From f190b8c8452a8681480d5435804a6b2a5b760252 Mon Sep 17 00:00:00 2001
From: Tim Hall <tim.hall@arm.com>
Date: Tue, 14 May 2024 15:34:45 +0100
Subject: [PATCH] Add support for Ethos-U85

 - Added Ethos-U85 support via the --accelerator-config CLI option

Change-Id: Ia3c77dbf61c1b7fa9cb03f8f51d336de2f115a3a
Signed-off-by: Tim Hall <tim.hall@arm.com>
---
 .clang-format                                 |   214 +
 .gitignore                                    |     2 +-
 .pre-commit-config.yaml                       |    63 +-
 README.md                                     |    10 +-
 TESTING.md                                    |    90 +-
 THIRDPARTY.md                                 |    34 +
 ethosu/config_files/Arm/vela.ini              |    95 +-
 ethosu/mlw_codec/mlw_decode.c                 |     4 +-
 ethosu/regor/CMakeLists.txt                   |   355 +
 ethosu/regor/architecture/architecture.cpp    |   221 +
 ethosu/regor/architecture/architecture.hpp    |   391 +
 .../ethos_u_register_cs_generator.hpp         |    66 +
 ethosu/regor/architecture/ethos_u_scaling.cpp |    69 +
 ethosu/regor/architecture/ethos_u_scaling.hpp |    37 +
 .../regor/architecture/ethosu55/ethos_u55.cpp |   932 +
 .../regor/architecture/ethosu55/ethos_u55.hpp |   239 +
 .../ethosu55/ethos_u55_performance.cpp        |   531 +
 .../ethosu55/ethos_u55_performance.hpp        |    60 +
 .../ethos_u55_register_cs_generator.cpp       |  1469 +
 .../ethos_u55_register_cs_generator.hpp       |   269 +
 .../ethosu55/ethos_u55_scaling.cpp            |   325 +
 .../ethosu55/ethos_u55_scaling.hpp            |    39 +
 .../ethosu55/ethos_u55_weight_encoder.cpp     |   487 +
 .../ethosu55/ethos_u55_weight_encoder.hpp     |    86 +
 .../regor/architecture/ethosu65/ethos_u65.cpp |    91 +
 .../regor/architecture/ethosu65/ethos_u65.hpp |    45 +
 .../ethosu65/ethos_u65_interface.hpp          | 21699 ++++++++++++++
 .../ethos_u65_register_cs_generator.cpp       |    40 +
 .../ethos_u65_register_cs_generator.hpp       |    41 +
 .../regor/architecture/ethosu85/ethos_u85.cpp |  1324 +
 .../regor/architecture/ethosu85/ethos_u85.hpp |   230 +
 .../ethosu85/ethos_u85_interface.hpp          | 24312 ++++++++++++++++
 .../ethosu85/ethos_u85_performance.cpp        |   545 +
 .../ethosu85/ethos_u85_performance.hpp        |    60 +
 .../ethos_u85_register_cs_generator.cpp       |  1828 ++
 .../ethos_u85_register_cs_generator.hpp       |   250 +
 .../ethosu85/ethos_u85_scaling.cpp            |   327 +
 .../ethosu85/ethos_u85_scaling.hpp            |    38 +
 .../ethosu85/ethos_u85_weight_encoder.cpp     |   775 +
 .../ethosu85/ethos_u85_weight_encoder.hpp     |    91 +
 ethosu/regor/architecture/mlw_encode.cpp      |   125 +
 ethosu/regor/architecture/mlw_encode.hpp      |    54 +
 .../register_command_stream_generator.hpp     |    41 +
 ethosu/regor/architecture/weight_encoder.hpp  |   210 +
 ethosu/regor/bindings/python/py_regor.cpp     |   587 +
 ethosu/regor/cmake/cpack_config.cmake         |    83 +
 ethosu/regor/cmake/pkg-config.cmake.in        |    26 +
 ethosu/regor/cmake/regor_dependencies.cmake   |    82 +
 ethosu/regor/cmake/regor_lib.cmake            |   312 +
 ethosu/regor/cmake/regor_options.cmake        |   279 +
 ethosu/regor/cmake/regor_test.cmake           |   123 +
 ethosu/regor/cmake/regor_thirdparty.cmake     |    49 +
 ethosu/regor/cmake/toolchains/clang.cmake     |    51 +
 ethosu/regor/cmake/toolchains/clang32.cmake   |    61 +
 ethosu/regor/cmake/toolchains/gcc.cmake       |    37 +
 ethosu/regor/cmake/toolchains/gcc32.cmake     |    55 +
 ethosu/regor/cmake/utils.cmake                |    75 +
 ethosu/regor/common/bit_flags.hpp             |   332 +
 ethosu/regor/common/box.hpp                   |    63 +
 ethosu/regor/common/buffer_view.hpp           |   445 +
 ethosu/regor/common/common.cpp                |    29 +
 ethosu/regor/common/common.hpp                |   172 +
 ethosu/regor/common/data_type.cpp             |    72 +
 ethosu/regor/common/data_type.hpp             |   290 +
 ethosu/regor/common/dynamic_typing.hpp        |   255 +
 ethosu/regor/common/ini_reader.hpp            |   300 +
 ethosu/regor/common/lexer.hpp                 |   159 +
 ethosu/regor/common/logging.cpp               |    70 +
 ethosu/regor/common/logging.hpp               |   200 +
 ethosu/regor/common/numeric_util.hpp          |   412 +
 ethosu/regor/common/ordered_map.hpp           |  1112 +
 ethosu/regor/common/reverse_type.cpp          |    30 +
 ethosu/regor/common/reverse_type.hpp          |    28 +
 ethosu/regor/common/scaling.cpp               |    93 +
 ethosu/regor/common/scaling.hpp               |    49 +
 ethosu/regor/common/shape.hpp                 |   733 +
 ethosu/regor/common/transpose_type.cpp        |    33 +
 ethosu/regor/common/transpose_type.hpp        |    78 +
 ethosu/regor/common/vector_span.hpp           |    77 +
 ethosu/regor/compiler/attributes.cpp          |    66 +
 ethosu/regor/compiler/attributes.hpp          |   259 +
 ethosu/regor/compiler/cascade_builder.cpp     |   399 +
 ethosu/regor/compiler/cascade_builder.hpp     |    99 +
 ethosu/regor/compiler/compiler.cpp            |   455 +
 ethosu/regor/compiler/compiler.hpp            |   133 +
 ethosu/regor/compiler/database.hpp            |   208 +
 .../regor/compiler/faststorage_allocator.cpp  |   403 +
 .../regor/compiler/faststorage_allocator.hpp  |    98 +
 ethosu/regor/compiler/graph.hpp               |   172 +
 ethosu/regor/compiler/graph_builder.cpp       |   530 +
 ethosu/regor/compiler/graph_builder.hpp       |    99 +
 ethosu/regor/compiler/graph_optimiser.cpp     |   341 +
 ethosu/regor/compiler/graph_optimiser.hpp     |   212 +
 ethosu/regor/compiler/graph_optimiser_db.hpp  |    60 +
 ethosu/regor/compiler/graph_packing.cpp       |   249 +
 ethosu/regor/compiler/graph_packing.hpp       |    60 +
 ethosu/regor/compiler/graph_validator.cpp     |    54 +
 ethosu/regor/compiler/graph_validator.hpp     |    59 +
 ethosu/regor/compiler/graphir_optimiser.cpp   |   187 +
 ethosu/regor/compiler/graphir_optimiser.hpp   |   107 +
 .../compiler/high_level_command_stream.hpp    |   273 +
 .../high_level_command_stream_generator.cpp   |   813 +
 .../high_level_command_stream_generator.hpp   |    63 +
 ethosu/regor/compiler/hillclimb_allocator.cpp |   355 +
 ethosu/regor/compiler/hillclimb_allocator.hpp |   172 +
 ethosu/regor/compiler/kernel.hpp              |   161 +
 ethosu/regor/compiler/live_range.cpp          |   233 +
 ethosu/regor/compiler/live_range.hpp          |   112 +
 ethosu/regor/compiler/network_performance.cpp |   317 +
 ethosu/regor/compiler/network_performance.hpp |   153 +
 ethosu/regor/compiler/op_type.cpp             |   199 +
 ethosu/regor/compiler/op_type.hpp             |   290 +
 ethosu/regor/compiler/operation.cpp           |   168 +
 ethosu/regor/compiler/operation.hpp           |   234 +
 ethosu/regor/compiler/operation_util.hpp      |   242 +
 ethosu/regor/compiler/optimiser_utils.cpp     |   143 +
 ethosu/regor/compiler/optimiser_utils.hpp     |    55 +
 ethosu/regor/compiler/quantization.cpp        |    59 +
 ethosu/regor/compiler/quantization.hpp        |    94 +
 ethosu/regor/compiler/raw_writer.cpp          |   260 +
 ethosu/regor/compiler/raw_writer.hpp          |    53 +
 ethosu/regor/compiler/scheduler.cpp           |  1742 ++
 ethosu/regor/compiler/scheduler.hpp           |   340 +
 ethosu/regor/compiler/scheduler_decompose.cpp |   326 +
 ethosu/regor/compiler/scheduler_decompose.hpp |    36 +
 ethosu/regor/compiler/scheduler_operation.hpp |   305 +
 ethosu/regor/compiler/scheduler_packing.cpp   |   493 +
 ethosu/regor/compiler/scheduler_packing.hpp   |    70 +
 ethosu/regor/compiler/softmax.cpp             |   530 +
 ethosu/regor/compiler/softmax.hpp             |    53 +
 ethosu/regor/compiler/tensor.cpp              |   118 +
 ethosu/regor/compiler/tensor.hpp              |    95 +
 ethosu/regor/compiler/tensor_allocator.cpp    |   149 +
 ethosu/regor/compiler/tensor_allocator.hpp    |    64 +
 ethosu/regor/compiler/tensor_properties.hpp   |    90 +
 .../regor/compiler/tflite_graph_optimiser.cpp |  2936 ++
 .../regor/compiler/tflite_graph_optimiser.hpp |   312 +
 .../compiler/tflite_graph_optimiser_tp.cpp    |   145 +
 .../regor/compiler/tosa_graph_validator.cpp   |    92 +
 .../regor/compiler/tosa_graph_validator.hpp   |    39 +
 .../dependencies/mlw_codec/CMakeLists.txt     |   183 +
 .../mlw_codec/include/mlw_decode.h            |    50 +
 .../mlw_codec/include/mlw_encode.h            |   114 +
 .../mlw_codec/source/ml_bit_buffer.hpp        |   255 +
 .../mlw_codec/source/ml_encoder_internal.hpp  |   128 +
 .../mlw_codec/source/ml_ethosu_encode.cpp     |   114 +
 .../mlw_codec/source/ml_raw_buffer.hpp        |   161 +
 .../mlw_codec/source/mlw_decode.cpp           |   361 +
 .../mlw_codec/source/mlw_encode.cpp           |   961 +
 .../mlw_codec/source/mlw_encode_fwd.cpp       |   197 +
 .../thirdparty/Catch2/BUILD.bazel             |    95 +
 .../Catch2/CMake/Catch2Config.cmake.in        |    10 +
 .../Catch2/CMake/CatchConfigOptions.cmake     |    89 +
 .../Catch2/CMake/CatchMiscFunctions.cmake     |   121 +
 .../thirdparty/Catch2/CMake/FindGcov.cmake    |   157 +
 .../thirdparty/Catch2/CMake/FindLcov.cmake    |   354 +
 .../thirdparty/Catch2/CMake/Findcodecov.cmake |   258 +
 .../Catch2/CMake/catch2-with-main.pc.in       |    10 +
 .../thirdparty/Catch2/CMake/catch2.pc.in      |    11 +
 .../thirdparty/Catch2/CMake/llvm-cov-wrapper  |    56 +
 .../thirdparty/Catch2/CMakeLists.txt          |   203 +
 .../thirdparty/Catch2/CMakePresets.json       |    26 +
 .../thirdparty/Catch2/CODE_OF_CONDUCT.md      |    46 +
 .../dependencies/thirdparty/Catch2/Doxyfile   |  2650 ++
 .../thirdparty/Catch2/LICENSE.txt             |    23 +
 .../thirdparty/Catch2/MODULE.bazel            |     3 +
 .../dependencies/thirdparty/Catch2/README.md  |   103 +
 .../thirdparty/Catch2/SECURITY.md             |    19 +
 .../thirdparty/Catch2/WORKSPACE.bazel         |    16 +
 .../thirdparty/Catch2/appveyor.yml            |    83 +
 .../thirdparty/Catch2/codecov.yml             |    22 +
 .../thirdparty/Catch2/conanfile.py            |    87 +
 .../thirdparty/Catch2/extras/Catch.cmake      |   304 +
 .../Catch2/extras/CatchAddTests.cmake         |   192 +
 .../Catch2/extras/CatchShardTests.cmake       |    74 +
 .../Catch2/extras/CatchShardTestsImpl.cmake   |    52 +
 .../Catch2/extras/ParseAndAddCatchTests.cmake |   252 +
 .../Catch2/extras/catch_amalgamated.cpp       | 11516 ++++++++
 .../Catch2/extras/catch_amalgamated.hpp       | 13915 +++++++++
 .../thirdparty/Catch2/extras/gdbinit          |    16 +
 .../thirdparty/Catch2/extras/lldbinit         |    16 +
 .../thirdparty/Catch2/fuzzing/CMakeLists.txt  |    20 +
 .../thirdparty/Catch2/fuzzing/NullOStream.cpp |    18 +
 .../thirdparty/Catch2/fuzzing/NullOStream.h   |    28 +
 .../Catch2/fuzzing/build_fuzzers.sh           |    33 +
 .../Catch2/fuzzing/fuzz_TestSpecParser.cpp    |    22 +
 .../Catch2/fuzzing/fuzz_XmlWriter.cpp         |    22 +
 .../Catch2/fuzzing/fuzz_textflow.cpp          |    53 +
 .../thirdparty/Catch2/mdsnippets.json         |     9 +
 .../thirdparty/Catch2/meson.build             |    19 +
 .../thirdparty/Catch2/meson_options.txt       |     1 +
 .../thirdparty/Catch2/src/CMakeLists.txt      |   524 +
 .../src/catch2/benchmark/catch_benchmark.hpp  |   148 +
 .../catch2/benchmark/catch_benchmark_all.hpp  |    46 +
 .../catch2/benchmark/catch_chronometer.cpp    |    17 +
 .../catch2/benchmark/catch_chronometer.hpp    |    77 +
 .../src/catch2/benchmark/catch_clock.hpp      |    27 +
 .../catch2/benchmark/catch_constructor.hpp    |    82 +
 .../catch2/benchmark/catch_environment.hpp    |    29 +
 .../src/catch2/benchmark/catch_estimate.hpp   |    25 +
 .../catch2/benchmark/catch_execution_plan.hpp |    58 +
 .../src/catch2/benchmark/catch_optimizer.hpp  |    78 +
 .../catch_outlier_classification.hpp          |    29 +
 .../benchmark/catch_sample_analysis.hpp       |    31 +
 .../catch2/benchmark/detail/catch_analyse.cpp |    85 +
 .../catch2/benchmark/detail/catch_analyse.hpp |    27 +
 .../detail/catch_benchmark_function.cpp       |    17 +
 .../detail/catch_benchmark_function.hpp       |   107 +
 .../detail/catch_benchmark_stats.hpp          |    48 +
 .../detail/catch_benchmark_stats_fwd.hpp      |    23 +
 .../detail/catch_complete_invoke.hpp          |    58 +
 .../benchmark/detail/catch_estimate_clock.hpp |   125 +
 .../catch2/benchmark/detail/catch_measure.hpp |    32 +
 .../catch2/benchmark/detail/catch_repeat.hpp  |    36 +
 .../detail/catch_run_for_at_least.cpp         |    31 +
 .../detail/catch_run_for_at_least.hpp         |    65 +
 .../catch2/benchmark/detail/catch_stats.cpp   |   392 +
 .../catch2/benchmark/detail/catch_stats.hpp   |    60 +
 .../catch2/benchmark/detail/catch_timing.hpp  |    31 +
 .../Catch2/src/catch2/catch_all.hpp           |   135 +
 .../Catch2/src/catch2/catch_approx.cpp        |    85 +
 .../Catch2/src/catch2/catch_approx.hpp        |   128 +
 .../src/catch2/catch_assertion_info.hpp       |    28 +
 .../src/catch2/catch_assertion_result.cpp     |   105 +
 .../src/catch2/catch_assertion_result.hpp     |    60 +
 .../Catch2/src/catch2/catch_config.cpp        |   247 +
 .../Catch2/src/catch2/catch_config.hpp        |   153 +
 .../src/catch2/catch_get_random_seed.cpp      |    18 +
 .../src/catch2/catch_get_random_seed.hpp      |    18 +
 .../Catch2/src/catch2/catch_message.cpp       |   117 +
 .../Catch2/src/catch2/catch_message.hpp       |   150 +
 .../Catch2/src/catch2/catch_registry_hub.cpp  |   106 +
 .../Catch2/src/catch2/catch_section_info.hpp  |    42 +
 .../Catch2/src/catch2/catch_session.cpp       |   364 +
 .../Catch2/src/catch2/catch_session.hpp       |    62 +
 .../Catch2/src/catch2/catch_tag_alias.hpp     |    29 +
 .../catch2/catch_tag_alias_autoregistrar.cpp  |    24 +
 .../catch2/catch_tag_alias_autoregistrar.hpp  |    29 +
 .../src/catch2/catch_template_test_macros.hpp |   124 +
 .../src/catch2/catch_test_case_info.cpp       |   266 +
 .../src/catch2/catch_test_case_info.hpp       |   132 +
 .../Catch2/src/catch2/catch_test_macros.hpp   |   226 +
 .../Catch2/src/catch2/catch_test_spec.cpp     |   141 +
 .../Catch2/src/catch2/catch_test_spec.hpp     |   119 +
 .../Catch2/src/catch2/catch_timer.cpp         |    37 +
 .../Catch2/src/catch2/catch_timer.hpp         |    27 +
 .../Catch2/src/catch2/catch_tostring.cpp      |   254 +
 .../Catch2/src/catch2/catch_tostring.hpp      |   674 +
 .../Catch2/src/catch2/catch_totals.cpp        |    65 +
 .../Catch2/src/catch2/catch_totals.hpp        |    41 +
 .../src/catch2/catch_translate_exception.cpp  |    20 +
 .../src/catch2/catch_translate_exception.hpp  |    88 +
 .../src/catch2/catch_user_config.hpp.in       |   220 +
 .../Catch2/src/catch2/catch_version.cpp       |    43 +
 .../Catch2/src/catch2/catch_version.hpp       |    39 +
 .../src/catch2/catch_version_macros.hpp       |    15 +
 .../generators/catch_generator_exception.cpp  |    17 +
 .../generators/catch_generator_exception.hpp  |    31 +
 .../catch2/generators/catch_generators.cpp    |    42 +
 .../catch2/generators/catch_generators.hpp    |   244 +
 .../generators/catch_generators_adapters.hpp  |   241 +
 .../generators/catch_generators_all.hpp       |    30 +
 .../generators/catch_generators_random.cpp    |    41 +
 .../generators/catch_generators_random.hpp    |   107 +
 .../generators/catch_generators_range.hpp     |   111 +
 .../interfaces/catch_interfaces_all.hpp       |    37 +
 .../interfaces/catch_interfaces_capture.cpp   |    13 +
 .../interfaces/catch_interfaces_capture.hpp   |   110 +
 .../interfaces/catch_interfaces_config.cpp    |    13 +
 .../interfaces/catch_interfaces_config.hpp    |   100 +
 .../catch_interfaces_enum_values_registry.hpp |    47 +
 .../interfaces/catch_interfaces_exception.cpp |    14 +
 .../interfaces/catch_interfaces_exception.hpp |    36 +
 .../catch_interfaces_generatortracker.cpp     |    32 +
 .../catch_interfaces_generatortracker.hpp     |    90 +
 .../catch_interfaces_registry_hub.cpp         |    14 +
 .../catch_interfaces_registry_hub.hpp         |    66 +
 .../interfaces/catch_interfaces_reporter.cpp  |    93 +
 .../interfaces/catch_interfaces_reporter.hpp  |   223 +
 .../catch_interfaces_reporter_factory.cpp     |    14 +
 .../catch_interfaces_reporter_factory.hpp     |    45 +
 .../catch_interfaces_tag_alias_registry.hpp   |    29 +
 .../catch_interfaces_test_invoker.hpp         |    21 +
 .../interfaces/catch_interfaces_testcase.cpp  |    13 +
 .../interfaces/catch_interfaces_testcase.hpp  |    30 +
 .../internal/catch_assertion_handler.cpp      |    82 +
 .../internal/catch_assertion_handler.hpp      |    68 +
 .../catch_case_insensitive_comparisons.cpp    |    35 +
 .../catch_case_insensitive_comparisons.hpp    |    30 +
 .../catch2/internal/catch_case_sensitive.hpp  |    17 +
 .../src/catch2/internal/catch_clara.cpp       |   464 +
 .../src/catch2/internal/catch_clara.hpp       |   750 +
 .../src/catch2/internal/catch_commandline.cpp |   314 +
 .../src/catch2/internal/catch_commandline.hpp |    21 +
 .../catch2/internal/catch_compare_traits.hpp  |    75 +
 .../internal/catch_compiler_capabilities.hpp  |   447 +
 .../catch_config_android_logwrite.hpp         |    33 +
 .../catch2/internal/catch_config_counter.hpp  |    34 +
 .../internal/catch_config_prefix_messages.hpp |    29 +
 .../catch_config_static_analysis_support.hpp  |    34 +
 .../catch_config_uncaught_exceptions.hpp      |    46 +
 .../catch2/internal/catch_config_wchar.hpp    |    35 +
 .../catch2/internal/catch_console_colour.cpp  |   282 +
 .../catch2/internal/catch_console_colour.hpp  |   141 +
 .../catch2/internal/catch_console_width.hpp   |    19 +
 .../internal/catch_container_nonmembers.hpp   |    73 +
 .../src/catch2/internal/catch_context.cpp     |    41 +
 .../src/catch2/internal/catch_context.hpp     |    51 +
 .../catch2/internal/catch_debug_console.cpp   |    45 +
 .../catch2/internal/catch_debug_console.hpp   |    17 +
 .../src/catch2/internal/catch_debugger.cpp    |   120 +
 .../src/catch2/internal/catch_debugger.hpp    |    67 +
 .../src/catch2/internal/catch_decomposer.cpp  |    28 +
 .../src/catch2/internal/catch_decomposer.hpp  |   452 +
 .../src/catch2/internal/catch_enforce.cpp     |    41 +
 .../src/catch2/internal/catch_enforce.hpp     |    54 +
 .../internal/catch_enum_values_registry.cpp   |    73 +
 .../internal/catch_enum_values_registry.hpp   |    36 +
 .../src/catch2/internal/catch_errno_guard.cpp |    16 +
 .../src/catch2/internal/catch_errno_guard.hpp |    27 +
 .../catch_exception_translator_registry.cpp   |    87 +
 .../catch_exception_translator_registry.hpp   |    30 +
 .../catch_fatal_condition_handler.cpp         |   244 +
 .../catch_fatal_condition_handler.hpp         |    66 +
 .../internal/catch_floating_point_helpers.cpp |    43 +
 .../internal/catch_floating_point_helpers.hpp |   108 +
 .../src/catch2/internal/catch_getenv.cpp      |    37 +
 .../src/catch2/internal/catch_getenv.hpp      |    20 +
 .../catch2/internal/catch_is_permutation.hpp  |   138 +
 .../src/catch2/internal/catch_istream.cpp     |   154 +
 .../src/catch2/internal/catch_istream.hpp     |    54 +
 .../src/catch2/internal/catch_jsonwriter.cpp  |   148 +
 .../src/catch2/internal/catch_jsonwriter.hpp  |   120 +
 .../src/catch2/internal/catch_lazy_expr.cpp   |    29 +
 .../src/catch2/internal/catch_lazy_expr.hpp   |    40 +
 .../catch2/internal/catch_leak_detector.cpp   |    38 +
 .../catch2/internal/catch_leak_detector.hpp   |    19 +
 .../Catch2/src/catch2/internal/catch_list.cpp |   120 +
 .../Catch2/src/catch2/internal/catch_list.hpp |    43 +
 .../catch2/internal/catch_logical_traits.hpp  |    44 +
 .../Catch2/src/catch2/internal/catch_main.cpp |    39 +
 .../catch2/internal/catch_message_info.cpp    |    25 +
 .../catch2/internal/catch_message_info.hpp    |    42 +
 .../Catch2/src/catch2/internal/catch_meta.hpp |    47 +
 .../internal/catch_move_and_forward.hpp       |    19 +
 .../src/catch2/internal/catch_noncopyable.hpp |    28 +
 .../src/catch2/internal/catch_optional.hpp    |   117 +
 .../catch2/internal/catch_output_redirect.cpp |   146 +
 .../catch2/internal/catch_output_redirect.hpp |   118 +
 .../catch2/internal/catch_parse_numbers.cpp   |    52 +
 .../catch2/internal/catch_parse_numbers.hpp   |    26 +
 .../src/catch2/internal/catch_platform.hpp    |    37 +
 .../src/catch2/internal/catch_polyfills.cpp   |    42 +
 .../src/catch2/internal/catch_polyfills.hpp   |    21 +
 .../catch2/internal/catch_preprocessor.hpp    |   237 +
 .../catch_preprocessor_internal_stringify.hpp |    19 +
 .../catch_preprocessor_remove_parens.hpp      |    19 +
 .../catch_random_floating_point_helpers.hpp   |    94 +
 .../internal/catch_random_integer_helpers.hpp |   201 +
 .../catch_random_number_generator.cpp         |    70 +
 .../catch_random_number_generator.hpp         |    59 +
 .../internal/catch_random_seed_generation.cpp |    35 +
 .../internal/catch_random_seed_generation.hpp |    26 +
 .../internal/catch_reporter_registry.cpp      |    91 +
 .../internal/catch_reporter_registry.hpp      |    55 +
 .../internal/catch_reporter_spec_parser.cpp   |   173 +
 .../internal/catch_reporter_spec_parser.hpp   |    85 +
 .../src/catch2/internal/catch_result_type.cpp |    26 +
 .../src/catch2/internal/catch_result_type.hpp |    57 +
 .../internal/catch_reusable_string_stream.cpp |    62 +
 .../internal/catch_reusable_string_stream.hpp |    57 +
 .../src/catch2/internal/catch_run_context.cpp |   700 +
 .../src/catch2/internal/catch_run_context.hpp |   161 +
 .../src/catch2/internal/catch_section.cpp     |    60 +
 .../src/catch2/internal/catch_section.hpp     |   104 +
 .../src/catch2/internal/catch_sharding.hpp    |    41 +
 .../src/catch2/internal/catch_singletons.cpp  |    36 +
 .../src/catch2/internal/catch_singletons.hpp  |    45 +
 .../internal/catch_source_line_info.cpp       |    33 +
 .../internal/catch_source_line_info.hpp       |    37 +
 .../catch_startup_exception_registry.cpp      |    29 +
 .../catch_startup_exception_registry.hpp      |    29 +
 .../src/catch2/internal/catch_stdstreams.cpp  |    24 +
 .../src/catch2/internal/catch_stdstreams.hpp  |    22 +
 .../catch2/internal/catch_stream_end_stop.hpp |    30 +
 .../catch2/internal/catch_string_manip.cpp    |   116 +
 .../catch2/internal/catch_string_manip.hpp    |    61 +
 .../src/catch2/internal/catch_stringref.cpp   |    66 +
 .../src/catch2/internal/catch_stringref.hpp   |   123 +
 .../internal/catch_tag_alias_registry.cpp     |    54 +
 .../internal/catch_tag_alias_registry.hpp     |    33 +
 .../internal/catch_template_test_registry.hpp |   337 +
 .../internal/catch_test_case_info_hasher.cpp  |    39 +
 .../internal/catch_test_case_info_hasher.hpp  |    29 +
 .../catch_test_case_registry_impl.cpp         |   151 +
 .../catch_test_case_registry_impl.hpp         |    57 +
 .../internal/catch_test_case_tracker.cpp      |   239 +
 .../internal/catch_test_case_tracker.hpp      |   244 +
 .../internal/catch_test_failure_exception.cpp |    31 +
 .../internal/catch_test_failure_exception.hpp |    34 +
 .../catch2/internal/catch_test_macro_impl.hpp |   157 +
 .../catch2/internal/catch_test_registry.cpp   |    82 +
 .../catch2/internal/catch_test_registry.hpp   |   173 +
 .../catch2/internal/catch_test_run_info.hpp   |    22 +
 .../internal/catch_test_spec_parser.cpp       |   239 +
 .../internal/catch_test_spec_parser.hpp       |    81 +
 .../src/catch2/internal/catch_textflow.cpp    |   268 +
 .../src/catch2/internal/catch_textflow.hpp    |   189 +
 .../src/catch2/internal/catch_to_string.hpp   |    29 +
 .../internal/catch_uncaught_exceptions.cpp    |    25 +
 .../internal/catch_uncaught_exceptions.hpp    |    15 +
 ...ch_uniform_floating_point_distribution.hpp |   131 +
 .../catch_uniform_integer_distribution.hpp    |   124 +
 .../src/catch2/internal/catch_unique_name.hpp |    20 +
 .../src/catch2/internal/catch_unique_ptr.hpp  |   118 +
 .../src/catch2/internal/catch_void_type.hpp   |    25 +
 .../internal/catch_wildcard_pattern.cpp       |    47 +
 .../internal/catch_wildcard_pattern.hpp       |    38 +
 .../catch2/internal/catch_windows_h_proxy.hpp |    28 +
 .../src/catch2/internal/catch_xmlwriter.cpp   |   348 +
 .../src/catch2/internal/catch_xmlwriter.hpp   |   152 +
 .../src/catch2/matchers/catch_matchers.cpp    |    25 +
 .../src/catch2/matchers/catch_matchers.hpp    |   237 +
 .../catch2/matchers/catch_matchers_all.hpp    |    36 +
 .../catch_matchers_container_properties.cpp   |    34 +
 .../catch_matchers_container_properties.hpp   |    90 +
 .../matchers/catch_matchers_contains.hpp      |   102 +
 .../matchers/catch_matchers_exception.cpp     |    26 +
 .../matchers/catch_matchers_exception.hpp     |    61 +
 .../catch_matchers_floating_point.cpp         |   226 +
 .../catch_matchers_floating_point.hpp         |    94 +
 .../matchers/catch_matchers_predicate.cpp     |    17 +
 .../matchers/catch_matchers_predicate.hpp     |    59 +
 .../matchers/catch_matchers_quantifiers.cpp   |    24 +
 .../matchers/catch_matchers_quantifiers.hpp   |   165 +
 .../matchers/catch_matchers_range_equals.hpp  |   144 +
 .../catch2/matchers/catch_matchers_string.cpp |   114 +
 .../catch2/matchers/catch_matchers_string.hpp |    85 +
 .../matchers/catch_matchers_templated.cpp     |    41 +
 .../matchers/catch_matchers_templated.hpp     |   296 +
 .../catch2/matchers/catch_matchers_vector.hpp |   194 +
 .../matchers/internal/catch_matchers_impl.cpp |    25 +
 .../matchers/internal/catch_matchers_impl.hpp |    88 +
 .../thirdparty/Catch2/src/catch2/meson.build  |   392 +
 .../reporters/catch_reporter_automake.cpp     |    37 +
 .../reporters/catch_reporter_automake.hpp     |    38 +
 .../reporters/catch_reporter_common_base.cpp  |    49 +
 .../reporters/catch_reporter_common_base.hpp  |    79 +
 .../reporters/catch_reporter_compact.cpp      |   254 +
 .../reporters/catch_reporter_compact.hpp      |    39 +
 .../reporters/catch_reporter_console.cpp      |   665 +
 .../reporters/catch_reporter_console.hpp      |    67 +
 .../catch_reporter_cumulative_base.cpp        |   158 +
 .../catch_reporter_cumulative_base.hpp        |   151 +
 .../catch_reporter_event_listener.cpp         |    40 +
 .../catch_reporter_event_listener.hpp         |    60 +
 .../reporters/catch_reporter_helpers.cpp      |   343 +
 .../reporters/catch_reporter_helpers.hpp      |    95 +
 .../catch2/reporters/catch_reporter_json.cpp  |   372 +
 .../catch2/reporters/catch_reporter_json.hpp  |    95 +
 .../catch2/reporters/catch_reporter_junit.cpp |   309 +
 .../catch2/reporters/catch_reporter_junit.hpp |    56 +
 .../catch2/reporters/catch_reporter_multi.cpp |   197 +
 .../catch2/reporters/catch_reporter_multi.hpp |    72 +
 .../reporters/catch_reporter_registrars.cpp   |    36 +
 .../reporters/catch_reporter_registrars.hpp   |   131 +
 .../reporters/catch_reporter_sonarqube.cpp    |   162 +
 .../reporters/catch_reporter_sonarqube.hpp    |    59 +
 .../catch_reporter_streaming_base.cpp         |    23 +
 .../catch_reporter_streaming_base.hpp         |    73 +
 .../catch2/reporters/catch_reporter_tap.cpp   |   228 +
 .../catch2/reporters/catch_reporter_tap.hpp   |    42 +
 .../reporters/catch_reporter_teamcity.cpp     |   177 +
 .../reporters/catch_reporter_teamcity.hpp     |    66 +
 .../catch2/reporters/catch_reporter_xml.cpp   |   333 +
 .../catch2/reporters/catch_reporter_xml.hpp   |    66 +
 .../catch2/reporters/catch_reporters_all.hpp  |    41 +
 .../thirdparty/Catch2/third_party/clara.hpp   |  1267 +
 .../Catch2/tools/misc/CMakeLists.txt          |    11 +
 .../Catch2/tools/misc/SelfTest.vcxproj.user   |    23 +
 .../misc/appveyorBuildConfigurationScript.bat |    21 +
 .../tools/misc/appveyorMergeCoverageScript.py |     9 +
 .../tools/misc/appveyorTestRunScript.bat      |    17 +
 .../Catch2/tools/misc/coverage-helper.cpp     |   142 +
 .../tools/misc/installOpenCppCoverage.ps1     |    19 +
 .../Catch2/tools/scripts/approvalTests.py     |   243 +
 .../Catch2/tools/scripts/approve.py           |    31 +
 .../Catch2/tools/scripts/buildAndTest.cmd     |    17 +
 .../Catch2/tools/scripts/buildAndTest.sh      |    19 +
 .../tools/scripts/checkConvenienceHeaders.py  |   151 +
 .../tools/scripts/checkDuplicateFilenames.py  |    14 +
 .../Catch2/tools/scripts/checkLicense.py      |    46 +
 .../Catch2/tools/scripts/developBuild.py      |     9 +
 .../extractFeaturesFromReleaseNotes.py        |    92 +
 .../Catch2/tools/scripts/fixWhitespace.py     |    51 +
 .../tools/scripts/generateAmalgamatedFiles.py |   139 +
 .../Catch2/tools/scripts/majorRelease.py      |     9 +
 .../Catch2/tools/scripts/minorRelease.py      |     9 +
 .../Catch2/tools/scripts/patchRelease.py      |     9 +
 .../Catch2/tools/scripts/releaseCommon.py     |   143 +
 .../Catch2/tools/scripts/scriptCommon.py      |     4 +
 .../tools/scripts/updateDocumentSnippets.py   |    23 +
 .../Catch2/tools/scripts/updateDocumentToC.py |   447 +
 .../thirdparty/download_thirdparty.sh         |   118 +
 .../thirdparty/flatbuffers/BUILD.bazel        |   139 +
 .../thirdparty/flatbuffers/CHANGELOG.md       |   155 +
 .../thirdparty/flatbuffers/CMakeLists.txt     |   712 +
 .../thirdparty/flatbuffers/CONTRIBUTING.md    |    42 +
 .../flatbuffers/FlatBuffers.podspec           |    21 +
 .../thirdparty/flatbuffers/Formatters.md      |    22 +
 .../thirdparty/flatbuffers/LICENSE            |   202 +
 .../thirdparty/flatbuffers/Package.swift      |    37 +
 .../flatbuffers/Package@swift-5.5.swift       |    37 +
 .../thirdparty/flatbuffers/README.md          |   121 +
 .../thirdparty/flatbuffers/SECURITY.md        |    11 +
 .../thirdparty/flatbuffers/WORKSPACE          |   156 +
 .../thirdparty/flatbuffers/build_defs.bzl     |   280 +
 .../thirdparty/flatbuffers/composer.json      |    18 +
 .../thirdparty/flatbuffers/conanfile.py       |    75 +
 .../include/flatbuffers/allocator.h           |    68 +
 .../flatbuffers/include/flatbuffers/array.h   |   256 +
 .../flatbuffers/include/flatbuffers/base.h    |   495 +
 .../flatbuffers/include/flatbuffers/buffer.h  |   199 +
 .../include/flatbuffers/buffer_ref.h          |    53 +
 .../include/flatbuffers/code_generator.h      |    97 +
 .../include/flatbuffers/code_generators.h     |   238 +
 .../include/flatbuffers/default_allocator.h   |    64 +
 .../include/flatbuffers/detached_buffer.h     |   114 +
 .../include/flatbuffers/file_manager.h        |    48 +
 .../include/flatbuffers/flatbuffer_builder.h  |  1465 +
 .../include/flatbuffers/flatbuffers.h         |   284 +
 .../flatbuffers/include/flatbuffers/flatc.h   |   131 +
 .../include/flatbuffers/flex_flat_util.h      |    36 +
 .../include/flatbuffers/flexbuffers.h         |  1887 ++
 .../flatbuffers/include/flatbuffers/grpc.h    |   299 +
 .../flatbuffers/include/flatbuffers/hash.h    |   127 +
 .../flatbuffers/include/flatbuffers/idl.h     |  1254 +
 .../include/flatbuffers/minireflect.h         |   420 +
 .../include/flatbuffers/pch/flatc_pch.h       |    39 +
 .../flatbuffers/include/flatbuffers/pch/pch.h |    38 +
 .../include/flatbuffers/reflection.h          |   523 +
 .../flatbuffers/reflection_generated.h        |  1487 +
 .../include/flatbuffers/registry.h            |   130 +
 .../include/flatbuffers/stl_emulation.h       |   513 +
 .../flatbuffers/include/flatbuffers/string.h  |    64 +
 .../flatbuffers/include/flatbuffers/struct.h  |    53 +
 .../flatbuffers/include/flatbuffers/table.h   |   188 +
 .../flatbuffers/include/flatbuffers/util.h    |   732 +
 .../flatbuffers/include/flatbuffers/vector.h  |   397 +
 .../include/flatbuffers/vector_downward.h     |   289 +
 .../include/flatbuffers/verifier.h            |   332 +
 .../thirdparty/flatbuffers/package.json       |    46 +
 .../thirdparty/flatbuffers/pnpm-lock.yaml     |  1184 +
 .../thirdparty/flatbuffers/swift.swiftformat  |    27 +
 .../thirdparty/flatbuffers/tsconfig.json      |    15 +
 .../thirdparty/flatbuffers/tsconfig.mjs.json  |    15 +
 .../thirdparty/flatbuffers/typescript.bzl     |    90 +
 .../thirdparty/fmt/CMakeLists.txt             |   453 +
 .../thirdparty/fmt/CONTRIBUTING.md            |    20 +
 .../dependencies/thirdparty/fmt/ChangeLog.md  |  5533 ++++
 .../regor/dependencies/thirdparty/fmt/LICENSE |    27 +
 .../dependencies/thirdparty/fmt/README.md     |   490 +
 .../thirdparty/fmt/include/fmt/args.h         |   235 +
 .../thirdparty/fmt/include/fmt/chrono.h       |  2240 ++
 .../thirdparty/fmt/include/fmt/color.h        |   643 +
 .../thirdparty/fmt/include/fmt/compile.h      |   535 +
 .../thirdparty/fmt/include/fmt/core.h         |  2969 ++
 .../thirdparty/fmt/include/fmt/format-inl.h   |  1678 ++
 .../thirdparty/fmt/include/fmt/format.h       |  4535 +++
 .../thirdparty/fmt/include/fmt/os.h           |   455 +
 .../thirdparty/fmt/include/fmt/ostream.h      |   245 +
 .../thirdparty/fmt/include/fmt/printf.h       |   675 +
 .../thirdparty/fmt/include/fmt/ranges.h       |   738 +
 .../thirdparty/fmt/include/fmt/std.h          |   537 +
 .../thirdparty/fmt/include/fmt/xchar.h        |   259 +
 .../dependencies/thirdparty/fmt/src/fmt.cc    |   108 +
 .../dependencies/thirdparty/fmt/src/format.cc |    43 +
 .../dependencies/thirdparty/fmt/src/os.cc     |   402 +
 .../dependencies/thirdparty/gemmlowp/AUTHORS  |    14 +
 .../dependencies/thirdparty/gemmlowp/BUILD    |   232 +
 .../thirdparty/gemmlowp/CONTRIBUTING          |    53 +
 .../thirdparty/gemmlowp/CONTRIBUTORS          |    40 +
 .../dependencies/thirdparty/gemmlowp/LICENSE  |   202 +
 .../thirdparty/gemmlowp/Makefile.travis       |    27 +
 .../thirdparty/gemmlowp/README.md             |   276 +
 .../thirdparty/gemmlowp/WORKSPACE             |     0
 .../gemmlowp/fixedpoint/fixedpoint.h          |   914 +
 .../gemmlowp/fixedpoint/fixedpoint_avx.h      |   384 +
 .../gemmlowp/fixedpoint/fixedpoint_msa.h      |   413 +
 .../gemmlowp/fixedpoint/fixedpoint_neon.h     |   357 +
 .../gemmlowp/fixedpoint/fixedpoint_sse.h      |   388 +
 .../gemmlowp/fixedpoint/fixedpoint_wasmsimd.h |   381 +
 .../thirdparty/gemmlowp/flags.bzl             |    11 +
 .../thirdparty/gemmlowp/internal/allocator.h  |   197 +
 .../gemmlowp/internal/block_params.h          |   177 +
 .../thirdparty/gemmlowp/internal/common.h     |   184 +
 .../thirdparty/gemmlowp/internal/compute.h    |   118 +
 .../gemmlowp/internal/detect_platform.h       |   171 +
 .../gemmlowp/internal/dispatch_gemm_shape.h   |   207 +
 .../thirdparty/gemmlowp/internal/kernel.h     |   251 +
 .../thirdparty/gemmlowp/internal/kernel_avx.h |   361 +
 .../gemmlowp/internal/kernel_default.h        |   112 +
 .../thirdparty/gemmlowp/internal/kernel_msa.h |   579 +
 .../gemmlowp/internal/kernel_neon.h           |  1913 ++
 .../gemmlowp/internal/kernel_reference.h      |   118 +
 .../thirdparty/gemmlowp/internal/kernel_sse.h |   519 +
 .../gemmlowp/internal/multi_thread_gemm.h     |   721 +
 .../thirdparty/gemmlowp/internal/output.h     |   579 +
 .../thirdparty/gemmlowp/internal/output_avx.h |    19 +
 .../thirdparty/gemmlowp/internal/output_msa.h |  1111 +
 .../gemmlowp/internal/output_neon.h           |   922 +
 .../thirdparty/gemmlowp/internal/output_sse.h |   561 +
 .../thirdparty/gemmlowp/internal/pack.h       |   444 +
 .../thirdparty/gemmlowp/internal/pack_avx.h   |   282 +
 .../thirdparty/gemmlowp/internal/pack_msa.h   |   431 +
 .../thirdparty/gemmlowp/internal/pack_neon.h  |   384 +
 .../thirdparty/gemmlowp/internal/pack_sse.h   |   128 +
 .../thirdparty/gemmlowp/internal/platform.h   |   117 +
 .../gemmlowp/internal/simd_wrappers.h         |   669 +
 .../internal/simd_wrappers_common_neon_sse.h  |   850 +
 .../gemmlowp/internal/simd_wrappers_msa.h     |   191 +
 .../gemmlowp/internal/simd_wrappers_neon.h    |   551 +
 .../gemmlowp/internal/simd_wrappers_sse.h     |   149 +
 .../gemmlowp/internal/single_thread_gemm.h    |   157 +
 .../thirdparty/gemmlowp/internal/unpack.h     |   280 +
 .../thirdparty/pybind11/CMakeLists.txt        |   322 +
 .../dependencies/thirdparty/pybind11/LICENSE  |    29 +
 .../thirdparty/pybind11/MANIFEST.in           |     6 +
 .../thirdparty/pybind11/README.rst            |   180 +
 .../thirdparty/pybind11/SECURITY.md           |    13 +
 .../pybind11/include/pybind11/attr.h          |   690 +
 .../pybind11/include/pybind11/buffer_info.h   |   208 +
 .../pybind11/include/pybind11/cast.h          |  1704 ++
 .../pybind11/include/pybind11/chrono.h        |   225 +
 .../pybind11/include/pybind11/common.h        |     2 +
 .../pybind11/include/pybind11/complex.h       |    74 +
 .../pybind11/include/pybind11/detail/class.h  |   743 +
 .../pybind11/include/pybind11/detail/common.h |  1255 +
 .../pybind11/include/pybind11/detail/descr.h  |   171 +
 .../pybind11/include/pybind11/detail/init.h   |   434 +
 .../include/pybind11/detail/internals.h       |   656 +
 .../pybind11/detail/type_caster_base.h        |  1177 +
 .../pybind11/include/pybind11/detail/typeid.h |    65 +
 .../pybind11/include/pybind11/eigen.h         |    12 +
 .../pybind11/include/pybind11/eigen/common.h  |     9 +
 .../pybind11/include/pybind11/eigen/matrix.h  |   714 +
 .../pybind11/include/pybind11/eigen/tensor.h  |   516 +
 .../pybind11/include/pybind11/embed.h         |   316 +
 .../pybind11/include/pybind11/eval.h          |   156 +
 .../pybind11/include/pybind11/functional.h    |   137 +
 .../pybind11/include/pybind11/gil.h           |   239 +
 .../pybind11/include/pybind11/iostream.h      |   265 +
 .../pybind11/include/pybind11/numpy.h         |  1998 ++
 .../pybind11/include/pybind11/operators.h     |   202 +
 .../pybind11/include/pybind11/options.h       |    92 +
 .../pybind11/include/pybind11/pybind11.h      |  2890 ++
 .../pybind11/include/pybind11/pytypes.h       |  2557 ++
 .../pybind11/include/pybind11/stl.h           |   447 +
 .../include/pybind11/stl/filesystem.h         |   116 +
 .../pybind11/include/pybind11/stl_bind.h      |   851 +
 .../pybind11/type_caster_pyobject_ptr.h       |    61 +
 .../thirdparty/pybind11/noxfile.py            |   107 +
 .../thirdparty/pybind11/pybind11/__init__.py  |    17 +
 .../thirdparty/pybind11/pybind11/__main__.py  |    62 +
 .../thirdparty/pybind11/pybind11/_version.py  |    12 +
 .../thirdparty/pybind11/pybind11/commands.py  |    37 +
 .../thirdparty/pybind11/pybind11/py.typed     |     0
 .../pybind11/pybind11/setup_helpers.py        |   498 +
 .../thirdparty/pybind11/pyproject.toml        |    98 +
 .../thirdparty/pybind11/setup.cfg             |    43 +
 .../dependencies/thirdparty/pybind11/setup.py |   150 +
 .../thirdparty/pybind11/tools/FindCatch.cmake |    76 +
 .../pybind11/tools/FindEigen3.cmake           |    86 +
 .../pybind11/tools/FindPythonLibsNew.cmake    |   287 +
 .../thirdparty/pybind11/tools/JoinPaths.cmake |    23 +
 .../thirdparty/pybind11/tools/check-style.sh  |    44 +
 .../pybind11/tools/cmake_uninstall.cmake.in   |    23 +
 .../codespell_ignore_lines_from_errors.py     |    39 +
 .../thirdparty/pybind11/tools/libsize.py      |    36 +
 .../pybind11/tools/make_changelog.py          |    62 +
 .../thirdparty/pybind11/tools/pybind11.pc.in  |     7 +
 .../pybind11/tools/pybind11Common.cmake       |   405 +
 .../pybind11/tools/pybind11Config.cmake.in    |   233 +
 .../pybind11/tools/pybind11NewTools.cmake     |   256 +
 .../pybind11/tools/pybind11Tools.cmake        |   233 +
 .../thirdparty/pybind11/tools/pyproject.toml  |     3 +
 .../pybind11/tools/setup_global.py.in         |    63 +
 .../pybind11/tools/setup_main.py.in           |    44 +
 ethosu/regor/include/graphapi.hpp             |   295 +
 ethosu/regor/include/graphapi_attr.hpp        |   110 +
 ethosu/regor/include/graphapi_tosa_types.hpp  |   127 +
 ethosu/regor/include/regor.h                  |   214 +
 ethosu/regor/include/regor_database.hpp       |    57 +
 ethosu/regor/include/regor_interface.hpp      |    45 +
 ethosu/regor/regor.cpp                        |   591 +
 ethosu/regor/test/CMakeLists.txt              |    45 +
 ethosu/regor/test/randomize.hpp               |   156 +
 ethosu/regor/test/test_arch_ethos_u85.cpp     |   101 +
 ethosu/regor/test/test_data_type.cpp          |   145 +
 .../test/test_ethos_u85_weight_encoder.cpp    |   123 +
 ethosu/regor/test/test_graph_packing.cpp      |   292 +
 ethosu/regor/test/test_ini_reader.cpp         |   184 +
 ethosu/regor/test/test_main.cpp               |    77 +
 ethosu/regor/test/test_mlw_encode.cpp         |   260 +
 ethosu/regor/test/test_ordered_map.cpp        |  1255 +
 ethosu/regor/test/test_raw_writer.cpp         |   206 +
 ethosu/regor/test/test_shape.cpp              |   412 +
 ethosu/regor/test/test_tosa_validator.cpp     |    89 +
 ethosu/regor/test/test_transpose_type.cpp     |    56 +
 .../regor/tflite/custom_operator_ethosu.hpp   |   212 +
 ethosu/regor/tflite/flatbuffer_utils.hpp      |    49 +
 ethosu/regor/tflite/tflite_mapping.cpp        |   717 +
 ethosu/regor/tflite/tflite_mapping.hpp        |   135 +
 .../regor/tflite/tflite_model_semantics.cpp   |   976 +
 .../regor/tflite/tflite_model_semantics.hpp   |    37 +
 ethosu/regor/tflite/tflite_reader.cpp         |   777 +
 ethosu/regor/tflite/tflite_reader.hpp         |    54 +
 .../regor/tflite/tflite_schema_generated.hpp  | 17552 +++++++++++
 ethosu/regor/tflite/tflite_writer.cpp         |   778 +
 ethosu/regor/tflite/tflite_writer.hpp         |   134 +
 ethosu/regor/tools/cppcheck.py                |    73 +
 ethosu/regor/tosa/tosaValidationGenerator.rb  |   830 +
 ethosu/regor/tosa/tosa_argument_checks.cpp    |   335 +
 ethosu/regor/tosa/tosa_argument_checks.hpp    |    60 +
 ethosu/regor/tosa/tosa_error_checks.cpp       |  1486 +
 ethosu/regor/tosa/tosa_error_checks.hpp       |   161 +
 ethosu/regor/tosa/tosa_level_checks.cpp       |   269 +
 ethosu/regor/tosa/tosa_level_checks.hpp       |    60 +
 ethosu/regor/tosa/tosa_mapping.cpp            |   171 +
 ethosu/regor/tosa/tosa_mapping.hpp            |    41 +
 ethosu/regor/tosa/tosa_reader.cpp             |   879 +
 ethosu/regor/tosa/tosa_reader.hpp             |    40 +
 ethosu/regor/tosa/tosa_require_checks.cpp     |   104 +
 ethosu/regor/tosa/tosa_require_checks.hpp     |    42 +
 ethosu/regor/tosa/tosa_schema_generated.hpp   |  3099 ++
 ethosu/regor/tosa/tosa_validator.cpp          |    42 +
 ethosu/regor/tosa/tosa_validator.hpp          |    62 +
 ...sa_validator_version_0_60_0_profile_bi.cpp |  2898 ++
 ethosu/vela/__init__.py                       |     3 +-
 ethosu/vela/__main__.py                       |     3 +-
 ethosu/vela/_version.py                       |     2 +-
 ethosu/vela/architecture_features.py          |   137 +-
 ethosu/vela/rawdata_writer.py                 |    26 +-
 ethosu/vela/stats_writer.py                   |   554 +-
 .../test/test_tflite_supported_operators.py   |     4 +-
 ethosu/vela/tflite_graph_optimiser.py         |     2 +-
 ethosu/vela/tflite_reader.py                  |     4 +-
 ethosu/vela/tflite_writer.py                  |     4 +-
 ethosu/vela/vela.py                           |   664 +-
 pyproject.toml                                |    28 +-
 setup.cfg                                     |     6 +-
 setup.py                                      |   198 +-
 test/network.py                               |    79 +
 test/test_ethos_u_vela.py                     |   132 +
 754 files changed, 263735 insertions(+), 412 deletions(-)
 create mode 100644 .clang-format
 create mode 100644 THIRDPARTY.md
 create mode 100644 ethosu/regor/CMakeLists.txt
 create mode 100644 ethosu/regor/architecture/architecture.cpp
 create mode 100644 ethosu/regor/architecture/architecture.hpp
 create mode 100644 ethosu/regor/architecture/ethos_u_register_cs_generator.hpp
 create mode 100644 ethosu/regor/architecture/ethos_u_scaling.cpp
 create mode 100644 ethosu/regor/architecture/ethos_u_scaling.hpp
 create mode 100644 ethosu/regor/architecture/ethosu55/ethos_u55.cpp
 create mode 100644 ethosu/regor/architecture/ethosu55/ethos_u55.hpp
 create mode 100644 ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp
 create mode 100644 ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp
 create mode 100644 ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp
 create mode 100644 ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp
 create mode 100644 ethosu/regor/architecture/ethosu55/ethos_u55_scaling.cpp
 create mode 100644 ethosu/regor/architecture/ethosu55/ethos_u55_scaling.hpp
 create mode 100644 ethosu/regor/architecture/ethosu55/ethos_u55_weight_encoder.cpp
 create mode 100644 ethosu/regor/architecture/ethosu55/ethos_u55_weight_encoder.hpp
 create mode 100644 ethosu/regor/architecture/ethosu65/ethos_u65.cpp
 create mode 100644 ethosu/regor/architecture/ethosu65/ethos_u65.hpp
 create mode 100644 ethosu/regor/architecture/ethosu65/ethos_u65_interface.hpp
 create mode 100644 ethosu/regor/architecture/ethosu65/ethos_u65_register_cs_generator.cpp
 create mode 100644 ethosu/regor/architecture/ethosu65/ethos_u65_register_cs_generator.hpp
 create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85.cpp
 create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85.hpp
 create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85_interface.hpp
 create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp
 create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp
 create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp
 create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.hpp
 create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85_scaling.cpp
 create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85_scaling.hpp
 create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85_weight_encoder.cpp
 create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85_weight_encoder.hpp
 create mode 100644 ethosu/regor/architecture/mlw_encode.cpp
 create mode 100644 ethosu/regor/architecture/mlw_encode.hpp
 create mode 100644 ethosu/regor/architecture/register_command_stream_generator.hpp
 create mode 100644 ethosu/regor/architecture/weight_encoder.hpp
 create mode 100644 ethosu/regor/bindings/python/py_regor.cpp
 create mode 100644 ethosu/regor/cmake/cpack_config.cmake
 create mode 100644 ethosu/regor/cmake/pkg-config.cmake.in
 create mode 100644 ethosu/regor/cmake/regor_dependencies.cmake
 create mode 100644 ethosu/regor/cmake/regor_lib.cmake
 create mode 100644 ethosu/regor/cmake/regor_options.cmake
 create mode 100644 ethosu/regor/cmake/regor_test.cmake
 create mode 100644 ethosu/regor/cmake/regor_thirdparty.cmake
 create mode 100644 ethosu/regor/cmake/toolchains/clang.cmake
 create mode 100644 ethosu/regor/cmake/toolchains/clang32.cmake
 create mode 100644 ethosu/regor/cmake/toolchains/gcc.cmake
 create mode 100644 ethosu/regor/cmake/toolchains/gcc32.cmake
 create mode 100644 ethosu/regor/cmake/utils.cmake
 create mode 100644 ethosu/regor/common/bit_flags.hpp
 create mode 100644 ethosu/regor/common/box.hpp
 create mode 100644 ethosu/regor/common/buffer_view.hpp
 create mode 100644 ethosu/regor/common/common.cpp
 create mode 100644 ethosu/regor/common/common.hpp
 create mode 100644 ethosu/regor/common/data_type.cpp
 create mode 100644 ethosu/regor/common/data_type.hpp
 create mode 100644 ethosu/regor/common/dynamic_typing.hpp
 create mode 100644 ethosu/regor/common/ini_reader.hpp
 create mode 100644 ethosu/regor/common/lexer.hpp
 create mode 100644 ethosu/regor/common/logging.cpp
 create mode 100644 ethosu/regor/common/logging.hpp
 create mode 100644 ethosu/regor/common/numeric_util.hpp
 create mode 100644 ethosu/regor/common/ordered_map.hpp
 create mode 100644 ethosu/regor/common/reverse_type.cpp
 create mode 100644 ethosu/regor/common/reverse_type.hpp
 create mode 100644 ethosu/regor/common/scaling.cpp
 create mode 100644 ethosu/regor/common/scaling.hpp
 create mode 100644 ethosu/regor/common/shape.hpp
 create mode 100644 ethosu/regor/common/transpose_type.cpp
 create mode 100644 ethosu/regor/common/transpose_type.hpp
 create mode 100644 ethosu/regor/common/vector_span.hpp
 create mode 100644 ethosu/regor/compiler/attributes.cpp
 create mode 100644 ethosu/regor/compiler/attributes.hpp
 create mode 100644 ethosu/regor/compiler/cascade_builder.cpp
 create mode 100644 ethosu/regor/compiler/cascade_builder.hpp
 create mode 100644 ethosu/regor/compiler/compiler.cpp
 create mode 100644 ethosu/regor/compiler/compiler.hpp
 create mode 100644 ethosu/regor/compiler/database.hpp
 create mode 100644 ethosu/regor/compiler/faststorage_allocator.cpp
 create mode 100644 ethosu/regor/compiler/faststorage_allocator.hpp
 create mode 100644 ethosu/regor/compiler/graph.hpp
 create mode 100644 ethosu/regor/compiler/graph_builder.cpp
 create mode 100644 ethosu/regor/compiler/graph_builder.hpp
 create mode 100644 ethosu/regor/compiler/graph_optimiser.cpp
 create mode 100644 ethosu/regor/compiler/graph_optimiser.hpp
 create mode 100644 ethosu/regor/compiler/graph_optimiser_db.hpp
 create mode 100644 ethosu/regor/compiler/graph_packing.cpp
 create mode 100644 ethosu/regor/compiler/graph_packing.hpp
 create mode 100644 ethosu/regor/compiler/graph_validator.cpp
 create mode 100644 ethosu/regor/compiler/graph_validator.hpp
 create mode 100644 ethosu/regor/compiler/graphir_optimiser.cpp
 create mode 100644 ethosu/regor/compiler/graphir_optimiser.hpp
 create mode 100644 ethosu/regor/compiler/high_level_command_stream.hpp
 create mode 100644 ethosu/regor/compiler/high_level_command_stream_generator.cpp
 create mode 100644 ethosu/regor/compiler/high_level_command_stream_generator.hpp
 create mode 100644 ethosu/regor/compiler/hillclimb_allocator.cpp
 create mode 100644 ethosu/regor/compiler/hillclimb_allocator.hpp
 create mode 100644 ethosu/regor/compiler/kernel.hpp
 create mode 100644 ethosu/regor/compiler/live_range.cpp
 create mode 100644 ethosu/regor/compiler/live_range.hpp
 create mode 100644 ethosu/regor/compiler/network_performance.cpp
 create mode 100644 ethosu/regor/compiler/network_performance.hpp
 create mode 100644 ethosu/regor/compiler/op_type.cpp
 create mode 100644 ethosu/regor/compiler/op_type.hpp
 create mode 100644 ethosu/regor/compiler/operation.cpp
 create mode 100644 ethosu/regor/compiler/operation.hpp
 create mode 100644 ethosu/regor/compiler/operation_util.hpp
 create mode 100644 ethosu/regor/compiler/optimiser_utils.cpp
 create mode 100644 ethosu/regor/compiler/optimiser_utils.hpp
 create mode 100644 ethosu/regor/compiler/quantization.cpp
 create mode 100644 ethosu/regor/compiler/quantization.hpp
 create mode 100644 ethosu/regor/compiler/raw_writer.cpp
 create mode 100644 ethosu/regor/compiler/raw_writer.hpp
 create mode 100644 ethosu/regor/compiler/scheduler.cpp
 create mode 100644 ethosu/regor/compiler/scheduler.hpp
 create mode 100644 ethosu/regor/compiler/scheduler_decompose.cpp
 create mode 100644 ethosu/regor/compiler/scheduler_decompose.hpp
 create mode 100644 ethosu/regor/compiler/scheduler_operation.hpp
 create mode 100644 ethosu/regor/compiler/scheduler_packing.cpp
 create mode 100644 ethosu/regor/compiler/scheduler_packing.hpp
 create mode 100644 ethosu/regor/compiler/softmax.cpp
 create mode 100644 ethosu/regor/compiler/softmax.hpp
 create mode 100644 ethosu/regor/compiler/tensor.cpp
 create mode 100644 ethosu/regor/compiler/tensor.hpp
 create mode 100644 ethosu/regor/compiler/tensor_allocator.cpp
 create mode 100644 ethosu/regor/compiler/tensor_allocator.hpp
 create mode 100644 ethosu/regor/compiler/tensor_properties.hpp
 create mode 100644 ethosu/regor/compiler/tflite_graph_optimiser.cpp
 create mode 100644 ethosu/regor/compiler/tflite_graph_optimiser.hpp
 create mode 100644 ethosu/regor/compiler/tflite_graph_optimiser_tp.cpp
 create mode 100644 ethosu/regor/compiler/tosa_graph_validator.cpp
 create mode 100644 ethosu/regor/compiler/tosa_graph_validator.hpp
 create mode 100644 ethosu/regor/dependencies/mlw_codec/CMakeLists.txt
 create mode 100644 ethosu/regor/dependencies/mlw_codec/include/mlw_decode.h
 create mode 100644 ethosu/regor/dependencies/mlw_codec/include/mlw_encode.h
 create mode 100644 ethosu/regor/dependencies/mlw_codec/source/ml_bit_buffer.hpp
 create mode 100644 ethosu/regor/dependencies/mlw_codec/source/ml_encoder_internal.hpp
 create mode 100644 ethosu/regor/dependencies/mlw_codec/source/ml_ethosu_encode.cpp
 create mode 100644 ethosu/regor/dependencies/mlw_codec/source/ml_raw_buffer.hpp
 create mode 100644 ethosu/regor/dependencies/mlw_codec/source/mlw_decode.cpp
 create mode 100644 ethosu/regor/dependencies/mlw_codec/source/mlw_encode.cpp
 create mode 100644 ethosu/regor/dependencies/mlw_codec/source/mlw_encode_fwd.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/BUILD.bazel
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CMake/Catch2Config.cmake.in
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CMake/CatchConfigOptions.cmake
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CMake/CatchMiscFunctions.cmake
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CMake/FindGcov.cmake
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CMake/FindLcov.cmake
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CMake/Findcodecov.cmake
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CMake/catch2-with-main.pc.in
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CMake/catch2.pc.in
 create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/CMake/llvm-cov-wrapper
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CMakeLists.txt
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CMakePresets.json
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CODE_OF_CONDUCT.md
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/Doxyfile
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/LICENSE.txt
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/MODULE.bazel
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/README.md
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/SECURITY.md
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/WORKSPACE.bazel
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/appveyor.yml
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/codecov.yml
 create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/conanfile.py
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/extras/Catch.cmake
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/extras/CatchAddTests.cmake
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/extras/CatchShardTests.cmake
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/extras/CatchShardTestsImpl.cmake
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/extras/ParseAndAddCatchTests.cmake
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/extras/catch_amalgamated.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/extras/catch_amalgamated.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/extras/gdbinit
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/extras/lldbinit
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/CMakeLists.txt
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/NullOStream.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/NullOStream.h
 create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/build_fuzzers.sh
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/fuzz_TestSpecParser.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/fuzz_XmlWriter.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/fuzz_textflow.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/mdsnippets.json
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/meson.build
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/meson_options.txt
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/CMakeLists.txt
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_benchmark.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_benchmark_all.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_chronometer.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_chronometer.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_clock.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_constructor.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_environment.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_estimate.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_execution_plan.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_optimizer.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_outlier_classification.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_sample_analysis.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_analyse.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_analyse.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_benchmark_function.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_benchmark_function.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_benchmark_stats.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_benchmark_stats_fwd.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_complete_invoke.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_estimate_clock.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_measure.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_repeat.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_run_for_at_least.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_run_for_at_least.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_stats.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_stats.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_timing.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_all.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_approx.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_approx.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_assertion_info.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_assertion_result.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_assertion_result.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_config.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_config.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_get_random_seed.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_get_random_seed.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_message.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_message.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_registry_hub.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_section_info.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_session.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_session.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tag_alias.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tag_alias_autoregistrar.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tag_alias_autoregistrar.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_template_test_macros.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_case_info.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_case_info.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_macros.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_spec.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_spec.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_timer.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_timer.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tostring.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tostring.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_totals.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_totals.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_translate_exception.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_translate_exception.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_user_config.hpp.in
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_version.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_version.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_version_macros.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generator_exception.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generator_exception.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_adapters.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_all.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_random.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_random.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_range.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_all.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_capture.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_capture.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_config.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_config.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_enum_values_registry.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_exception.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_exception.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_generatortracker.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_generatortracker.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_registry_hub.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_registry_hub.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_reporter.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_reporter.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_reporter_factory.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_reporter_factory.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_tag_alias_registry.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_test_invoker.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_testcase.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_testcase.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_assertion_handler.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_assertion_handler.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_case_insensitive_comparisons.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_case_insensitive_comparisons.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_case_sensitive.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_clara.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_clara.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_commandline.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_commandline.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_compare_traits.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_compiler_capabilities.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_android_logwrite.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_counter.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_prefix_messages.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_static_analysis_support.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_uncaught_exceptions.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_wchar.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_console_colour.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_console_colour.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_console_width.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_container_nonmembers.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_context.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_context.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_debug_console.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_debug_console.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_debugger.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_debugger.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_decomposer.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_decomposer.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_enforce.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_enforce.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_enum_values_registry.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_enum_values_registry.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_errno_guard.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_errno_guard.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_exception_translator_registry.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_exception_translator_registry.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_fatal_condition_handler.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_fatal_condition_handler.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_floating_point_helpers.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_floating_point_helpers.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_getenv.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_getenv.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_is_permutation.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_istream.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_istream.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_jsonwriter.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_jsonwriter.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_lazy_expr.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_lazy_expr.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_leak_detector.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_leak_detector.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_list.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_list.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_logical_traits.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_main.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_message_info.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_message_info.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_meta.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_move_and_forward.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_noncopyable.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_optional.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_output_redirect.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_output_redirect.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_parse_numbers.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_parse_numbers.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_platform.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_polyfills.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_polyfills.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_preprocessor.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_preprocessor_internal_stringify.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_preprocessor_remove_parens.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_floating_point_helpers.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_integer_helpers.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_number_generator.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_number_generator.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_seed_generation.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_seed_generation.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reporter_registry.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reporter_registry.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reporter_spec_parser.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reporter_spec_parser.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_result_type.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_result_type.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reusable_string_stream.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reusable_string_stream.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_run_context.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_run_context.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_section.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_section.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_sharding.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_singletons.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_singletons.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_source_line_info.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_source_line_info.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_startup_exception_registry.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_startup_exception_registry.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stdstreams.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stdstreams.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stream_end_stop.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_string_manip.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_string_manip.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stringref.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stringref.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_tag_alias_registry.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_tag_alias_registry.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_template_test_registry.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_info_hasher.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_info_hasher.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_registry_impl.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_registry_impl.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_tracker.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_tracker.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_failure_exception.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_failure_exception.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_macro_impl.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_registry.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_registry.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_run_info.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_spec_parser.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_spec_parser.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_textflow.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_textflow.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_to_string.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_uncaught_exceptions.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_uncaught_exceptions.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_uniform_floating_point_distribution.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_uniform_integer_distribution.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_unique_name.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_unique_ptr.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_void_type.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_wildcard_pattern.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_wildcard_pattern.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_windows_h_proxy.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_xmlwriter.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_xmlwriter.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_all.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_container_properties.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_container_properties.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_contains.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_exception.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_exception.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_floating_point.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_floating_point.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_predicate.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_predicate.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_quantifiers.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_quantifiers.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_range_equals.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_string.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_string.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_templated.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_templated.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_vector.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/internal/catch_matchers_impl.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/internal/catch_matchers_impl.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/meson.build
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_automake.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_automake.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_common_base.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_common_base.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_compact.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_compact.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_console.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_console.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_cumulative_base.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_cumulative_base.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_event_listener.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_event_listener.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_helpers.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_helpers.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_json.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_json.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_junit.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_junit.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_multi.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_multi.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_registrars.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_registrars.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_sonarqube.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_sonarqube.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_streaming_base.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_streaming_base.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_tap.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_tap.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_teamcity.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_teamcity.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_xml.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_xml.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporters_all.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/third_party/clara.hpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/CMakeLists.txt
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/SelfTest.vcxproj.user
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/appveyorBuildConfigurationScript.bat
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/appveyorMergeCoverageScript.py
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/appveyorTestRunScript.bat
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/coverage-helper.cpp
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/installOpenCppCoverage.ps1
 create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/approvalTests.py
 create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/approve.py
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/buildAndTest.cmd
 create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/buildAndTest.sh
 create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/checkConvenienceHeaders.py
 create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/checkDuplicateFilenames.py
 create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/checkLicense.py
 create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/developBuild.py
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/extractFeaturesFromReleaseNotes.py
 create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/fixWhitespace.py
 create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/generateAmalgamatedFiles.py
 create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/majorRelease.py
 create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/minorRelease.py
 create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/patchRelease.py
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/releaseCommon.py
 create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/scriptCommon.py
 create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/updateDocumentSnippets.py
 create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/updateDocumentToC.py
 create mode 100644 ethosu/regor/dependencies/thirdparty/download_thirdparty.sh
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/BUILD.bazel
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/CHANGELOG.md
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/CMakeLists.txt
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/CONTRIBUTING.md
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/FlatBuffers.podspec
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/Formatters.md
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/LICENSE
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/Package.swift
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/Package@swift-5.5.swift
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/README.md
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/SECURITY.md
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/WORKSPACE
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/build_defs.bzl
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/composer.json
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/conanfile.py
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/allocator.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/array.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/base.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/buffer.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/buffer_ref.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/code_generator.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/code_generators.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/default_allocator.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/detached_buffer.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/file_manager.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flatbuffer_builder.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flatbuffers.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flatc.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flex_flat_util.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flexbuffers.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/grpc.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/hash.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/idl.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/minireflect.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/pch/flatc_pch.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/pch/pch.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/reflection.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/reflection_generated.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/registry.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/stl_emulation.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/string.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/struct.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/table.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/util.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/vector.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/vector_downward.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/verifier.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/package.json
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/pnpm-lock.yaml
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/swift.swiftformat
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/tsconfig.json
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/tsconfig.mjs.json
 create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/typescript.bzl
 create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/CMakeLists.txt
 create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/CONTRIBUTING.md
 create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/ChangeLog.md
 create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/LICENSE
 create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/README.md
 create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/args.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/chrono.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/color.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/compile.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/core.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/format-inl.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/format.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/os.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/ostream.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/printf.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/ranges.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/std.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/xchar.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/src/fmt.cc
 create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/src/format.cc
 create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/src/os.cc
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/AUTHORS
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/BUILD
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/CONTRIBUTING
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/CONTRIBUTORS
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/LICENSE
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/Makefile.travis
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/README.md
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/WORKSPACE
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_avx.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_msa.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_neon.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_sse.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_wasmsimd.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/flags.bzl
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/allocator.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/block_params.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/common.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/compute.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/detect_platform.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/dispatch_gemm_shape.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_avx.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_default.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_msa.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_neon.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_reference.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_sse.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/multi_thread_gemm.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output_avx.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output_msa.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output_neon.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output_sse.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack_avx.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack_msa.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack_neon.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack_sse.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/platform.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers_common_neon_sse.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers_msa.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers_neon.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers_sse.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/single_thread_gemm.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/unpack.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/CMakeLists.txt
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/LICENSE
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/MANIFEST.in
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/README.rst
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/SECURITY.md
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/attr.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/buffer_info.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/cast.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/chrono.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/common.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/complex.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/class.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/common.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/descr.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/init.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/internals.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/type_caster_base.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/typeid.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eigen.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eigen/common.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eigen/matrix.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eigen/tensor.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/embed.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eval.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/functional.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/gil.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/iostream.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/numpy.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/operators.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/options.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/pybind11.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/pytypes.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/stl.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/stl/filesystem.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/stl_bind.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/type_caster_pyobject_ptr.h
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/noxfile.py
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/pybind11/__init__.py
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/pybind11/__main__.py
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/pybind11/_version.py
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/pybind11/commands.py
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/pybind11/py.typed
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/pybind11/setup_helpers.py
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/pyproject.toml
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/setup.cfg
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/setup.py
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/FindCatch.cmake
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/FindEigen3.cmake
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/FindPythonLibsNew.cmake
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/JoinPaths.cmake
 create mode 100755 ethosu/regor/dependencies/thirdparty/pybind11/tools/check-style.sh
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/cmake_uninstall.cmake.in
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/codespell_ignore_lines_from_errors.py
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/libsize.py
 create mode 100755 ethosu/regor/dependencies/thirdparty/pybind11/tools/make_changelog.py
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11.pc.in
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11Common.cmake
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11Config.cmake.in
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11NewTools.cmake
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11Tools.cmake
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/pyproject.toml
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/setup_global.py.in
 create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/setup_main.py.in
 create mode 100644 ethosu/regor/include/graphapi.hpp
 create mode 100644 ethosu/regor/include/graphapi_attr.hpp
 create mode 100644 ethosu/regor/include/graphapi_tosa_types.hpp
 create mode 100644 ethosu/regor/include/regor.h
 create mode 100644 ethosu/regor/include/regor_database.hpp
 create mode 100644 ethosu/regor/include/regor_interface.hpp
 create mode 100644 ethosu/regor/regor.cpp
 create mode 100644 ethosu/regor/test/CMakeLists.txt
 create mode 100644 ethosu/regor/test/randomize.hpp
 create mode 100644 ethosu/regor/test/test_arch_ethos_u85.cpp
 create mode 100644 ethosu/regor/test/test_data_type.cpp
 create mode 100644 ethosu/regor/test/test_ethos_u85_weight_encoder.cpp
 create mode 100644 ethosu/regor/test/test_graph_packing.cpp
 create mode 100644 ethosu/regor/test/test_ini_reader.cpp
 create mode 100644 ethosu/regor/test/test_main.cpp
 create mode 100644 ethosu/regor/test/test_mlw_encode.cpp
 create mode 100644 ethosu/regor/test/test_ordered_map.cpp
 create mode 100644 ethosu/regor/test/test_raw_writer.cpp
 create mode 100644 ethosu/regor/test/test_shape.cpp
 create mode 100644 ethosu/regor/test/test_tosa_validator.cpp
 create mode 100644 ethosu/regor/test/test_transpose_type.cpp
 create mode 100644 ethosu/regor/tflite/custom_operator_ethosu.hpp
 create mode 100644 ethosu/regor/tflite/flatbuffer_utils.hpp
 create mode 100644 ethosu/regor/tflite/tflite_mapping.cpp
 create mode 100644 ethosu/regor/tflite/tflite_mapping.hpp
 create mode 100644 ethosu/regor/tflite/tflite_model_semantics.cpp
 create mode 100644 ethosu/regor/tflite/tflite_model_semantics.hpp
 create mode 100644 ethosu/regor/tflite/tflite_reader.cpp
 create mode 100644 ethosu/regor/tflite/tflite_reader.hpp
 create mode 100644 ethosu/regor/tflite/tflite_schema_generated.hpp
 create mode 100644 ethosu/regor/tflite/tflite_writer.cpp
 create mode 100644 ethosu/regor/tflite/tflite_writer.hpp
 create mode 100755 ethosu/regor/tools/cppcheck.py
 create mode 100755 ethosu/regor/tosa/tosaValidationGenerator.rb
 create mode 100644 ethosu/regor/tosa/tosa_argument_checks.cpp
 create mode 100644 ethosu/regor/tosa/tosa_argument_checks.hpp
 create mode 100644 ethosu/regor/tosa/tosa_error_checks.cpp
 create mode 100644 ethosu/regor/tosa/tosa_error_checks.hpp
 create mode 100644 ethosu/regor/tosa/tosa_level_checks.cpp
 create mode 100644 ethosu/regor/tosa/tosa_level_checks.hpp
 create mode 100644 ethosu/regor/tosa/tosa_mapping.cpp
 create mode 100644 ethosu/regor/tosa/tosa_mapping.hpp
 create mode 100644 ethosu/regor/tosa/tosa_reader.cpp
 create mode 100644 ethosu/regor/tosa/tosa_reader.hpp
 create mode 100644 ethosu/regor/tosa/tosa_require_checks.cpp
 create mode 100644 ethosu/regor/tosa/tosa_require_checks.hpp
 create mode 100644 ethosu/regor/tosa/tosa_schema_generated.hpp
 create mode 100644 ethosu/regor/tosa/tosa_validator.cpp
 create mode 100644 ethosu/regor/tosa/tosa_validator.hpp
 create mode 100644 ethosu/regor/tosa/tosa_validator_version_0_60_0_profile_bi.cpp
 mode change 100644 => 100755 ethosu/vela/vela.py
 create mode 100644 test/network.py
 create mode 100644 test/test_ethos_u_vela.py

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 00000000..778d3ab0
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,214 @@
+Language: Cpp
+# BasedOnStyle:  Google
+AccessModifierOffset: -4
+AlignAfterOpenBracket: DontAlign
+# AlignArrayOfStructures: Left
+AlignConsecutiveMacros: None
+AlignConsecutiveAssignments: None
+AlignConsecutiveBitFields: None
+AlignConsecutiveDeclarations: None
+AlignEscapedNewlines: DontAlign
+AlignOperands: Align
+AlignTrailingComments: true
+#AllowAllArgumentsOnNextLine: true
+#AllowAllConstructorInitializersOnNextLine: true
+#AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortEnumsOnASingleLine: true
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: InlineOnly
+#AllowShortLambdasOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: AllIfsAndElse
+AllowShortLoopsOnASingleLine: false
+#AlwaysBreakAfterDefinitionReturnType: None
+#AlwaysBreakAfterReturnType: None
+#AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: Yes
+#AttributeMacros:
+#  - __capability
+#BinPackArguments: true
+#BinPackParameters: true
+BraceWrapping:
+    AfterCaseLabel: true
+    AfterClass: true
+    AfterControlStatement: Always
+    AfterEnum: true
+    AfterFunction: true
+    AfterNamespace: true
+    AfterObjCDeclaration: true
+    AfterStruct: true
+    AfterUnion: true
+    AfterExternBlock: false
+    BeforeCatch: true
+    BeforeElse: true
+    BeforeLambdaBody: true
+    BeforeWhile: false
+    IndentBraces: false
+    SplitEmptyFunction: true
+    SplitEmptyRecord: true
+    SplitEmptyNamespace: true
+#BreakBeforeBinaryOperators: None
+#BreakBeforeConceptDeclarations: true
+BreakBeforeBraces: Custom
+#BreakBeforeInheritanceComma: false
+BreakInheritanceList: AfterColon
+BreakBeforeTernaryOperators: false
+BreakConstructorInitializers: AfterColon
+#BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit: 120
+#CommentPragmas:  '^ IWYU pragma:'
+#CompactNamespaces: false
+ConstructorInitializerIndentWidth: 8
+ContinuationIndentWidth: 4
+#Cpp11BracedListStyle: true
+#DeriveLineEnding: true
+DerivePointerAlignment: false
+#DisableFormat:   false
+#EmptyLineAfterAccessModifier: Never
+#EmptyLineBeforeAccessModifier: LogicalBlock
+#ExperimentalAutoDetectBinPacking: false
+#FixNamespaceComments: true
+#ForEachMacros:
+#  - foreach
+#  - Q_FOREACH
+#  - BOOST_FOREACH
+#IfMacros:
+#  - KJ_IF_MAYBE
+IncludeBlocks: Regroup
+IncludeCategories:
+-   Regex: \"common/common.hpp\"
+    Priority: 1
+    SortPriority: 0
+    CaseSensitive: false
+-   Regex: \"common/logging.hpp\"
+    Priority: 1
+    SortPriority: 1
+    CaseSensitive: false
+#-   Regex: \"common/.*\.hpp\"
+#    Priority: 1
+#    SortPriority: 2
+#    CaseSensitive: false
+-   Regex: \".*\.hpp\"
+    Priority: 2
+    SortPriority: 3
+    CaseSensitive: false
+-   Regex: ^<ext/.*\.h>
+    Priority: 3
+    SortPriority: 4
+    CaseSensitive: false
+-   Regex: ^<.*\.h>
+    Priority: 3
+    SortPriority: 5
+    CaseSensitive: false
+-   Regex: ^<.*
+    Priority: 3
+    SortPriority: 6
+    CaseSensitive: false
+#IncludeIsMainRegex: '([-_](test|unittest))?$'
+#IncludeIsMainSourceRegex: ''
+#IndentAccessModifiers: false
+IndentCaseLabels: true
+#IndentCaseBlocks: false
+#IndentGotoLabels: true
+IndentPPDirectives: None
+IndentExternBlock: AfterExternBlock
+#IndentRequires:  false
+IndentWidth: 4
+#IndentWrappedFunctionNames: false
+#InsertTrailingCommas: None
+#JavaScriptQuotes: Leave
+#JavaScriptWrapImports: true
+#KeepEmptyLinesAtTheStartOfBlocks: false
+#LambdaBodyIndentation: Signature / OuterScope
+MacroBlockBegin: ^BEGIN_FIELD_TABLE|^BEGIN_ENUM_TABLE
+MacroBlockEnd: ^END_FIELD_TABLE|^END_ENUM_TABLE
+MaxEmptyLinesToKeep: 3
+#NamespaceIndentation: None
+#ObjCBinPackProtocolList: Never
+#ObjCBlockIndentWidth: 2
+#ObjCBreakBeforeNestedBlockParam: true
+#ObjCSpaceAfterProperty: false
+#ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 350
+PenaltyBreakBeforeFirstCallParameter: 10
+PenaltyBreakComment: 10
+PenaltyBreakFirstLessLess: 10
+PenaltyBreakString: 100
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 10
+PenaltyReturnTypeOnItsOwnLine: 1000
+PenaltyIndentedWhitespace: 250
+PointerAlignment: Right
+#PPIndentWidth:   -1
+#RawStringFormats:
+#  - Language:        Cpp
+#    Delimiters:
+#      - cc
+#      - CC
+#      - cpp
+#      - Cpp
+#      - CPP
+#      - 'c++'
+#      - 'C++'
+#    CanonicalDelimiter: ''
+#    BasedOnStyle:    google
+#  - Language:        TextProto
+#    Delimiters:
+#      - pb
+#      - PB
+#      - proto
+#      - PROTO
+#    EnclosingFunctions:
+#      - EqualsProto
+#      - EquivToProto
+#      - PARSE_PARTIAL_TEXT_PROTO
+#      - PARSE_TEST_PROTO
+#      - PARSE_TEXT_PROTO
+#      - ParseTextOrDie
+#      - ParseTextProtoOrDie
+#      - ParseTestProto
+#      - ParsePartialTestProto
+#    CanonicalDelimiter: pb
+#    BasedOnStyle:    google
+#ReferenceAlignment: Pointer
+#ReflowComments:  true
+ShortNamespaceLines: 1
+SortIncludes: CaseInsensitive
+#SortJavaStaticImport: Before
+#SortUsingDeclarations: true
+#SpaceAfterCStyleCast: false
+#SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: false
+#SpaceBeforeAssignmentOperators: true
+#SpaceBeforeCaseColon: false
+#SpaceBeforeCpp11BracedList: false
+#SpaceBeforeCtorInitializerColon: true
+#SpaceBeforeInheritanceColon: true
+#SpaceBeforeParens: ControlStatements
+#SpaceAroundPointerQualifiers: Default
+#SpaceBeforeRangeBasedForLoopColon: true
+#SpaceInEmptyBlock: false
+#SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+#SpacesInAngles:  Never
+SpacesInConditionalStatement: true
+#SpacesInContainerLiterals: true
+#SpacesInCStyleCastParentheses: false
+SpacesInLineCommentPrefix:
+    Minimum: 1
+    Maximum: -1
+#SpacesInParentheses: false
+#SpacesInSquareBrackets: false
+#SpaceBeforeSquareBrackets: false
+BitFieldColonSpacing: None
+#Standard:        Auto
+StatementAttributeLikeMacros:
+-   DLL_EXPORT
+-   DLL_IMPORT
+StatementMacros: [LOG_PRINT, LOG_WARN, LOG_ERROR, DECLARE_ENUM_AS_FLAGS, REGOR_FIELD_TYPE]
+TabWidth: 4
+#UseCRLF:         false
+#UseTab:          Never
+WhitespaceSensitiveMacros:
+-   MACRO_CONCAT
diff --git a/.gitignore b/.gitignore
index 070ea6c3..55b4c109 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,7 +3,7 @@ build/
 ethos_u_vela.egg-info/
 ethosu/*.so
 output/
-.vscode/
 .coverage
 __pycache__
+*.pyc
 *.pyd
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 472cdf93..3b337df7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 # SPDX-License-Identifier: Apache-2.0
 
@@ -14,16 +14,50 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-exclude: '^ethosu/vela/(tflite|ethos_u55_regs|tosa)/'
+exclude: ethosu/vela/(tflite|ethos_u55_regs|tosa)|ethosu/regor/dependencies|ethosu/regor/architecture/ethosu[68]5/ethos_u[68]5_interface.hpp|ethosu/regor/(tflite|tosa)/(tflite|tosa)_schema_generated.hpp|[A-Z_0-9]*.md
+
 repos:
+-   repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks
+    rev: v2.2.0
+    hooks:
+    -   id: pretty-format-yaml
+        name: Pretty format YAML
+        description: This hook sets a standard for formatting YAML files.
+        entry: pretty-format-yaml
+        language: python
+        types: [yaml]
+        minimum_pre_commit_version: '1'
+        args: [--autofix, --indent=4]
+
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.0.1
+    hooks:
+    -   id: check-toml
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+    -   id: mixed-line-ending
+        args: [--fix=lf]
+        description: Forces to replace line ending by the UNIX 'lf' character.
+    -   id: detect-private-key
+    -   id: check-executables-have-shebangs
+    -   id: check-added-large-files
+        args: [--maxkb=256, --enforce-all]
+
+-   repo: https://github.com/Lucas-C/pre-commit-hooks
+    rev: v1.5.4
+    hooks:
+    -   id: remove-tabs
+        args: [--whitespaces-count, '4']
+        exclude: (M|m)akefile
+
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: 'v0.931'
+    rev: v0.931
     hooks:
     -   id: mypy
-        args: ["--no-strict-optional", "--show-error-codes", "--ignore-missing-imports"]
+        args: [--no-strict-optional, --show-error-codes, --ignore-missing-imports]
         require_serial: true
         additional_dependencies: [types-setuptools]
-        minimum_pre_commit_version: '2.9.2'
+        minimum_pre_commit_version: 2.9.2
 
 -   repo: https://github.com/asottile/reorder_python_imports
     rev: v2.2.0
@@ -42,12 +76,25 @@ repos:
     hooks:
     -   id: flake8
         args: [--max-line-length=120, --extend-ignore=E203]
+        exclude: ^test/network.py|ethosu/regor/[A-Z_0-9]*
 
 -   repo: https://github.com/pylint-dev/pylint
     rev: v2.13.9
     hooks:
     -   id: pylint
-        args: [--score=no, --max-line-length=120, --disable=all, --enable=W0102]
+        args: [--score=no, --max-line-length=120, --disable=all, --enable=W0102 --ignore=ethosu/regor]
+
+-   repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v16.0.6
+    hooks:
+    -   id: clang-format
+        files: .*\.(hpp|cpp|h|c)
+        types_or: [c++, c]
+        args: [-style=file, -i]
+        require_serial: false
+        additional_dependencies: []
+        minimum_pre_commit_version: 2.9.2
+        exclude: ethosu/mlw_codec
 
 -   repo: local
     hooks:
@@ -58,7 +105,7 @@ repos:
         entry: pytest -s -v
         types: [python]
         pass_filenames: false
-        always_run: true
+        always_run: false
 
     -   id: pytest-cov
         name: pytest-cov
@@ -67,4 +114,4 @@ repos:
         entry: pytest -v --cov=ethosu --cov-fail-under=0
         types: [python]
         pass_filenames: false
-        always_run: true
+        always_run: false
diff --git a/README.md b/README.md
index 95e7ddff..f7293c15 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <!--
-SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 SPDX-License-Identifier: Apache-2.0
 
@@ -17,6 +17,8 @@ limitations under the License.
 -->
 # Vela
 
+**_NOTE:_** **Ethos-U85 support should be considered beta-quality**
+
 This tool is used to compile a
 [TensorFlow Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers)
 neural network model into an optimised version that can run on an embedded
@@ -330,8 +332,10 @@ Additional useful information:
 
 * [Arm Products: Ethos-U55 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u55)
 * [Arm Products: Ethos-U65 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u65)
-* [Arm Developer: Ethos-U55 NPU](https://developer.arm.com/ip-products/processors/machine-learning/arm-ethos-u/ethos-u55)
-* [Arm Developer: Ethos-U65 NPU](https://developer.arm.com/ip-products/processors/machine-learning/arm-ethos-u/ethos-u65)
+* [Arm Products: Ethos-U85 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u85)
+* [Arm Developer: Ethos-U55 NPU](https://developer.arm.com/Processors/Ethos-U55)
+* [Arm Developer: Ethos-U65 NPU](https://developer.arm.com/Processors/Ethos-U65)
+* [Arm Developer: Ethos-U85 NPU](https://developer.arm.com/Processors/Ethos-U85)
 
 ## Security
 
diff --git a/TESTING.md b/TESTING.md
index 91fe9cc1..0e296e39 100644
--- a/TESTING.md
+++ b/TESTING.md
@@ -1,5 +1,5 @@
 <!--
-SPDX-FileCopyrightText: Copyright 2020, 2022-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+SPDX-FileCopyrightText: Copyright 2020, 2022-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 
 SPDX-License-Identifier: Apache-2.0
 
@@ -19,10 +19,10 @@ limitations under the License.
 
 ## Tools
 
-Vela's Python codebase is PEP8 compliant with the exception of a 120 character
+Vela's Python code is PEP8 compliant with the exception of a 120 character
 line length.  The following code formatting and linting tools are run on all the
-Python files (excluding the directories `ethosu/vela/tflite/`, `ethosu/vela/tosa/`,
-and `ethosu/vela/ethos_u55_regs` because they contain auto-generated code):
+Python files (excluding some auto-generated code see `.pre-commit-config.yaml`
+for details):
 
 * mypy (code linter)
 * reorder-python-import (code formatter)
@@ -30,21 +30,30 @@ and `ethosu/vela/ethos_u55_regs` because they contain auto-generated code):
 * flake8 (code linter)
 * pylint (code linter)
 
-These tools are run using the [pre-commit](https://pre-commit.com/) framework.
-This is also used to run the following test and coverage tools:
+Vela's C/C++ code is formatted using the following tools (excluding some
+auto-generated and third-party code see `.pre-commit-config.yaml` for details):
 
-* pytest (testing framework)
-* pytest-cov (code coverage plugin for pytest)
+* clang-format (code formatter)
+
+All of the above tools can be installed and run using the
+[pre-commit](https://pre-commit.com/) framework (see below).
+
+In addition, there are both Python and C/C++ unit tests. These use the following
+frameworks:
+
+* pytest (Python)
+* Catch2 (C/C++)
 
 ### Installation
 
-To install the development dependencies, use the following command:
+To install the development dependencies, first install Vela with the development
+option:
 
 ``` bash
-pip install -e .[dev]
+pip install -e ".[dev]"
 ```
 
-This command will install the following tools:
+This will install the following tools:
 
 * pytest
 * pytest-cov
@@ -52,53 +61,56 @@ This command will install the following tools:
 * build
 * setuptools_scm
 
-The remaining tools will all be installed automatically upon first use of pre-commit.
+Then, the remaining tools will be installed automatically upon the first use of
+pre-commit.
 
 ### Add pre-commit hook (Automatically running the tools)
 
 To support code development all the above tools can be configured to run
-automatically on `git commit` (except pytest-cov which is run on `git push`) by
-using the command:
+automatically upon any modified file. This happens when performing a
+`git commit` command and is setup using:
 
 ```bash
 $ pre-commit install
 pre-commit installed at .git/hooks/pre-commit
 ```
 
-When committing (or pushing) if any of the tools result in a failure (meaning an
-issue was found) then it will need to be resolved and the git operation
-repeated.
+When committing a patch, if any of the tools result in a failure (meaning an
+issue was found) then the issue will need to be resolved before re-issuing the
+commit.
 
 ### Manually running the tools
 
 All of the tools can be run individually by invoking them using the following
 pre-commit framework commands:
 
+1) Run all of the commit stages on all files in the repository:
+
+```bash
+$ pre-commit run --all-files
+```
+
+2) Run a specific check on a specific file
 ```bash
-$ pre-commit run mypy --all-files
-...
-$ pre-commit run reorder-python-imports --all-files
-...
-$ pre-commit run black --all-files
-...
-$ pre-commit run flake8 --all-files
-...
-$ pre-commit run pylint --all-files
-...
-$ pre-commit run pytest
-...
-$ pre-commit run pytest-cov --hook-stage push
-...
+$ pre-commit run pylint --files ethosu/vela/vela.py
 ```
 
-Alternatively, all of the commit stage hooks can be run using the command:
+### Manually run the pytest unit tests
 
+To run all of the pytest unit tests use the following command:
 ```bash
-$ pre-commit run --all-files
-mypy.....................................................................Passed
-Reorder python imports...................................................Passed
-black....................................................................Passed
-flake8...................................................................Passed
-pylint...................................................................Passed
-pytest...................................................................Passed
+$ pytest
+```
+
+2) Run a specific pytest unit test
+```bash
+$ pytest pytest ethosu/vela/test/test_architecture_allocator.py
+```
+
+### Manually run the Catch2 unit tests
+
+To run all of the Catch2 unit tests use the following command:
+```bash
+$ cmake -S ethosu/regor -B build-unit-tests -DCMAKE_BUILD_TYPE=Debug -DREGOR_SANITIZE=address
+$ cmake --build build-unit-tests -t check
 ```
diff --git a/THIRDPARTY.md b/THIRDPARTY.md
new file mode 100644
index 00000000..af451299
--- /dev/null
+++ b/THIRDPARTY.md
@@ -0,0 +1,34 @@
+<!--
+SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the License); you may
+not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an AS IS BASIS, WITHOUT
+WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+# Third Party Software
+
+The following lists the Third Party software versions and licenses used by Vela:
+* FlatBuffers [v23.5.26](https://github.com/google/flatbuffers/releases/tag/v23.5.26) - 
+([Apache-2.0 License](https://github.com/google/flatbuffers/blob/v23.5.26/LICENSE))
+
+* fmt [10.2.1](https://github.com/fmtlib/fmt/releases/tag/10.2.1) - 
+([MIT License](https://github.com/fmtlib/fmt/blob/10.2.1/LICENSE))
+
+* Catch2 [v3.5.3](https://github.com/catchorg/Catch2/releases/tag/v3.5.3) - 
+([BSL-1.0 License](https://github.com/catchorg/Catch2/blob/v3.5.3/LICENSE.txt))
+
+* Gemmlowp [09d81e02ab15b41405caebeb5eb63fd12555aee3](https://github.com/google/gemmlowp/tree/09d81e02ab15b41405caebeb5eb63fd12555aee3) - 
+([Apache-2.0 License](https://github.com/google/gemmlowp/blob/09d81e02ab15b41405caebeb5eb63fd12555aee3/LICENSE))
+
+* pybind11 [v2.11.1](https://github.com/pybind/pybind11/releases/tag/v2.11.1) - 
+([BSD-3-Clause License](https://github.com/pybind/pybind11/blob/v2.11.1/LICENSE))
\ No newline at end of file
diff --git a/ethosu/config_files/Arm/vela.ini b/ethosu/config_files/Arm/vela.ini
index 50ad055f..12170c90 100644
--- a/ethosu/config_files/Arm/vela.ini
+++ b/ethosu/config_files/Arm/vela.ini
@@ -1,4 +1,4 @@
-; SPDX-FileCopyrightText: Copyright 2020-2021 Arm Limited and/or its affiliates <open-source-office@arm.com>
+; SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 ;
 ; SPDX-License-Identifier: Apache-2.0
 ;
@@ -104,6 +104,86 @@ Dram_burst_length=128
 Dram_read_latency=500
 Dram_write_latency=250
 
+; SRAMx2 (6.4 GB/s) and Flash (0.2 GB/s)
+[System_Config.Ethos_U85_SYS_Flash_Low]
+core_clock=200e6
+axi0_port=Sram
+axi1_port=OffChipFlash
+Sram_clock_scale=1
+Sram_ports_used=2
+Sram_burst_length=64
+Sram_read_latency=8
+Sram_write_latency=8
+OffChipFlash_clock_scale=0.0625
+OffChipFlash_ports_used=1
+OffChipFlash_burst_length=128
+OffChipFlash_read_latency=32
+OffChipFlash_write_latency=32
+
+; SRAMx2 (16 GB/s) and Flash (0.5 GB/s)
+[System_Config.Ethos_U85_SYS_Flash_High]
+core_clock=500e6
+axi0_port=Sram
+axi1_port=OffChipFlash
+Sram_clock_scale=1.0
+Sram_ports_used=2
+Sram_burst_length=64
+Sram_read_latency=32
+Sram_write_latency=32
+OffChipFlash_clock_scale=0.0625
+OffChipFlash_ports_used=1
+OffChipFlash_burst_length=128
+OffChipFlash_read_latency=64
+OffChipFlash_write_latency=64
+
+; SRAMx2 (16 GB/s) and DRAMx1 (3.75 GB/s)
+[System_Config.Ethos_U85_SYS_DRAM_Low]
+core_clock=500e6
+axi0_port=Sram
+axi1_port=Dram
+Sram_clock_scale=1.0
+Sram_ports_used=2
+Sram_burst_length=64
+Sram_read_latency=16
+Sram_write_latency=16
+Dram_clock_scale=0.46875
+Dram_ports_used=1
+Dram_burst_length=128
+Dram_read_latency=500
+Dram_write_latency=250
+
+; SRAMx2 (32 GB/s) and DRAM (12 GB/s)
+[System_Config.Ethos_U85_SYS_DRAM_Mid]
+core_clock=1e9
+axi0_port=Sram
+axi1_port=Dram
+Sram_clock_scale=1.0
+Sram_ports_used=2
+Sram_burst_length=64
+Sram_read_latency=32
+Sram_write_latency=32
+Dram_clock_scale=0.75
+Dram_ports_used=1
+Dram_burst_length=128
+Dram_read_latency=500
+Dram_write_latency=250
+
+; SRAMx4 (64 GB/s) and DRAMx2 (24 GB/s)
+[System_Config.Ethos_U85_SYS_DRAM_High]
+core_clock=1e9
+axi0_port=Sram
+axi1_port=Dram
+Sram_clock_scale=1.0
+Sram_ports_used=4
+Sram_burst_length=64
+Sram_read_latency=32
+Sram_write_latency=32
+Dram_clock_scale=0.75
+Dram_ports_used=2
+Dram_burst_length=128
+Dram_read_latency=500
+Dram_write_latency=250
+
 ; -----------------------------------------------------------------------------
 ; Memory Mode
 
@@ -120,12 +200,23 @@ const_mem_area=Axi1
 arena_mem_area=Axi0
 cache_mem_area=Axi0
 
-; Dedicated SRAM: the SRAM (384KB) is only for use by the Ethos-U
+; Dedicated SRAM: the SRAM is only for use by the Ethos-U
 ; The non-SRAM memory is assumed to be read-writeable
 [Memory_Mode.Dedicated_Sram]
 const_mem_area=Axi1
 arena_mem_area=Axi1
 cache_mem_area=Axi0
+
+; Dedicated SRAM 256KB: the SRAM (256KB) is only for use by the Ethos-U
+; The non-SRAM memory is assumed to be read-writeable
+[Memory_Mode.Dedicated_Sram_256KB]
+inherit=Memory_Mode.Dedicated_Sram
+arena_cache_size=262144
+
+; Dedicated SRAM 384KB: the SRAM (384KB) is only for use by the Ethos-U
+; The non-SRAM memory is assumed to be read-writeable
+[Memory_Mode.Dedicated_Sram_384KB]
+inherit=Memory_Mode.Dedicated_Sram
 arena_cache_size=393216
 
 ; Dedicated SRAM 512KB: the SRAM (512KB) is only for use by the Ethos-U
diff --git a/ethosu/mlw_codec/mlw_decode.c b/ethosu/mlw_codec/mlw_decode.c
index 264f3a6a..214b0317 100644
--- a/ethosu/mlw_codec/mlw_decode.c
+++ b/ethosu/mlw_codec/mlw_decode.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright 2020, 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ * SPDX-FileCopyrightText: Copyright 2020, 2022-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -91,7 +91,6 @@ int mlw_decode( uint8_t *inbuf, int inbuf_size, int16_t **outbuf, int verbose) {
     int first=1;
     int use_zero_run, i, j;
     int outbuf_size=0;
-    int nchunks=0;
 
     *outbuf=0;
 
@@ -260,7 +259,6 @@ int mlw_decode( uint8_t *inbuf, int inbuf_size, int16_t **outbuf, int verbose) {
             z_prev_enable = z_enable;
             z_prev_nsymbols = z_nsymbols;
             memcpy( z_prev_q, z_q, sizeof(z_prev_q));
-            nchunks++;
         } while( w_prev_enable || z_prev_enable );
 
         // Interleave non-zero and zeros into the outbut buffer
diff --git a/ethosu/regor/CMakeLists.txt b/ethosu/regor/CMakeLists.txt
new file mode 100644
index 00000000..f7615ff2
--- /dev/null
+++ b/ethosu/regor/CMakeLists.txt
@@ -0,0 +1,355 @@
+#
+# SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+cmake_minimum_required(VERSION 3.20.2)
+
+#############################################################################
+# Policies
+#############################################################################
+
+macro(regor_policy num val)
+    string(LENGTH "${num}" policy_var)
+    math(EXPR policy_var "4 - ${policy_var}")
+    string(REPEAT "0" ${policy_var} policy_var)
+    set(policy_var "CMP${policy_var}${num}")
+    if (POLICY ${policy_var})
+        cmake_policy(SET ${policy_var} ${val})
+        set(CMAKE_POLICY_DEFAULT_${policy_var} ${val})
+    endif()
+    unset(policy_var)
+endmacro()
+
+regor_policy(63 NEW)
+regor_policy(69 NEW)
+regor_policy(91 NEW)
+regor_policy(92 NEW)
+regor_policy(94 NEW)
+
+#############################################################################
+# Project
+#############################################################################
+
+if ("${CMAKE_CURRENT_SOURCE_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}")
+    project(regor
+        VERSION 0.1.0
+        DESCRIPTION "Regor Ethos-U compiler"
+        LANGUAGES CXX)
+
+    set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+    if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}")
+        message(FATAL_ERROR "${CMAKE_PROJECT_NAME} requires an out of source build.")
+    endif()
+endif()
+
+#############################################################################
+# Folders
+#############################################################################
+
+include(GNUInstallDirs)
+# Set libdir in a consistent way
+math(EXPR arch_bits "${CMAKE_SIZEOF_VOID_P}*8")
+if (arch_bits EQUAL 64)
+    set(CMAKE_INSTALL_LIBDIR "lib64" CACHE PATH "Libdir path" FORCE)
+endif()
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Installation path" FORCE)
+endif()
+
+#############################################################################
+# Config
+#############################################################################
+
+set(DEFAULT_CMAKE_BUILD_TYPE "Debug")
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+    message(STATUS "No build type selected, default to ${DEFAULT_CMAKE_BUILD_TYPE}")
+    set(CMAKE_BUILD_TYPE "${DEFAULT_CMAKE_BUILD_TYPE}" CACHE STRING "Build type (default ${DEFAULT_CMAKE_BUILD_TYPE})" FORCE)
+endif()
+
+if (CMAKE_TOOLCHAIN_FILE)
+    message(STATUS "Toolchain file: ${CMAKE_TOOLCHAIN_FILE}")
+endif()
+
+#############################################################################
+# Options
+#############################################################################
+
+# CCACHE support
+find_program(CCACHE_PROGRAM ccache)
+if (CCACHE_PROGRAM)
+    message(STATUS "Looking for CCACHE support - Success")
+    set(ENABLE_CCACHE_DEFAULT ON)
+else()
+    message(STATUS "Looking for CCACHE support - Not found")
+    set(ENABLE_CCACHE_DEFAULT OFF)
+endif()
+
+# LTO/IPO support. Only enable in Release mode
+include(CheckIPOSupported)
+check_ipo_supported(RESULT ENABLE_LTO_DEFAULT)
+if (ENABLE_LTO_DEFAULT)
+    if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+        set(ENABLE_LTO_DEFAULT OFF)
+        message(STATUS "Looking for IPO support - Success (default disabled)")
+    else()
+        message(STATUS "Looking for IPO support - Success")
+    endif()
+else()
+    message(STATUS "Looking for IPO support - Not found")
+endif()
+
+# Gold linker
+find_program(LD_GOLD "ld.gold")
+if (LD_GOLD)
+    message(STATUS "Looking for ld.gold support - Success (default disabled)")
+    set(ENABLE_LDGOLD_DEFAULT OFF)
+else()
+    message(STATUS "Looking for ld.gold support - Not found")
+    set(ENABLE_LDGOLD_DEFAULT OFF)
+endif()
+
+# Werror
+if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+    set(ENABLE_WERROR_DEFAULT OFF)
+else()
+    set(ENABLE_WERROR_DEFAULT ON)
+endif()
+
+# Asserts
+if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+    set(ENABLE_ASSERT_DEFAULT ON)
+else()
+    set(ENABLE_ASSERT_DEFAULT OFF)
+endif()
+
+option(REGOR_ENABLE_LTO "Link Time Optimization" ${ENABLE_LTO_DEFAULT})
+option(REGOR_ENABLE_LDGOLD "Enable Gold linker if available" ${ENABLE_LDGOLD_DEFAULT})
+option(REGOR_ENABLE_CCACHE "Enable ccache if available" ${ENABLE_CCACHE_DEFAULT})
+option(REGOR_ENABLE_WERROR "Warnings as errors" ${ENABLE_WERROR_DEFAULT})
+option(REGOR_ENABLE_STD_STATIC "Link libstdc and libgcc statically" OFF)
+option(REGOR_ENABLE_COVERAGE "Enable coverage build" OFF)
+option(REGOR_ENABLE_PROFILING "Enable timer based runtime profiling" OFF)
+option(REGOR_ENABLE_ASSERT "Enable asserts" ${ENABLE_ASSERT_DEFAULT})
+option(REGOR_ENABLE_EXPENSIVE_CHECKS "Enable expensive STL GLICXX asserts" OFF)
+option(REGOR_ENABLE_RTTI "Enable RTTI" OFF)
+option(REGOR_ENABLE_VALGRIND "Enable valgrind during check target" OFF)
+option(REGOR_ENABLE_TESTING "Enable unit testing" ON)
+option(REGOR_ENABLE_CPPCHECK "Enable cppcheck" OFF)
+set(REGOR_SANITIZE "" CACHE STRING "Sanitizer setting. For example undefined")
+set(REGOR_LOG_TRACE_MASK "" CACHE STRING "Log trace enable mask")
+set(REGOR_PACKAGE_NAME "${PROJECT_NAME}" CACHE STRING "CPack package name. Will be suffixed with platform tag")
+set(REGOR_DEBUG_COMPRESSION "zlib-gnu" CACHE STRING "Debug symbol compression. none, zlib or zlib-gnu")
+set(REGOR_PYTHON_BINDINGS_DESTINATION "" CACHE STRING "Python bindings install destination")
+set(REGOR_PYEXT_VERSION "${${PROJECT_NAME}_VERSION}" CACHE STRING "Python extension version")
+
+#############################################################################
+# General purpose
+#############################################################################
+
+# Modules
+list(APPEND CMAKE_MODULE_PATH
+    ${CMAKE_CURRENT_LIST_DIR}/cmake/
+    ${CMAKE_CURRENT_LIST_DIR}/dependencies/thirdparty/Catch2
+)
+
+include(utils)
+
+# Python
+utils_find_python()
+
+# Export compile commands and flags
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Action CCache
+if (REGOR_ENABLE_CCACHE)
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_PROGRAM}")
+endif()
+
+# Action CPPCHECK
+if (REGOR_ENABLE_CPPCHECK)
+    find_program(CPPCHECK cppcheck)
+    if (CPPCHECK)
+        message(STATUS "Looking for cppcheck support - Success")
+        set(CMAKE_CXX_CPPCHECK ${CMAKE_COMMAND} -E env CMAKE_BINARY_DIR=${CMAKE_BINARY_DIR}
+            ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/tools/cppcheck.py
+            ${CPPCHECK} --enable=all --inconclusive --quiet
+            --suppress=missingIncludeSystem
+            --suppress=*:${CMAKE_CURRENT_LIST_DIR}/dependencies/*
+        )
+    else()
+        message(STATUS "Looking for cppcheck support - Not found")
+    endif()
+endif()
+
+#############################################################################
+# Modules
+#############################################################################
+
+include(regor_options)
+include(regor_lib)
+include(regor_dependencies)
+
+#############################################################################
+# Top level targets
+############################################################################
+
+find_package(Threads REQUIRED)
+
+if (REGOR_PYTHON_BINDINGS_DESTINATION AND NOT TARGET pybind11::module)
+    regor_add_dependency(dependencies/thirdparty/pybind11)
+endif()
+
+set(REGOR_LIB_DEPS
+    Threads::Threads
+    regor::flatbuffers
+    regor::fmt
+    regor::gemmlowp
+    mlw_codec_st
+)
+
+set(REGOR_HEADERS
+    "include/regor.h"
+    "include/regor_database.hpp"
+    "include/regor_interface.hpp"
+)
+
+regor_lib(
+    NAME
+        regor-objects
+    TYPE OBJECT
+    DEFINES
+        "$<$<BOOL:${REGOR_LOG_TRACE_MASK}>:LOG_TRACE_ENABLE=${REGOR_LOG_TRACE_MASK}>"
+    DEPS
+        ${REGOR_LIB_DEPS}
+    PUBLIC_HEADERS
+        ${REGOR_HEADERS}
+    INC_DIRS
+        ${CMAKE_CURRENT_SOURCE_DIR}
+    SOURCES
+        "regor.cpp"
+        "common/common.cpp"
+        "common/data_type.cpp"
+        "common/logging.cpp"
+        "common/scaling.cpp"
+        "common/transpose_type.cpp"
+        "common/reverse_type.cpp"
+        "architecture/architecture.cpp"
+        "architecture/ethos_u_scaling.cpp"
+        "architecture/mlw_encode.cpp"
+        "architecture/ethosu55/ethos_u55.cpp"
+        "architecture/ethosu55/ethos_u55_scaling.cpp"
+        "architecture/ethosu55/ethos_u55_weight_encoder.cpp"
+        "architecture/ethosu55/ethos_u55_performance.cpp"
+        "architecture/ethosu55/ethos_u55_register_cs_generator.cpp"
+        "architecture/ethosu65/ethos_u65.cpp"
+        "architecture/ethosu65/ethos_u65_register_cs_generator.cpp"
+        "architecture/ethosu85/ethos_u85.cpp"
+        "architecture/ethosu85/ethos_u85_register_cs_generator.cpp"
+        "architecture/ethosu85/ethos_u85_scaling.cpp"
+        "architecture/ethosu85/ethos_u85_weight_encoder.cpp"
+        "architecture/ethosu85/ethos_u85_performance.cpp"
+        "compiler/attributes.cpp"
+        "compiler/compiler.cpp"
+        "compiler/faststorage_allocator.cpp"
+        "compiler/graph_builder.cpp"
+        "compiler/graph_packing.cpp"
+        "compiler/high_level_command_stream_generator.cpp"
+        "compiler/hillclimb_allocator.cpp"
+        "compiler/live_range.cpp"
+        "compiler/network_performance.cpp"
+        "compiler/operation.cpp"
+        "compiler/quantization.cpp"
+        "compiler/raw_writer.cpp"
+        "compiler/scheduler.cpp"
+        "compiler/scheduler_decompose.cpp"
+        "compiler/scheduler_packing.cpp"
+        "compiler/softmax.cpp"
+        "compiler/tensor.cpp"
+        "compiler/tensor_allocator.cpp"
+        "compiler/tflite_graph_optimiser.cpp"
+        "compiler/tflite_graph_optimiser_tp.cpp"
+        "compiler/cascade_builder.cpp"
+        "compiler/graph_optimiser.cpp"
+        "compiler/graphir_optimiser.cpp"
+        "compiler/optimiser_utils.cpp"
+        "compiler/graph_validator.cpp"
+        "compiler/tosa_graph_validator.cpp"
+        "compiler/op_type.cpp"
+        "tosa/tosa_validator.cpp"
+        "tosa/tosa_argument_checks.cpp"
+        "tosa/tosa_error_checks.cpp"
+        "tosa/tosa_level_checks.cpp"
+        "tosa/tosa_require_checks.cpp"
+        "tosa/tosa_validator_version_0_60_0_profile_bi.cpp"
+        "tosa/tosa_reader.cpp"
+        "tosa/tosa_mapping.cpp"
+        "tflite/tflite_reader.cpp"
+        "tflite/tflite_writer.cpp"
+        "tflite/tflite_mapping.cpp"
+        "tflite/tflite_model_semantics.cpp"
+)
+
+regor_lib(
+    NAME
+        regor-static
+    OUTPUT_NAME
+        regor
+    COMPONENT regor
+    TYPE STATIC
+    DEPS
+        regor-objects
+    PUBLIC_HEADERS
+        ${REGOR_HEADERS}
+)
+
+if (REGOR_PYTHON_BINDINGS_DESTINATION)
+    regor_lib(
+        NAME
+            PyRegor
+        OUTPUT_NAME
+            regor
+        COMPONENT python-bindings
+        INSTALL_LOCATION
+            ${REGOR_PYTHON_BINDINGS_DESTINATION}
+        TYPE PY_MODULE
+        DEFINES
+            REGOR_VERSION="${REGOR_PYEXT_VERSION}"
+        SOURCES
+            "bindings/python/py_regor.cpp"
+        DEPS
+            regor-objects
+        COPTS
+            "$<IF:$<CXX_COMPILER_ID:MSVC>,/GR,-frtti>"
+    )
+endif()
+
+#############################################################################
+# Subdirs
+############################################################################
+
+if (REGOR_ENABLE_TESTING)
+    # Enable testing at the top level to get a top level check target
+    include(CTest)
+    add_subdirectory(test)
+endif()
+
+# CPack last
+include(cpack_config)
diff --git a/ethosu/regor/architecture/architecture.cpp b/ethosu/regor/architecture/architecture.cpp
new file mode 100644
index 00000000..1f90fc64
--- /dev/null
+++ b/ethosu/regor/architecture/architecture.cpp
@@ -0,0 +1,221 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "architecture.hpp"
+
+#include "common/logging.hpp"
+
+#include "common/bit_flags.hpp"
+
+BEGIN_ENUM_TABLE(regor::MemUsage)
+    ADD_ENUM_NAME(None)
+    ADD_ENUM_NAME(ReadOnly)
+    ADD_ENUM_NAME(FeatureMap)
+    ADD_ENUM_NAME(LUT)
+    ADD_ENUM_NAME(Staging)
+END_ENUM_TABLE()
+
+namespace regor
+{
+
+IniParseResult Architecture::ParseSection(const std::string &section, IniReader *reader)
+{
+    // Parse the architecture config (must happen first in INI file ).
+    if ( section == "architecture" )
+    {
+        if ( !ParseConfig(reader) )
+        {
+            return IniParseResult::Error;
+        }
+    }
+    // Parse memory definitions
+    else if ( section == "memory" || section.find("memory.") == 0 )
+    {
+        if ( !ParseMemory(reader, section) )
+        {
+            return IniParseResult::Error;
+        }
+    }
+    // Parse system configuration locally
+    else if ( section == "system" )
+    {
+        std::string key;
+        std::string tmp;
+        while ( reader->Begin(key) )
+        {
+            if ( key == "const" )
+            {
+                tmp = reader->Get<std::string>();
+                SetReadonlyMemory(tmp);
+            }
+            else if ( key == "feature_maps" )
+            {
+                tmp = reader->Get<std::string>();
+                SetFeatureMapMemory(tmp);
+            }
+            else if ( key == "staging" )
+            {
+                tmp = reader->Get<std::string>();
+                SetStagingMemory(tmp);
+            }
+            else
+            {
+                LOG_WARN("Skipping parsing of unrecognised configuration option '{}' of section '{}'\n", key, section);
+            }
+            reader->End();
+        }
+    }
+    else
+    {
+        LOG_WARN("Skipping parsing of unrecognised configuration section '{}'\n", section);
+        return IniParseResult::Unknown;
+    }
+
+    return IniParseResult::Done;
+}
+
+
+bool Architecture::ParseMemory(IniReader *reader, const std::string &section)
+{
+    std::string name;
+    Address size = MaxAddress();
+    float bandwidth = 1;
+    int readLatency = 0;
+    int writeLatency = 0;
+    int burstLength = 1;
+    int portIndex = 0;
+
+    // Parse memory definition
+    std::string key;
+    while ( reader->Begin(key) )
+    {
+        if ( key == "name" )
+        {
+            name = reader->Get<std::string>();
+        }
+        else if ( key == "size" )
+        {
+            size = reader->Get<int>();
+            std::string suffix;
+            if ( reader->Read(suffix) )
+            {
+                if ( suffix == "kb" )
+                {
+                    size *= 1024;
+                }
+                else if ( suffix == "mb" )
+                {
+                    size *= 1024 * 1024;
+                }
+            }
+        }
+        else if ( key == "bandwidth" )
+        {
+            bandwidth = std::max(0.0001f, reader->Get<float>());
+        }
+        else if ( key == "read_latency" )
+        {
+            readLatency = std::max(0, reader->Get<int>());
+        }
+        else if ( key == "write_latency" )
+        {
+            writeLatency = std::max(0, reader->Get<int>());
+        }
+        else if ( key == "burst_length" )
+        {
+            burstLength = std::max(1, reader->Get<int>());
+        }
+        else if ( key == "port_index" )
+        {
+            portIndex = std::max(0, reader->Get<int>());
+        }
+        else
+        {
+            LOG_WARN("Skipping parsing of unrecognised memory configuration option '{}'\n", key);
+        }
+
+        reader->End();
+    }
+
+    // Add a named, sized, memory to the system memory map
+    if ( name.empty() )
+    {
+        LOG_ERROR("Unable to parse memory configuration. All memories must have a name.\n");
+        return false;
+    }
+    if ( (std::string("memory.") + name) != section )
+    {
+        LOG_ERROR("Unable to parse memory configuration. All memories must have matching name key and section name.\n");
+        return false;
+    }
+    else if ( _memories.count(name) )
+    {
+        LOG_ERROR("Unable to parse memory configuration for '{}'. All memories must have a unique name.\n", name);
+        return false;
+    }
+    else if ( size <= 0 )
+    {
+        LOG_ERROR("Unable to parse memory configuration for '{}' of size {} bytes. All memories must have size > 0.\n", name, size);
+        return false;
+    }
+    else
+    {
+        auto memory = std::make_unique<ArchitectureMemory>(name, size);
+        memory->SetParameters(bandwidth, readLatency, writeLatency, burstLength, portIndex);
+        _memories[name] = std::move(memory);
+    }
+
+    return true;
+}
+
+
+bool Architecture::CheckConfiguration(std::string &error)
+{
+    if ( !_featuremapMemory )
+    {
+        error = "Feature Map memory not configured";
+        return false;
+    }
+    if ( !_lutMemory )
+    {
+        error = "LUT memory not configured";
+        return false;
+    }
+    if ( !_stagingMemory )
+    {
+        error = "Staging memory not configured";
+        return false;
+    }
+    if ( !_readonlyMemory )
+    {
+        error = "Readonly memory not configured";
+        return false;
+    }
+    for ( auto &mem : _memories )
+    {
+        if ( mem.second->SizeBytes() > MaxAddress() )
+        {
+            error = "Configured memory size out of bounds for memory: " + mem.first;
+            return false;
+        }
+    }
+
+    return true;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/architecture.hpp b/ethosu/regor/architecture/architecture.hpp
new file mode 100644
index 00000000..56bac80c
--- /dev/null
+++ b/ethosu/regor/architecture/architecture.hpp
@@ -0,0 +1,391 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/common.hpp"
+
+#include "common/bit_flags.hpp"
+#include "common/data_type.hpp"
+#include "common/ini_reader.hpp"
+#include "common/numeric_util.hpp"
+#include "common/reverse_type.hpp"
+#include "common/scaling.hpp"
+#include "common/shape.hpp"
+#include "common/transpose_type.hpp"
+#include "compiler/kernel.hpp"
+#include "compiler/op_type.hpp"
+#include "compiler/tensor_properties.hpp"
+#include "mlw_encode.hpp"
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+namespace regor
+{
+
+class WeightEncoder;
+class IRegisterCommandStreamGenerator;
+
+using Address = int64_t;
+
+enum class TensorFormat : int16_t
+{
+    Unknown = 0,
+    NHWC = 1,
+    NHCWB16 = 2,
+    WeightsEncoded = 3,
+};
+
+/// <summary>
+/// Architecture connected-memory definition
+/// </summary>
+struct ArchitectureMemory
+{
+protected:
+    std::string _name;
+    Address _sizeBytes = 0;
+    float _bandwidthPerCycle = 1;
+    int _readLatencyCycles = 0;
+    int _writeLatencyCycles = 0;
+    int _maxBurstLengthBytes = 0;
+    int _portIndex = 0;
+
+public:
+    ArchitectureMemory(const std::string &name, Address sizeBytes) : _name(name), _sizeBytes(sizeBytes) {}
+
+public:
+    void SetParameters(float bandwidth, int readLatency, int writeLatency, int maxBurstLengthBytes, int portIndex)
+    {
+        _bandwidthPerCycle = bandwidth;
+        _readLatencyCycles = readLatency;
+        _writeLatencyCycles = writeLatency;
+        _maxBurstLengthBytes = maxBurstLengthBytes;
+        _portIndex = portIndex;
+    }
+
+    float Bandwidth() const { return _bandwidthPerCycle; }  // Assume read/write symmetrical
+    int ReadLatency() const { return _readLatencyCycles; }
+    int WriteLatency() const { return _writeLatencyCycles; }
+    Address SizeBytes() const { return _sizeBytes; }
+    int MaxBurstLength() const { return _maxBurstLengthBytes; }
+    std::string Name() const { return _name; }
+};
+
+enum class MemUsage : uint16_t
+{
+    None = 0,
+    ReadOnly = 0x1,
+    FeatureMap = 0x2,
+    LUT = 0x4,
+    Staging = 0x8,
+};
+
+struct MemArea
+{
+    ArchitectureMemory *memory = nullptr;  // The physical memory area
+    Flags<MemUsage> usage;                 // Usage partition within the memory area
+
+    MemArea() = default;
+    MemArea(ArchitectureMemory *architectureMemory, MemUsage memUsage) : memory(architectureMemory), usage(memUsage) {}
+
+    bool operator==(const MemArea &other) const { return memory == other.memory && usage == other.usage; }
+    bool operator!=(const MemArea &other) const { return !operator==(other); }
+    explicit operator bool() const { return (memory != nullptr) && (usage != MemUsage::None); }
+
+    struct hash
+    {
+        size_t operator()(const MemArea &memArea) const { return size_t(memArea.memory) | size_t(memArea.usage); }
+    };
+};
+
+/// <summary>
+/// Per-operator architecture configuration base
+/// </summary>
+class ArchitectureOpConfig
+{
+public:
+    virtual ~ArchitectureOpConfig() = default;
+    virtual std::unique_ptr<ArchitectureOpConfig> Clone() = 0;
+    virtual int MaxIFMBuffering() = 0;
+    virtual Point2i OptimalStripeGranule() = 0;
+    virtual int OptimalDepthGranule() = 0;
+    virtual std::string ToString(bool full) = 0;
+};
+
+enum class ArchResampling : uint8_t
+{
+    None = 0,
+    Nearest = 1,
+    Zeros = 2,
+};
+
+enum class ArchResizeMode : uint8_t
+{
+    Nearest = 0,
+    Bilinear = 1,
+    Replicate = 2,
+};
+
+/// <summary>
+/// Description of a candidate op to add to a ArchitectureOpGroup
+/// </summary>
+struct ArchitectureOpGroupQuery
+{
+    struct TensorInfo
+    {
+        UniqueId key;
+        DataType type;
+    };
+
+    OpType type;
+    const Kernel *kernel;
+    TensorInfo ifm;
+    TensorInfo ifm2;
+    TensorInfo ofm;
+};
+
+/// <summary>
+/// Group of ops that can be fused and/or chained
+/// </summary>
+class ArchitectureOpGroup
+{
+public:
+    virtual ~ArchitectureOpGroup() = default;
+    virtual int Add(const ArchitectureOpGroupQuery &op, const std::vector<int> &dependsOn = {}) = 0;
+
+protected:
+    virtual bool CanRunOnNPU(const ArchitectureOpGroupQuery &op) = 0;
+};
+
+enum class ArchAccumulatorSource : uint8_t
+{
+    Reset = 0,
+    Acc = 1,
+    Ifm2 = 2
+};
+
+/// <summary>
+/// Query Information to retrieve HW specific operator config
+/// </summary>
+struct ArchitectureConfigQuery
+{
+    Shape ofmShape;
+    Shape ifmShape[2];
+    int ifmBits;
+    Kernel *kernel;
+    int lutBytes;
+    bool scaled;
+    ArchResampling ifmResampling;
+    TransposeType transpose;
+    ReverseType reverse;
+    TensorFormat ofmFormat;
+    WeightFormat weightFormat;
+    ArchAccumulatorSource accSource = ArchAccumulatorSource::Reset;
+    bool accOutputEnabled = true;
+    struct Rescale
+    {
+        GraphApi::FractionND scaleY = {1, 1};
+        GraphApi::FractionND scaleX = {1, 1};
+    } rescaling;
+};
+
+/// <summary>
+/// Information for querying operation performance
+/// </summary>
+struct PerformanceQuery
+{
+    OpType type;
+    Kernel *kernel;
+    ArchitectureOpConfig *config;
+    Shape ifmShape[2];
+    ArchitectureMemory *ifmMemory[2];
+    DataType ifmType[2];
+    TensorFormat ifmFormat[2];
+    Shape ofmShape;
+    ArchitectureMemory *ofmMemory;
+    DataType ofmType;
+    TensorFormat ofmFormat;
+    Shape constShape;
+    ArchitectureMemory *constMemory;
+};
+
+/// <summary>
+/// Information for querying performance for HW fused operations
+/// </summary>
+struct FusionQuery
+{
+    OpType type;
+    Kernel *kernel = nullptr;
+    Shape ifm2Shape;
+    ArchitectureMemory *ifm2Memory = nullptr;
+    DataType ifm2Type;
+    TensorFormat ifm2Format;
+};
+
+/// <summary>
+/// Information for querying support for Resize
+/// </summary>
+struct ResizeSupportQuery
+{
+    ArchResizeMode mode;
+    GraphApi::FractionND scaleY;
+    GraphApi::FractionND scaleX;
+    int offsetY;
+    int offsetX;
+    Shape ifmShape;
+};
+
+/// <summary>
+/// Information for querying whether an operation can be executed by the architecture
+/// </summary>
+struct ExecutionQuery
+{
+    DataType ifmType[2];
+    DataType ofmType;
+    const Kernel *kernel;
+};
+
+/// <summary>
+/// Cycle cost of performing an operation
+/// </summary>
+struct CycleCost
+{
+    int64_t opCycles = 0;
+    int64_t macs = 0;
+};
+
+/// <summary>
+/// How elements are accessed during an operation
+/// </summary>
+struct ElementAccess
+{
+    int ifmRead[2] = {0, 0};
+    int ofmWrite = 0;
+    int weightsRefetch = 0;
+    int constRead[2] = {0, 0};
+};
+
+/// <summary>
+/// Architecture performance interface
+/// </summary>
+class ArchitecturePerformance
+{
+public:
+    virtual ~ArchitecturePerformance() = default;
+    virtual CycleCost MeasureCycleCost(const PerformanceQuery &query, const std::vector<FusionQuery> &fused) = 0;
+    virtual int64_t MemToMemCycles(const ArchitectureMemory *dest, const ArchitectureMemory *source, int sizeBytes) = 0;
+    virtual ElementAccess MeasureElementAccess(const PerformanceQuery &query) = 0;
+    virtual ElementAccess ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access) = 0;
+};
+
+enum class IniParseResult
+{
+    Unknown = 0,
+    Done,
+    Error,
+};
+
+enum class AxisMask
+{
+    None = 0,
+    AxisY = 1,
+};
+
+/// <summary>
+/// ArchitectureFeatures base
+/// </summary>
+class Architecture
+{
+protected:
+    std::unordered_map<std::string, std::unique_ptr<ArchitectureMemory>> _memories;
+    ArchitectureMemory *_readonlyMemory = nullptr;
+    ArchitectureMemory *_featuremapMemory = nullptr;
+    ArchitectureMemory *_lutMemory = nullptr;
+    ArchitectureMemory *_stagingMemory = nullptr;
+
+public:
+    virtual ~Architecture() = default;
+    virtual bool ParseConfig(IniReader *reader) = 0;
+    virtual bool CheckConfiguration(std::string &error);
+    virtual std::unique_ptr<ArchitectureOpConfig> GetOpConfig(OpType opType, const ArchitectureConfigQuery &query) = 0;
+    virtual std::unique_ptr<ArchitectureOpGroup> CreateOpGroup(const ArchitectureOpGroupQuery &op) = 0;
+    virtual class WeightEncoder *WeightEncoder() = 0;
+    virtual ArchitecturePerformance *Performance() = 0;
+    virtual IRegisterCommandStreamGenerator *RegisterCommandStreamGenerator() = 0;
+    virtual TensorFormat IdealBufferingFormat() { return TensorFormat::Unknown; }
+    virtual Address MaxAddress() = 0;
+    virtual std::vector<uint32_t> ConfigRegisters() = 0;
+    virtual uint32_t Version() = 0;
+    virtual int UpscaleAndRounding(ArchResampling resampling, int &rounding) = 0;
+    virtual AxisMask CanSubdivide(OpType opType) = 0;
+    virtual bool SupportsLeakyRelu(bool quantized, DataType type) = 0;
+    virtual bool SupportsMatMul(OpType opType) = 0;
+    virtual bool SupportsTranspose(OpType opType, TransposeType transposeType) = 0;
+    virtual bool SupportsReverse(OpType opType, ReverseType reverseType) = 0;
+    virtual bool SupportsGather(OpType opType) = 0;
+    virtual bool SupportsScatter(OpType opType) = 0;
+    virtual bool SupportsSigmoidTanhLutInt16(OpType opType) = 0;
+    virtual bool SupportsResize(const ResizeSupportQuery &query) = 0;
+    virtual bool SupportsAccumulatorMode(ArchAccumulatorSource source, bool outputEnabled) = 0;
+    virtual bool SupportsScalar(OpType opType, DataType dataType, TensorUsage usage) = 0;
+    virtual bool SupportsArgMax(OpType opType) = 0;
+    virtual Flags<WeightFormat> SupportedWeightFormat(OpType op) = 0;
+
+    MemArea ReadonlyMemory()
+    {
+        assert(_readonlyMemory);
+        return MemArea(_readonlyMemory, MemUsage::ReadOnly);
+    }
+    MemArea FeatureMapMemory()
+    {
+        assert(_featuremapMemory);
+        Flags<MemUsage> usage = MemUsage::FeatureMap;
+        if ( _featuremapMemory == _stagingMemory )
+        {
+            usage |= MemUsage::Staging;
+        }
+        return MemArea(_featuremapMemory, usage);
+    }
+    MemArea LUTMemory()
+    {
+        assert(_lutMemory);
+        return MemArea(_lutMemory, MemUsage::LUT);
+    }
+    MemArea StagingMemory()
+    {
+        assert(_stagingMemory);
+        Flags<MemUsage> usage = MemUsage::Staging;
+        if ( _featuremapMemory == _stagingMemory )
+        {
+            usage |= MemUsage::FeatureMap;
+        }
+        return MemArea(_stagingMemory, usage);
+    }
+
+    IniParseResult ParseSection(const std::string &section, IniReader *reader);
+    // Select named memories
+    void SetReadonlyMemory(const std::string &name) { _readonlyMemory = _memories.at(name).get(); }
+    void SetFeatureMapMemory(const std::string &name) { _featuremapMemory = _memories.at(name).get(); }
+    void SetStagingMemory(const std::string &name) { _stagingMemory = _memories.at(name).get(); }
+
+private:
+    bool ParseMemory(IniReader *reader, const std::string &section);
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethos_u_register_cs_generator.hpp b/ethosu/regor/architecture/ethos_u_register_cs_generator.hpp
new file mode 100644
index 00000000..9be69e0a
--- /dev/null
+++ b/ethosu/regor/architecture/ethos_u_register_cs_generator.hpp
@@ -0,0 +1,66 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common/common.hpp"
+#include "common/logging.hpp"
+
+#include "register_command_stream_generator.hpp"
+
+namespace regor
+{
+
+template<typename BASE>
+class EthosURegisterCSGenerator : public IRegisterCommandStreamGenerator
+{
+public:
+    void PrintCommandStream(const std::vector<uint32_t> &stream, std::vector<std::pair<unsigned, std::string>> &debugInfo) override
+    {
+        LOG_PRINT("Register command stream: {} words\n", stream.size());
+        LOG_PRINT("{0:>8}: {1:8}{2:4} {3:4} - {4:30} {5:5}, {6}\n", "  Offset", "Payload", "Param", "Code", "Command", "Param", "Fields");
+
+        size_t debugInfoIdx = 0;
+        for ( unsigned i = 0; i < stream.size(); )
+        {
+            if ( debugInfoIdx < debugInfo.size() && debugInfo[debugInfoIdx].first == i )
+            {
+                LOG_PRINT("// {}\n", debugInfo[debugInfoIdx++].second);
+            }
+            const uint32_t *d = &stream[i];
+            std::string op;
+            std::vector<std::pair<std::string, std::string>> fields;
+            int nrWords = static_cast<BASE *>(this)->Disassemble(d, op, fields);
+            uint32_t code = *d & 0xffff;
+            uint32_t par = *d >> 16;
+            uint32_t payload = 0;
+            if ( nrWords == 2 && i + 1 < stream.size() )
+            {
+                payload = stream[i + 1];
+            }
+            const auto &intr = nrWords == 2 ? fmt::format("{:08x}", payload) : fmt::format("{:8}", "");
+            LOG_PRINT("{0:#08x}: {1} {2:04x} {3:04x} - {4:30} {5:5}", i * sizeof(uint32_t), intr, par, code, op, par);
+            i += nrWords;
+            for ( auto &f : fields )
+            {
+                LOG_PRINT(", {} = {}", f.first, f.second);
+            }
+            LOG_PRINT("\n");
+        }
+    }
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethos_u_scaling.cpp b/ethosu/regor/architecture/ethos_u_scaling.cpp
new file mode 100644
index 00000000..3f2b17ad
--- /dev/null
+++ b/ethosu/regor/architecture/ethos_u_scaling.cpp
@@ -0,0 +1,69 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "ethos_u_scaling.hpp"
+
+#include "common/numeric_util.hpp"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+
+namespace regor
+{
+
+void QuantizePoolingScale(int kernelElements, double rescale, int rescaleBits, uint32_t &scale, int &shift, int N)
+{
+    int exp;
+    std::frexp(float(kernelElements - 1), &exp);
+    // N = scale instruction register size
+    int n = (N - 1) - rescaleBits;
+    scale = uint32_t(std::ceil(rescale * double(((1ULL << (n + exp)) + (1ULL << exp)) / kernelElements)));
+    shift = n + exp;
+    assert(unsigned(shift) < 64);
+}
+
+void QuantizePoolingScaleMaxPrecision(int kernelElements, double rescale, uint32_t &scale, int &shift, int N)
+{
+    int rescaleBits = 0;
+    // if rescale != 1, scale need to consider the number of bits needed for rescaling
+    if ( rescale > 1 )
+    {
+        rescaleBits = IntLog2(rescale) + 2;
+    }
+    else if ( rescale < 1 )
+    {
+        rescaleBits = -IntLog2(1.0 / rescale);
+    }
+    QuantizePoolingScale(kernelElements, rescale, rescaleBits, scale, shift, N);
+}
+
+// Simplified version of calculating elementwise Add/Sub scales
+void SimplifiedElementwiseAddSubScale(double input1Scale, double input2Scale, double outputScale, int inputShift,
+    double &input1Rescale, double &input2Rescale, QuantizedScale &outScale)
+{
+    auto m = 2 * std::max(input1Scale, input2Scale);
+    auto f = double(int64_t(1) << inputShift);
+    input1Rescale = input1Scale * f / m;
+    input2Rescale = input2Scale * f / m;
+    double outputRescale = m / (outputScale * f);
+    outScale = QuantizedScale(outputRescale);
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethos_u_scaling.hpp b/ethosu/regor/architecture/ethos_u_scaling.hpp
new file mode 100644
index 00000000..1cbb7c55
--- /dev/null
+++ b/ethosu/regor/architecture/ethos_u_scaling.hpp
@@ -0,0 +1,37 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/scaling.hpp"
+
+#include <cstdint>
+
+namespace regor
+{
+
+void QuantizePoolingScale(int kernelElements, double rescale, int rescaleBits, uint32_t &scale, int &shift, int N);
+
+// Max scale precision based on register size N (32 or 31)
+void QuantizePoolingScaleMaxPrecision(int kernelElements, double rescale, uint32_t &scale, int &shift, int N);
+
+// Simplified version of calculating elementwise Add/Sub scales
+void SimplifiedElementwiseAddSubScale(double input1Scale, double input2Scale, double outputScale, int inputShift,
+    double &input1Rescale, double &input2Rescale, QuantizedScale &outScale);
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55.cpp
new file mode 100644
index 00000000..e973170c
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55.cpp
@@ -0,0 +1,932 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "ethos_u55.hpp"
+
+#include "common/common.hpp"
+#include "common/logging.hpp"
+
+#include "common/bit_flags.hpp"
+#include "common/numeric_util.hpp"
+#include "ethos_u55_performance.hpp"
+#include "ethos_u55_register_cs_generator.hpp"
+#include "ethos_u55_weight_encoder.hpp"
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <unordered_map>
+#include <unordered_set>
+
+BEGIN_ENUM_TABLE(regor::EthosU55SHRamElements)
+    ADD_ENUM_NAME(SHRAM_IFM8)
+    ADD_ENUM_NAME(SHRAM_IFM16)
+    ADD_ENUM_NAME(SHRAM_IFM8_Elementwise)
+    ADD_ENUM_NAME(SHRAM_IFM16_Elementwise)
+    ADD_ENUM_NAME(SHRAM_IFM32)
+    ADD_ENUM_NAME(SHRAM_Acc16)
+    ADD_ENUM_NAME(SHRAM_Acc32)
+    ADD_ENUM_NAME(SHRAM_Acc40)
+END_ENUM_TABLE()
+
+BEGIN_ENUM_TABLE(regor::EthosUTraversal)
+    ADD_ENUM_NAME(DepthFirst)
+    ADD_ENUM_NAME(PartKernel)
+    ADD_ENUM_NAME(Depthwise)
+END_ENUM_TABLE()
+
+namespace regor
+{
+
+static const EthosU55PerfInfo s_EthosU55PerfInfo[] = {
+    // Accelerator.Ethos_U55_32
+    {{2.0, 3.0, 3.0, 3.0, 4.0, 6.0, 1.0, 2.0}, {1.0, 1.0, 0.0}},
+    // Accelerator.Ethos_U55_64
+    {{1.0, 1.5, 1.5, 1.5, 2.0, 3.0, 0.5, 1.0}, {1.0, 1.0, 0.0}},
+    // Accelerator.Ethos_U55_128
+    {{0.75, 1.25, 0.75, 0.75, 1.0, 1.5, 0.25, 0.5}, {1.0, 0.5, 0.0}},
+    // Accelerator.Ethos_U55_256
+    {{0.625, 1.125, 0.5, 0.375, 0.5, 0.75, 0.125, 0.25}, {1.0, 0.25, 0.0}},
+};
+
+static const ArchEthosU55::AcceleratorConfig s_EthosU55Configs[] = {
+    // Accelerator.Ethos_U55_32
+    {32, 1, Shape(1, 1, 4), Shape(1, 1, 8), 16, {2, 2, 2, 2, 4, 4, 4, 4}, 1, &s_EthosU55PerfInfo[0]},
+    // Accelerator.Ethos_U55_64
+    {64, 1, Shape(1, 1, 8), Shape(1, 1, 8), 16, {2, 2, 2, 2, 4, 4, 4, 8}, 2, &s_EthosU55PerfInfo[1]},
+    // Accelerator.Ethos_U55_128
+    {128, 1, Shape(1, 2, 8), Shape(1, 2, 8), 24, {4, 4, 4, 4, 8, 4, 8, 12}, 4, &s_EthosU55PerfInfo[2]},
+    // Accelerator.Ethos_U55_256
+    {256, 1, Shape(2, 2, 8), Shape(2, 2, 8), 48, {8, 8, 8, 8, 16, 8, 16, 20}, 8, &s_EthosU55PerfInfo[3]},
+};
+
+enum class ElementwiseUsage
+{
+    No = 0,
+    Full = 1,
+    Scalar = 2,
+};
+
+static const int s_SHRAMElementBits[] = {
+    8,   // IFM8
+    16,  // IFM16
+    8,   // IFM8_Elementwise
+    16,  // IFM16_Elementwise
+    32,  // IFM32
+    16,  // Acc16
+    32,  // Acc32
+    40,  // Acc40
+};
+
+static_assert(std::size(s_SHRAMElementBits) == int(SHRAM_Last) + 1, "Bad element mapping");
+
+
+ArchEthosU55::ArchEthosU55() : _subkernelMax(8, 8, 65536), _ofmBlockMax(32, 64, 128)
+{
+    _weightEncoder = std::make_unique<EthosU55WeightEncoder>(this);
+    _rcsGenerator = std::make_unique<EthosU55RCSGenerator>(this);
+}
+
+uint32_t ArchEthosU55::Version()
+{
+    return EthosU55RCSGenerator::IdRegister();
+}
+
+bool ArchEthosU55::ParseConfig(IniReader *reader)
+{
+    // Parse architecture configuration
+    std::string key;
+    int macs = 0;
+    while ( reader->Begin(key) )
+    {
+        if ( key == "macs" )
+        {
+            macs = reader->Get<int>();
+        }
+        reader->End();
+    }
+
+    // Find the requested MAC configuration for this accelerator
+    auto cfg = std::find_if(s_EthosU55Configs, std::cend(s_EthosU55Configs),
+        [&](const AcceleratorConfig &config) { return config.macs == macs; });
+    if ( cfg == std::cend(s_EthosU55Configs) )
+    {
+        assert(macs == 32 || macs == 64 || macs == 128 || macs == 256);
+        LOG_TRACE0("Unable to find U55 accelerator for macs={}", macs);
+        return false;
+    }
+
+    ApplyConfig(cfg);
+
+    return true;
+}
+
+void ArchEthosU55::ApplyConfig(const AcceleratorConfig *cfg)
+{
+    // Basic configuration
+    _cores = cfg->cores;
+    _macs = cfg->macs;
+    _ifmUBlock = cfg->ifmUblock;
+    _ofmUBlock = cfg->ofmUBlock;
+
+    // All SHRAM granules
+    _shramGranules = cfg->shramGranules;
+
+    // Bank granules organised by bit width
+    _ifmBankGranules[0] = cfg->shramGranules[SHRAM_IFM8];
+    _ifmBankGranules[1] = cfg->shramGranules[SHRAM_IFM16];
+    _ifmBankGranules[2] = 0;
+    _ifmBankGranules[3] = cfg->shramGranules[SHRAM_IFM32];
+
+    // Elementwise bank granules organised by bit width
+    _ifmEWBankGranules[0] = cfg->shramGranules[SHRAM_IFM8_Elementwise];
+    _ifmEWBankGranules[1] = cfg->shramGranules[SHRAM_IFM16_Elementwise];
+    _ifmEWBankGranules[2] = 0;
+    _ifmEWBankGranules[3] = cfg->shramGranules[SHRAM_IFM32];
+
+    // SHRAM layout information
+    _shram.reservedOutputBanks = 2;
+    _shram.bankSizeBytes = 1024, _shram.totalBanks = cfg->shramBanks;
+    _shram.reservedEndBanks = (_shram.totalBanks > 16) ? 2 : 0;
+
+    _shramMemory = std::make_unique<ArchitectureMemory>("shram", _shram.bankSizeBytes * _shram.totalBanks);
+    _shramMemory->SetParameters(1, 0, 0, 1, 0);  // TODO: These figures make no sense for the shram (they depend on
+                                                 // which HW unit is accessing the memory)
+    _lutMemory = _shramMemory.get();
+    _performance = std::unique_ptr<ArchitecturePerformance>(new EthosU55Performance(this, cfg->perfInfo));
+}
+
+
+std::unique_ptr<ArchitectureOpConfig> ArchEthosU55::GetOpConfig(OpType opType, const ArchitectureConfigQuery &query)
+{
+    auto config = FindBlockConfig(opType, query);
+    return config;
+}
+
+
+std::unique_ptr<ArchitectureOpGroup> ArchEthosU55::CreateOpGroup(const ArchitectureOpGroupQuery &op)
+{
+    LOG_TRACE1("Trying to create ArchEthosU55 OpGroup for {}\n", OpTypeToString(op.type));
+
+    auto group = std::make_unique<EthosU55OpGroup>();
+    if ( !group->Add(op) )
+    {
+        return nullptr;
+    }
+
+    return group;
+}
+
+std::vector<uint32_t> ArchEthosU55::ConfigRegisters()
+{
+    return std::vector<uint32_t>(1, ConfigRegister(0));
+}
+
+int ArchEthosU55::UpscaleAndRounding(ArchResampling resampling, int &rounding)
+{
+    rounding = (resampling == ArchResampling::Nearest) ? 1 : 0;
+    return (resampling == ArchResampling::None) ? 1 : 2;
+}
+
+AxisMask ArchEthosU55::CanSubdivide(OpType opType)
+{
+    if ( IsConvolution(opType) || IsElementwise(opType) || IsPooling(opType) )
+    {
+        return AxisMask::AxisY;
+    }
+    return AxisMask::None;
+}
+
+bool ArchEthosU55::SupportsLeakyRelu(bool quantized, DataType type)
+{
+    return quantized == false && type == DataType::Int16;
+}
+
+bool ArchEthosU55::SupportsMatMul(OpType opType)
+{
+    UNUSED(opType);
+    return false;
+}
+
+bool ArchEthosU55::SupportsTranspose(OpType opType, TransposeType transposeType)
+{
+    UNUSED(opType);
+    UNUSED(transposeType);
+    return IsNone(transposeType);
+}
+
+bool ArchEthosU55::SupportsReverse(OpType opType, ReverseType reverseType)
+{
+    UNUSED(opType);
+    UNUSED(reverseType);
+    return reverseType == ReverseType::None;
+}
+
+bool ArchEthosU55::SupportsGather(OpType opType)
+{
+    UNUSED(opType);
+    return false;
+}
+
+bool ArchEthosU55::SupportsScatter(OpType opType)
+{
+    UNUSED(opType);
+    return false;
+}
+bool ArchEthosU55::SupportsResize(const ResizeSupportQuery &query)
+{
+    UNUSED(query);
+    return false;
+}
+
+bool ArchEthosU55::SupportsSigmoidTanhLutInt16(OpType opType)
+{
+    UNUSED(opType);
+    return false;
+}
+
+bool ArchEthosU55::SupportsAccumulatorMode(ArchAccumulatorSource source, bool outputEnabled)
+{
+    return source == ArchAccumulatorSource::Reset && outputEnabled;
+}
+
+bool ArchEthosU55::SupportsScalar(OpType opType, DataType dataType, TensorUsage usage)
+{
+    bool supportedType(dataType == DataType::Int8 || dataType == DataType::UInt8 || dataType == DataType::Int16);
+    return EthosU55RCSGenerator::IsSupportedElementwise(opType) && supportedType && IsIFM(usage);
+}
+
+bool ArchEthosU55::SupportsArgMax(OpType opType)
+{
+    UNUSED(opType);
+    return false;
+}
+
+Flags<WeightFormat> ArchEthosU55::SupportedWeightFormat(OpType)
+{
+    return WeightFormat::Default;
+}
+
+bool ArchEthosU55::UseAvgPoolNop(OpType type)
+{
+    return IsActivation(type) || type == OpType::Quantize || type == OpType::MemoryCopy;
+}
+
+static bool ChooseKernelMethod(const Shape &ifmShape, int ifmBits, const Kernel *kernel)
+{
+    if ( ifmShape.Depth() <= 8 )
+    {
+        return true;
+    }
+
+    // Compare part-kernel to depth-kernel and choose the one with best utilisation
+    int kernelElements = kernel->ElementsWH();
+    double depthUtilisation = ifmShape.Depth() / double(RoundAway(ifmShape.Depth(), ifmBits == 8 ? 32 : 16));
+    double partUtilisation =
+        (ifmShape.Depth() / double(RoundAway(ifmShape.Depth(), 8)) *
+            (kernelElements / double(RoundAway(kernelElements, ifmBits == 8 ? 4 : 2))));
+
+    return partUtilisation >= depthUtilisation;
+}
+
+
+static Shape GetArchIFMBlockSize(const Shape &ofmBlock, const Kernel *kernel, const Shape &ublock,
+    const Shape &subkernelLimit, int upscale, int rounding)
+{
+    Point2i dilatedSize = kernel->DilatedWH();
+
+    // IFM block height
+    int h = RequiredInputSize(ofmBlock.Height(), kernel->Stride().y, std::min(dilatedSize.y, subkernelLimit.Height()), upscale, rounding);
+    h = RoundAway(h, ublock.Height());
+
+    // IFM block width
+    int w = RequiredInputSize(ofmBlock.Width(), kernel->Stride().x, std::min(dilatedSize.x, subkernelLimit.Width()), upscale, rounding);
+    w = RoundAway(w, ublock.Width());
+
+    return Shape(1, h, w, ofmBlock.Depth());
+}
+
+
+static Shape FitBlockForOFM(const Shape &ofmShape, const Kernel *kernel, const Shape &block, int ublockHeight)
+{
+    // 256/512 __Conv1D__ optimisation (ratio of IFM:Accumulators changes) This is a specific
+    // interpretation of a more general constraint that can't be applied because the
+    // FindBlockConfig function must return block configs that can be applied to any OFM shape.
+    if ( (ofmShape.Height() == 1) && (kernel->Size().y == 1) && (ublockHeight == 2) )
+    {
+        return Shape(1, std::min(block.Height(), ofmShape.Height()), block.Width(), block.Depth());
+    }
+    return block;
+}
+
+
+std::unique_ptr<ArchitectureOpConfig> ArchEthosU55::FindBlockConfig(OpType opType, const ArchitectureConfigQuery &query)
+{
+    assert(query.ifmBits > 0 && query.ifmBits <= 32);
+    assert(query.ofmShape.Size() > 2 && "Insufficient dimensions to search for block config");
+    assert(query.kernel != nullptr);
+
+    if ( !SupportsAccumulatorMode(query.accSource, query.accOutputEnabled) ) return nullptr;
+
+    const int OFMSplitDepth = 16;  // Specific to this architecture
+
+    // Elementwise larger-volume correction
+    const Shape &ifmShape = (query.ifmShape[1].Elements() > query.ifmShape[0].Elements()) ? query.ifmShape[1] : query.ifmShape[0];
+
+    EthosU55NpuOp npuOp = GetHWOp(opType);
+    assert(npuOp != EthosU55NpuOp::None);
+
+    // Figure out if SHRAM should be portioned for elementwise
+    ElementwiseUsage ewUsage = ElementwiseUsage::No;
+    if ( npuOp == EthosU55NpuOp::Elementwise )
+    {
+        bool usesScalar = query.ifmShape[0].Elements() == 1;
+        if ( query.ifmShape[1].IsValid() )
+        {
+            usesScalar = usesScalar || query.ifmShape[1].Elements() == 1;
+        }
+
+        ewUsage = (usesScalar && (query.ifmBits <= 16)) ? ElementwiseUsage::Scalar : ElementwiseUsage::Full;
+    }
+
+    // Operator typing help
+    bool isPooling = npuOp == EthosU55NpuOp::Pooling || npuOp == EthosU55NpuOp::ReduceSum;
+    bool isReduceSum = npuOp == EthosU55NpuOp::ReduceSum;
+    bool isDepthwise = npuOp == EthosU55NpuOp::Depthwise;
+    bool isEqualDepthOp = (ewUsage != ElementwiseUsage::No) || (isPooling && !isReduceSum) || isDepthwise;
+    bool isPartKernel = npuOp == EthosU55NpuOp::Convolution && ChooseKernelMethod(ifmShape, query.ifmBits, query.kernel);
+
+    // Operator configuration to be returned
+    auto config = std::make_unique<EthosU55OpConfig>();
+    config->_bankSize = _shram.bankSizeBytes;
+    // IFM is not broadcasted for pooling and depthwise ops and for elementwise
+    // when there's no elementwise-broadcasting in depth
+    int ifmDepthBufScaling =
+        isPooling || isDepthwise || IsUnaryElementwise(opType) ||
+                (IsBinaryElementwise(opType) && (query.ifmShape[0].Depth() == query.ifmShape[1].Depth())) ?
+            _cores :
+            1;
+    config->_ifmDepthBufScaling = ifmDepthBufScaling;
+    config->_traversal = isDepthwise ? EthosUTraversal::Depthwise : (isPartKernel ? EthosUTraversal::PartKernel : EthosUTraversal::DepthFirst);
+
+    // Accumulator & granule settings
+    EthosU55SHRamElements accType = SHRAM_Acc32;
+    if ( query.ifmBits == 16 && (!isPooling || isReduceSum) && query.scaled )
+    {
+        accType = SHRAM_Acc40;
+    }
+    config->_accumulatorType = accType;
+
+    // Memory rounding granules
+    int accGranule = _shramGranules[accType];
+    int accBits = s_SHRAMElementBits[accType];
+    int ifmGranule = 0;
+    if ( ewUsage != ElementwiseUsage::No )
+    {
+        ifmGranule = _ifmEWBankGranules[query.ifmBits / 8 - 1];
+    }
+    else
+    {
+        ifmGranule = _ifmBankGranules[query.ifmBits / 8 - 1];
+    }
+
+    int rounding;
+    int upscale = UpscaleAndRounding(query.ifmResampling, rounding);
+    int lutBanks = std::max(DivRoundUp(query.lutBytes, 1024), _shram.reservedEndBanks);
+
+    // Subkernel repeats of the IFM
+    Point2i dilatedWH = query.kernel->DilatedWH();
+    int ifmRepeats = DivRoundUp(dilatedWH.x, _subkernelMax.Width()) * DivRoundUp(dilatedWH.y, _subkernelMax.Height());
+
+    int ifmBlockDepth = 0;
+    if ( query.ifmBits == 16 )
+    {
+        ifmBlockDepth = RoundAway(std::min(ifmShape.Depth(), 16), 4);
+    }
+    else
+    {
+        ifmBlockDepth = RoundAway(std::min(ifmShape.Depth(), isPartKernel ? 16 : 32), _ifmUBlock.Depth());
+    }
+
+    // Weights fetch (for operators that have them)
+    bool hasWeights = npuOp == EthosU55NpuOp::Convolution || isDepthwise;
+    int weightFetchWH = hasWeights ? query.kernel->Size().AreaXY() : 0;
+
+    int ofmUBlockDepth = _ofmUBlock.Depth() * _cores;
+    Shape searchSpace = Shape::RoundAway(Shape::Min(query.ofmShape, _ofmBlockMax), _ofmUBlock.WithDepth(ofmUBlockDepth));
+
+    // Block WHC search, loops across the search space looking for best efficiency
+    float bestCost = std::numeric_limits<float>::infinity();
+    float bestCoverage = std::numeric_limits<float>::infinity();
+    int ofmElements = query.ofmShape.Elements();
+
+    int depth = std::max(ofmUBlockDepth, std::min(searchSpace.Depth(), OFMSplitDepth));
+    if ( depth < query.ofmShape.Depth() )
+    {
+        depth = RoundAway(depth, OFMSplitDepth);
+    }
+
+    std::unordered_set<Point2i, Point2Hash<int>> wontFit;
+    while ( depth <= searchSpace.Depth() )
+    {
+        for ( int height = _ofmUBlock.Height(); height <= searchSpace.Height(); height += _ofmUBlock.Height() )
+        {
+            for ( int width = _ofmUBlock.Width(); width <= searchSpace.Width(); width += _ofmUBlock.Width() )
+            {
+                // Avoid checking W/H transposed blocks that already didn't fit. i.e. if 8x4x16 didn't
+                // fit, then 4x8x16 won't either.
+                if ( wontFit.count(Point2i(height, width)) > 0 )
+                {
+                    continue;
+                }
+
+                // Calculate the IFM block dimensions required to feed this OFM block
+                Shape ofmBlock = Shape(height, width, depth);
+                Shape ifmBlock = GetArchIFMBlockSize(ofmBlock, query.kernel, _ofmUBlock, _subkernelMax, upscale, rounding);
+                if ( !isEqualDepthOp )
+                {
+                    ifmBlock[-1] = ifmBlockDepth;
+                }
+
+                ofmBlock = FitBlockForOFM(query.ofmShape, query.kernel, ofmBlock, _ofmUBlock.Height());
+
+                // Test if the IFM/OFM blocks fit into SHRAM
+                EthosU55OpConfig::SHRAMLayout layout;
+                if ( TryBlockConfig(layout, int(ewUsage), ofmBlock, ifmBlock, query.ifmBits, ifmGranule, accBits, accGranule, lutBanks, ifmDepthBufScaling) )
+                {
+                    Shape fullBlocks = Shape::DivRoundUp(query.ofmShape, ofmBlock);
+                    Point3<float> blocks = query.ofmShape.HWC<float>() / ofmBlock.HWC<float>();
+
+                    // Weights fetching
+                    float weightFetch = float(weightFetchWH * ifmShape.Depth() * fullBlocks.ElementsWH());
+                    if ( !isDepthwise )
+                    {
+                        weightFetch *= blocks.z * float(ofmBlock.Depth());
+                    }
+
+                    // IFM fetching
+                    float ifmFetch = float(ifmBlock.ElementsWH() * ifmShape.Depth() * ifmRepeats) * blocks.x * blocks.y;
+                    if ( !isEqualDepthOp )
+                    {
+                        ifmFetch *= float(fullBlocks.Depth());
+                    }
+
+                    // Scale relative to every output OFM element
+                    float relativeCost =
+                        npuOp == EthosU55NpuOp::Elementwise ? float(ofmElements) / float(height * width * depth) : (ifmFetch + weightFetch) / float(ofmElements);
+
+                    // If the entire IFM can be encompassed by both buffers, bias to prefer this configuration
+                    if ( ifmShape.Elements() < ifmBlock.Elements() * 2 )
+                    {
+                        relativeCost = relativeCost / 2.0f;
+                    }
+
+                    // Choose based on relative minimum cost or larger IFM area (if equal cost)
+                    if ( relativeCost <= bestCost )
+                    {
+                        bool chooseThis = false;
+                        // Check IFM coverage only when it's equal best_cost and small OFM
+                        if ( relativeCost == bestCost )
+                        {
+                            Shape coverageShape = Shape::Min(ifmShape, ifmBlock);
+                            float coverage = float(ifmShape.ElementsWH()) / float(coverageShape.ElementsWH());
+                            // Small 4x4 IFM constraint found through analysis of networks
+                            if ( coverage <= bestCoverage && (height <= 4 && width <= 4) )
+                            {
+                                bestCoverage = coverage;
+                                chooseThis = true;
+                            }
+                        }
+                        else
+                        {
+                            bestCoverage = std::numeric_limits<float>::infinity();
+                            chooseThis = true;
+                        }
+
+                        if ( chooseThis )
+                        {
+                            bestCost = relativeCost;
+                            config->_layout = layout;
+                            config->_ifmBlock = ifmBlock;
+                            config->_ofmBlock = Shape(1, height, width, depth);
+                        }
+                    }
+                }
+                else
+                {
+                    wontFit.emplace(width, height);
+                }
+            }
+        }
+
+        // Try Next block depth, rounded
+        depth = depth + ofmUBlockDepth;
+        if ( depth < query.ofmShape.Depth() )
+        {
+            depth = RoundAway(depth, OFMSplitDepth);
+        }
+    }
+
+    // Return the best configuration
+    if ( bestCost != std::numeric_limits<float>::infinity() )
+    {
+        return std::unique_ptr<ArchitectureOpConfig>(config.release());
+    }
+
+    // Didn't find a configuration
+    return std::unique_ptr<ArchitectureOpConfig>();
+}
+
+
+bool ArchEthosU55::TryBlockConfig(EthosU55OpConfig::SHRAMLayout &layout, int ewUsage, const Shape &ofmBlock,
+    const Shape &ifmBlock, int ifmBits, int ifmGranule, int accBits, int accGranule, int lutBanks, int ifmDepthBufScaling)
+{
+    assert((accBits > 0) && (accGranule > 0));
+    assert((ifmBits >= 8) && ((ifmBits % 8) == 0) && (ifmGranule > 0));
+
+    // Scale depth with cores
+    int ifm_depth = DivRoundUp(ifmBlock.Depth(), ifmDepthBufScaling);
+    int ofm_depth = DivRoundUp(ofmBlock.Depth(), _cores);
+
+    // Always need IFM space
+    int ifm_bytes = ifmBlock.ElementsWH() * RoundAway(ifm_depth * (ifmBits / 8), 8);
+    int ifm_banks = DivRoundUp(ifm_bytes, _shram.bankSizeBytes) * 2;
+    ifm_banks = RoundAway(ifm_banks, ifmGranule);
+
+    // Calculate SHRAM boundaries of the IFM and Accumulators
+    int lut_start = _shram.totalBanks - lutBanks;
+    int ifm_end = _shram.reservedOutputBanks + ifm_banks;
+    int ifm2_start = ifm_end;
+    int acc_start = lut_start;
+
+    // If not elementwise then we need accumulator space
+    if ( ewUsage == int(ElementwiseUsage::No) )
+    {
+        int acc_bytes = (ofmBlock.ElementsWH() * RoundAway(ofm_depth, 8) * accBits) / 8;
+        int acc_banks = DivRoundUp(acc_bytes, _shram.bankSizeBytes) * 2;
+        acc_banks = RoundAway(acc_banks, accGranule);
+        acc_start = acc_start - acc_banks;
+    }
+    else
+    {
+        int ifm2_banks = (ewUsage == int(ElementwiseUsage::Full)) ? ifm_banks : 0;
+        if ( ifm2_start + ifm2_banks > acc_start )
+        {
+            return false;
+        }
+        ifm_end = acc_start;
+    }
+
+    // IFM must still fit before accumulators
+    if ( ifm_end > acc_start )
+    {
+        return false;
+    }
+
+    // Should all fit, so return this layout
+    layout.ibStart = _shram.reservedOutputBanks;
+    layout.ibStart2 = ifm2_start;
+    layout.ibEnd = ifm_end;
+    layout.abStart = acc_start;
+    layout.lutStart = lut_start;
+    return true;
+}
+
+
+Shape ArchEthosU55::GetStorageRounding(TensorFormat format)
+{
+    if ( format == TensorFormat::NHCWB16 )
+    {
+        return Shape(1, 1, 1, 16);
+    }
+
+    return Shape(1, 1, 1, 1);
+}
+
+
+uint32_t ArchEthosU55::ConfigRegister(int product)
+{
+    uint32_t macs = _macs * _cores;
+    int macsCeilLog2 = 0;
+    while ( macs >>= 1 )
+    {
+        macsCeilLog2++;
+    }
+    int shramSize = _cores * (int(_shramMemory->SizeBytes()) >> 10);
+    assert(macsCeilLog2 < 16);
+    assert(shramSize < 256);
+    return macsCeilLog2 | (shramSize << 8) | (product << 28);
+}
+
+std::unique_ptr<ArchitectureOpConfig> EthosU55OpConfig::Clone()
+{
+    auto config = std::make_unique<EthosU55OpConfig>();
+    config->_bankSize = _bankSize;
+    config->_ifmDepthBufScaling = _ifmDepthBufScaling;
+    config->_traversal = _traversal;
+    config->_accumulatorType = _accumulatorType;
+    config->_ofmBlock = _ofmBlock;
+    config->_ifmBlock = _ifmBlock;
+    config->_layout = _layout;
+    return std::unique_ptr<ArchitectureOpConfig>(config.release());
+}
+
+int EthosU55OpConfig::MaxIFMBuffering()
+{
+    return (_layout.ibEnd - _layout.ibStart) * _bankSize * _ifmDepthBufScaling;
+}
+
+Point2i EthosU55OpConfig::OptimalStripeGranule()
+{
+    return _ofmBlock.WH<int>();
+}
+
+int EthosU55OpConfig::OptimalDepthGranule()
+{
+    return _ofmBlock.Depth();
+}
+
+std::string EthosU55OpConfig::ToString(bool full)
+{
+    std::string tmp = fmt::format("OFM Block=[{}], IFM Block=[{}], Traversal={}, AccType={}", _ofmBlock.ToString(),
+        _ifmBlock.ToString(), EnumToString(_traversal), EnumToString(_accumulatorType));
+    if ( full )
+    {
+        tmp += fmt::format("\nSHRAM: ib={} ibE={}, ib2={}, ab={}, lut={}", _layout.ibStart, _layout.ibEnd,
+            _layout.ibStart2, _layout.abStart, _layout.lutStart);
+    }
+    return tmp;
+}
+
+EthosU55NpuOp ArchEthosU55::GetHWOp(OpType type)
+{
+    static const std::unordered_map<OpType, EthosU55NpuOp> toNpuOp = {
+        {OpType::DepthwiseConv2DBias, EthosU55NpuOp::Depthwise},
+        {OpType::Conv2D, EthosU55NpuOp::Convolution},
+        {OpType::Conv2DBackpropInput, EthosU55NpuOp::Convolution},
+        {OpType::Conv2DBackpropInputSwitchedBias, EthosU55NpuOp::Convolution},
+        {OpType::Conv2DBias, EthosU55NpuOp::Convolution},
+        {OpType::FullyConnected, EthosU55NpuOp::VectorProduct},
+        {OpType::MaxPool, EthosU55NpuOp::Pooling},
+        {OpType::AvgPool, EthosU55NpuOp::Pooling},
+        {OpType::QuantizedAvgPool, EthosU55NpuOp::Pooling},
+        {OpType::QuantizedMaxPool, EthosU55NpuOp::Pooling},
+        {OpType::ResizeBilinear, EthosU55NpuOp::Pooling},
+        {OpType::ReduceSum, EthosU55NpuOp::ReduceSum},
+    };
+    auto pos = toNpuOp.find(type);
+    if ( pos != toNpuOp.end() )
+    {
+        return pos->second;
+    }
+    else if ( EthosU55RCSGenerator::IsSupportedElementwise(type) )
+    {
+        return EthosU55NpuOp::Elementwise;
+    }
+    else if ( UseAvgPoolNop(type) )
+    {
+        return EthosU55NpuOp::Pooling;
+    }
+    return EthosU55NpuOp::None;
+}
+
+int EthosU55OpGroup::Add(const ArchitectureOpGroupQuery &op, const std::vector<int> &dependsOn)
+{
+    LOG_TRACE1("Trying to add op {}\n", OpTypeToString(op.type));
+
+    if ( _opsCount >= 2 )
+    {
+        // Can only fuse 2 ops
+        return 0;
+    }
+
+    for ( int dep : dependsOn )
+    {
+        if ( dep > 0 )
+        {
+            // Don't validate user-specified (positive keys) dependencies
+            continue;
+        }
+        else if ( dep < 0 )
+        {
+            // Convert to group generated keys (negative keys) to array index
+            dep = (-dep) - 1;
+            if ( dep >= _opsCount )
+            {
+                // Missing dependency
+                return 0;
+            }
+        }
+
+        const EthosU55OpGroup::OpInfo &prevOp = _ops[dep];
+        if ( prevOp.ofm.key != op.ifm.key && prevOp.ofm.key != op.ifm2.key )
+        {
+            // Can only fuse when ops are connected
+            return 0;
+        }
+    }
+
+    if ( !CanRunOnNPU(op) )
+    {
+        // Can only fuse NPU ops
+        return 0;
+    }
+
+    if ( _opsCount > 0 && !IsActivation(op.type) )
+    {
+        // Can only fuse with activation
+        return 0;
+    }
+
+    // Generated key
+    int key = (-_opsCount) - 1;
+
+    // Save copy of op
+    _ops[_opsCount].type = op.type;
+    _ops[_opsCount].ifm.key = op.ifm.key;
+    _ops[_opsCount].ifm.type = op.ifm.type;
+    _ops[_opsCount].ifm2.key = op.ifm2.key;
+    _ops[_opsCount].ifm2.type = op.ifm2.type;
+    _ops[_opsCount].ofm.key = op.ofm.key;
+    _ops[_opsCount].ofm.type = op.ofm.type;
+    _opsInternal[_opsCount].dependsOn = dependsOn;
+    _opsCount++;
+
+    return key;
+}
+
+// Table of allowed ifm/ofm data type combinations for each HWOp
+static const std::unordered_map<EthosU55NpuOp, std::unordered_map<DataType, std::vector<DataType>>> s_opDataTypeSupport = {
+    {EthosU55NpuOp::Convolution,  // HWOp
+        {
+            // IFM data type  | OFM data type(s)
+            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+        }},
+    {EthosU55NpuOp::Depthwise,
+        {
+            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+        }},
+    {EthosU55NpuOp::VectorProduct,
+        {
+            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+        }},
+    {EthosU55NpuOp::Pooling,
+        {
+            {DataType::UInt8, {DataType::UInt8}},
+            {DataType::Int8, {DataType::Int8}},
+            {DataType::Int16, {DataType::Int16}},
+        }},
+    {EthosU55NpuOp::ReduceSum,
+        {
+            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int32, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+        }},
+};
+
+bool EthosU55OpGroup::CanRunOnNPU(const ArchitectureOpGroupQuery &op)
+{
+    EthosU55NpuOp npuOp = ArchEthosU55::GetHWOp(op.type);
+
+    if ( IsFloat(op.ifm.type | op.ifm2.type | op.ofm.type) )
+    {
+        return false;
+    }
+
+    if ( npuOp == EthosU55NpuOp::None )
+    {
+        return false;
+    }
+
+    auto k = op.kernel;
+    if ( k->Stride().x > 3 || k->Stride().y > 3 )
+    {
+        return false;
+    }
+
+    if ( k->Dilation().x > 2 || k->Dilation().y > 2 )
+    {
+        return false;
+    }
+
+    switch ( npuOp )
+    {
+        case EthosU55NpuOp::Convolution:
+        case EthosU55NpuOp::Depthwise:
+        case EthosU55NpuOp::VectorProduct:
+        case EthosU55NpuOp::Pooling:
+        case EthosU55NpuOp::ReduceSum:
+        case EthosU55NpuOp::Elementwise:
+            break;
+        default:
+            assert(false && "Unrecognized HWOp");
+            return false;
+    }
+
+    // Check allowed ifm/ofm type mapping
+    if ( npuOp != EthosU55NpuOp::Elementwise )
+    {
+        if ( op.type == OpType::LUT || op.type == OpType::MemoryCopy )
+        {  // TODO: LUT operations end up here due to UseAvgPoolNop although the rules are not the same as
+           // for a Pooling operation, so skip checks for now.
+            return true;
+        }
+
+        auto map = s_opDataTypeSupport.find(npuOp);
+        if ( map == s_opDataTypeSupport.end() )
+        {
+            assert(false && "Data type mapping for HWOp missing");
+            return false;
+        }
+        auto &typeMap = map->second;
+        auto ifmEntry = typeMap.find(op.ifm.type);
+        if ( ifmEntry == typeMap.end() )
+        {  // Unsupported ifm data type
+            return false;
+        }
+        auto &ofmTypes = ifmEntry->second;
+        if ( 0 == std::count(ofmTypes.begin(), ofmTypes.end(), op.ofm.type) )
+        {  // Unsupported ofm data type
+            return false;
+        }
+    }
+    else
+    {
+        std::vector<DataType> validIfmTypes;
+        std::vector<DataType> validOfmTypes;
+        switch ( op.type )
+        {
+            case OpType::Add:
+            case OpType::Sub:
+            case OpType::Mul:
+            {
+                validIfmTypes = {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32};
+                validOfmTypes = validIfmTypes;
+            }
+            break;
+            case OpType::Minimum:
+            case OpType::Maximum:
+            case OpType::LeakyRelu:
+            case OpType::Abs:
+            {
+                validIfmTypes = {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32};
+                validOfmTypes = {op.ifm.type};
+            }
+            break;
+            case OpType::CLZ:
+            case OpType::SHL:
+            case OpType::Asr:
+            {
+                validIfmTypes = {DataType::Int32};
+                validOfmTypes = {DataType::Int32};
+                if ( op.type == OpType::Asr )
+                {
+                    validOfmTypes.insert(validOfmTypes.begin(), {DataType::UInt8, DataType::Int8, DataType::Int16});
+                }
+            }
+            break;
+            default:
+                assert(false && "Unkown elementwise type");
+                break;
+        }
+
+        if ( 0 == std::count(validIfmTypes.begin(), validIfmTypes.end(), op.ifm.type) )
+        {  // Unsupported ifm data type
+            return false;
+        }
+        if ( IsBinaryElementwise(op.type) && op.ifm2.type != op.ifm.type )
+        {  // ifm2 data type must match ifm data type
+            return false;
+        }
+        if ( 0 == std::count(validOfmTypes.begin(), validOfmTypes.end(), op.ofm.type) )
+        {  // Unsupported ofm data type
+            return false;
+        }
+    }
+
+    return true;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55.hpp
new file mode 100644
index 00000000..d83ec42a
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55.hpp
@@ -0,0 +1,239 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "architecture/architecture.hpp"
+#include "architecture/ethos_u_scaling.hpp"
+#include "architecture/register_command_stream_generator.hpp"
+#include "architecture/weight_encoder.hpp"
+#include "common/bit_flags.hpp"
+#include "common/shape.hpp"
+#include "ethos_u55_performance.hpp"
+
+#include <string>
+
+namespace regor
+{
+
+enum EthosU55SHRamElements
+{
+    SHRAM_IFM8 = 0,
+    SHRAM_IFM16 = 1,
+    SHRAM_IFM8_Elementwise = 2,
+    SHRAM_IFM16_Elementwise = 3,
+    SHRAM_IFM32 = 4,
+    SHRAM_Acc16 = 5,
+    SHRAM_Acc32 = 6,
+    SHRAM_Acc40 = 7,
+    SHRAM_Last = SHRAM_Acc40
+};
+
+enum class EthosUTraversal
+{
+    DepthFirst = 0,
+    PartKernel = 1,
+    Depthwise = 2,
+};
+
+class ArchEthosU55;
+
+enum class EthosU55NpuOp
+{
+    None = 0,
+    Convolution,
+    Depthwise,
+    VectorProduct,
+    Pooling,
+    ReduceSum,
+    Elementwise,
+};
+
+/// <summary>
+/// Per-operator architecture configuration
+/// </summary>
+class EthosU55OpConfig : public ArchitectureOpConfig
+{
+    friend class ArchEthosU55;
+    friend class EthosU55RCSGenerator;
+
+public:
+    struct SHRAMLayout
+    {
+        int ibStart = 0;
+        int ibEnd = 0;
+        int ibStart2 = 0;
+        int abStart = 0;
+        int lutStart = 0;
+    };
+
+private:
+    SHRAMLayout _layout;
+    Shape _ifmBlock;
+    Shape _ofmBlock;
+    EthosU55SHRamElements _accumulatorType = SHRAM_Acc32;
+    EthosUTraversal _traversal = EthosUTraversal::DepthFirst;
+    int _bankSize = 0;
+    int _ifmDepthBufScaling = 0;
+
+public:
+    EthosUTraversal Traversal() const { return _traversal; }
+    const Shape &IfmBlock() const { return _ifmBlock; }
+    const Shape &OfmBlock() const { return _ofmBlock; }
+    EthosU55SHRamElements Acc() const { return _accumulatorType; }
+
+    std::unique_ptr<ArchitectureOpConfig> Clone() override;
+    int MaxIFMBuffering() override;
+    Point2i OptimalStripeGranule() override;
+    int OptimalDepthGranule() override;
+    std::string ToString(bool full) override;
+};
+
+/// <summary>
+/// Group of ops that can be fused and/or chained
+/// </summary>
+class EthosU55OpGroup : public ArchitectureOpGroup
+{
+    friend class ArchEthosU55;
+
+    using OpInfo = ArchitectureOpGroupQuery;
+
+    struct InternalOpInfo
+    {
+        std::vector<int> dependsOn;
+    };
+
+private:
+    std::array<OpInfo, 2> _ops;
+    std::array<InternalOpInfo, 2> _opsInternal;
+    int _opsCount = 0;
+
+public:
+    int Add(const ArchitectureOpGroupQuery &op, const std::vector<int> &dependsOn = {}) override;
+
+protected:
+    bool CanRunOnNPU(const ArchitectureOpGroupQuery &op) override;
+};
+
+/// <summary>
+/// EthosU55 specialisation
+/// </summary>
+class ArchEthosU55 : public Architecture
+{
+    friend class EthosU55WeightEncoder;
+    friend class EthosU55Performance;
+    friend class EthosU55RCSGenerator;
+    friend class EthosU65RCSGenerator;
+    friend class EthosU55OpGroup;
+
+public:
+    struct AcceleratorConfig
+    {
+        int macs;
+        int cores;
+        Shape ofmUBlock;
+        Shape ifmUblock;
+        int shramBanks;
+        int8_t shramGranules[8];
+        int elemUnits;
+        const EthosU55PerfInfo *perfInfo;
+    };
+
+private:
+    std::unique_ptr<ArchitectureMemory> _shramMemory;
+    Shape _subkernelMax;
+    Shape _ofmBlockMax;
+    int _cores = 0;
+    int _macs = 0;
+    Shape _ofmUBlock;
+    Shape _ifmUBlock;
+    const int8_t *_shramGranules = nullptr;
+    int _ifmBankGranules[4] = {0};
+    int _ifmEWBankGranules[4] = {0};
+
+    struct
+    {
+        int reservedOutputBanks = 0;
+        int bankSizeBytes = 0;
+        int totalBanks = 0;
+        int reservedEndBanks = 0;
+        int lutBanks = 2;
+        int lutSlotSize = 256;
+    } _shram;
+
+protected:
+    std::unique_ptr<class WeightEncoder> _weightEncoder;
+    std::unique_ptr<ArchitecturePerformance> _performance;
+    std::unique_ptr<IRegisterCommandStreamGenerator> _rcsGenerator;
+
+public:
+    ArchEthosU55();
+
+public:
+    bool ParseConfig(IniReader *reader) override;
+
+    std::unique_ptr<ArchitectureOpConfig> GetOpConfig(OpType opType, const ArchitectureConfigQuery &query) override;
+    std::unique_ptr<ArchitectureOpGroup> CreateOpGroup(const ArchitectureOpGroupQuery &op) override;
+    class WeightEncoder *WeightEncoder() override { return _weightEncoder.get(); }
+    IRegisterCommandStreamGenerator *RegisterCommandStreamGenerator() override { return _rcsGenerator.get(); }
+    ArchitecturePerformance *Performance() override { return _performance.get(); }
+    TensorFormat IdealBufferingFormat() override { return TensorFormat::NHCWB16; }
+    Address MaxAddress() override { return 1LL << 32; }
+    std::vector<uint32_t> ConfigRegisters() override;
+    int UpscaleAndRounding(ArchResampling resampling, int &rounding) override;
+    AxisMask CanSubdivide(OpType opType) override;
+    bool SupportsLeakyRelu(bool quantized, DataType type) override;
+    bool SupportsMatMul(OpType opType) override;
+    bool SupportsTranspose(OpType opType, TransposeType transposeType) override;
+    bool SupportsReverse(OpType opType, ReverseType reverseType) override;
+    bool SupportsGather(OpType opType) override;
+    bool SupportsScatter(OpType opType) override;
+    bool SupportsSigmoidTanhLutInt16(OpType opType) override;
+    bool SupportsResize(const ResizeSupportQuery &query) override;
+    bool SupportsAccumulatorMode(ArchAccumulatorSource source, bool outputEnabled) override;
+    bool SupportsScalar(OpType opType, DataType dataType, TensorUsage usage) override;
+    bool SupportsArgMax(OpType opType) override;
+    Flags<WeightFormat> SupportedWeightFormat(OpType op) override;
+    uint32_t Version() override;
+
+protected:
+    Shape OfmUBlock() { return _ofmUBlock; }
+    void ApplyConfig(const AcceleratorConfig *cfg);
+
+    std::unique_ptr<ArchitectureOpConfig> FindBlockConfig(OpType opType, const ArchitectureConfigQuery &query);
+
+    bool TryBlockConfig(EthosU55OpConfig::SHRAMLayout &layout, int ewUsage, const Shape &ofmBlock, const Shape &ifmBlock,
+        int ifmBits, int ifmGranule, int accBits, int accGranule, int lutBanks, int ifmDepthBufScaling);
+
+    Shape GetStorageRounding(TensorFormat format);
+
+    uint32_t ConfigRegister(int product);
+
+    bool IsU55_32() const { return (_macs == 32) && (_cores == 1); }
+
+    // Checks if the operation is to be mapped on AvgPool
+    static bool UseAvgPoolNop(OpType type);
+    static EthosU55NpuOp GetHWOp(OpType type);
+
+private:
+    int MaxOutstandingKernelOps() { return 2; }
+    virtual int MaxOutstandingDMAOps() { return 1; }
+    int MaxBlockdep() { return 3; }
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp
new file mode 100644
index 00000000..527e159b
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp
@@ -0,0 +1,531 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "ethos_u55_performance.hpp"
+
+#include "common/common.hpp"
+
+#include "architecture/architecture.hpp"
+#include "ethos_u55.hpp"
+
+namespace regor
+{
+
+static const Point2i s_SubkernelLimits[] = {
+    {0, 0},  // No kernel
+    {8, 8},  // Convolution
+    {8, 8},  // Depthwise
+    {1, 1},  // VectorProduct
+    {8, 8},  // Pooling
+    {8, 8},  // ReduceSum
+    {1, 1},  // Elementwise
+};
+
+static constexpr bool OpUsesMacs(EthosU55NpuOp npuOp)
+{
+    return (npuOp != EthosU55NpuOp::Elementwise && npuOp != EthosU55NpuOp::None);
+}
+
+EthosU55Performance::EthosU55Performance(ArchEthosU55 *arch, const EthosU55PerfInfo *perfInfo) : _arch(arch)
+{
+    _perfInfo = perfInfo;
+}
+
+CycleCost EthosU55Performance::MeasureCycleCost(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
+{
+    CycleCost cycles;
+    auto npuOp = _arch->GetHWOp(query.type);
+
+    // Convolution/Vector product cycle calculation
+    if ( OpUsesMacs(npuOp) )
+    {
+        if ( (npuOp == EthosU55NpuOp::Depthwise) || (npuOp == EthosU55NpuOp::Pooling) )
+        {
+            cycles.macs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements() * 1;
+        }
+        else
+        {
+            cycles.macs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements() * query.ifmShape[0].Depth();
+        }
+
+        cycles.opCycles = EstimateConvCycles(query, fused);
+    }
+    // Elementwise cycle calculation
+    else if ( npuOp == EthosU55NpuOp::Elementwise )
+    {
+        auto ofmShape =
+            (query.ofmFormat == TensorFormat::NHCWB16) ? Shape::RoundAway(query.ofmShape, Shape(1, 1, 1, 16)) : query.ofmShape;
+        cycles.opCycles = int64_t(EstimateOutputCyclesPerElement(query, fused) * float(ofmShape.Elements()));
+    }
+    else
+    {
+        assert(false && "Unknown operator cycle costing");
+    }
+
+    return cycles;
+}
+
+int64_t EthosU55Performance::MemToMemCycles(const ArchitectureMemory *dest, const ArchitectureMemory *source, int sizeBytes)
+{
+    int64_t fromCycles = int64_t(float(sizeBytes) / source->Bandwidth());
+    fromCycles += source->ReadLatency();
+    int64_t toCycles = int64_t(float(sizeBytes) / dest->Bandwidth());
+    toCycles += source->WriteLatency();
+    return std::max(fromCycles, toCycles);
+}
+
+int64_t EthosU55Performance::EstimateConvCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
+{
+    EthosU55OpConfig *opConfig = static_cast<EthosU55OpConfig *>(query.config);
+    auto npuOp = _arch->GetHWOp(query.type);
+    assert(npuOp != EthosU55NpuOp::None);
+
+    Shape ifmBlock = Shape::Min(query.ifmShape[0], opConfig->IfmBlock());
+    Shape ofmBlock = Shape::Min(query.ofmShape, opConfig->OfmBlock());
+    Shape ofmUBlock = _arch->OfmUBlock();
+
+    // HW Optimisation check
+    if ( (ofmUBlock.Height() == 2) && (npuOp == EthosU55NpuOp::Convolution || npuOp == EthosU55NpuOp::VectorProduct) &&
+         (query.ofmShape.Height() == 1) && (query.ofmShape.Width() % 2 == 0) &&  // Optimisation only applies for even
+                                                                                 // width tensors
+         (query.kernel->Size().y == 1) )
+    {
+        ofmUBlock = Shape(1, 1, 4, ofmUBlock.Depth());
+        ofmBlock = ofmBlock.WithHeight(1);
+    }
+
+    int ifmBits = DataTypeSizeBits(query.ifmType[0]);
+    Shape numUBlocks = Shape::DivRoundUp(ofmBlock, ofmUBlock);
+    bool use40BitAcc = opConfig->Acc() == EthosU55SHRamElements::SHRAM_Acc40;
+
+    int64_t cyclesDpuBlk = 0;
+    int cyclesWb = 32 * ofmUBlock.Depth() / 8;
+
+    int subKernelWidth = s_SubkernelLimits[int(npuOp)].x;
+    int subKernelHeight = s_SubkernelLimits[int(npuOp)].y;
+    const Point2i kernelSize = query.kernel->Size();
+    bool isConvolutionMxN = (npuOp == EthosU55NpuOp::Convolution);
+
+    for ( int x = 0; x < kernelSize.x; x += subKernelWidth )
+    {
+        for ( int y = 0; y < kernelSize.y; y += subKernelHeight )
+        {
+            int subKernelElements = std::min(kernelSize.y - y, subKernelHeight);
+            subKernelElements *= std::min(kernelSize.x - x, subKernelWidth);
+
+            // Calculate processing cycles
+            int numKernelSteps = 0;
+            int cycles = 0;
+            if ( npuOp == EthosU55NpuOp::Pooling )
+            {
+                numKernelSteps = 1;
+                cycles = std::max(4, subKernelElements) * numUBlocks.Elements();
+                if ( !_arch->IsU55_32() )
+                {
+                    cycles = cycles * (ifmBits / 2);
+                }
+            }
+            else if ( npuOp == EthosU55NpuOp::Depthwise )
+            {
+                numKernelSteps = DivRoundUp(subKernelElements, 4);
+                cycles = 4 * numUBlocks.ElementsWH() * (ifmBits / 8);
+                cycles = std::max(cyclesWb, cycles) * numKernelSteps * numUBlocks.Depth();
+            }
+            else if ( (isConvolutionMxN && opConfig->Traversal() != EthosUTraversal::PartKernel) ||
+                      npuOp == EthosU55NpuOp::VectorProduct || npuOp == EthosU55NpuOp::ReduceSum )
+            {
+                numKernelSteps = subKernelElements;
+                cycles = std::max(cyclesWb, 4 * numUBlocks.ElementsWH()) * numKernelSteps * numUBlocks.Depth();
+            }
+            else
+            {
+                assert(opConfig->Traversal() == EthosUTraversal::PartKernel);
+                int divider = (ifmBits == 16) ? 2 : 4;
+                numKernelSteps = DivRoundUp(subKernelElements, divider);
+                cycles = std::max(cyclesWb, 4 * numUBlocks.ElementsWH()) * numKernelSteps * numUBlocks.Depth() *
+                         DivRoundUp(ifmBlock.Depth(), 8);
+            }
+
+            // Calculate delay
+            int delayCycles = 0;
+            if ( _arch->IsU55_32() )
+            {
+                int delay = use40BitAcc ? 7 : 3;
+                if ( numUBlocks.ElementsWH() == 1 )
+                {
+                    if ( numUBlocks.Depth() == 1 )
+                    {
+                        delayCycles = delay * numKernelSteps;
+                    }
+                    else if ( numKernelSteps > 1 )
+                    {
+                        delayCycles = delay * (numKernelSteps - 1) * numUBlocks.Depth();
+                    }
+                }
+
+                if ( (numUBlocks.Width() == 1 || numUBlocks.Height() == 1) && (numUBlocks.Depth() > 1) && use40BitAcc )
+                {
+                    delayCycles += delay * numUBlocks.Depth();
+                }
+            }
+            else
+            {
+                int delay = (use40BitAcc && (_arch->_macs <= 128)) ? 3 : 2;
+
+                if ( numUBlocks.ElementsWH() == 1 )
+                {
+                    if ( numUBlocks.Depth() == 1 )
+                    {
+                        delayCycles = delay * numKernelSteps;
+                    }
+                    else if ( numKernelSteps > 1 )
+                    {
+                        delayCycles = delay * (numKernelSteps - 1) * numUBlocks.Depth();
+                    }
+                }
+            }
+
+            if ( isConvolutionMxN && opConfig->Traversal() == EthosUTraversal::PartKernel )
+            {
+                delayCycles *= DivRoundUp(ifmBlock.Depth(), 8);
+            }
+
+            cyclesDpuBlk += cycles;
+            cyclesDpuBlk += delayCycles;
+        }
+    }
+
+    if ( npuOp == EthosU55NpuOp::Convolution || npuOp == EthosU55NpuOp::VectorProduct || npuOp == EthosU55NpuOp::ReduceSum )
+    {
+        cyclesDpuBlk *= DivRoundUp(query.ifmShape[0].Depth(), ifmBlock.Depth());
+    }
+
+    cyclesDpuBlk /= _arch->_cores;
+
+    // Estimate output cycles
+    int numOfmBlks = Shape::DivRoundUp(query.ofmShape, ofmBlock).Elements();
+    int64_t cyclesOutputBlk = int64_t(EstimateOutputCyclesPerElement(query, fused) * float(ofmBlock.Elements()));
+
+    // Scale and bias tensor
+    if ( query.constShape.Size() > 0 && query.constShape.Depth() > 0 )
+    {
+        int cyclesBiasBlk = (10 * ofmBlock.Depth() * query.constMemory->ReadLatency() / 256);
+        cyclesOutputBlk = std::max(cyclesOutputBlk, int64_t(cyclesBiasBlk));
+    }
+
+    int64_t cycles_cmd = EstimateMinimumMemoryCycles(query);
+    cycles_cmd = (cycles_cmd + cyclesOutputBlk + cyclesDpuBlk) / 4;  // Per DPU
+
+    cyclesDpuBlk = std::max(cyclesDpuBlk, cycles_cmd);
+    cyclesOutputBlk = std::max(cyclesOutputBlk, cycles_cmd);
+
+    int64_t totalCycles = 0;
+    if ( cyclesDpuBlk > cyclesOutputBlk )
+    {
+        totalCycles = cyclesDpuBlk * numOfmBlks + cyclesOutputBlk;
+    }
+    else
+    {
+        totalCycles = cyclesOutputBlk * numOfmBlks + cyclesDpuBlk;
+    }
+
+    return totalCycles;
+}
+
+static int EstimateMemoryTransfer(int cores, bool isRead, ArchitectureMemory *memory, TensorFormat format,
+    int elementBits, Shape block, Shape shape, int toTransfer)
+{
+    int burstLen = 8;
+
+    if ( format == TensorFormat::NHCWB16 )
+    {
+        int zStride = (shape.Width() * elementBits * 16) / 8;
+        if ( zStride == block.Depth() )
+        {
+            burstLen = elementBits * block.Depth() * block.Width();
+        }
+        else if ( isRead )
+        {
+            burstLen = 16 * elementBits * block.Width();
+        }
+        else
+        {
+            burstLen = 16 * elementBits * block.Width() * cores;
+        }
+    }
+    else if ( format == TensorFormat::NHWC )
+    {
+        int xStride = (shape.Depth() * elementBits) / 8;
+        if ( isRead )
+        {
+            if ( xStride == block.Depth() )
+            {
+                burstLen = elementBits * block.Depth() * block.Width();
+            }
+            else
+            {
+                burstLen = elementBits * block.Depth();
+            }
+        }
+        else
+        {
+            if ( (block.Depth() <= 16) && xStride == block.Depth() )
+            {
+                burstLen = elementBits * block.Depth() * block.Width();
+            }
+            else
+            {
+                burstLen = std::min(std::min(64 * 8, 16 * elementBits * cores), block.Depth() * elementBits);
+            }
+        }
+    }
+
+    burstLen = std::min(memory->MaxBurstLength(), burstLen / 8);
+    assert(burstLen > 0 && "Burst length cannot be zero");
+    return (toTransfer * memory->MaxBurstLength()) / burstLen;
+}
+
+
+int64_t EthosU55Performance::EstimateMinimumMemoryCycles(const PerformanceQuery &query)
+{
+    EthosU55OpConfig *opConfig = static_cast<EthosU55OpConfig *>(query.config);
+
+    int ifmBits = DataTypeSizeBits(query.ifmType[0]);  // All inputs expect same bit width
+    const int ifmCount = query.ifmShape[1].Elements() > 0 ? int(std::size(query.ifmShape)) : 1;
+    int64_t cyclesIfm = 0;
+    for ( int i = 0; i < ifmCount; i++ )
+    {
+        // Input block HW transfer (only for elements present)
+        int ifmBytes = Shape::Min(query.ifmShape[i], opConfig->IfmBlock()).Elements() * ifmBits / 8;
+        int64_t cyclesIfmBlk = query.ifmMemory[i]->ReadLatency();
+        int64_t tx = EstimateMemoryTransfer(_arch->_cores, true, query.ifmMemory[i], query.ifmFormat[i], ifmBits,
+            opConfig->IfmBlock(), query.ifmShape[i], ifmBytes);
+        cyclesIfmBlk += int64_t(float(tx) / query.ifmMemory[i]->Bandwidth());
+
+        cyclesIfm = std::max(cyclesIfm, cyclesIfmBlk);
+    }
+
+    // Output block HW transfer (only for elements present)
+    int ofmBits = DataTypeSizeBits(query.ofmType);
+    int ofmBytes = Shape::Min(query.ofmShape, opConfig->OfmBlock()).Elements() * ofmBits / 8;
+    int64_t cyclesOfm = query.ofmMemory->WriteLatency();
+    int64_t tx = EstimateMemoryTransfer(_arch->_cores, false, query.ofmMemory, query.ofmFormat, ofmBits,
+        opConfig->OfmBlock(), query.ofmShape, ofmBytes);
+    cyclesOfm += int64_t(float(tx) / query.ofmMemory->Bandwidth());
+
+    return cyclesIfm + cyclesOfm;
+}
+
+
+float EthosU55Performance::EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
+{
+    EthosU55OpConfig *opConfig = static_cast<EthosU55OpConfig *>(query.config);
+    auto npuOp = _arch->GetHWOp(query.type);
+    assert(npuOp != EthosU55NpuOp::None);
+    int ifmBits = DataTypeSizeBits(query.ifmType[0]);
+    int ofmBits = DataTypeSizeBits(query.ofmType);
+    int outputPerfIndex = 0;
+
+    if ( (npuOp == EthosU55NpuOp::Elementwise) && (ifmBits == 32) )
+    {
+        // Unary op else Binary op
+        outputPerfIndex = query.ifmShape[1].Elements() > 0 ? 1 : 0;
+    }
+    else if ( query.type == OpType::Mul && ofmBits == 32 )
+    {
+        outputPerfIndex = 2;
+    }
+    else if ( (query.type == OpType::Mul) || ((npuOp != EthosU55NpuOp::Elementwise) && opConfig->Acc() == EthosU55SHRamElements::SHRAM_Acc40) )
+    {
+        outputPerfIndex = 3;
+    }
+    else if ( query.type == OpType::Add || query.type == OpType::Sub )
+    {
+        if ( false )
+        {
+            // Simple Add/Sub
+            outputPerfIndex = 4;
+        }
+        else
+        {
+            // Advanced Add/Sub TODO: Add as perf selection as operator variant
+            outputPerfIndex = 5;
+        }
+    }
+    else if ( query.type == OpType::MaxPool )
+    {
+        outputPerfIndex = 6;
+    }
+    else
+    {
+        outputPerfIndex = 7;
+    }
+
+    int activationPerfIndex = 0;
+    assert(fused.size() <= 1 && "multiple op performance not available");
+    for ( const FusionQuery &fusedOp : fused )
+    {
+        if ( fusedOp.type == OpType::Sigmoid || fusedOp.type == OpType::Tanh || fusedOp.type == OpType::LookupTable )
+        {
+            activationPerfIndex = 0;
+        }
+        else if ( fusedOp.type == OpType::Relu || fusedOp.type == OpType::Relu6 || fusedOp.type == OpType::ReluN1To1 )
+        {
+            activationPerfIndex = 1;
+        }
+        else
+        {
+            activationPerfIndex = 2;
+        }
+    }
+
+    float cyclesPerElement = std::max(_perfInfo->outputCycles[outputPerfIndex], _perfInfo->activationCycles[activationPerfIndex]);
+
+    if ( npuOp == EthosU55NpuOp::Elementwise )
+    {
+        int numElemsBlk = opConfig->OfmBlock().Elements();
+        assert(numElemsBlk > 0);
+        float cycleCmd = (float(EstimateMinimumMemoryCycles(query)) / float(numElemsBlk) + cyclesPerElement) / 4.0f;  // per DPU
+        cyclesPerElement = std::max(cyclesPerElement, cycleCmd);
+    }
+
+    return cyclesPerElement;
+}
+
+ElementAccess EthosU55Performance::MeasureElementAccess(const PerformanceQuery &query)
+{
+    ElementAccess access;
+    EthosU55OpConfig *opConfig = static_cast<EthosU55OpConfig *>(query.config);
+    auto npuOp = _arch->GetHWOp(query.type);
+    assert(npuOp != EthosU55NpuOp::None);
+
+    Shape ifmBlock = Shape::Min(query.ifmShape[0], opConfig->IfmBlock());
+    Shape ofmBlock = Shape::Min(query.ofmShape, opConfig->OfmBlock());
+
+    Shape ifmRounding = _arch->GetStorageRounding(query.ifmFormat[0]);
+    Shape ofmRounding = _arch->GetStorageRounding(query.ofmFormat);
+
+    // Number of ofm blocks in the overall output shape
+    Shape ofmBlocks = Shape::DivRoundUp(query.ofmShape, ofmBlock);
+
+    int ofmBlockDepth = ofmBlock.Depth();
+    if ( npuOp == EthosU55NpuOp::Depthwise || npuOp == EthosU55NpuOp::Pooling )
+    {
+        ofmBlocks = ofmBlocks.WithDepth(1);
+        ofmBlockDepth = query.ifmShape[0].Depth();
+    }
+
+    // Convolution & pooling
+    if ( OpUsesMacs(npuOp) )
+    {
+        // Number of sub kernels
+        int subKernelWidth = s_SubkernelLimits[int(npuOp)].x;
+        int subKernelHeight = s_SubkernelLimits[int(npuOp)].y;
+        int subkernels = DivRoundUp(query.kernel->Size().x, subKernelWidth) * DivRoundUp(query.kernel->Size().y, subKernelHeight);
+
+        int ifmFetch =
+            (Shape::RoundAway(ifmBlock, ifmRounding).ElementsWH() * Shape::RoundAway(query.ifmShape[0], ifmRounding).Depth());
+
+        int kernelRead = query.kernel->Size().AreaXY();
+        if ( (npuOp != EthosU55NpuOp::Depthwise) && (npuOp != EthosU55NpuOp::Pooling) )
+        {
+            kernelRead *= query.ifmShape[0].Depth();
+        }
+
+        int ofmBlockCount = ofmBlocks.Elements();
+
+        access.ifmRead[0] = ifmFetch * subkernels * ofmBlockCount;
+
+        if ( (npuOp != EthosU55NpuOp::Pooling) && (npuOp != EthosU55NpuOp::ReduceSum) )
+        {
+            int weightFetch = kernelRead * ofmBlockDepth * ofmBlockCount;
+            access.constRead[0] = weightFetch;
+            access.constRead[1] = query.ofmShape.Depth();  // Scales & biases
+            access.weightsRefetch = ofmBlocks.ElementsWH();
+        }
+    }
+    else if ( npuOp == EthosU55NpuOp::Elementwise )
+    {
+        // IFM1 is scalar
+        if ( query.ifmShape[0].Elements() == 1 )
+        {
+            if ( DataTypeSizeBits(query.ifmType[0]) > 8 )  // IFM1 is a non 8-bit scalar
+            {
+                access.ifmRead[0] = Shape::RoundAway(query.ifmShape[0], ifmRounding).Elements();
+            }
+            else if ( query.ifmShape[1].Elements() > 0 )
+            {
+                access.ifmRead[1] = Shape::RoundAway(query.ofmShape, ifmRounding).Elements();
+            }
+        }
+        else  // IFM1 is not scalar
+        {
+            access.ifmRead[0] = Shape::RoundAway(query.ofmShape, ifmRounding).Elements();
+            if ( query.ifmShape[1].Elements() > 0 )
+            {
+                // IFM2 is not scalar
+                if ( query.ifmShape[1].Elements() > 1 )
+                {
+                    access.ifmRead[1] = access.ifmRead[0];
+                }
+                else if ( DataTypeSizeBits(query.ifmType[1]) > 8 )  // IFM2 is a non 8-bit scalar
+                {
+                    access.ifmRead[1] = Shape::RoundAway(query.ifmShape[1], ifmRounding).Elements();
+                }
+            }
+        }
+    }
+    else
+    {
+        assert(false);
+    }
+
+    access.ofmWrite = Shape::RoundAway(query.ofmShape, ofmRounding).Elements();
+
+    return access;
+}
+
+
+ElementAccess EthosU55Performance::ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access)
+{
+    EthosU55OpConfig *opConfig = static_cast<EthosU55OpConfig *>(query.config);
+
+    ElementAccess result = access;
+
+    // IFM bytes transferred
+    int ifmBits = DataTypeSizeBits(query.ifmType[0]);  // All inputs expect same bit width
+    const int ifmCount = query.ifmShape[1].Elements() > 0 ? int(std::size(query.ifmShape)) : 1;
+    for ( int i = 0; i < ifmCount; i++ )
+    {
+        result.ifmRead[i] = EstimateMemoryTransfer(_arch->_cores, true, query.ifmMemory[i], query.ifmFormat[i], ifmBits,
+            opConfig->IfmBlock(), query.ifmShape[i], access.ifmRead[i]);
+    }
+
+    // OFM bytes transferred
+    result.ofmWrite = EstimateMemoryTransfer(_arch->_cores, false, query.ofmMemory, query.ofmFormat,
+        DataTypeSizeBits(query.ofmType), opConfig->OfmBlock(), query.ofmShape, access.ofmWrite);
+
+    // These requires compression ratio information
+    result.constRead[0] = 0;
+    result.constRead[1] = 0;
+
+    return result;
+}
+
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp
new file mode 100644
index 00000000..c39e2c40
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp
@@ -0,0 +1,60 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/common.hpp"
+
+#include "architecture/architecture.hpp"
+
+namespace regor
+{
+
+class ArchEthosU55;
+
+struct EthosU55PerfInfo
+{
+    float outputCycles[8];
+    float activationCycles[3];
+};
+
+/// <summary>
+/// Profiles performance analysis for Ethos-U55
+/// </summary>
+class EthosU55Performance : public ArchitecturePerformance
+{
+protected:
+    ArchEthosU55 *_arch;
+    const EthosU55PerfInfo *_perfInfo;
+
+public:
+    EthosU55Performance(ArchEthosU55 *arch, const EthosU55PerfInfo *perfInfo);
+
+public:
+    CycleCost MeasureCycleCost(const PerformanceQuery &query, const std::vector<FusionQuery> &fused) override;
+    int64_t MemToMemCycles(const ArchitectureMemory *dest, const ArchitectureMemory *source, int sizeBytes) override;
+    ElementAccess MeasureElementAccess(const PerformanceQuery &query) override;
+    ElementAccess ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access) override;
+
+private:
+    int64_t EstimateConvCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
+    float EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
+    int64_t EstimateMinimumMemoryCycles(const PerformanceQuery &query);
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp
new file mode 100644
index 00000000..092e40f5
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp
@@ -0,0 +1,1469 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "ethos_u55_register_cs_generator.hpp"
+
+#include "common/common.hpp"
+#include "common/logging.hpp"
+
+#include "architecture/ethos_u_scaling.hpp"
+#include "common/data_type.hpp"
+#include "compiler/high_level_command_stream.hpp"
+#include "compiler/op_type.hpp"
+#include "ethos_u55.hpp"
+#include "ethos_u55_scaling.hpp"
+#define NPU_DISASSEMBLE
+#define NPU_NAMESPACE ethosu65
+#include "architecture/ethosu65/ethos_u65_interface.hpp"
+
+#include <cstdint>
+#include <deque>
+#include <unordered_map>
+#include <vector>
+
+namespace regor
+{
+using namespace ethosu65;
+
+void Emitter::Emit(uint32_t instr)
+{
+    uint16_t cmd = instr & 0xFFFF;
+    assert(IsCmd0(cmd));
+    bool emit = IsOp(cmd) || SetRegister(cmd, instr);
+    if ( emit )
+    {
+        _stream.push_back(instr);
+    }
+}
+
+void Emitter::Emit(uint64_t instr)
+{
+    uint16_t cmd = instr & 0xFFFF;
+    assert(IsCmd1(cmd));
+    bool emit = IsOp(cmd) || SetRegister(cmd, instr);
+    if ( emit )
+    {
+        _stream.push_back(uint32_t(instr));
+        _stream.push_back(uint32_t(instr >> 32));
+    }
+}
+
+void Emitter::Clear()
+{
+    _stream.clear();
+    _registers.clear();
+}
+
+
+bool Emitter::SetRegister(uint16_t reg, uint64_t value)
+{
+    auto item = _registers.find(reg);
+    bool isChanged = item == _registers.end() || item->second != value;
+    if ( isChanged )
+    {
+        _registers[reg] = value;
+    }
+    return isChanged;
+}
+
+bool Emitter::IsCmd0(uint16_t key)
+{
+    return (key >> 14) == uint16_t(ethosu65::cmd_ctrl::CMD0_CTRL);
+}
+
+bool Emitter::IsCmd1(uint16_t key)
+{
+    return (key >> 14) == uint16_t(ethosu65::cmd_ctrl::CMD1_CTRL);
+}
+
+bool Emitter::IsOp(uint16_t key)
+{
+    return IsCmd0(key) ? (key & (1 << 8)) == 0 : (key & (1 << 8)) != 0;
+}
+
+
+/// <summary>
+/// Generates register command streams for Ethos U55 and Ethos U65.
+/// </summary>
+
+namespace
+{
+const std::unordered_map<OpType, elementwise_mode> s_ElementwiseMap = {
+    {OpType::Add, elementwise_mode::ADD},
+    {OpType::Sub, elementwise_mode::SUB},
+    {OpType::Abs, elementwise_mode::ABS},
+    {OpType::Mul, elementwise_mode::MUL},
+    {OpType::Minimum, elementwise_mode::MIN},
+    {OpType::Maximum, elementwise_mode::MAX},
+    {OpType::LeakyRelu, elementwise_mode::LRELU},
+    {OpType::CLZ, elementwise_mode::CLZ},
+    {OpType::SHL, elementwise_mode::SHL},
+    {OpType::Asr, elementwise_mode::SHR},
+};
+
+activation_type ToActivationType(DataType type)
+{
+    if ( IsSignedInteger(type) )
+    {
+        return activation_type::SIGNED;
+    }
+    else
+    {
+        assert(IsInteger(type));
+        return activation_type::UNSIGNED;
+    }
+}
+
+activation_format ToActivationFormat(TensorFormat format)
+{
+    if ( format == TensorFormat::NHCWB16 )
+    {
+        return activation_format::NHCWB16;
+    }
+    else
+    {
+        assert(format == TensorFormat::NHWC);
+        return activation_format::NHWC;
+    }
+}
+
+activation_precision ToActivationPrecision(DataType type)
+{
+    switch ( DataTypeSizeBits(type) )
+    {
+        case 8:
+            return activation_precision::B8;
+        case 16:
+            return activation_precision::B16;
+        case 32:
+            return activation_precision::B32;
+        case 64:
+            return activation_precision::B64;
+        default:
+            assert(false);
+            return activation_precision::B64;
+    }
+}
+
+ifm_upscale_mode ToIfmUpscaleMode(ArchResampling resampling)
+{
+
+    if ( resampling == ArchResampling::Nearest )
+    {
+        return ifm_upscale_mode::NEAREST;
+    }
+    if ( resampling == ArchResampling::Zeros )
+    {
+        return ifm_upscale_mode::ZEROS;
+    }
+    return ifm_upscale_mode::NONE;
+}
+
+RCSRoundMode GetRoundingMode(const HLCOperation *op)
+{
+    switch ( op->rounding )
+    {
+        case HLCRoundMode::NATURAL:
+            return RCSRoundMode::NATURAL;
+        case HLCRoundMode::TRUNCATE:
+            return RCSRoundMode::TRUNCATE;
+        case HLCRoundMode::DBL:
+            return RCSRoundMode::DBL;
+        case HLCRoundMode::AUTO:
+            return RCSRoundMode::DBL;
+        default:
+            return RCSRoundMode::DBL;
+    }
+}
+
+ifm_scale_mode MapRcsIfmScaleModeToInterface(RCSIfmScaleMode rcsScaleMode)
+{
+    switch ( rcsScaleMode )
+    {
+        case RCSIfmScaleMode::OPA_OPB_16:
+            return ifm_scale_mode::OPA_OPB_16;
+        case RCSIfmScaleMode::OPA_32:
+            return ifm_scale_mode::OPA_32;
+        case RCSIfmScaleMode::OPB_32:
+            return ifm_scale_mode::OPB_32;
+        default:
+            assert(0 && "Unexpected value, has the interface changed?");
+            return ifm_scale_mode::OPA_32;
+    }
+}
+
+round_mode MapRcsRoundModeToInterface(RCSRoundMode rcsRoundMode)
+{
+    switch ( rcsRoundMode )
+    {
+        case RCSRoundMode::NATURAL:
+            return round_mode::NATURAL;
+        case RCSRoundMode::TRUNCATE:
+            return round_mode::TRUNCATE;
+        case RCSRoundMode::DBL:
+            return round_mode::DBL;
+        default:
+            assert(0 && "Unexpected value, has the interface changed?");
+            return round_mode::DBL;
+    }
+}
+
+}  // namespace
+
+uint32_t EthosU55RCSGenerator::IdRegister()
+{
+    return id_r{};
+}
+
+bool EthosU55RCSGenerator::IsSupportedElementwise(const OpType opType)
+{
+    return s_ElementwiseMap.count(opType) != 0;
+}
+
+EthosU55RCSGenerator::EthosU55RCSGenerator(ArchEthosU55 *arch) : _arch(arch)
+{
+}
+
+
+void EthosU55RCSGenerator::Emit(uint32_t instr)
+{
+    _emit.Emit(instr);
+}
+
+void EthosU55RCSGenerator::Emit(uint64_t instr)
+{
+    _emit.Emit(instr);
+}
+
+int EthosU55RCSGenerator::GetDoubleBufferOffset(HLCWeights *weights, int rangeIndex)
+{
+    int doubleBufferOffset = 0;
+    if ( weights->buffering == Buffering::Double )
+    {
+        assert(weights->subStreams > 0);
+        int depthIndex = rangeIndex / weights->subStreams;
+        if ( depthIndex % 2 == 1 )
+        {
+            doubleBufferOffset = weights->maxRangeBytes;
+        }
+    }
+    return doubleBufferOffset;
+}
+
+void EthosU55RCSGenerator::CheckAddressRange(ArchitectureMemory *memory, Address address, int size)
+{
+    assert(address >= 0);
+    if ( address >= memory->SizeBytes() )
+    {
+        LOG_ERROR("Error: Address out of bounds, address {0}, memory '{1}' with size {2}\n", address, memory->Name(),
+            memory->SizeBytes());
+        // TODO: replace assert by error handling
+        assert(false && "Address out of bounds");
+    }
+    assert(size >= 0);
+    if ( address + size > memory->SizeBytes() )
+    {
+        LOG_ERROR("Error: Address offset out of bounds, address {0}, offset {1}, memory '{2}' with size {3}\n", address,
+            size, memory->Name(), memory->SizeBytes());
+        // TODO: replace assert by error handling
+        assert(false && "address offset out of bounds");
+    }
+}
+
+void EthosU55RCSGenerator::CheckAddresses(const HLCFeatureMap &fm)
+{
+    CheckAddressRange(fm.memArea.memory, fm.address, fm.AllocationSizeBytes());
+    assert(fm.address % 16 == 0 || fm.format != TensorFormat::NHCWB16);
+}
+
+// Calculates the rolling buffer address of the given coordinate.
+Address EthosU55RCSGenerator::AddressForCoordinate(const HLCFeatureMap &fm, const Shape &strides, const Shape &coord)
+{
+    Shape truncatedCoord = Shape::PadAxes(coord, 4, 0) % Shape::PadAxes(fm.shape, 4, 1);
+    int offset = 0;
+    if ( fm.format == TensorFormat::NHWC )
+    {
+        offset = strides.Dot(truncatedCoord);
+    }
+    else if ( fm.format == TensorFormat::NHCWB16 )
+    {
+        constexpr int BRICK = 16;
+        int elemSize = DataTypeSizeBits(fm.dataType) / 8;
+        int strideX = BRICK * elemSize;
+        offset =
+            truncatedCoord.Height() * strides.Height() + truncatedCoord.Width() * strideX +
+            (truncatedCoord.Depth() / BRICK) * strides.Depth() + (truncatedCoord.Depth() % BRICK) * elemSize +
+            truncatedCoord.Batch() * strides.Batch();
+    }
+    else
+    {
+        assert(false);
+    }
+    return fm.address + offset;
+}
+
+// Calculates tile sizes/addresses of a feature map
+TileBox EthosU55RCSGenerator::GetTiles(const HLCFeatureMap &fm, const Shape &strides, const Box &area)
+{
+    int crossingY = RoundAway(area.Start().Height() + 1, fm.shape.Height());
+    crossingY = std::min(crossingY, area.End().Height());
+    int crossingX = RoundAway(area.Start().Width() + 1, fm.shape.Width());
+    crossingX = std::min(crossingX, area.End().Width());
+    TileBox tiles;
+    auto height = crossingY - area.Start().Height();
+    auto width = crossingX - area.Start().Width();
+    tiles.height0 = (height + fm.stepXY.y - 1) / fm.stepXY.y;
+    tiles.height1 = tiles.height0;
+    tiles.width0 = (width + fm.stepXY.x - 1) / fm.stepXY.x;
+    for ( int i = 0; i < 4; ++i )
+    {
+        tiles.address[i] = 0;
+    }
+    int fmSize = fm.AllocationSizeBytes();
+    tiles.address[0] = AddressForCoordinate(fm, strides, area.Start());
+    assert(fm.address <= tiles.address[0] && tiles.address[0] < fm.address + fmSize);
+    if ( area.End().Width() > crossingX )
+    {
+        tiles.address[1] = AddressForCoordinate(fm, strides, area.Start().WithWidth(crossingX));
+        assert(fm.address <= tiles.address[1] && tiles.address[1] < fm.address + fmSize);
+        assert(false && "Striping in vertical direction is not supported");
+    }
+    if ( area.End().Height() > crossingY )
+    {
+        tiles.address[2] = AddressForCoordinate(fm, strides, area.Start().WithHeight(crossingY));
+        assert(fm.address <= tiles.address[2] && tiles.address[2] < fm.address + fmSize);
+    }
+    if ( area.End().Width() > crossingX && area.End().Height() > crossingY )
+    {
+        tiles.address[3] = AddressForCoordinate(fm, strides, area.Start().WithWidth(crossingX).WithHeight(crossingY));
+        assert(fm.address <= tiles.address[3] && tiles.address[3] < fm.address + fmSize);
+    }
+    if ( fm.format == TensorFormat::NHCWB16 )
+    {
+        for ( int i = 0; i < 4; ++i )
+        {
+            assert(tiles.address[i] % 16 == 0 && "NHCWB16 base address is not 16-byte aligned");
+        }
+    }
+    return tiles;
+}
+
+MemoryAccess EthosU55RCSGenerator::ToMemoryAccess(const HLCFeatureMap &fm, const Box &area, AccessDirection direction)
+{
+    const auto &strides = fm.strides;
+    Address start = AddressForCoordinate(fm, strides, area.Start());
+    // Note: due to truncating of shape, AddressForCoordinate(fm, .., fm.shape) returns
+    // fm.address; the - Shape(1, 1, 1) prevents this
+    Address end = AddressForCoordinate(fm, strides, area.End() - Shape(1, 1, 1)) + DataTypeSizeBits(fm.dataType) / 8;
+    if ( end < start )
+    {
+        // Area wraps around the end of the feature map
+        start = fm.address;
+        end = fm.address + fm.AllocationSizeBytes();
+    }
+    return MemoryAccess(direction, fm.memArea, start, end);
+}
+
+// Returns region number used in NPU_SET_..._REGION
+uint32_t EthosU55RCSGenerator::ToRegion(const MemArea &memArea)
+{
+    auto region = BasePointerIndex::WeightTensor;
+    if ( memArea == _arch->FeatureMapMemory() )
+    {
+        region = BasePointerIndex::ScratchTensor;
+    }
+    else if ( memArea == _arch->StagingMemory() )
+    {
+        region = BasePointerIndex::ScratchFastTensor;
+    }
+    else if ( memArea == _arch->LUTMemory() )
+    {
+        region = BasePointerIndex::Mem2Mem;
+    }
+    else
+    {
+        assert(memArea == _arch->ReadonlyMemory());
+    }
+    return uint32_t(region);
+}
+
+bool EthosU55RCSGenerator::UseZeroPoint0(OpType opType, const HLCFeatureMap &fm, bool isOFM)
+{
+    if ( fm.quantization.forceZeroPoint )
+    {
+        return false;
+    }
+    if ( fm.quantization.zeroPoints.empty() || (fm.dataType == DataType::Int32 && !isOFM) )
+    {
+        return true;
+    }
+    return opType == OpType::AvgPool || opType == OpType::ResizeBilinear || opType == OpType::CLZ || opType == OpType::SHL;
+}
+
+// Checks if the feature map is a scalar, and if so, returns the
+// quantized value in scalarValue.
+bool EthosU55RCSGenerator::IsScalar(const HLCFeatureMap &fm, int32_t &scalarValue)
+{
+    const auto &view = fm.bufferView;
+    // A 1-sized feature map in constant memory is a scalar
+    bool isScalar = fm.shape.Elements() == 1 && view.HasBuffer();
+    if ( isScalar )
+    {
+        if ( fm.dataType == DataType::Int8 )
+        {
+            scalarValue = view.Values<int8_t>()[0];
+        }
+        else if ( fm.dataType == DataType::UInt8 )
+        {
+            scalarValue = view.Values<uint8_t>()[0];
+        }
+        else if ( fm.dataType == DataType::Int16 )
+        {
+            scalarValue = view.Values<int16_t>()[0];
+        }
+        else
+        {  // Unsupported scalar value
+            isScalar = false;
+        }
+    }
+    return isScalar;
+}
+
+// Calculates waits for KERNEL_WAIT/DMA_WAIT, returns -1 if no wait is needed
+// - opAccesses contains the memory accesses for the current operation
+// - outstanding contains the memory accesses for ongoing "other" operations
+//   (DMA operations if the current op is an NPU operation, NPU operations if the current op is a DMA operation)
+// Note: NPU - NPU dependency is handled via block dependency
+int EthosU55RCSGenerator::CalcCommandWaits(const MemoryAccesses &opAccesses, std::deque<MemoryAccesses> &outstanding)
+{
+    int waits = 0;
+    for ( int index = int(outstanding.size()) - 1; index >= 0; ++waits, --index )
+    {
+        for ( const auto &access : opAccesses )
+        {
+            for ( const auto &outstandingAccess : outstanding[index] )
+            {
+                if ( access.Conflicts(outstandingAccess) )
+                {
+                    // Current op needs to wait, and after it has waited,
+                    // outstanding[0..index] are not outstanding any longer
+                    for ( int i = 0; i <= index; ++i )
+                    {
+                        outstanding.pop_front();
+                    }
+                    return waits;
+                }
+            }
+        }
+    }
+    return -1;
+}
+
+// Returns LUT slot to be used for the given LUT operation.
+// Sets alreadyInLutMem to true if the LUT is already in SHRAM.
+int EthosU55RCSGenerator::AllocateLutSlot(
+    std::vector<LutSlot> &lutSlots, const HLCOperation *op, int sizeInSlots, int timestamp, bool &alreadyInLutMem)
+{
+    alreadyInLutMem = false;
+    int totalSlots = int(lutSlots.size());
+    if ( sizeInSlots < 0 || sizeInSlots > totalSlots )
+    {
+        assert(false);
+        return 0;
+    }
+    // Returns least recently used slot, unless the LUT is already in memory
+    int allocatedSlot = 0;
+    for ( int i = 0; i < totalSlots; i += sizeInSlots )
+    {
+        if ( lutSlots[i].hlcOp == op )
+        {
+            // LUT is already in SHRAM
+            allocatedSlot = i;
+            alreadyInLutMem = true;
+            break;
+        }
+        if ( lutSlots[i].lastUsed < lutSlots[allocatedSlot].lastUsed )
+        {
+            allocatedSlot = i;
+        }
+    }
+    for ( int j = allocatedSlot; j < allocatedSlot + sizeInSlots; ++j )
+    {
+        lutSlots[j].hlcOp = op;
+        lutSlots[j].lastUsed = timestamp;
+    }
+    return allocatedSlot;
+}
+
+//----------------------------------------------------------------------
+// Print
+//----------------------------------------------------------------------
+
+int EthosU55RCSGenerator::Disassemble(const uint32_t *in, std::string &op, std::vector<std::pair<std::string, std::string>> &fields)
+{
+    return isa::disassemble(in, op, fields);
+}
+
+//----------------------------------------------------------------------
+// Scaling (OFM/OPA/OPB_SCALE)
+//----------------------------------------------------------------------
+
+// Generates OFM_SCALE register for pooling operations
+void EthosU55RCSGenerator::GenerateOFMScalingForPooling(HLCOperation *poolOp)
+{
+    QuantizedScale quant(1, 0);
+    bool isNoOp = _arch->UseAvgPoolNop(poolOp->type);
+    ethosU55Scaling::RescalePooling(poolOp, isNoOp);
+
+    if ( poolOp->ofm.quantization && !poolOp->ofm.quantization.scales.empty() )
+    {
+        quant = poolOp->ofm.quantization.scales[0];
+        assert(unsigned(quant.shift) < 64);
+    }
+
+    Emit(isa::npu_set_ofm_scale_t(uint32_t(quant.shift), quant.scale));
+}
+
+// Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
+// Returns the operator to scale
+RCSIfmScaleMode EthosU55RCSGenerator::GenerateScalingForElementwise(HLCOperation *op, int ifm0Index)
+{
+    auto opToScale = RCSIfmScaleMode::OPA_OPB_16;
+    auto opType = op->type;
+
+    QuantizedScale ofmScale(1, 0);
+    ethosU55Scaling::RescaleElementwise(op);
+    int ifmCnt = int(op->ifm.size());
+    bool allHaveScale =
+        !op->ofm.quantization.scales.empty() && !op->ifm[0].quantization.scales.empty() && ifmCnt == 2 &&
+        !op->ifm[1].quantization.scales.empty();
+
+    if ( opType == OpType::Mul || opType == OpType::Abs )
+    {
+        if ( !op->ofm.quantization.scales.empty() )
+        {
+            ofmScale = op->ofm.quantization.scales[0];
+        }
+    }
+    else if ( opType == OpType::LeakyRelu )
+    {
+        const HLCParameters *params = &op->parameters;
+        double alpha = params->leaky_relu.alpha;
+        ofmScale = QuantizedScale(alpha);
+    }
+    else if ( opType == OpType::Add || opType == OpType::Sub )
+    {
+        uint32_t opaScale = 1;
+        uint32_t opbScale = 1;
+        uint32_t opaShift = 0;
+        if ( allHaveScale )
+        {
+            ofmScale = op->ofm.quantization.scales[0];
+            QuantizedScale ifm1Scale = op->ifm[ifm0Index].quantization.scales[0];
+            QuantizedScale ifm2Scale = op->ifm[1 - ifm0Index].quantization.scales[0];
+            opaScale = ifm1Scale.scale;
+            opaShift = ifm1Scale.shift;
+            opbScale = ifm2Scale.scale;
+
+            if ( ifm1Scale.scale == 0 || ifm2Scale.scale == 0 )
+            {
+                opbScale = 0;
+                if ( ifm1Scale.scale == 0 )
+                {
+                    opToScale = RCSIfmScaleMode::OPB_32;
+                    opaScale = ifm2Scale.scale;
+                    opaShift = ifm2Scale.shift;
+                }
+                else
+                {
+                    opToScale = RCSIfmScaleMode::OPA_32;
+                }
+            }
+            if ( ifm0Index == 1 )
+            {
+                // Reversed operands
+                if ( opToScale == RCSIfmScaleMode::OPA_32 )
+                {
+                    opToScale = RCSIfmScaleMode::OPB_32;
+                }
+                else if ( opToScale == RCSIfmScaleMode::OPB_32 )
+                {
+                    opToScale = RCSIfmScaleMode::OPA_32;
+                }
+            }
+        }
+        assert(opaShift < 64);
+        Emit(isa::npu_set_opa_scale_t(opaShift, opaScale));
+        Emit(isa::npu_set_opb_scale_t(opbScale));
+    }
+    assert(unsigned(ofmScale.shift) < 64);
+    Emit(isa::npu_set_ofm_scale_t(ofmScale.shift, ofmScale.scale));
+    return opToScale;
+}
+
+//----------------------------------------------------------------------
+// BLOCKDEP calculation
+//----------------------------------------------------------------------
+
+static Shape CalcIFMJobShape(const Shape &ofmBlock, Kernel *kernel, int ifmBlockDepth)
+{
+    // TODO MLBEDSW-8498: Consider ifm_upscale_mode for job-shape calculations
+    Point2i dilatedSize = kernel->DilatedWH();
+    int h = RequiredInputSize(ofmBlock.Height(), kernel->Stride().y, dilatedSize.y, 1);
+    int w = RequiredInputSize(ofmBlock.Width(), kernel->Stride().x, dilatedSize.x, 1);
+    return Shape(1, h, w, ifmBlockDepth);
+}
+
+// Given the area and block size, adds the first/last jobs (depending on fromStart) to jobs.
+// - area: total amount of work to perform
+// - jobShape: size of each job
+// - fromStart: if true, the first jobs are added, if false, the last jobs are added
+//   (in that case, the very last job is added last)
+void EthosU55RCSGenerator::GetJobs(const Box &area, const Shape &jobShape, int nrJobsToGet, bool fromStart, std::vector<Box> &jobs)
+{
+    Shape jobSplit = Shape::DivRoundUp(area.End() - area.Start(), jobShape);
+    int z = jobSplit.Depth();
+    int w = jobSplit.Width();
+    int h = jobSplit.Height();
+    int n = z * w * h;  // n = total number of jobs for the whole area
+    const auto &start = area.Start().Extract(-3, -2, -1);
+    const auto &end = area.End().Extract(-3, -2, -1);
+    int firstJob = fromStart ? 0 : std::max(0, n - nrJobsToGet);
+    int lastJob = fromStart ? std::min(n, nrJobsToGet) : n;
+    for ( int i = firstJob; i < lastJob; ++i )
+    {
+        Shape from = Shape(start.Height() + (i / (z * w)) * jobShape.Height(),
+            start.Width() + ((i / z) % w) * jobShape.Width(), start.Depth() + (i % z) * jobShape.Depth());
+        jobs.emplace_back(from, Shape::Min(from + jobShape, end));
+    }
+}
+
+// Calculates the value for the BLOCKDEP register
+int EthosU55RCSGenerator::CalcBlockDep(HLCStripe *prevStripe, HLCStripe *stripe)
+{
+    if ( prevStripe == nullptr )
+    {
+        return 0;
+    }
+    const auto &op = stripe->operation;
+    const auto &prevOp = prevStripe->operation;
+    const auto &prevOfm = prevOp->ofm;
+    if ( _arch->_shram.reservedEndBanks == 0 )
+    {
+        // SHRAM has no reserved LUT banks
+        if ( _stripeToLutSlot.count(prevStripe) && !_stripeToLutSlot.count(stripe) )
+        {
+            // Previous operation uses LUT, current does not
+            return 0;  // Prevents corruption of the LUT
+        }
+    }
+
+    int ifmIndex = (op->ifm.size() > 1 && op->ifm[1].address == prevOfm.address && op->ifm[1].memArea == prevOfm.memArea) ? 1 : 0;
+    const auto &ifm = op->ifm[ifmIndex];
+    int maxJobs = _arch->MaxBlockdep();
+    if ( ifm.address != prevOfm.address || ifm.memArea != prevOfm.memArea )
+    {
+        for ( const auto &fm : op->ifm )
+        {
+            if ( fm.memArea == prevOfm.memArea &&
+                 Overlaps(fm.address, fm.address + fm.AllocationSizeBytes(), prevOfm.address, prevOfm.address + prevOfm.AllocationSizeBytes()) )
+            {
+                // Previous OFM overlaps in unexpected way with current IFM
+                assert(false && "Unexpected overlap previous OFM/current IFM");
+                return 0;
+            }
+        }
+        // Previous operation does not produce current operation's IFM
+        return maxJobs;
+    }
+    if ( op->ifm.size() > 1 && ifm.AllocationSizeBytes() < op->ifm[1 - ifmIndex].AllocationSizeBytes() )
+    {
+        // Prev OFM produces IFM2 which is broadcasted (this should be rare)
+        return 0;
+    }
+    if ( prevOfm.shape != ifm.shape )
+    {
+        // OFM has been reshaped; the job overlap calculations below do not work in this case
+        return 0;
+    }
+    // Previous operation produces current operations IFM
+    auto prevConfig = static_cast<EthosU55OpConfig *>(prevOp->config);
+    Shape prevBlock = prevConfig->OfmBlock();
+    auto config = static_cast<EthosU55OpConfig *>(op->config);
+    Shape currBlock = CalcIFMJobShape(config->OfmBlock(), &op->kernel, config->IfmBlock().Depth());
+    // Get the last few jobs from the previous operation (each job produces a part of the current op's IFM)
+    std::vector<Box> lastPrevJobs;
+    GetJobs(prevStripe->ofmArea, prevBlock, maxJobs, false, lastPrevJobs);
+    // Get the first few jobs from the current operation (each job consumes a part of the current op's IFM)
+    std::vector<Box> firstCurrJobs;
+    GetJobs(stripe->ifmAreas[ifmIndex], currBlock, maxJobs, true, firstCurrJobs);
+    // Find the highest block dependency such that there is no overlap between
+    // any job from the previous op with any job from the current op during block dependency jobs
+    int sz = int(std::min(lastPrevJobs.size(), firstCurrJobs.size()));
+    int prevLastIx = int(lastPrevJobs.size()) - 1;
+    for ( int blockdep = 0; blockdep < sz; ++blockdep )
+    {
+        bool overlaps = false;
+        for ( int i = 0; !overlaps && i <= blockdep; ++i )
+        {
+            for ( int j = blockdep - i; !overlaps && i + j <= blockdep; ++j )
+            {
+                if ( firstCurrJobs[i].Overlaps(lastPrevJobs[prevLastIx - j]) )
+                {
+                    overlaps = true;
+                }
+            }
+        }
+        if ( overlaps )
+        {
+            return blockdep;
+        }
+    }
+    // No overlap found
+    return sz;
+}
+
+//----------------------------------------------------------------------
+// Register generation
+//----------------------------------------------------------------------
+
+void EthosU55RCSGenerator::GeneratePadding(const HLCPadding &padding)
+{
+    Emit(isa::npu_set_ifm_pad_top_t(padding.top));
+    Emit(isa::npu_set_ifm_pad_left_t(padding.left));
+    Emit(isa::npu_set_ifm_pad_bottom_t(padding.bottom));
+    Emit(isa::npu_set_ifm_pad_right_t(padding.right));
+}
+
+// Generates ACTIVATION registers
+void EthosU55RCSGenerator::GenerateActivation(const HLCStripe *stripe, MemoryAccesses &memoryAccesses)
+{
+    const HLCOperation *op = stripe->operation.get();
+    assert(op->subOps.size() <= 1);
+    OpType opType = OpType::None;
+    if ( IsActivation(op->type) )
+    {
+        // Non-fused activation
+        opType = op->type;
+        assert(op->subOps.empty() || opType == op->subOps[0].type);
+    }
+    else if ( !op->subOps.empty() )
+    {
+        // Fused activation
+        opType = op->subOps[0].type;
+    }
+    auto &ofm = op->ofm;
+    int size = std::min(16, DataTypeSizeBits(ofm.dataType));
+    assert(size > 0 && "Illegal data type");
+    bool isSigned = bool(ofm.dataType & DataType::Signed);
+    int64_t quantizedMin = isSigned ? -(1LL << (size - 1)) : 0;
+    int64_t quantizedMax = isSigned ? (1LL << (size - 1)) - 1 : (1LL << size) - 1;
+
+    auto act = activation_function::RELU;
+    auto clipRange = activation_clip_range::OFM_PRECISION;
+    if ( ofm.quantization.quantMin.size() )
+    {
+        quantizedMin = std::max(quantizedMin, ofm.quantization.quantMin[0]);
+    }
+    if ( ofm.quantization.quantMax.size() )
+    {
+        quantizedMax = std::min(quantizedMax, ofm.quantization.quantMax[0]);
+    }
+
+    if ( opType == OpType::Sigmoid )
+    {
+        act = activation_function::SIGMOID;
+    }
+    else if ( opType == OpType::Tanh )
+    {
+        act = activation_function::TANH;
+    }
+    else if ( opType == OpType::LUT )
+    {
+        auto &param = op->subOps[0].parameters.lut;
+        size = param.sizeBytes;
+        assert(size == 256 || size == 1024 || size == 2048);
+
+        int tableIndex = 0;
+        auto pos = _stripeToLutSlot.find(stripe);
+        if ( pos != _stripeToLutSlot.end() )
+        {
+            tableIndex = pos->second;
+        }
+        else
+        {
+            assert(false && "Command uses lut, but no lut info found");
+        }
+        act = activation_function(int(activation_function::TABLE_0) + tableIndex);
+        if ( ofm.dataType == DataType::Int32 )
+        {
+            // force INT8 range
+            clipRange = activation_clip_range::FORCE_INT8;
+            quantizedMin = std::max<int64_t>(quantizedMin, -128);
+            quantizedMax = std::min<int64_t>(quantizedMax, 127);
+        }
+        auto &layout = static_cast<EthosU55OpConfig *>(op->config)->_layout;
+        Address lutStart = Address(layout.lutStart) * _arch->_shram.bankSizeBytes + tableIndex * _arch->_shram.lutSlotSize;
+        memoryAccesses.emplace_back(AccessDirection::Read, _arch->LUTMemory(), lutStart, lutStart + param.sizeBytes);
+    }
+    assert(quantizedMin <= std::numeric_limits<int16_t>::max());
+    assert(quantizedMax <= std::numeric_limits<int16_t>::max());
+    Emit(isa::npu_set_activation_t(act, clipRange));
+    Emit(isa::npu_set_activation_min_t(uint32_t(quantizedMin)));
+    Emit(isa::npu_set_activation_max_t(uint32_t(quantizedMax)));
+}
+
+// Generates KERNEL related registers
+void EthosU55RCSGenerator::GenerateKernel(const Kernel &kernel, bool partKernel)
+{
+    auto dilatedWH = kernel.DilatedWH();
+    Emit(isa::npu_set_kernel_height_m1_t(dilatedWH.y - 1));
+    Emit(isa::npu_set_kernel_width_m1_t(dilatedWH.x - 1));
+    uint32_t stride_x_lsb = (kernel.Stride().x - 1) & 1;
+    uint32_t stride_y_lsb = (kernel.Stride().y - 1) & 1;
+    uint32_t stride_x_msb = ((kernel.Stride().x - 1) >> 1) & 1;
+    uint32_t stride_y_msb = ((kernel.Stride().y - 1) >> 1) & 1;
+    auto weightOrder = partKernel ? weight_order::PART_KERNEL_FIRST : weight_order::DEPTH_FIRST;
+    kernel_dilation dilation_x = kernel_dilation(kernel.Dilation().x - 1);
+    kernel_dilation dilation_y = kernel_dilation(kernel.Dilation().y - 1);
+    kernel_decomposition decomposition = kernel_decomposition::D8X8;  //  Kernel decomposition
+    Emit(isa::npu_set_kernel_stride_t(
+        stride_x_lsb, stride_y_lsb, weightOrder, dilation_x, dilation_y, decomposition, stride_x_msb, stride_y_msb));
+}
+
+// Generates IFM2_BROADCAST register for binary elementwise operations
+void EthosU55RCSGenerator::GenerateIFM2Broadcast(const Shape &ifmShape, const Shape &ifm2Shape, bool reversedOperands, bool isScalar)
+{
+    auto broadcastH = broadcast_mode::DISABLE;
+    auto broadcastW = broadcast_mode::DISABLE;
+    auto broadcastC = broadcast_mode::DISABLE;
+    auto order = reversedOperands ? ifm2_operand_order::ORDER_A : ifm2_operand_order::ORDER_B;
+    auto isConstant = broadcast_mode::DISABLE;
+    if ( isScalar )
+    {
+        isConstant = broadcast_mode::ENABLE;
+    }
+    else
+    {
+        if ( ifmShape.Height() != ifm2Shape.Height() )
+        {
+            // Broadcast in 'H' dimension
+            broadcastH = broadcast_mode::ENABLE;
+            assert(ifm2Shape.Height() == 1);
+        }
+        if ( ifmShape.Width() != ifm2Shape.Width() )
+        {
+            // Broadcast in 'W' dimension
+            broadcastW = broadcast_mode::ENABLE;
+            assert(ifm2Shape.Width() == 1);
+        }
+        if ( ifmShape.Depth() != ifm2Shape.Depth() )
+        {
+            // Broadcast in 'C' dimension
+            broadcastC = broadcast_mode::ENABLE;
+            assert(ifm2Shape.Depth() == 1);
+        }
+    }
+    Emit(isa::npu_set_ifm2_broadcast_t(broadcastH, broadcastW, broadcastC, order, isConstant));
+}
+
+// Generates IFM_PRECISION register
+void EthosU55RCSGenerator::GenerateIFMPrecision(const HLCFeatureMap &fm, RCSIfmScaleMode scaleMode)
+{
+    activation_type type = ToActivationType(fm.dataType);
+    activation_precision precision = ToActivationPrecision(fm.dataType);
+    activation_format format = ToActivationFormat(fm.format);
+    round_mode roundMode = round_mode::DBL;
+    ifm_scale_mode interfaceScaleMode = MapRcsIfmScaleModeToInterface(scaleMode);
+    Emit(isa::npu_set_ifm_precision_t(type, precision, format, interfaceScaleMode, roundMode));
+}
+
+// Generates IFM2_PRECISION register
+void EthosU55RCSGenerator::GenerateIFM2Precision(const HLCFeatureMap &fm)
+{
+    activation_type type = ToActivationType(fm.dataType);
+    activation_precision precision = ToActivationPrecision(fm.dataType);
+    activation_format format = ToActivationFormat(fm.format);
+    Emit(isa::npu_set_ifm2_precision_t(type, precision, format));
+}
+
+// Generates OFM_PRECISION register
+void EthosU55RCSGenerator::GenerateOFMPrecision(const HLCFeatureMap &fm, bool useGlobalScale, RCSRoundMode roundMode)
+{
+    activation_type type = ToActivationType(fm.dataType);
+    activation_precision precision = ToActivationPrecision(fm.dataType);
+    activation_format format = ToActivationFormat(fm.format);
+    round_mode interfaceRoundMode = MapRcsRoundModeToInterface(roundMode);
+    auto scaleMode = useGlobalScale ? ofm_scale_mode::GLOBAL : ofm_scale_mode::PER_CHANNEL;
+    Emit(isa::npu_set_ofm_precision_t(type, precision, format, scaleMode, interfaceRoundMode));
+}
+
+// Generates common IFM registers
+void EthosU55RCSGenerator::GenerateIFM(OpType opType, const HLCFeatureMap &fm, const Box &inputArea)
+{
+    CheckAddresses(fm);
+    Emit(isa::npu_set_ifm_region_t(ToRegion(fm.memArea)));
+    Shape strides = fm.strides;
+    auto tiles = GetTiles(fm, strides, inputArea);
+    auto boxSize = inputArea.SizeShape();
+    // IFM_BASE registers
+    Emit(isa::npu_set_ifm_base0_t(tiles.address[0]));
+    Emit(isa::npu_set_ifm_base1_t(tiles.address[1]));
+    Emit(isa::npu_set_ifm_base2_t(tiles.address[2]));
+    Emit(isa::npu_set_ifm_base3_t(tiles.address[3]));
+    // Tile related registers
+    Emit(isa::npu_set_ifm_height0_m1_t(tiles.height0 - 1));
+    Emit(isa::npu_set_ifm_height1_m1_t(tiles.height1 - 1));
+    Emit(isa::npu_set_ifm_width0_m1_t(tiles.width0 - 1));
+    Emit(isa::npu_set_ifm_depth_m1_t(boxSize.Depth() - 1));
+    // IFM_STRIDE registers
+    Emit(isa::npu_set_ifm_stride_y_t(strides.Height() * fm.stepXY.y));
+    Emit(isa::npu_set_ifm_stride_x_t(strides.Width() * fm.stepXY.x));
+    Emit(isa::npu_set_ifm_stride_c_t(strides.Depth()));
+    // IFM_ZERO_POINT register
+    auto &quant = fm.quantization;
+    uint32_t zp = UseZeroPoint0(opType, fm, false) ? 0 : uint32_t(quant.zeroPoints[0]);
+    Emit(isa::npu_set_ifm_zero_point_t(zp));
+}
+
+// Generates common IFM2 registers
+void EthosU55RCSGenerator::GenerateIFM2(OpType opType, const HLCFeatureMap &fm, const Box &inputArea, bool isScalar, int32_t scalarValue)
+{
+    if ( isScalar )
+    {
+        Emit(isa::npu_set_ifm2_scalar_t(uint32_t(scalarValue)));
+    }
+    else
+    {
+        CheckAddresses(fm);
+        Emit(isa::npu_set_ifm2_region_t(ToRegion(fm.memArea)));
+        Shape strides = fm.strides;
+        auto tiles = GetTiles(fm, strides, inputArea);
+        // IFM2_BASE registers
+        Emit(isa::npu_set_ifm2_base0_t(tiles.address[0]));
+        Emit(isa::npu_set_ifm2_base1_t(tiles.address[1]));
+        Emit(isa::npu_set_ifm2_base2_t(tiles.address[2]));
+        Emit(isa::npu_set_ifm2_base3_t(tiles.address[3]));
+        // Tile related registers
+        Emit(isa::npu_set_ifm2_height0_m1_t(tiles.height0 - 1));
+        Emit(isa::npu_set_ifm2_height1_m1_t(tiles.height1 - 1));
+        Emit(isa::npu_set_ifm2_width0_m1_t(tiles.width0 - 1));
+        // IFM2_STRIDE registers
+        Emit(isa::npu_set_ifm2_stride_y_t(strides.Height() * fm.stepXY.y));
+        Emit(isa::npu_set_ifm2_stride_x_t(strides.Width() * fm.stepXY.x));
+        Emit(isa::npu_set_ifm2_stride_c_t(strides.Depth()));
+    }
+    // IFM2_ZERO_POINT register
+    auto &quant = fm.quantization;
+    uint32_t zp = UseZeroPoint0(opType, fm, false) ? 0 : uint32_t(quant.zeroPoints[0]);
+    Emit(isa::npu_set_ifm2_zero_point_t(zp));
+}
+
+// Generates OFM registers
+void EthosU55RCSGenerator::GenerateOFM(OpType opType, const HLCFeatureMap &fm, const Box &outputArea)
+{
+    CheckAddresses(fm);
+    Emit(isa::npu_set_ofm_region_t(ToRegion(fm.memArea)));
+    Shape strides = fm.strides;
+    auto tiles = GetTiles(fm, strides, outputArea);
+    auto boxSize = outputArea.SizeShape();
+    // OFM_BASE registers
+    Emit(isa::npu_set_ofm_base0_t(tiles.address[0]));
+    Emit(isa::npu_set_ofm_base1_t(tiles.address[1]));
+    Emit(isa::npu_set_ofm_base2_t(tiles.address[2]));
+    Emit(isa::npu_set_ofm_base3_t(tiles.address[3]));
+    // OFM size
+    Emit(isa::npu_set_ofm_height_m1_t(DivRoundUp(boxSize.Height(), fm.stepXY.y) - 1));
+    Emit(isa::npu_set_ofm_width_m1_t(DivRoundUp(boxSize.Width(), fm.stepXY.x) - 1));
+    // Tile related registers
+    Emit(isa::npu_set_ofm_height0_m1_t(tiles.height0 - 1));
+    Emit(isa::npu_set_ofm_height1_m1_t(tiles.height1 - 1));
+    Emit(isa::npu_set_ofm_width0_m1_t(tiles.width0 - 1));
+    Emit(isa::npu_set_ofm_depth_m1_t(boxSize.Depth() - 1));
+    // OFM_STRIDE registers
+    Emit(isa::npu_set_ofm_stride_y_t(strides.Height() * fm.stepXY.y));
+    Emit(isa::npu_set_ofm_stride_x_t(strides.Width() * fm.stepXY.x));
+    Emit(isa::npu_set_ofm_stride_c_t(strides.Depth()));
+    // OFM_ZERO_POINT register
+    auto &quant = fm.quantization;
+    uint32_t zp = UseZeroPoint0(opType, fm, true) ? 0 : uint32_t(quant.zeroPoints[0]);
+    Emit(isa::npu_set_ofm_zero_point_t(zp));
+}
+
+// Generates WEIGHT registers
+void EthosU55RCSGenerator::GenerateWeights(const HLCStripe *stripe, MemoryAccesses &memoryAccesses)
+{
+    auto weights = stripe->operation->weights.get();
+    if ( weights == nullptr )
+    {
+        return;
+    }
+    int depth = stripe->weightRangeDepth;
+    Emit(isa::npu_set_weight_region_t(ToRegion(weights->memArea)));
+    auto item0 = weights->encodedRanges.find(WeightKey(0, depth));
+    assert(item0 != weights->encodedRanges.end());
+    auto &range0 = item0->second;
+    int doubleBufferOffset = GetDoubleBufferOffset(weights, range0.index);
+    Address address = weights->address + range0.weightOffset + doubleBufferOffset;
+    int length = RoundAway(range0.weightBytes, 16);
+    CheckAddressRange(weights->memArea.memory, address, length);
+    Emit(isa::npu_set_weight_base_t(address));
+    Emit(isa::npu_set_weight_length_t(length));
+    memoryAccesses.emplace_back(AccessDirection::Read, weights->memArea, address, address + length);
+    auto item1 = weights->encodedRanges.find(WeightKey(1, depth));
+    if ( item1 != weights->encodedRanges.end() )
+    {
+        auto &range1 = item1->second;
+        Address address1 = weights->address + RoundAway(range0.TotalBytes(), 16) + range1.weightOffset + doubleBufferOffset;
+        int length1 = RoundAway(range1.weightBytes, 16);
+        CheckAddressRange(weights->memArea.memory, address1, length1);
+        Emit(isa::npu_set_weight1_base_t(address1));
+        Emit(isa::npu_set_weight1_length_t(length1));
+        memoryAccesses.emplace_back(AccessDirection::Read, weights->memArea, address1, address1 + length1);
+    }
+    else if ( _arch->_cores > 1 )
+    {
+        Emit(isa::npu_set_weight1_length_t(0));
+    }
+}
+
+// Generates SCALE registers
+void EthosU55RCSGenerator::GenerateScales(const HLCStripe *stripe, MemoryAccesses &memoryAccesses)
+{
+    auto scales = stripe->operation->scales.get();
+    if ( scales == nullptr )
+    {
+        assert(!stripe->operation->weights);
+        return;
+    }
+    int depth = stripe->weightRangeDepth;
+    Emit(isa::npu_set_scale_region_t(ToRegion(scales->memArea)));
+    auto item0 = scales->encodedRanges.find(WeightKey(0, depth));
+    assert(item0 != scales->encodedRanges.end());
+    auto &range0 = item0->second;
+    int doubleBufferOffset = GetDoubleBufferOffset(scales, range0.index);
+    Address address = scales->address + doubleBufferOffset;
+    int length = RoundAway(range0.scaleBytes, 16);
+    CheckAddressRange(scales->memArea.memory, address, length);
+    Emit(isa::npu_set_scale_base_t(address));
+    Emit(isa::npu_set_scale_length_t(length));
+    memoryAccesses.emplace_back(AccessDirection::Read, scales->memArea, address, address + length);
+    auto item1 = scales->encodedRanges.find(WeightKey(1, depth));
+    if ( item1 != scales->encodedRanges.end() )
+    {
+        auto &range1 = item1->second;
+        Address address1 = address + RoundAway(range0.TotalBytes(), 16);
+        int length1 = RoundAway(range1.scaleBytes, 16);
+        CheckAddressRange(scales->memArea.memory, address1, length1);
+        Emit(isa::npu_set_scale1_base_t(address1));
+        Emit(isa::npu_set_scale1_length_t(length1));
+        memoryAccesses.emplace_back(AccessDirection::Read, scales->memArea, address1, address1 + length1);
+    }
+    else if ( _arch->_cores > 1 )
+    {
+        Emit(isa::npu_set_scale1_length_t(0));
+    }
+}
+
+// Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers
+void EthosU55RCSGenerator::GenerateBlockConfig(const EthosU55OpConfig *config)
+{
+    Emit(isa::npu_set_ofm_blk_height_m1_t(config->OfmBlock().Height() - 1));
+    Emit(isa::npu_set_ofm_blk_width_m1_t(config->OfmBlock().Width() - 1));
+    Emit(isa::npu_set_ofm_blk_depth_m1_t(config->OfmBlock().Depth() - 1));
+}
+
+// Generates IB_END/IB_START/AB_START/ACC_FORMAT registers
+void EthosU55RCSGenerator::GenerateShramRegisters(const EthosU55OpConfig *config, bool hasIfm2)
+{
+    auto &layout = config->_layout;
+    Emit(isa::npu_set_ifm_ib_end_t(layout.ibEnd));
+    Emit(isa::npu_set_ab_start_t(layout.abStart));
+    if ( hasIfm2 )
+    {
+        Emit(isa::npu_set_ifm2_ib_start_t(layout.ibStart2));
+    }
+    // ACC_FORMAT register
+    auto accType = config->_accumulatorType;
+    acc_format format;
+    if ( accType == EthosU55SHRamElements::SHRAM_Acc16 )
+    {
+        format = acc_format::F16;
+    }
+    else if ( accType == EthosU55SHRamElements::SHRAM_Acc32 )
+    {
+        format = acc_format::I32;
+    }
+    else
+    {
+        assert(accType == EthosU55SHRamElements::SHRAM_Acc40);
+        format = acc_format::I40;
+    }
+    Emit(isa::npu_set_acc_format_t(format));
+}
+
+// Calculates and generates KERNEL_WAIT or DMA_WAIT register
+void EthosU55RCSGenerator::GenerateWaits(bool isKernelWait, const MemoryAccesses &memoryAccesses, int maxWaits,
+    std::deque<MemoryAccesses> &outstandingAccesses, std::deque<MemoryAccesses> &accessesToUpdate)
+{
+    int waits = CalcCommandWaits(memoryAccesses, outstandingAccesses);
+    if ( waits >= 0 )
+    {
+        if ( isKernelWait )
+        {
+            Emit(isa::npu_op_kernel_wait_t(waits));
+        }
+        else
+        {
+            Emit(isa::npu_op_dma_wait_t(waits));
+        }
+    }
+    accessesToUpdate.push_back(memoryAccesses);
+    if ( int(accessesToUpdate.size()) > maxWaits )
+    {
+        accessesToUpdate.pop_front();
+    }
+}
+
+// Inserts DMA commands for copying LUTs from constant memory
+// to LUT memory
+std::vector<std::unique_ptr<HighLevelCommand>>
+EthosU55RCSGenerator::InsertLUTDMACommands(std::vector<std::unique_ptr<HighLevelCommand>> &cmds)
+{
+    std::vector<std::unique_ptr<HighLevelCommand>> result;
+    int lutSlotSize = _arch->_shram.lutSlotSize;
+    int slots = (_arch->_shram.bankSizeBytes * _arch->_shram.lutBanks) / lutSlotSize;
+    std::vector<LutSlot> lutSlots(slots);
+    int timestamp = 0;
+    result.reserve(cmds.size());
+    for ( auto &hlc : cmds )
+    {
+        ++timestamp;
+        if ( hlc->IsStripe() )
+        {
+            auto stripe = static_cast<HLCStripe *>(hlc.get());
+            auto op = stripe->operation;
+            auto config = static_cast<EthosU55OpConfig *>(op->config);
+            if ( !op->subOps.empty() && op->subOps[0].type == OpType::LUT )
+            {
+                const auto &srcTens = op->subOps[0].parameters.lut;
+                assert(config->_layout.lutStart > 0);
+                assert(srcTens.sizeBytes % lutSlotSize == 0);
+                bool alreadyInLutMem;
+                int sizeInSlots = srcTens.sizeBytes / lutSlotSize;
+                int slot = AllocateLutSlot(lutSlots, op.get(), sizeInSlots, timestamp, alreadyInLutMem);
+                _stripeToLutSlot[stripe] = slot;
+
+                if ( !alreadyInLutMem )
+                {
+                    auto dma = std::make_unique<HLCDMA>();
+                    dma->srcMemArea = srcTens.memArea;
+                    dma->srcAddress = srcTens.address;
+                    dma->length = srcTens.sizeBytes;
+                    dma->destMemArea = _arch->LUTMemory();
+                    dma->destAddress = _arch->_shram.bankSizeBytes * config->_layout.lutStart + slot * lutSlotSize;
+                    result.push_back(std::move(dma));
+                }
+            }
+            else if ( _arch->_shram.reservedEndBanks == 0 )
+            {
+                // LUT is overwritten by SHRAM accumulator buffers; clear slots
+                for ( auto &slot : lutSlots )
+                {
+                    slot.hlcOp = nullptr;
+                    slot.lastUsed = 0;
+                }
+            }
+        }
+        result.push_back(std::move(hlc));
+    }
+    return result;
+}
+
+//----------------------------------------------------------------------
+// Operations
+//----------------------------------------------------------------------
+
+// Generates NPU_OP_* command
+void EthosU55RCSGenerator::GenerateOperationCode(OpType opType)
+{
+    if ( IsPooling(opType) )
+    {
+        pooling_mode mode;
+        if ( opType == OpType::AvgPool || opType == OpType::ResizeBilinear )
+        {
+            mode = pooling_mode::AVERAGE;
+        }
+        else if ( opType == OpType::MaxPool )
+        {
+            mode = pooling_mode::MAX;
+        }
+        else
+        {
+            assert(opType == OpType::ReduceSum);
+            mode = pooling_mode::REDUCE_SUM;
+        }
+        Emit(isa::npu_op_pool_t(mode));
+    }
+    else if ( IsDepthwise(opType) )
+    {
+        Emit(isa::npu_op_depthwise_t());
+    }
+    else if ( IsConvolution(opType) || IsVectorProduct(opType) )
+    {
+        Emit(isa::npu_op_conv_t());
+    }
+    else if ( IsElementwise(opType) )
+    {
+        const auto &item = s_ElementwiseMap.find(opType);
+        if ( item == s_ElementwiseMap.end() )
+        {
+            assert(false && "Unsupported elementwise operator");
+        }
+        else
+        {
+            Emit(isa::npu_op_elementwise_t(item->second));
+        }
+    }
+    else if ( _arch->UseAvgPoolNop(opType) )
+    {
+        // Implemented using AvgPool
+        Emit(isa::npu_op_pool_t(pooling_mode::AVERAGE));
+    }
+    else
+    {
+        assert(false && "Unsupported operator");
+    }
+}
+
+void EthosU55RCSGenerator::GenerateCommon(const HLCStripe *stripe, bool useGlobalScale, RCSIfmScaleMode opToScale,
+    MemoryAccesses &memoryAccesses, int ifm0Index)
+{
+    auto op = stripe->operation.get();
+    GenerateIFM(op->type, op->ifm[ifm0Index], stripe->ifmAreas[ifm0Index]);
+    memoryAccesses.push_back(ToMemoryAccess(op->ifm[ifm0Index], stripe->ifmAreas[ifm0Index], AccessDirection::Read));
+    GenerateIFMPrecision(op->ifm[ifm0Index], opToScale);
+    ifm_upscale_mode upscaleMode = ToIfmUpscaleMode(op->ifm[0].resamplingMode);
+    Emit(isa::npu_set_ifm_upscale_t(upscaleMode));
+    if ( !IsElementwise(op->type) )
+    {
+        GeneratePadding(stripe->padding);
+    }
+    GenerateOFM(op->type, op->ofm, stripe->ofmArea);
+    memoryAccesses.push_back(ToMemoryAccess(op->ofm, stripe->ofmArea, AccessDirection::Write));
+    RCSRoundMode roundMode = GetRoundingMode(op);
+    GenerateOFMPrecision(op->ofm, useGlobalScale, roundMode);
+    EthosU55OpConfig *config = static_cast<EthosU55OpConfig *>(stripe->operation->config);
+    if ( !IsElementwise(op->type) )
+    {
+        GenerateKernel(op->kernel, config->Traversal() == EthosUTraversal::PartKernel);
+    }
+    GenerateWeights(stripe, memoryAccesses);
+    GenerateScales(stripe, memoryAccesses);
+    GenerateActivation(stripe, memoryAccesses);
+    if ( _arch->_shram.reservedEndBanks == 0 )
+    {
+        // SHRAM has no reserved LUT banks; LUT is overwritten by accumulator buffer
+        memoryAccesses.emplace_back(
+            AccessDirection::Write, _arch->LUTMemory(), 0, _arch->_shram.bankSizeBytes * _arch->_shram.totalBanks);
+    }
+}
+
+// Conv2D/Depthwise operations
+void EthosU55RCSGenerator::GenerateConvolutionOp(const HLCStripe *stripe, MemoryAccesses &memoryAccesses)
+{
+    GenerateCommon(stripe, false, RCSIfmScaleMode::OPA_OPB_16, memoryAccesses);
+}
+
+// MaxPool/AvgPool/ResizeBilinear or operations that are mapped to AvgPool
+void EthosU55RCSGenerator::GeneratePoolingOp(HLCStripe *stripe, MemoryAccesses &memoryAccesses)
+{
+    auto op = stripe->operation.get();
+    auto pad = stripe->padding;
+    auto padSum = pad.top + pad.left + pad.bottom + pad.right;
+    bool useGlobalScale = op->type != OpType::MaxPool && padSum == 0;
+
+    if ( _arch->UseAvgPoolNop(op->type) )
+    {
+        assert(op->kernel.Size() == Point2i(1, 1));
+        assert(op->kernel.Stride() == Point2i(1, 1));
+        assert(op->kernel.Dilation() == Point2i(1, 1));
+        assert(op->kernel.DepthMultiplier() == 1);
+        assert(useGlobalScale);
+    }
+    GenerateCommon(stripe, useGlobalScale, RCSIfmScaleMode::OPA_OPB_16, memoryAccesses);
+    if ( useGlobalScale )
+    {
+        GenerateOFMScalingForPooling(op);
+    }
+}
+
+// Elementwise operations
+void EthosU55RCSGenerator::GenerateElementwiseOp(HLCStripe *stripe, MemoryAccesses &memoryAccesses)
+{
+    auto op = stripe->operation.get();
+    auto opType = op->type;
+    bool useGlobalScale = opType == OpType::Add || opType == OpType::Sub || opType == OpType::Mul || opType == OpType::LeakyRelu || opType == OpType::Abs;
+    if ( IsUnaryElementwise(opType) )
+    {
+        assert(op->ifm.size() == 1);
+        auto opToScale = GenerateScalingForElementwise(op, 0);
+        GenerateCommon(stripe, useGlobalScale, opToScale, memoryAccesses);
+    }
+    else
+    {
+        // Binary operation: generate IFM2 registers
+        assert(op->ifm.size() == 2);
+        assert(stripe->ifmAreas.size() == 2);
+        int32_t scalarValue = 0;
+        auto ifmShape = stripe->ifmAreas[0].SizeShape();
+        auto ifm2Shape = stripe->ifmAreas[1].SizeShape();
+        bool reversedOperands = IsScalar(op->ifm[0], scalarValue) || (ifmShape != ifm2Shape && ifmShape.IsSubShapeOf(ifm2Shape));
+        int ifmIndex = 0;
+        if ( reversedOperands )
+        {
+            // If reversed, the scalar/broadcasted feature map has to be the ifm2 tensor,
+            // so switch ifm/ifm2
+            ifmIndex = 1;
+            std::swap(ifmShape, ifm2Shape);
+        }
+        auto opToScale = GenerateScalingForElementwise(op, ifmIndex);
+        GenerateCommon(stripe, useGlobalScale, opToScale, memoryAccesses, ifmIndex);
+        int ifm2Index = 1 - ifmIndex;
+        bool isScalar = IsScalar(op->ifm[ifm2Index], scalarValue);
+        GenerateIFM2(opType, op->ifm[ifm2Index], stripe->ifmAreas[ifm2Index], isScalar, scalarValue);
+        if ( !isScalar )
+        {
+            memoryAccesses.push_back(ToMemoryAccess(op->ifm[ifm2Index], stripe->ifmAreas[ifm2Index], AccessDirection::Read));
+        }
+        GenerateIFM2Precision(op->ifm[ifm2Index]);
+        GenerateIFM2Broadcast(ifmShape, ifm2Shape, reversedOperands, isScalar);
+    }
+}
+
+bool EthosU55RCSGenerator::GenerateStripe(HLCStripe *stripe, MemoryAccesses &memoryAccesses)
+{
+    auto opType = stripe->operation->type;
+    EthosU55NpuOp npuOp = ArchEthosU55::GetHWOp(opType);
+    if ( npuOp == EthosU55NpuOp::Pooling || npuOp == EthosU55NpuOp::ReduceSum )
+    {
+        GeneratePoolingOp(stripe, memoryAccesses);
+    }
+    else if ( npuOp == EthosU55NpuOp::Depthwise || npuOp == EthosU55NpuOp::Convolution || npuOp == EthosU55NpuOp::VectorProduct )
+    {
+        GenerateConvolutionOp(stripe, memoryAccesses);
+    }
+    else if ( npuOp == EthosU55NpuOp::Elementwise )
+    {
+        GenerateElementwiseOp(stripe, memoryAccesses);
+    }
+    else
+    {
+        LOG_ERROR("Register command stream generator: unsupported operator '{}'\n", OpTypeToString(opType));
+        assert(false);
+        return false;
+    }
+    EthosU55OpConfig *config = static_cast<EthosU55OpConfig *>(stripe->operation->config);
+    GenerateBlockConfig(config);
+    GenerateShramRegisters(config, stripe->operation->ifm.size() >= 2);
+    return true;
+}
+
+// Generates register commands for DMA operations
+void EthosU55RCSGenerator::GenerateDMA(const HLCDMA *dma, MemoryAccesses &memoryAccesses)
+{
+    auto srcRegionMode = dma_region_mode::EXTERNAL;
+    auto destRegionMode = dma_region_mode::EXTERNAL;
+    if ( dma->destMemArea == _arch->LUTMemory() )
+    {
+        destRegionMode = dma_region_mode::INTERNAL;
+    }
+    auto strideMode = dma_stride_mode::D1;
+    CheckAddressRange(dma->srcMemArea.memory, dma->srcAddress, dma->length);
+    CheckAddressRange(dma->destMemArea.memory, dma->destAddress, dma->length);
+    Emit(isa::npu_set_dma0_src_region_t(ToRegion(dma->srcMemArea), srcRegionMode, strideMode));
+    Emit(isa::npu_set_dma0_src_t(dma->srcAddress));
+    Emit(isa::npu_set_dma0_dst_region_t(ToRegion(dma->destMemArea), destRegionMode, strideMode));
+    Emit(isa::npu_set_dma0_dst_t(dma->destAddress));
+    Emit(isa::npu_set_dma0_len_t(dma->length));
+    memoryAccesses.emplace_back(AccessDirection::Read, dma->srcMemArea, dma->srcAddress, dma->srcAddress + dma->length);
+    memoryAccesses.emplace_back(AccessDirection::Write, dma->destMemArea, dma->destAddress, dma->destAddress + dma->length);
+}
+
+std::vector<uint32_t> EthosU55RCSGenerator::GenerateCommandStream(std::vector<std::unique_ptr<HighLevelCommand>> &highLevelCommandStream,
+    std::vector<std::tuple<void *, int, int>> *cmdRanges, bool verbose)
+{
+    _emit.Clear();
+    _stripeToLutSlot.clear();
+    GenerateInitialRegisterSetup();
+    auto cmds = InsertLUTDMACommands(highLevelCommandStream);
+    std::deque<MemoryAccesses> outstandingDmaAccesses;
+    std::deque<MemoryAccesses> outstandingNpuAccesses;
+    int maxOutstandingDMAOps = _arch->MaxOutstandingDMAOps();
+    int maxOutstandingKernelOps = _arch->MaxOutstandingKernelOps();
+    HLCStripe *prevOp = nullptr;
+    std::vector<std::pair<unsigned, std::string>> debugInfo;
+    for ( auto &hlc : cmds )
+    {
+        MemoryAccesses memoryAccesses;
+        int emitStart = _emit.Position();
+        if ( hlc->IsStripe() )
+        {
+            auto stripe = static_cast<HLCStripe *>(hlc.get());
+            if ( verbose )
+            {
+                debugInfo.emplace_back(emitStart, stripe->operation->ToString());
+            }
+            if ( !GenerateStripe(stripe, memoryAccesses) )
+            {
+                return std::vector<uint32_t>();
+            }
+            // BLOCKDEP register
+            int blockdep = CalcBlockDep(prevOp, stripe);
+            Emit(isa::npu_set_blockdep_t(blockdep));
+            GenerateWaits(false, memoryAccesses, maxOutstandingKernelOps, outstandingDmaAccesses, outstandingNpuAccesses);
+            GenerateOperationCode(stripe->operation->type);
+            prevOp = stripe;
+            // Return command mapping information to the caller
+            int emitEnd = _emit.Position();
+            if ( cmdRanges )
+            {
+                cmdRanges->emplace_back(stripe->operation->_srcKey, emitStart, emitEnd);
+            }
+        }
+        else
+        {
+            auto dma = static_cast<HLCDMA *>(hlc.get());
+            if ( verbose )
+            {
+                debugInfo.emplace_back(emitStart, dma->ToString());
+            }
+            GenerateDMA(static_cast<HLCDMA *>(hlc.get()), memoryAccesses);
+            GenerateWaits(true, memoryAccesses, maxOutstandingDMAOps, outstandingNpuAccesses, outstandingDmaAccesses);
+            Emit(isa::npu_op_dma_start_t());
+        }
+    }
+    Emit(isa::npu_op_stop_t(0xFFFF));
+    if ( verbose )
+    {
+        PrintCommandStream(_emit.CommandStream(), debugInfo);
+    }
+    return _emit.CommandStream();
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp
new file mode 100644
index 00000000..16abcf27
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp
@@ -0,0 +1,269 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/common.hpp"
+#include "common/logging.hpp"
+
+#include "architecture/ethos_u_register_cs_generator.hpp"
+#include "architecture/ethos_u_scaling.hpp"
+#include "common/data_type.hpp"
+#include "compiler/high_level_command_stream.hpp"
+#include "compiler/op_type.hpp"
+#include "ethos_u55.hpp"
+
+#include <cstdint>
+#include <deque>
+#include <unordered_map>
+#include <vector>
+
+namespace regor
+{
+class Emitter
+{
+public:
+    Emitter() = default;
+    void Emit(uint32_t instr);
+    void Emit(uint64_t instr);
+    void Clear();
+    int Position() const { return int(_stream.size()); }
+    const std::vector<uint32_t> &CommandStream() const { return _stream; }
+
+private:
+    bool SetRegister(uint16_t reg, uint64_t value);
+    static bool IsCmd0(uint16_t key);
+    static bool IsCmd1(uint16_t key);
+    static bool IsOp(uint16_t key);
+    std::vector<uint32_t> _stream;
+    std::unordered_map<uint16_t, uint64_t> _registers;
+};
+
+
+// Specifies the addresses and dimensions of the tiles of a feature map.
+// A feature map can use 1 to 4 tiles
+struct TileBox
+{
+    int height0;         // The height of tile 0
+    int height1;         // The height of tile 1
+    int width0;          // The width of tile 0, and tile 2 (if used)
+    Address address[4];  // Tile addresses
+};
+
+enum BasePointerIndex
+{
+    WeightTensor = 0,       // base address index for the Weight tensor
+    ScratchTensor = 1,      // base address index for the Scratch_tensor in the TensorArena
+    ScratchFastTensor = 2,  // base address for the Scratch_fast_tensor
+    Mem2Mem = 3,            // base address slot for memory to memory transfer
+};
+
+enum class AccessDirection
+{
+    Read = 0,
+    Write = 1,
+};
+
+enum class RCSIfmScaleMode : uint8_t
+{
+    OPA_OPB_16 = 0,
+    OPA_32 = 1,
+    OPB_32 = 2,
+};
+
+enum class RCSRoundMode : uint8_t
+{
+    DBL = 0,
+    TRUNCATE = 1,
+    NATURAL = 2,
+};
+
+struct MemoryAccess
+{
+    AccessDirection direction;
+    MemArea memArea;
+    Address start;
+    Address end;
+
+    MemoryAccess(AccessDirection direction_, MemArea area_, Address start_, Address end_) :
+            direction(direction_), memArea(area_), start(start_), end(end_)
+    {
+    }
+
+    bool Conflicts(const MemoryAccess &other) const
+    {
+        bool overlaps = Overlaps(start, end, other.start, other.end) && memArea == other.memArea;
+        return overlaps && (direction != AccessDirection::Read || other.direction != AccessDirection::Read);
+    }
+};
+
+using MemoryAccesses = std::vector<MemoryAccess>;
+
+struct LutSlot
+{
+    const HLCOperation *hlcOp = nullptr;
+    int lastUsed = 0;
+};
+
+/// <summary>
+/// Generates register command streams for Ethos U55 and Ethos U65.
+/// </summary>
+class EthosU55RCSGenerator : public EthosURegisterCSGenerator<EthosU55RCSGenerator>
+{
+public:
+    EthosU55RCSGenerator(ArchEthosU55 *arch);
+
+    //----------------------------------------------------------------------
+    // Print
+    //----------------------------------------------------------------------
+
+    int Disassemble(const uint32_t *in, std::string &op, std::vector<std::pair<std::string, std::string>> &fields);
+
+protected:
+    //----------------------------------------------------------------------
+    // Helper functions
+    //----------------------------------------------------------------------
+    void Emit(uint32_t instr);
+    void Emit(uint64_t instr);
+
+    static int GetDoubleBufferOffset(HLCWeights *weights, int rangeIndex);
+    static void CheckAddressRange(ArchitectureMemory *memory, Address address, int size);
+    static void CheckAddresses(const HLCFeatureMap &fm);
+    // Calculates the rolling buffer address of the given coordinate.
+    static Address AddressForCoordinate(const HLCFeatureMap &fm, const Shape &strides, const Shape &coord);
+    // Calculates tile sizes/addresses of a feature map
+    static TileBox GetTiles(const HLCFeatureMap &fm, const Shape &strides, const Box &area);
+    MemoryAccess ToMemoryAccess(const HLCFeatureMap &fm, const Box &area, AccessDirection direction);
+    // Returns region number used in NPU_SET_..._REGION
+    uint32_t ToRegion(const MemArea &memArea);
+    static bool UseZeroPoint0(OpType opType, const HLCFeatureMap &fm, bool isOFM);
+    // Checks if the feature map is a scalar, and if so, returns the
+    // quantized value in scalarValue.
+    static bool IsScalar(const HLCFeatureMap &fm, int32_t &scalarValue);
+    // Calculates waits for KERNEL_WAIT/DMA_WAIT, returns -1 if no wait is needed
+    // - opAccesses contains the memory accesses for the current operation
+    // - outstanding contains the memory accesses for ongoing "other" operations
+    //   (DMA operations if the current op is an NPU operation, NPU operations if the current op is a DMA operation)
+    // Note: NPU - NPU dependency is handled via blockdep
+    static int CalcCommandWaits(const MemoryAccesses &opAccesses, std::deque<MemoryAccesses> &outstanding);
+    // Returns LUT slot to be used for the given LUT operation.
+    // Sets alreadyInLutMem to true if the LUT is already in SHRAM.
+    int AllocateLutSlot(std::vector<LutSlot> &lutSlots, const HLCOperation *op, int sizeInSlots, int timestamp, bool &alreadyInLutMem);
+    //----------------------------------------------------------------------
+    // Scaling (OFM/OPA/OPB_SCALE)
+    //----------------------------------------------------------------------
+
+    // Generates OFM_SCALE register for pooling operations
+    void GenerateOFMScalingForPooling(HLCOperation *poolOp);
+    // Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
+    // Returns the operator to scale
+    RCSIfmScaleMode GenerateScalingForElementwise(HLCOperation *op, int ifm0Index);
+
+
+
+    //----------------------------------------------------------------------
+    // BLOCKDEP calculation
+    //----------------------------------------------------------------------
+
+    // Given the area and block size, adds the first/last jobs (depending on fromStart) to jobs.
+    // - area: total amount of work to perform
+    // - block: size of each job
+    // - fromStart: if true, the first jobs are added, if false, the last jobs are added
+    //   (in that case, the very last job is added last)
+    void GetJobs(const Box &area, const Shape &block, int nrJobsToGet, bool fromStart, std::vector<Box> &jobs);
+    // Calculates the value for the BLOCKDEP register
+    int CalcBlockDep(HLCStripe *prevStripe, HLCStripe *stripe);
+
+
+
+    //----------------------------------------------------------------------
+    // Register generation
+    //----------------------------------------------------------------------
+
+    void GeneratePadding(const HLCPadding &padding);
+    // Generates ACTIVATION registers
+    void GenerateActivation(const HLCStripe *stripe, MemoryAccesses &memoryAccesses);
+    // Generates KERNEL related registers
+    void GenerateKernel(const Kernel &kernel, bool partKernel);
+    // Generates IFM2_BROADCAST register for binary elementwise operations
+    void GenerateIFM2Broadcast(const Shape &ifmShape, const Shape &ifm2Shape, bool reversedOperands, bool isScalar);
+    // Generates IFM_PRECISION register
+    void GenerateIFMPrecision(const HLCFeatureMap &fm, RCSIfmScaleMode scaleMode);
+    // Generates IFM2_PRECISION register
+    void GenerateIFM2Precision(const HLCFeatureMap &fm);
+    // Generates OFM_PRECISION register
+    void GenerateOFMPrecision(const HLCFeatureMap &fm, bool useGlobalScale, RCSRoundMode roundMode);
+    // Generates common IFM registers
+    void GenerateIFM(OpType opType, const HLCFeatureMap &fm, const Box &inputArea);
+    // Generates common IFM2 registers
+    void GenerateIFM2(OpType opType, const HLCFeatureMap &fm, const Box &inputArea, bool isScalar, int32_t scalarValue);
+    // Generates OFM registers
+    void GenerateOFM(OpType opType, const HLCFeatureMap &fm, const Box &outputArea);
+    // Generates WEIGHT registers
+    void GenerateWeights(const HLCStripe *stripe, MemoryAccesses &memoryAccesses);
+    // Generates SCALE registers
+    void GenerateScales(const HLCStripe *stripe, MemoryAccesses &memoryAccesses);
+    // Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers
+    void GenerateBlockConfig(const EthosU55OpConfig *config);
+    // Generates IB_END/IB_START/AB_START/ACC_FORMAT registers
+    void GenerateShramRegisters(const EthosU55OpConfig *config, bool hasIfm2);
+    // Calculates and generates KERNEL_WAIT or DMA_WAIT register
+    void GenerateWaits(bool isKernelWait, const MemoryAccesses &memoryAccesses, int maxWaits,
+        std::deque<MemoryAccesses> &outstandingAccesses, std::deque<MemoryAccesses> &accessesToUpdate);
+    // Inserts DMA commands for copying LUTs from constant memory
+    // to LUT memory
+    std::vector<std::unique_ptr<HighLevelCommand>> InsertLUTDMACommands(std::vector<std::unique_ptr<HighLevelCommand>> &cmds);
+
+    //----------------------------------------------------------------------
+    // Operations
+    //----------------------------------------------------------------------
+
+    // Generates NPU_OP_* command
+    void GenerateOperationCode(OpType opType);
+    void GenerateCommon(const HLCStripe *stripe, bool useGlobalScale, RCSIfmScaleMode opToScale,
+        MemoryAccesses &memoryAccesses, int ifm0Index = 0);
+    // Conv2D/Depthwise operations
+    void GenerateConvolutionOp(const HLCStripe *stripe, MemoryAccesses &memoryAccesses);
+    // MaxPool/AvgPool/ResizeBilinear or operations that are mapped to AvgPool
+    void GeneratePoolingOp(HLCStripe *stripe, MemoryAccesses &memoryAccesses);
+    // Elementwise operations
+    void GenerateElementwiseOp(HLCStripe *stripe, MemoryAccesses &memoryAccesses);
+    bool GenerateStripe(HLCStripe *stripe, MemoryAccesses &memoryAccesses);
+    // Generates register commands for DMA operations
+    void GenerateDMA(const HLCDMA *dma, MemoryAccesses &memoryAccesses);
+
+    virtual void GenerateInitialRegisterSetup()
+    {
+        // No special initial setup for Ethos U55
+    }
+
+public:
+    std::vector<uint32_t> GenerateCommandStream(std::vector<std::unique_ptr<HighLevelCommand>> &highLevelCommandStream,
+        std::vector<std::tuple<void *, int, int>> *cmdRanges, bool verbose) override;
+
+    static uint32_t IdRegister();
+    static bool IsSupportedElementwise(const OpType opType);
+
+private:
+    ArchEthosU55 *_arch;
+    // For stripes that use LUT: the LUT slot to be used
+    std::unordered_map<const HLCStripe *, int> _stripeToLutSlot;
+    Emitter _emit;
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_scaling.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_scaling.cpp
new file mode 100644
index 00000000..6d40d4bb
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_scaling.cpp
@@ -0,0 +1,325 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "ethos_u55_scaling.hpp"
+
+#include "architecture/ethos_u_scaling.hpp"
+#include "compiler/high_level_command_stream.hpp"
+#include "compiler/op_type.hpp"
+#include "compiler/quantization.hpp"
+
+namespace regor::ethosU55Scaling
+{
+namespace
+{
+void AdvancedElementwiseAddSubScale(double input1Scale, double input2Scale, double outputScale, int bitDepth,
+    QuantizedScale &input1Rescale, QuantizedScale &outScale)
+{
+    auto maxInputScale = std::max(input1Scale, input2Scale);
+    auto minInputScale = std::min(input1Scale, input2Scale);
+    int inputShift = bitDepth == 8 ? 20 : 15;
+    double ifm1Rescale;
+    double ifm2Rescale;
+    SimplifiedElementwiseAddSubScale(minInputScale, maxInputScale, outputScale, inputShift, ifm1Rescale, ifm2Rescale, outScale);
+    input1Rescale = QuantizedScale(ifm1Rescale);
+}
+
+float GetScale(const Quantization *quant)
+{
+    if ( quant != nullptr && quant->scales.size() != 0 )
+    {
+        return float(quant->scales[0].Dequantize());
+    }
+    else
+    {
+        return 1.0f;
+    }
+}
+
+}  // namespace
+
+void RescaleElementwise(HLCOperation *op)
+{
+    int ifmCnt = int(op->ifm.size());
+    Quantization *ifm1Quant = &op->ifm[0].quantization;
+    Quantization *ifm2Quant = ifmCnt == 2 ? &op->ifm[1].quantization : nullptr;
+    Quantization *ofmQuant = &op->ofm.quantization;
+
+    if ( ifm1Quant->type == QuantizationType::EXPLICIT && ofmQuant->type == QuantizationType::EXPLICIT &&
+         (ifm2Quant == nullptr || ifm2Quant->type == QuantizationType::EXPLICIT) )
+    {
+        return;
+    }
+
+    QuantizedScale outScale(1, 0);
+
+    double ifm1Scale = GetScale(ifm1Quant);
+    double ifm2Scale = GetScale(ifm2Quant);
+    double ofmScale = GetScale(ofmQuant);
+
+    DataType ifmDataType = op->ifm[0].dataType;
+    OpType opType = op->type;
+
+    bool allHaveScale =
+        (!ifm1Quant->scales.empty() && !ofmQuant->scales.empty() && ifm2Quant != nullptr && !ifm2Quant->scales.empty());
+
+    if ( opType == OpType::Mul )
+    {
+        if ( allHaveScale )
+        {
+            outScale = ElementwiseMulScale(ifm1Scale, ifm2Scale, ofmScale);
+        }
+    }
+    else if ( opType == OpType::Abs || opType == OpType::LeakyRelu )
+    {
+        outScale = QuantizedScale(ofmScale);
+    }
+    else if ( opType == OpType::Add || opType == OpType::Sub )
+    {
+        int bitDepth = DataTypeSizeBits(ifmDataType);
+        bool useAdvancedScaling = false;
+        uint32_t opaScale = 1;
+        uint32_t opbScale = 1;
+        int opaShift = 0;
+        int opbShift = 0;
+        double ifm1Rescale;
+        double ifm2Rescale;
+        if ( allHaveScale )
+        {
+            if ( ifm1Scale == ifm2Scale )
+            {
+                SimplifiedElementwiseAddSubScale(ifm1Scale, ifm2Scale, ofmScale, 16, ifm1Rescale, ifm2Rescale, outScale);
+                opaScale = uint32_t(round(ifm1Rescale));
+                opbScale = uint32_t(round(ifm2Rescale));
+                if ( bitDepth == 16 )
+                {
+                    // Align the double rounding with that of advanced scaling
+                    opaScale /= 2;
+                    opbScale /= 2;
+                    --outScale.shift;
+                }
+                else
+                {
+                    // For 8 bit we can't guarantee double rounding with simplified scaling will always be
+                    // the same as with advanced scaling due to different shifts. When the ofm scale fulfils
+                    // the following we know that double rounding will have no effect for advanced scaling
+                    // no matter the input, so we can safely use simplified scaling with double rounding disabled.
+                    useAdvancedScaling = (outScale.scale & 0xFFF) != 0;
+                }
+            }
+            else
+            {
+                useAdvancedScaling = true;
+            }
+            if ( useAdvancedScaling )
+            {
+                // Use advanced implementation only when input/output scales differ,
+                // or when we can't guarantee the absence of rounding errors
+                QuantizedScale inScale(1, 0);
+                AdvancedElementwiseAddSubScale(ifm1Scale, ifm2Scale, ofmScale, bitDepth, inScale, outScale);
+                if ( ifm1Scale <= ifm2Scale )
+                {
+                    opaScale = inScale.scale;
+                    opaShift = inScale.shift;
+                    opbScale = 0;
+                    opbShift = 0;
+                }
+                else
+                {
+                    opaScale = 0;
+                    opaShift = 0;
+                    opbScale = inScale.scale;
+                    opbShift = inScale.shift;
+                }
+            }
+        }
+        if ( ifm1Quant != nullptr && ifm1Quant->type == QuantizationType::TFLITE )
+        {
+            ifm1Quant->scales.clear();
+            ifm1Quant->scales.push_back({int32_t(opaScale), opaShift});
+            ifm1Quant->type = QuantizationType::EXPLICIT;
+        }
+        if ( ifm2Quant != nullptr && ifm2Quant->type == QuantizationType::TFLITE )
+        {
+            ifm2Quant->scales.clear();
+            ifm2Quant->scales.push_back({int32_t(opbScale), opbShift});
+            ifm2Quant->type = QuantizationType::EXPLICIT;
+        }
+    }
+    if ( ofmQuant != nullptr && ofmQuant->type == QuantizationType::TFLITE )
+    {
+        ofmQuant->scales.clear();
+        ofmQuant->scales.push_back(outScale);
+        ofmQuant->type = QuantizationType::EXPLICIT;
+    }
+}
+
+void RescalePooling(HLCOperation *op, bool isNoOp)
+{
+
+    Quantization *ifm1Quant = &op->ifm[0].quantization;
+    Quantization *ofmQuant = &op->ofm.quantization;
+    uint32_t scale = 1;
+    int shift = 0;
+    DataType ifmDataType = op->ifm[0].dataType;
+    OpType opType = op->type;
+
+    if ( ofmQuant->type != QuantizationType::TFLITE )
+    {
+        // Explicit scaling
+        return;
+    }
+
+    if ( !ifm1Quant->scales.empty() && !ofmQuant->scales.empty() )
+    {
+        double ifmScale = GetScale(ifm1Quant);
+        double ofmScale = GetScale(ofmQuant);
+        auto actType = op->subOps.empty() ? opType : op->subOps[0].type;
+        if ( actType == OpType::Sigmoid || actType == OpType::Tanh )
+        {
+            double rescale = 0x3000 * ifmScale;
+            if ( ifmDataType == DataType::Int16 )
+            {
+                // Calculate scale and shift for the output scale of 1/(3*4096)
+                double xLog2 = std::log2(ifmScale);
+                int roundedLog2 = int(std::round(xLog2));
+                bool isPowerOf2 = std::abs(xLog2 - roundedLog2) < 0.001;
+                shift = roundedLog2 + 12;
+                if ( isPowerOf2 && ((actType == OpType::Tanh && (shift == 0 || shift == 1)) || (actType == OpType::Sigmoid && (shift == 0))) )
+                {
+                    // Special handling if input scale is 1/2048 or 1/4096
+                    scale = 3 << shift;
+                    shift = 0;
+                }
+                else
+                {
+                    shift = 0;
+                    int maxRescale = 16384;
+                    while ( rescale < maxRescale && shift <= 30 )
+                    {
+                        shift++;
+                        rescale *= 2;
+                    }
+                    scale = uint32_t(rescale);
+                }
+            }
+            else
+            {
+                QuantizePoolingScaleMaxPrecision(op->kernel.ElementsWH(), rescale, scale, shift, 32);
+            }
+        }
+        else if ( opType == OpType::MemoryCopy )
+        {
+            double rescale = ifmScale / ofmScale;
+            // In case of concat or other memory operation, rescaling might be needed.
+            // The scale is maximised, to get maximum precision
+            QuantizePoolingScaleMaxPrecision(op->kernel.ElementsWH(), rescale, scale, shift, 32);
+        }
+        else if ( opType == OpType::Quantize )
+        {
+            // Quantize operations need double-precision scaling
+            QuantizedScale quantScale(ifmScale / ofmScale);
+            scale = uint32_t(quantScale.scale);
+            shift = quantScale.shift;
+        }
+        else if ( isNoOp )
+        {
+            QuantizedScale quantScale(float(ifmScale) / float(ofmScale));
+            scale = uint32_t(quantScale.scale);
+            shift = quantScale.shift;
+        }
+        else
+        {
+            // Normal pooling operation, without need for special scaling
+            double rescale = ifmScale / ofmScale;
+            QuantizePoolingScale(op->kernel.ElementsWH(), rescale, 0, scale, shift, 32);
+        }
+    }
+    ofmQuant->scales.clear();
+    ofmQuant->scales.push_back({int32_t(scale), shift});
+    ofmQuant->type = QuantizationType::EXPLICIT;
+}
+
+Quantization RescalePerChannel(const Quantization &ifmQuant, const Quantization &weightQuant,
+    const Quantization &ofmQuant, const DataType scaleDataType, const DataType ifmDataType)
+{
+    if ( ofmQuant.type != QuantizationType::TFLITE )
+    {
+        // Explicit quantized scale has already been set
+        return ofmQuant;
+    }
+
+    Quantization quantResult;
+    quantResult.type = QuantizationType::EXPLICIT;
+    quantResult.zeroPoints = ofmQuant.zeroPoints;
+    quantResult.quantMin = ofmQuant.quantMin;
+    quantResult.quantMax = ofmQuant.quantMax;
+    quantResult.dimension = ofmQuant.dimension;
+    quantResult.forceZeroPoint = ofmQuant.forceZeroPoint;
+
+    if ( !ifmQuant.scales.empty() && !ofmQuant.scales.empty() && !weightQuant.scales.empty() )
+    {
+        DataType dataType = DataType::None;
+        bool reducedScale = false;
+        if ( scaleDataType == DataType::Int32 )
+        {
+            switch ( ifmDataType )
+            {
+                case DataType::Int8:
+                case DataType::UInt8:
+                case DataType::Int16:
+                    dataType = ifmDataType;
+                    break;
+                default:
+                    break;
+            }
+        }
+        else if ( scaleDataType == DataType::Int64 && DataTypeSizeBits(ifmDataType) == 16 )
+        {
+            dataType = DataType::Int16;
+            reducedScale = true;
+        }
+
+        int modIfm = (ifmQuant.scales.size()) == 1 ? 0 : -1;
+        int modOfm = (ofmQuant.scales.size()) == 1 ? 0 : -1;
+
+        quantResult.scales.reserve(weightQuant.scales.size());
+
+        for ( int i = 0; i < int(weightQuant.scales.size()); i++ )
+        {
+            double v = 1.0;
+            float ifmScale = float(ifmQuant.scales[i & modIfm].Dequantize());
+            float ofmScale = float(ofmQuant.scales[i & modOfm].Dequantize());
+            float weightScale = float(weightQuant.scales[i].Dequantize());
+            if ( dataType == DataType::UInt8 )
+            {
+                v = double(ifmScale * weightScale) / double(ofmScale);
+            }
+            else if ( dataType == DataType::Int8 || dataType == DataType::Int16 )
+            {
+                v = (double(ifmScale) * double(weightScale)) / double(ofmScale);
+            }
+
+            quantResult.scales.emplace_back(v, reducedScale);
+        }
+    }
+
+    return quantResult;
+}
+
+}  // namespace regor::ethosU55Scaling
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_scaling.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_scaling.hpp
new file mode 100644
index 00000000..84ce0894
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_scaling.hpp
@@ -0,0 +1,39 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/data_type.hpp"
+
+namespace regor
+{
+
+class Quantization;
+struct HLCOperation;
+
+namespace ethosU55Scaling
+{
+
+
+void RescalePooling(HLCOperation *op, bool isNoOp);
+void RescaleElementwise(HLCOperation *op);
+Quantization RescalePerChannel(const Quantization &ifmQuant, const Quantization &weightQuant,
+    const Quantization &ofmQuant, const DataType scaleDataType, const DataType ifmDataType);
+
+}  // namespace ethosU55Scaling
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_weight_encoder.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_weight_encoder.cpp
new file mode 100644
index 00000000..8e1b18f0
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_weight_encoder.cpp
@@ -0,0 +1,487 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "ethos_u55_weight_encoder.hpp"
+
+#include "common/logging.hpp"
+
+#include "architecture/architecture.hpp"
+#include "architecture/ethos_u_scaling.hpp"
+#include "architecture/mlw_encode.hpp"
+#include "common/buffer_view.hpp"
+#include "common/shape.hpp"
+#include "compiler/tensor_properties.hpp"
+#include "ethos_u55.hpp"
+#include "ethos_u55_scaling.hpp"
+
+#include <mlw_encode.h>
+
+namespace regor
+{
+
+
+EthosU55WeightEncoder::EthosUEncodingConfig::EthosUEncodingConfig(int cores) : _cores(cores)
+{
+}
+
+void EthosU55WeightEncoder::EthosUEncodingConfig::Rehash()
+{
+    _hash = SimpleHash32(ofmBlockDepth, traversal, _depthOffsetHash, ifmType, dilation, ohwiStrides);
+
+    _depthOffsetHash = 0;
+    for ( int offset : this->depthOffsets )
+    {
+        _depthOffsetHash = _depthOffsetHash * 31 ^ offset;
+    }
+}
+
+uint32_t EthosU55WeightEncoder::EthosUEncodingConfig::Hash()
+{
+    return _hash;
+}
+
+bool EthosU55WeightEncoder::EthosUEncodingConfig::Equals(IWeightEncodingConfig *other)
+{
+    EthosUEncodingConfig *p = static_cast<EthosUEncodingConfig *>(other);
+    return std::tie(ofmBlockDepth, traversal, _depthOffsetHash, ifmType, dilation, ohwiStrides) ==
+           std::tie(p->ofmBlockDepth, p->traversal, p->_depthOffsetHash, p->ifmType, p->dilation, ohwiStrides);
+}
+
+const std::vector<int> &EthosU55WeightEncoder::EthosUEncodingConfig::DepthOffsets()
+{
+    return this->depthOffsets;
+}
+
+Flags<WeightFormat> EthosU55WeightEncoder::EthosUEncodingConfig::Format()
+{
+    return WeightFormat::Default;
+}
+
+
+std::unique_ptr<IWeightEncodingConfig> EthosU55WeightEncoder::GetEncodingConfig(ArchitectureOpConfig *opCfg,
+    const WeightsRef &weights, const Kernel *kernel, DataType ifmType, const std::vector<int> &depthOffsets, Flags<WeightFormat>)
+{
+    std::unique_ptr<EthosUEncodingConfig> params = std::make_unique<EthosUEncodingConfig>(_arch->_cores);
+
+    EthosU55OpConfig *opConfig = static_cast<EthosU55OpConfig *>(opCfg);
+    params->ofmBlockDepth = opConfig->OfmBlock().Depth();
+    params->traversal = opConfig->Traversal();
+    params->depthOffsets = depthOffsets;
+    params->ifmType = ifmType;
+    params->dilation = kernel->Dilation();
+
+    assert(!weights.isScales);
+    Shape ohwiStrides = weights.view->StrideBytes() * 8 / DataTypeSizeBits(weights.type);
+    if ( weights.axisOrder == AxisOrder::IHWO )
+    {
+        ohwiStrides = ohwiStrides.Extract(3, 1, 2, 0);
+    }
+    params->ohwiStrides = std::move(ohwiStrides);
+    params->ohwiStrides[0] = params->ohwiStrides[0] * _arch->_cores;
+    params->Rehash();
+
+    return params;
+}
+
+int EthosU55WeightEncoder::StreamsRequired(IWeightEncodingConfig *, const Shape &weightShape, int &scaleStreamsRequired)
+{
+    scaleStreamsRequired = std::min(weightShape[0], _arch->_cores);
+    return scaleStreamsRequired;
+}
+
+static int EncodeBias(int64_t bias, int32_t scale, int shift, uint8_t data[10])
+{
+    assert(-(1LL << (40 - 1)) <= bias && bias < (1LL << (40 - 1)));  // signed 40-bit range
+    assert(0 <= shift && shift < (1 << 6));                          // unsigned 6-bit range
+
+    data[0] = uint8_t((bias >> (0 * 8)) & 0xFF);
+    data[1] = uint8_t((bias >> (1 * 8)) & 0xFF);
+    data[2] = uint8_t((bias >> (2 * 8)) & 0xFF);
+    data[3] = uint8_t((bias >> (3 * 8)) & 0xFF);
+    data[4] = uint8_t((bias >> (4 * 8)) & 0xFF);
+    data[5] = uint8_t((scale >> (0 * 8)) & 0xFF);
+    data[6] = uint8_t((scale >> (1 * 8)) & 0xFF);
+    data[7] = uint8_t((scale >> (2 * 8)) & 0xFF);
+    data[8] = uint8_t((scale >> (3 * 8)) & 0xFF);
+    data[9] = uint8_t(shift & 0x3F);
+    return 10;
+}
+
+
+template<typename TYPE>
+class EthosUWeightOrdering : public WeightSourceCommon
+{
+protected:
+    // Transform
+    WeightTransformParam *_param;
+    WeightTransformFunc _transform;
+    // Loop Limits
+    int _ofmBlockDepth;
+    int _ifmBlockDepth;
+    short _ofmUBlockDepth;
+    short _ifmUBlockDepth;
+    short _decompX;
+    short _decompY;
+    short _subKernelRound;
+    // Saved state
+    int _ofmBlockZ = 0;
+    int _ifmBlockZ = 0;
+    int _subKernelX = 0;
+    int _subKernelY = 0;
+    int _ifmUBlockOuter = 0;
+    int _ifmUBlockInner = 0;
+    int _ofmUBlockZ = 0;
+    int _ifmUBlockZ = 0;
+    int _kernelElement = 0;
+    int _ofmUBlock = 0;
+    EthosUTraversal _traversal;
+
+public:
+    EthosUWeightOrdering(int cores, const Point2i &dilation, int ofmBlockDepth, int ifmBitDepth, int ofmUBlockDepth,
+        int ifmUBlockDepth, WeightTransformFunc func, WeightTransformParam *param, EthosUTraversal traversal)
+    {
+        _streams = cores;
+        _ofmBlockDepth = ofmBlockDepth;
+        _ifmBlockDepth = ((traversal == EthosUTraversal::PartKernel) || (ifmBitDepth == 16)) ? 16 : 32;
+        _ofmUBlockDepth = short(ofmUBlockDepth);
+        _ifmUBlockDepth = short(ifmUBlockDepth);
+        _decompX = short(8 / dilation.x);
+        _decompY = short(8 / dilation.y);
+        if ( traversal == EthosUTraversal::Depthwise )
+        {
+            _subKernelRound = 4;
+        }
+        else if ( traversal == EthosUTraversal::PartKernel )
+        {
+            _subKernelRound = (ifmBitDepth == 16) ? 2 : 4;
+        }
+        else
+        {
+            _subKernelRound = 1;
+        }
+        _transform = func;
+        _param = param;
+        _traversal = traversal;
+    }
+
+    void SetSource(const void *buffer, int depthOffset, const Shape &ohwiShape, const Shape &ohwiStrides, int streamIndex) override
+    {
+        SetSourceCommon(buffer, depthOffset + streamIndex, ohwiShape, ohwiStrides, streamIndex, true);
+    }
+
+public:
+    int Get(int16_t *output, int count) override
+    {
+        if ( _traversal == EthosUTraversal::Depthwise ) return GetNext<false, true>(output, count);
+        else if ( _traversal == EthosUTraversal::PartKernel ) return GetNext<true, false>(output, count);
+        return GetNext<false, false>(output, count);
+    }
+
+    template<bool IS_PARTKERNEL, bool IS_DEPTHWISE>
+    int GetNext(int16_t *output, int count)
+    {
+        if ( _ofmBlockZ >= _ofmDepth )
+        {
+            return 0;
+        }
+
+        int ofmBlockZ, ifmBlockZ;
+        int ifmUBlockOuter, ifmUBlockInner;
+        int ifmUBlockZ, ofmUBlockZ, ofmUBlock;
+        int subKernelX, subKernelY;
+        int kernelElement;
+        int16_t *write = output;
+
+        const TYPE *buffer = reinterpret_cast<const TYPE *>(_source);
+        int streamBlockDepth = (_ofmBlockDepth + _streams - 1 - _streamIndex) / _streams;
+
+        for ( ofmBlockZ = _ofmBlockZ; ofmBlockZ < _ofmDepth; ofmBlockZ += streamBlockDepth )
+        {
+            int clippedOfmBlockDepth = std::min(streamBlockDepth, _ofmDepth - ofmBlockZ);
+            // IFM blocks required for the brick
+            for ( ifmBlockZ = _ifmBlockZ; ifmBlockZ < (IS_DEPTHWISE ? 1 : _ifmDepth); ifmBlockZ += _ifmBlockDepth )
+            {
+                int clippedIfmBlockDepth;
+                if ( IS_DEPTHWISE )
+                {
+                    clippedIfmBlockDepth = _ifmUBlockDepth;
+                }
+                else
+                {
+                    clippedIfmBlockDepth = IS_PARTKERNEL ? std::min(_ifmBlockDepth, _ifmDepth - ifmBlockZ) : _ifmBlockDepth;
+                }
+
+                // Weight decomposition
+                // Subkernel Splitting (H)
+                for ( subKernelY = _subKernelY; subKernelY < _kernelH; subKernelY += _decompY )
+                {
+                    int subHeight = std::min<int>(_kernelH - subKernelY, _decompY);
+                    // Subkernel splitting (W)
+                    for ( subKernelX = _subKernelX; subKernelX < _kernelW; subKernelX += _decompX )
+                    {
+                        int subWidth = std::min<int>(_kernelW - subKernelX, _decompX);
+                        int subKernelElements = subWidth * subHeight;
+
+                        // Part-kernel first works across the kernel H/W and needs padding
+                        subKernelElements = RoundAway<int>(subKernelElements, _subKernelRound);
+
+                        int ifmBlockDepthOuter = IS_PARTKERNEL ? clippedIfmBlockDepth : 1;
+                        int ifmBlockDepthInner = IS_PARTKERNEL ? 1 : clippedIfmBlockDepth;
+
+                        for ( ifmUBlockOuter = _ifmUBlockOuter; ifmUBlockOuter < ifmBlockDepthOuter; ifmUBlockOuter += _ifmUBlockDepth )
+                        {
+                            // OFM uBlocks in OFM-block over depth
+                            for ( ofmUBlock = _ofmUBlock; ofmUBlock < clippedOfmBlockDepth; ofmUBlock += _ofmUBlockDepth )
+                            {
+                                // HW Kernel element traversal - cannot be a H/W loop due to element
+                                // padding requirement on depthwise/part-kernel configurations
+                                for ( kernelElement = _kernelElement; kernelElement < subKernelElements; kernelElement++ )
+                                {
+                                    int kx = kernelElement % subWidth;
+                                    int ky = kernelElement / subWidth;
+                                    // IFM uBlocks in IFM-block over depth (only 1 uBlock if depthwise)
+                                    // In case of part-kernel-first IFM uBlock traversal have already been handled
+                                    // and this loop is ignored.
+                                    for ( ifmUBlockInner = _ifmUBlockInner; ifmUBlockInner < ifmBlockDepthInner; ifmUBlockInner += _ifmUBlockDepth )
+                                    {
+                                        int ifmUBlock = ifmUBlockInner + ifmUBlockOuter;
+                                        // Feed OFM uBlock elements
+                                        for ( ofmUBlockZ = _ofmUBlockZ; ofmUBlockZ < _ofmUBlockDepth; ofmUBlockZ++ )
+                                        {
+                                            // Source IFM uBlock elements (only 1 element deep if depthwise)
+                                            for ( ifmUBlockZ = _ifmUBlockZ; ifmUBlockZ < (IS_DEPTHWISE ? 1 : _ifmUBlockDepth); ifmUBlockZ++ )
+                                            {
+                                                // Source position within the current subkernel
+                                                int wx = subKernelX + kx;
+                                                int wy = subKernelY + ky;
+                                                // Source IFM/OFM slices
+                                                int ifm_z = ifmBlockZ + ifmUBlock + ifmUBlockZ;
+                                                int ofm_z = ofmBlockZ + ofmUBlock + ofmUBlockZ;
+                                                if ( (ifm_z < _ifmDepth) && (ofm_z < _ofmDepth) && (ky < subHeight) )
+                                                {
+                                                    _param->o = ofm_z;
+                                                    _param->h = wy;
+                                                    _param->w = wx;
+                                                    _param->i = ifm_z;
+                                                    int weight = int(buffer[WeightIndex(ofm_z, wy, wx, ifm_z)]);
+                                                    *write = int16_t(_transform(_param, weight));
+                                                }
+                                                else
+                                                {
+                                                    *write = 0;
+                                                }
+                                                write++;
+                                                if ( --count == 0 )
+                                                {
+                                                    // Save state
+                                                    _ifmUBlockZ = ifmUBlockZ + 1;
+                                                    _ofmUBlockZ = ofmUBlockZ;
+                                                    _ifmUBlockInner = ifmUBlockInner;
+                                                    _kernelElement = kernelElement;
+                                                    _ofmUBlock = ofmUBlock;
+                                                    _ifmUBlockOuter = ifmUBlockOuter;
+                                                    _subKernelX = subKernelX;
+                                                    _subKernelY = subKernelY;
+                                                    _ifmBlockZ = ifmBlockZ;
+                                                    _ofmBlockZ = ofmBlockZ;
+                                                    // Return weights generated (less than requested count == EOS)
+                                                    return int(intptr_t(write - output));
+                                                }
+                                            }
+                                            _ifmUBlockZ = 0;
+                                        }
+                                        _ofmUBlockZ = 0;
+                                    }
+                                    _ifmUBlockInner = 0;
+                                }
+                                _kernelElement = 0;
+                            }
+                            _ofmUBlock = 0;
+                        }
+                        _ifmUBlockOuter = 0;
+                    }
+                    _subKernelX = 0;
+                }
+                _subKernelY = 0;
+            }
+            _ifmBlockZ = 0;
+        }
+        _ofmBlockZ = 0;
+        return int(intptr_t(write - output));
+    }
+};
+
+
+std::unique_ptr<IVolumeWeightSource> EthosU55WeightEncoder::GetWeightSource(
+    IWeightEncodingConfig *config, DataType weightType, WeightTransformFunc func, WeightTransformParam *param)
+{
+    int ofmUBlockDepth = _arch->_ofmUBlock.Depth();
+    int ifmUBlockDepth = _arch->_ifmUBlock.Depth();
+
+    EthosUEncodingConfig *cfg = static_cast<EthosUEncodingConfig *>(config);
+    int ifmBitDepth = DataTypeSizeBits(cfg->ifmType);
+
+    if ( weightType == DataType::UInt8 )
+    {
+        return std::make_unique<EthosUWeightOrdering<uint8_t>>(_arch->_cores, cfg->dilation, cfg->ofmBlockDepth,
+            ifmBitDepth, ofmUBlockDepth, ifmUBlockDepth, func, param, cfg->traversal);
+    }
+    else if ( weightType == DataType::Int8 )
+    {
+        return std::make_unique<EthosUWeightOrdering<int8_t>>(_arch->_cores, cfg->dilation, cfg->ofmBlockDepth,
+            ifmBitDepth, ofmUBlockDepth, ifmUBlockDepth, func, param, cfg->traversal);
+    }
+
+    assert(false && "No weight source for this datatype");
+    return nullptr;
+}
+
+
+template<typename TYPE>
+class EthosUScaleSource : public IVolumeScaleSource
+{
+private:
+    const TYPE *_buffer = nullptr;
+    const QuantizedScale *_scales = nullptr;
+    int _biasIndex = 0;
+    int _biasCount = 0;
+    int _streamIndex = 0;
+    int _streams = 0;
+    Quantization _quantization;
+
+public:
+    EthosUScaleSource(int cores, Quantization quantization) : _streams(cores), _quantization(std::move(quantization))
+    {
+        assert(!_quantization.scales.empty());
+        // assert that no scale is out of range
+        auto invalidScale = std::find_if(std::begin(_quantization.scales), std::end(_quantization.scales),
+            [](const auto q) { return q.shift < 0 || q.shift >= 64; });
+        assert(invalidScale == std::end(_quantization.scales));
+    }
+
+    int Elements()
+    {
+        assert(_biasCount >= 0);
+        return _biasCount;
+    }
+
+    int Get(int64_t *biasBuffer, QuantizedScale *quantBuffer, int count)
+    {
+        count = std::min(count, _biasCount);
+        const size_t scaleSize = _quantization.scales.size();
+
+        for ( int i = 0; i < count; i++ )
+        {
+            int index = _biasIndex + (i * _streams);
+            *biasBuffer++ = _buffer[index];
+            *quantBuffer++ = _quantization.scales[index % scaleSize];
+            _biasCount--;
+        }
+
+        _biasIndex += (count * _streams);
+        return count;
+    }
+
+    void SetSource(const void *buffer, int biasCount, int depthOffset, int depthLength, int streamIndex)
+    {
+        assert(streamIndex >= 0 && streamIndex < _streams);
+        UNUSED(biasCount);
+        assert(depthOffset + depthLength <= biasCount);
+        assert(uintptr_t(buffer) % alignof(TYPE) == 0);
+        _buffer = reinterpret_cast<const TYPE *>(buffer);
+        _biasIndex = depthOffset + streamIndex;                              // Where to start in the buffer
+        _biasCount = (depthLength + _streams - 1 - streamIndex) / _streams;  // How many biases to generate
+    }
+};
+
+
+std::unique_ptr<IVolumeScaleSource> EthosU55WeightEncoder::GetScaleSource(
+    IWeightEncodingConfig *config, DataType scaleType, const Quantization &explicitQuant)
+{
+    EthosUEncodingConfig *cfg = static_cast<EthosUEncodingConfig *>(config);
+    assert(explicitQuant.type == QuantizationType::EXPLICIT);
+
+    if ( scaleType == DataType::Int32 )
+    {
+        return std::make_unique<EthosUScaleSource<int32_t>>(_arch->_cores, explicitQuant);
+    }
+    else if ( scaleType == DataType::Int64 && DataTypeSizeBits(cfg->ifmType) == 16 )
+    {
+        return std::make_unique<EthosUScaleSource<int64_t>>(_arch->_cores, explicitQuant);
+    }
+
+    return nullptr;
+}
+
+Quantization EthosU55WeightEncoder::MakeExplicit(const Quantization &ifmQ, const Quantization &weightQ,
+    const Quantization &ofmQ, DataType scaleType, DataType ifmType)
+{
+    if ( scaleType == DataType::Int64 && DataTypeSizeBits(ifmType) == 16 ) scaleType = DataType::Int32;
+
+    return ethosU55Scaling::RescalePerChannel(ifmQ, weightQ, ofmQ, scaleType, ifmType);
+}
+
+
+WeightsInfo EthosU55WeightEncoder::EncodeWeights(
+    IWeightEncodingConfig *config, IWeightSource *source, std::vector<uint8_t> &result, bool measureOnly)
+{
+    [[maybe_unused]] EthosUEncodingConfig *cfg = static_cast<EthosUEncodingConfig *>(config);
+    assert(cfg->Format() == WeightFormat::Default);
+    unsigned flags = measureOnly ? MLW_ENCODE_NO_BITSTREAM : MLW_ENCODE_FLAG_NONE;
+    auto res = mle_encode_proxy(source, 128 * 1024, result, flags);
+    return {res.elements_read, res.bytes_written, res.zero_count};
+}
+
+
+int EthosU55WeightEncoder::EncodeScales(IWeightEncodingConfig *config, IScaleSource *source, std::vector<uint8_t> &result, bool measureOnly)
+{
+    UNUSED(config);
+    constexpr int BUFFER_SIZE = 8;
+    constexpr int SCALE_ELEMENT_SIZE = 10;
+
+    if ( measureOnly )
+    {
+        return source->Elements() * SCALE_ELEMENT_SIZE;  // Must be accurate
+    }
+
+    int64_t scaleBuffer[BUFFER_SIZE];
+    QuantizedScale quantBuffer[BUFFER_SIZE];
+
+    int start = int(result.size());
+    int write = start;
+    result.reserve(start + source->Elements() * SCALE_ELEMENT_SIZE);
+    while ( true )
+    {
+        int count = source->Get(scaleBuffer, quantBuffer, BUFFER_SIZE);
+        result.resize(write + (count * SCALE_ELEMENT_SIZE));
+
+        for ( int i = 0; i < count; i++ )
+        {
+            write += EncodeBias(scaleBuffer[i], quantBuffer[i].scale, quantBuffer[i].shift, &result[write]);
+        }
+
+        if ( count < BUFFER_SIZE )
+        {
+            break;
+        }
+    }
+
+    return write - start;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_weight_encoder.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_weight_encoder.hpp
new file mode 100644
index 00000000..24fcffa7
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu55/ethos_u55_weight_encoder.hpp
@@ -0,0 +1,86 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "architecture/architecture.hpp"
+#include "architecture/ethos_u_scaling.hpp"
+#include "architecture/mlw_encode.hpp"
+#include "architecture/weight_encoder.hpp"
+#include "common/shape.hpp"
+#include "ethos_u55.hpp"
+
+namespace regor
+{
+
+/// <summary>
+/// Encodes weights and biases. Common implementation for Ethos U55 and Ethos U65.
+/// </summary>
+class EthosU55WeightEncoder : public WeightEncoder
+{
+private:
+    struct EthosUEncodingConfig : IWeightEncodingConfig
+    {
+    private:
+        uint32_t _hash = 0;
+        uint32_t _depthOffsetHash = 0;
+        int _cores = 0;
+
+    public:
+        DataType ifmType = DataType::None;
+        int ofmBlockDepth = 0;
+        EthosUTraversal traversal = EthosUTraversal::DepthFirst;
+        std::vector<int> depthOffsets;
+        Point2i dilation;
+        Shape ohwiStrides;
+
+    public:
+        EthosUEncodingConfig(int cores);
+        void Rehash();
+        uint32_t Hash() override;
+        bool Equals(IWeightEncodingConfig *other) override;
+        const std::vector<int> &DepthOffsets() override;
+        Flags<WeightFormat> Format() override;
+    };
+
+public:
+    EthosU55WeightEncoder(ArchEthosU55 *arch) : _arch(arch) {}
+
+public:
+    std::unique_ptr<IWeightEncodingConfig> GetEncodingConfig(ArchitectureOpConfig *opCfg, const WeightsRef &weights,
+        const Kernel *kernel, DataType ifmType, const std::vector<int> &depthOffsets, Flags<WeightFormat> format);
+
+    int StreamsRequired(IWeightEncodingConfig *config, const Shape &weightShape, int &scaleStreamsRequired);
+
+    std::unique_ptr<IVolumeWeightSource> GetWeightSource(
+        IWeightEncodingConfig *config, DataType weightType, WeightTransformFunc func, WeightTransformParam *param);
+
+    std::unique_ptr<IVolumeScaleSource> GetScaleSource(IWeightEncodingConfig *config, DataType scaleType, const Quantization &explicitQuant);
+
+    Quantization MakeExplicit(const Quantization &ifmQ, const Quantization &weightQ, const Quantization &ofmQ,
+        DataType scaleType, DataType ifmType);
+
+    WeightsInfo EncodeWeights(IWeightEncodingConfig *config, IWeightSource *source, std::vector<uint8_t> &result, bool measureOnly);
+
+    int EncodeScales(IWeightEncodingConfig *config, IScaleSource *source, std::vector<uint8_t> &result, bool measureOnly);
+
+private:
+    ArchEthosU55 *_arch;
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu65/ethos_u65.cpp b/ethosu/regor/architecture/ethosu65/ethos_u65.cpp
new file mode 100644
index 00000000..5b7d71c3
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu65/ethos_u65.cpp
@@ -0,0 +1,91 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "ethos_u65.hpp"
+
+#include "common/common.hpp"
+#include "common/logging.hpp"
+
+#include "common/numeric_util.hpp"
+#include "ethos_u65_register_cs_generator.hpp"
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+
+namespace regor
+{
+
+static const EthosU55PerfInfo s_EthosU65PerfInfo[] = {
+    // Accelerator.Ethos_U65_256
+    {{0.625, 1.125, 0.5, 0.375, 0.5, 0.75, 0.125, 0.25}, {1.0, 0.25, 0.0}},
+    // Accelerator.Ethos_U65_512
+    {{0.3125, 0.5625, 0.25, 0.1875, 0.25, 0.375, 0.0625, 0.125}, {0.5, 0.125, 0.0}}};
+
+static const ArchEthosU55::AcceleratorConfig s_EthosU65Configs[] = {
+    // Accelerator.Ethos_U65_256
+    {256, 1, Shape(2, 2, 8), Shape(2, 2, 8), 48, {8, 8, 8, 8, 16, 8, 16, 20}, 8, &s_EthosU65PerfInfo[0]},
+    // Accelerator.Ethos_U65_512
+    {256, 2, Shape(2, 2, 8), Shape(2, 2, 8), 48, {8, 8, 8, 8, 16, 8, 16, 20}, 8, &s_EthosU65PerfInfo[1]},
+};
+
+ArchEthosU65::ArchEthosU65()
+{
+    _rcsGenerator = std::make_unique<EthosU65RCSGenerator>(this);
+}
+
+bool ArchEthosU65::ParseConfig(IniReader *reader)
+{
+    // Parse architecture configuration
+    std::string key;
+    int macs = 0;
+    int cores = 0;
+    while ( reader->Begin(key) )
+    {
+        if ( key == "macs" )
+        {
+            macs = reader->Get<int>();
+        }
+        else if ( key == "cores" )
+        {
+            cores = reader->Get<int>();
+        }
+        reader->End();
+    }
+
+    // Find the requested MAC configuration for this accelerator
+    auto cfg = std::find_if(s_EthosU65Configs, std::cend(s_EthosU65Configs),
+        [&](const AcceleratorConfig &config) { return config.macs == macs && config.cores == cores; });
+    if ( cfg == std::cend(s_EthosU65Configs) )
+    {
+        assert(macs == 256 && ((cores == 1) || (cores == 2)));
+        LOG_TRACE0("Unable to find U65 accelerator for macs={} cores={}", macs, cores);
+        return false;
+    }
+
+    ApplyConfig(cfg);
+
+    return true;
+}
+
+std::vector<uint32_t> ArchEthosU65::ConfigRegisters()
+{
+    return std::vector<uint32_t>(1, ConfigRegister(1));
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu65/ethos_u65.hpp b/ethosu/regor/architecture/ethosu65/ethos_u65.hpp
new file mode 100644
index 00000000..057f5d5b
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu65/ethos_u65.hpp
@@ -0,0 +1,45 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "architecture/ethosu55/ethos_u55.hpp"
+#include "common/shape.hpp"
+
+#include <string>
+
+namespace regor
+{
+
+/// <summary>
+/// EthosU65 specialisation (based on U55)
+/// </summary>
+class ArchEthosU65 : public ArchEthosU55
+{
+public:
+    ArchEthosU65();
+
+    bool ParseConfig(IniReader *reader) override;
+    Address MaxAddress() override { return 1LL << 40; }
+    std::vector<uint32_t> ConfigRegisters() override;
+
+private:
+    int MaxOutstandingDMAOps() override { return 2; }
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu65/ethos_u65_interface.hpp b/ethosu/regor/architecture/ethosu65/ethos_u65_interface.hpp
new file mode 100644
index 00000000..968d4015
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu65/ethos_u65_interface.hpp
@@ -0,0 +1,21699 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+
+#if !defined(__cplusplus) || __cplusplus < 201402L
+#define CONSTEXPR
+#else
+#define CONSTEXPR constexpr
+#endif
+
+#ifndef __cplusplus
+#define STRUCT struct
+#else
+#define STRUCT
+#endif
+
+#if defined(__cplusplus) && defined(NPU_DISASSEMBLE)
+#include <iomanip>
+#include <vector>
+#include <sstream>
+#endif
+
+#if defined(__cplusplus) && !defined(NPU_NAMESPACE)
+#define NPU_NAMESPACE npu
+#endif
+
+#ifdef __cplusplus
+#include <cassert>
+#include <cstring>
+#include <limits>
+#endif
+
+#ifdef __cplusplus
+namespace NPU_NAMESPACE
+{
+#endif
+#define NNX_ARCH_VERSION_MAJOR 1
+#define NNX_ARCH_VERSION_MINOR 0
+#define NNX_ARCH_VERSION_PATCH 6
+
+
+
+
+
+#define NPU_REG_ID 0x0000
+#define NPU_REG_STATUS 0x0004
+#define NPU_REG_CMD 0x0008
+#define NPU_REG_RESET 0x000C
+#define NPU_REG_QBASE 0x0010
+#define NPU_REG_QBASE_HI 0x0014
+#define NPU_REG_QREAD 0x0018
+#define NPU_REG_QCONFIG 0x001C
+#define NPU_REG_QSIZE 0x0020
+#define NPU_REG_PROT 0x0024
+#define NPU_REG_CONFIG 0x0028
+#define NPU_REG_LOCK 0x002C
+#define NPU_REG_REGIONCFG 0x003C
+#define NPU_REG_AXI_LIMIT0 0x0040
+#define NPU_REG_AXI_LIMIT1 0x0044
+#define NPU_REG_AXI_LIMIT2 0x0048
+#define NPU_REG_AXI_LIMIT3 0x004C
+#define BASE_REGISTERS_SIZE 0x0080
+
+
+
+
+#define NPU_REG_BASEP_BASE 0x0080
+#define NPU_REG_BASEP_ARRLEN 0x0008
+#define BASE_POINTERS_REGISTERS_SIZE 0x0100
+
+
+
+
+#define NPU_REG_WD_STATUS 0x0100
+#define NPU_REG_MAC_STATUS 0x0104
+#define NPU_REG_AO_STATUS 0x0108
+#define NPU_REG_DMA_STATUS0 0x0110
+#define NPU_REG_DMA_STATUS1 0x0114
+#define NPU_REG_CLKFORCE 0x0140
+#define NPU_REG_DEBUG_ADDRESS 0x0144
+#define NPU_REG_DEBUG_MISC 0x0148
+#define NPU_REG_DEBUGCORE 0x014C
+#define NPU_REG_DEBUG_BLOCK 0x0150
+#define DEBUG_REGISTERS_SIZE 0x0180
+
+
+
+
+#define NPU_REG_PMCR 0x0180
+#define NPU_REG_PMCNTENSET 0x0184
+#define NPU_REG_PMCNTENCLR 0x0188
+#define NPU_REG_PMOVSSET 0x018C
+#define NPU_REG_PMOVSCLR 0x0190
+#define NPU_REG_PMINTSET 0x0194
+#define NPU_REG_PMINTCLR 0x0198
+#define NPU_REG_PMCCNTR 0x01A0
+#define NPU_REG_PMCCNTR_HI 0x01A4
+#define NPU_REG_PMCCNTR_CFG 0x01A8
+#define NPU_REG_PMCAXI_CHAN 0x01AC
+#define PMU_REGISTERS_SIZE 0x0200
+
+
+
+
+#define NPU_REG_KERNEL_X 0x0200
+#define NPU_REG_KERNEL_Y 0x0204
+#define NPU_REG_KERNEL_W_M1 0x0208
+#define NPU_REG_KERNEL_H_M1 0x020C
+#define NPU_REG_OFM_CBLK_WIDTH_M1 0x0210
+#define NPU_REG_OFM_CBLK_HEIGHT_M1 0x0214
+#define NPU_REG_OFM_CBLK_DEPTH_M1 0x0218
+#define NPU_REG_IFM_CBLK_DEPTH_M1 0x021C
+#define NPU_REG_OFM_X 0x0220
+#define NPU_REG_OFM_Y 0x0224
+#define NPU_REG_OFM_Z 0x0228
+#define NPU_REG_IFM_Z 0x022C
+#define NPU_REG_PAD_TOP 0x0230
+#define NPU_REG_PAD_LEFT 0x0234
+#define NPU_REG_IFM_CBLK_WIDTH 0x0238
+#define NPU_REG_IFM_CBLK_HEIGHT 0x023C
+#define NPU_REG_DMA_IFM_SRC 0x0240
+#define NPU_REG_DMA_IFM_SRC_HI 0x0244
+#define NPU_REG_DMA_IFM_DST 0x0248
+#define NPU_REG_DMA_OFM_SRC 0x024C
+#define NPU_REG_DMA_OFM_DST 0x0250
+#define NPU_REG_DMA_OFM_DST_HI 0x0254
+#define NPU_REG_DMA_WEIGHT_SRC 0x0258
+#define NPU_REG_DMA_WEIGHT_SRC_HI 0x025C
+#define NPU_REG_DMA_CMD_SRC 0x0260
+#define NPU_REG_DMA_CMD_SRC_HI 0x0264
+#define NPU_REG_DMA_CMD_SIZE 0x0268
+#define NPU_REG_DMA_M2M_SRC 0x026C
+#define NPU_REG_DMA_M2M_SRC_HI 0x0270
+#define NPU_REG_DMA_M2M_DST 0x0274
+#define NPU_REG_DMA_M2M_DST_HI 0x0278
+#define NPU_REG_CURRENT_QREAD 0x027C
+#define NPU_REG_DMA_SCALE_SRC 0x0280
+#define NPU_REG_DMA_SCALE_SRC_HI 0x0284
+#define NPU_REG_CURRENT_BLOCK 0x02B4
+#define NPU_REG_CURRENT_OP 0x02B8
+#define NPU_REG_CURRENT_CMD 0x02BC
+#define TSU_DEBUG_REGISTERS_SIZE 0x0300
+
+
+
+
+#define NPU_REG_PMEVCNTR_BASE 0x0300
+#define NPU_REG_PMEVCNTR_ARRLEN 0x0004
+#define NPU_REG_PMEVTYPER_BASE 0x0380
+#define NPU_REG_PMEVTYPER_ARRLEN 0x0004
+#define PMU_COUNTERS_REGISTERS_SIZE 0x0400
+
+
+
+
+#define NPU_REG_SHARED_BUFFER_BASE 0x0400
+#define NPU_REG_SHARED_BUFFER_ARRLEN 0x0100
+#define SHARED_BUFFER_REGISTERS_SIZE 0x0800
+
+
+
+
+#define NPU_REG_IFM_PAD_TOP 0x0800
+#define NPU_REG_IFM_PAD_LEFT 0x0804
+#define NPU_REG_IFM_PAD_RIGHT 0x0808
+#define NPU_REG_IFM_PAD_BOTTOM 0x080C
+#define NPU_REG_IFM_DEPTH_M1 0x0810
+#define NPU_REG_IFM_PRECISION 0x0814
+#define NPU_REG_IFM_UPSCALE 0x081C
+#define NPU_REG_IFM_ZERO_POINT 0x0824
+#define NPU_REG_IFM_WIDTH0_M1 0x0828
+#define NPU_REG_IFM_HEIGHT0_M1 0x082C
+#define NPU_REG_IFM_HEIGHT1_M1 0x0830
+#define NPU_REG_IFM_IB_END 0x0834
+#define NPU_REG_IFM_REGION 0x083C
+#define TSU_IFM_REGISTERS_SIZE 0x0840
+
+
+
+
+#define NPU_REG_OFM_WIDTH_M1 0x0844
+#define NPU_REG_OFM_HEIGHT_M1 0x0848
+#define NPU_REG_OFM_DEPTH_M1 0x084C
+#define NPU_REG_OFM_PRECISION 0x0850
+#define NPU_REG_OFM_BLK_WIDTH_M1 0x0854
+#define NPU_REG_OFM_BLK_HEIGHT_M1 0x0858
+#define NPU_REG_OFM_BLK_DEPTH_M1 0x085C
+#define NPU_REG_OFM_ZERO_POINT 0x0860
+#define NPU_REG_OFM_WIDTH0_M1 0x0868
+#define NPU_REG_OFM_HEIGHT0_M1 0x086C
+#define NPU_REG_OFM_HEIGHT1_M1 0x0870
+#define NPU_REG_OFM_REGION 0x087C
+#define TSU_OFM_REGISTERS_SIZE 0x0880
+
+
+
+
+#define NPU_REG_KERNEL_WIDTH_M1 0x0880
+#define NPU_REG_KERNEL_HEIGHT_M1 0x0884
+#define NPU_REG_KERNEL_STRIDE 0x0888
+#define NPU_REG_PARALLEL_MODE 0x088C
+#define NPU_REG_ACC_FORMAT 0x0890
+#define NPU_REG_ACTIVATION 0x0894
+#define NPU_REG_ACTIVATION_MIN 0x0898
+#define NPU_REG_ACTIVATION_MAX 0x089C
+#define NPU_REG_WEIGHT_REGION 0x08A0
+#define NPU_REG_SCALE_REGION 0x08A4
+#define NPU_REG_AB_START 0x08B4
+#define NPU_REG_BLOCKDEP 0x08BC
+#define TSU_KERNEL_REGISTERS_SIZE 0x08C0
+
+
+
+
+#define NPU_REG_DMA0_SRC_REGION 0x08C0
+#define NPU_REG_DMA0_DST_REGION 0x08C4
+#define NPU_REG_DMA0_SIZE0 0x08C8
+#define NPU_REG_DMA0_SIZE1 0x08CC
+#define TSU_DMA_REGISTERS_SIZE 0x0900
+
+
+
+
+#define NPU_REG_IFM2_BROADCAST 0x0900
+#define NPU_REG_IFM2_SCALAR 0x0904
+#define NPU_REG_IFM2_PRECISION 0x0914
+#define NPU_REG_IFM2_ZERO_POINT 0x0924
+#define NPU_REG_IFM2_WIDTH0_M1 0x0928
+#define NPU_REG_IFM2_HEIGHT0_M1 0x092C
+#define NPU_REG_IFM2_HEIGHT1_M1 0x0930
+#define NPU_REG_IFM2_IB_START 0x0934
+#define NPU_REG_IFM2_REGION 0x093C
+#define TSU_IFM2_REGISTERS_SIZE 0x0940
+
+
+
+
+#define NPU_REG_IFM_BASE0 0x0A00
+#define NPU_REG_IFM_BASE0_HI 0x0A04
+#define NPU_REG_IFM_BASE1 0x0A08
+#define NPU_REG_IFM_BASE1_HI 0x0A0C
+#define NPU_REG_IFM_BASE2 0x0A10
+#define NPU_REG_IFM_BASE2_HI 0x0A14
+#define NPU_REG_IFM_BASE3 0x0A18
+#define NPU_REG_IFM_BASE3_HI 0x0A1C
+#define NPU_REG_IFM_STRIDE_X 0x0A20
+#define NPU_REG_IFM_STRIDE_X_HI 0x0A24
+#define NPU_REG_IFM_STRIDE_Y 0x0A28
+#define NPU_REG_IFM_STRIDE_Y_HI 0x0A2C
+#define NPU_REG_IFM_STRIDE_C 0x0A30
+#define NPU_REG_IFM_STRIDE_C_HI 0x0A34
+#define TSU_IFM_BASE_REGISTERS_SIZE 0x0A40
+
+
+
+
+#define NPU_REG_OFM_BASE0 0x0A40
+#define NPU_REG_OFM_BASE0_HI 0x0A44
+#define NPU_REG_OFM_BASE1 0x0A48
+#define NPU_REG_OFM_BASE1_HI 0x0A4C
+#define NPU_REG_OFM_BASE2 0x0A50
+#define NPU_REG_OFM_BASE2_HI 0x0A54
+#define NPU_REG_OFM_BASE3 0x0A58
+#define NPU_REG_OFM_BASE3_HI 0x0A5C
+#define NPU_REG_OFM_STRIDE_X 0x0A60
+#define NPU_REG_OFM_STRIDE_X_HI 0x0A64
+#define NPU_REG_OFM_STRIDE_Y 0x0A68
+#define NPU_REG_OFM_STRIDE_Y_HI 0x0A6C
+#define NPU_REG_OFM_STRIDE_C 0x0A70
+#define NPU_REG_OFM_STRIDE_C_HI 0x0A74
+#define TSU_OFM_BASE_REGISTERS_SIZE 0x0A80
+
+
+
+
+#define NPU_REG_WEIGHT_BASE 0x0A80
+#define NPU_REG_WEIGHT_BASE_HI 0x0A84
+#define NPU_REG_WEIGHT_LENGTH 0x0A88
+#define NPU_REG_WEIGHT_LENGTH_HI 0x0A8C
+#define NPU_REG_SCALE_BASE 0x0A90
+#define NPU_REG_SCALE_BASE_HI 0x0A94
+#define NPU_REG_SCALE_LENGTH 0x0A98
+#define NPU_REG_SCALE_LENGTH_HI 0x0A9C
+#define NPU_REG_OFM_SCALE 0x0AA0
+#define NPU_REG_OFM_SCALE_HI 0x0AA4
+#define NPU_REG_OPA_SCALE 0x0AA8
+#define NPU_REG_OPA_SCALE_HI 0x0AAC
+#define NPU_REG_OPB_SCALE 0x0AB0
+#define TSU_WS_BASE_REGISTERS_SIZE 0x0AC0
+
+
+
+
+#define NPU_REG_DMA0_SRC 0x0AC0
+#define NPU_REG_DMA0_SRC_HI 0x0AC4
+#define NPU_REG_DMA0_DST 0x0AC8
+#define NPU_REG_DMA0_DST_HI 0x0ACC
+#define NPU_REG_DMA0_LEN 0x0AD0
+#define NPU_REG_DMA0_LEN_HI 0x0AD4
+#define NPU_REG_DMA0_SKIP0 0x0AD8
+#define NPU_REG_DMA0_SKIP0_HI 0x0ADC
+#define NPU_REG_DMA0_SKIP1 0x0AE0
+#define NPU_REG_DMA0_SKIP1_HI 0x0AE4
+#define TSU_DMA_BASE_REGISTERS_SIZE 0x0B00
+
+
+
+
+#define NPU_REG_IFM2_BASE0 0x0B00
+#define NPU_REG_IFM2_BASE0_HI 0x0B04
+#define NPU_REG_IFM2_BASE1 0x0B08
+#define NPU_REG_IFM2_BASE1_HI 0x0B0C
+#define NPU_REG_IFM2_BASE2 0x0B10
+#define NPU_REG_IFM2_BASE2_HI 0x0B14
+#define NPU_REG_IFM2_BASE3 0x0B18
+#define NPU_REG_IFM2_BASE3_HI 0x0B1C
+#define NPU_REG_IFM2_STRIDE_X 0x0B20
+#define NPU_REG_IFM2_STRIDE_X_HI 0x0B24
+#define NPU_REG_IFM2_STRIDE_Y 0x0B28
+#define NPU_REG_IFM2_STRIDE_Y_HI 0x0B2C
+#define NPU_REG_IFM2_STRIDE_C 0x0B30
+#define NPU_REG_IFM2_STRIDE_C_HI 0x0B34
+#define TSU_IFM2_BASE_REGISTERS_SIZE 0x0B40
+
+
+
+
+#define NPU_REG_WEIGHT1_BASE 0x0B40
+#define NPU_REG_WEIGHT1_BASE_HI 0x0B44
+#define NPU_REG_WEIGHT1_LENGTH 0x0B48
+#define NPU_REG_WEIGHT1_LENGTH_HI 0x0B4C
+#define NPU_REG_SCALE1_BASE 0x0B50
+#define NPU_REG_SCALE1_BASE_HI 0x0B54
+#define NPU_REG_SCALE1_LENGTH 0x0B58
+#define NPU_REG_SCALE1_LENGTH_HI 0x0B5C
+#define TSU_WS1_BASE_REGISTERS_SIZE 0x0B80
+
+
+
+
+#define TSU_USER_BASE_REGISTERS_SIZE 0x0BC0
+
+
+
+
+#define TSU_DMA_EBASE_REGISTERS_SIZE 0x0C00
+
+
+
+
+#define NPU_REG_REVISION 0x0FC0
+#define NPU_REG_PID4 0x0FD0
+#define NPU_REG_PID5 0x0FD4
+#define NPU_REG_PID6 0x0FD8
+#define NPU_REG_PID7 0x0FDC
+#define NPU_REG_PID0 0x0FE0
+#define NPU_REG_PID1 0x0FE4
+#define NPU_REG_PID2 0x0FE8
+#define NPU_REG_PID3 0x0FEC
+#define NPU_REG_CID0 0x0FF0
+#define NPU_REG_CID1 0x0FF4
+#define NPU_REG_CID2 0x0FF8
+#define NPU_REG_CID3 0x0FFC
+#define ID_REGISTERS_SIZE 0x1000
+
+#ifdef __cplusplus
+
+enum class acc_format : uint8_t
+{
+    I32 = 0,
+    I40 = 1,
+    F16 = 2,
+};
+
+enum class activation_clip_range : uint8_t
+{
+    OFM_PRECISION = 0,
+    FORCE_UINT8 = 2,
+    FORCE_INT8 = 3,
+    FORCE_INT16 = 5,
+};
+
+enum class activation_format : uint8_t
+{
+    NHWC = 0,
+    NHCWB16 = 1,
+};
+
+enum class activation_function : uint8_t
+{
+    RELU = 0,
+    TANH = 3,
+    SIGMOID = 4,
+    TABLE_0 = 16,
+    TABLE_1 = 17,
+    TABLE_2 = 18,
+    TABLE_3 = 19,
+    TABLE_4 = 20,
+    TABLE_5 = 21,
+    TABLE_6 = 22,
+    TABLE_7 = 23,
+};
+
+enum class activation_precision : uint8_t
+{
+    B8 = 0,
+    B16 = 1,
+    B32 = 2,
+    B64 = 3,
+};
+
+enum class activation_type : uint8_t
+{
+    UNSIGNED = 0,
+    SIGNED = 1,
+};
+
+enum class axi_mem_encoding : uint8_t
+{
+    DEVICE_NON_BUFFERABLE = 0,
+    DEVICE_BUFFERABLE = 1,
+    NORMAL_NON_CACHEABLE_NON_BUFFERABLE = 2,
+    NORMAL_NON_CACHEABLE_BUFFERABLE = 3,
+    WRITE_THROUGH_NO_ALLOCATE = 4,
+    WRITE_THROUGH_READ_ALLOCATE = 5,
+    WRITE_THROUGH_WRITE_ALLOCATE = 6,
+    WRITE_THROUGH_READ_AND_WRITE_ALLOCATE = 7,
+    WRITE_BACK_NO_ALLOCATE = 8,
+    WRITE_BACK_READ_ALLOCATE = 9,
+    WRITE_BACK_WRITE_ALLOCATE = 10,
+    WRITE_BACK_READ_AND_WRITE_ALLOCATE = 11,
+};
+
+enum class broadcast_mode : uint8_t
+{
+    DISABLE = 0,
+    ENABLE = 1,
+};
+
+enum class cmd0_opcode : uint16_t
+{
+    NPU_OP_STOP = 0,
+    NPU_OP_IRQ = 1,
+    NPU_OP_CONV = 2,
+    NPU_OP_DEPTHWISE = 3,
+    NPU_OP_POOL = 5,
+    NPU_OP_ELEMENTWISE = 6,
+    NPU_OP_DMA_START = 16,
+    NPU_OP_DMA_WAIT = 17,
+    NPU_OP_KERNEL_WAIT = 18,
+    NPU_OP_PMU_MASK = 19,
+    NPU_SET_IFM_PAD_TOP = 256,
+    NPU_SET_IFM_PAD_LEFT = 257,
+    NPU_SET_IFM_PAD_RIGHT = 258,
+    NPU_SET_IFM_PAD_BOTTOM = 259,
+    NPU_SET_IFM_DEPTH_M1 = 260,
+    NPU_SET_IFM_PRECISION = 261,
+    NPU_SET_IFM_UPSCALE = 263,
+    NPU_SET_IFM_ZERO_POINT = 265,
+    NPU_SET_IFM_WIDTH0_M1 = 266,
+    NPU_SET_IFM_HEIGHT0_M1 = 267,
+    NPU_SET_IFM_HEIGHT1_M1 = 268,
+    NPU_SET_IFM_IB_END = 269,
+    NPU_SET_IFM_REGION = 271,
+    NPU_SET_OFM_WIDTH_M1 = 273,
+    NPU_SET_OFM_HEIGHT_M1 = 274,
+    NPU_SET_OFM_DEPTH_M1 = 275,
+    NPU_SET_OFM_PRECISION = 276,
+    NPU_SET_OFM_BLK_WIDTH_M1 = 277,
+    NPU_SET_OFM_BLK_HEIGHT_M1 = 278,
+    NPU_SET_OFM_BLK_DEPTH_M1 = 279,
+    NPU_SET_OFM_ZERO_POINT = 280,
+    NPU_SET_OFM_WIDTH0_M1 = 282,
+    NPU_SET_OFM_HEIGHT0_M1 = 283,
+    NPU_SET_OFM_HEIGHT1_M1 = 284,
+    NPU_SET_OFM_REGION = 287,
+    NPU_SET_KERNEL_WIDTH_M1 = 288,
+    NPU_SET_KERNEL_HEIGHT_M1 = 289,
+    NPU_SET_KERNEL_STRIDE = 290,
+    NPU_SET_PARALLEL_MODE = 291,
+    NPU_SET_ACC_FORMAT = 292,
+    NPU_SET_ACTIVATION = 293,
+    NPU_SET_ACTIVATION_MIN = 294,
+    NPU_SET_ACTIVATION_MAX = 295,
+    NPU_SET_WEIGHT_REGION = 296,
+    NPU_SET_SCALE_REGION = 297,
+    NPU_SET_AB_START = 301,
+    NPU_SET_BLOCKDEP = 303,
+    NPU_SET_DMA0_SRC_REGION = 304,
+    NPU_SET_DMA0_DST_REGION = 305,
+    NPU_SET_DMA0_SIZE0 = 306,
+    NPU_SET_DMA0_SIZE1 = 307,
+    NPU_SET_IFM2_BROADCAST = 384,
+    NPU_SET_IFM2_SCALAR = 385,
+    NPU_SET_IFM2_PRECISION = 389,
+    NPU_SET_IFM2_ZERO_POINT = 393,
+    NPU_SET_IFM2_WIDTH0_M1 = 394,
+    NPU_SET_IFM2_HEIGHT0_M1 = 395,
+    NPU_SET_IFM2_HEIGHT1_M1 = 396,
+    NPU_SET_IFM2_IB_START = 397,
+    NPU_SET_IFM2_REGION = 399,
+};
+
+enum class cmd1_opcode : uint16_t
+{
+    NPU_SET_IFM_BASE0 = 0,
+    NPU_SET_IFM_BASE1 = 1,
+    NPU_SET_IFM_BASE2 = 2,
+    NPU_SET_IFM_BASE3 = 3,
+    NPU_SET_IFM_STRIDE_X = 4,
+    NPU_SET_IFM_STRIDE_Y = 5,
+    NPU_SET_IFM_STRIDE_C = 6,
+    NPU_SET_OFM_BASE0 = 16,
+    NPU_SET_OFM_BASE1 = 17,
+    NPU_SET_OFM_BASE2 = 18,
+    NPU_SET_OFM_BASE3 = 19,
+    NPU_SET_OFM_STRIDE_X = 20,
+    NPU_SET_OFM_STRIDE_Y = 21,
+    NPU_SET_OFM_STRIDE_C = 22,
+    NPU_SET_WEIGHT_BASE = 32,
+    NPU_SET_WEIGHT_LENGTH = 33,
+    NPU_SET_SCALE_BASE = 34,
+    NPU_SET_SCALE_LENGTH = 35,
+    NPU_SET_OFM_SCALE = 36,
+    NPU_SET_OPA_SCALE = 37,
+    NPU_SET_OPB_SCALE = 38,
+    NPU_SET_DMA0_SRC = 48,
+    NPU_SET_DMA0_DST = 49,
+    NPU_SET_DMA0_LEN = 50,
+    NPU_SET_DMA0_SKIP0 = 51,
+    NPU_SET_DMA0_SKIP1 = 52,
+    NPU_SET_IFM2_BASE0 = 128,
+    NPU_SET_IFM2_BASE1 = 129,
+    NPU_SET_IFM2_BASE2 = 130,
+    NPU_SET_IFM2_BASE3 = 131,
+    NPU_SET_IFM2_STRIDE_X = 132,
+    NPU_SET_IFM2_STRIDE_Y = 133,
+    NPU_SET_IFM2_STRIDE_C = 134,
+    NPU_SET_WEIGHT1_BASE = 144,
+    NPU_SET_WEIGHT1_LENGTH = 145,
+    NPU_SET_SCALE1_BASE = 146,
+    NPU_SET_SCALE1_LENGTH = 147,
+};
+
+enum class cmd_ctrl : uint8_t
+{
+    CMD0_CTRL = 0,
+    CMD1_CTRL = 1,
+};
+
+enum class custom_dma : uint8_t
+{
+    NOT_IMPLEMENTED = 0,
+    IMPLEMENTED = 1,
+};
+
+enum class dma_fault_channel : uint8_t
+{
+    CMD_READ = 0,
+    IFM_READ = 1,
+    WEIGHT_READ = 2,
+    SBS_READ = 3,
+    MEM2MEM_READ = 4,
+    OFM_WRITE = 8,
+    MEM2MEM_WRITE = 9,
+};
+
+enum class dma_fault_src : uint8_t
+{
+    AXI_M0 = 0,
+    AXI_M1 = 1,
+};
+
+enum class dma_region_mode : uint8_t
+{
+    EXTERNAL = 0,
+    INTERNAL = 1,
+};
+
+enum class dma_stride_mode : uint8_t
+{
+    D1 = 0,
+    D2 = 1,
+    D3 = 2,
+};
+
+enum class elementwise_mode : uint8_t
+{
+    MUL = 0,
+    ADD = 1,
+    SUB = 2,
+    MIN = 3,
+    MAX = 4,
+    LRELU = 5,
+    ABS = 6,
+    CLZ = 7,
+    SHR = 8,
+    SHL = 9,
+};
+
+enum class ifm2_operand_order : uint8_t
+{
+    ORDER_B = 0,
+    ORDER_A = 1,
+};
+
+enum class ifm_scale_mode : uint8_t
+{
+    OPA_OPB_16 = 0,
+    OPA_32 = 1,
+    OPB_32 = 2,
+};
+
+enum class ifm_upscale_mode : uint8_t
+{
+    NONE = 0,
+    NEAREST = 1,
+    ZEROS = 2,
+};
+
+enum class kernel_decomposition : uint8_t
+{
+    D8X8 = 0,
+    D4X4 = 1,
+};
+
+enum class kernel_dilation : uint8_t
+{
+    NONE = 0,
+    X2 = 1,
+};
+
+enum class max_beats : uint8_t
+{
+    B64 = 0,
+    B128 = 1,
+    B256 = 2,
+};
+
+enum class mem_attr : uint8_t
+{
+    AXI0_OUTSTANDING_COUNTER0 = 0,
+    AXI0_OUTSTANDING_COUNTER1 = 1,
+    AXI1_OUTSTANDING_COUNTER2 = 2,
+    AXI1_OUTSTANDING_COUNTER3 = 3,
+};
+
+enum class ofm_scale_mode : uint8_t
+{
+    PER_CHANNEL = 0,
+    GLOBAL = 1,
+};
+
+enum class parallel_mode : uint8_t
+{
+    SINGLE_CORE = 0,
+    DUAL_CORE_DEPTH = 1,
+};
+
+enum class pmu_axi_channel : uint8_t
+{
+    RD_CMD = 0,
+    RD_IFM = 1,
+    RD_WEIGHTS = 2,
+    RD_SCALE_BIAS = 3,
+    RD_MEM2MEM = 4,
+    WR_OFM = 8,
+    WR_MEM2MEM = 9,
+};
+
+enum class pmu_event : uint16_t
+{
+    NO_EVENT = 0,
+    CYCLE = 17,
+    NPU_IDLE = 32,
+    CC_STALLED_ON_BLOCKDEP = 33,
+    CC_STALLED_ON_SHRAM_RECONFIG = 34,
+    NPU_ACTIVE = 35,
+    MAC_ACTIVE = 48,
+    MAC_ACTIVE_8BIT = 49,
+    MAC_ACTIVE_16BIT = 50,
+    MAC_DPU_ACTIVE = 51,
+    MAC_STALLED_BY_WD_ACC = 52,
+    MAC_STALLED_BY_WD = 53,
+    MAC_STALLED_BY_ACC = 54,
+    MAC_STALLED_BY_IB = 55,
+    MAC_ACTIVE_32BIT = 56,
+    MAC_STALLED_BY_INT_W = 57,
+    MAC_STALLED_BY_INT_ACC = 58,
+    AO_ACTIVE = 64,
+    AO_ACTIVE_8BIT = 65,
+    AO_ACTIVE_16BIT = 66,
+    AO_STALLED_BY_OFMP_OB = 67,
+    AO_STALLED_BY_OFMP = 68,
+    AO_STALLED_BY_OB = 69,
+    AO_STALLED_BY_ACC_IB = 70,
+    AO_STALLED_BY_ACC = 71,
+    AO_STALLED_BY_IB = 72,
+    WD_ACTIVE = 80,
+    WD_STALLED = 81,
+    WD_STALLED_BY_WS = 82,
+    WD_STALLED_BY_WD_BUF = 83,
+    WD_PARSE_ACTIVE = 84,
+    WD_PARSE_STALLED = 85,
+    WD_PARSE_STALLED_IN = 86,
+    WD_PARSE_STALLED_OUT = 87,
+    WD_TRANS_WS = 88,
+    WD_TRANS_WB = 89,
+    WD_TRANS_DW0 = 90,
+    WD_TRANS_DW1 = 91,
+    AXI0_RD_TRANS_ACCEPTED = 128,
+    AXI0_RD_TRANS_COMPLETED = 129,
+    AXI0_RD_DATA_BEAT_RECEIVED = 130,
+    AXI0_RD_TRAN_REQ_STALLED = 131,
+    AXI0_WR_TRANS_ACCEPTED = 132,
+    AXI0_WR_TRANS_COMPLETED_M = 133,
+    AXI0_WR_TRANS_COMPLETED_S = 134,
+    AXI0_WR_DATA_BEAT_WRITTEN = 135,
+    AXI0_WR_TRAN_REQ_STALLED = 136,
+    AXI0_WR_DATA_BEAT_STALLED = 137,
+    AXI0_ENABLED_CYCLES = 140,
+    AXI0_RD_STALL_LIMIT = 142,
+    AXI0_WR_STALL_LIMIT = 143,
+    AXI_LATENCY_ANY = 160,
+    AXI_LATENCY_32 = 161,
+    AXI_LATENCY_64 = 162,
+    AXI_LATENCY_128 = 163,
+    AXI_LATENCY_256 = 164,
+    AXI_LATENCY_512 = 165,
+    AXI_LATENCY_1024 = 166,
+    ECC_DMA = 176,
+    ECC_SB0 = 177,
+    AXI1_RD_TRANS_ACCEPTED = 384,
+    AXI1_RD_TRANS_COMPLETED = 385,
+    AXI1_RD_DATA_BEAT_RECEIVED = 386,
+    AXI1_RD_TRAN_REQ_STALLED = 387,
+    AXI1_WR_TRANS_ACCEPTED = 388,
+    AXI1_WR_TRANS_COMPLETED_M = 389,
+    AXI1_WR_TRANS_COMPLETED_S = 390,
+    AXI1_WR_DATA_BEAT_WRITTEN = 391,
+    AXI1_WR_TRAN_REQ_STALLED = 392,
+    AXI1_WR_DATA_BEAT_STALLED = 393,
+    AXI1_ENABLED_CYCLES = 396,
+    AXI1_RD_STALL_LIMIT = 398,
+    AXI1_WR_STALL_LIMIT = 399,
+    ECC_SB1 = 433,
+};
+
+enum class pooling_mode : uint8_t
+{
+    MAX = 0,
+    AVERAGE = 1,
+    REDUCE_SUM = 2,
+};
+
+enum class privilege_level : uint8_t
+{
+    USER = 0,
+    PRIVILEGED = 1,
+};
+
+enum class round_mode : uint8_t
+{
+    DBL = 0,
+    TRUNCATE = 1,
+    NATURAL = 2,
+};
+
+enum class security_level : uint8_t
+{
+    SECURE = 0,
+    NON_SECURE = 1,
+};
+
+enum class state : uint8_t
+{
+    STOPPED = 0,
+    RUNNING = 1,
+};
+
+enum class wd_core_slice_state : uint8_t
+{
+    HEADER = 0,
+    PALETTE = 1,
+    WEIGHTS = 2,
+};
+
+enum class wd_ctrl_state : uint8_t
+{
+    IDLE = 0,
+    DRAIN = 1,
+    OFD_INIT = 2,
+    OFD_RUN = 3,
+};
+
+enum class weight_order : uint8_t
+{
+    DEPTH_FIRST = 0,
+    PART_KERNEL_FIRST = 1,
+};
+
+#else
+
+enum acc_format
+{
+    ACC_FORMAT_I32 = 0,
+    ACC_FORMAT_I40 = 1,
+    ACC_FORMAT_F16 = 2,
+};
+
+enum activation_clip_range
+{
+    ACTIVATION_CLIP_RANGE_OFM_PRECISION = 0,
+    ACTIVATION_CLIP_RANGE_FORCE_UINT8 = 2,
+    ACTIVATION_CLIP_RANGE_FORCE_INT8 = 3,
+    ACTIVATION_CLIP_RANGE_FORCE_INT16 = 5,
+};
+
+enum activation_format
+{
+    ACTIVATION_FORMAT_NHWC = 0,
+    ACTIVATION_FORMAT_NHCWB16 = 1,
+};
+
+enum activation_function
+{
+    ACTIVATION_FUNCTION_RELU = 0,
+    ACTIVATION_FUNCTION_TANH = 3,
+    ACTIVATION_FUNCTION_SIGMOID = 4,
+    ACTIVATION_FUNCTION_TABLE_0 = 16,
+    ACTIVATION_FUNCTION_TABLE_1 = 17,
+    ACTIVATION_FUNCTION_TABLE_2 = 18,
+    ACTIVATION_FUNCTION_TABLE_3 = 19,
+    ACTIVATION_FUNCTION_TABLE_4 = 20,
+    ACTIVATION_FUNCTION_TABLE_5 = 21,
+    ACTIVATION_FUNCTION_TABLE_6 = 22,
+    ACTIVATION_FUNCTION_TABLE_7 = 23,
+};
+
+enum activation_precision
+{
+    ACTIVATION_PRECISION_B8 = 0,
+    ACTIVATION_PRECISION_B16 = 1,
+    ACTIVATION_PRECISION_B32 = 2,
+    ACTIVATION_PRECISION_B64 = 3,
+};
+
+enum activation_type
+{
+    ACTIVATION_TYPE_UNSIGNED = 0,
+    ACTIVATION_TYPE_SIGNED = 1,
+};
+
+enum axi_mem_encoding
+{
+    AXI_MEM_ENCODING_DEVICE_NON_BUFFERABLE = 0,
+    AXI_MEM_ENCODING_DEVICE_BUFFERABLE = 1,
+    AXI_MEM_ENCODING_NORMAL_NON_CACHEABLE_NON_BUFFERABLE = 2,
+    AXI_MEM_ENCODING_NORMAL_NON_CACHEABLE_BUFFERABLE = 3,
+    AXI_MEM_ENCODING_WRITE_THROUGH_NO_ALLOCATE = 4,
+    AXI_MEM_ENCODING_WRITE_THROUGH_READ_ALLOCATE = 5,
+    AXI_MEM_ENCODING_WRITE_THROUGH_WRITE_ALLOCATE = 6,
+    AXI_MEM_ENCODING_WRITE_THROUGH_READ_AND_WRITE_ALLOCATE = 7,
+    AXI_MEM_ENCODING_WRITE_BACK_NO_ALLOCATE = 8,
+    AXI_MEM_ENCODING_WRITE_BACK_READ_ALLOCATE = 9,
+    AXI_MEM_ENCODING_WRITE_BACK_WRITE_ALLOCATE = 10,
+    AXI_MEM_ENCODING_WRITE_BACK_READ_AND_WRITE_ALLOCATE = 11,
+};
+
+enum broadcast_mode
+{
+    BROADCAST_MODE_DISABLE = 0,
+    BROADCAST_MODE_ENABLE = 1,
+};
+
+enum cmd0_opcode
+{
+    CMD0_OPCODE_NPU_OP_STOP = 0,
+    CMD0_OPCODE_NPU_OP_IRQ = 1,
+    CMD0_OPCODE_NPU_OP_CONV = 2,
+    CMD0_OPCODE_NPU_OP_DEPTHWISE = 3,
+    CMD0_OPCODE_NPU_OP_POOL = 5,
+    CMD0_OPCODE_NPU_OP_ELEMENTWISE = 6,
+    CMD0_OPCODE_NPU_OP_DMA_START = 16,
+    CMD0_OPCODE_NPU_OP_DMA_WAIT = 17,
+    CMD0_OPCODE_NPU_OP_KERNEL_WAIT = 18,
+    CMD0_OPCODE_NPU_OP_PMU_MASK = 19,
+    CMD0_OPCODE_NPU_SET_IFM_PAD_TOP = 256,
+    CMD0_OPCODE_NPU_SET_IFM_PAD_LEFT = 257,
+    CMD0_OPCODE_NPU_SET_IFM_PAD_RIGHT = 258,
+    CMD0_OPCODE_NPU_SET_IFM_PAD_BOTTOM = 259,
+    CMD0_OPCODE_NPU_SET_IFM_DEPTH_M1 = 260,
+    CMD0_OPCODE_NPU_SET_IFM_PRECISION = 261,
+    CMD0_OPCODE_NPU_SET_IFM_UPSCALE = 263,
+    CMD0_OPCODE_NPU_SET_IFM_ZERO_POINT = 265,
+    CMD0_OPCODE_NPU_SET_IFM_WIDTH0_M1 = 266,
+    CMD0_OPCODE_NPU_SET_IFM_HEIGHT0_M1 = 267,
+    CMD0_OPCODE_NPU_SET_IFM_HEIGHT1_M1 = 268,
+    CMD0_OPCODE_NPU_SET_IFM_IB_END = 269,
+    CMD0_OPCODE_NPU_SET_IFM_REGION = 271,
+    CMD0_OPCODE_NPU_SET_OFM_WIDTH_M1 = 273,
+    CMD0_OPCODE_NPU_SET_OFM_HEIGHT_M1 = 274,
+    CMD0_OPCODE_NPU_SET_OFM_DEPTH_M1 = 275,
+    CMD0_OPCODE_NPU_SET_OFM_PRECISION = 276,
+    CMD0_OPCODE_NPU_SET_OFM_BLK_WIDTH_M1 = 277,
+    CMD0_OPCODE_NPU_SET_OFM_BLK_HEIGHT_M1 = 278,
+    CMD0_OPCODE_NPU_SET_OFM_BLK_DEPTH_M1 = 279,
+    CMD0_OPCODE_NPU_SET_OFM_ZERO_POINT = 280,
+    CMD0_OPCODE_NPU_SET_OFM_WIDTH0_M1 = 282,
+    CMD0_OPCODE_NPU_SET_OFM_HEIGHT0_M1 = 283,
+    CMD0_OPCODE_NPU_SET_OFM_HEIGHT1_M1 = 284,
+    CMD0_OPCODE_NPU_SET_OFM_REGION = 287,
+    CMD0_OPCODE_NPU_SET_KERNEL_WIDTH_M1 = 288,
+    CMD0_OPCODE_NPU_SET_KERNEL_HEIGHT_M1 = 289,
+    CMD0_OPCODE_NPU_SET_KERNEL_STRIDE = 290,
+    CMD0_OPCODE_NPU_SET_PARALLEL_MODE = 291,
+    CMD0_OPCODE_NPU_SET_ACC_FORMAT = 292,
+    CMD0_OPCODE_NPU_SET_ACTIVATION = 293,
+    CMD0_OPCODE_NPU_SET_ACTIVATION_MIN = 294,
+    CMD0_OPCODE_NPU_SET_ACTIVATION_MAX = 295,
+    CMD0_OPCODE_NPU_SET_WEIGHT_REGION = 296,
+    CMD0_OPCODE_NPU_SET_SCALE_REGION = 297,
+    CMD0_OPCODE_NPU_SET_AB_START = 301,
+    CMD0_OPCODE_NPU_SET_BLOCKDEP = 303,
+    CMD0_OPCODE_NPU_SET_DMA0_SRC_REGION = 304,
+    CMD0_OPCODE_NPU_SET_DMA0_DST_REGION = 305,
+    CMD0_OPCODE_NPU_SET_DMA0_SIZE0 = 306,
+    CMD0_OPCODE_NPU_SET_DMA0_SIZE1 = 307,
+    CMD0_OPCODE_NPU_SET_IFM2_BROADCAST = 384,
+    CMD0_OPCODE_NPU_SET_IFM2_SCALAR = 385,
+    CMD0_OPCODE_NPU_SET_IFM2_PRECISION = 389,
+    CMD0_OPCODE_NPU_SET_IFM2_ZERO_POINT = 393,
+    CMD0_OPCODE_NPU_SET_IFM2_WIDTH0_M1 = 394,
+    CMD0_OPCODE_NPU_SET_IFM2_HEIGHT0_M1 = 395,
+    CMD0_OPCODE_NPU_SET_IFM2_HEIGHT1_M1 = 396,
+    CMD0_OPCODE_NPU_SET_IFM2_IB_START = 397,
+    CMD0_OPCODE_NPU_SET_IFM2_REGION = 399,
+};
+
+enum cmd1_opcode
+{
+    CMD1_OPCODE_NPU_SET_IFM_BASE0 = 0,
+    CMD1_OPCODE_NPU_SET_IFM_BASE1 = 1,
+    CMD1_OPCODE_NPU_SET_IFM_BASE2 = 2,
+    CMD1_OPCODE_NPU_SET_IFM_BASE3 = 3,
+    CMD1_OPCODE_NPU_SET_IFM_STRIDE_X = 4,
+    CMD1_OPCODE_NPU_SET_IFM_STRIDE_Y = 5,
+    CMD1_OPCODE_NPU_SET_IFM_STRIDE_C = 6,
+    CMD1_OPCODE_NPU_SET_OFM_BASE0 = 16,
+    CMD1_OPCODE_NPU_SET_OFM_BASE1 = 17,
+    CMD1_OPCODE_NPU_SET_OFM_BASE2 = 18,
+    CMD1_OPCODE_NPU_SET_OFM_BASE3 = 19,
+    CMD1_OPCODE_NPU_SET_OFM_STRIDE_X = 20,
+    CMD1_OPCODE_NPU_SET_OFM_STRIDE_Y = 21,
+    CMD1_OPCODE_NPU_SET_OFM_STRIDE_C = 22,
+    CMD1_OPCODE_NPU_SET_WEIGHT_BASE = 32,
+    CMD1_OPCODE_NPU_SET_WEIGHT_LENGTH = 33,
+    CMD1_OPCODE_NPU_SET_SCALE_BASE = 34,
+    CMD1_OPCODE_NPU_SET_SCALE_LENGTH = 35,
+    CMD1_OPCODE_NPU_SET_OFM_SCALE = 36,
+    CMD1_OPCODE_NPU_SET_OPA_SCALE = 37,
+    CMD1_OPCODE_NPU_SET_OPB_SCALE = 38,
+    CMD1_OPCODE_NPU_SET_DMA0_SRC = 48,
+    CMD1_OPCODE_NPU_SET_DMA0_DST = 49,
+    CMD1_OPCODE_NPU_SET_DMA0_LEN = 50,
+    CMD1_OPCODE_NPU_SET_DMA0_SKIP0 = 51,
+    CMD1_OPCODE_NPU_SET_DMA0_SKIP1 = 52,
+    CMD1_OPCODE_NPU_SET_IFM2_BASE0 = 128,
+    CMD1_OPCODE_NPU_SET_IFM2_BASE1 = 129,
+    CMD1_OPCODE_NPU_SET_IFM2_BASE2 = 130,
+    CMD1_OPCODE_NPU_SET_IFM2_BASE3 = 131,
+    CMD1_OPCODE_NPU_SET_IFM2_STRIDE_X = 132,
+    CMD1_OPCODE_NPU_SET_IFM2_STRIDE_Y = 133,
+    CMD1_OPCODE_NPU_SET_IFM2_STRIDE_C = 134,
+    CMD1_OPCODE_NPU_SET_WEIGHT1_BASE = 144,
+    CMD1_OPCODE_NPU_SET_WEIGHT1_LENGTH = 145,
+    CMD1_OPCODE_NPU_SET_SCALE1_BASE = 146,
+    CMD1_OPCODE_NPU_SET_SCALE1_LENGTH = 147,
+};
+
+enum cmd_ctrl
+{
+    CMD_CTRL_CMD0_CTRL = 0,
+    CMD_CTRL_CMD1_CTRL = 1,
+};
+
+enum custom_dma
+{
+    CUSTOM_DMA_NOT_IMPLEMENTED = 0,
+    CUSTOM_DMA_IMPLEMENTED = 1,
+};
+
+enum dma_fault_channel
+{
+    DMA_FAULT_CHANNEL_CMD_READ = 0,
+    DMA_FAULT_CHANNEL_IFM_READ = 1,
+    DMA_FAULT_CHANNEL_WEIGHT_READ = 2,
+    DMA_FAULT_CHANNEL_SBS_READ = 3,
+    DMA_FAULT_CHANNEL_MEM2MEM_READ = 4,
+    DMA_FAULT_CHANNEL_OFM_WRITE = 8,
+    DMA_FAULT_CHANNEL_MEM2MEM_WRITE = 9,
+};
+
+enum dma_fault_src
+{
+    DMA_FAULT_SRC_AXI_M0 = 0,
+    DMA_FAULT_SRC_AXI_M1 = 1,
+};
+
+enum dma_region_mode
+{
+    DMA_REGION_MODE_EXTERNAL = 0,
+    DMA_REGION_MODE_INTERNAL = 1,
+};
+
+enum dma_stride_mode
+{
+    DMA_STRIDE_MODE_D1 = 0,
+    DMA_STRIDE_MODE_D2 = 1,
+    DMA_STRIDE_MODE_D3 = 2,
+};
+
+enum elementwise_mode
+{
+    ELEMENTWISE_MODE_MUL = 0,
+    ELEMENTWISE_MODE_ADD = 1,
+    ELEMENTWISE_MODE_SUB = 2,
+    ELEMENTWISE_MODE_MIN = 3,
+    ELEMENTWISE_MODE_MAX = 4,
+    ELEMENTWISE_MODE_LRELU = 5,
+    ELEMENTWISE_MODE_ABS = 6,
+    ELEMENTWISE_MODE_CLZ = 7,
+    ELEMENTWISE_MODE_SHR = 8,
+    ELEMENTWISE_MODE_SHL = 9,
+};
+
+enum ifm2_operand_order
+{
+    IFM2_OPERAND_ORDER_ORDER_B = 0,
+    IFM2_OPERAND_ORDER_ORDER_A = 1,
+};
+
+enum ifm_scale_mode
+{
+    IFM_SCALE_MODE_OPA_OPB_16 = 0,
+    IFM_SCALE_MODE_OPA_32 = 1,
+    IFM_SCALE_MODE_OPB_32 = 2,
+};
+
+enum ifm_upscale_mode
+{
+    IFM_UPSCALE_MODE_NONE = 0,
+    IFM_UPSCALE_MODE_NEAREST = 1,
+    IFM_UPSCALE_MODE_ZEROS = 2,
+};
+
+enum kernel_decomposition
+{
+    KERNEL_DECOMPOSITION_D8X8 = 0,
+    KERNEL_DECOMPOSITION_D4X4 = 1,
+};
+
+enum kernel_dilation
+{
+    KERNEL_DILATION_NONE = 0,
+    KERNEL_DILATION_X2 = 1,
+};
+
+enum max_beats
+{
+    MAX_BEATS_B64 = 0,
+    MAX_BEATS_B128 = 1,
+    MAX_BEATS_B256 = 2,
+};
+
+enum mem_attr
+{
+    MEM_ATTR_AXI0_OUTSTANDING_COUNTER0 = 0,
+    MEM_ATTR_AXI0_OUTSTANDING_COUNTER1 = 1,
+    MEM_ATTR_AXI1_OUTSTANDING_COUNTER2 = 2,
+    MEM_ATTR_AXI1_OUTSTANDING_COUNTER3 = 3,
+};
+
+enum ofm_scale_mode
+{
+    OFM_SCALE_MODE_PER_CHANNEL = 0,
+    OFM_SCALE_MODE_GLOBAL = 1,
+};
+
+enum parallel_mode
+{
+    PARALLEL_MODE_SINGLE_CORE = 0,
+    PARALLEL_MODE_DUAL_CORE_DEPTH = 1,
+};
+
+enum pmu_axi_channel
+{
+    PMU_AXI_CHANNEL_RD_CMD = 0,
+    PMU_AXI_CHANNEL_RD_IFM = 1,
+    PMU_AXI_CHANNEL_RD_WEIGHTS = 2,
+    PMU_AXI_CHANNEL_RD_SCALE_BIAS = 3,
+    PMU_AXI_CHANNEL_RD_MEM2MEM = 4,
+    PMU_AXI_CHANNEL_WR_OFM = 8,
+    PMU_AXI_CHANNEL_WR_MEM2MEM = 9,
+};
+
+enum pmu_event
+{
+    PMU_EVENT_NO_EVENT = 0,
+    PMU_EVENT_CYCLE = 17,
+    PMU_EVENT_NPU_IDLE = 32,
+    PMU_EVENT_CC_STALLED_ON_BLOCKDEP = 33,
+    PMU_EVENT_CC_STALLED_ON_SHRAM_RECONFIG = 34,
+    PMU_EVENT_NPU_ACTIVE = 35,
+    PMU_EVENT_MAC_ACTIVE = 48,
+    PMU_EVENT_MAC_ACTIVE_8BIT = 49,
+    PMU_EVENT_MAC_ACTIVE_16BIT = 50,
+    PMU_EVENT_MAC_DPU_ACTIVE = 51,
+    PMU_EVENT_MAC_STALLED_BY_WD_ACC = 52,
+    PMU_EVENT_MAC_STALLED_BY_WD = 53,
+    PMU_EVENT_MAC_STALLED_BY_ACC = 54,
+    PMU_EVENT_MAC_STALLED_BY_IB = 55,
+    PMU_EVENT_MAC_ACTIVE_32BIT = 56,
+    PMU_EVENT_MAC_STALLED_BY_INT_W = 57,
+    PMU_EVENT_MAC_STALLED_BY_INT_ACC = 58,
+    PMU_EVENT_AO_ACTIVE = 64,
+    PMU_EVENT_AO_ACTIVE_8BIT = 65,
+    PMU_EVENT_AO_ACTIVE_16BIT = 66,
+    PMU_EVENT_AO_STALLED_BY_OFMP_OB = 67,
+    PMU_EVENT_AO_STALLED_BY_OFMP = 68,
+    PMU_EVENT_AO_STALLED_BY_OB = 69,
+    PMU_EVENT_AO_STALLED_BY_ACC_IB = 70,
+    PMU_EVENT_AO_STALLED_BY_ACC = 71,
+    PMU_EVENT_AO_STALLED_BY_IB = 72,
+    PMU_EVENT_WD_ACTIVE = 80,
+    PMU_EVENT_WD_STALLED = 81,
+    PMU_EVENT_WD_STALLED_BY_WS = 82,
+    PMU_EVENT_WD_STALLED_BY_WD_BUF = 83,
+    PMU_EVENT_WD_PARSE_ACTIVE = 84,
+    PMU_EVENT_WD_PARSE_STALLED = 85,
+    PMU_EVENT_WD_PARSE_STALLED_IN = 86,
+    PMU_EVENT_WD_PARSE_STALLED_OUT = 87,
+    PMU_EVENT_WD_TRANS_WS = 88,
+    PMU_EVENT_WD_TRANS_WB = 89,
+    PMU_EVENT_WD_TRANS_DW0 = 90,
+    PMU_EVENT_WD_TRANS_DW1 = 91,
+    PMU_EVENT_AXI0_RD_TRANS_ACCEPTED = 128,
+    PMU_EVENT_AXI0_RD_TRANS_COMPLETED = 129,
+    PMU_EVENT_AXI0_RD_DATA_BEAT_RECEIVED = 130,
+    PMU_EVENT_AXI0_RD_TRAN_REQ_STALLED = 131,
+    PMU_EVENT_AXI0_WR_TRANS_ACCEPTED = 132,
+    PMU_EVENT_AXI0_WR_TRANS_COMPLETED_M = 133,
+    PMU_EVENT_AXI0_WR_TRANS_COMPLETED_S = 134,
+    PMU_EVENT_AXI0_WR_DATA_BEAT_WRITTEN = 135,
+    PMU_EVENT_AXI0_WR_TRAN_REQ_STALLED = 136,
+    PMU_EVENT_AXI0_WR_DATA_BEAT_STALLED = 137,
+    PMU_EVENT_AXI0_ENABLED_CYCLES = 140,
+    PMU_EVENT_AXI0_RD_STALL_LIMIT = 142,
+    PMU_EVENT_AXI0_WR_STALL_LIMIT = 143,
+    PMU_EVENT_AXI_LATENCY_ANY = 160,
+    PMU_EVENT_AXI_LATENCY_32 = 161,
+    PMU_EVENT_AXI_LATENCY_64 = 162,
+    PMU_EVENT_AXI_LATENCY_128 = 163,
+    PMU_EVENT_AXI_LATENCY_256 = 164,
+    PMU_EVENT_AXI_LATENCY_512 = 165,
+    PMU_EVENT_AXI_LATENCY_1024 = 166,
+    PMU_EVENT_ECC_DMA = 176,
+    PMU_EVENT_ECC_SB0 = 177,
+    PMU_EVENT_AXI1_RD_TRANS_ACCEPTED = 384,
+    PMU_EVENT_AXI1_RD_TRANS_COMPLETED = 385,
+    PMU_EVENT_AXI1_RD_DATA_BEAT_RECEIVED = 386,
+    PMU_EVENT_AXI1_RD_TRAN_REQ_STALLED = 387,
+    PMU_EVENT_AXI1_WR_TRANS_ACCEPTED = 388,
+    PMU_EVENT_AXI1_WR_TRANS_COMPLETED_M = 389,
+    PMU_EVENT_AXI1_WR_TRANS_COMPLETED_S = 390,
+    PMU_EVENT_AXI1_WR_DATA_BEAT_WRITTEN = 391,
+    PMU_EVENT_AXI1_WR_TRAN_REQ_STALLED = 392,
+    PMU_EVENT_AXI1_WR_DATA_BEAT_STALLED = 393,
+    PMU_EVENT_AXI1_ENABLED_CYCLES = 396,
+    PMU_EVENT_AXI1_RD_STALL_LIMIT = 398,
+    PMU_EVENT_AXI1_WR_STALL_LIMIT = 399,
+    PMU_EVENT_ECC_SB1 = 433,
+};
+
+enum pooling_mode
+{
+    POOLING_MODE_MAX = 0,
+    POOLING_MODE_AVERAGE = 1,
+    POOLING_MODE_REDUCE_SUM = 2,
+};
+
+enum privilege_level
+{
+    PRIVILEGE_LEVEL_USER = 0,
+    PRIVILEGE_LEVEL_PRIVILEGED = 1,
+};
+
+enum round_mode
+{
+    ROUND_MODE_DBL = 0,
+    ROUND_MODE_TRUNCATE = 1,
+    ROUND_MODE_NATURAL = 2,
+};
+
+enum security_level
+{
+    SECURITY_LEVEL_SECURE = 0,
+    SECURITY_LEVEL_NON_SECURE = 1,
+};
+
+enum state
+{
+    STATE_STOPPED = 0,
+    STATE_RUNNING = 1,
+};
+
+enum wd_core_slice_state
+{
+    WD_CORE_SLICE_STATE_HEADER = 0,
+    WD_CORE_SLICE_STATE_PALETTE = 1,
+    WD_CORE_SLICE_STATE_WEIGHTS = 2,
+};
+
+enum wd_ctrl_state
+{
+    WD_CTRL_STATE_IDLE = 0,
+    WD_CTRL_STATE_DRAIN = 1,
+    WD_CTRL_STATE_OFD_INIT = 2,
+    WD_CTRL_STATE_OFD_RUN = 3,
+};
+
+enum weight_order
+{
+    WEIGHT_ORDER_DEPTH_FIRST = 0,
+    WEIGHT_ORDER_PART_KERNEL_FIRST = 1,
+};
+
+#endif
+
+#ifdef NPU_DISASSEMBLE
+
+static const char* acc_format_str[] =
+{
+    "ACC_FORMAT_I32",
+    "ACC_FORMAT_I40",
+    "ACC_FORMAT_F16",
+};
+
+static const char* activation_clip_range_str[] =
+{
+    "ACTIVATION_CLIP_RANGE_OFM_PRECISION",
+    "****",
+    "ACTIVATION_CLIP_RANGE_FORCE_UINT8",
+    "ACTIVATION_CLIP_RANGE_FORCE_INT8",
+    "****",
+    "ACTIVATION_CLIP_RANGE_FORCE_INT16",
+};
+
+static const char* activation_format_str[] =
+{
+    "ACTIVATION_FORMAT_NHWC",
+    "ACTIVATION_FORMAT_NHCWB16",
+};
+
+static const char* activation_function_str[] =
+{
+    "ACTIVATION_FUNCTION_RELU",
+    "****",
+    "****",
+    "ACTIVATION_FUNCTION_TANH",
+    "ACTIVATION_FUNCTION_SIGMOID",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "ACTIVATION_FUNCTION_TABLE_0",
+    "ACTIVATION_FUNCTION_TABLE_1",
+    "ACTIVATION_FUNCTION_TABLE_2",
+    "ACTIVATION_FUNCTION_TABLE_3",
+    "ACTIVATION_FUNCTION_TABLE_4",
+    "ACTIVATION_FUNCTION_TABLE_5",
+    "ACTIVATION_FUNCTION_TABLE_6",
+    "ACTIVATION_FUNCTION_TABLE_7",
+};
+
+static const char* activation_precision_str[] =
+{
+    "ACTIVATION_PRECISION_B8",
+    "ACTIVATION_PRECISION_B16",
+    "ACTIVATION_PRECISION_B32",
+    "ACTIVATION_PRECISION_B64",
+};
+
+static const char* activation_type_str[] =
+{
+    "ACTIVATION_TYPE_UNSIGNED",
+    "ACTIVATION_TYPE_SIGNED",
+};
+
+static const char* axi_mem_encoding_str[] =
+{
+    "AXI_MEM_ENCODING_DEVICE_NON_BUFFERABLE",
+    "AXI_MEM_ENCODING_DEVICE_BUFFERABLE",
+    "AXI_MEM_ENCODING_NORMAL_NON_CACHEABLE_NON_BUFFERABLE",
+    "AXI_MEM_ENCODING_NORMAL_NON_CACHEABLE_BUFFERABLE",
+    "AXI_MEM_ENCODING_WRITE_THROUGH_NO_ALLOCATE",
+    "AXI_MEM_ENCODING_WRITE_THROUGH_READ_ALLOCATE",
+    "AXI_MEM_ENCODING_WRITE_THROUGH_WRITE_ALLOCATE",
+    "AXI_MEM_ENCODING_WRITE_THROUGH_READ_AND_WRITE_ALLOCATE",
+    "AXI_MEM_ENCODING_WRITE_BACK_NO_ALLOCATE",
+    "AXI_MEM_ENCODING_WRITE_BACK_READ_ALLOCATE",
+    "AXI_MEM_ENCODING_WRITE_BACK_WRITE_ALLOCATE",
+    "AXI_MEM_ENCODING_WRITE_BACK_READ_AND_WRITE_ALLOCATE",
+};
+
+static const char* broadcast_mode_str[] =
+{
+    "BROADCAST_MODE_DISABLE",
+    "BROADCAST_MODE_ENABLE",
+};
+
+static const char* cmd0_opcode_str[] =
+{
+    "CMD0_OPCODE_NPU_OP_STOP",
+    "CMD0_OPCODE_NPU_OP_IRQ",
+    "CMD0_OPCODE_NPU_OP_CONV",
+    "CMD0_OPCODE_NPU_OP_DEPTHWISE",
+    "****",
+    "CMD0_OPCODE_NPU_OP_POOL",
+    "CMD0_OPCODE_NPU_OP_ELEMENTWISE",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "CMD0_OPCODE_NPU_OP_DMA_START",
+    "CMD0_OPCODE_NPU_OP_DMA_WAIT",
+    "CMD0_OPCODE_NPU_OP_KERNEL_WAIT",
+    "CMD0_OPCODE_NPU_OP_PMU_MASK",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "CMD0_OPCODE_NPU_SET_IFM_PAD_TOP",
+    "CMD0_OPCODE_NPU_SET_IFM_PAD_LEFT",
+    "CMD0_OPCODE_NPU_SET_IFM_PAD_RIGHT",
+    "CMD0_OPCODE_NPU_SET_IFM_PAD_BOTTOM",
+    "CMD0_OPCODE_NPU_SET_IFM_DEPTH_M1",
+    "CMD0_OPCODE_NPU_SET_IFM_PRECISION",
+    "****",
+    "CMD0_OPCODE_NPU_SET_IFM_UPSCALE",
+    "****",
+    "CMD0_OPCODE_NPU_SET_IFM_ZERO_POINT",
+    "CMD0_OPCODE_NPU_SET_IFM_WIDTH0_M1",
+    "CMD0_OPCODE_NPU_SET_IFM_HEIGHT0_M1",
+    "CMD0_OPCODE_NPU_SET_IFM_HEIGHT1_M1",
+    "CMD0_OPCODE_NPU_SET_IFM_IB_END",
+    "****",
+    "CMD0_OPCODE_NPU_SET_IFM_REGION",
+    "****",
+    "CMD0_OPCODE_NPU_SET_OFM_WIDTH_M1",
+    "CMD0_OPCODE_NPU_SET_OFM_HEIGHT_M1",
+    "CMD0_OPCODE_NPU_SET_OFM_DEPTH_M1",
+    "CMD0_OPCODE_NPU_SET_OFM_PRECISION",
+    "CMD0_OPCODE_NPU_SET_OFM_BLK_WIDTH_M1",
+    "CMD0_OPCODE_NPU_SET_OFM_BLK_HEIGHT_M1",
+    "CMD0_OPCODE_NPU_SET_OFM_BLK_DEPTH_M1",
+    "CMD0_OPCODE_NPU_SET_OFM_ZERO_POINT",
+    "****",
+    "CMD0_OPCODE_NPU_SET_OFM_WIDTH0_M1",
+    "CMD0_OPCODE_NPU_SET_OFM_HEIGHT0_M1",
+    "CMD0_OPCODE_NPU_SET_OFM_HEIGHT1_M1",
+    "****",
+    "****",
+    "CMD0_OPCODE_NPU_SET_OFM_REGION",
+    "CMD0_OPCODE_NPU_SET_KERNEL_WIDTH_M1",
+    "CMD0_OPCODE_NPU_SET_KERNEL_HEIGHT_M1",
+    "CMD0_OPCODE_NPU_SET_KERNEL_STRIDE",
+    "CMD0_OPCODE_NPU_SET_PARALLEL_MODE",
+    "CMD0_OPCODE_NPU_SET_ACC_FORMAT",
+    "CMD0_OPCODE_NPU_SET_ACTIVATION",
+    "CMD0_OPCODE_NPU_SET_ACTIVATION_MIN",
+    "CMD0_OPCODE_NPU_SET_ACTIVATION_MAX",
+    "CMD0_OPCODE_NPU_SET_WEIGHT_REGION",
+    "CMD0_OPCODE_NPU_SET_SCALE_REGION",
+    "****",
+    "****",
+    "****",
+    "CMD0_OPCODE_NPU_SET_AB_START",
+    "****",
+    "CMD0_OPCODE_NPU_SET_BLOCKDEP",
+    "CMD0_OPCODE_NPU_SET_DMA0_SRC_REGION",
+    "CMD0_OPCODE_NPU_SET_DMA0_DST_REGION",
+    "CMD0_OPCODE_NPU_SET_DMA0_SIZE0",
+    "CMD0_OPCODE_NPU_SET_DMA0_SIZE1",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "CMD0_OPCODE_NPU_SET_IFM2_BROADCAST",
+    "CMD0_OPCODE_NPU_SET_IFM2_SCALAR",
+    "****",
+    "****",
+    "****",
+    "CMD0_OPCODE_NPU_SET_IFM2_PRECISION",
+    "****",
+    "****",
+    "****",
+    "CMD0_OPCODE_NPU_SET_IFM2_ZERO_POINT",
+    "CMD0_OPCODE_NPU_SET_IFM2_WIDTH0_M1",
+    "CMD0_OPCODE_NPU_SET_IFM2_HEIGHT0_M1",
+    "CMD0_OPCODE_NPU_SET_IFM2_HEIGHT1_M1",
+    "CMD0_OPCODE_NPU_SET_IFM2_IB_START",
+    "****",
+    "CMD0_OPCODE_NPU_SET_IFM2_REGION",
+};
+
+static const char* cmd1_opcode_str[] =
+{
+    "CMD1_OPCODE_NPU_SET_IFM_BASE0",
+    "CMD1_OPCODE_NPU_SET_IFM_BASE1",
+    "CMD1_OPCODE_NPU_SET_IFM_BASE2",
+    "CMD1_OPCODE_NPU_SET_IFM_BASE3",
+    "CMD1_OPCODE_NPU_SET_IFM_STRIDE_X",
+    "CMD1_OPCODE_NPU_SET_IFM_STRIDE_Y",
+    "CMD1_OPCODE_NPU_SET_IFM_STRIDE_C",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "CMD1_OPCODE_NPU_SET_OFM_BASE0",
+    "CMD1_OPCODE_NPU_SET_OFM_BASE1",
+    "CMD1_OPCODE_NPU_SET_OFM_BASE2",
+    "CMD1_OPCODE_NPU_SET_OFM_BASE3",
+    "CMD1_OPCODE_NPU_SET_OFM_STRIDE_X",
+    "CMD1_OPCODE_NPU_SET_OFM_STRIDE_Y",
+    "CMD1_OPCODE_NPU_SET_OFM_STRIDE_C",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "CMD1_OPCODE_NPU_SET_WEIGHT_BASE",
+    "CMD1_OPCODE_NPU_SET_WEIGHT_LENGTH",
+    "CMD1_OPCODE_NPU_SET_SCALE_BASE",
+    "CMD1_OPCODE_NPU_SET_SCALE_LENGTH",
+    "CMD1_OPCODE_NPU_SET_OFM_SCALE",
+    "CMD1_OPCODE_NPU_SET_OPA_SCALE",
+    "CMD1_OPCODE_NPU_SET_OPB_SCALE",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "CMD1_OPCODE_NPU_SET_DMA0_SRC",
+    "CMD1_OPCODE_NPU_SET_DMA0_DST",
+    "CMD1_OPCODE_NPU_SET_DMA0_LEN",
+    "CMD1_OPCODE_NPU_SET_DMA0_SKIP0",
+    "CMD1_OPCODE_NPU_SET_DMA0_SKIP1",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "CMD1_OPCODE_NPU_SET_IFM2_BASE0",
+    "CMD1_OPCODE_NPU_SET_IFM2_BASE1",
+    "CMD1_OPCODE_NPU_SET_IFM2_BASE2",
+    "CMD1_OPCODE_NPU_SET_IFM2_BASE3",
+    "CMD1_OPCODE_NPU_SET_IFM2_STRIDE_X",
+    "CMD1_OPCODE_NPU_SET_IFM2_STRIDE_Y",
+    "CMD1_OPCODE_NPU_SET_IFM2_STRIDE_C",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "CMD1_OPCODE_NPU_SET_WEIGHT1_BASE",
+    "CMD1_OPCODE_NPU_SET_WEIGHT1_LENGTH",
+    "CMD1_OPCODE_NPU_SET_SCALE1_BASE",
+    "CMD1_OPCODE_NPU_SET_SCALE1_LENGTH",
+};
+
+static const char* cmd_ctrl_str[] =
+{
+    "CMD_CTRL_CMD0_CTRL",
+    "CMD_CTRL_CMD1_CTRL",
+};
+
+static const char* custom_dma_str[] =
+{
+    "CUSTOM_DMA_NOT_IMPLEMENTED",
+    "CUSTOM_DMA_IMPLEMENTED",
+};
+
+static const char* dma_fault_channel_str[] =
+{
+    "DMA_FAULT_CHANNEL_CMD_READ",
+    "DMA_FAULT_CHANNEL_IFM_READ",
+    "DMA_FAULT_CHANNEL_WEIGHT_READ",
+    "DMA_FAULT_CHANNEL_SBS_READ",
+    "DMA_FAULT_CHANNEL_MEM2MEM_READ",
+    "****",
+    "****",
+    "****",
+    "DMA_FAULT_CHANNEL_OFM_WRITE",
+    "DMA_FAULT_CHANNEL_MEM2MEM_WRITE",
+};
+
+static const char* dma_fault_src_str[] =
+{
+    "DMA_FAULT_SRC_AXI_M0",
+    "DMA_FAULT_SRC_AXI_M1",
+};
+
+static const char* dma_region_mode_str[] =
+{
+    "DMA_REGION_MODE_EXTERNAL",
+    "DMA_REGION_MODE_INTERNAL",
+};
+
+static const char* dma_stride_mode_str[] =
+{
+    "DMA_STRIDE_MODE_D1",
+    "DMA_STRIDE_MODE_D2",
+    "DMA_STRIDE_MODE_D3",
+};
+
+static const char* elementwise_mode_str[] =
+{
+    "ELEMENTWISE_MODE_MUL",
+    "ELEMENTWISE_MODE_ADD",
+    "ELEMENTWISE_MODE_SUB",
+    "ELEMENTWISE_MODE_MIN",
+    "ELEMENTWISE_MODE_MAX",
+    "ELEMENTWISE_MODE_LRELU",
+    "ELEMENTWISE_MODE_ABS",
+    "ELEMENTWISE_MODE_CLZ",
+    "ELEMENTWISE_MODE_SHR",
+    "ELEMENTWISE_MODE_SHL",
+};
+
+static const char* ifm2_operand_order_str[] =
+{
+    "IFM2_OPERAND_ORDER_ORDER_B",
+    "IFM2_OPERAND_ORDER_ORDER_A",
+};
+
+static const char* ifm_scale_mode_str[] =
+{
+    "IFM_SCALE_MODE_OPA_OPB_16",
+    "IFM_SCALE_MODE_OPA_32",
+    "IFM_SCALE_MODE_OPB_32",
+};
+
+static const char* ifm_upscale_mode_str[] =
+{
+    "IFM_UPSCALE_MODE_NONE",
+    "IFM_UPSCALE_MODE_NEAREST",
+    "IFM_UPSCALE_MODE_ZEROS",
+};
+
+static const char* kernel_decomposition_str[] =
+{
+    "KERNEL_DECOMPOSITION_D8X8",
+    "KERNEL_DECOMPOSITION_D4X4",
+};
+
+static const char* kernel_dilation_str[] =
+{
+    "KERNEL_DILATION_NONE",
+    "KERNEL_DILATION_X2",
+};
+
+static const char* max_beats_str[] =
+{
+    "MAX_BEATS_B64",
+    "MAX_BEATS_B128",
+    "MAX_BEATS_B256",
+};
+
+static const char* mem_attr_str[] =
+{
+    "MEM_ATTR_AXI0_OUTSTANDING_COUNTER0",
+    "MEM_ATTR_AXI0_OUTSTANDING_COUNTER1",
+    "MEM_ATTR_AXI1_OUTSTANDING_COUNTER2",
+    "MEM_ATTR_AXI1_OUTSTANDING_COUNTER3",
+};
+
+static const char* ofm_scale_mode_str[] =
+{
+    "OFM_SCALE_MODE_PER_CHANNEL",
+    "OFM_SCALE_MODE_GLOBAL",
+};
+
+static const char* parallel_mode_str[] =
+{
+    "PARALLEL_MODE_SINGLE_CORE",
+    "PARALLEL_MODE_DUAL_CORE_DEPTH",
+};
+
+static const char* pmu_axi_channel_str[] =
+{
+    "PMU_AXI_CHANNEL_RD_CMD",
+    "PMU_AXI_CHANNEL_RD_IFM",
+    "PMU_AXI_CHANNEL_RD_WEIGHTS",
+    "PMU_AXI_CHANNEL_RD_SCALE_BIAS",
+    "PMU_AXI_CHANNEL_RD_MEM2MEM",
+    "****",
+    "****",
+    "****",
+    "PMU_AXI_CHANNEL_WR_OFM",
+    "PMU_AXI_CHANNEL_WR_MEM2MEM",
+};
+
+static const char* pmu_event_str[] =
+{
+    "PMU_EVENT_NO_EVENT",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "PMU_EVENT_CYCLE",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "PMU_EVENT_NPU_IDLE",
+    "PMU_EVENT_CC_STALLED_ON_BLOCKDEP",
+    "PMU_EVENT_CC_STALLED_ON_SHRAM_RECONFIG",
+    "PMU_EVENT_NPU_ACTIVE",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "PMU_EVENT_MAC_ACTIVE",
+    "PMU_EVENT_MAC_ACTIVE_8BIT",
+    "PMU_EVENT_MAC_ACTIVE_16BIT",
+    "PMU_EVENT_MAC_DPU_ACTIVE",
+    "PMU_EVENT_MAC_STALLED_BY_WD_ACC",
+    "PMU_EVENT_MAC_STALLED_BY_WD",
+    "PMU_EVENT_MAC_STALLED_BY_ACC",
+    "PMU_EVENT_MAC_STALLED_BY_IB",
+    "PMU_EVENT_MAC_ACTIVE_32BIT",
+    "PMU_EVENT_MAC_STALLED_BY_INT_W",
+    "PMU_EVENT_MAC_STALLED_BY_INT_ACC",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "PMU_EVENT_AO_ACTIVE",
+    "PMU_EVENT_AO_ACTIVE_8BIT",
+    "PMU_EVENT_AO_ACTIVE_16BIT",
+    "PMU_EVENT_AO_STALLED_BY_OFMP_OB",
+    "PMU_EVENT_AO_STALLED_BY_OFMP",
+    "PMU_EVENT_AO_STALLED_BY_OB",
+    "PMU_EVENT_AO_STALLED_BY_ACC_IB",
+    "PMU_EVENT_AO_STALLED_BY_ACC",
+    "PMU_EVENT_AO_STALLED_BY_IB",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "PMU_EVENT_WD_ACTIVE",
+    "PMU_EVENT_WD_STALLED",
+    "PMU_EVENT_WD_STALLED_BY_WS",
+    "PMU_EVENT_WD_STALLED_BY_WD_BUF",
+    "PMU_EVENT_WD_PARSE_ACTIVE",
+    "PMU_EVENT_WD_PARSE_STALLED",
+    "PMU_EVENT_WD_PARSE_STALLED_IN",
+    "PMU_EVENT_WD_PARSE_STALLED_OUT",
+    "PMU_EVENT_WD_TRANS_WS",
+    "PMU_EVENT_WD_TRANS_WB",
+    "PMU_EVENT_WD_TRANS_DW0",
+    "PMU_EVENT_WD_TRANS_DW1",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "PMU_EVENT_AXI0_RD_TRANS_ACCEPTED",
+    "PMU_EVENT_AXI0_RD_TRANS_COMPLETED",
+    "PMU_EVENT_AXI0_RD_DATA_BEAT_RECEIVED",
+    "PMU_EVENT_AXI0_RD_TRAN_REQ_STALLED",
+    "PMU_EVENT_AXI0_WR_TRANS_ACCEPTED",
+    "PMU_EVENT_AXI0_WR_TRANS_COMPLETED_M",
+    "PMU_EVENT_AXI0_WR_TRANS_COMPLETED_S",
+    "PMU_EVENT_AXI0_WR_DATA_BEAT_WRITTEN",
+    "PMU_EVENT_AXI0_WR_TRAN_REQ_STALLED",
+    "PMU_EVENT_AXI0_WR_DATA_BEAT_STALLED",
+    "****",
+    "****",
+    "PMU_EVENT_AXI0_ENABLED_CYCLES",
+    "****",
+    "PMU_EVENT_AXI0_RD_STALL_LIMIT",
+    "PMU_EVENT_AXI0_WR_STALL_LIMIT",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "PMU_EVENT_AXI_LATENCY_ANY",
+    "PMU_EVENT_AXI_LATENCY_32",
+    "PMU_EVENT_AXI_LATENCY_64",
+    "PMU_EVENT_AXI_LATENCY_128",
+    "PMU_EVENT_AXI_LATENCY_256",
+    "PMU_EVENT_AXI_LATENCY_512",
+    "PMU_EVENT_AXI_LATENCY_1024",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "PMU_EVENT_ECC_DMA",
+    "PMU_EVENT_ECC_SB0",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "PMU_EVENT_AXI1_RD_TRANS_ACCEPTED",
+    "PMU_EVENT_AXI1_RD_TRANS_COMPLETED",
+    "PMU_EVENT_AXI1_RD_DATA_BEAT_RECEIVED",
+    "PMU_EVENT_AXI1_RD_TRAN_REQ_STALLED",
+    "PMU_EVENT_AXI1_WR_TRANS_ACCEPTED",
+    "PMU_EVENT_AXI1_WR_TRANS_COMPLETED_M",
+    "PMU_EVENT_AXI1_WR_TRANS_COMPLETED_S",
+    "PMU_EVENT_AXI1_WR_DATA_BEAT_WRITTEN",
+    "PMU_EVENT_AXI1_WR_TRAN_REQ_STALLED",
+    "PMU_EVENT_AXI1_WR_DATA_BEAT_STALLED",
+    "****",
+    "****",
+    "PMU_EVENT_AXI1_ENABLED_CYCLES",
+    "****",
+    "PMU_EVENT_AXI1_RD_STALL_LIMIT",
+    "PMU_EVENT_AXI1_WR_STALL_LIMIT",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "PMU_EVENT_ECC_SB1",
+};
+
+static const char* pooling_mode_str[] =
+{
+    "POOLING_MODE_MAX",
+    "POOLING_MODE_AVERAGE",
+    "POOLING_MODE_REDUCE_SUM",
+};
+
+static const char* privilege_level_str[] =
+{
+    "PRIVILEGE_LEVEL_USER",
+    "PRIVILEGE_LEVEL_PRIVILEGED",
+};
+
+static const char* round_mode_str[] =
+{
+    "ROUND_MODE_DBL",
+    "ROUND_MODE_TRUNCATE",
+    "ROUND_MODE_NATURAL",
+};
+
+static const char* security_level_str[] =
+{
+    "SECURITY_LEVEL_SECURE",
+    "SECURITY_LEVEL_NON_SECURE",
+};
+
+static const char* state_str[] =
+{
+    "STATE_STOPPED",
+    "STATE_RUNNING",
+};
+
+static const char* wd_core_slice_state_str[] =
+{
+    "WD_CORE_SLICE_STATE_HEADER",
+    "WD_CORE_SLICE_STATE_PALETTE",
+    "WD_CORE_SLICE_STATE_WEIGHTS",
+};
+
+static const char* wd_ctrl_state_str[] =
+{
+    "WD_CTRL_STATE_IDLE",
+    "WD_CTRL_STATE_DRAIN",
+    "WD_CTRL_STATE_OFD_INIT",
+    "WD_CTRL_STATE_OFD_RUN",
+};
+
+static const char* weight_order_str[] =
+{
+    "WEIGHT_ORDER_DEPTH_FIRST",
+    "WEIGHT_ORDER_PART_KERNEL_FIRST",
+};
+
+#endif
+
+
+
+struct id_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t version_status : 4;
+            uint32_t version_minor : 4;
+            uint32_t version_major : 4;
+            uint32_t product_major : 4;
+            uint32_t arch_patch_rev : 4;
+            uint32_t arch_minor_rev : 8;
+            uint32_t arch_major_rev : 4;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR id_r() :
+        word0(268853249)
+    {}
+    CONSTEXPR id_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    id_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_version_status() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR id_r& set_version_status(uint32_t value)
+    {
+        word0 = (~(((1U << 4) - 1)<<0) & word0) | ((((1U << 4) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_version_minor() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR id_r& set_version_minor(uint32_t value)
+    {
+        word0 = (~(((1U << 4) - 1)<<4) & word0) | ((((1U << 4) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_version_major() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 8);
+        return v;
+    }
+    CONSTEXPR id_r& set_version_major(uint32_t value)
+    {
+        word0 = (~(((1U << 4) - 1)<<8) & word0) | ((((1U << 4) - 1) & value) << 8);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_product_major() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 12);
+        return v;
+    }
+    CONSTEXPR id_r& set_product_major(uint32_t value)
+    {
+        word0 = (~(((1U << 4) - 1)<<12) & word0) | ((((1U << 4) - 1) & value) << 12);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_arch_patch_rev() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 16);
+        return v;
+    }
+    CONSTEXPR id_r& set_arch_patch_rev(uint32_t value)
+    {
+        word0 = (~(((1U << 4) - 1)<<16) & word0) | ((((1U << 4) - 1) & value) << 16);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_arch_minor_rev() const
+    {
+        auto v = ((1U << 8) - 1) & (word0 >> 20);
+        return v;
+    }
+    CONSTEXPR id_r& set_arch_minor_rev(uint32_t value)
+    {
+        word0 = (~(((1U << 8) - 1)<<20) & word0) | ((((1U << 8) - 1) & value) << 20);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_arch_major_rev() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 28);
+        return v;
+    }
+    CONSTEXPR id_r& set_arch_major_rev(uint32_t value)
+    {
+        word0 = (~(((1U << 4) - 1)<<28) & word0) | ((((1U << 4) - 1) & value) << 28);
+        return *this;
+    }
+#endif
+};
+
+
+struct status_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t state : 1;
+            uint32_t irq_raised : 1;
+            uint32_t bus_status : 1;
+            uint32_t reset_status : 1;
+            uint32_t cmd_parse_error : 1;
+            uint32_t cmd_end_reached : 1;
+            uint32_t pmu_irq_raised : 1;
+            uint32_t wd_fault : 1;
+            uint32_t ecc_fault : 1;
+            uint32_t reserved0 : 2;
+            uint32_t faulting_interface : 1;
+            uint32_t faulting_channel : 4;
+            uint32_t irq_history_mask : 16;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR status_r() :
+        word0(8)
+    {}
+    CONSTEXPR status_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    status_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::state get_state() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::state>(v);
+    }
+    CONSTEXPR status_r& set_state(NPU_NAMESPACE::state value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_irq_raised() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR status_r& set_irq_raised(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_bus_status() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR status_r& set_bus_status(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_reset_status() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR status_r& set_reset_status(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cmd_parse_error() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR status_r& set_cmd_parse_error(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cmd_end_reached() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 5);
+        return v;
+    }
+    CONSTEXPR status_r& set_cmd_end_reached(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_pmu_irq_raised() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 6);
+        return v;
+    }
+    CONSTEXPR status_r& set_pmu_irq_raised(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wd_fault() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 7);
+        return v;
+    }
+    CONSTEXPR status_r& set_wd_fault(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ecc_fault() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 8);
+        return v;
+    }
+    CONSTEXPR status_r& set_ecc_fault(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & value) << 8);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::dma_fault_src get_faulting_interface() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 11);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::dma_fault_src>(v);
+    }
+    CONSTEXPR status_r& set_faulting_interface(NPU_NAMESPACE::dma_fault_src value)
+    {
+        word0 = (~(((1U << 1) - 1)<<11) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 11);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::dma_fault_channel get_faulting_channel() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 12);
+        assert(v <= 9);
+        return static_cast<NPU_NAMESPACE::dma_fault_channel>(v);
+    }
+    CONSTEXPR status_r& set_faulting_channel(NPU_NAMESPACE::dma_fault_channel value)
+    {
+        word0 = (~(((1U << 4) - 1)<<12) & word0) | ((((1U << 4) - 1) & static_cast<uint32_t>(value)) << 12);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_irq_history_mask() const
+    {
+        auto v = ((1U << 16) - 1) & (word0 >> 16);
+        return v;
+    }
+    CONSTEXPR status_r& set_irq_history_mask(uint32_t value)
+    {
+        word0 = (~(((1U << 16) - 1)<<16) & word0) | ((((1U << 16) - 1) & value) << 16);
+        return *this;
+    }
+#endif
+};
+
+
+struct cmd_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t transition_to_running_state : 1;
+            uint32_t clear_irq : 1;
+            uint32_t clock_q_enable : 1;
+            uint32_t power_q_enable : 1;
+            uint32_t stop_request : 1;
+            uint32_t reserved0 : 11;
+            uint32_t clear_irq_history : 16;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR cmd_r() :
+        word0(12)
+    {}
+    CONSTEXPR cmd_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    cmd_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_transition_to_running_state() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR cmd_r& set_transition_to_running_state(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_clear_irq() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR cmd_r& set_clear_irq(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_clock_q_enable() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR cmd_r& set_clock_q_enable(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_power_q_enable() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR cmd_r& set_power_q_enable(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_stop_request() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR cmd_r& set_stop_request(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_clear_irq_history() const
+    {
+        auto v = ((1U << 16) - 1) & (word0 >> 16);
+        return v;
+    }
+    CONSTEXPR cmd_r& set_clear_irq_history(uint32_t value)
+    {
+        word0 = (~(((1U << 16) - 1)<<16) & word0) | ((((1U << 16) - 1) & value) << 16);
+        return *this;
+    }
+#endif
+};
+
+
+struct reset_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t pending_CPL : 1;
+            uint32_t pending_CSL : 1;
+            uint32_t reserved0 : 30;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR reset_r() :
+        word0(0)
+    {}
+    CONSTEXPR reset_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    reset_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::privilege_level get_pending_CPL() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::privilege_level>(v);
+    }
+    CONSTEXPR reset_r& set_pending_CPL(NPU_NAMESPACE::privilege_level value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 0);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::security_level get_pending_CSL() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::security_level>(v);
+    }
+    CONSTEXPR reset_r& set_pending_CSL(NPU_NAMESPACE::security_level value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 1);
+        return *this;
+    }
+#endif
+};
+
+
+struct qbase_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t offset_LO : 32;
+            uint32_t offset_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR qbase_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR qbase_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    qbase_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct qread_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t QREAD : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR qread_r() :
+        word0(0)
+    {}
+    CONSTEXPR qread_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    qread_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_QREAD() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR qread_r& set_QREAD(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct qconfig_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t cmd_region0 : 2;
+            uint32_t reserved0 : 30;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR qconfig_r() :
+        word0(0)
+    {}
+    CONSTEXPR qconfig_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    qconfig_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::mem_attr get_cmd_region0() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 0);
+        assert(v <= 3);
+        return static_cast<NPU_NAMESPACE::mem_attr>(v);
+    }
+    CONSTEXPR qconfig_r& set_cmd_region0(NPU_NAMESPACE::mem_attr value)
+    {
+        word0 = (~(((1U << 2) - 1)<<0) & word0) | ((((1U << 2) - 1) & static_cast<uint32_t>(value)) << 0);
+        return *this;
+    }
+#endif
+};
+
+
+struct qsize_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t QSIZE : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR qsize_r() :
+        word0(0)
+    {}
+    CONSTEXPR qsize_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    qsize_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_QSIZE() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR qsize_r& set_QSIZE(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct prot_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t active_CPL : 1;
+            uint32_t active_CSL : 1;
+            uint32_t reserved0 : 30;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR prot_r() :
+        word0(0)
+    {}
+    CONSTEXPR prot_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    prot_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::privilege_level get_active_CPL() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::privilege_level>(v);
+    }
+    CONSTEXPR prot_r& set_active_CPL(NPU_NAMESPACE::privilege_level value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 0);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::security_level get_active_CSL() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::security_level>(v);
+    }
+    CONSTEXPR prot_r& set_active_CSL(NPU_NAMESPACE::security_level value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 1);
+        return *this;
+    }
+#endif
+};
+
+
+struct config_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t macs_per_cc : 4;
+            uint32_t cmd_stream_version : 4;
+            uint32_t shram_size : 8;
+            uint32_t reserved0 : 11;
+            uint32_t custom_dma : 1;
+            uint32_t product : 4;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR config_r() :
+        word0(268435456)
+    {}
+    CONSTEXPR config_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    config_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_macs_per_cc() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR config_r& set_macs_per_cc(uint32_t value)
+    {
+        word0 = (~(((1U << 4) - 1)<<0) & word0) | ((((1U << 4) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cmd_stream_version() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR config_r& set_cmd_stream_version(uint32_t value)
+    {
+        word0 = (~(((1U << 4) - 1)<<4) & word0) | ((((1U << 4) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_shram_size() const
+    {
+        auto v = ((1U << 8) - 1) & (word0 >> 8);
+        return v;
+    }
+    CONSTEXPR config_r& set_shram_size(uint32_t value)
+    {
+        word0 = (~(((1U << 8) - 1)<<8) & word0) | ((((1U << 8) - 1) & value) << 8);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::custom_dma get_custom_dma() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 27);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::custom_dma>(v);
+    }
+    CONSTEXPR config_r& set_custom_dma(NPU_NAMESPACE::custom_dma value)
+    {
+        word0 = (~(((1U << 1) - 1)<<27) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 27);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_product() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 28);
+        return v;
+    }
+    CONSTEXPR config_r& set_product(uint32_t value)
+    {
+        word0 = (~(((1U << 4) - 1)<<28) & word0) | ((((1U << 4) - 1) & value) << 28);
+        return *this;
+    }
+#endif
+};
+
+
+struct lock_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t LOCK : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR lock_r() :
+        word0(0)
+    {}
+    CONSTEXPR lock_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    lock_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_LOCK() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR lock_r& set_LOCK(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct regioncfg_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t region0 : 2;
+            uint32_t region1 : 2;
+            uint32_t region2 : 2;
+            uint32_t region3 : 2;
+            uint32_t region4 : 2;
+            uint32_t region5 : 2;
+            uint32_t region6 : 2;
+            uint32_t region7 : 2;
+            uint32_t reserved0 : 16;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR regioncfg_r() :
+        word0(0)
+    {}
+    CONSTEXPR regioncfg_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    regioncfg_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::mem_attr get_region0() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 0);
+        assert(v <= 3);
+        return static_cast<NPU_NAMESPACE::mem_attr>(v);
+    }
+    CONSTEXPR regioncfg_r& set_region0(NPU_NAMESPACE::mem_attr value)
+    {
+        word0 = (~(((1U << 2) - 1)<<0) & word0) | ((((1U << 2) - 1) & static_cast<uint32_t>(value)) << 0);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::mem_attr get_region1() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 2);
+        assert(v <= 3);
+        return static_cast<NPU_NAMESPACE::mem_attr>(v);
+    }
+    CONSTEXPR regioncfg_r& set_region1(NPU_NAMESPACE::mem_attr value)
+    {
+        word0 = (~(((1U << 2) - 1)<<2) & word0) | ((((1U << 2) - 1) & static_cast<uint32_t>(value)) << 2);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::mem_attr get_region2() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 4);
+        assert(v <= 3);
+        return static_cast<NPU_NAMESPACE::mem_attr>(v);
+    }
+    CONSTEXPR regioncfg_r& set_region2(NPU_NAMESPACE::mem_attr value)
+    {
+        word0 = (~(((1U << 2) - 1)<<4) & word0) | ((((1U << 2) - 1) & static_cast<uint32_t>(value)) << 4);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::mem_attr get_region3() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 6);
+        assert(v <= 3);
+        return static_cast<NPU_NAMESPACE::mem_attr>(v);
+    }
+    CONSTEXPR regioncfg_r& set_region3(NPU_NAMESPACE::mem_attr value)
+    {
+        word0 = (~(((1U << 2) - 1)<<6) & word0) | ((((1U << 2) - 1) & static_cast<uint32_t>(value)) << 6);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::mem_attr get_region4() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 8);
+        assert(v <= 3);
+        return static_cast<NPU_NAMESPACE::mem_attr>(v);
+    }
+    CONSTEXPR regioncfg_r& set_region4(NPU_NAMESPACE::mem_attr value)
+    {
+        word0 = (~(((1U << 2) - 1)<<8) & word0) | ((((1U << 2) - 1) & static_cast<uint32_t>(value)) << 8);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::mem_attr get_region5() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 10);
+        assert(v <= 3);
+        return static_cast<NPU_NAMESPACE::mem_attr>(v);
+    }
+    CONSTEXPR regioncfg_r& set_region5(NPU_NAMESPACE::mem_attr value)
+    {
+        word0 = (~(((1U << 2) - 1)<<10) & word0) | ((((1U << 2) - 1) & static_cast<uint32_t>(value)) << 10);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::mem_attr get_region6() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 12);
+        assert(v <= 3);
+        return static_cast<NPU_NAMESPACE::mem_attr>(v);
+    }
+    CONSTEXPR regioncfg_r& set_region6(NPU_NAMESPACE::mem_attr value)
+    {
+        word0 = (~(((1U << 2) - 1)<<12) & word0) | ((((1U << 2) - 1) & static_cast<uint32_t>(value)) << 12);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::mem_attr get_region7() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 14);
+        assert(v <= 3);
+        return static_cast<NPU_NAMESPACE::mem_attr>(v);
+    }
+    CONSTEXPR regioncfg_r& set_region7(NPU_NAMESPACE::mem_attr value)
+    {
+        word0 = (~(((1U << 2) - 1)<<14) & word0) | ((((1U << 2) - 1) & static_cast<uint32_t>(value)) << 14);
+        return *this;
+    }
+#endif
+};
+
+
+struct axi_limit0_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t max_beats : 2;
+            uint32_t reserved0 : 2;
+            uint32_t memtype : 4;
+            uint32_t reserved1 : 8;
+            uint32_t max_outstanding_read_m1 : 6;
+            uint32_t reserved2 : 2;
+            uint32_t max_outstanding_write_m1 : 5;
+            uint32_t reserved3 : 3;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR axi_limit0_r() :
+        word0(0)
+    {}
+    CONSTEXPR axi_limit0_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    axi_limit0_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::max_beats get_max_beats() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 0);
+        assert(v <= 2);
+        return static_cast<NPU_NAMESPACE::max_beats>(v);
+    }
+    CONSTEXPR axi_limit0_r& set_max_beats(NPU_NAMESPACE::max_beats value)
+    {
+        word0 = (~(((1U << 2) - 1)<<0) & word0) | ((((1U << 2) - 1) & static_cast<uint32_t>(value)) << 0);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::axi_mem_encoding get_memtype() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 4);
+        assert(v <= 11);
+        return static_cast<NPU_NAMESPACE::axi_mem_encoding>(v);
+    }
+    CONSTEXPR axi_limit0_r& set_memtype(NPU_NAMESPACE::axi_mem_encoding value)
+    {
+        word0 = (~(((1U << 4) - 1)<<4) & word0) | ((((1U << 4) - 1) & static_cast<uint32_t>(value)) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_max_outstanding_read_m1() const
+    {
+        auto v = ((1U << 6) - 1) & (word0 >> 16);
+        return v;
+    }
+    CONSTEXPR axi_limit0_r& set_max_outstanding_read_m1(uint32_t value)
+    {
+        word0 = (~(((1U << 6) - 1)<<16) & word0) | ((((1U << 6) - 1) & value) << 16);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_max_outstanding_write_m1() const
+    {
+        auto v = ((1U << 5) - 1) & (word0 >> 24);
+        return v;
+    }
+    CONSTEXPR axi_limit0_r& set_max_outstanding_write_m1(uint32_t value)
+    {
+        word0 = (~(((1U << 5) - 1)<<24) & word0) | ((((1U << 5) - 1) & value) << 24);
+        return *this;
+    }
+#endif
+};
+
+
+struct axi_limit1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t max_beats : 2;
+            uint32_t reserved0 : 2;
+            uint32_t memtype : 4;
+            uint32_t reserved1 : 8;
+            uint32_t max_outstanding_read_m1 : 6;
+            uint32_t reserved2 : 2;
+            uint32_t max_outstanding_write_m1 : 5;
+            uint32_t reserved3 : 3;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR axi_limit1_r() :
+        word0(0)
+    {}
+    CONSTEXPR axi_limit1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    axi_limit1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::max_beats get_max_beats() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 0);
+        assert(v <= 2);
+        return static_cast<NPU_NAMESPACE::max_beats>(v);
+    }
+    CONSTEXPR axi_limit1_r& set_max_beats(NPU_NAMESPACE::max_beats value)
+    {
+        word0 = (~(((1U << 2) - 1)<<0) & word0) | ((((1U << 2) - 1) & static_cast<uint32_t>(value)) << 0);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::axi_mem_encoding get_memtype() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 4);
+        assert(v <= 11);
+        return static_cast<NPU_NAMESPACE::axi_mem_encoding>(v);
+    }
+    CONSTEXPR axi_limit1_r& set_memtype(NPU_NAMESPACE::axi_mem_encoding value)
+    {
+        word0 = (~(((1U << 4) - 1)<<4) & word0) | ((((1U << 4) - 1) & static_cast<uint32_t>(value)) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_max_outstanding_read_m1() const
+    {
+        auto v = ((1U << 6) - 1) & (word0 >> 16);
+        return v;
+    }
+    CONSTEXPR axi_limit1_r& set_max_outstanding_read_m1(uint32_t value)
+    {
+        word0 = (~(((1U << 6) - 1)<<16) & word0) | ((((1U << 6) - 1) & value) << 16);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_max_outstanding_write_m1() const
+    {
+        auto v = ((1U << 5) - 1) & (word0 >> 24);
+        return v;
+    }
+    CONSTEXPR axi_limit1_r& set_max_outstanding_write_m1(uint32_t value)
+    {
+        word0 = (~(((1U << 5) - 1)<<24) & word0) | ((((1U << 5) - 1) & value) << 24);
+        return *this;
+    }
+#endif
+};
+
+
+struct axi_limit2_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t max_beats : 2;
+            uint32_t reserved0 : 2;
+            uint32_t memtype : 4;
+            uint32_t reserved1 : 8;
+            uint32_t max_outstanding_read_m1 : 6;
+            uint32_t reserved2 : 2;
+            uint32_t max_outstanding_write_m1 : 5;
+            uint32_t reserved3 : 3;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR axi_limit2_r() :
+        word0(0)
+    {}
+    CONSTEXPR axi_limit2_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    axi_limit2_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::max_beats get_max_beats() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 0);
+        assert(v <= 2);
+        return static_cast<NPU_NAMESPACE::max_beats>(v);
+    }
+    CONSTEXPR axi_limit2_r& set_max_beats(NPU_NAMESPACE::max_beats value)
+    {
+        word0 = (~(((1U << 2) - 1)<<0) & word0) | ((((1U << 2) - 1) & static_cast<uint32_t>(value)) << 0);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::axi_mem_encoding get_memtype() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 4);
+        assert(v <= 11);
+        return static_cast<NPU_NAMESPACE::axi_mem_encoding>(v);
+    }
+    CONSTEXPR axi_limit2_r& set_memtype(NPU_NAMESPACE::axi_mem_encoding value)
+    {
+        word0 = (~(((1U << 4) - 1)<<4) & word0) | ((((1U << 4) - 1) & static_cast<uint32_t>(value)) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_max_outstanding_read_m1() const
+    {
+        auto v = ((1U << 6) - 1) & (word0 >> 16);
+        return v;
+    }
+    CONSTEXPR axi_limit2_r& set_max_outstanding_read_m1(uint32_t value)
+    {
+        word0 = (~(((1U << 6) - 1)<<16) & word0) | ((((1U << 6) - 1) & value) << 16);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_max_outstanding_write_m1() const
+    {
+        auto v = ((1U << 5) - 1) & (word0 >> 24);
+        return v;
+    }
+    CONSTEXPR axi_limit2_r& set_max_outstanding_write_m1(uint32_t value)
+    {
+        word0 = (~(((1U << 5) - 1)<<24) & word0) | ((((1U << 5) - 1) & value) << 24);
+        return *this;
+    }
+#endif
+};
+
+
+struct axi_limit3_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t max_beats : 2;
+            uint32_t reserved0 : 2;
+            uint32_t memtype : 4;
+            uint32_t reserved1 : 8;
+            uint32_t max_outstanding_read_m1 : 6;
+            uint32_t reserved2 : 2;
+            uint32_t max_outstanding_write_m1 : 5;
+            uint32_t reserved3 : 3;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR axi_limit3_r() :
+        word0(0)
+    {}
+    CONSTEXPR axi_limit3_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    axi_limit3_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::max_beats get_max_beats() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 0);
+        assert(v <= 2);
+        return static_cast<NPU_NAMESPACE::max_beats>(v);
+    }
+    CONSTEXPR axi_limit3_r& set_max_beats(NPU_NAMESPACE::max_beats value)
+    {
+        word0 = (~(((1U << 2) - 1)<<0) & word0) | ((((1U << 2) - 1) & static_cast<uint32_t>(value)) << 0);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::axi_mem_encoding get_memtype() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 4);
+        assert(v <= 11);
+        return static_cast<NPU_NAMESPACE::axi_mem_encoding>(v);
+    }
+    CONSTEXPR axi_limit3_r& set_memtype(NPU_NAMESPACE::axi_mem_encoding value)
+    {
+        word0 = (~(((1U << 4) - 1)<<4) & word0) | ((((1U << 4) - 1) & static_cast<uint32_t>(value)) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_max_outstanding_read_m1() const
+    {
+        auto v = ((1U << 6) - 1) & (word0 >> 16);
+        return v;
+    }
+    CONSTEXPR axi_limit3_r& set_max_outstanding_read_m1(uint32_t value)
+    {
+        word0 = (~(((1U << 6) - 1)<<16) & word0) | ((((1U << 6) - 1) & value) << 16);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_max_outstanding_write_m1() const
+    {
+        auto v = ((1U << 5) - 1) & (word0 >> 24);
+        return v;
+    }
+    CONSTEXPR axi_limit3_r& set_max_outstanding_write_m1(uint32_t value)
+    {
+        word0 = (~(((1U << 5) - 1)<<24) & word0) | ((((1U << 5) - 1) & value) << 24);
+        return *this;
+    }
+#endif
+};
+
+
+struct basep_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t offset_LO : 32;
+            uint32_t offset_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR basep_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR basep_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    basep_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct wd_status_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t core_slice_state : 2;
+            uint32_t core_idle : 1;
+            uint32_t ctrl_state : 2;
+            uint32_t ctrl_idle : 1;
+            uint32_t write_buf_index0 : 3;
+            uint32_t write_buf_valid0 : 1;
+            uint32_t write_buf_idle0 : 1;
+            uint32_t write_buf_index1 : 3;
+            uint32_t write_buf_valid1 : 1;
+            uint32_t write_buf_idle1 : 1;
+            uint32_t events : 12;
+            uint32_t reserved0 : 4;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR wd_status_r() :
+        word0(0)
+    {}
+    CONSTEXPR wd_status_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    wd_status_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::wd_core_slice_state get_core_slice_state() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 0);
+        assert(v <= 2);
+        return static_cast<NPU_NAMESPACE::wd_core_slice_state>(v);
+    }
+    CONSTEXPR wd_status_r& set_core_slice_state(NPU_NAMESPACE::wd_core_slice_state value)
+    {
+        word0 = (~(((1U << 2) - 1)<<0) & word0) | ((((1U << 2) - 1) & static_cast<uint32_t>(value)) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_core_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_core_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::wd_ctrl_state get_ctrl_state() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 3);
+        assert(v <= 3);
+        return static_cast<NPU_NAMESPACE::wd_ctrl_state>(v);
+    }
+    CONSTEXPR wd_status_r& set_ctrl_state(NPU_NAMESPACE::wd_ctrl_state value)
+    {
+        word0 = (~(((1U << 2) - 1)<<3) & word0) | ((((1U << 2) - 1) & static_cast<uint32_t>(value)) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ctrl_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 5);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_ctrl_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_write_buf_index0() const
+    {
+        auto v = ((1U << 3) - 1) & (word0 >> 6);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_write_buf_index0(uint32_t value)
+    {
+        word0 = (~(((1U << 3) - 1)<<6) & word0) | ((((1U << 3) - 1) & value) << 6);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_write_buf_valid0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 9);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_write_buf_valid0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<9) & word0) | ((((1U << 1) - 1) & value) << 9);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_write_buf_idle0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 10);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_write_buf_idle0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<10) & word0) | ((((1U << 1) - 1) & value) << 10);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_write_buf_index1() const
+    {
+        auto v = ((1U << 3) - 1) & (word0 >> 11);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_write_buf_index1(uint32_t value)
+    {
+        word0 = (~(((1U << 3) - 1)<<11) & word0) | ((((1U << 3) - 1) & value) << 11);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_write_buf_valid1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 14);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_write_buf_valid1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<14) & word0) | ((((1U << 1) - 1) & value) << 14);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_write_buf_idle1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 15);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_write_buf_idle1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<15) & word0) | ((((1U << 1) - 1) & value) << 15);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_events() const
+    {
+        auto v = ((1U << 12) - 1) & (word0 >> 16);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_events(uint32_t value)
+    {
+        word0 = (~(((1U << 12) - 1)<<16) & word0) | ((((1U << 12) - 1) & value) << 16);
+        return *this;
+    }
+#endif
+};
+
+
+struct mac_status_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t block_cfg_valid : 1;
+            uint32_t trav_en : 1;
+            uint32_t wait_for_ib : 1;
+            uint32_t wait_for_acc_buf : 1;
+            uint32_t wait_for_weights : 1;
+            uint32_t stall_stripe : 1;
+            uint32_t dw_sel : 1;
+            uint32_t wait_for_dw0_ready : 1;
+            uint32_t wait_for_dw1_ready : 1;
+            uint32_t acc_buf_sel_ai : 1;
+            uint32_t wait_for_acc0_ready : 1;
+            uint32_t wait_for_acc1_ready : 1;
+            uint32_t acc_buf_sel_aa : 1;
+            uint32_t acc0_valid : 1;
+            uint32_t acc1_valid : 1;
+            uint32_t reserved0 : 1;
+            uint32_t events : 11;
+            uint32_t reserved1 : 5;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR mac_status_r() :
+        word0(0)
+    {}
+    CONSTEXPR mac_status_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    mac_status_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_block_cfg_valid() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_block_cfg_valid(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_trav_en() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_trav_en(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wait_for_ib() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_wait_for_ib(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wait_for_acc_buf() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_wait_for_acc_buf(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wait_for_weights() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_wait_for_weights(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_stall_stripe() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 5);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_stall_stripe(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_dw_sel() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 6);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_dw_sel(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wait_for_dw0_ready() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 7);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_wait_for_dw0_ready(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wait_for_dw1_ready() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 8);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_wait_for_dw1_ready(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & value) << 8);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_acc_buf_sel_ai() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 9);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_acc_buf_sel_ai(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<9) & word0) | ((((1U << 1) - 1) & value) << 9);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wait_for_acc0_ready() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 10);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_wait_for_acc0_ready(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<10) & word0) | ((((1U << 1) - 1) & value) << 10);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wait_for_acc1_ready() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 11);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_wait_for_acc1_ready(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<11) & word0) | ((((1U << 1) - 1) & value) << 11);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_acc_buf_sel_aa() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 12);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_acc_buf_sel_aa(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<12) & word0) | ((((1U << 1) - 1) & value) << 12);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_acc0_valid() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 13);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_acc0_valid(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<13) & word0) | ((((1U << 1) - 1) & value) << 13);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_acc1_valid() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 14);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_acc1_valid(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<14) & word0) | ((((1U << 1) - 1) & value) << 14);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_events() const
+    {
+        auto v = ((1U << 11) - 1) & (word0 >> 16);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_events(uint32_t value)
+    {
+        word0 = (~(((1U << 11) - 1)<<16) & word0) | ((((1U << 11) - 1) & value) << 16);
+        return *this;
+    }
+#endif
+};
+
+
+struct ao_status_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t cmd_sbw_valid : 1;
+            uint32_t cmd_act_valid : 1;
+            uint32_t cmd_ctl_valid : 1;
+            uint32_t cmd_scl_valid : 1;
+            uint32_t cmd_sbr_valid : 1;
+            uint32_t cmd_ofm_valid : 1;
+            uint32_t blk_cmd_ready : 1;
+            uint32_t blk_cmd_valid : 1;
+            uint32_t reserved0 : 8;
+            uint32_t events : 8;
+            uint32_t reserved1 : 8;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ao_status_r() :
+        word0(0)
+    {}
+    CONSTEXPR ao_status_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ao_status_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cmd_sbw_valid() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR ao_status_r& set_cmd_sbw_valid(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cmd_act_valid() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR ao_status_r& set_cmd_act_valid(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cmd_ctl_valid() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR ao_status_r& set_cmd_ctl_valid(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cmd_scl_valid() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR ao_status_r& set_cmd_scl_valid(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cmd_sbr_valid() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR ao_status_r& set_cmd_sbr_valid(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cmd_ofm_valid() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 5);
+        return v;
+    }
+    CONSTEXPR ao_status_r& set_cmd_ofm_valid(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_blk_cmd_ready() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 6);
+        return v;
+    }
+    CONSTEXPR ao_status_r& set_blk_cmd_ready(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_blk_cmd_valid() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 7);
+        return v;
+    }
+    CONSTEXPR ao_status_r& set_blk_cmd_valid(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_events() const
+    {
+        auto v = ((1U << 8) - 1) & (word0 >> 16);
+        return v;
+    }
+    CONSTEXPR ao_status_r& set_events(uint32_t value)
+    {
+        word0 = (~(((1U << 8) - 1)<<16) & word0) | ((((1U << 8) - 1) & value) << 16);
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_status0_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t cmd_idle : 1;
+            uint32_t ifm_idle : 1;
+            uint32_t wgt_idle_c0 : 1;
+            uint32_t bas_idle_c0 : 1;
+            uint32_t m2m_idle : 1;
+            uint32_t ofm_idle : 1;
+            uint32_t halt_req : 1;
+            uint32_t halt_ack : 1;
+            uint32_t pause_req : 1;
+            uint32_t pause_ack : 1;
+            uint32_t ib0_ai_valid_c0 : 1;
+            uint32_t ib0_ai_ready_c0 : 1;
+            uint32_t ib1_ai_valid_c0 : 1;
+            uint32_t ib1_ai_ready_c0 : 1;
+            uint32_t ib0_ao_valid_c0 : 1;
+            uint32_t ib0_ao_ready_c0 : 1;
+            uint32_t ib1_ao_valid_c0 : 1;
+            uint32_t ib1_ao_ready_c0 : 1;
+            uint32_t ob0_valid_c0 : 1;
+            uint32_t ob0_ready_c0 : 1;
+            uint32_t ob1_valid_c0 : 1;
+            uint32_t ob1_ready_c0 : 1;
+            uint32_t cmd_valid : 1;
+            uint32_t cmd_ready : 1;
+            uint32_t wd_bitstream_valid_c0 : 1;
+            uint32_t wd_bitstream_ready_c0 : 1;
+            uint32_t bs_bitstream_valid_c0 : 1;
+            uint32_t bs_bitstream_ready_c0 : 1;
+            uint32_t axi0_ar_stalled : 1;
+            uint32_t axi0_rd_limit_stall : 1;
+            uint32_t axi0_aw_stalled : 1;
+            uint32_t axi0_w_stalled : 1;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR dma_status0_r() :
+        word0(0)
+    {}
+    CONSTEXPR dma_status0_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    dma_status0_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cmd_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_cmd_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ifm_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_ifm_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wgt_idle_c0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_wgt_idle_c0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_bas_idle_c0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_bas_idle_c0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_m2m_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_m2m_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ofm_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 5);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_ofm_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_halt_req() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 6);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_halt_req(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_halt_ack() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 7);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_halt_ack(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_pause_req() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 8);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_pause_req(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & value) << 8);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_pause_ack() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 9);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_pause_ack(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<9) & word0) | ((((1U << 1) - 1) & value) << 9);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ib0_ai_valid_c0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 10);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_ib0_ai_valid_c0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<10) & word0) | ((((1U << 1) - 1) & value) << 10);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ib0_ai_ready_c0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 11);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_ib0_ai_ready_c0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<11) & word0) | ((((1U << 1) - 1) & value) << 11);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ib1_ai_valid_c0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 12);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_ib1_ai_valid_c0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<12) & word0) | ((((1U << 1) - 1) & value) << 12);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ib1_ai_ready_c0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 13);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_ib1_ai_ready_c0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<13) & word0) | ((((1U << 1) - 1) & value) << 13);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ib0_ao_valid_c0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 14);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_ib0_ao_valid_c0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<14) & word0) | ((((1U << 1) - 1) & value) << 14);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ib0_ao_ready_c0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 15);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_ib0_ao_ready_c0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<15) & word0) | ((((1U << 1) - 1) & value) << 15);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ib1_ao_valid_c0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 16);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_ib1_ao_valid_c0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<16) & word0) | ((((1U << 1) - 1) & value) << 16);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ib1_ao_ready_c0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 17);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_ib1_ao_ready_c0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<17) & word0) | ((((1U << 1) - 1) & value) << 17);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ob0_valid_c0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 18);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_ob0_valid_c0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<18) & word0) | ((((1U << 1) - 1) & value) << 18);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ob0_ready_c0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 19);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_ob0_ready_c0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<19) & word0) | ((((1U << 1) - 1) & value) << 19);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ob1_valid_c0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 20);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_ob1_valid_c0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<20) & word0) | ((((1U << 1) - 1) & value) << 20);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ob1_ready_c0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 21);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_ob1_ready_c0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<21) & word0) | ((((1U << 1) - 1) & value) << 21);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cmd_valid() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 22);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_cmd_valid(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<22) & word0) | ((((1U << 1) - 1) & value) << 22);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cmd_ready() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 23);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_cmd_ready(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<23) & word0) | ((((1U << 1) - 1) & value) << 23);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wd_bitstream_valid_c0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 24);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_wd_bitstream_valid_c0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<24) & word0) | ((((1U << 1) - 1) & value) << 24);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wd_bitstream_ready_c0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 25);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_wd_bitstream_ready_c0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<25) & word0) | ((((1U << 1) - 1) & value) << 25);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_bs_bitstream_valid_c0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 26);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_bs_bitstream_valid_c0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<26) & word0) | ((((1U << 1) - 1) & value) << 26);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_bs_bitstream_ready_c0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 27);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_bs_bitstream_ready_c0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<27) & word0) | ((((1U << 1) - 1) & value) << 27);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi0_ar_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 28);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_axi0_ar_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<28) & word0) | ((((1U << 1) - 1) & value) << 28);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi0_rd_limit_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 29);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_axi0_rd_limit_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<29) & word0) | ((((1U << 1) - 1) & value) << 29);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi0_aw_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 30);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_axi0_aw_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<30) & word0) | ((((1U << 1) - 1) & value) << 30);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi0_w_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 31);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_axi0_w_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31);
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_status1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t axi0_wr_limit_stall : 1;
+            uint32_t axi1_ar_stalled : 1;
+            uint32_t axi1_rd_limit_stall : 1;
+            uint32_t axi1_wr_stalled : 1;
+            uint32_t axi1_w_stalled : 1;
+            uint32_t axi1_wr_limit_stall : 1;
+            uint32_t wgt_idle_c1 : 1;
+            uint32_t bas_idle_c1 : 1;
+            uint32_t ib0_ai_valid_c1 : 1;
+            uint32_t ib0_ai_ready_c1 : 1;
+            uint32_t ib1_ai_valid_c1 : 1;
+            uint32_t ib1_ai_ready_c1 : 1;
+            uint32_t ib0_ao_valid_c1 : 1;
+            uint32_t ib0_ao_ready_c1 : 1;
+            uint32_t ib1_ao_valid_c1 : 1;
+            uint32_t ib1_ao_ready_c1 : 1;
+            uint32_t ob0_valid_c1 : 1;
+            uint32_t ob0_ready_c1 : 1;
+            uint32_t ob1_valid_c1 : 1;
+            uint32_t ob1_ready_c1 : 1;
+            uint32_t wd_bitstream_valid_c1 : 1;
+            uint32_t wd_bitstream_ready_c1 : 1;
+            uint32_t bs_bitstream_valid_c1 : 1;
+            uint32_t bs_bitstream_ready_c1 : 1;
+            uint32_t reserved0 : 8;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR dma_status1_r() :
+        word0(0)
+    {}
+    CONSTEXPR dma_status1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    dma_status1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi0_wr_limit_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi0_wr_limit_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi1_ar_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi1_ar_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi1_rd_limit_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi1_rd_limit_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi1_wr_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi1_wr_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi1_w_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi1_w_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi1_wr_limit_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 5);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi1_wr_limit_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wgt_idle_c1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 6);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_wgt_idle_c1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_bas_idle_c1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 7);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_bas_idle_c1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ib0_ai_valid_c1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 8);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_ib0_ai_valid_c1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & value) << 8);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ib0_ai_ready_c1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 9);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_ib0_ai_ready_c1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<9) & word0) | ((((1U << 1) - 1) & value) << 9);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ib1_ai_valid_c1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 10);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_ib1_ai_valid_c1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<10) & word0) | ((((1U << 1) - 1) & value) << 10);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ib1_ai_ready_c1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 11);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_ib1_ai_ready_c1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<11) & word0) | ((((1U << 1) - 1) & value) << 11);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ib0_ao_valid_c1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 12);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_ib0_ao_valid_c1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<12) & word0) | ((((1U << 1) - 1) & value) << 12);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ib0_ao_ready_c1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 13);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_ib0_ao_ready_c1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<13) & word0) | ((((1U << 1) - 1) & value) << 13);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ib1_ao_valid_c1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 14);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_ib1_ao_valid_c1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<14) & word0) | ((((1U << 1) - 1) & value) << 14);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ib1_ao_ready_c1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 15);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_ib1_ao_ready_c1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<15) & word0) | ((((1U << 1) - 1) & value) << 15);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ob0_valid_c1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 16);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_ob0_valid_c1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<16) & word0) | ((((1U << 1) - 1) & value) << 16);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ob0_ready_c1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 17);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_ob0_ready_c1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<17) & word0) | ((((1U << 1) - 1) & value) << 17);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ob1_valid_c1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 18);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_ob1_valid_c1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<18) & word0) | ((((1U << 1) - 1) & value) << 18);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ob1_ready_c1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 19);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_ob1_ready_c1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<19) & word0) | ((((1U << 1) - 1) & value) << 19);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wd_bitstream_valid_c1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 20);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_wd_bitstream_valid_c1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<20) & word0) | ((((1U << 1) - 1) & value) << 20);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wd_bitstream_ready_c1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 21);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_wd_bitstream_ready_c1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<21) & word0) | ((((1U << 1) - 1) & value) << 21);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_bs_bitstream_valid_c1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 22);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_bs_bitstream_valid_c1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<22) & word0) | ((((1U << 1) - 1) & value) << 22);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_bs_bitstream_ready_c1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 23);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_bs_bitstream_ready_c1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<23) & word0) | ((((1U << 1) - 1) & value) << 23);
+        return *this;
+    }
+#endif
+};
+
+
+struct clkforce_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t top_level_clk : 1;
+            uint32_t cc_clk : 1;
+            uint32_t dma_clk : 1;
+            uint32_t mac_clk : 1;
+            uint32_t ao_clk : 1;
+            uint32_t wd_clk : 1;
+            uint32_t reserved0 : 26;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR clkforce_r() :
+        word0(0)
+    {}
+    CONSTEXPR clkforce_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    clkforce_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_top_level_clk() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR clkforce_r& set_top_level_clk(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cc_clk() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR clkforce_r& set_cc_clk(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_dma_clk() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR clkforce_r& set_dma_clk(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_mac_clk() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR clkforce_r& set_mac_clk(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ao_clk() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR clkforce_r& set_ao_clk(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wd_clk() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 5);
+        return v;
+    }
+    CONSTEXPR clkforce_r& set_wd_clk(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5);
+        return *this;
+    }
+#endif
+};
+
+
+struct debug_address_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t addr : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR debug_address_r() :
+        word0(0)
+    {}
+    CONSTEXPR debug_address_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    debug_address_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_addr() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR debug_address_r& set_addr(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct debug_misc_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t misc : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR debug_misc_r() :
+        word0(0)
+    {}
+    CONSTEXPR debug_misc_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    debug_misc_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_misc() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR debug_misc_r& set_misc(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct debugcore_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t core : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR debugcore_r() :
+        word0(0)
+    {}
+    CONSTEXPR debugcore_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    debugcore_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_core() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR debugcore_r& set_core(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct debug_block_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t block : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR debug_block_r() :
+        word0(0)
+    {}
+    CONSTEXPR debug_block_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    debug_block_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_block() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR debug_block_r& set_block(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pmcr_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t cnt_en : 1;
+            uint32_t event_cnt_rst : 1;
+            uint32_t cycle_cnt_rst : 1;
+            uint32_t mask_en : 1;
+            uint32_t reserved0 : 7;
+            uint32_t num_event_cnt : 5;
+            uint32_t reserved1 : 16;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmcr_r() :
+        word0(8192)
+    {}
+    CONSTEXPR pmcr_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmcr_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cnt_en() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR pmcr_r& set_cnt_en(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_event_cnt_rst() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR pmcr_r& set_event_cnt_rst(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cycle_cnt_rst() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR pmcr_r& set_cycle_cnt_rst(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_mask_en() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR pmcr_r& set_mask_en(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_num_event_cnt() const
+    {
+        auto v = ((1U << 5) - 1) & (word0 >> 11);
+        return v;
+    }
+    CONSTEXPR pmcr_r& set_num_event_cnt(uint32_t value)
+    {
+        word0 = (~(((1U << 5) - 1)<<11) & word0) | ((((1U << 5) - 1) & value) << 11);
+        return *this;
+    }
+#endif
+};
+
+
+struct pmcntenset_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t EVENT_CNT_0 : 1;
+            uint32_t EVENT_CNT_1 : 1;
+            uint32_t EVENT_CNT_2 : 1;
+            uint32_t EVENT_CNT_3 : 1;
+            uint32_t reserved0 : 27;
+            uint32_t CYCLE_CNT : 1;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmcntenset_r() :
+        word0(0)
+    {}
+    CONSTEXPR pmcntenset_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmcntenset_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR pmcntenset_r& set_EVENT_CNT_0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR pmcntenset_r& set_EVENT_CNT_1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_2() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR pmcntenset_r& set_EVENT_CNT_2(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_3() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR pmcntenset_r& set_EVENT_CNT_3(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_CYCLE_CNT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 31);
+        return v;
+    }
+    CONSTEXPR pmcntenset_r& set_CYCLE_CNT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31);
+        return *this;
+    }
+#endif
+};
+
+
+struct pmcntenclr_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t EVENT_CNT_0 : 1;
+            uint32_t EVENT_CNT_1 : 1;
+            uint32_t EVENT_CNT_2 : 1;
+            uint32_t EVENT_CNT_3 : 1;
+            uint32_t reserved0 : 27;
+            uint32_t CYCLE_CNT : 1;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmcntenclr_r() :
+        word0(0)
+    {}
+    CONSTEXPR pmcntenclr_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmcntenclr_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR pmcntenclr_r& set_EVENT_CNT_0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR pmcntenclr_r& set_EVENT_CNT_1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_2() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR pmcntenclr_r& set_EVENT_CNT_2(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_3() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR pmcntenclr_r& set_EVENT_CNT_3(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_CYCLE_CNT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 31);
+        return v;
+    }
+    CONSTEXPR pmcntenclr_r& set_CYCLE_CNT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31);
+        return *this;
+    }
+#endif
+};
+
+
+struct pmovsset_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t EVENT_CNT_0_OVF : 1;
+            uint32_t EVENT_CNT_1_OVF : 1;
+            uint32_t EVENT_CNT_2_OVF : 1;
+            uint32_t EVENT_CNT_3_OVF : 1;
+            uint32_t reserved0 : 27;
+            uint32_t CYCLE_CNT_OVF : 1;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmovsset_r() :
+        word0(0)
+    {}
+    CONSTEXPR pmovsset_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmovsset_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_0_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR pmovsset_r& set_EVENT_CNT_0_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_1_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR pmovsset_r& set_EVENT_CNT_1_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_2_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR pmovsset_r& set_EVENT_CNT_2_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_3_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR pmovsset_r& set_EVENT_CNT_3_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_CYCLE_CNT_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 31);
+        return v;
+    }
+    CONSTEXPR pmovsset_r& set_CYCLE_CNT_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31);
+        return *this;
+    }
+#endif
+};
+
+
+struct pmovsclr_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t EVENT_CNT_0_OVF : 1;
+            uint32_t EVENT_CNT_1_OVF : 1;
+            uint32_t EVENT_CNT_2_OVF : 1;
+            uint32_t EVENT_CNT_3_OVF : 1;
+            uint32_t reserved0 : 27;
+            uint32_t CYCLE_CNT_OVF : 1;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmovsclr_r() :
+        word0(0)
+    {}
+    CONSTEXPR pmovsclr_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmovsclr_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_0_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR pmovsclr_r& set_EVENT_CNT_0_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_1_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR pmovsclr_r& set_EVENT_CNT_1_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_2_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR pmovsclr_r& set_EVENT_CNT_2_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_3_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR pmovsclr_r& set_EVENT_CNT_3_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_CYCLE_CNT_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 31);
+        return v;
+    }
+    CONSTEXPR pmovsclr_r& set_CYCLE_CNT_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31);
+        return *this;
+    }
+#endif
+};
+
+
+struct pmintset_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t EVENT_CNT_0_INT : 1;
+            uint32_t EVENT_CNT_1_INT : 1;
+            uint32_t EVENT_CNT_2_INT : 1;
+            uint32_t EVENT_CNT_3_INT : 1;
+            uint32_t reserved0 : 27;
+            uint32_t CYCLE_CNT_INT : 1;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmintset_r() :
+        word0(0)
+    {}
+    CONSTEXPR pmintset_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmintset_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_0_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR pmintset_r& set_EVENT_CNT_0_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_1_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR pmintset_r& set_EVENT_CNT_1_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_2_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR pmintset_r& set_EVENT_CNT_2_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_3_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR pmintset_r& set_EVENT_CNT_3_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_CYCLE_CNT_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 31);
+        return v;
+    }
+    CONSTEXPR pmintset_r& set_CYCLE_CNT_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31);
+        return *this;
+    }
+#endif
+};
+
+
+struct pmintclr_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t EVENT_CNT_0_INT : 1;
+            uint32_t EVENT_CNT_1_INT : 1;
+            uint32_t EVENT_CNT_2_INT : 1;
+            uint32_t EVENT_CNT_3_INT : 1;
+            uint32_t reserved0 : 27;
+            uint32_t CYCLE_CNT_INT : 1;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmintclr_r() :
+        word0(0)
+    {}
+    CONSTEXPR pmintclr_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmintclr_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_0_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR pmintclr_r& set_EVENT_CNT_0_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_1_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR pmintclr_r& set_EVENT_CNT_1_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_2_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR pmintclr_r& set_EVENT_CNT_2_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_3_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR pmintclr_r& set_EVENT_CNT_3_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_CYCLE_CNT_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 31);
+        return v;
+    }
+    CONSTEXPR pmintclr_r& set_CYCLE_CNT_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31);
+        return *this;
+    }
+#endif
+};
+
+
+struct pmccntr_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t CYCLE_CNT_LO : 32;
+            uint32_t CYCLE_CNT_HI : 16;
+            uint32_t reserved0 : 16;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR pmccntr_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR pmccntr_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    pmccntr_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct pmccntr_cfg_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t CYCLE_CNT_CFG_START : 10;
+            uint32_t reserved0 : 6;
+            uint32_t CYCLE_CNT_CFG_STOP : 10;
+            uint32_t reserved1 : 6;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmccntr_cfg_r() :
+        word0(0)
+    {}
+    CONSTEXPR pmccntr_cfg_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmccntr_cfg_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pmu_event get_CYCLE_CNT_CFG_START() const
+    {
+        auto v = ((1U << 10) - 1) & (word0 >> 0);
+        assert(v <= 433);
+        return static_cast<NPU_NAMESPACE::pmu_event>(v);
+    }
+    CONSTEXPR pmccntr_cfg_r& set_CYCLE_CNT_CFG_START(NPU_NAMESPACE::pmu_event value)
+    {
+        word0 = (~(((1U << 10) - 1)<<0) & word0) | ((((1U << 10) - 1) & static_cast<uint32_t>(value)) << 0);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pmu_event get_CYCLE_CNT_CFG_STOP() const
+    {
+        auto v = ((1U << 10) - 1) & (word0 >> 16);
+        assert(v <= 433);
+        return static_cast<NPU_NAMESPACE::pmu_event>(v);
+    }
+    CONSTEXPR pmccntr_cfg_r& set_CYCLE_CNT_CFG_STOP(NPU_NAMESPACE::pmu_event value)
+    {
+        word0 = (~(((1U << 10) - 1)<<16) & word0) | ((((1U << 10) - 1) & static_cast<uint32_t>(value)) << 16);
+        return *this;
+    }
+#endif
+};
+
+
+struct pmcaxi_chan_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t CH_SEL : 4;
+            uint32_t reserved0 : 4;
+            uint32_t AXI_CNT_SEL : 2;
+            uint32_t BW_CH_SEL_EN : 1;
+            uint32_t reserved1 : 21;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmcaxi_chan_r() :
+        word0(0)
+    {}
+    CONSTEXPR pmcaxi_chan_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmcaxi_chan_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pmu_axi_channel get_CH_SEL() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 0);
+        assert(v <= 9);
+        return static_cast<NPU_NAMESPACE::pmu_axi_channel>(v);
+    }
+    CONSTEXPR pmcaxi_chan_r& set_CH_SEL(NPU_NAMESPACE::pmu_axi_channel value)
+    {
+        word0 = (~(((1U << 4) - 1)<<0) & word0) | ((((1U << 4) - 1) & static_cast<uint32_t>(value)) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_AXI_CNT_SEL() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 8);
+        return v;
+    }
+    CONSTEXPR pmcaxi_chan_r& set_AXI_CNT_SEL(uint32_t value)
+    {
+        word0 = (~(((1U << 2) - 1)<<8) & word0) | ((((1U << 2) - 1) & value) << 8);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_BW_CH_SEL_EN() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 10);
+        return v;
+    }
+    CONSTEXPR pmcaxi_chan_r& set_BW_CH_SEL_EN(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<10) & word0) | ((((1U << 1) - 1) & value) << 10);
+        return *this;
+    }
+#endif
+};
+
+
+struct kernel_x_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR kernel_x_r() :
+        word0(0)
+    {}
+    CONSTEXPR kernel_x_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    kernel_x_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR kernel_x_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct kernel_y_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR kernel_y_r() :
+        word0(0)
+    {}
+    CONSTEXPR kernel_y_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    kernel_y_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR kernel_y_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct kernel_w_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR kernel_w_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR kernel_w_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    kernel_w_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR kernel_w_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct kernel_h_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR kernel_h_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR kernel_h_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    kernel_h_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR kernel_h_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_cblk_width_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_cblk_width_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_cblk_width_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_cblk_width_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_cblk_width_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_cblk_height_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_cblk_height_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_cblk_height_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_cblk_height_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_cblk_height_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_cblk_depth_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_cblk_depth_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_cblk_depth_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_cblk_depth_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_cblk_depth_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_cblk_depth_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_cblk_depth_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_cblk_depth_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_cblk_depth_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_cblk_depth_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_x_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_x_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_x_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_x_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_x_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_y_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_y_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_y_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_y_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_y_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_z_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_z_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_z_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_z_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_z_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_z_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_z_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_z_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_z_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_z_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pad_top_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pad_top_r() :
+        word0(0)
+    {}
+    CONSTEXPR pad_top_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pad_top_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR pad_top_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pad_left_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pad_left_r() :
+        word0(0)
+    {}
+    CONSTEXPR pad_left_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pad_left_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR pad_left_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_cblk_width_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_cblk_width_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_cblk_width_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_cblk_width_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_cblk_width_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_cblk_height_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_cblk_height_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_cblk_height_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_cblk_height_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_cblk_height_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_ifm_src_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t offset_LO : 32;
+            uint32_t offset_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma_ifm_src_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma_ifm_src_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma_ifm_src_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_ifm_dst_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR dma_ifm_dst_r() :
+        word0(0)
+    {}
+    CONSTEXPR dma_ifm_dst_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    dma_ifm_dst_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR dma_ifm_dst_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_ofm_src_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR dma_ofm_src_r() :
+        word0(0)
+    {}
+    CONSTEXPR dma_ofm_src_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    dma_ofm_src_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR dma_ofm_src_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_ofm_dst_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t offset_LO : 32;
+            uint32_t offset_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma_ofm_dst_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma_ofm_dst_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma_ofm_dst_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_weight_src_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t offset_LO : 32;
+            uint32_t offset_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma_weight_src_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma_weight_src_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma_weight_src_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_cmd_src_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t offset_LO : 32;
+            uint32_t offset_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma_cmd_src_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma_cmd_src_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma_cmd_src_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_cmd_size_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR dma_cmd_size_r() :
+        word0(0)
+    {}
+    CONSTEXPR dma_cmd_size_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    dma_cmd_size_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR dma_cmd_size_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_m2m_src_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t offset_LO : 32;
+            uint32_t offset_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma_m2m_src_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma_m2m_src_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma_m2m_src_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_m2m_dst_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t offset_LO : 32;
+            uint32_t offset_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma_m2m_dst_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma_m2m_dst_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma_m2m_dst_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct current_qread_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR current_qread_r() :
+        word0(0)
+    {}
+    CONSTEXPR current_qread_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    current_qread_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR current_qread_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_scale_src_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t offset_LO : 32;
+            uint32_t offset_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma_scale_src_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma_scale_src_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma_scale_src_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct current_block_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR current_block_r() :
+        word0(0)
+    {}
+    CONSTEXPR current_block_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    current_block_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR current_block_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct current_op_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR current_op_r() :
+        word0(0)
+    {}
+    CONSTEXPR current_op_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    current_op_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR current_op_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct current_cmd_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR current_cmd_r() :
+        word0(0)
+    {}
+    CONSTEXPR current_cmd_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    current_cmd_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR current_cmd_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pmevcntr_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t count : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmevcntr_r() :
+        word0(0)
+    {}
+    CONSTEXPR pmevcntr_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmevcntr_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_count() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR pmevcntr_r& set_count(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pmevtyper_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t EV_TYPE : 10;
+            uint32_t reserved0 : 22;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmevtyper_r() :
+        word0(0)
+    {}
+    CONSTEXPR pmevtyper_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmevtyper_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pmu_event get_EV_TYPE() const
+    {
+        auto v = ((1U << 10) - 1) & (word0 >> 0);
+        assert(v <= 433);
+        return static_cast<NPU_NAMESPACE::pmu_event>(v);
+    }
+    CONSTEXPR pmevtyper_r& set_EV_TYPE(NPU_NAMESPACE::pmu_event value)
+    {
+        word0 = (~(((1U << 10) - 1)<<0) & word0) | ((((1U << 10) - 1) & static_cast<uint32_t>(value)) << 0);
+        return *this;
+    }
+#endif
+};
+
+
+struct shared_buffer_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t mem_word : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR shared_buffer_r() :
+        word0(0)
+    {}
+    CONSTEXPR shared_buffer_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    shared_buffer_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_mem_word() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR shared_buffer_r& set_mem_word(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_pad_top_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_pad_top_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_pad_top_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_pad_top_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_pad_top_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_pad_left_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_pad_left_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_pad_left_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_pad_left_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_pad_left_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_pad_right_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_pad_right_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_pad_right_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_pad_right_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_pad_right_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_pad_bottom_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_pad_bottom_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_pad_bottom_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_pad_bottom_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_pad_bottom_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_depth_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_depth_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_depth_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_depth_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_depth_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_precision_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_precision_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_precision_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_precision_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_precision_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_upscale_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_upscale_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_upscale_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_upscale_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_upscale_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_zero_point_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_zero_point_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_zero_point_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_zero_point_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_zero_point_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_width0_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_width0_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_width0_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_width0_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_width0_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_height0_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_height0_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_height0_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_height0_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_height0_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_height1_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_height1_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_height1_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_height1_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_height1_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_ib_end_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_ib_end_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_ib_end_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_ib_end_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_ib_end_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_region_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_region_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_region_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_region_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_region_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_width_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_width_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_width_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_width_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_width_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_height_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_height_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_height_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_height_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_height_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_depth_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_depth_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_depth_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_depth_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_depth_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_precision_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_precision_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_precision_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_precision_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_precision_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_blk_width_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_blk_width_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_blk_width_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_blk_width_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_blk_width_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_blk_height_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_blk_height_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_blk_height_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_blk_height_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_blk_height_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_blk_depth_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_blk_depth_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_blk_depth_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_blk_depth_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_blk_depth_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_zero_point_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_zero_point_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_zero_point_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_zero_point_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_zero_point_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_width0_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_width0_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_width0_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_width0_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_width0_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_height0_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_height0_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_height0_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_height0_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_height0_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_height1_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_height1_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_height1_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_height1_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_height1_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_region_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_region_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_region_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_region_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_region_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct kernel_width_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR kernel_width_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR kernel_width_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    kernel_width_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR kernel_width_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct kernel_height_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR kernel_height_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR kernel_height_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    kernel_height_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR kernel_height_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct kernel_stride_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR kernel_stride_r() :
+        word0(0)
+    {}
+    CONSTEXPR kernel_stride_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    kernel_stride_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR kernel_stride_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct parallel_mode_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR parallel_mode_r() :
+        word0(0)
+    {}
+    CONSTEXPR parallel_mode_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    parallel_mode_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR parallel_mode_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct acc_format_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR acc_format_r() :
+        word0(0)
+    {}
+    CONSTEXPR acc_format_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    acc_format_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR acc_format_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct activation_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR activation_r() :
+        word0(0)
+    {}
+    CONSTEXPR activation_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    activation_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR activation_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct activation_min_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR activation_min_r() :
+        word0(0)
+    {}
+    CONSTEXPR activation_min_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    activation_min_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR activation_min_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct activation_max_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR activation_max_r() :
+        word0(0)
+    {}
+    CONSTEXPR activation_max_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    activation_max_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR activation_max_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct weight_region_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR weight_region_r() :
+        word0(0)
+    {}
+    CONSTEXPR weight_region_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    weight_region_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR weight_region_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct scale_region_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR scale_region_r() :
+        word0(0)
+    {}
+    CONSTEXPR scale_region_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    scale_region_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR scale_region_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ab_start_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ab_start_r() :
+        word0(0)
+    {}
+    CONSTEXPR ab_start_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ab_start_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ab_start_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct blockdep_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR blockdep_r() :
+        word0(0)
+    {}
+    CONSTEXPR blockdep_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    blockdep_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR blockdep_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_src_region_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR dma0_src_region_r() :
+        word0(0)
+    {}
+    CONSTEXPR dma0_src_region_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    dma0_src_region_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR dma0_src_region_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_dst_region_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR dma0_dst_region_r() :
+        word0(0)
+    {}
+    CONSTEXPR dma0_dst_region_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    dma0_dst_region_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR dma0_dst_region_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_size0_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR dma0_size0_r() :
+        word0(0)
+    {}
+    CONSTEXPR dma0_size0_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    dma0_size0_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR dma0_size0_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_size1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR dma0_size1_r() :
+        word0(0)
+    {}
+    CONSTEXPR dma0_size1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    dma0_size1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR dma0_size1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_broadcast_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm2_broadcast_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm2_broadcast_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm2_broadcast_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm2_broadcast_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_scalar_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm2_scalar_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm2_scalar_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm2_scalar_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm2_scalar_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_precision_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm2_precision_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm2_precision_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm2_precision_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm2_precision_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_zero_point_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm2_zero_point_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm2_zero_point_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm2_zero_point_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm2_zero_point_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_width0_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm2_width0_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm2_width0_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm2_width0_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm2_width0_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_height0_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm2_height0_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm2_height0_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm2_height0_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm2_height0_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_height1_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm2_height1_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm2_height1_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm2_height1_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm2_height1_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_ib_start_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm2_ib_start_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm2_ib_start_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm2_ib_start_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm2_ib_start_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_region_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm2_region_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm2_region_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm2_region_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm2_region_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_base0_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm_base0_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm_base0_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm_base0_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_base1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm_base1_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm_base1_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm_base1_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_base2_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm_base2_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm_base2_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm_base2_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_base3_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm_base3_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm_base3_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm_base3_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_stride_x_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm_stride_x_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm_stride_x_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm_stride_x_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_stride_y_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm_stride_y_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm_stride_y_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm_stride_y_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_stride_c_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm_stride_c_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm_stride_c_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm_stride_c_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_base0_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ofm_base0_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ofm_base0_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ofm_base0_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_base1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ofm_base1_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ofm_base1_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ofm_base1_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_base2_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ofm_base2_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ofm_base2_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ofm_base2_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_base3_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ofm_base3_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ofm_base3_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ofm_base3_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_stride_x_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ofm_stride_x_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ofm_stride_x_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ofm_stride_x_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_stride_y_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ofm_stride_y_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ofm_stride_y_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ofm_stride_y_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_stride_c_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ofm_stride_c_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ofm_stride_c_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ofm_stride_c_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct weight_base_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR weight_base_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR weight_base_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    weight_base_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct weight_length_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR weight_length_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR weight_length_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    weight_length_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct scale_base_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR scale_base_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR scale_base_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    scale_base_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct scale_length_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR scale_length_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR scale_length_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    scale_length_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_scale_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ofm_scale_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ofm_scale_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ofm_scale_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct opa_scale_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR opa_scale_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR opa_scale_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    opa_scale_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct opb_scale_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR opb_scale_r() :
+        word0(0)
+    {}
+    CONSTEXPR opb_scale_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    opb_scale_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR opb_scale_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_src_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma0_src_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma0_src_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma0_src_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_dst_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma0_dst_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma0_dst_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma0_dst_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_len_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma0_len_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma0_len_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma0_len_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_skip0_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma0_skip0_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma0_skip0_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma0_skip0_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_skip1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma0_skip1_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma0_skip1_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma0_skip1_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_base0_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm2_base0_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm2_base0_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm2_base0_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_base1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm2_base1_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm2_base1_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm2_base1_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_base2_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm2_base2_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm2_base2_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm2_base2_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_base3_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm2_base3_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm2_base3_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm2_base3_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_stride_x_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm2_stride_x_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm2_stride_x_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm2_stride_x_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_stride_y_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm2_stride_y_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm2_stride_y_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm2_stride_y_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_stride_c_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm2_stride_c_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm2_stride_c_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm2_stride_c_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct weight1_base_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR weight1_base_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR weight1_base_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    weight1_base_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct weight1_length_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR weight1_length_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR weight1_length_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    weight1_length_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct scale1_base_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR scale1_base_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR scale1_base_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    scale1_base_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct scale1_length_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR scale1_length_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR scale1_length_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    scale1_length_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct revision_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR revision_r() :
+        word0(0)
+    {}
+    CONSTEXPR revision_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    revision_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR revision_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pid4_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t PID4 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pid4_r() :
+        word0(4)
+    {}
+    CONSTEXPR pid4_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pid4_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_PID4() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR pid4_r& set_PID4(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pid5_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t PID5 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pid5_r() :
+        word0(0)
+    {}
+    CONSTEXPR pid5_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pid5_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_PID5() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR pid5_r& set_PID5(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pid6_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t PID6 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pid6_r() :
+        word0(0)
+    {}
+    CONSTEXPR pid6_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pid6_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_PID6() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR pid6_r& set_PID6(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pid7_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t PID7 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pid7_r() :
+        word0(0)
+    {}
+    CONSTEXPR pid7_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pid7_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_PID7() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR pid7_r& set_PID7(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pid0_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t PID0 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pid0_r() :
+        word0(129)
+    {}
+    CONSTEXPR pid0_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pid0_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_PID0() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR pid0_r& set_PID0(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pid1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t PID1 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pid1_r() :
+        word0(181)
+    {}
+    CONSTEXPR pid1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pid1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_PID1() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR pid1_r& set_PID1(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pid2_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t PID2 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pid2_r() :
+        word0(11)
+    {}
+    CONSTEXPR pid2_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pid2_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_PID2() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR pid2_r& set_PID2(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pid3_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t PID3 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pid3_r() :
+        word0(0)
+    {}
+    CONSTEXPR pid3_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pid3_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_PID3() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR pid3_r& set_PID3(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct cid0_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t CID0 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR cid0_r() :
+        word0(13)
+    {}
+    CONSTEXPR cid0_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    cid0_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_CID0() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR cid0_r& set_CID0(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct cid1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t CID1 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR cid1_r() :
+        word0(240)
+    {}
+    CONSTEXPR cid1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    cid1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_CID1() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR cid1_r& set_CID1(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct cid2_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t CID2 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR cid2_r() :
+        word0(5)
+    {}
+    CONSTEXPR cid2_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    cid2_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_CID2() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR cid2_r& set_CID2(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct cid3_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t CID3 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR cid3_r() :
+        word0(177)
+    {}
+    CONSTEXPR cid3_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    cid3_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_CID3() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR cid3_r& set_CID3(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+struct NPU_REG
+{
+    STRUCT id_r ID;
+    STRUCT status_r STATUS;
+    STRUCT cmd_r CMD;
+    STRUCT reset_r RESET;
+    STRUCT qbase_r QBASE;
+    STRUCT qread_r QREAD;
+    STRUCT qconfig_r QCONFIG;
+    STRUCT qsize_r QSIZE;
+    STRUCT prot_r PROT;
+    STRUCT config_r CONFIG;
+    STRUCT lock_r LOCK;
+    uint32_t unused0[3];
+    STRUCT regioncfg_r REGIONCFG;
+    STRUCT axi_limit0_r AXI_LIMIT0;
+    STRUCT axi_limit1_r AXI_LIMIT1;
+    STRUCT axi_limit2_r AXI_LIMIT2;
+    STRUCT axi_limit3_r AXI_LIMIT3;
+    uint32_t unused1[12];
+    STRUCT basep_r BASEP[8];
+    uint32_t unused2[16];
+    STRUCT wd_status_r WD_STATUS;
+    STRUCT mac_status_r MAC_STATUS;
+    STRUCT ao_status_r AO_STATUS;
+    uint32_t unused3[1];
+    STRUCT dma_status0_r DMA_STATUS0;
+    STRUCT dma_status1_r DMA_STATUS1;
+    uint32_t unused4[10];
+    STRUCT clkforce_r CLKFORCE;
+    STRUCT debug_address_r DEBUG_ADDRESS;
+    STRUCT debug_misc_r DEBUG_MISC;
+    STRUCT debugcore_r DEBUGCORE;
+    STRUCT debug_block_r DEBUG_BLOCK;
+    uint32_t unused5[11];
+    STRUCT pmcr_r PMCR;
+    STRUCT pmcntenset_r PMCNTENSET;
+    STRUCT pmcntenclr_r PMCNTENCLR;
+    STRUCT pmovsset_r PMOVSSET;
+    STRUCT pmovsclr_r PMOVSCLR;
+    STRUCT pmintset_r PMINTSET;
+    STRUCT pmintclr_r PMINTCLR;
+    uint32_t unused6[1];
+    STRUCT pmccntr_r PMCCNTR;
+    STRUCT pmccntr_cfg_r PMCCNTR_CFG;
+    STRUCT pmcaxi_chan_r PMCAXI_CHAN;
+    uint32_t unused7[20];
+    STRUCT kernel_x_r KERNEL_X;
+    STRUCT kernel_y_r KERNEL_Y;
+    STRUCT kernel_w_m1_r KERNEL_W_M1;
+    STRUCT kernel_h_m1_r KERNEL_H_M1;
+    STRUCT ofm_cblk_width_m1_r OFM_CBLK_WIDTH_M1;
+    STRUCT ofm_cblk_height_m1_r OFM_CBLK_HEIGHT_M1;
+    STRUCT ofm_cblk_depth_m1_r OFM_CBLK_DEPTH_M1;
+    STRUCT ifm_cblk_depth_m1_r IFM_CBLK_DEPTH_M1;
+    STRUCT ofm_x_r OFM_X;
+    STRUCT ofm_y_r OFM_Y;
+    STRUCT ofm_z_r OFM_Z;
+    STRUCT ifm_z_r IFM_Z;
+    STRUCT pad_top_r PAD_TOP;
+    STRUCT pad_left_r PAD_LEFT;
+    STRUCT ifm_cblk_width_r IFM_CBLK_WIDTH;
+    STRUCT ifm_cblk_height_r IFM_CBLK_HEIGHT;
+    STRUCT dma_ifm_src_r DMA_IFM_SRC;
+    STRUCT dma_ifm_dst_r DMA_IFM_DST;
+    STRUCT dma_ofm_src_r DMA_OFM_SRC;
+    STRUCT dma_ofm_dst_r DMA_OFM_DST;
+    STRUCT dma_weight_src_r DMA_WEIGHT_SRC;
+    STRUCT dma_cmd_src_r DMA_CMD_SRC;
+    STRUCT dma_cmd_size_r DMA_CMD_SIZE;
+    STRUCT dma_m2m_src_r DMA_M2M_SRC;
+    STRUCT dma_m2m_dst_r DMA_M2M_DST;
+    STRUCT current_qread_r CURRENT_QREAD;
+    STRUCT dma_scale_src_r DMA_SCALE_SRC;
+    uint32_t unused8[11];
+    STRUCT current_block_r CURRENT_BLOCK;
+    STRUCT current_op_r CURRENT_OP;
+    STRUCT current_cmd_r CURRENT_CMD;
+    uint32_t unused9[16];
+    STRUCT pmevcntr_r PMEVCNTR[4];
+    uint32_t unused10[28];
+    STRUCT pmevtyper_r PMEVTYPER[4];
+    uint32_t unused11[28];
+    STRUCT shared_buffer_r SHARED_BUFFER[256];
+    STRUCT ifm_pad_top_r IFM_PAD_TOP;
+    STRUCT ifm_pad_left_r IFM_PAD_LEFT;
+    STRUCT ifm_pad_right_r IFM_PAD_RIGHT;
+    STRUCT ifm_pad_bottom_r IFM_PAD_BOTTOM;
+    STRUCT ifm_depth_m1_r IFM_DEPTH_M1;
+    STRUCT ifm_precision_r IFM_PRECISION;
+    uint32_t unused12[1];
+    STRUCT ifm_upscale_r IFM_UPSCALE;
+    uint32_t unused13[1];
+    STRUCT ifm_zero_point_r IFM_ZERO_POINT;
+    STRUCT ifm_width0_m1_r IFM_WIDTH0_M1;
+    STRUCT ifm_height0_m1_r IFM_HEIGHT0_M1;
+    STRUCT ifm_height1_m1_r IFM_HEIGHT1_M1;
+    STRUCT ifm_ib_end_r IFM_IB_END;
+    uint32_t unused14[1];
+    STRUCT ifm_region_r IFM_REGION;
+    uint32_t unused15[1];
+    STRUCT ofm_width_m1_r OFM_WIDTH_M1;
+    STRUCT ofm_height_m1_r OFM_HEIGHT_M1;
+    STRUCT ofm_depth_m1_r OFM_DEPTH_M1;
+    STRUCT ofm_precision_r OFM_PRECISION;
+    STRUCT ofm_blk_width_m1_r OFM_BLK_WIDTH_M1;
+    STRUCT ofm_blk_height_m1_r OFM_BLK_HEIGHT_M1;
+    STRUCT ofm_blk_depth_m1_r OFM_BLK_DEPTH_M1;
+    STRUCT ofm_zero_point_r OFM_ZERO_POINT;
+    uint32_t unused16[1];
+    STRUCT ofm_width0_m1_r OFM_WIDTH0_M1;
+    STRUCT ofm_height0_m1_r OFM_HEIGHT0_M1;
+    STRUCT ofm_height1_m1_r OFM_HEIGHT1_M1;
+    uint32_t unused17[2];
+    STRUCT ofm_region_r OFM_REGION;
+    STRUCT kernel_width_m1_r KERNEL_WIDTH_M1;
+    STRUCT kernel_height_m1_r KERNEL_HEIGHT_M1;
+    STRUCT kernel_stride_r KERNEL_STRIDE;
+    STRUCT parallel_mode_r PARALLEL_MODE;
+    STRUCT acc_format_r ACC_FORMAT;
+    STRUCT activation_r ACTIVATION;
+    STRUCT activation_min_r ACTIVATION_MIN;
+    STRUCT activation_max_r ACTIVATION_MAX;
+    STRUCT weight_region_r WEIGHT_REGION;
+    STRUCT scale_region_r SCALE_REGION;
+    uint32_t unused18[3];
+    STRUCT ab_start_r AB_START;
+    uint32_t unused19[1];
+    STRUCT blockdep_r BLOCKDEP;
+    STRUCT dma0_src_region_r DMA0_SRC_REGION;
+    STRUCT dma0_dst_region_r DMA0_DST_REGION;
+    STRUCT dma0_size0_r DMA0_SIZE0;
+    STRUCT dma0_size1_r DMA0_SIZE1;
+    uint32_t unused20[12];
+    STRUCT ifm2_broadcast_r IFM2_BROADCAST;
+    STRUCT ifm2_scalar_r IFM2_SCALAR;
+    uint32_t unused21[3];
+    STRUCT ifm2_precision_r IFM2_PRECISION;
+    uint32_t unused22[3];
+    STRUCT ifm2_zero_point_r IFM2_ZERO_POINT;
+    STRUCT ifm2_width0_m1_r IFM2_WIDTH0_M1;
+    STRUCT ifm2_height0_m1_r IFM2_HEIGHT0_M1;
+    STRUCT ifm2_height1_m1_r IFM2_HEIGHT1_M1;
+    STRUCT ifm2_ib_start_r IFM2_IB_START;
+    uint32_t unused23[1];
+    STRUCT ifm2_region_r IFM2_REGION;
+    uint32_t unused24[48];
+    STRUCT ifm_base0_r IFM_BASE0;
+    STRUCT ifm_base1_r IFM_BASE1;
+    STRUCT ifm_base2_r IFM_BASE2;
+    STRUCT ifm_base3_r IFM_BASE3;
+    STRUCT ifm_stride_x_r IFM_STRIDE_X;
+    STRUCT ifm_stride_y_r IFM_STRIDE_Y;
+    STRUCT ifm_stride_c_r IFM_STRIDE_C;
+    uint32_t unused25[2];
+    STRUCT ofm_base0_r OFM_BASE0;
+    STRUCT ofm_base1_r OFM_BASE1;
+    STRUCT ofm_base2_r OFM_BASE2;
+    STRUCT ofm_base3_r OFM_BASE3;
+    STRUCT ofm_stride_x_r OFM_STRIDE_X;
+    STRUCT ofm_stride_y_r OFM_STRIDE_Y;
+    STRUCT ofm_stride_c_r OFM_STRIDE_C;
+    uint32_t unused26[2];
+    STRUCT weight_base_r WEIGHT_BASE;
+    STRUCT weight_length_r WEIGHT_LENGTH;
+    STRUCT scale_base_r SCALE_BASE;
+    STRUCT scale_length_r SCALE_LENGTH;
+    STRUCT ofm_scale_r OFM_SCALE;
+    STRUCT opa_scale_r OPA_SCALE;
+    STRUCT opb_scale_r OPB_SCALE;
+    uint32_t unused27[3];
+    STRUCT dma0_src_r DMA0_SRC;
+    STRUCT dma0_dst_r DMA0_DST;
+    STRUCT dma0_len_r DMA0_LEN;
+    STRUCT dma0_skip0_r DMA0_SKIP0;
+    STRUCT dma0_skip1_r DMA0_SKIP1;
+    uint32_t unused28[6];
+    STRUCT ifm2_base0_r IFM2_BASE0;
+    STRUCT ifm2_base1_r IFM2_BASE1;
+    STRUCT ifm2_base2_r IFM2_BASE2;
+    STRUCT ifm2_base3_r IFM2_BASE3;
+    STRUCT ifm2_stride_x_r IFM2_STRIDE_X;
+    STRUCT ifm2_stride_y_r IFM2_STRIDE_Y;
+    STRUCT ifm2_stride_c_r IFM2_STRIDE_C;
+    uint32_t unused29[2];
+    STRUCT weight1_base_r WEIGHT1_BASE;
+    STRUCT weight1_length_r WEIGHT1_LENGTH;
+    STRUCT scale1_base_r SCALE1_BASE;
+    STRUCT scale1_length_r SCALE1_LENGTH;
+    uint32_t unused30[280];
+    STRUCT revision_r REVISION;
+    uint32_t unused31[3];
+    STRUCT pid4_r PID4;
+    STRUCT pid5_r PID5;
+    STRUCT pid6_r PID6;
+    STRUCT pid7_r PID7;
+    STRUCT pid0_r PID0;
+    STRUCT pid1_r PID1;
+    STRUCT pid2_r PID2;
+    STRUCT pid3_r PID3;
+    STRUCT cid0_r CID0;
+    STRUCT cid1_r CID1;
+    STRUCT cid2_r CID2;
+    STRUCT cid3_r CID3;
+
+#ifdef __cplusplus
+    enum class access_type_t : uint8_t { RW, RO, WO };
+    NPU_REG()
+    {
+        reset();
+    }
+    void reset()
+    {
+        ID = 268853249;
+        STATUS = 8;
+        CMD = 12;
+        RESET = 0;
+        QBASE = 0;
+        QREAD = 0;
+        QCONFIG = 0;
+        QSIZE = 0;
+        PROT = 0;
+        CONFIG = 268435456;
+        LOCK = 0;
+        REGIONCFG = 0;
+        AXI_LIMIT0 = 0;
+        AXI_LIMIT1 = 0;
+        AXI_LIMIT2 = 0;
+        AXI_LIMIT3 = 0;
+        for (size_t i = 0; i < (sizeof(BASEP) / sizeof(BASEP[0])); ++i)
+            BASEP[i] = 0;
+        WD_STATUS = 0;
+        MAC_STATUS = 0;
+        AO_STATUS = 0;
+        DMA_STATUS0 = 0;
+        DMA_STATUS1 = 0;
+        CLKFORCE = 0;
+        DEBUG_ADDRESS = 0;
+        DEBUG_MISC = 0;
+        DEBUGCORE = 0;
+        DEBUG_BLOCK = 0;
+        PMCR = 8192;
+        PMCNTENSET = 0;
+        PMCNTENCLR = 0;
+        PMOVSSET = 0;
+        PMOVSCLR = 0;
+        PMINTSET = 0;
+        PMINTCLR = 0;
+        PMCCNTR = 0;
+        PMCCNTR_CFG = 0;
+        PMCAXI_CHAN = 0;
+        KERNEL_X = 0;
+        KERNEL_Y = 0;
+        KERNEL_W_M1 = 0;
+        KERNEL_H_M1 = 0;
+        OFM_CBLK_WIDTH_M1 = 0;
+        OFM_CBLK_HEIGHT_M1 = 0;
+        OFM_CBLK_DEPTH_M1 = 0;
+        IFM_CBLK_DEPTH_M1 = 0;
+        OFM_X = 0;
+        OFM_Y = 0;
+        OFM_Z = 0;
+        IFM_Z = 0;
+        PAD_TOP = 0;
+        PAD_LEFT = 0;
+        IFM_CBLK_WIDTH = 0;
+        IFM_CBLK_HEIGHT = 0;
+        DMA_IFM_SRC = 0;
+        DMA_IFM_DST = 0;
+        DMA_OFM_SRC = 0;
+        DMA_OFM_DST = 0;
+        DMA_WEIGHT_SRC = 0;
+        DMA_CMD_SRC = 0;
+        DMA_CMD_SIZE = 0;
+        DMA_M2M_SRC = 0;
+        DMA_M2M_DST = 0;
+        CURRENT_QREAD = 0;
+        DMA_SCALE_SRC = 0;
+        CURRENT_BLOCK = 0;
+        CURRENT_OP = 0;
+        CURRENT_CMD = 0;
+        for (size_t i = 0; i < (sizeof(PMEVCNTR) / sizeof(PMEVCNTR[0])); ++i)
+            PMEVCNTR[i] = 0;
+        for (size_t i = 0; i < (sizeof(PMEVTYPER) / sizeof(PMEVTYPER[0])); ++i)
+            PMEVTYPER[i] = 0;
+        for (size_t i = 0; i < (sizeof(SHARED_BUFFER) / sizeof(SHARED_BUFFER[0])); ++i)
+            SHARED_BUFFER[i] = 0;
+        IFM_PAD_TOP = 0;
+        IFM_PAD_LEFT = 0;
+        IFM_PAD_RIGHT = 0;
+        IFM_PAD_BOTTOM = 0;
+        IFM_DEPTH_M1 = 0;
+        IFM_PRECISION = 0;
+        IFM_UPSCALE = 0;
+        IFM_ZERO_POINT = 0;
+        IFM_WIDTH0_M1 = 0;
+        IFM_HEIGHT0_M1 = 0;
+        IFM_HEIGHT1_M1 = 0;
+        IFM_IB_END = 0;
+        IFM_REGION = 0;
+        OFM_WIDTH_M1 = 0;
+        OFM_HEIGHT_M1 = 0;
+        OFM_DEPTH_M1 = 0;
+        OFM_PRECISION = 0;
+        OFM_BLK_WIDTH_M1 = 0;
+        OFM_BLK_HEIGHT_M1 = 0;
+        OFM_BLK_DEPTH_M1 = 0;
+        OFM_ZERO_POINT = 0;
+        OFM_WIDTH0_M1 = 0;
+        OFM_HEIGHT0_M1 = 0;
+        OFM_HEIGHT1_M1 = 0;
+        OFM_REGION = 0;
+        KERNEL_WIDTH_M1 = 0;
+        KERNEL_HEIGHT_M1 = 0;
+        KERNEL_STRIDE = 0;
+        PARALLEL_MODE = 0;
+        ACC_FORMAT = 0;
+        ACTIVATION = 0;
+        ACTIVATION_MIN = 0;
+        ACTIVATION_MAX = 0;
+        WEIGHT_REGION = 0;
+        SCALE_REGION = 0;
+        AB_START = 0;
+        BLOCKDEP = 0;
+        DMA0_SRC_REGION = 0;
+        DMA0_DST_REGION = 0;
+        DMA0_SIZE0 = 0;
+        DMA0_SIZE1 = 0;
+        IFM2_BROADCAST = 0;
+        IFM2_SCALAR = 0;
+        IFM2_PRECISION = 0;
+        IFM2_ZERO_POINT = 0;
+        IFM2_WIDTH0_M1 = 0;
+        IFM2_HEIGHT0_M1 = 0;
+        IFM2_HEIGHT1_M1 = 0;
+        IFM2_IB_START = 0;
+        IFM2_REGION = 0;
+        IFM_BASE0 = 0;
+        IFM_BASE1 = 0;
+        IFM_BASE2 = 0;
+        IFM_BASE3 = 0;
+        IFM_STRIDE_X = 0;
+        IFM_STRIDE_Y = 0;
+        IFM_STRIDE_C = 0;
+        OFM_BASE0 = 0;
+        OFM_BASE1 = 0;
+        OFM_BASE2 = 0;
+        OFM_BASE3 = 0;
+        OFM_STRIDE_X = 0;
+        OFM_STRIDE_Y = 0;
+        OFM_STRIDE_C = 0;
+        WEIGHT_BASE = 0;
+        WEIGHT_LENGTH = 0;
+        SCALE_BASE = 0;
+        SCALE_LENGTH = 0;
+        OFM_SCALE = 0;
+        OPA_SCALE = 0;
+        OPB_SCALE = 0;
+        DMA0_SRC = 0;
+        DMA0_DST = 0;
+        DMA0_LEN = 0;
+        DMA0_SKIP0 = 0;
+        DMA0_SKIP1 = 0;
+        IFM2_BASE0 = 0;
+        IFM2_BASE1 = 0;
+        IFM2_BASE2 = 0;
+        IFM2_BASE3 = 0;
+        IFM2_STRIDE_X = 0;
+        IFM2_STRIDE_Y = 0;
+        IFM2_STRIDE_C = 0;
+        WEIGHT1_BASE = 0;
+        WEIGHT1_LENGTH = 0;
+        SCALE1_BASE = 0;
+        SCALE1_LENGTH = 0;
+        REVISION = 0;
+        PID4 = 4;
+        PID5 = 0;
+        PID6 = 0;
+        PID7 = 0;
+        PID0 = 129;
+        PID1 = 181;
+        PID2 = 11;
+        PID3 = 0;
+        CID0 = 13;
+        CID1 = 240;
+        CID2 = 5;
+        CID3 = 177;
+    }
+    uint32_t& operator[](const int addr_offset)
+    {
+        return reinterpret_cast<uint32_t *>(this)[addr_offset / 4];
+    }
+    access_type_t get_access_type(uint32_t offset)
+    {
+        switch (offset)
+        {
+            case 0: return access_type_t::RO;
+            case 4: return access_type_t::RO;
+            case 8: return access_type_t::RW;
+            case 12: return access_type_t::RW;
+            case 16: return access_type_t::RW;
+            case 24: return access_type_t::RO;
+            case 28: return access_type_t::RW;
+            case 32: return access_type_t::RW;
+            case 36: return access_type_t::RO;
+            case 40: return access_type_t::RO;
+            case 44: return access_type_t::RW;
+            case 60: return access_type_t::RW;
+            case 64: return access_type_t::RW;
+            case 68: return access_type_t::RW;
+            case 72: return access_type_t::RW;
+            case 76: return access_type_t::RW;
+            case 128: return access_type_t::RW;
+            case 136: return access_type_t::RW;
+            case 144: return access_type_t::RW;
+            case 152: return access_type_t::RW;
+            case 160: return access_type_t::RW;
+            case 168: return access_type_t::RW;
+            case 176: return access_type_t::RW;
+            case 184: return access_type_t::RW;
+            case 256: return access_type_t::RO;
+            case 260: return access_type_t::RO;
+            case 264: return access_type_t::RO;
+            case 272: return access_type_t::RO;
+            case 276: return access_type_t::RO;
+            case 320: return access_type_t::RW;
+            case 324: return access_type_t::RW;
+            case 328: return access_type_t::RW;
+            case 332: return access_type_t::RW;
+            case 336: return access_type_t::RW;
+            case 384: return access_type_t::RW;
+            case 388: return access_type_t::RW;
+            case 392: return access_type_t::RW;
+            case 396: return access_type_t::RW;
+            case 400: return access_type_t::RW;
+            case 404: return access_type_t::RW;
+            case 408: return access_type_t::RW;
+            case 416: return access_type_t::RW;
+            case 424: return access_type_t::RW;
+            case 428: return access_type_t::RW;
+            case 512: return access_type_t::RO;
+            case 516: return access_type_t::RO;
+            case 520: return access_type_t::RO;
+            case 524: return access_type_t::RO;
+            case 528: return access_type_t::RO;
+            case 532: return access_type_t::RO;
+            case 536: return access_type_t::RO;
+            case 540: return access_type_t::RO;
+            case 544: return access_type_t::RO;
+            case 548: return access_type_t::RO;
+            case 552: return access_type_t::RO;
+            case 556: return access_type_t::RO;
+            case 560: return access_type_t::RO;
+            case 564: return access_type_t::RO;
+            case 568: return access_type_t::RO;
+            case 572: return access_type_t::RO;
+            case 576: return access_type_t::RO;
+            case 584: return access_type_t::RO;
+            case 588: return access_type_t::RO;
+            case 592: return access_type_t::RO;
+            case 600: return access_type_t::RO;
+            case 608: return access_type_t::RO;
+            case 616: return access_type_t::RO;
+            case 620: return access_type_t::RO;
+            case 628: return access_type_t::RO;
+            case 636: return access_type_t::RO;
+            case 640: return access_type_t::RO;
+            case 692: return access_type_t::RO;
+            case 696: return access_type_t::RO;
+            case 700: return access_type_t::RO;
+            case 768: return access_type_t::RW;
+            case 772: return access_type_t::RW;
+            case 776: return access_type_t::RW;
+            case 780: return access_type_t::RW;
+            case 896: return access_type_t::RW;
+            case 900: return access_type_t::RW;
+            case 904: return access_type_t::RW;
+            case 908: return access_type_t::RW;
+            case 1024: return access_type_t::RW;
+            case 1028: return access_type_t::RW;
+            case 1032: return access_type_t::RW;
+            case 1036: return access_type_t::RW;
+            case 1040: return access_type_t::RW;
+            case 1044: return access_type_t::RW;
+            case 1048: return access_type_t::RW;
+            case 1052: return access_type_t::RW;
+            case 1056: return access_type_t::RW;
+            case 1060: return access_type_t::RW;
+            case 1064: return access_type_t::RW;
+            case 1068: return access_type_t::RW;
+            case 1072: return access_type_t::RW;
+            case 1076: return access_type_t::RW;
+            case 1080: return access_type_t::RW;
+            case 1084: return access_type_t::RW;
+            case 1088: return access_type_t::RW;
+            case 1092: return access_type_t::RW;
+            case 1096: return access_type_t::RW;
+            case 1100: return access_type_t::RW;
+            case 1104: return access_type_t::RW;
+            case 1108: return access_type_t::RW;
+            case 1112: return access_type_t::RW;
+            case 1116: return access_type_t::RW;
+            case 1120: return access_type_t::RW;
+            case 1124: return access_type_t::RW;
+            case 1128: return access_type_t::RW;
+            case 1132: return access_type_t::RW;
+            case 1136: return access_type_t::RW;
+            case 1140: return access_type_t::RW;
+            case 1144: return access_type_t::RW;
+            case 1148: return access_type_t::RW;
+            case 1152: return access_type_t::RW;
+            case 1156: return access_type_t::RW;
+            case 1160: return access_type_t::RW;
+            case 1164: return access_type_t::RW;
+            case 1168: return access_type_t::RW;
+            case 1172: return access_type_t::RW;
+            case 1176: return access_type_t::RW;
+            case 1180: return access_type_t::RW;
+            case 1184: return access_type_t::RW;
+            case 1188: return access_type_t::RW;
+            case 1192: return access_type_t::RW;
+            case 1196: return access_type_t::RW;
+            case 1200: return access_type_t::RW;
+            case 1204: return access_type_t::RW;
+            case 1208: return access_type_t::RW;
+            case 1212: return access_type_t::RW;
+            case 1216: return access_type_t::RW;
+            case 1220: return access_type_t::RW;
+            case 1224: return access_type_t::RW;
+            case 1228: return access_type_t::RW;
+            case 1232: return access_type_t::RW;
+            case 1236: return access_type_t::RW;
+            case 1240: return access_type_t::RW;
+            case 1244: return access_type_t::RW;
+            case 1248: return access_type_t::RW;
+            case 1252: return access_type_t::RW;
+            case 1256: return access_type_t::RW;
+            case 1260: return access_type_t::RW;
+            case 1264: return access_type_t::RW;
+            case 1268: return access_type_t::RW;
+            case 1272: return access_type_t::RW;
+            case 1276: return access_type_t::RW;
+            case 1280: return access_type_t::RW;
+            case 1284: return access_type_t::RW;
+            case 1288: return access_type_t::RW;
+            case 1292: return access_type_t::RW;
+            case 1296: return access_type_t::RW;
+            case 1300: return access_type_t::RW;
+            case 1304: return access_type_t::RW;
+            case 1308: return access_type_t::RW;
+            case 1312: return access_type_t::RW;
+            case 1316: return access_type_t::RW;
+            case 1320: return access_type_t::RW;
+            case 1324: return access_type_t::RW;
+            case 1328: return access_type_t::RW;
+            case 1332: return access_type_t::RW;
+            case 1336: return access_type_t::RW;
+            case 1340: return access_type_t::RW;
+            case 1344: return access_type_t::RW;
+            case 1348: return access_type_t::RW;
+            case 1352: return access_type_t::RW;
+            case 1356: return access_type_t::RW;
+            case 1360: return access_type_t::RW;
+            case 1364: return access_type_t::RW;
+            case 1368: return access_type_t::RW;
+            case 1372: return access_type_t::RW;
+            case 1376: return access_type_t::RW;
+            case 1380: return access_type_t::RW;
+            case 1384: return access_type_t::RW;
+            case 1388: return access_type_t::RW;
+            case 1392: return access_type_t::RW;
+            case 1396: return access_type_t::RW;
+            case 1400: return access_type_t::RW;
+            case 1404: return access_type_t::RW;
+            case 1408: return access_type_t::RW;
+            case 1412: return access_type_t::RW;
+            case 1416: return access_type_t::RW;
+            case 1420: return access_type_t::RW;
+            case 1424: return access_type_t::RW;
+            case 1428: return access_type_t::RW;
+            case 1432: return access_type_t::RW;
+            case 1436: return access_type_t::RW;
+            case 1440: return access_type_t::RW;
+            case 1444: return access_type_t::RW;
+            case 1448: return access_type_t::RW;
+            case 1452: return access_type_t::RW;
+            case 1456: return access_type_t::RW;
+            case 1460: return access_type_t::RW;
+            case 1464: return access_type_t::RW;
+            case 1468: return access_type_t::RW;
+            case 1472: return access_type_t::RW;
+            case 1476: return access_type_t::RW;
+            case 1480: return access_type_t::RW;
+            case 1484: return access_type_t::RW;
+            case 1488: return access_type_t::RW;
+            case 1492: return access_type_t::RW;
+            case 1496: return access_type_t::RW;
+            case 1500: return access_type_t::RW;
+            case 1504: return access_type_t::RW;
+            case 1508: return access_type_t::RW;
+            case 1512: return access_type_t::RW;
+            case 1516: return access_type_t::RW;
+            case 1520: return access_type_t::RW;
+            case 1524: return access_type_t::RW;
+            case 1528: return access_type_t::RW;
+            case 1532: return access_type_t::RW;
+            case 1536: return access_type_t::RW;
+            case 1540: return access_type_t::RW;
+            case 1544: return access_type_t::RW;
+            case 1548: return access_type_t::RW;
+            case 1552: return access_type_t::RW;
+            case 1556: return access_type_t::RW;
+            case 1560: return access_type_t::RW;
+            case 1564: return access_type_t::RW;
+            case 1568: return access_type_t::RW;
+            case 1572: return access_type_t::RW;
+            case 1576: return access_type_t::RW;
+            case 1580: return access_type_t::RW;
+            case 1584: return access_type_t::RW;
+            case 1588: return access_type_t::RW;
+            case 1592: return access_type_t::RW;
+            case 1596: return access_type_t::RW;
+            case 1600: return access_type_t::RW;
+            case 1604: return access_type_t::RW;
+            case 1608: return access_type_t::RW;
+            case 1612: return access_type_t::RW;
+            case 1616: return access_type_t::RW;
+            case 1620: return access_type_t::RW;
+            case 1624: return access_type_t::RW;
+            case 1628: return access_type_t::RW;
+            case 1632: return access_type_t::RW;
+            case 1636: return access_type_t::RW;
+            case 1640: return access_type_t::RW;
+            case 1644: return access_type_t::RW;
+            case 1648: return access_type_t::RW;
+            case 1652: return access_type_t::RW;
+            case 1656: return access_type_t::RW;
+            case 1660: return access_type_t::RW;
+            case 1664: return access_type_t::RW;
+            case 1668: return access_type_t::RW;
+            case 1672: return access_type_t::RW;
+            case 1676: return access_type_t::RW;
+            case 1680: return access_type_t::RW;
+            case 1684: return access_type_t::RW;
+            case 1688: return access_type_t::RW;
+            case 1692: return access_type_t::RW;
+            case 1696: return access_type_t::RW;
+            case 1700: return access_type_t::RW;
+            case 1704: return access_type_t::RW;
+            case 1708: return access_type_t::RW;
+            case 1712: return access_type_t::RW;
+            case 1716: return access_type_t::RW;
+            case 1720: return access_type_t::RW;
+            case 1724: return access_type_t::RW;
+            case 1728: return access_type_t::RW;
+            case 1732: return access_type_t::RW;
+            case 1736: return access_type_t::RW;
+            case 1740: return access_type_t::RW;
+            case 1744: return access_type_t::RW;
+            case 1748: return access_type_t::RW;
+            case 1752: return access_type_t::RW;
+            case 1756: return access_type_t::RW;
+            case 1760: return access_type_t::RW;
+            case 1764: return access_type_t::RW;
+            case 1768: return access_type_t::RW;
+            case 1772: return access_type_t::RW;
+            case 1776: return access_type_t::RW;
+            case 1780: return access_type_t::RW;
+            case 1784: return access_type_t::RW;
+            case 1788: return access_type_t::RW;
+            case 1792: return access_type_t::RW;
+            case 1796: return access_type_t::RW;
+            case 1800: return access_type_t::RW;
+            case 1804: return access_type_t::RW;
+            case 1808: return access_type_t::RW;
+            case 1812: return access_type_t::RW;
+            case 1816: return access_type_t::RW;
+            case 1820: return access_type_t::RW;
+            case 1824: return access_type_t::RW;
+            case 1828: return access_type_t::RW;
+            case 1832: return access_type_t::RW;
+            case 1836: return access_type_t::RW;
+            case 1840: return access_type_t::RW;
+            case 1844: return access_type_t::RW;
+            case 1848: return access_type_t::RW;
+            case 1852: return access_type_t::RW;
+            case 1856: return access_type_t::RW;
+            case 1860: return access_type_t::RW;
+            case 1864: return access_type_t::RW;
+            case 1868: return access_type_t::RW;
+            case 1872: return access_type_t::RW;
+            case 1876: return access_type_t::RW;
+            case 1880: return access_type_t::RW;
+            case 1884: return access_type_t::RW;
+            case 1888: return access_type_t::RW;
+            case 1892: return access_type_t::RW;
+            case 1896: return access_type_t::RW;
+            case 1900: return access_type_t::RW;
+            case 1904: return access_type_t::RW;
+            case 1908: return access_type_t::RW;
+            case 1912: return access_type_t::RW;
+            case 1916: return access_type_t::RW;
+            case 1920: return access_type_t::RW;
+            case 1924: return access_type_t::RW;
+            case 1928: return access_type_t::RW;
+            case 1932: return access_type_t::RW;
+            case 1936: return access_type_t::RW;
+            case 1940: return access_type_t::RW;
+            case 1944: return access_type_t::RW;
+            case 1948: return access_type_t::RW;
+            case 1952: return access_type_t::RW;
+            case 1956: return access_type_t::RW;
+            case 1960: return access_type_t::RW;
+            case 1964: return access_type_t::RW;
+            case 1968: return access_type_t::RW;
+            case 1972: return access_type_t::RW;
+            case 1976: return access_type_t::RW;
+            case 1980: return access_type_t::RW;
+            case 1984: return access_type_t::RW;
+            case 1988: return access_type_t::RW;
+            case 1992: return access_type_t::RW;
+            case 1996: return access_type_t::RW;
+            case 2000: return access_type_t::RW;
+            case 2004: return access_type_t::RW;
+            case 2008: return access_type_t::RW;
+            case 2012: return access_type_t::RW;
+            case 2016: return access_type_t::RW;
+            case 2020: return access_type_t::RW;
+            case 2024: return access_type_t::RW;
+            case 2028: return access_type_t::RW;
+            case 2032: return access_type_t::RW;
+            case 2036: return access_type_t::RW;
+            case 2040: return access_type_t::RW;
+            case 2044: return access_type_t::RW;
+            case 2048: return access_type_t::RW;
+            case 2052: return access_type_t::RW;
+            case 2056: return access_type_t::RW;
+            case 2060: return access_type_t::RW;
+            case 2064: return access_type_t::RW;
+            case 2068: return access_type_t::RW;
+            case 2076: return access_type_t::RW;
+            case 2084: return access_type_t::RW;
+            case 2088: return access_type_t::RW;
+            case 2092: return access_type_t::RW;
+            case 2096: return access_type_t::RW;
+            case 2100: return access_type_t::RW;
+            case 2108: return access_type_t::RW;
+            case 2116: return access_type_t::RW;
+            case 2120: return access_type_t::RW;
+            case 2124: return access_type_t::RW;
+            case 2128: return access_type_t::RW;
+            case 2132: return access_type_t::RW;
+            case 2136: return access_type_t::RW;
+            case 2140: return access_type_t::RW;
+            case 2144: return access_type_t::RW;
+            case 2152: return access_type_t::RW;
+            case 2156: return access_type_t::RW;
+            case 2160: return access_type_t::RW;
+            case 2172: return access_type_t::RW;
+            case 2176: return access_type_t::RW;
+            case 2180: return access_type_t::RW;
+            case 2184: return access_type_t::RW;
+            case 2188: return access_type_t::RW;
+            case 2192: return access_type_t::RW;
+            case 2196: return access_type_t::RW;
+            case 2200: return access_type_t::RW;
+            case 2204: return access_type_t::RW;
+            case 2208: return access_type_t::RW;
+            case 2212: return access_type_t::RW;
+            case 2228: return access_type_t::RW;
+            case 2236: return access_type_t::RW;
+            case 2240: return access_type_t::RW;
+            case 2244: return access_type_t::RW;
+            case 2248: return access_type_t::RW;
+            case 2252: return access_type_t::RW;
+            case 2304: return access_type_t::RW;
+            case 2308: return access_type_t::RW;
+            case 2324: return access_type_t::RW;
+            case 2340: return access_type_t::RW;
+            case 2344: return access_type_t::RW;
+            case 2348: return access_type_t::RW;
+            case 2352: return access_type_t::RW;
+            case 2356: return access_type_t::RW;
+            case 2364: return access_type_t::RW;
+            case 2560: return access_type_t::RW;
+            case 2568: return access_type_t::RW;
+            case 2576: return access_type_t::RW;
+            case 2584: return access_type_t::RW;
+            case 2592: return access_type_t::RW;
+            case 2600: return access_type_t::RW;
+            case 2608: return access_type_t::RW;
+            case 2624: return access_type_t::RW;
+            case 2632: return access_type_t::RW;
+            case 2640: return access_type_t::RW;
+            case 2648: return access_type_t::RW;
+            case 2656: return access_type_t::RW;
+            case 2664: return access_type_t::RW;
+            case 2672: return access_type_t::RW;
+            case 2688: return access_type_t::RW;
+            case 2696: return access_type_t::RW;
+            case 2704: return access_type_t::RW;
+            case 2712: return access_type_t::RW;
+            case 2720: return access_type_t::RW;
+            case 2728: return access_type_t::RW;
+            case 2736: return access_type_t::RW;
+            case 2752: return access_type_t::RW;
+            case 2760: return access_type_t::RW;
+            case 2768: return access_type_t::RW;
+            case 2776: return access_type_t::RW;
+            case 2784: return access_type_t::RW;
+            case 2816: return access_type_t::RW;
+            case 2824: return access_type_t::RW;
+            case 2832: return access_type_t::RW;
+            case 2840: return access_type_t::RW;
+            case 2848: return access_type_t::RW;
+            case 2856: return access_type_t::RW;
+            case 2864: return access_type_t::RW;
+            case 2880: return access_type_t::RW;
+            case 2888: return access_type_t::RW;
+            case 2896: return access_type_t::RW;
+            case 2904: return access_type_t::RW;
+            case 4032: return access_type_t::RO;
+            case 4048: return access_type_t::RO;
+            case 4052: return access_type_t::RO;
+            case 4056: return access_type_t::RO;
+            case 4060: return access_type_t::RO;
+            case 4064: return access_type_t::RO;
+            case 4068: return access_type_t::RO;
+            case 4072: return access_type_t::RO;
+            case 4076: return access_type_t::RO;
+            case 4080: return access_type_t::RO;
+            case 4084: return access_type_t::RO;
+            case 4088: return access_type_t::RO;
+            case 4092: return access_type_t::RO;
+            default: return access_type_t::RO;
+        }
+    }
+#endif
+};
+
+#ifdef __cplusplus
+struct isa
+{
+#ifdef NPU_DISASSEMBLE
+static int disassemble(const uint32_t* in, std::string& op, std::vector<std::pair<std::string, std::string>>& fields)
+{
+    switch (*in & 0xffff)
+    {
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_STOP):
+        {
+            const npu_op_stop_t& v = *reinterpret_cast<const npu_op_stop_t*>(in);
+            op = "NPU_OP_STOP";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_IRQ):
+        {
+            const npu_op_irq_t& v = *reinterpret_cast<const npu_op_irq_t*>(in);
+            op = "NPU_OP_IRQ";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_CONV):
+        {
+            const npu_op_conv_t& v = *reinterpret_cast<const npu_op_conv_t*>(in);
+            op = "NPU_OP_CONV";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DEPTHWISE):
+        {
+            const npu_op_depthwise_t& v = *reinterpret_cast<const npu_op_depthwise_t*>(in);
+            op = "NPU_OP_DEPTHWISE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_POOL):
+        {
+            const npu_op_pool_t& v = *reinterpret_cast<const npu_op_pool_t*>(in);
+            op = "NPU_OP_POOL";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_ELEMENTWISE):
+        {
+            const npu_op_elementwise_t& v = *reinterpret_cast<const npu_op_elementwise_t*>(in);
+            op = "NPU_OP_ELEMENTWISE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_START):
+        {
+            const npu_op_dma_start_t& v = *reinterpret_cast<const npu_op_dma_start_t*>(in);
+            op = "NPU_OP_DMA_START";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_WAIT):
+        {
+            const npu_op_dma_wait_t& v = *reinterpret_cast<const npu_op_dma_wait_t*>(in);
+            op = "NPU_OP_DMA_WAIT";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_KERNEL_WAIT):
+        {
+            const npu_op_kernel_wait_t& v = *reinterpret_cast<const npu_op_kernel_wait_t*>(in);
+            op = "NPU_OP_KERNEL_WAIT";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_PMU_MASK):
+        {
+            const npu_op_pmu_mask_t& v = *reinterpret_cast<const npu_op_pmu_mask_t*>(in);
+            op = "NPU_OP_PMU_MASK";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_TOP):
+        {
+            const npu_set_ifm_pad_top_t& v = *reinterpret_cast<const npu_set_ifm_pad_top_t*>(in);
+            op = "NPU_SET_IFM_PAD_TOP";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_LEFT):
+        {
+            const npu_set_ifm_pad_left_t& v = *reinterpret_cast<const npu_set_ifm_pad_left_t*>(in);
+            op = "NPU_SET_IFM_PAD_LEFT";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_RIGHT):
+        {
+            const npu_set_ifm_pad_right_t& v = *reinterpret_cast<const npu_set_ifm_pad_right_t*>(in);
+            op = "NPU_SET_IFM_PAD_RIGHT";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_BOTTOM):
+        {
+            const npu_set_ifm_pad_bottom_t& v = *reinterpret_cast<const npu_set_ifm_pad_bottom_t*>(in);
+            op = "NPU_SET_IFM_PAD_BOTTOM";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_DEPTH_M1):
+        {
+            const npu_set_ifm_depth_m1_t& v = *reinterpret_cast<const npu_set_ifm_depth_m1_t*>(in);
+            op = "NPU_SET_IFM_DEPTH_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PRECISION):
+        {
+            const npu_set_ifm_precision_t& v = *reinterpret_cast<const npu_set_ifm_precision_t*>(in);
+            op = "NPU_SET_IFM_PRECISION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_UPSCALE):
+        {
+            const npu_set_ifm_upscale_t& v = *reinterpret_cast<const npu_set_ifm_upscale_t*>(in);
+            op = "NPU_SET_IFM_UPSCALE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_ZERO_POINT):
+        {
+            const npu_set_ifm_zero_point_t& v = *reinterpret_cast<const npu_set_ifm_zero_point_t*>(in);
+            op = "NPU_SET_IFM_ZERO_POINT";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_WIDTH0_M1):
+        {
+            const npu_set_ifm_width0_m1_t& v = *reinterpret_cast<const npu_set_ifm_width0_m1_t*>(in);
+            op = "NPU_SET_IFM_WIDTH0_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT0_M1):
+        {
+            const npu_set_ifm_height0_m1_t& v = *reinterpret_cast<const npu_set_ifm_height0_m1_t*>(in);
+            op = "NPU_SET_IFM_HEIGHT0_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT1_M1):
+        {
+            const npu_set_ifm_height1_m1_t& v = *reinterpret_cast<const npu_set_ifm_height1_m1_t*>(in);
+            op = "NPU_SET_IFM_HEIGHT1_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_IB_END):
+        {
+            const npu_set_ifm_ib_end_t& v = *reinterpret_cast<const npu_set_ifm_ib_end_t*>(in);
+            op = "NPU_SET_IFM_IB_END";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_REGION):
+        {
+            const npu_set_ifm_region_t& v = *reinterpret_cast<const npu_set_ifm_region_t*>(in);
+            op = "NPU_SET_IFM_REGION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH_M1):
+        {
+            const npu_set_ofm_width_m1_t& v = *reinterpret_cast<const npu_set_ofm_width_m1_t*>(in);
+            op = "NPU_SET_OFM_WIDTH_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT_M1):
+        {
+            const npu_set_ofm_height_m1_t& v = *reinterpret_cast<const npu_set_ofm_height_m1_t*>(in);
+            op = "NPU_SET_OFM_HEIGHT_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_DEPTH_M1):
+        {
+            const npu_set_ofm_depth_m1_t& v = *reinterpret_cast<const npu_set_ofm_depth_m1_t*>(in);
+            op = "NPU_SET_OFM_DEPTH_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_PRECISION):
+        {
+            const npu_set_ofm_precision_t& v = *reinterpret_cast<const npu_set_ofm_precision_t*>(in);
+            op = "NPU_SET_OFM_PRECISION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_WIDTH_M1):
+        {
+            const npu_set_ofm_blk_width_m1_t& v = *reinterpret_cast<const npu_set_ofm_blk_width_m1_t*>(in);
+            op = "NPU_SET_OFM_BLK_WIDTH_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_HEIGHT_M1):
+        {
+            const npu_set_ofm_blk_height_m1_t& v = *reinterpret_cast<const npu_set_ofm_blk_height_m1_t*>(in);
+            op = "NPU_SET_OFM_BLK_HEIGHT_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_DEPTH_M1):
+        {
+            const npu_set_ofm_blk_depth_m1_t& v = *reinterpret_cast<const npu_set_ofm_blk_depth_m1_t*>(in);
+            op = "NPU_SET_OFM_BLK_DEPTH_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_ZERO_POINT):
+        {
+            const npu_set_ofm_zero_point_t& v = *reinterpret_cast<const npu_set_ofm_zero_point_t*>(in);
+            op = "NPU_SET_OFM_ZERO_POINT";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH0_M1):
+        {
+            const npu_set_ofm_width0_m1_t& v = *reinterpret_cast<const npu_set_ofm_width0_m1_t*>(in);
+            op = "NPU_SET_OFM_WIDTH0_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT0_M1):
+        {
+            const npu_set_ofm_height0_m1_t& v = *reinterpret_cast<const npu_set_ofm_height0_m1_t*>(in);
+            op = "NPU_SET_OFM_HEIGHT0_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT1_M1):
+        {
+            const npu_set_ofm_height1_m1_t& v = *reinterpret_cast<const npu_set_ofm_height1_m1_t*>(in);
+            op = "NPU_SET_OFM_HEIGHT1_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_REGION):
+        {
+            const npu_set_ofm_region_t& v = *reinterpret_cast<const npu_set_ofm_region_t*>(in);
+            op = "NPU_SET_OFM_REGION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_WIDTH_M1):
+        {
+            const npu_set_kernel_width_m1_t& v = *reinterpret_cast<const npu_set_kernel_width_m1_t*>(in);
+            op = "NPU_SET_KERNEL_WIDTH_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_HEIGHT_M1):
+        {
+            const npu_set_kernel_height_m1_t& v = *reinterpret_cast<const npu_set_kernel_height_m1_t*>(in);
+            op = "NPU_SET_KERNEL_HEIGHT_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_STRIDE):
+        {
+            const npu_set_kernel_stride_t& v = *reinterpret_cast<const npu_set_kernel_stride_t*>(in);
+            op = "NPU_SET_KERNEL_STRIDE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_PARALLEL_MODE):
+        {
+            const npu_set_parallel_mode_t& v = *reinterpret_cast<const npu_set_parallel_mode_t*>(in);
+            op = "NPU_SET_PARALLEL_MODE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACC_FORMAT):
+        {
+            const npu_set_acc_format_t& v = *reinterpret_cast<const npu_set_acc_format_t*>(in);
+            op = "NPU_SET_ACC_FORMAT";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION):
+        {
+            const npu_set_activation_t& v = *reinterpret_cast<const npu_set_activation_t*>(in);
+            op = "NPU_SET_ACTIVATION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MIN):
+        {
+            const npu_set_activation_min_t& v = *reinterpret_cast<const npu_set_activation_min_t*>(in);
+            op = "NPU_SET_ACTIVATION_MIN";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MAX):
+        {
+            const npu_set_activation_max_t& v = *reinterpret_cast<const npu_set_activation_max_t*>(in);
+            op = "NPU_SET_ACTIVATION_MAX";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_REGION):
+        {
+            const npu_set_weight_region_t& v = *reinterpret_cast<const npu_set_weight_region_t*>(in);
+            op = "NPU_SET_WEIGHT_REGION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_SCALE_REGION):
+        {
+            const npu_set_scale_region_t& v = *reinterpret_cast<const npu_set_scale_region_t*>(in);
+            op = "NPU_SET_SCALE_REGION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_AB_START):
+        {
+            const npu_set_ab_start_t& v = *reinterpret_cast<const npu_set_ab_start_t*>(in);
+            op = "NPU_SET_AB_START";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_BLOCKDEP):
+        {
+            const npu_set_blockdep_t& v = *reinterpret_cast<const npu_set_blockdep_t*>(in);
+            op = "NPU_SET_BLOCKDEP";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SRC_REGION):
+        {
+            const npu_set_dma0_src_region_t& v = *reinterpret_cast<const npu_set_dma0_src_region_t*>(in);
+            op = "NPU_SET_DMA0_SRC_REGION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_DST_REGION):
+        {
+            const npu_set_dma0_dst_region_t& v = *reinterpret_cast<const npu_set_dma0_dst_region_t*>(in);
+            op = "NPU_SET_DMA0_DST_REGION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE0):
+        {
+            const npu_set_dma0_size0_t& v = *reinterpret_cast<const npu_set_dma0_size0_t*>(in);
+            op = "NPU_SET_DMA0_SIZE0";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE1):
+        {
+            const npu_set_dma0_size1_t& v = *reinterpret_cast<const npu_set_dma0_size1_t*>(in);
+            op = "NPU_SET_DMA0_SIZE1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_BROADCAST):
+        {
+            const npu_set_ifm2_broadcast_t& v = *reinterpret_cast<const npu_set_ifm2_broadcast_t*>(in);
+            op = "NPU_SET_IFM2_BROADCAST";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_SCALAR):
+        {
+            const npu_set_ifm2_scalar_t& v = *reinterpret_cast<const npu_set_ifm2_scalar_t*>(in);
+            op = "NPU_SET_IFM2_SCALAR";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_PRECISION):
+        {
+            const npu_set_ifm2_precision_t& v = *reinterpret_cast<const npu_set_ifm2_precision_t*>(in);
+            op = "NPU_SET_IFM2_PRECISION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_ZERO_POINT):
+        {
+            const npu_set_ifm2_zero_point_t& v = *reinterpret_cast<const npu_set_ifm2_zero_point_t*>(in);
+            op = "NPU_SET_IFM2_ZERO_POINT";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_WIDTH0_M1):
+        {
+            const npu_set_ifm2_width0_m1_t& v = *reinterpret_cast<const npu_set_ifm2_width0_m1_t*>(in);
+            op = "NPU_SET_IFM2_WIDTH0_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT0_M1):
+        {
+            const npu_set_ifm2_height0_m1_t& v = *reinterpret_cast<const npu_set_ifm2_height0_m1_t*>(in);
+            op = "NPU_SET_IFM2_HEIGHT0_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT1_M1):
+        {
+            const npu_set_ifm2_height1_m1_t& v = *reinterpret_cast<const npu_set_ifm2_height1_m1_t*>(in);
+            op = "NPU_SET_IFM2_HEIGHT1_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_IB_START):
+        {
+            const npu_set_ifm2_ib_start_t& v = *reinterpret_cast<const npu_set_ifm2_ib_start_t*>(in);
+            op = "NPU_SET_IFM2_IB_START";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_REGION):
+        {
+            const npu_set_ifm2_region_t& v = *reinterpret_cast<const npu_set_ifm2_region_t*>(in);
+            op = "NPU_SET_IFM2_REGION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE0):
+        {
+            const npu_set_ifm_base0_t& v = *reinterpret_cast<const npu_set_ifm_base0_t*>(in);
+            op = "NPU_SET_IFM_BASE0";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE1):
+        {
+            const npu_set_ifm_base1_t& v = *reinterpret_cast<const npu_set_ifm_base1_t*>(in);
+            op = "NPU_SET_IFM_BASE1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE2):
+        {
+            const npu_set_ifm_base2_t& v = *reinterpret_cast<const npu_set_ifm_base2_t*>(in);
+            op = "NPU_SET_IFM_BASE2";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE3):
+        {
+            const npu_set_ifm_base3_t& v = *reinterpret_cast<const npu_set_ifm_base3_t*>(in);
+            op = "NPU_SET_IFM_BASE3";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_X):
+        {
+            const npu_set_ifm_stride_x_t& v = *reinterpret_cast<const npu_set_ifm_stride_x_t*>(in);
+            op = "NPU_SET_IFM_STRIDE_X";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_Y):
+        {
+            const npu_set_ifm_stride_y_t& v = *reinterpret_cast<const npu_set_ifm_stride_y_t*>(in);
+            op = "NPU_SET_IFM_STRIDE_Y";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_C):
+        {
+            const npu_set_ifm_stride_c_t& v = *reinterpret_cast<const npu_set_ifm_stride_c_t*>(in);
+            op = "NPU_SET_IFM_STRIDE_C";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE0):
+        {
+            const npu_set_ofm_base0_t& v = *reinterpret_cast<const npu_set_ofm_base0_t*>(in);
+            op = "NPU_SET_OFM_BASE0";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE1):
+        {
+            const npu_set_ofm_base1_t& v = *reinterpret_cast<const npu_set_ofm_base1_t*>(in);
+            op = "NPU_SET_OFM_BASE1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE2):
+        {
+            const npu_set_ofm_base2_t& v = *reinterpret_cast<const npu_set_ofm_base2_t*>(in);
+            op = "NPU_SET_OFM_BASE2";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE3):
+        {
+            const npu_set_ofm_base3_t& v = *reinterpret_cast<const npu_set_ofm_base3_t*>(in);
+            op = "NPU_SET_OFM_BASE3";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_X):
+        {
+            const npu_set_ofm_stride_x_t& v = *reinterpret_cast<const npu_set_ofm_stride_x_t*>(in);
+            op = "NPU_SET_OFM_STRIDE_X";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_Y):
+        {
+            const npu_set_ofm_stride_y_t& v = *reinterpret_cast<const npu_set_ofm_stride_y_t*>(in);
+            op = "NPU_SET_OFM_STRIDE_Y";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_C):
+        {
+            const npu_set_ofm_stride_c_t& v = *reinterpret_cast<const npu_set_ofm_stride_c_t*>(in);
+            op = "NPU_SET_OFM_STRIDE_C";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_BASE):
+        {
+            const npu_set_weight_base_t& v = *reinterpret_cast<const npu_set_weight_base_t*>(in);
+            op = "NPU_SET_WEIGHT_BASE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_LENGTH):
+        {
+            const npu_set_weight_length_t& v = *reinterpret_cast<const npu_set_weight_length_t*>(in);
+            op = "NPU_SET_WEIGHT_LENGTH";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_BASE):
+        {
+            const npu_set_scale_base_t& v = *reinterpret_cast<const npu_set_scale_base_t*>(in);
+            op = "NPU_SET_SCALE_BASE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_LENGTH):
+        {
+            const npu_set_scale_length_t& v = *reinterpret_cast<const npu_set_scale_length_t*>(in);
+            op = "NPU_SET_SCALE_LENGTH";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_SCALE):
+        {
+            const npu_set_ofm_scale_t& v = *reinterpret_cast<const npu_set_ofm_scale_t*>(in);
+            op = "NPU_SET_OFM_SCALE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OPA_SCALE):
+        {
+            const npu_set_opa_scale_t& v = *reinterpret_cast<const npu_set_opa_scale_t*>(in);
+            op = "NPU_SET_OPA_SCALE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OPB_SCALE):
+        {
+            const npu_set_opb_scale_t& v = *reinterpret_cast<const npu_set_opb_scale_t*>(in);
+            op = "NPU_SET_OPB_SCALE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC):
+        {
+            const npu_set_dma0_src_t& v = *reinterpret_cast<const npu_set_dma0_src_t*>(in);
+            op = "NPU_SET_DMA0_SRC";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST):
+        {
+            const npu_set_dma0_dst_t& v = *reinterpret_cast<const npu_set_dma0_dst_t*>(in);
+            op = "NPU_SET_DMA0_DST";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_LEN):
+        {
+            const npu_set_dma0_len_t& v = *reinterpret_cast<const npu_set_dma0_len_t*>(in);
+            op = "NPU_SET_DMA0_LEN";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SKIP0):
+        {
+            const npu_set_dma0_skip0_t& v = *reinterpret_cast<const npu_set_dma0_skip0_t*>(in);
+            op = "NPU_SET_DMA0_SKIP0";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SKIP1):
+        {
+            const npu_set_dma0_skip1_t& v = *reinterpret_cast<const npu_set_dma0_skip1_t*>(in);
+            op = "NPU_SET_DMA0_SKIP1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE0):
+        {
+            const npu_set_ifm2_base0_t& v = *reinterpret_cast<const npu_set_ifm2_base0_t*>(in);
+            op = "NPU_SET_IFM2_BASE0";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE1):
+        {
+            const npu_set_ifm2_base1_t& v = *reinterpret_cast<const npu_set_ifm2_base1_t*>(in);
+            op = "NPU_SET_IFM2_BASE1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE2):
+        {
+            const npu_set_ifm2_base2_t& v = *reinterpret_cast<const npu_set_ifm2_base2_t*>(in);
+            op = "NPU_SET_IFM2_BASE2";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE3):
+        {
+            const npu_set_ifm2_base3_t& v = *reinterpret_cast<const npu_set_ifm2_base3_t*>(in);
+            op = "NPU_SET_IFM2_BASE3";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_X):
+        {
+            const npu_set_ifm2_stride_x_t& v = *reinterpret_cast<const npu_set_ifm2_stride_x_t*>(in);
+            op = "NPU_SET_IFM2_STRIDE_X";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_Y):
+        {
+            const npu_set_ifm2_stride_y_t& v = *reinterpret_cast<const npu_set_ifm2_stride_y_t*>(in);
+            op = "NPU_SET_IFM2_STRIDE_Y";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_C):
+        {
+            const npu_set_ifm2_stride_c_t& v = *reinterpret_cast<const npu_set_ifm2_stride_c_t*>(in);
+            op = "NPU_SET_IFM2_STRIDE_C";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_BASE):
+        {
+            const npu_set_weight1_base_t& v = *reinterpret_cast<const npu_set_weight1_base_t*>(in);
+            op = "NPU_SET_WEIGHT1_BASE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_LENGTH):
+        {
+            const npu_set_weight1_length_t& v = *reinterpret_cast<const npu_set_weight1_length_t*>(in);
+            op = "NPU_SET_WEIGHT1_LENGTH";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE1_BASE):
+        {
+            const npu_set_scale1_base_t& v = *reinterpret_cast<const npu_set_scale1_base_t*>(in);
+            op = "NPU_SET_SCALE1_BASE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE1_LENGTH):
+        {
+            const npu_set_scale1_length_t& v = *reinterpret_cast<const npu_set_scale1_length_t*>(in);
+            op = "NPU_SET_SCALE1_LENGTH";
+            v.disassemble(fields);
+            break;
+        }
+        default: break;
+    }
+    return (*in & (3<<14)) != 0 ? 2 : 1;
+}
+#endif
+#endif
+
+struct npu_op_stop_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t mask:16;
+#ifdef __cplusplus
+public:
+    npu_op_stop_t(uint32_t _mask) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_STOP)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        mask(_mask & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_op_stop_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_STOP)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        mask(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_STOP) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_STOP); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(mask) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_stop_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_stop_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_mask() const
+    {
+        return static_cast<uint32_t>(mask);
+    }
+    CONSTEXPR npu_op_stop_t& set_mask(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        mask = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("mask", std::to_string(mask)));
+    }
+#endif
+#endif
+};
+
+struct npu_op_irq_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t mask:16;
+#ifdef __cplusplus
+public:
+    npu_op_irq_t(uint32_t _mask) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_IRQ)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        mask(_mask & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_op_irq_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_IRQ)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        mask(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_IRQ) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_IRQ); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(mask) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_irq_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_irq_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_mask() const
+    {
+        return static_cast<uint32_t>(mask);
+    }
+    CONSTEXPR npu_op_irq_t& set_mask(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        mask = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("mask", std::to_string(mask)));
+    }
+#endif
+#endif
+};
+
+struct npu_op_conv_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t reserved1:16;
+#ifdef __cplusplus
+public:
+    CONSTEXPR npu_op_conv_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_CONV)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_CONV) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_CONV); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_conv_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_conv_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>&) const
+    {
+    }
+#endif
+#endif
+};
+
+struct npu_op_depthwise_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t reserved1:16;
+#ifdef __cplusplus
+public:
+    CONSTEXPR npu_op_depthwise_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DEPTHWISE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DEPTHWISE) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DEPTHWISE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_depthwise_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_depthwise_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>&) const
+    {
+    }
+#endif
+#endif
+};
+
+struct npu_op_pool_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t pooling_mode:3;
+    uint32_t reserved1:13;
+#ifdef __cplusplus
+public:
+    npu_op_pool_t(NPU_NAMESPACE::pooling_mode _pooling_mode) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_POOL)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        pooling_mode(static_cast<uint8_t>(_pooling_mode) & ((1U << 3)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_op_pool_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_POOL)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        pooling_mode(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_POOL) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_POOL); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(pooling_mode) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_pool_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_pool_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pooling_mode get_pooling_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::pooling_mode>(pooling_mode);
+    }
+    CONSTEXPR npu_op_pool_t& set_pooling_mode(NPU_NAMESPACE::pooling_mode value)
+    {
+        pooling_mode = static_cast<uint8_t>(value) & ((1U << 3)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("pooling_mode", (pooling_mode < (sizeof(pooling_mode_str)/sizeof(pooling_mode_str[0])) ? pooling_mode_str[pooling_mode] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_op_elementwise_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t elementwise_mode:6;
+    uint32_t reserved1:10;
+#ifdef __cplusplus
+public:
+    npu_op_elementwise_t(NPU_NAMESPACE::elementwise_mode _elementwise_mode) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_ELEMENTWISE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        elementwise_mode(static_cast<uint8_t>(_elementwise_mode) & ((1U << 6)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_op_elementwise_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_ELEMENTWISE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        elementwise_mode(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_ELEMENTWISE) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_ELEMENTWISE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(elementwise_mode) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_elementwise_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_elementwise_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::elementwise_mode get_elementwise_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::elementwise_mode>(elementwise_mode);
+    }
+    CONSTEXPR npu_op_elementwise_t& set_elementwise_mode(NPU_NAMESPACE::elementwise_mode value)
+    {
+        elementwise_mode = static_cast<uint8_t>(value) & ((1U << 6)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("elementwise_mode", (elementwise_mode < (sizeof(elementwise_mode_str)/sizeof(elementwise_mode_str[0])) ? elementwise_mode_str[elementwise_mode] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_op_dma_start_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t reserved1:16;
+#ifdef __cplusplus
+public:
+    CONSTEXPR npu_op_dma_start_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_START)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_START) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_START); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_dma_start_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_dma_start_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>&) const
+    {
+    }
+#endif
+#endif
+};
+
+struct npu_op_dma_wait_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t k:1;
+    uint32_t reserved1:15;
+#ifdef __cplusplus
+public:
+    npu_op_dma_wait_t(uint32_t _k) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_WAIT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        k(_k & ((1U << 1)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_op_dma_wait_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_WAIT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        k(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_WAIT) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_WAIT); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(k) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_dma_wait_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_dma_wait_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_k() const
+    {
+        return static_cast<uint32_t>(k);
+    }
+    CONSTEXPR npu_op_dma_wait_t& set_k(uint32_t value)
+    {
+        assert((value >> 1) == 0);
+        k = static_cast<uint8_t>(value & ((1U << 1)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("k", std::to_string(k)));
+    }
+#endif
+#endif
+};
+
+struct npu_op_kernel_wait_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t n:2;
+    uint32_t reserved1:14;
+#ifdef __cplusplus
+public:
+    npu_op_kernel_wait_t(uint32_t _n) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_KERNEL_WAIT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        n(_n & ((1U << 2)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_op_kernel_wait_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_KERNEL_WAIT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        n(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_KERNEL_WAIT) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_KERNEL_WAIT); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(n) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_kernel_wait_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_kernel_wait_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_n() const
+    {
+        return static_cast<uint32_t>(n);
+    }
+    CONSTEXPR npu_op_kernel_wait_t& set_n(uint32_t value)
+    {
+        assert((value >> 2) == 0);
+        n = static_cast<uint8_t>(value & ((1U << 2)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("n", std::to_string(n)));
+    }
+#endif
+#endif
+};
+
+struct npu_op_pmu_mask_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t enable:1;
+    uint32_t reserved1:15;
+#ifdef __cplusplus
+public:
+    npu_op_pmu_mask_t(uint32_t _enable) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_PMU_MASK)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        enable(_enable & ((1U << 1)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_op_pmu_mask_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_PMU_MASK)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        enable(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_PMU_MASK) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_PMU_MASK); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(enable) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_pmu_mask_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_pmu_mask_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_enable() const
+    {
+        return static_cast<uint32_t>(enable);
+    }
+    CONSTEXPR npu_op_pmu_mask_t& set_enable(uint32_t value)
+    {
+        assert((value >> 1) == 0);
+        enable = static_cast<uint8_t>(value & ((1U << 1)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("enable", std::to_string(enable)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_pad_top_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t pad:7;
+    uint32_t reserved1:9;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_pad_top_t(uint32_t _pad) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_TOP)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        pad(_pad & ((1U << 7)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ifm_pad_top_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_TOP)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        pad(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_TOP) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_TOP); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(pad) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_pad_top_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_pad_top_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_pad() const
+    {
+        return static_cast<uint32_t>(pad);
+    }
+    CONSTEXPR npu_set_ifm_pad_top_t& set_pad(uint32_t value)
+    {
+        assert((value >> 7) == 0);
+        pad = static_cast<uint8_t>(value & ((1U << 7)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("pad", std::to_string(pad)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_pad_left_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t pad:7;
+    uint32_t reserved1:9;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_pad_left_t(uint32_t _pad) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_LEFT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        pad(_pad & ((1U << 7)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ifm_pad_left_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_LEFT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        pad(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_LEFT) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_LEFT); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(pad) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_pad_left_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_pad_left_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_pad() const
+    {
+        return static_cast<uint32_t>(pad);
+    }
+    CONSTEXPR npu_set_ifm_pad_left_t& set_pad(uint32_t value)
+    {
+        assert((value >> 7) == 0);
+        pad = static_cast<uint8_t>(value & ((1U << 7)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("pad", std::to_string(pad)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_pad_right_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t pad:8;
+    uint32_t reserved1:8;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_pad_right_t(uint32_t _pad) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_RIGHT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        pad(_pad & ((1U << 8)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ifm_pad_right_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_RIGHT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        pad(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_RIGHT) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_RIGHT); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(pad) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_pad_right_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_pad_right_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_pad() const
+    {
+        return static_cast<uint32_t>(pad);
+    }
+    CONSTEXPR npu_set_ifm_pad_right_t& set_pad(uint32_t value)
+    {
+        assert((value >> 8) == 0);
+        pad = static_cast<uint8_t>(value & ((1U << 8)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("pad", std::to_string(pad)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_pad_bottom_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t pad:8;
+    uint32_t reserved1:8;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_pad_bottom_t(uint32_t _pad) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_BOTTOM)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        pad(_pad & ((1U << 8)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ifm_pad_bottom_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_BOTTOM)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        pad(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_BOTTOM) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_BOTTOM); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(pad) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_pad_bottom_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_pad_bottom_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_pad() const
+    {
+        return static_cast<uint32_t>(pad);
+    }
+    CONSTEXPR npu_set_ifm_pad_bottom_t& set_pad(uint32_t value)
+    {
+        assert((value >> 8) == 0);
+        pad = static_cast<uint8_t>(value & ((1U << 8)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("pad", std::to_string(pad)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_depth_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t depth_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_depth_m1_t(uint32_t _depth_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_DEPTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        depth_m1(_depth_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ifm_depth_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_DEPTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        depth_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_DEPTH_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_DEPTH_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(depth_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_depth_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_depth_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_depth_m1() const
+    {
+        return static_cast<uint32_t>(depth_m1);
+    }
+    CONSTEXPR npu_set_ifm_depth_m1_t& set_depth_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        depth_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("depth_m1", std::to_string(depth_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_precision_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t activation_type:1;
+    uint32_t reserved1:1;
+    uint32_t activation_precision:2;
+    uint32_t reserved2:2;
+    uint32_t activation_format:2;
+    uint32_t scale_mode:2;
+    uint32_t reserved3:4;
+    uint32_t round_mode:2;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_precision_t(NPU_NAMESPACE::activation_type _activation_type, NPU_NAMESPACE::activation_precision _activation_precision, NPU_NAMESPACE::activation_format _activation_format, NPU_NAMESPACE::ifm_scale_mode _scale_mode, NPU_NAMESPACE::round_mode _round_mode) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PRECISION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        activation_type(static_cast<uint8_t>(_activation_type) & ((1U << 1)-1)),
+        reserved1(0),
+        activation_precision(static_cast<uint8_t>(_activation_precision) & ((1U << 2)-1)),
+        reserved2(0),
+        activation_format(static_cast<uint8_t>(_activation_format) & ((1U << 2)-1)),
+        scale_mode(static_cast<uint8_t>(_scale_mode) & ((1U << 2)-1)),
+        reserved3(0),
+        round_mode(static_cast<uint8_t>(_round_mode) & ((1U << 2)-1))
+    {}
+    CONSTEXPR npu_set_ifm_precision_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PRECISION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        activation_type(0),
+        reserved1(0),
+        activation_precision(0),
+        reserved2(0),
+        activation_format(0),
+        scale_mode(0),
+        reserved3(0),
+        round_mode(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PRECISION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PRECISION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(activation_type) << 16;
+        word |= uint32_t(activation_precision) << 18;
+        word |= uint32_t(activation_format) << 22;
+        word |= uint32_t(scale_mode) << 24;
+        word |= uint32_t(round_mode) << 30;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_precision_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_precision_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_type get_activation_type() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_type>(activation_type);
+    }
+    CONSTEXPR npu_set_ifm_precision_t& set_activation_type(NPU_NAMESPACE::activation_type value)
+    {
+        activation_type = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_precision get_activation_precision() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_precision>(activation_precision);
+    }
+    CONSTEXPR npu_set_ifm_precision_t& set_activation_precision(NPU_NAMESPACE::activation_precision value)
+    {
+        activation_precision = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_format get_activation_format() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_format>(activation_format);
+    }
+    CONSTEXPR npu_set_ifm_precision_t& set_activation_format(NPU_NAMESPACE::activation_format value)
+    {
+        activation_format = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::ifm_scale_mode get_scale_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::ifm_scale_mode>(scale_mode);
+    }
+    CONSTEXPR npu_set_ifm_precision_t& set_scale_mode(NPU_NAMESPACE::ifm_scale_mode value)
+    {
+        scale_mode = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::round_mode get_round_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::round_mode>(round_mode);
+    }
+    CONSTEXPR npu_set_ifm_precision_t& set_round_mode(NPU_NAMESPACE::round_mode value)
+    {
+        round_mode = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("activation_type", (activation_type < (sizeof(activation_type_str)/sizeof(activation_type_str[0])) ? activation_type_str[activation_type] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("activation_precision", (activation_precision < (sizeof(activation_precision_str)/sizeof(activation_precision_str[0])) ? activation_precision_str[activation_precision] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("activation_format", (activation_format < (sizeof(activation_format_str)/sizeof(activation_format_str[0])) ? activation_format_str[activation_format] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("scale_mode", (scale_mode < (sizeof(ifm_scale_mode_str)/sizeof(ifm_scale_mode_str[0])) ? ifm_scale_mode_str[scale_mode] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("round_mode", (round_mode < (sizeof(round_mode_str)/sizeof(round_mode_str[0])) ? round_mode_str[round_mode] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_upscale_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t mode:2;
+    uint32_t reserved1:14;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_upscale_t(NPU_NAMESPACE::ifm_upscale_mode _mode) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_UPSCALE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        mode(static_cast<uint8_t>(_mode) & ((1U << 2)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ifm_upscale_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_UPSCALE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        mode(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_UPSCALE) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_UPSCALE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(mode) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_upscale_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_upscale_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::ifm_upscale_mode get_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::ifm_upscale_mode>(mode);
+    }
+    CONSTEXPR npu_set_ifm_upscale_t& set_mode(NPU_NAMESPACE::ifm_upscale_mode value)
+    {
+        mode = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("mode", (mode < (sizeof(ifm_upscale_mode_str)/sizeof(ifm_upscale_mode_str[0])) ? ifm_upscale_mode_str[mode] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_zero_point_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t zero_point:16;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_zero_point_t(uint32_t _zero_point) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_ZERO_POINT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        zero_point(_zero_point & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ifm_zero_point_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_ZERO_POINT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        zero_point(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_ZERO_POINT) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_ZERO_POINT); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(zero_point) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_zero_point_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_zero_point_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_zero_point() const
+    {
+        return static_cast<uint32_t>(zero_point);
+    }
+    CONSTEXPR npu_set_ifm_zero_point_t& set_zero_point(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        zero_point = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("zero_point", std::to_string(zero_point)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_width0_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t width_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_width0_m1_t(uint32_t _width_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_WIDTH0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(_width_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ifm_width0_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_WIDTH0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_WIDTH0_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_WIDTH0_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(width_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_width0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_width0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_width_m1() const
+    {
+        return static_cast<uint32_t>(width_m1);
+    }
+    CONSTEXPR npu_set_ifm_width0_m1_t& set_width_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        width_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("width_m1", std::to_string(width_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_height0_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t height_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_height0_m1_t(uint32_t _height_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(_height_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ifm_height0_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT0_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT0_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(height_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_height0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_height0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_height_m1() const
+    {
+        return static_cast<uint32_t>(height_m1);
+    }
+    CONSTEXPR npu_set_ifm_height0_m1_t& set_height_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        height_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("height_m1", std::to_string(height_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_height1_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t height_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_height1_m1_t(uint32_t _height_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT1_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(_height_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ifm_height1_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT1_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT1_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT1_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(height_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_height1_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_height1_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_height_m1() const
+    {
+        return static_cast<uint32_t>(height_m1);
+    }
+    CONSTEXPR npu_set_ifm_height1_m1_t& set_height_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        height_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("height_m1", std::to_string(height_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_ib_end_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t ib_end:6;
+    uint32_t reserved1:10;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_ib_end_t(uint32_t _ib_end) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_IB_END)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        ib_end(_ib_end & ((1U << 6)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ifm_ib_end_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_IB_END)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        ib_end(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_IB_END) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_IB_END); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(ib_end) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_ib_end_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_ib_end_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ib_end() const
+    {
+        return static_cast<uint32_t>(ib_end);
+    }
+    CONSTEXPR npu_set_ifm_ib_end_t& set_ib_end(uint32_t value)
+    {
+        assert((value >> 6) == 0);
+        ib_end = static_cast<uint8_t>(value & ((1U << 6)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("ib_end", std::to_string(ib_end)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_region_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t region:3;
+    uint32_t reserved1:13;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_region_t(uint32_t _region) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(_region & ((1U << 3)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ifm_region_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_REGION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_REGION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(region) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region() const
+    {
+        return static_cast<uint32_t>(region);
+    }
+    CONSTEXPR npu_set_ifm_region_t& set_region(uint32_t value)
+    {
+        assert((value >> 3) == 0);
+        region = static_cast<uint8_t>(value & ((1U << 3)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("region", std::to_string(region)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_width_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t width_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_width_m1_t(uint32_t _width_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(_width_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ofm_width_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(width_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_width_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_width_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_width_m1() const
+    {
+        return static_cast<uint32_t>(width_m1);
+    }
+    CONSTEXPR npu_set_ofm_width_m1_t& set_width_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        width_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("width_m1", std::to_string(width_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_height_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t height_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_height_m1_t(uint32_t _height_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(_height_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ofm_height_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(height_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_height_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_height_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_height_m1() const
+    {
+        return static_cast<uint32_t>(height_m1);
+    }
+    CONSTEXPR npu_set_ofm_height_m1_t& set_height_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        height_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("height_m1", std::to_string(height_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_depth_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t depth_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_depth_m1_t(uint32_t _depth_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_DEPTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        depth_m1(_depth_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ofm_depth_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_DEPTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        depth_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_DEPTH_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_DEPTH_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(depth_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_depth_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_depth_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_depth_m1() const
+    {
+        return static_cast<uint32_t>(depth_m1);
+    }
+    CONSTEXPR npu_set_ofm_depth_m1_t& set_depth_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        depth_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("depth_m1", std::to_string(depth_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_precision_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t activation_type:1;
+    uint32_t activation_precision:2;
+    uint32_t reserved1:3;
+    uint32_t activation_format:2;
+    uint32_t scale_mode:1;
+    uint32_t reserved2:5;
+    uint32_t round_mode:2;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_precision_t(NPU_NAMESPACE::activation_type _activation_type, NPU_NAMESPACE::activation_precision _activation_precision, NPU_NAMESPACE::activation_format _activation_format, NPU_NAMESPACE::ofm_scale_mode _scale_mode, NPU_NAMESPACE::round_mode _round_mode) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_PRECISION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        activation_type(static_cast<uint8_t>(_activation_type) & ((1U << 1)-1)),
+        activation_precision(static_cast<uint8_t>(_activation_precision) & ((1U << 2)-1)),
+        reserved1(0),
+        activation_format(static_cast<uint8_t>(_activation_format) & ((1U << 2)-1)),
+        scale_mode(static_cast<uint8_t>(_scale_mode) & ((1U << 1)-1)),
+        reserved2(0),
+        round_mode(static_cast<uint8_t>(_round_mode) & ((1U << 2)-1))
+    {}
+    CONSTEXPR npu_set_ofm_precision_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_PRECISION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        activation_type(0),
+        activation_precision(0),
+        reserved1(0),
+        activation_format(0),
+        scale_mode(0),
+        reserved2(0),
+        round_mode(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_PRECISION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_PRECISION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(activation_type) << 16;
+        word |= uint32_t(activation_precision) << 17;
+        word |= uint32_t(activation_format) << 22;
+        word |= uint32_t(scale_mode) << 24;
+        word |= uint32_t(round_mode) << 30;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_precision_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_precision_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_type get_activation_type() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_type>(activation_type);
+    }
+    CONSTEXPR npu_set_ofm_precision_t& set_activation_type(NPU_NAMESPACE::activation_type value)
+    {
+        activation_type = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_precision get_activation_precision() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_precision>(activation_precision);
+    }
+    CONSTEXPR npu_set_ofm_precision_t& set_activation_precision(NPU_NAMESPACE::activation_precision value)
+    {
+        activation_precision = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_format get_activation_format() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_format>(activation_format);
+    }
+    CONSTEXPR npu_set_ofm_precision_t& set_activation_format(NPU_NAMESPACE::activation_format value)
+    {
+        activation_format = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::ofm_scale_mode get_scale_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::ofm_scale_mode>(scale_mode);
+    }
+    CONSTEXPR npu_set_ofm_precision_t& set_scale_mode(NPU_NAMESPACE::ofm_scale_mode value)
+    {
+        scale_mode = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::round_mode get_round_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::round_mode>(round_mode);
+    }
+    CONSTEXPR npu_set_ofm_precision_t& set_round_mode(NPU_NAMESPACE::round_mode value)
+    {
+        round_mode = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("activation_type", (activation_type < (sizeof(activation_type_str)/sizeof(activation_type_str[0])) ? activation_type_str[activation_type] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("activation_precision", (activation_precision < (sizeof(activation_precision_str)/sizeof(activation_precision_str[0])) ? activation_precision_str[activation_precision] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("activation_format", (activation_format < (sizeof(activation_format_str)/sizeof(activation_format_str[0])) ? activation_format_str[activation_format] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("scale_mode", (scale_mode < (sizeof(ofm_scale_mode_str)/sizeof(ofm_scale_mode_str[0])) ? ofm_scale_mode_str[scale_mode] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("round_mode", (round_mode < (sizeof(round_mode_str)/sizeof(round_mode_str[0])) ? round_mode_str[round_mode] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_blk_width_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t width_m1:6;
+    uint32_t reserved1:10;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_blk_width_m1_t(uint32_t _width_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_WIDTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(_width_m1 & ((1U << 6)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ofm_blk_width_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_WIDTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_WIDTH_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_WIDTH_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(width_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_blk_width_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_blk_width_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_width_m1() const
+    {
+        return static_cast<uint32_t>(width_m1);
+    }
+    CONSTEXPR npu_set_ofm_blk_width_m1_t& set_width_m1(uint32_t value)
+    {
+        assert((value >> 6) == 0);
+        width_m1 = static_cast<uint8_t>(value & ((1U << 6)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("width_m1", std::to_string(width_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_blk_height_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t height_m1:5;
+    uint32_t reserved1:11;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_blk_height_m1_t(uint32_t _height_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_HEIGHT_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(_height_m1 & ((1U << 5)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ofm_blk_height_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_HEIGHT_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_HEIGHT_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_HEIGHT_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(height_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_blk_height_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_blk_height_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_height_m1() const
+    {
+        return static_cast<uint32_t>(height_m1);
+    }
+    CONSTEXPR npu_set_ofm_blk_height_m1_t& set_height_m1(uint32_t value)
+    {
+        assert((value >> 5) == 0);
+        height_m1 = static_cast<uint8_t>(value & ((1U << 5)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("height_m1", std::to_string(height_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_blk_depth_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t depth_m1:7;
+    uint32_t reserved1:9;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_blk_depth_m1_t(uint32_t _depth_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_DEPTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        depth_m1(_depth_m1 & ((1U << 7)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ofm_blk_depth_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_DEPTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        depth_m1(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_DEPTH_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_DEPTH_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(depth_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_blk_depth_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_blk_depth_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_depth_m1() const
+    {
+        return static_cast<uint32_t>(depth_m1);
+    }
+    CONSTEXPR npu_set_ofm_blk_depth_m1_t& set_depth_m1(uint32_t value)
+    {
+        assert((value >> 7) == 0);
+        depth_m1 = static_cast<uint8_t>(value & ((1U << 7)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("depth_m1", std::to_string(depth_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_zero_point_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t zero_point:16;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_zero_point_t(uint32_t _zero_point) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_ZERO_POINT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        zero_point(_zero_point & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ofm_zero_point_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_ZERO_POINT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        zero_point(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_ZERO_POINT) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_ZERO_POINT); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(zero_point) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_zero_point_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_zero_point_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_zero_point() const
+    {
+        return static_cast<uint32_t>(zero_point);
+    }
+    CONSTEXPR npu_set_ofm_zero_point_t& set_zero_point(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        zero_point = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("zero_point", std::to_string(zero_point)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_width0_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t width_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_width0_m1_t(uint32_t _width_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(_width_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ofm_width0_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH0_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH0_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(width_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_width0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_width0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_width_m1() const
+    {
+        return static_cast<uint32_t>(width_m1);
+    }
+    CONSTEXPR npu_set_ofm_width0_m1_t& set_width_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        width_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("width_m1", std::to_string(width_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_height0_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t height_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_height0_m1_t(uint32_t _height_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(_height_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ofm_height0_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT0_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT0_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(height_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_height0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_height0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_height_m1() const
+    {
+        return static_cast<uint32_t>(height_m1);
+    }
+    CONSTEXPR npu_set_ofm_height0_m1_t& set_height_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        height_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("height_m1", std::to_string(height_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_height1_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t height_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_height1_m1_t(uint32_t _height_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT1_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(_height_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ofm_height1_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT1_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT1_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT1_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(height_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_height1_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_height1_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_height_m1() const
+    {
+        return static_cast<uint32_t>(height_m1);
+    }
+    CONSTEXPR npu_set_ofm_height1_m1_t& set_height_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        height_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("height_m1", std::to_string(height_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_region_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t region:3;
+    uint32_t reserved1:13;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_region_t(uint32_t _region) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(_region & ((1U << 3)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ofm_region_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_REGION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_REGION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(region) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region() const
+    {
+        return static_cast<uint32_t>(region);
+    }
+    CONSTEXPR npu_set_ofm_region_t& set_region(uint32_t value)
+    {
+        assert((value >> 3) == 0);
+        region = static_cast<uint8_t>(value & ((1U << 3)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("region", std::to_string(region)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_kernel_width_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t width_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_kernel_width_m1_t(uint32_t _width_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_WIDTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(_width_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_kernel_width_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_WIDTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_WIDTH_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_WIDTH_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(width_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_kernel_width_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_kernel_width_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_width_m1() const
+    {
+        return static_cast<uint32_t>(width_m1);
+    }
+    CONSTEXPR npu_set_kernel_width_m1_t& set_width_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        width_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("width_m1", std::to_string(width_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_kernel_height_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t height_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_kernel_height_m1_t(uint32_t _height_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_HEIGHT_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(_height_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_kernel_height_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_HEIGHT_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_HEIGHT_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_HEIGHT_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(height_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_kernel_height_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_kernel_height_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_height_m1() const
+    {
+        return static_cast<uint32_t>(height_m1);
+    }
+    CONSTEXPR npu_set_kernel_height_m1_t& set_height_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        height_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("height_m1", std::to_string(height_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_kernel_stride_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t stride_x_lsb:1;
+    uint32_t stride_y_lsb:1;
+    uint32_t weight_order:1;
+    uint32_t dilation_x:1;
+    uint32_t dilation_y:1;
+    uint32_t decomposition:1;
+    uint32_t stride_x_msb:1;
+    uint32_t reserved1:2;
+    uint32_t stride_y_msb:1;
+    uint32_t reserved2:6;
+#ifdef __cplusplus
+public:
+    npu_set_kernel_stride_t(uint32_t _stride_x_lsb, uint32_t _stride_y_lsb, NPU_NAMESPACE::weight_order _weight_order, NPU_NAMESPACE::kernel_dilation _dilation_x, NPU_NAMESPACE::kernel_dilation _dilation_y, NPU_NAMESPACE::kernel_decomposition _decomposition, uint32_t _stride_x_msb, uint32_t _stride_y_msb) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_STRIDE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        stride_x_lsb(_stride_x_lsb & ((1U << 1)-1)),
+        stride_y_lsb(_stride_y_lsb & ((1U << 1)-1)),
+        weight_order(static_cast<uint8_t>(_weight_order) & ((1U << 1)-1)),
+        dilation_x(static_cast<uint8_t>(_dilation_x) & ((1U << 1)-1)),
+        dilation_y(static_cast<uint8_t>(_dilation_y) & ((1U << 1)-1)),
+        decomposition(static_cast<uint8_t>(_decomposition) & ((1U << 1)-1)),
+        stride_x_msb(_stride_x_msb & ((1U << 1)-1)),
+        reserved1(0),
+        stride_y_msb(_stride_y_msb & ((1U << 1)-1)),
+        reserved2(0)
+    {}
+    CONSTEXPR npu_set_kernel_stride_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_STRIDE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        stride_x_lsb(0),
+        stride_y_lsb(0),
+        weight_order(0),
+        dilation_x(0),
+        dilation_y(0),
+        decomposition(0),
+        stride_x_msb(0),
+        reserved1(0),
+        stride_y_msb(0),
+        reserved2(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_STRIDE) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_STRIDE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(stride_x_lsb) << 16;
+        word |= uint32_t(stride_y_lsb) << 17;
+        word |= uint32_t(weight_order) << 18;
+        word |= uint32_t(dilation_x) << 19;
+        word |= uint32_t(dilation_y) << 20;
+        word |= uint32_t(decomposition) << 21;
+        word |= uint32_t(stride_x_msb) << 22;
+        word |= uint32_t(stride_y_msb) << 25;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_kernel_stride_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_kernel_stride_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_stride_x_lsb() const
+    {
+        return static_cast<uint32_t>(stride_x_lsb);
+    }
+    CONSTEXPR npu_set_kernel_stride_t& set_stride_x_lsb(uint32_t value)
+    {
+        assert((value >> 1) == 0);
+        stride_x_lsb = static_cast<uint8_t>(value & ((1U << 1)-1));
+        return *this;
+    }
+    CONSTEXPR uint32_t get_stride_y_lsb() const
+    {
+        return static_cast<uint32_t>(stride_y_lsb);
+    }
+    CONSTEXPR npu_set_kernel_stride_t& set_stride_y_lsb(uint32_t value)
+    {
+        assert((value >> 1) == 0);
+        stride_y_lsb = static_cast<uint8_t>(value & ((1U << 1)-1));
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::weight_order get_weight_order() const
+    {
+        return static_cast<NPU_NAMESPACE::weight_order>(weight_order);
+    }
+    CONSTEXPR npu_set_kernel_stride_t& set_weight_order(NPU_NAMESPACE::weight_order value)
+    {
+        weight_order = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::kernel_dilation get_dilation_x() const
+    {
+        return static_cast<NPU_NAMESPACE::kernel_dilation>(dilation_x);
+    }
+    CONSTEXPR npu_set_kernel_stride_t& set_dilation_x(NPU_NAMESPACE::kernel_dilation value)
+    {
+        dilation_x = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::kernel_dilation get_dilation_y() const
+    {
+        return static_cast<NPU_NAMESPACE::kernel_dilation>(dilation_y);
+    }
+    CONSTEXPR npu_set_kernel_stride_t& set_dilation_y(NPU_NAMESPACE::kernel_dilation value)
+    {
+        dilation_y = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::kernel_decomposition get_decomposition() const
+    {
+        return static_cast<NPU_NAMESPACE::kernel_decomposition>(decomposition);
+    }
+    CONSTEXPR npu_set_kernel_stride_t& set_decomposition(NPU_NAMESPACE::kernel_decomposition value)
+    {
+        decomposition = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_stride_x_msb() const
+    {
+        return static_cast<uint32_t>(stride_x_msb);
+    }
+    CONSTEXPR npu_set_kernel_stride_t& set_stride_x_msb(uint32_t value)
+    {
+        assert((value >> 1) == 0);
+        stride_x_msb = static_cast<uint8_t>(value & ((1U << 1)-1));
+        return *this;
+    }
+    CONSTEXPR uint32_t get_stride_y_msb() const
+    {
+        return static_cast<uint32_t>(stride_y_msb);
+    }
+    CONSTEXPR npu_set_kernel_stride_t& set_stride_y_msb(uint32_t value)
+    {
+        assert((value >> 1) == 0);
+        stride_y_msb = static_cast<uint8_t>(value & ((1U << 1)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("stride_x_lsb", std::to_string(stride_x_lsb)));
+        fields.push_back(std::make_pair<std::string, std::string>("stride_y_lsb", std::to_string(stride_y_lsb)));
+        fields.push_back(std::make_pair<std::string, std::string>("weight_order", (weight_order < (sizeof(weight_order_str)/sizeof(weight_order_str[0])) ? weight_order_str[weight_order] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("dilation_x", (dilation_x < (sizeof(kernel_dilation_str)/sizeof(kernel_dilation_str[0])) ? kernel_dilation_str[dilation_x] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("dilation_y", (dilation_y < (sizeof(kernel_dilation_str)/sizeof(kernel_dilation_str[0])) ? kernel_dilation_str[dilation_y] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("decomposition", (decomposition < (sizeof(kernel_decomposition_str)/sizeof(kernel_decomposition_str[0])) ? kernel_decomposition_str[decomposition] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("stride_x_msb", std::to_string(stride_x_msb)));
+        fields.push_back(std::make_pair<std::string, std::string>("stride_y_msb", std::to_string(stride_y_msb)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_parallel_mode_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t parallel_mode:1;
+    uint32_t reserved1:15;
+#ifdef __cplusplus
+public:
+    npu_set_parallel_mode_t(NPU_NAMESPACE::parallel_mode _parallel_mode) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_PARALLEL_MODE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        parallel_mode(static_cast<uint8_t>(_parallel_mode) & ((1U << 1)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_parallel_mode_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_PARALLEL_MODE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        parallel_mode(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_PARALLEL_MODE) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_PARALLEL_MODE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(parallel_mode) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_parallel_mode_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_parallel_mode_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::parallel_mode get_parallel_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::parallel_mode>(parallel_mode);
+    }
+    CONSTEXPR npu_set_parallel_mode_t& set_parallel_mode(NPU_NAMESPACE::parallel_mode value)
+    {
+        parallel_mode = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("parallel_mode", (parallel_mode < (sizeof(parallel_mode_str)/sizeof(parallel_mode_str[0])) ? parallel_mode_str[parallel_mode] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_set_acc_format_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t acc_format:2;
+    uint32_t reserved1:14;
+#ifdef __cplusplus
+public:
+    npu_set_acc_format_t(NPU_NAMESPACE::acc_format _acc_format) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACC_FORMAT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        acc_format(static_cast<uint8_t>(_acc_format) & ((1U << 2)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_acc_format_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACC_FORMAT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        acc_format(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACC_FORMAT) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACC_FORMAT); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(acc_format) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_acc_format_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_acc_format_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::acc_format get_acc_format() const
+    {
+        return static_cast<NPU_NAMESPACE::acc_format>(acc_format);
+    }
+    CONSTEXPR npu_set_acc_format_t& set_acc_format(NPU_NAMESPACE::acc_format value)
+    {
+        acc_format = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("acc_format", (acc_format < (sizeof(acc_format_str)/sizeof(acc_format_str[0])) ? acc_format_str[acc_format] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_set_activation_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t activation_function:5;
+    uint32_t reserved1:7;
+    uint32_t activation_clip_range:3;
+    uint32_t reserved2:1;
+#ifdef __cplusplus
+public:
+    npu_set_activation_t(NPU_NAMESPACE::activation_function _activation_function, NPU_NAMESPACE::activation_clip_range _activation_clip_range) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        activation_function(static_cast<uint8_t>(_activation_function) & ((1U << 5)-1)),
+        reserved1(0),
+        activation_clip_range(static_cast<uint8_t>(_activation_clip_range) & ((1U << 3)-1)),
+        reserved2(0)
+    {}
+    CONSTEXPR npu_set_activation_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        activation_function(0),
+        reserved1(0),
+        activation_clip_range(0),
+        reserved2(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(activation_function) << 16;
+        word |= uint32_t(activation_clip_range) << 28;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_activation_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_activation_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_function get_activation_function() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_function>(activation_function);
+    }
+    CONSTEXPR npu_set_activation_t& set_activation_function(NPU_NAMESPACE::activation_function value)
+    {
+        activation_function = static_cast<uint8_t>(value) & ((1U << 5)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_clip_range get_activation_clip_range() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_clip_range>(activation_clip_range);
+    }
+    CONSTEXPR npu_set_activation_t& set_activation_clip_range(NPU_NAMESPACE::activation_clip_range value)
+    {
+        activation_clip_range = static_cast<uint8_t>(value) & ((1U << 3)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("activation_function", (activation_function < (sizeof(activation_function_str)/sizeof(activation_function_str[0])) ? activation_function_str[activation_function] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("activation_clip_range", (activation_clip_range < (sizeof(activation_clip_range_str)/sizeof(activation_clip_range_str[0])) ? activation_clip_range_str[activation_clip_range] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_set_activation_min_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t clip_boundary:16;
+#ifdef __cplusplus
+public:
+    npu_set_activation_min_t(uint32_t _clip_boundary) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MIN)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        clip_boundary(_clip_boundary & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_activation_min_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MIN)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        clip_boundary(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MIN) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MIN); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(clip_boundary) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_activation_min_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_activation_min_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_clip_boundary() const
+    {
+        return static_cast<uint32_t>(clip_boundary);
+    }
+    CONSTEXPR npu_set_activation_min_t& set_clip_boundary(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        clip_boundary = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("clip_boundary", std::to_string(clip_boundary)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_activation_max_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t clip_boundary:16;
+#ifdef __cplusplus
+public:
+    npu_set_activation_max_t(uint32_t _clip_boundary) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MAX)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        clip_boundary(_clip_boundary & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_activation_max_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MAX)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        clip_boundary(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MAX) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MAX); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(clip_boundary) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_activation_max_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_activation_max_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_clip_boundary() const
+    {
+        return static_cast<uint32_t>(clip_boundary);
+    }
+    CONSTEXPR npu_set_activation_max_t& set_clip_boundary(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        clip_boundary = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("clip_boundary", std::to_string(clip_boundary)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_weight_region_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t region:3;
+    uint32_t reserved1:13;
+#ifdef __cplusplus
+public:
+    npu_set_weight_region_t(uint32_t _region) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(_region & ((1U << 3)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_weight_region_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_REGION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_REGION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(region) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_weight_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_weight_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region() const
+    {
+        return static_cast<uint32_t>(region);
+    }
+    CONSTEXPR npu_set_weight_region_t& set_region(uint32_t value)
+    {
+        assert((value >> 3) == 0);
+        region = static_cast<uint8_t>(value & ((1U << 3)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("region", std::to_string(region)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_scale_region_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t region:3;
+    uint32_t reserved1:13;
+#ifdef __cplusplus
+public:
+    npu_set_scale_region_t(uint32_t _region) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_SCALE_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(_region & ((1U << 3)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_scale_region_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_SCALE_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_SCALE_REGION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_SCALE_REGION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(region) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_scale_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_scale_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region() const
+    {
+        return static_cast<uint32_t>(region);
+    }
+    CONSTEXPR npu_set_scale_region_t& set_region(uint32_t value)
+    {
+        assert((value >> 3) == 0);
+        region = static_cast<uint8_t>(value & ((1U << 3)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("region", std::to_string(region)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ab_start_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t ab_start:6;
+    uint32_t reserved1:10;
+#ifdef __cplusplus
+public:
+    npu_set_ab_start_t(uint32_t _ab_start) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_AB_START)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        ab_start(_ab_start & ((1U << 6)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ab_start_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_AB_START)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        ab_start(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_AB_START) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_AB_START); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(ab_start) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ab_start_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ab_start_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ab_start() const
+    {
+        return static_cast<uint32_t>(ab_start);
+    }
+    CONSTEXPR npu_set_ab_start_t& set_ab_start(uint32_t value)
+    {
+        assert((value >> 6) == 0);
+        ab_start = static_cast<uint8_t>(value & ((1U << 6)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("ab_start", std::to_string(ab_start)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_blockdep_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t blockdep:2;
+    uint32_t reserved1:14;
+#ifdef __cplusplus
+public:
+    npu_set_blockdep_t(uint32_t _blockdep) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_BLOCKDEP)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        blockdep(_blockdep & ((1U << 2)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_blockdep_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_BLOCKDEP)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        blockdep(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_BLOCKDEP) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_BLOCKDEP); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(blockdep) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_blockdep_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_blockdep_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_blockdep() const
+    {
+        return static_cast<uint32_t>(blockdep);
+    }
+    CONSTEXPR npu_set_blockdep_t& set_blockdep(uint32_t value)
+    {
+        assert((value >> 2) == 0);
+        blockdep = static_cast<uint8_t>(value & ((1U << 2)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("blockdep", std::to_string(blockdep)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_src_region_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t region:3;
+    uint32_t reserved1:5;
+    uint32_t region_mode:1;
+    uint32_t stride_mode:2;
+    uint32_t reserved2:5;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_src_region_t(uint32_t _region, NPU_NAMESPACE::dma_region_mode _region_mode, NPU_NAMESPACE::dma_stride_mode _stride_mode) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SRC_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(_region & ((1U << 3)-1)),
+        reserved1(0),
+        region_mode(static_cast<uint8_t>(_region_mode) & ((1U << 1)-1)),
+        stride_mode(static_cast<uint8_t>(_stride_mode) & ((1U << 2)-1)),
+        reserved2(0)
+    {}
+    CONSTEXPR npu_set_dma0_src_region_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SRC_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(0),
+        reserved1(0),
+        region_mode(0),
+        stride_mode(0),
+        reserved2(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SRC_REGION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SRC_REGION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(region) << 16;
+        word |= uint32_t(region_mode) << 24;
+        word |= uint32_t(stride_mode) << 25;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_dma0_src_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_dma0_src_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region() const
+    {
+        return static_cast<uint32_t>(region);
+    }
+    CONSTEXPR npu_set_dma0_src_region_t& set_region(uint32_t value)
+    {
+        assert((value >> 3) == 0);
+        region = static_cast<uint8_t>(value & ((1U << 3)-1));
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::dma_region_mode get_region_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::dma_region_mode>(region_mode);
+    }
+    CONSTEXPR npu_set_dma0_src_region_t& set_region_mode(NPU_NAMESPACE::dma_region_mode value)
+    {
+        region_mode = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::dma_stride_mode get_stride_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::dma_stride_mode>(stride_mode);
+    }
+    CONSTEXPR npu_set_dma0_src_region_t& set_stride_mode(NPU_NAMESPACE::dma_stride_mode value)
+    {
+        stride_mode = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("region", std::to_string(region)));
+        fields.push_back(std::make_pair<std::string, std::string>("region_mode", (region_mode < (sizeof(dma_region_mode_str)/sizeof(dma_region_mode_str[0])) ? dma_region_mode_str[region_mode] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("stride_mode", (stride_mode < (sizeof(dma_stride_mode_str)/sizeof(dma_stride_mode_str[0])) ? dma_stride_mode_str[stride_mode] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_dst_region_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t region:3;
+    uint32_t reserved1:5;
+    uint32_t region_mode:1;
+    uint32_t stride_mode:2;
+    uint32_t reserved2:5;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_dst_region_t(uint32_t _region, NPU_NAMESPACE::dma_region_mode _region_mode, NPU_NAMESPACE::dma_stride_mode _stride_mode) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_DST_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(_region & ((1U << 3)-1)),
+        reserved1(0),
+        region_mode(static_cast<uint8_t>(_region_mode) & ((1U << 1)-1)),
+        stride_mode(static_cast<uint8_t>(_stride_mode) & ((1U << 2)-1)),
+        reserved2(0)
+    {}
+    CONSTEXPR npu_set_dma0_dst_region_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_DST_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(0),
+        reserved1(0),
+        region_mode(0),
+        stride_mode(0),
+        reserved2(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_DST_REGION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_DST_REGION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(region) << 16;
+        word |= uint32_t(region_mode) << 24;
+        word |= uint32_t(stride_mode) << 25;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_dma0_dst_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_dma0_dst_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region() const
+    {
+        return static_cast<uint32_t>(region);
+    }
+    CONSTEXPR npu_set_dma0_dst_region_t& set_region(uint32_t value)
+    {
+        assert((value >> 3) == 0);
+        region = static_cast<uint8_t>(value & ((1U << 3)-1));
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::dma_region_mode get_region_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::dma_region_mode>(region_mode);
+    }
+    CONSTEXPR npu_set_dma0_dst_region_t& set_region_mode(NPU_NAMESPACE::dma_region_mode value)
+    {
+        region_mode = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::dma_stride_mode get_stride_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::dma_stride_mode>(stride_mode);
+    }
+    CONSTEXPR npu_set_dma0_dst_region_t& set_stride_mode(NPU_NAMESPACE::dma_stride_mode value)
+    {
+        stride_mode = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("region", std::to_string(region)));
+        fields.push_back(std::make_pair<std::string, std::string>("region_mode", (region_mode < (sizeof(dma_region_mode_str)/sizeof(dma_region_mode_str[0])) ? dma_region_mode_str[region_mode] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("stride_mode", (stride_mode < (sizeof(dma_stride_mode_str)/sizeof(dma_stride_mode_str[0])) ? dma_stride_mode_str[stride_mode] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_size0_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t size:16;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_size0_t(uint32_t _size) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        size(_size & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_dma0_size0_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        size(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE0) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE0); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(size) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_dma0_size0_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_dma0_size0_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_size() const
+    {
+        return static_cast<uint32_t>(size);
+    }
+    CONSTEXPR npu_set_dma0_size0_t& set_size(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        size = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("size", std::to_string(size)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_size1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t size:16;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_size1_t(uint32_t _size) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        size(_size & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_dma0_size1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        size(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(size) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_dma0_size1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_dma0_size1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_size() const
+    {
+        return static_cast<uint32_t>(size);
+    }
+    CONSTEXPR npu_set_dma0_size1_t& set_size(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        size = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("size", std::to_string(size)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_broadcast_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t broadcast_h:1;
+    uint32_t broadcast_w:1;
+    uint32_t broadcast_c:1;
+    uint32_t reserved1:3;
+    uint32_t operand_order:1;
+    uint32_t broadcast_constant:1;
+    uint32_t reserved2:8;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_broadcast_t(NPU_NAMESPACE::broadcast_mode _broadcast_h, NPU_NAMESPACE::broadcast_mode _broadcast_w, NPU_NAMESPACE::broadcast_mode _broadcast_c, NPU_NAMESPACE::ifm2_operand_order _operand_order, NPU_NAMESPACE::broadcast_mode _broadcast_constant) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_BROADCAST)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        broadcast_h(static_cast<uint8_t>(_broadcast_h) & ((1U << 1)-1)),
+        broadcast_w(static_cast<uint8_t>(_broadcast_w) & ((1U << 1)-1)),
+        broadcast_c(static_cast<uint8_t>(_broadcast_c) & ((1U << 1)-1)),
+        reserved1(0),
+        operand_order(static_cast<uint8_t>(_operand_order) & ((1U << 1)-1)),
+        broadcast_constant(static_cast<uint8_t>(_broadcast_constant) & ((1U << 1)-1)),
+        reserved2(0)
+    {}
+    CONSTEXPR npu_set_ifm2_broadcast_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_BROADCAST)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        broadcast_h(0),
+        broadcast_w(0),
+        broadcast_c(0),
+        reserved1(0),
+        operand_order(0),
+        broadcast_constant(0),
+        reserved2(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_BROADCAST) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_BROADCAST); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(broadcast_h) << 16;
+        word |= uint32_t(broadcast_w) << 17;
+        word |= uint32_t(broadcast_c) << 18;
+        word |= uint32_t(operand_order) << 22;
+        word |= uint32_t(broadcast_constant) << 23;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm2_broadcast_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm2_broadcast_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::broadcast_mode get_broadcast_h() const
+    {
+        return static_cast<NPU_NAMESPACE::broadcast_mode>(broadcast_h);
+    }
+    CONSTEXPR npu_set_ifm2_broadcast_t& set_broadcast_h(NPU_NAMESPACE::broadcast_mode value)
+    {
+        broadcast_h = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::broadcast_mode get_broadcast_w() const
+    {
+        return static_cast<NPU_NAMESPACE::broadcast_mode>(broadcast_w);
+    }
+    CONSTEXPR npu_set_ifm2_broadcast_t& set_broadcast_w(NPU_NAMESPACE::broadcast_mode value)
+    {
+        broadcast_w = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::broadcast_mode get_broadcast_c() const
+    {
+        return static_cast<NPU_NAMESPACE::broadcast_mode>(broadcast_c);
+    }
+    CONSTEXPR npu_set_ifm2_broadcast_t& set_broadcast_c(NPU_NAMESPACE::broadcast_mode value)
+    {
+        broadcast_c = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::ifm2_operand_order get_operand_order() const
+    {
+        return static_cast<NPU_NAMESPACE::ifm2_operand_order>(operand_order);
+    }
+    CONSTEXPR npu_set_ifm2_broadcast_t& set_operand_order(NPU_NAMESPACE::ifm2_operand_order value)
+    {
+        operand_order = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::broadcast_mode get_broadcast_constant() const
+    {
+        return static_cast<NPU_NAMESPACE::broadcast_mode>(broadcast_constant);
+    }
+    CONSTEXPR npu_set_ifm2_broadcast_t& set_broadcast_constant(NPU_NAMESPACE::broadcast_mode value)
+    {
+        broadcast_constant = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("broadcast_h", (broadcast_h < (sizeof(broadcast_mode_str)/sizeof(broadcast_mode_str[0])) ? broadcast_mode_str[broadcast_h] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("broadcast_w", (broadcast_w < (sizeof(broadcast_mode_str)/sizeof(broadcast_mode_str[0])) ? broadcast_mode_str[broadcast_w] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("broadcast_c", (broadcast_c < (sizeof(broadcast_mode_str)/sizeof(broadcast_mode_str[0])) ? broadcast_mode_str[broadcast_c] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("operand_order", (operand_order < (sizeof(ifm2_operand_order_str)/sizeof(ifm2_operand_order_str[0])) ? ifm2_operand_order_str[operand_order] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("broadcast_constant", (broadcast_constant < (sizeof(broadcast_mode_str)/sizeof(broadcast_mode_str[0])) ? broadcast_mode_str[broadcast_constant] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_scalar_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t scalar:16;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_scalar_t(uint32_t _scalar) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_SCALAR)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        scalar(_scalar & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ifm2_scalar_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_SCALAR)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        scalar(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_SCALAR) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_SCALAR); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(scalar) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm2_scalar_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm2_scalar_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_scalar() const
+    {
+        return static_cast<uint32_t>(scalar);
+    }
+    CONSTEXPR npu_set_ifm2_scalar_t& set_scalar(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        scalar = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("scalar", std::to_string(scalar)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_precision_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t activation_type:1;
+    uint32_t reserved1:1;
+    uint32_t activation_precision:2;
+    uint32_t reserved2:2;
+    uint32_t activation_format:2;
+    uint32_t reserved3:8;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_precision_t(NPU_NAMESPACE::activation_type _activation_type, NPU_NAMESPACE::activation_precision _activation_precision, NPU_NAMESPACE::activation_format _activation_format) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_PRECISION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        activation_type(static_cast<uint8_t>(_activation_type) & ((1U << 1)-1)),
+        reserved1(0),
+        activation_precision(static_cast<uint8_t>(_activation_precision) & ((1U << 2)-1)),
+        reserved2(0),
+        activation_format(static_cast<uint8_t>(_activation_format) & ((1U << 2)-1)),
+        reserved3(0)
+    {}
+    CONSTEXPR npu_set_ifm2_precision_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_PRECISION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        activation_type(0),
+        reserved1(0),
+        activation_precision(0),
+        reserved2(0),
+        activation_format(0),
+        reserved3(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_PRECISION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_PRECISION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(activation_type) << 16;
+        word |= uint32_t(activation_precision) << 18;
+        word |= uint32_t(activation_format) << 22;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm2_precision_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm2_precision_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_type get_activation_type() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_type>(activation_type);
+    }
+    CONSTEXPR npu_set_ifm2_precision_t& set_activation_type(NPU_NAMESPACE::activation_type value)
+    {
+        activation_type = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_precision get_activation_precision() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_precision>(activation_precision);
+    }
+    CONSTEXPR npu_set_ifm2_precision_t& set_activation_precision(NPU_NAMESPACE::activation_precision value)
+    {
+        activation_precision = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_format get_activation_format() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_format>(activation_format);
+    }
+    CONSTEXPR npu_set_ifm2_precision_t& set_activation_format(NPU_NAMESPACE::activation_format value)
+    {
+        activation_format = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("activation_type", (activation_type < (sizeof(activation_type_str)/sizeof(activation_type_str[0])) ? activation_type_str[activation_type] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("activation_precision", (activation_precision < (sizeof(activation_precision_str)/sizeof(activation_precision_str[0])) ? activation_precision_str[activation_precision] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("activation_format", (activation_format < (sizeof(activation_format_str)/sizeof(activation_format_str[0])) ? activation_format_str[activation_format] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_zero_point_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t zero_point:16;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_zero_point_t(uint32_t _zero_point) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_ZERO_POINT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        zero_point(_zero_point & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ifm2_zero_point_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_ZERO_POINT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        zero_point(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_ZERO_POINT) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_ZERO_POINT); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(zero_point) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm2_zero_point_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm2_zero_point_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_zero_point() const
+    {
+        return static_cast<uint32_t>(zero_point);
+    }
+    CONSTEXPR npu_set_ifm2_zero_point_t& set_zero_point(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        zero_point = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("zero_point", std::to_string(zero_point)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_width0_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t width_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_width0_m1_t(uint32_t _width_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_WIDTH0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(_width_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ifm2_width0_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_WIDTH0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_WIDTH0_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_WIDTH0_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(width_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm2_width0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm2_width0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_width_m1() const
+    {
+        return static_cast<uint32_t>(width_m1);
+    }
+    CONSTEXPR npu_set_ifm2_width0_m1_t& set_width_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        width_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("width_m1", std::to_string(width_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_height0_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t height_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_height0_m1_t(uint32_t _height_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(_height_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ifm2_height0_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT0_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT0_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(height_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm2_height0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm2_height0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_height_m1() const
+    {
+        return static_cast<uint32_t>(height_m1);
+    }
+    CONSTEXPR npu_set_ifm2_height0_m1_t& set_height_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        height_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("height_m1", std::to_string(height_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_height1_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t height_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_height1_m1_t(uint32_t _height_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT1_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(_height_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ifm2_height1_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT1_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT1_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT1_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(height_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm2_height1_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm2_height1_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_height_m1() const
+    {
+        return static_cast<uint32_t>(height_m1);
+    }
+    CONSTEXPR npu_set_ifm2_height1_m1_t& set_height_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        height_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("height_m1", std::to_string(height_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_ib_start_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t ib_start:6;
+    uint32_t reserved1:10;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_ib_start_t(uint32_t _ib_start) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_IB_START)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        ib_start(_ib_start & ((1U << 6)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ifm2_ib_start_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_IB_START)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        ib_start(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_IB_START) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_IB_START); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(ib_start) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm2_ib_start_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm2_ib_start_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ib_start() const
+    {
+        return static_cast<uint32_t>(ib_start);
+    }
+    CONSTEXPR npu_set_ifm2_ib_start_t& set_ib_start(uint32_t value)
+    {
+        assert((value >> 6) == 0);
+        ib_start = static_cast<uint8_t>(value & ((1U << 6)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("ib_start", std::to_string(ib_start)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_region_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t region:3;
+    uint32_t reserved1:13;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_region_t(uint32_t _region) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(_region & ((1U << 3)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ifm2_region_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_REGION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_REGION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(region) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm2_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm2_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region() const
+    {
+        return static_cast<uint32_t>(region);
+    }
+    CONSTEXPR npu_set_ifm2_region_t& set_region(uint32_t value)
+    {
+        assert((value >> 3) == 0);
+        region = static_cast<uint8_t>(value & ((1U << 3)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("region", std::to_string(region)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_base0_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_base0_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm_base0_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE0) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE0); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm_base0_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_base1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_base1_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm_base1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE1) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm_base1_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_base2_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_base2_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE2)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm_base2_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE2)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE2) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE2); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm_base2_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_base3_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_base3_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE3)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm_base3_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE3)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE3) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE3); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm_base3_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_stride_x_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_stride_x_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_X)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm_stride_x_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_X)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_X) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_X); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm_stride_x_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_stride_y_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_stride_y_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_Y)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm_stride_y_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_Y)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_Y) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_Y); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm_stride_y_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_stride_c_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_stride_c_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_C)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm_stride_c_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_C)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_C) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_C); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm_stride_c_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_base0_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_base0_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ofm_base0_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE0) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE0); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ofm_base0_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_base1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_base1_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ofm_base1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE1) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ofm_base1_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_base2_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_base2_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE2)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ofm_base2_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE2)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE2) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE2); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ofm_base2_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_base3_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_base3_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE3)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ofm_base3_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE3)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE3) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE3); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ofm_base3_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_stride_x_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_stride_x_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_X)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ofm_stride_x_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_X)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_X) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_X); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ofm_stride_x_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_stride_y_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_stride_y_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_Y)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ofm_stride_y_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_Y)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_Y) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_Y); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ofm_stride_y_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_stride_c_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_stride_c_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_C)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ofm_stride_c_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_C)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_C) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_C); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ofm_stride_c_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_weight_base_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_weight_base_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_BASE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_weight_base_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_BASE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_BASE) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_BASE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_weight_base_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_weight_length_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t reserved1:16;
+    uint32_t length:32;
+#ifdef __cplusplus
+public:
+    npu_set_weight_length_t(uint32_t _length) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_LENGTH)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        length(_length)
+    {}
+    CONSTEXPR npu_set_weight_length_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_LENGTH)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        length(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_LENGTH) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_LENGTH); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(length) << 32;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd1_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_weight_length_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_weight_length_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_length() const
+    {
+        return static_cast<uint32_t>(length);
+    }
+    CONSTEXPR npu_set_weight_length_t& set_length(uint32_t value)
+    {
+        length = value;
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("length", std::to_string(length)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_scale_base_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_scale_base_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_BASE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_scale_base_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_BASE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_BASE) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_BASE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_scale_base_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_scale_length_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t reserved1:16;
+    uint32_t length:20;
+    uint32_t reserved2:12;
+#ifdef __cplusplus
+public:
+    npu_set_scale_length_t(uint32_t _length) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_LENGTH)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        length(_length & ((1U << 20)-1)),
+        reserved2(0)
+    {}
+    CONSTEXPR npu_set_scale_length_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_LENGTH)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        length(0),
+        reserved2(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_LENGTH) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_LENGTH); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(length) << 32;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd1_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_scale_length_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_scale_length_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_length() const
+    {
+        return static_cast<uint32_t>(length);
+    }
+    CONSTEXPR npu_set_scale_length_t& set_length(uint32_t value)
+    {
+        assert((value >> 20) == 0);
+        length = value & ((1U << 20)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("length", std::to_string(length)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_scale_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t shift:6;
+    uint32_t reserved1:10;
+    uint32_t scale:32;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_scale_t(uint32_t _shift, uint32_t _scale) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_SCALE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        shift(_shift & ((1U << 6)-1)),
+        reserved1(0),
+        scale(_scale)
+    {}
+    CONSTEXPR npu_set_ofm_scale_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_SCALE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        shift(0),
+        reserved1(0),
+        scale(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_SCALE) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_SCALE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(shift) << 16;
+        word |= uint64_t(scale) << 32;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd1_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_scale_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_scale_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_shift() const
+    {
+        return static_cast<uint32_t>(shift);
+    }
+    CONSTEXPR npu_set_ofm_scale_t& set_shift(uint32_t value)
+    {
+        assert((value >> 6) == 0);
+        shift = static_cast<uint8_t>(value & ((1U << 6)-1));
+        return *this;
+    }
+    CONSTEXPR uint32_t get_scale() const
+    {
+        return static_cast<uint32_t>(scale);
+    }
+    CONSTEXPR npu_set_ofm_scale_t& set_scale(uint32_t value)
+    {
+        scale = value;
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("shift", std::to_string(shift)));
+        fields.push_back(std::make_pair<std::string, std::string>("scale", std::to_string(scale)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_opa_scale_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t shift:6;
+    uint32_t reserved1:10;
+    uint32_t scale:32;
+#ifdef __cplusplus
+public:
+    npu_set_opa_scale_t(uint32_t _shift, uint32_t _scale) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OPA_SCALE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        shift(_shift & ((1U << 6)-1)),
+        reserved1(0),
+        scale(_scale)
+    {}
+    CONSTEXPR npu_set_opa_scale_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OPA_SCALE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        shift(0),
+        reserved1(0),
+        scale(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OPA_SCALE) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OPA_SCALE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(shift) << 16;
+        word |= uint64_t(scale) << 32;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd1_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_opa_scale_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_opa_scale_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_shift() const
+    {
+        return static_cast<uint32_t>(shift);
+    }
+    CONSTEXPR npu_set_opa_scale_t& set_shift(uint32_t value)
+    {
+        assert((value >> 6) == 0);
+        shift = static_cast<uint8_t>(value & ((1U << 6)-1));
+        return *this;
+    }
+    CONSTEXPR uint32_t get_scale() const
+    {
+        return static_cast<uint32_t>(scale);
+    }
+    CONSTEXPR npu_set_opa_scale_t& set_scale(uint32_t value)
+    {
+        scale = value;
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("shift", std::to_string(shift)));
+        fields.push_back(std::make_pair<std::string, std::string>("scale", std::to_string(scale)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_opb_scale_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t reserved1:16;
+    uint32_t scale:16;
+    uint32_t reserved2:16;
+#ifdef __cplusplus
+public:
+    npu_set_opb_scale_t(uint32_t _scale) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OPB_SCALE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        scale(_scale & ((1U << 16)-1)),
+        reserved2(0)
+    {}
+    CONSTEXPR npu_set_opb_scale_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OPB_SCALE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        scale(0),
+        reserved2(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OPB_SCALE) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OPB_SCALE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(scale) << 32;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd1_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_opb_scale_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_opb_scale_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_scale() const
+    {
+        return static_cast<uint32_t>(scale);
+    }
+    CONSTEXPR npu_set_opb_scale_t& set_scale(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        scale = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("scale", std::to_string(scale)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_src_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_src_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_dma0_src_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_dma0_src_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_dst_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_dst_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_dma0_dst_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_dma0_dst_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_len_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_len_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_LEN)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_dma0_len_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_LEN)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_LEN) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_LEN); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_dma0_len_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_skip0_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_skip0_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SKIP0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_dma0_skip0_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SKIP0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SKIP0) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SKIP0); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_dma0_skip0_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_skip1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_skip1_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SKIP1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_dma0_skip1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SKIP1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SKIP1) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SKIP1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_dma0_skip1_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_base0_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_base0_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm2_base0_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE0) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE0); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm2_base0_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_base1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_base1_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm2_base1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE1) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm2_base1_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_base2_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_base2_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE2)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm2_base2_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE2)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE2) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE2); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm2_base2_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_base3_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_base3_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE3)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm2_base3_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE3)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE3) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE3); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm2_base3_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_stride_x_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_stride_x_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_X)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm2_stride_x_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_X)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_X) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_X); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm2_stride_x_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_stride_y_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_stride_y_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_Y)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm2_stride_y_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_Y)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_Y) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_Y); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm2_stride_y_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_stride_c_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_stride_c_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_C)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm2_stride_c_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_C)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_C) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_C); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm2_stride_c_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_weight1_base_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_weight1_base_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_BASE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_weight1_base_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_BASE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_BASE) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_BASE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_weight1_base_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_weight1_length_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t reserved1:16;
+    uint32_t length:32;
+#ifdef __cplusplus
+public:
+    npu_set_weight1_length_t(uint32_t _length) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_LENGTH)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        length(_length)
+    {}
+    CONSTEXPR npu_set_weight1_length_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_LENGTH)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        length(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_LENGTH) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_LENGTH); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(length) << 32;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd1_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_weight1_length_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_weight1_length_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_length() const
+    {
+        return static_cast<uint32_t>(length);
+    }
+    CONSTEXPR npu_set_weight1_length_t& set_length(uint32_t value)
+    {
+        length = value;
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("length", std::to_string(length)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_scale1_base_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_scale1_base_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE1_BASE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_scale1_base_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE1_BASE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE1_BASE) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE1_BASE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_scale1_base_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_scale1_length_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t reserved1:16;
+    uint32_t length:20;
+    uint32_t reserved2:12;
+#ifdef __cplusplus
+public:
+    npu_set_scale1_length_t(uint32_t _length) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE1_LENGTH)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        length(_length & ((1U << 20)-1)),
+        reserved2(0)
+    {}
+    CONSTEXPR npu_set_scale1_length_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE1_LENGTH)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        length(0),
+        reserved2(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE1_LENGTH) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE1_LENGTH); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(length) << 32;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd1_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_scale1_length_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_scale1_length_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_length() const
+    {
+        return static_cast<uint32_t>(length);
+    }
+    CONSTEXPR npu_set_scale1_length_t& set_length(uint32_t value)
+    {
+        assert((value >> 20) == 0);
+        length = value & ((1U << 20)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("length", std::to_string(length)));
+    }
+#endif
+#endif
+};
+#ifdef __cplusplus
+};
+#endif
+#define NPU_OP_STRUCTS \
+    NPU_OP_(stop) \
+    NPU_OP_(irq) \
+    NPU_OP_(conv) \
+    NPU_OP_(depthwise) \
+    NPU_OP_(pool) \
+    NPU_OP_(elementwise) \
+    NPU_OP_(dma_start) \
+    NPU_OP_(dma_wait) \
+    NPU_OP_(kernel_wait) \
+    NPU_OP_(pmu_mask)
+
+#define NPU_SET_STRUCTS \
+    NPU_SET_(ifm_pad_top) \
+    NPU_SET_(ifm_pad_left) \
+    NPU_SET_(ifm_pad_right) \
+    NPU_SET_(ifm_pad_bottom) \
+    NPU_SET_(ifm_depth_m1) \
+    NPU_SET_(ifm_precision) \
+    NPU_SET_(ifm_upscale) \
+    NPU_SET_(ifm_zero_point) \
+    NPU_SET_(ifm_width0_m1) \
+    NPU_SET_(ifm_height0_m1) \
+    NPU_SET_(ifm_height1_m1) \
+    NPU_SET_(ifm_ib_end) \
+    NPU_SET_(ifm_region) \
+    NPU_SET_(ofm_width_m1) \
+    NPU_SET_(ofm_height_m1) \
+    NPU_SET_(ofm_depth_m1) \
+    NPU_SET_(ofm_precision) \
+    NPU_SET_(ofm_blk_width_m1) \
+    NPU_SET_(ofm_blk_height_m1) \
+    NPU_SET_(ofm_blk_depth_m1) \
+    NPU_SET_(ofm_zero_point) \
+    NPU_SET_(ofm_width0_m1) \
+    NPU_SET_(ofm_height0_m1) \
+    NPU_SET_(ofm_height1_m1) \
+    NPU_SET_(ofm_region) \
+    NPU_SET_(kernel_width_m1) \
+    NPU_SET_(kernel_height_m1) \
+    NPU_SET_(kernel_stride) \
+    NPU_SET_(parallel_mode) \
+    NPU_SET_(acc_format) \
+    NPU_SET_(activation) \
+    NPU_SET_(activation_min) \
+    NPU_SET_(activation_max) \
+    NPU_SET_(weight_region) \
+    NPU_SET_(scale_region) \
+    NPU_SET_(ab_start) \
+    NPU_SET_(blockdep) \
+    NPU_SET_(dma0_src_region) \
+    NPU_SET_(dma0_dst_region) \
+    NPU_SET_(dma0_size0) \
+    NPU_SET_(dma0_size1) \
+    NPU_SET_(ifm2_broadcast) \
+    NPU_SET_(ifm2_scalar) \
+    NPU_SET_(ifm2_precision) \
+    NPU_SET_(ifm2_zero_point) \
+    NPU_SET_(ifm2_width0_m1) \
+    NPU_SET_(ifm2_height0_m1) \
+    NPU_SET_(ifm2_height1_m1) \
+    NPU_SET_(ifm2_ib_start) \
+    NPU_SET_(ifm2_region) \
+    NPU_SET_(ifm_base0) \
+    NPU_SET_(ifm_base1) \
+    NPU_SET_(ifm_base2) \
+    NPU_SET_(ifm_base3) \
+    NPU_SET_(ifm_stride_x) \
+    NPU_SET_(ifm_stride_y) \
+    NPU_SET_(ifm_stride_c) \
+    NPU_SET_(ofm_base0) \
+    NPU_SET_(ofm_base1) \
+    NPU_SET_(ofm_base2) \
+    NPU_SET_(ofm_base3) \
+    NPU_SET_(ofm_stride_x) \
+    NPU_SET_(ofm_stride_y) \
+    NPU_SET_(ofm_stride_c) \
+    NPU_SET_(weight_base) \
+    NPU_SET_(weight_length) \
+    NPU_SET_(scale_base) \
+    NPU_SET_(scale_length) \
+    NPU_SET_(ofm_scale) \
+    NPU_SET_(opa_scale) \
+    NPU_SET_(opb_scale) \
+    NPU_SET_(dma0_src) \
+    NPU_SET_(dma0_dst) \
+    NPU_SET_(dma0_len) \
+    NPU_SET_(dma0_skip0) \
+    NPU_SET_(dma0_skip1) \
+    NPU_SET_(ifm2_base0) \
+    NPU_SET_(ifm2_base1) \
+    NPU_SET_(ifm2_base2) \
+    NPU_SET_(ifm2_base3) \
+    NPU_SET_(ifm2_stride_x) \
+    NPU_SET_(ifm2_stride_y) \
+    NPU_SET_(ifm2_stride_c) \
+    NPU_SET_(weight1_base) \
+    NPU_SET_(weight1_length) \
+    NPU_SET_(scale1_base) \
+    NPU_SET_(scale1_length)
+
+#define EXPAND_ACC_FORMAT(FUNC, SEP) \
+    FUNC(acc_format, I32) SEP \
+    FUNC(acc_format, I40) SEP \
+    FUNC(acc_format, F16)
+
+#define EXPAND_ACTIVATION_CLIP_RANGE(FUNC, SEP) \
+    FUNC(activation_clip_range, OFM_PRECISION) SEP \
+    FUNC(activation_clip_range, FORCE_UINT8) SEP \
+    FUNC(activation_clip_range, FORCE_INT8) SEP \
+    FUNC(activation_clip_range, FORCE_INT16)
+
+#define EXPAND_ACTIVATION_FORMAT(FUNC, SEP) \
+    FUNC(activation_format, NHWC) SEP \
+    FUNC(activation_format, NHCWB16)
+
+#define EXPAND_ACTIVATION_FUNCTION(FUNC, SEP) \
+    FUNC(activation_function, RELU) SEP \
+    FUNC(activation_function, TANH) SEP \
+    FUNC(activation_function, SIGMOID) SEP \
+    FUNC(activation_function, TABLE_0) SEP \
+    FUNC(activation_function, TABLE_1) SEP \
+    FUNC(activation_function, TABLE_2) SEP \
+    FUNC(activation_function, TABLE_3) SEP \
+    FUNC(activation_function, TABLE_4) SEP \
+    FUNC(activation_function, TABLE_5) SEP \
+    FUNC(activation_function, TABLE_6) SEP \
+    FUNC(activation_function, TABLE_7)
+
+#define EXPAND_ACTIVATION_PRECISION(FUNC, SEP) \
+    FUNC(activation_precision, B8) SEP \
+    FUNC(activation_precision, B16) SEP \
+    FUNC(activation_precision, B32) SEP \
+    FUNC(activation_precision, B64)
+
+#define EXPAND_ACTIVATION_TYPE(FUNC, SEP) \
+    FUNC(activation_type, UNSIGNED) SEP \
+    FUNC(activation_type, SIGNED)
+
+#define EXPAND_AXI_MEM_ENCODING(FUNC, SEP) \
+    FUNC(axi_mem_encoding, DEVICE_NON_BUFFERABLE) SEP \
+    FUNC(axi_mem_encoding, DEVICE_BUFFERABLE) SEP \
+    FUNC(axi_mem_encoding, NORMAL_NON_CACHEABLE_NON_BUFFERABLE) SEP \
+    FUNC(axi_mem_encoding, NORMAL_NON_CACHEABLE_BUFFERABLE) SEP \
+    FUNC(axi_mem_encoding, WRITE_THROUGH_NO_ALLOCATE) SEP \
+    FUNC(axi_mem_encoding, WRITE_THROUGH_READ_ALLOCATE) SEP \
+    FUNC(axi_mem_encoding, WRITE_THROUGH_WRITE_ALLOCATE) SEP \
+    FUNC(axi_mem_encoding, WRITE_THROUGH_READ_AND_WRITE_ALLOCATE) SEP \
+    FUNC(axi_mem_encoding, WRITE_BACK_NO_ALLOCATE) SEP \
+    FUNC(axi_mem_encoding, WRITE_BACK_READ_ALLOCATE) SEP \
+    FUNC(axi_mem_encoding, WRITE_BACK_WRITE_ALLOCATE) SEP \
+    FUNC(axi_mem_encoding, WRITE_BACK_READ_AND_WRITE_ALLOCATE)
+
+#define EXPAND_BROADCAST_MODE(FUNC, SEP) \
+    FUNC(broadcast_mode, DISABLE) SEP \
+    FUNC(broadcast_mode, ENABLE)
+
+#define EXPAND_CMD0_OPCODE(FUNC, SEP) \
+    FUNC(cmd0_opcode, NPU_OP_STOP) SEP \
+    FUNC(cmd0_opcode, NPU_OP_IRQ) SEP \
+    FUNC(cmd0_opcode, NPU_OP_CONV) SEP \
+    FUNC(cmd0_opcode, NPU_OP_DEPTHWISE) SEP \
+    FUNC(cmd0_opcode, NPU_OP_POOL) SEP \
+    FUNC(cmd0_opcode, NPU_OP_ELEMENTWISE) SEP \
+    FUNC(cmd0_opcode, NPU_OP_DMA_START) SEP \
+    FUNC(cmd0_opcode, NPU_OP_DMA_WAIT) SEP \
+    FUNC(cmd0_opcode, NPU_OP_KERNEL_WAIT) SEP \
+    FUNC(cmd0_opcode, NPU_OP_PMU_MASK) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_PAD_TOP) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_PAD_LEFT) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_PAD_RIGHT) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_PAD_BOTTOM) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_DEPTH_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_PRECISION) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_UPSCALE) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_ZERO_POINT) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_WIDTH0_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_HEIGHT0_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_HEIGHT1_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_IB_END) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_REGION) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_WIDTH_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_HEIGHT_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_DEPTH_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_PRECISION) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_BLK_WIDTH_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_BLK_HEIGHT_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_BLK_DEPTH_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_ZERO_POINT) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_WIDTH0_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_HEIGHT0_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_HEIGHT1_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_REGION) SEP \
+    FUNC(cmd0_opcode, NPU_SET_KERNEL_WIDTH_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_KERNEL_HEIGHT_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_KERNEL_STRIDE) SEP \
+    FUNC(cmd0_opcode, NPU_SET_PARALLEL_MODE) SEP \
+    FUNC(cmd0_opcode, NPU_SET_ACC_FORMAT) SEP \
+    FUNC(cmd0_opcode, NPU_SET_ACTIVATION) SEP \
+    FUNC(cmd0_opcode, NPU_SET_ACTIVATION_MIN) SEP \
+    FUNC(cmd0_opcode, NPU_SET_ACTIVATION_MAX) SEP \
+    FUNC(cmd0_opcode, NPU_SET_WEIGHT_REGION) SEP \
+    FUNC(cmd0_opcode, NPU_SET_SCALE_REGION) SEP \
+    FUNC(cmd0_opcode, NPU_SET_AB_START) SEP \
+    FUNC(cmd0_opcode, NPU_SET_BLOCKDEP) SEP \
+    FUNC(cmd0_opcode, NPU_SET_DMA0_SRC_REGION) SEP \
+    FUNC(cmd0_opcode, NPU_SET_DMA0_DST_REGION) SEP \
+    FUNC(cmd0_opcode, NPU_SET_DMA0_SIZE0) SEP \
+    FUNC(cmd0_opcode, NPU_SET_DMA0_SIZE1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM2_BROADCAST) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM2_SCALAR) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM2_PRECISION) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM2_ZERO_POINT) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM2_WIDTH0_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM2_HEIGHT0_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM2_HEIGHT1_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM2_IB_START) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM2_REGION)
+
+#define EXPAND_CMD1_OPCODE(FUNC, SEP) \
+    FUNC(cmd1_opcode, NPU_SET_IFM_BASE0) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM_BASE1) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM_BASE2) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM_BASE3) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM_STRIDE_X) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM_STRIDE_Y) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM_STRIDE_C) SEP \
+    FUNC(cmd1_opcode, NPU_SET_OFM_BASE0) SEP \
+    FUNC(cmd1_opcode, NPU_SET_OFM_BASE1) SEP \
+    FUNC(cmd1_opcode, NPU_SET_OFM_BASE2) SEP \
+    FUNC(cmd1_opcode, NPU_SET_OFM_BASE3) SEP \
+    FUNC(cmd1_opcode, NPU_SET_OFM_STRIDE_X) SEP \
+    FUNC(cmd1_opcode, NPU_SET_OFM_STRIDE_Y) SEP \
+    FUNC(cmd1_opcode, NPU_SET_OFM_STRIDE_C) SEP \
+    FUNC(cmd1_opcode, NPU_SET_WEIGHT_BASE) SEP \
+    FUNC(cmd1_opcode, NPU_SET_WEIGHT_LENGTH) SEP \
+    FUNC(cmd1_opcode, NPU_SET_SCALE_BASE) SEP \
+    FUNC(cmd1_opcode, NPU_SET_SCALE_LENGTH) SEP \
+    FUNC(cmd1_opcode, NPU_SET_OFM_SCALE) SEP \
+    FUNC(cmd1_opcode, NPU_SET_OPA_SCALE) SEP \
+    FUNC(cmd1_opcode, NPU_SET_OPB_SCALE) SEP \
+    FUNC(cmd1_opcode, NPU_SET_DMA0_SRC) SEP \
+    FUNC(cmd1_opcode, NPU_SET_DMA0_DST) SEP \
+    FUNC(cmd1_opcode, NPU_SET_DMA0_LEN) SEP \
+    FUNC(cmd1_opcode, NPU_SET_DMA0_SKIP0) SEP \
+    FUNC(cmd1_opcode, NPU_SET_DMA0_SKIP1) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM2_BASE0) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM2_BASE1) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM2_BASE2) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM2_BASE3) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM2_STRIDE_X) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM2_STRIDE_Y) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM2_STRIDE_C) SEP \
+    FUNC(cmd1_opcode, NPU_SET_WEIGHT1_BASE) SEP \
+    FUNC(cmd1_opcode, NPU_SET_WEIGHT1_LENGTH) SEP \
+    FUNC(cmd1_opcode, NPU_SET_SCALE1_BASE) SEP \
+    FUNC(cmd1_opcode, NPU_SET_SCALE1_LENGTH)
+
+#define EXPAND_CMD_CTRL(FUNC, SEP) \
+    FUNC(cmd_ctrl, CMD0_CTRL) SEP \
+    FUNC(cmd_ctrl, CMD1_CTRL)
+
+#define EXPAND_CUSTOM_DMA(FUNC, SEP) \
+    FUNC(custom_dma, NOT_IMPLEMENTED) SEP \
+    FUNC(custom_dma, IMPLEMENTED)
+
+#define EXPAND_DMA_FAULT_CHANNEL(FUNC, SEP) \
+    FUNC(dma_fault_channel, CMD_READ) SEP \
+    FUNC(dma_fault_channel, IFM_READ) SEP \
+    FUNC(dma_fault_channel, WEIGHT_READ) SEP \
+    FUNC(dma_fault_channel, SBS_READ) SEP \
+    FUNC(dma_fault_channel, MEM2MEM_READ) SEP \
+    FUNC(dma_fault_channel, OFM_WRITE) SEP \
+    FUNC(dma_fault_channel, MEM2MEM_WRITE)
+
+#define EXPAND_DMA_FAULT_SRC(FUNC, SEP) \
+    FUNC(dma_fault_src, AXI_M0) SEP \
+    FUNC(dma_fault_src, AXI_M1)
+
+#define EXPAND_DMA_REGION_MODE(FUNC, SEP) \
+    FUNC(dma_region_mode, EXTERNAL) SEP \
+    FUNC(dma_region_mode, INTERNAL)
+
+#define EXPAND_DMA_STRIDE_MODE(FUNC, SEP) \
+    FUNC(dma_stride_mode, D1) SEP \
+    FUNC(dma_stride_mode, D2) SEP \
+    FUNC(dma_stride_mode, D3)
+
+#define EXPAND_ELEMENTWISE_MODE(FUNC, SEP) \
+    FUNC(elementwise_mode, MUL) SEP \
+    FUNC(elementwise_mode, ADD) SEP \
+    FUNC(elementwise_mode, SUB) SEP \
+    FUNC(elementwise_mode, MIN) SEP \
+    FUNC(elementwise_mode, MAX) SEP \
+    FUNC(elementwise_mode, LRELU) SEP \
+    FUNC(elementwise_mode, ABS) SEP \
+    FUNC(elementwise_mode, CLZ) SEP \
+    FUNC(elementwise_mode, SHR) SEP \
+    FUNC(elementwise_mode, SHL)
+
+#define EXPAND_IFM2_OPERAND_ORDER(FUNC, SEP) \
+    FUNC(ifm2_operand_order, ORDER_B) SEP \
+    FUNC(ifm2_operand_order, ORDER_A)
+
+#define EXPAND_IFM_SCALE_MODE(FUNC, SEP) \
+    FUNC(ifm_scale_mode, OPA_OPB_16) SEP \
+    FUNC(ifm_scale_mode, OPA_32) SEP \
+    FUNC(ifm_scale_mode, OPB_32)
+
+#define EXPAND_IFM_UPSCALE_MODE(FUNC, SEP) \
+    FUNC(ifm_upscale_mode, NONE) SEP \
+    FUNC(ifm_upscale_mode, NEAREST) SEP \
+    FUNC(ifm_upscale_mode, ZEROS)
+
+#define EXPAND_KERNEL_DECOMPOSITION(FUNC, SEP) \
+    FUNC(kernel_decomposition, D8X8) SEP \
+    FUNC(kernel_decomposition, D4X4)
+
+#define EXPAND_KERNEL_DILATION(FUNC, SEP) \
+    FUNC(kernel_dilation, NONE) SEP \
+    FUNC(kernel_dilation, X2)
+
+#define EXPAND_MAX_BEATS(FUNC, SEP) \
+    FUNC(max_beats, B64) SEP \
+    FUNC(max_beats, B128) SEP \
+    FUNC(max_beats, B256)
+
+#define EXPAND_MEM_ATTR(FUNC, SEP) \
+    FUNC(mem_attr, AXI0_OUTSTANDING_COUNTER0) SEP \
+    FUNC(mem_attr, AXI0_OUTSTANDING_COUNTER1) SEP \
+    FUNC(mem_attr, AXI1_OUTSTANDING_COUNTER2) SEP \
+    FUNC(mem_attr, AXI1_OUTSTANDING_COUNTER3)
+
+#define EXPAND_OFM_SCALE_MODE(FUNC, SEP) \
+    FUNC(ofm_scale_mode, PER_CHANNEL) SEP \
+    FUNC(ofm_scale_mode, GLOBAL)
+
+#define EXPAND_PARALLEL_MODE(FUNC, SEP) \
+    FUNC(parallel_mode, SINGLE_CORE) SEP \
+    FUNC(parallel_mode, DUAL_CORE_DEPTH)
+
+#define EXPAND_PMU_AXI_CHANNEL(FUNC, SEP) \
+    FUNC(pmu_axi_channel, RD_CMD) SEP \
+    FUNC(pmu_axi_channel, RD_IFM) SEP \
+    FUNC(pmu_axi_channel, RD_WEIGHTS) SEP \
+    FUNC(pmu_axi_channel, RD_SCALE_BIAS) SEP \
+    FUNC(pmu_axi_channel, RD_MEM2MEM) SEP \
+    FUNC(pmu_axi_channel, WR_OFM) SEP \
+    FUNC(pmu_axi_channel, WR_MEM2MEM)
+
+#define EXPAND_PMU_EVENT(FUNC, SEP) \
+    FUNC(pmu_event, NO_EVENT) SEP \
+    FUNC(pmu_event, CYCLE) SEP \
+    FUNC(pmu_event, NPU_IDLE) SEP \
+    FUNC(pmu_event, CC_STALLED_ON_BLOCKDEP) SEP \
+    FUNC(pmu_event, CC_STALLED_ON_SHRAM_RECONFIG) SEP \
+    FUNC(pmu_event, NPU_ACTIVE) SEP \
+    FUNC(pmu_event, MAC_ACTIVE) SEP \
+    FUNC(pmu_event, MAC_ACTIVE_8BIT) SEP \
+    FUNC(pmu_event, MAC_ACTIVE_16BIT) SEP \
+    FUNC(pmu_event, MAC_DPU_ACTIVE) SEP \
+    FUNC(pmu_event, MAC_STALLED_BY_WD_ACC) SEP \
+    FUNC(pmu_event, MAC_STALLED_BY_WD) SEP \
+    FUNC(pmu_event, MAC_STALLED_BY_ACC) SEP \
+    FUNC(pmu_event, MAC_STALLED_BY_IB) SEP \
+    FUNC(pmu_event, MAC_ACTIVE_32BIT) SEP \
+    FUNC(pmu_event, MAC_STALLED_BY_INT_W) SEP \
+    FUNC(pmu_event, MAC_STALLED_BY_INT_ACC) SEP \
+    FUNC(pmu_event, AO_ACTIVE) SEP \
+    FUNC(pmu_event, AO_ACTIVE_8BIT) SEP \
+    FUNC(pmu_event, AO_ACTIVE_16BIT) SEP \
+    FUNC(pmu_event, AO_STALLED_BY_OFMP_OB) SEP \
+    FUNC(pmu_event, AO_STALLED_BY_OFMP) SEP \
+    FUNC(pmu_event, AO_STALLED_BY_OB) SEP \
+    FUNC(pmu_event, AO_STALLED_BY_ACC_IB) SEP \
+    FUNC(pmu_event, AO_STALLED_BY_ACC) SEP \
+    FUNC(pmu_event, AO_STALLED_BY_IB) SEP \
+    FUNC(pmu_event, WD_ACTIVE) SEP \
+    FUNC(pmu_event, WD_STALLED) SEP \
+    FUNC(pmu_event, WD_STALLED_BY_WS) SEP \
+    FUNC(pmu_event, WD_STALLED_BY_WD_BUF) SEP \
+    FUNC(pmu_event, WD_PARSE_ACTIVE) SEP \
+    FUNC(pmu_event, WD_PARSE_STALLED) SEP \
+    FUNC(pmu_event, WD_PARSE_STALLED_IN) SEP \
+    FUNC(pmu_event, WD_PARSE_STALLED_OUT) SEP \
+    FUNC(pmu_event, WD_TRANS_WS) SEP \
+    FUNC(pmu_event, WD_TRANS_WB) SEP \
+    FUNC(pmu_event, WD_TRANS_DW0) SEP \
+    FUNC(pmu_event, WD_TRANS_DW1) SEP \
+    FUNC(pmu_event, AXI0_RD_TRANS_ACCEPTED) SEP \
+    FUNC(pmu_event, AXI0_RD_TRANS_COMPLETED) SEP \
+    FUNC(pmu_event, AXI0_RD_DATA_BEAT_RECEIVED) SEP \
+    FUNC(pmu_event, AXI0_RD_TRAN_REQ_STALLED) SEP \
+    FUNC(pmu_event, AXI0_WR_TRANS_ACCEPTED) SEP \
+    FUNC(pmu_event, AXI0_WR_TRANS_COMPLETED_M) SEP \
+    FUNC(pmu_event, AXI0_WR_TRANS_COMPLETED_S) SEP \
+    FUNC(pmu_event, AXI0_WR_DATA_BEAT_WRITTEN) SEP \
+    FUNC(pmu_event, AXI0_WR_TRAN_REQ_STALLED) SEP \
+    FUNC(pmu_event, AXI0_WR_DATA_BEAT_STALLED) SEP \
+    FUNC(pmu_event, AXI0_ENABLED_CYCLES) SEP \
+    FUNC(pmu_event, AXI0_RD_STALL_LIMIT) SEP \
+    FUNC(pmu_event, AXI0_WR_STALL_LIMIT) SEP \
+    FUNC(pmu_event, AXI_LATENCY_ANY) SEP \
+    FUNC(pmu_event, AXI_LATENCY_32) SEP \
+    FUNC(pmu_event, AXI_LATENCY_64) SEP \
+    FUNC(pmu_event, AXI_LATENCY_128) SEP \
+    FUNC(pmu_event, AXI_LATENCY_256) SEP \
+    FUNC(pmu_event, AXI_LATENCY_512) SEP \
+    FUNC(pmu_event, AXI_LATENCY_1024) SEP \
+    FUNC(pmu_event, ECC_DMA) SEP \
+    FUNC(pmu_event, ECC_SB0) SEP \
+    FUNC(pmu_event, AXI1_RD_TRANS_ACCEPTED) SEP \
+    FUNC(pmu_event, AXI1_RD_TRANS_COMPLETED) SEP \
+    FUNC(pmu_event, AXI1_RD_DATA_BEAT_RECEIVED) SEP \
+    FUNC(pmu_event, AXI1_RD_TRAN_REQ_STALLED) SEP \
+    FUNC(pmu_event, AXI1_WR_TRANS_ACCEPTED) SEP \
+    FUNC(pmu_event, AXI1_WR_TRANS_COMPLETED_M) SEP \
+    FUNC(pmu_event, AXI1_WR_TRANS_COMPLETED_S) SEP \
+    FUNC(pmu_event, AXI1_WR_DATA_BEAT_WRITTEN) SEP \
+    FUNC(pmu_event, AXI1_WR_TRAN_REQ_STALLED) SEP \
+    FUNC(pmu_event, AXI1_WR_DATA_BEAT_STALLED) SEP \
+    FUNC(pmu_event, AXI1_ENABLED_CYCLES) SEP \
+    FUNC(pmu_event, AXI1_RD_STALL_LIMIT) SEP \
+    FUNC(pmu_event, AXI1_WR_STALL_LIMIT) SEP \
+    FUNC(pmu_event, ECC_SB1)
+
+#define EXPAND_POOLING_MODE(FUNC, SEP) \
+    FUNC(pooling_mode, MAX) SEP \
+    FUNC(pooling_mode, AVERAGE) SEP \
+    FUNC(pooling_mode, REDUCE_SUM)
+
+#define EXPAND_PRIVILEGE_LEVEL(FUNC, SEP) \
+    FUNC(privilege_level, USER) SEP \
+    FUNC(privilege_level, PRIVILEGED)
+
+#define EXPAND_ROUND_MODE(FUNC, SEP) \
+    FUNC(round_mode, DBL) SEP \
+    FUNC(round_mode, TRUNCATE) SEP \
+    FUNC(round_mode, NATURAL)
+
+#define EXPAND_SECURITY_LEVEL(FUNC, SEP) \
+    FUNC(security_level, SECURE) SEP \
+    FUNC(security_level, NON_SECURE)
+
+#define EXPAND_STATE(FUNC, SEP) \
+    FUNC(state, STOPPED) SEP \
+    FUNC(state, RUNNING)
+
+#define EXPAND_WD_CORE_SLICE_STATE(FUNC, SEP) \
+    FUNC(wd_core_slice_state, HEADER) SEP \
+    FUNC(wd_core_slice_state, PALETTE) SEP \
+    FUNC(wd_core_slice_state, WEIGHTS)
+
+#define EXPAND_WD_CTRL_STATE(FUNC, SEP) \
+    FUNC(wd_ctrl_state, IDLE) SEP \
+    FUNC(wd_ctrl_state, DRAIN) SEP \
+    FUNC(wd_ctrl_state, OFD_INIT) SEP \
+    FUNC(wd_ctrl_state, OFD_RUN)
+
+#define EXPAND_WEIGHT_ORDER(FUNC, SEP) \
+    FUNC(weight_order, DEPTH_FIRST) SEP \
+    FUNC(weight_order, PART_KERNEL_FIRST)
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ethosu/regor/architecture/ethosu65/ethos_u65_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu65/ethos_u65_register_cs_generator.cpp
new file mode 100644
index 00000000..75bc9b25
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu65/ethos_u65_register_cs_generator.cpp
@@ -0,0 +1,40 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "ethos_u65_register_cs_generator.hpp"
+
+#include "ethos_u65.hpp"
+#define NPU_NAMESPACE ethosu65
+#include "ethos_u65_interface.hpp"
+
+namespace regor
+{
+
+using namespace ethosu65;
+
+EthosU65RCSGenerator::EthosU65RCSGenerator(ArchEthosU65 *arch) : EthosU55RCSGenerator(arch), _arch(arch)
+{
+}
+
+void EthosU65RCSGenerator::GenerateInitialRegisterSetup()
+{
+    auto mode = _arch->_cores <= 1 ? parallel_mode::SINGLE_CORE : parallel_mode::DUAL_CORE_DEPTH;
+    Emit(isa::npu_set_parallel_mode_t(mode));
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu65/ethos_u65_register_cs_generator.hpp b/ethosu/regor/architecture/ethosu65/ethos_u65_register_cs_generator.hpp
new file mode 100644
index 00000000..f3448e17
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu65/ethos_u65_register_cs_generator.hpp
@@ -0,0 +1,41 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "architecture/ethosu55/ethos_u55_register_cs_generator.hpp"
+#include "ethos_u65.hpp"
+
+namespace regor
+{
+/// <summary>
+/// Generates register command streams for Ethos U55 and Ethos U65.
+/// </summary>
+class EthosU65RCSGenerator : public EthosU55RCSGenerator
+{
+public:
+    EthosU65RCSGenerator(ArchEthosU65 *arch);
+
+protected:
+    void GenerateInitialRegisterSetup() override;
+
+private:
+    ArchEthosU65 *_arch;
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp
new file mode 100644
index 00000000..e9b04ad6
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp
@@ -0,0 +1,1324 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "ethos_u85.hpp"
+
+#include "common/common.hpp"
+#include "common/logging.hpp"
+
+#include "common/bit_flags.hpp"
+#include "common/numeric_util.hpp"
+#include "ethos_u85_performance.hpp"
+#include "ethos_u85_register_cs_generator.hpp"
+#include "ethos_u85_weight_encoder.hpp"
+
+#include <algorithm>
+#include <limits>
+#include <unordered_map>
+#include <unordered_set>
+
+BEGIN_ENUM_TABLE(regor::EthosU85Accumulator)
+    ADD_ENUM_NAME(Acc32)
+    ADD_ENUM_NAME(Acc48)
+END_ENUM_TABLE()
+
+BEGIN_ENUM_TABLE(regor::EthosU85Traversal)
+    ADD_ENUM_NAME(DepthFirst)
+    ADD_ENUM_NAME(PartKernel)
+    ADD_ENUM_NAME(Depthwise)
+END_ENUM_TABLE()
+
+namespace regor
+{
+
+unsigned MaskForNpuOp(const EthosU85NpuOp npuOp, bool hasIfm2);
+bool IsMinMaxReduction(OpType opType, const Kernel *kernel);
+
+static const EthosU85PerfInfo s_EthosU85PerfInfo[] = {
+    // Accelerator.Ethos_U85_128
+    {{2.0, 3.0, 3.0, 3.0, 4.0, 6.0, 1.0, 2.0}, {1.0, 1.0, 0.0}},
+    // Accelerator.Ethos_U85_256
+    {{2.0, 3.0, 3.0, 3.0, 4.0, 6.0, 1.0, 2.0}, {1.0, 1.0, 0.0}},
+    // Accelerator.Ethos_U85_512
+    {{1.0, 1.5, 1.5, 1.5, 2.0, 3.0, 0.5, 1.0}, {1.0, 1.0, 0.0}},
+    // Accelerator.Ethos_U85_1024
+    {{0.75, 1.25, 0.75, 0.75, 1.0, 1.5, 0.25, 0.5}, {1.0, 0.5, 0.0}},
+    // Accelerator.Ethos_U85_2048
+    {{0.625, 1.125, 0.5, 0.375, 0.5, 0.75, 0.125, 0.25}, {1.0, 0.25, 0.0}},
+};
+
+static const ArchEthosU85::AcceleratorConfig s_EthosU85Configs[] = {
+    // Accelerator.Ethos_U85_128
+    {128, 1, {Shape(1, 2, 8), Shape(1, 1, 16)}, 2, Shape(1, 2, 8), 8 * 1024, 8 * 1024, 2, 1, 0, &s_EthosU85PerfInfo[0]},
+    // Accelerator.Ethos_U85_256
+    {256, 1, {Shape(1, 2, 16), Shape(1, 4, 8), Shape(2, 2, 8)}, 3, Shape(2, 2, 8), 16 * 1024, 16 * 1024, 4, 1, 0, &s_EthosU85PerfInfo[0]},
+    // Accelerator.Ethos_U85_512
+    {512, 2, {Shape(2, 2, 16), Shape(1, 4, 16)}, 2, Shape(2, 2, 16), 16 * 1024, 32 * 1024, 8, 1, 0, &s_EthosU85PerfInfo[1]},
+    // Accelerator.Ethos_U85_1014
+    {1024, 4, {Shape(2, 2, 32), Shape(1, 4, 32), Shape(2, 4, 16)}, 3, Shape(4, 2, 16), 16 * 1024, 64 * 1024, 16, 1, 1, &s_EthosU85PerfInfo[2]},
+    // Accelerator.Ethos_U85_2048
+    {2048, 4, {Shape(2, 2, 64), Shape(1, 4, 64), Shape(4, 4, 16)}, 3, Shape(4, 4, 16), 32 * 1024, 128 * 1024, 32, 2, 1, &s_EthosU85PerfInfo[3]},
+};
+
+enum class ElementwiseUsage
+{
+    No = 0,
+    Full = 1,
+    Scalar = 2,
+};
+
+static int AccumulatorBits(EthosU85Accumulator accType)
+{
+    int bits = 32;
+    switch ( accType )
+    {
+        case EthosU85Accumulator::Acc32:
+            bits = 32;
+            break;
+        case EthosU85Accumulator::Acc48:
+            bits = 64;
+            break;
+        default:
+            LOG_WARN("Invalid accumulator type for Ethos U85: {}\n", accType);
+            assert(false);
+            break;
+    }
+    return bits;
+}
+
+
+ArchEthosU85::ArchEthosU85() : _subkernelMax(8, 8, 65536), _ofmBlockMax(128, 128, 1024)
+{
+    _weightEncoder = std::make_unique<EthosU85WeightEncoder>(this);
+    _rcsGenerator = std::make_unique<EthosU85RCSGenerator>(this);
+}
+
+uint32_t ArchEthosU85::Version()
+{
+    return EthosU85RCSGenerator::IdRegister();
+}
+
+bool ArchEthosU85::ParseConfig(IniReader *reader)
+{
+    // Parse architecture configuration
+    std::string key;
+    int macs = 0;
+    while ( reader->Begin(key) )
+    {
+        if ( key == "macs" )
+        {
+            macs = reader->Get<int>();
+        }
+        reader->End();
+    }
+
+    // Find the requested MAC configuration for this accelerator
+    auto cfg = std::find_if(s_EthosU85Configs, std::cend(s_EthosU85Configs),
+        [&](const AcceleratorConfig &config) { return config.macs == macs; });
+    if ( cfg == std::cend(s_EthosU85Configs) )
+    {
+        assert(macs == 128 || macs == 256 || macs == 512 || macs == 1024 || macs == 2048);
+        LOG_TRACE0("Unable to find Ethos U85 accelerator for macs={}", macs);
+        return false;
+    }
+
+    ApplyConfig(cfg);
+
+    return true;
+}
+
+void ArchEthosU85::ApplyConfig(const AcceleratorConfig *cfg)
+{
+    // Basic configuration
+    _cores = cfg->cores;
+    _macs = cfg->macs;
+    _ifmUBlock = cfg->ifmUBlock;
+    _nOfmUBlocks = cfg->nOfmUBlocks;
+    std::copy(std::begin(cfg->ofmUBlocks), std::end(cfg->ofmUBlocks), std::begin(_ofmUBlocks));
+
+    // Internal memory
+    _ifmRamSizeBytes = cfg->ifmRamSizeBytes;
+    _accRamSizeBytes = cfg->accRamSizeBytes;
+    _numAxiSramLog2 = cfg->numAxiSramLog2;
+    _numAxiExtLog2 = cfg->numAxiExtLog2;
+
+    _lutRam = std::make_unique<ArchitectureMemory>("lutram", 2048);
+    // TODO MLBEDSW-7980 fix LUT performance parameters
+    _lutRam->SetParameters(1, 0, 0, 1, 0);
+    _lutMemory = _lutRam.get();
+    _performance = std::unique_ptr<ArchitecturePerformance>(new EthosU85Performance(this, cfg->perfInfo));
+
+    // Populate ofmUBlock -> NpuOp lookup table
+    SetupOfmUBlockToOpTable();
+    // Populate ofmUBlock -> ifmAlloc unit table
+    SetupOfmUBlockToIfmAuTable();
+}
+
+
+std::unique_ptr<ArchitectureOpConfig> ArchEthosU85::GetOpConfig(OpType opType, const ArchitectureConfigQuery &query)
+{
+    auto config = FindBlockConfig(opType, query);
+    return config;
+}
+
+
+std::unique_ptr<ArchitectureOpGroup> ArchEthosU85::CreateOpGroup(const ArchitectureOpGroupQuery &op)
+{
+    LOG_TRACE1("Trying to create ArchEthosU85 OpGroup for {}\n", OpTypeToString(op.type));
+
+    auto group = std::make_unique<EthosU85OpGroup>();
+    if ( !group->Add(op) )
+    {
+        return nullptr;
+    }
+
+    return group;
+}
+
+std::vector<uint32_t> ArchEthosU85::ConfigRegisters()
+{
+    return std::vector<uint32_t>(1, ConfigRegister(2));
+}
+
+int ArchEthosU85::UpscaleAndRounding(ArchResampling resampling, int &rounding)
+{
+    rounding = (resampling == ArchResampling::Nearest) ? 1 : 0;
+    return (resampling == ArchResampling::Zeros) ? 2 : 1;
+}
+
+AxisMask ArchEthosU85::CanSubdivide(OpType opType)
+{
+    if ( IsConvolution(opType) || IsElementwise(opType) || IsPooling(opType) )
+    {
+        return AxisMask::AxisY;
+    }
+    return AxisMask::None;
+}
+
+bool ArchEthosU85::SupportsLeakyRelu(bool /*quantized*/, DataType /*type*/)
+{
+    return true;
+}
+
+bool ArchEthosU85::SupportsMatMul(OpType opType)
+{
+    EthosU85NpuOp npuOp = GetHWOp(opType);
+    if ( npuOp == EthosU85NpuOp::None )
+    {
+        return false;
+    }
+
+    return true;
+}
+
+bool ArchEthosU85::SupportsTranspose(OpType opType, TransposeType transposeType)
+{
+    if ( IsNone(transposeType) ) return true;
+
+    EthosU85NpuOp npuOp = GetHWOp(opType);
+    if ( npuOp == EthosU85NpuOp::None || npuOp == EthosU85NpuOp::Resize || npuOp == EthosU85NpuOp::Dma )
+    {
+        return false;
+    }
+    else if ( npuOp == EthosU85NpuOp::Elementwise )
+    {
+        return transposeType == TransposeType::NHWC || transposeType == TransposeType::NHCW || transposeType == TransposeType::NCHW;
+    }
+
+    return transposeType == TransposeType::NHWC || transposeType == TransposeType::NWHC || transposeType == TransposeType::NHCW ||
+           transposeType == TransposeType::NWCH || transposeType == TransposeType::NCHW || transposeType == TransposeType::NCWH;
+}
+
+bool ArchEthosU85::SupportsReverse(OpType opType, ReverseType reverseType)
+{
+    if ( reverseType == ReverseType::None ) return true;
+
+    EthosU85NpuOp npuOp = GetHWOp(opType);
+    if ( npuOp == EthosU85NpuOp::None || npuOp == EthosU85NpuOp::Elementwise || npuOp == EthosU85NpuOp::Dma )
+    {
+        return false;
+    }
+
+    return reverseType == ReverseType::H || reverseType == ReverseType::W || reverseType == ReverseType::C;
+}
+
+bool ArchEthosU85::SupportsGather(OpType opType)
+{
+    EthosU85NpuOp npuOp = GetHWOp(opType);
+    if ( npuOp == EthosU85NpuOp::None )
+    {
+        return false;
+    }
+
+    return true;
+}
+
+bool ArchEthosU85::SupportsScatter(OpType opType)
+{
+    EthosU85NpuOp npuOp = GetHWOp(opType);
+    if ( npuOp == EthosU85NpuOp::None )
+    {
+        return false;
+    }
+
+    return true;
+}
+
+bool ArchEthosU85::SupportsSigmoidTanhLutInt16(OpType opType)
+{
+    return (opType == OpType::Sigmoid || opType == OpType::Tanh);
+}
+
+bool ArchEthosU85::SupportsArgMax(OpType opType)
+{
+    EthosU85NpuOp npuOp = GetHWOp(opType);
+    if ( npuOp == EthosU85NpuOp::None )
+    {
+        return false;
+    }
+
+    return true;
+}
+
+bool ArchEthosU85::SupportsResize(const ResizeSupportQuery &query)
+{
+    /* Supported operator checks for resize operations
+     *
+     *  * Scaling numerators must be less than or equal to 2048
+     *  * Offsets must be in the range [-numerator, numerator) for each axis
+     *  * The following constraints apply to upscale-factors
+     *    mode REPLICATE:
+     *      Any width and height upscale-factors are supported
+     *    mode NEAREST:
+     *      Any width and height upscale-factors are supported
+     *    mode BILINEAR:
+     *      if IFM W*H == 1*1:
+     *        Any width and height upscale-factors are supported
+     *      else:
+     *        The upscale-factors need to be powers-of-two.
+     */
+    if ( query.ifmShape.Width() == 1 && query.ifmShape.Height() == 1 )
+    {
+        return true;
+    }
+
+    int n_w = query.scaleX.n;
+    int d_w = query.scaleX.d;
+    int n_h = query.scaleY.n;
+    int d_h = query.scaleY.d;
+    bool supported = true;
+
+    if ( n_h > 2048 )
+    {
+        LOG_WARN("Resize height scale numerator ({}) exceeds maximum size (2048).\n", n_h);
+        supported = false;
+    }
+    if ( n_w > 2048 )
+    {
+        LOG_WARN("Resize width scale numerator ({}) exceeds maximum size (2048).\n", n_w);
+        supported = false;
+    }
+    if ( query.offsetY >= n_h || query.offsetY < -n_h )
+    {
+        LOG_WARN("Resize height offset: {} is outside the valid range [-height_numerator, height_numerator) = [{}, {})\n",
+            query.offsetY, -n_h, n_h);
+        supported = false;
+    }
+    if ( query.offsetX >= n_w || query.offsetX < -n_w )
+    {
+        LOG_WARN("Resize width offset: {} is outside the valid range [-with_numerator, width_numerator) = [{}, {})\n",
+            query.offsetX, -n_w, n_w);
+        supported = false;
+    }
+
+    if ( query.mode == ArchResizeMode::Bilinear )
+    {
+        // Get scale fractions and verify that scale-factor is a power of two.
+
+        if ( n_w % d_w != 0 )
+        {
+            LOG_WARN("ResizeBilinear width scale-factor is not an integer: {}/{}\n", n_w, d_w);
+            supported = false;
+        }
+        if ( n_h % d_h != 0 )
+        {
+            LOG_WARN("ResizeBilinear height scale-factor is not an integer: {}/{}\n", n_h, d_h);
+            supported = false;
+        }
+        int scale_w = n_w / d_w;
+        int scale_h = n_h / d_h;
+        if ( !IsPowerOfTwo(scale_w) )
+        {
+            LOG_WARN("ResizeBilinear width scale-factor is not a power of two: {}\n", double(n_w) / d_w);
+            supported = false;
+        }
+        if ( !IsPowerOfTwo(scale_h) )
+        {
+            LOG_WARN("ResizeBilinear height scale-factor is not a power of two: {}\n", double(n_h) / d_h);
+            supported = false;
+        }
+        return supported;
+    }
+    return supported;
+}
+
+bool ArchEthosU85::SupportsAccumulatorMode(ArchAccumulatorSource source, bool outputEnabled)
+{
+    UNUSED(outputEnabled);
+    return source == ArchAccumulatorSource::Reset || source == ArchAccumulatorSource::Acc || source == ArchAccumulatorSource::Ifm2;
+}
+
+bool ArchEthosU85::SupportsScalar(OpType opType, DataType dataType, TensorUsage usage)
+{
+    bool supportedType(dataType == DataType::Int8 || dataType == DataType::UInt8 || dataType == DataType::Int16 || dataType == DataType::Int32);
+    return EthosU85RCSGenerator::IsSupportedElementwise(opType) && supportedType && IsIFM(usage);
+}
+
+Flags<WeightFormat> ArchEthosU85::SupportedWeightFormat(OpType op)
+{
+    auto hwOp = GetHWOp(op);
+    if ( hwOp == EthosU85NpuOp::Convolution || hwOp == EthosU85NpuOp::VectorProduct )
+    {
+        return Flags<WeightFormat>(WeightFormat::Default, WeightFormat::Fast, WeightFormat::Sparse2_4);
+    }
+    return Flags<WeightFormat>(WeightFormat::Default);
+}
+
+bool ArchEthosU85::UseAvgPoolNop(OpType type)
+{
+    return IsActivation(type) || type == OpType::Quantize || type == OpType::MemoryCopy;
+}
+
+static bool ChooseKernelMethod(const Shape &ifmShape, int ifmBits, const Kernel *kernel)
+{
+    if ( ifmShape.Depth() <= 8 )
+    {
+        return true;
+    }
+
+    // Compare part-kernel to depth-kernel and choose the one with best utilisation
+    int kernelElements = kernel->ElementsWH();
+    double depthUtilisation = ifmShape.Depth() / double(RoundAway(ifmShape.Depth(), ifmBits == 8 ? 32 : 16));
+    double partUtilisation =
+        (ifmShape.Depth() / double(RoundAway(ifmShape.Depth(), 8)) *
+            (kernelElements / double(RoundAway(kernelElements, ifmBits == 8 ? 4 : 2))));
+
+    return partUtilisation >= depthUtilisation;
+}
+
+
+static Shape GetArchIFMBlockSize(const Shape &ofmBlock, const Kernel *kernel, const Shape &ublock,
+    const Shape &subkernelLimit, int upscale, int rounding)
+{
+    Point2i dilatedSize = kernel->DilatedWH();
+
+    // IFM block height
+    int h = RequiredInputSize(ofmBlock.Height(), kernel->Stride().y, std::min(dilatedSize.y, subkernelLimit.Height()), upscale, rounding);
+    h = RoundAway(h, ublock.Height());
+
+    // IFM block width
+    int w = RequiredInputSize(ofmBlock.Width(), kernel->Stride().x, std::min(dilatedSize.x, subkernelLimit.Width()), upscale, rounding);
+    w = RoundAway(w, ublock.Width());
+
+    return Shape(1, h, w, ofmBlock.Depth());
+}
+
+unsigned MaskForNpuOp(const EthosU85NpuOp npuOp, bool hasIfm2 = false)
+{
+    if ( npuOp == EthosU85NpuOp::VectorProduct && hasIfm2 )
+    {
+        // first bit is reserved for matmul
+        return 1;
+    }
+    return 1 << (int(npuOp));
+}
+
+int ArchEthosU85::IndexForOfmUBlock(const Shape &ofmUBlock)
+{
+    auto it = std::find(_ofmUBlocks.begin(), _ofmUBlocks.end(), ofmUBlock);
+    if ( it == _ofmUBlocks.end() )
+    {
+        LOG_WARN("OFM microblock {} is not supported for this configuration\n", ofmUBlock.ToString());
+        assert(false);
+    }
+    return int(std::distance(_ofmUBlocks.begin(), it));
+}
+
+void ArchEthosU85::SetupOfmUBlockToIfmAuTable()
+{
+    if ( _macs == 128 )
+    {
+        int b_1x2x8 = IndexForOfmUBlock(Shape(1, 2, 8));
+        int b_1x1x16 = IndexForOfmUBlock(Shape(1, 1, 16));
+        _uBlockToIfmAuTable[b_1x2x8] = {Shape(1, 2, 1), Shape(1, 1, 2), Shape(1, 1, 2)};
+        _uBlockToIfmAuTable[b_1x1x16] = _uBlockToIfmAuTable[b_1x2x8];
+    }
+    else if ( _macs == 256 )
+    {
+        int b_2x2x8 = IndexForOfmUBlock(Shape(2, 2, 8));
+        int b_1x4x8 = IndexForOfmUBlock(Shape(1, 4, 8));
+        int b_1x2x16 = IndexForOfmUBlock(Shape(1, 2, 16));
+        _uBlockToIfmAuTable[b_2x2x8] = {Shape(2, 2, 1), Shape(1, 2, 2), Shape(1, 1, 4)};
+        _uBlockToIfmAuTable[b_1x2x16] = _uBlockToIfmAuTable[b_2x2x8];
+        _uBlockToIfmAuTable[b_1x4x8] = {Shape(1, 4, 1), Shape(1, 2, 2), Shape(1, 1, 4)};
+    }
+    else if ( _macs == 512 )
+    {
+        int b_2x2x16 = IndexForOfmUBlock(Shape(2, 2, 16));
+        int b_1x4x16 = IndexForOfmUBlock(Shape(1, 4, 16));
+        _uBlockToIfmAuTable[b_2x2x16] = {Shape(2, 2, 1), Shape(1, 2, 2), Shape(1, 1, 4)};
+        _uBlockToIfmAuTable[b_1x4x16] = {Shape(1, 4, 1), Shape(1, 2, 2), Shape(1, 1, 4)};
+    }
+    else if ( _macs == 1024 )
+    {
+        int b_2x2x32 = IndexForOfmUBlock(Shape(2, 2, 32));
+        int b_1x4x32 = IndexForOfmUBlock(Shape(1, 4, 32));
+        int b_2x4x16 = IndexForOfmUBlock(Shape(2, 4, 16));
+        _uBlockToIfmAuTable[b_2x2x32] = {Shape(2, 4, 1), Shape(2, 2, 2), Shape(1, 2, 4)};
+        _uBlockToIfmAuTable[b_2x4x16] = _uBlockToIfmAuTable[b_2x2x32];
+        _uBlockToIfmAuTable[b_1x4x32] = {Shape(2, 4, 1), Shape(1, 4, 2), Shape(1, 2, 4)};
+    }
+    else
+    {
+        int b_2x2x64 = IndexForOfmUBlock(Shape(2, 2, 64));
+        int b_1x4x64 = IndexForOfmUBlock(Shape(1, 4, 64));
+        int b_4x4x16 = IndexForOfmUBlock(Shape(4, 4, 16));
+        _uBlockToIfmAuTable[b_2x2x64] = {Shape(4, 4, 1), Shape(2, 4, 2), Shape(2, 2, 4)};
+        _uBlockToIfmAuTable[b_4x4x16] = _uBlockToIfmAuTable[b_2x2x64];
+        _uBlockToIfmAuTable[b_1x4x64] = {Shape(4, 4, 1), Shape(2, 4, 2), Shape(1, 4, 4)};
+    }
+}
+
+void ArchEthosU85::SetupOfmUBlockToOpTable()
+{
+    unsigned conv = MaskForNpuOp(EthosU85NpuOp::Convolution);
+    unsigned depthwise = MaskForNpuOp(EthosU85NpuOp::Depthwise);
+    unsigned vectorprod = MaskForNpuOp(EthosU85NpuOp::VectorProduct);
+    unsigned pool = MaskForNpuOp(EthosU85NpuOp::Pooling);
+    unsigned reducesum = MaskForNpuOp(EthosU85NpuOp::ReduceSum);
+    unsigned elementwise = MaskForNpuOp(EthosU85NpuOp::Elementwise);
+    unsigned resize = MaskForNpuOp(EthosU85NpuOp::Resize);
+    unsigned matmul = MaskForNpuOp(EthosU85NpuOp::VectorProduct, true);
+    unsigned dma = MaskForNpuOp(EthosU85NpuOp::Dma, true);
+
+    // clang-format off
+    if ( _macs == 128 )
+    {
+        unsigned b_1x2x8 = IndexForOfmUBlock(Shape(1, 2, 8));
+        unsigned b_1x1x16 = IndexForOfmUBlock(Shape(1, 1, 16));
+        _uBlockToOpTable[b_1x2x8] = {
+            // 8 bit ifm
+            conv | matmul | vectorprod | reducesum | elementwise | resize,
+            // 16 bit ifm
+            conv | matmul | vectorprod | depthwise | pool | reducesum | elementwise | resize,
+            // 32 bit ifm
+            reducesum | elementwise | resize,
+        };
+        _uBlockToOpTable[b_1x1x16] = {
+            depthwise | pool | elementwise | resize,
+            vectorprod | elementwise | resize,
+            elementwise | resize
+        };
+    }
+    else if ( _macs == 256 )
+    {
+        unsigned b_2x2x8 = IndexForOfmUBlock(Shape(2, 2, 8));
+        unsigned b_1x4x8 = IndexForOfmUBlock(Shape(1, 4, 8));
+        unsigned b_1x2x16 = IndexForOfmUBlock(Shape(1, 2, 16));
+        _uBlockToOpTable[b_2x2x8] = {
+            conv | matmul | vectorprod | reducesum | elementwise | resize,
+            conv | matmul | vectorprod | depthwise | pool | reducesum | elementwise | resize,
+            reducesum | elementwise | resize
+        };
+        _uBlockToOpTable[b_1x4x8] = {
+            conv | matmul | vectorprod | reducesum | elementwise | resize,
+            conv | matmul | vectorprod | depthwise | pool | reducesum | elementwise | resize,
+            reducesum | elementwise | resize
+        };
+        _uBlockToOpTable[b_1x2x16] = {
+            depthwise | pool | elementwise | resize,
+            vectorprod | elementwise | resize,
+            elementwise | resize
+        };
+    }
+    else if ( _macs == 512 )
+    {
+        unsigned b_2x2x16 = IndexForOfmUBlock(Shape(2, 2, 16));
+        unsigned b_1x4x16 = IndexForOfmUBlock(Shape(1, 4, 16));
+        _uBlockToOpTable[b_2x2x16] = {
+            conv | depthwise | vectorprod | pool | reducesum | elementwise | resize | matmul,
+            conv | depthwise | vectorprod | pool | reducesum | elementwise | resize | matmul,
+            reducesum | elementwise | resize,
+        };
+        _uBlockToOpTable[b_1x4x16] = {
+            conv | depthwise | vectorprod | pool | reducesum | elementwise | resize | matmul,
+            conv | depthwise | vectorprod | pool | reducesum | elementwise | resize | matmul,
+            reducesum | elementwise | resize,
+        };
+    }
+    else if ( _macs == 1024 )
+    {
+        unsigned b_2x2x32 = IndexForOfmUBlock(Shape(2, 2, 32));
+        unsigned b_1x4x32 = IndexForOfmUBlock(Shape(1, 4, 32));
+        unsigned b_2x4x16 = IndexForOfmUBlock(Shape(2, 4, 16));
+        _uBlockToOpTable[b_2x2x32] = {
+            conv | matmul | vectorprod | elementwise,
+            conv | matmul | vectorprod | elementwise,
+            elementwise,
+        };
+        _uBlockToOpTable[b_1x4x32] = {
+            conv | matmul | vectorprod | elementwise,
+            conv | matmul | vectorprod | elementwise,
+            elementwise,
+        };
+        _uBlockToOpTable[b_2x4x16] = {
+            conv | vectorprod | depthwise | pool | reducesum | elementwise | resize,
+            conv | vectorprod | depthwise | pool | reducesum | elementwise | resize,
+            reducesum | elementwise | resize,
+        };
+    }
+    else
+    {  // 2048
+        unsigned b_2x2x64 = IndexForOfmUBlock(Shape(2, 2, 64));
+        unsigned b_1x4x64 = IndexForOfmUBlock(Shape(1, 4, 64));
+        unsigned b_4x4x16 = IndexForOfmUBlock(Shape(4, 4, 16));
+        _uBlockToOpTable[b_2x2x64] = {
+            conv | matmul | vectorprod | elementwise,
+            conv | matmul | vectorprod | elementwise,
+            elementwise,
+        };
+        _uBlockToOpTable[b_1x4x64] = {
+            conv | matmul | vectorprod | elementwise,
+            conv | matmul | vectorprod | elementwise,
+            elementwise,
+        };
+        _uBlockToOpTable[b_4x4x16] = {
+            conv | vectorprod | depthwise | pool | reducesum | elementwise | resize,
+            conv | vectorprod | depthwise | pool | reducesum | elementwise | resize,
+            reducesum | elementwise | resize,
+        };
+    }
+    // clang-format on
+}
+
+bool ArchEthosU85::IsUBlockValid(const OpType opType, int ifmBits, const Shape &ofmUBlock, bool hasIfm2)
+{
+    EthosU85NpuOp npuOp = GetHWOp(opType);
+    if ( npuOp == EthosU85NpuOp::None )
+    {
+        return false;
+    }
+
+    unsigned blockIdx = IndexForOfmUBlock(ofmUBlock);
+    if ( blockIdx >= _uBlockToOpTable.size() )
+    {
+        LOG_WARN("OFM microblock {} is not a valid block for Ethos U85-{}\n", ofmUBlock.ToString(), _macs);
+        return false;
+    }
+
+    auto &bitsToOperations = _uBlockToOpTable[blockIdx];
+
+    unsigned bitIdx = (ifmBits / 16);
+    if ( bitIdx >= bitsToOperations.size() )
+    {
+        LOG_DEBUG("(OFM microblock validation - ifmbits: {} is not a valid ifm precision\n", ifmBits);
+        return false;
+    }
+
+    // one-hot encoded mask for NpuOp operations
+    unsigned opmask = MaskForNpuOp(npuOp, hasIfm2);
+    return bitsToOperations[bitIdx] & opmask;
+}
+
+bool IsMinMaxReduction(OpType opType, const Kernel *kernel)
+{
+    // MIN/MAX Reduction over width or height is defined as a MAX/MIN-pool with 1-D kernel.
+    return (opType == OpType::MaxPool || opType == OpType::Min) && (kernel->Size().x == 1 || kernel->Size().y == 1);
+}
+
+Shape ArchEthosU85::FindUBlock(OpType opType, const ArchitectureConfigQuery &query)
+{
+    int lookupBits = query.ifmBits;
+    if ( IsMinMaxReduction(opType, query.kernel) && lookupBits == 32 )
+    {
+        // 16-bit microblock lookup-table is used for
+        // 32-bit Min/Max reductions.
+        lookupBits = 16;
+    }
+
+    const EthosU85NpuOp npuOp = GetHWOp(opType);
+    assert(npuOp != EthosU85NpuOp::None);
+
+    int bestWaste = std::numeric_limits<int>::max();
+    Shape bestUblk;
+
+    for ( int i = 0; i < _nOfmUBlocks; i++ )
+    {
+        const Shape &ublk = _ofmUBlocks[i];
+        if ( !IsUBlockValid(opType, lookupBits, ublk, query.ifmShape[1] != Shape()) )
+        {
+            continue;
+        }
+
+        Shape tmp = Shape::RoundAway(query.ofmShape, ublk);
+        int waste = tmp.Elements() - query.ofmShape.Elements();
+        if ( waste < bestWaste )
+        {
+            bestUblk = ublk;
+            bestWaste = waste;
+        }
+    }
+
+    return bestUblk;
+}
+
+std::unique_ptr<ArchitectureOpConfig> ArchEthosU85::FindBlockConfig(OpType opType, const ArchitectureConfigQuery &query)
+{
+    assert(query.ifmBits > 0 && query.ifmBits <= 32);
+    assert(query.ofmShape.Size() > 2 && "Insufficient dimensions to search for block config");
+    assert(query.kernel != nullptr);
+
+    if ( !SupportsAccumulatorMode(query.accSource, query.accOutputEnabled) ) return nullptr;
+
+    const int OFMSplitDepth = 16;  // Specific to this architecture
+
+    // Elementwise larger-volume correction
+    const Shape &ifmShape = (query.ifmShape[1].Elements() > query.ifmShape[0].Elements()) ? query.ifmShape[1] : query.ifmShape[0];
+
+    EthosU85NpuOp npuOp = GetHWOp(opType);
+    assert(npuOp != EthosU85NpuOp::None);
+
+    // Operator typing help
+    bool isPooling = npuOp == EthosU85NpuOp::Pooling || npuOp == EthosU85NpuOp::ReduceSum;
+    bool isReduceSum = npuOp == EthosU85NpuOp::ReduceSum;
+    bool isDepthwise = npuOp == EthosU85NpuOp::Depthwise;
+    bool isElementwise = npuOp == EthosU85NpuOp::Elementwise;
+    bool isConvolution = npuOp == EthosU85NpuOp::Convolution || npuOp == EthosU85NpuOp::Depthwise;
+    bool isResize = npuOp == EthosU85NpuOp::Resize;
+    bool isDma = npuOp == EthosU85NpuOp::Dma;
+    bool isPartKernel = isConvolution && ChooseKernelMethod(ifmShape, query.ifmBits, query.kernel);
+    bool isEqualDepthOp = isElementwise || (isPooling && !isReduceSum) || isDepthwise || isResize;
+
+    if ( isDma )
+    {
+        // DMA ops doesn't use block config
+        return nullptr;
+    }
+
+    // Operator configuration to be returned
+    auto config = std::make_unique<EthosU85OpConfig>();
+
+    EthosU85Traversal traversal = isDepthwise ? EthosU85Traversal::Depthwise : (isPartKernel ? EthosU85Traversal::PartKernel : EthosU85Traversal::DepthFirst);
+
+    // Accumulator settings
+    EthosU85Accumulator accType = EthosU85Accumulator::Acc32;
+    if ( query.ifmBits == 16 && (!isPooling || isReduceSum) && query.scaled )
+    {
+        accType = EthosU85Accumulator::Acc48;
+    }
+    else if ( query.ifmBits == 64 && isPooling )
+    {
+        // Special case for Rescale int48
+        accType = EthosU85Accumulator::Acc48;
+    }
+
+    int accBits = AccumulatorBits(accType);
+    int rounding;
+    int upscale = UpscaleAndRounding(query.ifmResampling, rounding);
+    int numBlocksInRam = 2;
+
+    const Shape ofmUBlock = FindUBlock(opType, query);
+    if ( ofmUBlock == Shape() )
+    {
+        // no valid ofm microblock found
+        LOG_WARN("Could not find a valid OFM microblock for {} with {}-bit input.\n", OpTypeToString(opType), query.ifmBits);
+        return nullptr;
+    }
+
+    // Subkernel repeats of the IFM
+    Point2i dilatedWH = query.kernel->DilatedWH();
+    int ifmRepeats = DivRoundUp(dilatedWH.x, _subkernelMax.Width()) * DivRoundUp(dilatedWH.y, _subkernelMax.Height());
+
+    int ifmBlockDepth = 0;
+    const bool sparse = query.weightFormat & WeightFormat::Sparse2_4;
+    if ( isPartKernel )
+    {
+        ifmBlockDepth = 16;
+    }
+    else if ( query.ifmBits == 32 || ((_macs == 128 || _macs == 256) && ofmUBlock.Depth() == 16 && !sparse) )
+    {
+        ifmBlockDepth = 32;
+    }
+    else if ( sparse && traversal == EthosU85Traversal::DepthFirst && query.ifmBits == 8 )
+    {
+        ifmBlockDepth = 128;
+    }
+    else
+    {
+        ifmBlockDepth = 64;
+    }
+
+    // Weights fetch (for operators that have them)
+    int weightFetchWH = isConvolution ? query.kernel->Size().AreaXY() : 0;
+
+    int ofmUBlockDepth = ofmUBlock.Depth();
+
+    // When using brick format and certain transposes, there are additional constraints to the block size, so we must
+    // extend the search space to be able to find a valid block size.
+    Shape ofmBlockMin = Shape(0, 0, 0);
+    if ( query.ofmFormat == TensorFormat::NHCWB16 )
+    {
+        switch ( query.transpose )
+        {
+            case TransposeType::NCHW:
+            case TransposeType::NHCW:
+                ofmBlockMin = ofmBlockMin.WithWidth(16);
+                break;
+            case TransposeType::NCWH:
+            case TransposeType::NWCH:
+                ofmBlockMin = ofmBlockMin.WithHeight(16);
+                break;
+            default:
+                break;
+        }
+    }
+    Shape searchSpaceStep = Shape::Max(ofmUBlock, ofmBlockMin);
+    Shape ofmBlockMaxTp = _ofmBlockMax.Untranspose(Reduce4To3(query.transpose));
+    Shape searchSpaceEnd = Shape::RoundAway(Shape::Max(Shape::Min(query.ofmShape, ofmBlockMaxTp), searchSpaceStep), ofmUBlock);
+
+    if ( isResize )
+    {
+        // resize operations are constrained to OFM block height 1 and depth 1-16
+        // TODO MLBEDSW-8573: Improve block config search for Resize/Elementwise operations
+        int resizeMaxWidth = CalcResizeMaxOfmBlockWidth(query.ifmBits, query.rescaling.scaleX.n, query.rescaling.scaleX.d);
+        // reduce minimal step if max width becomes smaller than the minimal step
+        if ( resizeMaxWidth < searchSpaceStep.Width() )
+        {
+            searchSpaceStep = searchSpaceStep.WithWidth(resizeMaxWidth);
+        }
+        searchSpaceStep = searchSpaceStep.WithHeight(1);
+        searchSpaceEnd = searchSpaceEnd.WithHeight(1).WithDepth(16).WithWidth(resizeMaxWidth);
+    }
+
+    // At this point, OFM is already configured to NHWC but we need to limit OFM block depth as well.
+    if ( query.reverse == ReverseType::C )
+    {
+        searchSpaceEnd = Shape::Min(searchSpaceEnd, searchSpaceEnd.WithDepth(16));
+    }
+
+    // Block WHC search, loops across the search space looking for best efficiency
+    float bestCost = std::numeric_limits<float>::infinity();
+    float bestCoverage = std::numeric_limits<float>::infinity();
+    int ofmElements = query.ofmShape.Elements();
+
+    int depth = std::max(ofmUBlockDepth, std::min(searchSpaceEnd.Depth(), OFMSplitDepth));
+    int restartDepth = depth;
+    if ( depth < query.ofmShape.Depth() )
+    {
+        depth = RoundAway(depth, OFMSplitDepth);
+    }
+
+    Shape ifmAllocUnit = CalcIfmAUSize(ifmBlockDepth, query.ifmBits, ofmUBlock);
+
+    std::unordered_set<Point2i, Point2Hash<int>> wontFit;
+    while ( depth <= searchSpaceEnd.Depth() )
+    {
+        if ( isEqualDepthOp )
+        {
+            // For equal depth ops, IFMBlockDepth == OFMBlockDepth
+            // Recalculate the IFM AU for the new depth
+            ifmBlockDepth = depth;
+            ifmAllocUnit = CalcIfmAUSize(depth, query.ifmBits, ofmUBlock);
+        }
+
+        for ( int height = searchSpaceStep.Height(); height <= searchSpaceEnd.Height(); height += searchSpaceStep.Height() )
+        {
+            for ( int width = searchSpaceStep.Width(); width <= searchSpaceEnd.Width(); width += searchSpaceStep.Width() )
+            {
+                // Avoid checking W/H transposed blocks that already didn't fit. i.e. if 8x4x16 didn't
+                // fit, then 4x8x16 won't either.
+                if ( wontFit.count(Point2i(height, width)) > 0 )
+                {
+                    continue;
+                }
+
+                // Calculate the IFM block dimensions required to feed this OFM block
+                Shape ofmBlock = Shape(height, width, depth);
+
+                Shape ifmBlock = GetArchIFMBlockSize(ofmBlock, query.kernel, ifmAllocUnit, _subkernelMax, upscale, rounding);
+                ifmBlock = ifmBlock.WithDepth(ifmBlockDepth);
+
+                // Test if the IFM/OFM blocks fit into RAM
+                if ( TryBlockConfig(npuOp, ofmBlock, ifmBlock, ifmShape, query.ifmBits, accBits, _ifmRamSizeBytes,
+                         _accRamSizeBytes, ifmAllocUnit.Depth(), numBlocksInRam, isEqualDepthOp) )
+                {
+                    Shape fullBlocks = Shape::DivRoundUp(query.ofmShape, ofmBlock);
+                    Point3<float> blocks = query.ofmShape.HWC<float>() / ofmBlock.HWC<float>();
+
+                    // Weights fetching
+                    float weightFetch = float(weightFetchWH) * ifmShape.Depth() * fullBlocks.ElementsWH();
+                    if ( !isDepthwise )
+                    {
+                        weightFetch *= blocks.z * ofmBlock.Depth();
+                    }
+
+                    // IFM fetching
+                    float ifmFetch = float(ifmBlock.ElementsWH()) * ifmShape.Depth() * ifmRepeats * blocks.x * blocks.y;
+                    if ( !isEqualDepthOp )
+                    {
+                        ifmFetch *= fullBlocks.Depth();
+                    }
+
+                    // Scale relative to every output OFM element
+                    float relativeCost =
+                        (isElementwise || isResize) ? float(ofmElements) / (float(height) * width * depth) : (ifmFetch + weightFetch) / float(ofmElements);
+
+                    // If the entire IFM can be encompassed by both buffers, bias to prefer this configuration
+                    if ( ifmShape.Elements() < ifmBlock.Elements() * 2 )
+                    {
+                        relativeCost = relativeCost / 2.0f;
+                    }
+
+                    // Choose based on relative minimum cost or larger IFM area (if equal cost)
+                    if ( relativeCost <= bestCost )
+                    {
+                        bool chooseThis = false;
+                        // Check IFM coverage only when it's equal best_cost and small OFM
+                        if ( relativeCost == bestCost )
+                        {
+                            Shape coverageShape = Shape::Min(ifmShape, ifmBlock);
+                            float coverage = float(ifmShape.ElementsWH()) / float(coverageShape.ElementsWH());
+                            // Small 4x4 IFM constraint found through analysis of networks
+                            if ( coverage <= bestCoverage && (height <= 4 && width <= 4) )
+                            {
+                                bestCoverage = coverage;
+                                chooseThis = true;
+                            }
+                        }
+                        else
+                        {
+                            bestCoverage = std::numeric_limits<float>::infinity();
+                            chooseThis = true;
+                        }
+
+                        if ( chooseThis )
+                        {
+                            bestCost = relativeCost;
+                            config->_ifmBlock = std::move(ifmBlock);
+                            config->_ofmBlock = Shape(1, height, width, depth);
+                        }
+                    }
+                }
+                else
+                {
+                    wontFit.emplace(width, height);
+                }
+            }
+        }
+
+        // Try Next block depth, rounded
+        depth = depth + ofmUBlockDepth;
+        if ( depth < query.ofmShape.Depth() )
+        {
+            depth = RoundAway(depth, OFMSplitDepth);
+        }
+        if ( depth > searchSpaceEnd.Depth() && bestCost == std::numeric_limits<float>::infinity() && numBlocksInRam == 2 )
+        {
+            numBlocksInRam = 1;
+            depth = restartDepth;
+        }
+    }
+
+    config->_ofmUBlock = std::move(ofmUBlock);
+    config->_accumulatorType = accType;
+    config->_accumulatorSource = query.accSource;
+    config->_accumulatorOutputEnabled = query.accOutputEnabled;
+    config->_ifmRamSizeBytes = _ifmRamSizeBytes;
+    config->_traversal = traversal;
+
+    // Return the best configuration
+    if ( bestCost != std::numeric_limits<float>::infinity() )
+    {
+        return std::unique_ptr<ArchitectureOpConfig>(config.release());
+    }
+
+    // Didn't find a configuration
+    return nullptr;
+}
+
+Shape ArchEthosU85::CalcIfmAUSize(int ifmBlkDepth, int ifmBits, Shape ofmUBlk)
+{
+    int ifmu = 0;
+    int ifmDepthBits = ifmBlkDepth * ifmBits;
+    if ( ifmDepthBits > 256 )
+    {
+        // ifmu3
+        ifmu += 2;
+    }
+    else if ( ifmDepthBits > 128 )
+    {
+        // ifmu2
+        ifmu++;
+    }
+    assert(ifmu < 3);
+    unsigned blockIdx = IndexForOfmUBlock(ofmUBlk);
+    return _uBlockToIfmAuTable[blockIdx][ifmu];
+}
+
+int ArchEthosU85::CalcResizeMaxOfmBlockWidth(int ifmBits, int scaleN, int scaleD)
+{
+    // Calculate the maximum OfmBlockWidth that still allows
+    // the IFM block to fit in the chaining buffer
+    assert(scaleN > 0);
+    assert(scaleD > 0);
+    int numIfmCbSlots = _macs / 16;
+    if ( ifmBits == 16 )
+    {
+        numIfmCbSlots /= 2;
+    }
+    int maxOfmBlkW = int(std::ceil(((numIfmCbSlots - 2) * scaleN + 1) / double(scaleD)));
+    maxOfmBlkW = std::max(1, std::min(maxOfmBlkW, _ofmBlockMax.Width()));
+    return maxOfmBlkW;
+}
+
+bool ArchEthosU85::TryBlockConfig(EthosU85NpuOp npuOp, const Shape &ofmBlock, const Shape &ifmBlock, const Shape &ifmShape,
+    int ifmBits, int accBits, int ifmSpace, int accSpace, int ifmAuDepth, int numBlocksInRam, bool isEqualDepthOp)
+{
+    assert(accBits > 0);
+    assert((ifmBits >= 8) && ((ifmBits % 8) == 0));
+
+    // Elementwise and Resize don't use IB/AB.
+    if ( npuOp == EthosU85NpuOp::Elementwise || npuOp == EthosU85NpuOp::Resize )
+    {
+        return true;
+    }
+
+    // IFM Space
+    int ifmAlignDepth = ifmAuDepth * 128 / ifmBits;
+    int ifmBlockDepth = isEqualDepthOp ? ofmBlock.Depth() : std::min(ifmBlock.Depth(), ifmShape.Depth());
+    ifmBlockDepth = RoundAway(ifmBlockDepth, ifmAlignDepth);
+    int ifmBytes = ifmBlock.ElementsWH() * ifmBlockDepth * (ifmBits / 8) * numBlocksInRam;
+
+    // Accumulator space
+    int ofmBlockDepth = RoundAway(ofmBlock.Depth(), 16);
+    int accBytes = (ofmBlock.ElementsWH() * ofmBlockDepth * accBits) / 8 * numBlocksInRam;
+
+    if ( ifmBytes > ifmSpace || accBytes > accSpace )
+    {
+        return false;
+    }
+
+    return true;
+}
+
+
+Shape ArchEthosU85::GetStorageRounding(TensorFormat format)
+{
+    if ( format == TensorFormat::NHCWB16 )
+    {
+        return Shape(1, 1, 1, 16);
+    }
+
+    return Shape(1, 1, 1, 1);
+}
+
+uint32_t ArchEthosU85::ConfigRegister(int product)
+{
+    uint32_t macsLog2 = IntLog2(_macs);
+    uint32_t numWdLog2 = IntLog2(_cores);
+
+    return EthosU85RCSGenerator::ConfigRegister(macsLog2, 1, _numAxiSramLog2, _numAxiExtLog2, numWdLog2, product);
+}
+
+
+std::unique_ptr<ArchitectureOpConfig> EthosU85OpConfig::Clone()
+{
+    auto config = std::make_unique<EthosU85OpConfig>();
+    config->_ifmRamSizeBytes = _ifmRamSizeBytes;
+    config->_traversal = _traversal;
+    config->_accumulatorType = _accumulatorType;
+    config->_accumulatorSource = _accumulatorSource;
+    config->_accumulatorOutputEnabled = _accumulatorOutputEnabled;
+    config->_ofmBlock = _ofmBlock;
+    config->_ofmUBlock = _ofmUBlock;
+    config->_ifmBlock = _ifmBlock;
+    return std::unique_ptr<ArchitectureOpConfig>(config.release());
+}
+
+int EthosU85OpConfig::MaxIFMBuffering()
+{
+    return _ifmRamSizeBytes;
+}
+
+Point2i EthosU85OpConfig::OptimalStripeGranule()
+{
+    return _ofmBlock.WH<int>();
+}
+
+int EthosU85OpConfig::OptimalDepthGranule()
+{
+    return _ofmBlock.Depth();
+}
+
+std::string EthosU85OpConfig::ToString(bool full)
+{
+    std::string tmp = fmt::format("OFM Block=[{}], IFM Block=[{}], OFM UBlock=[{}] Traversal={}, AccType={}", _ofmBlock.ToString(),
+        _ifmBlock.ToString(), _ofmUBlock.ToString(), EnumToString(_traversal), EnumToString(_accumulatorType));
+    UNUSED(full);
+    return tmp;
+}
+
+EthosU85NpuOp ArchEthosU85::GetHWOp(OpType type)
+{
+    static const std::unordered_map<OpType, EthosU85NpuOp> toNpuOp = {
+        {OpType::DepthwiseConv2DBias, EthosU85NpuOp::Depthwise},
+        {OpType::Conv2D, EthosU85NpuOp::Convolution},
+        {OpType::Conv2DBackpropInput, EthosU85NpuOp::Convolution},
+        {OpType::Conv2DBackpropInputSwitchedBias, EthosU85NpuOp::Convolution},
+        {OpType::Conv2DBias, EthosU85NpuOp::Convolution},
+        {OpType::ReduceSum, EthosU85NpuOp::ReduceSum},
+        {OpType::FullyConnected, EthosU85NpuOp::VectorProduct},
+        {OpType::MatMul, EthosU85NpuOp::VectorProduct},
+        {OpType::MaxPool, EthosU85NpuOp::Pooling},
+        {OpType::AvgPool, EthosU85NpuOp::Pooling},
+        {OpType::QuantizedAvgPool, EthosU85NpuOp::Pooling},
+        {OpType::QuantizedMaxPool, EthosU85NpuOp::Pooling},
+        {OpType::Sum, EthosU85NpuOp::Pooling},
+        {OpType::Min, EthosU85NpuOp::Pooling},
+        {OpType::ArgMax, EthosU85NpuOp::Pooling},
+        // TODO MLBEDSW-7986 add none pooling
+        {OpType::Resize, EthosU85NpuOp::Resize},
+        {OpType::Gather, EthosU85NpuOp::Dma},
+        {OpType::Scatter, EthosU85NpuOp::Dma},
+    };
+
+    auto pos = toNpuOp.find(type);
+    if ( pos != toNpuOp.end() )
+    {
+        return pos->second;
+    }
+    else if ( EthosU85RCSGenerator::IsSupportedElementwise(type) )
+    {
+        return EthosU85NpuOp::Elementwise;
+    }
+    else if ( UseAvgPoolNop(type) )
+    {
+        return EthosU85NpuOp::Pooling;
+    }
+    return EthosU85NpuOp::None;
+}
+
+// TODO: this is activation fusing only
+int EthosU85OpGroup::Add(const ArchitectureOpGroupQuery &op, const std::vector<int> &dependsOn)
+{
+    LOG_TRACE1("Trying to add op {}\n", OpTypeToString(op.type));
+
+    if ( _opsCount >= 2 )
+    {
+        // Can only fuse 2 ops
+        return 0;
+    }
+
+    for ( int dep : dependsOn )
+    {
+        if ( dep > 0 )
+        {
+            // Don't validate user-specified (positive keys) dependencies
+            continue;
+        }
+        else if ( dep < 0 )
+        {
+            // Convert to group generated keys (negative keys) to array index
+            dep = (-dep) - 1;
+            if ( dep >= _opsCount )
+            {
+                // Missing dependency
+                return 0;
+            }
+        }
+
+        const EthosU85OpGroup::OpInfo &prevOp = _ops[dep];
+        if ( prevOp.ofm.key != op.ifm.key && prevOp.ofm.key != op.ifm2.key )
+        {
+            // Can only fuse when ops are connected
+            return 0;
+        }
+    }
+    if ( !CanRunOnNPU(op) )
+    {
+        // Can only fuse NPU ops
+        return 0;
+    }
+
+    if ( _opsCount > 0 )
+    {
+        if ( !IsActivation(op.type) )
+        {
+            // Can only fuse with activation
+            return 0;
+        }
+        else if ( op.ifm.type == DataType::Int16 && (op.type == OpType::Sigmoid || op.type == OpType::Tanh) )
+        {
+            // Can not fuse int16 Sigmoid and Tanh LUT since they require special scaling done by AvgPoolNop
+            return 0;
+        }
+    }
+
+    // Generated key
+    int key = (-_opsCount) - 1;
+
+    // Save copy of op
+    _ops[_opsCount].type = op.type;
+    _ops[_opsCount].ifm.key = op.ifm.key;
+    _ops[_opsCount].ifm.type = op.ifm.type;
+    _ops[_opsCount].ifm2.key = op.ifm2.key;
+    _ops[_opsCount].ifm2.type = op.ifm2.type;
+    _ops[_opsCount].ofm.key = op.ofm.key;
+    _ops[_opsCount].ofm.type = op.ofm.type;
+    _opsInternal[_opsCount].dependsOn = dependsOn;
+    _opsCount++;
+
+    return key;
+}
+
+// TODO: This table is from the EthosU55/U65 Embedded NPU Interface Specification, it's not completely valid for
+// Ethos U85 since the allowed data types depend on ifm/ofm as well as selected acc and scaling.
+static const std::unordered_map<EthosU85NpuOp, std::unordered_map<DataType, std::vector<DataType>>> s_opDataTypeSupport = {
+    {EthosU85NpuOp::Convolution,
+        {
+            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+        }},
+    {EthosU85NpuOp::Depthwise,
+        {
+            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+        }},
+    {EthosU85NpuOp::VectorProduct,
+        {
+            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+        }},
+    {EthosU85NpuOp::Pooling,
+        {
+            {DataType::UInt8, {DataType::UInt8, DataType::Int32, DataType::Int64}},
+            {DataType::Int8, {DataType::Int8, DataType::Int32, DataType::Int64}},
+            {DataType::Int16, {DataType::Int16}},
+        }},
+    {EthosU85NpuOp::ReduceSum,
+        {
+            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+            {DataType::Int32, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}},
+        }},
+    {EthosU85NpuOp::Dma,
+        {
+            {DataType::UInt8, {DataType::UInt8}},
+            {DataType::Int8, {DataType::Int8}},
+            {DataType::Int16, {DataType::Int16}},
+            {DataType::Int32, {DataType::Int32}},
+        }},
+    {EthosU85NpuOp::Resize,
+        {
+            {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+            {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+            {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}},
+        }},
+};
+
+bool EthosU85OpGroup::CanRunOnNPU(const ArchitectureOpGroupQuery &op)
+{
+    EthosU85NpuOp npuOp = ArchEthosU85::GetHWOp(op.type);
+
+    if ( IsFloat(op.ifm.type | op.ifm2.type | op.ofm.type) )
+    {
+        return false;
+    }
+
+    if ( npuOp == EthosU85NpuOp::None )
+    {
+        return false;
+    }
+
+    auto k = op.kernel;
+    if ( k->Stride().x > 3 || k->Stride().y > 3 )
+    {
+        return false;
+    }
+
+    if ( k->Dilation().x > 2 || k->Dilation().y > 2 )
+    {
+        return false;
+    }
+
+    switch ( npuOp )
+    {
+        case EthosU85NpuOp::Convolution:
+        case EthosU85NpuOp::Depthwise:
+        case EthosU85NpuOp::VectorProduct:
+        case EthosU85NpuOp::Pooling:
+        case EthosU85NpuOp::ReduceSum:
+        case EthosU85NpuOp::Elementwise:
+        case EthosU85NpuOp::Resize:
+        case EthosU85NpuOp::Dma:
+            break;
+        default:
+            assert(false && "Unrecognized HWOp");
+            return false;
+    }
+
+    // Check allowed ifm/ofm data type mapping
+    if ( npuOp != EthosU85NpuOp::Elementwise )
+    {
+        if ( op.type == OpType::LUT || op.type == OpType::MemoryCopy )
+        {  // TODO: LUT operations end up here due to UseAvgPoolNop although the rules are not the same as
+           // for a Pooling operation, so skip checks for now.
+            return true;
+        }
+
+        auto map = s_opDataTypeSupport.find(npuOp);
+        if ( map == s_opDataTypeSupport.end() )
+        {
+            assert(false && "Data type mapping for HWOp missing");
+            return false;
+        }
+        auto &typeMap = map->second;
+        auto ifmEntry = typeMap.find(op.ifm.type);
+        if ( ifmEntry == typeMap.end() )
+        {  // Unsupported ifm data type
+            return false;
+        }
+        auto &ofmTypes = ifmEntry->second;
+        if ( 0 == std::count(ofmTypes.begin(), ofmTypes.end(), op.ofm.type) )
+        {  // Unsupported ofm data type
+            return false;
+        }
+    }
+    else
+    {
+        // TODO: Elementwise
+    }
+
+    return true;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85.hpp
new file mode 100644
index 00000000..831904ec
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85.hpp
@@ -0,0 +1,230 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "architecture/architecture.hpp"
+#include "architecture/ethos_u_scaling.hpp"
+#include "architecture/register_command_stream_generator.hpp"
+#include "architecture/weight_encoder.hpp"
+#include "common/bit_flags.hpp"
+#include "common/shape.hpp"
+#include "ethos_u85_performance.hpp"
+
+#include <string>
+
+namespace regor
+{
+
+enum class EthosU85Accumulator
+{
+    Acc32 = 0,
+    Acc48 = 1,
+    Acc_Last = Acc48
+};
+
+enum class EthosU85Traversal
+{
+    DepthFirst = 0,
+    PartKernel = 1,
+    Depthwise = 2,
+};
+
+class ArchEthosU85;
+
+enum class EthosU85NpuOp
+{
+    None = 0,
+    Convolution,
+    Depthwise,
+    VectorProduct,
+    Pooling,
+    ReduceSum,
+    Elementwise,
+    Resize,
+    Dma,
+};
+
+/// <summary>
+/// Per-operator architecture configuration
+/// </summary>
+class EthosU85OpConfig : public ArchitectureOpConfig
+{
+    friend class ArchEthosU85;
+    friend class EthosU85RCSGenerator;
+
+private:
+    Shape _ifmBlock;
+    Shape _ofmBlock;
+    Shape _ofmUBlock;
+    EthosU85Accumulator _accumulatorType = EthosU85Accumulator::Acc32;
+    ArchAccumulatorSource _accumulatorSource = ArchAccumulatorSource::Reset;
+    bool _accumulatorOutputEnabled = true;
+    EthosU85Traversal _traversal = EthosU85Traversal::DepthFirst;
+    int _ifmRamSizeBytes = 0;
+
+public:
+    EthosU85Traversal Traversal() const { return _traversal; }
+    const Shape &IfmBlock() const { return _ifmBlock; }
+    const Shape &OfmBlock() const { return _ofmBlock; }
+    const Shape &OfmUBlock() const { return _ofmUBlock; }
+    EthosU85Accumulator Acc() const { return _accumulatorType; }
+    ArchAccumulatorSource AccSource() const { return _accumulatorSource; }
+    bool AccOutputEnabled() const { return _accumulatorOutputEnabled; }
+    std::unique_ptr<ArchitectureOpConfig> Clone() override;
+    int MaxIFMBuffering() override;
+    Point2i OptimalStripeGranule() override;
+    int OptimalDepthGranule() override;
+    std::string ToString(bool full) override;
+};
+
+/// <summary>
+/// Group of ops that can be fused and/or chained
+/// </summary>
+class EthosU85OpGroup : public ArchitectureOpGroup
+{
+    friend class ArchEthosU85;
+
+    using OpInfo = ArchitectureOpGroupQuery;
+
+    struct InternalOpInfo
+    {
+        std::vector<int> dependsOn;
+    };
+
+private:
+    std::array<OpInfo, 5> _ops;
+    std::array<InternalOpInfo, 5> _opsInternal;
+    int _opsCount = 0;
+
+public:
+    int Add(const ArchitectureOpGroupQuery &op, const std::vector<int> &dependsOn = {}) override;
+
+protected:
+    bool CanRunOnNPU(const ArchitectureOpGroupQuery &op) override;
+};
+
+/// <summary>
+/// EthosU85 specialisation
+/// </summary>
+class ArchEthosU85 : public Architecture
+{
+    friend class EthosU85WeightEncoder;
+    friend class EthosU85Performance;
+    friend class EthosU85RCSGenerator;
+    friend class EthosU85OpGroup;
+
+public:
+    struct AcceleratorConfig
+    {
+        int macs;
+        int cores;
+        std::array<Shape, 3> ofmUBlocks;
+        int nOfmUBlocks;
+        Shape ifmUBlock;
+        int ifmRamSizeBytes;
+        int accRamSizeBytes;
+        int elemUnits;
+        int numAxiSramLog2;
+        int numAxiExtLog2;
+        const EthosU85PerfInfo *perfInfo;
+    };
+
+private:
+    static constexpr int LUT_SLOT_SIZE = 256;
+    std::unique_ptr<ArchitectureMemory> _lutRam;
+    Shape _subkernelMax;
+    Shape _ofmBlockMax;
+    int _cores = 0;
+    int _macs = 0;
+    std::array<Shape, 3> _ofmUBlocks;
+    int _nOfmUBlocks = 1;
+    // maps ofm microblock and ifmbits to a bitmask of supported operations
+    std::array<std::array<unsigned, 3>, 3> _uBlockToOpTable{};
+    // maps ofm microblock to supported IFM allocation unit
+    std::array<std::array<Shape, 3>, 3> _uBlockToIfmAuTable{};
+    Shape _ifmUBlock;
+    int _ifmRamSizeBytes = 0;
+    int _accRamSizeBytes = 0;
+    int _numAxiSramLog2 = 0;
+    int _numAxiExtLog2 = 0;
+
+protected:
+    std::unique_ptr<class WeightEncoder> _weightEncoder;
+    std::unique_ptr<ArchitecturePerformance> _performance;
+    std::unique_ptr<IRegisterCommandStreamGenerator> _rcsGenerator;
+
+public:
+    ArchEthosU85();
+
+
+    bool ParseConfig(IniReader *reader) override;
+
+    std::unique_ptr<ArchitectureOpConfig> GetOpConfig(OpType opType, const ArchitectureConfigQuery &query) override;
+    std::unique_ptr<ArchitectureOpGroup> CreateOpGroup(const ArchitectureOpGroupQuery &op) override;
+    class WeightEncoder *WeightEncoder() override { return _weightEncoder.get(); }
+    IRegisterCommandStreamGenerator *RegisterCommandStreamGenerator() override { return _rcsGenerator.get(); }
+    ArchitecturePerformance *Performance() override { return _performance.get(); }
+    TensorFormat IdealBufferingFormat() override { return TensorFormat::NHCWB16; }
+    Address MaxAddress() override { return 1LL << 32; }
+    std::vector<uint32_t> ConfigRegisters() override;
+    int UpscaleAndRounding(ArchResampling resampling, int &rounding) override;
+    AxisMask CanSubdivide(OpType opType) override;
+    bool SupportsLeakyRelu(bool quantized, DataType type) override;
+    bool SupportsMatMul(OpType opType) override;
+    bool SupportsTranspose(OpType opType, TransposeType transposeType) override;
+    bool SupportsReverse(OpType opType, ReverseType reverseType) override;
+    bool SupportsGather(OpType opType) override;
+    bool SupportsScatter(OpType opType) override;
+    bool SupportsSigmoidTanhLutInt16(OpType opType) override;
+    bool SupportsResize(const ResizeSupportQuery &query) override;
+    bool SupportsAccumulatorMode(ArchAccumulatorSource source, bool outputEnabled) override;
+    bool SupportsScalar(OpType opType, DataType dataType, TensorUsage usage) override;
+    bool SupportsArgMax(OpType opType) override;
+    Flags<WeightFormat> SupportedWeightFormat(OpType op) override;
+    uint32_t Version() override;
+
+protected:
+    void ApplyConfig(const AcceleratorConfig *cfg);
+
+    std::unique_ptr<ArchitectureOpConfig> FindBlockConfig(OpType opType, const ArchitectureConfigQuery &query);
+
+    bool TryBlockConfig(EthosU85NpuOp npuOp, const Shape &ofmBlock, const Shape &ifmBlock, const Shape &ifmShape,
+        int ifmBits, int accBits, int ifmSpace, int accSpace, int ifmAuDepth, int numBlocksInRam, bool isEqualDepthOp);
+
+    Shape GetStorageRounding(TensorFormat format);
+
+    uint32_t ConfigRegister(int product);
+    // Checks if the operation is to be mapped on AvgPool
+    static bool UseAvgPoolNop(OpType type);
+    static EthosU85NpuOp GetHWOp(OpType type);
+
+private:
+    int MaxOutstandingKernelOps() { return 2; }
+    int MaxOutstandingDMAOps() { return 4; }
+    int MaxBlockdep() { return 7; }
+    bool IsUBlockValid(const OpType opType, int ifmBits, const Shape &ofmUBlock, bool hasIfm2);
+    Shape FindUBlock(OpType opType, const ArchitectureConfigQuery &query);
+    Shape CalcIfmAUSize(int IfmBlkDepth, int ifmBits, Shape ofmUBlk);
+    int CalcResizeMaxOfmBlockWidth(int ifmBits, int scaleN, int scaleD);
+    int IndexForOfmUBlock(const Shape &ofmUBlock);
+    void SetupOfmUBlockToOpTable();
+    void SetupOfmUBlockToIfmAuTable();
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_interface.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_interface.hpp
new file mode 100644
index 00000000..fa7e7b70
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_interface.hpp
@@ -0,0 +1,24312 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+
+#if !defined(__cplusplus) || __cplusplus < 201402L
+#define CONSTEXPR
+#else
+#define CONSTEXPR constexpr
+#endif
+
+#ifndef __cplusplus
+#define STRUCT struct
+#else
+#define STRUCT
+#endif
+
+#if defined(__cplusplus) && defined(NPU_DISASSEMBLE)
+#include <iomanip>
+#include <vector>
+#include <sstream>
+#endif
+
+#if defined(__cplusplus) && !defined(NPU_NAMESPACE)
+#define NPU_NAMESPACE npu
+#endif
+
+#ifdef __cplusplus
+#include <cassert>
+#include <cstring>
+#include <limits>
+#endif
+
+#ifdef __cplusplus
+namespace NPU_NAMESPACE
+{
+#endif
+#define NNX_ARCH_VERSION_MAJOR 2
+#define NNX_ARCH_VERSION_MINOR 0
+#define NNX_ARCH_VERSION_PATCH 0
+
+
+
+
+
+#define NPU_REG_ID 0x0000
+#define NPU_REG_STATUS 0x0004
+#define NPU_REG_CMD 0x0008
+#define NPU_REG_RESET 0x000C
+#define NPU_REG_QBASE 0x0010
+#define NPU_REG_QBASE_HI 0x0014
+#define NPU_REG_QREAD 0x0018
+#define NPU_REG_QCONFIG 0x001C
+#define NPU_REG_QSIZE 0x0020
+#define NPU_REG_PROT 0x0024
+#define NPU_REG_CONFIG 0x0028
+#define NPU_REG_COND_STATUS 0x0030
+#define NPU_REG_POWER_CTRL 0x0038
+#define NPU_REG_REGIONCFG 0x003C
+#define NPU_REG_MEM_ATTR_BASE 0x0040
+#define NPU_REG_MEM_ATTR_ARRLEN 0x0004
+#define NPU_REG_AXI_SRAM 0x0050
+#define NPU_REG_AXI_EXT 0x0054
+#define NPU_REG_CFG_SRAM_CAP 0x0060
+#define NPU_REG_CFG_EXT_CAP 0x0064
+#define NPU_REG_CFG_SRAM_HASH0 0x0068
+#define NPU_REG_CFG_SRAM_HASH0_HI 0x006C
+#define NPU_REG_CFG_SRAM_HASH1 0x0070
+#define NPU_REG_CFG_SRAM_HASH1_HI 0x0074
+#define NPU_REG_CFG_EXT_HASH0 0x0078
+#define NPU_REG_CFG_EXT_HASH0_HI 0x007C
+#define BASE_REGISTERS_SIZE 0x0080
+
+
+
+
+#define NPU_REG_BASEP_BASE 0x0080
+#define NPU_REG_BASEP_ARRLEN 0x0008
+#define BASE_POINTERS_REGISTERS_SIZE 0x0100
+
+
+
+
+#define NPU_REG_CLKFORCE 0x0140
+#define NPU_REG_DEBUG_ADDRESS 0x0144
+#define NPU_REG_DEBUG_MISC 0x0148
+#define DEBUG_REGISTERS_SIZE 0x0180
+
+
+
+
+#define NPU_REG_DMA_IFM_SRC 0x0240
+#define NPU_REG_DMA_IFM_SRC_HI 0x0244
+#define NPU_REG_DMA_IFM_DST 0x0248
+#define NPU_REG_DMA_OFM_SRC 0x024C
+#define NPU_REG_DMA_OFM_DST 0x0250
+#define NPU_REG_DMA_OFM_DST_HI 0x0254
+#define NPU_REG_DMA_WEIGHT_SRC 0x0258
+#define NPU_REG_DMA_WEIGHT_SRC_HI 0x025C
+#define NPU_REG_DMA_CMD_SRC 0x0260
+#define NPU_REG_DMA_CMD_SRC_HI 0x0264
+#define NPU_REG_DMA_CMD_SIZE 0x0268
+#define NPU_REG_DMA_M2M_SRC 0x026C
+#define NPU_REG_DMA_M2M_SRC_HI 0x0270
+#define NPU_REG_DMA_M2M_DST 0x0274
+#define NPU_REG_DMA_M2M_DST_HI 0x0278
+#define NPU_REG_CURRENT_QREAD 0x027C
+#define NPU_REG_DMA_SCALE_SRC 0x0280
+#define NPU_REG_DMA_SCALE_SRC_HI 0x0284
+#define NPU_REG_DMA_WEIGHT1_SRC 0x0288
+#define NPU_REG_DMA_WEIGHT1_SRC_HI 0x028C
+#define NPU_REG_DMA_WEIGHT2_SRC 0x0290
+#define NPU_REG_DMA_WEIGHT2_SRC_HI 0x0294
+#define NPU_REG_DMA_WEIGHT3_SRC 0x0298
+#define NPU_REG_DMA_WEIGHT3_SRC_HI 0x029C
+#define NPU_REG_CURRENT_OP 0x02B8
+#define NPU_REG_CURRENT_CMD 0x02BC
+#define TSU_DEBUG_REGISTERS_SIZE 0x0300
+
+
+
+
+#define NPU_REG_INTERNAL_MEMORY_BASE 0x0400
+#define NPU_REG_INTERNAL_MEMORY_ARRLEN 0x0100
+#define INTERNAL_MEMORY_REGISTERS_SIZE 0x0800
+
+
+
+
+#define NPU_REG_IFM_PAD_TOP 0x0800
+#define NPU_REG_IFM_PAD_LEFT 0x0804
+#define NPU_REG_IFM_PAD_RIGHT 0x0808
+#define NPU_REG_IFM_PAD_BOTTOM 0x080C
+#define NPU_REG_IFM_DEPTH_M1 0x0810
+#define NPU_REG_IFM_PRECISION 0x0814
+#define NPU_REG_IFM_UPSCALE 0x081C
+#define NPU_REG_IFM_BROADCAST 0x0820
+#define NPU_REG_IFM_ZERO_POINT 0x0824
+#define NPU_REG_IFM_WIDTH0_M1 0x0828
+#define NPU_REG_IFM_HEIGHT0_M1 0x082C
+#define NPU_REG_IFM_HEIGHT1_M1 0x0830
+#define NPU_REG_IFM_REGION 0x083C
+#define TSU_IFM_REGISTERS_SIZE 0x0840
+
+
+
+
+#define NPU_REG_OFM_WIDTH_M1 0x0844
+#define NPU_REG_OFM_HEIGHT_M1 0x0848
+#define NPU_REG_OFM_DEPTH_M1 0x084C
+#define NPU_REG_OFM_PRECISION 0x0850
+#define NPU_REG_OFM_BLK_WIDTH_M1 0x0854
+#define NPU_REG_OFM_BLK_HEIGHT_M1 0x0858
+#define NPU_REG_OFM_BLK_DEPTH_M1 0x085C
+#define NPU_REG_OFM_ZERO_POINT 0x0860
+#define NPU_REG_OFM_WIDTH0_M1 0x0868
+#define NPU_REG_OFM_HEIGHT0_M1 0x086C
+#define NPU_REG_OFM_HEIGHT1_M1 0x0870
+#define NPU_REG_OFM_REGION 0x087C
+#define TSU_OFM_REGISTERS_SIZE 0x0880
+
+
+
+
+#define NPU_REG_KERNEL_WIDTH_M1 0x0880
+#define NPU_REG_KERNEL_HEIGHT_M1 0x0884
+#define NPU_REG_KERNEL_STRIDE 0x0888
+#define NPU_REG_ACC_FORMAT 0x0890
+#define NPU_REG_ACTIVATION 0x0894
+#define NPU_REG_ACTIVATION_MIN 0x0898
+#define NPU_REG_ACTIVATION_MAX 0x089C
+#define NPU_REG_WEIGHT_REGION 0x08A0
+#define NPU_REG_SCALE_REGION 0x08A4
+#define NPU_REG_RESIZE_X_SCALE_N_M1 0x08A8
+#define NPU_REG_RESIZE_Y_SCALE_N_M1 0x08AC
+#define NPU_REG_RESIZE_X_OFFSET 0x08B0
+#define NPU_REG_RESIZE_Y_OFFSET 0x08B4
+#define NPU_REG_WEIGHT_FORMAT 0x08B8
+#define NPU_REG_BLOCKDEP 0x08BC
+#define TSU_KERNEL_REGISTERS_SIZE 0x08C0
+
+
+
+
+#define NPU_REG_DMA0_SRC_REGION 0x08C0
+#define NPU_REG_DMA0_DST_REGION 0x08C4
+#define NPU_REG_DMA0_SIZE0 0x08C8
+#define NPU_REG_DMA0_SIZE1 0x08CC
+#define NPU_REG_DMA0_IDX_REGION 0x08D0
+#define TSU_DMA_REGISTERS_SIZE 0x0900
+
+
+
+
+#define NPU_REG_IFM2_BROADCAST 0x0900
+#define NPU_REG_IFM2_PRECISION 0x0914
+#define NPU_REG_IFM2_ZERO_POINT 0x0924
+#define NPU_REG_IFM2_WIDTH0_M1 0x0928
+#define NPU_REG_IFM2_HEIGHT0_M1 0x092C
+#define NPU_REG_IFM2_HEIGHT1_M1 0x0930
+#define NPU_REG_IFM2_REGION 0x093C
+#define TSU_IFM2_REGISTERS_SIZE 0x0940
+
+
+
+
+#define NPU_REG_IFM_BASE0 0x0A00
+#define NPU_REG_IFM_BASE0_HI 0x0A04
+#define NPU_REG_IFM_BASE1 0x0A08
+#define NPU_REG_IFM_BASE1_HI 0x0A0C
+#define NPU_REG_IFM_BASE2 0x0A10
+#define NPU_REG_IFM_BASE2_HI 0x0A14
+#define NPU_REG_IFM_BASE3 0x0A18
+#define NPU_REG_IFM_BASE3_HI 0x0A1C
+#define NPU_REG_IFM_STRIDE_X 0x0A20
+#define NPU_REG_IFM_STRIDE_X_HI 0x0A24
+#define NPU_REG_IFM_STRIDE_Y 0x0A28
+#define NPU_REG_IFM_STRIDE_Y_HI 0x0A2C
+#define NPU_REG_IFM_STRIDE_C 0x0A30
+#define NPU_REG_IFM_STRIDE_C_HI 0x0A34
+#define TSU_IFM_BASE_REGISTERS_SIZE 0x0A40
+
+
+
+
+#define NPU_REG_OFM_BASE0 0x0A40
+#define NPU_REG_OFM_BASE0_HI 0x0A44
+#define NPU_REG_OFM_BASE1 0x0A48
+#define NPU_REG_OFM_BASE1_HI 0x0A4C
+#define NPU_REG_OFM_BASE2 0x0A50
+#define NPU_REG_OFM_BASE2_HI 0x0A54
+#define NPU_REG_OFM_BASE3 0x0A58
+#define NPU_REG_OFM_BASE3_HI 0x0A5C
+#define NPU_REG_OFM_STRIDE_X 0x0A60
+#define NPU_REG_OFM_STRIDE_X_HI 0x0A64
+#define NPU_REG_OFM_STRIDE_Y 0x0A68
+#define NPU_REG_OFM_STRIDE_Y_HI 0x0A6C
+#define NPU_REG_OFM_STRIDE_C 0x0A70
+#define NPU_REG_OFM_STRIDE_C_HI 0x0A74
+#define TSU_OFM_BASE_REGISTERS_SIZE 0x0A80
+
+
+
+
+#define NPU_REG_WEIGHT_BASE 0x0A80
+#define NPU_REG_WEIGHT_BASE_HI 0x0A84
+#define NPU_REG_WEIGHT_LENGTH 0x0A88
+#define NPU_REG_WEIGHT_LENGTH_HI 0x0A8C
+#define NPU_REG_SCALE_BASE 0x0A90
+#define NPU_REG_SCALE_BASE_HI 0x0A94
+#define NPU_REG_SCALE_LENGTH 0x0A98
+#define NPU_REG_SCALE_LENGTH_HI 0x0A9C
+#define NPU_REG_OFM_SCALE 0x0AA0
+#define NPU_REG_OFM_SCALE_HI 0x0AA4
+#define NPU_REG_IFM_SCALE 0x0AA8
+#define NPU_REG_IFM_SCALE_HI 0x0AAC
+#define NPU_REG_IFM2_SCALE 0x0AB0
+#define NPU_REG_IFM2_SCALE_HI 0x0AB4
+#define NPU_REG_OP_SCALAR 0x0AB8
+#define NPU_REG_OP_SCALAR_HI 0x0ABC
+#define TSU_WS_BASE_REGISTERS_SIZE 0x0AC0
+
+
+
+
+#define NPU_REG_DMA0_SRC 0x0AC0
+#define NPU_REG_DMA0_SRC_HI 0x0AC4
+#define NPU_REG_DMA0_DST 0x0AC8
+#define NPU_REG_DMA0_DST_HI 0x0ACC
+#define NPU_REG_DMA0_LEN 0x0AD0
+#define NPU_REG_DMA0_LEN_HI 0x0AD4
+#define NPU_REG_DMA0_SRC_STRIDE0 0x0AD8
+#define NPU_REG_DMA0_SRC_STRIDE0_HI 0x0ADC
+#define NPU_REG_DMA0_SRC_STRIDE1 0x0AE0
+#define NPU_REG_DMA0_SRC_STRIDE1_HI 0x0AE4
+#define NPU_REG_DMA0_DST_STRIDE0 0x0AE8
+#define NPU_REG_DMA0_DST_STRIDE0_HI 0x0AEC
+#define NPU_REG_DMA0_DST_STRIDE1 0x0AF0
+#define NPU_REG_DMA0_DST_STRIDE1_HI 0x0AF4
+#define NPU_REG_DMA0_IDX 0x0AF8
+#define NPU_REG_DMA0_IDX_HI 0x0AFC
+#define TSU_DMA_BASE_REGISTERS_SIZE 0x0B00
+
+
+
+
+#define NPU_REG_IFM2_BASE0 0x0B00
+#define NPU_REG_IFM2_BASE0_HI 0x0B04
+#define NPU_REG_IFM2_BASE1 0x0B08
+#define NPU_REG_IFM2_BASE1_HI 0x0B0C
+#define NPU_REG_IFM2_BASE2 0x0B10
+#define NPU_REG_IFM2_BASE2_HI 0x0B14
+#define NPU_REG_IFM2_BASE3 0x0B18
+#define NPU_REG_IFM2_BASE3_HI 0x0B1C
+#define NPU_REG_IFM2_STRIDE_X 0x0B20
+#define NPU_REG_IFM2_STRIDE_X_HI 0x0B24
+#define NPU_REG_IFM2_STRIDE_Y 0x0B28
+#define NPU_REG_IFM2_STRIDE_Y_HI 0x0B2C
+#define NPU_REG_IFM2_STRIDE_C 0x0B30
+#define NPU_REG_IFM2_STRIDE_C_HI 0x0B34
+#define TSU_IFM2_BASE_REGISTERS_SIZE 0x0B40
+
+
+
+
+#define NPU_REG_WEIGHT1_BASE 0x0B40
+#define NPU_REG_WEIGHT1_BASE_HI 0x0B44
+#define NPU_REG_WEIGHT1_LENGTH 0x0B48
+#define NPU_REG_WEIGHT1_LENGTH_HI 0x0B4C
+#define NPU_REG_WEIGHT2_BASE 0x0B50
+#define NPU_REG_WEIGHT2_BASE_HI 0x0B54
+#define NPU_REG_WEIGHT2_LENGTH 0x0B58
+#define NPU_REG_WEIGHT2_LENGTH_HI 0x0B5C
+#define NPU_REG_WEIGHT3_BASE 0x0B60
+#define NPU_REG_WEIGHT3_BASE_HI 0x0B64
+#define NPU_REG_WEIGHT3_LENGTH 0x0B68
+#define NPU_REG_WEIGHT3_LENGTH_HI 0x0B6C
+#define NPU_REG_RESIZE_X_STEP 0x0B70
+#define NPU_REG_RESIZE_X_STEP_HI 0x0B74
+#define NPU_REG_RESIZE_Y_STEP 0x0B78
+#define NPU_REG_RESIZE_Y_STEP_HI 0x0B7C
+#define TSU_WS1_BASE_REGISTERS_SIZE 0x0B80
+
+
+
+
+#define TSU_USER_BASE_REGISTERS_SIZE 0x0BC0
+
+
+
+
+#define NPU_REG_DMA0_IDX_MAX 0x0BC0
+#define NPU_REG_DMA0_IDX_MAX_HI 0x0BC4
+#define NPU_REG_DMA0_IDX_SKIP1 0x0BC8
+#define NPU_REG_DMA0_IDX_SKIP1_HI 0x0BCC
+#define TSU_DMA_EBASE_REGISTERS_SIZE 0x0C00
+
+
+
+
+#define NPU_REG_REVISION 0x0FC0
+#define NPU_REG_PID4 0x0FD0
+#define NPU_REG_PID5 0x0FD4
+#define NPU_REG_PID6 0x0FD8
+#define NPU_REG_PID7 0x0FDC
+#define NPU_REG_PID0 0x0FE0
+#define NPU_REG_PID1 0x0FE4
+#define NPU_REG_PID2 0x0FE8
+#define NPU_REG_PID3 0x0FEC
+#define NPU_REG_CID0 0x0FF0
+#define NPU_REG_CID1 0x0FF4
+#define NPU_REG_CID2 0x0FF8
+#define NPU_REG_CID3 0x0FFC
+#define ID_REGISTERS_SIZE 0x1000
+
+
+
+
+#define NPU_REG_WD_STATUS 0x1100
+#define NPU_REG_MAC_STATUS 0x1104
+#define NPU_REG_AO_STATUS 0x1108
+#define NPU_REG_DMA_STATUS0 0x1110
+#define NPU_REG_DMA_STATUS1 0x1114
+#define DEBUG_STATUS_REGISTERS_SIZE 0x1180
+
+
+
+
+#define NPU_REG_PMCR 0x1180
+#define NPU_REG_PMCNTENSET 0x1184
+#define NPU_REG_PMCNTENCLR 0x1188
+#define NPU_REG_PMOVSSET 0x118C
+#define NPU_REG_PMOVSCLR 0x1190
+#define NPU_REG_PMINTSET 0x1194
+#define NPU_REG_PMINTCLR 0x1198
+#define NPU_REG_PMCCNTR 0x11A0
+#define NPU_REG_PMCCNTR_HI 0x11A4
+#define NPU_REG_PMCCNTR_CFG 0x11A8
+#define NPU_REG_PMCAXI_CHAN 0x11AC
+#define NPU_REG_PMCLUT 0x11B0
+#define PMU_REGISTERS_SIZE 0x1200
+
+
+
+
+#define NPU_REG_PMEVCNTR_BASE 0x1300
+#define NPU_REG_PMEVCNTR_ARRLEN 0x0008
+#define NPU_REG_PMEVTYPER_BASE 0x1380
+#define NPU_REG_PMEVTYPER_ARRLEN 0x0008
+#define PMU_COUNTERS_REGISTERS_SIZE 0x1400
+
+#ifdef __cplusplus
+
+enum class acc_format : uint8_t
+{
+    I32 = 0,
+    I48 = 1,
+};
+
+enum class acc_input : uint8_t
+{
+    RESET = 0,
+    KEEP = 1,
+    IFM2 = 2,
+};
+
+enum class acc_output : uint8_t
+{
+    ENABLE = 0,
+    DISABLE = 1,
+};
+
+enum class activation_clip_range : uint8_t
+{
+    B16 = 0,
+    NONE = 1,
+};
+
+enum class activation_format : uint8_t
+{
+    NHWC = 0,
+    NHCWB16 = 1,
+};
+
+enum class activation_function : uint8_t
+{
+    LUT_NONE = 0,
+    LUT_U8_U8 = 1,
+    LUT_S8_S8 = 4,
+    LUT_S8_S16 = 5,
+    LUT_S8_S32 = 7,
+    LUT_S16_S16 = 8,
+    LUT_S16_S32 = 9,
+    LUT_TANH = 10,
+    LUT_SIGMOID = 11,
+};
+
+enum class activation_precision : uint8_t
+{
+    B8 = 0,
+    B16 = 1,
+    B32 = 2,
+    B64 = 3,
+};
+
+enum class activation_reverse : uint8_t
+{
+    NONE = 0,
+    H = 1,
+    W = 2,
+    C = 3,
+};
+
+enum class activation_storage : uint8_t
+{
+    TILE2X2 = 0,
+    TILE3X1 = 1,
+    CHAINED = 2,
+    NONE = 3,
+};
+
+enum class activation_transpose : uint8_t
+{
+    HWC = 0,
+    WHC = 1,
+    HCW = 2,
+    WCH = 3,
+    CHW = 6,
+    CWH = 7,
+};
+
+enum class activation_type : uint8_t
+{
+    UNSIGNED = 0,
+    SIGNED = 1,
+};
+
+enum class axi_mem_domain : uint8_t
+{
+    NON_SHARABLE = 0,
+    INNER_SHARABLE = 1,
+    OUTER_SHARABLE = 2,
+    SYSTEM = 3,
+};
+
+enum class axi_mem_encoding : uint8_t
+{
+    DEVICE_NON_BUFFERABLE = 0,
+    DEVICE_BUFFERABLE = 1,
+    NORMAL_NON_CACHEABLE_NON_BUFFERABLE = 2,
+    NORMAL_NON_CACHEABLE_BUFFERABLE = 3,
+    WRITE_THROUGH_NO_ALLOCATE = 4,
+    WRITE_THROUGH_READ_ALLOCATE = 5,
+    WRITE_THROUGH_WRITE_ALLOCATE = 6,
+    WRITE_THROUGH_READ_AND_WRITE_ALLOCATE = 7,
+    WRITE_BACK_NO_ALLOCATE = 8,
+    WRITE_BACK_READ_ALLOCATE = 9,
+    WRITE_BACK_WRITE_ALLOCATE = 10,
+    WRITE_BACK_READ_AND_WRITE_ALLOCATE = 11,
+};
+
+enum class axi_port : uint8_t
+{
+    SRAM = 0,
+    EXT = 1,
+};
+
+enum class branch_cond : uint8_t
+{
+    ALWAYS = 0,
+    RF_TRUE = 1,
+};
+
+enum class broadcast_mode : uint8_t
+{
+    NONE = 0,
+    H = 1,
+    W = 2,
+    HW = 3,
+    C = 4,
+    CH = 5,
+    CW = 6,
+    CWH = 7,
+    SCALAR = 8,
+};
+
+enum class cmd0_opcode : uint16_t
+{
+    NPU_OP_STOP = 0,
+    NPU_OP_IRQ = 1,
+    NPU_OP_CONV = 2,
+    NPU_OP_DEPTHWISE = 3,
+    NPU_OP_POOL = 5,
+    NPU_OP_ELEMENTWISE = 6,
+    NPU_OP_RESIZE = 7,
+    NPU_OP_DMA_START = 16,
+    NPU_OP_DMA_WAIT = 17,
+    NPU_OP_KERNEL_WAIT = 18,
+    NPU_OP_PMU_MASK = 19,
+    NPU_SET_IFM_PAD_TOP = 256,
+    NPU_SET_IFM_PAD_LEFT = 257,
+    NPU_SET_IFM_PAD_RIGHT = 258,
+    NPU_SET_IFM_PAD_BOTTOM = 259,
+    NPU_SET_IFM_DEPTH_M1 = 260,
+    NPU_SET_IFM_PRECISION = 261,
+    NPU_SET_IFM_UPSCALE = 263,
+    NPU_SET_IFM_BROADCAST = 264,
+    NPU_SET_IFM_ZERO_POINT = 265,
+    NPU_SET_IFM_WIDTH0_M1 = 266,
+    NPU_SET_IFM_HEIGHT0_M1 = 267,
+    NPU_SET_IFM_HEIGHT1_M1 = 268,
+    NPU_SET_IFM_REGION = 271,
+    NPU_SET_OFM_WIDTH_M1 = 273,
+    NPU_SET_OFM_HEIGHT_M1 = 274,
+    NPU_SET_OFM_DEPTH_M1 = 275,
+    NPU_SET_OFM_PRECISION = 276,
+    NPU_SET_OFM_BLK_WIDTH_M1 = 277,
+    NPU_SET_OFM_BLK_HEIGHT_M1 = 278,
+    NPU_SET_OFM_BLK_DEPTH_M1 = 279,
+    NPU_SET_OFM_ZERO_POINT = 280,
+    NPU_SET_OFM_WIDTH0_M1 = 282,
+    NPU_SET_OFM_HEIGHT0_M1 = 283,
+    NPU_SET_OFM_HEIGHT1_M1 = 284,
+    NPU_SET_OFM_REGION = 287,
+    NPU_SET_KERNEL_WIDTH_M1 = 288,
+    NPU_SET_KERNEL_HEIGHT_M1 = 289,
+    NPU_SET_KERNEL_STRIDE = 290,
+    NPU_SET_ACC_FORMAT = 292,
+    NPU_SET_ACTIVATION = 293,
+    NPU_SET_ACTIVATION_MIN = 294,
+    NPU_SET_ACTIVATION_MAX = 295,
+    NPU_SET_WEIGHT_REGION = 296,
+    NPU_SET_SCALE_REGION = 297,
+    NPU_SET_RESIZE_X_SCALE_N_M1 = 298,
+    NPU_SET_RESIZE_Y_SCALE_N_M1 = 299,
+    NPU_SET_RESIZE_X_OFFSET = 300,
+    NPU_SET_RESIZE_Y_OFFSET = 301,
+    NPU_SET_WEIGHT_FORMAT = 302,
+    NPU_SET_BLOCKDEP = 303,
+    NPU_SET_DMA0_SRC_REGION = 304,
+    NPU_SET_DMA0_DST_REGION = 305,
+    NPU_SET_DMA0_SIZE0 = 306,
+    NPU_SET_DMA0_SIZE1 = 307,
+    NPU_SET_DMA0_IDX_REGION = 308,
+    NPU_SET_IFM2_BROADCAST = 384,
+    NPU_SET_IFM2_PRECISION = 389,
+    NPU_SET_IFM2_ZERO_POINT = 393,
+    NPU_SET_IFM2_WIDTH0_M1 = 394,
+    NPU_SET_IFM2_HEIGHT0_M1 = 395,
+    NPU_SET_IFM2_HEIGHT1_M1 = 396,
+    NPU_SET_IFM2_REGION = 399,
+};
+
+enum class cmd1_opcode : uint16_t
+{
+    NPU_SET_IFM_BASE0 = 0,
+    NPU_SET_IFM_BASE1 = 1,
+    NPU_SET_IFM_BASE2 = 2,
+    NPU_SET_IFM_BASE3 = 3,
+    NPU_SET_IFM_STRIDE_X = 4,
+    NPU_SET_IFM_STRIDE_Y = 5,
+    NPU_SET_IFM_STRIDE_C = 6,
+    NPU_SET_OFM_BASE0 = 16,
+    NPU_SET_OFM_BASE1 = 17,
+    NPU_SET_OFM_BASE2 = 18,
+    NPU_SET_OFM_BASE3 = 19,
+    NPU_SET_OFM_STRIDE_X = 20,
+    NPU_SET_OFM_STRIDE_Y = 21,
+    NPU_SET_OFM_STRIDE_C = 22,
+    NPU_SET_WEIGHT_BASE = 32,
+    NPU_SET_WEIGHT_LENGTH = 33,
+    NPU_SET_SCALE_BASE = 34,
+    NPU_SET_SCALE_LENGTH = 35,
+    NPU_SET_OFM_SCALE = 36,
+    NPU_SET_IFM_SCALE = 37,
+    NPU_SET_IFM2_SCALE = 38,
+    NPU_SET_OP_SCALAR = 39,
+    NPU_SET_DMA0_SRC = 48,
+    NPU_SET_DMA0_DST = 49,
+    NPU_SET_DMA0_LEN = 50,
+    NPU_SET_DMA0_SRC_STRIDE0 = 51,
+    NPU_SET_DMA0_SRC_STRIDE1 = 52,
+    NPU_SET_DMA0_DST_STRIDE0 = 53,
+    NPU_SET_DMA0_DST_STRIDE1 = 54,
+    NPU_SET_DMA0_IDX = 55,
+    NPU_SET_DMA0_IDX_MAX = 56,
+    NPU_SET_DMA0_IDX_SKIP1 = 57,
+    NPU_SET_IFM2_BASE0 = 128,
+    NPU_SET_IFM2_BASE1 = 129,
+    NPU_SET_IFM2_BASE2 = 130,
+    NPU_SET_IFM2_BASE3 = 131,
+    NPU_SET_IFM2_STRIDE_X = 132,
+    NPU_SET_IFM2_STRIDE_Y = 133,
+    NPU_SET_IFM2_STRIDE_C = 134,
+    NPU_SET_WEIGHT1_BASE = 144,
+    NPU_SET_WEIGHT1_LENGTH = 145,
+    NPU_SET_WEIGHT2_BASE = 146,
+    NPU_SET_WEIGHT2_LENGTH = 147,
+    NPU_SET_WEIGHT3_BASE = 148,
+    NPU_SET_WEIGHT3_LENGTH = 149,
+    NPU_SET_RESIZE_X = 150,
+    NPU_SET_RESIZE_Y = 151,
+    NPU_OP_BRANCH = 256,
+};
+
+enum class cmd_ctrl : uint8_t
+{
+    CMD0_CTRL = 0,
+    CMD1_CTRL = 1,
+};
+
+enum class custom_dma : uint8_t
+{
+    NOT_IMPLEMENTED = 0,
+    IMPLEMENTED = 1,
+};
+
+enum class dma_fault_channel : uint8_t
+{
+    CMD_READ = 0,
+    IFM_READ = 1,
+    WEIGHT_READ = 2,
+    SBS_READ = 3,
+    MEM2MEM_READ = 4,
+    OFM_WRITE = 8,
+    MEM2MEM_WRITE = 9,
+};
+
+enum class dma_fault_src : uint8_t
+{
+    SRAM = 0,
+    EXT = 1,
+};
+
+enum class dma_idx_mode : uint8_t
+{
+    DISABLED = 0,
+    ENABLED = 1,
+};
+
+enum class dma_region_mode : uint8_t
+{
+    EXTERNAL = 0,
+    INTERNAL = 1,
+};
+
+enum class dma_stride_mode : uint8_t
+{
+    D1 = 0,
+    D2 = 1,
+    D3 = 2,
+};
+
+enum class elementwise_mode : uint8_t
+{
+    MUL = 0,
+    ADD = 1,
+    SUB = 2,
+    MIN = 3,
+    MAX = 4,
+    LRELU = 5,
+    ABS = 6,
+    CLZ = 7,
+    SHR = 8,
+    SHL = 9,
+    LSR = 10,
+    DIV = 11,
+    CMP_EQ = 16,
+    CMP_NE = 17,
+    CMP_GE = 18,
+    CMP_GT = 19,
+    AND = 33,
+    OR = 34,
+    XOR = 35,
+    NOT = 36,
+    AND_NOT = 42,
+};
+
+enum class ifm_upscale_mode : uint8_t
+{
+    NONE = 0,
+    NEAREST = 1,
+    ZEROS = 2,
+};
+
+enum class kernel_decomposition : uint8_t
+{
+    D8X8 = 0,
+    D4X4 = 1,
+};
+
+enum class kernel_dilation : uint8_t
+{
+    NONE = 0,
+    X2 = 1,
+};
+
+enum class max_beats : uint8_t
+{
+    B64 = 0,
+    B128 = 1,
+    B256 = 2,
+};
+
+enum class microblock : uint8_t
+{
+    U1X1 = 0,
+    U1X2 = 1,
+    U1X4 = 2,
+    U2X2 = 3,
+    U2X4 = 4,
+    U4X4 = 5,
+};
+
+enum class ofm_scale_mode : uint8_t
+{
+    PER_CHANNEL = 0,
+    GLOBAL = 1,
+};
+
+enum class pmu_axi_channel : uint8_t
+{
+    RD_CMD = 0,
+    RD_IFM = 1,
+    RD_WEIGHTS = 2,
+    RD_SCALE_BIAS = 3,
+    RD_MEM2MEM = 4,
+    RD_IFM_STREAM = 5,
+    RD_MEM2MEM_IDX = 6,
+    WR_OFM = 8,
+    WR_MEM2MEM = 9,
+};
+
+enum class pmu_event : uint16_t
+{
+    NO_EVENT = 0,
+    CYCLE = 17,
+    NPU_IDLE = 32,
+    CC_STALLED_ON_BLOCKDEP = 33,
+    CC_STALLED_ON_SHRAM_RECONFIG = 34,
+    NPU_ACTIVE = 35,
+    MAC_ACTIVE = 48,
+    MAC_DPU_ACTIVE = 51,
+    MAC_STALLED_BY_W_OR_ACC = 52,
+    MAC_STALLED_BY_W = 53,
+    MAC_STALLED_BY_ACC = 54,
+    MAC_STALLED_BY_IB = 55,
+    MAC_STALLED_BY_INT_W = 57,
+    MAC_STALLED_BY_INT_ACC = 58,
+    AO_ACTIVE = 64,
+    AO_STALLED_BY_BS_OR_OB = 67,
+    AO_STALLED_BY_BS = 68,
+    AO_STALLED_BY_OB = 69,
+    AO_STALLED_BY_AB_OR_CB = 70,
+    AO_STALLED_BY_AB = 71,
+    AO_STALLED_BY_CB = 72,
+    WD_ACTIVE = 80,
+    WD_STALLED = 81,
+    WD_STALLED_BY_WD_BUF = 83,
+    WD_STALLED_BY_WS_FC = 84,
+    WD_STALLED_BY_WS_TC = 85,
+    WD_TRANS_WBLK = 89,
+    WD_TRANS_WS_FC = 90,
+    WD_TRANS_WS_TC = 91,
+    WD_STALLED_BY_WS_SC0 = 96,
+    WD_STALLED_BY_WS_SC1 = 97,
+    WD_STALLED_BY_WS_SC2 = 98,
+    WD_STALLED_BY_WS_SC3 = 99,
+    WD_PARSE_ACTIVE_SC0 = 100,
+    WD_PARSE_ACTIVE_SC1 = 101,
+    WD_PARSE_ACTIVE_SC2 = 102,
+    WD_PARSE_ACTIVE_SC3 = 103,
+    WD_PARSE_STALL_SC0 = 104,
+    WD_PARSE_STALL_SC1 = 105,
+    WD_PARSE_STALL_SC2 = 106,
+    WD_PARSE_STALL_SC3 = 107,
+    WD_PARSE_STALL_IN_SC0 = 108,
+    WD_PARSE_STALL_IN_SC1 = 109,
+    WD_PARSE_STALL_IN_SC2 = 110,
+    WD_PARSE_STALL_IN_SC3 = 111,
+    WD_PARSE_STALL_OUT_SC0 = 112,
+    WD_PARSE_STALL_OUT_SC1 = 113,
+    WD_PARSE_STALL_OUT_SC2 = 114,
+    WD_PARSE_STALL_OUT_SC3 = 115,
+    WD_TRANS_WS_SC0 = 116,
+    WD_TRANS_WS_SC1 = 117,
+    WD_TRANS_WS_SC2 = 118,
+    WD_TRANS_WS_SC3 = 119,
+    WD_TRANS_WB0 = 120,
+    WD_TRANS_WB1 = 121,
+    WD_TRANS_WB2 = 122,
+    WD_TRANS_WB3 = 123,
+    SRAM_RD_TRANS_ACCEPTED = 128,
+    SRAM_RD_TRANS_COMPLETED = 129,
+    SRAM_RD_DATA_BEAT_RECEIVED = 130,
+    SRAM_RD_TRAN_REQ_STALLED = 131,
+    SRAM_WR_TRANS_ACCEPTED = 132,
+    SRAM_WR_TRANS_COMPLETED_M = 133,
+    SRAM_WR_TRANS_COMPLETED_S = 134,
+    SRAM_WR_DATA_BEAT_WRITTEN = 135,
+    SRAM_WR_TRAN_REQ_STALLED = 136,
+    SRAM_WR_DATA_BEAT_STALLED = 137,
+    SRAM_ENABLED_CYCLES = 140,
+    SRAM_RD_STALL_LIMIT = 142,
+    SRAM_WR_STALL_LIMIT = 143,
+    AXI_LATENCY_ANY = 160,
+    AXI_LATENCY_32 = 161,
+    AXI_LATENCY_64 = 162,
+    AXI_LATENCY_128 = 163,
+    AXI_LATENCY_256 = 164,
+    AXI_LATENCY_512 = 165,
+    AXI_LATENCY_1024 = 166,
+    ECC_DMA = 176,
+    ECC_MAC_IB = 177,
+    ECC_MAC_AB = 178,
+    ECC_AO_CB = 179,
+    ECC_AO_OB = 180,
+    ECC_AO_LUT = 181,
+    EXT_RD_TRANS_ACCEPTED = 384,
+    EXT_RD_TRANS_COMPLETED = 385,
+    EXT_RD_DATA_BEAT_RECEIVED = 386,
+    EXT_RD_TRAN_REQ_STALLED = 387,
+    EXT_WR_TRANS_ACCEPTED = 388,
+    EXT_WR_TRANS_COMPLETED_M = 389,
+    EXT_WR_TRANS_COMPLETED_S = 390,
+    EXT_WR_DATA_BEAT_WRITTEN = 391,
+    EXT_WR_TRAN_REQ_STALLED = 392,
+    EXT_WR_DATA_BEAT_STALLED = 393,
+    EXT_ENABLED_CYCLES = 396,
+    EXT_RD_STALL_LIMIT = 398,
+    EXT_WR_STALL_LIMIT = 399,
+};
+
+enum class pmu_port_disable : uint8_t
+{
+    ENABLE = 0,
+    DISABLE = 1,
+};
+
+enum class pooling_mode : uint8_t
+{
+    MAX = 0,
+    AVERAGE = 1,
+    REDUCE_SUM = 2,
+    SUM = 3,
+    NONE = 4,
+    MIN = 5,
+    ARGMAX_X = 6,
+    ARGMAX_Y = 7,
+};
+
+enum class privilege_level : uint8_t
+{
+    USER = 0,
+    PRIVILEGED = 1,
+};
+
+enum class ram_id : uint8_t
+{
+    LUT = 0,
+    IB = 1,
+    AB = 2,
+    CB = 3,
+    OB = 4,
+};
+
+enum class resize_mode : uint8_t
+{
+    BILINEAR = 0,
+    REPLICATE = 1,
+    NEAREST = 2,
+};
+
+enum class round_mode_ifm : uint8_t
+{
+    DOUBLE_SYMMETRIC = 0,
+    NATURAL = 1,
+};
+
+enum class round_mode_ofm : uint8_t
+{
+    DOUBLE_SYMMETRIC = 0,
+    NATURAL = 1,
+    DOUBLE_ASYMMETRIC = 2,
+    SYMMETRIC = 3,
+    TRUNCATE_TO_ZERO = 4,
+    TRUNCATE_TO_LOWER = 5,
+};
+
+enum class security_level : uint8_t
+{
+    SECURE = 0,
+    NON_SECURE = 1,
+};
+
+enum class state : uint8_t
+{
+    STOPPED = 0,
+    RUNNING = 1,
+};
+
+enum class wd_active_core : uint8_t
+{
+    NONE = 0,
+    STANDARD = 1,
+    FAST = 2,
+    TENSOR = 3,
+};
+
+enum class weight_format : uint8_t
+{
+    SWD = 0,
+    FWD = 1,
+};
+
+enum class weight_order : uint8_t
+{
+    DEPTH_FIRST = 0,
+    PART_KERNEL_FIRST = 1,
+};
+
+enum class weight_sparsity : uint8_t
+{
+    NONE = 0,
+    SPARSE_2_4 = 1,
+};
+
+#else
+
+enum acc_format
+{
+    ACC_FORMAT_I32 = 0,
+    ACC_FORMAT_I48 = 1,
+};
+
+enum acc_input
+{
+    ACC_INPUT_RESET = 0,
+    ACC_INPUT_KEEP = 1,
+    ACC_INPUT_IFM2 = 2,
+};
+
+enum acc_output
+{
+    ACC_OUTPUT_ENABLE = 0,
+    ACC_OUTPUT_DISABLE = 1,
+};
+
+enum activation_clip_range
+{
+    ACTIVATION_CLIP_RANGE_B16 = 0,
+    ACTIVATION_CLIP_RANGE_NONE = 1,
+};
+
+enum activation_format
+{
+    ACTIVATION_FORMAT_NHWC = 0,
+    ACTIVATION_FORMAT_NHCWB16 = 1,
+};
+
+enum activation_function
+{
+    ACTIVATION_FUNCTION_LUT_NONE = 0,
+    ACTIVATION_FUNCTION_LUT_U8_U8 = 1,
+    ACTIVATION_FUNCTION_LUT_S8_S8 = 4,
+    ACTIVATION_FUNCTION_LUT_S8_S16 = 5,
+    ACTIVATION_FUNCTION_LUT_S8_S32 = 7,
+    ACTIVATION_FUNCTION_LUT_S16_S16 = 8,
+    ACTIVATION_FUNCTION_LUT_S16_S32 = 9,
+    ACTIVATION_FUNCTION_LUT_TANH = 10,
+    ACTIVATION_FUNCTION_LUT_SIGMOID = 11,
+};
+
+enum activation_precision
+{
+    ACTIVATION_PRECISION_B8 = 0,
+    ACTIVATION_PRECISION_B16 = 1,
+    ACTIVATION_PRECISION_B32 = 2,
+    ACTIVATION_PRECISION_B64 = 3,
+};
+
+enum activation_reverse
+{
+    ACTIVATION_REVERSE_NONE = 0,
+    ACTIVATION_REVERSE_H = 1,
+    ACTIVATION_REVERSE_W = 2,
+    ACTIVATION_REVERSE_C = 3,
+};
+
+enum activation_storage
+{
+    ACTIVATION_STORAGE_TILE2X2 = 0,
+    ACTIVATION_STORAGE_TILE3X1 = 1,
+    ACTIVATION_STORAGE_CHAINED = 2,
+    ACTIVATION_STORAGE_NONE = 3,
+};
+
+enum activation_transpose
+{
+    ACTIVATION_TRANSPOSE_HWC = 0,
+    ACTIVATION_TRANSPOSE_WHC = 1,
+    ACTIVATION_TRANSPOSE_HCW = 2,
+    ACTIVATION_TRANSPOSE_WCH = 3,
+    ACTIVATION_TRANSPOSE_CHW = 6,
+    ACTIVATION_TRANSPOSE_CWH = 7,
+};
+
+enum activation_type
+{
+    ACTIVATION_TYPE_UNSIGNED = 0,
+    ACTIVATION_TYPE_SIGNED = 1,
+};
+
+enum axi_mem_domain
+{
+    AXI_MEM_DOMAIN_NON_SHARABLE = 0,
+    AXI_MEM_DOMAIN_INNER_SHARABLE = 1,
+    AXI_MEM_DOMAIN_OUTER_SHARABLE = 2,
+    AXI_MEM_DOMAIN_SYSTEM = 3,
+};
+
+enum axi_mem_encoding
+{
+    AXI_MEM_ENCODING_DEVICE_NON_BUFFERABLE = 0,
+    AXI_MEM_ENCODING_DEVICE_BUFFERABLE = 1,
+    AXI_MEM_ENCODING_NORMAL_NON_CACHEABLE_NON_BUFFERABLE = 2,
+    AXI_MEM_ENCODING_NORMAL_NON_CACHEABLE_BUFFERABLE = 3,
+    AXI_MEM_ENCODING_WRITE_THROUGH_NO_ALLOCATE = 4,
+    AXI_MEM_ENCODING_WRITE_THROUGH_READ_ALLOCATE = 5,
+    AXI_MEM_ENCODING_WRITE_THROUGH_WRITE_ALLOCATE = 6,
+    AXI_MEM_ENCODING_WRITE_THROUGH_READ_AND_WRITE_ALLOCATE = 7,
+    AXI_MEM_ENCODING_WRITE_BACK_NO_ALLOCATE = 8,
+    AXI_MEM_ENCODING_WRITE_BACK_READ_ALLOCATE = 9,
+    AXI_MEM_ENCODING_WRITE_BACK_WRITE_ALLOCATE = 10,
+    AXI_MEM_ENCODING_WRITE_BACK_READ_AND_WRITE_ALLOCATE = 11,
+};
+
+enum axi_port
+{
+    AXI_PORT_SRAM = 0,
+    AXI_PORT_EXT = 1,
+};
+
+enum branch_cond
+{
+    BRANCH_COND_ALWAYS = 0,
+    BRANCH_COND_RF_TRUE = 1,
+};
+
+enum broadcast_mode
+{
+    BROADCAST_MODE_NONE = 0,
+    BROADCAST_MODE_H = 1,
+    BROADCAST_MODE_W = 2,
+    BROADCAST_MODE_HW = 3,
+    BROADCAST_MODE_C = 4,
+    BROADCAST_MODE_CH = 5,
+    BROADCAST_MODE_CW = 6,
+    BROADCAST_MODE_CWH = 7,
+    BROADCAST_MODE_SCALAR = 8,
+};
+
+enum cmd0_opcode
+{
+    CMD0_OPCODE_NPU_OP_STOP = 0,
+    CMD0_OPCODE_NPU_OP_IRQ = 1,
+    CMD0_OPCODE_NPU_OP_CONV = 2,
+    CMD0_OPCODE_NPU_OP_DEPTHWISE = 3,
+    CMD0_OPCODE_NPU_OP_POOL = 5,
+    CMD0_OPCODE_NPU_OP_ELEMENTWISE = 6,
+    CMD0_OPCODE_NPU_OP_RESIZE = 7,
+    CMD0_OPCODE_NPU_OP_DMA_START = 16,
+    CMD0_OPCODE_NPU_OP_DMA_WAIT = 17,
+    CMD0_OPCODE_NPU_OP_KERNEL_WAIT = 18,
+    CMD0_OPCODE_NPU_OP_PMU_MASK = 19,
+    CMD0_OPCODE_NPU_SET_IFM_PAD_TOP = 256,
+    CMD0_OPCODE_NPU_SET_IFM_PAD_LEFT = 257,
+    CMD0_OPCODE_NPU_SET_IFM_PAD_RIGHT = 258,
+    CMD0_OPCODE_NPU_SET_IFM_PAD_BOTTOM = 259,
+    CMD0_OPCODE_NPU_SET_IFM_DEPTH_M1 = 260,
+    CMD0_OPCODE_NPU_SET_IFM_PRECISION = 261,
+    CMD0_OPCODE_NPU_SET_IFM_UPSCALE = 263,
+    CMD0_OPCODE_NPU_SET_IFM_BROADCAST = 264,
+    CMD0_OPCODE_NPU_SET_IFM_ZERO_POINT = 265,
+    CMD0_OPCODE_NPU_SET_IFM_WIDTH0_M1 = 266,
+    CMD0_OPCODE_NPU_SET_IFM_HEIGHT0_M1 = 267,
+    CMD0_OPCODE_NPU_SET_IFM_HEIGHT1_M1 = 268,
+    CMD0_OPCODE_NPU_SET_IFM_REGION = 271,
+    CMD0_OPCODE_NPU_SET_OFM_WIDTH_M1 = 273,
+    CMD0_OPCODE_NPU_SET_OFM_HEIGHT_M1 = 274,
+    CMD0_OPCODE_NPU_SET_OFM_DEPTH_M1 = 275,
+    CMD0_OPCODE_NPU_SET_OFM_PRECISION = 276,
+    CMD0_OPCODE_NPU_SET_OFM_BLK_WIDTH_M1 = 277,
+    CMD0_OPCODE_NPU_SET_OFM_BLK_HEIGHT_M1 = 278,
+    CMD0_OPCODE_NPU_SET_OFM_BLK_DEPTH_M1 = 279,
+    CMD0_OPCODE_NPU_SET_OFM_ZERO_POINT = 280,
+    CMD0_OPCODE_NPU_SET_OFM_WIDTH0_M1 = 282,
+    CMD0_OPCODE_NPU_SET_OFM_HEIGHT0_M1 = 283,
+    CMD0_OPCODE_NPU_SET_OFM_HEIGHT1_M1 = 284,
+    CMD0_OPCODE_NPU_SET_OFM_REGION = 287,
+    CMD0_OPCODE_NPU_SET_KERNEL_WIDTH_M1 = 288,
+    CMD0_OPCODE_NPU_SET_KERNEL_HEIGHT_M1 = 289,
+    CMD0_OPCODE_NPU_SET_KERNEL_STRIDE = 290,
+    CMD0_OPCODE_NPU_SET_ACC_FORMAT = 292,
+    CMD0_OPCODE_NPU_SET_ACTIVATION = 293,
+    CMD0_OPCODE_NPU_SET_ACTIVATION_MIN = 294,
+    CMD0_OPCODE_NPU_SET_ACTIVATION_MAX = 295,
+    CMD0_OPCODE_NPU_SET_WEIGHT_REGION = 296,
+    CMD0_OPCODE_NPU_SET_SCALE_REGION = 297,
+    CMD0_OPCODE_NPU_SET_RESIZE_X_SCALE_N_M1 = 298,
+    CMD0_OPCODE_NPU_SET_RESIZE_Y_SCALE_N_M1 = 299,
+    CMD0_OPCODE_NPU_SET_RESIZE_X_OFFSET = 300,
+    CMD0_OPCODE_NPU_SET_RESIZE_Y_OFFSET = 301,
+    CMD0_OPCODE_NPU_SET_WEIGHT_FORMAT = 302,
+    CMD0_OPCODE_NPU_SET_BLOCKDEP = 303,
+    CMD0_OPCODE_NPU_SET_DMA0_SRC_REGION = 304,
+    CMD0_OPCODE_NPU_SET_DMA0_DST_REGION = 305,
+    CMD0_OPCODE_NPU_SET_DMA0_SIZE0 = 306,
+    CMD0_OPCODE_NPU_SET_DMA0_SIZE1 = 307,
+    CMD0_OPCODE_NPU_SET_DMA0_IDX_REGION = 308,
+    CMD0_OPCODE_NPU_SET_IFM2_BROADCAST = 384,
+    CMD0_OPCODE_NPU_SET_IFM2_PRECISION = 389,
+    CMD0_OPCODE_NPU_SET_IFM2_ZERO_POINT = 393,
+    CMD0_OPCODE_NPU_SET_IFM2_WIDTH0_M1 = 394,
+    CMD0_OPCODE_NPU_SET_IFM2_HEIGHT0_M1 = 395,
+    CMD0_OPCODE_NPU_SET_IFM2_HEIGHT1_M1 = 396,
+    CMD0_OPCODE_NPU_SET_IFM2_REGION = 399,
+};
+
+enum cmd1_opcode
+{
+    CMD1_OPCODE_NPU_SET_IFM_BASE0 = 0,
+    CMD1_OPCODE_NPU_SET_IFM_BASE1 = 1,
+    CMD1_OPCODE_NPU_SET_IFM_BASE2 = 2,
+    CMD1_OPCODE_NPU_SET_IFM_BASE3 = 3,
+    CMD1_OPCODE_NPU_SET_IFM_STRIDE_X = 4,
+    CMD1_OPCODE_NPU_SET_IFM_STRIDE_Y = 5,
+    CMD1_OPCODE_NPU_SET_IFM_STRIDE_C = 6,
+    CMD1_OPCODE_NPU_SET_OFM_BASE0 = 16,
+    CMD1_OPCODE_NPU_SET_OFM_BASE1 = 17,
+    CMD1_OPCODE_NPU_SET_OFM_BASE2 = 18,
+    CMD1_OPCODE_NPU_SET_OFM_BASE3 = 19,
+    CMD1_OPCODE_NPU_SET_OFM_STRIDE_X = 20,
+    CMD1_OPCODE_NPU_SET_OFM_STRIDE_Y = 21,
+    CMD1_OPCODE_NPU_SET_OFM_STRIDE_C = 22,
+    CMD1_OPCODE_NPU_SET_WEIGHT_BASE = 32,
+    CMD1_OPCODE_NPU_SET_WEIGHT_LENGTH = 33,
+    CMD1_OPCODE_NPU_SET_SCALE_BASE = 34,
+    CMD1_OPCODE_NPU_SET_SCALE_LENGTH = 35,
+    CMD1_OPCODE_NPU_SET_OFM_SCALE = 36,
+    CMD1_OPCODE_NPU_SET_IFM_SCALE = 37,
+    CMD1_OPCODE_NPU_SET_IFM2_SCALE = 38,
+    CMD1_OPCODE_NPU_SET_OP_SCALAR = 39,
+    CMD1_OPCODE_NPU_SET_DMA0_SRC = 48,
+    CMD1_OPCODE_NPU_SET_DMA0_DST = 49,
+    CMD1_OPCODE_NPU_SET_DMA0_LEN = 50,
+    CMD1_OPCODE_NPU_SET_DMA0_SRC_STRIDE0 = 51,
+    CMD1_OPCODE_NPU_SET_DMA0_SRC_STRIDE1 = 52,
+    CMD1_OPCODE_NPU_SET_DMA0_DST_STRIDE0 = 53,
+    CMD1_OPCODE_NPU_SET_DMA0_DST_STRIDE1 = 54,
+    CMD1_OPCODE_NPU_SET_DMA0_IDX = 55,
+    CMD1_OPCODE_NPU_SET_DMA0_IDX_MAX = 56,
+    CMD1_OPCODE_NPU_SET_DMA0_IDX_SKIP1 = 57,
+    CMD1_OPCODE_NPU_SET_IFM2_BASE0 = 128,
+    CMD1_OPCODE_NPU_SET_IFM2_BASE1 = 129,
+    CMD1_OPCODE_NPU_SET_IFM2_BASE2 = 130,
+    CMD1_OPCODE_NPU_SET_IFM2_BASE3 = 131,
+    CMD1_OPCODE_NPU_SET_IFM2_STRIDE_X = 132,
+    CMD1_OPCODE_NPU_SET_IFM2_STRIDE_Y = 133,
+    CMD1_OPCODE_NPU_SET_IFM2_STRIDE_C = 134,
+    CMD1_OPCODE_NPU_SET_WEIGHT1_BASE = 144,
+    CMD1_OPCODE_NPU_SET_WEIGHT1_LENGTH = 145,
+    CMD1_OPCODE_NPU_SET_WEIGHT2_BASE = 146,
+    CMD1_OPCODE_NPU_SET_WEIGHT2_LENGTH = 147,
+    CMD1_OPCODE_NPU_SET_WEIGHT3_BASE = 148,
+    CMD1_OPCODE_NPU_SET_WEIGHT3_LENGTH = 149,
+    CMD1_OPCODE_NPU_SET_RESIZE_X = 150,
+    CMD1_OPCODE_NPU_SET_RESIZE_Y = 151,
+    CMD1_OPCODE_NPU_OP_BRANCH = 256,
+};
+
+enum cmd_ctrl
+{
+    CMD_CTRL_CMD0_CTRL = 0,
+    CMD_CTRL_CMD1_CTRL = 1,
+};
+
+enum custom_dma
+{
+    CUSTOM_DMA_NOT_IMPLEMENTED = 0,
+    CUSTOM_DMA_IMPLEMENTED = 1,
+};
+
+enum dma_fault_channel
+{
+    DMA_FAULT_CHANNEL_CMD_READ = 0,
+    DMA_FAULT_CHANNEL_IFM_READ = 1,
+    DMA_FAULT_CHANNEL_WEIGHT_READ = 2,
+    DMA_FAULT_CHANNEL_SBS_READ = 3,
+    DMA_FAULT_CHANNEL_MEM2MEM_READ = 4,
+    DMA_FAULT_CHANNEL_OFM_WRITE = 8,
+    DMA_FAULT_CHANNEL_MEM2MEM_WRITE = 9,
+};
+
+enum dma_fault_src
+{
+    DMA_FAULT_SRC_SRAM = 0,
+    DMA_FAULT_SRC_EXT = 1,
+};
+
+enum dma_idx_mode
+{
+    DMA_IDX_MODE_DISABLED = 0,
+    DMA_IDX_MODE_ENABLED = 1,
+};
+
+enum dma_region_mode
+{
+    DMA_REGION_MODE_EXTERNAL = 0,
+    DMA_REGION_MODE_INTERNAL = 1,
+};
+
+enum dma_stride_mode
+{
+    DMA_STRIDE_MODE_D1 = 0,
+    DMA_STRIDE_MODE_D2 = 1,
+    DMA_STRIDE_MODE_D3 = 2,
+};
+
+enum elementwise_mode
+{
+    ELEMENTWISE_MODE_MUL = 0,
+    ELEMENTWISE_MODE_ADD = 1,
+    ELEMENTWISE_MODE_SUB = 2,
+    ELEMENTWISE_MODE_MIN = 3,
+    ELEMENTWISE_MODE_MAX = 4,
+    ELEMENTWISE_MODE_LRELU = 5,
+    ELEMENTWISE_MODE_ABS = 6,
+    ELEMENTWISE_MODE_CLZ = 7,
+    ELEMENTWISE_MODE_SHR = 8,
+    ELEMENTWISE_MODE_SHL = 9,
+    ELEMENTWISE_MODE_LSR = 10,
+    ELEMENTWISE_MODE_DIV = 11,
+    ELEMENTWISE_MODE_CMP_EQ = 16,
+    ELEMENTWISE_MODE_CMP_NE = 17,
+    ELEMENTWISE_MODE_CMP_GE = 18,
+    ELEMENTWISE_MODE_CMP_GT = 19,
+    ELEMENTWISE_MODE_AND = 33,
+    ELEMENTWISE_MODE_OR = 34,
+    ELEMENTWISE_MODE_XOR = 35,
+    ELEMENTWISE_MODE_NOT = 36,
+    ELEMENTWISE_MODE_AND_NOT = 42,
+};
+
+enum ifm_upscale_mode
+{
+    IFM_UPSCALE_MODE_NONE = 0,
+    IFM_UPSCALE_MODE_NEAREST = 1,
+    IFM_UPSCALE_MODE_ZEROS = 2,
+};
+
+enum kernel_decomposition
+{
+    KERNEL_DECOMPOSITION_D8X8 = 0,
+    KERNEL_DECOMPOSITION_D4X4 = 1,
+};
+
+enum kernel_dilation
+{
+    KERNEL_DILATION_NONE = 0,
+    KERNEL_DILATION_X2 = 1,
+};
+
+enum max_beats
+{
+    MAX_BEATS_B64 = 0,
+    MAX_BEATS_B128 = 1,
+    MAX_BEATS_B256 = 2,
+};
+
+enum microblock
+{
+    MICROBLOCK_U1X1 = 0,
+    MICROBLOCK_U1X2 = 1,
+    MICROBLOCK_U1X4 = 2,
+    MICROBLOCK_U2X2 = 3,
+    MICROBLOCK_U2X4 = 4,
+    MICROBLOCK_U4X4 = 5,
+};
+
+enum ofm_scale_mode
+{
+    OFM_SCALE_MODE_PER_CHANNEL = 0,
+    OFM_SCALE_MODE_GLOBAL = 1,
+};
+
+enum pmu_axi_channel
+{
+    PMU_AXI_CHANNEL_RD_CMD = 0,
+    PMU_AXI_CHANNEL_RD_IFM = 1,
+    PMU_AXI_CHANNEL_RD_WEIGHTS = 2,
+    PMU_AXI_CHANNEL_RD_SCALE_BIAS = 3,
+    PMU_AXI_CHANNEL_RD_MEM2MEM = 4,
+    PMU_AXI_CHANNEL_RD_IFM_STREAM = 5,
+    PMU_AXI_CHANNEL_RD_MEM2MEM_IDX = 6,
+    PMU_AXI_CHANNEL_WR_OFM = 8,
+    PMU_AXI_CHANNEL_WR_MEM2MEM = 9,
+};
+
+enum pmu_event
+{
+    PMU_EVENT_NO_EVENT = 0,
+    PMU_EVENT_CYCLE = 17,
+    PMU_EVENT_NPU_IDLE = 32,
+    PMU_EVENT_CC_STALLED_ON_BLOCKDEP = 33,
+    PMU_EVENT_CC_STALLED_ON_SHRAM_RECONFIG = 34,
+    PMU_EVENT_NPU_ACTIVE = 35,
+    PMU_EVENT_MAC_ACTIVE = 48,
+    PMU_EVENT_MAC_DPU_ACTIVE = 51,
+    PMU_EVENT_MAC_STALLED_BY_W_OR_ACC = 52,
+    PMU_EVENT_MAC_STALLED_BY_W = 53,
+    PMU_EVENT_MAC_STALLED_BY_ACC = 54,
+    PMU_EVENT_MAC_STALLED_BY_IB = 55,
+    PMU_EVENT_MAC_STALLED_BY_INT_W = 57,
+    PMU_EVENT_MAC_STALLED_BY_INT_ACC = 58,
+    PMU_EVENT_AO_ACTIVE = 64,
+    PMU_EVENT_AO_STALLED_BY_BS_OR_OB = 67,
+    PMU_EVENT_AO_STALLED_BY_BS = 68,
+    PMU_EVENT_AO_STALLED_BY_OB = 69,
+    PMU_EVENT_AO_STALLED_BY_AB_OR_CB = 70,
+    PMU_EVENT_AO_STALLED_BY_AB = 71,
+    PMU_EVENT_AO_STALLED_BY_CB = 72,
+    PMU_EVENT_WD_ACTIVE = 80,
+    PMU_EVENT_WD_STALLED = 81,
+    PMU_EVENT_WD_STALLED_BY_WD_BUF = 83,
+    PMU_EVENT_WD_STALLED_BY_WS_FC = 84,
+    PMU_EVENT_WD_STALLED_BY_WS_TC = 85,
+    PMU_EVENT_WD_TRANS_WBLK = 89,
+    PMU_EVENT_WD_TRANS_WS_FC = 90,
+    PMU_EVENT_WD_TRANS_WS_TC = 91,
+    PMU_EVENT_WD_STALLED_BY_WS_SC0 = 96,
+    PMU_EVENT_WD_STALLED_BY_WS_SC1 = 97,
+    PMU_EVENT_WD_STALLED_BY_WS_SC2 = 98,
+    PMU_EVENT_WD_STALLED_BY_WS_SC3 = 99,
+    PMU_EVENT_WD_PARSE_ACTIVE_SC0 = 100,
+    PMU_EVENT_WD_PARSE_ACTIVE_SC1 = 101,
+    PMU_EVENT_WD_PARSE_ACTIVE_SC2 = 102,
+    PMU_EVENT_WD_PARSE_ACTIVE_SC3 = 103,
+    PMU_EVENT_WD_PARSE_STALL_SC0 = 104,
+    PMU_EVENT_WD_PARSE_STALL_SC1 = 105,
+    PMU_EVENT_WD_PARSE_STALL_SC2 = 106,
+    PMU_EVENT_WD_PARSE_STALL_SC3 = 107,
+    PMU_EVENT_WD_PARSE_STALL_IN_SC0 = 108,
+    PMU_EVENT_WD_PARSE_STALL_IN_SC1 = 109,
+    PMU_EVENT_WD_PARSE_STALL_IN_SC2 = 110,
+    PMU_EVENT_WD_PARSE_STALL_IN_SC3 = 111,
+    PMU_EVENT_WD_PARSE_STALL_OUT_SC0 = 112,
+    PMU_EVENT_WD_PARSE_STALL_OUT_SC1 = 113,
+    PMU_EVENT_WD_PARSE_STALL_OUT_SC2 = 114,
+    PMU_EVENT_WD_PARSE_STALL_OUT_SC3 = 115,
+    PMU_EVENT_WD_TRANS_WS_SC0 = 116,
+    PMU_EVENT_WD_TRANS_WS_SC1 = 117,
+    PMU_EVENT_WD_TRANS_WS_SC2 = 118,
+    PMU_EVENT_WD_TRANS_WS_SC3 = 119,
+    PMU_EVENT_WD_TRANS_WB0 = 120,
+    PMU_EVENT_WD_TRANS_WB1 = 121,
+    PMU_EVENT_WD_TRANS_WB2 = 122,
+    PMU_EVENT_WD_TRANS_WB3 = 123,
+    PMU_EVENT_SRAM_RD_TRANS_ACCEPTED = 128,
+    PMU_EVENT_SRAM_RD_TRANS_COMPLETED = 129,
+    PMU_EVENT_SRAM_RD_DATA_BEAT_RECEIVED = 130,
+    PMU_EVENT_SRAM_RD_TRAN_REQ_STALLED = 131,
+    PMU_EVENT_SRAM_WR_TRANS_ACCEPTED = 132,
+    PMU_EVENT_SRAM_WR_TRANS_COMPLETED_M = 133,
+    PMU_EVENT_SRAM_WR_TRANS_COMPLETED_S = 134,
+    PMU_EVENT_SRAM_WR_DATA_BEAT_WRITTEN = 135,
+    PMU_EVENT_SRAM_WR_TRAN_REQ_STALLED = 136,
+    PMU_EVENT_SRAM_WR_DATA_BEAT_STALLED = 137,
+    PMU_EVENT_SRAM_ENABLED_CYCLES = 140,
+    PMU_EVENT_SRAM_RD_STALL_LIMIT = 142,
+    PMU_EVENT_SRAM_WR_STALL_LIMIT = 143,
+    PMU_EVENT_AXI_LATENCY_ANY = 160,
+    PMU_EVENT_AXI_LATENCY_32 = 161,
+    PMU_EVENT_AXI_LATENCY_64 = 162,
+    PMU_EVENT_AXI_LATENCY_128 = 163,
+    PMU_EVENT_AXI_LATENCY_256 = 164,
+    PMU_EVENT_AXI_LATENCY_512 = 165,
+    PMU_EVENT_AXI_LATENCY_1024 = 166,
+    PMU_EVENT_ECC_DMA = 176,
+    PMU_EVENT_ECC_MAC_IB = 177,
+    PMU_EVENT_ECC_MAC_AB = 178,
+    PMU_EVENT_ECC_AO_CB = 179,
+    PMU_EVENT_ECC_AO_OB = 180,
+    PMU_EVENT_ECC_AO_LUT = 181,
+    PMU_EVENT_EXT_RD_TRANS_ACCEPTED = 384,
+    PMU_EVENT_EXT_RD_TRANS_COMPLETED = 385,
+    PMU_EVENT_EXT_RD_DATA_BEAT_RECEIVED = 386,
+    PMU_EVENT_EXT_RD_TRAN_REQ_STALLED = 387,
+    PMU_EVENT_EXT_WR_TRANS_ACCEPTED = 388,
+    PMU_EVENT_EXT_WR_TRANS_COMPLETED_M = 389,
+    PMU_EVENT_EXT_WR_TRANS_COMPLETED_S = 390,
+    PMU_EVENT_EXT_WR_DATA_BEAT_WRITTEN = 391,
+    PMU_EVENT_EXT_WR_TRAN_REQ_STALLED = 392,
+    PMU_EVENT_EXT_WR_DATA_BEAT_STALLED = 393,
+    PMU_EVENT_EXT_ENABLED_CYCLES = 396,
+    PMU_EVENT_EXT_RD_STALL_LIMIT = 398,
+    PMU_EVENT_EXT_WR_STALL_LIMIT = 399,
+};
+
+enum pmu_port_disable
+{
+    PMU_PORT_DISABLE_ENABLE = 0,
+    PMU_PORT_DISABLE_DISABLE = 1,
+};
+
+enum pooling_mode
+{
+    POOLING_MODE_MAX = 0,
+    POOLING_MODE_AVERAGE = 1,
+    POOLING_MODE_REDUCE_SUM = 2,
+    POOLING_MODE_SUM = 3,
+    POOLING_MODE_NONE = 4,
+    POOLING_MODE_MIN = 5,
+    POOLING_MODE_ARGMAX_X = 6,
+    POOLING_MODE_ARGMAX_Y = 7,
+};
+
+enum privilege_level
+{
+    PRIVILEGE_LEVEL_USER = 0,
+    PRIVILEGE_LEVEL_PRIVILEGED = 1,
+};
+
+enum ram_id
+{
+    RAM_ID_LUT = 0,
+    RAM_ID_IB = 1,
+    RAM_ID_AB = 2,
+    RAM_ID_CB = 3,
+    RAM_ID_OB = 4,
+};
+
+enum resize_mode
+{
+    RESIZE_MODE_BILINEAR = 0,
+    RESIZE_MODE_REPLICATE = 1,
+    RESIZE_MODE_NEAREST = 2,
+};
+
+enum round_mode_ifm
+{
+    ROUND_MODE_IFM_DOUBLE_SYMMETRIC = 0,
+    ROUND_MODE_IFM_NATURAL = 1,
+};
+
+enum round_mode_ofm
+{
+    ROUND_MODE_OFM_DOUBLE_SYMMETRIC = 0,
+    ROUND_MODE_OFM_NATURAL = 1,
+    ROUND_MODE_OFM_DOUBLE_ASYMMETRIC = 2,
+    ROUND_MODE_OFM_SYMMETRIC = 3,
+    ROUND_MODE_OFM_TRUNCATE_TO_ZERO = 4,
+    ROUND_MODE_OFM_TRUNCATE_TO_LOWER = 5,
+};
+
+enum security_level
+{
+    SECURITY_LEVEL_SECURE = 0,
+    SECURITY_LEVEL_NON_SECURE = 1,
+};
+
+enum state
+{
+    STATE_STOPPED = 0,
+    STATE_RUNNING = 1,
+};
+
+enum wd_active_core
+{
+    WD_ACTIVE_CORE_NONE = 0,
+    WD_ACTIVE_CORE_STANDARD = 1,
+    WD_ACTIVE_CORE_FAST = 2,
+    WD_ACTIVE_CORE_TENSOR = 3,
+};
+
+enum weight_format
+{
+    WEIGHT_FORMAT_SWD = 0,
+    WEIGHT_FORMAT_FWD = 1,
+};
+
+enum weight_order
+{
+    WEIGHT_ORDER_DEPTH_FIRST = 0,
+    WEIGHT_ORDER_PART_KERNEL_FIRST = 1,
+};
+
+enum weight_sparsity
+{
+    WEIGHT_SPARSITY_NONE = 0,
+    WEIGHT_SPARSITY_SPARSE_2_4 = 1,
+};
+
+#endif
+
+#ifdef NPU_DISASSEMBLE
+
+static const char* acc_format_str[] =
+{
+    "ACC_FORMAT_I32",
+    "ACC_FORMAT_I48",
+};
+
+static const char* acc_input_str[] =
+{
+    "ACC_INPUT_RESET",
+    "ACC_INPUT_KEEP",
+    "ACC_INPUT_IFM2",
+};
+
+static const char* acc_output_str[] =
+{
+    "ACC_OUTPUT_ENABLE",
+    "ACC_OUTPUT_DISABLE",
+};
+
+static const char* activation_clip_range_str[] =
+{
+    "ACTIVATION_CLIP_RANGE_B16",
+    "ACTIVATION_CLIP_RANGE_NONE",
+};
+
+static const char* activation_format_str[] =
+{
+    "ACTIVATION_FORMAT_NHWC",
+    "ACTIVATION_FORMAT_NHCWB16",
+};
+
+static const char* activation_function_str[] =
+{
+    "ACTIVATION_FUNCTION_LUT_NONE",
+    "ACTIVATION_FUNCTION_LUT_U8_U8",
+    "****",
+    "****",
+    "ACTIVATION_FUNCTION_LUT_S8_S8",
+    "ACTIVATION_FUNCTION_LUT_S8_S16",
+    "****",
+    "ACTIVATION_FUNCTION_LUT_S8_S32",
+    "ACTIVATION_FUNCTION_LUT_S16_S16",
+    "ACTIVATION_FUNCTION_LUT_S16_S32",
+    "ACTIVATION_FUNCTION_LUT_TANH",
+    "ACTIVATION_FUNCTION_LUT_SIGMOID",
+};
+
+static const char* activation_precision_str[] =
+{
+    "ACTIVATION_PRECISION_B8",
+    "ACTIVATION_PRECISION_B16",
+    "ACTIVATION_PRECISION_B32",
+    "ACTIVATION_PRECISION_B64",
+};
+
+static const char* activation_reverse_str[] =
+{
+    "ACTIVATION_REVERSE_NONE",
+    "ACTIVATION_REVERSE_H",
+    "ACTIVATION_REVERSE_W",
+    "ACTIVATION_REVERSE_C",
+};
+
+static const char* activation_storage_str[] =
+{
+    "ACTIVATION_STORAGE_TILE2X2",
+    "ACTIVATION_STORAGE_TILE3X1",
+    "ACTIVATION_STORAGE_CHAINED",
+    "ACTIVATION_STORAGE_NONE",
+};
+
+static const char* activation_transpose_str[] =
+{
+    "ACTIVATION_TRANSPOSE_HWC",
+    "ACTIVATION_TRANSPOSE_WHC",
+    "ACTIVATION_TRANSPOSE_HCW",
+    "ACTIVATION_TRANSPOSE_WCH",
+    "****",
+    "****",
+    "ACTIVATION_TRANSPOSE_CHW",
+    "ACTIVATION_TRANSPOSE_CWH",
+};
+
+static const char* activation_type_str[] =
+{
+    "ACTIVATION_TYPE_UNSIGNED",
+    "ACTIVATION_TYPE_SIGNED",
+};
+
+static const char* axi_mem_domain_str[] =
+{
+    "AXI_MEM_DOMAIN_NON_SHARABLE",
+    "AXI_MEM_DOMAIN_INNER_SHARABLE",
+    "AXI_MEM_DOMAIN_OUTER_SHARABLE",
+    "AXI_MEM_DOMAIN_SYSTEM",
+};
+
+static const char* axi_mem_encoding_str[] =
+{
+    "AXI_MEM_ENCODING_DEVICE_NON_BUFFERABLE",
+    "AXI_MEM_ENCODING_DEVICE_BUFFERABLE",
+    "AXI_MEM_ENCODING_NORMAL_NON_CACHEABLE_NON_BUFFERABLE",
+    "AXI_MEM_ENCODING_NORMAL_NON_CACHEABLE_BUFFERABLE",
+    "AXI_MEM_ENCODING_WRITE_THROUGH_NO_ALLOCATE",
+    "AXI_MEM_ENCODING_WRITE_THROUGH_READ_ALLOCATE",
+    "AXI_MEM_ENCODING_WRITE_THROUGH_WRITE_ALLOCATE",
+    "AXI_MEM_ENCODING_WRITE_THROUGH_READ_AND_WRITE_ALLOCATE",
+    "AXI_MEM_ENCODING_WRITE_BACK_NO_ALLOCATE",
+    "AXI_MEM_ENCODING_WRITE_BACK_READ_ALLOCATE",
+    "AXI_MEM_ENCODING_WRITE_BACK_WRITE_ALLOCATE",
+    "AXI_MEM_ENCODING_WRITE_BACK_READ_AND_WRITE_ALLOCATE",
+};
+
+static const char* axi_port_str[] =
+{
+    "AXI_PORT_SRAM",
+    "AXI_PORT_EXT",
+};
+
+static const char* branch_cond_str[] =
+{
+    "BRANCH_COND_ALWAYS",
+    "BRANCH_COND_RF_TRUE",
+};
+
+static const char* broadcast_mode_str[] =
+{
+    "BROADCAST_MODE_NONE",
+    "BROADCAST_MODE_H",
+    "BROADCAST_MODE_W",
+    "BROADCAST_MODE_HW",
+    "BROADCAST_MODE_C",
+    "BROADCAST_MODE_CH",
+    "BROADCAST_MODE_CW",
+    "BROADCAST_MODE_CWH",
+    "BROADCAST_MODE_SCALAR",
+};
+
+static const char* cmd0_opcode_str[] =
+{
+    "CMD0_OPCODE_NPU_OP_STOP",
+    "CMD0_OPCODE_NPU_OP_IRQ",
+    "CMD0_OPCODE_NPU_OP_CONV",
+    "CMD0_OPCODE_NPU_OP_DEPTHWISE",
+    "****",
+    "CMD0_OPCODE_NPU_OP_POOL",
+    "CMD0_OPCODE_NPU_OP_ELEMENTWISE",
+    "CMD0_OPCODE_NPU_OP_RESIZE",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "CMD0_OPCODE_NPU_OP_DMA_START",
+    "CMD0_OPCODE_NPU_OP_DMA_WAIT",
+    "CMD0_OPCODE_NPU_OP_KERNEL_WAIT",
+    "CMD0_OPCODE_NPU_OP_PMU_MASK",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "CMD0_OPCODE_NPU_SET_IFM_PAD_TOP",
+    "CMD0_OPCODE_NPU_SET_IFM_PAD_LEFT",
+    "CMD0_OPCODE_NPU_SET_IFM_PAD_RIGHT",
+    "CMD0_OPCODE_NPU_SET_IFM_PAD_BOTTOM",
+    "CMD0_OPCODE_NPU_SET_IFM_DEPTH_M1",
+    "CMD0_OPCODE_NPU_SET_IFM_PRECISION",
+    "****",
+    "CMD0_OPCODE_NPU_SET_IFM_UPSCALE",
+    "CMD0_OPCODE_NPU_SET_IFM_BROADCAST",
+    "CMD0_OPCODE_NPU_SET_IFM_ZERO_POINT",
+    "CMD0_OPCODE_NPU_SET_IFM_WIDTH0_M1",
+    "CMD0_OPCODE_NPU_SET_IFM_HEIGHT0_M1",
+    "CMD0_OPCODE_NPU_SET_IFM_HEIGHT1_M1",
+    "****",
+    "****",
+    "CMD0_OPCODE_NPU_SET_IFM_REGION",
+    "****",
+    "CMD0_OPCODE_NPU_SET_OFM_WIDTH_M1",
+    "CMD0_OPCODE_NPU_SET_OFM_HEIGHT_M1",
+    "CMD0_OPCODE_NPU_SET_OFM_DEPTH_M1",
+    "CMD0_OPCODE_NPU_SET_OFM_PRECISION",
+    "CMD0_OPCODE_NPU_SET_OFM_BLK_WIDTH_M1",
+    "CMD0_OPCODE_NPU_SET_OFM_BLK_HEIGHT_M1",
+    "CMD0_OPCODE_NPU_SET_OFM_BLK_DEPTH_M1",
+    "CMD0_OPCODE_NPU_SET_OFM_ZERO_POINT",
+    "****",
+    "CMD0_OPCODE_NPU_SET_OFM_WIDTH0_M1",
+    "CMD0_OPCODE_NPU_SET_OFM_HEIGHT0_M1",
+    "CMD0_OPCODE_NPU_SET_OFM_HEIGHT1_M1",
+    "****",
+    "****",
+    "CMD0_OPCODE_NPU_SET_OFM_REGION",
+    "CMD0_OPCODE_NPU_SET_KERNEL_WIDTH_M1",
+    "CMD0_OPCODE_NPU_SET_KERNEL_HEIGHT_M1",
+    "CMD0_OPCODE_NPU_SET_KERNEL_STRIDE",
+    "****",
+    "CMD0_OPCODE_NPU_SET_ACC_FORMAT",
+    "CMD0_OPCODE_NPU_SET_ACTIVATION",
+    "CMD0_OPCODE_NPU_SET_ACTIVATION_MIN",
+    "CMD0_OPCODE_NPU_SET_ACTIVATION_MAX",
+    "CMD0_OPCODE_NPU_SET_WEIGHT_REGION",
+    "CMD0_OPCODE_NPU_SET_SCALE_REGION",
+    "CMD0_OPCODE_NPU_SET_RESIZE_X_SCALE_N_M1",
+    "CMD0_OPCODE_NPU_SET_RESIZE_Y_SCALE_N_M1",
+    "CMD0_OPCODE_NPU_SET_RESIZE_X_OFFSET",
+    "CMD0_OPCODE_NPU_SET_RESIZE_Y_OFFSET",
+    "CMD0_OPCODE_NPU_SET_WEIGHT_FORMAT",
+    "CMD0_OPCODE_NPU_SET_BLOCKDEP",
+    "CMD0_OPCODE_NPU_SET_DMA0_SRC_REGION",
+    "CMD0_OPCODE_NPU_SET_DMA0_DST_REGION",
+    "CMD0_OPCODE_NPU_SET_DMA0_SIZE0",
+    "CMD0_OPCODE_NPU_SET_DMA0_SIZE1",
+    "CMD0_OPCODE_NPU_SET_DMA0_IDX_REGION",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "CMD0_OPCODE_NPU_SET_IFM2_BROADCAST",
+    "****",
+    "****",
+    "****",
+    "****",
+    "CMD0_OPCODE_NPU_SET_IFM2_PRECISION",
+    "****",
+    "****",
+    "****",
+    "CMD0_OPCODE_NPU_SET_IFM2_ZERO_POINT",
+    "CMD0_OPCODE_NPU_SET_IFM2_WIDTH0_M1",
+    "CMD0_OPCODE_NPU_SET_IFM2_HEIGHT0_M1",
+    "CMD0_OPCODE_NPU_SET_IFM2_HEIGHT1_M1",
+    "****",
+    "****",
+    "CMD0_OPCODE_NPU_SET_IFM2_REGION",
+};
+
+static const char* cmd1_opcode_str[] =
+{
+    "CMD1_OPCODE_NPU_SET_IFM_BASE0",
+    "CMD1_OPCODE_NPU_SET_IFM_BASE1",
+    "CMD1_OPCODE_NPU_SET_IFM_BASE2",
+    "CMD1_OPCODE_NPU_SET_IFM_BASE3",
+    "CMD1_OPCODE_NPU_SET_IFM_STRIDE_X",
+    "CMD1_OPCODE_NPU_SET_IFM_STRIDE_Y",
+    "CMD1_OPCODE_NPU_SET_IFM_STRIDE_C",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "CMD1_OPCODE_NPU_SET_OFM_BASE0",
+    "CMD1_OPCODE_NPU_SET_OFM_BASE1",
+    "CMD1_OPCODE_NPU_SET_OFM_BASE2",
+    "CMD1_OPCODE_NPU_SET_OFM_BASE3",
+    "CMD1_OPCODE_NPU_SET_OFM_STRIDE_X",
+    "CMD1_OPCODE_NPU_SET_OFM_STRIDE_Y",
+    "CMD1_OPCODE_NPU_SET_OFM_STRIDE_C",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "CMD1_OPCODE_NPU_SET_WEIGHT_BASE",
+    "CMD1_OPCODE_NPU_SET_WEIGHT_LENGTH",
+    "CMD1_OPCODE_NPU_SET_SCALE_BASE",
+    "CMD1_OPCODE_NPU_SET_SCALE_LENGTH",
+    "CMD1_OPCODE_NPU_SET_OFM_SCALE",
+    "CMD1_OPCODE_NPU_SET_IFM_SCALE",
+    "CMD1_OPCODE_NPU_SET_IFM2_SCALE",
+    "CMD1_OPCODE_NPU_SET_OP_SCALAR",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "CMD1_OPCODE_NPU_SET_DMA0_SRC",
+    "CMD1_OPCODE_NPU_SET_DMA0_DST",
+    "CMD1_OPCODE_NPU_SET_DMA0_LEN",
+    "CMD1_OPCODE_NPU_SET_DMA0_SRC_STRIDE0",
+    "CMD1_OPCODE_NPU_SET_DMA0_SRC_STRIDE1",
+    "CMD1_OPCODE_NPU_SET_DMA0_DST_STRIDE0",
+    "CMD1_OPCODE_NPU_SET_DMA0_DST_STRIDE1",
+    "CMD1_OPCODE_NPU_SET_DMA0_IDX",
+    "CMD1_OPCODE_NPU_SET_DMA0_IDX_MAX",
+    "CMD1_OPCODE_NPU_SET_DMA0_IDX_SKIP1",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "CMD1_OPCODE_NPU_SET_IFM2_BASE0",
+    "CMD1_OPCODE_NPU_SET_IFM2_BASE1",
+    "CMD1_OPCODE_NPU_SET_IFM2_BASE2",
+    "CMD1_OPCODE_NPU_SET_IFM2_BASE3",
+    "CMD1_OPCODE_NPU_SET_IFM2_STRIDE_X",
+    "CMD1_OPCODE_NPU_SET_IFM2_STRIDE_Y",
+    "CMD1_OPCODE_NPU_SET_IFM2_STRIDE_C",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "CMD1_OPCODE_NPU_SET_WEIGHT1_BASE",
+    "CMD1_OPCODE_NPU_SET_WEIGHT1_LENGTH",
+    "CMD1_OPCODE_NPU_SET_WEIGHT2_BASE",
+    "CMD1_OPCODE_NPU_SET_WEIGHT2_LENGTH",
+    "CMD1_OPCODE_NPU_SET_WEIGHT3_BASE",
+    "CMD1_OPCODE_NPU_SET_WEIGHT3_LENGTH",
+    "CMD1_OPCODE_NPU_SET_RESIZE_X",
+    "CMD1_OPCODE_NPU_SET_RESIZE_Y",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "CMD1_OPCODE_NPU_OP_BRANCH",
+};
+
+static const char* cmd_ctrl_str[] =
+{
+    "CMD_CTRL_CMD0_CTRL",
+    "CMD_CTRL_CMD1_CTRL",
+};
+
+static const char* custom_dma_str[] =
+{
+    "CUSTOM_DMA_NOT_IMPLEMENTED",
+    "CUSTOM_DMA_IMPLEMENTED",
+};
+
+static const char* dma_fault_channel_str[] =
+{
+    "DMA_FAULT_CHANNEL_CMD_READ",
+    "DMA_FAULT_CHANNEL_IFM_READ",
+    "DMA_FAULT_CHANNEL_WEIGHT_READ",
+    "DMA_FAULT_CHANNEL_SBS_READ",
+    "DMA_FAULT_CHANNEL_MEM2MEM_READ",
+    "****",
+    "****",
+    "****",
+    "DMA_FAULT_CHANNEL_OFM_WRITE",
+    "DMA_FAULT_CHANNEL_MEM2MEM_WRITE",
+};
+
+static const char* dma_fault_src_str[] =
+{
+    "DMA_FAULT_SRC_SRAM",
+    "DMA_FAULT_SRC_EXT",
+};
+
+static const char* dma_idx_mode_str[] =
+{
+    "DMA_IDX_MODE_DISABLED",
+    "DMA_IDX_MODE_ENABLED",
+};
+
+static const char* dma_region_mode_str[] =
+{
+    "DMA_REGION_MODE_EXTERNAL",
+    "DMA_REGION_MODE_INTERNAL",
+};
+
+static const char* dma_stride_mode_str[] =
+{
+    "DMA_STRIDE_MODE_D1",
+    "DMA_STRIDE_MODE_D2",
+    "DMA_STRIDE_MODE_D3",
+};
+
+static const char* elementwise_mode_str[] =
+{
+    "ELEMENTWISE_MODE_MUL",
+    "ELEMENTWISE_MODE_ADD",
+    "ELEMENTWISE_MODE_SUB",
+    "ELEMENTWISE_MODE_MIN",
+    "ELEMENTWISE_MODE_MAX",
+    "ELEMENTWISE_MODE_LRELU",
+    "ELEMENTWISE_MODE_ABS",
+    "ELEMENTWISE_MODE_CLZ",
+    "ELEMENTWISE_MODE_SHR",
+    "ELEMENTWISE_MODE_SHL",
+    "ELEMENTWISE_MODE_LSR",
+    "ELEMENTWISE_MODE_DIV",
+    "****",
+    "****",
+    "****",
+    "****",
+    "ELEMENTWISE_MODE_CMP_EQ",
+    "ELEMENTWISE_MODE_CMP_NE",
+    "ELEMENTWISE_MODE_CMP_GE",
+    "ELEMENTWISE_MODE_CMP_GT",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "ELEMENTWISE_MODE_AND",
+    "ELEMENTWISE_MODE_OR",
+    "ELEMENTWISE_MODE_XOR",
+    "ELEMENTWISE_MODE_NOT",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "ELEMENTWISE_MODE_AND_NOT",
+};
+
+static const char* ifm_upscale_mode_str[] =
+{
+    "IFM_UPSCALE_MODE_NONE",
+    "IFM_UPSCALE_MODE_NEAREST",
+    "IFM_UPSCALE_MODE_ZEROS",
+};
+
+static const char* kernel_decomposition_str[] =
+{
+    "KERNEL_DECOMPOSITION_D8X8",
+    "KERNEL_DECOMPOSITION_D4X4",
+};
+
+static const char* kernel_dilation_str[] =
+{
+    "KERNEL_DILATION_NONE",
+    "KERNEL_DILATION_X2",
+};
+
+static const char* max_beats_str[] =
+{
+    "MAX_BEATS_B64",
+    "MAX_BEATS_B128",
+    "MAX_BEATS_B256",
+};
+
+static const char* microblock_str[] =
+{
+    "MICROBLOCK_U1X1",
+    "MICROBLOCK_U1X2",
+    "MICROBLOCK_U1X4",
+    "MICROBLOCK_U2X2",
+    "MICROBLOCK_U2X4",
+    "MICROBLOCK_U4X4",
+};
+
+static const char* ofm_scale_mode_str[] =
+{
+    "OFM_SCALE_MODE_PER_CHANNEL",
+    "OFM_SCALE_MODE_GLOBAL",
+};
+
+static const char* pmu_axi_channel_str[] =
+{
+    "PMU_AXI_CHANNEL_RD_CMD",
+    "PMU_AXI_CHANNEL_RD_IFM",
+    "PMU_AXI_CHANNEL_RD_WEIGHTS",
+    "PMU_AXI_CHANNEL_RD_SCALE_BIAS",
+    "PMU_AXI_CHANNEL_RD_MEM2MEM",
+    "PMU_AXI_CHANNEL_RD_IFM_STREAM",
+    "PMU_AXI_CHANNEL_RD_MEM2MEM_IDX",
+    "****",
+    "PMU_AXI_CHANNEL_WR_OFM",
+    "PMU_AXI_CHANNEL_WR_MEM2MEM",
+};
+
+static const char* pmu_event_str[] =
+{
+    "PMU_EVENT_NO_EVENT",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "PMU_EVENT_CYCLE",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "PMU_EVENT_NPU_IDLE",
+    "PMU_EVENT_CC_STALLED_ON_BLOCKDEP",
+    "PMU_EVENT_CC_STALLED_ON_SHRAM_RECONFIG",
+    "PMU_EVENT_NPU_ACTIVE",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "PMU_EVENT_MAC_ACTIVE",
+    "****",
+    "****",
+    "PMU_EVENT_MAC_DPU_ACTIVE",
+    "PMU_EVENT_MAC_STALLED_BY_W_OR_ACC",
+    "PMU_EVENT_MAC_STALLED_BY_W",
+    "PMU_EVENT_MAC_STALLED_BY_ACC",
+    "PMU_EVENT_MAC_STALLED_BY_IB",
+    "****",
+    "PMU_EVENT_MAC_STALLED_BY_INT_W",
+    "PMU_EVENT_MAC_STALLED_BY_INT_ACC",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "PMU_EVENT_AO_ACTIVE",
+    "****",
+    "****",
+    "PMU_EVENT_AO_STALLED_BY_BS_OR_OB",
+    "PMU_EVENT_AO_STALLED_BY_BS",
+    "PMU_EVENT_AO_STALLED_BY_OB",
+    "PMU_EVENT_AO_STALLED_BY_AB_OR_CB",
+    "PMU_EVENT_AO_STALLED_BY_AB",
+    "PMU_EVENT_AO_STALLED_BY_CB",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "PMU_EVENT_WD_ACTIVE",
+    "PMU_EVENT_WD_STALLED",
+    "****",
+    "PMU_EVENT_WD_STALLED_BY_WD_BUF",
+    "PMU_EVENT_WD_STALLED_BY_WS_FC",
+    "PMU_EVENT_WD_STALLED_BY_WS_TC",
+    "****",
+    "****",
+    "****",
+    "PMU_EVENT_WD_TRANS_WBLK",
+    "PMU_EVENT_WD_TRANS_WS_FC",
+    "PMU_EVENT_WD_TRANS_WS_TC",
+    "****",
+    "****",
+    "****",
+    "****",
+    "PMU_EVENT_WD_STALLED_BY_WS_SC0",
+    "PMU_EVENT_WD_STALLED_BY_WS_SC1",
+    "PMU_EVENT_WD_STALLED_BY_WS_SC2",
+    "PMU_EVENT_WD_STALLED_BY_WS_SC3",
+    "PMU_EVENT_WD_PARSE_ACTIVE_SC0",
+    "PMU_EVENT_WD_PARSE_ACTIVE_SC1",
+    "PMU_EVENT_WD_PARSE_ACTIVE_SC2",
+    "PMU_EVENT_WD_PARSE_ACTIVE_SC3",
+    "PMU_EVENT_WD_PARSE_STALL_SC0",
+    "PMU_EVENT_WD_PARSE_STALL_SC1",
+    "PMU_EVENT_WD_PARSE_STALL_SC2",
+    "PMU_EVENT_WD_PARSE_STALL_SC3",
+    "PMU_EVENT_WD_PARSE_STALL_IN_SC0",
+    "PMU_EVENT_WD_PARSE_STALL_IN_SC1",
+    "PMU_EVENT_WD_PARSE_STALL_IN_SC2",
+    "PMU_EVENT_WD_PARSE_STALL_IN_SC3",
+    "PMU_EVENT_WD_PARSE_STALL_OUT_SC0",
+    "PMU_EVENT_WD_PARSE_STALL_OUT_SC1",
+    "PMU_EVENT_WD_PARSE_STALL_OUT_SC2",
+    "PMU_EVENT_WD_PARSE_STALL_OUT_SC3",
+    "PMU_EVENT_WD_TRANS_WS_SC0",
+    "PMU_EVENT_WD_TRANS_WS_SC1",
+    "PMU_EVENT_WD_TRANS_WS_SC2",
+    "PMU_EVENT_WD_TRANS_WS_SC3",
+    "PMU_EVENT_WD_TRANS_WB0",
+    "PMU_EVENT_WD_TRANS_WB1",
+    "PMU_EVENT_WD_TRANS_WB2",
+    "PMU_EVENT_WD_TRANS_WB3",
+    "****",
+    "****",
+    "****",
+    "****",
+    "PMU_EVENT_SRAM_RD_TRANS_ACCEPTED",
+    "PMU_EVENT_SRAM_RD_TRANS_COMPLETED",
+    "PMU_EVENT_SRAM_RD_DATA_BEAT_RECEIVED",
+    "PMU_EVENT_SRAM_RD_TRAN_REQ_STALLED",
+    "PMU_EVENT_SRAM_WR_TRANS_ACCEPTED",
+    "PMU_EVENT_SRAM_WR_TRANS_COMPLETED_M",
+    "PMU_EVENT_SRAM_WR_TRANS_COMPLETED_S",
+    "PMU_EVENT_SRAM_WR_DATA_BEAT_WRITTEN",
+    "PMU_EVENT_SRAM_WR_TRAN_REQ_STALLED",
+    "PMU_EVENT_SRAM_WR_DATA_BEAT_STALLED",
+    "****",
+    "****",
+    "PMU_EVENT_SRAM_ENABLED_CYCLES",
+    "****",
+    "PMU_EVENT_SRAM_RD_STALL_LIMIT",
+    "PMU_EVENT_SRAM_WR_STALL_LIMIT",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "PMU_EVENT_AXI_LATENCY_ANY",
+    "PMU_EVENT_AXI_LATENCY_32",
+    "PMU_EVENT_AXI_LATENCY_64",
+    "PMU_EVENT_AXI_LATENCY_128",
+    "PMU_EVENT_AXI_LATENCY_256",
+    "PMU_EVENT_AXI_LATENCY_512",
+    "PMU_EVENT_AXI_LATENCY_1024",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "PMU_EVENT_ECC_DMA",
+    "PMU_EVENT_ECC_MAC_IB",
+    "PMU_EVENT_ECC_MAC_AB",
+    "PMU_EVENT_ECC_AO_CB",
+    "PMU_EVENT_ECC_AO_OB",
+    "PMU_EVENT_ECC_AO_LUT",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "****",
+    "PMU_EVENT_EXT_RD_TRANS_ACCEPTED",
+    "PMU_EVENT_EXT_RD_TRANS_COMPLETED",
+    "PMU_EVENT_EXT_RD_DATA_BEAT_RECEIVED",
+    "PMU_EVENT_EXT_RD_TRAN_REQ_STALLED",
+    "PMU_EVENT_EXT_WR_TRANS_ACCEPTED",
+    "PMU_EVENT_EXT_WR_TRANS_COMPLETED_M",
+    "PMU_EVENT_EXT_WR_TRANS_COMPLETED_S",
+    "PMU_EVENT_EXT_WR_DATA_BEAT_WRITTEN",
+    "PMU_EVENT_EXT_WR_TRAN_REQ_STALLED",
+    "PMU_EVENT_EXT_WR_DATA_BEAT_STALLED",
+    "****",
+    "****",
+    "PMU_EVENT_EXT_ENABLED_CYCLES",
+    "****",
+    "PMU_EVENT_EXT_RD_STALL_LIMIT",
+    "PMU_EVENT_EXT_WR_STALL_LIMIT",
+};
+
+static const char* pmu_port_disable_str[] =
+{
+    "PMU_PORT_DISABLE_ENABLE",
+    "PMU_PORT_DISABLE_DISABLE",
+};
+
+static const char* pooling_mode_str[] =
+{
+    "POOLING_MODE_MAX",
+    "POOLING_MODE_AVERAGE",
+    "POOLING_MODE_REDUCE_SUM",
+    "POOLING_MODE_SUM",
+    "POOLING_MODE_NONE",
+    "POOLING_MODE_MIN",
+    "POOLING_MODE_ARGMAX_X",
+    "POOLING_MODE_ARGMAX_Y",
+};
+
+static const char* privilege_level_str[] =
+{
+    "PRIVILEGE_LEVEL_USER",
+    "PRIVILEGE_LEVEL_PRIVILEGED",
+};
+
+static const char* ram_id_str[] =
+{
+    "RAM_ID_LUT",
+    "RAM_ID_IB",
+    "RAM_ID_AB",
+    "RAM_ID_CB",
+    "RAM_ID_OB",
+};
+
+static const char* resize_mode_str[] =
+{
+    "RESIZE_MODE_BILINEAR",
+    "RESIZE_MODE_REPLICATE",
+    "RESIZE_MODE_NEAREST",
+};
+
+static const char* round_mode_ifm_str[] =
+{
+    "ROUND_MODE_IFM_DOUBLE_SYMMETRIC",
+    "ROUND_MODE_IFM_NATURAL",
+};
+
+static const char* round_mode_ofm_str[] =
+{
+    "ROUND_MODE_OFM_DOUBLE_SYMMETRIC",
+    "ROUND_MODE_OFM_NATURAL",
+    "ROUND_MODE_OFM_DOUBLE_ASYMMETRIC",
+    "ROUND_MODE_OFM_SYMMETRIC",
+    "ROUND_MODE_OFM_TRUNCATE_TO_ZERO",
+    "ROUND_MODE_OFM_TRUNCATE_TO_LOWER",
+};
+
+static const char* security_level_str[] =
+{
+    "SECURITY_LEVEL_SECURE",
+    "SECURITY_LEVEL_NON_SECURE",
+};
+
+static const char* state_str[] =
+{
+    "STATE_STOPPED",
+    "STATE_RUNNING",
+};
+
+static const char* wd_active_core_str[] =
+{
+    "WD_ACTIVE_CORE_NONE",
+    "WD_ACTIVE_CORE_STANDARD",
+    "WD_ACTIVE_CORE_FAST",
+    "WD_ACTIVE_CORE_TENSOR",
+};
+
+static const char* weight_format_str[] =
+{
+    "WEIGHT_FORMAT_SWD",
+    "WEIGHT_FORMAT_FWD",
+};
+
+static const char* weight_order_str[] =
+{
+    "WEIGHT_ORDER_DEPTH_FIRST",
+    "WEIGHT_ORDER_PART_KERNEL_FIRST",
+};
+
+static const char* weight_sparsity_str[] =
+{
+    "WEIGHT_SPARSITY_NONE",
+    "WEIGHT_SPARSITY_SPARSE_2_4",
+};
+
+#endif
+
+
+
+struct id_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t version_status : 4;
+            uint32_t version_minor : 4;
+            uint32_t version_major : 4;
+            uint32_t product_major : 4;
+            uint32_t arch_patch_rev : 4;
+            uint32_t arch_minor_rev : 8;
+            uint32_t arch_major_rev : 4;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR id_r() :
+        word0(536899584)
+    {}
+    CONSTEXPR id_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    id_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_version_status() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR id_r& set_version_status(uint32_t value)
+    {
+        word0 = (~(((1U << 4) - 1)<<0) & word0) | ((((1U << 4) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_version_minor() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR id_r& set_version_minor(uint32_t value)
+    {
+        word0 = (~(((1U << 4) - 1)<<4) & word0) | ((((1U << 4) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_version_major() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 8);
+        return v;
+    }
+    CONSTEXPR id_r& set_version_major(uint32_t value)
+    {
+        word0 = (~(((1U << 4) - 1)<<8) & word0) | ((((1U << 4) - 1) & value) << 8);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_product_major() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 12);
+        return v;
+    }
+    CONSTEXPR id_r& set_product_major(uint32_t value)
+    {
+        word0 = (~(((1U << 4) - 1)<<12) & word0) | ((((1U << 4) - 1) & value) << 12);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_arch_patch_rev() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 16);
+        return v;
+    }
+    CONSTEXPR id_r& set_arch_patch_rev(uint32_t value)
+    {
+        word0 = (~(((1U << 4) - 1)<<16) & word0) | ((((1U << 4) - 1) & value) << 16);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_arch_minor_rev() const
+    {
+        auto v = ((1U << 8) - 1) & (word0 >> 20);
+        return v;
+    }
+    CONSTEXPR id_r& set_arch_minor_rev(uint32_t value)
+    {
+        word0 = (~(((1U << 8) - 1)<<20) & word0) | ((((1U << 8) - 1) & value) << 20);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_arch_major_rev() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 28);
+        return v;
+    }
+    CONSTEXPR id_r& set_arch_major_rev(uint32_t value)
+    {
+        word0 = (~(((1U << 4) - 1)<<28) & word0) | ((((1U << 4) - 1) & value) << 28);
+        return *this;
+    }
+#endif
+};
+
+
+struct status_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t state : 1;
+            uint32_t irq_raised : 1;
+            uint32_t bus_status : 1;
+            uint32_t reset_status : 1;
+            uint32_t cmd_parse_error : 1;
+            uint32_t cmd_end_reached : 1;
+            uint32_t pmu_irq_raised : 1;
+            uint32_t reserved0 : 1;
+            uint32_t ecc_fault : 1;
+            uint32_t branch_fault : 1;
+            uint32_t reserved1 : 1;
+            uint32_t faulting_interface : 1;
+            uint32_t faulting_channel : 4;
+            uint32_t irq_history_mask : 16;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR status_r() :
+        word0(8)
+    {}
+    CONSTEXPR status_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    status_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::state get_state() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::state>(v);
+    }
+    CONSTEXPR status_r& set_state(NPU_NAMESPACE::state value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_irq_raised() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR status_r& set_irq_raised(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_bus_status() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR status_r& set_bus_status(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_reset_status() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR status_r& set_reset_status(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cmd_parse_error() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR status_r& set_cmd_parse_error(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cmd_end_reached() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 5);
+        return v;
+    }
+    CONSTEXPR status_r& set_cmd_end_reached(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_pmu_irq_raised() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 6);
+        return v;
+    }
+    CONSTEXPR status_r& set_pmu_irq_raised(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ecc_fault() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 8);
+        return v;
+    }
+    CONSTEXPR status_r& set_ecc_fault(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & value) << 8);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_branch_fault() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 9);
+        return v;
+    }
+    CONSTEXPR status_r& set_branch_fault(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<9) & word0) | ((((1U << 1) - 1) & value) << 9);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::dma_fault_src get_faulting_interface() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 11);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::dma_fault_src>(v);
+    }
+    CONSTEXPR status_r& set_faulting_interface(NPU_NAMESPACE::dma_fault_src value)
+    {
+        word0 = (~(((1U << 1) - 1)<<11) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 11);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::dma_fault_channel get_faulting_channel() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 12);
+        assert(v <= 9);
+        return static_cast<NPU_NAMESPACE::dma_fault_channel>(v);
+    }
+    CONSTEXPR status_r& set_faulting_channel(NPU_NAMESPACE::dma_fault_channel value)
+    {
+        word0 = (~(((1U << 4) - 1)<<12) & word0) | ((((1U << 4) - 1) & static_cast<uint32_t>(value)) << 12);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_irq_history_mask() const
+    {
+        auto v = ((1U << 16) - 1) & (word0 >> 16);
+        return v;
+    }
+    CONSTEXPR status_r& set_irq_history_mask(uint32_t value)
+    {
+        word0 = (~(((1U << 16) - 1)<<16) & word0) | ((((1U << 16) - 1) & value) << 16);
+        return *this;
+    }
+#endif
+};
+
+
+struct cmd_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t transition_to_running_state : 1;
+            uint32_t clear_irq : 1;
+            uint32_t clock_q_enable : 1;
+            uint32_t power_q_enable : 1;
+            uint32_t stop_request : 1;
+            uint32_t reserved0 : 11;
+            uint32_t clear_irq_history : 16;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR cmd_r() :
+        word0(12)
+    {}
+    CONSTEXPR cmd_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    cmd_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_transition_to_running_state() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR cmd_r& set_transition_to_running_state(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_clear_irq() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR cmd_r& set_clear_irq(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_clock_q_enable() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR cmd_r& set_clock_q_enable(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_power_q_enable() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR cmd_r& set_power_q_enable(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_stop_request() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR cmd_r& set_stop_request(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_clear_irq_history() const
+    {
+        auto v = ((1U << 16) - 1) & (word0 >> 16);
+        return v;
+    }
+    CONSTEXPR cmd_r& set_clear_irq_history(uint32_t value)
+    {
+        word0 = (~(((1U << 16) - 1)<<16) & word0) | ((((1U << 16) - 1) & value) << 16);
+        return *this;
+    }
+#endif
+};
+
+
+struct reset_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t pending_CPL : 1;
+            uint32_t pending_CSL : 1;
+            uint32_t reserved0 : 30;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR reset_r() :
+        word0(0)
+    {}
+    CONSTEXPR reset_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    reset_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::privilege_level get_pending_CPL() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::privilege_level>(v);
+    }
+    CONSTEXPR reset_r& set_pending_CPL(NPU_NAMESPACE::privilege_level value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 0);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::security_level get_pending_CSL() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::security_level>(v);
+    }
+    CONSTEXPR reset_r& set_pending_CSL(NPU_NAMESPACE::security_level value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 1);
+        return *this;
+    }
+#endif
+};
+
+
+struct qbase_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t offset_LO : 32;
+            uint32_t offset_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR qbase_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR qbase_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    qbase_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct qread_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t QREAD : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR qread_r() :
+        word0(0)
+    {}
+    CONSTEXPR qread_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    qread_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_QREAD() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR qread_r& set_QREAD(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct qconfig_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t cmd_region0 : 2;
+            uint32_t reserved0 : 30;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR qconfig_r() :
+        word0(0)
+    {}
+    CONSTEXPR qconfig_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    qconfig_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cmd_region0() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR qconfig_r& set_cmd_region0(uint32_t value)
+    {
+        word0 = (~(((1U << 2) - 1)<<0) & word0) | ((((1U << 2) - 1) & value) << 0);
+        return *this;
+    }
+#endif
+};
+
+
+struct qsize_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t QSIZE : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR qsize_r() :
+        word0(0)
+    {}
+    CONSTEXPR qsize_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    qsize_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_QSIZE() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR qsize_r& set_QSIZE(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct prot_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t active_CPL : 1;
+            uint32_t active_CSL : 1;
+            uint32_t reserved0 : 30;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR prot_r() :
+        word0(0)
+    {}
+    CONSTEXPR prot_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    prot_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::privilege_level get_active_CPL() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::privilege_level>(v);
+    }
+    CONSTEXPR prot_r& set_active_CPL(NPU_NAMESPACE::privilege_level value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 0);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::security_level get_active_CSL() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::security_level>(v);
+    }
+    CONSTEXPR prot_r& set_active_CSL(NPU_NAMESPACE::security_level value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 1);
+        return *this;
+    }
+#endif
+};
+
+
+struct config_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t macs_per_cc : 4;
+            uint32_t cmd_stream_version : 4;
+            uint32_t num_axi_sram : 2;
+            uint32_t num_axi_ext : 1;
+            uint32_t reserved0 : 1;
+            uint32_t num_wd : 2;
+            uint32_t reserved1 : 13;
+            uint32_t custom_dma : 1;
+            uint32_t product : 4;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR config_r() :
+        word0(536870928)
+    {}
+    CONSTEXPR config_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    config_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_macs_per_cc() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR config_r& set_macs_per_cc(uint32_t value)
+    {
+        word0 = (~(((1U << 4) - 1)<<0) & word0) | ((((1U << 4) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cmd_stream_version() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR config_r& set_cmd_stream_version(uint32_t value)
+    {
+        word0 = (~(((1U << 4) - 1)<<4) & word0) | ((((1U << 4) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_num_axi_sram() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 8);
+        return v;
+    }
+    CONSTEXPR config_r& set_num_axi_sram(uint32_t value)
+    {
+        word0 = (~(((1U << 2) - 1)<<8) & word0) | ((((1U << 2) - 1) & value) << 8);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_num_axi_ext() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 10);
+        return v;
+    }
+    CONSTEXPR config_r& set_num_axi_ext(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<10) & word0) | ((((1U << 1) - 1) & value) << 10);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_num_wd() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 12);
+        return v;
+    }
+    CONSTEXPR config_r& set_num_wd(uint32_t value)
+    {
+        word0 = (~(((1U << 2) - 1)<<12) & word0) | ((((1U << 2) - 1) & value) << 12);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::custom_dma get_custom_dma() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 27);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::custom_dma>(v);
+    }
+    CONSTEXPR config_r& set_custom_dma(NPU_NAMESPACE::custom_dma value)
+    {
+        word0 = (~(((1U << 1) - 1)<<27) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 27);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_product() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 28);
+        return v;
+    }
+    CONSTEXPR config_r& set_product(uint32_t value)
+    {
+        word0 = (~(((1U << 4) - 1)<<28) & word0) | ((((1U << 4) - 1) & value) << 28);
+        return *this;
+    }
+#endif
+};
+
+
+struct cond_status_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t result_flag : 1;
+            uint32_t reserved0 : 31;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR cond_status_r() :
+        word0(0)
+    {}
+    CONSTEXPR cond_status_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    cond_status_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_result_flag() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR cond_status_r& set_result_flag(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+#endif
+};
+
+
+struct power_ctrl_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t mac_step_cycles : 6;
+            uint32_t reserved0 : 26;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR power_ctrl_r() :
+        word0(0)
+    {}
+    CONSTEXPR power_ctrl_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    power_ctrl_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_mac_step_cycles() const
+    {
+        auto v = ((1U << 6) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR power_ctrl_r& set_mac_step_cycles(uint32_t value)
+    {
+        word0 = (~(((1U << 6) - 1)<<0) & word0) | ((((1U << 6) - 1) & value) << 0);
+        return *this;
+    }
+#endif
+};
+
+
+struct regioncfg_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t region0 : 2;
+            uint32_t region1 : 2;
+            uint32_t region2 : 2;
+            uint32_t region3 : 2;
+            uint32_t region4 : 2;
+            uint32_t region5 : 2;
+            uint32_t region6 : 2;
+            uint32_t region7 : 2;
+            uint32_t reserved0 : 16;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR regioncfg_r() :
+        word0(0)
+    {}
+    CONSTEXPR regioncfg_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    regioncfg_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region0() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR regioncfg_r& set_region0(uint32_t value)
+    {
+        word0 = (~(((1U << 2) - 1)<<0) & word0) | ((((1U << 2) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region1() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR regioncfg_r& set_region1(uint32_t value)
+    {
+        word0 = (~(((1U << 2) - 1)<<2) & word0) | ((((1U << 2) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region2() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR regioncfg_r& set_region2(uint32_t value)
+    {
+        word0 = (~(((1U << 2) - 1)<<4) & word0) | ((((1U << 2) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region3() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 6);
+        return v;
+    }
+    CONSTEXPR regioncfg_r& set_region3(uint32_t value)
+    {
+        word0 = (~(((1U << 2) - 1)<<6) & word0) | ((((1U << 2) - 1) & value) << 6);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region4() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 8);
+        return v;
+    }
+    CONSTEXPR regioncfg_r& set_region4(uint32_t value)
+    {
+        word0 = (~(((1U << 2) - 1)<<8) & word0) | ((((1U << 2) - 1) & value) << 8);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region5() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 10);
+        return v;
+    }
+    CONSTEXPR regioncfg_r& set_region5(uint32_t value)
+    {
+        word0 = (~(((1U << 2) - 1)<<10) & word0) | ((((1U << 2) - 1) & value) << 10);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region6() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 12);
+        return v;
+    }
+    CONSTEXPR regioncfg_r& set_region6(uint32_t value)
+    {
+        word0 = (~(((1U << 2) - 1)<<12) & word0) | ((((1U << 2) - 1) & value) << 12);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region7() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 14);
+        return v;
+    }
+    CONSTEXPR regioncfg_r& set_region7(uint32_t value)
+    {
+        word0 = (~(((1U << 2) - 1)<<14) & word0) | ((((1U << 2) - 1) & value) << 14);
+        return *this;
+    }
+#endif
+};
+
+
+struct mem_attr_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t mem_domain : 2;
+            uint32_t axi_port : 1;
+            uint32_t reserved0 : 1;
+            uint32_t memtype : 4;
+            uint32_t reserved1 : 24;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR mem_attr_r() :
+        word0(0)
+    {}
+    CONSTEXPR mem_attr_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    mem_attr_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::axi_mem_domain get_mem_domain() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 0);
+        assert(v <= 3);
+        return static_cast<NPU_NAMESPACE::axi_mem_domain>(v);
+    }
+    CONSTEXPR mem_attr_r& set_mem_domain(NPU_NAMESPACE::axi_mem_domain value)
+    {
+        word0 = (~(((1U << 2) - 1)<<0) & word0) | ((((1U << 2) - 1) & static_cast<uint32_t>(value)) << 0);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::axi_port get_axi_port() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::axi_port>(v);
+    }
+    CONSTEXPR mem_attr_r& set_axi_port(NPU_NAMESPACE::axi_port value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 2);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::axi_mem_encoding get_memtype() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 4);
+        assert(v <= 11);
+        return static_cast<NPU_NAMESPACE::axi_mem_encoding>(v);
+    }
+    CONSTEXPR mem_attr_r& set_memtype(NPU_NAMESPACE::axi_mem_encoding value)
+    {
+        word0 = (~(((1U << 4) - 1)<<4) & word0) | ((((1U << 4) - 1) & static_cast<uint32_t>(value)) << 4);
+        return *this;
+    }
+#endif
+};
+
+
+struct axi_sram_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t max_outstanding_read_m1 : 6;
+            uint32_t reserved0 : 2;
+            uint32_t max_outstanding_write_m1 : 5;
+            uint32_t reserved1 : 3;
+            uint32_t max_beats : 2;
+            uint32_t reserved2 : 14;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR axi_sram_r() :
+        word0(0)
+    {}
+    CONSTEXPR axi_sram_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    axi_sram_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_max_outstanding_read_m1() const
+    {
+        auto v = ((1U << 6) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR axi_sram_r& set_max_outstanding_read_m1(uint32_t value)
+    {
+        word0 = (~(((1U << 6) - 1)<<0) & word0) | ((((1U << 6) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_max_outstanding_write_m1() const
+    {
+        auto v = ((1U << 5) - 1) & (word0 >> 8);
+        return v;
+    }
+    CONSTEXPR axi_sram_r& set_max_outstanding_write_m1(uint32_t value)
+    {
+        word0 = (~(((1U << 5) - 1)<<8) & word0) | ((((1U << 5) - 1) & value) << 8);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::max_beats get_max_beats() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 16);
+        assert(v <= 2);
+        return static_cast<NPU_NAMESPACE::max_beats>(v);
+    }
+    CONSTEXPR axi_sram_r& set_max_beats(NPU_NAMESPACE::max_beats value)
+    {
+        word0 = (~(((1U << 2) - 1)<<16) & word0) | ((((1U << 2) - 1) & static_cast<uint32_t>(value)) << 16);
+        return *this;
+    }
+#endif
+};
+
+
+struct axi_ext_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t max_outstanding_read_m1 : 6;
+            uint32_t reserved0 : 2;
+            uint32_t max_outstanding_write_m1 : 5;
+            uint32_t reserved1 : 3;
+            uint32_t max_beats : 2;
+            uint32_t reserved2 : 14;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR axi_ext_r() :
+        word0(0)
+    {}
+    CONSTEXPR axi_ext_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    axi_ext_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_max_outstanding_read_m1() const
+    {
+        auto v = ((1U << 6) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR axi_ext_r& set_max_outstanding_read_m1(uint32_t value)
+    {
+        word0 = (~(((1U << 6) - 1)<<0) & word0) | ((((1U << 6) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_max_outstanding_write_m1() const
+    {
+        auto v = ((1U << 5) - 1) & (word0 >> 8);
+        return v;
+    }
+    CONSTEXPR axi_ext_r& set_max_outstanding_write_m1(uint32_t value)
+    {
+        word0 = (~(((1U << 5) - 1)<<8) & word0) | ((((1U << 5) - 1) & value) << 8);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::max_beats get_max_beats() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 16);
+        assert(v <= 2);
+        return static_cast<NPU_NAMESPACE::max_beats>(v);
+    }
+    CONSTEXPR axi_ext_r& set_max_beats(NPU_NAMESPACE::max_beats value)
+    {
+        word0 = (~(((1U << 2) - 1)<<16) & word0) | ((((1U << 2) - 1) & static_cast<uint32_t>(value)) << 16);
+        return *this;
+    }
+#endif
+};
+
+
+struct cfg_sram_cap_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t max_outstanding_read_m1 : 6;
+            uint32_t reserved0 : 2;
+            uint32_t max_outstanding_write_m1 : 5;
+            uint32_t reserved1 : 3;
+            uint32_t max_beats : 2;
+            uint32_t reserved2 : 14;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR cfg_sram_cap_r() :
+        word0(0)
+    {}
+    CONSTEXPR cfg_sram_cap_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    cfg_sram_cap_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_max_outstanding_read_m1() const
+    {
+        auto v = ((1U << 6) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR cfg_sram_cap_r& set_max_outstanding_read_m1(uint32_t value)
+    {
+        word0 = (~(((1U << 6) - 1)<<0) & word0) | ((((1U << 6) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_max_outstanding_write_m1() const
+    {
+        auto v = ((1U << 5) - 1) & (word0 >> 8);
+        return v;
+    }
+    CONSTEXPR cfg_sram_cap_r& set_max_outstanding_write_m1(uint32_t value)
+    {
+        word0 = (~(((1U << 5) - 1)<<8) & word0) | ((((1U << 5) - 1) & value) << 8);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::max_beats get_max_beats() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 16);
+        assert(v <= 2);
+        return static_cast<NPU_NAMESPACE::max_beats>(v);
+    }
+    CONSTEXPR cfg_sram_cap_r& set_max_beats(NPU_NAMESPACE::max_beats value)
+    {
+        word0 = (~(((1U << 2) - 1)<<16) & word0) | ((((1U << 2) - 1) & static_cast<uint32_t>(value)) << 16);
+        return *this;
+    }
+#endif
+};
+
+
+struct cfg_ext_cap_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t max_outstanding_read_m1 : 6;
+            uint32_t reserved0 : 2;
+            uint32_t max_outstanding_write_m1 : 5;
+            uint32_t reserved1 : 3;
+            uint32_t max_beats : 2;
+            uint32_t reserved2 : 14;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR cfg_ext_cap_r() :
+        word0(0)
+    {}
+    CONSTEXPR cfg_ext_cap_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    cfg_ext_cap_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_max_outstanding_read_m1() const
+    {
+        auto v = ((1U << 6) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR cfg_ext_cap_r& set_max_outstanding_read_m1(uint32_t value)
+    {
+        word0 = (~(((1U << 6) - 1)<<0) & word0) | ((((1U << 6) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_max_outstanding_write_m1() const
+    {
+        auto v = ((1U << 5) - 1) & (word0 >> 8);
+        return v;
+    }
+    CONSTEXPR cfg_ext_cap_r& set_max_outstanding_write_m1(uint32_t value)
+    {
+        word0 = (~(((1U << 5) - 1)<<8) & word0) | ((((1U << 5) - 1) & value) << 8);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::max_beats get_max_beats() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 16);
+        assert(v <= 2);
+        return static_cast<NPU_NAMESPACE::max_beats>(v);
+    }
+    CONSTEXPR cfg_ext_cap_r& set_max_beats(NPU_NAMESPACE::max_beats value)
+    {
+        word0 = (~(((1U << 2) - 1)<<16) & word0) | ((((1U << 2) - 1) & static_cast<uint32_t>(value)) << 16);
+        return *this;
+    }
+#endif
+};
+
+
+struct cfg_sram_hash0_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t zero : 6;
+            uint32_t hash_LO : 26;
+            uint32_t hash_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR cfg_sram_hash0_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR cfg_sram_hash0_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    cfg_sram_hash0_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct cfg_sram_hash1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t zero : 6;
+            uint32_t hash_LO : 26;
+            uint32_t hash_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR cfg_sram_hash1_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR cfg_sram_hash1_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    cfg_sram_hash1_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct cfg_ext_hash0_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t zero : 6;
+            uint32_t hash_LO : 26;
+            uint32_t hash_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR cfg_ext_hash0_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR cfg_ext_hash0_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    cfg_ext_hash0_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct basep_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t offset_LO : 32;
+            uint32_t offset_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR basep_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR basep_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    basep_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct clkforce_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t top_level_clk : 1;
+            uint32_t cc_clk : 1;
+            uint32_t dma_clk : 1;
+            uint32_t mac_clk : 1;
+            uint32_t ao_clk : 1;
+            uint32_t wd_clk : 1;
+            uint32_t reserved0 : 26;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR clkforce_r() :
+        word0(0)
+    {}
+    CONSTEXPR clkforce_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    clkforce_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_top_level_clk() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR clkforce_r& set_top_level_clk(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cc_clk() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR clkforce_r& set_cc_clk(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_dma_clk() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR clkforce_r& set_dma_clk(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_mac_clk() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR clkforce_r& set_mac_clk(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ao_clk() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR clkforce_r& set_ao_clk(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wd_clk() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 5);
+        return v;
+    }
+    CONSTEXPR clkforce_r& set_wd_clk(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5);
+        return *this;
+    }
+#endif
+};
+
+
+struct debug_address_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t addr : 28;
+            uint32_t ram_id : 4;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR debug_address_r() :
+        word0(0)
+    {}
+    CONSTEXPR debug_address_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    debug_address_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_addr() const
+    {
+        auto v = ((1U << 28) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR debug_address_r& set_addr(uint32_t value)
+    {
+        word0 = (~(((1U << 28) - 1)<<0) & word0) | ((((1U << 28) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::ram_id get_ram_id() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 28);
+        assert(v <= 4);
+        return static_cast<NPU_NAMESPACE::ram_id>(v);
+    }
+    CONSTEXPR debug_address_r& set_ram_id(NPU_NAMESPACE::ram_id value)
+    {
+        word0 = (~(((1U << 4) - 1)<<28) & word0) | ((((1U << 4) - 1) & static_cast<uint32_t>(value)) << 28);
+        return *this;
+    }
+#endif
+};
+
+
+struct debug_misc_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t misc : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR debug_misc_r() :
+        word0(0)
+    {}
+    CONSTEXPR debug_misc_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    debug_misc_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_misc() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR debug_misc_r& set_misc(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_ifm_src_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t offset_LO : 32;
+            uint32_t offset_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma_ifm_src_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma_ifm_src_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma_ifm_src_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_ifm_dst_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR dma_ifm_dst_r() :
+        word0(0)
+    {}
+    CONSTEXPR dma_ifm_dst_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    dma_ifm_dst_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR dma_ifm_dst_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_ofm_src_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR dma_ofm_src_r() :
+        word0(0)
+    {}
+    CONSTEXPR dma_ofm_src_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    dma_ofm_src_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR dma_ofm_src_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_ofm_dst_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t offset_LO : 32;
+            uint32_t offset_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma_ofm_dst_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma_ofm_dst_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma_ofm_dst_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_weight_src_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t offset_LO : 32;
+            uint32_t offset_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma_weight_src_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma_weight_src_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma_weight_src_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_cmd_src_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t offset_LO : 32;
+            uint32_t offset_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma_cmd_src_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma_cmd_src_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma_cmd_src_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_cmd_size_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR dma_cmd_size_r() :
+        word0(0)
+    {}
+    CONSTEXPR dma_cmd_size_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    dma_cmd_size_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR dma_cmd_size_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_m2m_src_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t offset_LO : 32;
+            uint32_t offset_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma_m2m_src_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma_m2m_src_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma_m2m_src_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_m2m_dst_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t offset_LO : 32;
+            uint32_t offset_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma_m2m_dst_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma_m2m_dst_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma_m2m_dst_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct current_qread_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR current_qread_r() :
+        word0(0)
+    {}
+    CONSTEXPR current_qread_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    current_qread_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR current_qread_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_scale_src_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t offset_LO : 32;
+            uint32_t offset_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma_scale_src_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma_scale_src_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma_scale_src_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_weight1_src_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t offset_LO : 32;
+            uint32_t offset_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma_weight1_src_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma_weight1_src_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma_weight1_src_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_weight2_src_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t offset_LO : 32;
+            uint32_t offset_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma_weight2_src_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma_weight2_src_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma_weight2_src_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_weight3_src_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t offset_LO : 32;
+            uint32_t offset_HI : 8;
+            uint32_t reserved0 : 24;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma_weight3_src_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma_weight3_src_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma_weight3_src_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct current_op_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR current_op_r() :
+        word0(0)
+    {}
+    CONSTEXPR current_op_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    current_op_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR current_op_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct current_cmd_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR current_cmd_r() :
+        word0(0)
+    {}
+    CONSTEXPR current_cmd_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    current_cmd_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR current_cmd_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct internal_memory_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t mem_word : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR internal_memory_r() :
+        word0(0)
+    {}
+    CONSTEXPR internal_memory_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    internal_memory_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_mem_word() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR internal_memory_r& set_mem_word(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_pad_top_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_pad_top_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_pad_top_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_pad_top_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_pad_top_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_pad_left_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_pad_left_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_pad_left_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_pad_left_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_pad_left_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_pad_right_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_pad_right_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_pad_right_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_pad_right_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_pad_right_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_pad_bottom_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_pad_bottom_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_pad_bottom_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_pad_bottom_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_pad_bottom_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_depth_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_depth_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_depth_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_depth_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_depth_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_precision_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_precision_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_precision_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_precision_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_precision_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_upscale_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_upscale_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_upscale_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_upscale_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_upscale_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_broadcast_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_broadcast_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_broadcast_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_broadcast_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_broadcast_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_zero_point_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_zero_point_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_zero_point_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_zero_point_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_zero_point_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_width0_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_width0_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_width0_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_width0_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_width0_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_height0_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_height0_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_height0_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_height0_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_height0_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_height1_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_height1_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_height1_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_height1_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_height1_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_region_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm_region_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm_region_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm_region_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm_region_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_width_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_width_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_width_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_width_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_width_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_height_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_height_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_height_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_height_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_height_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_depth_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_depth_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_depth_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_depth_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_depth_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_precision_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_precision_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_precision_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_precision_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_precision_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_blk_width_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_blk_width_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_blk_width_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_blk_width_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_blk_width_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_blk_height_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_blk_height_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_blk_height_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_blk_height_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_blk_height_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_blk_depth_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_blk_depth_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_blk_depth_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_blk_depth_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_blk_depth_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_zero_point_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_zero_point_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_zero_point_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_zero_point_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_zero_point_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_width0_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_width0_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_width0_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_width0_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_width0_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_height0_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_height0_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_height0_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_height0_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_height0_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_height1_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_height1_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_height1_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_height1_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_height1_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_region_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ofm_region_r() :
+        word0(0)
+    {}
+    CONSTEXPR ofm_region_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ofm_region_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ofm_region_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct kernel_width_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR kernel_width_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR kernel_width_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    kernel_width_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR kernel_width_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct kernel_height_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR kernel_height_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR kernel_height_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    kernel_height_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR kernel_height_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct kernel_stride_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR kernel_stride_r() :
+        word0(0)
+    {}
+    CONSTEXPR kernel_stride_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    kernel_stride_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR kernel_stride_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct acc_format_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR acc_format_r() :
+        word0(0)
+    {}
+    CONSTEXPR acc_format_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    acc_format_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR acc_format_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct activation_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR activation_r() :
+        word0(0)
+    {}
+    CONSTEXPR activation_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    activation_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR activation_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct activation_min_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR activation_min_r() :
+        word0(0)
+    {}
+    CONSTEXPR activation_min_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    activation_min_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR activation_min_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct activation_max_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR activation_max_r() :
+        word0(0)
+    {}
+    CONSTEXPR activation_max_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    activation_max_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR activation_max_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct weight_region_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR weight_region_r() :
+        word0(0)
+    {}
+    CONSTEXPR weight_region_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    weight_region_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR weight_region_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct scale_region_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR scale_region_r() :
+        word0(0)
+    {}
+    CONSTEXPR scale_region_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    scale_region_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR scale_region_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct resize_x_scale_n_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR resize_x_scale_n_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR resize_x_scale_n_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    resize_x_scale_n_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR resize_x_scale_n_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct resize_y_scale_n_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR resize_y_scale_n_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR resize_y_scale_n_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    resize_y_scale_n_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR resize_y_scale_n_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct resize_x_offset_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR resize_x_offset_r() :
+        word0(0)
+    {}
+    CONSTEXPR resize_x_offset_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    resize_x_offset_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR resize_x_offset_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct resize_y_offset_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR resize_y_offset_r() :
+        word0(0)
+    {}
+    CONSTEXPR resize_y_offset_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    resize_y_offset_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR resize_y_offset_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct weight_format_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR weight_format_r() :
+        word0(0)
+    {}
+    CONSTEXPR weight_format_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    weight_format_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR weight_format_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct blockdep_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR blockdep_r() :
+        word0(0)
+    {}
+    CONSTEXPR blockdep_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    blockdep_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR blockdep_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_src_region_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR dma0_src_region_r() :
+        word0(0)
+    {}
+    CONSTEXPR dma0_src_region_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    dma0_src_region_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR dma0_src_region_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_dst_region_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR dma0_dst_region_r() :
+        word0(0)
+    {}
+    CONSTEXPR dma0_dst_region_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    dma0_dst_region_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR dma0_dst_region_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_size0_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR dma0_size0_r() :
+        word0(0)
+    {}
+    CONSTEXPR dma0_size0_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    dma0_size0_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR dma0_size0_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_size1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR dma0_size1_r() :
+        word0(0)
+    {}
+    CONSTEXPR dma0_size1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    dma0_size1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR dma0_size1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_idx_region_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR dma0_idx_region_r() :
+        word0(0)
+    {}
+    CONSTEXPR dma0_idx_region_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    dma0_idx_region_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR dma0_idx_region_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_broadcast_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm2_broadcast_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm2_broadcast_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm2_broadcast_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm2_broadcast_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_precision_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm2_precision_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm2_precision_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm2_precision_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm2_precision_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_zero_point_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm2_zero_point_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm2_zero_point_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm2_zero_point_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm2_zero_point_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_width0_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm2_width0_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm2_width0_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm2_width0_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm2_width0_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_height0_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm2_height0_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm2_height0_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm2_height0_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm2_height0_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_height1_m1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm2_height1_m1_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm2_height1_m1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm2_height1_m1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm2_height1_m1_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_region_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ifm2_region_r() :
+        word0(0)
+    {}
+    CONSTEXPR ifm2_region_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ifm2_region_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR ifm2_region_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_base0_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm_base0_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm_base0_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm_base0_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_base1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm_base1_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm_base1_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm_base1_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_base2_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm_base2_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm_base2_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm_base2_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_base3_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm_base3_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm_base3_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm_base3_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_stride_x_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm_stride_x_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm_stride_x_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm_stride_x_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_stride_y_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm_stride_y_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm_stride_y_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm_stride_y_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_stride_c_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm_stride_c_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm_stride_c_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm_stride_c_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_base0_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ofm_base0_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ofm_base0_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ofm_base0_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_base1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ofm_base1_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ofm_base1_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ofm_base1_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_base2_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ofm_base2_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ofm_base2_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ofm_base2_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_base3_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ofm_base3_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ofm_base3_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ofm_base3_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_stride_x_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ofm_stride_x_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ofm_stride_x_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ofm_stride_x_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_stride_y_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ofm_stride_y_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ofm_stride_y_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ofm_stride_y_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_stride_c_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ofm_stride_c_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ofm_stride_c_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ofm_stride_c_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct weight_base_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR weight_base_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR weight_base_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    weight_base_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct weight_length_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR weight_length_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR weight_length_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    weight_length_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct scale_base_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR scale_base_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR scale_base_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    scale_base_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct scale_length_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR scale_length_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR scale_length_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    scale_length_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ofm_scale_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ofm_scale_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ofm_scale_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ofm_scale_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm_scale_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm_scale_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm_scale_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm_scale_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_scale_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm2_scale_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm2_scale_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm2_scale_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct op_scalar_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR op_scalar_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR op_scalar_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    op_scalar_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_src_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma0_src_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma0_src_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma0_src_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_dst_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma0_dst_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma0_dst_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma0_dst_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_len_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma0_len_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma0_len_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma0_len_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_src_stride0_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma0_src_stride0_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma0_src_stride0_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma0_src_stride0_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_src_stride1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma0_src_stride1_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma0_src_stride1_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma0_src_stride1_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_dst_stride0_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma0_dst_stride0_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma0_dst_stride0_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma0_dst_stride0_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_dst_stride1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma0_dst_stride1_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma0_dst_stride1_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma0_dst_stride1_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_idx_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma0_idx_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma0_idx_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma0_idx_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_base0_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm2_base0_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm2_base0_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm2_base0_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_base1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm2_base1_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm2_base1_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm2_base1_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_base2_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm2_base2_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm2_base2_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm2_base2_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_base3_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm2_base3_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm2_base3_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm2_base3_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_stride_x_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm2_stride_x_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm2_stride_x_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm2_stride_x_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_stride_y_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm2_stride_y_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm2_stride_y_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm2_stride_y_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct ifm2_stride_c_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR ifm2_stride_c_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR ifm2_stride_c_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    ifm2_stride_c_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct weight1_base_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR weight1_base_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR weight1_base_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    weight1_base_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct weight1_length_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR weight1_length_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR weight1_length_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    weight1_length_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct weight2_base_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR weight2_base_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR weight2_base_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    weight2_base_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct weight2_length_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR weight2_length_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR weight2_length_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    weight2_length_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct weight3_base_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR weight3_base_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR weight3_base_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    weight3_base_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct weight3_length_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR weight3_length_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR weight3_length_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    weight3_length_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct resize_x_step_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR resize_x_step_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR resize_x_step_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    resize_x_step_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct resize_y_step_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR resize_y_step_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR resize_y_step_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    resize_y_step_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_idx_max_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma0_idx_max_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma0_idx_max_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma0_idx_max_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct dma0_idx_skip1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value_LO : 32;
+            uint32_t value_HI : 32;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR dma0_idx_skip1_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR dma0_idx_skip1_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    dma0_idx_skip1_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct revision_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t value : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR revision_r() :
+        word0(0)
+    {}
+    CONSTEXPR revision_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    revision_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_value() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR revision_r& set_value(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pid4_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t PID4 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pid4_r() :
+        word0(4)
+    {}
+    CONSTEXPR pid4_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pid4_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_PID4() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR pid4_r& set_PID4(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pid5_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t PID5 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pid5_r() :
+        word0(0)
+    {}
+    CONSTEXPR pid5_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pid5_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_PID5() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR pid5_r& set_PID5(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pid6_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t PID6 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pid6_r() :
+        word0(0)
+    {}
+    CONSTEXPR pid6_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pid6_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_PID6() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR pid6_r& set_PID6(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pid7_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t PID7 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pid7_r() :
+        word0(0)
+    {}
+    CONSTEXPR pid7_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pid7_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_PID7() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR pid7_r& set_PID7(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pid0_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t PID0 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pid0_r() :
+        word0(130)
+    {}
+    CONSTEXPR pid0_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pid0_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_PID0() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR pid0_r& set_PID0(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pid1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t PID1 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pid1_r() :
+        word0(181)
+    {}
+    CONSTEXPR pid1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pid1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_PID1() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR pid1_r& set_PID1(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pid2_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t PID2 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pid2_r() :
+        word0(11)
+    {}
+    CONSTEXPR pid2_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pid2_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_PID2() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR pid2_r& set_PID2(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pid3_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t PID3 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pid3_r() :
+        word0(0)
+    {}
+    CONSTEXPR pid3_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pid3_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_PID3() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR pid3_r& set_PID3(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct cid0_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t CID0 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR cid0_r() :
+        word0(13)
+    {}
+    CONSTEXPR cid0_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    cid0_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_CID0() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR cid0_r& set_CID0(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct cid1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t CID1 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR cid1_r() :
+        word0(240)
+    {}
+    CONSTEXPR cid1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    cid1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_CID1() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR cid1_r& set_CID1(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct cid2_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t CID2 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR cid2_r() :
+        word0(5)
+    {}
+    CONSTEXPR cid2_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    cid2_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_CID2() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR cid2_r& set_CID2(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct cid3_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t CID3 : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR cid3_r() :
+        word0(177)
+    {}
+    CONSTEXPR cid3_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    cid3_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_CID3() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR cid3_r& set_CID3(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct wd_status_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t ctrl_idle : 1;
+            uint32_t reserved0 : 1;
+            uint32_t active_core : 2;
+            uint32_t sc0_idle : 1;
+            uint32_t sc1_idle : 1;
+            uint32_t sc2_idle : 1;
+            uint32_t sc3_idle : 1;
+            uint32_t fc_idle : 1;
+            uint32_t tc_idle : 1;
+            uint32_t reserved1 : 6;
+            uint32_t wbuf0_valid : 1;
+            uint32_t wbuf0_idle : 1;
+            uint32_t wbuf1_valid : 1;
+            uint32_t wbuf1_idle : 1;
+            uint32_t wbuf2_valid : 1;
+            uint32_t wbuf2_idle : 1;
+            uint32_t wbuf3_valid : 1;
+            uint32_t wbuf3_idle : 1;
+            uint32_t stalled_by_ws_sc0 : 1;
+            uint32_t stalled_by_ws_sc1 : 1;
+            uint32_t stalled_by_ws_sc2 : 1;
+            uint32_t stalled_by_ws_sc3 : 1;
+            uint32_t stalled_by_ws_fc : 1;
+            uint32_t stalled_by_ws_tc : 1;
+            uint32_t stalled_by_wd_buf : 1;
+            uint32_t reserved2 : 1;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR wd_status_r() :
+        word0(0)
+    {}
+    CONSTEXPR wd_status_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    wd_status_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ctrl_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_ctrl_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::wd_active_core get_active_core() const
+    {
+        auto v = ((1U << 2) - 1) & (word0 >> 2);
+        assert(v <= 3);
+        return static_cast<NPU_NAMESPACE::wd_active_core>(v);
+    }
+    CONSTEXPR wd_status_r& set_active_core(NPU_NAMESPACE::wd_active_core value)
+    {
+        word0 = (~(((1U << 2) - 1)<<2) & word0) | ((((1U << 2) - 1) & static_cast<uint32_t>(value)) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_sc0_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_sc0_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_sc1_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 5);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_sc1_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_sc2_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 6);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_sc2_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_sc3_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 7);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_sc3_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_fc_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 8);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_fc_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & value) << 8);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_tc_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 9);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_tc_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<9) & word0) | ((((1U << 1) - 1) & value) << 9);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wbuf0_valid() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 16);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_wbuf0_valid(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<16) & word0) | ((((1U << 1) - 1) & value) << 16);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wbuf0_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 17);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_wbuf0_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<17) & word0) | ((((1U << 1) - 1) & value) << 17);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wbuf1_valid() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 18);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_wbuf1_valid(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<18) & word0) | ((((1U << 1) - 1) & value) << 18);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wbuf1_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 19);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_wbuf1_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<19) & word0) | ((((1U << 1) - 1) & value) << 19);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wbuf2_valid() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 20);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_wbuf2_valid(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<20) & word0) | ((((1U << 1) - 1) & value) << 20);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wbuf2_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 21);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_wbuf2_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<21) & word0) | ((((1U << 1) - 1) & value) << 21);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wbuf3_valid() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 22);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_wbuf3_valid(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<22) & word0) | ((((1U << 1) - 1) & value) << 22);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wbuf3_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 23);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_wbuf3_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<23) & word0) | ((((1U << 1) - 1) & value) << 23);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_stalled_by_ws_sc0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 24);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_stalled_by_ws_sc0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<24) & word0) | ((((1U << 1) - 1) & value) << 24);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_stalled_by_ws_sc1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 25);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_stalled_by_ws_sc1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<25) & word0) | ((((1U << 1) - 1) & value) << 25);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_stalled_by_ws_sc2() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 26);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_stalled_by_ws_sc2(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<26) & word0) | ((((1U << 1) - 1) & value) << 26);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_stalled_by_ws_sc3() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 27);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_stalled_by_ws_sc3(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<27) & word0) | ((((1U << 1) - 1) & value) << 27);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_stalled_by_ws_fc() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 28);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_stalled_by_ws_fc(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<28) & word0) | ((((1U << 1) - 1) & value) << 28);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_stalled_by_ws_tc() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 29);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_stalled_by_ws_tc(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<29) & word0) | ((((1U << 1) - 1) & value) << 29);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_stalled_by_wd_buf() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 30);
+        return v;
+    }
+    CONSTEXPR wd_status_r& set_stalled_by_wd_buf(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<30) & word0) | ((((1U << 1) - 1) & value) << 30);
+        return *this;
+    }
+#endif
+};
+
+
+struct mac_status_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t block_cfg_valid : 1;
+            uint32_t trav_en : 1;
+            uint32_t wait_for_ib : 1;
+            uint32_t wait_for_acc_buf : 1;
+            uint32_t wait_for_weights : 1;
+            uint32_t stall_stripe : 1;
+            uint32_t dw_sel : 1;
+            uint32_t wait_for_dw0_ready : 1;
+            uint32_t wait_for_dw1_ready : 1;
+            uint32_t acc_buf_sel_ai : 1;
+            uint32_t wait_for_acc0_ready : 1;
+            uint32_t wait_for_acc1_ready : 1;
+            uint32_t acc_buf_sel_aa : 1;
+            uint32_t acc0_valid : 1;
+            uint32_t acc1_valid : 1;
+            uint32_t reserved0 : 1;
+            uint32_t events : 11;
+            uint32_t reserved1 : 5;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR mac_status_r() :
+        word0(0)
+    {}
+    CONSTEXPR mac_status_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    mac_status_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_block_cfg_valid() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_block_cfg_valid(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_trav_en() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_trav_en(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wait_for_ib() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_wait_for_ib(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wait_for_acc_buf() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_wait_for_acc_buf(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wait_for_weights() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_wait_for_weights(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_stall_stripe() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 5);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_stall_stripe(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_dw_sel() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 6);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_dw_sel(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wait_for_dw0_ready() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 7);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_wait_for_dw0_ready(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wait_for_dw1_ready() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 8);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_wait_for_dw1_ready(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & value) << 8);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_acc_buf_sel_ai() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 9);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_acc_buf_sel_ai(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<9) & word0) | ((((1U << 1) - 1) & value) << 9);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wait_for_acc0_ready() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 10);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_wait_for_acc0_ready(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<10) & word0) | ((((1U << 1) - 1) & value) << 10);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wait_for_acc1_ready() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 11);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_wait_for_acc1_ready(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<11) & word0) | ((((1U << 1) - 1) & value) << 11);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_acc_buf_sel_aa() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 12);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_acc_buf_sel_aa(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<12) & word0) | ((((1U << 1) - 1) & value) << 12);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_acc0_valid() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 13);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_acc0_valid(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<13) & word0) | ((((1U << 1) - 1) & value) << 13);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_acc1_valid() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 14);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_acc1_valid(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<14) & word0) | ((((1U << 1) - 1) & value) << 14);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_events() const
+    {
+        auto v = ((1U << 11) - 1) & (word0 >> 16);
+        return v;
+    }
+    CONSTEXPR mac_status_r& set_events(uint32_t value)
+    {
+        word0 = (~(((1U << 11) - 1)<<16) & word0) | ((((1U << 11) - 1) & value) << 16);
+        return *this;
+    }
+#endif
+};
+
+
+struct ao_status_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t ao_active : 1;
+            uint32_t reserved0 : 2;
+            uint32_t ao_stalled_by_bs_or_ob : 1;
+            uint32_t ao_stalled_by_bs : 1;
+            uint32_t ao_stalled_by_ob : 1;
+            uint32_t ao_stalled_by_ab_or_cb : 1;
+            uint32_t ao_stalled_by_ab : 1;
+            uint32_t ao_stalled_by_cb : 1;
+            uint32_t reserved1 : 23;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR ao_status_r() :
+        word0(0)
+    {}
+    CONSTEXPR ao_status_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    ao_status_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ao_active() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR ao_status_r& set_ao_active(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ao_stalled_by_bs_or_ob() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR ao_status_r& set_ao_stalled_by_bs_or_ob(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ao_stalled_by_bs() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR ao_status_r& set_ao_stalled_by_bs(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ao_stalled_by_ob() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 5);
+        return v;
+    }
+    CONSTEXPR ao_status_r& set_ao_stalled_by_ob(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ao_stalled_by_ab_or_cb() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 6);
+        return v;
+    }
+    CONSTEXPR ao_status_r& set_ao_stalled_by_ab_or_cb(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ao_stalled_by_ab() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 7);
+        return v;
+    }
+    CONSTEXPR ao_status_r& set_ao_stalled_by_ab(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ao_stalled_by_cb() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 8);
+        return v;
+    }
+    CONSTEXPR ao_status_r& set_ao_stalled_by_cb(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & value) << 8);
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_status0_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t cmd_ch_idle : 1;
+            uint32_t ifm0_ch_idle : 1;
+            uint32_t ifm1_ch_idle : 1;
+            uint32_t wgt_ch_idle : 1;
+            uint32_t bas_ch_idle : 1;
+            uint32_t m2m_ch_idle : 1;
+            uint32_t ofm_ch_idle : 1;
+            uint32_t axi_halt_req : 1;
+            uint32_t axi_halt_ack : 1;
+            uint32_t axi_pause_req : 1;
+            uint32_t axi_pause_ack : 1;
+            uint32_t cmd_abort_ack : 1;
+            uint32_t cmd_abort_req : 1;
+            uint32_t ifm_mac_if_stall : 1;
+            uint32_t ifm_tc_if_stall : 1;
+            uint32_t ifm_ao_if_stall : 1;
+            uint32_t ofm_if_stall : 1;
+            uint32_t cmd_if_stall : 1;
+            uint32_t wd_sc0_if_stall : 1;
+            uint32_t wd_sc1_if_stall : 1;
+            uint32_t wd_sc2_if_stall : 1;
+            uint32_t wd_sc3_if_stall : 1;
+            uint32_t wd_fc_if_stall : 1;
+            uint32_t bs_if_stall : 1;
+            uint32_t lutcfg_if_stall : 1;
+            uint32_t reserved0 : 7;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR dma_status0_r() :
+        word0(0)
+    {}
+    CONSTEXPR dma_status0_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    dma_status0_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cmd_ch_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_cmd_ch_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ifm0_ch_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_ifm0_ch_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ifm1_ch_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_ifm1_ch_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wgt_ch_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_wgt_ch_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_bas_ch_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_bas_ch_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_m2m_ch_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 5);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_m2m_ch_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ofm_ch_idle() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 6);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_ofm_ch_idle(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_halt_req() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 7);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_axi_halt_req(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_halt_ack() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 8);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_axi_halt_ack(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & value) << 8);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_pause_req() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 9);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_axi_pause_req(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<9) & word0) | ((((1U << 1) - 1) & value) << 9);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_pause_ack() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 10);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_axi_pause_ack(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<10) & word0) | ((((1U << 1) - 1) & value) << 10);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cmd_abort_ack() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 11);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_cmd_abort_ack(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<11) & word0) | ((((1U << 1) - 1) & value) << 11);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cmd_abort_req() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 12);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_cmd_abort_req(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<12) & word0) | ((((1U << 1) - 1) & value) << 12);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ifm_mac_if_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 13);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_ifm_mac_if_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<13) & word0) | ((((1U << 1) - 1) & value) << 13);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ifm_tc_if_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 14);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_ifm_tc_if_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<14) & word0) | ((((1U << 1) - 1) & value) << 14);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ifm_ao_if_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 15);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_ifm_ao_if_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<15) & word0) | ((((1U << 1) - 1) & value) << 15);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_ofm_if_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 16);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_ofm_if_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<16) & word0) | ((((1U << 1) - 1) & value) << 16);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cmd_if_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 17);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_cmd_if_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<17) & word0) | ((((1U << 1) - 1) & value) << 17);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wd_sc0_if_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 18);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_wd_sc0_if_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<18) & word0) | ((((1U << 1) - 1) & value) << 18);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wd_sc1_if_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 19);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_wd_sc1_if_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<19) & word0) | ((((1U << 1) - 1) & value) << 19);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wd_sc2_if_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 20);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_wd_sc2_if_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<20) & word0) | ((((1U << 1) - 1) & value) << 20);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wd_sc3_if_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 21);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_wd_sc3_if_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<21) & word0) | ((((1U << 1) - 1) & value) << 21);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_wd_fc_if_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 22);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_wd_fc_if_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<22) & word0) | ((((1U << 1) - 1) & value) << 22);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_bs_if_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 23);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_bs_if_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<23) & word0) | ((((1U << 1) - 1) & value) << 23);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_lutcfg_if_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 24);
+        return v;
+    }
+    CONSTEXPR dma_status0_r& set_lutcfg_if_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<24) & word0) | ((((1U << 1) - 1) & value) << 24);
+        return *this;
+    }
+#endif
+};
+
+
+struct dma_status1_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t axi_sram0_ar_stalled : 1;
+            uint32_t axi_sram0_rd_limit_stall : 1;
+            uint32_t axi_sram0_aw_stalled : 1;
+            uint32_t axi_sram0_w_stalled : 1;
+            uint32_t axi_sram0_wr_limit_stall : 1;
+            uint32_t axi_sram1_ar_stalled : 1;
+            uint32_t axi_sram1_rd_limit_stall : 1;
+            uint32_t axi_sram1_aw_stalled : 1;
+            uint32_t axi_sram1_w_stalled : 1;
+            uint32_t axi_sram1_wr_limit_stall : 1;
+            uint32_t axi_sram2_ar_stalled : 1;
+            uint32_t axi_sram2_rd_limit_stall : 1;
+            uint32_t axi_sram2_aw_stalled : 1;
+            uint32_t axi_sram2_w_stalled : 1;
+            uint32_t axi_sram2_wr_limit_stall : 1;
+            uint32_t axi_sram3_ar_stalled : 1;
+            uint32_t axi_sram3_rd_limit_stall : 1;
+            uint32_t axi_sram3_aw_stalled : 1;
+            uint32_t axi_sram3_w_stalled : 1;
+            uint32_t axi_sram3_wr_limit_stall : 1;
+            uint32_t axi_ext0_ar_stalled : 1;
+            uint32_t axi_ext0_rd_limit_stall : 1;
+            uint32_t axi_ext0_aw_stalled : 1;
+            uint32_t axi_ext0_w_stalled : 1;
+            uint32_t axi_ext0_wr_limit_stall : 1;
+            uint32_t axi_ext1_ar_stalled : 1;
+            uint32_t axi_ext1_rd_limit_stall : 1;
+            uint32_t axi_ext1_aw_stalled : 1;
+            uint32_t axi_ext1_w_stalled : 1;
+            uint32_t axi_ext1_wr_limit_stall : 1;
+            uint32_t reserved0 : 2;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR dma_status1_r() :
+        word0(0)
+    {}
+    CONSTEXPR dma_status1_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    dma_status1_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_sram0_ar_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_sram0_ar_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_sram0_rd_limit_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_sram0_rd_limit_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_sram0_aw_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_sram0_aw_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_sram0_w_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_sram0_w_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_sram0_wr_limit_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_sram0_wr_limit_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_sram1_ar_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 5);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_sram1_ar_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_sram1_rd_limit_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 6);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_sram1_rd_limit_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_sram1_aw_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 7);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_sram1_aw_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_sram1_w_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 8);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_sram1_w_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & value) << 8);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_sram1_wr_limit_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 9);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_sram1_wr_limit_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<9) & word0) | ((((1U << 1) - 1) & value) << 9);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_sram2_ar_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 10);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_sram2_ar_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<10) & word0) | ((((1U << 1) - 1) & value) << 10);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_sram2_rd_limit_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 11);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_sram2_rd_limit_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<11) & word0) | ((((1U << 1) - 1) & value) << 11);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_sram2_aw_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 12);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_sram2_aw_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<12) & word0) | ((((1U << 1) - 1) & value) << 12);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_sram2_w_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 13);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_sram2_w_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<13) & word0) | ((((1U << 1) - 1) & value) << 13);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_sram2_wr_limit_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 14);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_sram2_wr_limit_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<14) & word0) | ((((1U << 1) - 1) & value) << 14);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_sram3_ar_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 15);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_sram3_ar_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<15) & word0) | ((((1U << 1) - 1) & value) << 15);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_sram3_rd_limit_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 16);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_sram3_rd_limit_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<16) & word0) | ((((1U << 1) - 1) & value) << 16);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_sram3_aw_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 17);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_sram3_aw_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<17) & word0) | ((((1U << 1) - 1) & value) << 17);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_sram3_w_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 18);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_sram3_w_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<18) & word0) | ((((1U << 1) - 1) & value) << 18);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_sram3_wr_limit_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 19);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_sram3_wr_limit_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<19) & word0) | ((((1U << 1) - 1) & value) << 19);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_ext0_ar_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 20);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_ext0_ar_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<20) & word0) | ((((1U << 1) - 1) & value) << 20);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_ext0_rd_limit_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 21);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_ext0_rd_limit_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<21) & word0) | ((((1U << 1) - 1) & value) << 21);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_ext0_aw_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 22);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_ext0_aw_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<22) & word0) | ((((1U << 1) - 1) & value) << 22);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_ext0_w_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 23);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_ext0_w_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<23) & word0) | ((((1U << 1) - 1) & value) << 23);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_ext0_wr_limit_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 24);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_ext0_wr_limit_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<24) & word0) | ((((1U << 1) - 1) & value) << 24);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_ext1_ar_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 25);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_ext1_ar_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<25) & word0) | ((((1U << 1) - 1) & value) << 25);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_ext1_rd_limit_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 26);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_ext1_rd_limit_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<26) & word0) | ((((1U << 1) - 1) & value) << 26);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_ext1_aw_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 27);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_ext1_aw_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<27) & word0) | ((((1U << 1) - 1) & value) << 27);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_ext1_w_stalled() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 28);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_ext1_w_stalled(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<28) & word0) | ((((1U << 1) - 1) & value) << 28);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_axi_ext1_wr_limit_stall() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 29);
+        return v;
+    }
+    CONSTEXPR dma_status1_r& set_axi_ext1_wr_limit_stall(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<29) & word0) | ((((1U << 1) - 1) & value) << 29);
+        return *this;
+    }
+#endif
+};
+
+
+struct pmcr_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t cnt_en : 1;
+            uint32_t event_cnt_rst : 1;
+            uint32_t cycle_cnt_rst : 1;
+            uint32_t mask_en : 1;
+            uint32_t reserved0 : 7;
+            uint32_t num_event_cnt : 5;
+            uint32_t reserved1 : 16;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmcr_r() :
+        word0(16384)
+    {}
+    CONSTEXPR pmcr_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmcr_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cnt_en() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR pmcr_r& set_cnt_en(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_event_cnt_rst() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR pmcr_r& set_event_cnt_rst(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_cycle_cnt_rst() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR pmcr_r& set_cycle_cnt_rst(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_mask_en() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR pmcr_r& set_mask_en(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_num_event_cnt() const
+    {
+        auto v = ((1U << 5) - 1) & (word0 >> 11);
+        return v;
+    }
+    CONSTEXPR pmcr_r& set_num_event_cnt(uint32_t value)
+    {
+        word0 = (~(((1U << 5) - 1)<<11) & word0) | ((((1U << 5) - 1) & value) << 11);
+        return *this;
+    }
+#endif
+};
+
+
+struct pmcntenset_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t EVENT_CNT_0 : 1;
+            uint32_t EVENT_CNT_1 : 1;
+            uint32_t EVENT_CNT_2 : 1;
+            uint32_t EVENT_CNT_3 : 1;
+            uint32_t EVENT_CNT_4 : 1;
+            uint32_t EVENT_CNT_5 : 1;
+            uint32_t EVENT_CNT_6 : 1;
+            uint32_t EVENT_CNT_7 : 1;
+            uint32_t reserved0 : 23;
+            uint32_t CYCLE_CNT : 1;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmcntenset_r() :
+        word0(0)
+    {}
+    CONSTEXPR pmcntenset_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmcntenset_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR pmcntenset_r& set_EVENT_CNT_0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR pmcntenset_r& set_EVENT_CNT_1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_2() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR pmcntenset_r& set_EVENT_CNT_2(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_3() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR pmcntenset_r& set_EVENT_CNT_3(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_4() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR pmcntenset_r& set_EVENT_CNT_4(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_5() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 5);
+        return v;
+    }
+    CONSTEXPR pmcntenset_r& set_EVENT_CNT_5(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_6() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 6);
+        return v;
+    }
+    CONSTEXPR pmcntenset_r& set_EVENT_CNT_6(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_7() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 7);
+        return v;
+    }
+    CONSTEXPR pmcntenset_r& set_EVENT_CNT_7(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_CYCLE_CNT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 31);
+        return v;
+    }
+    CONSTEXPR pmcntenset_r& set_CYCLE_CNT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31);
+        return *this;
+    }
+#endif
+};
+
+
+struct pmcntenclr_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t EVENT_CNT_0 : 1;
+            uint32_t EVENT_CNT_1 : 1;
+            uint32_t EVENT_CNT_2 : 1;
+            uint32_t EVENT_CNT_3 : 1;
+            uint32_t EVENT_CNT_4 : 1;
+            uint32_t EVENT_CNT_5 : 1;
+            uint32_t EVENT_CNT_6 : 1;
+            uint32_t EVENT_CNT_7 : 1;
+            uint32_t reserved0 : 23;
+            uint32_t CYCLE_CNT : 1;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmcntenclr_r() :
+        word0(0)
+    {}
+    CONSTEXPR pmcntenclr_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmcntenclr_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR pmcntenclr_r& set_EVENT_CNT_0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR pmcntenclr_r& set_EVENT_CNT_1(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_2() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR pmcntenclr_r& set_EVENT_CNT_2(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_3() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR pmcntenclr_r& set_EVENT_CNT_3(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_4() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR pmcntenclr_r& set_EVENT_CNT_4(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_5() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 5);
+        return v;
+    }
+    CONSTEXPR pmcntenclr_r& set_EVENT_CNT_5(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_6() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 6);
+        return v;
+    }
+    CONSTEXPR pmcntenclr_r& set_EVENT_CNT_6(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_7() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 7);
+        return v;
+    }
+    CONSTEXPR pmcntenclr_r& set_EVENT_CNT_7(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_CYCLE_CNT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 31);
+        return v;
+    }
+    CONSTEXPR pmcntenclr_r& set_CYCLE_CNT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31);
+        return *this;
+    }
+#endif
+};
+
+
+struct pmovsset_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t EVENT_CNT_0_OVF : 1;
+            uint32_t EVENT_CNT_1_OVF : 1;
+            uint32_t EVENT_CNT_2_OVF : 1;
+            uint32_t EVENT_CNT_3_OVF : 1;
+            uint32_t EVENT_CNT_4_OVF : 1;
+            uint32_t EVENT_CNT_5_OVF : 1;
+            uint32_t EVENT_CNT_6_OVF : 1;
+            uint32_t EVENT_CNT_7_OVF : 1;
+            uint32_t reserved0 : 23;
+            uint32_t CYCLE_CNT_OVF : 1;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmovsset_r() :
+        word0(0)
+    {}
+    CONSTEXPR pmovsset_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmovsset_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_0_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR pmovsset_r& set_EVENT_CNT_0_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_1_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR pmovsset_r& set_EVENT_CNT_1_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_2_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR pmovsset_r& set_EVENT_CNT_2_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_3_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR pmovsset_r& set_EVENT_CNT_3_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_4_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR pmovsset_r& set_EVENT_CNT_4_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_5_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 5);
+        return v;
+    }
+    CONSTEXPR pmovsset_r& set_EVENT_CNT_5_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_6_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 6);
+        return v;
+    }
+    CONSTEXPR pmovsset_r& set_EVENT_CNT_6_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_7_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 7);
+        return v;
+    }
+    CONSTEXPR pmovsset_r& set_EVENT_CNT_7_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_CYCLE_CNT_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 31);
+        return v;
+    }
+    CONSTEXPR pmovsset_r& set_CYCLE_CNT_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31);
+        return *this;
+    }
+#endif
+};
+
+
+struct pmovsclr_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t EVENT_CNT_0_OVF : 1;
+            uint32_t EVENT_CNT_1_OVF : 1;
+            uint32_t EVENT_CNT_2_OVF : 1;
+            uint32_t EVENT_CNT_3_OVF : 1;
+            uint32_t EVENT_CNT_4_OVF : 1;
+            uint32_t EVENT_CNT_5_OVF : 1;
+            uint32_t EVENT_CNT_6_OVF : 1;
+            uint32_t EVENT_CNT_7_OVF : 1;
+            uint32_t reserved0 : 23;
+            uint32_t CYCLE_CNT_OVF : 1;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmovsclr_r() :
+        word0(0)
+    {}
+    CONSTEXPR pmovsclr_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmovsclr_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_0_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR pmovsclr_r& set_EVENT_CNT_0_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_1_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR pmovsclr_r& set_EVENT_CNT_1_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_2_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR pmovsclr_r& set_EVENT_CNT_2_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_3_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR pmovsclr_r& set_EVENT_CNT_3_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_4_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR pmovsclr_r& set_EVENT_CNT_4_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_5_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 5);
+        return v;
+    }
+    CONSTEXPR pmovsclr_r& set_EVENT_CNT_5_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_6_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 6);
+        return v;
+    }
+    CONSTEXPR pmovsclr_r& set_EVENT_CNT_6_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_7_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 7);
+        return v;
+    }
+    CONSTEXPR pmovsclr_r& set_EVENT_CNT_7_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_CYCLE_CNT_OVF() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 31);
+        return v;
+    }
+    CONSTEXPR pmovsclr_r& set_CYCLE_CNT_OVF(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31);
+        return *this;
+    }
+#endif
+};
+
+
+struct pmintset_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t EVENT_CNT_0_INT : 1;
+            uint32_t EVENT_CNT_1_INT : 1;
+            uint32_t EVENT_CNT_2_INT : 1;
+            uint32_t EVENT_CNT_3_INT : 1;
+            uint32_t EVENT_CNT_4_INT : 1;
+            uint32_t EVENT_CNT_5_INT : 1;
+            uint32_t EVENT_CNT_6_INT : 1;
+            uint32_t EVENT_CNT_7_INT : 1;
+            uint32_t reserved0 : 23;
+            uint32_t CYCLE_CNT_INT : 1;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmintset_r() :
+        word0(0)
+    {}
+    CONSTEXPR pmintset_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmintset_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_0_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR pmintset_r& set_EVENT_CNT_0_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_1_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR pmintset_r& set_EVENT_CNT_1_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_2_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR pmintset_r& set_EVENT_CNT_2_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_3_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR pmintset_r& set_EVENT_CNT_3_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_4_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR pmintset_r& set_EVENT_CNT_4_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_5_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 5);
+        return v;
+    }
+    CONSTEXPR pmintset_r& set_EVENT_CNT_5_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_6_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 6);
+        return v;
+    }
+    CONSTEXPR pmintset_r& set_EVENT_CNT_6_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_7_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 7);
+        return v;
+    }
+    CONSTEXPR pmintset_r& set_EVENT_CNT_7_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_CYCLE_CNT_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 31);
+        return v;
+    }
+    CONSTEXPR pmintset_r& set_CYCLE_CNT_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31);
+        return *this;
+    }
+#endif
+};
+
+
+struct pmintclr_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t EVENT_CNT_0_INT : 1;
+            uint32_t EVENT_CNT_1_INT : 1;
+            uint32_t EVENT_CNT_2_INT : 1;
+            uint32_t EVENT_CNT_3_INT : 1;
+            uint32_t EVENT_CNT_4_INT : 1;
+            uint32_t EVENT_CNT_5_INT : 1;
+            uint32_t EVENT_CNT_6_INT : 1;
+            uint32_t EVENT_CNT_7_INT : 1;
+            uint32_t reserved0 : 23;
+            uint32_t CYCLE_CNT_INT : 1;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmintclr_r() :
+        word0(0)
+    {}
+    CONSTEXPR pmintclr_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmintclr_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_0_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR pmintclr_r& set_EVENT_CNT_0_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_1_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 1);
+        return v;
+    }
+    CONSTEXPR pmintclr_r& set_EVENT_CNT_1_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_2_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 2);
+        return v;
+    }
+    CONSTEXPR pmintclr_r& set_EVENT_CNT_2_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_3_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 3);
+        return v;
+    }
+    CONSTEXPR pmintclr_r& set_EVENT_CNT_3_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_4_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 4);
+        return v;
+    }
+    CONSTEXPR pmintclr_r& set_EVENT_CNT_4_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_5_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 5);
+        return v;
+    }
+    CONSTEXPR pmintclr_r& set_EVENT_CNT_5_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_6_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 6);
+        return v;
+    }
+    CONSTEXPR pmintclr_r& set_EVENT_CNT_6_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_EVENT_CNT_7_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 7);
+        return v;
+    }
+    CONSTEXPR pmintclr_r& set_EVENT_CNT_7_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_CYCLE_CNT_INT() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 31);
+        return v;
+    }
+    CONSTEXPR pmintclr_r& set_CYCLE_CNT_INT(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31);
+        return *this;
+    }
+#endif
+};
+
+
+struct pmccntr_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t CYCLE_CNT_LO : 32;
+            uint32_t CYCLE_CNT_HI : 16;
+            uint32_t reserved0 : 16;
+        };
+        uint32_t word[2];
+    };
+#else
+private:
+    uint32_t word0;
+    uint32_t word1;
+public:
+    CONSTEXPR pmccntr_r() :
+        word0(0),
+        word1(0)
+    {}
+    CONSTEXPR pmccntr_r(uint64_t init) :
+        word0(static_cast<uint32_t>((init) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()))),
+        word1(static_cast<uint32_t>((init >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR void operator=(uint64_t value)
+    {
+        word0 = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); word1 = static_cast<uint32_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max()));
+    }
+    CONSTEXPR operator uint64_t()
+    {
+        return (static_cast<uint64_t>(word1) << 32) | word0;
+    }
+    pmccntr_r copy()
+    {
+        return *this;
+    }
+#endif
+};
+
+
+struct pmccntr_cfg_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t CYCLE_CNT_CFG_START : 10;
+            uint32_t reserved0 : 2;
+            uint32_t S0 : 1;
+            uint32_t S1 : 1;
+            uint32_t S2 : 1;
+            uint32_t S3 : 1;
+            uint32_t CYCLE_CNT_CFG_STOP : 10;
+            uint32_t reserved1 : 2;
+            uint32_t E0 : 1;
+            uint32_t E1 : 1;
+            uint32_t E2 : 1;
+            uint32_t E3 : 1;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmccntr_cfg_r() :
+        word0(0)
+    {}
+    CONSTEXPR pmccntr_cfg_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmccntr_cfg_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pmu_event get_CYCLE_CNT_CFG_START() const
+    {
+        auto v = ((1U << 10) - 1) & (word0 >> 0);
+        assert(v <= 399);
+        return static_cast<NPU_NAMESPACE::pmu_event>(v);
+    }
+    CONSTEXPR pmccntr_cfg_r& set_CYCLE_CNT_CFG_START(NPU_NAMESPACE::pmu_event value)
+    {
+        word0 = (~(((1U << 10) - 1)<<0) & word0) | ((((1U << 10) - 1) & static_cast<uint32_t>(value)) << 0);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_S0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 12);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::pmu_port_disable>(v);
+    }
+    CONSTEXPR pmccntr_cfg_r& set_S0(NPU_NAMESPACE::pmu_port_disable value)
+    {
+        word0 = (~(((1U << 1) - 1)<<12) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 12);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_S1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 13);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::pmu_port_disable>(v);
+    }
+    CONSTEXPR pmccntr_cfg_r& set_S1(NPU_NAMESPACE::pmu_port_disable value)
+    {
+        word0 = (~(((1U << 1) - 1)<<13) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 13);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_S2() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 14);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::pmu_port_disable>(v);
+    }
+    CONSTEXPR pmccntr_cfg_r& set_S2(NPU_NAMESPACE::pmu_port_disable value)
+    {
+        word0 = (~(((1U << 1) - 1)<<14) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 14);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_S3() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 15);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::pmu_port_disable>(v);
+    }
+    CONSTEXPR pmccntr_cfg_r& set_S3(NPU_NAMESPACE::pmu_port_disable value)
+    {
+        word0 = (~(((1U << 1) - 1)<<15) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 15);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pmu_event get_CYCLE_CNT_CFG_STOP() const
+    {
+        auto v = ((1U << 10) - 1) & (word0 >> 16);
+        assert(v <= 399);
+        return static_cast<NPU_NAMESPACE::pmu_event>(v);
+    }
+    CONSTEXPR pmccntr_cfg_r& set_CYCLE_CNT_CFG_STOP(NPU_NAMESPACE::pmu_event value)
+    {
+        word0 = (~(((1U << 10) - 1)<<16) & word0) | ((((1U << 10) - 1) & static_cast<uint32_t>(value)) << 16);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_E0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 28);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::pmu_port_disable>(v);
+    }
+    CONSTEXPR pmccntr_cfg_r& set_E0(NPU_NAMESPACE::pmu_port_disable value)
+    {
+        word0 = (~(((1U << 1) - 1)<<28) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 28);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_E1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 29);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::pmu_port_disable>(v);
+    }
+    CONSTEXPR pmccntr_cfg_r& set_E1(NPU_NAMESPACE::pmu_port_disable value)
+    {
+        word0 = (~(((1U << 1) - 1)<<29) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 29);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_E2() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 30);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::pmu_port_disable>(v);
+    }
+    CONSTEXPR pmccntr_cfg_r& set_E2(NPU_NAMESPACE::pmu_port_disable value)
+    {
+        word0 = (~(((1U << 1) - 1)<<30) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 30);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_E3() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 31);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::pmu_port_disable>(v);
+    }
+    CONSTEXPR pmccntr_cfg_r& set_E3(NPU_NAMESPACE::pmu_port_disable value)
+    {
+        word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 31);
+        return *this;
+    }
+#endif
+};
+
+
+struct pmcaxi_chan_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t CH_SEL : 4;
+            uint32_t reserved0 : 4;
+            uint32_t AXI_SEL : 1;
+            uint32_t reserved1 : 1;
+            uint32_t BW_CH_SEL_EN : 1;
+            uint32_t reserved2 : 21;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmcaxi_chan_r() :
+        word0(0)
+    {}
+    CONSTEXPR pmcaxi_chan_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmcaxi_chan_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pmu_axi_channel get_CH_SEL() const
+    {
+        auto v = ((1U << 4) - 1) & (word0 >> 0);
+        assert(v <= 9);
+        return static_cast<NPU_NAMESPACE::pmu_axi_channel>(v);
+    }
+    CONSTEXPR pmcaxi_chan_r& set_CH_SEL(NPU_NAMESPACE::pmu_axi_channel value)
+    {
+        word0 = (~(((1U << 4) - 1)<<0) & word0) | ((((1U << 4) - 1) & static_cast<uint32_t>(value)) << 0);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::axi_port get_AXI_SEL() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 8);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::axi_port>(v);
+    }
+    CONSTEXPR pmcaxi_chan_r& set_AXI_SEL(NPU_NAMESPACE::axi_port value)
+    {
+        word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 8);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_BW_CH_SEL_EN() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 10);
+        return v;
+    }
+    CONSTEXPR pmcaxi_chan_r& set_BW_CH_SEL_EN(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<10) & word0) | ((((1U << 1) - 1) & value) << 10);
+        return *this;
+    }
+#endif
+};
+
+
+struct pmclut_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t PCM_LUT_EN_0 : 1;
+            uint32_t reserved0 : 15;
+            uint32_t PMC_LUT_0 : 16;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmclut_r() :
+        word0(0)
+    {}
+    CONSTEXPR pmclut_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmclut_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_PCM_LUT_EN_0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 0);
+        return v;
+    }
+    CONSTEXPR pmclut_r& set_PCM_LUT_EN_0(uint32_t value)
+    {
+        word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_PMC_LUT_0() const
+    {
+        auto v = ((1U << 16) - 1) & (word0 >> 16);
+        return v;
+    }
+    CONSTEXPR pmclut_r& set_PMC_LUT_0(uint32_t value)
+    {
+        word0 = (~(((1U << 16) - 1)<<16) & word0) | ((((1U << 16) - 1) & value) << 16);
+        return *this;
+    }
+#endif
+};
+
+
+struct pmevcntr_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t count : 32;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmevcntr_r() :
+        word0(0)
+    {}
+    CONSTEXPR pmevcntr_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmevcntr_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR uint32_t get_count() const
+    {
+        auto v = word0;
+        return v;
+    }
+    CONSTEXPR pmevcntr_r& set_count(uint32_t value)
+    {
+        word0 = value;
+        return *this;
+    }
+#endif
+};
+
+
+struct pmevtyper_r
+{
+#ifndef __cplusplus
+    union
+    {
+        struct
+        {
+            uint32_t EV_TYPE : 10;
+            uint32_t reserved0 : 2;
+            uint32_t D0 : 1;
+            uint32_t D1 : 1;
+            uint32_t D2 : 1;
+            uint32_t D3 : 1;
+            uint32_t reserved1 : 16;
+        };
+        uint32_t word;
+    };
+#else
+private:
+    uint32_t word0;
+public:
+    CONSTEXPR pmevtyper_r() :
+        word0(0)
+    {}
+    CONSTEXPR pmevtyper_r(uint32_t init) :
+        word0(init)
+    {}
+    CONSTEXPR void operator=(uint32_t value)
+    {
+        word0 = value;
+    }
+    CONSTEXPR operator uint32_t()
+    {
+        return word0;
+    }
+    pmevtyper_r copy()
+    {
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pmu_event get_EV_TYPE() const
+    {
+        auto v = ((1U << 10) - 1) & (word0 >> 0);
+        assert(v <= 399);
+        return static_cast<NPU_NAMESPACE::pmu_event>(v);
+    }
+    CONSTEXPR pmevtyper_r& set_EV_TYPE(NPU_NAMESPACE::pmu_event value)
+    {
+        word0 = (~(((1U << 10) - 1)<<0) & word0) | ((((1U << 10) - 1) & static_cast<uint32_t>(value)) << 0);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_D0() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 12);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::pmu_port_disable>(v);
+    }
+    CONSTEXPR pmevtyper_r& set_D0(NPU_NAMESPACE::pmu_port_disable value)
+    {
+        word0 = (~(((1U << 1) - 1)<<12) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 12);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_D1() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 13);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::pmu_port_disable>(v);
+    }
+    CONSTEXPR pmevtyper_r& set_D1(NPU_NAMESPACE::pmu_port_disable value)
+    {
+        word0 = (~(((1U << 1) - 1)<<13) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 13);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_D2() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 14);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::pmu_port_disable>(v);
+    }
+    CONSTEXPR pmevtyper_r& set_D2(NPU_NAMESPACE::pmu_port_disable value)
+    {
+        word0 = (~(((1U << 1) - 1)<<14) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 14);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_D3() const
+    {
+        auto v = ((1U << 1) - 1) & (word0 >> 15);
+        assert(v <= 1);
+        return static_cast<NPU_NAMESPACE::pmu_port_disable>(v);
+    }
+    CONSTEXPR pmevtyper_r& set_D3(NPU_NAMESPACE::pmu_port_disable value)
+    {
+        word0 = (~(((1U << 1) - 1)<<15) & word0) | ((((1U << 1) - 1) & static_cast<uint32_t>(value)) << 15);
+        return *this;
+    }
+#endif
+};
+
+struct NPU_REG
+{
+    STRUCT id_r ID;
+    STRUCT status_r STATUS;
+    STRUCT cmd_r CMD;
+    STRUCT reset_r RESET;
+    STRUCT qbase_r QBASE;
+    STRUCT qread_r QREAD;
+    STRUCT qconfig_r QCONFIG;
+    STRUCT qsize_r QSIZE;
+    STRUCT prot_r PROT;
+    STRUCT config_r CONFIG;
+    uint32_t unused0[1];
+    STRUCT cond_status_r COND_STATUS;
+    uint32_t unused1[1];
+    STRUCT power_ctrl_r POWER_CTRL;
+    STRUCT regioncfg_r REGIONCFG;
+    STRUCT mem_attr_r MEM_ATTR[4];
+    STRUCT axi_sram_r AXI_SRAM;
+    STRUCT axi_ext_r AXI_EXT;
+    uint32_t unused2[2];
+    STRUCT cfg_sram_cap_r CFG_SRAM_CAP;
+    STRUCT cfg_ext_cap_r CFG_EXT_CAP;
+    STRUCT cfg_sram_hash0_r CFG_SRAM_HASH0;
+    STRUCT cfg_sram_hash1_r CFG_SRAM_HASH1;
+    STRUCT cfg_ext_hash0_r CFG_EXT_HASH0;
+    STRUCT basep_r BASEP[8];
+    uint32_t unused3[32];
+    STRUCT clkforce_r CLKFORCE;
+    STRUCT debug_address_r DEBUG_ADDRESS;
+    STRUCT debug_misc_r DEBUG_MISC;
+    uint32_t unused4[61];
+    STRUCT dma_ifm_src_r DMA_IFM_SRC;
+    STRUCT dma_ifm_dst_r DMA_IFM_DST;
+    STRUCT dma_ofm_src_r DMA_OFM_SRC;
+    STRUCT dma_ofm_dst_r DMA_OFM_DST;
+    STRUCT dma_weight_src_r DMA_WEIGHT_SRC;
+    STRUCT dma_cmd_src_r DMA_CMD_SRC;
+    STRUCT dma_cmd_size_r DMA_CMD_SIZE;
+    STRUCT dma_m2m_src_r DMA_M2M_SRC;
+    STRUCT dma_m2m_dst_r DMA_M2M_DST;
+    STRUCT current_qread_r CURRENT_QREAD;
+    STRUCT dma_scale_src_r DMA_SCALE_SRC;
+    STRUCT dma_weight1_src_r DMA_WEIGHT1_SRC;
+    STRUCT dma_weight2_src_r DMA_WEIGHT2_SRC;
+    STRUCT dma_weight3_src_r DMA_WEIGHT3_SRC;
+    uint32_t unused5[6];
+    STRUCT current_op_r CURRENT_OP;
+    STRUCT current_cmd_r CURRENT_CMD;
+    uint32_t unused6[80];
+    STRUCT internal_memory_r INTERNAL_MEMORY[256];
+    STRUCT ifm_pad_top_r IFM_PAD_TOP;
+    STRUCT ifm_pad_left_r IFM_PAD_LEFT;
+    STRUCT ifm_pad_right_r IFM_PAD_RIGHT;
+    STRUCT ifm_pad_bottom_r IFM_PAD_BOTTOM;
+    STRUCT ifm_depth_m1_r IFM_DEPTH_M1;
+    STRUCT ifm_precision_r IFM_PRECISION;
+    uint32_t unused7[1];
+    STRUCT ifm_upscale_r IFM_UPSCALE;
+    STRUCT ifm_broadcast_r IFM_BROADCAST;
+    STRUCT ifm_zero_point_r IFM_ZERO_POINT;
+    STRUCT ifm_width0_m1_r IFM_WIDTH0_M1;
+    STRUCT ifm_height0_m1_r IFM_HEIGHT0_M1;
+    STRUCT ifm_height1_m1_r IFM_HEIGHT1_M1;
+    uint32_t unused8[2];
+    STRUCT ifm_region_r IFM_REGION;
+    uint32_t unused9[1];
+    STRUCT ofm_width_m1_r OFM_WIDTH_M1;
+    STRUCT ofm_height_m1_r OFM_HEIGHT_M1;
+    STRUCT ofm_depth_m1_r OFM_DEPTH_M1;
+    STRUCT ofm_precision_r OFM_PRECISION;
+    STRUCT ofm_blk_width_m1_r OFM_BLK_WIDTH_M1;
+    STRUCT ofm_blk_height_m1_r OFM_BLK_HEIGHT_M1;
+    STRUCT ofm_blk_depth_m1_r OFM_BLK_DEPTH_M1;
+    STRUCT ofm_zero_point_r OFM_ZERO_POINT;
+    uint32_t unused10[1];
+    STRUCT ofm_width0_m1_r OFM_WIDTH0_M1;
+    STRUCT ofm_height0_m1_r OFM_HEIGHT0_M1;
+    STRUCT ofm_height1_m1_r OFM_HEIGHT1_M1;
+    uint32_t unused11[2];
+    STRUCT ofm_region_r OFM_REGION;
+    STRUCT kernel_width_m1_r KERNEL_WIDTH_M1;
+    STRUCT kernel_height_m1_r KERNEL_HEIGHT_M1;
+    STRUCT kernel_stride_r KERNEL_STRIDE;
+    uint32_t unused12[1];
+    STRUCT acc_format_r ACC_FORMAT;
+    STRUCT activation_r ACTIVATION;
+    STRUCT activation_min_r ACTIVATION_MIN;
+    STRUCT activation_max_r ACTIVATION_MAX;
+    STRUCT weight_region_r WEIGHT_REGION;
+    STRUCT scale_region_r SCALE_REGION;
+    STRUCT resize_x_scale_n_m1_r RESIZE_X_SCALE_N_M1;
+    STRUCT resize_y_scale_n_m1_r RESIZE_Y_SCALE_N_M1;
+    STRUCT resize_x_offset_r RESIZE_X_OFFSET;
+    STRUCT resize_y_offset_r RESIZE_Y_OFFSET;
+    STRUCT weight_format_r WEIGHT_FORMAT;
+    STRUCT blockdep_r BLOCKDEP;
+    STRUCT dma0_src_region_r DMA0_SRC_REGION;
+    STRUCT dma0_dst_region_r DMA0_DST_REGION;
+    STRUCT dma0_size0_r DMA0_SIZE0;
+    STRUCT dma0_size1_r DMA0_SIZE1;
+    STRUCT dma0_idx_region_r DMA0_IDX_REGION;
+    uint32_t unused13[11];
+    STRUCT ifm2_broadcast_r IFM2_BROADCAST;
+    uint32_t unused14[4];
+    STRUCT ifm2_precision_r IFM2_PRECISION;
+    uint32_t unused15[3];
+    STRUCT ifm2_zero_point_r IFM2_ZERO_POINT;
+    STRUCT ifm2_width0_m1_r IFM2_WIDTH0_M1;
+    STRUCT ifm2_height0_m1_r IFM2_HEIGHT0_M1;
+    STRUCT ifm2_height1_m1_r IFM2_HEIGHT1_M1;
+    uint32_t unused16[2];
+    STRUCT ifm2_region_r IFM2_REGION;
+    uint32_t unused17[48];
+    STRUCT ifm_base0_r IFM_BASE0;
+    STRUCT ifm_base1_r IFM_BASE1;
+    STRUCT ifm_base2_r IFM_BASE2;
+    STRUCT ifm_base3_r IFM_BASE3;
+    STRUCT ifm_stride_x_r IFM_STRIDE_X;
+    STRUCT ifm_stride_y_r IFM_STRIDE_Y;
+    STRUCT ifm_stride_c_r IFM_STRIDE_C;
+    uint32_t unused18[2];
+    STRUCT ofm_base0_r OFM_BASE0;
+    STRUCT ofm_base1_r OFM_BASE1;
+    STRUCT ofm_base2_r OFM_BASE2;
+    STRUCT ofm_base3_r OFM_BASE3;
+    STRUCT ofm_stride_x_r OFM_STRIDE_X;
+    STRUCT ofm_stride_y_r OFM_STRIDE_Y;
+    STRUCT ofm_stride_c_r OFM_STRIDE_C;
+    uint32_t unused19[2];
+    STRUCT weight_base_r WEIGHT_BASE;
+    STRUCT weight_length_r WEIGHT_LENGTH;
+    STRUCT scale_base_r SCALE_BASE;
+    STRUCT scale_length_r SCALE_LENGTH;
+    STRUCT ofm_scale_r OFM_SCALE;
+    STRUCT ifm_scale_r IFM_SCALE;
+    STRUCT ifm2_scale_r IFM2_SCALE;
+    STRUCT op_scalar_r OP_SCALAR;
+    STRUCT dma0_src_r DMA0_SRC;
+    STRUCT dma0_dst_r DMA0_DST;
+    STRUCT dma0_len_r DMA0_LEN;
+    STRUCT dma0_src_stride0_r DMA0_SRC_STRIDE0;
+    STRUCT dma0_src_stride1_r DMA0_SRC_STRIDE1;
+    STRUCT dma0_dst_stride0_r DMA0_DST_STRIDE0;
+    STRUCT dma0_dst_stride1_r DMA0_DST_STRIDE1;
+    STRUCT dma0_idx_r DMA0_IDX;
+    STRUCT ifm2_base0_r IFM2_BASE0;
+    STRUCT ifm2_base1_r IFM2_BASE1;
+    STRUCT ifm2_base2_r IFM2_BASE2;
+    STRUCT ifm2_base3_r IFM2_BASE3;
+    STRUCT ifm2_stride_x_r IFM2_STRIDE_X;
+    STRUCT ifm2_stride_y_r IFM2_STRIDE_Y;
+    STRUCT ifm2_stride_c_r IFM2_STRIDE_C;
+    uint32_t unused20[2];
+    STRUCT weight1_base_r WEIGHT1_BASE;
+    STRUCT weight1_length_r WEIGHT1_LENGTH;
+    STRUCT weight2_base_r WEIGHT2_BASE;
+    STRUCT weight2_length_r WEIGHT2_LENGTH;
+    STRUCT weight3_base_r WEIGHT3_BASE;
+    STRUCT weight3_length_r WEIGHT3_LENGTH;
+    STRUCT resize_x_step_r RESIZE_X_STEP;
+    STRUCT resize_y_step_r RESIZE_Y_STEP;
+    uint32_t unused21[16];
+    STRUCT dma0_idx_max_r DMA0_IDX_MAX;
+    STRUCT dma0_idx_skip1_r DMA0_IDX_SKIP1;
+    uint32_t unused22[252];
+    STRUCT revision_r REVISION;
+    uint32_t unused23[3];
+    STRUCT pid4_r PID4;
+    STRUCT pid5_r PID5;
+    STRUCT pid6_r PID6;
+    STRUCT pid7_r PID7;
+    STRUCT pid0_r PID0;
+    STRUCT pid1_r PID1;
+    STRUCT pid2_r PID2;
+    STRUCT pid3_r PID3;
+    STRUCT cid0_r CID0;
+    STRUCT cid1_r CID1;
+    STRUCT cid2_r CID2;
+    STRUCT cid3_r CID3;
+    uint32_t unused24[64];
+    STRUCT wd_status_r WD_STATUS;
+    STRUCT mac_status_r MAC_STATUS;
+    STRUCT ao_status_r AO_STATUS;
+    uint32_t unused25[1];
+    STRUCT dma_status0_r DMA_STATUS0;
+    STRUCT dma_status1_r DMA_STATUS1;
+    uint32_t unused26[26];
+    STRUCT pmcr_r PMCR;
+    STRUCT pmcntenset_r PMCNTENSET;
+    STRUCT pmcntenclr_r PMCNTENCLR;
+    STRUCT pmovsset_r PMOVSSET;
+    STRUCT pmovsclr_r PMOVSCLR;
+    STRUCT pmintset_r PMINTSET;
+    STRUCT pmintclr_r PMINTCLR;
+    uint32_t unused27[1];
+    STRUCT pmccntr_r PMCCNTR;
+    STRUCT pmccntr_cfg_r PMCCNTR_CFG;
+    STRUCT pmcaxi_chan_r PMCAXI_CHAN;
+    STRUCT pmclut_r PMCLUT;
+    uint32_t unused28[83];
+    STRUCT pmevcntr_r PMEVCNTR[8];
+    uint32_t unused29[24];
+    STRUCT pmevtyper_r PMEVTYPER[8];
+
+#ifdef __cplusplus
+    enum class access_type_t : uint8_t { RW, RO, WO };
+    NPU_REG()
+    {
+        reset();
+    }
+    void reset()
+    {
+        ID = 536899584;
+        STATUS = 8;
+        CMD = 12;
+        RESET = 0;
+        QBASE = 0;
+        QREAD = 0;
+        QCONFIG = 0;
+        QSIZE = 0;
+        PROT = 0;
+        CONFIG = 536870928;
+        COND_STATUS = 0;
+        POWER_CTRL = 0;
+        REGIONCFG = 0;
+        for (size_t i = 0; i < (sizeof(MEM_ATTR) / sizeof(MEM_ATTR[0])); ++i)
+            MEM_ATTR[i] = 0;
+        AXI_SRAM = 0;
+        AXI_EXT = 0;
+        CFG_SRAM_CAP = 0;
+        CFG_EXT_CAP = 0;
+        CFG_SRAM_HASH0 = 0;
+        CFG_SRAM_HASH1 = 0;
+        CFG_EXT_HASH0 = 0;
+        for (size_t i = 0; i < (sizeof(BASEP) / sizeof(BASEP[0])); ++i)
+            BASEP[i] = 0;
+        CLKFORCE = 0;
+        DEBUG_ADDRESS = 0;
+        DEBUG_MISC = 0;
+        DMA_IFM_SRC = 0;
+        DMA_IFM_DST = 0;
+        DMA_OFM_SRC = 0;
+        DMA_OFM_DST = 0;
+        DMA_WEIGHT_SRC = 0;
+        DMA_CMD_SRC = 0;
+        DMA_CMD_SIZE = 0;
+        DMA_M2M_SRC = 0;
+        DMA_M2M_DST = 0;
+        CURRENT_QREAD = 0;
+        DMA_SCALE_SRC = 0;
+        DMA_WEIGHT1_SRC = 0;
+        DMA_WEIGHT2_SRC = 0;
+        DMA_WEIGHT3_SRC = 0;
+        CURRENT_OP = 0;
+        CURRENT_CMD = 0;
+        for (size_t i = 0; i < (sizeof(INTERNAL_MEMORY) / sizeof(INTERNAL_MEMORY[0])); ++i)
+            INTERNAL_MEMORY[i] = 0;
+        IFM_PAD_TOP = 0;
+        IFM_PAD_LEFT = 0;
+        IFM_PAD_RIGHT = 0;
+        IFM_PAD_BOTTOM = 0;
+        IFM_DEPTH_M1 = 0;
+        IFM_PRECISION = 0;
+        IFM_UPSCALE = 0;
+        IFM_BROADCAST = 0;
+        IFM_ZERO_POINT = 0;
+        IFM_WIDTH0_M1 = 0;
+        IFM_HEIGHT0_M1 = 0;
+        IFM_HEIGHT1_M1 = 0;
+        IFM_REGION = 0;
+        OFM_WIDTH_M1 = 0;
+        OFM_HEIGHT_M1 = 0;
+        OFM_DEPTH_M1 = 0;
+        OFM_PRECISION = 0;
+        OFM_BLK_WIDTH_M1 = 0;
+        OFM_BLK_HEIGHT_M1 = 0;
+        OFM_BLK_DEPTH_M1 = 0;
+        OFM_ZERO_POINT = 0;
+        OFM_WIDTH0_M1 = 0;
+        OFM_HEIGHT0_M1 = 0;
+        OFM_HEIGHT1_M1 = 0;
+        OFM_REGION = 0;
+        KERNEL_WIDTH_M1 = 0;
+        KERNEL_HEIGHT_M1 = 0;
+        KERNEL_STRIDE = 0;
+        ACC_FORMAT = 0;
+        ACTIVATION = 0;
+        ACTIVATION_MIN = 0;
+        ACTIVATION_MAX = 0;
+        WEIGHT_REGION = 0;
+        SCALE_REGION = 0;
+        RESIZE_X_SCALE_N_M1 = 0;
+        RESIZE_Y_SCALE_N_M1 = 0;
+        RESIZE_X_OFFSET = 0;
+        RESIZE_Y_OFFSET = 0;
+        WEIGHT_FORMAT = 0;
+        BLOCKDEP = 0;
+        DMA0_SRC_REGION = 0;
+        DMA0_DST_REGION = 0;
+        DMA0_SIZE0 = 0;
+        DMA0_SIZE1 = 0;
+        DMA0_IDX_REGION = 0;
+        IFM2_BROADCAST = 0;
+        IFM2_PRECISION = 0;
+        IFM2_ZERO_POINT = 0;
+        IFM2_WIDTH0_M1 = 0;
+        IFM2_HEIGHT0_M1 = 0;
+        IFM2_HEIGHT1_M1 = 0;
+        IFM2_REGION = 0;
+        IFM_BASE0 = 0;
+        IFM_BASE1 = 0;
+        IFM_BASE2 = 0;
+        IFM_BASE3 = 0;
+        IFM_STRIDE_X = 0;
+        IFM_STRIDE_Y = 0;
+        IFM_STRIDE_C = 0;
+        OFM_BASE0 = 0;
+        OFM_BASE1 = 0;
+        OFM_BASE2 = 0;
+        OFM_BASE3 = 0;
+        OFM_STRIDE_X = 0;
+        OFM_STRIDE_Y = 0;
+        OFM_STRIDE_C = 0;
+        WEIGHT_BASE = 0;
+        WEIGHT_LENGTH = 0;
+        SCALE_BASE = 0;
+        SCALE_LENGTH = 0;
+        OFM_SCALE = 0;
+        IFM_SCALE = 0;
+        IFM2_SCALE = 0;
+        OP_SCALAR = 0;
+        DMA0_SRC = 0;
+        DMA0_DST = 0;
+        DMA0_LEN = 0;
+        DMA0_SRC_STRIDE0 = 0;
+        DMA0_SRC_STRIDE1 = 0;
+        DMA0_DST_STRIDE0 = 0;
+        DMA0_DST_STRIDE1 = 0;
+        DMA0_IDX = 0;
+        IFM2_BASE0 = 0;
+        IFM2_BASE1 = 0;
+        IFM2_BASE2 = 0;
+        IFM2_BASE3 = 0;
+        IFM2_STRIDE_X = 0;
+        IFM2_STRIDE_Y = 0;
+        IFM2_STRIDE_C = 0;
+        WEIGHT1_BASE = 0;
+        WEIGHT1_LENGTH = 0;
+        WEIGHT2_BASE = 0;
+        WEIGHT2_LENGTH = 0;
+        WEIGHT3_BASE = 0;
+        WEIGHT3_LENGTH = 0;
+        RESIZE_X_STEP = 0;
+        RESIZE_Y_STEP = 0;
+        DMA0_IDX_MAX = 0;
+        DMA0_IDX_SKIP1 = 0;
+        REVISION = 0;
+        PID4 = 4;
+        PID5 = 0;
+        PID6 = 0;
+        PID7 = 0;
+        PID0 = 130;
+        PID1 = 181;
+        PID2 = 11;
+        PID3 = 0;
+        CID0 = 13;
+        CID1 = 240;
+        CID2 = 5;
+        CID3 = 177;
+        WD_STATUS = 0;
+        MAC_STATUS = 0;
+        AO_STATUS = 0;
+        DMA_STATUS0 = 0;
+        DMA_STATUS1 = 0;
+        PMCR = 16384;
+        PMCNTENSET = 0;
+        PMCNTENCLR = 0;
+        PMOVSSET = 0;
+        PMOVSCLR = 0;
+        PMINTSET = 0;
+        PMINTCLR = 0;
+        PMCCNTR = 0;
+        PMCCNTR_CFG = 0;
+        PMCAXI_CHAN = 0;
+        PMCLUT = 0;
+        for (size_t i = 0; i < (sizeof(PMEVCNTR) / sizeof(PMEVCNTR[0])); ++i)
+            PMEVCNTR[i] = 0;
+        for (size_t i = 0; i < (sizeof(PMEVTYPER) / sizeof(PMEVTYPER[0])); ++i)
+            PMEVTYPER[i] = 0;
+    }
+    uint32_t& operator[](const int addr_offset)
+    {
+        return reinterpret_cast<uint32_t *>(this)[addr_offset / 4];
+    }
+    access_type_t get_access_type(uint32_t offset)
+    {
+        switch (offset)
+        {
+            case 0: return access_type_t::RO;
+            case 4: return access_type_t::RO;
+            case 8: return access_type_t::RW;
+            case 12: return access_type_t::RW;
+            case 16: return access_type_t::RW;
+            case 24: return access_type_t::RO;
+            case 28: return access_type_t::RW;
+            case 32: return access_type_t::RW;
+            case 36: return access_type_t::RO;
+            case 40: return access_type_t::RO;
+            case 48: return access_type_t::RW;
+            case 56: return access_type_t::RW;
+            case 60: return access_type_t::RW;
+            case 64: return access_type_t::RW;
+            case 68: return access_type_t::RW;
+            case 72: return access_type_t::RW;
+            case 76: return access_type_t::RW;
+            case 80: return access_type_t::RW;
+            case 84: return access_type_t::RW;
+            case 96: return access_type_t::RO;
+            case 100: return access_type_t::RO;
+            case 104: return access_type_t::RO;
+            case 112: return access_type_t::RO;
+            case 120: return access_type_t::RO;
+            case 128: return access_type_t::RW;
+            case 136: return access_type_t::RW;
+            case 144: return access_type_t::RW;
+            case 152: return access_type_t::RW;
+            case 160: return access_type_t::RW;
+            case 168: return access_type_t::RW;
+            case 176: return access_type_t::RW;
+            case 184: return access_type_t::RW;
+            case 320: return access_type_t::RW;
+            case 324: return access_type_t::RW;
+            case 328: return access_type_t::RW;
+            case 576: return access_type_t::RO;
+            case 584: return access_type_t::RO;
+            case 588: return access_type_t::RO;
+            case 592: return access_type_t::RO;
+            case 600: return access_type_t::RO;
+            case 608: return access_type_t::RO;
+            case 616: return access_type_t::RO;
+            case 620: return access_type_t::RO;
+            case 628: return access_type_t::RO;
+            case 636: return access_type_t::RO;
+            case 640: return access_type_t::RO;
+            case 648: return access_type_t::RO;
+            case 656: return access_type_t::RO;
+            case 664: return access_type_t::RO;
+            case 696: return access_type_t::RO;
+            case 700: return access_type_t::RO;
+            case 1024: return access_type_t::RW;
+            case 1028: return access_type_t::RW;
+            case 1032: return access_type_t::RW;
+            case 1036: return access_type_t::RW;
+            case 1040: return access_type_t::RW;
+            case 1044: return access_type_t::RW;
+            case 1048: return access_type_t::RW;
+            case 1052: return access_type_t::RW;
+            case 1056: return access_type_t::RW;
+            case 1060: return access_type_t::RW;
+            case 1064: return access_type_t::RW;
+            case 1068: return access_type_t::RW;
+            case 1072: return access_type_t::RW;
+            case 1076: return access_type_t::RW;
+            case 1080: return access_type_t::RW;
+            case 1084: return access_type_t::RW;
+            case 1088: return access_type_t::RW;
+            case 1092: return access_type_t::RW;
+            case 1096: return access_type_t::RW;
+            case 1100: return access_type_t::RW;
+            case 1104: return access_type_t::RW;
+            case 1108: return access_type_t::RW;
+            case 1112: return access_type_t::RW;
+            case 1116: return access_type_t::RW;
+            case 1120: return access_type_t::RW;
+            case 1124: return access_type_t::RW;
+            case 1128: return access_type_t::RW;
+            case 1132: return access_type_t::RW;
+            case 1136: return access_type_t::RW;
+            case 1140: return access_type_t::RW;
+            case 1144: return access_type_t::RW;
+            case 1148: return access_type_t::RW;
+            case 1152: return access_type_t::RW;
+            case 1156: return access_type_t::RW;
+            case 1160: return access_type_t::RW;
+            case 1164: return access_type_t::RW;
+            case 1168: return access_type_t::RW;
+            case 1172: return access_type_t::RW;
+            case 1176: return access_type_t::RW;
+            case 1180: return access_type_t::RW;
+            case 1184: return access_type_t::RW;
+            case 1188: return access_type_t::RW;
+            case 1192: return access_type_t::RW;
+            case 1196: return access_type_t::RW;
+            case 1200: return access_type_t::RW;
+            case 1204: return access_type_t::RW;
+            case 1208: return access_type_t::RW;
+            case 1212: return access_type_t::RW;
+            case 1216: return access_type_t::RW;
+            case 1220: return access_type_t::RW;
+            case 1224: return access_type_t::RW;
+            case 1228: return access_type_t::RW;
+            case 1232: return access_type_t::RW;
+            case 1236: return access_type_t::RW;
+            case 1240: return access_type_t::RW;
+            case 1244: return access_type_t::RW;
+            case 1248: return access_type_t::RW;
+            case 1252: return access_type_t::RW;
+            case 1256: return access_type_t::RW;
+            case 1260: return access_type_t::RW;
+            case 1264: return access_type_t::RW;
+            case 1268: return access_type_t::RW;
+            case 1272: return access_type_t::RW;
+            case 1276: return access_type_t::RW;
+            case 1280: return access_type_t::RW;
+            case 1284: return access_type_t::RW;
+            case 1288: return access_type_t::RW;
+            case 1292: return access_type_t::RW;
+            case 1296: return access_type_t::RW;
+            case 1300: return access_type_t::RW;
+            case 1304: return access_type_t::RW;
+            case 1308: return access_type_t::RW;
+            case 1312: return access_type_t::RW;
+            case 1316: return access_type_t::RW;
+            case 1320: return access_type_t::RW;
+            case 1324: return access_type_t::RW;
+            case 1328: return access_type_t::RW;
+            case 1332: return access_type_t::RW;
+            case 1336: return access_type_t::RW;
+            case 1340: return access_type_t::RW;
+            case 1344: return access_type_t::RW;
+            case 1348: return access_type_t::RW;
+            case 1352: return access_type_t::RW;
+            case 1356: return access_type_t::RW;
+            case 1360: return access_type_t::RW;
+            case 1364: return access_type_t::RW;
+            case 1368: return access_type_t::RW;
+            case 1372: return access_type_t::RW;
+            case 1376: return access_type_t::RW;
+            case 1380: return access_type_t::RW;
+            case 1384: return access_type_t::RW;
+            case 1388: return access_type_t::RW;
+            case 1392: return access_type_t::RW;
+            case 1396: return access_type_t::RW;
+            case 1400: return access_type_t::RW;
+            case 1404: return access_type_t::RW;
+            case 1408: return access_type_t::RW;
+            case 1412: return access_type_t::RW;
+            case 1416: return access_type_t::RW;
+            case 1420: return access_type_t::RW;
+            case 1424: return access_type_t::RW;
+            case 1428: return access_type_t::RW;
+            case 1432: return access_type_t::RW;
+            case 1436: return access_type_t::RW;
+            case 1440: return access_type_t::RW;
+            case 1444: return access_type_t::RW;
+            case 1448: return access_type_t::RW;
+            case 1452: return access_type_t::RW;
+            case 1456: return access_type_t::RW;
+            case 1460: return access_type_t::RW;
+            case 1464: return access_type_t::RW;
+            case 1468: return access_type_t::RW;
+            case 1472: return access_type_t::RW;
+            case 1476: return access_type_t::RW;
+            case 1480: return access_type_t::RW;
+            case 1484: return access_type_t::RW;
+            case 1488: return access_type_t::RW;
+            case 1492: return access_type_t::RW;
+            case 1496: return access_type_t::RW;
+            case 1500: return access_type_t::RW;
+            case 1504: return access_type_t::RW;
+            case 1508: return access_type_t::RW;
+            case 1512: return access_type_t::RW;
+            case 1516: return access_type_t::RW;
+            case 1520: return access_type_t::RW;
+            case 1524: return access_type_t::RW;
+            case 1528: return access_type_t::RW;
+            case 1532: return access_type_t::RW;
+            case 1536: return access_type_t::RW;
+            case 1540: return access_type_t::RW;
+            case 1544: return access_type_t::RW;
+            case 1548: return access_type_t::RW;
+            case 1552: return access_type_t::RW;
+            case 1556: return access_type_t::RW;
+            case 1560: return access_type_t::RW;
+            case 1564: return access_type_t::RW;
+            case 1568: return access_type_t::RW;
+            case 1572: return access_type_t::RW;
+            case 1576: return access_type_t::RW;
+            case 1580: return access_type_t::RW;
+            case 1584: return access_type_t::RW;
+            case 1588: return access_type_t::RW;
+            case 1592: return access_type_t::RW;
+            case 1596: return access_type_t::RW;
+            case 1600: return access_type_t::RW;
+            case 1604: return access_type_t::RW;
+            case 1608: return access_type_t::RW;
+            case 1612: return access_type_t::RW;
+            case 1616: return access_type_t::RW;
+            case 1620: return access_type_t::RW;
+            case 1624: return access_type_t::RW;
+            case 1628: return access_type_t::RW;
+            case 1632: return access_type_t::RW;
+            case 1636: return access_type_t::RW;
+            case 1640: return access_type_t::RW;
+            case 1644: return access_type_t::RW;
+            case 1648: return access_type_t::RW;
+            case 1652: return access_type_t::RW;
+            case 1656: return access_type_t::RW;
+            case 1660: return access_type_t::RW;
+            case 1664: return access_type_t::RW;
+            case 1668: return access_type_t::RW;
+            case 1672: return access_type_t::RW;
+            case 1676: return access_type_t::RW;
+            case 1680: return access_type_t::RW;
+            case 1684: return access_type_t::RW;
+            case 1688: return access_type_t::RW;
+            case 1692: return access_type_t::RW;
+            case 1696: return access_type_t::RW;
+            case 1700: return access_type_t::RW;
+            case 1704: return access_type_t::RW;
+            case 1708: return access_type_t::RW;
+            case 1712: return access_type_t::RW;
+            case 1716: return access_type_t::RW;
+            case 1720: return access_type_t::RW;
+            case 1724: return access_type_t::RW;
+            case 1728: return access_type_t::RW;
+            case 1732: return access_type_t::RW;
+            case 1736: return access_type_t::RW;
+            case 1740: return access_type_t::RW;
+            case 1744: return access_type_t::RW;
+            case 1748: return access_type_t::RW;
+            case 1752: return access_type_t::RW;
+            case 1756: return access_type_t::RW;
+            case 1760: return access_type_t::RW;
+            case 1764: return access_type_t::RW;
+            case 1768: return access_type_t::RW;
+            case 1772: return access_type_t::RW;
+            case 1776: return access_type_t::RW;
+            case 1780: return access_type_t::RW;
+            case 1784: return access_type_t::RW;
+            case 1788: return access_type_t::RW;
+            case 1792: return access_type_t::RW;
+            case 1796: return access_type_t::RW;
+            case 1800: return access_type_t::RW;
+            case 1804: return access_type_t::RW;
+            case 1808: return access_type_t::RW;
+            case 1812: return access_type_t::RW;
+            case 1816: return access_type_t::RW;
+            case 1820: return access_type_t::RW;
+            case 1824: return access_type_t::RW;
+            case 1828: return access_type_t::RW;
+            case 1832: return access_type_t::RW;
+            case 1836: return access_type_t::RW;
+            case 1840: return access_type_t::RW;
+            case 1844: return access_type_t::RW;
+            case 1848: return access_type_t::RW;
+            case 1852: return access_type_t::RW;
+            case 1856: return access_type_t::RW;
+            case 1860: return access_type_t::RW;
+            case 1864: return access_type_t::RW;
+            case 1868: return access_type_t::RW;
+            case 1872: return access_type_t::RW;
+            case 1876: return access_type_t::RW;
+            case 1880: return access_type_t::RW;
+            case 1884: return access_type_t::RW;
+            case 1888: return access_type_t::RW;
+            case 1892: return access_type_t::RW;
+            case 1896: return access_type_t::RW;
+            case 1900: return access_type_t::RW;
+            case 1904: return access_type_t::RW;
+            case 1908: return access_type_t::RW;
+            case 1912: return access_type_t::RW;
+            case 1916: return access_type_t::RW;
+            case 1920: return access_type_t::RW;
+            case 1924: return access_type_t::RW;
+            case 1928: return access_type_t::RW;
+            case 1932: return access_type_t::RW;
+            case 1936: return access_type_t::RW;
+            case 1940: return access_type_t::RW;
+            case 1944: return access_type_t::RW;
+            case 1948: return access_type_t::RW;
+            case 1952: return access_type_t::RW;
+            case 1956: return access_type_t::RW;
+            case 1960: return access_type_t::RW;
+            case 1964: return access_type_t::RW;
+            case 1968: return access_type_t::RW;
+            case 1972: return access_type_t::RW;
+            case 1976: return access_type_t::RW;
+            case 1980: return access_type_t::RW;
+            case 1984: return access_type_t::RW;
+            case 1988: return access_type_t::RW;
+            case 1992: return access_type_t::RW;
+            case 1996: return access_type_t::RW;
+            case 2000: return access_type_t::RW;
+            case 2004: return access_type_t::RW;
+            case 2008: return access_type_t::RW;
+            case 2012: return access_type_t::RW;
+            case 2016: return access_type_t::RW;
+            case 2020: return access_type_t::RW;
+            case 2024: return access_type_t::RW;
+            case 2028: return access_type_t::RW;
+            case 2032: return access_type_t::RW;
+            case 2036: return access_type_t::RW;
+            case 2040: return access_type_t::RW;
+            case 2044: return access_type_t::RW;
+            case 2048: return access_type_t::RW;
+            case 2052: return access_type_t::RW;
+            case 2056: return access_type_t::RW;
+            case 2060: return access_type_t::RW;
+            case 2064: return access_type_t::RW;
+            case 2068: return access_type_t::RW;
+            case 2076: return access_type_t::RW;
+            case 2080: return access_type_t::RW;
+            case 2084: return access_type_t::RW;
+            case 2088: return access_type_t::RW;
+            case 2092: return access_type_t::RW;
+            case 2096: return access_type_t::RW;
+            case 2108: return access_type_t::RW;
+            case 2116: return access_type_t::RW;
+            case 2120: return access_type_t::RW;
+            case 2124: return access_type_t::RW;
+            case 2128: return access_type_t::RW;
+            case 2132: return access_type_t::RW;
+            case 2136: return access_type_t::RW;
+            case 2140: return access_type_t::RW;
+            case 2144: return access_type_t::RW;
+            case 2152: return access_type_t::RW;
+            case 2156: return access_type_t::RW;
+            case 2160: return access_type_t::RW;
+            case 2172: return access_type_t::RW;
+            case 2176: return access_type_t::RW;
+            case 2180: return access_type_t::RW;
+            case 2184: return access_type_t::RW;
+            case 2192: return access_type_t::RW;
+            case 2196: return access_type_t::RW;
+            case 2200: return access_type_t::RW;
+            case 2204: return access_type_t::RW;
+            case 2208: return access_type_t::RW;
+            case 2212: return access_type_t::RW;
+            case 2216: return access_type_t::RW;
+            case 2220: return access_type_t::RW;
+            case 2224: return access_type_t::RW;
+            case 2228: return access_type_t::RW;
+            case 2232: return access_type_t::RW;
+            case 2236: return access_type_t::RW;
+            case 2240: return access_type_t::RW;
+            case 2244: return access_type_t::RW;
+            case 2248: return access_type_t::RW;
+            case 2252: return access_type_t::RW;
+            case 2256: return access_type_t::RW;
+            case 2304: return access_type_t::RW;
+            case 2324: return access_type_t::RW;
+            case 2340: return access_type_t::RW;
+            case 2344: return access_type_t::RW;
+            case 2348: return access_type_t::RW;
+            case 2352: return access_type_t::RW;
+            case 2364: return access_type_t::RW;
+            case 2560: return access_type_t::RW;
+            case 2568: return access_type_t::RW;
+            case 2576: return access_type_t::RW;
+            case 2584: return access_type_t::RW;
+            case 2592: return access_type_t::RW;
+            case 2600: return access_type_t::RW;
+            case 2608: return access_type_t::RW;
+            case 2624: return access_type_t::RW;
+            case 2632: return access_type_t::RW;
+            case 2640: return access_type_t::RW;
+            case 2648: return access_type_t::RW;
+            case 2656: return access_type_t::RW;
+            case 2664: return access_type_t::RW;
+            case 2672: return access_type_t::RW;
+            case 2688: return access_type_t::RW;
+            case 2696: return access_type_t::RW;
+            case 2704: return access_type_t::RW;
+            case 2712: return access_type_t::RW;
+            case 2720: return access_type_t::RW;
+            case 2728: return access_type_t::RW;
+            case 2736: return access_type_t::RW;
+            case 2744: return access_type_t::RW;
+            case 2752: return access_type_t::RW;
+            case 2760: return access_type_t::RW;
+            case 2768: return access_type_t::RW;
+            case 2776: return access_type_t::RW;
+            case 2784: return access_type_t::RW;
+            case 2792: return access_type_t::RW;
+            case 2800: return access_type_t::RW;
+            case 2808: return access_type_t::RW;
+            case 2816: return access_type_t::RW;
+            case 2824: return access_type_t::RW;
+            case 2832: return access_type_t::RW;
+            case 2840: return access_type_t::RW;
+            case 2848: return access_type_t::RW;
+            case 2856: return access_type_t::RW;
+            case 2864: return access_type_t::RW;
+            case 2880: return access_type_t::RW;
+            case 2888: return access_type_t::RW;
+            case 2896: return access_type_t::RW;
+            case 2904: return access_type_t::RW;
+            case 2912: return access_type_t::RW;
+            case 2920: return access_type_t::RW;
+            case 2928: return access_type_t::RW;
+            case 2936: return access_type_t::RW;
+            case 3008: return access_type_t::RW;
+            case 3016: return access_type_t::RW;
+            case 4032: return access_type_t::RO;
+            case 4048: return access_type_t::RO;
+            case 4052: return access_type_t::RO;
+            case 4056: return access_type_t::RO;
+            case 4060: return access_type_t::RO;
+            case 4064: return access_type_t::RO;
+            case 4068: return access_type_t::RO;
+            case 4072: return access_type_t::RO;
+            case 4076: return access_type_t::RO;
+            case 4080: return access_type_t::RO;
+            case 4084: return access_type_t::RO;
+            case 4088: return access_type_t::RO;
+            case 4092: return access_type_t::RO;
+            case 4352: return access_type_t::RO;
+            case 4356: return access_type_t::RO;
+            case 4360: return access_type_t::RO;
+            case 4368: return access_type_t::RO;
+            case 4372: return access_type_t::RO;
+            case 4480: return access_type_t::RW;
+            case 4484: return access_type_t::RW;
+            case 4488: return access_type_t::RW;
+            case 4492: return access_type_t::RW;
+            case 4496: return access_type_t::RW;
+            case 4500: return access_type_t::RW;
+            case 4504: return access_type_t::RW;
+            case 4512: return access_type_t::RW;
+            case 4520: return access_type_t::RW;
+            case 4524: return access_type_t::RW;
+            case 4528: return access_type_t::RW;
+            case 4864: return access_type_t::RW;
+            case 4868: return access_type_t::RW;
+            case 4872: return access_type_t::RW;
+            case 4876: return access_type_t::RW;
+            case 4880: return access_type_t::RW;
+            case 4884: return access_type_t::RW;
+            case 4888: return access_type_t::RW;
+            case 4892: return access_type_t::RW;
+            case 4992: return access_type_t::RW;
+            case 4996: return access_type_t::RW;
+            case 5000: return access_type_t::RW;
+            case 5004: return access_type_t::RW;
+            case 5008: return access_type_t::RW;
+            case 5012: return access_type_t::RW;
+            case 5016: return access_type_t::RW;
+            case 5020: return access_type_t::RW;
+            default: return access_type_t::RO;
+        }
+    }
+#endif
+};
+
+#ifdef __cplusplus
+struct isa
+{
+#ifdef NPU_DISASSEMBLE
+static int disassemble(const uint32_t* in, std::string& op, std::vector<std::pair<std::string, std::string>>& fields)
+{
+    switch (*in & 0xffff)
+    {
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_STOP):
+        {
+            const npu_op_stop_t& v = *reinterpret_cast<const npu_op_stop_t*>(in);
+            op = "NPU_OP_STOP";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_IRQ):
+        {
+            const npu_op_irq_t& v = *reinterpret_cast<const npu_op_irq_t*>(in);
+            op = "NPU_OP_IRQ";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_CONV):
+        {
+            const npu_op_conv_t& v = *reinterpret_cast<const npu_op_conv_t*>(in);
+            op = "NPU_OP_CONV";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DEPTHWISE):
+        {
+            const npu_op_depthwise_t& v = *reinterpret_cast<const npu_op_depthwise_t*>(in);
+            op = "NPU_OP_DEPTHWISE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_POOL):
+        {
+            const npu_op_pool_t& v = *reinterpret_cast<const npu_op_pool_t*>(in);
+            op = "NPU_OP_POOL";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_ELEMENTWISE):
+        {
+            const npu_op_elementwise_t& v = *reinterpret_cast<const npu_op_elementwise_t*>(in);
+            op = "NPU_OP_ELEMENTWISE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_RESIZE):
+        {
+            const npu_op_resize_t& v = *reinterpret_cast<const npu_op_resize_t*>(in);
+            op = "NPU_OP_RESIZE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_START):
+        {
+            const npu_op_dma_start_t& v = *reinterpret_cast<const npu_op_dma_start_t*>(in);
+            op = "NPU_OP_DMA_START";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_WAIT):
+        {
+            const npu_op_dma_wait_t& v = *reinterpret_cast<const npu_op_dma_wait_t*>(in);
+            op = "NPU_OP_DMA_WAIT";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_KERNEL_WAIT):
+        {
+            const npu_op_kernel_wait_t& v = *reinterpret_cast<const npu_op_kernel_wait_t*>(in);
+            op = "NPU_OP_KERNEL_WAIT";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_PMU_MASK):
+        {
+            const npu_op_pmu_mask_t& v = *reinterpret_cast<const npu_op_pmu_mask_t*>(in);
+            op = "NPU_OP_PMU_MASK";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_TOP):
+        {
+            const npu_set_ifm_pad_top_t& v = *reinterpret_cast<const npu_set_ifm_pad_top_t*>(in);
+            op = "NPU_SET_IFM_PAD_TOP";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_LEFT):
+        {
+            const npu_set_ifm_pad_left_t& v = *reinterpret_cast<const npu_set_ifm_pad_left_t*>(in);
+            op = "NPU_SET_IFM_PAD_LEFT";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_RIGHT):
+        {
+            const npu_set_ifm_pad_right_t& v = *reinterpret_cast<const npu_set_ifm_pad_right_t*>(in);
+            op = "NPU_SET_IFM_PAD_RIGHT";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_BOTTOM):
+        {
+            const npu_set_ifm_pad_bottom_t& v = *reinterpret_cast<const npu_set_ifm_pad_bottom_t*>(in);
+            op = "NPU_SET_IFM_PAD_BOTTOM";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_DEPTH_M1):
+        {
+            const npu_set_ifm_depth_m1_t& v = *reinterpret_cast<const npu_set_ifm_depth_m1_t*>(in);
+            op = "NPU_SET_IFM_DEPTH_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PRECISION):
+        {
+            const npu_set_ifm_precision_t& v = *reinterpret_cast<const npu_set_ifm_precision_t*>(in);
+            op = "NPU_SET_IFM_PRECISION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_UPSCALE):
+        {
+            const npu_set_ifm_upscale_t& v = *reinterpret_cast<const npu_set_ifm_upscale_t*>(in);
+            op = "NPU_SET_IFM_UPSCALE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_ZERO_POINT):
+        {
+            const npu_set_ifm_zero_point_t& v = *reinterpret_cast<const npu_set_ifm_zero_point_t*>(in);
+            op = "NPU_SET_IFM_ZERO_POINT";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_WIDTH0_M1):
+        {
+            const npu_set_ifm_width0_m1_t& v = *reinterpret_cast<const npu_set_ifm_width0_m1_t*>(in);
+            op = "NPU_SET_IFM_WIDTH0_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT0_M1):
+        {
+            const npu_set_ifm_height0_m1_t& v = *reinterpret_cast<const npu_set_ifm_height0_m1_t*>(in);
+            op = "NPU_SET_IFM_HEIGHT0_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT1_M1):
+        {
+            const npu_set_ifm_height1_m1_t& v = *reinterpret_cast<const npu_set_ifm_height1_m1_t*>(in);
+            op = "NPU_SET_IFM_HEIGHT1_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_REGION):
+        {
+            const npu_set_ifm_region_t& v = *reinterpret_cast<const npu_set_ifm_region_t*>(in);
+            op = "NPU_SET_IFM_REGION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_BROADCAST):
+        {
+            const npu_set_ifm_broadcast_t& v = *reinterpret_cast<const npu_set_ifm_broadcast_t*>(in);
+            op = "NPU_SET_IFM_BROADCAST";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH_M1):
+        {
+            const npu_set_ofm_width_m1_t& v = *reinterpret_cast<const npu_set_ofm_width_m1_t*>(in);
+            op = "NPU_SET_OFM_WIDTH_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT_M1):
+        {
+            const npu_set_ofm_height_m1_t& v = *reinterpret_cast<const npu_set_ofm_height_m1_t*>(in);
+            op = "NPU_SET_OFM_HEIGHT_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_DEPTH_M1):
+        {
+            const npu_set_ofm_depth_m1_t& v = *reinterpret_cast<const npu_set_ofm_depth_m1_t*>(in);
+            op = "NPU_SET_OFM_DEPTH_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_PRECISION):
+        {
+            const npu_set_ofm_precision_t& v = *reinterpret_cast<const npu_set_ofm_precision_t*>(in);
+            op = "NPU_SET_OFM_PRECISION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_WIDTH_M1):
+        {
+            const npu_set_ofm_blk_width_m1_t& v = *reinterpret_cast<const npu_set_ofm_blk_width_m1_t*>(in);
+            op = "NPU_SET_OFM_BLK_WIDTH_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_HEIGHT_M1):
+        {
+            const npu_set_ofm_blk_height_m1_t& v = *reinterpret_cast<const npu_set_ofm_blk_height_m1_t*>(in);
+            op = "NPU_SET_OFM_BLK_HEIGHT_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_DEPTH_M1):
+        {
+            const npu_set_ofm_blk_depth_m1_t& v = *reinterpret_cast<const npu_set_ofm_blk_depth_m1_t*>(in);
+            op = "NPU_SET_OFM_BLK_DEPTH_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_ZERO_POINT):
+        {
+            const npu_set_ofm_zero_point_t& v = *reinterpret_cast<const npu_set_ofm_zero_point_t*>(in);
+            op = "NPU_SET_OFM_ZERO_POINT";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH0_M1):
+        {
+            const npu_set_ofm_width0_m1_t& v = *reinterpret_cast<const npu_set_ofm_width0_m1_t*>(in);
+            op = "NPU_SET_OFM_WIDTH0_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT0_M1):
+        {
+            const npu_set_ofm_height0_m1_t& v = *reinterpret_cast<const npu_set_ofm_height0_m1_t*>(in);
+            op = "NPU_SET_OFM_HEIGHT0_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT1_M1):
+        {
+            const npu_set_ofm_height1_m1_t& v = *reinterpret_cast<const npu_set_ofm_height1_m1_t*>(in);
+            op = "NPU_SET_OFM_HEIGHT1_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_REGION):
+        {
+            const npu_set_ofm_region_t& v = *reinterpret_cast<const npu_set_ofm_region_t*>(in);
+            op = "NPU_SET_OFM_REGION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_WIDTH_M1):
+        {
+            const npu_set_kernel_width_m1_t& v = *reinterpret_cast<const npu_set_kernel_width_m1_t*>(in);
+            op = "NPU_SET_KERNEL_WIDTH_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_HEIGHT_M1):
+        {
+            const npu_set_kernel_height_m1_t& v = *reinterpret_cast<const npu_set_kernel_height_m1_t*>(in);
+            op = "NPU_SET_KERNEL_HEIGHT_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_STRIDE):
+        {
+            const npu_set_kernel_stride_t& v = *reinterpret_cast<const npu_set_kernel_stride_t*>(in);
+            op = "NPU_SET_KERNEL_STRIDE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACC_FORMAT):
+        {
+            const npu_set_acc_format_t& v = *reinterpret_cast<const npu_set_acc_format_t*>(in);
+            op = "NPU_SET_ACC_FORMAT";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION):
+        {
+            const npu_set_activation_t& v = *reinterpret_cast<const npu_set_activation_t*>(in);
+            op = "NPU_SET_ACTIVATION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MIN):
+        {
+            const npu_set_activation_min_t& v = *reinterpret_cast<const npu_set_activation_min_t*>(in);
+            op = "NPU_SET_ACTIVATION_MIN";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MAX):
+        {
+            const npu_set_activation_max_t& v = *reinterpret_cast<const npu_set_activation_max_t*>(in);
+            op = "NPU_SET_ACTIVATION_MAX";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_REGION):
+        {
+            const npu_set_weight_region_t& v = *reinterpret_cast<const npu_set_weight_region_t*>(in);
+            op = "NPU_SET_WEIGHT_REGION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_SCALE_REGION):
+        {
+            const npu_set_scale_region_t& v = *reinterpret_cast<const npu_set_scale_region_t*>(in);
+            op = "NPU_SET_SCALE_REGION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_FORMAT):
+        {
+            const npu_set_weight_format_t& v = *reinterpret_cast<const npu_set_weight_format_t*>(in);
+            op = "NPU_SET_WEIGHT_FORMAT";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_BLOCKDEP):
+        {
+            const npu_set_blockdep_t& v = *reinterpret_cast<const npu_set_blockdep_t*>(in);
+            op = "NPU_SET_BLOCKDEP";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_X_SCALE_N_M1):
+        {
+            const npu_set_resize_x_scale_n_m1_t& v = *reinterpret_cast<const npu_set_resize_x_scale_n_m1_t*>(in);
+            op = "NPU_SET_RESIZE_X_SCALE_N_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_Y_SCALE_N_M1):
+        {
+            const npu_set_resize_y_scale_n_m1_t& v = *reinterpret_cast<const npu_set_resize_y_scale_n_m1_t*>(in);
+            op = "NPU_SET_RESIZE_Y_SCALE_N_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_X_OFFSET):
+        {
+            const npu_set_resize_x_offset_t& v = *reinterpret_cast<const npu_set_resize_x_offset_t*>(in);
+            op = "NPU_SET_RESIZE_X_OFFSET";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_Y_OFFSET):
+        {
+            const npu_set_resize_y_offset_t& v = *reinterpret_cast<const npu_set_resize_y_offset_t*>(in);
+            op = "NPU_SET_RESIZE_Y_OFFSET";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SRC_REGION):
+        {
+            const npu_set_dma0_src_region_t& v = *reinterpret_cast<const npu_set_dma0_src_region_t*>(in);
+            op = "NPU_SET_DMA0_SRC_REGION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_DST_REGION):
+        {
+            const npu_set_dma0_dst_region_t& v = *reinterpret_cast<const npu_set_dma0_dst_region_t*>(in);
+            op = "NPU_SET_DMA0_DST_REGION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE0):
+        {
+            const npu_set_dma0_size0_t& v = *reinterpret_cast<const npu_set_dma0_size0_t*>(in);
+            op = "NPU_SET_DMA0_SIZE0";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE1):
+        {
+            const npu_set_dma0_size1_t& v = *reinterpret_cast<const npu_set_dma0_size1_t*>(in);
+            op = "NPU_SET_DMA0_SIZE1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_IDX_REGION):
+        {
+            const npu_set_dma0_idx_region_t& v = *reinterpret_cast<const npu_set_dma0_idx_region_t*>(in);
+            op = "NPU_SET_DMA0_IDX_REGION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_BROADCAST):
+        {
+            const npu_set_ifm2_broadcast_t& v = *reinterpret_cast<const npu_set_ifm2_broadcast_t*>(in);
+            op = "NPU_SET_IFM2_BROADCAST";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_PRECISION):
+        {
+            const npu_set_ifm2_precision_t& v = *reinterpret_cast<const npu_set_ifm2_precision_t*>(in);
+            op = "NPU_SET_IFM2_PRECISION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_ZERO_POINT):
+        {
+            const npu_set_ifm2_zero_point_t& v = *reinterpret_cast<const npu_set_ifm2_zero_point_t*>(in);
+            op = "NPU_SET_IFM2_ZERO_POINT";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_WIDTH0_M1):
+        {
+            const npu_set_ifm2_width0_m1_t& v = *reinterpret_cast<const npu_set_ifm2_width0_m1_t*>(in);
+            op = "NPU_SET_IFM2_WIDTH0_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT0_M1):
+        {
+            const npu_set_ifm2_height0_m1_t& v = *reinterpret_cast<const npu_set_ifm2_height0_m1_t*>(in);
+            op = "NPU_SET_IFM2_HEIGHT0_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT1_M1):
+        {
+            const npu_set_ifm2_height1_m1_t& v = *reinterpret_cast<const npu_set_ifm2_height1_m1_t*>(in);
+            op = "NPU_SET_IFM2_HEIGHT1_M1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_REGION):
+        {
+            const npu_set_ifm2_region_t& v = *reinterpret_cast<const npu_set_ifm2_region_t*>(in);
+            op = "NPU_SET_IFM2_REGION";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE0):
+        {
+            const npu_set_ifm_base0_t& v = *reinterpret_cast<const npu_set_ifm_base0_t*>(in);
+            op = "NPU_SET_IFM_BASE0";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE1):
+        {
+            const npu_set_ifm_base1_t& v = *reinterpret_cast<const npu_set_ifm_base1_t*>(in);
+            op = "NPU_SET_IFM_BASE1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE2):
+        {
+            const npu_set_ifm_base2_t& v = *reinterpret_cast<const npu_set_ifm_base2_t*>(in);
+            op = "NPU_SET_IFM_BASE2";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE3):
+        {
+            const npu_set_ifm_base3_t& v = *reinterpret_cast<const npu_set_ifm_base3_t*>(in);
+            op = "NPU_SET_IFM_BASE3";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_X):
+        {
+            const npu_set_ifm_stride_x_t& v = *reinterpret_cast<const npu_set_ifm_stride_x_t*>(in);
+            op = "NPU_SET_IFM_STRIDE_X";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_Y):
+        {
+            const npu_set_ifm_stride_y_t& v = *reinterpret_cast<const npu_set_ifm_stride_y_t*>(in);
+            op = "NPU_SET_IFM_STRIDE_Y";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_C):
+        {
+            const npu_set_ifm_stride_c_t& v = *reinterpret_cast<const npu_set_ifm_stride_c_t*>(in);
+            op = "NPU_SET_IFM_STRIDE_C";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE0):
+        {
+            const npu_set_ofm_base0_t& v = *reinterpret_cast<const npu_set_ofm_base0_t*>(in);
+            op = "NPU_SET_OFM_BASE0";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE1):
+        {
+            const npu_set_ofm_base1_t& v = *reinterpret_cast<const npu_set_ofm_base1_t*>(in);
+            op = "NPU_SET_OFM_BASE1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE2):
+        {
+            const npu_set_ofm_base2_t& v = *reinterpret_cast<const npu_set_ofm_base2_t*>(in);
+            op = "NPU_SET_OFM_BASE2";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE3):
+        {
+            const npu_set_ofm_base3_t& v = *reinterpret_cast<const npu_set_ofm_base3_t*>(in);
+            op = "NPU_SET_OFM_BASE3";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_X):
+        {
+            const npu_set_ofm_stride_x_t& v = *reinterpret_cast<const npu_set_ofm_stride_x_t*>(in);
+            op = "NPU_SET_OFM_STRIDE_X";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_Y):
+        {
+            const npu_set_ofm_stride_y_t& v = *reinterpret_cast<const npu_set_ofm_stride_y_t*>(in);
+            op = "NPU_SET_OFM_STRIDE_Y";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_C):
+        {
+            const npu_set_ofm_stride_c_t& v = *reinterpret_cast<const npu_set_ofm_stride_c_t*>(in);
+            op = "NPU_SET_OFM_STRIDE_C";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_BASE):
+        {
+            const npu_set_weight_base_t& v = *reinterpret_cast<const npu_set_weight_base_t*>(in);
+            op = "NPU_SET_WEIGHT_BASE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_LENGTH):
+        {
+            const npu_set_weight_length_t& v = *reinterpret_cast<const npu_set_weight_length_t*>(in);
+            op = "NPU_SET_WEIGHT_LENGTH";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_BASE):
+        {
+            const npu_set_scale_base_t& v = *reinterpret_cast<const npu_set_scale_base_t*>(in);
+            op = "NPU_SET_SCALE_BASE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_LENGTH):
+        {
+            const npu_set_scale_length_t& v = *reinterpret_cast<const npu_set_scale_length_t*>(in);
+            op = "NPU_SET_SCALE_LENGTH";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_SCALE):
+        {
+            const npu_set_ofm_scale_t& v = *reinterpret_cast<const npu_set_ofm_scale_t*>(in);
+            op = "NPU_SET_OFM_SCALE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_SCALE):
+        {
+            const npu_set_ifm_scale_t& v = *reinterpret_cast<const npu_set_ifm_scale_t*>(in);
+            op = "NPU_SET_IFM_SCALE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_SCALE):
+        {
+            const npu_set_ifm2_scale_t& v = *reinterpret_cast<const npu_set_ifm2_scale_t*>(in);
+            op = "NPU_SET_IFM2_SCALE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OP_SCALAR):
+        {
+            const npu_set_op_scalar_t& v = *reinterpret_cast<const npu_set_op_scalar_t*>(in);
+            op = "NPU_SET_OP_SCALAR";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC):
+        {
+            const npu_set_dma0_src_t& v = *reinterpret_cast<const npu_set_dma0_src_t*>(in);
+            op = "NPU_SET_DMA0_SRC";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST):
+        {
+            const npu_set_dma0_dst_t& v = *reinterpret_cast<const npu_set_dma0_dst_t*>(in);
+            op = "NPU_SET_DMA0_DST";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_LEN):
+        {
+            const npu_set_dma0_len_t& v = *reinterpret_cast<const npu_set_dma0_len_t*>(in);
+            op = "NPU_SET_DMA0_LEN";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC_STRIDE0):
+        {
+            const npu_set_dma0_src_stride0_t& v = *reinterpret_cast<const npu_set_dma0_src_stride0_t*>(in);
+            op = "NPU_SET_DMA0_SRC_STRIDE0";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC_STRIDE1):
+        {
+            const npu_set_dma0_src_stride1_t& v = *reinterpret_cast<const npu_set_dma0_src_stride1_t*>(in);
+            op = "NPU_SET_DMA0_SRC_STRIDE1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST_STRIDE0):
+        {
+            const npu_set_dma0_dst_stride0_t& v = *reinterpret_cast<const npu_set_dma0_dst_stride0_t*>(in);
+            op = "NPU_SET_DMA0_DST_STRIDE0";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST_STRIDE1):
+        {
+            const npu_set_dma0_dst_stride1_t& v = *reinterpret_cast<const npu_set_dma0_dst_stride1_t*>(in);
+            op = "NPU_SET_DMA0_DST_STRIDE1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX):
+        {
+            const npu_set_dma0_idx_t& v = *reinterpret_cast<const npu_set_dma0_idx_t*>(in);
+            op = "NPU_SET_DMA0_IDX";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX_MAX):
+        {
+            const npu_set_dma0_idx_max_t& v = *reinterpret_cast<const npu_set_dma0_idx_max_t*>(in);
+            op = "NPU_SET_DMA0_IDX_MAX";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX_SKIP1):
+        {
+            const npu_set_dma0_idx_skip1_t& v = *reinterpret_cast<const npu_set_dma0_idx_skip1_t*>(in);
+            op = "NPU_SET_DMA0_IDX_SKIP1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE0):
+        {
+            const npu_set_ifm2_base0_t& v = *reinterpret_cast<const npu_set_ifm2_base0_t*>(in);
+            op = "NPU_SET_IFM2_BASE0";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE1):
+        {
+            const npu_set_ifm2_base1_t& v = *reinterpret_cast<const npu_set_ifm2_base1_t*>(in);
+            op = "NPU_SET_IFM2_BASE1";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE2):
+        {
+            const npu_set_ifm2_base2_t& v = *reinterpret_cast<const npu_set_ifm2_base2_t*>(in);
+            op = "NPU_SET_IFM2_BASE2";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE3):
+        {
+            const npu_set_ifm2_base3_t& v = *reinterpret_cast<const npu_set_ifm2_base3_t*>(in);
+            op = "NPU_SET_IFM2_BASE3";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_X):
+        {
+            const npu_set_ifm2_stride_x_t& v = *reinterpret_cast<const npu_set_ifm2_stride_x_t*>(in);
+            op = "NPU_SET_IFM2_STRIDE_X";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_Y):
+        {
+            const npu_set_ifm2_stride_y_t& v = *reinterpret_cast<const npu_set_ifm2_stride_y_t*>(in);
+            op = "NPU_SET_IFM2_STRIDE_Y";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_C):
+        {
+            const npu_set_ifm2_stride_c_t& v = *reinterpret_cast<const npu_set_ifm2_stride_c_t*>(in);
+            op = "NPU_SET_IFM2_STRIDE_C";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_BASE):
+        {
+            const npu_set_weight1_base_t& v = *reinterpret_cast<const npu_set_weight1_base_t*>(in);
+            op = "NPU_SET_WEIGHT1_BASE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_LENGTH):
+        {
+            const npu_set_weight1_length_t& v = *reinterpret_cast<const npu_set_weight1_length_t*>(in);
+            op = "NPU_SET_WEIGHT1_LENGTH";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT2_BASE):
+        {
+            const npu_set_weight2_base_t& v = *reinterpret_cast<const npu_set_weight2_base_t*>(in);
+            op = "NPU_SET_WEIGHT2_BASE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT2_LENGTH):
+        {
+            const npu_set_weight2_length_t& v = *reinterpret_cast<const npu_set_weight2_length_t*>(in);
+            op = "NPU_SET_WEIGHT2_LENGTH";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT3_BASE):
+        {
+            const npu_set_weight3_base_t& v = *reinterpret_cast<const npu_set_weight3_base_t*>(in);
+            op = "NPU_SET_WEIGHT3_BASE";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT3_LENGTH):
+        {
+            const npu_set_weight3_length_t& v = *reinterpret_cast<const npu_set_weight3_length_t*>(in);
+            op = "NPU_SET_WEIGHT3_LENGTH";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_RESIZE_X):
+        {
+            const npu_set_resize_x_step_t& v = *reinterpret_cast<const npu_set_resize_x_step_t*>(in);
+            op = "NPU_SET_RESIZE_X_STEP";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_RESIZE_Y):
+        {
+            const npu_set_resize_y_step_t& v = *reinterpret_cast<const npu_set_resize_y_step_t*>(in);
+            op = "NPU_SET_RESIZE_Y_STEP";
+            v.disassemble(fields);
+            break;
+        }
+        case (static_cast<uint32_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast<uint32_t>(NPU_NAMESPACE::cmd1_opcode::NPU_OP_BRANCH):
+        {
+            const npu_op_branch_t& v = *reinterpret_cast<const npu_op_branch_t*>(in);
+            op = "NPU_OP_BRANCH";
+            v.disassemble(fields);
+            break;
+        }
+        default: break;
+    }
+    return (*in & (3<<14)) != 0 ? 2 : 1;
+}
+#endif
+#endif
+
+struct npu_op_stop_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t mask:16;
+#ifdef __cplusplus
+public:
+    npu_op_stop_t(uint32_t _mask) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_STOP)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        mask(_mask & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_op_stop_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_STOP)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        mask(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_STOP) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_STOP); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(mask) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_stop_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_stop_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_mask() const
+    {
+        return static_cast<uint32_t>(mask);
+    }
+    CONSTEXPR npu_op_stop_t& set_mask(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        mask = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("mask", std::to_string(mask)));
+    }
+#endif
+#endif
+};
+
+struct npu_op_irq_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t mask:16;
+#ifdef __cplusplus
+public:
+    npu_op_irq_t(uint32_t _mask) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_IRQ)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        mask(_mask & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_op_irq_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_IRQ)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        mask(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_IRQ) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_IRQ); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(mask) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_irq_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_irq_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_mask() const
+    {
+        return static_cast<uint32_t>(mask);
+    }
+    CONSTEXPR npu_op_irq_t& set_mask(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        mask = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("mask", std::to_string(mask)));
+    }
+#endif
+#endif
+};
+
+struct npu_op_conv_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t weights_ifm2:1;
+    uint32_t reserved1:15;
+#ifdef __cplusplus
+public:
+    npu_op_conv_t(uint32_t _weights_ifm2) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_CONV)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        weights_ifm2(_weights_ifm2 & ((1U << 1)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_op_conv_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_CONV)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        weights_ifm2(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_CONV) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_CONV); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(weights_ifm2) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_conv_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_conv_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_weights_ifm2() const
+    {
+        return static_cast<uint32_t>(weights_ifm2);
+    }
+    CONSTEXPR npu_op_conv_t& set_weights_ifm2(uint32_t value)
+    {
+        assert((value >> 1) == 0);
+        weights_ifm2 = static_cast<uint8_t>(value & ((1U << 1)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("weights_ifm2", std::to_string(weights_ifm2)));
+    }
+#endif
+#endif
+};
+
+struct npu_op_depthwise_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t reserved1:16;
+#ifdef __cplusplus
+public:
+    CONSTEXPR npu_op_depthwise_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DEPTHWISE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DEPTHWISE) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DEPTHWISE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_depthwise_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_depthwise_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>&) const
+    {
+    }
+#endif
+#endif
+};
+
+struct npu_op_pool_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t pooling_mode:3;
+    uint32_t reserved1:13;
+#ifdef __cplusplus
+public:
+    npu_op_pool_t(NPU_NAMESPACE::pooling_mode _pooling_mode) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_POOL)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        pooling_mode(static_cast<uint8_t>(_pooling_mode) & ((1U << 3)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_op_pool_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_POOL)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        pooling_mode(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_POOL) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_POOL); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(pooling_mode) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_pool_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_pool_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::pooling_mode get_pooling_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::pooling_mode>(pooling_mode);
+    }
+    CONSTEXPR npu_op_pool_t& set_pooling_mode(NPU_NAMESPACE::pooling_mode value)
+    {
+        pooling_mode = static_cast<uint8_t>(value) & ((1U << 3)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("pooling_mode", (pooling_mode < (sizeof(pooling_mode_str)/sizeof(pooling_mode_str[0])) ? pooling_mode_str[pooling_mode] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_op_elementwise_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t elementwise_mode:6;
+    uint32_t reserved1:10;
+#ifdef __cplusplus
+public:
+    npu_op_elementwise_t(NPU_NAMESPACE::elementwise_mode _elementwise_mode) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_ELEMENTWISE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        elementwise_mode(static_cast<uint8_t>(_elementwise_mode) & ((1U << 6)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_op_elementwise_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_ELEMENTWISE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        elementwise_mode(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_ELEMENTWISE) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_ELEMENTWISE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(elementwise_mode) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_elementwise_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_elementwise_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::elementwise_mode get_elementwise_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::elementwise_mode>(elementwise_mode);
+    }
+    CONSTEXPR npu_op_elementwise_t& set_elementwise_mode(NPU_NAMESPACE::elementwise_mode value)
+    {
+        elementwise_mode = static_cast<uint8_t>(value) & ((1U << 6)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("elementwise_mode", (elementwise_mode < (sizeof(elementwise_mode_str)/sizeof(elementwise_mode_str[0])) ? elementwise_mode_str[elementwise_mode] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_op_resize_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t resize_mode:2;
+    uint32_t reserved1:14;
+#ifdef __cplusplus
+public:
+    npu_op_resize_t(NPU_NAMESPACE::resize_mode _resize_mode) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_RESIZE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        resize_mode(static_cast<uint8_t>(_resize_mode) & ((1U << 2)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_op_resize_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_RESIZE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        resize_mode(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_RESIZE) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_RESIZE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(resize_mode) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_resize_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_resize_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::resize_mode get_resize_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::resize_mode>(resize_mode);
+    }
+    CONSTEXPR npu_op_resize_t& set_resize_mode(NPU_NAMESPACE::resize_mode value)
+    {
+        resize_mode = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("resize_mode", (resize_mode < (sizeof(resize_mode_str)/sizeof(resize_mode_str[0])) ? resize_mode_str[resize_mode] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_op_dma_start_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t reserved1:16;
+#ifdef __cplusplus
+public:
+    CONSTEXPR npu_op_dma_start_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_START)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_START) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_START); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_dma_start_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_dma_start_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>&) const
+    {
+    }
+#endif
+#endif
+};
+
+struct npu_op_dma_wait_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t k:2;
+    uint32_t reserved1:14;
+#ifdef __cplusplus
+public:
+    npu_op_dma_wait_t(uint32_t _k) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_WAIT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        k(_k & ((1U << 2)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_op_dma_wait_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_WAIT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        k(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_WAIT) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_WAIT); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(k) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_dma_wait_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_dma_wait_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_k() const
+    {
+        return static_cast<uint32_t>(k);
+    }
+    CONSTEXPR npu_op_dma_wait_t& set_k(uint32_t value)
+    {
+        assert((value >> 2) == 0);
+        k = static_cast<uint8_t>(value & ((1U << 2)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("k", std::to_string(k)));
+    }
+#endif
+#endif
+};
+
+struct npu_op_kernel_wait_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t n:1;
+    uint32_t reserved1:15;
+#ifdef __cplusplus
+public:
+    npu_op_kernel_wait_t(uint32_t _n) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_KERNEL_WAIT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        n(_n & ((1U << 1)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_op_kernel_wait_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_KERNEL_WAIT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        n(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_KERNEL_WAIT) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_KERNEL_WAIT); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(n) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_kernel_wait_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_kernel_wait_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_n() const
+    {
+        return static_cast<uint32_t>(n);
+    }
+    CONSTEXPR npu_op_kernel_wait_t& set_n(uint32_t value)
+    {
+        assert((value >> 1) == 0);
+        n = static_cast<uint8_t>(value & ((1U << 1)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("n", std::to_string(n)));
+    }
+#endif
+#endif
+};
+
+struct npu_op_pmu_mask_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t enable:1;
+    uint32_t reserved1:15;
+#ifdef __cplusplus
+public:
+    npu_op_pmu_mask_t(uint32_t _enable) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_PMU_MASK)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        enable(_enable & ((1U << 1)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_op_pmu_mask_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_PMU_MASK)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        enable(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_PMU_MASK) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_OP_PMU_MASK); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(enable) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_pmu_mask_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_pmu_mask_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_enable() const
+    {
+        return static_cast<uint32_t>(enable);
+    }
+    CONSTEXPR npu_op_pmu_mask_t& set_enable(uint32_t value)
+    {
+        assert((value >> 1) == 0);
+        enable = static_cast<uint8_t>(value & ((1U << 1)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("enable", std::to_string(enable)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_pad_top_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t pad:7;
+    uint32_t reserved1:9;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_pad_top_t(uint32_t _pad) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_TOP)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        pad(_pad & ((1U << 7)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ifm_pad_top_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_TOP)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        pad(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_TOP) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_TOP); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(pad) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_pad_top_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_pad_top_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_pad() const
+    {
+        return static_cast<uint32_t>(pad);
+    }
+    CONSTEXPR npu_set_ifm_pad_top_t& set_pad(uint32_t value)
+    {
+        assert((value >> 7) == 0);
+        pad = static_cast<uint8_t>(value & ((1U << 7)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("pad", std::to_string(pad)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_pad_left_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t pad:7;
+    uint32_t reserved1:9;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_pad_left_t(uint32_t _pad) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_LEFT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        pad(_pad & ((1U << 7)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ifm_pad_left_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_LEFT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        pad(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_LEFT) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_LEFT); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(pad) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_pad_left_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_pad_left_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_pad() const
+    {
+        return static_cast<uint32_t>(pad);
+    }
+    CONSTEXPR npu_set_ifm_pad_left_t& set_pad(uint32_t value)
+    {
+        assert((value >> 7) == 0);
+        pad = static_cast<uint8_t>(value & ((1U << 7)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("pad", std::to_string(pad)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_pad_right_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t pad:8;
+    uint32_t reserved1:8;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_pad_right_t(uint32_t _pad) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_RIGHT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        pad(_pad & ((1U << 8)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ifm_pad_right_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_RIGHT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        pad(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_RIGHT) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_RIGHT); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(pad) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_pad_right_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_pad_right_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_pad() const
+    {
+        return static_cast<uint32_t>(pad);
+    }
+    CONSTEXPR npu_set_ifm_pad_right_t& set_pad(uint32_t value)
+    {
+        assert((value >> 8) == 0);
+        pad = static_cast<uint8_t>(value & ((1U << 8)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("pad", std::to_string(pad)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_pad_bottom_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t pad:8;
+    uint32_t reserved1:8;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_pad_bottom_t(uint32_t _pad) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_BOTTOM)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        pad(_pad & ((1U << 8)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ifm_pad_bottom_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_BOTTOM)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        pad(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_BOTTOM) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_BOTTOM); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(pad) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_pad_bottom_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_pad_bottom_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_pad() const
+    {
+        return static_cast<uint32_t>(pad);
+    }
+    CONSTEXPR npu_set_ifm_pad_bottom_t& set_pad(uint32_t value)
+    {
+        assert((value >> 8) == 0);
+        pad = static_cast<uint8_t>(value & ((1U << 8)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("pad", std::to_string(pad)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_depth_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t depth_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_depth_m1_t(uint32_t _depth_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_DEPTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        depth_m1(_depth_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ifm_depth_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_DEPTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        depth_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_DEPTH_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_DEPTH_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(depth_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_depth_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_depth_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_depth_m1() const
+    {
+        return static_cast<uint32_t>(depth_m1);
+    }
+    CONSTEXPR npu_set_ifm_depth_m1_t& set_depth_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        depth_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("depth_m1", std::to_string(depth_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_precision_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t activation_type:1;
+    uint32_t reserved1:1;
+    uint32_t activation_precision:2;
+    uint32_t reserved2:2;
+    uint32_t activation_format:2;
+    uint32_t reserved3:6;
+    uint32_t activation_storage:2;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_precision_t(NPU_NAMESPACE::activation_type _activation_type, NPU_NAMESPACE::activation_precision _activation_precision, NPU_NAMESPACE::activation_format _activation_format, NPU_NAMESPACE::activation_storage _activation_storage) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PRECISION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        activation_type(static_cast<uint8_t>(_activation_type) & ((1U << 1)-1)),
+        reserved1(0),
+        activation_precision(static_cast<uint8_t>(_activation_precision) & ((1U << 2)-1)),
+        reserved2(0),
+        activation_format(static_cast<uint8_t>(_activation_format) & ((1U << 2)-1)),
+        reserved3(0),
+        activation_storage(static_cast<uint8_t>(_activation_storage) & ((1U << 2)-1))
+    {}
+    CONSTEXPR npu_set_ifm_precision_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PRECISION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        activation_type(0),
+        reserved1(0),
+        activation_precision(0),
+        reserved2(0),
+        activation_format(0),
+        reserved3(0),
+        activation_storage(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PRECISION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PRECISION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(activation_type) << 16;
+        word |= uint32_t(activation_precision) << 18;
+        word |= uint32_t(activation_format) << 22;
+        word |= uint32_t(activation_storage) << 30;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_precision_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_precision_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_type get_activation_type() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_type>(activation_type);
+    }
+    CONSTEXPR npu_set_ifm_precision_t& set_activation_type(NPU_NAMESPACE::activation_type value)
+    {
+        activation_type = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_precision get_activation_precision() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_precision>(activation_precision);
+    }
+    CONSTEXPR npu_set_ifm_precision_t& set_activation_precision(NPU_NAMESPACE::activation_precision value)
+    {
+        activation_precision = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_format get_activation_format() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_format>(activation_format);
+    }
+    CONSTEXPR npu_set_ifm_precision_t& set_activation_format(NPU_NAMESPACE::activation_format value)
+    {
+        activation_format = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_storage get_activation_storage() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_storage>(activation_storage);
+    }
+    CONSTEXPR npu_set_ifm_precision_t& set_activation_storage(NPU_NAMESPACE::activation_storage value)
+    {
+        activation_storage = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("activation_type", (activation_type < (sizeof(activation_type_str)/sizeof(activation_type_str[0])) ? activation_type_str[activation_type] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("activation_precision", (activation_precision < (sizeof(activation_precision_str)/sizeof(activation_precision_str[0])) ? activation_precision_str[activation_precision] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("activation_format", (activation_format < (sizeof(activation_format_str)/sizeof(activation_format_str[0])) ? activation_format_str[activation_format] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("activation_storage", (activation_storage < (sizeof(activation_storage_str)/sizeof(activation_storage_str[0])) ? activation_storage_str[activation_storage] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_upscale_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t mode:2;
+    uint32_t reserved1:14;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_upscale_t(NPU_NAMESPACE::ifm_upscale_mode _mode) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_UPSCALE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        mode(static_cast<uint8_t>(_mode) & ((1U << 2)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ifm_upscale_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_UPSCALE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        mode(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_UPSCALE) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_UPSCALE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(mode) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_upscale_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_upscale_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::ifm_upscale_mode get_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::ifm_upscale_mode>(mode);
+    }
+    CONSTEXPR npu_set_ifm_upscale_t& set_mode(NPU_NAMESPACE::ifm_upscale_mode value)
+    {
+        mode = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("mode", (mode < (sizeof(ifm_upscale_mode_str)/sizeof(ifm_upscale_mode_str[0])) ? ifm_upscale_mode_str[mode] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_zero_point_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t zero_point:16;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_zero_point_t(uint32_t _zero_point) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_ZERO_POINT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        zero_point(_zero_point & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ifm_zero_point_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_ZERO_POINT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        zero_point(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_ZERO_POINT) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_ZERO_POINT); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(zero_point) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_zero_point_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_zero_point_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_zero_point() const
+    {
+        return static_cast<uint32_t>(zero_point);
+    }
+    CONSTEXPR npu_set_ifm_zero_point_t& set_zero_point(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        zero_point = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("zero_point", std::to_string(zero_point)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_width0_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t width_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_width0_m1_t(uint32_t _width_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_WIDTH0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(_width_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ifm_width0_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_WIDTH0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_WIDTH0_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_WIDTH0_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(width_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_width0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_width0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_width_m1() const
+    {
+        return static_cast<uint32_t>(width_m1);
+    }
+    CONSTEXPR npu_set_ifm_width0_m1_t& set_width_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        width_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("width_m1", std::to_string(width_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_height0_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t height_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_height0_m1_t(uint32_t _height_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(_height_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ifm_height0_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT0_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT0_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(height_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_height0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_height0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_height_m1() const
+    {
+        return static_cast<uint32_t>(height_m1);
+    }
+    CONSTEXPR npu_set_ifm_height0_m1_t& set_height_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        height_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("height_m1", std::to_string(height_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_height1_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t height_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_height1_m1_t(uint32_t _height_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT1_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(_height_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ifm_height1_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT1_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT1_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT1_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(height_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_height1_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_height1_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_height_m1() const
+    {
+        return static_cast<uint32_t>(height_m1);
+    }
+    CONSTEXPR npu_set_ifm_height1_m1_t& set_height_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        height_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("height_m1", std::to_string(height_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_region_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t region:3;
+    uint32_t reserved1:13;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_region_t(uint32_t _region) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(_region & ((1U << 3)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ifm_region_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_REGION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_REGION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(region) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region() const
+    {
+        return static_cast<uint32_t>(region);
+    }
+    CONSTEXPR npu_set_ifm_region_t& set_region(uint32_t value)
+    {
+        assert((value >> 3) == 0);
+        region = static_cast<uint8_t>(value & ((1U << 3)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("region", std::to_string(region)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_broadcast_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t broadcast_mode:4;
+    uint32_t reserved1:12;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_broadcast_t(NPU_NAMESPACE::broadcast_mode _broadcast_mode) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_BROADCAST)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        broadcast_mode(static_cast<uint8_t>(_broadcast_mode) & ((1U << 4)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ifm_broadcast_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_BROADCAST)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        broadcast_mode(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_BROADCAST) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_BROADCAST); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(broadcast_mode) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_broadcast_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_broadcast_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::broadcast_mode get_broadcast_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::broadcast_mode>(broadcast_mode);
+    }
+    CONSTEXPR npu_set_ifm_broadcast_t& set_broadcast_mode(NPU_NAMESPACE::broadcast_mode value)
+    {
+        broadcast_mode = static_cast<uint8_t>(value) & ((1U << 4)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("broadcast_mode", (broadcast_mode < (sizeof(broadcast_mode_str)/sizeof(broadcast_mode_str[0])) ? broadcast_mode_str[broadcast_mode] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_width_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t width_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_width_m1_t(uint32_t _width_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(_width_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ofm_width_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(width_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_width_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_width_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_width_m1() const
+    {
+        return static_cast<uint32_t>(width_m1);
+    }
+    CONSTEXPR npu_set_ofm_width_m1_t& set_width_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        width_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("width_m1", std::to_string(width_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_height_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t height_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_height_m1_t(uint32_t _height_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(_height_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ofm_height_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(height_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_height_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_height_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_height_m1() const
+    {
+        return static_cast<uint32_t>(height_m1);
+    }
+    CONSTEXPR npu_set_ofm_height_m1_t& set_height_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        height_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("height_m1", std::to_string(height_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_depth_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t depth_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_depth_m1_t(uint32_t _depth_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_DEPTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        depth_m1(_depth_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ofm_depth_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_DEPTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        depth_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_DEPTH_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_DEPTH_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(depth_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_depth_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_depth_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_depth_m1() const
+    {
+        return static_cast<uint32_t>(depth_m1);
+    }
+    CONSTEXPR npu_set_ofm_depth_m1_t& set_depth_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        depth_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("depth_m1", std::to_string(depth_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_precision_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t activation_type:1;
+    uint32_t activation_precision:2;
+    uint32_t reserved1:3;
+    uint32_t activation_format:2;
+    uint32_t scale_mode:1;
+    uint32_t activation_reverse:2;
+    uint32_t activation_transpose:3;
+    uint32_t activation_storage:2;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_precision_t(NPU_NAMESPACE::activation_type _activation_type, NPU_NAMESPACE::activation_precision _activation_precision, NPU_NAMESPACE::activation_format _activation_format, NPU_NAMESPACE::ofm_scale_mode _scale_mode, NPU_NAMESPACE::activation_reverse _activation_reverse, NPU_NAMESPACE::activation_transpose _activation_transpose, NPU_NAMESPACE::activation_storage _activation_storage) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_PRECISION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        activation_type(static_cast<uint8_t>(_activation_type) & ((1U << 1)-1)),
+        activation_precision(static_cast<uint8_t>(_activation_precision) & ((1U << 2)-1)),
+        reserved1(0),
+        activation_format(static_cast<uint8_t>(_activation_format) & ((1U << 2)-1)),
+        scale_mode(static_cast<uint8_t>(_scale_mode) & ((1U << 1)-1)),
+        activation_reverse(static_cast<uint8_t>(_activation_reverse) & ((1U << 2)-1)),
+        activation_transpose(static_cast<uint8_t>(_activation_transpose) & ((1U << 3)-1)),
+        activation_storage(static_cast<uint8_t>(_activation_storage) & ((1U << 2)-1))
+    {}
+    CONSTEXPR npu_set_ofm_precision_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_PRECISION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        activation_type(0),
+        activation_precision(0),
+        reserved1(0),
+        activation_format(0),
+        scale_mode(0),
+        activation_reverse(0),
+        activation_transpose(0),
+        activation_storage(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_PRECISION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_PRECISION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(activation_type) << 16;
+        word |= uint32_t(activation_precision) << 17;
+        word |= uint32_t(activation_format) << 22;
+        word |= uint32_t(scale_mode) << 24;
+        word |= uint32_t(activation_reverse) << 25;
+        word |= uint32_t(activation_transpose) << 27;
+        word |= uint32_t(activation_storage) << 30;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_precision_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_precision_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_type get_activation_type() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_type>(activation_type);
+    }
+    CONSTEXPR npu_set_ofm_precision_t& set_activation_type(NPU_NAMESPACE::activation_type value)
+    {
+        activation_type = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_precision get_activation_precision() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_precision>(activation_precision);
+    }
+    CONSTEXPR npu_set_ofm_precision_t& set_activation_precision(NPU_NAMESPACE::activation_precision value)
+    {
+        activation_precision = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_format get_activation_format() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_format>(activation_format);
+    }
+    CONSTEXPR npu_set_ofm_precision_t& set_activation_format(NPU_NAMESPACE::activation_format value)
+    {
+        activation_format = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::ofm_scale_mode get_scale_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::ofm_scale_mode>(scale_mode);
+    }
+    CONSTEXPR npu_set_ofm_precision_t& set_scale_mode(NPU_NAMESPACE::ofm_scale_mode value)
+    {
+        scale_mode = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_reverse get_activation_reverse() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_reverse>(activation_reverse);
+    }
+    CONSTEXPR npu_set_ofm_precision_t& set_activation_reverse(NPU_NAMESPACE::activation_reverse value)
+    {
+        activation_reverse = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_transpose get_activation_transpose() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_transpose>(activation_transpose);
+    }
+    CONSTEXPR npu_set_ofm_precision_t& set_activation_transpose(NPU_NAMESPACE::activation_transpose value)
+    {
+        activation_transpose = static_cast<uint8_t>(value) & ((1U << 3)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_storage get_activation_storage() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_storage>(activation_storage);
+    }
+    CONSTEXPR npu_set_ofm_precision_t& set_activation_storage(NPU_NAMESPACE::activation_storage value)
+    {
+        activation_storage = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("activation_type", (activation_type < (sizeof(activation_type_str)/sizeof(activation_type_str[0])) ? activation_type_str[activation_type] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("activation_precision", (activation_precision < (sizeof(activation_precision_str)/sizeof(activation_precision_str[0])) ? activation_precision_str[activation_precision] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("activation_format", (activation_format < (sizeof(activation_format_str)/sizeof(activation_format_str[0])) ? activation_format_str[activation_format] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("scale_mode", (scale_mode < (sizeof(ofm_scale_mode_str)/sizeof(ofm_scale_mode_str[0])) ? ofm_scale_mode_str[scale_mode] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("activation_reverse", (activation_reverse < (sizeof(activation_reverse_str)/sizeof(activation_reverse_str[0])) ? activation_reverse_str[activation_reverse] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("activation_transpose", (activation_transpose < (sizeof(activation_transpose_str)/sizeof(activation_transpose_str[0])) ? activation_transpose_str[activation_transpose] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("activation_storage", (activation_storage < (sizeof(activation_storage_str)/sizeof(activation_storage_str[0])) ? activation_storage_str[activation_storage] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_blk_width_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t width_m1:7;
+    uint32_t reserved1:9;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_blk_width_m1_t(uint32_t _width_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_WIDTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(_width_m1 & ((1U << 7)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ofm_blk_width_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_WIDTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_WIDTH_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_WIDTH_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(width_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_blk_width_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_blk_width_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_width_m1() const
+    {
+        return static_cast<uint32_t>(width_m1);
+    }
+    CONSTEXPR npu_set_ofm_blk_width_m1_t& set_width_m1(uint32_t value)
+    {
+        assert((value >> 7) == 0);
+        width_m1 = static_cast<uint8_t>(value & ((1U << 7)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("width_m1", std::to_string(width_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_blk_height_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t height_m1:7;
+    uint32_t reserved1:9;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_blk_height_m1_t(uint32_t _height_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_HEIGHT_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(_height_m1 & ((1U << 7)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ofm_blk_height_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_HEIGHT_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_HEIGHT_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_HEIGHT_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(height_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_blk_height_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_blk_height_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_height_m1() const
+    {
+        return static_cast<uint32_t>(height_m1);
+    }
+    CONSTEXPR npu_set_ofm_blk_height_m1_t& set_height_m1(uint32_t value)
+    {
+        assert((value >> 7) == 0);
+        height_m1 = static_cast<uint8_t>(value & ((1U << 7)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("height_m1", std::to_string(height_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_blk_depth_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t depth_m1:10;
+    uint32_t reserved1:6;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_blk_depth_m1_t(uint32_t _depth_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_DEPTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        depth_m1(_depth_m1 & ((1U << 10)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ofm_blk_depth_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_DEPTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        depth_m1(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_DEPTH_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_DEPTH_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(depth_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_blk_depth_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_blk_depth_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_depth_m1() const
+    {
+        return static_cast<uint32_t>(depth_m1);
+    }
+    CONSTEXPR npu_set_ofm_blk_depth_m1_t& set_depth_m1(uint32_t value)
+    {
+        assert((value >> 10) == 0);
+        depth_m1 = static_cast<uint16_t>(value & ((1U << 10)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("depth_m1", std::to_string(depth_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_zero_point_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t zero_point:16;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_zero_point_t(uint32_t _zero_point) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_ZERO_POINT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        zero_point(_zero_point & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ofm_zero_point_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_ZERO_POINT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        zero_point(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_ZERO_POINT) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_ZERO_POINT); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(zero_point) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_zero_point_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_zero_point_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_zero_point() const
+    {
+        return static_cast<uint32_t>(zero_point);
+    }
+    CONSTEXPR npu_set_ofm_zero_point_t& set_zero_point(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        zero_point = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("zero_point", std::to_string(zero_point)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_width0_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t width_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_width0_m1_t(uint32_t _width_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(_width_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ofm_width0_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH0_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH0_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(width_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_width0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_width0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_width_m1() const
+    {
+        return static_cast<uint32_t>(width_m1);
+    }
+    CONSTEXPR npu_set_ofm_width0_m1_t& set_width_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        width_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("width_m1", std::to_string(width_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_height0_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t height_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_height0_m1_t(uint32_t _height_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(_height_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ofm_height0_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT0_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT0_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(height_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_height0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_height0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_height_m1() const
+    {
+        return static_cast<uint32_t>(height_m1);
+    }
+    CONSTEXPR npu_set_ofm_height0_m1_t& set_height_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        height_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("height_m1", std::to_string(height_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_height1_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t height_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_height1_m1_t(uint32_t _height_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT1_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(_height_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ofm_height1_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT1_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT1_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT1_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(height_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_height1_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_height1_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_height_m1() const
+    {
+        return static_cast<uint32_t>(height_m1);
+    }
+    CONSTEXPR npu_set_ofm_height1_m1_t& set_height_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        height_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("height_m1", std::to_string(height_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_region_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t region:3;
+    uint32_t reserved1:13;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_region_t(uint32_t _region) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(_region & ((1U << 3)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ofm_region_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_REGION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_REGION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(region) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region() const
+    {
+        return static_cast<uint32_t>(region);
+    }
+    CONSTEXPR npu_set_ofm_region_t& set_region(uint32_t value)
+    {
+        assert((value >> 3) == 0);
+        region = static_cast<uint8_t>(value & ((1U << 3)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("region", std::to_string(region)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_kernel_width_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t width_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_kernel_width_m1_t(uint32_t _width_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_WIDTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(_width_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_kernel_width_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_WIDTH_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_WIDTH_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_WIDTH_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(width_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_kernel_width_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_kernel_width_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_width_m1() const
+    {
+        return static_cast<uint32_t>(width_m1);
+    }
+    CONSTEXPR npu_set_kernel_width_m1_t& set_width_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        width_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("width_m1", std::to_string(width_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_kernel_height_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t height_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_kernel_height_m1_t(uint32_t _height_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_HEIGHT_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(_height_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_kernel_height_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_HEIGHT_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_HEIGHT_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_HEIGHT_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(height_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_kernel_height_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_kernel_height_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_height_m1() const
+    {
+        return static_cast<uint32_t>(height_m1);
+    }
+    CONSTEXPR npu_set_kernel_height_m1_t& set_height_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        height_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("height_m1", std::to_string(height_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_kernel_stride_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t stride_x_lsb:1;
+    uint32_t stride_y_lsb:1;
+    uint32_t weight_order:1;
+    uint32_t dilation_x:1;
+    uint32_t dilation_y:1;
+    uint32_t decomposition:1;
+    uint32_t stride_x_msb:1;
+    uint32_t reserved1:2;
+    uint32_t stride_y_msb:1;
+    uint32_t reserved2:6;
+#ifdef __cplusplus
+public:
+    npu_set_kernel_stride_t(uint32_t _stride_x_lsb, uint32_t _stride_y_lsb, NPU_NAMESPACE::weight_order _weight_order, NPU_NAMESPACE::kernel_dilation _dilation_x, NPU_NAMESPACE::kernel_dilation _dilation_y, NPU_NAMESPACE::kernel_decomposition _decomposition, uint32_t _stride_x_msb, uint32_t _stride_y_msb) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_STRIDE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        stride_x_lsb(_stride_x_lsb & ((1U << 1)-1)),
+        stride_y_lsb(_stride_y_lsb & ((1U << 1)-1)),
+        weight_order(static_cast<uint8_t>(_weight_order) & ((1U << 1)-1)),
+        dilation_x(static_cast<uint8_t>(_dilation_x) & ((1U << 1)-1)),
+        dilation_y(static_cast<uint8_t>(_dilation_y) & ((1U << 1)-1)),
+        decomposition(static_cast<uint8_t>(_decomposition) & ((1U << 1)-1)),
+        stride_x_msb(_stride_x_msb & ((1U << 1)-1)),
+        reserved1(0),
+        stride_y_msb(_stride_y_msb & ((1U << 1)-1)),
+        reserved2(0)
+    {}
+    CONSTEXPR npu_set_kernel_stride_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_STRIDE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        stride_x_lsb(0),
+        stride_y_lsb(0),
+        weight_order(0),
+        dilation_x(0),
+        dilation_y(0),
+        decomposition(0),
+        stride_x_msb(0),
+        reserved1(0),
+        stride_y_msb(0),
+        reserved2(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_STRIDE) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_STRIDE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(stride_x_lsb) << 16;
+        word |= uint32_t(stride_y_lsb) << 17;
+        word |= uint32_t(weight_order) << 18;
+        word |= uint32_t(dilation_x) << 19;
+        word |= uint32_t(dilation_y) << 20;
+        word |= uint32_t(decomposition) << 21;
+        word |= uint32_t(stride_x_msb) << 22;
+        word |= uint32_t(stride_y_msb) << 25;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_kernel_stride_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_kernel_stride_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_stride_x_lsb() const
+    {
+        return static_cast<uint32_t>(stride_x_lsb);
+    }
+    CONSTEXPR npu_set_kernel_stride_t& set_stride_x_lsb(uint32_t value)
+    {
+        assert((value >> 1) == 0);
+        stride_x_lsb = static_cast<uint8_t>(value & ((1U << 1)-1));
+        return *this;
+    }
+    CONSTEXPR uint32_t get_stride_y_lsb() const
+    {
+        return static_cast<uint32_t>(stride_y_lsb);
+    }
+    CONSTEXPR npu_set_kernel_stride_t& set_stride_y_lsb(uint32_t value)
+    {
+        assert((value >> 1) == 0);
+        stride_y_lsb = static_cast<uint8_t>(value & ((1U << 1)-1));
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::weight_order get_weight_order() const
+    {
+        return static_cast<NPU_NAMESPACE::weight_order>(weight_order);
+    }
+    CONSTEXPR npu_set_kernel_stride_t& set_weight_order(NPU_NAMESPACE::weight_order value)
+    {
+        weight_order = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::kernel_dilation get_dilation_x() const
+    {
+        return static_cast<NPU_NAMESPACE::kernel_dilation>(dilation_x);
+    }
+    CONSTEXPR npu_set_kernel_stride_t& set_dilation_x(NPU_NAMESPACE::kernel_dilation value)
+    {
+        dilation_x = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::kernel_dilation get_dilation_y() const
+    {
+        return static_cast<NPU_NAMESPACE::kernel_dilation>(dilation_y);
+    }
+    CONSTEXPR npu_set_kernel_stride_t& set_dilation_y(NPU_NAMESPACE::kernel_dilation value)
+    {
+        dilation_y = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::kernel_decomposition get_decomposition() const
+    {
+        return static_cast<NPU_NAMESPACE::kernel_decomposition>(decomposition);
+    }
+    CONSTEXPR npu_set_kernel_stride_t& set_decomposition(NPU_NAMESPACE::kernel_decomposition value)
+    {
+        decomposition = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_stride_x_msb() const
+    {
+        return static_cast<uint32_t>(stride_x_msb);
+    }
+    CONSTEXPR npu_set_kernel_stride_t& set_stride_x_msb(uint32_t value)
+    {
+        assert((value >> 1) == 0);
+        stride_x_msb = static_cast<uint8_t>(value & ((1U << 1)-1));
+        return *this;
+    }
+    CONSTEXPR uint32_t get_stride_y_msb() const
+    {
+        return static_cast<uint32_t>(stride_y_msb);
+    }
+    CONSTEXPR npu_set_kernel_stride_t& set_stride_y_msb(uint32_t value)
+    {
+        assert((value >> 1) == 0);
+        stride_y_msb = static_cast<uint8_t>(value & ((1U << 1)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("stride_x_lsb", std::to_string(stride_x_lsb)));
+        fields.push_back(std::make_pair<std::string, std::string>("stride_y_lsb", std::to_string(stride_y_lsb)));
+        fields.push_back(std::make_pair<std::string, std::string>("weight_order", (weight_order < (sizeof(weight_order_str)/sizeof(weight_order_str[0])) ? weight_order_str[weight_order] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("dilation_x", (dilation_x < (sizeof(kernel_dilation_str)/sizeof(kernel_dilation_str[0])) ? kernel_dilation_str[dilation_x] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("dilation_y", (dilation_y < (sizeof(kernel_dilation_str)/sizeof(kernel_dilation_str[0])) ? kernel_dilation_str[dilation_y] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("decomposition", (decomposition < (sizeof(kernel_decomposition_str)/sizeof(kernel_decomposition_str[0])) ? kernel_decomposition_str[decomposition] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("stride_x_msb", std::to_string(stride_x_msb)));
+        fields.push_back(std::make_pair<std::string, std::string>("stride_y_msb", std::to_string(stride_y_msb)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_acc_format_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t acc_format:2;
+    uint32_t reserved1:2;
+    uint32_t acc_input:2;
+    uint32_t acc_output:1;
+    uint32_t reserved2:1;
+    uint32_t microblock:3;
+    uint32_t reserved3:5;
+#ifdef __cplusplus
+public:
+    npu_set_acc_format_t(NPU_NAMESPACE::acc_format _acc_format, NPU_NAMESPACE::acc_input _acc_input, NPU_NAMESPACE::acc_output _acc_output, NPU_NAMESPACE::microblock _microblock) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACC_FORMAT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        acc_format(static_cast<uint8_t>(_acc_format) & ((1U << 2)-1)),
+        reserved1(0),
+        acc_input(static_cast<uint8_t>(_acc_input) & ((1U << 2)-1)),
+        acc_output(static_cast<uint8_t>(_acc_output) & ((1U << 1)-1)),
+        reserved2(0),
+        microblock(static_cast<uint8_t>(_microblock) & ((1U << 3)-1)),
+        reserved3(0)
+    {}
+    CONSTEXPR npu_set_acc_format_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACC_FORMAT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        acc_format(0),
+        reserved1(0),
+        acc_input(0),
+        acc_output(0),
+        reserved2(0),
+        microblock(0),
+        reserved3(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACC_FORMAT) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACC_FORMAT); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(acc_format) << 16;
+        word |= uint32_t(acc_input) << 20;
+        word |= uint32_t(acc_output) << 22;
+        word |= uint32_t(microblock) << 24;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_acc_format_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_acc_format_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::acc_format get_acc_format() const
+    {
+        return static_cast<NPU_NAMESPACE::acc_format>(acc_format);
+    }
+    CONSTEXPR npu_set_acc_format_t& set_acc_format(NPU_NAMESPACE::acc_format value)
+    {
+        acc_format = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::acc_input get_acc_input() const
+    {
+        return static_cast<NPU_NAMESPACE::acc_input>(acc_input);
+    }
+    CONSTEXPR npu_set_acc_format_t& set_acc_input(NPU_NAMESPACE::acc_input value)
+    {
+        acc_input = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::acc_output get_acc_output() const
+    {
+        return static_cast<NPU_NAMESPACE::acc_output>(acc_output);
+    }
+    CONSTEXPR npu_set_acc_format_t& set_acc_output(NPU_NAMESPACE::acc_output value)
+    {
+        acc_output = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::microblock get_microblock() const
+    {
+        return static_cast<NPU_NAMESPACE::microblock>(microblock);
+    }
+    CONSTEXPR npu_set_acc_format_t& set_microblock(NPU_NAMESPACE::microblock value)
+    {
+        microblock = static_cast<uint8_t>(value) & ((1U << 3)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("acc_format", (acc_format < (sizeof(acc_format_str)/sizeof(acc_format_str[0])) ? acc_format_str[acc_format] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("acc_input", (acc_input < (sizeof(acc_input_str)/sizeof(acc_input_str[0])) ? acc_input_str[acc_input] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("acc_output", (acc_output < (sizeof(acc_output_str)/sizeof(acc_output_str[0])) ? acc_output_str[acc_output] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("microblock", (microblock < (sizeof(microblock_str)/sizeof(microblock_str[0])) ? microblock_str[microblock] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_set_activation_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t activation_function:5;
+    uint32_t table:3;
+    uint32_t reserved1:4;
+    uint32_t activation_clip_range:1;
+    uint32_t reserved2:3;
+#ifdef __cplusplus
+public:
+    npu_set_activation_t(NPU_NAMESPACE::activation_function _activation_function, uint32_t _table, NPU_NAMESPACE::activation_clip_range _activation_clip_range) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        activation_function(static_cast<uint8_t>(_activation_function) & ((1U << 5)-1)),
+        table(_table & ((1U << 3)-1)),
+        reserved1(0),
+        activation_clip_range(static_cast<uint8_t>(_activation_clip_range) & ((1U << 1)-1)),
+        reserved2(0)
+    {}
+    CONSTEXPR npu_set_activation_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        activation_function(0),
+        table(0),
+        reserved1(0),
+        activation_clip_range(0),
+        reserved2(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(activation_function) << 16;
+        word |= uint32_t(table) << 21;
+        word |= uint32_t(activation_clip_range) << 28;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_activation_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_activation_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_function get_activation_function() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_function>(activation_function);
+    }
+    CONSTEXPR npu_set_activation_t& set_activation_function(NPU_NAMESPACE::activation_function value)
+    {
+        activation_function = static_cast<uint8_t>(value) & ((1U << 5)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_table() const
+    {
+        return static_cast<uint32_t>(table);
+    }
+    CONSTEXPR npu_set_activation_t& set_table(uint32_t value)
+    {
+        assert((value >> 3) == 0);
+        table = static_cast<uint8_t>(value & ((1U << 3)-1));
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_clip_range get_activation_clip_range() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_clip_range>(activation_clip_range);
+    }
+    CONSTEXPR npu_set_activation_t& set_activation_clip_range(NPU_NAMESPACE::activation_clip_range value)
+    {
+        activation_clip_range = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("activation_function", (activation_function < (sizeof(activation_function_str)/sizeof(activation_function_str[0])) ? activation_function_str[activation_function] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("table", std::to_string(table)));
+        fields.push_back(std::make_pair<std::string, std::string>("activation_clip_range", (activation_clip_range < (sizeof(activation_clip_range_str)/sizeof(activation_clip_range_str[0])) ? activation_clip_range_str[activation_clip_range] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_set_activation_min_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t clip_boundary:16;
+#ifdef __cplusplus
+public:
+    npu_set_activation_min_t(uint32_t _clip_boundary) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MIN)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        clip_boundary(_clip_boundary & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_activation_min_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MIN)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        clip_boundary(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MIN) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MIN); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(clip_boundary) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_activation_min_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_activation_min_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_clip_boundary() const
+    {
+        return static_cast<uint32_t>(clip_boundary);
+    }
+    CONSTEXPR npu_set_activation_min_t& set_clip_boundary(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        clip_boundary = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("clip_boundary", std::to_string(clip_boundary)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_activation_max_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t clip_boundary:16;
+#ifdef __cplusplus
+public:
+    npu_set_activation_max_t(uint32_t _clip_boundary) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MAX)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        clip_boundary(_clip_boundary & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_activation_max_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MAX)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        clip_boundary(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MAX) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MAX); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(clip_boundary) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_activation_max_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_activation_max_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_clip_boundary() const
+    {
+        return static_cast<uint32_t>(clip_boundary);
+    }
+    CONSTEXPR npu_set_activation_max_t& set_clip_boundary(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        clip_boundary = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("clip_boundary", std::to_string(clip_boundary)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_weight_region_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t region:3;
+    uint32_t reserved1:13;
+#ifdef __cplusplus
+public:
+    npu_set_weight_region_t(uint32_t _region) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(_region & ((1U << 3)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_weight_region_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_REGION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_REGION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(region) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_weight_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_weight_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region() const
+    {
+        return static_cast<uint32_t>(region);
+    }
+    CONSTEXPR npu_set_weight_region_t& set_region(uint32_t value)
+    {
+        assert((value >> 3) == 0);
+        region = static_cast<uint8_t>(value & ((1U << 3)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("region", std::to_string(region)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_scale_region_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t region:3;
+    uint32_t reserved1:13;
+#ifdef __cplusplus
+public:
+    npu_set_scale_region_t(uint32_t _region) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_SCALE_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(_region & ((1U << 3)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_scale_region_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_SCALE_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_SCALE_REGION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_SCALE_REGION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(region) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_scale_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_scale_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region() const
+    {
+        return static_cast<uint32_t>(region);
+    }
+    CONSTEXPR npu_set_scale_region_t& set_region(uint32_t value)
+    {
+        assert((value >> 3) == 0);
+        region = static_cast<uint8_t>(value & ((1U << 3)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("region", std::to_string(region)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_weight_format_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t weight_format:1;
+    uint32_t reserved1:3;
+    uint32_t weight_sparsity:1;
+    uint32_t reserved2:11;
+#ifdef __cplusplus
+public:
+    npu_set_weight_format_t(NPU_NAMESPACE::weight_format _weight_format, NPU_NAMESPACE::weight_sparsity _weight_sparsity) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_FORMAT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        weight_format(static_cast<uint8_t>(_weight_format) & ((1U << 1)-1)),
+        reserved1(0),
+        weight_sparsity(static_cast<uint8_t>(_weight_sparsity) & ((1U << 1)-1)),
+        reserved2(0)
+    {}
+    CONSTEXPR npu_set_weight_format_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_FORMAT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        weight_format(0),
+        reserved1(0),
+        weight_sparsity(0),
+        reserved2(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_FORMAT) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_FORMAT); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(weight_format) << 16;
+        word |= uint32_t(weight_sparsity) << 20;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_weight_format_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_weight_format_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::weight_format get_weight_format() const
+    {
+        return static_cast<NPU_NAMESPACE::weight_format>(weight_format);
+    }
+    CONSTEXPR npu_set_weight_format_t& set_weight_format(NPU_NAMESPACE::weight_format value)
+    {
+        weight_format = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::weight_sparsity get_weight_sparsity() const
+    {
+        return static_cast<NPU_NAMESPACE::weight_sparsity>(weight_sparsity);
+    }
+    CONSTEXPR npu_set_weight_format_t& set_weight_sparsity(NPU_NAMESPACE::weight_sparsity value)
+    {
+        weight_sparsity = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("weight_format", (weight_format < (sizeof(weight_format_str)/sizeof(weight_format_str[0])) ? weight_format_str[weight_format] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("weight_sparsity", (weight_sparsity < (sizeof(weight_sparsity_str)/sizeof(weight_sparsity_str[0])) ? weight_sparsity_str[weight_sparsity] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_set_blockdep_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t blockdep:3;
+    uint32_t reserved1:13;
+#ifdef __cplusplus
+public:
+    npu_set_blockdep_t(uint32_t _blockdep) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_BLOCKDEP)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        blockdep(_blockdep & ((1U << 3)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_blockdep_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_BLOCKDEP)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        blockdep(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_BLOCKDEP) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_BLOCKDEP); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(blockdep) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_blockdep_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_blockdep_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_blockdep() const
+    {
+        return static_cast<uint32_t>(blockdep);
+    }
+    CONSTEXPR npu_set_blockdep_t& set_blockdep(uint32_t value)
+    {
+        assert((value >> 3) == 0);
+        blockdep = static_cast<uint8_t>(value & ((1U << 3)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("blockdep", std::to_string(blockdep)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_resize_x_scale_n_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t resize_x_scale_n_m1:11;
+    uint32_t reserved1:5;
+#ifdef __cplusplus
+public:
+    npu_set_resize_x_scale_n_m1_t(uint32_t _resize_x_scale_n_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_X_SCALE_N_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        resize_x_scale_n_m1(_resize_x_scale_n_m1 & ((1U << 11)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_resize_x_scale_n_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_X_SCALE_N_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        resize_x_scale_n_m1(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_X_SCALE_N_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_X_SCALE_N_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(resize_x_scale_n_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_resize_x_scale_n_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_resize_x_scale_n_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_resize_x_scale_n_m1() const
+    {
+        return static_cast<uint32_t>(resize_x_scale_n_m1);
+    }
+    CONSTEXPR npu_set_resize_x_scale_n_m1_t& set_resize_x_scale_n_m1(uint32_t value)
+    {
+        assert((value >> 11) == 0);
+        resize_x_scale_n_m1 = static_cast<uint16_t>(value & ((1U << 11)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("resize_x_scale_n_m1", std::to_string(resize_x_scale_n_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_resize_y_scale_n_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t resize_y_scale_n_m1:11;
+    uint32_t reserved1:5;
+#ifdef __cplusplus
+public:
+    npu_set_resize_y_scale_n_m1_t(uint32_t _resize_y_scale_n_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_Y_SCALE_N_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        resize_y_scale_n_m1(_resize_y_scale_n_m1 & ((1U << 11)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_resize_y_scale_n_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_Y_SCALE_N_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        resize_y_scale_n_m1(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_Y_SCALE_N_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_Y_SCALE_N_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(resize_y_scale_n_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_resize_y_scale_n_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_resize_y_scale_n_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_resize_y_scale_n_m1() const
+    {
+        return static_cast<uint32_t>(resize_y_scale_n_m1);
+    }
+    CONSTEXPR npu_set_resize_y_scale_n_m1_t& set_resize_y_scale_n_m1(uint32_t value)
+    {
+        assert((value >> 11) == 0);
+        resize_y_scale_n_m1 = static_cast<uint16_t>(value & ((1U << 11)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("resize_y_scale_n_m1", std::to_string(resize_y_scale_n_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_resize_x_offset_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t resize_x_offset:12;
+    uint32_t reserved1:4;
+#ifdef __cplusplus
+public:
+    npu_set_resize_x_offset_t(uint32_t _resize_x_offset) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_X_OFFSET)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        resize_x_offset(_resize_x_offset & ((1U << 12)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_resize_x_offset_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_X_OFFSET)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        resize_x_offset(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_X_OFFSET) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_X_OFFSET); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(resize_x_offset) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_resize_x_offset_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_resize_x_offset_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_resize_x_offset() const
+    {
+        return static_cast<uint32_t>(resize_x_offset);
+    }
+    CONSTEXPR npu_set_resize_x_offset_t& set_resize_x_offset(uint32_t value)
+    {
+        assert((value >> 12) == 0);
+        resize_x_offset = static_cast<uint16_t>(value & ((1U << 12)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("resize_x_offset", std::to_string(((resize_x_offset <= std::numeric_limits<int>::max() ? static_cast<int>(resize_x_offset) : resize_x_offset - std::numeric_limits<int>::min() + std::numeric_limits<int>::max()) << 20) >> 20)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_resize_y_offset_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t resize_y_offset:12;
+    uint32_t reserved1:4;
+#ifdef __cplusplus
+public:
+    npu_set_resize_y_offset_t(uint32_t _resize_y_offset) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_Y_OFFSET)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        resize_y_offset(_resize_y_offset & ((1U << 12)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_resize_y_offset_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_Y_OFFSET)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        resize_y_offset(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_Y_OFFSET) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_Y_OFFSET); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(resize_y_offset) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_resize_y_offset_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_resize_y_offset_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_resize_y_offset() const
+    {
+        return static_cast<uint32_t>(resize_y_offset);
+    }
+    CONSTEXPR npu_set_resize_y_offset_t& set_resize_y_offset(uint32_t value)
+    {
+        assert((value >> 12) == 0);
+        resize_y_offset = static_cast<uint16_t>(value & ((1U << 12)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("resize_y_offset", std::to_string(((resize_y_offset <= std::numeric_limits<int>::max() ? static_cast<int>(resize_y_offset) : resize_y_offset - std::numeric_limits<int>::min() + std::numeric_limits<int>::max()) << 20) >> 20)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_src_region_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t region:3;
+    uint32_t reserved1:5;
+    uint32_t region_mode:1;
+    uint32_t stride_mode:2;
+    uint32_t idx_mode:1;
+    uint32_t reserved2:4;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_src_region_t(uint32_t _region, NPU_NAMESPACE::dma_region_mode _region_mode, NPU_NAMESPACE::dma_stride_mode _stride_mode, NPU_NAMESPACE::dma_idx_mode _idx_mode) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SRC_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(_region & ((1U << 3)-1)),
+        reserved1(0),
+        region_mode(static_cast<uint8_t>(_region_mode) & ((1U << 1)-1)),
+        stride_mode(static_cast<uint8_t>(_stride_mode) & ((1U << 2)-1)),
+        idx_mode(static_cast<uint8_t>(_idx_mode) & ((1U << 1)-1)),
+        reserved2(0)
+    {}
+    CONSTEXPR npu_set_dma0_src_region_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SRC_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(0),
+        reserved1(0),
+        region_mode(0),
+        stride_mode(0),
+        idx_mode(0),
+        reserved2(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SRC_REGION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SRC_REGION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(region) << 16;
+        word |= uint32_t(region_mode) << 24;
+        word |= uint32_t(stride_mode) << 25;
+        word |= uint32_t(idx_mode) << 27;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_dma0_src_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_dma0_src_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region() const
+    {
+        return static_cast<uint32_t>(region);
+    }
+    CONSTEXPR npu_set_dma0_src_region_t& set_region(uint32_t value)
+    {
+        assert((value >> 3) == 0);
+        region = static_cast<uint8_t>(value & ((1U << 3)-1));
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::dma_region_mode get_region_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::dma_region_mode>(region_mode);
+    }
+    CONSTEXPR npu_set_dma0_src_region_t& set_region_mode(NPU_NAMESPACE::dma_region_mode value)
+    {
+        region_mode = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::dma_stride_mode get_stride_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::dma_stride_mode>(stride_mode);
+    }
+    CONSTEXPR npu_set_dma0_src_region_t& set_stride_mode(NPU_NAMESPACE::dma_stride_mode value)
+    {
+        stride_mode = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::dma_idx_mode get_idx_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::dma_idx_mode>(idx_mode);
+    }
+    CONSTEXPR npu_set_dma0_src_region_t& set_idx_mode(NPU_NAMESPACE::dma_idx_mode value)
+    {
+        idx_mode = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("region", std::to_string(region)));
+        fields.push_back(std::make_pair<std::string, std::string>("region_mode", (region_mode < (sizeof(dma_region_mode_str)/sizeof(dma_region_mode_str[0])) ? dma_region_mode_str[region_mode] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("stride_mode", (stride_mode < (sizeof(dma_stride_mode_str)/sizeof(dma_stride_mode_str[0])) ? dma_stride_mode_str[stride_mode] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("idx_mode", (idx_mode < (sizeof(dma_idx_mode_str)/sizeof(dma_idx_mode_str[0])) ? dma_idx_mode_str[idx_mode] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_dst_region_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t region:3;
+    uint32_t reserved1:5;
+    uint32_t region_mode:1;
+    uint32_t reserved2:2;
+    uint32_t idx_mode:1;
+    uint32_t reserved3:4;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_dst_region_t(uint32_t _region, NPU_NAMESPACE::dma_region_mode _region_mode, NPU_NAMESPACE::dma_idx_mode _idx_mode) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_DST_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(_region & ((1U << 3)-1)),
+        reserved1(0),
+        region_mode(static_cast<uint8_t>(_region_mode) & ((1U << 1)-1)),
+        reserved2(0),
+        idx_mode(static_cast<uint8_t>(_idx_mode) & ((1U << 1)-1)),
+        reserved3(0)
+    {}
+    CONSTEXPR npu_set_dma0_dst_region_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_DST_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(0),
+        reserved1(0),
+        region_mode(0),
+        reserved2(0),
+        idx_mode(0),
+        reserved3(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_DST_REGION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_DST_REGION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(region) << 16;
+        word |= uint32_t(region_mode) << 24;
+        word |= uint32_t(idx_mode) << 27;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_dma0_dst_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_dma0_dst_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region() const
+    {
+        return static_cast<uint32_t>(region);
+    }
+    CONSTEXPR npu_set_dma0_dst_region_t& set_region(uint32_t value)
+    {
+        assert((value >> 3) == 0);
+        region = static_cast<uint8_t>(value & ((1U << 3)-1));
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::dma_region_mode get_region_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::dma_region_mode>(region_mode);
+    }
+    CONSTEXPR npu_set_dma0_dst_region_t& set_region_mode(NPU_NAMESPACE::dma_region_mode value)
+    {
+        region_mode = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::dma_idx_mode get_idx_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::dma_idx_mode>(idx_mode);
+    }
+    CONSTEXPR npu_set_dma0_dst_region_t& set_idx_mode(NPU_NAMESPACE::dma_idx_mode value)
+    {
+        idx_mode = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("region", std::to_string(region)));
+        fields.push_back(std::make_pair<std::string, std::string>("region_mode", (region_mode < (sizeof(dma_region_mode_str)/sizeof(dma_region_mode_str[0])) ? dma_region_mode_str[region_mode] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("idx_mode", (idx_mode < (sizeof(dma_idx_mode_str)/sizeof(dma_idx_mode_str[0])) ? dma_idx_mode_str[idx_mode] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_size0_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t size:16;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_size0_t(uint32_t _size) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        size(_size & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_dma0_size0_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        size(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE0) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE0); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(size) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_dma0_size0_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_dma0_size0_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_size() const
+    {
+        return static_cast<uint32_t>(size);
+    }
+    CONSTEXPR npu_set_dma0_size0_t& set_size(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        size = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("size", std::to_string(size)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_size1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t size:16;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_size1_t(uint32_t _size) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        size(_size & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_dma0_size1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        size(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(size) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_dma0_size1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_dma0_size1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_size() const
+    {
+        return static_cast<uint32_t>(size);
+    }
+    CONSTEXPR npu_set_dma0_size1_t& set_size(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        size = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("size", std::to_string(size)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_idx_region_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t region:3;
+    uint32_t reserved1:13;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_idx_region_t(uint32_t _region) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_IDX_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(_region & ((1U << 3)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_dma0_idx_region_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_IDX_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_IDX_REGION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_IDX_REGION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(region) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_dma0_idx_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_dma0_idx_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region() const
+    {
+        return static_cast<uint32_t>(region);
+    }
+    CONSTEXPR npu_set_dma0_idx_region_t& set_region(uint32_t value)
+    {
+        assert((value >> 3) == 0);
+        region = static_cast<uint8_t>(value & ((1U << 3)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("region", std::to_string(region)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_broadcast_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t broadcast_mode:4;
+    uint32_t reserved1:12;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_broadcast_t(NPU_NAMESPACE::broadcast_mode _broadcast_mode) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_BROADCAST)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        broadcast_mode(static_cast<uint8_t>(_broadcast_mode) & ((1U << 4)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ifm2_broadcast_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_BROADCAST)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        broadcast_mode(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_BROADCAST) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_BROADCAST); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(broadcast_mode) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm2_broadcast_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm2_broadcast_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::broadcast_mode get_broadcast_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::broadcast_mode>(broadcast_mode);
+    }
+    CONSTEXPR npu_set_ifm2_broadcast_t& set_broadcast_mode(NPU_NAMESPACE::broadcast_mode value)
+    {
+        broadcast_mode = static_cast<uint8_t>(value) & ((1U << 4)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("broadcast_mode", (broadcast_mode < (sizeof(broadcast_mode_str)/sizeof(broadcast_mode_str[0])) ? broadcast_mode_str[broadcast_mode] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_precision_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t activation_type:1;
+    uint32_t reserved1:1;
+    uint32_t activation_precision:2;
+    uint32_t reserved2:2;
+    uint32_t activation_format:2;
+    uint32_t reserved3:6;
+    uint32_t activation_storage:2;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_precision_t(NPU_NAMESPACE::activation_type _activation_type, NPU_NAMESPACE::activation_precision _activation_precision, NPU_NAMESPACE::activation_format _activation_format, NPU_NAMESPACE::activation_storage _activation_storage) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_PRECISION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        activation_type(static_cast<uint8_t>(_activation_type) & ((1U << 1)-1)),
+        reserved1(0),
+        activation_precision(static_cast<uint8_t>(_activation_precision) & ((1U << 2)-1)),
+        reserved2(0),
+        activation_format(static_cast<uint8_t>(_activation_format) & ((1U << 2)-1)),
+        reserved3(0),
+        activation_storage(static_cast<uint8_t>(_activation_storage) & ((1U << 2)-1))
+    {}
+    CONSTEXPR npu_set_ifm2_precision_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_PRECISION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        activation_type(0),
+        reserved1(0),
+        activation_precision(0),
+        reserved2(0),
+        activation_format(0),
+        reserved3(0),
+        activation_storage(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_PRECISION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_PRECISION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(activation_type) << 16;
+        word |= uint32_t(activation_precision) << 18;
+        word |= uint32_t(activation_format) << 22;
+        word |= uint32_t(activation_storage) << 30;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm2_precision_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm2_precision_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_type get_activation_type() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_type>(activation_type);
+    }
+    CONSTEXPR npu_set_ifm2_precision_t& set_activation_type(NPU_NAMESPACE::activation_type value)
+    {
+        activation_type = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_precision get_activation_precision() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_precision>(activation_precision);
+    }
+    CONSTEXPR npu_set_ifm2_precision_t& set_activation_precision(NPU_NAMESPACE::activation_precision value)
+    {
+        activation_precision = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_format get_activation_format() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_format>(activation_format);
+    }
+    CONSTEXPR npu_set_ifm2_precision_t& set_activation_format(NPU_NAMESPACE::activation_format value)
+    {
+        activation_format = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::activation_storage get_activation_storage() const
+    {
+        return static_cast<NPU_NAMESPACE::activation_storage>(activation_storage);
+    }
+    CONSTEXPR npu_set_ifm2_precision_t& set_activation_storage(NPU_NAMESPACE::activation_storage value)
+    {
+        activation_storage = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("activation_type", (activation_type < (sizeof(activation_type_str)/sizeof(activation_type_str[0])) ? activation_type_str[activation_type] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("activation_precision", (activation_precision < (sizeof(activation_precision_str)/sizeof(activation_precision_str[0])) ? activation_precision_str[activation_precision] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("activation_format", (activation_format < (sizeof(activation_format_str)/sizeof(activation_format_str[0])) ? activation_format_str[activation_format] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("activation_storage", (activation_storage < (sizeof(activation_storage_str)/sizeof(activation_storage_str[0])) ? activation_storage_str[activation_storage] : "****")));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_zero_point_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t zero_point:16;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_zero_point_t(uint32_t _zero_point) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_ZERO_POINT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        zero_point(_zero_point & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ifm2_zero_point_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_ZERO_POINT)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        zero_point(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_ZERO_POINT) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_ZERO_POINT); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(zero_point) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm2_zero_point_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm2_zero_point_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_zero_point() const
+    {
+        return static_cast<uint32_t>(zero_point);
+    }
+    CONSTEXPR npu_set_ifm2_zero_point_t& set_zero_point(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        zero_point = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("zero_point", std::to_string(zero_point)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_width0_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t width_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_width0_m1_t(uint32_t _width_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_WIDTH0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(_width_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ifm2_width0_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_WIDTH0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        width_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_WIDTH0_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_WIDTH0_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(width_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm2_width0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm2_width0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_width_m1() const
+    {
+        return static_cast<uint32_t>(width_m1);
+    }
+    CONSTEXPR npu_set_ifm2_width0_m1_t& set_width_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        width_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("width_m1", std::to_string(width_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_height0_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t height_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_height0_m1_t(uint32_t _height_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(_height_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ifm2_height0_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT0_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT0_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT0_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(height_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm2_height0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm2_height0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_height_m1() const
+    {
+        return static_cast<uint32_t>(height_m1);
+    }
+    CONSTEXPR npu_set_ifm2_height0_m1_t& set_height_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        height_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("height_m1", std::to_string(height_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_height1_m1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t height_m1:16;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_height1_m1_t(uint32_t _height_m1) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT1_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(_height_m1 & ((1U << 16)-1))
+    {}
+    CONSTEXPR npu_set_ifm2_height1_m1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT1_M1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        height_m1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT1_M1) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT1_M1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(height_m1) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm2_height1_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm2_height1_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_height_m1() const
+    {
+        return static_cast<uint32_t>(height_m1);
+    }
+    CONSTEXPR npu_set_ifm2_height1_m1_t& set_height_m1(uint32_t value)
+    {
+        assert((value >> 16) == 0);
+        height_m1 = static_cast<uint16_t>(value & ((1U << 16)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("height_m1", std::to_string(height_m1)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_region_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t region:3;
+    uint32_t reserved1:13;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_region_t(uint32_t _region) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(_region & ((1U << 3)-1)),
+        reserved1(0)
+    {}
+    CONSTEXPR npu_set_ifm2_region_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_REGION)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)),
+        region(0),
+        reserved1(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_REGION) && control == static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_REGION); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL);
+    }
+    operator uint32_t()
+    {
+        uint32_t word = 0;
+        word |= uint32_t(opcode) << 0;
+        word |= uint32_t(control) << 14;
+        word |= uint32_t(region) << 16;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd0_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm2_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm2_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_region() const
+    {
+        return static_cast<uint32_t>(region);
+    }
+    CONSTEXPR npu_set_ifm2_region_t& set_region(uint32_t value)
+    {
+        assert((value >> 3) == 0);
+        region = static_cast<uint8_t>(value & ((1U << 3)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("region", std::to_string(region)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_base0_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_base0_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm_base0_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE0) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE0); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm_base0_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_base1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_base1_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm_base1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE1) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm_base1_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_base2_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_base2_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE2)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm_base2_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE2)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE2) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE2); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm_base2_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_base3_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_base3_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE3)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm_base3_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE3)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE3) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE3); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm_base3_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_stride_x_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_stride_x_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_X)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm_stride_x_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_X)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_X) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_X); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm_stride_x_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_stride_y_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_stride_y_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_Y)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm_stride_y_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_Y)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_Y) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_Y); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm_stride_y_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_stride_c_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_stride_c_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_C)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm_stride_c_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_C)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_C) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_C); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm_stride_c_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_base0_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_base0_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ofm_base0_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE0) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE0); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ofm_base0_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_base1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_base1_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ofm_base1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE1) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ofm_base1_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_base2_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_base2_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE2)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ofm_base2_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE2)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE2) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE2); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ofm_base2_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_base3_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_base3_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE3)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ofm_base3_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE3)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE3) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE3); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ofm_base3_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_stride_x_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_stride_x_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_X)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ofm_stride_x_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_X)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_X) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_X); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ofm_stride_x_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_stride_y_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_stride_y_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_Y)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ofm_stride_y_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_Y)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_Y) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_Y); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ofm_stride_y_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_stride_c_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_stride_c_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_C)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ofm_stride_c_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_C)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_C) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_C); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ofm_stride_c_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_weight_base_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_weight_base_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_BASE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_weight_base_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_BASE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_BASE) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_BASE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_weight_base_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_weight_length_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t reserved1:16;
+    uint32_t length:32;
+#ifdef __cplusplus
+public:
+    npu_set_weight_length_t(uint32_t _length) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_LENGTH)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        length(_length)
+    {}
+    CONSTEXPR npu_set_weight_length_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_LENGTH)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        length(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_LENGTH) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_LENGTH); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(length) << 32;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd1_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_weight_length_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_weight_length_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_length() const
+    {
+        return static_cast<uint32_t>(length);
+    }
+    CONSTEXPR npu_set_weight_length_t& set_length(uint32_t value)
+    {
+        length = value;
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("length", std::to_string(length)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_scale_base_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_scale_base_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_BASE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_scale_base_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_BASE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_BASE) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_BASE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_scale_base_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_scale_length_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t reserved1:16;
+    uint32_t length:20;
+    uint32_t reserved2:12;
+#ifdef __cplusplus
+public:
+    npu_set_scale_length_t(uint32_t _length) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_LENGTH)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        length(_length & ((1U << 20)-1)),
+        reserved2(0)
+    {}
+    CONSTEXPR npu_set_scale_length_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_LENGTH)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        length(0),
+        reserved2(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_LENGTH) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_LENGTH); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(length) << 32;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd1_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_scale_length_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_scale_length_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_length() const
+    {
+        return static_cast<uint32_t>(length);
+    }
+    CONSTEXPR npu_set_scale_length_t& set_length(uint32_t value)
+    {
+        assert((value >> 20) == 0);
+        length = value & ((1U << 20)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("length", std::to_string(length)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ofm_scale_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t shift:6;
+    uint32_t dbl_rnd:5;
+    uint32_t reserved1:2;
+    uint32_t round_mode:3;
+    uint32_t scale:31;
+    uint32_t reserved2:1;
+#ifdef __cplusplus
+public:
+    npu_set_ofm_scale_t(uint32_t _shift, uint32_t _dbl_rnd, NPU_NAMESPACE::round_mode_ofm _round_mode, uint32_t _scale) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_SCALE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        shift(_shift & ((1U << 6)-1)),
+        dbl_rnd(_dbl_rnd & ((1U << 5)-1)),
+        reserved1(0),
+        round_mode(static_cast<uint8_t>(_round_mode) & ((1U << 3)-1)),
+        scale(_scale & ((1U << 31)-1)),
+        reserved2(0)
+    {}
+    CONSTEXPR npu_set_ofm_scale_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_SCALE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        shift(0),
+        dbl_rnd(0),
+        reserved1(0),
+        round_mode(0),
+        scale(0),
+        reserved2(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_SCALE) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_SCALE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(shift) << 16;
+        word |= uint64_t(dbl_rnd) << 22;
+        word |= uint64_t(round_mode) << 29;
+        word |= uint64_t(scale) << 32;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd1_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ofm_scale_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ofm_scale_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_shift() const
+    {
+        return static_cast<uint32_t>(shift);
+    }
+    CONSTEXPR npu_set_ofm_scale_t& set_shift(uint32_t value)
+    {
+        assert((value >> 6) == 0);
+        shift = static_cast<uint8_t>(value & ((1U << 6)-1));
+        return *this;
+    }
+    CONSTEXPR uint32_t get_dbl_rnd() const
+    {
+        return static_cast<uint32_t>(dbl_rnd);
+    }
+    CONSTEXPR npu_set_ofm_scale_t& set_dbl_rnd(uint32_t value)
+    {
+        assert((value >> 5) == 0);
+        dbl_rnd = static_cast<uint8_t>(value & ((1U << 5)-1));
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::round_mode_ofm get_round_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::round_mode_ofm>(round_mode);
+    }
+    CONSTEXPR npu_set_ofm_scale_t& set_round_mode(NPU_NAMESPACE::round_mode_ofm value)
+    {
+        round_mode = static_cast<uint8_t>(value) & ((1U << 3)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_scale() const
+    {
+        return static_cast<uint32_t>(scale);
+    }
+    CONSTEXPR npu_set_ofm_scale_t& set_scale(uint32_t value)
+    {
+        assert((value >> 31) == 0);
+        scale = value & ((1U << 31)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("shift", std::to_string(shift)));
+        fields.push_back(std::make_pair<std::string, std::string>("dbl_rnd", std::to_string(dbl_rnd)));
+        fields.push_back(std::make_pair<std::string, std::string>("round_mode", (round_mode < (sizeof(round_mode_ofm_str)/sizeof(round_mode_ofm_str[0])) ? round_mode_ofm_str[round_mode] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("scale", std::to_string(scale)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm_scale_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t shift:6;
+    uint32_t dbl_rnd:5;
+    uint32_t reserved1:2;
+    uint32_t round_mode:1;
+    uint32_t reserved2:2;
+    uint32_t scale:31;
+    uint32_t reserved3:1;
+#ifdef __cplusplus
+public:
+    npu_set_ifm_scale_t(uint32_t _shift, uint32_t _dbl_rnd, NPU_NAMESPACE::round_mode_ifm _round_mode, uint32_t _scale) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_SCALE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        shift(_shift & ((1U << 6)-1)),
+        dbl_rnd(_dbl_rnd & ((1U << 5)-1)),
+        reserved1(0),
+        round_mode(static_cast<uint8_t>(_round_mode) & ((1U << 1)-1)),
+        reserved2(0),
+        scale(_scale & ((1U << 31)-1)),
+        reserved3(0)
+    {}
+    CONSTEXPR npu_set_ifm_scale_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_SCALE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        shift(0),
+        dbl_rnd(0),
+        reserved1(0),
+        round_mode(0),
+        reserved2(0),
+        scale(0),
+        reserved3(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_SCALE) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_SCALE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(shift) << 16;
+        word |= uint64_t(dbl_rnd) << 22;
+        word |= uint64_t(round_mode) << 29;
+        word |= uint64_t(scale) << 32;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd1_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm_scale_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm_scale_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_shift() const
+    {
+        return static_cast<uint32_t>(shift);
+    }
+    CONSTEXPR npu_set_ifm_scale_t& set_shift(uint32_t value)
+    {
+        assert((value >> 6) == 0);
+        shift = static_cast<uint8_t>(value & ((1U << 6)-1));
+        return *this;
+    }
+    CONSTEXPR uint32_t get_dbl_rnd() const
+    {
+        return static_cast<uint32_t>(dbl_rnd);
+    }
+    CONSTEXPR npu_set_ifm_scale_t& set_dbl_rnd(uint32_t value)
+    {
+        assert((value >> 5) == 0);
+        dbl_rnd = static_cast<uint8_t>(value & ((1U << 5)-1));
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::round_mode_ifm get_round_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::round_mode_ifm>(round_mode);
+    }
+    CONSTEXPR npu_set_ifm_scale_t& set_round_mode(NPU_NAMESPACE::round_mode_ifm value)
+    {
+        round_mode = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_scale() const
+    {
+        return static_cast<uint32_t>(scale);
+    }
+    CONSTEXPR npu_set_ifm_scale_t& set_scale(uint32_t value)
+    {
+        assert((value >> 31) == 0);
+        scale = value & ((1U << 31)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("shift", std::to_string(shift)));
+        fields.push_back(std::make_pair<std::string, std::string>("dbl_rnd", std::to_string(dbl_rnd)));
+        fields.push_back(std::make_pair<std::string, std::string>("round_mode", (round_mode < (sizeof(round_mode_ifm_str)/sizeof(round_mode_ifm_str[0])) ? round_mode_ifm_str[round_mode] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("scale", std::to_string(scale)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_scale_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t shift:6;
+    uint32_t dbl_rnd:5;
+    uint32_t reserved1:2;
+    uint32_t round_mode:1;
+    uint32_t reserved2:2;
+    uint32_t scale:31;
+    uint32_t reserved3:1;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_scale_t(uint32_t _shift, uint32_t _dbl_rnd, NPU_NAMESPACE::round_mode_ifm _round_mode, uint32_t _scale) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_SCALE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        shift(_shift & ((1U << 6)-1)),
+        dbl_rnd(_dbl_rnd & ((1U << 5)-1)),
+        reserved1(0),
+        round_mode(static_cast<uint8_t>(_round_mode) & ((1U << 1)-1)),
+        reserved2(0),
+        scale(_scale & ((1U << 31)-1)),
+        reserved3(0)
+    {}
+    CONSTEXPR npu_set_ifm2_scale_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_SCALE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        shift(0),
+        dbl_rnd(0),
+        reserved1(0),
+        round_mode(0),
+        reserved2(0),
+        scale(0),
+        reserved3(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_SCALE) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_SCALE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(shift) << 16;
+        word |= uint64_t(dbl_rnd) << 22;
+        word |= uint64_t(round_mode) << 29;
+        word |= uint64_t(scale) << 32;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd1_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_ifm2_scale_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_ifm2_scale_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_shift() const
+    {
+        return static_cast<uint32_t>(shift);
+    }
+    CONSTEXPR npu_set_ifm2_scale_t& set_shift(uint32_t value)
+    {
+        assert((value >> 6) == 0);
+        shift = static_cast<uint8_t>(value & ((1U << 6)-1));
+        return *this;
+    }
+    CONSTEXPR uint32_t get_dbl_rnd() const
+    {
+        return static_cast<uint32_t>(dbl_rnd);
+    }
+    CONSTEXPR npu_set_ifm2_scale_t& set_dbl_rnd(uint32_t value)
+    {
+        assert((value >> 5) == 0);
+        dbl_rnd = static_cast<uint8_t>(value & ((1U << 5)-1));
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::round_mode_ifm get_round_mode() const
+    {
+        return static_cast<NPU_NAMESPACE::round_mode_ifm>(round_mode);
+    }
+    CONSTEXPR npu_set_ifm2_scale_t& set_round_mode(NPU_NAMESPACE::round_mode_ifm value)
+    {
+        round_mode = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_scale() const
+    {
+        return static_cast<uint32_t>(scale);
+    }
+    CONSTEXPR npu_set_ifm2_scale_t& set_scale(uint32_t value)
+    {
+        assert((value >> 31) == 0);
+        scale = value & ((1U << 31)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("shift", std::to_string(shift)));
+        fields.push_back(std::make_pair<std::string, std::string>("dbl_rnd", std::to_string(dbl_rnd)));
+        fields.push_back(std::make_pair<std::string, std::string>("round_mode", (round_mode < (sizeof(round_mode_ifm_str)/sizeof(round_mode_ifm_str[0])) ? round_mode_ifm_str[round_mode] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("scale", std::to_string(scale)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_op_scalar_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t reserved1:16;
+    uint32_t scalar:32;
+#ifdef __cplusplus
+public:
+    npu_set_op_scalar_t(uint32_t _scalar) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OP_SCALAR)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        scalar(_scalar)
+    {}
+    CONSTEXPR npu_set_op_scalar_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OP_SCALAR)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        scalar(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OP_SCALAR) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OP_SCALAR); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(scalar) << 32;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd1_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_op_scalar_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_op_scalar_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_scalar() const
+    {
+        return static_cast<uint32_t>(scalar);
+    }
+    CONSTEXPR npu_set_op_scalar_t& set_scalar(uint32_t value)
+    {
+        scalar = value;
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("scalar", std::to_string(((scalar <= std::numeric_limits<int>::max() ? static_cast<int>(scalar) : scalar - std::numeric_limits<int>::min() + std::numeric_limits<int>::max()) << 0) >> 0)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_src_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_src_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_dma0_src_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_dma0_src_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_dst_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_dst_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_dma0_dst_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_dma0_dst_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_len_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_len_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_LEN)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_dma0_len_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_LEN)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_LEN) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_LEN); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_dma0_len_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_src_stride0_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_src_stride0_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC_STRIDE0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_dma0_src_stride0_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC_STRIDE0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC_STRIDE0) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC_STRIDE0); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_dma0_src_stride0_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_src_stride1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_src_stride1_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC_STRIDE1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_dma0_src_stride1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC_STRIDE1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC_STRIDE1) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC_STRIDE1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_dma0_src_stride1_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_dst_stride0_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_dst_stride0_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST_STRIDE0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_dma0_dst_stride0_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST_STRIDE0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST_STRIDE0) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST_STRIDE0); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_dma0_dst_stride0_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_dst_stride1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_dst_stride1_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST_STRIDE1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_dma0_dst_stride1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST_STRIDE1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST_STRIDE1) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST_STRIDE1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_dma0_dst_stride1_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_idx_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_idx_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_dma0_idx_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_dma0_idx_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_idx_max_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t reserved1:16;
+    uint32_t idx_max:31;
+    uint32_t reserved2:1;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_idx_max_t(uint32_t _idx_max) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX_MAX)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        idx_max(_idx_max & ((1U << 31)-1)),
+        reserved2(0)
+    {}
+    CONSTEXPR npu_set_dma0_idx_max_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX_MAX)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        idx_max(0),
+        reserved2(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX_MAX) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX_MAX); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(idx_max) << 32;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd1_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_dma0_idx_max_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_dma0_idx_max_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_idx_max() const
+    {
+        return static_cast<uint32_t>(idx_max);
+    }
+    CONSTEXPR npu_set_dma0_idx_max_t& set_idx_max(uint32_t value)
+    {
+        assert((value >> 31) == 0);
+        idx_max = value & ((1U << 31)-1);
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("idx_max", std::to_string(idx_max)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_dma0_idx_skip1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_dma0_idx_skip1_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX_SKIP1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_dma0_idx_skip1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX_SKIP1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX_SKIP1) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX_SKIP1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_dma0_idx_skip1_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_base0_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_base0_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm2_base0_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE0)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE0) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE0); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm2_base0_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_base1_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_base1_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm2_base1_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE1)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE1) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE1); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm2_base1_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_base2_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_base2_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE2)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm2_base2_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE2)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE2) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE2); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm2_base2_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_base3_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_base3_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE3)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm2_base3_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE3)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE3) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE3); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm2_base3_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_stride_x_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_stride_x_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_X)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm2_stride_x_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_X)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_X) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_X); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm2_stride_x_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_stride_y_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_stride_y_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_Y)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm2_stride_y_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_Y)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_Y) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_Y); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm2_stride_y_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_ifm2_stride_c_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_ifm2_stride_c_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_C)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_ifm2_stride_c_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_C)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_C) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_C); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_ifm2_stride_c_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_weight1_base_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_weight1_base_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_BASE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_weight1_base_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_BASE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_BASE) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_BASE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_weight1_base_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_weight1_length_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t reserved1:16;
+    uint32_t length:32;
+#ifdef __cplusplus
+public:
+    npu_set_weight1_length_t(uint32_t _length) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_LENGTH)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        length(_length)
+    {}
+    CONSTEXPR npu_set_weight1_length_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_LENGTH)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        length(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_LENGTH) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_LENGTH); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(length) << 32;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd1_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_weight1_length_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_weight1_length_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_length() const
+    {
+        return static_cast<uint32_t>(length);
+    }
+    CONSTEXPR npu_set_weight1_length_t& set_length(uint32_t value)
+    {
+        length = value;
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("length", std::to_string(length)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_weight2_base_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_weight2_base_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT2_BASE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_weight2_base_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT2_BASE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT2_BASE) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT2_BASE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_weight2_base_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_weight2_length_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t reserved1:16;
+    uint32_t length:32;
+#ifdef __cplusplus
+public:
+    npu_set_weight2_length_t(uint32_t _length) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT2_LENGTH)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        length(_length)
+    {}
+    CONSTEXPR npu_set_weight2_length_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT2_LENGTH)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        length(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT2_LENGTH) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT2_LENGTH); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(length) << 32;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd1_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_weight2_length_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_weight2_length_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_length() const
+    {
+        return static_cast<uint32_t>(length);
+    }
+    CONSTEXPR npu_set_weight2_length_t& set_length(uint32_t value)
+    {
+        length = value;
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("length", std::to_string(length)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_weight3_base_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t addr_hi:8;
+    uint32_t reserved1:8;
+    uint32_t addr_lo:32;
+#ifdef __cplusplus
+public:
+    npu_set_weight3_base_t(uint64_t _addr) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT3_BASE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(static_cast<uint8_t>((_addr >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()))),
+        reserved1(0),
+        addr_lo(static_cast<uint32_t>((_addr) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())))
+    {}
+    CONSTEXPR npu_set_weight3_base_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT3_BASE)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        addr_hi(0),
+        reserved1(0),
+        addr_lo(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT3_BASE) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT3_BASE); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(addr_hi) << 16;
+        word |= uint64_t(addr_lo) << 32;
+        return word;
+    }
+    CONSTEXPR uint64_t get_addr() const
+    {
+        return (static_cast<uint64_t>(addr_hi) << 32) | addr_lo;
+    }
+    CONSTEXPR npu_set_weight3_base_t& set_addr(uint64_t value)
+    {
+        addr_lo = static_cast<uint32_t>((value) & static_cast<uint64_t>(std::numeric_limits<uint32_t>::max())); addr_hi = static_cast<uint8_t>((value >> 32) & static_cast<uint64_t>(std::numeric_limits<uint8_t>::max())); return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        std::stringstream saddr; saddr << std::hex << "0x" << get_addr();
+        fields.push_back(std::make_pair<std::string, std::string>("addr", saddr.str()));
+    }
+#endif
+#endif
+};
+
+struct npu_set_weight3_length_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t reserved1:16;
+    uint32_t length:32;
+#ifdef __cplusplus
+public:
+    npu_set_weight3_length_t(uint32_t _length) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT3_LENGTH)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        length(_length)
+    {}
+    CONSTEXPR npu_set_weight3_length_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT3_LENGTH)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        reserved1(0),
+        length(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT3_LENGTH) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT3_LENGTH); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(length) << 32;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd1_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_weight3_length_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_weight3_length_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_length() const
+    {
+        return static_cast<uint32_t>(length);
+    }
+    CONSTEXPR npu_set_weight3_length_t& set_length(uint32_t value)
+    {
+        length = value;
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("length", std::to_string(length)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_resize_x_step_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t one_step_int:4;
+    uint32_t blk_step_int:11;
+    uint32_t reserved1:1;
+    uint32_t one_step_mod:11;
+    uint32_t reserved2:5;
+    uint32_t blk_step_mod:11;
+    uint32_t reserved3:5;
+#ifdef __cplusplus
+public:
+    npu_set_resize_x_step_t(uint32_t _one_step_int, uint32_t _blk_step_int, uint32_t _one_step_mod, uint32_t _blk_step_mod) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_RESIZE_X)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        one_step_int(_one_step_int & ((1U << 4)-1)),
+        blk_step_int(_blk_step_int & ((1U << 11)-1)),
+        reserved1(0),
+        one_step_mod(_one_step_mod & ((1U << 11)-1)),
+        reserved2(0),
+        blk_step_mod(_blk_step_mod & ((1U << 11)-1)),
+        reserved3(0)
+    {}
+    CONSTEXPR npu_set_resize_x_step_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_RESIZE_X)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        one_step_int(0),
+        blk_step_int(0),
+        reserved1(0),
+        one_step_mod(0),
+        reserved2(0),
+        blk_step_mod(0),
+        reserved3(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_RESIZE_X) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_RESIZE_X); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(one_step_int) << 16;
+        word |= uint64_t(blk_step_int) << 20;
+        word |= uint64_t(one_step_mod) << 32;
+        word |= uint64_t(blk_step_mod) << 48;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd1_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_resize_x_step_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_resize_x_step_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_one_step_int() const
+    {
+        return static_cast<uint32_t>(one_step_int);
+    }
+    CONSTEXPR npu_set_resize_x_step_t& set_one_step_int(uint32_t value)
+    {
+        assert((value >> 4) == 0);
+        one_step_int = static_cast<uint8_t>(value & ((1U << 4)-1));
+        return *this;
+    }
+    CONSTEXPR uint32_t get_blk_step_int() const
+    {
+        return static_cast<uint32_t>(blk_step_int);
+    }
+    CONSTEXPR npu_set_resize_x_step_t& set_blk_step_int(uint32_t value)
+    {
+        assert((value >> 11) == 0);
+        blk_step_int = static_cast<uint16_t>(value & ((1U << 11)-1));
+        return *this;
+    }
+    CONSTEXPR uint32_t get_one_step_mod() const
+    {
+        return static_cast<uint32_t>(one_step_mod);
+    }
+    CONSTEXPR npu_set_resize_x_step_t& set_one_step_mod(uint32_t value)
+    {
+        assert((value >> 11) == 0);
+        one_step_mod = static_cast<uint16_t>(value & ((1U << 11)-1));
+        return *this;
+    }
+    CONSTEXPR uint32_t get_blk_step_mod() const
+    {
+        return static_cast<uint32_t>(blk_step_mod);
+    }
+    CONSTEXPR npu_set_resize_x_step_t& set_blk_step_mod(uint32_t value)
+    {
+        assert((value >> 11) == 0);
+        blk_step_mod = static_cast<uint16_t>(value & ((1U << 11)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("one_step_int", std::to_string(one_step_int)));
+        fields.push_back(std::make_pair<std::string, std::string>("blk_step_int", std::to_string(blk_step_int)));
+        fields.push_back(std::make_pair<std::string, std::string>("one_step_mod", std::to_string(one_step_mod)));
+        fields.push_back(std::make_pair<std::string, std::string>("blk_step_mod", std::to_string(blk_step_mod)));
+    }
+#endif
+#endif
+};
+
+struct npu_set_resize_y_step_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t one_step_int:4;
+    uint32_t blk_step_int:11;
+    uint32_t reserved1:1;
+    uint32_t one_step_mod:11;
+    uint32_t reserved2:5;
+    uint32_t blk_step_mod:11;
+    uint32_t reserved3:5;
+#ifdef __cplusplus
+public:
+    npu_set_resize_y_step_t(uint32_t _one_step_int, uint32_t _blk_step_int, uint32_t _one_step_mod, uint32_t _blk_step_mod) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_RESIZE_Y)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        one_step_int(_one_step_int & ((1U << 4)-1)),
+        blk_step_int(_blk_step_int & ((1U << 11)-1)),
+        reserved1(0),
+        one_step_mod(_one_step_mod & ((1U << 11)-1)),
+        reserved2(0),
+        blk_step_mod(_blk_step_mod & ((1U << 11)-1)),
+        reserved3(0)
+    {}
+    CONSTEXPR npu_set_resize_y_step_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_RESIZE_Y)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        one_step_int(0),
+        blk_step_int(0),
+        reserved1(0),
+        one_step_mod(0),
+        reserved2(0),
+        blk_step_mod(0),
+        reserved3(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_RESIZE_Y) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_SET_RESIZE_Y); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(one_step_int) << 16;
+        word |= uint64_t(blk_step_int) << 20;
+        word |= uint64_t(one_step_mod) << 32;
+        word |= uint64_t(blk_step_mod) << 48;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd1_opcode>(opcode);
+    }
+    CONSTEXPR npu_set_resize_y_step_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_set_resize_y_step_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_one_step_int() const
+    {
+        return static_cast<uint32_t>(one_step_int);
+    }
+    CONSTEXPR npu_set_resize_y_step_t& set_one_step_int(uint32_t value)
+    {
+        assert((value >> 4) == 0);
+        one_step_int = static_cast<uint8_t>(value & ((1U << 4)-1));
+        return *this;
+    }
+    CONSTEXPR uint32_t get_blk_step_int() const
+    {
+        return static_cast<uint32_t>(blk_step_int);
+    }
+    CONSTEXPR npu_set_resize_y_step_t& set_blk_step_int(uint32_t value)
+    {
+        assert((value >> 11) == 0);
+        blk_step_int = static_cast<uint16_t>(value & ((1U << 11)-1));
+        return *this;
+    }
+    CONSTEXPR uint32_t get_one_step_mod() const
+    {
+        return static_cast<uint32_t>(one_step_mod);
+    }
+    CONSTEXPR npu_set_resize_y_step_t& set_one_step_mod(uint32_t value)
+    {
+        assert((value >> 11) == 0);
+        one_step_mod = static_cast<uint16_t>(value & ((1U << 11)-1));
+        return *this;
+    }
+    CONSTEXPR uint32_t get_blk_step_mod() const
+    {
+        return static_cast<uint32_t>(blk_step_mod);
+    }
+    CONSTEXPR npu_set_resize_y_step_t& set_blk_step_mod(uint32_t value)
+    {
+        assert((value >> 11) == 0);
+        blk_step_mod = static_cast<uint16_t>(value & ((1U << 11)-1));
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("one_step_int", std::to_string(one_step_int)));
+        fields.push_back(std::make_pair<std::string, std::string>("blk_step_int", std::to_string(blk_step_int)));
+        fields.push_back(std::make_pair<std::string, std::string>("one_step_mod", std::to_string(one_step_mod)));
+        fields.push_back(std::make_pair<std::string, std::string>("blk_step_mod", std::to_string(blk_step_mod)));
+    }
+#endif
+#endif
+};
+
+struct npu_op_branch_t
+{
+#ifdef __cplusplus
+private:
+#endif
+    uint32_t opcode:10;
+    uint32_t reserved0:4;
+    uint32_t control:2;
+    uint32_t branch_cond:1;
+    uint32_t reserved1:15;
+    uint32_t branch_target:32;
+#ifdef __cplusplus
+public:
+    npu_op_branch_t(NPU_NAMESPACE::branch_cond _branch_cond, uint32_t _branch_target) :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_OP_BRANCH)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        branch_cond(static_cast<uint8_t>(_branch_cond) & ((1U << 1)-1)),
+        reserved1(0),
+        branch_target(_branch_target)
+    {}
+    CONSTEXPR npu_op_branch_t() :
+        opcode(static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_OP_BRANCH)),
+        reserved0(0),
+        control(static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)),
+        branch_cond(0),
+        reserved1(0),
+        branch_target(0)
+    {}
+    CONSTEXPR bool valid() const
+    {
+        return opcode == static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_OP_BRANCH) && control >= 1 && control <= 2;
+    }
+    CONSTEXPR void init()
+    {
+        opcode = static_cast<uint16_t>(NPU_NAMESPACE::cmd1_opcode::NPU_OP_BRANCH); control = static_cast<uint8_t>(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL);
+    }
+    operator uint64_t()
+    {
+        uint64_t word = 0;
+        word |= uint64_t(opcode) << 0;
+        word |= uint64_t(control) << 14;
+        word |= uint64_t(branch_cond) << 16;
+        word |= uint64_t(branch_target) << 32;
+        return word;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd1_opcode>(opcode);
+    }
+    CONSTEXPR npu_op_branch_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value)
+    {
+        opcode = static_cast<uint16_t>(value) & ((1U << 10)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const
+    {
+        return static_cast<NPU_NAMESPACE::cmd_ctrl>(control);
+    }
+    CONSTEXPR npu_op_branch_t& set_control(NPU_NAMESPACE::cmd_ctrl value)
+    {
+        control = static_cast<uint8_t>(value) & ((1U << 2)-1);
+        return *this;
+    }
+    CONSTEXPR NPU_NAMESPACE::branch_cond get_branch_cond() const
+    {
+        return static_cast<NPU_NAMESPACE::branch_cond>(branch_cond);
+    }
+    CONSTEXPR npu_op_branch_t& set_branch_cond(NPU_NAMESPACE::branch_cond value)
+    {
+        branch_cond = static_cast<uint8_t>(value) & ((1U << 1)-1);
+        return *this;
+    }
+    CONSTEXPR uint32_t get_branch_target() const
+    {
+        return static_cast<uint32_t>(branch_target);
+    }
+    CONSTEXPR npu_op_branch_t& set_branch_target(uint32_t value)
+    {
+        branch_target = value;
+        return *this;
+    }
+#ifdef NPU_DISASSEMBLE
+    void disassemble(std::vector<std::pair<std::string, std::string>>& fields) const
+    {
+        fields.push_back(std::make_pair<std::string, std::string>("branch_cond", (branch_cond < (sizeof(branch_cond_str)/sizeof(branch_cond_str[0])) ? branch_cond_str[branch_cond] : "****")));
+        fields.push_back(std::make_pair<std::string, std::string>("branch_target", std::to_string(branch_target)));
+    }
+#endif
+#endif
+};
+#ifdef __cplusplus
+};
+#endif
+#define NPU_OP_STRUCTS \
+    NPU_OP_(stop) \
+    NPU_OP_(irq) \
+    NPU_OP_(conv) \
+    NPU_OP_(depthwise) \
+    NPU_OP_(pool) \
+    NPU_OP_(elementwise) \
+    NPU_OP_(resize) \
+    NPU_OP_(dma_start) \
+    NPU_OP_(dma_wait) \
+    NPU_OP_(kernel_wait) \
+    NPU_OP_(pmu_mask) \
+    NPU_OP_(branch)
+
+#define NPU_SET_STRUCTS \
+    NPU_SET_(ifm_pad_top) \
+    NPU_SET_(ifm_pad_left) \
+    NPU_SET_(ifm_pad_right) \
+    NPU_SET_(ifm_pad_bottom) \
+    NPU_SET_(ifm_depth_m1) \
+    NPU_SET_(ifm_precision) \
+    NPU_SET_(ifm_upscale) \
+    NPU_SET_(ifm_zero_point) \
+    NPU_SET_(ifm_width0_m1) \
+    NPU_SET_(ifm_height0_m1) \
+    NPU_SET_(ifm_height1_m1) \
+    NPU_SET_(ifm_region) \
+    NPU_SET_(ifm_broadcast) \
+    NPU_SET_(ofm_width_m1) \
+    NPU_SET_(ofm_height_m1) \
+    NPU_SET_(ofm_depth_m1) \
+    NPU_SET_(ofm_precision) \
+    NPU_SET_(ofm_blk_width_m1) \
+    NPU_SET_(ofm_blk_height_m1) \
+    NPU_SET_(ofm_blk_depth_m1) \
+    NPU_SET_(ofm_zero_point) \
+    NPU_SET_(ofm_width0_m1) \
+    NPU_SET_(ofm_height0_m1) \
+    NPU_SET_(ofm_height1_m1) \
+    NPU_SET_(ofm_region) \
+    NPU_SET_(kernel_width_m1) \
+    NPU_SET_(kernel_height_m1) \
+    NPU_SET_(kernel_stride) \
+    NPU_SET_(acc_format) \
+    NPU_SET_(activation) \
+    NPU_SET_(activation_min) \
+    NPU_SET_(activation_max) \
+    NPU_SET_(weight_region) \
+    NPU_SET_(scale_region) \
+    NPU_SET_(weight_format) \
+    NPU_SET_(blockdep) \
+    NPU_SET_(resize_x_scale_n_m1) \
+    NPU_SET_(resize_y_scale_n_m1) \
+    NPU_SET_(resize_x_offset) \
+    NPU_SET_(resize_y_offset) \
+    NPU_SET_(dma0_src_region) \
+    NPU_SET_(dma0_dst_region) \
+    NPU_SET_(dma0_size0) \
+    NPU_SET_(dma0_size1) \
+    NPU_SET_(dma0_idx_region) \
+    NPU_SET_(ifm2_broadcast) \
+    NPU_SET_(ifm2_precision) \
+    NPU_SET_(ifm2_zero_point) \
+    NPU_SET_(ifm2_width0_m1) \
+    NPU_SET_(ifm2_height0_m1) \
+    NPU_SET_(ifm2_height1_m1) \
+    NPU_SET_(ifm2_region) \
+    NPU_SET_(ifm_base0) \
+    NPU_SET_(ifm_base1) \
+    NPU_SET_(ifm_base2) \
+    NPU_SET_(ifm_base3) \
+    NPU_SET_(ifm_stride_x) \
+    NPU_SET_(ifm_stride_y) \
+    NPU_SET_(ifm_stride_c) \
+    NPU_SET_(ofm_base0) \
+    NPU_SET_(ofm_base1) \
+    NPU_SET_(ofm_base2) \
+    NPU_SET_(ofm_base3) \
+    NPU_SET_(ofm_stride_x) \
+    NPU_SET_(ofm_stride_y) \
+    NPU_SET_(ofm_stride_c) \
+    NPU_SET_(weight_base) \
+    NPU_SET_(weight_length) \
+    NPU_SET_(scale_base) \
+    NPU_SET_(scale_length) \
+    NPU_SET_(ofm_scale) \
+    NPU_SET_(ifm_scale) \
+    NPU_SET_(ifm2_scale) \
+    NPU_SET_(op_scalar) \
+    NPU_SET_(dma0_src) \
+    NPU_SET_(dma0_dst) \
+    NPU_SET_(dma0_len) \
+    NPU_SET_(dma0_src_stride0) \
+    NPU_SET_(dma0_src_stride1) \
+    NPU_SET_(dma0_dst_stride0) \
+    NPU_SET_(dma0_dst_stride1) \
+    NPU_SET_(dma0_idx) \
+    NPU_SET_(dma0_idx_max) \
+    NPU_SET_(dma0_idx_skip1) \
+    NPU_SET_(ifm2_base0) \
+    NPU_SET_(ifm2_base1) \
+    NPU_SET_(ifm2_base2) \
+    NPU_SET_(ifm2_base3) \
+    NPU_SET_(ifm2_stride_x) \
+    NPU_SET_(ifm2_stride_y) \
+    NPU_SET_(ifm2_stride_c) \
+    NPU_SET_(weight1_base) \
+    NPU_SET_(weight1_length) \
+    NPU_SET_(weight2_base) \
+    NPU_SET_(weight2_length) \
+    NPU_SET_(weight3_base) \
+    NPU_SET_(weight3_length) \
+    NPU_SET_(resize_x_step) \
+    NPU_SET_(resize_y_step)
+
+#define EXPAND_ACC_FORMAT(FUNC, SEP) \
+    FUNC(acc_format, I32) SEP \
+    FUNC(acc_format, I48)
+
+#define EXPAND_ACC_INPUT(FUNC, SEP) \
+    FUNC(acc_input, RESET) SEP \
+    FUNC(acc_input, KEEP) SEP \
+    FUNC(acc_input, IFM2)
+
+#define EXPAND_ACC_OUTPUT(FUNC, SEP) \
+    FUNC(acc_output, ENABLE) SEP \
+    FUNC(acc_output, DISABLE)
+
+#define EXPAND_ACTIVATION_CLIP_RANGE(FUNC, SEP) \
+    FUNC(activation_clip_range, B16) SEP \
+    FUNC(activation_clip_range, NONE)
+
+#define EXPAND_ACTIVATION_FORMAT(FUNC, SEP) \
+    FUNC(activation_format, NHWC) SEP \
+    FUNC(activation_format, NHCWB16)
+
+#define EXPAND_ACTIVATION_FUNCTION(FUNC, SEP) \
+    FUNC(activation_function, LUT_NONE) SEP \
+    FUNC(activation_function, LUT_U8_U8) SEP \
+    FUNC(activation_function, LUT_S8_S8) SEP \
+    FUNC(activation_function, LUT_S8_S16) SEP \
+    FUNC(activation_function, LUT_S8_S32) SEP \
+    FUNC(activation_function, LUT_S16_S16) SEP \
+    FUNC(activation_function, LUT_S16_S32) SEP \
+    FUNC(activation_function, LUT_TANH) SEP \
+    FUNC(activation_function, LUT_SIGMOID)
+
+#define EXPAND_ACTIVATION_PRECISION(FUNC, SEP) \
+    FUNC(activation_precision, B8) SEP \
+    FUNC(activation_precision, B16) SEP \
+    FUNC(activation_precision, B32) SEP \
+    FUNC(activation_precision, B64)
+
+#define EXPAND_ACTIVATION_REVERSE(FUNC, SEP) \
+    FUNC(activation_reverse, NONE) SEP \
+    FUNC(activation_reverse, H) SEP \
+    FUNC(activation_reverse, W) SEP \
+    FUNC(activation_reverse, C)
+
+#define EXPAND_ACTIVATION_STORAGE(FUNC, SEP) \
+    FUNC(activation_storage, TILE2X2) SEP \
+    FUNC(activation_storage, TILE3X1) SEP \
+    FUNC(activation_storage, CHAINED) SEP \
+    FUNC(activation_storage, NONE)
+
+#define EXPAND_ACTIVATION_TRANSPOSE(FUNC, SEP) \
+    FUNC(activation_transpose, HWC) SEP \
+    FUNC(activation_transpose, WHC) SEP \
+    FUNC(activation_transpose, HCW) SEP \
+    FUNC(activation_transpose, WCH) SEP \
+    FUNC(activation_transpose, CHW) SEP \
+    FUNC(activation_transpose, CWH)
+
+#define EXPAND_ACTIVATION_TYPE(FUNC, SEP) \
+    FUNC(activation_type, UNSIGNED) SEP \
+    FUNC(activation_type, SIGNED)
+
+#define EXPAND_AXI_MEM_DOMAIN(FUNC, SEP) \
+    FUNC(axi_mem_domain, NON_SHARABLE) SEP \
+    FUNC(axi_mem_domain, INNER_SHARABLE) SEP \
+    FUNC(axi_mem_domain, OUTER_SHARABLE) SEP \
+    FUNC(axi_mem_domain, SYSTEM)
+
+#define EXPAND_AXI_MEM_ENCODING(FUNC, SEP) \
+    FUNC(axi_mem_encoding, DEVICE_NON_BUFFERABLE) SEP \
+    FUNC(axi_mem_encoding, DEVICE_BUFFERABLE) SEP \
+    FUNC(axi_mem_encoding, NORMAL_NON_CACHEABLE_NON_BUFFERABLE) SEP \
+    FUNC(axi_mem_encoding, NORMAL_NON_CACHEABLE_BUFFERABLE) SEP \
+    FUNC(axi_mem_encoding, WRITE_THROUGH_NO_ALLOCATE) SEP \
+    FUNC(axi_mem_encoding, WRITE_THROUGH_READ_ALLOCATE) SEP \
+    FUNC(axi_mem_encoding, WRITE_THROUGH_WRITE_ALLOCATE) SEP \
+    FUNC(axi_mem_encoding, WRITE_THROUGH_READ_AND_WRITE_ALLOCATE) SEP \
+    FUNC(axi_mem_encoding, WRITE_BACK_NO_ALLOCATE) SEP \
+    FUNC(axi_mem_encoding, WRITE_BACK_READ_ALLOCATE) SEP \
+    FUNC(axi_mem_encoding, WRITE_BACK_WRITE_ALLOCATE) SEP \
+    FUNC(axi_mem_encoding, WRITE_BACK_READ_AND_WRITE_ALLOCATE)
+
+#define EXPAND_AXI_PORT(FUNC, SEP) \
+    FUNC(axi_port, SRAM) SEP \
+    FUNC(axi_port, EXT)
+
+#define EXPAND_BRANCH_COND(FUNC, SEP) \
+    FUNC(branch_cond, ALWAYS) SEP \
+    FUNC(branch_cond, RF_TRUE)
+
+#define EXPAND_BROADCAST_MODE(FUNC, SEP) \
+    FUNC(broadcast_mode, NONE) SEP \
+    FUNC(broadcast_mode, H) SEP \
+    FUNC(broadcast_mode, W) SEP \
+    FUNC(broadcast_mode, HW) SEP \
+    FUNC(broadcast_mode, C) SEP \
+    FUNC(broadcast_mode, CH) SEP \
+    FUNC(broadcast_mode, CW) SEP \
+    FUNC(broadcast_mode, CWH) SEP \
+    FUNC(broadcast_mode, SCALAR)
+
+#define EXPAND_CMD0_OPCODE(FUNC, SEP) \
+    FUNC(cmd0_opcode, NPU_OP_STOP) SEP \
+    FUNC(cmd0_opcode, NPU_OP_IRQ) SEP \
+    FUNC(cmd0_opcode, NPU_OP_CONV) SEP \
+    FUNC(cmd0_opcode, NPU_OP_DEPTHWISE) SEP \
+    FUNC(cmd0_opcode, NPU_OP_POOL) SEP \
+    FUNC(cmd0_opcode, NPU_OP_ELEMENTWISE) SEP \
+    FUNC(cmd0_opcode, NPU_OP_RESIZE) SEP \
+    FUNC(cmd0_opcode, NPU_OP_DMA_START) SEP \
+    FUNC(cmd0_opcode, NPU_OP_DMA_WAIT) SEP \
+    FUNC(cmd0_opcode, NPU_OP_KERNEL_WAIT) SEP \
+    FUNC(cmd0_opcode, NPU_OP_PMU_MASK) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_PAD_TOP) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_PAD_LEFT) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_PAD_RIGHT) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_PAD_BOTTOM) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_DEPTH_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_PRECISION) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_UPSCALE) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_BROADCAST) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_ZERO_POINT) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_WIDTH0_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_HEIGHT0_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_HEIGHT1_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM_REGION) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_WIDTH_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_HEIGHT_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_DEPTH_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_PRECISION) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_BLK_WIDTH_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_BLK_HEIGHT_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_BLK_DEPTH_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_ZERO_POINT) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_WIDTH0_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_HEIGHT0_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_HEIGHT1_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_OFM_REGION) SEP \
+    FUNC(cmd0_opcode, NPU_SET_KERNEL_WIDTH_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_KERNEL_HEIGHT_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_KERNEL_STRIDE) SEP \
+    FUNC(cmd0_opcode, NPU_SET_ACC_FORMAT) SEP \
+    FUNC(cmd0_opcode, NPU_SET_ACTIVATION) SEP \
+    FUNC(cmd0_opcode, NPU_SET_ACTIVATION_MIN) SEP \
+    FUNC(cmd0_opcode, NPU_SET_ACTIVATION_MAX) SEP \
+    FUNC(cmd0_opcode, NPU_SET_WEIGHT_REGION) SEP \
+    FUNC(cmd0_opcode, NPU_SET_SCALE_REGION) SEP \
+    FUNC(cmd0_opcode, NPU_SET_RESIZE_X_SCALE_N_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_RESIZE_Y_SCALE_N_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_RESIZE_X_OFFSET) SEP \
+    FUNC(cmd0_opcode, NPU_SET_RESIZE_Y_OFFSET) SEP \
+    FUNC(cmd0_opcode, NPU_SET_WEIGHT_FORMAT) SEP \
+    FUNC(cmd0_opcode, NPU_SET_BLOCKDEP) SEP \
+    FUNC(cmd0_opcode, NPU_SET_DMA0_SRC_REGION) SEP \
+    FUNC(cmd0_opcode, NPU_SET_DMA0_DST_REGION) SEP \
+    FUNC(cmd0_opcode, NPU_SET_DMA0_SIZE0) SEP \
+    FUNC(cmd0_opcode, NPU_SET_DMA0_SIZE1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_DMA0_IDX_REGION) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM2_BROADCAST) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM2_PRECISION) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM2_ZERO_POINT) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM2_WIDTH0_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM2_HEIGHT0_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM2_HEIGHT1_M1) SEP \
+    FUNC(cmd0_opcode, NPU_SET_IFM2_REGION)
+
+#define EXPAND_CMD1_OPCODE(FUNC, SEP) \
+    FUNC(cmd1_opcode, NPU_SET_IFM_BASE0) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM_BASE1) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM_BASE2) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM_BASE3) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM_STRIDE_X) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM_STRIDE_Y) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM_STRIDE_C) SEP \
+    FUNC(cmd1_opcode, NPU_SET_OFM_BASE0) SEP \
+    FUNC(cmd1_opcode, NPU_SET_OFM_BASE1) SEP \
+    FUNC(cmd1_opcode, NPU_SET_OFM_BASE2) SEP \
+    FUNC(cmd1_opcode, NPU_SET_OFM_BASE3) SEP \
+    FUNC(cmd1_opcode, NPU_SET_OFM_STRIDE_X) SEP \
+    FUNC(cmd1_opcode, NPU_SET_OFM_STRIDE_Y) SEP \
+    FUNC(cmd1_opcode, NPU_SET_OFM_STRIDE_C) SEP \
+    FUNC(cmd1_opcode, NPU_SET_WEIGHT_BASE) SEP \
+    FUNC(cmd1_opcode, NPU_SET_WEIGHT_LENGTH) SEP \
+    FUNC(cmd1_opcode, NPU_SET_SCALE_BASE) SEP \
+    FUNC(cmd1_opcode, NPU_SET_SCALE_LENGTH) SEP \
+    FUNC(cmd1_opcode, NPU_SET_OFM_SCALE) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM_SCALE) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM2_SCALE) SEP \
+    FUNC(cmd1_opcode, NPU_SET_OP_SCALAR) SEP \
+    FUNC(cmd1_opcode, NPU_SET_DMA0_SRC) SEP \
+    FUNC(cmd1_opcode, NPU_SET_DMA0_DST) SEP \
+    FUNC(cmd1_opcode, NPU_SET_DMA0_LEN) SEP \
+    FUNC(cmd1_opcode, NPU_SET_DMA0_SRC_STRIDE0) SEP \
+    FUNC(cmd1_opcode, NPU_SET_DMA0_SRC_STRIDE1) SEP \
+    FUNC(cmd1_opcode, NPU_SET_DMA0_DST_STRIDE0) SEP \
+    FUNC(cmd1_opcode, NPU_SET_DMA0_DST_STRIDE1) SEP \
+    FUNC(cmd1_opcode, NPU_SET_DMA0_IDX) SEP \
+    FUNC(cmd1_opcode, NPU_SET_DMA0_IDX_MAX) SEP \
+    FUNC(cmd1_opcode, NPU_SET_DMA0_IDX_SKIP1) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM2_BASE0) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM2_BASE1) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM2_BASE2) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM2_BASE3) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM2_STRIDE_X) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM2_STRIDE_Y) SEP \
+    FUNC(cmd1_opcode, NPU_SET_IFM2_STRIDE_C) SEP \
+    FUNC(cmd1_opcode, NPU_SET_WEIGHT1_BASE) SEP \
+    FUNC(cmd1_opcode, NPU_SET_WEIGHT1_LENGTH) SEP \
+    FUNC(cmd1_opcode, NPU_SET_WEIGHT2_BASE) SEP \
+    FUNC(cmd1_opcode, NPU_SET_WEIGHT2_LENGTH) SEP \
+    FUNC(cmd1_opcode, NPU_SET_WEIGHT3_BASE) SEP \
+    FUNC(cmd1_opcode, NPU_SET_WEIGHT3_LENGTH) SEP \
+    FUNC(cmd1_opcode, NPU_SET_RESIZE_X) SEP \
+    FUNC(cmd1_opcode, NPU_SET_RESIZE_Y) SEP \
+    FUNC(cmd1_opcode, NPU_OP_BRANCH)
+
+#define EXPAND_CMD_CTRL(FUNC, SEP) \
+    FUNC(cmd_ctrl, CMD0_CTRL) SEP \
+    FUNC(cmd_ctrl, CMD1_CTRL)
+
+#define EXPAND_CUSTOM_DMA(FUNC, SEP) \
+    FUNC(custom_dma, NOT_IMPLEMENTED) SEP \
+    FUNC(custom_dma, IMPLEMENTED)
+
+#define EXPAND_DMA_FAULT_CHANNEL(FUNC, SEP) \
+    FUNC(dma_fault_channel, CMD_READ) SEP \
+    FUNC(dma_fault_channel, IFM_READ) SEP \
+    FUNC(dma_fault_channel, WEIGHT_READ) SEP \
+    FUNC(dma_fault_channel, SBS_READ) SEP \
+    FUNC(dma_fault_channel, MEM2MEM_READ) SEP \
+    FUNC(dma_fault_channel, OFM_WRITE) SEP \
+    FUNC(dma_fault_channel, MEM2MEM_WRITE)
+
+#define EXPAND_DMA_FAULT_SRC(FUNC, SEP) \
+    FUNC(dma_fault_src, SRAM) SEP \
+    FUNC(dma_fault_src, EXT)
+
+#define EXPAND_DMA_IDX_MODE(FUNC, SEP) \
+    FUNC(dma_idx_mode, DISABLED) SEP \
+    FUNC(dma_idx_mode, ENABLED)
+
+#define EXPAND_DMA_REGION_MODE(FUNC, SEP) \
+    FUNC(dma_region_mode, EXTERNAL) SEP \
+    FUNC(dma_region_mode, INTERNAL)
+
+#define EXPAND_DMA_STRIDE_MODE(FUNC, SEP) \
+    FUNC(dma_stride_mode, D1) SEP \
+    FUNC(dma_stride_mode, D2) SEP \
+    FUNC(dma_stride_mode, D3)
+
+#define EXPAND_ELEMENTWISE_MODE(FUNC, SEP) \
+    FUNC(elementwise_mode, MUL) SEP \
+    FUNC(elementwise_mode, ADD) SEP \
+    FUNC(elementwise_mode, SUB) SEP \
+    FUNC(elementwise_mode, MIN) SEP \
+    FUNC(elementwise_mode, MAX) SEP \
+    FUNC(elementwise_mode, LRELU) SEP \
+    FUNC(elementwise_mode, ABS) SEP \
+    FUNC(elementwise_mode, CLZ) SEP \
+    FUNC(elementwise_mode, SHR) SEP \
+    FUNC(elementwise_mode, SHL) SEP \
+    FUNC(elementwise_mode, LSR) SEP \
+    FUNC(elementwise_mode, DIV) SEP \
+    FUNC(elementwise_mode, CMP_EQ) SEP \
+    FUNC(elementwise_mode, CMP_NE) SEP \
+    FUNC(elementwise_mode, CMP_GE) SEP \
+    FUNC(elementwise_mode, CMP_GT) SEP \
+    FUNC(elementwise_mode, AND) SEP \
+    FUNC(elementwise_mode, OR) SEP \
+    FUNC(elementwise_mode, XOR) SEP \
+    FUNC(elementwise_mode, NOT) SEP \
+    FUNC(elementwise_mode, AND_NOT)
+
+#define EXPAND_IFM_UPSCALE_MODE(FUNC, SEP) \
+    FUNC(ifm_upscale_mode, NONE) SEP \
+    FUNC(ifm_upscale_mode, NEAREST) SEP \
+    FUNC(ifm_upscale_mode, ZEROS)
+
+#define EXPAND_KERNEL_DECOMPOSITION(FUNC, SEP) \
+    FUNC(kernel_decomposition, D8X8) SEP \
+    FUNC(kernel_decomposition, D4X4)
+
+#define EXPAND_KERNEL_DILATION(FUNC, SEP) \
+    FUNC(kernel_dilation, NONE) SEP \
+    FUNC(kernel_dilation, X2)
+
+#define EXPAND_MAX_BEATS(FUNC, SEP) \
+    FUNC(max_beats, B64) SEP \
+    FUNC(max_beats, B128) SEP \
+    FUNC(max_beats, B256)
+
+#define EXPAND_MICROBLOCK(FUNC, SEP) \
+    FUNC(microblock, U1X1) SEP \
+    FUNC(microblock, U1X2) SEP \
+    FUNC(microblock, U1X4) SEP \
+    FUNC(microblock, U2X2) SEP \
+    FUNC(microblock, U2X4) SEP \
+    FUNC(microblock, U4X4)
+
+#define EXPAND_OFM_SCALE_MODE(FUNC, SEP) \
+    FUNC(ofm_scale_mode, PER_CHANNEL) SEP \
+    FUNC(ofm_scale_mode, GLOBAL)
+
+#define EXPAND_PMU_AXI_CHANNEL(FUNC, SEP) \
+    FUNC(pmu_axi_channel, RD_CMD) SEP \
+    FUNC(pmu_axi_channel, RD_IFM) SEP \
+    FUNC(pmu_axi_channel, RD_WEIGHTS) SEP \
+    FUNC(pmu_axi_channel, RD_SCALE_BIAS) SEP \
+    FUNC(pmu_axi_channel, RD_MEM2MEM) SEP \
+    FUNC(pmu_axi_channel, RD_IFM_STREAM) SEP \
+    FUNC(pmu_axi_channel, RD_MEM2MEM_IDX) SEP \
+    FUNC(pmu_axi_channel, WR_OFM) SEP \
+    FUNC(pmu_axi_channel, WR_MEM2MEM)
+
+#define EXPAND_PMU_EVENT(FUNC, SEP) \
+    FUNC(pmu_event, NO_EVENT) SEP \
+    FUNC(pmu_event, CYCLE) SEP \
+    FUNC(pmu_event, NPU_IDLE) SEP \
+    FUNC(pmu_event, CC_STALLED_ON_BLOCKDEP) SEP \
+    FUNC(pmu_event, CC_STALLED_ON_SHRAM_RECONFIG) SEP \
+    FUNC(pmu_event, NPU_ACTIVE) SEP \
+    FUNC(pmu_event, MAC_ACTIVE) SEP \
+    FUNC(pmu_event, MAC_DPU_ACTIVE) SEP \
+    FUNC(pmu_event, MAC_STALLED_BY_W_OR_ACC) SEP \
+    FUNC(pmu_event, MAC_STALLED_BY_W) SEP \
+    FUNC(pmu_event, MAC_STALLED_BY_ACC) SEP \
+    FUNC(pmu_event, MAC_STALLED_BY_IB) SEP \
+    FUNC(pmu_event, MAC_STALLED_BY_INT_W) SEP \
+    FUNC(pmu_event, MAC_STALLED_BY_INT_ACC) SEP \
+    FUNC(pmu_event, AO_ACTIVE) SEP \
+    FUNC(pmu_event, AO_STALLED_BY_BS_OR_OB) SEP \
+    FUNC(pmu_event, AO_STALLED_BY_BS) SEP \
+    FUNC(pmu_event, AO_STALLED_BY_OB) SEP \
+    FUNC(pmu_event, AO_STALLED_BY_AB_OR_CB) SEP \
+    FUNC(pmu_event, AO_STALLED_BY_AB) SEP \
+    FUNC(pmu_event, AO_STALLED_BY_CB) SEP \
+    FUNC(pmu_event, WD_ACTIVE) SEP \
+    FUNC(pmu_event, WD_STALLED) SEP \
+    FUNC(pmu_event, WD_STALLED_BY_WD_BUF) SEP \
+    FUNC(pmu_event, WD_STALLED_BY_WS_FC) SEP \
+    FUNC(pmu_event, WD_STALLED_BY_WS_TC) SEP \
+    FUNC(pmu_event, WD_TRANS_WBLK) SEP \
+    FUNC(pmu_event, WD_TRANS_WS_FC) SEP \
+    FUNC(pmu_event, WD_TRANS_WS_TC) SEP \
+    FUNC(pmu_event, WD_STALLED_BY_WS_SC0) SEP \
+    FUNC(pmu_event, WD_STALLED_BY_WS_SC1) SEP \
+    FUNC(pmu_event, WD_STALLED_BY_WS_SC2) SEP \
+    FUNC(pmu_event, WD_STALLED_BY_WS_SC3) SEP \
+    FUNC(pmu_event, WD_PARSE_ACTIVE_SC0) SEP \
+    FUNC(pmu_event, WD_PARSE_ACTIVE_SC1) SEP \
+    FUNC(pmu_event, WD_PARSE_ACTIVE_SC2) SEP \
+    FUNC(pmu_event, WD_PARSE_ACTIVE_SC3) SEP \
+    FUNC(pmu_event, WD_PARSE_STALL_SC0) SEP \
+    FUNC(pmu_event, WD_PARSE_STALL_SC1) SEP \
+    FUNC(pmu_event, WD_PARSE_STALL_SC2) SEP \
+    FUNC(pmu_event, WD_PARSE_STALL_SC3) SEP \
+    FUNC(pmu_event, WD_PARSE_STALL_IN_SC0) SEP \
+    FUNC(pmu_event, WD_PARSE_STALL_IN_SC1) SEP \
+    FUNC(pmu_event, WD_PARSE_STALL_IN_SC2) SEP \
+    FUNC(pmu_event, WD_PARSE_STALL_IN_SC3) SEP \
+    FUNC(pmu_event, WD_PARSE_STALL_OUT_SC0) SEP \
+    FUNC(pmu_event, WD_PARSE_STALL_OUT_SC1) SEP \
+    FUNC(pmu_event, WD_PARSE_STALL_OUT_SC2) SEP \
+    FUNC(pmu_event, WD_PARSE_STALL_OUT_SC3) SEP \
+    FUNC(pmu_event, WD_TRANS_WS_SC0) SEP \
+    FUNC(pmu_event, WD_TRANS_WS_SC1) SEP \
+    FUNC(pmu_event, WD_TRANS_WS_SC2) SEP \
+    FUNC(pmu_event, WD_TRANS_WS_SC3) SEP \
+    FUNC(pmu_event, WD_TRANS_WB0) SEP \
+    FUNC(pmu_event, WD_TRANS_WB1) SEP \
+    FUNC(pmu_event, WD_TRANS_WB2) SEP \
+    FUNC(pmu_event, WD_TRANS_WB3) SEP \
+    FUNC(pmu_event, SRAM_RD_TRANS_ACCEPTED) SEP \
+    FUNC(pmu_event, SRAM_RD_TRANS_COMPLETED) SEP \
+    FUNC(pmu_event, SRAM_RD_DATA_BEAT_RECEIVED) SEP \
+    FUNC(pmu_event, SRAM_RD_TRAN_REQ_STALLED) SEP \
+    FUNC(pmu_event, SRAM_WR_TRANS_ACCEPTED) SEP \
+    FUNC(pmu_event, SRAM_WR_TRANS_COMPLETED_M) SEP \
+    FUNC(pmu_event, SRAM_WR_TRANS_COMPLETED_S) SEP \
+    FUNC(pmu_event, SRAM_WR_DATA_BEAT_WRITTEN) SEP \
+    FUNC(pmu_event, SRAM_WR_TRAN_REQ_STALLED) SEP \
+    FUNC(pmu_event, SRAM_WR_DATA_BEAT_STALLED) SEP \
+    FUNC(pmu_event, SRAM_ENABLED_CYCLES) SEP \
+    FUNC(pmu_event, SRAM_RD_STALL_LIMIT) SEP \
+    FUNC(pmu_event, SRAM_WR_STALL_LIMIT) SEP \
+    FUNC(pmu_event, AXI_LATENCY_ANY) SEP \
+    FUNC(pmu_event, AXI_LATENCY_32) SEP \
+    FUNC(pmu_event, AXI_LATENCY_64) SEP \
+    FUNC(pmu_event, AXI_LATENCY_128) SEP \
+    FUNC(pmu_event, AXI_LATENCY_256) SEP \
+    FUNC(pmu_event, AXI_LATENCY_512) SEP \
+    FUNC(pmu_event, AXI_LATENCY_1024) SEP \
+    FUNC(pmu_event, ECC_DMA) SEP \
+    FUNC(pmu_event, ECC_MAC_IB) SEP \
+    FUNC(pmu_event, ECC_MAC_AB) SEP \
+    FUNC(pmu_event, ECC_AO_CB) SEP \
+    FUNC(pmu_event, ECC_AO_OB) SEP \
+    FUNC(pmu_event, ECC_AO_LUT) SEP \
+    FUNC(pmu_event, EXT_RD_TRANS_ACCEPTED) SEP \
+    FUNC(pmu_event, EXT_RD_TRANS_COMPLETED) SEP \
+    FUNC(pmu_event, EXT_RD_DATA_BEAT_RECEIVED) SEP \
+    FUNC(pmu_event, EXT_RD_TRAN_REQ_STALLED) SEP \
+    FUNC(pmu_event, EXT_WR_TRANS_ACCEPTED) SEP \
+    FUNC(pmu_event, EXT_WR_TRANS_COMPLETED_M) SEP \
+    FUNC(pmu_event, EXT_WR_TRANS_COMPLETED_S) SEP \
+    FUNC(pmu_event, EXT_WR_DATA_BEAT_WRITTEN) SEP \
+    FUNC(pmu_event, EXT_WR_TRAN_REQ_STALLED) SEP \
+    FUNC(pmu_event, EXT_WR_DATA_BEAT_STALLED) SEP \
+    FUNC(pmu_event, EXT_ENABLED_CYCLES) SEP \
+    FUNC(pmu_event, EXT_RD_STALL_LIMIT) SEP \
+    FUNC(pmu_event, EXT_WR_STALL_LIMIT)
+
+#define EXPAND_PMU_PORT_DISABLE(FUNC, SEP) \
+    FUNC(pmu_port_disable, ENABLE) SEP \
+    FUNC(pmu_port_disable, DISABLE)
+
+#define EXPAND_POOLING_MODE(FUNC, SEP) \
+    FUNC(pooling_mode, MAX) SEP \
+    FUNC(pooling_mode, AVERAGE) SEP \
+    FUNC(pooling_mode, REDUCE_SUM) SEP \
+    FUNC(pooling_mode, SUM) SEP \
+    FUNC(pooling_mode, NONE) SEP \
+    FUNC(pooling_mode, MIN) SEP \
+    FUNC(pooling_mode, ARGMAX_X) SEP \
+    FUNC(pooling_mode, ARGMAX_Y)
+
+#define EXPAND_PRIVILEGE_LEVEL(FUNC, SEP) \
+    FUNC(privilege_level, USER) SEP \
+    FUNC(privilege_level, PRIVILEGED)
+
+#define EXPAND_RAM_ID(FUNC, SEP) \
+    FUNC(ram_id, LUT) SEP \
+    FUNC(ram_id, IB) SEP \
+    FUNC(ram_id, AB) SEP \
+    FUNC(ram_id, CB) SEP \
+    FUNC(ram_id, OB)
+
+#define EXPAND_RESIZE_MODE(FUNC, SEP) \
+    FUNC(resize_mode, BILINEAR) SEP \
+    FUNC(resize_mode, REPLICATE) SEP \
+    FUNC(resize_mode, NEAREST)
+
+#define EXPAND_ROUND_MODE_IFM(FUNC, SEP) \
+    FUNC(round_mode_ifm, DOUBLE_SYMMETRIC) SEP \
+    FUNC(round_mode_ifm, NATURAL)
+
+#define EXPAND_ROUND_MODE_OFM(FUNC, SEP) \
+    FUNC(round_mode_ofm, DOUBLE_SYMMETRIC) SEP \
+    FUNC(round_mode_ofm, NATURAL) SEP \
+    FUNC(round_mode_ofm, DOUBLE_ASYMMETRIC) SEP \
+    FUNC(round_mode_ofm, SYMMETRIC) SEP \
+    FUNC(round_mode_ofm, TRUNCATE_TO_ZERO) SEP \
+    FUNC(round_mode_ofm, TRUNCATE_TO_LOWER)
+
+#define EXPAND_SECURITY_LEVEL(FUNC, SEP) \
+    FUNC(security_level, SECURE) SEP \
+    FUNC(security_level, NON_SECURE)
+
+#define EXPAND_STATE(FUNC, SEP) \
+    FUNC(state, STOPPED) SEP \
+    FUNC(state, RUNNING)
+
+#define EXPAND_WD_ACTIVE_CORE(FUNC, SEP) \
+    FUNC(wd_active_core, NONE) SEP \
+    FUNC(wd_active_core, STANDARD) SEP \
+    FUNC(wd_active_core, FAST) SEP \
+    FUNC(wd_active_core, TENSOR)
+
+#define EXPAND_WEIGHT_FORMAT(FUNC, SEP) \
+    FUNC(weight_format, SWD) SEP \
+    FUNC(weight_format, FWD)
+
+#define EXPAND_WEIGHT_ORDER(FUNC, SEP) \
+    FUNC(weight_order, DEPTH_FIRST) SEP \
+    FUNC(weight_order, PART_KERNEL_FIRST)
+
+#define EXPAND_WEIGHT_SPARSITY(FUNC, SEP) \
+    FUNC(weight_sparsity, NONE) SEP \
+    FUNC(weight_sparsity, SPARSE_2_4)
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp
new file mode 100644
index 00000000..9623b239
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp
@@ -0,0 +1,545 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "ethos_u85_performance.hpp"
+
+#include "common/common.hpp"
+#include "common/logging.hpp"
+
+#include "architecture/architecture.hpp"
+#include "ethos_u85.hpp"
+
+namespace regor
+{
+
+static const Point2i s_SubkernelLimits[] = {
+    {0, 0},  // No kernel
+    {8, 8},  // Convolution
+    {8, 8},  // Depthwise
+    {1, 1},  // VectorProduct
+    {8, 8},  // Pooling
+    {8, 8},  // ReduceSum
+    {1, 1},  // Elementwise
+    {1, 1},  // Resize
+};
+
+static constexpr bool OpUsesMacs(EthosU85NpuOp npuOp)
+{
+    return (npuOp != EthosU85NpuOp::Elementwise && npuOp != EthosU85NpuOp::Resize && npuOp != EthosU85NpuOp::Dma && npuOp != EthosU85NpuOp::None);
+}
+
+EthosU85Performance::EthosU85Performance(ArchEthosU85 *arch, const EthosU85PerfInfo *perfInfo) : _arch(arch)
+{
+    _perfInfo = perfInfo;
+}
+
+CycleCost EthosU85Performance::MeasureCycleCost(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
+{
+    CycleCost cycles;
+    auto npuOp = _arch->GetHWOp(query.type);
+
+    // Convolution/Vector product cycle calculation
+    if ( OpUsesMacs(npuOp) )
+    {
+        if ( (npuOp == EthosU85NpuOp::Depthwise) || (npuOp == EthosU85NpuOp::Pooling) )
+        {
+            cycles.macs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements() * 1;
+        }
+        else
+        {
+            cycles.macs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements() * query.ifmShape[0].Depth();
+        }
+
+        cycles.opCycles = EstimateConvCycles(query, fused);
+    }
+    // Elementwise cycle calculation
+    else if ( npuOp == EthosU85NpuOp::Elementwise )
+    {
+        auto ofmShape =
+            (query.ofmFormat == TensorFormat::NHCWB16) ? Shape::RoundAway(query.ofmShape, Shape(1, 1, 1, 16)) : query.ofmShape;
+        cycles.opCycles = int64_t(EstimateOutputCyclesPerElement(query, fused) * float(ofmShape.Elements()));
+    }
+    // Resize cycle calculation
+    else if ( npuOp == EthosU85NpuOp::Resize )
+    {
+        // TODO: Implement for Resize
+        cycles.opCycles = 0;
+    }
+    // DMA cycle calculation
+    else if ( npuOp == EthosU85NpuOp::Dma )
+    {
+        // TODO: below is incorrect (MLBEDSW-8400)
+
+        auto ofmShape =
+            (query.ofmFormat == TensorFormat::NHCWB16) ? Shape::RoundAway(query.ofmShape, Shape(1, 1, 1, 16)) : query.ofmShape;
+        cycles.opCycles = 0;
+    }
+    else
+    {
+        assert(false && "Unknown operator cycle costing");
+    }
+
+    return cycles;
+}
+
+int64_t EthosU85Performance::MemToMemCycles(const ArchitectureMemory *dest, const ArchitectureMemory *source, int sizeBytes)
+{
+    int64_t fromCycles = int64_t(float(sizeBytes) / source->Bandwidth());
+    fromCycles += source->ReadLatency();
+    int64_t toCycles = int64_t(float(sizeBytes) / dest->Bandwidth());
+    toCycles += source->WriteLatency();
+    return std::max(fromCycles, toCycles);
+}
+
+int64_t EthosU85Performance::EstimateConvCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
+{
+    EthosU85OpConfig *opConfig = static_cast<EthosU85OpConfig *>(query.config);
+    auto npuOp = _arch->GetHWOp(query.type);
+    assert(npuOp != EthosU85NpuOp::None);
+
+    Shape ifmBlock = Shape::Min(query.ifmShape[0], opConfig->IfmBlock());
+    Shape ofmBlock = Shape::Min(query.ofmShape, opConfig->OfmBlock());
+    Shape ofmUBlock = opConfig->OfmUBlock();
+
+    // HW Optimisation check
+    if ( (ofmUBlock.Height() == 2) && (npuOp == EthosU85NpuOp::Convolution || npuOp == EthosU85NpuOp::VectorProduct) &&
+         (query.ofmShape.Height() == 1) && (query.ofmShape.Width() % 2 == 0) &&  // Optimisation only applies for even
+                                                                                 // width tensors
+         (query.kernel->Size().y == 1) )
+    {
+        ofmUBlock = Shape(1, 1, 4, ofmUBlock.Depth());
+        ofmBlock = ofmBlock.WithHeight(1);
+    }
+
+    int ifmBits = DataTypeSizeBits(query.ifmType[0]);
+    Shape numUBlocks = Shape::DivRoundUp(ofmBlock, ofmUBlock);
+    bool use48BitAcc = opConfig->Acc() == EthosU85Accumulator::Acc48;
+
+    int64_t cyclesDpuBlk = 0;
+    int cyclesWb = 32 * ofmUBlock.Depth() / 8;
+
+    int subKernelWidth = s_SubkernelLimits[int(npuOp)].x;
+    int subKernelHeight = s_SubkernelLimits[int(npuOp)].y;
+    const Point2i kernelSize = query.kernel->Size();
+    bool isConvolutionMxN = (npuOp == EthosU85NpuOp::Convolution);
+
+    for ( int x = 0; x < kernelSize.x; x += subKernelWidth )
+    {
+        for ( int y = 0; y < kernelSize.y; y += subKernelHeight )
+        {
+            int subKernelElements = std::min(kernelSize.y - y, subKernelHeight);
+            subKernelElements *= std::min(kernelSize.x - x, subKernelWidth);
+
+            // Calculate processing cycles
+            int numKernelSteps = 0;
+            int cycles = 0;
+            if ( npuOp == EthosU85NpuOp::Pooling )
+            {
+                numKernelSteps = 1;
+                cycles = std::max(4, subKernelElements) * numUBlocks.Elements() * (ifmBits / 2);
+            }
+            else if ( npuOp == EthosU85NpuOp::Depthwise )
+            {
+                numKernelSteps = DivRoundUp(subKernelElements, 4);
+                cycles = 4 * numUBlocks.ElementsWH() * (ifmBits / 8);
+                cycles = std::max(cyclesWb, cycles) * numKernelSteps * numUBlocks.Depth();
+            }
+            else if ( (isConvolutionMxN && opConfig->Traversal() != EthosU85Traversal::PartKernel) ||
+                      npuOp == EthosU85NpuOp::VectorProduct || npuOp == EthosU85NpuOp::ReduceSum )
+            {
+                numKernelSteps = subKernelElements;
+                cycles = std::max(cyclesWb, 4 * numUBlocks.ElementsWH()) * numKernelSteps * numUBlocks.Depth();
+            }
+            else
+            {
+                assert(opConfig->Traversal() == EthosU85Traversal::PartKernel);
+                int divider = (ifmBits == 16) ? 2 : 4;
+                numKernelSteps = DivRoundUp(subKernelElements, divider);
+                cycles = std::max(cyclesWb, 4 * numUBlocks.ElementsWH()) * numKernelSteps * numUBlocks.Depth() *
+                         DivRoundUp(ifmBlock.Depth(), 8);
+            }
+
+            // Calculate delay
+            int delayCycles = 0;
+            int delay = (use48BitAcc && (_arch->_macs <= 128)) ? 3 : 2;
+
+            if ( numUBlocks.ElementsWH() == 1 )
+            {
+                if ( numUBlocks.Depth() == 1 )
+                {
+                    delayCycles = delay * numKernelSteps;
+                }
+                else if ( numKernelSteps > 1 )
+                {
+                    delayCycles = delay * (numKernelSteps - 1) * numUBlocks.Depth();
+                }
+            }
+
+            if ( isConvolutionMxN && opConfig->Traversal() == EthosU85Traversal::PartKernel )
+            {
+                delayCycles *= DivRoundUp(ifmBlock.Depth(), 8);
+            }
+
+            cyclesDpuBlk += cycles;
+            cyclesDpuBlk += delayCycles;
+        }
+    }
+
+    if ( npuOp == EthosU85NpuOp::Convolution || npuOp == EthosU85NpuOp::VectorProduct || npuOp == EthosU85NpuOp::ReduceSum )
+    {
+        cyclesDpuBlk *= DivRoundUp(query.ifmShape[0].Depth(), ifmBlock.Depth());
+    }
+
+    cyclesDpuBlk /= _arch->_cores;
+
+    // Estimate output cycles
+    int numOfmBlks = Shape::DivRoundUp(query.ofmShape, ofmBlock).Elements();
+    int64_t cyclesOutputBlk = int64_t(EstimateOutputCyclesPerElement(query, fused) * float(ofmBlock.Elements()));
+
+    // Scale and bias tensor
+    if ( query.constShape.Size() > 0 && query.constShape.Depth() > 0 )
+    {
+        int cyclesBiasBlk = (10 * ofmBlock.Depth() * query.constMemory->ReadLatency() / 256);
+        cyclesOutputBlk = std::max(cyclesOutputBlk, int64_t(cyclesBiasBlk));
+    }
+
+    int64_t cycles_cmd = EstimateMinimumMemoryCycles(query);
+    cycles_cmd = (cycles_cmd + cyclesOutputBlk + cyclesDpuBlk) / 4;  // Per DPU
+
+    cyclesDpuBlk = std::max(cyclesDpuBlk, cycles_cmd);
+    cyclesOutputBlk = std::max(cyclesOutputBlk, cycles_cmd);
+
+    int64_t totalCycles = 0;
+    if ( cyclesDpuBlk > cyclesOutputBlk )
+    {
+        totalCycles = cyclesDpuBlk * numOfmBlks + cyclesOutputBlk;
+    }
+    else
+    {
+        totalCycles = cyclesOutputBlk * numOfmBlks + cyclesDpuBlk;
+    }
+
+    return totalCycles;
+}
+
+static int EstimateMemoryTransfer(int cores, bool isRead, ArchitectureMemory *memory, TensorFormat format,
+    int elementBits, const Shape &block, const Shape &shape, int toTransfer)
+{
+    int burstLen = 8;
+
+    if ( format == TensorFormat::NHCWB16 )
+    {
+        int zStride = (shape.Width() * elementBits * 16) / 8;
+        if ( zStride == block.Depth() )
+        {
+            burstLen = elementBits * block.Depth() * block.Width();
+        }
+        else if ( isRead )
+        {
+            burstLen = 16 * elementBits * block.Width();
+        }
+        else
+        {
+            burstLen = 16 * elementBits * block.Width() * cores;
+        }
+    }
+    else if ( format == TensorFormat::NHWC )
+    {
+        int xStride = (shape.Depth() * elementBits) / 8;
+        if ( isRead )
+        {
+            if ( xStride == block.Depth() )
+            {
+                burstLen = elementBits * block.Depth() * block.Width();
+            }
+            else
+            {
+                burstLen = elementBits * block.Depth();
+            }
+        }
+        else
+        {
+            if ( (block.Depth() <= 16) && xStride == block.Depth() )
+            {
+                burstLen = elementBits * block.Depth() * block.Width();
+            }
+            else
+            {
+                burstLen = std::min(std::min(64 * 8, 16 * elementBits * cores), block.Depth() * elementBits);
+            }
+        }
+    }
+
+    burstLen = std::min(memory->MaxBurstLength(), burstLen / 8);
+    assert(burstLen > 0 && "Burst length cannot be zero");
+    return (toTransfer * memory->MaxBurstLength()) / burstLen;
+}
+
+
+int64_t EthosU85Performance::EstimateMinimumMemoryCycles(const PerformanceQuery &query)
+{
+    EthosU85OpConfig *opConfig = static_cast<EthosU85OpConfig *>(query.config);
+
+    int ifmBits = DataTypeSizeBits(query.ifmType[0]);  // All inputs expect same bit width
+    const int ifmCount = query.ifmShape[1].Elements() > 0 ? int(std::size(query.ifmShape)) : 1;
+    int64_t cyclesIfm = 0;
+    for ( int i = 0; i < ifmCount; i++ )
+    {
+        // Input block HW transfer (only for elements present)
+        int ifmBytes = Shape::Min(query.ifmShape[i], opConfig->IfmBlock()).Elements() * ifmBits / 8;
+        int64_t cyclesIfmBlk = query.ifmMemory[i]->ReadLatency();
+        int64_t tx = EstimateMemoryTransfer(_arch->_cores, true, query.ifmMemory[i], query.ifmFormat[i], ifmBits,
+            opConfig->IfmBlock(), query.ifmShape[i], ifmBytes);
+        cyclesIfmBlk += int64_t(float(tx) / query.ifmMemory[i]->Bandwidth());
+
+        cyclesIfm = std::max(cyclesIfm, cyclesIfmBlk);
+    }
+
+    // Output block HW transfer (only for elements present)
+    int ofmBits = DataTypeSizeBits(query.ofmType);
+    int ofmBytes = Shape::Min(query.ofmShape, opConfig->OfmBlock()).Elements() * ofmBits / 8;
+    int64_t cyclesOfm = query.ofmMemory->WriteLatency();
+    int64_t tx = EstimateMemoryTransfer(_arch->_cores, false, query.ofmMemory, query.ofmFormat, ofmBits,
+        opConfig->OfmBlock(), query.ofmShape, ofmBytes);
+    cyclesOfm += int64_t(float(tx) / query.ofmMemory->Bandwidth());
+
+    return cyclesIfm + cyclesOfm;
+}
+
+
+float EthosU85Performance::EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector<FusionQuery> &fused)
+{
+    EthosU85OpConfig *opConfig = static_cast<EthosU85OpConfig *>(query.config);
+    auto npuOp = _arch->GetHWOp(query.type);
+    assert(npuOp != EthosU85NpuOp::None);
+    int ifmBits = DataTypeSizeBits(query.ifmType[0]);
+    int ofmBits = DataTypeSizeBits(query.ofmType);
+    int outputPerfIndex = 0;
+
+    if ( (npuOp == EthosU85NpuOp::Elementwise) && (ifmBits == 32) )
+    {
+        // Unary op else Binary op
+        outputPerfIndex = query.ifmShape[1].Elements() > 0 ? 1 : 0;
+    }
+    else if ( query.type == OpType::Mul && ofmBits == 32 )
+    {
+        outputPerfIndex = 2;
+    }
+    else if ( (query.type == OpType::Mul) || ((npuOp != EthosU85NpuOp::Elementwise) && opConfig->Acc() == EthosU85Accumulator::Acc48) )
+    {
+        outputPerfIndex = 3;
+    }
+    else if ( query.type == OpType::Add || query.type == OpType::Sub )
+    {
+        if ( false )
+        {
+            // Simple Add/Sub
+            outputPerfIndex = 4;
+        }
+        else
+        {
+            // Advanced Add/Sub TODO: Add as perf selection as operator variant
+            outputPerfIndex = 5;
+        }
+    }
+    else if ( query.type == OpType::MaxPool )
+    {
+        outputPerfIndex = 6;
+    }
+    else
+    {
+        outputPerfIndex = 7;
+    }
+
+    int activationPerfIndex = 0;
+    assert(fused.size() <= 1 && "multiple op performance not available");
+    for ( const FusionQuery &fusedOp : fused )
+    {
+        if ( fusedOp.type == OpType::Sigmoid || fusedOp.type == OpType::Tanh || fusedOp.type == OpType::LookupTable )
+        {
+            activationPerfIndex = 0;
+        }
+        else if ( fusedOp.type == OpType::Relu || fusedOp.type == OpType::Relu6 || fusedOp.type == OpType::ReluN1To1 )
+        {
+            activationPerfIndex = 1;
+        }
+        else
+        {
+            activationPerfIndex = 2;
+        }
+    }
+
+    float cyclesPerElement = std::max(_perfInfo->outputCycles[outputPerfIndex], _perfInfo->activationCycles[activationPerfIndex]);
+
+    if ( npuOp == EthosU85NpuOp::Elementwise )
+    {
+        int numElemsBlk = opConfig->OfmBlock().Elements();
+        assert(numElemsBlk > 0);
+        float cycleCmd = (float(EstimateMinimumMemoryCycles(query)) / float(numElemsBlk) + cyclesPerElement) / 4.0f;  // per DPU
+        cyclesPerElement = std::max(cyclesPerElement, cycleCmd);
+    }
+
+    return cyclesPerElement;
+}
+
+ElementAccess EthosU85Performance::MeasureElementAccess(const PerformanceQuery &query)
+{
+    ElementAccess access;
+    EthosU85OpConfig *opConfig = static_cast<EthosU85OpConfig *>(query.config);
+    auto npuOp = _arch->GetHWOp(query.type);
+    assert(npuOp != EthosU85NpuOp::None);
+
+    Shape ifmRounding = _arch->GetStorageRounding(query.ifmFormat[0]);
+    Shape ofmRounding = _arch->GetStorageRounding(query.ofmFormat);
+
+    // Convolution & pooling
+    if ( OpUsesMacs(npuOp) )
+    {
+        Shape ifmBlock = Shape::Min(query.ifmShape[0], opConfig->IfmBlock());
+        Shape ofmBlock = Shape::Min(query.ofmShape, opConfig->OfmBlock());
+
+        // Number of ofm blocks in the overall output shape
+        Shape ofmBlocks = Shape::DivRoundUp(query.ofmShape, ofmBlock);
+
+        int ofmBlockDepth = ofmBlock.Depth();
+        if ( npuOp == EthosU85NpuOp::Depthwise || npuOp == EthosU85NpuOp::Pooling )
+        {
+            ofmBlocks = ofmBlocks.WithDepth(1);
+            ofmBlockDepth = query.ifmShape[0].Depth();
+        }
+
+        // Number of sub kernels
+        int subKernelWidth = s_SubkernelLimits[int(npuOp)].x;
+        int subKernelHeight = s_SubkernelLimits[int(npuOp)].y;
+        int subkernels = DivRoundUp(query.kernel->Size().x, subKernelWidth) * DivRoundUp(query.kernel->Size().y, subKernelHeight);
+
+        int ifmFetch =
+            (Shape::RoundAway(ifmBlock, ifmRounding).ElementsWH() * Shape::RoundAway(query.ifmShape[0], ifmRounding).Depth());
+
+        int kernelRead = query.kernel->Size().AreaXY();
+        if ( (npuOp != EthosU85NpuOp::Depthwise) && (npuOp != EthosU85NpuOp::Pooling) )
+        {
+            kernelRead *= query.ifmShape[0].Depth();
+        }
+
+        int ofmBlockCount = ofmBlocks.Elements();
+
+        access.ifmRead[0] = ifmFetch * subkernels * ofmBlockCount;
+
+        if ( (npuOp != EthosU85NpuOp::Pooling) && (npuOp != EthosU85NpuOp::ReduceSum) )
+        {
+            int weightFetch = kernelRead * ofmBlockDepth * ofmBlockCount;
+            access.constRead[0] = weightFetch;
+            access.constRead[1] = query.ofmShape.Depth();  // Scales & biases
+            access.weightsRefetch = ofmBlocks.ElementsWH();
+        }
+    }
+    else if ( npuOp == EthosU85NpuOp::Elementwise )
+    {
+        // IFM1 is scalar
+        if ( query.ifmShape[0].Elements() == 1 )
+        {
+            if ( DataTypeSizeBits(query.ifmType[0]) > 8 )  // IFM1 is a non 8-bit scalar
+            {
+                access.ifmRead[0] = Shape::RoundAway(query.ifmShape[0], ifmRounding).Elements();
+            }
+            else if ( query.ifmShape[1].Elements() > 0 )
+            {
+                access.ifmRead[1] = Shape::RoundAway(query.ofmShape, ifmRounding).Elements();
+            }
+        }
+        else  // IFM1 is not scalar
+        {
+            access.ifmRead[0] = Shape::RoundAway(query.ofmShape, ifmRounding).Elements();
+            if ( query.ifmShape[1].Elements() > 0 )
+            {
+                // IFM2 is not scalar
+                if ( query.ifmShape[1].Elements() > 1 )
+                {
+                    access.ifmRead[1] = access.ifmRead[0];
+                }
+                else if ( DataTypeSizeBits(query.ifmType[1]) > 8 )  // IFM2 is a non 8-bit scalar
+                {
+                    access.ifmRead[1] = Shape::RoundAway(query.ifmShape[1], ifmRounding).Elements();
+                }
+            }
+        }
+    }
+    else if ( npuOp == EthosU85NpuOp::Resize )
+    {
+        // TODO: Implement for Resize
+        access.ifmRead[0] = Shape::RoundAway(query.ifmShape[0], ifmRounding).Elements();
+        access.ofmWrite = Shape::RoundAway(query.ofmShape[0], ofmRounding).Elements();
+    }
+    else if ( npuOp == EthosU85NpuOp::Dma )
+    {
+        if ( query.type == OpType::Gather )
+        {
+            // One element from IFM0 (positions) is read per element in IFM1 (index)
+            access.ifmRead[0] = Shape::RoundAway(query.ifmShape[1], ifmRounding).Elements();
+
+            // Complete IFM1 (index) is read
+            access.ifmRead[1] = Shape::RoundAway(query.ifmShape[1], ifmRounding).Elements();
+
+            // Complete OFM is written
+            access.ofmWrite = Shape::RoundAway(query.ofmShape[0], ofmRounding).Elements();
+        }
+        else
+        {
+            LOG_WARN("Missing element access estimation for DMA op {}\n", OpTypeToString(query.type).c_str());
+        }
+    }
+    else
+    {
+        assert(false);
+    }
+
+    access.ofmWrite = Shape::RoundAway(query.ofmShape, ofmRounding).Elements();
+
+    return access;
+}
+
+
+ElementAccess EthosU85Performance::ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access)
+{
+    EthosU85OpConfig *opConfig = static_cast<EthosU85OpConfig *>(query.config);
+    auto ifmBlock = opConfig ? opConfig->IfmBlock() : Shape(1, 1, 1, 1);
+    auto ofmBlock = opConfig ? opConfig->OfmBlock() : Shape(1, 1, 1, 1);
+
+    ElementAccess result = access;
+
+    // IFM bytes transferred
+    const int ifmCount = query.ifmShape[1].Elements() > 0 ? int(std::size(query.ifmShape)) : 1;
+    for ( int i = 0; i < ifmCount; i++ )
+    {
+        result.ifmRead[i] = EstimateMemoryTransfer(_arch->_cores, true, query.ifmMemory[i], query.ifmFormat[i],
+            DataTypeSizeBits(query.ifmType[i]), ifmBlock, query.ifmShape[i], access.ifmRead[i]);
+    }
+
+    // OFM bytes transferred
+    result.ofmWrite = EstimateMemoryTransfer(_arch->_cores, false, query.ofmMemory, query.ofmFormat,
+        DataTypeSizeBits(query.ofmType), ofmBlock, query.ofmShape, access.ofmWrite);
+
+    // These requires compression ratio information
+    result.constRead[0] = 0;
+    result.constRead[1] = 0;
+
+    return result;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp
new file mode 100644
index 00000000..114e4f2b
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp
@@ -0,0 +1,60 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/common.hpp"
+
+#include "architecture/architecture.hpp"
+
+namespace regor
+{
+
+class ArchEthosU85;
+
+struct EthosU85PerfInfo
+{
+    float outputCycles[8];
+    float activationCycles[3];
+};
+
+/// <summary>
+/// Profiles performance analysis for Ethos-U85
+/// </summary>
+class EthosU85Performance : public ArchitecturePerformance
+{
+protected:
+    ArchEthosU85 *_arch;
+    const EthosU85PerfInfo *_perfInfo;
+
+public:
+    EthosU85Performance(ArchEthosU85 *arch, const EthosU85PerfInfo *perfInfo);
+
+public:
+    CycleCost MeasureCycleCost(const PerformanceQuery &query, const std::vector<FusionQuery> &fused) override;
+    int64_t MemToMemCycles(const ArchitectureMemory *dest, const ArchitectureMemory *source, int sizeBytes) override;
+    ElementAccess MeasureElementAccess(const PerformanceQuery &query) override;
+    ElementAccess ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access) override;
+
+private:
+    int64_t EstimateConvCycles(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
+    float EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector<FusionQuery> &fused);
+    int64_t EstimateMinimumMemoryCycles(const PerformanceQuery &query);
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp
new file mode 100644
index 00000000..cdab052f
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp
@@ -0,0 +1,1828 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "ethos_u85_register_cs_generator.hpp"
+
+#include "common/common.hpp"
+#include "common/logging.hpp"
+
+#include "architecture/ethos_u_scaling.hpp"
+#include "common/data_type.hpp"
+#include "compiler/high_level_command_stream.hpp"
+#include "compiler/op_type.hpp"
+#include "ethos_u85.hpp"
+#define NPU_DISASSEMBLE
+#define NPU_NAMESPACE ethosu85
+#include "ethos_u85_interface.hpp"
+#include "ethos_u85_scaling.hpp"
+
+#include <deque>
+#include <unordered_map>
+#include <vector>
+
+namespace regor
+{
+
+using namespace ethosu85;
+
+void EthosU85Emitter::Emit(uint32_t instr)
+{
+    uint16_t cmd = instr & 0xFFFF;
+    assert(IsCmd0(cmd));
+    bool emit = IsOp(cmd) || SetRegister(cmd, instr);
+    if ( emit )
+    {
+        _stream.push_back(instr);
+    }
+}
+
+void EthosU85Emitter::Emit(uint64_t instr)
+{
+    uint16_t cmd = instr & 0xFFFF;
+    assert(IsCmd1(cmd));
+    bool emit = IsOp(cmd) || SetRegister(cmd, instr);
+    if ( emit )
+    {
+        _stream.push_back(uint32_t(instr));
+        _stream.push_back(uint32_t(instr >> 32));
+    }
+}
+
+void EthosU85Emitter::Clear()
+{
+    _stream.clear();
+    _registers.clear();
+}
+
+
+bool EthosU85Emitter::SetRegister(uint16_t reg, uint64_t value)
+{
+    auto item = _registers.find(reg);
+    bool isChanged = item == _registers.end() || item->second != value;
+    if ( isChanged )
+    {
+        _registers[reg] = value;
+    }
+    return isChanged;
+}
+
+bool EthosU85Emitter::IsCmd0(uint16_t key)
+{
+    return (key >> 14) == uint16_t(cmd_ctrl::CMD0_CTRL);
+}
+
+bool EthosU85Emitter::IsCmd1(uint16_t key)
+{
+    return (key >> 14) == uint16_t(cmd_ctrl::CMD1_CTRL);
+}
+
+bool EthosU85Emitter::IsOp(uint16_t key)
+{
+    return IsCmd0(key) ? (key & (1 << 8)) == 0 : (key & (1 << 8)) != 0;
+}
+
+
+/// <summary>
+/// Generates register command streams for Ethos U85.
+/// </summary>
+// TODO MLBEDSW-7985 add elementwise bitwise AND_NOT
+namespace
+{
+const std::unordered_map<OpType, elementwise_mode> kElementwiseMap = {
+    {OpType::Add, elementwise_mode::ADD},
+    {OpType::Sub, elementwise_mode::SUB},
+    {OpType::Abs, elementwise_mode::ABS},
+    {OpType::Mul, elementwise_mode::MUL},
+    {OpType::Minimum, elementwise_mode::MIN},
+    {OpType::Maximum, elementwise_mode::MAX},
+    {OpType::LeakyRelu, elementwise_mode::LRELU},
+    {OpType::CLZ, elementwise_mode::CLZ},
+    {OpType::SHL, elementwise_mode::SHL},
+    {OpType::SHR, elementwise_mode::LSR},
+    {OpType::Div, elementwise_mode::DIV},
+    {OpType::LogicalAnd, elementwise_mode::AND},
+    {OpType::LogicalOr, elementwise_mode::OR},
+    {OpType::LogicalXor, elementwise_mode::XOR},
+    {OpType::LogicalNot, elementwise_mode::NOT},
+    {OpType::And, elementwise_mode::AND},
+    {OpType::Or, elementwise_mode::OR},
+    {OpType::Xor, elementwise_mode::XOR},
+    {OpType::Not, elementwise_mode::NOT},
+    {OpType::Asr, elementwise_mode::SHR},
+    {OpType::Equal, elementwise_mode::CMP_EQ},
+    {OpType::Greater, elementwise_mode::CMP_GT},
+    {OpType::GreaterEqual, elementwise_mode::CMP_GE},
+    {OpType::NotEqual, elementwise_mode::CMP_NE},
+};
+
+activation_type ToActivationType(DataType type)
+{
+    if ( IsSignedInteger(type) || IsBool(type) )
+    {
+        return activation_type::SIGNED;
+    }
+    else
+    {
+        assert(IsInteger(type));
+        return activation_type::UNSIGNED;
+    }
+}
+
+activation_format ToActivationFormat(TensorFormat format)
+{
+    if ( format == TensorFormat::NHCWB16 )
+    {
+        return activation_format::NHCWB16;
+    }
+    else
+    {
+        assert(format == TensorFormat::NHWC);
+        return activation_format::NHWC;
+    }
+}
+
+activation_precision ToActivationPrecision(DataType type)
+{
+    switch ( DataTypeSizeBits(type) )
+    {
+        case 8:
+            return activation_precision::B8;
+        case 16:
+            return activation_precision::B16;
+        case 32:
+            return activation_precision::B32;
+        case 64:
+            return activation_precision::B64;
+        default:
+            assert(false);
+            return activation_precision::B64;
+    }
+}
+
+activation_transpose ToActivationTranspose(TransposeType type)
+{
+    switch ( type )
+    {
+        case TransposeType::None:
+        case TransposeType::NHWC:
+            return activation_transpose::HWC;
+        case TransposeType::NWHC:
+            return activation_transpose::WHC;
+        case TransposeType::NHCW:
+            return activation_transpose::HCW;
+        case TransposeType::NWCH:
+            return activation_transpose::WCH;
+        case TransposeType::NCHW:
+            return activation_transpose::CHW;
+        case TransposeType::NCWH:
+            return activation_transpose::CWH;
+        default:
+            assert(false && "Unknown transpose mask");
+            return activation_transpose::HWC;
+    }
+}
+
+activation_reverse ToActivationReverse(ReverseType type)
+{
+    switch ( type )
+    {
+        case ReverseType::None:
+            return activation_reverse::NONE;
+        case ReverseType::H:
+            return activation_reverse::H;
+        case ReverseType::W:
+            return activation_reverse::W;
+        case ReverseType::C:
+            return activation_reverse::C;
+        default:
+            assert(false && "Unknown reverse type");
+            return activation_reverse::NONE;
+    }
+}
+
+ifm_upscale_mode ToIfmUpscaleMode(ArchResampling resampling)
+{
+    if ( resampling == ArchResampling::Nearest )
+    {
+        return ifm_upscale_mode::NEAREST;
+    }
+    if ( resampling == ArchResampling::Zeros )
+    {
+        return ifm_upscale_mode::ZEROS;
+    }
+    return ifm_upscale_mode::NONE;
+}
+
+resize_mode ToResizeMode(ArchResizeMode mode)
+{
+    if ( mode == ArchResizeMode::Bilinear )
+    {
+        return resize_mode::BILINEAR;
+    }
+    if ( mode == ArchResizeMode::Nearest )
+    {
+        return resize_mode::NEAREST;
+    }
+    return resize_mode::REPLICATE;
+}
+
+round_mode_ofm GetOfmRoundingMode(const HLCOperation *op)
+{
+    switch ( op->rounding )
+    {
+        case HLCRoundMode::NATURAL:
+            return round_mode_ofm::NATURAL;
+        case HLCRoundMode::TRUNCATE:
+            return round_mode_ofm::TRUNCATE_TO_ZERO;
+        case HLCRoundMode::DBL:
+            return round_mode_ofm::DOUBLE_SYMMETRIC;
+        case HLCRoundMode::AUTO:
+            return round_mode_ofm::DOUBLE_SYMMETRIC;
+        case HLCRoundMode::TRUNCATE_TO_LOWER:
+            return round_mode_ofm::TRUNCATE_TO_LOWER;
+        case HLCRoundMode::DOUBLE_ASYMMETRIC:
+            return round_mode_ofm::DOUBLE_ASYMMETRIC;
+        case HLCRoundMode::SYMMETRIC:
+            return round_mode_ofm::SYMMETRIC;
+        default:
+            return round_mode_ofm::DOUBLE_SYMMETRIC;
+    }
+}
+}  // namespace
+
+uint32_t EthosU85RCSGenerator::ConfigRegister(int macs, int cmdStreamVersion, int numAxiSram, int numAxiExt, int numWd, int product)
+{
+    return config_r{}
+        .set_macs_per_cc(macs)
+        .set_cmd_stream_version(cmdStreamVersion)
+        .set_num_axi_sram(numAxiSram)
+        .set_num_axi_ext(numAxiExt)
+        .set_num_wd(numWd)
+        .set_product(product);
+}
+
+uint32_t EthosU85RCSGenerator::IdRegister()
+{
+    return id_r{};
+}
+
+bool EthosU85RCSGenerator::IsSupportedElementwise(const OpType opType)
+{
+    return kElementwiseMap.count(opType) != 0;
+}
+
+EthosU85RCSGenerator::EthosU85RCSGenerator(ArchEthosU85 *arch) : _arch(arch)
+{
+}
+
+void EthosU85RCSGenerator::Emit(uint32_t instr)
+{
+    _emit.Emit(instr);
+}
+
+void EthosU85RCSGenerator::Emit(uint64_t instr)
+{
+    _emit.Emit(instr);
+}
+
+
+int EthosU85RCSGenerator::GetDoubleBufferOffset(HLCWeights *weights, int rangeIndex)
+{
+    int doubleBufferOffset = 0;
+    if ( weights->buffering == Buffering::Double )
+    {
+        assert(weights->subStreams > 0);
+        int depthIndex = rangeIndex / weights->subStreams;
+        if ( depthIndex % 2 == 1 )
+        {
+            doubleBufferOffset = weights->maxRangeBytes;
+        }
+    }
+    return doubleBufferOffset;
+}
+
+
+void EthosU85RCSGenerator::CheckAddressRange(ArchitectureMemory *memory, Address address, int size)
+{
+    assert(address >= 0);
+    if ( address >= memory->SizeBytes() )
+    {
+        LOG_ERROR("Error: Address out of bounds, address {0}, memory '{1}' with size {2}\n", address, memory->Name(),
+            memory->SizeBytes());
+        // TODO: replace assert by error handling
+        assert(false && "Address out of bounds");
+    }
+    assert(size >= 0);
+    if ( address + size > memory->SizeBytes() )
+    {
+        LOG_ERROR("Error: Address offset out of bounds, address {0}, offset {1}, memory '{2}' with size {3}\n", address,
+            size, memory->Name(), memory->SizeBytes());
+        // TODO: replace assert by error handling
+        assert(false && "address offset out of bounds");
+    }
+}
+
+void EthosU85RCSGenerator::CheckAddresses(const HLCFeatureMap &fm)
+{
+    CheckAddressRange(fm.memArea.memory, fm.address, fm.AllocationSizeBytes());
+    assert(fm.address % 16 == 0 || fm.format != TensorFormat::NHCWB16);
+}
+
+// Calculates the rolling buffer address of the given coordinate.
+Address EthosU85RCSGenerator::AddressForCoordinate(const HLCFeatureMap &fm, const Shape &strides, const Shape &coord)
+{
+    Shape truncatedCoord = Shape::PadAxes(coord, 4, 0) % Shape::PadAxes(fm.shape, 4, 1);
+    int offset = 0;
+    if ( fm.format == TensorFormat::NHWC )
+    {
+        offset = strides.Dot(truncatedCoord);
+    }
+    else if ( fm.format == TensorFormat::NHCWB16 )
+    {
+        constexpr int BRICK = 16;
+        int elemSize = DataTypeSizeBits(fm.dataType) / 8;
+        int strideX = BRICK * elemSize;
+        offset =
+            truncatedCoord.Height() * strides.Height() + truncatedCoord.Width() * strideX +
+            (truncatedCoord.Depth() / BRICK) * strides.Depth() + (truncatedCoord.Depth() % BRICK) * elemSize +
+            truncatedCoord.Batch() * strides.Batch();
+    }
+    else
+    {
+        assert(false);
+    }
+    return fm.address + offset;
+}
+
+// Calculates tile sizes/addresses of a feature map
+TileBox EthosU85RCSGenerator::GetTiles(const HLCFeatureMap &fm, const Shape &strides, const Box &area)
+{
+    int crossingY = RoundAway(area.Start().Height() + 1, fm.shape.Height());
+    crossingY = std::min(crossingY, area.End().Height());
+    int crossingX = RoundAway(area.Start().Width() + 1, fm.shape.Width());
+    crossingX = std::min(crossingX, area.End().Width());
+    TileBox tiles;
+    auto height = crossingY - area.Start().Height();
+    auto width = crossingX - area.Start().Width();
+    tiles.height0 = (height + fm.stepXY.y - 1) / fm.stepXY.y;
+    tiles.height1 = tiles.height0;
+    tiles.width0 = (width + fm.stepXY.x - 1) / fm.stepXY.x;
+    for ( int i = 0; i < 4; ++i )
+    {
+        tiles.address[i] = 0;
+    }
+    int fmSize = fm.AllocationSizeBytes();
+    tiles.address[0] = AddressForCoordinate(fm, strides, area.Start());
+    assert(fm.address <= tiles.address[0] && tiles.address[0] < fm.address + fmSize);
+    if ( area.End().Width() > crossingX )
+    {
+        tiles.address[1] = AddressForCoordinate(fm, strides, area.Start().WithWidth(crossingX));
+        assert(fm.address <= tiles.address[1] && tiles.address[1] < fm.address + fmSize);
+        assert(false && "Striping in vertical direction is not supported");
+    }
+    if ( area.End().Height() > crossingY )
+    {
+        tiles.address[2] = AddressForCoordinate(fm, strides, area.Start().WithHeight(crossingY));
+        assert(fm.address <= tiles.address[2] && tiles.address[2] < fm.address + fmSize);
+    }
+    if ( area.End().Width() > crossingX && area.End().Height() > crossingY )
+    {
+        tiles.address[3] = AddressForCoordinate(fm, strides, area.Start().WithWidth(crossingX).WithHeight(crossingY));
+        assert(fm.address <= tiles.address[3] && tiles.address[3] < fm.address + fmSize);
+    }
+    if ( fm.format == TensorFormat::NHCWB16 )
+    {
+        for ( int i = 0; i < 4; ++i )
+        {
+            assert(tiles.address[i] % 16 == 0 && "NHCWB16 base address is not 16-byte aligned");
+        }
+    }
+    return tiles;
+}
+
+MemoryAccess EthosU85RCSGenerator::ToMemoryAccess(const HLCFeatureMap &fm, const Box &area, AccessDirection direction)
+{
+    const auto &strides = fm.strides;
+    Address start = AddressForCoordinate(fm, strides, area.Start());
+    // Note: due to truncating of shape, AddressForCoordinate(fm, .., fm.shape) returns
+    // fm.address; the - Shape(1, 1, 1) prevents this
+    Address end = AddressForCoordinate(fm, strides, area.End() - Shape(1, 1, 1)) + DataTypeSizeBits(fm.dataType) / 8;
+    if ( end < start )
+    {
+        // Area wraps around the end of the feature map
+        start = fm.address;
+        end = fm.address + fm.AllocationSizeBytes();
+    }
+    return MemoryAccess(direction, fm.memArea, start, end);
+}
+
+// Returns region number used in NPU_SET_..._REGION
+uint32_t EthosU85RCSGenerator::ToRegion(const MemArea &memArea)
+{
+    auto region = BasePointerIndex::WeightTensor;
+    if ( memArea == _arch->FeatureMapMemory() )
+    {
+        region = BasePointerIndex::ScratchTensor;
+    }
+    else if ( memArea == _arch->StagingMemory() )
+    {
+        region = BasePointerIndex::ScratchFastTensor;
+    }
+    else if ( memArea == _arch->LUTMemory() )
+    {
+        region = BasePointerIndex::Mem2Mem;
+    }
+    else
+    {
+        assert(memArea == _arch->ReadonlyMemory());
+    }
+    return uint32_t(region);
+}
+
+bool EthosU85RCSGenerator::UseZeroPoint0(OpType opType, const HLCFeatureMap &fm, bool isOFM)
+{
+    if ( fm.quantization.forceZeroPoint )
+    {
+        return false;
+    }
+    if ( fm.quantization.zeroPoints.empty() || (fm.dataType == DataType::Int32 && !isOFM) )
+    {
+        return true;
+    }
+    return opType == OpType::AvgPool || opType == OpType::Resize || opType == OpType::CLZ || opType == OpType::SHL || opType == OpType::Div;
+}
+
+
+// Checks if the feature map is a scalar, and if so, returns the
+// quantized value in scalarValue.
+bool EthosU85RCSGenerator::IsScalar(const HLCFeatureMap &fm, int32_t &scalarValue)
+{
+    const auto &view = fm.bufferView;
+    // A 1-sized feature map in constant memory is a scalar
+    bool isScalar = fm.shape.Elements() == 1 && view.HasBuffer();
+    if ( isScalar )
+    {
+        if ( fm.dataType == DataType::Int8 )
+        {
+            scalarValue = view.Values<int8_t>()[0];
+        }
+        else if ( fm.dataType == DataType::UInt8 )
+        {
+            scalarValue = view.Values<uint8_t>()[0];
+        }
+        else if ( fm.dataType == DataType::Int16 )
+        {
+            scalarValue = view.Values<int16_t>()[0];
+        }
+        else if ( fm.dataType == DataType::Int32 )
+        {
+            scalarValue = view.Values<int32_t>()[0];
+        }
+        else
+        {  // Unsupported scalar value
+            isScalar = false;
+        }
+    }
+    return isScalar;
+}
+
+
+// Calculates waits for KERNEL_WAIT/DMA_WAIT, returns -1 if no wait is needed
+// - opAccesses contains the memory accesses for the current operation
+// - outstanding contains the memory accesses for ongoing "other" operations
+//   (DMA operations if the current op is an NPU operation, NPU operations if the current op is a DMA operation)
+// Note: NPU - NPU dependency is handled via blockdep
+int EthosU85RCSGenerator::CalcCommandWaits(const MemoryAccesses &opAccesses, std::deque<MemoryAccesses> &outstanding)
+{
+    int waits = 0;
+    for ( int index = int(outstanding.size()) - 1; index >= 0; ++waits, --index )
+    {
+        for ( const auto &access : opAccesses )
+        {
+            for ( const auto &outstandingAccess : outstanding[index] )
+            {
+                if ( access.Conflicts(outstandingAccess) )
+                {
+                    // Current op needs to wait, and after it has waited,
+                    // outstanding[0..index] are not outstanding any longer
+                    for ( int i = 0; i <= index; ++i )
+                    {
+                        outstanding.pop_front();
+                    }
+                    return waits;
+                }
+            }
+        }
+    }
+    return -1;
+}
+
+// Returns LUT slot to be used for the given LUT operation.
+// Sets alreadyInLutMem to true if the LUT is already in SHRAM.
+int EthosU85RCSGenerator::AllocateLutSlot(
+    std::vector<LutSlot> &lutSlots, const HLCOperation *op, int sizeInSlots, int timestamp, bool &alreadyInLutMem)
+{
+    alreadyInLutMem = false;
+    int totalSlots = int(lutSlots.size());
+    if ( sizeInSlots < 0 || sizeInSlots > totalSlots )
+    {
+        assert(false);
+        return 0;
+    }
+    // Returns least recently used slot, unless the LUT is already in memory
+    int allocatedSlot = 0;
+    for ( int i = 0; i < totalSlots; i += sizeInSlots )
+    {
+        if ( lutSlots[i].hlcOp == op )
+        {
+            // LUT is already in SHRAM
+            allocatedSlot = i;
+            alreadyInLutMem = true;
+            break;
+        }
+        if ( lutSlots[i].lastUsed < lutSlots[allocatedSlot].lastUsed )
+        {
+            allocatedSlot = i;
+        }
+    }
+    for ( int j = allocatedSlot; j < allocatedSlot + sizeInSlots; ++j )
+    {
+        lutSlots[j].hlcOp = op;
+        lutSlots[j].lastUsed = timestamp;
+    }
+    return allocatedSlot;
+}
+
+//----------------------------------------------------------------------
+// Print
+//----------------------------------------------------------------------
+
+int EthosU85RCSGenerator::Disassemble(const uint32_t *in, std::string &op, std::vector<std::pair<std::string, std::string>> &fields)
+{
+    return isa::disassemble(in, op, fields);
+}
+
+//----------------------------------------------------------------------
+// Scaling (OFM/IFM/IFM2_SCALE)
+//----------------------------------------------------------------------
+
+// Generates OFM_SCALE register for pooling operations
+void EthosU85RCSGenerator::GenerateOFMScalingForPooling(HLCOperation *poolOp)
+{
+    QuantizedScale ofmScale(1, 0);
+    bool isNoOp = _arch->UseAvgPoolNop(poolOp->type);
+    ethosU85Scaling::RescalePooling(poolOp, isNoOp);
+    if ( !poolOp->ofm.quantization.scales.empty() )
+    {
+        ofmScale = poolOp->ofm.quantization.scales[0];
+        assert(unsigned(ofmScale.shift) < 64);
+    }
+    Emit(isa::npu_set_ofm_scale_t(uint32_t(ofmScale.shift), 0, GetOfmRoundingMode(poolOp), ofmScale.scale));
+}
+
+// Generates OFM/IFM/IMF2_SCALE registers for elementwise operators.
+void EthosU85RCSGenerator::GenerateScalingForElementwise(HLCOperation *op)
+{
+    auto opType = op->type;
+    int ifmCnt = int(op->ifm.size());
+
+    QuantizedScale input1Scale(1, 0);
+    QuantizedScale input2Scale(1, 0);
+    QuantizedScale outScale(1, 0);
+    ethosU85Scaling::RescaleElementwise(op);
+
+    auto ifmRoundMode = round_mode_ifm::DOUBLE_SYMMETRIC;
+    uint32_t ifmDoubleRound = 0;
+    auto ofmRoundMode = GetOfmRoundingMode(op);
+    uint32_t ofmDoubleRound = 0;
+
+    int bitDepth = DataTypeSizeBits(op->ifm[0].dataType);
+    if ( opType == OpType::Mul || opType == OpType::Abs || opType == OpType::LeakyRelu )
+    {
+        if ( !op->ofm.quantization.scales.empty() )
+        {
+            outScale = op->ofm.quantization.scales[0];
+        }
+        if ( opType == OpType::LeakyRelu )
+        {
+            if ( !op->ifm[0].quantization.scales.empty() )
+            {
+                input1Scale = op->ifm[0].quantization.scales[0];
+            }
+            const HLCParameters *params = &op->parameters;
+            float alpha = params->leaky_relu.alpha;
+            float ifm1Scale = input1Scale.Dequantize();
+            input2Scale = QuantizedScale(alpha * ifm1Scale);
+            ifmCnt = 2;
+        }
+    }
+    else if ( opType == OpType::Div )
+    {
+        // Div operations require unit scaling
+        if ( !op->ifm[0].quantization.scales.empty() )
+        {
+            auto scale = op->ifm[0].quantization.scales[0];
+            assert(scale.scale == 1);
+            assert(scale.shift == 0);
+        }
+        if ( ifmCnt == 2 && !op->ifm[1].quantization.scales.empty() )
+        {
+            auto scale = op->ifm[1].quantization.scales[0];
+            assert(scale.scale == 1);
+            assert(scale.shift == 0);
+        }
+        if ( !op->ofm.quantization.scales.empty() )
+        {
+            auto scale = op->ofm.quantization.scales[0];
+            assert(scale.scale == 1);
+            assert(scale.shift == 0);
+        }
+    }
+    else if ( opType == OpType::Add || opType == OpType::Sub )
+    {
+        // Double round is used to compensate for the left shift that happens in AdvancedElementwiseAddSubScale
+        ifmDoubleRound = op->ifm[0].dataType == DataType::Int8 ? 20 : 15;
+        if ( !op->ofm.quantization.scales.empty() && !op->ifm[0].quantization.scales.empty() &&
+             !op->ifm[1].quantization.scales.empty() )
+        {
+            outScale = op->ofm.quantization.scales[0];
+            input1Scale = op->ifm[0].quantization.scales[0];
+            input2Scale = op->ifm[1].quantization.scales[0];
+        }
+    }
+    assert(unsigned(input1Scale.shift) < 64);
+    Emit(isa::npu_set_ifm_scale_t(input1Scale.shift, ifmDoubleRound, ifmRoundMode, input1Scale.scale));
+    if ( ifmCnt == 2 )
+    {
+        assert(unsigned(input2Scale.shift) < 64);
+        Emit(isa::npu_set_ifm2_scale_t(input2Scale.shift, ifmDoubleRound, ifmRoundMode, input2Scale.scale));
+    }
+    assert(unsigned(outScale.shift) < 64);
+    Emit(isa::npu_set_ofm_scale_t(outScale.shift, ofmDoubleRound, ofmRoundMode, outScale.scale));
+}
+
+//----------------------------------------------------------------------
+// BLOCKDEP calculation
+//----------------------------------------------------------------------
+
+static Shape CalcIFMJobShape(const Shape &ofmBlock, Kernel *kernel, int ifmBlockDepth)
+{
+    Point2i dilatedSize = kernel->DilatedWH();
+    // TODO MLBEDSW-8498: Consider ifm_upscale_mode for job-shape calculations
+    int h = RequiredInputSize(ofmBlock.Height(), kernel->Stride().y, dilatedSize.y, 1);
+    int w = RequiredInputSize(ofmBlock.Width(), kernel->Stride().x, dilatedSize.x, 1);
+    return Shape(1, h, w, ifmBlockDepth);
+}
+
+// Given the area and block size, adds the first/last jobs (depending on fromStart) to jobs.
+// - area: total amount of work to perform
+// - jobShape: size of each job
+// - fromStart: if true, the first jobs are added, if false, the last jobs are added
+//   (in that case, the very last job is added last)
+void EthosU85RCSGenerator::GetJobs(const Box &area, const Shape &jobShape, int nrJobsToGet, bool fromStart, std::vector<Box> &jobs)
+{
+    Shape jobSplit = Shape::DivRoundUp(area.End() - area.Start(), jobShape);
+    int z = jobSplit.Depth();
+    int w = jobSplit.Width();
+    int h = jobSplit.Height();
+    int n = z * w * h;  // n = total number of jobs for the whole area
+    const auto &start = area.Start().Extract(-3, -2, -1);
+    const auto &end = area.End().Extract(-3, -2, -1);
+    int firstJob = fromStart ? 0 : std::max(0, n - nrJobsToGet);
+    int lastJob = fromStart ? std::min(n, nrJobsToGet) : n;
+    for ( int i = firstJob; i < lastJob; ++i )
+    {
+        Shape from = Shape(start.Height() + (i / (z * w)) * jobShape.Height(),
+            start.Width() + ((i / z) % w) * jobShape.Width(), start.Depth() + (i % z) * jobShape.Depth());
+
+        jobs.emplace_back(from, Shape::Min(from + jobShape, end));
+    }
+}
+
+// Calculates the value for the BLOCKDEP register
+int EthosU85RCSGenerator::CalcBlockDep(HLCStripe *prevStripe, HLCStripe *stripe)
+{
+    if ( prevStripe == nullptr )
+    {
+        return 0;
+    }
+    const auto &op = stripe->operation;
+    const auto &prevOp = prevStripe->operation;
+    const auto &prevOfm = prevOp->ofm;
+
+    // TODO: Investigate if this is correct
+    if ( !IsNone(prevOp->ofm.transpose) )
+    {
+        return 0;
+    }
+
+    int ifmIndex = (op->ifm.size() > 1 && op->ifm[1].address == prevOfm.address && op->ifm[1].memArea == prevOfm.memArea) ? 1 : 0;
+    const auto &ifm = op->ifm[ifmIndex];
+    int maxJobs = _arch->MaxBlockdep();
+    if ( ifm.address != prevOfm.address || ifm.memArea != prevOfm.memArea )
+    {
+        for ( const auto &fm : op->ifm )
+        {
+            if ( fm.memArea == prevOfm.memArea &&
+                 Overlaps(fm.address, fm.address + fm.AllocationSizeBytes(), prevOfm.address, prevOfm.address + prevOfm.AllocationSizeBytes()) )
+            {
+                // Previous OFM overlaps in unexpected way with current IFM
+                assert(false && "Unexpected overlap previous OFM/current IFM");
+                return 0;
+            }
+        }
+        // Previous operation does not produce current operation's IFM
+        return maxJobs;
+    }
+    if ( op->ifm.size() > 1 && ifm.AllocationSizeBytes() < op->ifm[1 - ifmIndex].AllocationSizeBytes() )
+    {
+        // Prev OFM produces IFM2 which is broadcasted (this should be rare)
+        return 0;
+    }
+    if ( prevOfm.shape != ifm.shape )
+    {
+        // OFM has been reshaped; the job overlap calculations below do not work in this case
+        return 0;
+    }
+    // Previous operation produces current operations IFM
+    auto prevConfig = static_cast<EthosU85OpConfig *>(prevOp->config);
+    if ( !prevConfig )
+    {
+        // Previous operation doesn't have a block config
+        return 0;
+    }
+    Shape prevBlock = prevConfig->OfmBlock();
+    auto config = static_cast<EthosU85OpConfig *>(op->config);
+    if ( !config )
+    {
+        // Current operation doesn't have a block config
+        return 0;
+    }
+    Shape currBlock = CalcIFMJobShape(config->OfmBlock(), &op->kernel, config->IfmBlock().Depth());
+    // Get the last few jobs from the previous operation (each job produces a part of the current op's IFM)
+    std::vector<Box> lastPrevJobs;
+    GetJobs(prevStripe->ofmArea, prevBlock, maxJobs, false, lastPrevJobs);
+    // Get the first few jobs from the current operation (each job consumes a part of the current op's IFM)
+    std::vector<Box> firstCurrJobs;
+    GetJobs(stripe->ifmAreas[ifmIndex], currBlock, maxJobs, true, firstCurrJobs);
+    // Find the highest blockdep such that there is no overlap between
+    // any job from the previous op with any job from the current op during blockdep jobs
+    int sz = int(std::min(lastPrevJobs.size(), firstCurrJobs.size()));
+    int prevLastIx = int(lastPrevJobs.size()) - 1;
+    for ( int blockdep = 0; blockdep < sz; ++blockdep )
+    {
+        bool overlaps = false;
+        for ( int i = 0; !overlaps && i <= blockdep; ++i )
+        {
+            for ( int j = blockdep - i; !overlaps && i + j <= blockdep; ++j )
+            {
+                if ( firstCurrJobs[i].Overlaps(lastPrevJobs[prevLastIx - j]) )
+                {
+                    overlaps = true;
+                }
+            }
+        }
+        if ( overlaps )
+        {
+            return blockdep;
+        }
+    }
+    // No overlap found
+    return sz;
+}
+
+//----------------------------------------------------------------------
+// Register generation
+//----------------------------------------------------------------------
+
+void EthosU85RCSGenerator::GeneratePadding(const HLCPadding &padding)
+{
+    Emit(isa::npu_set_ifm_pad_top_t(padding.top));
+    Emit(isa::npu_set_ifm_pad_left_t(padding.left));
+    Emit(isa::npu_set_ifm_pad_bottom_t(padding.bottom));
+    Emit(isa::npu_set_ifm_pad_right_t(padding.right));
+}
+
+// Generates ACTIVATION registers
+void EthosU85RCSGenerator::GenerateActivation(const HLCStripe *stripe, MemoryAccesses &memoryAccesses)
+{
+    const HLCOperation *op = stripe->operation.get();
+    assert(op->subOps.size() <= 1);
+    OpType opType = OpType::None;
+    if ( IsActivation(op->type) )
+    {
+        // Non-fused activation
+        opType = op->type;
+        assert(op->subOps.empty() || opType == op->subOps[0].type || opType == OpType::Sigmoid || opType == OpType::Tanh);
+    }
+    else if ( !op->subOps.empty() )
+    {
+        // Fused activation
+        opType = op->subOps[0].type;
+    }
+    auto &ofm = op->ofm;
+    auto &ifm = op->ifm[0];
+    int size = std::min(16, DataTypeSizeBits(ofm.dataType));
+    assert(size > 0 && "Illegal data type");
+    bool isSigned = ToActivationType(ofm.dataType) == activation_type::SIGNED;
+    int64_t quantizedMin = isSigned ? -(1LL << (size - 1)) : 0;
+    int64_t quantizedMax = isSigned ? (1LL << (size - 1)) - 1 : (1 << size) - 1;
+    auto act = activation_function::LUT_NONE;
+    uint32_t tableIndex = 0;
+    auto clipRange = DataTypeSizeBits(ofm.dataType) > 16 ? activation_clip_range::NONE : activation_clip_range::B16;
+    if ( ofm.quantization.quantMin.size() )
+    {
+        quantizedMin = std::max(quantizedMin, ofm.quantization.quantMin[0]);
+    }
+    if ( ofm.quantization.quantMax.size() )
+    {
+        quantizedMax = std::min(quantizedMax, ofm.quantization.quantMax[0]);
+    }
+
+    if ( opType == OpType::LUT || opType == OpType::Sigmoid || opType == OpType::Tanh )
+    {
+        auto &param = op->subOps[0].parameters.lut;
+        int lutSize = param.sizeBytes;
+
+        auto pos = _stripeToLutSlot.find(stripe);
+        if ( pos != _stripeToLutSlot.end() )
+        {
+            tableIndex = pos->second;
+        }
+        else
+        {
+            assert(false && "Command uses lut, but no lut info found");
+        }
+
+        // tableIndex is based on 8 slots of size 256 and alignment is the same as the LUT size
+        // Hardware expects 0-7 tables for 256 LUT
+        //                  0-4 tables for 512 LUT
+        //                  0-1 tables for  1k LUT
+        //                    1 table  for  2k LUT
+        // So for 512 and 1k the tableIndex is adjusted below
+        switch ( ofm.dataType )
+        {
+            case DataType::Int8:
+                assert(lutSize == 256);
+                assert(param.ifmType == DataType::Int8);
+                act = activation_function::LUT_S8_S8;
+                break;
+            case DataType::UInt8:
+                assert(lutSize == 256);
+                assert(param.ifmType == DataType::UInt8);
+                act = activation_function::LUT_U8_U8;
+                break;
+            case DataType::Int16:
+                if ( param.ifmType == DataType::Int8 )
+                {
+                    assert(lutSize == 512 && tableIndex % 2 == 0);
+                    act = activation_function::LUT_S8_S16;
+                }
+                else
+                {
+                    assert(lutSize == 2048 && tableIndex == 0);
+                    assert(param.ifmType == DataType::Int16);
+                    if ( opType == OpType::LUT ) act = activation_function::LUT_S16_S16;
+                    else if ( opType == OpType::Sigmoid ) act = activation_function::LUT_SIGMOID;
+                    else act = activation_function::LUT_TANH;
+                }
+                break;
+            case DataType::Int32:
+                if ( param.ifmType == DataType::Int8 )
+                {
+                    assert(lutSize == 1024 && tableIndex % 4 == 0);
+                    act = activation_function::LUT_S8_S32;
+                }
+                else
+                {
+                    assert(lutSize == 2048 && tableIndex == 0);
+                    assert(param.ifmType == DataType::Int16);
+                    act = activation_function::LUT_S16_S32;
+                }
+                break;
+            default:
+                assert(false && "Unsupported LUT table");
+                break;
+        }
+
+        // Adjust table for 512 and 1k
+        tableIndex = tableIndex / (lutSize / ArchEthosU85::LUT_SLOT_SIZE);
+
+        Address lutStart = Address(tableIndex) * lutSize;
+        memoryAccesses.emplace_back(AccessDirection::Read, _arch->LUTMemory(), lutStart, lutStart + lutSize);
+    }
+    assert(quantizedMin <= std::numeric_limits<int16_t>::max());
+    assert(quantizedMax <= std::numeric_limits<int16_t>::max());
+    Emit(isa::npu_set_activation_t(act, tableIndex, clipRange));
+    Emit(isa::npu_set_activation_min_t(uint32_t(quantizedMin)));
+    Emit(isa::npu_set_activation_max_t(uint32_t(quantizedMax)));
+}
+
+// Generates KERNEL related registers
+void EthosU85RCSGenerator::GenerateKernel(const Kernel &kernel, bool partKernel)
+{
+    auto dilatedWH = kernel.DilatedWH();
+    Emit(isa::npu_set_kernel_height_m1_t(dilatedWH.y - 1));
+    Emit(isa::npu_set_kernel_width_m1_t(dilatedWH.x - 1));
+    uint32_t stride_x_lsb = (kernel.Stride().x - 1) & 1;
+    uint32_t stride_y_lsb = (kernel.Stride().y - 1) & 1;
+    uint32_t stride_x_msb = ((kernel.Stride().x - 1) >> 1) & 1;
+    uint32_t stride_y_msb = ((kernel.Stride().y - 1) >> 1) & 1;
+    auto weightOrder = partKernel ? weight_order::PART_KERNEL_FIRST : weight_order::DEPTH_FIRST;
+    kernel_dilation dilation_x = kernel_dilation(kernel.Dilation().x - 1);
+    kernel_dilation dilation_y = kernel_dilation(kernel.Dilation().y - 1);
+    kernel_decomposition decomposition = kernel_decomposition::D8X8;  //  Kernel decomposition
+    Emit(isa::npu_set_kernel_stride_t(
+        stride_x_lsb, stride_y_lsb, weightOrder, dilation_x, dilation_y, decomposition, stride_x_msb, stride_y_msb));
+}
+
+
+// Generates IFM_BROADCAST/IFM2_BROADCAST register for binary elementwise operations
+static broadcast_mode CalculateBroadcast(const Shape &shape1, const Shape &shape2)
+{
+    uint8_t mode = uint8_t(broadcast_mode::NONE);
+    if ( shape1.Height() < shape2.Height() && shape1.Height() == 1 )
+    {
+        // Broadcast in 'H' dimension
+        mode |= uint8_t(broadcast_mode::H);
+    }
+    if ( shape1.Width() < shape2.Width() && shape1.Width() == 1 )
+    {
+        // Broadcast in 'W' dimension
+        mode |= uint8_t(broadcast_mode::W);
+    }
+    if ( shape1.Depth() < shape2.Depth() && shape1.Depth() == 1 )
+    {
+        // Broadcast in 'C' dimension
+        mode |= uint8_t(broadcast_mode::C);
+    }
+    return broadcast_mode(mode);
+}
+
+void EthosU85RCSGenerator::GenerateInputBroadcast(const Shape &ifmShape, const Shape &ifm2Shape, bool ifmIsScalar, bool ifm2IsScalar)
+{
+    assert(!(ifmIsScalar && ifm2IsScalar));
+    // IFM broadcast
+    auto broadcastMode = ifmIsScalar ? broadcast_mode::SCALAR : CalculateBroadcast(ifmShape, ifm2Shape);
+    Emit(isa::npu_set_ifm_broadcast_t(broadcastMode));
+
+    // IFM2 broadcast
+    broadcastMode = ifm2IsScalar ? broadcast_mode::SCALAR : CalculateBroadcast(ifm2Shape, ifmShape);
+    Emit(isa::npu_set_ifm2_broadcast_t(broadcastMode));
+}
+
+// Generates IFM_PRECISION register
+void EthosU85RCSGenerator::GenerateIFMPrecision(const HLCFeatureMap &fm)
+{
+    activation_type type = ToActivationType(fm.dataType);
+    activation_precision precision = ToActivationPrecision(fm.dataType);
+    activation_format format = ToActivationFormat(fm.format);
+    activation_storage storage = activation_storage::TILE2X2;
+    Emit(isa::npu_set_ifm_precision_t(type, precision, format, storage));
+}
+
+// Generates IFM2_PRECISION register
+void EthosU85RCSGenerator::GenerateIFM2Precision(const HLCFeatureMap &fm)
+{
+    activation_type type = ToActivationType(fm.dataType);
+    activation_precision precision = ToActivationPrecision(fm.dataType);
+    activation_format format = ToActivationFormat(fm.format);
+    activation_storage storage = activation_storage::TILE2X2;
+    Emit(isa::npu_set_ifm2_precision_t(type, precision, format, storage));
+}
+
+// Generates OFM_PRECISION register
+void EthosU85RCSGenerator::GenerateOFMPrecision(const HLCFeatureMap &fm, bool useGlobalScale, bool enable_output)
+{
+    activation_type type = ToActivationType(fm.dataType);
+    activation_precision precision = ToActivationPrecision(fm.dataType);
+    activation_format format = ToActivationFormat(fm.format);
+    auto scaleMode = useGlobalScale ? ofm_scale_mode::GLOBAL : ofm_scale_mode::PER_CHANNEL;
+    activation_reverse reverse = ToActivationReverse(fm.reverse);
+    activation_transpose transpose = ToActivationTranspose(fm.transpose);
+    // TODO implement MLBEDSW-7867 storage
+    activation_storage storage = enable_output ? activation_storage::TILE2X2 : activation_storage::NONE;
+    if ( reverse != activation_reverse::NONE )
+    {
+        assert(transpose == activation_transpose::HWC && "Can't combine reverse and transpose");
+        assert(storage != activation_storage::CHAINED && "Can't combine reverse and chaining");
+    }
+    if ( transpose != activation_transpose::HWC )
+    {
+        assert(reverse == activation_reverse::NONE && "Can't combine transpose and reverse");
+        assert(storage != activation_storage::CHAINED && "Can't combine transpose and chaining");
+    }
+    if ( storage == activation_storage::CHAINED )
+    {
+        assert(reverse == activation_reverse::NONE && "Can't combine chaining and reverse");
+        assert(transpose == activation_transpose::HWC && "Can't combine chaining and transpose");
+    }
+    Emit(isa::npu_set_ofm_precision_t(type, precision, format, scaleMode, reverse, transpose, storage));
+}
+
+// Generates common IFM registers
+void EthosU85RCSGenerator::GenerateIFM(OpType opType, const HLCFeatureMap &fm, const Box &inputArea, bool isScalar, int32_t scalarValue)
+{
+    if ( isScalar )
+    {
+        Emit(isa::npu_set_op_scalar_t(uint32_t(scalarValue)));
+    }
+    else
+    {
+        CheckAddresses(fm);
+        Emit(isa::npu_set_ifm_region_t(ToRegion(fm.memArea)));
+        Shape strides = fm.strides;
+        auto tiles = GetTiles(fm, strides, inputArea);
+        auto boxSize = inputArea.SizeShape();
+        // IFM_BASE registers
+        Emit(isa::npu_set_ifm_base0_t(tiles.address[0]));
+        Emit(isa::npu_set_ifm_base1_t(tiles.address[1]));
+        Emit(isa::npu_set_ifm_base2_t(tiles.address[2]));
+        Emit(isa::npu_set_ifm_base3_t(tiles.address[3]));
+        // Tile related registers
+        Emit(isa::npu_set_ifm_height0_m1_t(tiles.height0 - 1));
+        Emit(isa::npu_set_ifm_height1_m1_t(tiles.height1 - 1));
+        Emit(isa::npu_set_ifm_width0_m1_t(tiles.width0 - 1));
+        Emit(isa::npu_set_ifm_depth_m1_t(boxSize.Depth() - 1));
+        // IFM_STRIDE registers
+        Emit(isa::npu_set_ifm_stride_y_t(strides.Height() * fm.stepXY.y));
+        Emit(isa::npu_set_ifm_stride_x_t(strides.Width() * fm.stepXY.x));
+        Emit(isa::npu_set_ifm_stride_c_t(strides.Depth()));
+    }
+    // IFM_ZERO_POINT register
+    auto &quant = fm.quantization;
+    uint32_t zp = UseZeroPoint0(opType, fm, false) ? 0 : uint32_t(quant.zeroPoints[0]);
+    Emit(isa::npu_set_ifm_zero_point_t(zp));
+}
+
+// Generates common IFM2 registers
+void EthosU85RCSGenerator::GenerateIFM2(OpType opType, const HLCFeatureMap &fm, const Box &inputArea, bool isScalar, int32_t scalarValue)
+{
+    if ( isScalar )
+    {
+        Emit(isa::npu_set_op_scalar_t(uint32_t(scalarValue)));
+    }
+    else
+    {
+        CheckAddresses(fm);
+        Emit(isa::npu_set_ifm2_region_t(ToRegion(fm.memArea)));
+        Shape strides = fm.strides;
+        auto tiles = GetTiles(fm, strides, inputArea);
+        // IFM2_BASE registers
+        Emit(isa::npu_set_ifm2_base0_t(tiles.address[0]));
+        Emit(isa::npu_set_ifm2_base1_t(tiles.address[1]));
+        Emit(isa::npu_set_ifm2_base2_t(tiles.address[2]));
+        Emit(isa::npu_set_ifm2_base3_t(tiles.address[3]));
+        // Tile related registers
+        Emit(isa::npu_set_ifm2_height0_m1_t(tiles.height0 - 1));
+        Emit(isa::npu_set_ifm2_height1_m1_t(tiles.height1 - 1));
+        Emit(isa::npu_set_ifm2_width0_m1_t(tiles.width0 - 1));
+        // IFM2_STRIDE registers
+        Emit(isa::npu_set_ifm2_stride_y_t(strides.Height() * fm.stepXY.y));
+        Emit(isa::npu_set_ifm2_stride_x_t(strides.Width() * fm.stepXY.x));
+        Emit(isa::npu_set_ifm2_stride_c_t(strides.Depth()));
+    }
+    // IFM2_ZERO_POINT register
+    auto &quant = fm.quantization;
+    uint32_t zp = UseZeroPoint0(opType, fm, false) ? 0 : uint32_t(quant.zeroPoints[0]);
+    Emit(isa::npu_set_ifm2_zero_point_t(zp));
+}
+
+// Generates OFM registers
+void EthosU85RCSGenerator::GenerateOFM(OpType opType, const HLCFeatureMap &fm, const Box &outputArea)
+{
+    CheckAddresses(fm);
+    Emit(isa::npu_set_ofm_region_t(ToRegion(fm.memArea)));
+    Shape strides = fm.strides;
+    auto tiles = GetTiles(fm, strides, outputArea);
+    auto boxSize = outputArea.SizeShape().Untranspose(fm.transpose);
+    // OFM_BASE registers
+    Emit(isa::npu_set_ofm_base0_t(tiles.address[0]));
+    Emit(isa::npu_set_ofm_base1_t(tiles.address[1]));
+    Emit(isa::npu_set_ofm_base2_t(tiles.address[2]));
+    Emit(isa::npu_set_ofm_base3_t(tiles.address[3]));
+    // OFM size (shape *before* transposition) //TODO: Maybe transpose stepXY. Here or in tiles?
+    Emit(isa::npu_set_ofm_height_m1_t(DivRoundUp(boxSize.Height(), fm.stepXY.y) - 1));
+    Emit(isa::npu_set_ofm_width_m1_t(DivRoundUp(boxSize.Width(), fm.stepXY.x) - 1));
+    Emit(isa::npu_set_ofm_depth_m1_t(boxSize.Depth() - 1));
+    // Tile related registers (shape *after* transposition)
+    Emit(isa::npu_set_ofm_height0_m1_t(tiles.height0 - 1));
+    Emit(isa::npu_set_ofm_height1_m1_t(tiles.height1 - 1));
+    Emit(isa::npu_set_ofm_width0_m1_t(tiles.width0 - 1));
+    // OFM_STRIDE registers
+    Emit(isa::npu_set_ofm_stride_y_t(strides.Height() * fm.stepXY.y));
+    Emit(isa::npu_set_ofm_stride_x_t(strides.Width() * fm.stepXY.x));
+    Emit(isa::npu_set_ofm_stride_c_t(strides.Depth()));
+    // OFM_ZERO_POINT register
+    auto &quant = fm.quantization;
+    uint32_t zp = UseZeroPoint0(opType, fm, true) ? 0 : uint32_t(quant.zeroPoints[0]);
+    Emit(isa::npu_set_ofm_zero_point_t(zp));
+}
+
+// Generates WEIGHT registers
+void EthosU85RCSGenerator::GenerateWeights(const HLCStripe *stripe, MemoryAccesses &memoryAccesses)
+{
+    auto weights = stripe->operation->weights.get();
+    if ( weights == nullptr )
+    {
+        return;
+    }
+
+    EthosU85OpConfig *config = static_cast<EthosU85OpConfig *>(stripe->operation->config);
+
+    auto wgtFormat = (weights->format & WeightFormat::Fast) ? weight_format::FWD : weight_format::SWD;
+    auto wgtSparsity = (weights->format & WeightFormat::Sparse2_4) ? weight_sparsity::SPARSE_2_4 : weight_sparsity::NONE;
+    Emit(isa::npu_set_weight_format_t(wgtFormat, wgtSparsity));
+
+    int depth = stripe->weightRangeDepth;
+    Emit(isa::npu_set_weight_region_t(ToRegion(weights->memArea)));
+    int offset = 0;
+    for ( int i = 0; i < _arch->_cores; ++i )
+    {
+        Address address = 0;
+        int length = 0;
+        auto item = weights->encodedRanges.find(WeightKey(i, depth));
+        if ( item != weights->encodedRanges.end() )
+        {
+            const auto &range = item->second;
+            int doubleBufferOffset = GetDoubleBufferOffset(weights, range.index);
+            address = weights->address + offset + range.weightOffset + doubleBufferOffset;
+            length = RoundAway(range.weightBytes, 16);
+            CheckAddressRange(weights->memArea.memory, address, length);
+            memoryAccesses.emplace_back(AccessDirection::Read, weights->memArea, address, address + length);
+            offset += RoundAway(range.TotalBytes(), 16);
+        }
+
+        switch ( i )
+        {
+            case 0:
+                if ( length != 0 ) Emit(isa::npu_set_weight_base_t(address));
+                Emit(isa::npu_set_weight_length_t(length));
+                break;
+            case 1:
+                if ( length != 0 ) Emit(isa::npu_set_weight1_base_t(address));
+                Emit(isa::npu_set_weight1_length_t(length));
+                break;
+            case 2:
+                if ( length != 0 ) Emit(isa::npu_set_weight2_base_t(address));
+                Emit(isa::npu_set_weight2_length_t(length));
+                break;
+            case 3:
+                if ( length != 0 ) Emit(isa::npu_set_weight3_base_t(address));
+                Emit(isa::npu_set_weight3_length_t(length));
+                break;
+            default:
+                assert(false);
+        }
+    }
+}
+
+// Generates SCALE registers
+void EthosU85RCSGenerator::GenerateScales(const HLCStripe *stripe, MemoryAccesses &memoryAccesses)
+{
+    auto scales = stripe->operation->scales.get();
+    if ( scales == nullptr )
+    {
+        assert(!stripe->operation->weights);
+        return;
+    }
+    int depth = stripe->weightRangeDepth;
+    Emit(isa::npu_set_scale_region_t(ToRegion(scales->memArea)));
+    auto item0 = scales->encodedRanges.find(WeightKey(0, depth));
+    assert(item0 != scales->encodedRanges.end());
+    auto &range0 = item0->second;
+    int doubleBufferOffset = GetDoubleBufferOffset(scales, range0.index);
+    Address address = scales->address + doubleBufferOffset;
+    int length = RoundAway(range0.scaleBytes, 16);
+
+    CheckAddressRange(scales->memArea.memory, address, length);
+    Emit(isa::npu_set_scale_base_t(address));
+    Emit(isa::npu_set_scale_length_t(length));
+    memoryAccesses.emplace_back(AccessDirection::Read, scales->memArea, address, address + length);
+}
+
+// Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers
+void EthosU85RCSGenerator::GenerateBlockConfig(const EthosU85OpConfig *config, const HLCFeatureMap &fm)
+{
+    Shape blk = config->OfmBlock();
+    // Block constraints for transpose
+    switch ( fm.transpose )
+    {
+        case TransposeType::NWHC:
+            assert(blk.Width() <= _arch->_ofmBlockMax.Height() && "Illegal OFM block height");
+            assert(blk.Height() <= _arch->_ofmBlockMax.Width() && "Illegal OFM block width");
+            break;
+        case TransposeType::NHCW:
+            assert(blk.Depth() <= _arch->_ofmBlockMax.Width() && "Illegal OFM block width");
+            assert(blk.Width() <= _arch->_ofmBlockMax.Depth() && "Illegal OFM block depth");
+            // Width must be multiple of 16 if brick format
+            assert((fm.format != TensorFormat::NHCWB16 || blk.Width() % 16 == 0) && "Illegal OFM block width for brick format");
+            break;
+        case TransposeType::NCWH:
+            assert(blk.Depth() <= _arch->_ofmBlockMax.Height() && "Illegal OFM block height");
+            assert(blk.Height() <= _arch->_ofmBlockMax.Depth() && "Illegal OFM block depth");
+            // Height must be multiple of 16 if brick format
+            assert((fm.format != TensorFormat::NHCWB16 || blk.Height() % 16 == 0) && "Illegal OFM block height for brick format");
+            break;
+        case TransposeType::NCHW:
+            assert(blk.Depth() <= _arch->_ofmBlockMax.Height() && "Illegal OFM block height");
+            assert(blk.Height() <= _arch->_ofmBlockMax.Width() && "Illegal OFM block width");
+            assert(blk.Width() <= _arch->_ofmBlockMax.Depth() && "Illegal OFM block depth");
+            // Width must be multiple of 16 if brick format
+            assert((fm.format != TensorFormat::NHCWB16 || blk.Width() % 16 == 0) && "Illegal OFM block width for brick format");
+            break;
+        case TransposeType::NWCH:
+            assert(blk.Width() <= _arch->_ofmBlockMax.Height() && "Illegal OFM block height");
+            assert(blk.Depth() <= _arch->_ofmBlockMax.Width() && "Illegal OFM block width");
+            assert(blk.Height() <= _arch->_ofmBlockMax.Depth() && "Illegal OFM block depth");
+            // Height must be multiple of 16 if brick format
+            assert((fm.format != TensorFormat::NHCWB16 || blk.Height() % 16 == 0) && "Illegal OFM block height for brick format");
+            break;
+        default:
+            break;
+    }
+    // OFM block (shape *before* transposition)
+    Emit(isa::npu_set_ofm_blk_height_m1_t(blk.Height() - 1));
+    Emit(isa::npu_set_ofm_blk_width_m1_t(blk.Width() - 1));
+    Emit(isa::npu_set_ofm_blk_depth_m1_t(blk.Depth() - 1));
+}
+
+// Generates ACC_FORMAT register
+void EthosU85RCSGenerator::GenerateAccFormat(const EthosU85OpConfig *config)
+{
+    auto accType = config->_accumulatorType;
+    acc_format format = accType == EthosU85Accumulator::Acc32 ? acc_format::I32 : acc_format::I48;
+
+    auto w = config->OfmUBlock().Width();
+    auto h = config->OfmUBlock().Height();
+    microblock block = microblock::U1X1;
+
+    switch ( h << 4 | w )
+    {
+        case 0x11:
+            block = microblock::U1X1;
+            break;
+        case 0x12:
+            block = microblock::U1X2;
+            break;
+        case 0x14:
+            block = microblock::U1X4;
+            break;
+        case 0x22:
+            block = microblock::U2X2;
+            break;
+        case 0x24:
+            block = microblock::U2X4;
+            break;
+        case 0x44:
+            block = microblock::U4X4;
+            break;
+        default:
+            assert(false && "Invalid microblock");
+    }
+    acc_input input;
+    switch ( config->AccSource() )
+    {
+        case ArchAccumulatorSource::Acc:
+            input = acc_input::KEEP;
+            break;
+        case ArchAccumulatorSource::Ifm2:
+            input = acc_input::IFM2;
+            break;
+        case ArchAccumulatorSource::Reset:
+        default:
+            input = acc_input::RESET;
+    }
+    acc_output output = config->AccOutputEnabled() ? acc_output::ENABLE : acc_output::DISABLE;
+
+    Emit(isa::npu_set_acc_format_t(format, input, output, block));
+}
+
+// Calculates and generates KERNEL_WAIT or DMA_WAIT register
+void EthosU85RCSGenerator::GenerateWaits(bool isKernelWait, const MemoryAccesses &memoryAccesses, int maxWaits,
+    std::deque<MemoryAccesses> &outstandingAccesses, std::deque<MemoryAccesses> &accessesToUpdate)
+{
+    int waits = CalcCommandWaits(memoryAccesses, outstandingAccesses);
+    if ( waits >= 0 )
+    {
+        if ( isKernelWait )
+        {
+            Emit(isa::npu_op_kernel_wait_t(waits));
+        }
+        else
+        {
+            Emit(isa::npu_op_dma_wait_t(waits));
+        }
+    }
+    accessesToUpdate.push_back(memoryAccesses);
+    if ( int(accessesToUpdate.size()) > maxWaits )
+    {
+        accessesToUpdate.pop_front();
+    }
+}
+
+// Inserts DMA commands for copying LUTs from constant memory
+// to LUT memory
+std::vector<std::unique_ptr<HighLevelCommand>>
+EthosU85RCSGenerator::InsertLUTDMACommands(std::vector<std::unique_ptr<HighLevelCommand>> &cmds)
+{
+    std::vector<std::unique_ptr<HighLevelCommand>> result;
+    int lutSlotSize = ArchEthosU85::LUT_SLOT_SIZE;
+    int slots = int(_arch->_lutRam->SizeBytes() / lutSlotSize);
+    std::vector<LutSlot> lutSlots(slots);
+    int timestamp = 0;
+    result.reserve(cmds.size());
+    for ( auto &hlc : cmds )
+    {
+        ++timestamp;
+        if ( hlc->IsStripe() )
+        {
+            auto stripe = static_cast<HLCStripe *>(hlc.get());
+            auto op = stripe->operation;
+            auto config = static_cast<EthosU85OpConfig *>(op->config);
+            if ( !op->subOps.empty() && op->subOps[0].type == OpType::LUT )
+            {
+                const auto &srcTens = op->subOps[0].parameters.lut;
+                assert(srcTens.sizeBytes % lutSlotSize == 0);
+                bool alreadyInLutMem;
+                int sizeInSlots = srcTens.sizeBytes / lutSlotSize;
+                int slot = AllocateLutSlot(lutSlots, op.get(), sizeInSlots, timestamp, alreadyInLutMem);
+                _stripeToLutSlot[stripe] = slot;
+
+                if ( !alreadyInLutMem )
+                {
+                    auto dma = std::make_unique<HLCDMA>();
+                    dma->srcMemArea = srcTens.memArea;
+                    dma->srcAddress = srcTens.address;
+                    dma->length = srcTens.sizeBytes;
+                    dma->destMemArea = _arch->LUTMemory();
+                    dma->destAddress = slot * lutSlotSize;
+                    result.push_back(std::move(dma));
+                }
+            }
+        }
+        result.push_back(std::move(hlc));
+    }
+    return result;
+}
+
+//----------------------------------------------------------------------
+// Operations
+//----------------------------------------------------------------------
+
+// Generates NPU_OP_* command
+void EthosU85RCSGenerator::GenerateOperationCode(const HLCOperation *op)
+{
+    auto opType = op->type;
+    if ( opType == OpType::Resize )
+    {
+        resize_mode mode = ToResizeMode(op->parameters.resize.mode);
+        Emit(isa::npu_op_resize_t(mode));
+    }
+    else if ( IsPooling(opType) )
+    {
+        pooling_mode mode;
+        if ( opType == OpType::AvgPool )
+        {
+            auto kernelSize = op->kernel.Size();
+            // SUM when kernel size > 8x8
+            mode = (kernelSize.x <= 8 && kernelSize.y <= 8) ? pooling_mode::AVERAGE : pooling_mode::SUM;
+        }
+        else if ( opType == OpType::MaxPool )
+        {
+            mode = pooling_mode::MAX;
+        }
+        else if ( opType == OpType::ArgMax )
+        {
+            auto axis = op->parameters.argmax.axis;
+            assert(axis == 1 || axis == 2);
+            mode = axis == 1 ? pooling_mode::ARGMAX_Y : pooling_mode::ARGMAX_X;
+        }
+        else
+        {
+            assert(opType == OpType::ReduceSum);
+            mode = pooling_mode::REDUCE_SUM;
+        }
+        Emit(isa::npu_op_pool_t(mode));
+    }
+    else if ( IsDepthwise(opType) )
+    {
+        Emit(isa::npu_op_depthwise_t());
+    }
+    else if ( IsConvolution(opType) || IsVectorProduct(opType) )
+    {
+        // Dynamic weights when op->ifm.size() == 2, _weights_ifm2 parameter should be True
+        Emit(isa::npu_op_conv_t(op->ifm.size() == 2));
+    }
+    else if ( IsElementwise(opType) )
+    {
+        const auto &item = kElementwiseMap.find(opType);
+        if ( item == kElementwiseMap.end() )
+        {
+            assert(false && "Unsupported elementwise operator");
+        }
+        else
+        {
+            Emit(isa::npu_op_elementwise_t(item->second));
+        }
+    }
+    else if ( IsDma(opType) )
+    {
+        Emit(isa::npu_op_dma_start_t());
+    }
+    else if ( _arch->UseAvgPoolNop(opType) )
+    {
+        // Implemented using SUM
+        Emit(isa::npu_op_pool_t(pooling_mode::SUM));
+    }
+    else
+    {
+        assert(false && "Unsupported operator");
+    }
+}
+
+void EthosU85RCSGenerator::GenerateCommon(const HLCStripe *stripe, bool useGlobalScale, MemoryAccesses &memoryAccesses)
+{
+    auto op = stripe->operation.get();
+    int32_t scalarValue = 0;
+    bool isScalar = IsScalar(op->ifm[0], scalarValue) && IsElementwise(op->type);
+    GenerateIFMPrecision(op->ifm[0]);
+    GenerateIFM(op->type, op->ifm[0], stripe->ifmAreas[0], isScalar, scalarValue);
+    if ( !isScalar )
+    {
+        memoryAccesses.push_back(ToMemoryAccess(op->ifm[0], stripe->ifmAreas[0], AccessDirection::Read));
+    }
+    ifm_upscale_mode upscaleMode = ToIfmUpscaleMode(op->ifm[0].resamplingMode);
+    Emit(isa::npu_set_ifm_upscale_t(upscaleMode));
+    if ( !IsElementwise(op->type) )
+    {
+        GeneratePadding(stripe->padding);
+    }
+    GenerateOFM(op->type, op->ofm, stripe->ofmArea);
+    memoryAccesses.push_back(ToMemoryAccess(op->ofm, stripe->ofmArea, AccessDirection::Write));
+    EthosU85OpConfig *config = static_cast<EthosU85OpConfig *>(stripe->operation->config);
+    GenerateOFMPrecision(op->ofm, useGlobalScale, config->AccOutputEnabled());
+    if ( !IsElementwise(op->type) && op->type != OpType::Resize )
+    {
+        GenerateKernel(op->kernel, config->Traversal() == EthosU85Traversal::PartKernel);
+    }
+    GenerateWeights(stripe, memoryAccesses);
+    GenerateScales(stripe, memoryAccesses);
+    GenerateActivation(stripe, memoryAccesses);
+}
+
+// Conv2D/Depthwise operations
+void EthosU85RCSGenerator::GenerateConvolutionOp(const HLCStripe *stripe, MemoryAccesses &memoryAccesses)
+{
+    auto op = stripe->operation.get();
+    QuantizedScale ofmScale(1, 0);
+    bool useGlobalScale = false;
+    ethosU85Scaling::RescaleConvolution(op);
+
+    if ( op->ifm.size() == 2 )
+    {
+        // Dynamic weights
+        useGlobalScale = true;
+        GenerateIFM2(op->type, op->ifm[1], stripe->ifmAreas[1], false, 0);
+        GenerateIFM2Precision(op->ifm[1]);
+        Emit(isa::npu_set_weight_format_t(weight_format::SWD, weight_sparsity::NONE));  // Reset weight format
+    }
+
+    if ( !op->ofm.quantization.scales.empty() )
+    {
+        ofmScale = op->ofm.quantization.scales[0];
+        assert(unsigned(ofmScale.shift) < 64);
+    }
+    Emit(isa::npu_set_ofm_scale_t(ofmScale.shift, 0, GetOfmRoundingMode(op), ofmScale.scale));
+    GenerateCommon(stripe, useGlobalScale, memoryAccesses);
+}
+
+// MaxPool/AvgPool or operations that are mapped to AvgPool
+void EthosU85RCSGenerator::GeneratePoolingOp(HLCStripe *stripe, MemoryAccesses &memoryAccesses)
+{
+    auto op = stripe->operation.get();
+    bool useGlobalScale = true;  // TODO: Add any per channel scaling modes
+    if ( _arch->UseAvgPoolNop(op->type) )
+    {
+        assert(op->kernel.Size() == Point2i(1, 1));
+        assert(op->kernel.Stride() == Point2i(1, 1));
+        assert(op->kernel.Dilation() == Point2i(1, 1));
+        assert(op->kernel.DepthMultiplier() == 1);
+        assert(useGlobalScale);
+    }
+    GenerateCommon(stripe, useGlobalScale, memoryAccesses);
+    if ( useGlobalScale )
+    {
+        GenerateOFMScalingForPooling(op);
+    }
+}
+
+// Elementwise operations
+void EthosU85RCSGenerator::GenerateElementwiseOp(HLCStripe *stripe, MemoryAccesses &memoryAccesses)
+{
+    auto op = stripe->operation.get();
+    auto opType = op->type;
+    constexpr bool useGlobalScale = true;
+    if ( IsUnaryElementwise(opType) )
+    {
+        assert(op->ifm.size() == 1);
+        GenerateScalingForElementwise(op);
+        GenerateCommon(stripe, useGlobalScale, memoryAccesses);
+    }
+    else
+    {
+        // Binary operation: generate IFM2 registers
+        assert(op->ifm.size() == 2);
+        assert(stripe->ifmAreas.size() == 2);
+        int32_t scalarValue = 0;
+        auto ifmShape = stripe->ifmAreas[0].SizeShape();
+        auto ifm2Shape = stripe->ifmAreas[1].SizeShape();
+        GenerateScalingForElementwise(op);
+        GenerateCommon(stripe, useGlobalScale, memoryAccesses);
+        bool ifmIsScalar = IsScalar(op->ifm[0], scalarValue);
+        bool ifm2IsScalar = IsScalar(op->ifm[1], scalarValue);
+        GenerateIFM2(opType, op->ifm[1], stripe->ifmAreas[1], ifm2IsScalar, scalarValue);
+        if ( !ifm2IsScalar )
+        {
+            memoryAccesses.push_back(ToMemoryAccess(op->ifm[1], stripe->ifmAreas[1], AccessDirection::Read));
+        }
+        GenerateIFM2Precision(op->ifm[1]);
+        GenerateInputBroadcast(ifmShape, ifm2Shape, ifmIsScalar, ifm2IsScalar);
+    }
+}
+
+// Resize operations
+void EthosU85RCSGenerator::GenerateResizeOp(HLCStripe *stripe, MemoryAccesses &memoryAccesses)
+{
+    auto op = stripe->operation.get();
+    auto opType = op->type;
+    constexpr bool useGlobalScale = true;
+    auto *config = static_cast<EthosU85OpConfig *>(op->config);
+    Shape ofmBlock = config->_ofmBlock;
+
+    auto ifmShape = stripe->ifmAreas[0].SizeShape();
+
+    // operator-parameters
+    const HLCParameters *params = &op->parameters;
+    const auto &scale_w = params->resize.scaleX;
+    const auto &scale_h = params->resize.scaleY;
+    int offset_h = params->resize.offsetY;
+    int offset_w = params->resize.offsetX;
+
+    round_mode_ofm roundMode = GetOfmRoundingMode(op);
+
+    // scaling is shift only and + 16
+    QuantizedScale ofmScale = op->ofm.quantization.scales[0];
+    int shift = 16 + ofmScale.shift;
+
+    // X - width
+    int one_step_int_w = scale_w.d / scale_w.n;
+    int one_step_mod_w = scale_w.d % scale_w.n;
+    int blk_step_int_w = ((ofmBlock.Width() - 1) * scale_w.d) / scale_w.n;
+    int blk_step_mod_w = ((ofmBlock.Width() - 1) * scale_w.d) % scale_w.n;
+
+    // Y - height
+    int one_step_int_h = scale_h.d / scale_h.n;
+    int one_step_mod_h = scale_h.d % scale_h.n;
+    int blk_step_int_h = ((ofmBlock.Height() - 1) * scale_h.d) / scale_h.n;
+    int blk_step_mod_h = ((ofmBlock.Height() - 1) * scale_h.d) % scale_h.n;
+
+    // asserts
+    assert(shift < (1 << 6));
+    assert(ofmScale.scale == 1);
+    assert(scale_w.n <= 2048);
+    assert(scale_h.n <= 2048);
+    assert(-scale_h.n <= offset_h);
+    assert(offset_h < scale_h.n);
+    assert(one_step_mod_h < scale_h.n);
+    assert(-scale_w.n <= offset_w);
+    assert(offset_w < scale_w.n);
+    assert(one_step_mod_w < scale_w.n);
+    assert(ToIfmUpscaleMode(op->ifm[0].resamplingMode) == ifm_upscale_mode::NONE);
+    assert(ofmBlock.Height() == 1);
+    assert(ToActivationTranspose(op->ofm.transpose) == activation_transpose::HWC);
+
+    GenerateCommon(stripe, useGlobalScale, memoryAccesses);
+
+    // Resize requires ifm2_zero_point 0
+    Emit(isa::npu_set_ifm2_zero_point_t(0));
+    Emit(isa::npu_set_ofm_scale_t(16 + ofmScale.shift, 0, roundMode, 1));
+
+    // Resize specific registers
+    Emit(isa::npu_set_resize_x_scale_n_m1_t(scale_w.n - 1));
+    Emit(isa::npu_set_resize_y_scale_n_m1_t(scale_h.n - 1));
+    Emit(isa::npu_set_resize_x_step_t(one_step_int_w, blk_step_int_w, one_step_mod_w, blk_step_mod_w));
+    Emit(isa::npu_set_resize_y_step_t(one_step_int_h, blk_step_int_h, one_step_mod_h, blk_step_mod_h));
+    Emit(isa::npu_set_resize_x_offset_t(offset_w));
+    Emit(isa::npu_set_resize_y_offset_t(offset_h));
+    Emit(isa::npu_set_kernel_height_m1_t(ifmShape.Height() - 1));
+    Emit(isa::npu_set_kernel_width_m1_t(ifmShape.Width() - 1));
+}
+
+bool EthosU85RCSGenerator::GenerateStripe(HLCStripe *stripe, MemoryAccesses &memoryAccesses)
+{
+    auto opType = stripe->operation->type;
+    EthosU85NpuOp npuOp = ArchEthosU85::GetHWOp(opType);
+
+    if ( npuOp == EthosU85NpuOp::Pooling || npuOp == EthosU85NpuOp::ReduceSum )
+    {
+        GeneratePoolingOp(stripe, memoryAccesses);
+    }
+    else if ( npuOp == EthosU85NpuOp::Depthwise || npuOp == EthosU85NpuOp::Convolution || npuOp == EthosU85NpuOp::VectorProduct )
+    {
+        GenerateConvolutionOp(stripe, memoryAccesses);
+    }
+    else if ( npuOp == EthosU85NpuOp::Elementwise )
+    {
+        GenerateElementwiseOp(stripe, memoryAccesses);
+    }
+    else if ( npuOp == EthosU85NpuOp::Resize )
+    {
+        GenerateResizeOp(stripe, memoryAccesses);
+    }
+    else
+    {
+        LOG_ERROR("Register command stream generator: unsupported operator '{}'\n", OpTypeToString(opType));
+        assert(false);
+        return false;
+    }
+    EthosU85OpConfig *config = static_cast<EthosU85OpConfig *>(stripe->operation->config);
+    GenerateBlockConfig(config, stripe->operation->ofm);
+    GenerateAccFormat(config);
+    return true;
+}
+
+// Generates register commands for DMA operations
+void EthosU85RCSGenerator::GenerateDMA(const HLCDMA *dma, MemoryAccesses &memoryAccesses)
+{
+    dma_region_mode srcRegionMode = dma->srcMemArea == _arch->LUTMemory() ? dma_region_mode::INTERNAL : dma_region_mode::EXTERNAL;
+    dma_region_mode destRegionMode = dma->destMemArea == _arch->LUTMemory() ? dma_region_mode::INTERNAL : dma_region_mode::EXTERNAL;
+
+    uint32_t size0 = dma->sizes.Size() > 0 ? dma->sizes[-1] : 1;
+    uint32_t size1 = dma->sizes.Size() > 1 ? dma->sizes[-2] : 1;
+    uint64_t srcStride0 = dma->srcStrides.Size() > 1 ? dma->srcStrides[-2] : 0;
+    uint64_t srcStride1 = dma->srcStrides.Size() > 2 ? dma->srcStrides[-3] : 0;
+    uint64_t destStride0 = dma->destStrides.Size() > 1 ? dma->destStrides[-2] : 0;
+    uint64_t destStride1 = dma->destStrides.Size() > 2 ? dma->destStrides[-3] : 0;
+
+    dma_stride_mode srcStrideMode;
+    if ( size1 > 1 ) srcStrideMode = dma_stride_mode::D3;
+    else if ( size0 > 1 ) srcStrideMode = dma_stride_mode::D2;
+    else srcStrideMode = dma_stride_mode::D1;
+
+    dma_idx_mode srcIndexMode = dma->srcIndexed ? dma_idx_mode::ENABLED : dma_idx_mode::DISABLED;
+    dma_idx_mode destIndexMode = dma->destIndexed ? dma_idx_mode::ENABLED : dma_idx_mode::DISABLED;
+    assert(!(srcIndexMode == dma_idx_mode::ENABLED && destIndexMode == dma_idx_mode::ENABLED));
+
+    // Registers for 1D, 2D and 3D mode
+    Emit(isa::npu_set_dma0_src_region_t(ToRegion(dma->srcMemArea), srcRegionMode, srcStrideMode, srcIndexMode));
+    Emit(isa::npu_set_dma0_src_t(dma->srcAddress));
+    Emit(isa::npu_set_dma0_dst_region_t(ToRegion(dma->destMemArea), destRegionMode, destIndexMode));
+    Emit(isa::npu_set_dma0_dst_t(dma->destAddress));
+    assert(dma->length > 0);
+    Emit(isa::npu_set_dma0_len_t(dma->length));
+
+    if ( srcStrideMode != dma_stride_mode::D1 )
+    {
+        // Registers for 2D and 3D mode
+        assert(size0 > 0);
+        Emit(isa::npu_set_dma0_size0_t(size0));
+    }
+
+    if ( srcStrideMode != dma_stride_mode::D1 || dma->srcIndexed )
+    {
+        // Registers for 2D and 3D mode, or src indexed operation
+        Emit(isa::npu_set_dma0_src_stride0_t(srcStride0));
+    }
+
+    if ( srcStrideMode != dma_stride_mode::D1 || dma->destIndexed )
+    {
+        // Registers for 2D and 3D mode, or dest indexed operation
+        Emit(isa::npu_set_dma0_dst_stride0_t(destStride0));
+    }
+
+    if ( srcStrideMode == dma_stride_mode::D3 )
+    {
+        // Registers for 3D mode
+        assert(size1 > 0);
+        Emit(isa::npu_set_dma0_size1_t(size1));
+        Emit(isa::npu_set_dma0_src_stride1_t(srcStride1));
+        Emit(isa::npu_set_dma0_dst_stride1_t(destStride1));
+    }
+
+    if ( dma->srcIndexed || dma->destIndexed )
+    {
+        // Registers for indexed operation
+        Emit(isa::npu_set_dma0_idx_region_t(ToRegion(dma->idxMemArea)));
+        assert(dma->idxMax > 0);
+        Emit(isa::npu_set_dma0_idx_max_t(dma->idxMax));
+        Emit(isa::npu_set_dma0_idx_t(dma->idxAddress));
+    }
+
+    if ( srcStrideMode == dma_stride_mode::D3 && (dma->srcIndexed || dma->destIndexed) )
+    {
+        Emit(isa::npu_set_dma0_idx_skip1_t(dma->idxSkip1));
+    }
+
+    if ( srcStrideMode == dma_stride_mode::D1 )
+    {
+        // Address accesses for 1D mode
+        CheckAddressRange(dma->srcMemArea.memory, dma->srcAddress, dma->length);
+        CheckAddressRange(dma->destMemArea.memory, dma->destAddress, dma->length);
+        memoryAccesses.emplace_back(AccessDirection::Read, dma->srcMemArea, dma->srcAddress, dma->srcAddress + dma->length);
+        memoryAccesses.emplace_back(AccessDirection::Write, dma->destMemArea, dma->destAddress, dma->destAddress + dma->length);
+    }
+    else
+    {
+        // Address accesses for 2D and 3D mode
+        CheckAddressRange(dma->srcMemArea.memory, dma->srcAddress, dma->srcStrides[0]);
+        CheckAddressRange(dma->destMemArea.memory, dma->destAddress, dma->destStrides[0]);
+        memoryAccesses.emplace_back(AccessDirection::Read, dma->srcMemArea, dma->srcAddress, dma->srcAddress + dma->srcStrides[0]);
+        memoryAccesses.emplace_back(AccessDirection::Write, dma->destMemArea, dma->destAddress, dma->destAddress + dma->destStrides[0]);
+    }
+
+    if ( dma->srcIndexed || dma->destIndexed )
+    {
+        // Address accesses for indexed operation
+        CheckAddressRange(dma->idxMemArea.memory, dma->idxAddress, size0 * size1);
+        memoryAccesses.emplace_back(AccessDirection::Read, dma->idxMemArea, dma->idxAddress, dma->idxAddress + size0 * size1);
+    }
+}
+
+std::vector<uint32_t> EthosU85RCSGenerator::GenerateCommandStream(std::vector<std::unique_ptr<HighLevelCommand>> &highLevelCommandStream,
+    std::vector<std::tuple<void *, int, int>> *cmdRanges, bool verbose)
+{
+    _emit.Clear();
+    _stripeToLutSlot.clear();
+    GenerateInitialRegisterSetup();
+    auto cmds = InsertLUTDMACommands(highLevelCommandStream);
+    std::deque<MemoryAccesses> outstandingDmaAccesses;
+    std::deque<MemoryAccesses> outstandingNpuAccesses;
+    int maxOutstandingDMAOps = _arch->MaxOutstandingDMAOps();
+    int maxOutstandingKernelOps = _arch->MaxOutstandingKernelOps();
+    HLCStripe *prevOp = nullptr;
+    std::vector<std::pair<unsigned, std::string>> debugInfo;
+    for ( auto &hlc : cmds )
+    {
+        MemoryAccesses memoryAccesses;
+        int emitStart = _emit.Position();
+        if ( hlc->IsStripe() )
+        {
+            auto stripe = static_cast<HLCStripe *>(hlc.get());
+            if ( verbose )
+            {
+                debugInfo.emplace_back(emitStart, stripe->operation->ToString());
+            }
+            if ( !GenerateStripe(stripe, memoryAccesses) )
+            {
+                return std::vector<uint32_t>();
+            }
+            // BLOCKDEP register
+            int blockdep = CalcBlockDep(prevOp, stripe);
+            Emit(isa::npu_set_blockdep_t(blockdep));
+            GenerateWaits(false, memoryAccesses, maxOutstandingKernelOps, outstandingDmaAccesses, outstandingNpuAccesses);
+            GenerateOperationCode(stripe->operation.get());
+            prevOp = stripe;
+            // Return command mapping information to the caller
+            int emitEnd = _emit.Position();
+            if ( cmdRanges )
+            {
+                cmdRanges->emplace_back(stripe->operation->_srcKey, emitStart, emitEnd);
+            }
+        }
+        else
+        {
+            auto dma = static_cast<HLCDMA *>(hlc.get());
+            if ( verbose )
+            {
+                debugInfo.emplace_back(emitStart, dma->ToString());
+            }
+            GenerateDMA(dma, memoryAccesses);
+            GenerateWaits(true, memoryAccesses, maxOutstandingDMAOps, outstandingNpuAccesses, outstandingDmaAccesses);
+            Emit(isa::npu_op_dma_start_t());
+        }
+    }
+    Emit(isa::npu_op_stop_t(0xFFFF));
+    if ( verbose )
+    {
+        PrintCommandStream(_emit.CommandStream(), debugInfo);
+    }
+    return _emit.CommandStream();
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.hpp
new file mode 100644
index 00000000..6c057550
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.hpp
@@ -0,0 +1,250 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "architecture/architecture.hpp"
+#include "architecture/ethos_u_register_cs_generator.hpp"
+#include "ethos_u85.hpp"
+
+#include <cstdint>
+#include <deque>
+#include <unordered_map>
+#include <vector>
+
+namespace regor
+{
+
+class EthosU85Emitter
+{
+public:
+    EthosU85Emitter() = default;
+    void Emit(uint32_t instr);
+    void Emit(uint64_t instr);
+    void Clear();
+    int Position() const { return int(_stream.size()); }
+    const std::vector<uint32_t> &CommandStream() const { return _stream; }
+
+private:
+    bool SetRegister(uint16_t reg, uint64_t value);
+    static bool IsCmd0(uint16_t key);
+    static bool IsCmd1(uint16_t key);
+    static bool IsOp(uint16_t key);
+    std::vector<uint32_t> _stream;
+    std::unordered_map<uint16_t, uint64_t> _registers;
+};
+
+
+// Specifies the addresses and dimensions of the tiles of a feature map.
+// A feature map can use 1 to 4 tiles
+struct TileBox
+{
+    int height0;         // The height of tile 0
+    int height1;         // The height of tile 1
+    int width0;          // The width of tile 0, and tile 2 (if used)
+    Address address[4];  // Tile addresses
+};
+
+enum BasePointerIndex
+{
+    WeightTensor = 0,       // base address index for the Weight tensor
+    ScratchTensor = 1,      // base address index for the Scratch_tensor in the TensorArena
+    ScratchFastTensor = 2,  // base address for the Scratch_fast_tensor
+    Mem2Mem = 3,            // base address slot for memory to memory transfer
+};
+
+enum class AccessDirection
+{
+    Read = 0,
+    Write = 1,
+};
+
+struct MemoryAccess
+{
+    AccessDirection direction;
+    MemArea memArea;
+    Address start;
+    Address end;
+
+    MemoryAccess(AccessDirection direction_, MemArea area_, Address start_, Address end_) :
+            direction(direction_), memArea(area_), start(start_), end(end_)
+    {
+    }
+
+    bool Conflicts(const MemoryAccess &other) const
+    {
+        bool overlaps = Overlaps(start, end, other.start, other.end) && memArea == other.memArea;
+        return overlaps && (direction != AccessDirection::Read || other.direction != AccessDirection::Read);
+    }
+};
+
+using MemoryAccesses = std::vector<MemoryAccess>;
+
+struct LutSlot
+{
+    const HLCOperation *hlcOp = nullptr;
+    int lastUsed = 0;
+};
+
+/// <summary>
+/// Generates register command streams for Ethos U85
+/// </summary>
+class EthosU85RCSGenerator : public EthosURegisterCSGenerator<EthosU85RCSGenerator>
+{
+
+public:
+    EthosU85RCSGenerator(ArchEthosU85 *arch);
+
+    //----------------------------------------------------------------------
+    // Print
+    //----------------------------------------------------------------------
+
+    int Disassemble(const uint32_t *in, std::string &op, std::vector<std::pair<std::string, std::string>> &fields);
+
+protected:
+    //----------------------------------------------------------------------
+    // Helper functions
+    //----------------------------------------------------------------------
+
+    void Emit(uint32_t instr);
+    void Emit(uint64_t instr);
+
+    static int GetDoubleBufferOffset(HLCWeights *weights, int rangeIndex);
+    static void CheckAddressRange(ArchitectureMemory *memory, Address address, int size);
+    static void CheckAddresses(const HLCFeatureMap &fm);
+    // Calculates the rolling buffer address of the given coordinate.
+    static Address AddressForCoordinate(const HLCFeatureMap &fm, const Shape &strides, const Shape &coord);
+    // Calculates tile sizes/addresses of a feature map
+    static TileBox GetTiles(const HLCFeatureMap &fm, const Shape &strides, const Box &area);
+    MemoryAccess ToMemoryAccess(const HLCFeatureMap &fm, const Box &area, AccessDirection direction);
+    // Returns region number used in NPU_SET_..._REGION
+    uint32_t ToRegion(const MemArea &memArea);
+    static bool UseZeroPoint0(OpType opType, const HLCFeatureMap &fm, bool isOFM);
+    // Checks if the feature map is a scalar, and if so, returns the
+    // quantized value in scalarValue.
+    static bool IsScalar(const HLCFeatureMap &fm, int32_t &scalarValue);
+    // Calculates waits for KERNEL_WAIT/DMA_WAIT, returns -1 if no wait is needed
+    // - opAccesses contains the memory accesses for the current operation
+    // - outstanding contains the memory accesses for ongoing "other" operations
+    //   (DMA operations if the current op is an NPU operation, NPU operations if the current op is a DMA operation)
+    // Note: NPU - NPU dependency is handled via blockdep
+    static int CalcCommandWaits(const MemoryAccesses &opAccesses, std::deque<MemoryAccesses> &outstanding);
+    // Returns LUT slot to be used for the given LUT operation.
+    // Sets alreadyInLutMem to true if the LUT is already in SHRAM.
+    int AllocateLutSlot(std::vector<LutSlot> &lutSlots, const HLCOperation *op, int sizeInSlots, int timestamp, bool &alreadyInLutMem);
+    //----------------------------------------------------------------------
+    // Scaling (OFM/IFM/IFM2_SCALE)
+    //----------------------------------------------------------------------
+
+    // Generates OFM_SCALE register for pooling operations
+    void GenerateOFMScalingForPooling(HLCOperation *poolOp);
+    // Generates OFM/IFM/IFM2_SCALE registers for elementwise operators.
+    void GenerateScalingForElementwise(HLCOperation *op);
+
+
+
+    //----------------------------------------------------------------------
+    // BLOCKDEP calculation
+    //----------------------------------------------------------------------
+
+    // Given the area and block size, adds the first/last jobs (depending on fromStart) to jobs.
+    // - area: total amount of work to perform
+    // - block: size of each job
+    // - fromStart: if true, the first jobs are added, if false, the last jobs are added
+    //   (in that case, the very last job is added last)
+    void GetJobs(const Box &area, const Shape &block, int nrJobsToGet, bool fromStart, std::vector<Box> &jobs);
+    // Calculates the value for the BLOCKDEP register
+    int CalcBlockDep(HLCStripe *prevStripe, HLCStripe *stripe);
+
+    //----------------------------------------------------------------------
+    // Register generation
+    //----------------------------------------------------------------------
+
+    void GeneratePadding(const HLCPadding &padding);
+    // Generates ACTIVATION registers
+    void GenerateActivation(const HLCStripe *stripe, MemoryAccesses &memoryAccesses);
+    // Generates KERNEL related registers
+    void GenerateKernel(const Kernel &kernel, bool partKernel);
+    // Generates IFM_BROADCAST and IFM2_BROADCAST register for binary elementwise operations
+    void GenerateInputBroadcast(const Shape &ifmShape, const Shape &ifm2Shape, bool ifmIsScalar, bool ifm2IsScalar);
+    // Generates IFM_PRECISION register
+    void GenerateIFMPrecision(const HLCFeatureMap &fm);
+    // Generates IFM2_PRECISION register
+    void GenerateIFM2Precision(const HLCFeatureMap &fm);
+    // Generates OFM_PRECISION register
+    void GenerateOFMPrecision(const HLCFeatureMap &fm, bool useGlobalScale, bool enable_output);
+    // Generates common IFM registers
+    void GenerateIFM(OpType opType, const HLCFeatureMap &fm, const Box &inputArea, bool isScalar, int32_t scalarValue);
+    // Generates common IFM2 registers
+    void GenerateIFM2(OpType opType, const HLCFeatureMap &fm, const Box &inputArea, bool isScalar, int32_t scalarValue);
+    // Generates OFM registers
+    void GenerateOFM(OpType opType, const HLCFeatureMap &fm, const Box &outputArea);
+    // Generates WEIGHT registers
+    void GenerateWeights(const HLCStripe *stripe, MemoryAccesses &memoryAccesses);
+    // Generates SCALE registers
+    void GenerateScales(const HLCStripe *stripe, MemoryAccesses &memoryAccesses);
+    // Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers
+    void GenerateBlockConfig(const EthosU85OpConfig *config, const HLCFeatureMap &fm);
+    // Generates ACC_FORMAT register
+    void GenerateAccFormat(const EthosU85OpConfig *config);
+    // Calculates and generates KERNEL_WAIT or DMA_WAIT register
+    void GenerateWaits(bool isKernelWait, const MemoryAccesses &memoryAccesses, int maxWaits,
+        std::deque<MemoryAccesses> &outstandingAccesses, std::deque<MemoryAccesses> &accessesToUpdate);
+    // Inserts DMA commands for copying LUTs from constant memory
+    // to LUT memory
+    std::vector<std::unique_ptr<HighLevelCommand>> InsertLUTDMACommands(std::vector<std::unique_ptr<HighLevelCommand>> &cmds);
+
+    //----------------------------------------------------------------------
+    // Operations
+    //----------------------------------------------------------------------
+
+    // Generates NPU_OP_* command
+    void GenerateOperationCode(const HLCOperation *op);
+    void GenerateCommon(const HLCStripe *stripe, bool useGlobalScale, MemoryAccesses &memoryAccesses);
+    // Conv2D/Depthwise operations
+    void GenerateConvolutionOp(const HLCStripe *stripe, MemoryAccesses &memoryAccesses);
+    // MaxPool/AvgPool or operations that are mapped to AvgPool
+    void GeneratePoolingOp(HLCStripe *stripe, MemoryAccesses &memoryAccesses);
+    // Elementwise operations
+    void GenerateElementwiseOp(HLCStripe *stripe, MemoryAccesses &memoryAccesses);
+    // Resize operations
+    void GenerateResizeOp(HLCStripe *stripe, MemoryAccesses &memoryAccesses);
+    bool GenerateStripe(HLCStripe *stripe, MemoryAccesses &memoryAccesses);
+    // Generates register commands for DMA operations
+    void GenerateDMA(const HLCDMA *dma, MemoryAccesses &memoryAccesses);
+
+    virtual void GenerateInitialRegisterSetup()
+    {
+        // No special initial setup for Ethos U85
+    }
+
+public:
+    std::vector<uint32_t> GenerateCommandStream(std::vector<std::unique_ptr<HighLevelCommand>> &highLevelCommandStream,
+        std::vector<std::tuple<void *, int, int>> *cmdRanges, bool verbose) override;
+    static uint32_t ConfigRegister(int macs, int cmdStreamVersion, int numAxiSram, int numAxiExt, int numWd, int product);
+    static bool IsSupportedElementwise(const OpType opType);
+    static uint32_t IdRegister();
+
+private:
+    ArchEthosU85 *_arch;
+    // For stripes that use LUT: the LUT slot to be used
+    std::unordered_map<const HLCStripe *, int> _stripeToLutSlot;
+    EthosU85Emitter _emit;
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_scaling.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_scaling.cpp
new file mode 100644
index 00000000..0056dc8d
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_scaling.cpp
@@ -0,0 +1,327 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "ethos_u85_scaling.hpp"
+
+#include "common/logging.hpp"
+
+#include "architecture/ethos_u_scaling.hpp"
+#include "compiler/high_level_command_stream.hpp"
+#include "compiler/op_type.hpp"
+#include "compiler/quantization.hpp"
+
+namespace regor::ethosU85Scaling
+{
+namespace
+{
+void AdvancedElementwiseAddSubScale(double input1Scale, double input2Scale, double outputScale, int bitDepth,
+    QuantizedScale &input1Rescale, QuantizedScale &input2Rescale, QuantizedScale &outScale)
+{
+    int inputShift = bitDepth == 8 ? 20 : 15;
+    double ifm1Rescale;
+    double ifm2Rescale;
+    SimplifiedElementwiseAddSubScale(input1Scale, input2Scale, outputScale, inputShift, ifm1Rescale, ifm2Rescale, outScale);
+    input1Rescale = QuantizedScale(ifm1Rescale);
+    input2Rescale = QuantizedScale(ifm2Rescale);
+}
+
+float GetScale(const Quantization *quant)
+{
+    if ( quant != nullptr && quant->scales.size() != 0 )
+    {
+        // Use single precision to match reference
+        return float(quant->scales[0].Dequantize());
+    }
+    else
+    {
+        return 1.0f;
+    }
+}
+
+}  // namespace
+
+void RescaleConvolution(HLCOperation *op)
+{
+    int ifmCnt = int(op->ifm.size());
+    Quantization *ifm1Quant = &op->ifm[0].quantization;
+    Quantization *ifm2Quant = ifmCnt == 2 ? &op->ifm[1].quantization : nullptr;
+    Quantization *ofmQuant = &op->ofm.quantization;
+
+    if ( ofmQuant->type == QuantizationType::EXPLICIT )
+    {
+        return;
+    }
+
+    QuantizedScale outScale(1, 0);
+
+    double ifm1Scale = GetScale(ifm1Quant);
+    double ifm2Scale = GetScale(ifm2Quant);
+    double ofmScale = GetScale(ofmQuant);
+
+    DataType ifmDataType = op->ifm[0].dataType;
+    OpType opType = op->type;
+
+    bool allHaveScale =
+        (!ifm1Quant->scales.empty() && !ofmQuant->scales.empty() && ifm2Quant != nullptr && !ifm2Quant->scales.empty());
+
+    bool reducedScale = DataTypeSizeBits(ifmDataType) != 8;
+
+    // If ifmCnt is 2 then it is a convolution with dynamic weights and global scale is used
+    if ( ifmCnt == 2 && allHaveScale )
+    {
+        if ( reducedScale )
+        {
+            outScale = QuantizedScale((ifm1Scale * ifm2Scale) / ofmScale, true);
+        }
+        else
+        {
+            outScale = ElementwiseMulScale(ifm1Scale, ifm2Scale, ofmScale);
+        }
+    }
+    if ( ofmQuant != nullptr && ofmQuant->type == QuantizationType::TFLITE )
+    {
+        ofmQuant->scales.clear();
+        ofmQuant->scales.push_back(outScale);
+        ofmQuant->type = QuantizationType::EXPLICIT;
+    }
+}
+
+void RescaleElementwise(HLCOperation *op)
+{
+    int ifmCnt = int(op->ifm.size());
+    Quantization *ifm1Quant = &op->ifm[0].quantization;
+    Quantization *ifm2Quant = ifmCnt == 2 ? &op->ifm[1].quantization : nullptr;
+    Quantization *ofmQuant = &op->ofm.quantization;
+
+    if ( ifm1Quant->type == QuantizationType::EXPLICIT && ofmQuant->type == QuantizationType::EXPLICIT &&
+         (ifm2Quant == nullptr || ifm2Quant->type == QuantizationType::EXPLICIT) )
+    {
+        return;
+    }
+
+    QuantizedScale input1Scale(1, 0);
+    QuantizedScale input2Scale(1, 0);
+    QuantizedScale outScale(1, 0);
+
+    double ifm1Scale = GetScale(ifm1Quant);
+    double ifm2Scale = GetScale(ifm2Quant);
+    double ofmScale = GetScale(ofmQuant);
+
+    DataType ifmDataType = op->ifm[0].dataType;
+    OpType opType = op->type;
+
+    bool allHaveScale =
+        (!ifm1Quant->scales.empty() && !ofmQuant->scales.empty() && ifm2Quant != nullptr && !ifm2Quant->scales.empty());
+
+    int bitDepth = DataTypeSizeBits(ifmDataType);
+    if ( opType == OpType::Div )
+    {
+        // Div scales should always be Unit
+    }
+    else if ( opType == OpType::Mul )
+    {
+        if ( allHaveScale )
+        {
+            outScale = ElementwiseMulScale(ifm1Scale, ifm2Scale, ofmScale);
+        }
+    }
+    else if ( opType == OpType::Abs || opType == OpType::LeakyRelu )
+    {
+        if ( opType == OpType::LeakyRelu )
+        {
+            input1Scale = QuantizedScale(ifm1Scale / ofmScale);
+        }
+    }
+    else if ( opType == OpType::Add || opType == OpType::Sub )
+    {
+        if ( allHaveScale )
+        {
+            AdvancedElementwiseAddSubScale(ifm1Scale, ifm2Scale, ofmScale, bitDepth, input1Scale, input2Scale, outScale);
+        }
+    }
+
+    if ( ifm1Quant != nullptr && ifm1Quant->type == QuantizationType::TFLITE )
+    {
+        ifm1Quant->scales.clear();
+        ifm1Quant->scales.push_back(input1Scale);
+        ifm1Quant->type = QuantizationType::EXPLICIT;
+    }
+    if ( ifm2Quant != nullptr && ifm2Quant->type == QuantizationType::TFLITE )
+    {
+        ifm2Quant->scales.clear();
+        ifm2Quant->scales.push_back(input2Scale);
+        ifm2Quant->type = QuantizationType::EXPLICIT;
+    }
+    if ( ofmQuant != nullptr && ofmQuant->type == QuantizationType::TFLITE )
+    {
+        ofmQuant->scales.clear();
+        ofmQuant->scales.push_back(outScale);
+        ofmQuant->type = QuantizationType::EXPLICIT;
+    }
+}
+
+void RescalePooling(HLCOperation *op, bool isNoOp)
+{
+    Quantization *ifm1Quant = &op->ifm[0].quantization;
+    Quantization *ofmQuant = &op->ofm.quantization;
+    uint32_t scale = 1;
+    int shift = 0;
+    DataType ifmDataType = op->ifm[0].dataType;
+    OpType opType = op->type;
+
+    if ( ofmQuant->type != QuantizationType::TFLITE )
+    {
+        // Explicit scaling
+        return;
+    }
+
+    if ( opType == OpType::MaxPool || opType == OpType::ArgMax )
+    {
+        // Do nothing
+    }
+    else if ( !ifm1Quant->scales.empty() && !ofmQuant->scales.empty() )
+    {
+        double ifmScale = GetScale(ifm1Quant);
+        double ofmScale = GetScale(ofmQuant);
+        if ( opType == OpType::Sigmoid || opType == OpType::Tanh )
+        {
+            assert(ifmDataType == DataType::Int16);
+            double rescale = 0x3000 * ifmScale;
+            // Calculate scale and shift for the output scale of 1/(3*4096)
+            double xLog2 = std::log2(ifmScale);
+            int roundedLog2 = int(std::round(xLog2));
+            bool isPowerOf2 = std::abs(xLog2 - roundedLog2) < 0.001;
+            shift = roundedLog2 + 12;
+            if ( isPowerOf2 && ((opType == OpType::Tanh && (shift == 0 || shift == 1)) || (opType == OpType::Sigmoid && shift == 0)) )
+            {
+                // Special handling if input scale is 1/2048 or 1/4096
+                scale = 3 << shift;
+                shift = 0;
+            }
+            else
+            {
+                shift = 0;
+                int maxRescale = 16384;
+                while ( rescale < maxRescale && shift <= 30 )
+                {
+                    shift++;
+                    rescale *= 2;
+                }
+                scale = uint32_t(rescale);
+            }
+        }
+        else if ( opType == OpType::MemoryCopy )
+        {
+            double rescale = ifmScale / ofmScale;
+            // In the case of concat or other memory operation, rescaling might be needed.
+            // The scale is maximised, to get maximum precision
+            QuantizePoolingScaleMaxPrecision(op->kernel.ElementsWH(), rescale, scale, shift, 31);
+        }
+        else if ( opType == OpType::Quantize )
+        {
+            // Quantize operations need double-precision scaling
+            QuantizedScale quantScale(ifmScale / ofmScale);
+            scale = uint32_t(quantScale.scale);
+            shift = quantScale.shift;
+        }
+        else if ( isNoOp )
+        {
+            QuantizedScale quantScale(float(ifmScale) / float(ofmScale));
+            scale = uint32_t(quantScale.scale);
+            shift = quantScale.shift;
+        }
+        else
+        {
+            // Normal pooling operation, without need for special scaling
+            double rescale = ifmScale / ofmScale;
+            QuantizePoolingScale(op->kernel.ElementsWH(), rescale, 0, scale, shift, 31);
+        }
+    }
+    ofmQuant->scales.clear();
+    ofmQuant->scales.push_back({int32_t(scale), shift});
+    ofmQuant->type = QuantizationType::EXPLICIT;
+}
+
+Quantization RescalePerChannel(const Quantization &ifmQuant, const Quantization &weightQuant,
+    const Quantization &ofmQuant, const DataType scaleDataType, const DataType ifmDataType)
+{
+    if ( ofmQuant.type != QuantizationType::TFLITE )
+    {
+        // Explicit quantized scale has already been set
+        return ofmQuant;
+    }
+
+    Quantization quantResult;
+    quantResult.type = QuantizationType::EXPLICIT;
+    quantResult.zeroPoints = ofmQuant.zeroPoints;
+    quantResult.quantMin = ofmQuant.quantMin;
+    quantResult.quantMax = ofmQuant.quantMax;
+    quantResult.dimension = ofmQuant.dimension;
+    quantResult.forceZeroPoint = ofmQuant.forceZeroPoint;
+
+    if ( !ifmQuant.scales.empty() && !ofmQuant.scales.empty() && !weightQuant.scales.empty() )
+    {
+        DataType dataType = DataType::None;
+        bool reducedScale = false;
+        if ( scaleDataType == DataType::Int32 )
+        {
+            switch ( ifmDataType )
+            {
+                case DataType::Int8:
+                case DataType::UInt8:
+                case DataType::Int16:
+                    dataType = ifmDataType;
+                    break;
+                default:
+                    break;
+            }
+        }
+        else if ( scaleDataType == DataType::Int64 && DataTypeSizeBits(ifmDataType) == 16 )
+        {
+            dataType = DataType::Int16;
+            reducedScale = true;
+        }
+
+        int modIfm = (ifmQuant.scales.size()) == 1 ? 0 : -1;
+        int modOfm = (ofmQuant.scales.size()) == 1 ? 0 : -1;
+
+        quantResult.scales.reserve(weightQuant.scales.size());
+
+        for ( int i = 0; i < int(weightQuant.scales.size()); i++ )
+        {
+            double v = 1.0;
+            float ifmScale = float(ifmQuant.scales[i & modIfm].Dequantize());
+            float ofmScale = float(ofmQuant.scales[i & modOfm].Dequantize());
+            float weightScale = float(weightQuant.scales[i].Dequantize());
+            if ( dataType == DataType::UInt8 )
+            {
+                v = double(ifmScale * weightScale) / double(ofmScale);
+            }
+            else if ( dataType == DataType::Int8 || dataType == DataType::Int16 )
+            {
+                v = (double(ifmScale) * double(weightScale)) / double(ofmScale);
+            }
+
+            quantResult.scales.emplace_back(v, reducedScale);
+        }
+    }
+
+    return quantResult;
+}
+
+}  // namespace regor::ethosU85Scaling
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_scaling.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_scaling.hpp
new file mode 100644
index 00000000..aa8bed53
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_scaling.hpp
@@ -0,0 +1,38 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+#include "common/data_type.hpp"
+
+namespace regor
+{
+
+class Quantization;
+struct HLCOperation;
+
+namespace ethosU85Scaling
+{
+
+void RescalePooling(HLCOperation *op, bool isNoOp);
+void RescaleConvolution(HLCOperation *op);
+void RescaleElementwise(HLCOperation *op);
+Quantization RescalePerChannel(const Quantization &ifmQuant, const Quantization &weightQuant,
+    const Quantization &ofmQuant, const DataType scaleDataType, const DataType ifmDataType);
+
+}  // namespace ethosU85Scaling
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_weight_encoder.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_weight_encoder.cpp
new file mode 100644
index 00000000..ccc8ed6d
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_weight_encoder.cpp
@@ -0,0 +1,775 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "ethos_u85_weight_encoder.hpp"
+
+#include "common/logging.hpp"
+
+#include "architecture/architecture.hpp"
+#include "architecture/ethos_u_scaling.hpp"
+#include "architecture/mlw_encode.hpp"
+#include "common/buffer_view.hpp"
+#include "common/shape.hpp"
+#include "compiler/tensor_properties.hpp"
+#include "ethos_u85.hpp"
+#include "ethos_u85_scaling.hpp"
+
+#include <mlw_encode.h>
+#include <string>
+#include <unordered_map>
+
+
+namespace regor
+{
+
+
+EthosU85WeightEncoder::EthosUEncodingConfig::EthosUEncodingConfig(int cores, Flags<WeightFormat> weightFormat) :
+        _cores(cores), _weightFormat(weightFormat)
+{
+}
+
+void EthosU85WeightEncoder::EthosUEncodingConfig::Rehash()
+{
+    _hash = SimpleHash32(ofmBlockDepth, traversal, _depthOffsetHash, ifmType, dilation, ohwiStrides);
+
+    _depthOffsetHash = 0;
+    for ( int offset : this->depthOffsets )
+    {
+        _depthOffsetHash = _depthOffsetHash * 31 ^ offset;
+    }
+}
+
+uint32_t EthosU85WeightEncoder::EthosUEncodingConfig::Hash()
+{
+    return _hash;
+}
+
+bool EthosU85WeightEncoder::EthosUEncodingConfig::Equals(IWeightEncodingConfig *other)
+{
+    EthosUEncodingConfig *p = static_cast<EthosUEncodingConfig *>(other);
+    return std::tie(ofmBlockDepth, ifmBlockDepth, traversal, _depthOffsetHash, ifmType, dilation, ohwiStrides, _weightFormat) ==
+           std::tie(p->ofmBlockDepth, p->ifmBlockDepth, p->traversal, p->_depthOffsetHash, p->ifmType, p->dilation, p->ohwiStrides, p->_weightFormat);
+}
+
+const std::vector<int> &EthosU85WeightEncoder::EthosUEncodingConfig::DepthOffsets()
+{
+    return this->depthOffsets;
+}
+
+Flags<WeightFormat> EthosU85WeightEncoder::EthosUEncodingConfig::Format()
+{
+    return _weightFormat;
+}
+
+
+std::unique_ptr<IWeightEncodingConfig> EthosU85WeightEncoder::GetEncodingConfig(ArchitectureOpConfig *opCfg, const WeightsRef &weights,
+    const Kernel *kernel, DataType ifmType, const std::vector<int> &depthOffsets, Flags<WeightFormat> format)
+{
+    std::unique_ptr<EthosUEncodingConfig> params = std::make_unique<EthosUEncodingConfig>(_arch->_cores, format);
+
+    EthosU85OpConfig *opConfig = static_cast<EthosU85OpConfig *>(opCfg);
+    params->ofmUBlock = opConfig->OfmUBlock();
+    params->ofmBlockDepth = opConfig->OfmBlock().Depth();
+    params->ifmBlockDepth = opConfig->IfmBlock().Depth();
+    params->traversal = opConfig->Traversal();
+    params->acc = opConfig->Acc();
+    params->depthOffsets = depthOffsets;
+    params->ifmType = ifmType;
+    params->dilation = kernel->Dilation();
+    params->stride = kernel->Stride();
+
+    assert(!weights.isScales);
+    Shape ohwiStrides = weights.view->StrideBytes() * 8 / DataTypeSizeBits(weights.type);
+    if ( weights.axisOrder == AxisOrder::IHWO )
+    {
+        ohwiStrides = ohwiStrides.Extract(3, 1, 2, 0);
+    }
+    params->ohwiStrides = std::move(ohwiStrides);
+    params->Rehash();
+
+    return params;
+}
+
+int EthosU85WeightEncoder::StreamsRequired(IWeightEncodingConfig *config, const Shape & /*weightShape*/, int &scaleStreamsRequired)
+{
+    scaleStreamsRequired = 1;
+    return config->Format() & WeightFormat::Fast ? 1 : _arch->_cores;
+}
+
+static int EncodeBias32(int64_t bias, int32_t scale, int shift, uint8_t data[10])
+{
+    assert(-(1LL << (32 - 1)) <= bias && bias < (1LL << (32 - 1)));  // signed 32-bit range
+    assert(0 <= scale);                                              // unsigned 31-bit range
+    assert(0 <= shift && shift < (1 << 6));                          // unsigned 6-bit range
+
+    data[0] = uint8_t((bias >> (0 * 8)) & 0xFF);
+    data[1] = uint8_t((bias >> (1 * 8)) & 0xFF);
+    data[2] = uint8_t((bias >> (2 * 8)) & 0xFF);
+    data[3] = uint8_t((bias >> (3 * 8)) & 0xFF);
+    data[4] = uint8_t((scale >> (0 * 8)) & 0xFF);
+    data[5] = uint8_t((scale >> (1 * 8)) & 0xFF);
+    data[6] = uint8_t((scale >> (2 * 8)) & 0xFF);
+    data[7] = uint8_t((scale >> (3 * 8)) & 0x7F);
+    data[8] = uint8_t(shift & 0x3F);
+    data[9] = 0;
+    return 10;
+}
+
+static int EncodeBias48(int64_t bias, int32_t scale, int shift, uint8_t data[10])
+{
+    assert(-(1LL << (48 - 1)) <= bias && bias < (1LL << (48 - 1)));  // signed 48-bit range
+    assert(0 <= scale && scale < (1L << 15));                        // unsigned 15-bit range
+    assert(0 <= shift && shift < (1 << 6));                          // unsigned 6-bit range
+
+    data[0] = uint8_t((bias >> (0 * 8)) & 0xFF);
+    data[1] = uint8_t((bias >> (1 * 8)) & 0xFF);
+    data[2] = uint8_t((bias >> (2 * 8)) & 0xFF);
+    data[3] = uint8_t((bias >> (3 * 8)) & 0xFF);
+    data[4] = uint8_t((bias >> (4 * 8)) & 0xFF);
+    data[5] = uint8_t((bias >> (5 * 8)) & 0xFF);
+    data[6] = uint8_t((scale >> (0 * 8)) & 0xFF);
+    data[7] = uint8_t((scale >> (1 * 8)) & 0x7F);
+    data[8] = uint8_t(shift & 0x3F);
+    data[9] = 0;
+    return 10;
+}
+
+struct SparsityTracker
+{
+    int _sparse_zeroes = 4;
+    int _sparse_index = 0;
+    uint32_t _sparse_pos = 0xFFFFFFFF;
+    void Reset() { _sparse_pos = 0xFFFFFFFF; }
+
+    void Check(uint32_t pos, int depth, int weight)
+    {
+        if ( _sparse_pos != pos )
+        {
+            _sparse_pos = pos;
+            _sparse_zeroes = 0;
+            _sparse_index = 0;
+            if ( depth & 3 ) throw WeightsNotSparse();
+        }
+
+        if ( weight == 0 ) _sparse_zeroes++;
+        else if ( weight > 127 || weight < -127 ) throw WeightsNotSparse();
+
+        if ( (_sparse_index & 3) == 3 )
+        {
+            if ( _sparse_zeroes < 2 ) throw WeightsNotSparse();
+            _sparse_zeroes = 0;
+        }
+
+        _sparse_index++;
+    }
+};
+
+template<typename TYPE>
+class EthosU85WeightOrdering : public WeightSourceCommon
+{
+protected:
+    static constexpr int InterleaveDepth = 4;
+    // Transform
+    WeightTransformParam *_param;
+    WeightTransformFunc _transform;
+    // Loop Limits
+    Point2i _stride;
+    int _ofmBlockDepth;
+    int _ifmBlockDepth;
+    short _ofmUBlockDepth;
+    short _ifmUBlockDepth;
+    short _decompX;
+    short _decompY;
+    short _subKernelRound;
+    short _dwPaddingCount;
+    // Saved state
+    int _ofmBlockZ = 0;
+    int _ifmBlockZ = 0;
+    int _subKernelX = 0;
+    int _subKernelY = 0;
+    int _ifmUBlockOuter = 0;
+    int _ifmUBlockInner = 0;
+    int _ofmUBlockZ = 0;
+    int _ifmUBlockZ = 0;
+    int _subKernelElements = 0;
+    int _strideX = 0;
+    int _strideY = 0;
+    int _kernelX = 0;
+    int _kernelY = 0;
+    int _ofmUBlockInner = 0;
+    int _ofmUBlockOuter = 0;
+    int _ifmLoopInc = 0;
+    int _padding = 0;
+    EthosU85Traversal _traversal;
+    bool _sparse;
+    SparsityTracker _sparsity;
+
+public:
+    EthosU85WeightOrdering(int cores, int macs, Point2i stride, const Point2i &dilation, int ofmBlockDepth, int ifmBlockDepth, int ifmBitDepth,
+        int ofmUBlockDepth, WeightTransformFunc func, WeightTransformParam *param, EthosU85Traversal traversal, bool sparse)
+    {
+        const bool ifm16bit = (ifmBitDepth == 16);
+        _streams = cores;
+        _transform = func;
+        _param = param;
+        _traversal = traversal;
+        _sparse = sparse;
+        _stride = stride;
+
+        _ofmBlockDepth = ofmBlockDepth;
+        _ifmBlockDepth = ifmBlockDepth;
+        _ofmUBlockDepth = short(ofmUBlockDepth);
+
+        if ( traversal == EthosU85Traversal::PartKernel )
+        {
+            _subKernelRound = (ifm16bit || sparse) ? 10 : 5;
+            _ifmUBlockDepth = ifm16bit && !sparse ? 8 : _ifmBlockDepth;
+        }
+        else
+        {
+            if ( traversal == EthosU85Traversal::DepthFirst )
+            {
+                _stride = Point2i(1, 1);
+                _subKernelRound = 1;
+                _ifmUBlockDepth = _ifmBlockDepth;
+            }
+            else if ( traversal == EthosU85Traversal::Depthwise )
+            {
+                _subKernelRound = 10;
+                _ifmUBlockDepth = 1;
+            }
+        }
+
+        _decompX = short(8 / dilation.x);
+        _decompY = short(8 / dilation.y);
+        _dwPaddingCount = (!ifm16bit && macs <= 256) ? 0 : (macs <= 512) ? 2 : 6;
+
+        _ifmLoopInc = -_ifmBlockDepth;
+    }
+
+    void SetSource(const void *buffer, int depthOffset, const Shape &ohwiShape, const Shape &ohwiStrides, int streamIndex) override
+    {
+        SetSourceCommon(buffer, depthOffset, ohwiShape, ohwiStrides, streamIndex, false);
+        assert(_streamIndex == streamIndex);
+        _ofmUBlockZ = _streamIndex * InterleaveDepth;
+        _sparsity.Reset();
+    }
+
+public:
+    int Get(int16_t *output, int count) override
+    {
+        if ( _traversal == EthosU85Traversal::Depthwise )
+        {
+            assert(!_sparse);
+            return GetNext<false, true>(output, count);
+        }
+        else if ( _sparse )
+        {
+            return GetNext<true, false>(output, count);
+        }
+
+        return GetNext<false, false>(output, count);
+    }
+
+    template<bool IS_SPARSE, bool IS_DEPTHWISE>
+    int GetNext(int16_t *output, int count)
+    {
+        if ( _ofmBlockZ >= _ofmDepth )
+        {
+            return 0;
+        }
+
+        int ofmBlockZ, ifmBlockZ;
+        int ifmUBlockOuter, ifmUBlockInner;
+        int ifmUBlockZ, ofmUBlockZ, ofmUBlockInner, ofmUBlockOuter;
+        int subKernelX, subKernelY;
+        int strideX, strideY;
+        int kernelX, kernelY;
+        int padding;
+        int16_t *write = output;
+
+        const TYPE *buffer = reinterpret_cast<const TYPE *>(_source);
+
+        for ( ofmBlockZ = _ofmBlockZ; ofmBlockZ < _ofmDepth; ofmBlockZ += _ofmBlockDepth )
+        {
+            _ifmLoopInc = -_ifmLoopInc;
+            int clippedOfmBlockDepth = std::min(_ofmBlockDepth, _ofmDepth - ofmBlockZ);
+            // IFM blocks required for the brick
+            for ( ifmBlockZ = _ifmBlockZ; ifmBlockZ < (IS_DEPTHWISE ? 1 : _ifmDepth) && ifmBlockZ >= 0; ifmBlockZ += _ifmLoopInc )
+            {
+                _ifmBlockZ = ifmBlockZ;
+                int clippedIfmBlockDepth = std::min(_ifmBlockDepth, _ifmDepth - ifmBlockZ);
+
+                // Weight decomposition
+                // Subkernel splitting (W)
+                for ( subKernelX = _subKernelX; subKernelX < _kernelW; subKernelX += _decompX )
+                {
+                    int subWidth = std::min<int>(_kernelW - subKernelX, _decompX);
+                    // Subkernel Splitting (H)
+                    for ( subKernelY = _subKernelY; subKernelY < _kernelH; subKernelY += _decompY )
+                    {
+                        int subHeight = std::min<int>(_kernelH - subKernelY, _decompY);
+                        int ifmBlockDepthOuter = IS_DEPTHWISE ? 1 : clippedIfmBlockDepth;
+                        for ( ifmUBlockOuter = _ifmUBlockOuter; ifmUBlockOuter < ifmBlockDepthOuter; ifmUBlockOuter += _ifmUBlockDepth )
+                        {
+                            // OFM uBlocks in OFM-block over depth
+                            for ( ofmUBlockOuter = _ofmUBlockOuter; ofmUBlockOuter < clippedOfmBlockDepth; ofmUBlockOuter += _ofmUBlockDepth )
+                            {
+                                // Part kernel first works across the kernel H/W and needs padding
+                                if ( !_subKernelElements )
+                                {
+                                    int subKernelElements = subWidth * subHeight;
+                                    _subKernelElements = RoundAway<int>(subKernelElements, _subKernelRound);
+                                }
+                                for ( strideY = _strideY; strideY < _stride.y; ++strideY )
+                                {
+                                    int stridedKernelH = (subHeight + _stride.y - 1 - strideY) / _stride.y;
+                                    for ( strideX = _strideX; strideX < _stride.x; ++strideX )
+                                    {
+                                        int stridedKernelW = (subWidth + _stride.x - 1 - strideX) / _stride.x;
+                                        for ( kernelY = _kernelY; kernelY < stridedKernelH; ++kernelY )
+                                        {
+                                            int y = kernelY;
+                                            for ( kernelX = _kernelX; kernelX < stridedKernelW; ++kernelX )
+                                            {
+                                                int x = kernelY % 2 == 0 ? kernelX : stridedKernelW - 1 - kernelX;
+                                                _subKernelElements--;
+                                                int ifmUBlockInnerStep = IS_DEPTHWISE ? 1 : (IS_SPARSE ? 16 : 8);
+                                                for ( ifmUBlockInner = _ifmUBlockInner; ifmUBlockInner < _ifmUBlockDepth; ifmUBlockInner += ifmUBlockInnerStep )
+                                                {
+                                                    // Feed OFM uBlock elements
+                                                    for ( ofmUBlockZ = _ofmUBlockZ; ofmUBlockZ < _ofmUBlockDepth; ofmUBlockZ += InterleaveDepth * _streams )
+                                                    {
+                                                        for ( ofmUBlockInner = _ofmUBlockInner; ofmUBlockInner < InterleaveDepth; ofmUBlockInner++ )
+                                                        {
+                                                            // Source IFM uBlock elements (only 1 element deep if
+                                                            // depthwise)
+                                                            for ( ifmUBlockZ = _ifmUBlockZ; ifmUBlockZ < ifmUBlockInnerStep; ifmUBlockZ++ )
+                                                            {
+                                                                // Source position within the current subkernel
+                                                                int wx = subKernelX + strideX + x * _stride.x;
+                                                                int wy = subKernelY + strideY + y * _stride.y;
+                                                                // Source IFM/OFM slices
+                                                                int ifm_z = ifmBlockZ + ifmUBlockOuter + ifmUBlockInner + ifmUBlockZ;
+                                                                int ofm_z = ofmBlockZ + ofmUBlockOuter + ofmUBlockInner + ofmUBlockZ;
+                                                                int weight = 0;
+                                                                if ( ifm_z < _ifmDepth && ofm_z < _ofmDepth )
+                                                                {
+                                                                    _param->o = ofm_z;
+                                                                    _param->h = wy;
+                                                                    _param->w = wx;
+                                                                    _param->i = ifm_z;
+                                                                    weight = int(buffer[WeightIndex(ofm_z, wy, wx, ifm_z)]);
+                                                                    weight = _transform(_param, weight);
+                                                                }
+
+                                                                if constexpr ( IS_SPARSE )
+                                                                    _sparsity.Check((unsigned(wy) << 16) | wx, ifm_z, weight);
+
+                                                                *write++ = int16_t(weight);
+
+                                                                if ( --count == 0 )
+                                                                {
+                                                                    // Save state
+                                                                    _subKernelElements++;
+                                                                    _ifmUBlockZ = ifmUBlockZ + 1;
+                                                                    _ofmUBlockInner = ofmUBlockInner;
+                                                                    _ofmUBlockZ = ofmUBlockZ;
+                                                                    _ifmUBlockInner = ifmUBlockInner;
+                                                                    _kernelX = kernelX;
+                                                                    _kernelY = kernelY;
+                                                                    _strideX = strideX;
+                                                                    _strideY = strideY;
+                                                                    _ofmUBlockOuter = ofmUBlockOuter;
+                                                                    _ifmUBlockOuter = ifmUBlockOuter;
+                                                                    _subKernelY = subKernelY;
+                                                                    _subKernelX = subKernelX;
+                                                                    _ofmBlockZ = ofmBlockZ;
+                                                                    _ifmLoopInc = -_ifmLoopInc;
+                                                                    return int(intptr_t(write - output));
+                                                                }
+                                                            }
+                                                            _ifmUBlockZ = 0;
+                                                        }
+                                                        _ofmUBlockInner = 0;
+                                                    }
+                                                    _ofmUBlockZ = _streamIndex * InterleaveDepth;
+                                                }
+                                                // Depthwise padding
+                                                if ( IS_DEPTHWISE && _subKernelElements % _subKernelRound == 0 )
+                                                {
+                                                    int padCount = _dwPaddingCount * _ofmUBlockDepth / _streams;
+                                                    for ( padding = _padding; padding < padCount; padding++ )
+                                                    {
+                                                        *write++ = 0;
+                                                        if ( --count == 0 )
+                                                        {
+                                                            // Save state
+                                                            _subKernelElements++;
+                                                            _padding = padding + 1;
+                                                            _ifmUBlockInner = ifmUBlockInner;  // Will skip loop above
+                                                            _kernelX = kernelX;
+                                                            _kernelY = kernelY;
+                                                            _strideX = strideX;
+                                                            _strideY = strideY;
+                                                            _ofmUBlockOuter = ofmUBlockOuter;
+                                                            _ifmUBlockOuter = ifmUBlockOuter;
+                                                            _subKernelY = subKernelY;
+                                                            _subKernelX = subKernelX;
+                                                            _ofmBlockZ = ofmBlockZ;
+                                                            _ifmLoopInc = -_ifmLoopInc;
+                                                            return int(intptr_t(write - output));
+                                                        }
+                                                    }
+                                                    _padding = 0;
+                                                }
+                                                _ifmUBlockInner = 0;
+                                            }
+                                            _kernelX = 0;
+                                        }
+                                        _kernelY = 0;
+                                    }
+                                    _strideX = 0;
+                                }
+                                // Padding
+                                if ( _subKernelElements > 0 )
+                                {
+                                    int padCount = _subKernelElements + (IS_DEPTHWISE ? _dwPaddingCount : 0);
+                                    padCount = padCount * _ifmUBlockDepth * _ofmUBlockDepth / _streams;
+                                    for ( padding = _padding; padding < padCount; padding++ )
+                                    {
+                                        *write++ = 0;
+                                        if ( --count == 0 )
+                                        {
+                                            // Save state
+                                            _padding = padding + 1;
+                                            _strideY = strideY;  // Will skip loop above
+                                            _ofmUBlockOuter = ofmUBlockOuter;
+                                            _ifmUBlockOuter = ifmUBlockOuter;
+                                            _subKernelY = subKernelY;
+                                            _subKernelX = subKernelX;
+                                            _ofmBlockZ = ofmBlockZ;
+                                            _ifmLoopInc = -_ifmLoopInc;
+                                            return int(intptr_t(write - output));
+                                        }
+                                    }
+                                    _padding = 0;
+                                }
+                                _subKernelElements = 0;
+                                _strideY = 0;
+                            }
+                            _ofmUBlockOuter = 0;
+                        }
+                        _ifmUBlockOuter = 0;
+                    }
+                    _subKernelY = 0;
+                }
+                _subKernelX = 0;
+            }
+        }
+        _ifmLoopInc = -_ifmBlockDepth;
+        _ifmBlockZ = 0;
+        _ofmBlockZ = 0;
+        // Return weights generated (less than requested count == EOS)
+        return int(intptr_t(write - output));
+    }
+};
+
+
+template<typename TYPE>
+class EthosU85WeightOrderingFwd : public WeightSourceCommon
+{
+protected:
+    int _weightBlockSize;
+    int _blockSizeEmitted;
+    EthosU85Traversal _traversal;
+    std::vector<std::unique_ptr<EthosU85WeightOrdering<TYPE>>> _weightSource;
+
+public:
+    EthosU85WeightOrderingFwd(int cores, int macs, Point2i stride, const Point2i &dilation, int ofmBlockDepth, int ifmBlockDepth,
+        int ifmBitDepth, int ofmUBlockDepth, WeightTransformFunc func, WeightTransformParam *param, EthosU85Traversal traversal, bool sparse)
+    {
+        assert(traversal != EthosU85Traversal::Depthwise);
+        const bool isPartKernel = (traversal == EthosU85Traversal::PartKernel);
+        const bool ifm16bit = ifmBitDepth == 16;
+
+        const int ifmUBlockDepth = isPartKernel && ifm16bit && !sparse ? 8 : ifmBlockDepth;
+        const int subKernelRound = isPartKernel ? (!ifm16bit && !sparse ? 5 : 10) : 1;
+        _streams = cores;
+        _weightBlockSize = ofmUBlockDepth * ifmUBlockDepth * subKernelRound / _streams;
+        _blockSizeEmitted = 0;
+        _traversal = traversal;
+        for ( int stream = 0; stream < _streams; ++stream )
+        {
+            _weightSource.push_back(std::make_unique<EthosU85WeightOrdering<TYPE>>(cores, macs, stride, dilation,
+                ofmBlockDepth, ifmBlockDepth, ifmBitDepth, ofmUBlockDepth, func, param, traversal, sparse));
+        }
+    }
+
+    void SetSource(const void *buffer, int depthOffset, const Shape &ohwiShape, const Shape &ohwiStrides, int streamIndex) override
+    {
+        SetSourceCommon(buffer, depthOffset, ohwiShape, ohwiStrides, streamIndex, false);
+        _streamIndex = streamIndex;
+        for ( int stream = 0; stream < _streams; ++stream )
+        {
+            _weightSource[stream]->SetSource(buffer, depthOffset, ohwiShape, ohwiStrides, stream);
+        }
+    }
+
+public:
+    int Get(int16_t *output, int count) override
+    {
+        // Interleave weight sources
+        int offset = 0;
+        int result = _weightBlockSize;
+        int stream;
+        while ( offset < count && result )
+        {
+            for ( stream = _streamIndex; stream < _streams; ++stream )
+            {
+                int blockSize = std::min(_weightBlockSize - _blockSizeEmitted, count - offset);
+                result = _weightSource[stream]->Get(output + offset, blockSize);
+                offset += result;
+                if ( offset == count )
+                {  // Output filled
+                    if ( result < _weightBlockSize )
+                    {  // Incomplete block, save state
+                        _blockSizeEmitted = result;
+                        _streamIndex = stream;
+                    }
+                    else
+                    {  // Complete block, start on next
+                        _blockSizeEmitted = 0;
+                        _streamIndex = stream + 1;
+                    }
+                    return offset;
+                }
+                _blockSizeEmitted = 0;
+            }
+            _streamIndex = 0;
+        }
+        // Return weights generated (less than requested count == EOS)
+        return offset;
+    }
+};
+
+
+std::unique_ptr<IVolumeWeightSource> EthosU85WeightEncoder::GetWeightSource(
+    IWeightEncodingConfig *config, DataType weightType, WeightTransformFunc func, WeightTransformParam *param)
+{
+    EthosUEncodingConfig *cfg = static_cast<EthosUEncodingConfig *>(config);
+
+    int ofmUBlockDepth = cfg->ofmUBlock.Depth();
+
+    int ifmBitDepth = DataTypeSizeBits(cfg->ifmType);
+    bool isFast = cfg->Format() & WeightFormat::Fast;
+    bool isSparse = cfg->Format() & WeightFormat::Sparse2_4;
+
+    if ( weightType == DataType::UInt8 )
+    {
+        if ( isFast && _arch->_cores > 1 )  // No interleaving needed for FWD if only one stream
+        {
+            return std::make_unique<EthosU85WeightOrderingFwd<uint8_t>>(_arch->_cores, _arch->_macs, cfg->stride, cfg->dilation,
+                cfg->ofmBlockDepth, cfg->ifmBlockDepth, ifmBitDepth, ofmUBlockDepth, func, param, cfg->traversal, isSparse);
+        }
+        else
+        {
+            assert(!(isFast && cfg->traversal == EthosU85Traversal::Depthwise));
+            return std::make_unique<EthosU85WeightOrdering<uint8_t>>(_arch->_cores, _arch->_macs, cfg->stride, cfg->dilation,
+                cfg->ofmBlockDepth, cfg->ifmBlockDepth, ifmBitDepth, ofmUBlockDepth, func, param, cfg->traversal, isSparse);
+        }
+    }
+    else if ( weightType == DataType::Int8 )
+    {
+        if ( isFast && _arch->_cores > 1 )  // No interleaving needed for FWD if only one stream
+        {
+            return std::make_unique<EthosU85WeightOrderingFwd<int8_t>>(_arch->_cores, _arch->_macs, cfg->stride, cfg->dilation,
+                cfg->ofmBlockDepth, cfg->ifmBlockDepth, ifmBitDepth, ofmUBlockDepth, func, param, cfg->traversal, isSparse);
+        }
+        else
+        {
+            assert(!(isFast && cfg->traversal == EthosU85Traversal::Depthwise));
+            return std::make_unique<EthosU85WeightOrdering<int8_t>>(_arch->_cores, _arch->_macs, cfg->stride, cfg->dilation,
+                cfg->ofmBlockDepth, cfg->ifmBlockDepth, ifmBitDepth, ofmUBlockDepth, func, param, cfg->traversal, isSparse);
+        }
+    }
+
+    assert(false && "No weight source for this datatype");
+    return nullptr;
+}
+
+
+template<typename TYPE, DataType IFM_TYPE>
+class EthosU85ScaleSource : public IVolumeScaleSource
+{
+private:
+    const TYPE *_buffer = nullptr;
+    const QuantizedScale *_scales = nullptr;
+    int _biasIndex = 0;
+    int _biasCount = 0;
+    int _bufferSize = 0;
+    int _streamIndex = 0;
+    int _cores = 0;
+    int _uBlockDepth = 0;
+    Quantization _quantization;
+
+public:
+    EthosU85ScaleSource(int cores, int uBlockDepth, Quantization quantization) :
+            _cores(cores), _uBlockDepth(uBlockDepth), _quantization(std::move(quantization))
+    {
+        assert(!_quantization.scales.empty());
+        auto invalidScale = std::find_if(std::begin(_quantization.scales), std::end(_quantization.scales),
+            [](const auto q) { return q.shift < 0 || q.shift >= 64; });
+        assert(invalidScale == std::end(_quantization.scales));
+    }
+
+    int Elements()
+    {
+        assert(_biasCount >= 0);
+        return _biasCount;
+    }
+
+    int Get(int64_t *biasBuffer, QuantizedScale *quantBuffer, int count)
+    {
+        count = std::min(count, _biasCount);
+        const size_t scaleSize = _quantization.scales.size();
+
+        for ( int i = 0; i < count; i++ )
+        {
+            int index = _biasIndex + i;
+            if ( index < _bufferSize )
+            {
+                *biasBuffer++ = _buffer[index];
+                *quantBuffer++ = _quantization.scales[index % scaleSize];
+            }
+            else
+            {
+                *biasBuffer++ = 0;
+                *quantBuffer++ = QuantizedScale(0, 0);
+            }
+            _biasCount--;
+        }
+
+        _biasIndex += count;
+        return count;
+    }
+
+    void SetSource(const void *buffer, int biasCount, int depthOffset, int depthLength, int streamIndex)
+    {
+        assert(streamIndex >= 0 && streamIndex < _cores);
+        assert(depthOffset + depthLength <= biasCount);
+        _buffer = reinterpret_cast<const TYPE *>(buffer);
+        _biasIndex = depthOffset + streamIndex;                            // Where to start in the buffer
+        _biasCount = RoundAway(depthLength, RoundAway(_uBlockDepth, 16));  // How many biases to generate
+        _bufferSize = biasCount;
+    }
+};
+
+
+std::unique_ptr<IVolumeScaleSource> EthosU85WeightEncoder::GetScaleSource(
+    IWeightEncodingConfig *config, DataType scaleType, const Quantization &explicitQuant)
+{
+    EthosUEncodingConfig *cfg = static_cast<EthosUEncodingConfig *>(config);
+    assert(explicitQuant.type == QuantizationType::EXPLICIT);
+
+    if ( scaleType == DataType::Int32 )
+    {
+        if ( cfg->ifmType == DataType::Int8 )
+        {
+            return std::make_unique<EthosU85ScaleSource<int32_t, DataType::Int8>>(_arch->_cores, cfg->ofmUBlock.Depth(), explicitQuant);
+        }
+        else if ( cfg->ifmType == DataType::UInt8 )
+        {
+            return std::make_unique<EthosU85ScaleSource<int32_t, DataType::UInt8>>(_arch->_cores, cfg->ofmUBlock.Depth(), explicitQuant);
+        }
+        else if ( cfg->ifmType == DataType::Int16 )
+        {
+            return std::make_unique<EthosU85ScaleSource<int32_t, DataType::Int16>>(_arch->_cores, cfg->ofmUBlock.Depth(), explicitQuant);
+        }
+    }
+    else if ( scaleType == DataType::Int48 && DataTypeSizeBits(cfg->ifmType) == 16 )
+    {
+        return std::make_unique<EthosU85ScaleSource<int48_t, DataType::Int16>>(_arch->_cores, cfg->ofmUBlock.Depth(), explicitQuant);
+    }
+    else if ( scaleType == DataType::Int64 && DataTypeSizeBits(cfg->ifmType) == 16 )
+    {
+        return std::make_unique<EthosU85ScaleSource<int64_t, DataType::Int16>>(_arch->_cores, cfg->ofmUBlock.Depth(), explicitQuant);
+    }
+
+    return nullptr;
+}
+
+Quantization EthosU85WeightEncoder::MakeExplicit(const Quantization &ifmQ, const Quantization &weightQ,
+    const Quantization &ofmQ, DataType scaleType, DataType ifmType)
+{
+    if ( DataTypeSizeBits(ifmType) == 16 ) ifmType = DataType::Int16;
+
+    return ethosU85Scaling::RescalePerChannel(ifmQ, weightQ, ofmQ, scaleType, ifmType);
+}
+
+
+WeightsInfo EthosU85WeightEncoder::EncodeWeights(
+    IWeightEncodingConfig *config, IWeightSource *source, std::vector<uint8_t> &result, bool measureOnly)
+{
+    EthosUEncodingConfig *cfg = static_cast<EthosUEncodingConfig *>(config);
+    auto fn = (cfg->Format() & WeightFormat::Fast) ? mle_encode_fwd_proxy : mle_encode_proxy;
+    unsigned flags = measureOnly ? MLW_ENCODE_NO_BITSTREAM : MLW_ENCODE_FLAG_NONE;
+    if ( cfg->Format().All(WeightFormat::Fast, WeightFormat::Sparse2_4) ) flags |= MLW_ENCODE_NO_PALETTE_LUT;
+    auto res = fn(source, 128 * 1024, result, flags);
+    return {res.elements_read, res.bytes_written, res.zero_count};
+}
+
+
+int EthosU85WeightEncoder::EncodeScales(IWeightEncodingConfig *config, IScaleSource *source, std::vector<uint8_t> &result, bool measureOnly)
+{
+    EthosUEncodingConfig *cfg = static_cast<EthosUEncodingConfig *>(config);
+
+    constexpr int BUFFER_SIZE = 8;
+    constexpr int SCALE_ELEMENT_SIZE = 10;
+    auto EncodeBias = cfg->acc == EthosU85Accumulator::Acc32 ? EncodeBias32 : EncodeBias48;
+
+    if ( measureOnly )
+    {
+        return source->Elements() * SCALE_ELEMENT_SIZE;  // Must be accurate
+    }
+
+    int64_t scaleBuffer[BUFFER_SIZE];
+    QuantizedScale quantBuffer[BUFFER_SIZE];
+
+    int start = int(result.size());
+    int write = start;
+    result.reserve(start + source->Elements() * SCALE_ELEMENT_SIZE);
+    while ( true )
+    {
+        int count = source->Get(scaleBuffer, quantBuffer, BUFFER_SIZE);
+        result.resize(write + (count * SCALE_ELEMENT_SIZE));
+
+        for ( int i = 0; i < count; i++ )
+        {
+            write += EncodeBias(scaleBuffer[i], quantBuffer[i].scale, quantBuffer[i].shift, &result[write]);
+        }
+        if ( count < BUFFER_SIZE )
+        {
+            break;
+        }
+    }
+
+    return write - start;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_weight_encoder.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_weight_encoder.hpp
new file mode 100644
index 00000000..c6203363
--- /dev/null
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85_weight_encoder.hpp
@@ -0,0 +1,91 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "architecture/architecture.hpp"
+#include "architecture/ethos_u_scaling.hpp"
+#include "architecture/ethosu85/ethos_u85.hpp"
+#include "architecture/mlw_encode.hpp"
+#include "architecture/weight_encoder.hpp"
+#include "common/shape.hpp"
+
+namespace regor
+{
+
+/// <summary>
+/// Encodes weights and biases.
+/// </summary>
+class EthosU85WeightEncoder : public WeightEncoder
+{
+private:
+    struct EthosUEncodingConfig : IWeightEncodingConfig
+    {
+    private:
+        uint32_t _hash = 0;
+        uint32_t _depthOffsetHash = 0;
+        int _cores = 0;
+        Flags<WeightFormat> _weightFormat = WeightFormat::Default;
+
+    public:
+        DataType ifmType = DataType::None;
+        int ofmBlockDepth = 0;
+        int ifmBlockDepth = 0;
+        EthosU85Traversal traversal = EthosU85Traversal::DepthFirst;
+        EthosU85Accumulator acc = EthosU85Accumulator::Acc32;
+        std::vector<int> depthOffsets;
+        Point2i dilation;
+        Point2i stride;
+        Shape ohwiStrides;
+        Shape ofmUBlock;
+
+    public:
+        EthosUEncodingConfig(int cores, Flags<WeightFormat> weightFormat);
+        void Rehash();
+        uint32_t Hash() override;
+        bool Equals(IWeightEncodingConfig *other) override;
+        const std::vector<int> &DepthOffsets() override;
+        Flags<WeightFormat> Format() override;
+    };
+
+public:
+    EthosU85WeightEncoder(ArchEthosU85 *arch) : _arch(arch) {}
+
+public:
+    std::unique_ptr<IWeightEncodingConfig> GetEncodingConfig(ArchitectureOpConfig *opCfg, const WeightsRef &weights,
+        const Kernel *kernel, DataType ifmType, const std::vector<int> &depthOffsets, Flags<WeightFormat> format);
+
+    int StreamsRequired(IWeightEncodingConfig *config, const Shape &weightShape, int &scaleStreamsRequired);
+
+    std::unique_ptr<IVolumeWeightSource> GetWeightSource(
+        IWeightEncodingConfig *config, DataType weightType, WeightTransformFunc func, WeightTransformParam *param);
+
+    std::unique_ptr<IVolumeScaleSource> GetScaleSource(IWeightEncodingConfig *config, DataType scaleType, const Quantization &explicitQuant);
+
+    Quantization MakeExplicit(const Quantization &ifmQ, const Quantization &weightQ, const Quantization &ofmQ,
+        DataType scaleType, DataType ifmType);
+
+    WeightsInfo EncodeWeights(IWeightEncodingConfig *config, IWeightSource *source, std::vector<uint8_t> &result, bool measureOnly);
+
+    int EncodeScales(IWeightEncodingConfig *config, IScaleSource *source, std::vector<uint8_t> &result, bool measureOnly);
+
+private:
+    ArchEthosU85 *_arch;
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/mlw_encode.cpp b/ethosu/regor/architecture/mlw_encode.cpp
new file mode 100644
index 00000000..0b5f48f8
--- /dev/null
+++ b/ethosu/regor/architecture/mlw_encode.cpp
@@ -0,0 +1,125 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "mlw_encode.hpp"
+
+#include "common/common.hpp"
+
+#include "common/bit_flags.hpp"
+
+#include <mlw_encode.h>
+#include <cassert>
+
+BEGIN_ENUM_TABLE(WeightFormat)
+    ADD_ENUM_NAME(Default)
+    ADD_ENUM_NAME(Fast)
+    ADD_ENUM_NAME(Sparse2_4)
+END_ENUM_TABLE()
+
+thread_local static std::vector<uint8_t> *sResult = nullptr;
+
+static void *reallocFunc(void *ptr, size_t reserve, int purpose)
+{
+    UNUSED(purpose);
+    assert(sResult);
+    assert(purpose == MLW_ENCODE_ALLOC_STREAM0);
+    size_t offset = ptr ? static_cast<uint8_t *>(ptr) - sResult->data() : sResult->size();
+    sResult->resize(reserve + offset);
+    return reserve ? static_cast<void *>(sResult->data() + offset) : nullptr;
+}
+
+MlwEncodeResult mle_encode_proxy(IWeightSource *source, int chunkSize, std::vector<uint8_t> &output, unsigned encodeFlags)
+{
+    assert(sResult == nullptr);
+    sResult = &output;
+    auto output_size = output.size();
+    ml_encode_result_t res;
+    int zeroCount = 0;
+    ml_ethosu_encode_params_t params;
+    params.encoder_flags = encodeFlags;
+    params.source_buffering_hint = chunkSize;
+    params.realloc_func = reallocFunc;
+
+    auto weight_func = [](int32_t query, ml_source_state_t *state, int16_t *buffer, int32_t size, void *user_arg)
+    {
+        UNUSED(query);
+        assert(query == MLW_SOURCE_QUERY_WEIGHTS);
+        IWeightSource *src = reinterpret_cast<IWeightSource *>(user_arg);
+        int source_size = src->Get(buffer, size);
+        state->eos = source_size < size;
+        return source_size;
+    };
+
+    try
+    {
+        mle_context_t *ctx = nullptr;
+        auto ret = ml_encode_ethosu_stream(&res, &params, weight_func, source, &ctx);
+        if ( ret < 0 ) throw std::runtime_error("mlw encode failed");
+        zeroCount = mle_context_query_zeroes(ctx);
+        mle_destroy_context(ctx);
+    }
+    catch ( const std::runtime_error & )
+    {
+        sResult = nullptr;
+        throw;
+    }
+    sResult = nullptr;
+    res.encoded_data = nullptr;  // Data owned by output
+    mle_free(&res);
+    output.resize(output_size + res.encoded_length);
+    return {res.source_length, res.encoded_length, zeroCount};
+}
+
+MlwEncodeResult mle_encode_fwd_proxy(IWeightSource *source, int chunkSize, std::vector<uint8_t> &output, unsigned encodeFlags)
+{
+    assert(sResult == nullptr);
+    sResult = &output;
+    auto output_size = output.size();
+    int count = 0;
+    int totalCount = 0;
+    int zeroCount = 0;
+    std::vector<int16_t> weights;
+
+    try
+    {
+        do
+        {
+            weights.resize(weights.size() + chunkSize);
+            count = source->Get(weights.data() + totalCount, chunkSize);
+            totalCount += count;
+        } while ( count == chunkSize );
+        weights.resize(totalCount);
+    }
+    catch ( const std::runtime_error & )
+    {
+        sResult = nullptr;
+        throw;
+    }
+
+    ml_encode_result_t res;
+    mle_context_t *ctx = mle_create_context(MLW_ENCODE_SYNTAX_ETHOSU_FWD);
+    mle_context_set_allocator(ctx, reallocFunc);
+    mle_encode(ctx, &res, weights.data(), totalCount, encodeFlags);
+    zeroCount = mle_context_query_zeroes(ctx);
+    res.encoded_data = nullptr;  // Data owned by output
+    mle_destroy_context(ctx);
+    mle_free(&res);
+    output.resize(output_size + res.encoded_length);
+    sResult = nullptr;
+    return {totalCount, res.encoded_length, zeroCount};
+}
diff --git a/ethosu/regor/architecture/mlw_encode.hpp b/ethosu/regor/architecture/mlw_encode.hpp
new file mode 100644
index 00000000..19d04544
--- /dev/null
+++ b/ethosu/regor/architecture/mlw_encode.hpp
@@ -0,0 +1,54 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+enum class WeightFormat : uint16_t
+{
+    Default = 0,
+    Fast = 1,
+    Sparse2_4 = 2
+};
+
+inline constexpr bool operator&(WeightFormat type, WeightFormat mask)
+{
+    return bool(unsigned(type) & unsigned(mask));
+}
+
+struct MlwEncodeResult
+{
+    int elements_read;
+    int bytes_written;
+    int zero_count;
+};
+
+class IWeightSource;
+
+class IWeightSource
+{
+public:
+    virtual ~IWeightSource() = default;
+    virtual int Elements() = 0;
+    virtual int Get(int16_t *buffer, int count) = 0;
+};
+
+MlwEncodeResult mle_encode_proxy(IWeightSource *source, int chunkSize, std::vector<uint8_t> &output, unsigned encodeFlags);
+MlwEncodeResult mle_encode_fwd_proxy(IWeightSource *source, int chunkSize, std::vector<uint8_t> &output, unsigned encodeFlags);
diff --git a/ethosu/regor/architecture/register_command_stream_generator.hpp b/ethosu/regor/architecture/register_command_stream_generator.hpp
new file mode 100644
index 00000000..5f4dc1d5
--- /dev/null
+++ b/ethosu/regor/architecture/register_command_stream_generator.hpp
@@ -0,0 +1,41 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "compiler/high_level_command_stream.hpp"
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+namespace regor
+{
+
+class IRegisterCommandStreamGenerator
+{
+public:
+    virtual ~IRegisterCommandStreamGenerator() = default;
+    virtual std::vector<uint32_t> GenerateCommandStream(std::vector<std::unique_ptr<HighLevelCommand>> &highLevelCommandStream,
+        std::vector<std::tuple<void *, int, int>> *genRanges, bool verbose) = 0;
+    virtual void PrintCommandStream(const std::vector<uint32_t> &stream, std::vector<std::pair<unsigned, std::string>> &debugInfo) = 0;
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/architecture/weight_encoder.hpp b/ethosu/regor/architecture/weight_encoder.hpp
new file mode 100644
index 00000000..bf217b6c
--- /dev/null
+++ b/ethosu/regor/architecture/weight_encoder.hpp
@@ -0,0 +1,210 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/scaling.hpp"
+#include "compiler/scheduler_operation.hpp"
+#include "mlw_encode.hpp"
+
+#include <cstdint>
+#include <unordered_map>
+#include <vector>
+
+namespace regor
+{
+
+/// <summary>
+/// Contains info about encoded weights/scales for one core/depth offset combination
+/// </summary>
+struct WeightRange
+{
+    int offset = 0;
+    int scaleBytes = 0;
+    int weightOffset = 0;
+    int weightBytes = 0;
+    int index = 0;
+
+    int TotalBytes() const { return scaleBytes + weightBytes; }
+};
+
+/// Produces key for indexing WeightTensor::encodedRanges
+constexpr inline int WeightKey(int stream, int depth)
+{
+    return (depth << 8) + stream;
+}
+
+struct WeightTransformParam
+{
+    int o, h, w, i;
+};
+
+typedef int (*WeightTransformFunc)(const WeightTransformParam *param, int weight);
+
+struct IWeightEncodingConfig
+{
+    virtual ~IWeightEncodingConfig() = default;
+    virtual uint32_t Hash() = 0;
+    virtual bool Equals(IWeightEncodingConfig *other) = 0;
+    virtual const std::vector<int> &DepthOffsets() = 0;
+    virtual Flags<WeightFormat> Format() = 0;
+};
+
+struct IVolumeWeightSource : public IWeightSource
+{
+    virtual ~IVolumeWeightSource() = default;
+    virtual void SetSource(const void *buffer, int depthOffset, const Shape &ohwiShape, const Shape &ohwiStrides, int streamIndex) = 0;
+};
+
+struct IScaleSource
+{
+    virtual ~IScaleSource() = default;
+    virtual int Elements() = 0;
+    virtual int Get(int64_t *biasBuffer, QuantizedScale *quantBuffer, int count) = 0;
+};
+
+struct IVolumeScaleSource : public IScaleSource
+{
+    virtual ~IVolumeScaleSource() = default;
+    virtual void SetSource(const void *buffer, int biasCount, int depthOffset, int depthLength, int streamIndex) = 0;
+};
+
+
+/// <summary>
+/// Contains encoded weights and biases.
+/// </summary>
+class NpuWeightTensor : public SchedulerTensor
+{
+public:
+    virtual ~NpuWeightTensor() = default;
+
+    /** Required size in bytes if double buffering is applied */
+    int maxRangeBytes = 0;
+    int totalWeightBytes = 0;
+    int subStreams = 0;
+    std::unordered_map<int, WeightRange> encodedRanges;
+    std::unique_ptr<IWeightEncodingConfig> config;
+};
+
+struct WeightScaleTensors
+{
+    /** Encoded weights, may be null */
+    std::shared_ptr<NpuWeightTensor> npuWeightsTensor;
+    /** Combined scaling parameters in the weights tensor **/
+    uint32_t scaleHash;
+    /** Encoded scales, may be null */
+    std::shared_ptr<NpuWeightTensor> npuScalesTensor;
+};
+
+
+struct WeightsRef
+{
+    BufferView *view = nullptr;
+    AxisOrder axisOrder = AxisOrder::Unknown;
+    DataType type = DataType::None;
+    bool isScales = false;
+};
+
+struct WeightsInfo
+{
+    int sourceSize = 0;
+    int encodedSize = 0;
+    int zeroCount = 0;
+    int streams = 0;
+};
+
+/// <summary>
+/// Encodes weights and biases.
+/// </summary>
+class WeightEncoder
+{
+public:
+    virtual ~WeightEncoder() = default;
+
+    virtual std::unique_ptr<IWeightEncodingConfig> GetEncodingConfig(ArchitectureOpConfig *opCfg, const WeightsRef &weights,
+        const Kernel *kernel, DataType ifmType, const std::vector<int> &depthOffsets, Flags<WeightFormat> format) = 0;
+
+    virtual int StreamsRequired(IWeightEncodingConfig *config, const Shape &ofmShape, int &scaleStreamsRequired) = 0;
+
+    virtual std::unique_ptr<IVolumeWeightSource> GetWeightSource(
+        IWeightEncodingConfig *config, DataType weightType, WeightTransformFunc func, WeightTransformParam *param) = 0;
+
+    virtual std::unique_ptr<IVolumeScaleSource>
+    GetScaleSource(IWeightEncodingConfig *config, DataType scaleType, const Quantization &explicitQuant) = 0;
+
+    virtual Quantization MakeExplicit(const Quantization &ifmQ, const Quantization &weightQ, const Quantization &ofmQ,
+        DataType scaleType, DataType ifmType) = 0;
+
+    virtual WeightsInfo EncodeWeights(
+        IWeightEncodingConfig *config, IWeightSource *source, std::vector<uint8_t> &result, bool measureOnly) = 0;
+    virtual int EncodeScales(IWeightEncodingConfig *config, IScaleSource *source, std::vector<uint8_t> &result, bool measureOnly) = 0;
+};
+
+// IVolumeWeightSource common implementation
+class WeightSourceCommon : public IVolumeWeightSource
+{
+
+protected:
+    const void *_source;
+    int16_t _streams = 1;
+    int16_t _streamIndex = 0;
+    int _ofmDepth = 0;
+    int _ifmDepth = 0;
+    int _kernelH = 0;
+    int _kernelW = 0;
+    int _ohwiStrides[4];
+
+protected:
+    void SetSourceCommon(const void *buffer, int depthOffset, const Shape &ohwiShape, const Shape &ohwiStrides, int streamIndex, bool separated)
+    {
+        assert(streamIndex < _streams);
+        _streamIndex = streamIndex;
+
+        int streamOffset = Shape(depthOffset, 0, 0, 0).Dot(ohwiStrides);
+        _source = reinterpret_cast<const uint8_t *>(buffer) + streamOffset;
+        _ifmDepth = ohwiShape[-1];
+        _ofmDepth = separated ? (ohwiShape[0] + _streams - 1 - streamIndex) / _streams : ohwiShape[0];
+        _kernelH = ohwiShape.Height();
+        _kernelW = ohwiShape.Width();
+
+        // Bring in values for better cache locality
+        _ohwiStrides[0] = ohwiStrides[0] * (separated ? _streams : 1);
+        _ohwiStrides[1] = ohwiStrides[1];
+        _ohwiStrides[2] = ohwiStrides[2];
+        _ohwiStrides[3] = ohwiStrides[3];
+    }
+
+    int Elements() override { return _ofmDepth * _ifmDepth * _kernelH * _kernelW; }
+
+    inline int WeightIndex(int ofm_z, int wy, int wx, int ifm_z) const
+    {
+        return ofm_z * _ohwiStrides[0] + wy * _ohwiStrides[1] + wx * _ohwiStrides[2] + ifm_z * _ohwiStrides[3];
+    }
+};
+
+struct WeightEncodeException : public std::runtime_error
+{
+    WeightEncodeException() : std::runtime_error("weight encode") {}
+};
+
+struct WeightsNotSparse : public WeightEncodeException
+{
+    WeightsNotSparse() {}
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/bindings/python/py_regor.cpp b/ethosu/regor/bindings/python/py_regor.cpp
new file mode 100644
index 00000000..af8a2a51
--- /dev/null
+++ b/ethosu/regor/bindings/python/py_regor.cpp
@@ -0,0 +1,587 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "include/regor_interface.hpp"
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <algorithm>
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include "include/regor.h"
+
+#define _STRINGIFY(x) #x
+#define STRINGIFY(x) _STRINGIFY(x)
+
+namespace py = pybind11;
+
+struct PyRegorMemoryAccess
+{
+    std::string accessType;
+    int64_t bytesRead = 0;
+    int64_t bytesWritten = 0;
+    int64_t accessCycles = 0;
+
+    std::string ToString() const
+    {
+        std::string ret;
+        ret += "Access Type[" + accessType + "]\n";
+        ret += "\tread  = " + std::to_string(bytesRead) + "\n";
+        ret += "\twrite = " + std::to_string(bytesWritten) + "\n";
+        ret += "\tcycles = " + std::to_string(accessCycles) + "\n";
+        return ret;
+    }
+};
+
+struct PyRegorMemoryPerf
+{
+    std::string memoryName;
+    int64_t peakUsage = 0;
+    std::unordered_map<std::string, PyRegorMemoryAccess> accesses;
+
+    std::string ToString() const
+    {
+        std::string ret;
+        ret += "Memory[" + memoryName + "]\n";
+        ret += "\tPeak usage  = " + std::to_string(peakUsage) + "\n";
+        return ret;
+    }
+};
+
+struct PyRegorPerfReport
+{
+    int64_t npuCycles = 0;
+    int64_t cpuCycles = 0;
+    int64_t totalCycles = 0;
+    int64_t macCount = 0;
+    int64_t cpuOps = 0;
+    int64_t npuOps = 0;
+    int64_t cascadedOps = 0;
+    int64_t cascades = 0;
+    int64_t originalWeights = 0;
+    int64_t encodedWeights = 0;
+    std::string stagingMemoryArea = "";
+
+    std::unordered_map<std::string, PyRegorMemoryPerf> memories;
+
+    std::string ToString() const
+    {
+        assert(npuOps >= 0);
+        assert(cpuOps >= 0);
+        assert(cascadedOps >= 0);
+        assert(cascades >= 0);
+
+        assert(npuOps <= (std::numeric_limits<int64_t>::max() - cpuOps));
+        auto totalOpsBc = npuOps + cpuOps;
+        assert(totalOpsBc >= cascadedOps);
+        assert((totalOpsBc - cascadedOps) <= (std::numeric_limits<int64_t>::max() - cascades));
+
+        std::string ret;
+        ret += "NPU Cycles = " + std::to_string(npuCycles) + "\n";
+        ret += "CPU Cycles = " + std::to_string(cpuCycles) + "\n";
+        ret += "Total Cycles = " + std::to_string(totalCycles) + "\n";
+        ret += "Total MACs = " + std::to_string(macCount) + "\n";
+        ret += "CPU Operations = " + std::to_string(cpuOps) + "\n";
+        ret += "NPU Operations = " + std::to_string(npuOps) + "\n";
+        ret += "Total operations (before cascading) = " + std::to_string(totalOpsBc) + "\n";
+        ret += "Total operations (after cascading) = " + std::to_string(totalOpsBc - cascadedOps + cascades) + "\n";
+        ret += "Original Weights = " + std::to_string(originalWeights) + "\n";
+        ret += "Encoded Weights = " + std::to_string(encodedWeights) + "\n";
+        for ( const auto &[memName, memory] : memories )
+        {
+            ret += memory.ToString();
+            for ( const auto &[_memName, access] : memory.accesses )
+            {
+                ret += access.ToString();
+            }
+        }
+        return ret;
+    }
+};
+
+struct PyRegorDatabaseTable
+{
+    std::vector<std::string> header;
+    std::vector<std::vector<std::string>> data;
+};
+
+struct PyRegorDatabase
+{
+    std::unordered_map<std::string, PyRegorDatabaseTable> tables;
+};
+
+struct PyRegorCompiledModel
+{
+    PyRegorCompiledModel() {}
+
+    void SetPerfReport(PyRegorPerfReport &&report) { perf_report = std::move(report); }
+    void SetOptDatabase(PyRegorDatabase &&database) { opt_database = std::move(database); }
+
+    PyRegorPerfReport perf_report;
+    PyRegorDatabase opt_database;
+};
+
+struct PyRegorCompiledRawModelConstantTensor
+{
+    PyRegorCompiledRawModelConstantTensor() = default;
+    PyRegorCompiledRawModelConstantTensor(uint8_t region_, py::bytes data_) : region(region_), data(std::move(data_)) {}
+
+    uint8_t region = 0;
+    py::bytes data;
+};
+
+struct PyRegorCompiledRawModelNonConstantTensor
+{
+    PyRegorCompiledRawModelNonConstantTensor() = default;
+    PyRegorCompiledRawModelNonConstantTensor(int64_t region_, int64_t address_, uint32_t size_, uint8_t element_size_, std::vector<uint32_t> &shape_) :
+            region(region_), address(address_), size(size_), element_size(element_size_), shape(shape_)
+    {
+    }
+
+    uint8_t region = 0;
+    uint64_t address = 0;
+    uint32_t size = 0;
+    uint8_t element_size = 0;
+    std::vector<uint32_t> shape;
+};
+
+struct PyRegorCompiledRawModel : PyRegorCompiledModel
+{
+    PyRegorCompiledRawModel() : PyRegorCompiledModel() {}
+
+    py::bytes command_stream;
+    PyRegorCompiledRawModelConstantTensor read_only;
+    PyRegorCompiledRawModelNonConstantTensor scratch;
+    PyRegorCompiledRawModelNonConstantTensor scratch_fast;
+    std::vector<PyRegorCompiledRawModelNonConstantTensor> inputs;
+    std::vector<PyRegorCompiledRawModelNonConstantTensor> outputs;
+};
+
+struct PyRegorCompiledTFLiteModel : PyRegorCompiledModel
+{
+    PyRegorCompiledTFLiteModel() : PyRegorCompiledModel(), model(py::none()) {}
+    PyRegorCompiledTFLiteModel(py::object model_) : PyRegorCompiledModel(), model(std::move(model_)) {}
+
+    py::object model;
+};
+
+class PyRegor
+{
+public:
+    PyRegor(const std::string &arch)
+    {
+        if ( !regor_create(&_context, arch.c_str()) )
+        {
+            throw std::invalid_argument("Unknown architecture " + arch);
+        }
+        std::cout.setf(std::ios::unitbuf);
+        std::cout.flush();
+        regor_set_logging(&PyRegor::Log, 0);
+    }
+
+    ~PyRegor() { regor_destroy(_context); }
+
+    void SetSystemConfig(const std::string &config)
+    {
+        if ( !regor_set_system_config(_context, config.c_str(), config.size()) )
+        {
+            throw std::invalid_argument("Invalid System Config");
+        }
+    }
+
+    void SetCompilerOptions(const std::string &options)
+    {
+        if ( !regor_set_compiler_options(_context, options.c_str(), options.size()) )
+        {
+            throw std::invalid_argument("Invalid Compiler Options");
+        }
+    }
+
+    py::object PyCompile(py::bytes &input, const std::string &fmt, bool verbose)
+    {
+        // Extract input buffer and size of input buffer
+        py::buffer_info info(py::buffer(input).request());
+        const void *in_data = reinterpret_cast<const void *>(info.ptr);
+        size_t in_size = size_t(std::max<py::ssize_t>(info.size, 0));
+
+        // Compile the input buffer and return a subclass of PyRegorCompiledModel
+        return Compile(in_data, in_size, fmt, verbose);
+    }
+
+    PyRegorPerfReport GetPerfReport()
+    {
+        PyRegorPerfReport pyPerf;
+        regor_perf_report_t report;
+        int rStatus;
+        (void)(rStatus);
+        report.access = nullptr;
+
+        rStatus = regor_get_perf_report(_context, &report);
+        assert(rStatus);
+        // Parse accessCount and call again to populate access
+        std::vector<regor_memory_access_perf_t> access;
+        if ( report.accessCount > 0 )
+        {
+            access.resize(report.accessCount);
+        }
+
+        report.access = access.data();
+        rStatus = regor_get_perf_report(_context, &report);
+        assert(rStatus);
+
+        pyPerf.npuCycles = report.npuCycles;
+        pyPerf.cpuCycles = report.cpuCycles;
+        pyPerf.totalCycles = report.totalCycles;
+        pyPerf.macCount = report.macCount;
+        pyPerf.cpuOps = report.cpuOps;
+        pyPerf.npuOps = report.npuOps;
+        pyPerf.cascadedOps = report.cascadedOps;
+        pyPerf.cascades = report.cascades;
+        pyPerf.originalWeights = report.originalWeights;
+        pyPerf.encodedWeights = report.encodedWeights;
+        if ( report.npuOps > 0 )
+        {
+            assert(report.numMemories <= int(std::size(report.peakUsages)));
+            assert(report.stagingMemory >= 0 && report.stagingMemory < report.numMemories);
+            pyPerf.stagingMemoryArea = report.peakUsages[report.stagingMemory].memoryName;
+            for ( int i = 0; i < report.numMemories; ++i )
+            {
+                std::string name = report.peakUsages[i].memoryName;
+                pyPerf.memories[name].memoryName = report.peakUsages[i].memoryName;
+                pyPerf.memories[name].peakUsage = report.peakUsages[i].peakUsage;
+            }
+            for ( const auto &acc : access )
+            {
+                std::string name = acc.memoryName;
+                pyPerf.memories[name].accesses[name] = {acc.accessType, acc.bytesRead, acc.bytesWritten, acc.accessCycles};
+            }
+        }
+        return pyPerf;
+    }
+
+    PyRegorDatabase GetOptDatabase()
+    {
+        regor::IRegorReporting *reporting = regor_get_reporting_interface(_context);
+        assert(reporting);
+        regor::IDatabase *db = reporting->OptimiserDatabase();
+        return ToPyDatabase(db);
+    }
+
+private:
+    static void Log(const void *data, size_t size)
+    {
+        assert(size <= std::numeric_limits<std::streamsize>::max());
+        std::cout.write(reinterpret_cast<const char *>(data), size);
+    }
+
+    PyRegorDatabase ToPyDatabase(regor::IDatabase *db)
+    {
+        PyRegorDatabase pyDb;
+        if ( db != nullptr )
+        {
+            regor::ITableIterator *table = db->Tables();
+            while ( table->Next() )
+            {
+                // table name
+                std::string name = table->Name();
+                PyRegorDatabaseTable pyDbTable;
+
+                // header
+                regor::IRowIterator<std::string> *row = table->ColumnNames();
+                bool isIndexed = (row->Id() > 0);
+                if ( isIndexed ) pyDbTable.header.push_back("id");
+                while ( row->Next() )
+                {
+                    pyDbTable.header.push_back(row->Value());
+                }
+                row->Release();
+
+                // data
+                int rowCount = table->Rows();
+                for ( int i = 0; i < rowCount; i++ )
+                {
+                    std::vector<std::string> data;
+                    row = table->Row(i);
+                    if ( isIndexed ) data.push_back(std::to_string(row->Id()));
+                    while ( row->Next() )
+                    {
+                        data.push_back(row->Value());
+                    }
+                    row->Release();
+                    pyDbTable.data.push_back(data);
+                }
+                pyDb.tables[name] = std::move(pyDbTable);
+            }
+            table->Release();
+        }
+        return pyDb;
+    }
+
+    // Internal helper
+    py::object Compile(const void *input, size_t in_size, const std::string &_fmt, bool verbose)
+    {
+        int rStatus;
+        (void)(rStatus);
+
+        // Capture these
+        regor_set_logging(&PyRegor::Log, verbose ? ~0u : 0u);
+        rStatus = regor_set_callback_arg(_context, this);
+        assert(rStatus);
+
+        regor_format_t fmt =
+            _fmt == "TFLITE" ? REGOR_INPUTFORMAT_TFLITE :
+            _fmt == "TOSA"   ? REGOR_INPUTFORMAT_TOSA :
+                               REGOR_INPUTFORMAT_GRAPHAPI;
+
+        if ( !regor_compile(_context, fmt, input, in_size, nullptr) )
+        {
+            std::string last_error;
+            size_t last_error_len = 0;
+
+            // Measure error length
+            if ( !regor_get_error(_context, nullptr, &last_error_len) )
+            {
+                throw std::runtime_error("Compilation failed: Failed to fetch error");
+            }
+            last_error.resize(last_error_len);
+
+            if ( !regor_get_error(_context, last_error.data(), &last_error_len) )
+            {
+                throw std::runtime_error("Compilation failed: Failed to fetch error");
+            }
+            else
+            {
+                throw std::runtime_error("Compilation failed: " + last_error);
+            }
+        }
+
+        // Get all compiler output
+        std::vector<regor::IRegorBlob *> blobs;
+        {
+            regor::IRegorBlob *blob;
+            while ( regor_get_output(_context, &blob) )
+            {
+                if ( blob ) blobs.push_back(blob);
+            }
+        }
+
+        if ( blobs.size() == 1 )
+        {
+            // Likely TFLite output
+
+            assert(fmt == REGOR_INPUTFORMAT_TFLITE && "Only 1 output blob expected for TFLite input");
+
+            PyRegorCompiledTFLiteModel tfl;
+            tfl.SetPerfReport(GetPerfReport());
+            tfl.SetOptDatabase(GetOptDatabase());
+
+            int64_t size;
+            void *buf = blobs[0]->Map(size);
+            tfl.model = py::bytes(reinterpret_cast<char *>(buf), size);
+            blobs[0]->Unmap(buf);
+            blobs[0]->Release();
+
+            return py::cast(tfl);
+        }
+        else if ( blobs.size() > 1 )
+        {
+            // Likely raw output
+
+            PyRegorCompiledRawModel raw;
+            raw.SetPerfReport(GetPerfReport());
+            raw.SetOptDatabase(GetOptDatabase());
+
+            for ( auto &blob : blobs )
+            {
+                int64_t size;
+                char *buf = reinterpret_cast<char *>(blob->Map(size));
+
+                regor_raw_tensor_header_t header;
+                std::copy_n(buf, sizeof(header), reinterpret_cast<char *>(&header));
+
+                char *data;
+                uint32_t data_size;
+                uint8_t region;
+                uint64_t address;
+                uint8_t element_size;
+                std::vector<uint32_t> shape;
+
+                switch ( header.type )
+                {
+                    case regor_raw_tensor_header_t::RAW_TENSOR_TYPE_COMMAND_STREAM:
+                        data = buf + sizeof(header);
+                        data_size = header.tensor.command_stream.size;
+                        raw.command_stream = py::bytes(data, data_size);
+                        break;
+                    case regor_raw_tensor_header_t::RAW_TENSOR_TYPE_READ_ONLY:
+                        data = buf + sizeof(header);
+                        data_size = header.tensor.read_only.size;
+                        raw.read_only.region = header.tensor.read_only.region;
+                        raw.read_only.data = py::bytes(data, data_size);
+                        break;
+                    case regor_raw_tensor_header_t::RAW_TENSOR_TYPE_SCRATCH:
+                        raw.scratch.region = header.tensor.scratch.region;
+                        raw.scratch.size = header.tensor.scratch.size;
+                        raw.scratch.address = header.tensor.scratch.address;
+                        break;
+                    case regor_raw_tensor_header_t::RAW_TENSOR_TYPE_SCRATCH_FAST:
+                        raw.scratch_fast.region = header.tensor.scratch_fast.region;
+                        raw.scratch_fast.size = header.tensor.scratch_fast.size;
+                        raw.scratch_fast.address = header.tensor.scratch_fast.address;
+                        break;
+                    case regor_raw_tensor_header_t::RAW_TENSOR_TYPE_INPUT:
+                        region = header.tensor.input.region;
+                        address = header.tensor.input.address;
+                        data_size = header.tensor.input.size;
+                        element_size = header.tensor.input.element_size;
+                        shape.insert(shape.end(), header.tensor.input.shape, header.tensor.input.shape + 4);
+                        raw.inputs.emplace_back(region, address, size, element_size, shape);
+                        break;
+                    case regor_raw_tensor_header_t::RAW_TENSOR_TYPE_OUTPUT:
+                        region = header.tensor.output.region;
+                        address = header.tensor.output.address;
+                        data_size = header.tensor.output.size;
+                        element_size = header.tensor.output.element_size;
+                        shape.insert(shape.end(), header.tensor.output.shape, header.tensor.output.shape + 4);
+                        raw.outputs.emplace_back(region, address, size, element_size, shape);
+                        break;
+                    default:
+                        break;
+                }
+
+                blob->Unmap(buf);
+                blob->Release();
+            }
+
+            return py::cast(raw);
+        }
+        else
+        {
+            throw std::runtime_error("Compilation generated no output blobs");
+        }
+    }
+
+    regor_context_t _context;
+};
+
+PYBIND11_MODULE(regor, m)
+{
+    m.doc() = R"pbdoc(
+        Welcome to Regor - the brightest star in Vela
+    )pbdoc";
+
+    m.attr("__version__") = STRINGIFY(REGOR_VERSION);
+
+    py::class_<PyRegorMemoryAccess>(m, "MemoryAccess", "Regor memory accesses")
+        .def(py::init<>())
+        .def_readwrite("accessType", &PyRegorMemoryAccess::accessType, "Access type")
+        .def_readwrite("bytesRead", &PyRegorMemoryAccess::bytesRead, "Bytes read")
+        .def_readwrite("bytesWritten", &PyRegorMemoryAccess::bytesWritten, "Bytes written")
+        .def_readwrite("accessCycles", &PyRegorMemoryAccess::accessCycles, "Total access cycles")
+        .def("__repr__", &PyRegorMemoryAccess::ToString);
+    py::class_<PyRegorMemoryPerf>(m, "MemoryPerf", "A Regor memory performance report")
+        .def(py::init<>())
+        .def_readwrite("memoryName", &PyRegorMemoryPerf::memoryName, "Memory name")
+        .def_readwrite("peakUsage", &PyRegorMemoryPerf::peakUsage, "Peak usage")
+        .def_readwrite("accesses", &PyRegorMemoryPerf::accesses, "Accesses")
+        .def("__repr__", &PyRegorMemoryPerf::ToString);
+    py::class_<PyRegorPerfReport>(m, "PerfReport", "A Regor performance report")
+        .def(py::init<>())
+        .def_readwrite("npuCycles", &PyRegorPerfReport::npuCycles, "NPU elapsed cycles")
+        .def_readwrite("cpuCycles", &PyRegorPerfReport::cpuCycles, "CPU elapsed cycles")
+        .def_readwrite("totalCycles", &PyRegorPerfReport::totalCycles, "Total elapsed time in cycles")
+        .def_readwrite("macCount", &PyRegorPerfReport::macCount, "Number of Multiply-Accumulate operations")
+        .def_readwrite("cpuOps", &PyRegorPerfReport::cpuOps, "Number of CPU operations")
+        .def_readwrite("npuOps", &PyRegorPerfReport::npuOps, "Number of NPU operations")
+        .def_readwrite("cascadedOps", &PyRegorPerfReport::cascadedOps, "Number of cascaded operations")
+        .def_readwrite("cascades", &PyRegorPerfReport::cascades, "Number of cascades")
+        .def_readwrite("originalWeights", &PyRegorPerfReport::originalWeights, "Weights size (uncompressed)")
+        .def_readwrite("encodedWeights", &PyRegorPerfReport::encodedWeights, "Weights size (compressed)")
+        .def_readwrite("stagingMemoryArea", &PyRegorPerfReport::stagingMemoryArea, "Staging memory area")
+        .def_readwrite("memories", &PyRegorPerfReport::memories, "Memory performance report")
+        .def("__repr__", &PyRegorPerfReport::ToString);
+
+    py::class_<PyRegorDatabaseTable>(m, "DatabaseTable", "A regor database table")
+        .def(py::init<>())
+        .def_readwrite("header", &PyRegorDatabaseTable::header, "database headers")
+        .def_readwrite("data", &PyRegorDatabaseTable::data, "database data");
+
+    py::class_<PyRegorDatabase>(m, "Database", "A regor database").def(py::init<>()).def_readwrite("tables", &PyRegorDatabase::tables, "database tables");
+
+    py::class_<PyRegor>(m, "Regor", "The main Regor compiler class")
+        .def(py::init<const std::string &>())
+        .def("SetSystemConfig", &PyRegor::SetSystemConfig, "Set the system configuration")
+        .def("SetCompilerOptions", &PyRegor::SetCompilerOptions, "Set compiler options")
+        .def("Compile", &PyRegor::PyCompile, "Compile the input model into a TFLite Flatbuffer", py::arg("input"),
+            py::arg("fmt"), py::arg("verbose") = false)
+        .def("GetPerfReport", &PyRegor::GetPerfReport, "Get the performance report for the latest compiled model")
+        .def("GetOptDatabase", &PyRegor::GetOptDatabase, "Get the optimiser database for the latest compiled model");
+
+    py::class_<PyRegorCompiledRawModelNonConstantTensor>(m, "CompiledRawModelNonConstantTensor", "A non-constant tensor of a Regor-compiled model in raw format")
+        .def(py::init<>())
+        .def_readwrite("region", &PyRegorCompiledRawModelNonConstantTensor::region, "The tensor's region")
+        .def_readwrite("address", &PyRegorCompiledRawModelNonConstantTensor::address, "The tensor's address")
+        .def_readwrite("size", &PyRegorCompiledRawModelNonConstantTensor::size, "The tensor's size")
+        .def_readwrite("element_size", &PyRegorCompiledRawModelNonConstantTensor::element_size, "The tensor's element size")
+        .def_readwrite("shape", &PyRegorCompiledRawModelNonConstantTensor::shape, "The tensor's shape");
+
+    py::class_<PyRegorCompiledRawModelConstantTensor>(m, "CompiledRawModelConstantTensor", "A constant tensor of a Regor-compiled model in raw format")
+        .def(py::init<>())
+        .def_readwrite("region", &PyRegorCompiledRawModelConstantTensor::region, "The tensor's region")
+        .def_readwrite("data", &PyRegorCompiledRawModelConstantTensor::data, "The tensor's constant data");
+
+    py::class_<PyRegorCompiledModel>(m, "CompiledModel", "A Regor-compiled model")
+        .def(py::init<>())
+        .def_readwrite("perf_report", &PyRegorCompiledTFLiteModel::perf_report, "The performance report for the compiled model")
+        .def_readwrite("opt_database", &PyRegorCompiledTFLiteModel::opt_database, "The optimiser database for the compiled model");
+
+    py::class_<PyRegorCompiledRawModel, PyRegorCompiledModel>(m, "CompiledRawModel", "A Regor-compiled model in raw format")
+        .def(py::init<>())
+        .def_readwrite("command_stream", &PyRegorCompiledRawModel::command_stream, "The compiled model command stream")
+        .def_readwrite("read_only", &PyRegorCompiledRawModel::read_only, "The compiled model weights")
+        .def_readwrite("scratch", &PyRegorCompiledRawModel::scratch, "The compiled model scratch area")
+        .def_readwrite("scratch_fast", &PyRegorCompiledRawModel::scratch_fast, "The compiled model scratch fast area")
+        .def_readwrite("inputs", &PyRegorCompiledRawModel::inputs, "The compiled model inputs")
+        .def_readwrite("outputs", &PyRegorCompiledRawModel::outputs, "The compiled model outputs");
+
+    py::class_<PyRegorCompiledTFLiteModel, PyRegorCompiledModel>(m, "CompiledTFLiteModel", "A Regor-compiled TFLite model")
+        .def(py::init<>())
+        .def_readwrite("model", &PyRegorCompiledTFLiteModel::model, "The compiled model TFLite blob");
+
+    m.def(
+        "compile",
+        [](const std::string &arch, py::bytes &input, const std::string &fmt, const std::string &sysconfig,
+            const std::string &options = "", bool verbose = false) -> py::object
+        {
+            PyRegor pyr(arch);
+            pyr.SetSystemConfig(sysconfig);
+            if ( options.size() > 0 )
+            {
+                pyr.SetCompilerOptions(options);
+            }
+
+            return pyr.PyCompile(input, fmt, verbose);
+        },
+        R"pbdoc(
+            Compile a model
+            Returns a compiled model
+        )pbdoc",
+        py::arg("arch"), py::arg("input"), py::arg("fmt"), py::arg("sysconfig"), py::arg("options") = "", py::arg("verbose") = false);
+}
diff --git a/ethosu/regor/cmake/cpack_config.cmake b/ethosu/regor/cmake/cpack_config.cmake
new file mode 100644
index 00000000..24042e1f
--- /dev/null
+++ b/ethosu/regor/cmake/cpack_config.cmake
@@ -0,0 +1,83 @@
+#
+# SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# only include last at the top level
+if (NOT "${CMAKE_CURRENT_SOURCE_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}")
+    return()
+endif()
+
+include(utils)
+include(InstallRequiredSystemLibraries)
+
+# Use a PEP-656 compliant package tag
+# The default value for this variable is not useful
+if (CMAKE_CROSSCOMPILING)
+    if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "i386")
+        set(REGOR_SYSTEM_NAME "linux-i586")
+    else()
+        message(FATAL_ERROR "Unknown cross-compile system")
+    endif()
+else()
+    utils_find_python()
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} -c "import sysconfig; print(sysconfig.get_platform())"
+        OUTPUT_VARIABLE REGOR_SYSTEM_NAME OUTPUT_STRIP_TRAILING_WHITESPACE)
+endif()
+
+set(CPACK_PACKAGE_NAME ${REGOR_PACKAGE_NAME})
+
+if (NOT CPACK_PROJECT_NAME)
+    # Can be overriden
+    set(CPACK_PROJECT_NAME ${CMAKE_PROJECT_NAME})
+endif()
+if (NOT CPACK_PACKAGE_NAME)
+    # Can be overriden
+    set(CPACK_PACKAGE_NAME ${CPACK_PROJECT_NAME})
+endif()
+if (NOT CPACK_COMPONENT_NAME)
+    # Can be overriden
+    set(CPACK_COMPONENT_NAME ${CPACK_PROJECT_NAME})
+endif()
+
+set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${REGOR_SYSTEM_NAME})
+
+# Default variables
+set(CPACK_PACKAGE_VENDOR "Arm")
+set(CPACK_PACKAGE_DESCRIPTION "${${CPACK_PROJECT_NAME}_DESCRIPTION}")
+set(CPACK_PACKAGE_VERSION_MAJOR "${${CPACK_PROJECT_NAME}_VERSION_MAJOR}")
+set(CPACK_PACKAGE_VERSION_MINOR "${${CPACK_PROJECT_NAME}_VERSION_MINOR}")
+if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+    set(CPACK_STRIP_FILES FALSE)
+else()
+    set(CPACK_STRIP_FILES TRUE)
+endif()
+set(CPACK_VERBATIM_VARIABLES TRUE)
+
+# Archive generator setup
+set(CPACK_BINARY_TGZ   ON)
+set(CPACK_BINARY_STGZ OFF)
+set(CPACK_BINARY_TBZ2 OFF)
+set(CPACK_BINARY_TXZ  OFF)
+set(CPACK_BINARY_TZ   OFF)
+
+# Collect all exported targets
+set(CPACK_INSTALL_CMAKE_PROJECTS
+    "${CMAKE_CURRENT_BINARY_DIR};${CPACK_PROJECT_NAME};${CPACK_COMPONENT_NAME};/")
+
+# Include CPack last
+include(CPack)
diff --git a/ethosu/regor/cmake/pkg-config.cmake.in b/ethosu/regor/cmake/pkg-config.cmake.in
new file mode 100644
index 00000000..70c4ced5
--- /dev/null
+++ b/ethosu/regor/cmake/pkg-config.cmake.in
@@ -0,0 +1,26 @@
+#
+# SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+@PACKAGE_INIT@
+
+set_and_check(TARGETS_FILE
+    "${CMAKE_CURRENT_LIST_DIR}/@PACKAGE_NAME@-targets.cmake")
+
+include(${TARGETS_FILE})
+
+check_required_components(@PACKAGE_NAME@)
diff --git a/ethosu/regor/cmake/regor_dependencies.cmake b/ethosu/regor/cmake/regor_dependencies.cmake
new file mode 100644
index 00000000..c971da02
--- /dev/null
+++ b/ethosu/regor/cmake/regor_dependencies.cmake
@@ -0,0 +1,82 @@
+#
+# SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+include_guard(GLOBAL)
+
+include(regor_lib)
+
+############################################################################
+# An add_subdirectory wrapper fixing up imported targets
+############################################################################
+
+function(regor_add_dependency dir)
+    add_subdirectory(${dir} ${ARGN})
+
+    # Get all target dirs
+    set(to_visit ${dir})
+    set(total_dirs ${dir})
+    while (to_visit)
+        set(_to_visit)
+        foreach (d IN LISTS to_visit)
+            get_directory_property(d_dirs DIRECTORY ${d} SUBDIRECTORIES)
+            if (d_dirs)
+                list(APPEND _to_visit ${d_dirs})
+            endif()
+        endforeach()
+        list(REMOVE_DUPLICATES _to_visit)
+        if (_to_visit)
+            list(APPEND total_dirs ${_to_visit})
+        endif()
+        set(to_visit ${_to_visit})
+    endwhile()
+    list(REMOVE_DUPLICATES total_dirs)
+
+    # Fix all targets
+    foreach (d IN LISTS total_dirs)
+        get_directory_property(bdir_targets DIRECTORY ${d} BUILDSYSTEM_TARGETS)
+        foreach (bdir_target IN LISTS bdir_targets)
+            # Skip custom targets
+            get_target_property(tp ${bdir_target} TYPE)
+            if ("${tp}" STREQUAL "UTILITY")
+                continue()
+            endif()
+
+            # Add default flags
+            regor_add_options(${bdir_target})
+
+            # Set include paths as system
+            utils_set_system_include_paths(${bdir_target})
+
+            # Disable diagnostics
+            utils_disable_warnings(${bdir_target})
+
+            # Exclude from ALL. This can't be done directly in add_subdirectory
+            if (NOT "${tp}" STREQUAL "INTERFACE_LIBRARY")
+                set_target_properties(${bdir_target} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+            endif()
+        endforeach()
+    endforeach()
+endfunction()
+
+############################################################################
+# Add internal and thirdparty dependencies
+############################################################################
+
+regor_add_dependency("${CMAKE_CURRENT_SOURCE_DIR}/dependencies/mlw_codec")
+
+include(regor_thirdparty)
diff --git a/ethosu/regor/cmake/regor_lib.cmake b/ethosu/regor/cmake/regor_lib.cmake
new file mode 100644
index 00000000..512c1695
--- /dev/null
+++ b/ethosu/regor/cmake/regor_lib.cmake
@@ -0,0 +1,312 @@
+#
+# SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+include_guard(GLOBAL)
+
+include(GNUInstallDirs)
+include(utils)
+include(regor_options)
+include(CMakePackageConfigHelpers)
+
+
+# This function installs a target in a component with a namespace and produces a CMake export
+# which links both the namespace and the component.
+# An optional include sub-path can be specified for public headers under the component include
+# path. This can be useful for "sub-component"
+function(regor_install)
+    cmake_parse_arguments(_
+        ""
+        "COMPONENT;NAMESPACE;TARGET;INCLUDE"
+        ""
+        ${ARGN})
+
+    if (NOT __COMPONENT)
+        return()
+    endif()
+
+    install(TARGETS ${__TARGET}
+        EXPORT ${__COMPONENT}-targets
+        LIBRARY       DESTINATION "${CMAKE_INSTALL_LIBDIR}"  COMPONENT ${__COMPONENT}
+        ARCHIVE       DESTINATION "${CMAKE_INSTALL_LIBDIR}"  COMPONENT ${__COMPONENT}
+        RUNTIME       DESTINATION "${CMAKE_INSTALL_BINDIR}"  COMPONENT ${__COMPONENT}
+        PUBLIC_HEADER DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${__COMPONENT}/${__INCLUDE}" COMPONENT ${__COMPONENT}
+    )
+    get_target_property(tp ${__TARGET} TYPE)
+    if (MSVC AND "${tp}" STREQUAL "STATIC_LIBRARY" AND NOT CMAKE_VERSION VERSION_LESS 3.15)
+        install(FILES
+            "$<TARGET_FILE_DIR:${__TARGET}>/$<TARGET_FILE_PREFIX:${__TARGET}>$<TARGET_FILE_BASE_NAME:${__TARGET}>.pdb"
+            DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            COMPONENT ${__COMPONENT}
+            OPTIONAL)
+    endif()
+    # Emit export
+    if (NOT TARGET install-${__COMPONENT})
+        install(EXPORT ${__COMPONENT}-targets
+            FILE ${__COMPONENT}-targets.cmake
+            NAMESPACE ${__NAMESPACE}::
+            DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${__COMPONENT}"
+            COMPONENT ${__COMPONENT})
+
+        # Convenience target
+        add_custom_target(install-${__COMPONENT}
+            COMMAND
+                "${CMAKE_COMMAND}" -DCMAKE_INSTALL_COMPONENT=${__COMPONENT}
+                    -P "${CMAKE_CURRENT_BINARY_DIR}/cmake_install.cmake")
+
+        # Package definition.
+        # These calls are wrappers around configure_file producing standard
+        # files to be employed by client code using find_package(${__COMPONENT})
+        # They produce a relocatable package
+        set(PACKAGE_NAME ${__COMPONENT})
+        configure_package_config_file(${REGOR_SOURCE_DIR}/cmake/pkg-config.cmake.in
+            ${CMAKE_CURRENT_BINARY_DIR}/${__COMPONENT}-config.cmake
+            INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${__COMPONENT}"
+            PATH_VARS PACKAGE_NAME)
+        unset(PACKAGE_NAME)
+        write_basic_package_version_file(
+            ${CMAKE_CURRENT_BINARY_DIR}/${__COMPONENT}-config-version.cmake
+            VERSION "${${PROJECT_NAME}_VERSION}"
+            COMPATIBILITY AnyNewerVersion
+        )
+
+        # Install for the produced configure_files above
+        install(FILES
+            "${CMAKE_CURRENT_BINARY_DIR}/${__COMPONENT}-config.cmake"
+            "${CMAKE_CURRENT_BINARY_DIR}/${__COMPONENT}-config-version.cmake"
+            DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${__COMPONENT}"
+            COMPONENT ${__COMPONENT})
+
+        # This produces a non-relocatable "quick" package that can be directly
+        # consumed after a build
+        export(EXPORT ${__COMPONENT}-targets
+            FILE ${CMAKE_CURRENT_BINARY_DIR}/${__COMPONENT}-targets.cmake)
+    endif()
+    add_dependencies(install-${__COMPONENT} ${__TARGET})
+endfunction()
+
+# This function implements an add_library wrapper for non-interface libraries.
+# It makes sure:
+# - flags are properly set
+# - Install exports is done when the COMPONENT option is present
+#
+# Note all settings have a default scope pinned to PRIVATE except for OBJECT
+# libraries and PUBLIC_HEADERS. Extra settings outside the scope of this function
+# can be done to the target following the call
+function(regor_lib)
+    cmake_parse_arguments(_
+        "EXCLUDE_FROM_ALL"
+        "NAME;COMPONENT;TYPE;OUTPUT_NAME;INSTALL_LOCATION"
+        "PUBLIC_HEADERS;SOURCES;COPTS;DEFINES;LOPTS;DEPS;INC_DIRS"
+        ${ARGN})
+
+    if ("${__TYPE}" STREQUAL "STATIC")
+        set(__default_scope PRIVATE)
+        set(__is_dll FALSE)
+    elseif ("${__TYPE}" STREQUAL "SHARED")
+        set(__default_scope PRIVATE)
+        set(__is_dll TRUE)
+    elseif ("${__TYPE}" STREQUAL "OBJECT")
+        set(__default_scope PUBLIC)
+        set(__is_dll FALSE)
+    elseif ("${__TYPE}" STREQUAL "PY_MODULE")
+        set(__default_scope PRIVATE)
+        set(__is_dll TRUE)
+    else()
+        message(FATAL_ERROR "Unexpected lib type ${__TYPE}")
+    endif()
+
+    if ("${__TYPE}" STREQUAL "PY_MODULE")
+        pybind11_add_module(${__NAME} ${__SOURCES})
+    elseif (__EXCLUDE_FROM_ALL)
+        add_library(${__NAME} ${__TYPE} EXCLUDE_FROM_ALL ${__SOURCES})
+    else()
+        add_library(${__NAME} ${__TYPE} ${__SOURCES})
+    endif()
+    if (__OUTPUT_NAME)
+        if (MSVC AND "${__TYPE}" STREQUAL "SHARED")
+            # MSVC generates static artifacts for DLLs
+            set_target_properties(${__NAME} PROPERTIES RUNTIME_OUTPUT_NAME
+                ${__OUTPUT_NAME})
+            set_target_properties(${__NAME} PROPERTIES ARCHIVE_OUTPUT_NAME
+                ${__OUTPUT_NAME}_st)
+        else()
+            set_target_properties(${__NAME} PROPERTIES OUTPUT_NAME
+                ${__OUTPUT_NAME})
+        endif()
+    endif()
+
+    foreach (dep IN LISTS __DEPS)
+        # Workaround for https://gitlab.kitware.com/cmake/cmake/-/issues/15415
+        # Recurse deps
+        set(to_visit ${dep})
+        set(total_deps ${dep})
+        while (to_visit)
+            set(_to_visit)
+            foreach (tgt IN LISTS to_visit)
+                # Imported libs and non-targets are linked as usual
+                set(no_target FALSE)
+                set(imported FALSE)
+                if (TARGET ${tgt})
+                    get_target_property(imported ${tgt} IMPORTED)
+                    get_target_property(tp ${tgt} TYPE)
+                else()
+                    set(no_target TRUE)
+                endif()
+                if (no_target OR imported OR (__is_dll AND NOT "${tp}" STREQUAL "OBJECT_LIBRARY"))
+                    target_link_libraries(${__NAME} ${__default_scope} ${tgt})
+                    list(REMOVE_ITEM total_deps ${tgt})
+                    continue()
+                endif()
+
+                get_target_property(ideps ${tgt} INTERFACE_LINK_LIBRARIES)
+                if (NOT ideps)
+                    continue()
+                endif()
+
+                foreach(idep IN LISTS ideps)
+                    # Strip LINK_ONLY
+                    string(REGEX REPLACE "^\\$<LINK_ONLY:([A-Za-z0-9_]+)>$" "\\1" ridep "${idep}")
+                    # Collect this target
+                    list(APPEND total_deps ${ridep})
+                    list(APPEND _to_visit ${ridep})
+                endforeach()
+            endforeach()
+            list(REMOVE_DUPLICATES _to_visit)
+            set(to_visit ${_to_visit})
+        endwhile()
+        list(REMOVE_DUPLICATES total_deps)
+
+        # Now "link" collected target dependencies
+        foreach (tgt IN LISTS total_deps)
+            get_target_property(tp ${tgt} TYPE)
+            if (NOT "${tp}" STREQUAL "INTERFACE_LIBRARY")
+                target_sources(${__NAME} ${__default_scope} $<TARGET_OBJECTS:${tgt}>)
+            endif()
+            # Hand-link interface
+            target_link_options(${__NAME} ${__default_scope} $<TARGET_PROPERTY:${tgt},INTERFACE_LINK_OPTIONS>)
+            target_include_directories(${__NAME} ${__default_scope} $<TARGET_PROPERTY:${tgt},INTERFACE_INCLUDE_DIRECTORIES>)
+            target_include_directories(${__NAME} SYSTEM ${__default_scope} $<TARGET_PROPERTY:${tgt},INTERFACE_SYSTEM_INCLUDE_DIRECTORIES>)
+            target_compile_options(${__NAME} ${__default_scope} $<TARGET_PROPERTY:${tgt},INTERFACE_COMPILE_OPTIONS>)
+            target_compile_definitions(${__NAME} ${__default_scope} $<TARGET_PROPERTY:${tgt},INTERFACE_COMPILE_DEFINITIONS>)
+            target_sources(${__NAME} ${__default_scope} $<TARGET_PROPERTY:${tgt},INTERFACE_SOURCES>)
+        endforeach()
+    endforeach()
+
+    regor_add_options(${__NAME})
+    if (__INC_DIRS)
+        target_include_directories(${__NAME} ${__default_scope} ${__INC_DIRS})
+    endif()
+    if (__DEFINES)
+        target_compile_definitions(${__NAME} ${__default_scope} ${__DEFINES})
+    endif()
+    if (__COPTS)
+        target_compile_options(${__NAME} ${__default_scope} ${__COPTS})
+    endif()
+    if (__is_dll AND __LOPTS)
+        target_link_options(${__NAME} ${__default_scope} ${__LOPTS})
+    endif()
+
+    if (NOT "${__TYPE}" STREQUAL "PY_MODULE")
+        set_target_properties(${__NAME} PROPERTIES
+            VERSION ${${PROJECT_NAME}_VERSION})
+
+        if (__PUBLIC_HEADERS)
+            set_target_properties(${__NAME} PROPERTIES
+                PUBLIC_HEADER "${__PUBLIC_HEADERS}")
+        endif()
+    endif()
+    foreach (dir IN LISTS __PUBLIC_HEADERS)
+        get_filename_component(ahdr ${dir} ABSOLUTE)
+        get_filename_component(dir ${ahdr} DIRECTORY)
+        target_include_directories(${__NAME}
+            INTERFACE
+                "$<BUILD_INTERFACE:${dir}>"
+                "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
+    endforeach()
+
+    set(__NAMESPACE ${PROJECT_NAME})
+    string(TOLOWER ${__NAMESPACE} __NAMESPACE)
+    if (__INSTALL_LOCATION AND __COMPONENT)
+        install(TARGETS ${__NAME}
+            RUNTIME DESTINATION "${__INSTALL_LOCATION}" COMPONENT "${__COMPONENT}"
+            LIBRARY DESTINATION "${__INSTALL_LOCATION}" COMPONENT "${__COMPONENT}"
+            ARCHIVE DESTINATION "${__INSTALL_LOCATION}" COMPONENT "${__COMPONENT}"
+        )
+        if (NOT TARGET install-${__COMPONENT})
+            add_custom_target(install-${__COMPONENT}
+            COMMAND
+                "${CMAKE_COMMAND}"
+                -DCMAKE_INSTALL_COMPONENT=${__COMPONENT}
+                    -P "${CMAKE_BINARY_DIR}/cmake_install.cmake"
+            )
+        endif()
+        add_dependencies(install-${__COMPONENT} ${__NAME})
+    elseif (NOT "${__TYPE}" STREQUAL "OBJECT")
+        regor_install(
+            NAMESPACE ${__NAMESPACE}
+            COMPONENT ${__COMPONENT}
+            TARGET ${__NAME})
+    endif()
+    add_library(${__NAMESPACE}::${__NAME} ALIAS ${__NAME})
+endfunction()
+
+# Same version of the above function for executables
+function(regor_exe)
+    cmake_parse_arguments(_
+        "EXCLUDE_FROM_ALL"
+        "NAME;COMPONENT;OUTPUT_NAME"
+        "SOURCES;COPTS;DEFINES;LOPTS;DEPS;INC_DIRS"
+        ${ARGN})
+
+    set(__default_scope PRIVATE)
+
+    if (__EXCLUDE_FROM_ALL)
+        add_executable(${__NAME} EXCLUDE_FROM_ALL ${__SOURCES})
+    else()
+        add_executable(${__NAME} ${__SOURCES})
+    endif()
+    if (__OUTPUT_NAME)
+        set_target_properties(${__NAME} PROPERTIES OUTPUT_NAME
+            ${__OUTPUT_NAME})
+    endif()
+    regor_add_options(${__NAME})
+    if (__INC_DIRS)
+        target_include_directories(${__NAME} ${__default_scope} ${__INC_DIRS})
+    endif()
+    if (__DEFINES)
+        target_compile_definitions(${__NAME} ${__default_scope} ${__DEFINES})
+    endif()
+    if (__COPTS)
+        target_compile_options(${__NAME} ${__default_scope} ${__COPTS})
+    endif()
+    if (__LOPTS)
+        target_link_options(${__NAME} ${__default_scope} ${__LOPTS})
+    endif()
+    if (__DEPS)
+        target_link_libraries(${__NAME} ${__default_scope} ${__DEPS})
+    endif()
+    set_target_properties(${__NAME} PROPERTIES
+        VERSION ${${PROJECT_NAME}_VERSION})
+
+    set(__NAMESPACE ${PROJECT_NAME})
+    string(TOLOWER ${__NAMESPACE} __NAMESPACE)
+    regor_install(
+        NAMESPACE ${__NAMESPACE}
+        COMPONENT ${__COMPONENT}
+        TARGET ${__NAME})
+endfunction()
diff --git a/ethosu/regor/cmake/regor_options.cmake b/ethosu/regor/cmake/regor_options.cmake
new file mode 100644
index 00000000..8d22ec85
--- /dev/null
+++ b/ethosu/regor/cmake/regor_options.cmake
@@ -0,0 +1,279 @@
+#
+# SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# No include guard - we want the below variables to be set in any including scope
+
+set(REGOR_DEFAULT_COPTS)
+set(REGOR_DEFAULT_DOPTS)
+set(REGOR_DEFAULT_LOPTS)
+
+set(REGOR_C_STANDARD 99)
+set(REGOR_C_STANDARD_REQUIRED ON)
+set(REGOR_C_EXTENSIONS OFF)
+set(REGOR_CXX_STANDARD 17)
+set(REGOR_CXX_STANDARD_REQUIRED ON)
+set(REGOR_CXX_EXTENSIONS OFF)
+set(REGOR_POSITION_INDEPENDENT_CODE ON)
+set(REGOR_CXX_VISIBILITY_PRESET hidden)
+set(REGOR_VISIBILITY_INLINES_HIDDEN ON)
+
+# Remove default debug flags from C/CXX flags, this is controlled by REGOR_ENABLE_ASSERT instead
+string(TOUPPER ${CMAKE_BUILD_TYPE} UPPER_CONFIG)
+string( REGEX REPLACE "[/-]D[;]*[N|_]DEBUG" "" CMAKE_CXX_FLAGS_${UPPER_CONFIG} "${CMAKE_CXX_FLAGS_${UPPER_CONFIG}}")
+string( REGEX REPLACE "[/-]D[;]*[N|_]DEBUG" "" CMAKE_C_FLAGS_${UPPER_CONFIG} "${CMAKE_C_FLAGS_${UPPER_CONFIG}}")
+
+# Check ASSEMBLER/CXX/LINKER flag together with other_flags
+# If it checks they all get added to flag_list
+function (checked_flag tool flag flag_list)
+    set(other_flags ${ARGN})
+    # Hash a var name for the cache
+    string(REGEX REPLACE "[ -;=]" "_" var_name "${flag} ${other_flags}")
+
+    # Tool option
+    if (MSVC)
+        if ("${tool}" STREQUAL "LINKER")
+            set(tool_opt "/link")
+        endif()
+    else()
+        if ("${tool}" STREQUAL "LINKER")
+            set(tool_opt "-Xlinker")
+        elseif ("${tool}" STREQUAL "ASSEMBLER")
+            set(tool_opt "-Xassembler")
+        endif()
+    endif()
+
+    if (${var_name}_set)
+        set(flag_not_supported "${${var_name}_val}")
+    else()
+        string(REPLACE " " ";" __flags "${flag}")
+        if (other_flags OR tool_opt)
+            list(INSERT __flags 0 ${other_flags} ${tool_opt})
+        endif()
+        if (MSVC)
+            list(APPEND __flags "/link")
+            list(APPEND __flags "/out:cxx_check.exe")
+        else()
+            list(APPEND __flags "-o")
+            list(APPEND __flags "cxx_check")
+        endif()
+        file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/cxx_check.cc "int main(void){ return 0; }")
+        string(REPLACE " " ";" user_args "${CMAKE_CXX_COMPILER_ARG1}")
+        execute_process(COMMAND ${CMAKE_CXX_COMPILER} cxx_check.cc ${user_args} ${__flags}
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+            OUTPUT_QUIET ERROR_QUIET
+            RESULT_VARIABLE flag_not_supported)
+        set(${var_name}_val "${flag_not_supported}" CACHE INTERNAL "")
+        set(${var_name}_set TRUE CACHE INTERNAL "")
+    endif()
+    if (flag_not_supported)
+        message(STATUS "Looking for ${tool} flag support (${flag}) - Not found")
+    else()
+        message(STATUS "Looking for ${tool} flag support (${flag}) - Success")
+        string(REPLACE " " ";" flag "${tool_opt} ${flag} ${other_flags}")
+        set(${flag_list} "${${flag_list}};${flag}" PARENT_SCOPE)
+    endif()
+endfunction()
+
+function(get_glibc_version var)
+    execute_process(COMMAND ldd --version
+        OUTPUT_VARIABLE LDD_STR
+        OUTPUT_STRIP_TRAILING_WHITESPACE)
+    string(REPLACE "\n" ";" LDD_LINES ${LDD_STR})
+    list(GET LDD_LINES 0 LDD_FIRST_LINE)
+    string(REPLACE " " ";" LDD_FIRST_LINE_LIST ${LDD_FIRST_LINE})
+    list(GET LDD_FIRST_LINE_LIST -1 GLIBC_VER)
+    set(${var} ${GLIBC_VER} PARENT_SCOPE)
+    message(STATUS "Looking for GLIBC - Found version ${GLIBC_VER}")
+endfunction()
+
+# Base options
+if(MSVC)
+    # On MSVC, CMake sets /GR by default (enabling RTTI), but we set /GR-
+    string(REPLACE "/GR" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
+    list(APPEND REGOR_DEFAULT_COPTS
+        "$<$<OR:$<COMPILE_LANGUAGE:CXX>,$<COMPILE_LANGUAGE:C>>:/bigobj>" # C1128: number of sections exceeded object file format limit: compile with /bigobj
+        "$<$<AND:$<OR:$<COMPILE_LANGUAGE:CXX>,$<COMPILE_LANGUAGE:C>>,$<CONFIG:Debug>>:$<IF:$<BOOL:${REGOR_ENABLE_RTTI}>,/GR,/GR->>"
+        "$<$<OR:$<COMPILE_LANGUAGE:CXX>,$<COMPILE_LANGUAGE:C>>:/experimental:external>"
+        "$<$<OR:$<COMPILE_LANGUAGE:CXX>,$<COMPILE_LANGUAGE:C>>:/external:W0>"
+        "$<$<OR:$<COMPILE_LANGUAGE:CXX>,$<COMPILE_LANGUAGE:C>>:/external:anglebrackets>"
+    )
+else()
+    list(APPEND REGOR_DEFAULT_COPTS
+        "$<IF:$<BOOL:${REGOR_ENABLE_RTTI}>,-frtti,-fno-rtti>"
+        "$<$<CONFIG:Debug>:-ggdb>"
+        "$<$<BOOL:${REGOR_ENABLE_COVERAGE}>:-g3>"
+        "$<$<BOOL:${REGOR_ENABLE_COVERAGE}>:-fprofile-arcs>"
+        "$<$<BOOL:${REGOR_ENABLE_COVERAGE}>:-ftest-coverage>"
+        "$<$<BOOL:${REGOR_ENABLE_PROFILING}>:-fno-omit-frame-pointer>"
+        # https://gitlab.kitware.com/cmake/cmake/-/issues/23136
+        "$<$<BOOL:${REGOR_ENABLE_LTO}>:$<IF:$<CXX_COMPILER_ID:GNU>,-ffat-lto-objects,-flto=full>>"
+        "$<$<BOOL:${REGOR_SANITIZE}>:-fsanitize=${REGOR_SANITIZE}>"
+    )
+    if (REGOR_SANITIZE)
+        # There's just too much code being added by the sanitizer
+        checked_flag(CXX "-fno-var-tracking-assignments" REGOR_DEFAULT_COPTS)
+        checked_flag(CXX "-fno-sanitize-recover=all" REGOR_DEFAULT_COPTS)
+    endif()
+    if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+        # -gz via assembler
+        checked_flag(ASSEMBLER "--compress-debug-sections" REGOR_DEFAULT_COPTS "-fnodebug-types-section")
+        checked_flag(CXX "-gdwarf-4" REGOR_DEFAULT_COPTS)
+    endif()
+endif()
+
+# Definitions
+list(APPEND REGOR_DEFAULT_DOPTS
+    "$<$<NOT:$<BOOL:${REGOR_ENABLE_ASSERT}>>:NDEBUG>"
+    "$<$<BOOL:${REGOR_ENABLE_ASSERT}>:_DEBUG>"
+    "$<$<BOOL:${REGOR_ENABLE_EXPENSIVE_CHECKS}>:_GLIBCXX_DEBUG>"
+    "$<$<BOOL:${REGOR_ENABLE_EXPENSIVE_CHECKS}>:_GLIBCXX_DEBUG_PEDANTIC>"
+    "$<$<CXX_COMPILER_ID:MSVC>:NOMINMAX>"
+    "$<$<CXX_COMPILER_ID:MSVC>:_CRT_SECURE_NO_WARNINGS>"
+)
+
+# Base link flags
+if(NOT MSVC)
+    if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+        checked_flag(LINKER "--compress-debug-sections=${REGOR_DEBUG_COMPRESSION}" gz_supported "-fdebug-types-section")
+        list(APPEND REGOR_DEFAULT_LOPTS ${gz_supported})
+        # Add gold only if compressed sections are supported
+        checked_flag(LINKER "--compress-debug-sections=${REGOR_DEBUG_COMPRESSION}" gold_gz_supported "-fuse-ld=gold" "-fdebug-types-section")
+        if (gz_supported AND NOT gold_gz_supported)
+            set(REGOR_ENABLE_LDGOLD OFF CACHE BOOL "Enable Gold linker if available" FORCE)
+        endif()
+        unset(gz_supported)
+        unset(gold_gz_supported)
+    endif()
+    list(APPEND REGOR_DEFAULT_LOPTS
+        "$<$<BOOL:${REGOR_ENABLE_COVERAGE}>:-fprofile-arcs>"
+        "$<$<BOOL:${REGOR_ENABLE_LDGOLD}>:-fuse-ld=gold>"
+        "$<$<CONFIG:Release>:-s>"
+        "$<$<BOOL:${REGOR_ENABLE_STD_STATIC}>:-static-libstdc++>"
+        "$<$<BOOL:${REGOR_ENABLE_STD_STATIC}>:-static-libgcc>"
+        "$<$<BOOL:${REGOR_SANITIZE}>:-fsanitize=${REGOR_SANITIZE}>"
+    )
+    if (REGOR_SANITIZE)
+        checked_flag(LINKER "-static-libubsan" REGOR_DEFAULT_LOPTS)
+    endif()
+    list(APPEND REGOR_DEFAULT_LOPTS "-lm")
+    if (UNIX AND NOT APPLE)
+        get_glibc_version(_GLIBC_VER)
+        if (_GLIBC_VER)
+            if (${_GLIBC_VER} VERSION_LESS 2.17)
+                list(APPEND REGOR_DEFAULT_LOPTS "-lrt")
+            endif()
+        endif()
+    endif()
+endif()
+
+# Diagnostics
+if(MSVC)
+    list(APPEND REGOR_DEFAULT_COPTS
+        "/W3"                                                             # Default warning level (severe + significant + production quality).
+        "$<$<BOOL:${REGOR_ENABLE_WERROR}>:/WX>"
+        "$<$<OR:$<COMPILE_LANGUAGE:CXX>,$<COMPILE_LANGUAGE:C>>:/wd4200>"  # "nonstandard extension used : zero-sized array in struct/union"
+        "$<$<OR:$<COMPILE_LANGUAGE:CXX>,$<COMPILE_LANGUAGE:C>>:/wd4018>"  # "signed/unsigned mismatch in comparison"
+        "$<$<OR:$<COMPILE_LANGUAGE:CXX>,$<COMPILE_LANGUAGE:C>>:/wd4146>"  # operator applied to unsigned type, result still unsigned
+        "$<$<OR:$<COMPILE_LANGUAGE:CXX>,$<COMPILE_LANGUAGE:C>>:/wd4244>"  # possible loss of data
+        "$<$<OR:$<COMPILE_LANGUAGE:CXX>,$<COMPILE_LANGUAGE:C>>:/wd4267>"  # initializing: possible loss of data
+        "$<$<OR:$<COMPILE_LANGUAGE:CXX>,$<COMPILE_LANGUAGE:C>>:/wd4005>"  # allow: macro redefinition
+        "$<$<OR:$<COMPILE_LANGUAGE:CXX>,$<COMPILE_LANGUAGE:C>>:/wd4065>"  # allow: switch statement contains 'default' but no 'case' labels
+        "$<$<OR:$<COMPILE_LANGUAGE:CXX>,$<COMPILE_LANGUAGE:C>>:/wd4141>"  # allow: inline used more than once
+        "$<$<OR:$<COMPILE_LANGUAGE:CXX>,$<COMPILE_LANGUAGE:C>>:/wd4624>"  # allow: destructor was implicitly defined as deleted
+        "$<$<OR:$<COMPILE_LANGUAGE:CXX>,$<COMPILE_LANGUAGE:C>>:/wd4146>"  # operator applied to unsigned type, result still unsigned
+        "$<$<OR:$<COMPILE_LANGUAGE:CXX>,$<COMPILE_LANGUAGE:C>>:/wd4244>"  # possible loss of data
+        "$<$<OR:$<COMPILE_LANGUAGE:CXX>,$<COMPILE_LANGUAGE:C>>:/wd4267>"  # initializing: possible loss of data
+        "$<$<OR:$<COMPILE_LANGUAGE:CXX>,$<COMPILE_LANGUAGE:C>>:/wd5105>"  # allow: macro expansion producing 'defined' has undefined behavior
+    )
+else()
+    list(APPEND REGOR_DEFAULT_COPTS
+        "-Wall"
+        "-Wextra"
+        "$<$<BOOL:${REGOR_ENABLE_WERROR}>:-Werror>"
+
+        "-Wdouble-promotion"
+        "-Wshadow"
+        "-Wredundant-decls"
+        "-Wcast-align"
+        "-Wmissing-declarations"
+        "-Wmissing-include-dirs"
+        "-Wswitch-enum"
+        "-Wswitch-default"
+        "-Winvalid-pch"
+        "-Wformat=2"
+        "-Wmissing-format-attribute"
+        "-Wformat-nonliteral"
+        "$<$<COMPILE_LANGUAGE:CXX>:-Wold-style-cast>"
+        "-Wformat-security"
+        "-Wimplicit-fallthrough"
+        "$<$<COMPILE_LANGUAGE:CXX>:-Wnon-virtual-dtor>"
+        "$<$<NOT:$<CXX_COMPILER_ID:GNU>>:-Woverloaded-virtual>" # Not working in GCC
+        "-Wvla"
+        "-Wformat-nonliteral"
+        "$<$<CXX_COMPILER_ID:GNU>:-Wlogical-op>"
+
+        # Disabled
+        "-Wno-switch-enum"                            # TODO : Switch case in TFLite ops handling
+        "$<$<CXX_COMPILER_ID:GNU>:-Wno-array-bounds>" # TODO : False positives on Shape operators
+        "-Wno-unused-function"
+        "-Wno-unused"
+        "-Wno-double-promotion"
+    )
+endif()
+
+function(regor_add_options tgt)
+    get_target_property(tp ${tgt} TYPE)
+    if ("${tp}" STREQUAL "STATIC_LIBRARY")
+        set(__default_scope PRIVATE)
+        set(__link FALSE)
+    elseif ("${tp}" STREQUAL "OBJECT_LIBRARY")
+        set(__default_scope PUBLIC)
+        set(__link FALSE)
+    elseif ("${tp}" STREQUAL "SHARED_LIBRARY")
+        set(__default_scope PRIVATE)
+        set(__link TRUE)
+    elseif ("${tp}" STREQUAL "MODULE_LIBRARY")
+        set(__default_scope PRIVATE)
+        set(__link TRUE)
+    elseif ("${tp}" STREQUAL "EXECUTABLE")
+        set(__default_scope PRIVATE)
+        set(__link TRUE)
+    else()
+        return()
+    endif()
+
+    set_target_properties(${tgt} PROPERTIES
+        C_STANDARD ${REGOR_C_STANDARD}
+        C_STANDARD_REQUIRED ${REGOR_C_STANDARD_REQUIRED}
+        C_EXTENSIONS ${REGOR_C_EXTENSIONS}
+        CXX_STANDARD ${REGOR_CXX_STANDARD}
+        CXX_STANDARD_REQUIRED ${REGOR_CXX_STANDARD_REQUIRED}
+        CXX_EXTENSIONS ${REGOR_CXX_EXTENSIONS}
+        POSITION_INDEPENDENT_CODE ${REGOR_POSITION_INDEPENDENT_CODE}
+        CXX_VISIBILITY_PRESET ${REGOR_CXX_VISIBILITY_PRESET}
+        VISIBILITY_INLINES_HIDDEN ${REGOR_VISIBILITY_INLINES_HIDDEN}
+        INTERPROCEDURAL_OPTIMIZATION ${REGOR_ENABLE_LTO}
+        INTERPROCEDURAL_OPTIMIZATION_${CMAKE_BUILD_TYPE} ${REGOR_ENABLE_LTO})
+
+    target_compile_definitions(${tgt} ${__default_scope} ${REGOR_DEFAULT_DOPTS})
+    target_compile_options(${tgt} ${__default_scope} ${REGOR_DEFAULT_COPTS})
+    if (__link)
+        target_link_options(${tgt} ${__default_scope} ${REGOR_DEFAULT_LOPTS})
+    endif()
+endfunction()
diff --git a/ethosu/regor/cmake/regor_test.cmake b/ethosu/regor/cmake/regor_test.cmake
new file mode 100644
index 00000000..b0995cac
--- /dev/null
+++ b/ethosu/regor/cmake/regor_test.cmake
@@ -0,0 +1,123 @@
+#
+# SPDX-FileCopyrightText: Copyright 2021, 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# No inclusion guard. We need these set and included in every scope
+
+set(ENABLE_COVERAGE ${REGOR_ENABLE_COVERAGE} CACHE INTERNAL
+    "Enable coverage passed to codecov package")
+
+include(regor_lib)
+include(utils)
+include(CTest)
+include(Catch)
+
+find_package(codecov)
+
+if (NOT TARGET check)
+    # Target to build and run all unit-tests from the top level
+    add_custom_target(check
+        COMMAND ${CMAKE_CTEST_COMMAND} -L unit_test --output-on-failure $<$<BOOL:${REGOR_SANITIZE}>:--verbose>
+        WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+    )
+endif()
+
+# Coverage target
+if(REGOR_ENABLE_COVERAGE AND NOT TARGET coverage)
+    add_custom_target(coverage)
+    if(DEFINED ENV{CMAKE_BUILD_PARALLEL_LEVEL})
+        set(CHECK_JOB_CNT --parallel $ENV{CMAKE_BUILD_PARALLEL_LEVEL})
+    endif()
+    add_custom_command(TARGET coverage POST_BUILD
+        COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} -t check ${CHECK_JOB_CNT}
+        COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} -t lcov-capture
+        COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} -t lcov-genhtml
+    )
+endif()
+
+function(add_catch_test)
+    cmake_parse_arguments(
+        _FARG
+        ""
+        "NAME"
+        "SOURCES;DEPS;COPTS;DEFINES;INC_DIRS"
+        ${ARGN}
+    )
+    ### Adds an executable unit test and ammends it to the unit tests top target
+    regor_exe(NAME ${_FARG_NAME}
+        SOURCES ${_FARG_SOURCES}
+        COPTS ${_FARG_COPTS}
+        DEFINES ${_FARG_DEFINES}
+        INC_DIRS ${_FARG_INC_DIRS}
+    )
+    # Links the individual tests to the build all target
+    add_dependencies(check ${_FARG_NAME})
+    # Tests require introspection
+    foreach(_dep ${_FARG_DEPS})
+        if (NOT TARGET ${_dep})
+            continue()
+        endif()
+        get_target_property(__incs ${_dep} INCLUDE_DIRECTORIES)
+        if (__incs)
+            target_include_directories(${_FARG_NAME} PRIVATE ${__incs})
+        endif()
+    endforeach()
+    # Link deps
+    target_link_libraries(${_FARG_NAME} PRIVATE ${_FARG_DEPS})
+    # Now finally enable catch
+    target_link_libraries(${_FARG_NAME} PRIVATE regor::Catch2)
+    # Valgrind support
+    if (REGOR_ENABLE_VALGRIND)
+        find_program(VALGRIND_EXECUTABLE valgrind REQUIRED)
+        # We hijack CROSSCOMPILING_EMULATOR to get catch_ctest
+        # to prepend the valgrind command to all executables
+        set_property(TARGET ${_FARG_NAME} PROPERTY
+            CROSSCOMPILING_EMULATOR ${VALGRIND_EXECUTABLE} $ENV{VALGRIND_OPTIONS})
+    endif()
+    catch_discover_tests(${_FARG_NAME} PROPERTIES LABELS unit_test)
+    # Coverage
+    add_coverage(${_FARG_NAME})
+endfunction()
+
+function(add_py_test)
+    cmake_parse_arguments(
+        _FARG
+        ""
+        "NAME"
+        "SOURCES;DEPS"
+        ${ARGN}
+    )
+    list(TRANSFORM _FARG_SOURCES PREPEND ${CMAKE_CURRENT_LIST_DIR}/)
+    add_test(NAME ${_FARG_NAME}
+        COMMAND ${Python3_EXECUTABLE} -m pytest ${_FARG_SOURCES}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    )
+    foreach(_dep ${_FARG_DEPS})
+        list(APPEND _INC_DIRS $<TARGET_FILE_DIR:${_dep}>)
+    endforeach()
+    if(MSVC)
+        set(_path_sep ";")
+    else()
+        set(_path_sep ":")
+    endif()
+    string(REPLACE ";" "${_path_sep}" _INC_DIRS "${_INC_DIRS}")
+    set_tests_properties(${_FARG_NAME}
+        PROPERTIES
+            ENVIRONMENT "PYTHONPATH=${_INC_DIRS}${_path_sep}$ENV{PYTHONPATH}"
+            LABELS unit_test
+    )
+endfunction()
diff --git a/ethosu/regor/cmake/regor_thirdparty.cmake b/ethosu/regor/cmake/regor_thirdparty.cmake
new file mode 100644
index 00000000..7ad6aec2
--- /dev/null
+++ b/ethosu/regor/cmake/regor_thirdparty.cmake
@@ -0,0 +1,49 @@
+#
+# SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+include_guard(GLOBAL)
+
+include(regor_lib)
+
+############################################################################
+# Header only libs
+############################################################################
+
+add_library(fmt INTERFACE)
+target_compile_definitions(fmt INTERFACE FMT_HEADER_ONLY)
+target_include_directories(fmt SYSTEM INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/dependencies/thirdparty/fmt/include>
+    $<INSTALL_INTERFACE:fmt/include>)
+add_library(regor::fmt ALIAS fmt)
+
+set(CATCH2_DIR "dependencies/thirdparty/Catch2")
+add_subdirectory(${CATCH2_DIR})
+target_include_directories(Catch2 SYSTEM INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/${CATCH2_DIR}/src/catch2)
+include(Catch)
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${CATCH2_DIR}/CMake")
+add_library(regor::Catch2 ALIAS Catch2)
+
+add_library(flatbuffers INTERFACE)
+target_include_directories(flatbuffers SYSTEM INTERFACE
+    "${CMAKE_CURRENT_SOURCE_DIR}/dependencies/thirdparty/flatbuffers/include")
+add_library(regor::flatbuffers ALIAS flatbuffers)
+
+add_library(gemmlowp INTERFACE)
+target_include_directories(gemmlowp SYSTEM INTERFACE
+    "${CMAKE_CURRENT_SOURCE_DIR}/dependencies/thirdparty/gemmlowp")
+add_library(regor::gemmlowp ALIAS gemmlowp)
diff --git a/ethosu/regor/cmake/toolchains/clang.cmake b/ethosu/regor/cmake/toolchains/clang.cmake
new file mode 100644
index 00000000..2221fb61
--- /dev/null
+++ b/ethosu/regor/cmake/toolchains/clang.cmake
@@ -0,0 +1,51 @@
+#
+# SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+include_guard(GLOBAL)
+
+set(CMAKE_TOOLCHAIN_PREFIX llvm-)
+
+find_program(_C_COMPILER "clang" REQUIRED)
+set(CMAKE_C_COMPILER ${_C_COMPILER})
+
+find_program(_CXX_COMPILER "clang++" REQUIRED)
+set(CMAKE_CXX_COMPILER ${_CXX_COMPILER})
+
+find_program(_CXX_COMPILER_AR "${CMAKE_TOOLCHAIN_PREFIX}ar" REQUIRED)
+set(CMAKE_CXX_COMPILER_AR ${_CXX_COMPILER_AR})
+set(CMAKE_AR ${_CXX_COMPILER_AR})
+
+find_program(_CXX_COMPILER_RANLIB "${CMAKE_TOOLCHAIN_PREFIX}ranlib" REQUIRED)
+set(CMAKE_CXX_COMPILER_RANLIB ${_CXX_COMPILER_RANLIB})
+set(CMAKE_RANLIB ${_CXX_COMPILER_RANLIB})
+
+# Find GCC base path and use it
+find_program(_GCC_PATH gcc REQUIRED)
+get_filename_component(_GCC_PATH ${_GCC_PATH} DIRECTORY)
+add_compile_options(--gcc-toolchain=${_GCC_PATH}/..)
+add_link_options(--gcc-toolchain=${_GCC_PATH}/..)
+
+# Add system headers from GCC
+execute_process(COMMAND cpp -xc++ -Wp,-v /dev/null OUTPUT_QUIET ERROR_VARIABLE cpp_out)
+string(REPLACE " " ";" cpp_out "${cpp_out}")
+string(REPLACE "\n" ";" cpp_out "${cpp_out}")
+foreach (e ${cpp_out})
+    if (IS_DIRECTORY "${e}")
+        include_directories(SYSTEM ${e})
+    endif()
+endforeach()
diff --git a/ethosu/regor/cmake/toolchains/clang32.cmake b/ethosu/regor/cmake/toolchains/clang32.cmake
new file mode 100644
index 00000000..c262f13d
--- /dev/null
+++ b/ethosu/regor/cmake/toolchains/clang32.cmake
@@ -0,0 +1,61 @@
+#
+# SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+include_guard(GLOBAL)
+
+set(CMAKE_CROSSCOMPILING TRUE)
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR i386 CACHE STRING "" FORCE)
+
+set(CMAKE_TOOLCHAIN_PREFIX llvm-)
+
+find_program(_C_COMPILER "clang" REQUIRED)
+set(CMAKE_C_COMPILER ${_C_COMPILER})
+
+find_program(_CXX_COMPILER "clang++" REQUIRED)
+set(CMAKE_CXX_COMPILER ${_CXX_COMPILER})
+
+find_program(_CXX_COMPILER_AR "${CMAKE_TOOLCHAIN_PREFIX}ar" REQUIRED)
+set(CMAKE_CXX_COMPILER_AR ${_CXX_COMPILER_AR})
+set(CMAKE_AR ${_CXX_COMPILER_AR})
+
+find_program(_CXX_COMPILER_RANLIB "${CMAKE_TOOLCHAIN_PREFIX}ranlib" REQUIRED)
+set(CMAKE_CXX_COMPILER_RANLIB ${_CXX_COMPILER_RANLIB})
+set(CMAKE_RANLIB ${_CXX_COMPILER_RANLIB})
+
+# Find GCC base path and use it
+find_program(_GCC_PATH i686-linux-gnu-gcc)
+if (NOT _GCC_PATH)
+    find_program(_GCC_PATH gcc REQUIRED)
+endif()
+get_filename_component(_GCC_PATH ${_GCC_PATH} DIRECTORY)
+add_compile_options(--gcc-toolchain=${_GCC_PATH}/..)
+add_link_options(--gcc-toolchain=${_GCC_PATH}/..)
+
+# Add system headers from GCC
+execute_process(COMMAND cpp -m32 -xc++ -Wp,-v /dev/null OUTPUT_QUIET ERROR_VARIABLE cpp_out)
+string(REPLACE " " ";" cpp_out "${cpp_out}")
+string(REPLACE "\n" ";" cpp_out "${cpp_out}")
+foreach (e ${cpp_out})
+    if (IS_DIRECTORY "${e}")
+        include_directories(SYSTEM ${e})
+    endif()
+endforeach()
+
+add_compile_options("-m32" "-march=i686" "-msse" "-msse2" "-mfpmath=sse")
+add_link_options("-m32")
diff --git a/ethosu/regor/cmake/toolchains/gcc.cmake b/ethosu/regor/cmake/toolchains/gcc.cmake
new file mode 100644
index 00000000..5aa821fc
--- /dev/null
+++ b/ethosu/regor/cmake/toolchains/gcc.cmake
@@ -0,0 +1,37 @@
+#
+# SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+include_guard(GLOBAL)
+
+
+set(CMAKE_TOOLCHAIN_PREFIX "gcc-")
+set(CMAKE_TOOLCHAIN_SUFFIX "" CACHE STRING "Toolchain suffix e.g. -8 for GCC8") # Can be -9 for GCC9 or -8 for GCC8 etc
+find_program(_C_COMPILER "gcc${CMAKE_TOOLCHAIN_SUFFIX}" REQUIRED)
+
+set(CMAKE_C_COMPILER ${_C_COMPILER})
+
+find_program(_CXX_COMPILER "g++${CMAKE_TOOLCHAIN_SUFFIX}" REQUIRED)
+set(CMAKE_CXX_COMPILER ${_CXX_COMPILER})
+
+find_program(_CXX_COMPILER_AR "${CMAKE_TOOLCHAIN_PREFIX}ar${CMAKE_TOOLCHAIN_SUFFIX}" REQUIRED)
+set(CMAKE_CXX_COMPILER_AR ${_CXX_COMPILER_AR})
+set(CMAKE_AR ${_CXX_COMPILER_AR})
+
+find_program(_CXX_COMPILER_RANLIB "${CMAKE_TOOLCHAIN_PREFIX}ranlib${CMAKE_TOOLCHAIN_SUFFIX}" REQUIRED)
+set(CMAKE_CXX_COMPILER_RANLIB ${_CXX_COMPILER_RANLIB})
+set(CMAKE_RANLIB ${_CXX_COMPILER_RANLIB})
diff --git a/ethosu/regor/cmake/toolchains/gcc32.cmake b/ethosu/regor/cmake/toolchains/gcc32.cmake
new file mode 100644
index 00000000..19946a3e
--- /dev/null
+++ b/ethosu/regor/cmake/toolchains/gcc32.cmake
@@ -0,0 +1,55 @@
+#
+# SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+include_guard(GLOBAL)
+
+
+set(CMAKE_CROSSCOMPILING TRUE)
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR i386 CACHE STRING "" FORCE)
+
+# First look for a cross-compilation toolchain
+set(CMAKE_TOOLCHAIN_PREFIX i686-linux-gnu-)
+set(CMAKE_TOOLCHAIN_SUFFIX "" CACHE STRING "Toolchain suffix e.g. -8 for GCC8") # Can be -9 for GCC9 or -8 for GCC8 etc
+
+find_program(_C_COMPILER "${CMAKE_TOOLCHAIN_PREFIX}gcc${CMAKE_TOOLCHAIN_SUFFIX}")
+if (NOT _C_COMPILER)
+    # Assume the main toolchain can manage
+    set(CMAKE_TOOLCHAIN_PREFIX "")
+endif()
+
+# Find compilers
+find_program(_C_COMPILER "${CMAKE_TOOLCHAIN_PREFIX}gcc${CMAKE_TOOLCHAIN_SUFFIX}" REQUIRED)
+find_program(_CXX_COMPILER "${CMAKE_TOOLCHAIN_PREFIX}g++${CMAKE_TOOLCHAIN_SUFFIX}" REQUIRED)
+
+# Now set prefix for binutils
+set(CMAKE_TOOLCHAIN_PREFIX ${CMAKE_TOOLCHAIN_PREFIX}gcc-)
+
+set(CMAKE_C_COMPILER ${_C_COMPILER})
+set(CMAKE_CXX_COMPILER ${_CXX_COMPILER})
+
+find_program(_CXX_COMPILER_AR "${CMAKE_TOOLCHAIN_PREFIX}ar${CMAKE_TOOLCHAIN_SUFFIX}" REQUIRED)
+set(CMAKE_CXX_COMPILER_AR ${_CXX_COMPILER_AR})
+set(CMAKE_AR ${_CXX_COMPILER_AR})
+
+find_program(_CXX_COMPILER_RANLIB "${CMAKE_TOOLCHAIN_PREFIX}ranlib${CMAKE_TOOLCHAIN_SUFFIX}" REQUIRED)
+set(CMAKE_CXX_COMPILER_RANLIB ${_CXX_COMPILER_RANLIB})
+set(CMAKE_RANLIB ${_CXX_COMPILER_RANLIB})
+
+add_compile_options("-m32" "-march=i686" "-msse" "-msse2" "-mfpmath=sse")
+add_link_options("-m32")
diff --git a/ethosu/regor/cmake/utils.cmake b/ethosu/regor/cmake/utils.cmake
new file mode 100644
index 00000000..f8d11598
--- /dev/null
+++ b/ethosu/regor/cmake/utils.cmake
@@ -0,0 +1,75 @@
+#
+# SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+include_guard(GLOBAL)
+
+# Anchor the project root folder
+set(REGOR_SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/.. CACHE INTERNAL "")
+
+# Flag target interface include paths as SYSTEM to prevent warnings leaking out
+function(utils_set_system_include_paths tgt)
+    get_target_property(__tgt_type ${tgt} TYPE)
+    if (${__tgt_type} STREQUAL "INTERFACE_LIBRARY")
+        set(__default_scope INTERFACE)
+    else()
+        set(__default_scope PUBLIC)
+    endif()
+    get_target_property(itf_inc_dirs ${tgt} INTERFACE_INCLUDE_DIRECTORIES)
+    if (itf_inc_dirs)
+        target_include_directories(${tgt} SYSTEM BEFORE ${__default_scope} ${itf_inc_dirs})
+    endif()
+endfunction()
+
+# Disable warnings for target
+function(utils_disable_warnings tgt)
+    get_target_property(__tgt_type ${tgt} TYPE)
+    if (NOT ${__tgt_type} STREQUAL "INTERFACE_LIBRARY")
+        # Remove previous setting to prevent MSVC warning
+        get_target_property(copts ${tgt} COMPILE_OPTIONS)
+        if (copts)
+            set(new_copts)
+            foreach (c IN LISTS copts)
+                if (c MATCHES "^/W[0-9]")
+                    continue()
+                endif()
+                list(APPEND new_copts "${c}")
+            endforeach()
+            set_target_properties(${tgt} PROPERTIES COMPILE_OPTIONS "${new_copts}")
+        endif()
+        target_compile_options(${tgt} PRIVATE "$<IF:$<CXX_COMPILER_ID:MSVC>,,-w>")
+    endif()
+endfunction()
+
+# Find Python the right way
+macro(utils_find_python)
+    if (NOT Python3_FOUND)
+        if(CMAKE_VERSION VERSION_LESS "3.18.0")
+            if (DEFINED ENV{PYTHON_VERSION})
+                find_package(Python3 $ENV{PYTHON_VERSION} EXACT COMPONENTS Interpreter Development REQUIRED)
+            else()
+                find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
+            endif()
+        else()
+            if (DEFINED ENV{PYTHON_VERSION})
+                find_package(Python3 $ENV{PYTHON_VERSION} EXACT COMPONENTS Interpreter Development.Module REQUIRED)
+            else()
+                find_package(Python3 COMPONENTS Interpreter Development.Module REQUIRED)
+            endif()
+        endif()
+    endif()
+endmacro()
diff --git a/ethosu/regor/common/bit_flags.hpp b/ethosu/regor/common/bit_flags.hpp
new file mode 100644
index 00000000..15c8646d
--- /dev/null
+++ b/ethosu/regor/common/bit_flags.hpp
@@ -0,0 +1,332 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/common.hpp"
+
+#include "common/lexer.hpp"
+
+#include <algorithm>
+#include <type_traits>
+
+struct EnumNameEntry
+{
+    unsigned value;
+    const char *name;
+};
+
+template<typename ENUM>
+static ENUM OrFlags(ENUM a)
+{
+    return a;
+}
+
+template<typename ENUM, typename... ARGS>
+static ENUM OrFlags(ENUM a, ARGS... rest)
+{
+    return ENUM(typename std::underlying_type<ENUM>::type(OrFlags(std::forward<ARGS>(rest)...)) | typename std::underlying_type<ENUM>::type(a));
+}
+
+static std::string EnumFlagsToString(unsigned value, const EnumNameEntry *table, int tableLength);
+static unsigned StringToEnumFlags(Lexer &lexer, const EnumNameEntry *table, int tableLength);
+
+/// <summary>
+/// Enumerated flags wrapper
+/// </summary>
+template<typename ENUM>
+struct Flags
+{
+    using TYPE = typename std::underlying_type<ENUM>::type;
+
+private:
+    TYPE _raw = 0;
+
+public:
+    Flags() {}
+    Flags(ENUM val) : _raw(TYPE(val)) {}
+
+    template<typename... ARGS>
+    Flags(ENUM val, ARGS... rest)
+    {
+        _raw = TYPE(val) | TYPE(OrFlags(std::forward<ARGS>(rest)...));
+    }
+
+    explicit Flags(TYPE val) : _raw(val) {}
+    Flags(const Flags<ENUM> &) = default;
+
+public:
+    bool operator==(ENUM val) const { return _raw == TYPE(val); }
+    bool operator!=(ENUM val) const { return _raw != TYPE(val); }
+    bool operator<(ENUM val) const { return _raw < TYPE(val); }
+    operator bool() const { return _raw != 0; }
+    operator ENUM() const { return ENUM(_raw); }
+    explicit operator unsigned() const { return unsigned(_raw); }
+
+    Flags<ENUM> &operator=(ENUM val)
+    {
+        _raw = TYPE(val);
+        return *this;
+    }
+    Flags<ENUM> &operator=(const Flags<ENUM> &other)
+    {
+        _raw = other._raw;
+        return *this;
+    }
+    Flags<ENUM> &operator&=(ENUM val)
+    {
+        _raw &= TYPE(val);
+        return *this;
+    }
+    Flags<ENUM> &operator&=(const Flags<ENUM> &other)
+    {
+        _raw &= other._raw;
+        return *this;
+    }
+    Flags<ENUM> &operator|=(ENUM val)
+    {
+        _raw |= TYPE(val);
+        return *this;
+    }
+    Flags<ENUM> &operator|=(const Flags<ENUM> &other)
+    {
+        _raw |= other._raw;
+        return *this;
+    }
+    Flags<ENUM> &operator^=(ENUM val)
+    {
+        _raw ^= TYPE(val);
+        return *this;
+    }
+    Flags<ENUM> &operator^=(const Flags<ENUM> &other)
+    {
+        _raw ^= other._raw;
+        return *this;
+    }
+
+    Flags<ENUM> operator&(ENUM val) const { return Flags<ENUM>(ENUM(_raw & TYPE(val))); }
+    Flags<ENUM> operator|(ENUM val) const { return Flags<ENUM>(ENUM(_raw | TYPE(val))); }
+    Flags<ENUM> operator^(ENUM val) const { return Flags<ENUM>(ENUM(_raw ^ TYPE(val))); }
+    Flags<ENUM> operator~() const { return Flags<ENUM>(ENUM(~_raw)); }
+
+    // Extract non-bitfield item
+    unsigned GetUInt(ENUM offset, int bits) { return (_raw >> int(offset)) & ((1u << bits) - 1u); }
+
+    // Set multiple flags
+    Flags<ENUM> &Set(TYPE val)
+    {
+        _raw |= TYPE(val);
+        return *this;
+    }
+    Flags<ENUM> &Set(ENUM val) { return Set(TYPE(val)); }
+    template<typename... ARGS>
+    Flags<ENUM> &Set(ENUM val, ARGS... rest)
+    {
+        return Set(TYPE(val) | TYPE(OrFlags(std::forward<ARGS>(rest)...)));
+    }
+
+    // Unset multiple flags
+    Flags<ENUM> &Unset(TYPE val)
+    {
+        _raw &= ~val;
+        return *this;
+    }
+    Flags<ENUM> &Unset(ENUM val) { return Unset(TYPE(val)); }
+    template<typename... ARGS>
+    Flags<ENUM> &Unset(ENUM val, ARGS... rest)
+    {
+        return Unset(TYPE(val) | TYPE(OrFlags(std::forward<ARGS>(rest)...)));
+    }
+
+    // Test any set
+    bool Any(TYPE val) const { return (_raw & TYPE(val)) != 0; }
+    bool Any(ENUM val) const { return Any(TYPE(val)); }
+    template<typename... ARGS>
+    bool Any(ENUM val, ARGS... rest) const
+    {
+        TYPE mask = TYPE(val) | TYPE(OrFlags(std::forward<ARGS>(rest)...));
+        return (_raw & mask) != 0;
+    }
+
+    // Test all set
+    bool All(TYPE val) const { return (_raw & TYPE(val)) == TYPE(val); }
+    bool All(ENUM val) const { return All(TYPE(val)); }
+    template<typename... ARGS>
+    bool All(ENUM val, ARGS... rest) const
+    {
+        TYPE mask = TYPE(val) | TYPE(OrFlags(std::forward<ARGS>(rest)...));
+        return (_raw & mask) == mask;
+    }
+
+    std::string ToString() const
+    {
+        int length = 0;
+        const EnumNameEntry *table = GetTable(length);
+        return EnumFlagsToString(_raw, table, length);
+    }
+
+    void Parse(const std::string &text)
+    {
+        int length = 0;
+        const EnumNameEntry *table = GetTable(length);
+        Lexer lexer(text.data(), int(text.size()));
+        _raw = StringToEnumFlags(lexer, table, length);
+    }
+
+private:
+    // Proxy function for type erasure
+    static const EnumNameEntry *GetTable(int &length)
+    {
+        extern const EnumNameEntry *GetEnumTable(ENUM, int &);
+        length = 0;
+        return GetEnumTable(ENUM(0), length);
+    }
+};
+
+static std::string EnumToString(unsigned value, const EnumNameEntry *table, int tableLength)
+{
+    auto pos = std::find_if(table, table + tableLength, [&value](const EnumNameEntry &v) { return v.value == value; });
+    if ( pos != table + tableLength )
+    {
+        return pos->name;
+    }
+    return std::to_string(value);
+}
+
+static std::string EnumFlagsToString(unsigned value, const EnumNameEntry *table, int tableLength)
+{
+    if ( value == 0 ) return EnumToString(value, table, tableLength);
+    unsigned mask = 1;
+    std::string text;
+    while ( mask <= value )
+    {
+        if ( value & mask )
+        {
+            if ( !text.empty() )
+            {
+                text += '|';
+            }
+
+            auto pos = std::find_if(
+                table, table + tableLength, [&mask](const EnumNameEntry &v) { return v.value == mask; });
+            if ( pos != table + tableLength )
+            {
+                text += pos->name;
+            }
+            else
+            {
+                text += std::to_string(mask);
+            }
+        }
+        mask = mask << 1;
+    }
+    return text;
+}
+
+template<typename ENUM>
+static std::string EnumToString(ENUM value)
+{
+    extern const EnumNameEntry *GetEnumTable(ENUM, int &);
+    int length = 0;
+    auto table = GetEnumTable(ENUM(0), length);
+    return EnumToString(unsigned(value), table, length);
+}
+
+static unsigned StringToEnumFlags(Lexer &lexer, const EnumNameEntry *table, int tableLength)
+{
+    unsigned value = 0;
+    std::string ident;
+    bool isXor = false;
+    while ( true )
+    {
+        if ( !lexer.SkipSpace() )
+        {
+            break;
+        }
+        if ( lexer.GetIdent(ident, false) )
+        {
+            auto pos = std::find_if(
+                table, table + tableLength, [&ident](const EnumNameEntry &v) { return ident == v.name; });
+            if ( pos != table + tableLength )
+            {
+                value = isXor ? (value ^ pos->value) : (value | pos->value);
+                isXor = false;
+            }
+        }
+        if ( !lexer.SkipSpace() )
+        {
+            break;
+        }
+        if ( lexer.Expect('^') )
+        {
+            isXor = true;
+        }
+        else if ( !lexer.Expect('|') )
+        {
+            break;
+        }
+    }
+    return value;
+}
+
+template<typename ENUM, std::enable_if_t<std::is_enum_v<ENUM>, int> = 0>
+inline std::string format_as(const Flags<ENUM> &flags) noexcept
+{
+    return flags.ToString();
+}
+
+// Use to treat enumerations as flags when defined as single-bit
+// numeric values:
+//
+//  enum class Type
+//  {
+//     First=1,
+//     Second=2,
+//     Third=4
+//  };
+//
+// Wrap enumerations with Flags template and use as a bitset:
+//
+//  Flags<Type> flags(Type::First, Type::Second);
+//  flags |= Type::Third;
+//
+// To convert to/from string, use macros to build a mapping table
+//
+//  BEGIN_ENUM_TABLE(Type)
+//    ADD_ENUM_NAME(First)
+//    ADD_ENUM_NAME(Second)
+//    ADD_ENUM_NAME(Third)
+//  END_ENUM_TABLE()
+//
+// Then use flags.ToString() or flags.Parse() to convert between
+// representations.
+
+#define BEGIN_ENUM_TABLE(TYPE) \
+    const EnumNameEntry *GetEnumTable(TYPE disc, int &length); \
+    const EnumNameEntry *GetEnumTable(TYPE disc, int &length) \
+    { \
+        (void)disc; \
+        using ENUM_TYPE = TYPE; \
+        static EnumNameEntry table[] = {
+#define ADD_ENUM_NAME(ENUM) {unsigned(ENUM_TYPE::ENUM), #ENUM},
+#define END_ENUM_TABLE() \
+    } \
+    ; \
+    length = int(std::size(table)); \
+    return table; \
+    }
diff --git a/ethosu/regor/common/box.hpp b/ethosu/regor/common/box.hpp
new file mode 100644
index 00000000..f4c957d1
--- /dev/null
+++ b/ethosu/regor/common/box.hpp
@@ -0,0 +1,63 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common.hpp"
+#include "shape.hpp"
+
+#include <cassert>
+#include <string>
+
+class Box
+{
+private:
+    Shape _start;
+    Shape _end;
+
+public:
+    Box() = default;
+
+    Box(const Shape &start, const Shape &end) : _start(start), _end(end)
+    {
+        assert(start.Size() == end.Size());
+        assert(start <= end);
+    }
+
+    Box(const Shape &end) : Box(end.WithZeros(), end) {}
+
+    const Shape &Start() const { return _start; }
+    const Shape &End() const { return _end; }
+
+    Shape SizeShape() const { return _end - _start; }
+
+    bool Overlaps(const Box &other) const
+    {
+        int sz = _start.Size();
+        for ( int i = 0; i < sz; ++i )
+        {
+            if ( !::Overlaps(_start[i], _end[i], other._start[i], other._end[i]) )
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    std::string ToString() const { return fmt::format("[{} - {}]", _start.ToString(), _end.ToString()); }
+};
diff --git a/ethosu/regor/common/buffer_view.hpp b/ethosu/regor/common/buffer_view.hpp
new file mode 100644
index 00000000..3c65f0d9
--- /dev/null
+++ b/ethosu/regor/common/buffer_view.hpp
@@ -0,0 +1,445 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common.hpp"
+#include "shape.hpp"
+
+#include <cassert>
+#include <memory>
+#include <vector>
+
+namespace regor
+{
+
+/// <summary>
+/// Buffer mechanism for local/remote data storage
+/// </summary>
+class Buffer : public std::enable_shared_from_this<Buffer>
+{
+    typedef void (*DeleteFunc)(void *);
+
+#define FOR_ALL_INT_TYPES(functor, sep) \
+    functor(uint8_t) sep functor(uint16_t) \
+    sep functor(uint32_t) \
+    sep functor(uint64_t) \
+    sep functor(int8_t) \
+    sep functor(int16_t) \
+    sep functor(int32_t) \
+    sep functor(int64_t)
+
+    union LocalStorage
+    {
+        LocalStorage() {}
+        ~LocalStorage() {}
+#define TYPE_FUNC(x) std::vector<x> as_##x
+        FOR_ALL_INT_TYPES(TYPE_FUNC, ;);
+#undef TYPE_FUNC
+    };
+
+    template<typename TYPE>
+    struct IsSupportedIntegral
+    {
+#define TYPE_FUNC(x) std::is_same<TYPE, x>::value
+        static constexpr bool value = FOR_ALL_INT_TYPES(TYPE_FUNC, ||);
+#undef TYPE_FUNC
+    };
+
+    template<typename TYPE>
+    struct IsByte
+    {
+        static constexpr bool value =
+            std::is_same<TYPE, char>::value || std::is_same<TYPE, unsigned char>::value || std::is_same<TYPE, std::byte>::value;
+    };
+
+    // TODO : make a proper type hash
+    template<typename TYPE>
+    struct TypeHash
+    {
+        static constexpr uint32_t value = (std::is_signed<TYPE>::value ? 1U << 16 : 0) | sizeof(TYPE);
+    };
+
+    union RefData
+    {
+        void *data;
+        const void *cdata;
+    };
+
+private:
+    RefData _refData = {};
+    int _sizeBytes = 0;
+    const uint32_t _typeHash;
+    const uint32_t _utypeHash;
+    bool _isLocal = false;
+    LocalStorage _localStorage;
+    DeleteFunc _deleter = nullptr;
+
+public:
+    Buffer(const Buffer &) = delete;
+    Buffer &operator=(const Buffer &) = delete;
+
+    template<typename TYPE, std::enable_if_t<IsSupportedIntegral<TYPE>::value, int> = 0>
+    Buffer(int sizeElements, const TYPE *buffer = nullptr, bool alias = false) :
+            _typeHash(TypeHash<TYPE>::value), _utypeHash(TypeHash<std::make_unsigned_t<TYPE>>::value)
+    {
+        _sizeBytes = sizeof(TYPE) * sizeElements;
+        if ( buffer == nullptr || !alias )
+        {
+            assert(sizeElements > 0);
+            auto ref = new TYPE[sizeElements];
+            if ( buffer )
+            {
+                std::copy_n(buffer, sizeElements, ref);
+            }
+            _refData.data = ref;
+            _deleter = &Buffer::DeleteArray<TYPE>;
+        }
+        else
+        {
+            assert(alias && buffer);
+            _refData.cdata = buffer;
+        }
+    }
+
+    template<typename TYPE, std::enable_if_t<IsSupportedIntegral<TYPE>::value, int> = 0>
+    Buffer(std::unique_ptr<TYPE> ptr) :
+            _typeHash(TypeHash<TYPE>::value), _utypeHash(TypeHash<std::make_unsigned_t<TYPE>>::value)
+    {
+        _refData.data = ptr.release();
+        _sizeBytes = sizeof(TYPE);
+        _deleter = &Buffer::Delete<TYPE>;
+    }
+
+    template<typename TYPE, std::enable_if_t<IsSupportedIntegral<TYPE>::value, int> = 0>
+    Buffer(std::unique_ptr<TYPE[]> ptr, int sizeElements) :
+            _typeHash(TypeHash<TYPE>::value), _utypeHash(TypeHash<std::make_unsigned_t<TYPE>>::value)
+    {
+        _refData.data = ptr.release();
+        assert(sizeElements > 0);
+        assert(INT_MAX / int(sizeof(TYPE)) >= sizeElements);
+        _sizeBytes = sizeof(TYPE) * sizeElements;
+        _deleter = &Buffer::DeleteArray<TYPE>;
+    }
+
+    template<typename TYPE, std::enable_if_t<IsSupportedIntegral<TYPE>::value, int> = 0>
+    Buffer(std::vector<TYPE> &&buffer) :
+            _typeHash(TypeHash<TYPE>::value), _utypeHash(TypeHash<std::make_unsigned_t<TYPE>>::value)
+    {
+        new (&GetLocalVector<TYPE>()) std::vector<TYPE>(std::move(buffer));
+        _deleter = &Buffer::DeleteVector<TYPE>;
+        _refData.data = &GetLocalVector<TYPE>();
+        _isLocal = true;
+    }
+
+    ~Buffer()
+    {
+        if ( _deleter )
+        {
+            _deleter(_refData.data);
+        }
+    }
+
+public:
+    template<typename T>
+    T *Data()
+    {
+        // Follow strict reinterpret_cast type aliasing rules
+        assert(IsByte<T>::value || (TypeHash<std::make_unsigned_t<T>>::value == _utypeHash));
+        if ( _isLocal )
+        {
+            if constexpr ( IsByte<T>::value )
+            {
+                switch ( _typeHash )
+                {
+#define TYPE_FUNC(x) \
+    case TypeHash<x>::value: \
+        return reinterpret_cast<T *>(GetLocalVector<x>().data())
+                    FOR_ALL_INT_TYPES(TYPE_FUNC, ;);
+#undef TYPE_FUNC
+                    default:
+                        assert(false);
+                        return nullptr;
+                }
+            }
+            else
+            {
+                using S = std::make_signed_t<T>;
+                using U = std::make_unsigned_t<T>;
+                switch ( _typeHash )
+                {
+                    case TypeHash<S>::value:
+                        return reinterpret_cast<T *>(GetLocalVector<S>().data());
+                    case TypeHash<U>::value:
+                        return reinterpret_cast<T *>(GetLocalVector<U>().data());
+                    default:
+                        assert(false);
+                        return nullptr;
+                }
+            }
+        }
+        else
+        {
+            assert(_deleter);
+            return reinterpret_cast<T *>(_refData.data);
+        }
+    }
+    template<typename T>
+    const T *Data() const
+    {
+        if ( _isLocal )
+        {
+            // Follow strict reinterpret_cast type aliasing rules
+            assert(IsByte<T>::value || (TypeHash<std::make_unsigned_t<T>>::value == _utypeHash));
+            if constexpr ( IsByte<T>::value )
+            {
+                switch ( _typeHash )
+                {
+#define TYPE_FUNC(x) \
+    case TypeHash<x>::value: \
+        return reinterpret_cast<const T *>(GetLocalVector<x>().data())
+                    FOR_ALL_INT_TYPES(TYPE_FUNC, ;);
+#undef TYPE_FUNC
+                    default:
+                        assert(false);
+                        return nullptr;
+                }
+            }
+            else
+            {
+                using S = std::make_signed_t<T>;
+                using U = std::make_unsigned_t<T>;
+                switch ( _typeHash )
+                {
+                    case TypeHash<S>::value:
+                        return reinterpret_cast<const T *>(GetLocalVector<S>().data());
+                    case TypeHash<U>::value:
+                        return reinterpret_cast<const T *>(GetLocalVector<U>().data());
+                    default:
+                        assert(false);
+                        return nullptr;
+                }
+            }
+        }
+        else
+        {
+            assert(uintptr_t(_deleter ? _refData.data : _refData.cdata) % alignof(T) == 0);
+            return reinterpret_cast<const T *>(_deleter ? _refData.data : _refData.cdata);
+        }
+    }
+
+    int Size() const
+    {
+        if ( _isLocal )
+        {
+            switch ( _typeHash )
+            {
+#define TYPE_FUNC(x) \
+    case TypeHash<x>::value: \
+        return int(GetLocalVector<x>().size() * sizeof(x))
+                FOR_ALL_INT_TYPES(TYPE_FUNC, ;);
+#undef TYPE_FUNC
+                default:
+                    assert(false);
+                    return 0;
+            }
+        }
+        else
+        {
+            return _sizeBytes;
+        }
+    }
+
+private:
+    template<typename TYPE>
+    std::vector<TYPE> &GetLocalVector()
+    {
+        if constexpr ( false )
+        {
+        }
+#define TYPE_FUNC(x) else if constexpr ( std::is_same<TYPE, x>::value ) return _localStorage.as_##x
+        FOR_ALL_INT_TYPES(TYPE_FUNC, ;);
+#undef TYPE_FUNC
+        else
+        {
+            static_assert(IsSupportedIntegral<TYPE>::value, "");
+            return _localStorage.as_uint8_t;
+        }
+    }
+    template<typename TYPE>
+    const std::vector<TYPE> &GetLocalVector() const
+    {
+        if constexpr ( false )
+        {
+        }
+#define TYPE_FUNC(x) else if constexpr ( std::is_same<TYPE, x>::value ) return _localStorage.as_##x
+        FOR_ALL_INT_TYPES(TYPE_FUNC, ;);
+#undef TYPE_FUNC
+        else
+        {
+            static_assert(IsSupportedIntegral<TYPE>::value, "");
+            return _localStorage.as_uint8_t;
+        }
+    }
+
+    template<typename TYPE>
+    static void Delete(void *p)
+    {
+        delete reinterpret_cast<TYPE *>(p);
+    }
+    template<typename TYPE>
+    static void DeleteArray(void *p)
+    {
+        delete[] reinterpret_cast<TYPE *>(p);
+    }
+    template<typename TYPE>
+    static void DeleteVector(void *v)
+    {
+        using vec = std::vector<TYPE>;
+        static_cast<vec *>(v)->~vec();
+    }
+#undef FOR_ALL_INT_TYPES
+};
+
+
+/// <summary>
+/// Access proxy for processing values within a buffer
+/// </summary>
+template<typename TYPE, bool IS_CONST>
+class BufferValues
+{
+    using PTR_TYPE = typename std::conditional_t<IS_CONST, const TYPE *, TYPE *>;
+
+private:
+    PTR_TYPE _data;
+    Shape _strideBytes;
+
+public:
+    BufferValues(PTR_TYPE data, const Shape &strideBytes) : _data(data), _strideBytes(strideBytes) {}
+
+    template<bool CONSTNESS = IS_CONST, typename std::enable_if_t<!CONSTNESS, int> = 0>
+    TYPE &operator[](int index)
+    {
+        return _data[index];
+    }
+
+    const TYPE &operator[](int index) const { return _data[index]; }
+
+    int ElementIndex(const Shape &offset) const
+    {
+        int index = offset.Dot(_strideBytes) / sizeof(TYPE);
+        return index;
+    }
+};
+
+
+/// <summary>
+/// View of buffer memory
+/// </summary>
+class BufferView
+{
+protected:
+    std::shared_ptr<class Buffer> _buffer;
+    int _elementBits = 0;
+    int _baseOffset = 0;
+    Shape _axisElements;
+    Shape _strideBytes;
+
+public:
+    BufferView() {}
+
+    BufferView(const std::shared_ptr<Buffer> &buffer, int firstElement, int elementBits, const Shape &axisElements, const Shape &strideBytes)
+    {
+        assert(elementBits >= 8 && elementBits % 8 == 0);
+        _buffer = buffer;
+        _elementBits = elementBits;
+        _baseOffset = firstElement;
+        _axisElements = axisElements;
+        if ( strideBytes.IsEmpty() )
+        {
+            // Calculate byte strides
+            int sz = axisElements.Size();
+            if ( sz > 0 )
+            {
+                std::vector<int> strides(sz);
+                int v = 1;
+                for ( int i = sz - 1; i >= 0; --i )
+                {
+                    strides[i] = (v * elementBits) / 8;
+                    v *= axisElements[i];
+                }
+
+                _strideBytes = Shape(&strides[0], sz);
+            }
+        }
+        else
+        {
+            _strideBytes = strideBytes;
+        }
+    }
+
+    BufferView(const std::shared_ptr<Buffer> &buffer, const BufferView &other)
+    {
+        _buffer = buffer;
+        _elementBits = other._elementBits;
+        _baseOffset = 0;
+        _axisElements = other._axisElements;
+        _strideBytes = other._strideBytes;
+    }
+
+public:
+    bool HasBuffer() const { return _buffer != nullptr; }
+    const Shape &ViewShape() const { return _axisElements; }
+    const Shape &StrideBytes() const { return _strideBytes; }
+
+    BufferView Reshape(const Shape &size) const
+    {
+        assert(size.Elements() == _axisElements.Elements());
+        return BufferView(_buffer, 0, _elementBits, size, Shape());
+    }
+
+    BufferView SubView(const Shape &offset, const Shape &size) const
+    {
+        assert(size.Elements() < _axisElements.Elements());
+        int linearOffset = offset.Dot(_strideBytes);
+        return BufferView(_buffer, linearOffset, _elementBits, size, _strideBytes);
+    }
+
+    template<typename TYPE>
+    BufferValues<TYPE, true> Values() const
+    {
+        assert(HasBuffer());
+        auto start = const_cast<const class Buffer *>(_buffer.get())->Data<TYPE>() + _baseOffset;
+        return BufferValues<TYPE, true>(start, _strideBytes);
+    }
+
+    template<typename TYPE>
+    BufferValues<TYPE, false> WritableValues()
+    {
+        assert(HasBuffer());
+        auto start = _buffer->Data<TYPE>() + _baseOffset;
+        return BufferValues<TYPE, false>(start, _strideBytes);
+    }
+
+    int BufferSize() const { return _buffer->Size(); }
+
+    const class Buffer *Buffer() const { return _buffer.get(); }
+};
+
+
+}  // namespace regor
diff --git a/ethosu/regor/common/common.cpp b/ethosu/regor/common/common.cpp
new file mode 100644
index 00000000..c4a6f61a
--- /dev/null
+++ b/ethosu/regor/common/common.cpp
@@ -0,0 +1,29 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common.hpp"
+
+#include <atomic>
+
+UniqueId GenerateUniqueId()
+{
+    static std::atomic<UniqueId> _id;
+    UniqueId id = _id.fetch_add(1);
+    assert(id != std::numeric_limits<UniqueId>::max());
+    return id;
+}
diff --git a/ethosu/regor/common/common.hpp b/ethosu/regor/common/common.hpp
new file mode 100644
index 00000000..af33a901
--- /dev/null
+++ b/ethosu/regor/common/common.hpp
@@ -0,0 +1,172 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+// Macro to mark variables as unused
+#if !defined UNUSED
+#define UNUSED(x) (void)(x)
+#endif
+
+#define MACRO_EXPAND(a) a
+#define MACRO_CONCAT_IMPL(a, b) a##b
+#define MACRO_CONCAT(a, b) MACRO_CONCAT_IMPL(a, b)
+
+// clang-format off
+#if __GNUC__
+    #define DLL_EXPORT __attribute__((visibility("default")))
+    #define _strnicmp strncasecmp
+#elif _WIN32
+    #if TARGET_WIN32_DLL
+        #define DLL_EXPORT __declspec(dllexport)
+    #else
+        #define DLL_EXPORT
+    #endif
+    #ifndef ssize_t
+        #define ssize_t ptrdiff_t
+    #endif
+#else
+    #error "undefined export semantics"
+#endif
+// clang-format on
+
+#include <fmt/format.h>
+#include <fmt/ranges.h>
+#include <cassert>
+#include <cstdint>
+#include <string_view>
+#include <type_traits>
+
+template<typename ENUMTYPE, std::enable_if_t<std::is_enum_v<ENUMTYPE>, int> = 0>
+constexpr std::underlying_type_t<ENUMTYPE> format_as(ENUMTYPE e) noexcept
+{
+    return static_cast<std::underlying_type_t<ENUMTYPE>>(e);
+}
+
+namespace regor
+{
+template<typename ENUMTYPE, std::enable_if_t<std::is_enum_v<ENUMTYPE>, int> = 0>
+constexpr std::underlying_type_t<ENUMTYPE> format_as(ENUMTYPE e) noexcept
+{
+    return static_cast<std::underlying_type_t<ENUMTYPE>>(e);
+}
+}  // namespace regor
+
+#define DECLARE_ENUM_AS_FLAGS(ENUM_) \
+    static constexpr inline ENUM_ operator&(ENUM_ a, ENUM_ b) \
+    { \
+        return ENUM_(std::underlying_type<ENUM_>::type(a) & std::underlying_type<ENUM_>::type(b)); \
+    } \
+    static constexpr inline ENUM_ operator|(ENUM_ a, ENUM_ b) \
+    { \
+        return ENUM_(std::underlying_type<ENUM_>::type(a) | std::underlying_type<ENUM_>::type(b)); \
+    } \
+    static constexpr inline ENUM_ operator^(ENUM_ a, ENUM_ b) \
+    { \
+        return ENUM_(std::underlying_type<ENUM_>::type(a) ^ std::underlying_type<ENUM_>::type(b)); \
+    }
+
+using UniqueId = uint32_t;
+
+UniqueId GenerateUniqueId();
+
+#define VERIFY(x_) (assert(x_), (x_))
+
+namespace regor
+{
+
+#if defined __GNUC__
+
+template<typename TYPE>
+constexpr const char *PlatformRootName()
+{
+    const char *p = __PRETTY_FUNCTION__;
+    return p;
+}
+
+template<typename TYPE>
+constexpr std::string_view PlatformTypeName()
+{
+    const char *p = PlatformRootName<TYPE>();
+    while ( *p++ != '=' )
+    {
+    };
+    while ( *p == ' ' )
+    {
+        p++;
+    };
+    std::size_t i = 0;
+    while ( (p[i] != 0) && (p[i] != ']') && (*p != ';') )
+        i++;
+    return std::string_view(p, i);
+}
+
+#elif _MSC_VER
+
+template<typename TYPE>
+constexpr const char *PlatformRootName()
+{
+    const char *p = __FUNCSIG__;
+    return p;
+}
+
+template<typename TYPE>
+constexpr std::string_view PlatformTypeName()
+{
+    const char *p = PlatformRootName<TYPE>();
+    while ( *p++ != '<' )
+        ;
+    if ( (*p == 'c') || (*p == 'u') || (*p == 's') || (*p == 'e') )
+    {
+        if ( std::string_view(p, 7) == "struct " ) p += 7;
+        else if ( std::string_view(p, 6) == "class " ) p += 6;
+        else if ( std::string_view(p, 6) == "union " ) p += 6;
+        else if ( std::string_view(p, 5) == "enum " ) p += 5;
+    }
+    std::size_t i = 0;
+    while ( p[i] != '(' )
+        i++;
+    return std::string_view(p, i - 1);
+}
+
+#else
+#error No type hash for this target
+#endif
+
+template<typename TYPE, bool NO_NAMESPACE>
+static constexpr uint32_t PlatformTypeHash()
+{
+    const std::string_view name = PlatformTypeName<TYPE>();
+    auto p = name.begin();
+    auto e = name.end();
+    if constexpr ( NO_NAMESPACE )
+    {
+        while ( *p != ':' )
+            p++;
+        while ( *p == ':' )
+            p++;
+    }
+    uint32_t hash = 0x811c9dc5;  // FNV-1a SEED
+    while ( p < e )
+    {
+        hash = (hash ^ uint8_t(*p++)) * 0x01000193;  // FNV-1a PRIME
+    }
+    return hash;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/common/data_type.cpp b/ethosu/regor/common/data_type.cpp
new file mode 100644
index 00000000..d9c8ac21
--- /dev/null
+++ b/ethosu/regor/common/data_type.cpp
@@ -0,0 +1,72 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common/data_type.hpp"
+
+#include "common/logging.hpp"
+
+#include "common/bit_flags.hpp"
+
+BEGIN_ENUM_TABLE(regor::DataType)
+    ADD_ENUM_NAME(None)
+    ADD_ENUM_NAME(Bits4)
+    ADD_ENUM_NAME(Bits8)
+    ADD_ENUM_NAME(Bits16)
+    ADD_ENUM_NAME(Bits32)
+    ADD_ENUM_NAME(Bits64)
+    ADD_ENUM_NAME(Bits128)
+    ADD_ENUM_NAME(Signed)
+    ADD_ENUM_NAME(Asymmetric)
+    ADD_ENUM_NAME(Int)
+    ADD_ENUM_NAME(SignedInt)
+    ADD_ENUM_NAME(Int4)
+    ADD_ENUM_NAME(Int8)
+    ADD_ENUM_NAME(Int16)
+    ADD_ENUM_NAME(Int32)
+    ADD_ENUM_NAME(Int48)
+    ADD_ENUM_NAME(Int64)
+    ADD_ENUM_NAME(UInt8)
+    ADD_ENUM_NAME(UInt16)
+    ADD_ENUM_NAME(UInt32)
+    ADD_ENUM_NAME(UInt64)
+    ADD_ENUM_NAME(QInt)
+    ADD_ENUM_NAME(QInt4)
+    ADD_ENUM_NAME(QInt8)
+    ADD_ENUM_NAME(QInt12)
+    ADD_ENUM_NAME(QInt16)
+    ADD_ENUM_NAME(QInt32)
+    ADD_ENUM_NAME(QUInt)
+    ADD_ENUM_NAME(QUInt4)
+    ADD_ENUM_NAME(QUInt8)
+    ADD_ENUM_NAME(QUInt12)
+    ADD_ENUM_NAME(QUInt16)
+    ADD_ENUM_NAME(QUInt32)
+    ADD_ENUM_NAME(Float)
+    ADD_ENUM_NAME(Float16)
+    ADD_ENUM_NAME(Float32)
+    ADD_ENUM_NAME(Float64)
+    ADD_ENUM_NAME(Bool)
+    ADD_ENUM_NAME(Bool8)
+    ADD_ENUM_NAME(Complex)
+    ADD_ENUM_NAME(Complex64)
+    ADD_ENUM_NAME(Complex128)
+    ADD_ENUM_NAME(VariablySized)
+    ADD_ENUM_NAME(String)
+    ADD_ENUM_NAME(Resource)
+    ADD_ENUM_NAME(Variant)
+END_ENUM_TABLE()
diff --git a/ethosu/regor/common/data_type.hpp b/ethosu/regor/common/data_type.hpp
new file mode 100644
index 00000000..06c79c2c
--- /dev/null
+++ b/ethosu/regor/common/data_type.hpp
@@ -0,0 +1,290 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+#include "common/bit_flags.hpp"
+
+#include <cassert>
+#include <cstdint>
+#include <string>
+
+namespace regor
+{
+class int48_t
+{
+public:
+    int48_t() = default;
+    int48_t(const int48_t &obj) = default;
+    int48_t(int48_t &&obj) noexcept = default;
+    int48_t(const int64_t val)
+    {
+        for ( int i = 0; i < 6; i++ )
+        {
+            _data[i] = (val & (uint64_t(0xFF) << i * 8)) >> (i * 8);
+        }
+    }
+
+    int48_t(const uint64_t val) { int48_t(static_cast<int64_t>(val)); }
+
+    operator int64_t() const
+    {
+        int64_t res = 0;
+        for ( int i = 0; i < 6; i++ )
+        {
+            res |= uint64_t(_data[i]) << (16 + i * 8);
+        }
+
+        return res >> 16;
+    }
+
+    operator uint64_t() const { return static_cast<uint64_t>(operator int64_t()); }
+
+private:
+    uint8_t _data[6]{0};
+};
+
+enum class DataType : uint16_t
+{
+    None = 0,
+    // Bits 1 and 2 reserved for disambiguating variably sized types
+    Bits4 = 1 << 2,
+    Bits8 = 1 << 3,
+    Bits16 = 1 << 4,
+    Bits32 = 1 << 5,
+    Bits48 = Bits32 | Bits16,
+    Bits64 = 1 << 6,
+    Bits128 = 1 << 7,
+    Signed = 1 << 8,
+    Packed = 1 << 9,
+    Asymmetric = 1 << 10,
+    Int = 1 << 11,
+    SignedInt = Signed | Int,
+    Int4 = SignedInt | Bits4,
+    Int4Packed8 = SignedInt | Bits8 | Bits4 | Packed,
+    Int8 = SignedInt | Bits8,
+    Int16 = SignedInt | Bits16,
+    Int32 = SignedInt | Bits32,
+    Int48 = SignedInt | Bits48,
+    Int64 = SignedInt | Bits64,
+    UInt8 = Int | Bits8,
+    UInt16 = Int | Bits16,
+    UInt32 = Int | Bits32,
+    UInt48 = Int | Bits48,
+    UInt64 = Int | Bits64,
+    QInt = Asymmetric | SignedInt,
+    QInt4 = QInt | Bits4,
+    QInt8 = QInt | Bits8,
+    QInt12 = QInt | Bits8 | Bits4,
+    QInt16 = QInt | Bits16,
+    QInt32 = QInt | Bits32,
+    QUInt = Asymmetric,
+    QUInt4 = QUInt | Bits4,
+    QUInt8 = QUInt | Bits8,
+    QUInt12 = QUInt | Bits8 | Bits4,
+    QUInt16 = QUInt | Bits16,
+    QUInt32 = QUInt | Bits32,
+    Float = 1 << 12,
+    BFloat16 = Float | Bits16 | Packed,
+    Float16 = Float | Bits16,
+    Float32 = Float | Bits32,
+    Float64 = Float | Bits64,
+    Bool = 1 << 13,
+    Bool8 = Bool | Bits8,
+    Complex = 1 << 14,
+    Complex64 = Complex | Bits64,
+    Complex128 = Complex | Bits128,
+    VariablySized = 1 << 15,
+    String = VariablySized | 1,
+    Resource = VariablySized | 2,
+    Variant = VariablySized | 3,
+};
+
+inline constexpr DataType operator&(DataType type, DataType mask)
+{
+    return DataType(unsigned(type) & unsigned(mask));
+}
+inline constexpr DataType operator&(DataType type, unsigned mask)
+{
+    return DataType(unsigned(type) & mask);
+}
+inline constexpr DataType operator|(DataType type, DataType mask)
+{
+    return DataType(unsigned(type) | unsigned(mask));
+}
+inline constexpr DataType operator|(DataType type, unsigned mask)
+{
+    return DataType(unsigned(type) | mask);
+}
+inline constexpr bool operator!(DataType type)
+{
+    return type == DataType::None;
+}
+
+
+static inline int Clz(uint32_t value)
+{
+    // Ensure all CLZ implementations return '32 zeroes'
+    // for a zero value input.
+    if ( value == 0 )
+    {
+        return 32;
+    }
+#if defined(__GNUC__)
+    return __builtin_clz(value);
+#elif defined(_MSC_VER)
+    unsigned long index;
+    _BitScanReverse(&index, value);
+    return int(31 - index);
+#else
+#error "Missing platform CLZ32 implementation"
+#endif
+}
+
+inline constexpr int DataTypeStorageSizeBits(DataType type)
+{
+    unsigned bits = unsigned(type & 0x00FFu);
+    // Interpret packed word size as the largest set bit
+    if ( (type & DataType::Packed) == DataType::Packed )
+    {
+        assert(bits > 0);
+        return 1 << (31 - Clz(bits));
+    }
+    return (!(type & DataType::VariablySized) ? std::max(int(bits), 8) : -1);
+}
+
+inline constexpr int DataTypeSizeBits(DataType type)
+{
+    unsigned bits = unsigned(type & 0x00FFu);
+    if ( (type & DataType::Packed) == DataType::Packed )
+    {
+        assert(bits > 0);
+        bits ^= 1 << (31 - Clz(bits));  // Strip container word
+    }
+    return (!(type & DataType::VariablySized) ? int(bits) : -1);
+}
+
+inline constexpr int DataTypeStorageSizeBytes(DataType type, int elements)
+{
+    const int storageBits = DataTypeStorageSizeBits(type);
+    const int bits = (type & DataType::Packed) == DataType::Packed ? DataTypeSizeBits(type) : storageBits;
+    assert(storageBits >= 8);
+    return (((elements * bits) + storageBits - 1) / storageBits) * (storageBits / 8);
+}
+
+
+inline constexpr int DataTypeElements(DataType type, int size)
+{
+    const int bits = (type & DataType::Packed) == DataType::Packed ? DataTypeSizeBits(type) : DataTypeStorageSizeBits(type);
+    assert(size <= std::numeric_limits<int>::max() / 8);
+    return 8 * size / bits;
+}
+
+inline constexpr DataType DataTypeBase(DataType type)
+{
+    return (!(type & DataType::VariablySized) ? type & 0xFF00u : type);
+}
+
+inline std::string DataTypeToString(const DataType type)
+{
+    return EnumToString<DataType>(type);
+}
+
+inline constexpr bool IsInteger(DataType type)
+{
+    return (type & DataType::Int) == DataType::Int;
+}
+
+inline constexpr bool IsSignedInteger(DataType type)
+{
+    return (type & DataType::SignedInt) == DataType::SignedInt;
+}
+
+inline constexpr bool IsFloat(DataType type)
+{
+    return (type & DataType::Float) == DataType::Float;
+}
+
+inline constexpr bool IsBool(DataType type)
+{
+    return (type & DataType::Bool) == DataType::Bool;
+}
+
+inline constexpr uint64_t IntegerMax(DataType type)
+{
+    assert(IsInteger(type));
+    return ~0ULL >> (64 - DataTypeSizeBits(type) + int(IsSignedInteger(type)));
+}
+
+inline constexpr int64_t IntegerMin(DataType type)
+{
+    assert(IsInteger(type));
+    if ( IsSignedInteger(type) )
+    {
+        int size = DataTypeSizeBits(type);
+        return -(1LL << (size - 1));
+    }
+    return 0;
+}
+
+template<typename T>
+struct DataTypeOf
+{
+    static constexpr DataType value = DataType::None;
+};
+template<>
+struct DataTypeOf<int8_t>
+{
+    static constexpr DataType value = DataType::Int8;
+};
+template<>
+struct DataTypeOf<int16_t>
+{
+    static constexpr DataType value = DataType::Int16;
+};
+template<>
+struct DataTypeOf<int32_t>
+{
+    static constexpr DataType value = DataType::Int32;
+};
+template<>
+struct DataTypeOf<int64_t>
+{
+    static constexpr DataType value = DataType::Int64;
+};
+template<>
+struct DataTypeOf<uint8_t>
+{
+    static constexpr DataType value = DataType::UInt8;
+};
+template<>
+struct DataTypeOf<uint16_t>
+{
+    static constexpr DataType value = DataType::UInt16;
+};
+template<>
+struct DataTypeOf<uint32_t>
+{
+    static constexpr DataType value = DataType::UInt32;
+};
+template<>
+struct DataTypeOf<uint64_t>
+{
+    static constexpr DataType value = DataType::UInt64;
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/common/dynamic_typing.hpp b/ethosu/regor/common/dynamic_typing.hpp
new file mode 100644
index 00000000..236f8f2a
--- /dev/null
+++ b/ethosu/regor/common/dynamic_typing.hpp
@@ -0,0 +1,255 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/shape.hpp"
+
+#include <cstdint>
+
+namespace regor
+{
+
+template<typename T>
+struct FieldTypeId
+{
+    static constexpr uint8_t TYPEID = 0;
+};
+// Ordinal types
+template<>
+struct FieldTypeId<bool>
+{
+    static constexpr uint8_t TYPEID = 1;
+};
+template<>
+struct FieldTypeId<uint8_t>
+{
+    static constexpr uint8_t TYPEID = 3;
+};
+template<>
+struct FieldTypeId<int8_t>
+{
+    static constexpr uint8_t TYPEID = 4;
+};
+template<>
+struct FieldTypeId<uint16_t>
+{
+    static constexpr uint8_t TYPEID = 5;
+};
+template<>
+struct FieldTypeId<int16_t>
+{
+    static constexpr uint8_t TYPEID = 6;
+};
+template<>
+struct FieldTypeId<uint32_t>
+{
+    static constexpr uint8_t TYPEID = 7;
+};
+template<>
+struct FieldTypeId<int32_t>
+{
+    static constexpr uint8_t TYPEID = 8;
+};
+template<>
+struct FieldTypeId<uint64_t>
+{
+    static constexpr uint8_t TYPEID = 9;
+};
+template<>
+struct FieldTypeId<int64_t>
+{
+    static constexpr uint8_t TYPEID = 10;
+};
+template<>
+struct FieldTypeId<float>
+{
+    static constexpr uint8_t TYPEID = 11;
+};
+template<>
+struct FieldTypeId<double>
+{
+    static constexpr uint8_t TYPEID = 12;
+};
+template<>
+struct FieldTypeId<std::string>
+{
+    static constexpr uint8_t TYPEID = 13;
+};
+// Class types
+template<>
+struct FieldTypeId<Point2i>
+{
+    static constexpr uint8_t TYPEID = 0x1F;
+};
+template<>
+struct FieldTypeId<Shape>
+{
+    static constexpr uint8_t TYPEID = 0x2F;
+};
+
+template<typename TYPE>
+struct TypeHash
+{
+    static constexpr uint32_t HASH = PlatformTypeHash<TYPE, true>();
+};
+
+struct FieldInfo
+{
+    size_t offset = 0;
+    uint32_t id = 0;
+    uint8_t typeId = 0;
+};
+
+struct TypeInfo
+{
+    uint32_t _hash;
+    const FieldInfo *_fields;
+    size_t _fieldCount;
+    void (*_deleter)(void *);
+    void (*_addref)(void *);
+
+public:
+    uint32_t Hash() const { return _hash; }
+    void *AddRef(void *p) const
+    {
+        if ( !_addref ) return nullptr;
+        _addref(p);
+        return p;
+    }
+    void Delete(void *p) const { _deleter(p); }
+    const FieldInfo *Fields(size_t &length) const
+    {
+        length = _fieldCount;
+        return _fields;
+    }
+};
+
+// Dynamic allocation with type-erasure. Use to handle anonymous TYPE
+// allocations by passing around a void pointer and the type information
+// separately at runtime.
+template<typename TYPE>
+struct TypeInfoOf
+{
+    static void *DefaultNew() { return new TYPE(); }
+    static void DefaultDeleter(void *p) { delete static_cast<TYPE *>(p); }
+
+    struct SharedType
+    {
+        TYPE instance;
+        unsigned ref = 1;
+    };
+    static void *SharedNew()
+    {
+        auto *p = new SharedType();
+        assert(static_cast<void *>(p) == static_cast<void *>(&p->instance));
+        return &p->instance;
+    }
+    static void SharedAddRef(void *p)
+    {
+        assert(p);
+        static_cast<SharedType *>(p)->ref++;
+    }
+    static void SharedDeleter(void *p)
+    {
+        auto *shared = static_cast<SharedType *>(p);
+        if ( --shared->ref == 0 ) delete shared;
+    }
+
+    static const TypeInfo *Get(bool sharedInstancing)
+    {
+        size_t len;
+        const FieldInfo *f = TYPE::FieldTable(len);
+        static const TypeInfo s_infoDefault{PlatformTypeHash<TYPE, true>(), f, len, &DefaultDeleter, nullptr};
+        static const TypeInfo s_infoShared{PlatformTypeHash<TYPE, true>(), f, len, &SharedDeleter, &SharedAddRef};
+        return sharedInstancing ? &s_infoShared : &s_infoDefault;
+    }
+};
+
+
+// Container for dynamically typed instances
+struct DynamicRef
+{
+private:
+    const TypeInfo *_info = nullptr;
+    void *_instance = nullptr;
+
+public:
+    DynamicRef() = default;
+    DynamicRef(const TypeInfo *info, void *inst) : _info(info), _instance(inst) {}
+    DynamicRef(const DynamicRef &other) { *this = other; }
+    DynamicRef(DynamicRef &&other) noexcept { *this = std::move(other); }
+    ~DynamicRef()
+    {
+        if ( _instance )
+        {
+            assert(_info);
+            _info->Delete(_instance);
+        }
+    }
+
+    DynamicRef &operator=(const DynamicRef &other)
+    {
+        if ( _instance )
+        {
+            assert(_info);
+            _info->Delete(_instance);
+            _instance = nullptr;
+        }
+        if ( &other != this && other._instance )
+        {
+            _info = other._info;
+            _instance = _info->AddRef(other._instance);
+            assert(_instance);
+        }
+        return *this;
+    }
+
+    DynamicRef &operator=(DynamicRef &&other) noexcept
+    {
+        if ( &other != this )
+        {
+            _info = other._info;
+            _instance = other._instance;
+            other._instance = nullptr;
+        }
+        return *this;
+    }
+    operator bool() const { return _info && _instance; }
+    void *Instance() { return _instance; }
+    const TypeInfo *Info() const { return _info; }
+};
+
+
+// clang-format off
+
+#define BEGIN_FIELD_TABLE(CLASS_) \
+    static const FieldInfo *FieldTable(size_t &len) { \
+        typedef CLASS_ thisclass_t; \
+        static const FieldInfo s_fieldTable[] = {
+
+#define END_FIELD_TABLE() }; \
+    len = std::size(s_fieldTable); \
+    return s_fieldTable; }
+
+// clang-format on
+
+#define REGOR_FIELD_TYPE(TYPE_) FieldTypeId<std::decay<TYPE_>::type>::TYPEID
+
+
+}  // namespace regor
diff --git a/ethosu/regor/common/ini_reader.hpp b/ethosu/regor/common/ini_reader.hpp
new file mode 100644
index 00000000..5a079900
--- /dev/null
+++ b/ethosu/regor/common/ini_reader.hpp
@@ -0,0 +1,300 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common.hpp"
+#include "lexer.hpp"
+
+#include <cassert>
+#include <charconv>
+#include <vector>
+
+
+/// <summary>
+/// INI file reader
+/// </summary>
+class IniReader : Lexer
+{
+    enum class ParseState
+    {
+        None,
+        Section,
+        Key,
+        Value
+    };
+
+protected:
+    ParseState _parseState = ParseState::None;
+    bool _wasError = false;
+
+public:
+    IniReader(const char *src, size_t length) : Lexer(src, length) {}
+
+    IniReader(const IniReader &other) = delete;
+
+public:
+    bool Begin(std::string &key)
+    {
+        assert(_parseState != ParseState::Value);
+
+        if ( _parseState == ParseState::None )
+        {
+            if ( !SkipCommentSpace() )
+            {
+                return false;
+            }
+
+            // Expect section
+            if ( !Expect('[') )
+            {
+                // Error state
+                _wasError = true;
+                return false;
+            }
+            _parseState = ParseState::Section;
+        }
+
+        if ( _parseState == ParseState::Section )
+        {
+            // Get section identifier
+            if ( !GetString(key, 0, 0, ']') )
+            {
+                return false;
+            }
+            if ( !SkipWhite() )
+            {
+                return false;
+            }
+            if ( !Expect(']') )
+            {
+                // Error state
+                _wasError = true;
+                return false;
+            }
+            _parseState = ParseState::Key;
+        }
+        else if ( _parseState == ParseState::Key )
+        {
+            if ( !SkipCommentSpace() )
+            {
+                return false;
+            }
+            // Get key name
+            if ( !GetIdent(key) )
+            {
+                _parseState = ParseState::None;
+                return false;
+            }
+            while ( Expect('.') )
+            {
+                key += ".";
+                std::string tmp;
+                if ( GetIdent(tmp) )
+                {
+                    key += tmp;
+                }
+            }
+            if ( !SkipWhite() )
+            {
+                return false;
+            }
+            if ( !Expect('=') )
+            {
+                // Error - attempt to recover by jumping to next line
+                _wasError = true;
+                SkipUntil('\n', true);
+                return false;
+            }
+            _parseState = ParseState::Value;
+        }
+
+        return true;
+    }
+
+    void End()
+    {
+        if ( _parseState == ParseState::Value )  // End key/value pair
+        {
+            // End this line (doesn't work with quoted strings that span lines)
+            SkipUntil('\n', true);
+            _parseState = ParseState::Key;
+        }
+        else if ( _parseState == ParseState::Key )  // End section
+        {
+            // Skip until next section or EOF
+            std::string skipped;
+            while ( Begin(skipped) )
+            {
+                assert(_parseState != ParseState::Key);  // Prevent recursion depth > 2
+                End();
+            }
+            _parseState = ParseState::None;
+        }
+    }
+
+    bool Read(bool &value)
+    {
+        if ( !SkipWhite() )
+        {
+            return false;
+        }
+
+        assert(_parseState == ParseState::Value);
+
+        if ( Expect("true") ) value = true;
+        else if ( Expect("yes") ) value = true;
+        else if ( Expect("1") ) value = true;
+        else if ( Expect("false") ) value = false;
+        else if ( Expect("no") ) value = false;
+        else if ( Expect("0") ) value = false;
+        else return false;
+
+        return true;
+    }
+
+    bool Read(int64_t &value)
+    {
+        if ( !SkipSpace() )
+        {
+            return false;
+        }
+
+        assert(_parseState == ParseState::Value);
+
+        auto [ptr, ec] = std::from_chars(_pos, _end, value);
+        if ( _pos == ptr )
+        {
+            return false;
+        }
+        _pos = ptr;
+
+        // Allow comma-separated value reads
+        SkipSpace();
+        Expect(',');
+        return true;
+    }
+
+    bool Read(int &value)
+    {
+        if ( !SkipSpace() )
+        {
+            return false;
+        }
+
+        assert(_parseState == ParseState::Value);
+
+        auto [ptr, ec] = std::from_chars(_pos, _end, value);
+        if ( _pos == ptr )
+        {
+            return false;
+        }
+        _pos = ptr;
+
+        // Allow comma-separated value reads
+        SkipSpace();
+        Expect(',');
+        return true;
+    }
+
+    bool Read(float &value)
+    {
+        if ( !SkipSpace() )
+        {
+            return false;
+        }
+
+        assert(_parseState == ParseState::Value);
+
+        char *end;
+        value = std::strtof(_pos, &end);
+        if ( end == _pos )
+        {
+            return false;
+        }
+        _pos = end;
+
+        // Allow comma-separated value reads
+        SkipSpace();
+        Expect(',');
+        return true;
+    }
+
+    bool Read(std::string &value)
+    {
+        if ( !SkipSpace() )
+        {
+            return false;
+        }
+
+        assert(_parseState == ParseState::Value);
+        if ( GetString(value, '"', '\\', '\n') )
+        {
+            if ( !value.empty() && value.back() == '\r' )
+            {
+                value.pop_back();
+            }
+        }
+        return true;
+    }
+
+    template<typename TYPE>
+    bool Read(std::vector<TYPE> &out)
+    {
+        assert(_parseState == ParseState::Value);
+
+        TYPE tmp{};
+        while ( Read(tmp) )
+        {
+            out.push_back(tmp);
+        }
+        return !out.empty();
+    }
+
+    template<typename TYPE>
+    TYPE Get()
+    {
+        TYPE tmp = TYPE();
+        Read(tmp);
+        return tmp;
+    }
+
+    ssize_t Position() const { return _pos - _source; }
+
+private:
+    bool SkipCommentSpace()
+    {
+        // Skip whitespace and comments
+        while ( true )
+        {
+            if ( !SkipWhite() )
+            {
+                return false;
+            }
+            if ( !Expect(';') )
+            {
+                break;
+            }
+            if ( !SkipUntil('\n', true) )
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+};
diff --git a/ethosu/regor/common/lexer.hpp b/ethosu/regor/common/lexer.hpp
new file mode 100644
index 00000000..a6373f1d
--- /dev/null
+++ b/ethosu/regor/common/lexer.hpp
@@ -0,0 +1,159 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include <cassert>
+#include <cctype>
+#include <cstddef>
+#include <cstring>
+#include <string>
+
+
+/// <summary>
+/// Text string lexer
+/// </summary>
+class Lexer
+{
+protected:
+    const char *_source = nullptr;
+    const char *_end = nullptr;
+    const char *_pos = nullptr;
+
+public:
+    Lexer(const char *text, size_t length) : _source(text), _end(text + length) { _pos = _source; }
+
+    bool SkipSpace()
+    {
+        const char *p = _pos;
+        while ( p < _end && std::isblank(*p) )
+        {
+            p++;
+        }
+        _pos = p;
+        return p < _end;
+    }
+
+    bool SkipWhite()
+    {
+        const char *p = _pos;
+        while ( p < _end && std::isspace(*p) )
+        {
+            p++;
+        }
+        _pos = p;
+        return p < _end;
+    }
+
+    bool SkipUntil(char term, bool consume)
+    {
+        const char *p = _pos;
+        while ( (p < _end) && (*p != term) )
+        {
+            p++;
+        }
+        if ( consume && (*p == term) )
+        {
+            p++;
+        }
+        _pos = p;
+        return p < _end;
+    }
+
+    char Peek() const
+    {
+        assert(_pos < _end);
+        return _pos < _end ? *_pos : '\0';
+    }
+
+    bool Expect(char c)
+    {
+        assert(_pos < _end);
+        if ( *_pos == c )
+        {
+            _pos++;
+            return true;
+        }
+        return false;
+    }
+
+    bool Expect(const char *text, size_t length = 0)
+    {
+        ptrdiff_t avail = std::max<ptrdiff_t>(0, _end - _pos);
+        size_t compare = length == 0 ? std::char_traits<char>::length(text) : length;
+        compare = std::min(compare, size_t(avail));
+        if ( strncmp(_pos, text, compare) == 0 )
+        {
+            _pos += compare;
+            return true;
+        }
+        return false;
+    }
+
+    bool GetIdent(std::string &ident, bool skipwhite = true)
+    {
+        if ( skipwhite && !SkipWhite() )
+        {
+            return false;
+        }
+
+        const char *p = _pos;
+        if ( *p != '_' && !std::isalpha(*p) )
+        {
+            return false;
+        }
+
+        const char *maxEnd = _end;
+        while ( (p < maxEnd) && (*p == '_' || std::isalnum(*p)) )
+        {
+            p++;
+        }
+
+        ident.assign(_pos, p);
+        _pos = p;
+        return !ident.empty();
+    }
+
+    bool GetString(std::string &text, char quote, char escape, char term)
+    {
+        text.reserve(16);
+        text.clear();
+
+        bool quoted = Expect(quote);
+
+        const char *p = _pos;
+        while ( (p < _end) && (*p != term) )
+        {
+            // Handle escaping first to allow quotes and terminator
+            // in string.
+            if ( (*p == escape) && (p < _end) )
+            {
+                p++;
+            }
+            else if ( quoted )
+            {
+                if ( *p == quote ) break;
+            }
+            text += *p;
+            p++;
+        }
+
+        _pos = p;
+        return quoted || !text.empty();  // Quoted strings are intentionally present (just empty)
+    }
+};
diff --git a/ethosu/regor/common/logging.cpp b/ethosu/regor/common/logging.cpp
new file mode 100644
index 00000000..d33bd48c
--- /dev/null
+++ b/ethosu/regor/common/logging.cpp
@@ -0,0 +1,70 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "logging.hpp"
+
+namespace Logging
+{
+
+LogContext Out("", ~0u);
+
+static const char g_indentString[] = "\t\t\t\t\t\t\t\t\t\t\t\t";
+
+LogContext::LogContext(const char *prefix, uint32_t filterMask) : _prefix(prefix), _filterMask(filterMask)
+{
+}
+
+LogContext::~LogContext()
+{
+}
+
+void LogContext::Indent()
+{
+    assert(_indent < int(sizeof(g_indentString)));
+    _indent++;
+}
+
+void LogContext::Unindent()
+{
+    assert(_indent > 0);
+    _indent--;
+}
+
+void LogContext::Write(const std::string &s)
+{
+    assert(_logWriter);
+
+    if ( !_prefix.empty() )
+    {
+        _logWriter(_prefix.data(), _prefix.size());
+    }
+    if ( _indent != 0 )
+    {
+        _logWriter(g_indentString, _indent);
+    }
+
+    _logWriter(s.data(), s.size());
+}
+
+void LogContext::WriteLn(const std::string &s)
+{
+    Write(s);
+    _logWriter("\n", 1);
+}
+
+}  // namespace Logging
diff --git a/ethosu/regor/common/logging.hpp b/ethosu/regor/common/logging.hpp
new file mode 100644
index 00000000..fb71dec2
--- /dev/null
+++ b/ethosu/regor/common/logging.hpp
@@ -0,0 +1,200 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common.hpp"
+
+#include <cassert>
+#include <cctype>
+#include <string>
+
+// Log tracing detail settings, set LOG_TRACE_ENABLE to
+// TD_0|TD_1|TD_2 to enable those traces before including
+// this logging header.
+#define TD_0 (1)
+#define TD_1 (2)
+#define TD_2 (4)
+
+#if !defined LOG_TRACE_ENABLE
+#define LOG_TRACE_ENABLE (TD_0)
+#endif
+
+#define LOG_TRACE0_ON (((LOG_TRACE_ENABLE)&TD_0) != 0)
+#define LOG_TRACE0(...) \
+    { \
+        if ( LOG_TRACE0_ON ) \
+        { \
+            Logging::Out(8)(__VA_ARGS__); \
+        } \
+    }
+
+#define LOG_TRACE1_ON (((LOG_TRACE_ENABLE)&TD_1) != 0)
+#define LOG_TRACE1(...) \
+    { \
+        if ( LOG_TRACE1_ON ) \
+        { \
+            Logging::Out(16)(__VA_ARGS__); \
+        } \
+    }
+
+#define LOG_TRACE2_ON (((LOG_TRACE_ENABLE)&TD_2) != 0)
+#define LOG_TRACE2(...) \
+    { \
+        if ( LOG_TRACE2_ON ) \
+        { \
+            Logging::Out(32)(__VA_ARGS__); \
+        } \
+    }
+
+#if NDEBUG
+#define LOG_DEBUG(...) \
+    do \
+    { \
+    } while ( false )
+#else
+#define LOG_DEBUG(...) Logging::Out(~0u)(__VA_ARGS__)
+#endif
+
+#define LOG_PRINT(...) \
+    { \
+        Logging::Out(1)(__VA_ARGS__); \
+    }
+#define LOG_WARN(...) \
+    { \
+        Logging::Out(1)(__VA_ARGS__); \
+    }
+#define LOG_ERROR(...) \
+    { \
+        Logging::Out(2)(__VA_ARGS__); \
+    }
+
+
+extern "C" {
+typedef void (*log_writer_t)(const void *data, size_t length);
+}
+
+namespace Logging
+{
+
+/// <summary>
+/// Logging context, currently outputs direct to stdout
+/// </summary>
+class LogContext
+{
+private:
+    std::string _prefix;
+    unsigned _filterMask = ~0u;
+    int _indent = 0;
+    log_writer_t _logWriter = nullptr;
+
+public:
+    struct Filter
+    {
+    public:
+        LogContext *_context;
+        unsigned _mask;
+
+    public:
+        Filter(LogContext *ctx, unsigned mask) : _context(ctx), _mask(mask) {}
+
+        template<typename... TYPES>
+        void operator()(const char *format, TYPES... args) const
+        {
+            if ( _mask & _context->_filterMask )
+            {
+                _context->Write(fmt::format(format, std::forward<TYPES>(args)...));
+            }
+        }
+
+        template<typename... TYPES>
+        void operator()(const std::string &format, TYPES... args) const
+        {
+            if ( _mask & _context->_filterMask )
+            {
+                _context->Write(fmt::format(format.c_str(), std::forward<TYPES>(args)...));
+            }
+        }
+
+        template<typename... TYPES>
+        void Print(const char *format, TYPES... args) const
+        {
+            if ( _mask & _context->_filterMask )
+            {
+                _context->Write(fmt::format(format, std::forward<TYPES>(args)...));
+            }
+        }
+
+        template<typename... TYPES>
+        void Print(const std::string &format, TYPES... args) const
+        {
+            if ( _mask & _context->_filterMask )
+            {
+                _context->Write(fmt::format(format, std::forward<TYPES>(args)...));
+            }
+        }
+    };
+
+public:
+    LogContext(const char *prefix, unsigned filterMask);
+    ~LogContext();
+
+public:
+    void SetPrefix(const char *prefix) { _prefix = prefix; }
+    void SetFilterMask(unsigned mask) { _filterMask = mask; }
+    unsigned FilterMask() const { return _filterMask; }
+    void SetWriter(log_writer_t writer) { _logWriter = writer; }
+    void Indent();
+    void Unindent();
+    void Write(const std::string &s);
+    void WriteLn(const std::string &s);
+
+    template<typename... TYPES>
+    void Print(const char *format, TYPES... args)
+    {
+        if ( _filterMask != 0 )
+        {
+            Write(fmt::format(format, std::forward<TYPES>(args)...));
+        }
+    }
+
+    template<typename... TYPES>
+    void Print(const std::string &format, TYPES... args)
+    {
+        if ( _filterMask != 0 )
+        {
+            Write(fmt::format(format, std::forward<TYPES>(args)...));
+        }
+    }
+
+    Filter operator()(unsigned mask) { return Filter(this, mask); }
+};
+
+// Default output stream
+extern LogContext Out;
+
+struct LogIndenter
+{
+    LogContext &_ctx;
+    LogIndenter(Logging::LogContext &ctx) : _ctx(ctx) { _ctx.Indent(); }
+    ~LogIndenter() { _ctx.Unindent(); }
+};
+
+#define LOG_INDENT(ctx) Logging::LogIndenter MACRO_CONCAT(_indent, __LINE__)(ctx)
+
+}  // namespace Logging
diff --git a/ethosu/regor/common/numeric_util.hpp b/ethosu/regor/common/numeric_util.hpp
new file mode 100644
index 00000000..974db467
--- /dev/null
+++ b/ethosu/regor/common/numeric_util.hpp
@@ -0,0 +1,412 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cfloat>
+#include <cmath>
+#include <cstdint>
+#include <stdexcept>
+#include <type_traits>
+#include <vector>
+#if _MSC_VER
+#include <intrin.h>
+#endif
+
+template<typename T>
+bool CheckSafeAdd(T a, T b)
+{
+#ifdef __GNUC__
+    T res;
+    return !__builtin_add_overflow(a, b, &res);
+#else
+    return (!(
+        (a > 0 && b > std::numeric_limits<T>::max() - a) ||
+        ((std::is_signed<T>::value && (a < 0 && b < std::numeric_limits<T>::min() - a)))));
+#endif
+}
+
+template<typename T>
+bool CheckSafeSub(T a, T b)
+{
+#ifdef __GNUC__
+    T res;
+    return !__builtin_sub_overflow(a, b, &res);
+#else
+    return !(
+        (a > 0 && b < std::numeric_limits<T>::min() + a) ||
+        (std::is_signed<T>::value && (a < 0 && b > std::numeric_limits<T>::max() + a)) || (std::is_unsigned<T>::value && a < b));
+#endif
+}
+
+template<typename T>
+bool CheckSafeMul(T a, T b)
+{
+#ifdef __GNUC__
+    T res;
+    return !__builtin_mul_overflow(a, b, &res);
+#else
+    return !(
+        (std::max(a, b) == -1 && std::min(a, b) == std::numeric_limits<T>::min()) ||
+        (b != 0 && (a > std::numeric_limits<T>::max() / b || (std::is_unsigned<T>::value && (a < std::numeric_limits<T>::min() / b)))));
+#endif
+}
+
+template<typename T>
+T SafeAdd(T a, T b)
+{
+    if ( !CheckSafeAdd(a, b) ) throw std::overflow_error("Addition overflow");
+    return a + b;
+}
+
+template<typename T>
+T SafeSub(T a, T b)
+{
+    if ( !CheckSafeSub(a, b) ) throw std::overflow_error("Subtraction overflow");
+    return a - b;
+}
+
+template<typename T>
+T SafeMul(T a, T b)
+{
+    if ( !CheckSafeMul(a, b) ) throw std::overflow_error("Multiplication overflow");
+    return a * b;
+}
+
+template<typename T>
+T AssertAdd(T a, T b)
+{
+    assert(CheckSafeAdd(a, b));
+    return a + b;
+}
+template<typename T>
+T AssertSub(T a, T b)
+{
+    assert(CheckSafeSub(a, b));
+    return a - b;
+}
+
+template<typename T>
+T AssertMul(T a, T b)
+{
+    assert(CheckSafeMul(a, b));
+    return a * b;
+}
+
+template<typename TYPE>
+class Point2
+{
+public:
+    TYPE x = 0, y = 0;
+
+public:
+    Point2(TYPE xx = 0, TYPE yy = 0) : x(xx), y(yy) {}
+
+public:
+    TYPE AreaXY() const { return x * y; }
+
+    Point2<TYPE> operator+(const Point2<TYPE> &pt) const
+    {
+        return Point2<TYPE>(AssertAdd(x, pt.x), AssertAdd(y, pt.y));
+    }
+    Point2<TYPE> operator-(const Point2<TYPE> &pt) const
+    {
+        return Point2<TYPE>(AssertSub(x, pt.x), AssertSub(y, pt.y));
+    }
+    Point2<TYPE> operator*(const Point2<TYPE> &pt) const
+    {
+        return Point2<TYPE>(AssertMul(x, pt.x), AssertMul(y, pt.y));
+    }
+    Point2<TYPE> operator/(const Point2<TYPE> &pt) const { return Point2<TYPE>(x / pt.x, y / pt.y); }
+
+    bool operator==(const Point2<TYPE> &pt) const { return (x == pt.x) && (y == pt.y); }
+    bool operator!=(const Point2<TYPE> &pt) const { return !((*this) == pt); }
+    bool operator<(const Point2<TYPE> &pt) const { return (x < pt.x) || ((x == pt.x) && (y < pt.y)); }
+
+    static Point2<TYPE> Min(const Point2<TYPE> &a, const Point2<TYPE> &b)
+    {
+        return Point2<TYPE>(std::min(a.x, b.x), std::min(a.y, b.y));
+    }
+
+    static Point2<TYPE> Max(const Point2<TYPE> &a, const Point2<TYPE> &b)
+    {
+        return Point2<TYPE>(std::max(a.x, b.x), std::max(a.y, b.y));
+    }
+
+    explicit operator uint32_t() const { return (uint32_t(x) << 16) ^ y; }
+
+    explicit operator uint64_t() const { return (uint64_t(x) << 16) ^ y; }
+};
+
+template<typename TYPE>
+struct Point2Hash
+{
+    size_t operator()(const Point2<TYPE> &pt) const { return (pt.x * 8191) ^ pt.y; }
+};
+
+using Point2i = Point2<int>;
+
+template<typename TYPE>
+class Point3
+{
+public:
+    TYPE x = 0, y = 0, z = 0;
+
+public:
+    Point3(TYPE xx = 0, TYPE yy = 0, TYPE zz = 0) : x(xx), y(yy), z(zz) {}
+
+public:
+    TYPE AreaXY() const { return x * y; }
+
+    Point3<TYPE> operator+(const Point3<TYPE> &pt) const { return Point3<TYPE>(x + pt.x, y + pt.y, z + pt.z); }
+    Point3<TYPE> operator-(const Point3<TYPE> &pt) const { return Point3<TYPE>(x - pt.x, y - pt.y, z - pt.z); }
+    Point3<TYPE> operator*(const Point3<TYPE> &pt) const { return Point3<TYPE>(x * pt.x, y * pt.y, z * pt.z); }
+    Point3<TYPE> operator/(const Point3<TYPE> &pt) const { return Point3<TYPE>(x / pt.x, y / pt.y, z / pt.z); }
+
+    bool operator==(const Point3<TYPE> &pt) const { return (x == pt.x) && (y == pt.y) && (z == pt.z); }
+    bool operator!=(const Point3<TYPE> &pt) const { return !((*this) == pt); }
+    bool operator<(const Point3<TYPE> &pt) const
+    {
+        return (x < pt.x) || ((x == pt.x) && (y < pt.y)) || ((x == pt.x) && (y == pt.y) && (z < pt.z));
+    }
+};
+
+
+template<typename TYPE>
+TYPE RoundAway(TYPE value, TYPE align)
+{
+    assert(align > 0);
+    TYPE rem = value % align;
+    if ( rem == 0 )
+    {
+        return value;
+    }
+    else if ( rem < 0 )
+    {
+        return value - (align + rem);
+    }
+    return value + (align - rem);
+}
+
+inline float RoundAway(float value, float align)
+{
+    assert(align > 0);
+    if ( value < 0 )
+    {
+        value = value - align + 1;
+    }
+    else
+    {
+        value = value + align - 1;
+    }
+    return std::trunc(value / align) * align;
+}
+
+template<typename TYPE>
+TYPE RoundAwayZero(TYPE value)
+{
+    return std::trunc(value + (value < 0 ? -0.5 : 0.5));
+}
+
+template<typename TYPE>
+TYPE RoundZero(TYPE value, TYPE align)
+{
+    assert(align > 0);
+    return value - (value % align);
+}
+
+inline float RoundZero(float value, float align)
+{
+    return std::trunc(value / align) * align;
+}
+
+
+template<typename TYPE>
+TYPE DivRoundUp(TYPE a, TYPE b)
+{
+    return TYPE((a + b - 1) / b);
+}
+
+// Checks if the ranges overlap, to0 and to1 are exclusive
+template<typename TYPE>
+bool Overlaps(TYPE from0, TYPE to0, TYPE from1, TYPE to1)
+{
+    return from0 < to1 && from1 < to0;
+}
+
+template<typename TYPE>
+TYPE ClampSigmoid(TYPE x, TYPE limit)
+{
+    if ( x <= -limit )
+    {
+        return TYPE(0);
+    }
+    else if ( x >= limit )
+    {
+        return TYPE(1);
+    }
+    else
+    {
+        return TYPE(1 / (1 + std::exp(-x)));
+    }
+}
+
+inline int NeededTotalPadding(int inputSize, int outputSize, int stride, int filterSize)
+{
+    int outSize = DivRoundUp(outputSize, stride);
+    int neededInput = (outSize - 1) * stride + filterSize;
+    return std::max(0, neededInput - inputSize);
+}
+
+inline int NeededTotalPadding(int inputSize, int stride, int filterSize)
+{
+    return NeededTotalPadding(inputSize, inputSize, stride, filterSize);
+}
+
+template<typename VALUE>
+uint32_t SimpleHash32(const VALUE &value)
+{
+    return uint32_t(value);
+}
+
+template<typename VALUE, typename... REST>
+uint32_t SimpleHash32(const VALUE &value, REST &&...rest)
+{
+    return SimpleHash32(std::forward<REST>(rest)...) * 31 + uint32_t(value);
+}
+
+template<typename VALUE>
+uint64_t SimpleHash64(const VALUE &value)
+{
+    return uint64_t(value);
+}
+
+template<typename VALUE, typename... REST>
+uint64_t SimpleHash64(const VALUE &value, REST &&...rest)
+{
+    return SimpleHash64(std::forward<REST>(rest)...) * 31 + uint64_t(value);
+}
+
+static constexpr uint32_t REGOR_FNV_SEED = 0x811c9dc5;
+static constexpr uint32_t REGOR_FNV_PRIME = 0x01000193;
+
+inline uint32_t FNVHashBytes(uint32_t hash, const uint8_t *p, int length)
+{
+    while ( length-- )
+    {
+        hash ^= *p++;
+        hash *= REGOR_FNV_PRIME;
+    }
+    return hash;
+}
+
+template<typename TYPE>
+uint32_t HashVector32(const std::vector<TYPE> &values)
+{
+    uint32_t hash = REGOR_FNV_SEED;
+    for ( auto const x : values )
+    {
+        hash = FNVHashBytes(hash, reinterpret_cast<const uint8_t *>(&x), sizeof(x));
+    }
+    return hash;
+}
+
+template<typename TYPE>
+std::make_unsigned_t<TYPE> ToUnsigned(TYPE x)
+{
+    assert(x >= 0);
+    return static_cast<std::make_unsigned_t<TYPE>>(x);
+}
+
+inline int Clz32(uint32_t x)
+{
+#if __GNUC__
+    return x == 0 ? 32 : __builtin_clz(x);
+#elif _MSC_VER
+    unsigned long leading_zero = 0;
+    return _BitScanReverse(&leading_zero, x) ? 31 - leading_zero : 32;
+#else
+#error "Unsupported toolchain"
+#endif
+}
+
+inline int Clz64(uint64_t x)
+{
+#if __GNUC__
+    return x == 0 ? 64 : __builtin_clzll(x);
+#elif _MSC_VER
+    unsigned long leading_zero = 0;
+    return _BitScanReverse64(&leading_zero, x) ? 63 - leading_zero : 64;
+#else
+#error "Unsupported toolchain"
+#endif
+}
+
+template<typename T>
+int IntLog2(T x)
+{
+    if ( x <= T(0) ) return 0;
+
+    static_assert(std::is_arithmetic_v<T> && (sizeof(T) <= 8), "");
+    using itype = std::conditional_t<std::is_floating_point_v<T> || sizeof(T) == 8, uint64_t, uint32_t>;
+    itype n;
+
+    if constexpr ( std::is_floating_point_v<T> )
+    {
+        n = itype(std::ceil(x));
+    }
+    else
+    {
+        n = itype(x);
+    }
+
+    if constexpr ( std::is_same_v<itype, uint64_t> )
+    {
+        return 63 - Clz64(ToUnsigned(n));
+    }
+    else
+    {
+        return 31 - Clz32(ToUnsigned(n));
+    }
+}
+
+template<typename T>
+constexpr bool IsPowerOfTwo(T x)
+{
+    static_assert(std::is_integral_v<T>, "");
+    return x > 0 && (x & (x - 1)) == 0;
+}
+
+template<typename OUT, typename IN>
+OUT ClampToType(IN x)
+{
+    static_assert(std::is_floating_point_v<IN> == std::is_floating_point_v<OUT>, "");
+    IN hi = std::numeric_limits<OUT>::max();
+    IN lo;
+    if constexpr ( std::is_floating_point_v<OUT> )
+    {
+        lo = -std::numeric_limits<OUT>::max();
+    }
+    else
+    {
+        lo = std::numeric_limits<OUT>::min();
+    }
+    return OUT(std::clamp(x, lo, hi));
+}
diff --git a/ethosu/regor/common/ordered_map.hpp b/ethosu/regor/common/ordered_map.hpp
new file mode 100644
index 00000000..35c1703d
--- /dev/null
+++ b/ethosu/regor/common/ordered_map.hpp
@@ -0,0 +1,1112 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <stdexcept>
+#include <vector>
+
+namespace armstd
+{
+
+template<typename TYPE, typename ENABLE = void>
+struct ordered_map_hash
+{
+    size_t operator()(const TYPE &value, size_t limit) const
+    {
+        // A better hash would map the incoming value to log2(limit) bits
+        // before applying the modulo.
+        // Disabled for hash performance
+        // coverity[cert_int33_c_violation]
+        return std::hash<TYPE>()(value) % limit;
+    }
+};
+
+template<typename TYPE>
+struct ordered_map_hash<TYPE, typename std::enable_if_t<std::is_pointer_v<TYPE>>>
+{
+    size_t operator()(TYPE value, size_t limit) const
+    {
+        uintptr_t hash = uintptr_t(value);
+        // Pointers tend to have alignments that have lower bits set zero.
+        // TODO: use alignof(std::remove_pointer_t<TYPE>) to determine lower zero count.
+        hash ^= (hash >> 3) & 7;
+        hash ^= (hash >> 15) * 3;
+        // Disabled for hash performance
+        // coverity[cert_int33_c_violation]
+        return size_t(hash % limit);
+    }
+};
+
+template<typename TYPE>
+struct ordered_map_hash<TYPE, typename std::enable_if_t<std::is_integral_v<TYPE> || std::is_enum_v<TYPE>>>
+{
+    size_t operator()(TYPE value, size_t limit) const
+    {
+        // A C++ std::hash implementation need not return a linear mapping for arithmetic values because it is
+        // implementation defined. The hash function required for this ordered map should perform exactly the same
+        // across all platforms and ideally produce sensible hashes for small-valued incrementing keys (not all
+        // compilers do this).
+        size_t hash = size_t(value);
+        // Mix in upper bits if the type is large (enum bit flags, for example)
+        if constexpr ( sizeof(value) > 2 )
+        {
+            hash ^= (size_t(value) >> 15) * 3;
+            hash ^= (size_t(value) >> 24) * 5;
+        }
+        // Disabled for hash performance
+        // coverity[cert_int33_c_violation]
+        return hash % limit;
+    }
+};
+
+
+template<typename KEY, typename VALUE, typename HASH = ordered_map_hash<KEY>, typename INDEXER = int16_t, bool PURE_HASH_CHAINS = true>
+class ordered_map
+{
+protected:
+    using this_class_t = ordered_map<KEY, VALUE, HASH, INDEXER, PURE_HASH_CHAINS>;
+    static_assert(std::is_integral<INDEXER>::value, "indexer must be integer");
+    static constexpr INDEXER HASH_FREE = INDEXER(-2);
+    static constexpr INDEXER HASH_END = INDEXER(-1);
+    static constexpr INDEXER NODE_UNLINKED = INDEXER(-1);
+
+    // Node stores two arrays in one allocation.
+    //  - Items array, mapping key->untyped storage
+    //  - The traversal order list
+    struct Node
+    {
+        typename std::aligned_storage<sizeof(VALUE), alignof(VALUE)>::type value;  // Untyped value storage (place
+                                                                                   // first)
+        KEY key;                                                                   // Map key
+        INDEXER order_next = NODE_UNLINKED;                                        // order forwards traversal
+        INDEXER order_prev = NODE_UNLINKED;                                        // order backwards traversal
+        INDEXER hash_next = HASH_FREE;  // Same hash collision relocation (-2=free, -1=used/end, otherwise=next bucket)
+
+        Node() = default;
+
+        VALUE &Value() { return *reinterpret_cast<VALUE *>(&value); }
+
+        const VALUE &Value() const { return *reinterpret_cast<const VALUE *>(&value); }
+
+        void copy_links(Node &other)
+        {
+            this->order_next = other.order_next;
+            this->order_prev = other.order_prev;
+            this->hash_next = other.hash_next;
+        }
+    };
+
+    std::unique_ptr<Node[]> _items;       // Bulk node store
+    int _capacity = 0;                    // Total allocated capacity
+    int _itemCount = 0;                   // Number of inserted/valid items
+    INDEXER _tableSize = 0;               // Hash table size
+    INDEXER _orderBegin = NODE_UNLINKED;  // First item in insertion order
+    INDEXER _orderLast = NODE_UNLINKED;   // Last item in insertion order
+    INDEXER _allocWatermark = 0;          // Location of last overflow allocation
+
+public:
+    ordered_map(int initialCapacity = 3) { resize(initialCapacity); }
+
+    ordered_map(std::initializer_list<std::pair<KEY, VALUE>> init)
+    {
+        resize(int(init.size()));
+        for ( auto &pair : init )
+        {
+            emplace(pair.first, pair.second);
+        }
+    }
+
+    ordered_map(const std::pair<KEY, VALUE> *pairs, size_t count)
+    {
+        resize(int(count));
+        while ( count-- )
+        {
+            emplace(pairs->first, pairs->second);
+            pairs++;
+        }
+    }
+
+    ordered_map(const ordered_map &other) { *this = other; }
+
+    ordered_map(ordered_map &&other) { *this = std::move(other); }
+
+    ordered_map &operator=(const ordered_map &other)
+    {
+        // Can't selectively delete operator= via enable_if_t
+        if constexpr ( !std::is_copy_constructible<VALUE>::value )
+        {
+            throw std::invalid_argument("value type non-copyable");
+        }
+        else if ( this != &other )
+        {
+            clear();
+
+            // Duplicating a map will compact and rehash it
+            // pad a little to prevent an immediate resize
+            // if the user appends an item.
+            resize(other._itemCount + 2);
+
+            // Duplicate in source's insertion order
+            int order = other._orderBegin;
+            while ( order != NODE_UNLINKED )
+            {
+                auto *from = &other._items[order];
+                int index = table_index_of(from->key);
+                auto *to = allocate_node(index, from->key);
+                to->key = from->key;
+                ::new (reinterpret_cast<void *>(&to->Value())) VALUE(from->Value());
+                order = from->order_next;
+            }
+        }
+        return *this;
+    }
+
+    ordered_map &operator=(ordered_map &&other)
+    {
+        if ( this != &other )
+        {
+            clear();
+
+            _items = std::move(other._items);
+            _capacity = other._capacity;
+            _itemCount = other._itemCount;
+            _tableSize = other._tableSize;
+            _orderBegin = other._orderBegin;
+            _orderLast = other._orderLast;
+            _allocWatermark = other._allocWatermark;
+            other._itemCount = 0;
+            other._capacity = 0;
+            other._orderBegin = NODE_UNLINKED;
+        }
+        return *this;
+    }
+
+    ~ordered_map() { clear(); }
+
+public:
+    int size() const { return _itemCount; }
+    bool empty() const { return _itemCount == 0; }
+    bool contains(const KEY &key) const
+    {
+        int index = table_index_of(key);
+        return find_node(index, key) != nullptr;
+    }
+
+    void clear()
+    {
+        if ( !_items ) return;
+        // Call destructors in order
+        int order = _orderBegin;
+        for ( int i = 0; i < _itemCount; i++ )
+        {
+            assert((i < _itemCount - 1) || (order == _orderLast));  // Consistency error
+            _items[order].Value().~VALUE();
+            order = _items[order].order_next;
+        }
+        assert(order == NODE_UNLINKED);  // Consistency error
+
+        // Reset every entry in the hash table
+        for ( int i = 0; i < _capacity; i++ )
+        {
+            _items[i].hash_next = HASH_FREE;
+            _items[i].order_next = NODE_UNLINKED;
+            _items[i].order_prev = NODE_UNLINKED;
+        }
+
+        _itemCount = 0;
+        _orderBegin = NODE_UNLINKED;
+        _orderLast = NODE_UNLINKED;
+    }
+
+    template<class... Args>
+    VALUE &emplace(const KEY &key, Args &&...args)
+    {
+        int index = table_index_of(key);
+        Node *node = find_node(index, key);
+        if ( node == nullptr )
+        {
+            node = allocate_node(index, key);
+            ::new (reinterpret_cast<void *>(&node->value)) VALUE(std::forward<Args>(args)...);
+        }
+        return node->Value();
+    }
+
+    // insert(key, value)
+    //  - Appends a new node for key at the end of the iteration order and copies or moves value into it
+    //  - The key must not already be present in the map
+    //  - If the map is at capacity then resizing will occur
+    //  - If resizing occurs, all iterators and references are invalidated. Otherwise, they are unaffected.
+
+    void insert(const KEY &key, const VALUE &value)
+    {
+        int index = table_index_of(key);
+        Node *node = find_node(index, key);
+        if ( node == nullptr )
+        {
+            node = allocate_node(index, key);
+            ::new (reinterpret_cast<void *>(&node->value)) VALUE(value);  // Requires accessible copy constructor
+        }
+        else
+        {
+            assert(false && "key already present");
+        }
+    }
+
+    void insert(const KEY &key, VALUE &&value)
+    {
+        int index = table_index_of(key);
+        Node *node = find_node(index, key);
+        if ( node == nullptr )
+        {
+            node = allocate_node(index, key);
+            ::new (reinterpret_cast<void *>(&node->value)) VALUE(std::move(value));  // Requires accessible move
+                                                                                     // constructor
+        }
+        else
+        {
+            assert(false && "key already present");
+        }
+    }
+
+    // reinsert(key, value)
+    //  - If key is not already present in the map:
+    //    - Is equivalent to insert
+    //    - Appends a new node for key at the end of the iteration order and copies or moves value into it
+    //    - If the map is at capacity then resizing will occur
+    //    - If resizing occurs, all iterators and references are invalidated. Otherwise, they are unaffected.
+    //  - If key is already present in the map:
+    //    - Moves the node at key to the back of the iteration order
+    //    - Replaces the VALUE object at key with value
+    //    - No hash chains are modified
+    //    - All iterators remain valid, but iterators to the reinserted node will point to its new position
+
+    void reinsert(const KEY &key, const VALUE &value)
+    {
+        int index = table_index_of(key);
+        Node *node = find_node(index, key);
+        if ( node == nullptr )
+        {
+            node = allocate_node(index, key);
+            ::new (reinterpret_cast<void *>(&node->value)) VALUE(value);  // Requires accessible copy constructor
+        }
+        else  // Reinsert re-links at end of ordering chain
+        {
+            if ( node->order_next != NODE_UNLINKED )  // No need to relink if already at end of chain
+            {
+                index = int(node - _items.get());  // Get actual node index
+                unlink_node_order(node);
+
+                // relink node at end of chain
+                _items[_orderLast].order_next = INDEXER(index);
+                node->order_prev = _orderLast;
+                node->order_next = NODE_UNLINKED;
+                _orderLast = INDEXER(index);
+            }
+
+            // Re-assign value (last)
+            node->Value() = value;
+        }
+    }
+
+    void reinsert(const KEY &key, VALUE &&value)
+    {
+        int index = table_index_of(key);
+        Node *node = find_node(index, key);
+        if ( node == nullptr )
+        {
+            node = allocate_node(index, key);
+            ::new (reinterpret_cast<void *>(&node->value)) VALUE(std::move(value));  // Requires accessible move
+                                                                                     // constructor
+        }
+        else  // Reinsert re-links at end of ordering chain
+        {
+            if ( node->order_next != NODE_UNLINKED )  // No need to relink if already at end of chain
+            {
+                index = int(node - _items.get());  // Get actual node index
+                unlink_node_order(node);
+
+                // relink node at end of chain
+                _items[_orderLast].order_next = INDEXER(index);
+                node->order_prev = _orderLast;
+                node->order_next = NODE_UNLINKED;
+                _orderLast = INDEXER(index);
+            }
+
+            // Re-assign value (last)
+            node->Value() = std::move(value);
+        }
+    }
+
+    // replace(oldKey, newKey, value)
+    //  - Replaces the node at oldKey with a new key and value but same position in iteration order
+    //  - The new key must not already be present in the map
+    //  - All iterators and references are invalidated
+    //  - If the map is at its capacity, resizing will occur
+
+    void replace(const KEY &oldKey, const KEY &newKey, const VALUE &value)
+    {
+        insert(newKey, value);
+        swap_order(find(oldKey), find(newKey));
+        erase(oldKey);
+    }
+
+    // operator[]
+    //  - Analogous to unordered_map::operator[]
+    //  - If insertion occurs, the new node is appended to the end of the iteration order
+    //  - If insertion occurs and the map is at capacity then resizing will occur
+    //  - If resizing occurs, all iterators and references are invalidated. Otherwise, they are unaffected.
+
+    VALUE &operator[](const KEY &key)
+    {
+        int index = table_index_of(key);
+        Node *node = find_node(index, key);
+        if ( node == nullptr )
+        {
+            node = allocate_node(index, key);
+            ::new (reinterpret_cast<void *>(&node->value)) VALUE();
+        }
+        return node->Value();
+    }
+
+    const VALUE &operator[](const KEY &key) const
+    {
+        int index = table_index_of(key);
+        const Node *node = find_node(index, key);
+        if ( node == nullptr ) throw std::out_of_range("missing key");
+        return node->Value();
+    }
+
+    const VALUE &at(const KEY &key) const
+    {
+        int index = table_index_of(key);
+        const Node *node = find_node(index, key);
+        if ( node == nullptr ) throw std::out_of_range("missing key");
+        return node->Value();
+    }
+
+    VALUE &at(const KEY &key) { return const_cast<VALUE &>(const_cast<const this_class_t *>(this)->at(key)); }
+
+    bool try_get(const KEY &key, VALUE &value) const
+    {
+        int index = table_index_of(key);
+        const Node *node = find_node(index, key);
+        if ( node != nullptr )
+        {
+            value = node->Value();
+            return true;
+        }
+        return false;
+    }
+
+    const VALUE *try_ref(const KEY &key) const
+    {
+        int index = table_index_of(key);
+        const Node *node = find_node(index, key);
+        if ( node != nullptr )
+        {
+            return &node->Value();
+        }
+        return nullptr;
+    }
+
+    VALUE *try_ref(const KEY &key) { return const_cast<VALUE *>(const_cast<const this_class_t *>(this)->try_ref(key)); }
+
+    const VALUE &front() const
+    {
+        if ( empty() ) throw std::out_of_range("no keys");
+        return _items[_orderBegin].Value();
+    }
+
+    const VALUE &back() const
+    {
+        if ( empty() ) throw std::out_of_range("no keys");
+        return _items[_orderLast].Value();
+    }
+
+    // Return all keys (in insertion order)
+    std::vector<KEY> keys() const
+    {
+        std::vector<KEY> tmp;
+        tmp.reserve(_itemCount);
+        // Collect the keys in order
+        int order = _orderBegin;
+        for ( int i = 0; i < _itemCount; i++ )
+        {
+            const Node &item = _items[order];
+            assert(item.hash_next != HASH_FREE);
+            tmp.push_back(item.key);
+            order = item.order_next;
+        }
+        assert(order == NODE_UNLINKED);
+        return tmp;
+    }
+
+    template<bool PAIRS, bool REVERSE, bool IS_CONST = false>
+    class iterator_base
+    {
+        friend this_class_t;
+        using value_ref_t = typename std::conditional_t<IS_CONST, const VALUE &, VALUE &>;
+        using value_ptr_t = typename std::conditional_t<IS_CONST, const VALUE *, VALUE *>;
+        using node_t = typename std::conditional_t<IS_CONST, const Node *, Node *>;
+        using iterator_base_t = iterator_base<PAIRS, REVERSE, IS_CONST>;
+
+    protected:
+        node_t _items;
+        int _at;
+
+    public:
+        iterator_base() = default;
+        iterator_base(const iterator_base_t &other) { *this = other; }
+        iterator_base(node_t items, int start) : _items(items), _at(start) {}
+
+        // Value-only access
+        template<bool PP = PAIRS, typename std::enable_if_t<!PP, int> = 0>
+        value_ref_t operator*() const
+        {
+            return _items[_at].Value();
+        }
+
+        template<bool PP = PAIRS, typename std::enable_if_t<!PP, int> = 0>
+        value_ptr_t operator->()
+        {
+            return &_items[_at].Value();
+        }
+
+        // Pair access
+        template<bool PP = PAIRS, typename std::enable_if_t<PP, int> = 0>
+        const KEY &key() const
+        {
+            return _items[_at].key;
+        }
+        template<bool PP = PAIRS, typename std::enable_if_t<PP, int> = 0>
+        value_ref_t value() const
+        {
+            return _items[_at].Value();
+        }
+
+        template<bool PP = PAIRS, typename std::enable_if_t<PP, int> = 0>
+        std::pair<KEY, value_ref_t> operator*() const
+        {
+            return std::pair<KEY, value_ref_t>(key(), value());
+        }
+
+        iterator_base_t &operator++()
+        {
+            if constexpr ( REVERSE ) _at = _items[_at].order_prev;
+            else _at = _items[_at].order_next;
+            return *this;
+        }
+        iterator_base_t operator++(int)
+        {
+            int tmp = _at;
+            if constexpr ( REVERSE ) _at = _items[_at].order_prev;
+            else _at = _items[_at].order_next;
+            return iterator_base_t(_items, tmp);
+        }
+
+        iterator_base_t &operator--()
+        {
+            if constexpr ( REVERSE ) _at = _items[_at].order_next;
+            else _at = _items[_at].order_prev;
+            return *this;
+        }
+        iterator_base_t operator--(int)
+        {
+            int tmp = _at;
+            if constexpr ( REVERSE ) _at = _items[_at].order_next;
+            else _at = _items[_at].order_prev;
+            return iterator_base_t(_items, tmp);
+        }
+
+        iterator_base_t &operator=(const iterator_base_t &other)
+        {
+            _items = other._items;
+            _at = other._at;
+            return *this;
+        }
+
+        bool operator==(const iterator_base<PAIRS, REVERSE, false> &b) const
+        {
+            assert(_items == b._items);
+            return _at == b._at;
+        }
+
+        bool operator==(const iterator_base<PAIRS, REVERSE, true> &b) const
+        {
+            assert(_items == b._items);
+            return _at == b._at;
+        }
+
+        bool operator!=(const iterator_base<PAIRS, REVERSE, false> &b) const
+        {
+            assert(_items == b._items);
+            return _at != b._at;
+        }
+
+        bool operator!=(const iterator_base<PAIRS, REVERSE, true> &b) const
+        {
+            assert(_items == b._items);
+            return _at != b._at;
+        }
+
+    protected:
+        Node *node() const { return const_cast<Node *>(&_items[_at]); }
+    };
+
+    using iterator = iterator_base<false, false>;
+    using reverse_iterator = iterator_base<false, true>;
+    using pair_iterator = iterator_base<true, false>;
+    using const_iterator = iterator_base<false, false, true>;
+    using const_reverse_iterator = iterator_base<false, true, true>;
+    using const_pair_iterator = iterator_base<true, false, true>;
+
+    // Forward value iterators
+    iterator begin() { return iterator(_items.get(), _orderBegin); }
+    iterator end() { return iterator(_items.get(), NODE_UNLINKED); }
+    const_iterator begin() const { return const_iterator(_items.get(), _orderBegin); }
+    const_iterator end() const { return const_iterator(_items.get(), NODE_UNLINKED); }
+
+    // Reverse value iterators
+    reverse_iterator rbegin() { return reverse_iterator(_items.get(), _orderLast); }
+    reverse_iterator rend() { return reverse_iterator(_items.get(), NODE_UNLINKED); }
+    const_reverse_iterator rbegin() const { return const_reverse_iterator(_items.get(), _orderLast); }
+    const_reverse_iterator rend() const { return const_reverse_iterator(_items.get(), NODE_UNLINKED); }
+
+    template<bool IS_CONST>
+    class iterator_proxy
+    {
+    private:
+        using outer_type_t = typename std::conditional_t<IS_CONST, const this_class_t &, this_class_t &>;
+        outer_type_t _outer;
+
+    public:
+        iterator_proxy(outer_type_t outer) : _outer(outer) {}
+        pair_iterator begin() { return pair_iterator(_outer._items.get(), _outer._orderBegin); }
+        pair_iterator end() { return pair_iterator(_outer._items.get(), NODE_UNLINKED); }
+        const_pair_iterator begin() const { return const_pair_iterator(_outer._items.get(), _outer._orderBegin); }
+        const_pair_iterator end() const { return const_pair_iterator(_outer._items.get(), NODE_UNLINKED); }
+    };
+
+    iterator_proxy<false> pairs() { return iterator_proxy<false>(*this); }
+    iterator_proxy<true> pairs() const { return iterator_proxy<true>(*this); }
+
+    iterator find(const KEY &key)
+    {
+        int index = table_index_of(key);
+        const Node *node = find_node(index, key);
+        if ( node == nullptr )
+        {
+            return end();
+        }
+        return iterator(_items.get(), int(node - _items.get()));
+    }
+
+    const_iterator find(const KEY &key) const
+    {
+        int index = table_index_of(key);
+        const Node *node = find_node(index, key);
+        if ( node == nullptr )
+        {
+            return end();
+        }
+        return const_iterator(_items.get(), int(node - _items.get()));
+    }
+
+    // reorder_after(key, position) and reorder_before(key, position)
+    //  - Moves a node to a new position in iteration order
+    //    - reorder_after() moves the node to immediately after position in iteration order
+    //    - reorder_before() moves the node to immediately before position in iteration order
+    //  - Key must already be present in the map
+    //  - end() can be used as the position for reorder_before() but not reorder_after()
+    //  - No hash chains are modified
+    //  - No VALUE objects are moved, copied, created or destroyed
+    //  - Container size is not changed and no reallocation occurs
+    //  - All iterators remain valid, but iterators to the reordered node will point to its new position
+
+    void reorder_after(const KEY &key, iterator position)
+    {
+        int index = table_index_of(key);
+        Node *node = find_node(index, key);
+        reorder_node(node, position.node(), true);
+    }
+
+    void reorder_before(const KEY &key, iterator position)
+    {
+        int index = table_index_of(key);
+        Node *node = find_node(index, key);
+        if ( position == end() )
+        {
+            reorder_node(node, &_items[_orderLast], true);
+        }
+        else
+        {
+            reorder_node(node, position.node(), false);
+        }
+    }
+
+    // erase(const KEY &key)
+    //  - Analogous to std::unordered_map::erase()
+    //  - Removes key from map, if present, and returns the number of elements removed
+    //  - Unlike unordered_map::erase(), all iterators and references are invalidated (not just the removed element)
+
+    int erase(const KEY &key)
+    {
+        int index = table_index_of(key);
+        int prevIndex = -1;  // local sentinel - not the indexer value
+        Node *node = find_node(index, key, &prevIndex);
+        if ( node != nullptr )
+        {
+            deallocate_node(node, prevIndex);
+            return 1;
+        }
+        return 0;
+    }
+
+    // erase(iterator pos)
+    //  - Analogous to std::vector::erase()
+    //  - Removes the node at pos and returns an iterator to the next node in iteration order
+    //  - Unlike vector::erase(), all iterators and references are invalidated (not just the removed element onwards)
+
+    iterator erase(iterator pos)
+    {
+        assert(pos._at != NODE_UNLINKED);
+        const auto &key = _items[pos._at].key;
+        int index = table_index_of(key);  // Locate in hash chain (iterators walk the ordering chain)
+        int prevIndex = -1;               // local sentinel - not the indexer value
+        int nextOrder = _items[pos._at].order_next;
+        Node *node = find_node(index, key, &prevIndex);
+        assert(node != nullptr);
+        deallocate_node(node, prevIndex);
+        return iterator(_items.get(), nextOrder);
+    }
+
+    // swap_order()
+    //  - Swaps the iteration order of two nodes specified by iterator
+    //  - Both iterators must be dereferenceable
+    //  - No hash chains are modified
+    //  - No VALUE objects are moved, copied, created or destroyed
+    //  - Container size is not changed and no reallocation occurs
+    //  - All iterators remain valid, but iterators to either of the two swapped nodes will point to their new position
+
+    void swap_order(iterator first, iterator second)
+    {
+        assert(first != end() && second != end());
+        swap_node_order(first.node(), second.node());
+    }
+
+    bool key_of(const VALUE &value, KEY &key) const
+    {
+        int order = _orderBegin;
+        while ( order != NODE_UNLINKED )
+        {
+            auto *node = &_items[order];
+            if ( node->Value() == value )
+            {
+                key = node->key;
+                return true;
+            }
+            order = node->order_next;
+        }
+        return false;
+    }
+
+private:
+    ordered_map(this_class_t &other, int capacity)
+    {
+        assert(capacity < std::numeric_limits<INDEXER>::max() - 2);  // Capacity exceeds indexer
+
+        resize(capacity);
+
+        // Duplicate in source's insertion order
+        int order = other._orderBegin;
+        while ( order != NODE_UNLINKED )
+        {
+            auto *from = &other._items[order];
+            int index = table_index_of(from->key);
+            auto *to = allocate_node(index, from->key);
+            copy_move_helper<KEY>()(to->key, from->key);
+            ::new (reinterpret_cast<void *>(&to->value)) VALUE();
+            copy_move_helper<VALUE>()(to->Value(), from->Value());
+            order = from->order_next;
+        }
+    }
+
+    void resize(int capacity)
+    {
+        int newTableSize = hashtable_size(capacity);
+        assert(capacity < std::numeric_limits<INDEXER>::max() - 2);  // Capacity exceeds indexer
+        if ( capacity <= _capacity ) return;
+
+        // Same hash table size, just move old items into the new item storage
+        if ( !_items || _tableSize == newTableSize )
+        {
+            std::unique_ptr<Node[]> newItems = std::make_unique<Node[]>(capacity);
+            if ( _items )
+            {
+                // Probably resizing because we are full, so move everything
+                for ( int i = 0; i < _capacity; i++ )
+                {
+                    auto *from = &_items[i];
+                    auto *to = &newItems[i];
+                    to->copy_links(*from);
+                    copy_move_helper<KEY>()(to->key, from->key);
+                    ::new (reinterpret_cast<void *>(&to->value)) VALUE();
+                    copy_move_helper<VALUE>()(to->Value(), from->Value());
+                }
+            }
+            _items = std::move(newItems);
+            _capacity = capacity;
+            _tableSize = INDEXER(newTableSize);
+            _allocWatermark = INDEXER(capacity - 1);
+        }
+        else  // Rehash by stealing the internals of another map
+        {
+            // This may occur recursively if the hash function is poor
+            this_class_t temp(*this, capacity);
+            *this = std::move(temp);
+        }
+        assert(_tableSize != 0);
+    }
+
+    Node *allocate_node(int index, const KEY &key)
+    {
+        // This function must only be called when an allocation is required
+        assert(index >= 0 && index < _tableSize);
+        // Try to insert into the hash table, if given index isn't free then find a free node and link onto that
+        while ( _items[index].hash_next != HASH_FREE )
+        {
+            int prev = index;
+            index = find_free_index();
+            if ( index < 0 )
+            {
+                // Conservative resize strategy
+                resize(_capacity + (_capacity + 1) / 2);
+                index = table_index_of(key);
+                continue;
+            }
+
+            // Find the end of the hash chain
+            while ( _items[prev].hash_next != HASH_END )
+            {
+                prev = _items[prev].hash_next;
+            }
+
+            _items[prev].hash_next = INDEXER(index);
+        }
+
+        Node *node = &_items[index];
+        node->hash_next = HASH_END;  // this node is now used but not linked
+        node->key = key;
+        if ( _orderBegin == NODE_UNLINKED )
+        {
+            _orderBegin = _orderLast = INDEXER(index);
+            node->order_next = node->order_prev = NODE_UNLINKED;
+        }
+        else
+        {
+            _items[_orderLast].order_next = INDEXER(index);
+            node->order_next = NODE_UNLINKED;
+            node->order_prev = _orderLast;
+            _orderLast = INDEXER(index);
+        }
+
+        _itemCount++;
+        return node;
+    }
+
+    void deallocate_node(Node *node, int prevIndex)
+    {
+        // Remove from ordering chain FIRST:
+        unlink_node_order(node);
+
+        // Remove from hash chain: Just unlink node if not in initial bucket
+        if ( prevIndex != -1 )
+        {
+            node->Value().~VALUE();
+            _items[prevIndex].hash_next = node->hash_next;
+            node->hash_next = HASH_FREE;
+        }
+        else if ( node->hash_next == HASH_END )
+        {
+            node->Value().~VALUE();
+            node->hash_next = HASH_FREE;
+        }
+        // If we are at the start of a chain then we need to move the next hashed node into this bucket (painful)
+        else
+        {
+            // If the hash table is used as overflow allocation (i.e. PURE_HASH_CHAINS == false)
+            // the next node in this chain may also be the start of another chain, which is not handled here.
+            assert(PURE_HASH_CHAINS);  // TODO: Add support for deallocation when PURE_HASH_CHAINS==false
+
+            Node *next = &_items[node->hash_next];
+            copy_move_helper<VALUE>()(node->Value(), next->Value());
+            node->key = next->key;
+            node->hash_next = next->hash_next;
+            next->hash_next = HASH_FREE;
+
+            // Relink ordering for the moved node
+            node->order_next = next->order_next;
+            node->order_prev = next->order_prev;
+            update_node_order(node);
+
+            next->order_next = NODE_UNLINKED;
+            next->order_prev = NODE_UNLINKED;
+        }
+
+        _itemCount--;
+    }
+
+    void unlink_node_order(Node *node)
+    {
+        // If this is not the first node, unlink from previous
+        if ( node->order_prev != NODE_UNLINKED )
+        {
+            _items[node->order_prev].order_next = node->order_next;
+        }
+        else
+        {
+            _orderBegin = node->order_next;
+        }
+
+        // If this is not the last node, unlink from next
+        if ( node->order_next != NODE_UNLINKED )
+        {
+            _items[node->order_next].order_prev = node->order_prev;
+        }
+        else
+        {
+            _orderLast = node->order_prev;
+        }
+
+        node->order_next = NODE_UNLINKED;
+        node->order_prev = NODE_UNLINKED;
+    }
+
+    void update_node_order(Node *node)
+    {
+        const auto index = INDEXER(node - _items.get());
+        if ( node->order_prev == NODE_UNLINKED )
+        {
+            _orderBegin = index;
+        }
+        else
+        {
+            _items[node->order_prev].order_next = index;
+        }
+        if ( node->order_next == NODE_UNLINKED )
+        {
+            _orderLast = index;
+        }
+        else
+        {
+            _items[node->order_next].order_prev = index;
+        }
+    }
+
+    void reorder_node(Node *node, Node *position, bool after)
+    {
+        assert(node != nullptr);
+        assert(position != nullptr);
+
+        if ( node != position )
+        {
+            unlink_node_order(node);
+
+            if ( after )
+            {
+                node->order_next = position->order_next;
+                node->order_prev = INDEXER(position - _items.get());
+            }
+            else
+            {
+                node->order_next = INDEXER(position - _items.get());
+                node->order_prev = position->order_prev;
+            }
+
+            update_node_order(node);
+        }
+    }
+
+    void swap_node_order(Node *first, Node *second)
+    {
+        // Given the two nodes, swap the traversal orders
+        const INDEXER firstPrev = first->order_prev;
+        const INDEXER firstNext = first->order_next;
+
+        first->order_prev = second->order_prev;
+        first->order_next = second->order_next;
+
+        second->order_prev = firstPrev;
+        second->order_next = firstNext;
+
+        // If the two nodes were adjacent before swapping, then they will each be pointing at themselves now.
+        // Point them at each other instead.
+        const auto firstIndex = INDEXER(first - _items.get());
+        const auto secondIndex = INDEXER(second - _items.get());
+
+        if ( firstPrev == secondIndex )
+        {
+            first->order_next = secondIndex;
+            second->order_prev = firstIndex;
+        }
+        else if ( firstNext == secondIndex )
+        {
+            first->order_prev = secondIndex;
+            second->order_next = firstIndex;
+        }
+
+        // Inform the affected neighbours of the change in order
+        update_node_order(first);
+        update_node_order(second);
+    }
+
+    const Node *find_node(int index, const KEY &key, int *prev = nullptr) const
+    {
+        // Initial index must be within the hashtable
+        assert(index >= 0 && index < _tableSize);
+
+        // Node is unallocated
+        if ( _items[index].hash_next == HASH_FREE )
+        {
+            return nullptr;
+        }
+
+        // Look for matching key
+        do
+        {
+            if ( _items[index].key == key )
+            {
+                assert(_items[index].hash_next != HASH_FREE);  // This node MUST BE allocated
+                return &_items[index];
+            }
+            if ( prev )
+            {
+                *prev = index;
+            }
+            index = _items[index].hash_next;
+        } while ( index != HASH_END );
+
+        return nullptr;
+    }
+
+    Node *find_node(int index, const KEY &key, int *prev = nullptr)
+    {
+        return const_cast<Node *>(const_cast<const this_class_t *>(this)->find_node(index, key, prev));
+    }
+
+    template<typename TYPE, typename ENABLE = void>
+    struct copy_move_helper
+    {
+        void operator()(TYPE &dst, TYPE &src) const
+        {
+            dst = src;
+            src.~TYPE();
+        }
+    };
+
+    template<typename TYPE>
+    struct copy_move_helper<TYPE, typename std::enable_if<std::is_move_assignable<TYPE>::value>::type>
+    {
+        void operator()(TYPE &dst, TYPE &src) const { dst = std::move(src); }
+    };
+
+    template<typename TYPE>
+    // clang-format off
+    struct copy_move_helper<TYPE, typename std::enable_if<std::is_trivially_copyable<TYPE>::value && !(std::is_move_assignable<TYPE>::value || std::is_arithmetic<TYPE>::value || std::is_pointer<TYPE>::value)>::type >
+    // clang-format on
+    {
+        // coverity[dont_call:FALSE]
+        void operator()(TYPE &dst, TYPE &src) const { std::memcpy(&dst, &src, sizeof(TYPE)); }
+    };
+
+    int table_index_of(const KEY &key, int tableSize) const { return int(HASH()(key, size_t(tableSize))); }
+    int table_index_of(const KEY &key) const { return table_index_of(key, _tableSize); }
+
+    int find_free_index()
+    {
+        const int minAlloc = PURE_HASH_CHAINS ? _tableSize : 0;
+
+        // Crude search for a free slot (start at the watermark)
+        for ( int i = _allocWatermark; i >= minAlloc; --i )
+        {
+            if ( _items[i].hash_next == HASH_FREE )
+            {
+                _allocWatermark = INDEXER(i - 1);
+                return i;
+            }
+        }
+        for ( int i = _capacity - 1; i > _allocWatermark; --i )
+        {
+            if ( _items[i].hash_next == HASH_FREE )
+            {
+                _allocWatermark = INDEXER(i - 1);
+                return i;
+            }
+        }
+        return -1;  // Error, no free slots
+    }
+
+    int hashtable_size(int &capacity)
+    {
+        // Prime table is tuned for rehashing while the capacity is small. Whereas
+        // table sizes for larger capacities are sparser, when rehashing is expensive.
+        // Note: Not all starting capacities give the same resize pattern!
+        // clang-format off
+        static constexpr int primes[] = { 3, 7, 11, 13, 17, 19, 23, 31, 41, 67, 97, 131,
+                                          197, 257, 509, 1021, 2039, 4093, 8191, 16381 };
+        // clang-format on
+        assert(capacity < std::numeric_limits<INDEXER>::max() - 2);  // Capacity exceeds indexer
+        capacity = std::max(capacity, 2);
+
+        // Estimate to expect ~1 collision per element
+        int estimatedTableSize = (capacity + 1) / 2;
+        int tableSize = 2;
+        // Choose a conservative prime hashtable size for small capacities
+        // (stops rehashing after final table size has been seen)
+        for ( int i : primes )
+        {
+            if ( i > estimatedTableSize )
+            {
+                break;
+            }
+            tableSize = i;
+        }
+
+        // Ensure capacity includes space other than just the table
+        capacity = std::max(capacity, tableSize * 2);
+        assert(tableSize != 0);
+        return tableSize;
+    }
+};
+
+}  // namespace armstd
+
+
+// WORKAROUND: Pull into regor namespace
+namespace regor
+{
+template<typename KEY, typename VALUE, typename HASH = ::armstd::ordered_map_hash<KEY>, typename INDEXER = int16_t, bool PURE_HASH_CHAINS = true>
+using ordered_map = armstd::ordered_map<KEY, VALUE, HASH, INDEXER, PURE_HASH_CHAINS>;
+
+}  // namespace regor
diff --git a/ethosu/regor/common/reverse_type.cpp b/ethosu/regor/common/reverse_type.cpp
new file mode 100644
index 00000000..6525ccd6
--- /dev/null
+++ b/ethosu/regor/common/reverse_type.cpp
@@ -0,0 +1,30 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common/reverse_type.hpp"
+
+#include "common/logging.hpp"
+
+#include "common/bit_flags.hpp"
+
+BEGIN_ENUM_TABLE(ReverseType)
+    ADD_ENUM_NAME(None)
+    ADD_ENUM_NAME(H)
+    ADD_ENUM_NAME(W)
+    ADD_ENUM_NAME(C)
+END_ENUM_TABLE()
diff --git a/ethosu/regor/common/reverse_type.hpp b/ethosu/regor/common/reverse_type.hpp
new file mode 100644
index 00000000..280d76aa
--- /dev/null
+++ b/ethosu/regor/common/reverse_type.hpp
@@ -0,0 +1,28 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+#include "common/bit_flags.hpp"
+
+enum class ReverseType : uint8_t
+{
+    None = 0x0,
+    C = 0x1,
+    W = 0x2,
+    H = 0x4
+};
diff --git a/ethosu/regor/common/scaling.cpp b/ethosu/regor/common/scaling.cpp
new file mode 100644
index 00000000..f9fdc664
--- /dev/null
+++ b/ethosu/regor/common/scaling.cpp
@@ -0,0 +1,93 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "scaling.hpp"
+
+#include "common/numeric_util.hpp"
+
+#include <cmath>
+#include <limits>
+
+bool QuantizedScale::operator==(const QuantizedScale &other) const
+{
+    return (scale == other.scale) && (shift == other.shift);
+}
+
+bool QuantizedScale::operator!=(const QuantizedScale &other) const
+{
+    return !(*this == other);
+}
+
+QuantizedScale::QuantizedScale(double scale_, bool reduced)
+{
+    int exponent = 0;
+    int leftShift = reduced ? 15 : 31;
+    double significand = std::frexp(scale_, &exponent);
+    // convert from left to right-shift
+    scale = int32_t(std::round(significand * double(1LL << leftShift)));
+    shift = leftShift - exponent;
+    // if shift is out of bounds [0,63], try to get back within bounds
+    if ( shift > 63 && scale > std::exp2(shift - 63) )
+    {
+        scale = scale >> (shift - 63);
+        shift = 63;
+    }
+    else if ( shift < 0 && scale < std::exp2(shift + 32) )
+    {
+        scale = scale << (0 - shift);
+        shift = 0;
+    }
+}
+
+double QuantizedScale::Dequantize() const
+{
+    double significand = double(scale);
+    // ldexp expects a left-shift
+    // so we convert from right to left-shift
+    int exp = -shift;
+    return std::ldexp(significand, exp);
+}
+
+QuantizedScale ElementwiseMulScale(const double inputScale, const double input2Scale, const double outputScale)
+{
+    // clamp to single-point precision
+    float ifm1Scale = ClampToType<float>(inputScale);
+    float ifm2Scale = ClampToType<float>(input2Scale);
+    float outScale = ClampToType<float>(outputScale);
+
+    float outputRescale = (ifm1Scale * ifm2Scale) / outScale;
+    return QuantizedScale(outputRescale);
+}
+
+// Convert int32_t multiplier to int16_t with rounding.
+int16_t DownScaleInt32ToInt16Multiplier(int32_t multiplier)
+{
+    static constexpr int32_t kRoundingOffset = 1 << 15;
+    int16_t mul16;
+
+    if ( multiplier >= std::numeric_limits<int32_t>::max() - kRoundingOffset )
+    {
+        mul16 = std::numeric_limits<int16_t>::max();
+    }
+    else
+    {
+        mul16 = int16_t((multiplier + kRoundingOffset) >> 16);
+    }
+
+    return mul16;
+}
diff --git a/ethosu/regor/common/scaling.hpp b/ethosu/regor/common/scaling.hpp
new file mode 100644
index 00000000..dad14714
--- /dev/null
+++ b/ethosu/regor/common/scaling.hpp
@@ -0,0 +1,49 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+#include <cstdint>
+
+class QuantizedScale
+{
+public:
+    int32_t scale;
+    int shift;
+
+public:
+    QuantizedScale() = default;
+    QuantizedScale(int32_t scale_, int shift_) : scale(scale_), shift(shift_) {}
+    /**
+     * Creates a quantized scale and shift from a floating-point
+     * scale: floating-point representation of the scale
+     * reduced: reduces the quantization to 16-bit (default 32)
+     */
+    QuantizedScale(double scale_, bool reduced = false);
+    /*
+     * Dequantizes scale into floating-point
+     */
+    double Dequantize() const;
+    bool operator==(const QuantizedScale &other) const;
+    bool operator!=(const QuantizedScale &other) const;
+};
+
+/* Calculate elementwise Mul OFM QuantizedScale */
+QuantizedScale ElementwiseMulScale(double inputScale, double input2Scale, double outputScale);
+
+/* Convert int32_t multiplier to int16_t with rounding. */
+int16_t DownScaleInt32ToInt16Multiplier(int32_t mul);
diff --git a/ethosu/regor/common/shape.hpp b/ethosu/regor/common/shape.hpp
new file mode 100644
index 00000000..60e673aa
--- /dev/null
+++ b/ethosu/regor/common/shape.hpp
@@ -0,0 +1,733 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common.hpp"
+#include "numeric_util.hpp"
+#include "transpose_type.hpp"
+
+#include <array>
+#include <cassert>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+/// <summary>
+/// Multi-axis shape description (stores axis dimensions backwards)
+/// </summary>
+class Shape
+{
+private:
+    static constexpr int MAX_STATIC_AXES = 4;
+    union
+    {
+        int32_t axes[MAX_STATIC_AXES];
+        int32_t *ptr;
+    } _storage;
+    int8_t _last = -1;  // Invalid
+    bool _dynamic = false;
+
+public:
+    Shape() {}
+
+    Shape(int c)
+    {
+        Init(1);
+        At(0) = c;
+    }
+
+    Shape(int w, int c)
+    {
+        Init(2);
+        At(0) = c;
+        At(1) = w;
+    }
+
+    Shape(int h, int w, int c)
+    {
+        Init(3);
+        At(0) = c;
+        At(1) = w;
+        At(2) = h;
+    }
+
+    Shape(int n, int h, int w, int c)
+    {
+        Init(4);
+        At(0) = c;
+        At(1) = w;
+        At(2) = h;
+        At(3) = n;
+    }
+
+    template<typename TYPE>
+    Shape(const TYPE *axes, size_t length)
+    {
+        assert(length < size_t(std::numeric_limits<int>::max()));
+        Init(int(length));
+        if ( axes != nullptr )
+        {
+            auto *local = Storage();
+            // Reverses input into position
+            assert(size_t(_last) == length - 1);
+            for ( size_t i = 0; i < length; i++ )
+            {
+                local[_last - i] = int32_t(axes[i]);
+            }
+        }
+    }
+
+    Shape(std::nullptr_t, int length, int fillValue = 0) { Init(length, fillValue); }
+
+    Shape(const Shape &other)
+    {
+        if ( other.IsValid() )
+        {
+            Init(other.Size());
+            std::copy_n(other.Storage(), Size(), Storage());
+        }
+    }
+
+    explicit Shape(Shape &&other)
+    {
+        _storage = other._storage;
+        _dynamic = other._dynamic;
+        _last = other._last;
+        other._storage.ptr = nullptr;
+        other._dynamic = false;
+    }
+
+    Shape(const Shape &other, int length, int padValue = 0)
+    {
+        if ( other.IsValid() )
+        {
+            Init(length, padValue);
+            std::copy_n(other.Storage(), std::min(other.Size(), length), Storage());
+        }
+    }
+
+    ~Shape() { Free(); }
+
+public:
+    int &operator[](int index)
+    {
+        int offset = ToOffset(index);
+        assert((offset >= 0) && (offset <= _last));
+        return At(offset);
+    }
+
+    int operator[](int index) const
+    {
+        int offset = ToOffset(index);
+        assert((offset >= 0) && (offset <= _last));
+        return At(offset);
+    }
+
+    Shape &operator=(const Shape &other)
+    {
+        if ( &other != this )
+        {
+            Free();
+            if ( other.IsValid() )
+            {
+                Init(other.Size());
+                std::copy_n(other.Storage(), Size(), Storage());
+            }
+        }
+        return *this;
+    }
+
+    Shape &operator=(Shape &&other)
+    {
+        if ( &other != this )
+        {
+            Free();
+            _storage = other._storage;
+            _dynamic = other._dynamic;
+            _last = other._last;
+            other._storage.ptr = nullptr;
+            other._dynamic = false;
+            other._last = -1;
+        }
+        return *this;
+    }
+
+    bool operator==(const Shape &other) const
+    {
+        if ( other._last != _last )
+        {
+            return false;
+        }
+
+        auto *from = other.Storage();
+        auto *local = Storage();
+        for ( int i = 0; i <= _last; i++ )
+        {
+            if ( local[i] != from[i] )
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    explicit operator uint32_t() const { return At(0) ^ (At(1) << 8) ^ (At(2) << 16) ^ (At(3) << 24); }
+
+    explicit operator uint64_t() const { return uint64_t(uint32_t(*this)); }
+
+    explicit operator bool() const { return IsValid(); }
+
+    bool operator!=(const Shape &other) const { return !((*this) == other); }
+
+    // Required for use in maps/sets
+    bool operator<(const Shape &other) const
+    {
+        auto *from = other.Storage();
+        auto *local = Storage();
+
+        for ( int i = std::min(other._last, _last); i >= 0; i-- )
+        {
+            if ( local[i] < from[i] )
+            {
+                return true;
+            }
+            else if ( local[i] > from[i] )
+            {
+                return false;
+            }
+        }
+
+        return false;
+    }
+
+    bool operator<=(const Shape &other) const { return *this < other || *this == other; }
+
+    Shape operator+(const Shape &other) const { return Shape::MaxFunc<std::plus<int32_t>, false, 0>(*this, other); }
+
+    Shape operator-(const Shape &other) const { return Shape::MaxFunc<std::minus<int32_t>, false, 0>(*this, other); }
+
+    Shape operator%(const Shape &other) const { return Shape::MaxFunc<std::modulus<int32_t>, false, 1>(*this, other); }
+
+    Shape operator/(const Shape &other) const { return Shape::MaxFunc<std::divides<int32_t>, false, 1>(*this, other); }
+
+    Shape operator*(int scale) const { return Shape::ScalarFunc<std::multiplies<int32_t>>(*this, scale); }
+
+    Shape operator/(int scale) const { return Shape::ScalarFunc<std::divides<int32_t>>(*this, scale); }
+
+    Shape &operator+=(const Shape &other)
+    {
+        Shape tmp = *this + other;
+        *this = std::move(tmp);
+        return *this;
+    }
+
+    Shape &operator-=(const Shape &other)
+    {
+        Shape tmp = *this - other;
+        *this = std::move(tmp);
+        return *this;
+    }
+
+    int Dot(const Shape &other) const
+    {
+        int result = 0;
+        if ( VERIFY(other.Size() == Size()) )
+        {
+            auto *from = other.Storage();
+            auto *local = Storage();
+            for ( int i = 0; i <= _last; i++ )
+            {
+                result += local[i] * from[i];
+            }
+        }
+        return result;
+    }
+
+    Shape With(int index, int value) const
+    {
+        Shape tmp(*this, std::max(Size(), index + 1));
+        tmp.At(ToOffset(index)) = value;
+        return tmp;
+    }
+
+    Shape WithBatch(int n) const
+    {
+        Shape tmp(*this, std::max(Size(), 4));
+        tmp.At(3) = n;
+        return tmp;
+    }
+
+    Shape WithHeight(int h) const
+    {
+        Shape tmp(*this, std::max(Size(), 3));
+        tmp.At(2) = h;
+        return tmp;
+    }
+
+    Shape WithWidth(int w) const
+    {
+        Shape tmp(*this, std::max(Size(), 2));
+        tmp.At(1) = w;
+        return tmp;
+    }
+
+    Shape WithDepth(int d) const
+    {
+        Shape tmp(*this, std::max(Size(), 1));
+        tmp.At(0) = d;
+        return tmp;
+    }
+
+    Shape WithHW(int h, int w) const
+    {
+        Shape tmp(*this, std::max(Size(), 3));
+        tmp.At(2) = h;
+        tmp.At(1) = w;
+        return tmp;
+    }
+
+    Shape WithZeros() const { return Shape(nullptr, Size()); }
+
+    Shape WithOnes() const
+    {
+        Shape tmp(nullptr, Size());
+        std::fill_n(tmp.Storage(), Size(), 1);
+        return tmp;
+    }
+
+    Shape Insert(int index, int value) const
+    {
+        Shape tmp(nullptr, Size() + 1);
+        auto *result = tmp.Storage();
+        auto *local = Storage();
+
+        index = Size() - index;
+        for ( int i = 0; i < index; i++ )
+            result[i] = local[i];
+        result[index] = value;
+        for ( int i = index; i <= _last; i++ )
+            result[i + 1] = local[i];
+
+        return tmp;
+    }
+
+    Shape Erase(int index) const
+    {
+        Shape tmp(nullptr, Size() - 1);
+        auto *result = tmp.Storage();
+        auto *local = Storage();
+
+        index = ToOffset(index);
+        for ( int i = 0; i < index; i++ )
+            result[i] = local[i];
+        for ( int i = index + 1; i <= _last; i++ )
+            result[i - 1] = local[i];
+
+        return tmp;
+    }
+
+    Shape Extract(int a, int b, int c) const { return Extract({a, b, c}); }
+
+    Shape Extract(int a, int b, int c, int d) const { return Extract({a, b, c, d}); }
+
+    Shape Extract(std::initializer_list<int32_t> axes) const
+    {
+        Shape tmp(nullptr, int(axes.size()));
+        auto *local = Storage();
+        auto *result = tmp.Storage() + tmp.Size() - 1;
+        for ( auto axis : axes )
+        {
+            int from = ToOffset(axis);
+            assert(from < Size());
+            *result-- = local[from];
+        }
+        return tmp;
+    }
+
+    Shape Extract(const Shape &axes) const
+    {
+        Shape tmp(nullptr, axes.Size());
+        auto *local = Storage();
+        auto *result = tmp.Storage() + tmp.Size() - 1;
+        for ( int i = 0; i < axes.Size(); i++ )
+        {
+            int from = ToOffset(axes[i]);
+            assert(from < Size());
+            *result-- = local[from];
+        }
+        return tmp;
+    }
+
+    // Permute using 4-bit-per-axis mask with depth in the LSB
+    Shape Permute(uint32_t reverseAxisMask4b) const
+    {
+        int length = Size();
+        if ( length == 0 ) return *this;
+
+        Shape tmp(nullptr, length);
+        auto *local = Storage();
+        auto *result = tmp.Storage();
+
+        while ( length-- )
+        {
+            int from = ToOffset(reverseAxisMask4b & 0xF);
+            assert(from < Size());
+            *result++ = local[from];
+            reverseAxisMask4b = reverseAxisMask4b >> 4;
+        }
+        assert((tmp.Elements64() == Elements64()) && "Possible bad permute (volume differs)");
+        return tmp;
+    }
+
+    // Reverse permute using 4-bit-per-axis mask with depth in the LSB
+    Shape Unpermute(uint32_t reverseAxisMask4b) const
+    {
+        int length = Size();
+        if ( length == 0 ) return *this;
+
+        Shape tmp(nullptr, length);
+        auto *local = Storage();
+        auto *result = tmp.Storage();
+
+        while ( length-- )
+        {
+            int to = ToOffset(reverseAxisMask4b & 0xF);
+            assert(to < Size());
+            result[to] = *local++;
+            reverseAxisMask4b = reverseAxisMask4b >> 4;
+        }
+        assert((tmp.Elements64() == Elements64()) && "Possible bad unpermute (volume differs)");
+        return tmp;
+    }
+
+    Shape Untranspose(TransposeType type) const
+    {
+        if ( IsNone(type) ) return *this;
+
+        return Unpermute(uint32_t(type));
+    }
+
+    int Size() const { return _last + 1; }
+
+    int Depth() const
+    {
+        assert(_last >= 0);
+        return At(0);
+    }
+    int Width() const
+    {
+        assert(_last >= 1);
+        return At(1);
+    }
+    int Height() const
+    {
+        assert(_last >= 2);
+        return At(2);
+    }
+    int Batch() const
+    {
+        assert(_last >= 3);
+        return At(3);
+    }
+
+    template<typename TYPE>
+    Point2<TYPE> WC() const
+    {
+        assert(Size() >= 2);
+        return Point2<TYPE>(TYPE(Width()), TYPE(Depth()));
+    }
+
+    template<typename TYPE>
+    Point2<TYPE> WH() const
+    {
+        assert(Size() >= 3);
+        return Point2<TYPE>(TYPE(Width()), TYPE(Height()));
+    }
+
+    template<typename TYPE>
+    Point3<TYPE> HWC() const
+    {
+        assert(Size() >= 3);
+        return Point3<TYPE>(TYPE(Height()), TYPE(Width()), TYPE(Depth()));
+    }
+
+    int ElementsWH() const
+    {
+        int64_t result = int64_t(Width()) * Height();
+        assert(result <= std::numeric_limits<int>::max());
+        return int(result);
+    }
+
+    int Elements() const
+    {
+        int64_t result = Elements64();
+        assert(result <= std::numeric_limits<int>::max());
+        return int(result);
+    }
+
+    int64_t Elements64() const
+    {
+        int64_t result = 0;
+        if ( IsValid() )
+        {
+            auto *local = Storage();
+            result = local[0];
+            for ( int i = 1; i <= _last; i++ )
+            {
+                result *= local[i];
+            }
+        }
+        return result;
+    }
+
+    bool IsValid() const { return _last >= 0; }
+
+    bool IsDynamic() const { return _dynamic; }
+
+    bool IsEmpty() const
+    {
+        auto *local = Storage();
+        for ( int i = 0; i <= _last; i++ )
+        {
+            if ( local != 0 )
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    bool IsSubShapeOf(const Shape &other) const
+    {
+        if ( Size() > other.Size() )
+        {
+            return false;
+        }
+        auto *bounds = other.Storage();
+        auto *local = Storage();
+        for ( int i = _last; i >= 0; i-- )
+        {
+            if ( local[i] > bounds[i] )
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    template<typename TYPE>
+    int ToNHWC(TYPE *buffer, int length) const
+    {
+        auto *local = Storage() + _last;
+        for ( int i = 0; i < length; i++ )
+        {
+            *buffer++ = TYPE(local[-i]);
+        }
+        return length;
+    }
+
+    template<typename TYPE>
+    std::vector<TYPE> ToList() const
+    {
+        return std::vector<TYPE>(std::reverse_iterator<const int *>(Storage() + Size()), std::reverse_iterator<const int *>(Storage()));
+    }
+
+    std::string ToString() const
+    {
+        std::string tmp;
+        tmp.reserve(16);
+        auto *local = Storage();
+        for ( int i = _last; i >= 0; i-- )
+        {
+            tmp += std::to_string(local[i]);
+            if ( i > 0 )
+            {
+                tmp += ", ";
+            }
+        }
+        return tmp;
+    }
+
+private:
+    void Init(int size, int fillValue = 0)
+    {
+        assert(size > 0);
+        assert(size <= 127);
+        _last = size - 1;
+        _dynamic = (size > MAX_STATIC_AXES);
+        int32_t *p = _dynamic ? (_storage.ptr = new int32_t[size]) : _storage.axes;
+        std::fill_n(p, size, fillValue);
+    }
+
+    void Free()
+    {
+        if ( _dynamic ) delete[] _storage.ptr;
+        _last = -1;  // Becomes invalid
+        _dynamic = false;
+        _storage.ptr = nullptr;
+    }
+
+    int32_t &At(int index) { return Storage()[index]; }
+
+    const int32_t &At(int index) const { return Storage()[index]; }
+
+    int32_t *Storage() { return _dynamic ? _storage.ptr : _storage.axes; }
+
+    const int32_t *Storage() const { return _dynamic ? _storage.ptr : _storage.axes; }
+
+    int ToOffset(int index) const { return (index < 0) ? (-index - 1) : (_last - index); }
+
+    // Apply a function to the minimum number of axes between two shapes.
+    template<typename FUNC>
+    static Shape MinFunc(const Shape &a, const Shape &b)
+    {
+        int size = std::min(a.Size(), b.Size());
+        Shape tmp(nullptr, size);
+        auto *pa = a.Storage();
+        auto *pb = b.Storage();
+        auto *result = tmp.Storage();
+        for ( int i = 0; i < size; i++ )
+        {
+            result[i] = FUNC()(pa[i], pb[i]);
+        }
+        return tmp;
+    }
+
+    // Apply a function to the maximum number of axes between two shapes. For missing
+    // axes either take from the longest shape, or substitute a constant value.
+    template<typename FUNC, bool TAKE_LONGEST, int MISSING_VALUE = 0>
+    static Shape MaxFunc(const Shape &a, const Shape &b)
+    {
+        bool a_longer = a.Size() >= b.Size();
+        int length = a_longer ? a.Size() : b.Size();
+        int shortest = a_longer ? b.Size() : a.Size();
+
+        Shape tmp(nullptr, length);
+        auto *pa = a.Storage();
+        auto *pb = b.Storage();
+        auto *result = tmp.Storage();
+
+        int i = 0;
+        for ( ; i < shortest; i++ )
+        {
+            result[i] = FUNC()(pa[i], pb[i]);
+        }
+        for ( ; i < length; i++ )
+        {
+            if ( TAKE_LONGEST )
+            {
+                result[i] = a_longer ? pa[i] : pb[i];
+            }
+            else
+            {
+                result[i] = a_longer ? FUNC()(pa[i], MISSING_VALUE) : FUNC()(MISSING_VALUE, pb[i]);
+            }
+        }
+        return tmp;
+    }
+
+    // Apply a scalar function to all axes
+    template<typename FUNC>
+    static Shape ScalarFunc(const Shape &a, int value)
+    {
+        Shape tmp(nullptr, a.Size());
+        auto *pa = a.Storage();
+        auto *result = tmp.Storage();
+        for ( int i = 0; i < a.Size(); i++ )
+        {
+            result[i] = FUNC()(pa[i], value);
+        }
+        return tmp;
+    }
+
+    // Proxy for pointer-to-functions
+    template<typename T, T (*FUNC)(T, T)>
+    struct func_proxy
+    {
+        std::remove_reference_t<T> operator()(T a, T b) const { return FUNC(a, b); }
+    };
+
+    template<typename T>
+    struct op_wrap
+    {
+        T operator()(const T a, const T b) const
+        {
+            assert(b > 0);
+            return (a >= b) ? (a % b) : a;
+        }
+    };
+
+public:
+    template<typename TYPE>
+    static Shape FromVector(const std::vector<TYPE> &from)
+    {
+        return from.empty() ? Shape() : Shape(from.data(), from.size());
+    }
+
+    static Shape PadAxes(const Shape &shape, int axes, int padValue)
+    {
+        return Shape(shape, std::max(axes, shape.Size()), padValue);
+    }
+
+    static Shape Min(const Shape &a, const Shape &b)
+    {
+        return Shape::MinFunc<func_proxy<const int32_t &, std::min<int32_t>>>(a, b);
+    }
+
+    static Shape Max(const Shape &a, const Shape &b)
+    {
+        return Shape::MaxFunc<func_proxy<const int32_t &, std::max<int32_t>>, true>(a, b);
+    }
+
+    static Shape RoundAway(const Shape &a, const Shape &b)
+    {
+        return Shape::MinFunc<func_proxy<int32_t, ::RoundAway<int32_t>>>(a, b);
+    }
+
+    static Shape RoundZero(const Shape &a, const Shape &b)
+    {
+        return Shape::MinFunc<func_proxy<int32_t, ::RoundZero<int32_t>>>(a, b);
+    }
+
+    static Shape DivRoundUp(const Shape &a, const Shape &b)
+    {
+        return Shape::MinFunc<func_proxy<int32_t, ::DivRoundUp<int32_t>>>(a, b);
+    }
+
+    static Shape Wrap(const Shape &a, const Shape &b) { return Shape::MinFunc<op_wrap<int32_t>>(a, b); }
+
+    static Shape GetStridesForShape(const Shape &shape, const Shape &granularity)
+    {
+        assert(granularity.Size() >= shape.Size());
+        Shape tmp(nullptr, shape.Size());
+        if ( shape.IsValid() )
+        {
+            auto *gran = granularity.Storage();
+            auto *from = shape.Storage();
+            auto *result = tmp.Storage();
+            result[0] = gran[0];
+            for ( int i = 1; i <= shape._last; i++ )
+            {
+                result[i] = ::RoundAway(result[i - 1] * from[i - 1], gran[i]);
+            }
+        }
+        return tmp;
+    }
+};
diff --git a/ethosu/regor/common/transpose_type.cpp b/ethosu/regor/common/transpose_type.cpp
new file mode 100644
index 00000000..2d7671a0
--- /dev/null
+++ b/ethosu/regor/common/transpose_type.cpp
@@ -0,0 +1,33 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common/transpose_type.hpp"
+
+#include "common/logging.hpp"
+
+#include "common/bit_flags.hpp"
+
+BEGIN_ENUM_TABLE(TransposeType)
+    ADD_ENUM_NAME(NHWC)
+    ADD_ENUM_NAME(NWHC)
+    ADD_ENUM_NAME(NHCW)
+    ADD_ENUM_NAME(NWCH)
+    ADD_ENUM_NAME(NCHW)
+    ADD_ENUM_NAME(NCWH)
+    ADD_ENUM_NAME(None)
+END_ENUM_TABLE()
diff --git a/ethosu/regor/common/transpose_type.hpp b/ethosu/regor/common/transpose_type.hpp
new file mode 100644
index 00000000..58e4a3b1
--- /dev/null
+++ b/ethosu/regor/common/transpose_type.hpp
@@ -0,0 +1,78 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+#include "common/bit_flags.hpp"
+
+enum class TransposeType : uint32_t
+{
+    NHWC = 0x0123,
+    NWHC = 0x0213,
+    NHCW = 0x0132,
+    NWCH = 0x0231,
+    NCHW = 0x0312,
+    NCWH = 0x0321,
+    None = 0x01234567,
+};
+
+inline constexpr TransposeType operator>>(TransposeType type, uint32_t size)
+{
+    return TransposeType(uint32_t(type) >> size);
+}
+
+inline bool IsNone(TransposeType type)
+{
+    for ( int p = 0; p < 32; p += 4 )
+    {
+        if ( type == (TransposeType::None >> p) ) return true;
+    }
+    return false;
+}
+
+// Reduce a 4D transpose mask to a 3D transpose mask (f.ex. 0x0123 -> 0x012)
+inline TransposeType Reduce4To3(TransposeType type)
+{
+    if ( IsNone(type) )
+    {
+        return TransposeType(0x012);
+    }
+
+    switch ( type )
+    {
+        case TransposeType::NHWC:
+        case TransposeType::NWHC:
+        case TransposeType::NHCW:
+        case TransposeType::NWCH:
+        case TransposeType::NCHW:
+        case TransposeType::NCWH:
+        {
+            int n = uint32_t(type >> 12) & 0xF;
+            assert(n == 0);
+            int h = uint32_t(type >> 8) & 0xF;
+            assert(h <= 3);
+            int w = uint32_t(type >> 4) & 0xF;
+            assert(w <= 3);
+            int c = uint32_t(type >> 0) & 0xF;
+            assert(c <= 3);
+            return TransposeType(((h - 1) << 8) | ((w - 1) << 4) | (c - 1));
+        }
+        default:
+            assert(false && "Unsupported transpose type");
+            return type;
+    }
+}
diff --git a/ethosu/regor/common/vector_span.hpp b/ethosu/regor/common/vector_span.hpp
new file mode 100644
index 00000000..9c63a5d5
--- /dev/null
+++ b/ethosu/regor/common/vector_span.hpp
@@ -0,0 +1,77 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include <vector>
+
+/// <summary>
+/// Mechanism for treating partial sections of a vector as complete sequences
+/// to allow sub-array processing.
+/// Does not track invalidation (source vector cannot change)
+/// </summary>
+template<typename TYPE>
+class vector_span
+{
+    using iterator = typename std::vector<TYPE>::iterator;
+    using const_iterator = typename std::vector<TYPE>::const_iterator;
+
+private:
+    iterator _start;
+    iterator _end;
+
+public:
+    vector_span() = default;
+
+    vector_span(std::vector<TYPE> &vec)
+    {
+        _start = vec.begin();
+        _end = vec.end();
+    }
+
+    vector_span(std::vector<TYPE> &vec, int start, int end)
+    {
+        _start = vec.begin() + start;
+        _end = vec.begin() + end;
+    }
+
+    vector_span(const std::vector<TYPE> &vec, int start, int end)
+    {
+        auto posStart = vec.begin() + start;
+        auto posEnd = vec.begin() + end;
+        // Use vec.erase to convert from const to non-const iterators (erases nothing!)
+        _start = const_cast<std::vector<TYPE> &>(vec).erase(posStart, posStart);
+        _end = const_cast<std::vector<TYPE> &>(vec).erase(posEnd, posEnd);
+    }
+
+    TYPE &front() { return *_start; }
+    const TYPE &front() const { return *_start; }
+
+    TYPE &back() { return *(_end - 1); }
+    const TYPE &back() const { return *(_end - 1); }
+
+    // Iterate just the values
+    iterator begin() { return _start; }
+    iterator end() { return _end; }
+    const_iterator begin() const { return _start; }
+    const_iterator end() const { return _end; }
+
+    int size() const { return int(std::distance(_start, _end)); }
+
+    TYPE &operator[](int index) { return *(_start + index); }
+};
diff --git a/ethosu/regor/compiler/attributes.cpp b/ethosu/regor/compiler/attributes.cpp
new file mode 100644
index 00000000..009a01be
--- /dev/null
+++ b/ethosu/regor/compiler/attributes.cpp
@@ -0,0 +1,66 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "attributes.hpp"
+
+#include "common/logging.hpp"
+
+namespace regor
+{
+
+#define REDUCED_HASH(hash) (hash & 0x000FFFFF)
+#define CASE_MAKE_ATTR_INSTANCE(TYPE_) \
+    case REDUCED_HASH(TypeHash<TYPE_>::HASH): \
+        return DynamicRef(TypeInfoOf<TYPE_>::Get(true), TypeInfoOf<TYPE_>::SharedNew());
+
+DynamicRef CreateAttribute(uint32_t reducedHash)
+{
+    reducedHash = REDUCED_HASH(reducedHash);
+    switch ( reducedHash )
+    {
+        CASE_MAKE_ATTR_INSTANCE(asr_attr_t);
+        CASE_MAKE_ATTR_INSTANCE(axis_attr_t);
+        CASE_MAKE_ATTR_INSTANCE(reshape_attr_t);
+        CASE_MAKE_ATTR_INSTANCE(clamp_attr_t);
+        CASE_MAKE_ATTR_INSTANCE(concat_attr_t);
+        CASE_MAKE_ATTR_INSTANCE(cond_attr_t);
+        CASE_MAKE_ATTR_INSTANCE(custom_attr_t);
+        CASE_MAKE_ATTR_INSTANCE(fft_attr_t);
+        CASE_MAKE_ATTR_INSTANCE(leaky_relu_attr_t);
+        CASE_MAKE_ATTR_INSTANCE(mul_attr_t);
+        CASE_MAKE_ATTR_INSTANCE(pack_unpack_attr_t);
+        CASE_MAKE_ATTR_INSTANCE(pooling_attr_t);
+        CASE_MAKE_ATTR_INSTANCE(rescale_attr_t);
+        CASE_MAKE_ATTR_INSTANCE(resize_attr_t);
+        CASE_MAKE_ATTR_INSTANCE(slice_attr_t);
+        CASE_MAKE_ATTR_INSTANCE(softmax_attr_t);
+        CASE_MAKE_ATTR_INSTANCE(strided_slice_attr_t);
+        CASE_MAKE_ATTR_INSTANCE(tile_attr_t);
+        CASE_MAKE_ATTR_INSTANCE(transpose_attr_t);
+        CASE_MAKE_ATTR_INSTANCE(transpose_conv2d_attr_t);
+        CASE_MAKE_ATTR_INSTANCE(while_attr_t);
+        default:
+            assert(false && "No attribute has this reduced hash");
+            // Add a new XXX_attr_t struct to the header then
+            // insert a new case entry in the statement above
+            break;
+    };
+    return {};
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/attributes.hpp b/ethosu/regor/compiler/attributes.hpp
new file mode 100644
index 00000000..c7b4a897
--- /dev/null
+++ b/ethosu/regor/compiler/attributes.hpp
@@ -0,0 +1,259 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/common.hpp"
+
+#include "common/data_type.hpp"
+#include "common/dynamic_typing.hpp"
+#include "common/numeric_util.hpp"
+#include "common/shape.hpp"
+#include "include/graphapi.hpp"
+
+#include <memory>
+#include <string>
+
+namespace regor
+{
+
+#define GRAPHAPI_FUNCTION_DETAIL_ONLY
+#include "include/graphapi_attr.hpp"
+
+// MakeAttributeId places the type information in the lower 4-bits for the specific
+// purpose of stripping it with a right shift. This allows us to compare fields
+// of a different type ids.
+#define ATTR_FIELD_ID(CLASS_, UNIQUE_) detail::MakeAttributeId(TypeHash<CLASS_>::HASH, 0, UNIQUE_)
+#define ATTR_FIELD(MEMBER_, UNIQUE_) \
+    { \
+        offsetof(thisclass_t, MEMBER_), ATTR_FIELD_ID(thisclass_t, UNIQUE_) >> 4, \
+            REGOR_FIELD_TYPE(decltype(reinterpret_cast<thisclass_t *>(0)->MEMBER_)) \
+    } \
+    ,
+
+struct pooling_attr_t
+{
+    DataType accPrecision;
+    BEGIN_FIELD_TABLE(pooling_attr_t)
+        ATTR_FIELD(accPrecision, 0)
+    END_FIELD_TABLE()
+};
+
+struct axis_attr_t
+{
+    int32_t axis;
+    BEGIN_FIELD_TABLE(axis_attr_t)
+        ATTR_FIELD(axis, 0)
+    END_FIELD_TABLE()
+};
+
+struct reshape_attr_t
+{
+    Shape shape;
+    BEGIN_FIELD_TABLE(reshape_attr_t)
+        ATTR_FIELD(shape, 0)
+    END_FIELD_TABLE()
+};
+
+struct slice_attr_t
+{
+    Shape begin;
+    Shape size;
+    BEGIN_FIELD_TABLE(slice_attr_t)
+        ATTR_FIELD(begin, 0)
+        ATTR_FIELD(size, 1)
+    END_FIELD_TABLE()
+};
+
+struct resize_attr_t
+{
+    Point2i scaleX;  // X/Y (x=numerator/y=denominator)
+    Point2i scaleY;  // X/Y (x=numerator/y=denominator)
+    Point2i offset;
+    Point2i border;
+    tosa::ResizeMode mode;
+    BEGIN_FIELD_TABLE(resize_attr_t)
+        ATTR_FIELD(scaleX, 0)
+        ATTR_FIELD(scaleY, 1)
+        ATTR_FIELD(offset, 2)
+        ATTR_FIELD(border, 3)
+        ATTR_FIELD(mode, 4)
+    END_FIELD_TABLE()
+};
+
+struct clamp_attr_t
+{
+    double min;
+    double max;
+    BEGIN_FIELD_TABLE(clamp_attr_t)
+        ATTR_FIELD(min, 0)
+        ATTR_FIELD(max, 1)
+    END_FIELD_TABLE()
+};
+
+struct rescale_attr_t
+{
+    bool scale32;
+    bool double_round;
+    bool per_channel;
+    BEGIN_FIELD_TABLE(rescale_attr_t)
+        ATTR_FIELD(scale32, 0)
+        ATTR_FIELD(double_round, 1)
+        ATTR_FIELD(per_channel, 2)
+    END_FIELD_TABLE()
+};
+
+struct mul_attr_t
+{
+    int32_t shift;
+    BEGIN_FIELD_TABLE(mul_attr_t)
+        ATTR_FIELD(shift, 0)
+    END_FIELD_TABLE()
+};
+
+struct asr_attr_t
+{
+    bool round;
+    BEGIN_FIELD_TABLE(asr_attr_t)
+        ATTR_FIELD(round, 0)
+    END_FIELD_TABLE()
+};
+
+struct cond_attr_t
+{
+    std::string then_branch;
+    std::string else_branch;
+    BEGIN_FIELD_TABLE(cond_attr_t)
+        ATTR_FIELD(then_branch, 0)
+        ATTR_FIELD(else_branch, 1)
+    END_FIELD_TABLE()
+};
+
+struct while_attr_t
+{
+    std::string cond_branch;
+    std::string body_branch;
+    BEGIN_FIELD_TABLE(while_attr_t)
+        ATTR_FIELD(cond_branch, 0)
+        ATTR_FIELD(body_branch, 1)
+    END_FIELD_TABLE()
+};
+
+struct transpose_conv2d_attr_t
+{
+    Shape outShape;
+    BEGIN_FIELD_TABLE(transpose_conv2d_attr_t)
+        ATTR_FIELD(outShape, 0)
+    END_FIELD_TABLE()
+};
+
+struct transpose_attr_t
+{
+    Shape perm;
+    BEGIN_FIELD_TABLE(transpose_attr_t)
+        ATTR_FIELD(perm, 0)
+    END_FIELD_TABLE()
+};
+
+struct tile_attr_t
+{
+    Shape multiples;
+    BEGIN_FIELD_TABLE(tile_attr_t)
+        ATTR_FIELD(multiples, 0)
+    END_FIELD_TABLE()
+};
+
+struct fft_attr_t
+{
+    bool inverse;
+    BEGIN_FIELD_TABLE(fft_attr_t)
+        ATTR_FIELD(inverse, 0)
+    END_FIELD_TABLE()
+};
+
+struct custom_attr_t
+{
+    std::string name;
+    std::string domain;
+    BEGIN_FIELD_TABLE(custom_attr_t)
+        ATTR_FIELD(name, 0)
+        ATTR_FIELD(domain, 1)
+    END_FIELD_TABLE()
+};
+
+struct strided_slice_attr_t
+{
+    int begin_mask;
+    int end_mask;
+    int ellipsis_mask;
+    int new_axis_mask;
+    int shrink_axis_mask;
+    BEGIN_FIELD_TABLE(strided_slice_attr_t)
+        ATTR_FIELD(begin_mask, 0)
+        ATTR_FIELD(end_mask, 1)
+        ATTR_FIELD(ellipsis_mask, 2)
+        ATTR_FIELD(new_axis_mask, 3)
+        ATTR_FIELD(shrink_axis_mask, 4)
+    END_FIELD_TABLE()
+};
+
+struct tflite_resize_t
+{
+    bool alignCorners;
+    bool halfPixelCenters;
+    BEGIN_FIELD_TABLE(tflite_resize_t)
+        ATTR_FIELD(alignCorners, 0)
+        ATTR_FIELD(halfPixelCenters, 1)
+    END_FIELD_TABLE()
+};
+
+struct leaky_relu_attr_t
+{
+    float alpha;
+    BEGIN_FIELD_TABLE(leaky_relu_attr_t)
+        ATTR_FIELD(alpha, 0)
+    END_FIELD_TABLE()
+};
+
+struct softmax_attr_t
+{
+    float beta;
+    BEGIN_FIELD_TABLE(softmax_attr_t)
+        ATTR_FIELD(beta, 0)
+    END_FIELD_TABLE()
+};
+
+struct concat_attr_t
+{
+    int axis;
+    BEGIN_FIELD_TABLE(concat_attr_t)
+        ATTR_FIELD(axis, 0)
+    END_FIELD_TABLE()
+};
+
+struct pack_unpack_attr_t
+{
+    int axis;
+    BEGIN_FIELD_TABLE(pack_unpack_attr_t)
+        ATTR_FIELD(axis, 0)
+    END_FIELD_TABLE()
+};
+
+DynamicRef CreateAttribute(uint32_t reducedhash);
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/cascade_builder.cpp b/ethosu/regor/compiler/cascade_builder.cpp
new file mode 100644
index 00000000..1d185f22
--- /dev/null
+++ b/ethosu/regor/compiler/cascade_builder.cpp
@@ -0,0 +1,399 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+// #define LOG_TRACE_ENABLE TD_1
+#include "cascade_builder.hpp"
+
+#include "common/logging.hpp"
+
+#include "common/numeric_util.hpp"
+#include "common/shape.hpp"
+#include "op_type.hpp"
+#include "scheduler.hpp"
+#include "scheduler_operation.hpp"
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+namespace regor
+{
+
+class BufferMap
+{
+    using Key = std::pair<UniqueId, UniqueId>;
+    struct KeyHash
+    {
+        size_t operator()(const Key &k) const { return (k.first << 8) ^ k.second; }
+    };
+
+private:
+    std::unordered_map<Key, CascadeBuffer, KeyHash> _cache;
+
+public:
+    CascadeBuffer GetBuffer(SchedulerOperation *producer, SchedulerOperation *consumer, const Schedule *refSchedule)
+    {
+        auto key = Key(producer ? *producer : 0, consumer ? *consumer : 0);
+        auto pos = _cache.find(key);
+        if ( pos != _cache.end() )
+        {
+            return pos->second;
+        }
+
+        Shape bufferShape;
+        int bufferSize = 0;
+        // No cached buffer between these two SchedulerOperations
+        if ( consumer == nullptr )
+        {
+            auto ofm = producer->OFM();
+            // There are either no consumers or multiple consumers - FeatureMap needs to be stored in full
+            bufferShape = ofm->shape;
+            bufferSize = ofm->tensor->AllocationSizeBytes();
+        }
+        else if ( producer == nullptr )
+        {
+            auto ifm = consumer->IFM(consumer->PrimaryIfmIndex());
+            // First Op in subgraph or cascade - FeatureMap needs to be stored in full
+            bufferShape = ifm->shape;
+            bufferSize = ifm->tensor->AllocationSizeBytes();
+        }
+        else
+        {
+            auto ofm = producer->OFM();
+            auto ifm = consumer->IFM(consumer->PrimaryIfmIndex());
+
+            if ( ofm->requireFullTensor || ifm->requireFullTensor )
+            {
+                // FeatureMap needs to be stored in full
+                bufferShape = Shape::Max(ofm->shape, ifm->shape);
+                bufferSize = std::max(ofm->tensor->AllocationSizeBytes(), ifm->tensor->AllocationSizeBytes());
+            }
+            else
+            {
+                // Use a rolling buffer
+                auto producerCost = refSchedule->Cost(producer);
+                auto consumerCost = refSchedule->Cost(consumer);
+
+                bufferShape = RollingBufferShape(producerCost->stripe, consumerCost->stripeInput[0]);
+                bufferSize = DataTypeStorageSizeBytes(ofm->tensor->dataType, bufferShape.Elements());
+            }
+        }
+        _cache.emplace(key, CascadeBuffer(bufferShape, bufferSize));
+
+        return CascadeBuffer(bufferShape, bufferSize);
+    }
+
+    Shape RollingBufferShape(const Shape &producerStripeShape, const Shape &consumerStripeShape)
+    {
+        // Calculates the storage shape of the rolling buffer between two SchedulerOperations in a Cascade
+        int buffer_height = RoundAway(producerStripeShape.Height() + consumerStripeShape.Height(), consumerStripeShape.Height());
+        // Rolling buffers have to conform to NHCWB16 alignment
+        return consumerStripeShape.With(-3, buffer_height).With(-1, RoundAway(producerStripeShape.Depth(), 16));
+    }
+};
+
+
+CascadeBuilder::CascadeBuilder(vector_span<std::unique_ptr<SchedulerOperation>> ops, const std::unordered_map<UniqueId, int> &nonLocalMemUsage, bool spilling) :
+        _ops(ops), _nonLocalMemUsage(nonLocalMemUsage)
+
+{
+    _spilling = spilling;
+}
+
+
+void CascadeBuilder::BuildCascades(Schedule *refSchedule, Schedule *fallbackSchedule, Address guidingStagingLimit)
+{
+    BufferMap buffers;
+    SchedulerCostMap costs;
+    std::unordered_map<int, CascadeInfo> cascadeMap;
+
+
+    LOG_TRACE1("Build Cascades for '{}' with limit of {} bytes\n", refSchedule->Name(), guidingStagingLimit);
+    // Peak memory usage so far - updated continuously, except where spilling makes this a hard limit
+    int peakStagingUsage = int(std::min(INT64_C(1) << 30, guidingStagingLimit));
+    auto pos = _ops.begin();
+    while ( pos != _ops.end() )
+    {
+        SchedulerOperation *op = pos->get();
+        if ( !op->IsNpuOp() )
+        {
+            pos++;
+            continue;
+        }
+
+        // Already processed this Op if it has a cost
+        if ( costs.find(*op) != costs.end() )
+        {
+            pos++;
+            continue;
+        }
+
+        auto fallbackCost = fallbackSchedule->Cost(op);
+
+        SchedulerConnection *ifm = op->IFM(op->PrimaryIfmIndex());
+
+        // If Op is not a candidate for cascading - assign fallback cost
+        if ( !IsCascadable(op, ifm->tensor.get(), refSchedule->Cost(op)) )
+        {
+            costs[*op] = std::make_unique<SchedulerOpInfo>(*fallbackCost);
+            if ( !_spilling )
+            {
+                peakStagingUsage = std::max(EstimateBufferUsage(op, fallbackCost), peakStagingUsage);
+            }
+            pos++;
+            continue;
+        }
+
+        // Propose a cascade starting with this Op
+        // Keep track of which Ops are in the proposed cascade as well as the best cascade so far
+        int cascadeStart = op->Index();
+        std::vector<SchedulerOperation *> opsInCascade = {op};
+        std::vector<SchedulerOperation *> opsInBestCascade = {op};
+
+        // Get the size of the weight buffer
+        int weightBufferSize = 0;
+        auto refCost = refSchedule->Cost(op);
+        if ( refCost->bufferedWeightTensor.tensor )
+        {
+            weightBufferSize = refCost->bufferedWeightTensor.tensor->AllocationSizeBytes();
+        }
+
+        // The first IFM needs to be stored in full
+        int cascadeIFMSize = _spilling ? 0 : ifm->tensor->AllocationSizeBytes();
+
+        // Add non-local memory usage
+        cascadeIFMSize += NonLocalUsage(*op);
+
+        // Sum of all intermediate cascade buffers (including weight buffers)
+        int cascadeBuffersSize = weightBufferSize;
+
+        // Best cascade size - Initially it's the fallback cost of the first Op in the cascade
+        int bestCascadeSize = EstimateBufferUsage(op, fallbackCost);
+
+        // Op is the producer of the OFM consumed by the next Op to consider
+        auto producer = op;
+        while ( true )
+        {
+            auto &dependants = producer->OFM()->tensor->consumers;
+
+            if ( dependants.size() != 1u )
+            {
+                // producer is either the last Op in the schedule or the start of a branch
+                break;
+            }
+
+            SchedulerOperation *currentOp = dependants[0];
+            refCost = refSchedule->Cost(currentOp);
+
+            auto currentIfm = currentOp->IFM(currentOp->PrimaryIfmIndex());
+
+            if ( costs.find(*currentOp) != costs.end() || (refCost == nullptr) ||
+                 !IsCascadable(currentOp, currentIfm->tensor.get(), refCost) ||
+                 producer->OFM()->shape != currentIfm->shape || currentIfm->requireFullTensor )
+            {
+                // Current op has already been processed or cannot be cascaded
+                break;
+            }
+            if ( currentOp->Index() != producer->Index() + 1 )
+            {
+                // Cascading is possible, but requires reordering of operations in the schedule,
+                // this is currently not supported
+                break;
+            }
+
+            // Get the size of the FeatureMap buffers between current and neighbouring Ops
+            int opFullIfmSize = currentIfm->tensor->AllocationSizeBytes();
+            int opFullOfmSize = currentOp->OFM()->tensor->AllocationSizeBytes();
+
+            auto bufferInfo = buffers.GetBuffer(producer, currentOp, refSchedule);
+            int ifmBufferSize = bufferInfo.sizeBytes;
+
+            // Get the size of the weight buffer
+            int opWeightBuffer = 0;
+            if ( refCost->bufferedWeightTensor.tensor )
+            {
+                opWeightBuffer = refCost->bufferedWeightTensor.tensor->AllocationSizeBytes();
+            }
+
+            // Calculate the uncascaded memory requirement for current Op
+            int uncascadedStagingUsage = opFullIfmSize + opFullOfmSize + NonLocalUsage(*currentOp);
+
+            // Add current Op to cascade
+            opsInCascade.push_back(currentOp);
+
+            // Increase the accumulated intermediate buffers in the cascade
+            cascadeBuffersSize += ifmBufferSize + opWeightBuffer;
+
+            LOG_TRACE1("\tAppend '{0}:{1}' to cascade\n", currentOp->Index(), OpTypeToString(currentOp->Type()));
+            LOG_TRACE1("\t\tFull Primary IFM [{0}] bytes = {1}, Full OFM bytes [{2}] = {3}\n",
+                currentIfm->shape.ToString(), opFullIfmSize, currentOp->OFM()->shape.ToString(), opFullOfmSize);
+            LOG_TRACE1("\t\tCascade buffer bytes = {0} - [{1}]\n", cascadeBuffersSize, bufferInfo.shape.ToString());
+
+            if ( _spilling )
+            {
+                if ( (uncascadedStagingUsage < peakStagingUsage) || (cascadeBuffersSize > peakStagingUsage) )
+                {
+                    // Cascade until an Op fits in its entirety or the accumulated buffers no longer fit
+                    break;
+                }
+                else
+                {
+                    opsInBestCascade = opsInCascade;
+                    bestCascadeSize = cascadeBuffersSize;
+                }
+            }
+            else
+            {
+                // Calculate the total size of the current cascade
+                int cascadeSize = cascadeIFMSize + cascadeBuffersSize + opFullOfmSize;
+
+                // Determine if current cascade is the best so far
+                if ( cascadeSize < bestCascadeSize )
+                {
+                    bestCascadeSize = cascadeSize;
+                    opsInBestCascade = opsInCascade;
+                }
+                // Determine if cascading search should stop
+                if ( ((uncascadedStagingUsage < peakStagingUsage) && (bestCascadeSize < peakStagingUsage)) ||
+                     (cascadeIFMSize + cascadeBuffersSize) > bestCascadeSize )
+                {
+                    // Both the existing cascade and current Op fits
+                    break;
+                }
+            }
+
+            producer = currentOp;
+        }
+
+        if ( opsInBestCascade.size() > 1 )
+        {
+            // A cascade was created - assign cascade and ref_cost to all of the Ops
+            int cascadeEnd = cascadeStart + int(opsInBestCascade.size()) - 1;  // Inclusive end
+
+            std::unordered_map<UniqueId, CascadeBuffer> buffersInCascade;
+            SchedulerOperation *prevOp = nullptr;
+            for ( auto cascadedOp : opsInBestCascade )
+            {
+                assert(cascadedOp->Index() <= cascadeEnd);
+                auto cascadedCost = std::make_unique<SchedulerOpInfo>(*refSchedule->Cost(cascadedOp));
+                cascadedCost->cascade = cascadeEnd;
+                costs.emplace(*cascadedOp, std::move(cascadedCost));
+
+                if ( prevOp )
+                {
+                    auto const &buffer = buffers.GetBuffer(prevOp, cascadedOp, refSchedule);
+                    buffersInCascade[*cascadedOp] = buffer;
+                }
+
+                prevOp = cascadedOp;
+            }
+
+            // Create a CascadeInfo for the cascade
+            cascadeMap.emplace(cascadeEnd, CascadeInfo(cascadeStart, cascadeEnd, bestCascadeSize, std::move(buffersInCascade)));
+            if ( !_spilling )
+            {
+                // Update peak memory usage
+                peakStagingUsage = std::max(bestCascadeSize, peakStagingUsage);
+            }
+        }
+        else
+        {
+            // Assign fallback cost to the initial Op
+            costs.emplace(*op, std::make_unique<SchedulerOpInfo>(*fallbackCost));
+            if ( !_spilling )
+            {
+                peakStagingUsage = std::max(EstimateBufferUsage(op, fallbackCost), peakStagingUsage);
+            }
+        }
+    }
+    // Update costing and cascade information for the ref_schedule
+    refSchedule->UpdateCosts(costs);
+    refSchedule->cascades = std::move(cascadeMap);
+}
+
+
+bool CascadeBuilder::IsCascadable(const SchedulerOperation *op, SchedulerTensor *ifm, SchedulerOpInfo *cost) const
+{
+    OpType type = op->Type();
+
+    if ( ifm->srcTensor->IsConstant() )
+    {
+        return false;
+    }
+
+    // Cascadable operations currently need to support NHCWB16
+    if ( ifm->needsLinearFormat )
+    {
+        return false;
+    }
+
+    if ( IsDma(op->Type()) )
+    {
+        return false;
+    }
+
+    if ( op->IsReordering() )
+    {
+        LOG_TRACE1("Not cascading Transpose/Reverse");
+        return false;
+    }
+
+    return (cost->stripe.Height() < op->OFM()->shape.Height()) && (IsConvolution(type) || IsElementwise(type) || IsPooling(type));
+}
+
+
+int CascadeBuilder::EstimateBufferUsage(SchedulerOperation *op, SchedulerOpInfo *) const
+{
+    // Estimate the RAM required for the Op if all FeatureMaps are in RAM
+    int size = NonLocalUsage(*op);
+
+    for ( auto usage : {TensorUsage::IFM, TensorUsage::IFM1, TensorUsage::OFM} )
+    {
+        SchedulerConnection *fm = IsOFM(usage) ? op->Output(usage) : op->TryInput(usage);
+        if ( !fm )
+        {
+            continue;
+        }
+
+        if ( fm->requireFullTensor )
+        {
+            size += fm->tensor->AllocationSizeBytes();
+        }
+        else
+        {
+            size += fm->PartialAllocationSizeBytes();
+            size = RoundAway(size, 16);
+        }
+    }
+
+    return size;
+}
+
+
+int CascadeBuilder::NonLocalUsage(UniqueId uid) const
+{
+    auto opPos = _nonLocalMemUsage.find(uid);
+    if ( opPos != _nonLocalMemUsage.end() )
+    {
+        return opPos->second;
+    }
+
+    return 0;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/cascade_builder.hpp b/ethosu/regor/compiler/cascade_builder.hpp
new file mode 100644
index 00000000..df70ca4c
--- /dev/null
+++ b/ethosu/regor/compiler/cascade_builder.hpp
@@ -0,0 +1,99 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/common.hpp"
+
+#include "common/shape.hpp"
+#include "common/vector_span.hpp"
+#include "op_type.hpp"
+#include "scheduler_operation.hpp"
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+namespace regor
+{
+
+class Schedule;
+
+/// <summary>
+/// Information about a cascade buffer
+/// </summary>
+struct CascadeBuffer
+{
+    Shape shape;
+    int sizeBytes = 0;
+    CascadeBuffer() = default;
+    CascadeBuffer(const CascadeBuffer &) = default;
+    CascadeBuffer(const Shape &s, int size) : shape(s), sizeBytes(size) {}
+    CascadeBuffer &operator=(const CascadeBuffer &) = default;
+};
+
+
+/// <summary>
+/// Information about a cascade within a schedule
+/// </summary>
+struct CascadeInfo
+{
+    int start = 0;
+    int end = 0;
+    int memUsage = 0;
+    std::unordered_map<UniqueId, CascadeBuffer> buffers;
+
+    CascadeInfo() = default;
+    CascadeInfo(const CascadeInfo &) = default;
+    CascadeInfo(int start_, int end_, int memUsage_, std::unordered_map<UniqueId, CascadeBuffer> buffers_)
+    {
+        this->start = start_;
+        this->end = end_;
+        this->memUsage = memUsage_;
+        this->buffers = std::move(buffers_);
+    }
+    CascadeInfo &operator=(const CascadeInfo &) = default;
+};
+
+
+class SchedulerOpInfo;
+
+/// <summary>
+/// Cascade builder for lists of scheduler operations
+/// </summary>
+class CascadeBuilder
+{
+private:
+    vector_span<std::unique_ptr<SchedulerOperation>> _ops;
+    const std::unordered_map<UniqueId, int> &_nonLocalMemUsage;
+    bool _spilling = false;
+
+public:
+    CascadeBuilder(vector_span<std::unique_ptr<SchedulerOperation>> ops,
+        const std::unordered_map<UniqueId, int> &nonLocalMemUsage, bool spilling);
+
+public:
+    void BuildCascades(Schedule *refSchedule, Schedule *fallbackSchedule, Address guidingStagingLimit);
+
+private:
+    bool IsCascadable(const SchedulerOperation *op, SchedulerTensor *ifm, SchedulerOpInfo *cost) const;
+    int EstimateBufferUsage(SchedulerOperation *op, SchedulerOpInfo *cost) const;
+    int NonLocalUsage(UniqueId uid) const;
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/compiler.cpp b/ethosu/regor/compiler/compiler.cpp
new file mode 100644
index 00000000..a9c4237c
--- /dev/null
+++ b/ethosu/regor/compiler/compiler.cpp
@@ -0,0 +1,455 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "compiler.hpp"
+
+#include "common/logging.hpp"
+
+#include "architecture/register_command_stream_generator.hpp"
+#include "common/bit_flags.hpp"
+#include "common/ini_reader.hpp"
+#include "graph_optimiser.hpp"
+#include "graph_packing.hpp"
+#include "graph_validator.hpp"
+#include "high_level_command_stream_generator.hpp"
+#include "network_performance.hpp"
+#include "raw_writer.hpp"
+#include "scheduler_packing.hpp"
+#include "tensor_allocator.hpp"
+#include "tflite/custom_operator_ethosu.hpp"
+#include "tflite/tflite_reader.hpp"
+#include "tflite/tflite_writer.hpp"
+#include "tosa/tosa_reader.hpp"
+
+BEGIN_ENUM_TABLE(regor::OutputFormat)
+    ADD_ENUM_NAME(None)
+    ADD_ENUM_NAME(TFLite)
+    ADD_ENUM_NAME(Raw)
+END_ENUM_TABLE()
+
+namespace regor
+{
+
+Compiler::Compiler(std::unique_ptr<Architecture> &arch)
+{
+    _architecture = std::move(arch);
+}
+
+Compiler::~Compiler()
+{
+    for ( auto blob : _output )
+    {
+        blob->Release();
+    }
+}
+
+bool Compiler::ParseConfig(const char *text, size_t size)
+{
+    // Get architecture configuration
+    IniReader reader(text, size);
+
+    std::string section;
+    while ( reader.Begin(section) )
+    {
+        auto result = _architecture->ParseSection(section, &reader);
+        if ( result == IniParseResult::Error )
+        {
+            SetLastError(fmt::format("Error parsing [{}]", section));
+            return false;
+        }
+        reader.End();
+    }
+
+    return true;
+}
+
+
+bool Compiler::ParseOptions(const char *text, size_t size)
+{
+    Logging::Out.SetFilterMask(1 | 2 | 4);
+
+    // Get compiler info
+    IniReader reader(text, size);
+
+    std::string section;
+    while ( reader.Begin(section) )
+    {
+        if ( section == "debug" )
+        {
+            // Parse debug settings
+            std::string key;
+            while ( reader.Begin(key) )
+            {
+                if ( key == "trace" )
+                {
+                    if ( reader.Get<bool>() )
+                    {
+                        Logging::Out.SetFilterMask(Logging::Out.FilterMask() | 8 | 16 | 32);
+                    }
+                }
+                reader.End();
+            }
+        }
+        else if ( section == "compiler" )
+        {
+            // Parse compiler options
+            std::string key;
+            while ( reader.Begin(key) )
+            {
+                if ( key == "verbose_high_level_command_stream" )
+                {
+                    _compilerOptions.verboseHighLevelCommandStream = reader.Get<bool>();
+                }
+                else if ( key == "verbose_register_command_stream" )
+                {
+                    _compilerOptions.verboseRegisterCommandStream = reader.Get<bool>();
+                }
+                else if ( key == "enable_db" )
+                {
+                    _compilerOptions.debugDatabase = reader.Get<bool>();
+                }
+                else if ( key == "perf_report" )
+                {
+                    _compilerOptions.perfReport = reader.Get<bool>();
+                }
+                else if ( key == "output_format" )
+                {
+                    Flags<OutputFormat> flags;
+                    flags.Parse(reader.Get<std::string>());
+                    _compilerOptions.outputFormat = flags;
+                }
+                reader.End();
+            }
+        }
+        else if ( section == "scheduler" )
+        {
+            ParseSchedulerOptions(_schedulerOptions, reader);
+        }
+        else if ( section == "graph" )
+        {
+            GraphOptimiser::ParseGraphOptimiserOptions(_graphOptimiserOptions, reader);
+        }
+        else
+        {
+            LOG_WARN("Skipping parsing of unrecognised options section '{}'\n", section);
+        }
+
+        reader.End();
+    }
+
+    return true;
+}
+
+
+bool Compiler::LoadTosa(const void *input, size_t size)
+{
+    TosaReader::LoadGraphs(input, size, _builders);
+    return !_builders.empty();
+}
+
+
+bool Compiler::LoadTflite(const void *input, size_t size)
+{
+    assert(input && size > 0);
+
+    // Instantiate debug database if required early for TFLite
+    if ( _compilerOptions.debugDatabase != !!_optDb )
+        _optDb = _compilerOptions.debugDatabase ? std::make_unique<class OptimiserDatabase>(&_Db) : nullptr;
+
+    TfLiteReader::LoadGraphs(input, size, _graphs, _optDb.get());
+    return !_graphs.empty();
+}
+
+
+// Flatbuffer output blob
+class RawBlob : public IRegorBlob
+{
+private:
+    int _refCount = 1;
+    std::unique_ptr<const uint8_t[]> _buffer;
+    int64_t _offset;
+    int64_t _size;
+
+public:
+    RawBlob(std::unique_ptr<const uint8_t[]> buffer, int64_t offset, int64_t size) :
+            _buffer(std::move(buffer)), _offset(offset), _size(size)
+    {
+    }
+
+    void AddRef() { _refCount++; }
+
+    void Release()
+    {
+        if ( --_refCount == 0 )
+        {
+            delete this;
+        }
+    }
+
+    void *Map(int64_t &size)
+    {
+        size = _size;
+        return const_cast<uint8_t *>(_buffer.get() + _offset);
+    }
+
+    void Unmap(void *) {}
+};
+
+
+bool Compiler::Store(const std::vector<std::unique_ptr<Graph>> &graphs,
+    const std::vector<std::unordered_map<const Tensor *, Address>> &tensorAddressMaps)
+{
+    if ( _compilerOptions.outputFormat == OutputFormat::Raw )
+    {
+        RawWriter writer;
+
+        // This will serialise multiple blobs
+        auto buffers = writer.Serialise(graphs, tensorAddressMaps);
+
+        for ( auto &[buffer, bufferSize] : buffers )
+        {
+            RawBlob *output = new RawBlob(std::move(buffer), 0, bufferSize);
+            _output.push_back(output);
+        }
+    }
+    else
+    {
+        TfLiteWriter writer;
+        int64_t offset;
+        size_t size;
+
+        // This will only serialise one TFLite model
+        auto buffer = writer.Serialise(graphs, tensorAddressMaps, offset, size);
+
+        RawBlob *output = new RawBlob(std::move(buffer), offset, int64_t(size));
+        _output.push_back(output);
+    }
+
+    return true;
+}
+
+
+bool Compiler::Compile()
+{
+    // Check that the configuration is okay to start compiling
+    std::string error;
+    if ( !_architecture->CheckConfiguration(error) )
+    {
+        SetLastError(error);
+        return false;
+    }
+
+    // If no graphs defined (nothing already loaded) then create a network from the graph builders
+    if ( _graphs.empty() )
+    {
+        if ( _builders.empty() )
+        {
+            SetLastError("No networks defined via GraphAPI");
+            return false;
+        }
+
+        // Instantiate debug database if required for GraphAPI
+        if ( _compilerOptions.debugDatabase != !!_optDb )
+            _optDb = _compilerOptions.debugDatabase ? std::make_unique<class OptimiserDatabase>(&_Db) : nullptr;
+
+        if ( !BuildNetwork(nullptr) )  // BuildNetworks sets error text
+        {
+            return false;
+        }
+    }
+
+    // Is used to allocate all constant Npu tensors, in permanent storage
+    IncrementalLinearAllocator readOnlyAllocator("read-only NPU tensors");
+
+    // Compile each graph/subgraph separately
+    std::vector<std::unique_ptr<Graph>> newGraphs;
+    std::vector<std::unordered_map<const Tensor *, Address>> tensorAddressMaps;
+    for ( auto &graph : _graphs )
+    {
+        std::unordered_map<const Tensor *, Address> tensorAddressMap;
+        auto newGraph = CompileGraph(graph, readOnlyAllocator, tensorAddressMap);
+        if ( !newGraph )
+        {
+            return false;
+        }
+        newGraphs.push_back(std::move(newGraph));
+        tensorAddressMaps.push_back(std::move(tensorAddressMap));
+    }
+
+    _optDb.reset();
+    Store(newGraphs, tensorAddressMaps);
+
+    _builders.clear();
+    return true;
+}
+
+
+bool Compiler::BuildNetwork(const char *entryGraph)
+{
+    // Iterate through the builders committing their inputs/outputs to new
+    // graph objects. Any un-attached data will be dropped later.
+    for ( auto &builder : _builders )
+    {
+        auto graph = std::make_unique<Graph>(
+            builder.Name(), builder._inputs, builder._outputs, GraphNotation::GraphAPI, builder.SyntaxVersion());
+        if ( entryGraph && (builder.Name() == entryGraph) )
+        {
+            assert(!_entryPoint && "Entrypoint already set");
+            _entryPoint = graph.get();
+        }
+        _graphs.push_back(std::move(graph));
+    }
+
+    if ( _graphs.empty() )
+    {
+        SetLastError("No graphs defined in network");
+        return false;
+    }
+
+    // Select first graph as entrypoint if none selected
+    if ( !_entryPoint )
+    {
+        _entryPoint = _graphs.front().get();
+    }
+
+    // Clearing the builders will release anything that the client allocated
+    // but didn't use. Unconnected operators will get freed.
+    _builders.clear();
+    return true;
+}
+
+std::unique_ptr<Graph> Compiler::CompileGraph(std::unique_ptr<Graph> &graph,
+    IncrementalLinearAllocator &readOnlyAllocator, std::unordered_map<const Tensor *, Address> &tensorAddressMap)
+{
+    // Validate the input graph semantics
+    if ( graph->Notation() == GraphNotation::GraphAPI )
+    {
+        auto validator = GraphValidator::MakeGraphValidator(graph->Notation(), graph->SyntaxVersion(), this);
+        if ( validator == nullptr )
+        {
+            LOG_WARN("Input graph {0} not validated (required for GraphAPI) syntax={1:X}\n", graph->Name(), graph->SyntaxVersion());
+            return nullptr;
+        }
+        if ( !validator->Validate(graph.get()) )
+        {
+            SetLastError(validator->GetErrorMsg());
+            return nullptr;
+        }
+    }
+
+    // Preprocess/optimise the graph
+    std::unique_ptr<GraphOptimiser> optimiser = GraphOptimiser::MakeGraphOptimiser(
+        graph->Notation(), _architecture.get(), _graphOptimiserOptions, _optDb.get());
+    if ( optimiser )
+    {
+        optimiser->Process(graph.get());
+    }
+
+    // Pack/linearise graph Operations into SchedulerOperations
+    SchedulerPacking packing(_architecture.get());
+    auto scheduleOps = packing.Process(graph.get());
+
+    // Schedule the linearised operation sequence
+    Scheduler scheduler(_architecture.get(), _schedulerOptions, "graph", scheduleOps);
+    auto schedule = scheduler.Process();
+
+    scheduler.AllocateReadOnlyAddresses(schedule.get(), readOnlyAllocator);
+
+    // Calculate full network performance
+    if ( _compilerOptions.perfReport )
+    {
+        NetworkPerformance perf(_architecture.get(), scheduleOps);
+        _perfResult = perf.Measure(schedule.get(), _optDb.get());
+    }
+
+    // Get a new graph and NPU operations from the scheduled operations
+    std::vector<std::pair<Operation *, std::unique_ptr<NPUOperation>>> npuOps;
+    std::unique_ptr<Graph> newGraph = PackScheduleToGraph(npuOps, scheduleOps, tensorAddressMap, graph.get());
+
+#ifndef NDEBUG
+    // Validate the output graph is NPU-only
+    Graph::TraverseGraphFromEnd(newGraph->Outputs(),
+        [&](Operation *op) -> bool
+        {
+            assert((op->Type() == OpType::CustomNpuOp) || (graph->Notation() != GraphNotation::GraphAPI));
+            return true;
+        });
+#endif
+
+    auto customOperatorBuilder = CustomOperatorBuilder(_architecture.get(), schedule.get());
+    customOperatorBuilder.AllocateScratchTensors(tensorAddressMap);
+
+    // Work over the NPU ops, generating code
+    for ( const auto &pair : npuOps )
+    {
+        auto *graphOp = pair.first;
+        const auto *npuOp = pair.second.get();
+
+        // Generate HLCS
+        auto hlcsGenerator = HLCStreamGenerator();
+        auto highLevelCommandStream = hlcsGenerator.GenerateCommandStream(npuOp, schedule.get(), _compilerOptions.verboseHighLevelCommandStream);
+
+        // Generate LLCS for output
+        std::vector<std::tuple<void *, int, int>> cmdRanges;
+        auto registerCommandStream = _architecture->RegisterCommandStreamGenerator()->GenerateCommandStream(
+            highLevelCommandStream, &cmdRanges, _compilerOptions.verboseRegisterCommandStream);
+
+        if ( registerCommandStream.empty() )
+        {
+            SetLastError("Failed to generate command stream");
+            return nullptr;
+        }
+
+        if ( _optDb )
+        {
+            int streamId = _optDb->AddStream();
+            for ( auto const &cmd : cmdRanges )
+            {
+                _optDb->AddCommand(std::get<0>(cmd), streamId, std::get<2>(cmd) - 1);
+            }
+        }
+
+        customOperatorBuilder.Serialise(graphOp, npuOp, registerCommandStream);
+    }
+
+    return newGraph;
+}
+
+GraphApi::IGraphBuilder *Compiler::CreateGraph(const char *name)
+{
+    auto pos = std::find_if(_builders.begin(), _builders.end(), [&](auto &b) { return b.Name() == name; });
+    if ( pos != _builders.end() )
+    {
+        return &(*pos);
+    }
+
+    _builders.emplace_back(name);
+    return &_builders.back();
+}
+
+Graph *Compiler::GetGraph(const char *name)
+{
+    auto pos = std::find_if(_graphs.begin(), _graphs.end(), [&](auto &b) { return b->Name() == name; });
+    if ( pos != _graphs.end() )
+    {
+        return pos->get();
+    }
+    return nullptr;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/compiler.hpp b/ethosu/regor/compiler/compiler.hpp
new file mode 100644
index 00000000..4ed81bb1
--- /dev/null
+++ b/ethosu/regor/compiler/compiler.hpp
@@ -0,0 +1,133 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/common.hpp"
+
+#include "architecture/architecture.hpp"
+#include "database.hpp"
+#include "graph.hpp"
+#include "graph_builder.hpp"
+#include "graph_optimiser.hpp"
+#include "include/regor_interface.hpp"
+#include "network_performance.hpp"
+#include "scheduler.hpp"
+#include "tensor_allocator.hpp"
+
+#include <deque>
+#include <list>
+#include <string>
+
+#include "include/regor.h"
+
+namespace regor
+{
+
+enum class OutputFormat : uint16_t
+{
+    None,
+    TFLite,
+    Raw,
+};
+
+/// <summary>
+/// Compilation options
+/// </summary>
+struct CompilerOptions
+{
+    bool verboseHighLevelCommandStream = false;
+    bool verboseRegisterCommandStream = false;
+    bool debugDatabase = false;
+    bool perfReport = true;
+    OutputFormat outputFormat = OutputFormat::TFLite;
+};
+
+/// <summary>
+/// Regor top level compiler context (could just become Context)
+/// </summary>
+class Compiler : public IRegorReporting
+{
+private:
+    SchedulerOptions _schedulerOptions;
+    CompilerOptions _compilerOptions;
+    GraphOptimiserOptions _graphOptimiserOptions;
+    std::unique_ptr<Architecture> _architecture;
+    std::string _lastError;
+    std::deque<IRegorBlob *> _output;
+    PerformanceResult _perfResult;
+    class Database _Db;
+    std::unique_ptr<class OptimiserDatabase> _optDb;
+
+    Graph *_entryPoint = nullptr;
+    std::vector<std::unique_ptr<Graph>> _graphs;
+    std::list<GraphBuilder> _builders;
+
+public:
+    void *userApiArg = nullptr;
+
+public:
+    Compiler() = delete;
+    Compiler(const Compiler &) = delete;
+    Compiler(std::unique_ptr<Architecture> &arch);
+    ~Compiler();
+
+public:
+    bool ParseConfig(const char *text, size_t size);
+    bool ParseOptions(const char *text, size_t size);
+
+    bool LoadTosa(const void *input, size_t size);
+    bool LoadTflite(const void *input, size_t size);
+    bool Store(const std::vector<std::unique_ptr<Graph>> &graphs,
+        const std::vector<std::unordered_map<const Tensor *, Address>> &tensorAddressMaps);
+
+    bool Compile();
+
+    [[nodiscard]] IRegorBlob *Output()
+    {
+        if ( _output.empty() )
+        {
+            return nullptr;
+        }
+
+        auto blob = _output.front();
+        _output.pop_front();
+        return blob;
+    }
+
+    void SetLastError(const char *message) { _lastError = message; }
+    void SetLastError(const std::string &message) { _lastError = message; }
+    Architecture *Arch() { return _architecture.get(); }
+    const std::string &LastError() const { return _lastError; }
+    const PerformanceResult &LastPerfResult() const { return _perfResult; }
+    // From IRegorReporting
+    IDatabase *OptimiserDatabase() { return &_Db; }
+
+    GraphApi::IGraphBuilder *CreateGraph(const char *name);
+    Graph *GetGraph(const char *name);
+
+private:
+    bool BuildNetwork(const char *entryGraph);
+
+    std::unique_ptr<Graph> CompileGraph(std::unique_ptr<Graph> &graph, IncrementalLinearAllocator &readOnlyAllocator,
+        std::unordered_map<const Tensor *, Address> &tensorAddressMap);
+
+    Compiler &operator=(const Compiler &) = delete;
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/database.hpp b/ethosu/regor/compiler/database.hpp
new file mode 100644
index 00000000..f9d5d74f
--- /dev/null
+++ b/ethosu/regor/compiler/database.hpp
@@ -0,0 +1,208 @@
+//
+// SPDX-FileCopyrightText: Copyright 2022-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/common.hpp"
+
+#include "include/regor_database.hpp"
+
+#include <string>
+#include <vector>
+
+namespace regor
+{
+
+
+
+/// <summary>
+/// Base database implementation
+/// </summary>
+class Database : public IDatabase
+{
+protected:
+    struct DataRow
+    {
+        int uniqueId = 0;
+        std::unique_ptr<std::string[]> fields;
+
+        DataRow() = default;
+
+        DataRow(DataRow &&other) noexcept : fields(std::move(other.fields)) { uniqueId = other.uniqueId; }
+
+        DataRow &operator=(DataRow &&other) noexcept
+        {
+            this->uniqueId = other.uniqueId;
+            this->fields = std::move(other.fields);
+            return *this;
+        }
+    };
+
+    struct DataTable
+    {
+        std::string name;
+        std::vector<DataRow> rows;
+        std::vector<std::string> columnNames;
+        int columns = 0;
+        bool isIndexed = true;
+        DataTable(const char *tableName) : name(tableName) {}
+    };
+
+
+    struct RowIterator : public IRowIterator<std::string>
+    {
+    private:
+        const std::string *_fields;
+        int _uniqueId = 0;
+        int _columns = 0;
+        int _index = -1;
+
+    public:
+        RowIterator(int uniqueId, const std::string *fields, int columns) :
+                _fields(fields), _uniqueId(uniqueId), _columns(columns)
+        {
+        }
+        std::string Value() override { return _fields[_index]; }
+        int Id() override { return _uniqueId; }
+        int Column() override { return _index; }
+        bool Next() override
+        {
+            _index++;
+            return _index < _columns;
+        }
+        void Release() override { delete this; }
+    };
+
+    struct TableIterator : public ITableIterator
+    {
+    private:
+        std::vector<DataTable> &_tables;
+        int _index = -1;
+
+    public:
+        TableIterator(std::vector<DataTable> &tables) : _tables(tables) {}
+
+    public:
+        std::string Name() override
+        {
+            assert(_index >= 0);
+            return _tables[_index].name;
+        }
+        int Rows() override
+        {
+            assert(_index >= 0);
+            return int(_tables[_index].rows.size());
+        }
+        int Columns() override
+        {
+            assert(_index >= 0);
+            return _tables[_index].columns;
+        }
+        IRowIterator<std::string> *ColumnNames() override
+        {
+            return new RowIterator(
+                _tables[_index].isIndexed ? 1 : 0, _tables[_index].columnNames.data(), _tables[_index].columns);
+        }
+        IRowIterator<std::string> *Row(int row) override
+        {
+            const auto &entry = _tables[_index].rows[row];
+            return new RowIterator(entry.uniqueId, entry.fields.get(), _tables[_index].columns);
+        }
+        bool Next() override
+        {
+            _index++;
+            return _index < int(_tables.size());
+        }
+        void Release() override { delete this; }
+    };
+
+    std::vector<DataTable> _tables;
+
+public:
+    int AddTable(const char *name, bool isIndexed = true)
+    {
+        _tables.emplace_back(name);
+        _tables.back().isIndexed = isIndexed;
+        return int(_tables.size()) - 1;
+    }
+
+    void AddColumns(int tableId, std::initializer_list<const char *> names)
+    {
+        assert(tableId >= 0 && tableId < int(_tables.size()));
+        DataTable *table = &_tables[tableId];
+        table->columnNames.insert(table->columnNames.end(), names.begin(), names.end());
+        table->columns = int(table->columnNames.size());
+    }
+
+    void AddColumns(int tableId, std::vector<std::string> names)
+    {
+        assert(tableId >= 0 && tableId < int(_tables.size()));
+        DataTable *table = &_tables[tableId];
+        table->columnNames.insert(table->columnNames.end(), names.begin(), names.end());
+        table->columns = int(table->columnNames.size());
+    }
+
+    int AddRow(int tableId, int uniqueId, std::initializer_list<std::string> values)
+    {
+        assert(tableId >= 0 && tableId < int(_tables.size()));
+        DataTable *table = &_tables[tableId];
+        DataRow tmp;
+        tmp.uniqueId = uniqueId;
+        tmp.fields = std::unique_ptr<std::string[]>(new std::string[table->columns]);
+        auto pos = values.begin();
+        for ( int i = 0; i < table->columns && pos != values.end(); i++, pos++ )
+        {
+            tmp.fields[i] = *pos;
+        }
+
+        table->rows.push_back(std::move(tmp));
+        return int(table->rows.size()) - 1;
+    }
+
+    int AddRow(int tableId, int uniqueId, std::vector<std::string> values)
+    {
+        assert(tableId >= 0 && tableId < int(_tables.size()));
+        DataTable *table = &_tables[tableId];
+        DataRow tmp;
+        tmp.uniqueId = uniqueId;
+        tmp.fields = std::unique_ptr<std::string[]>(new std::string[table->columns]);
+        auto pos = values.begin();
+        for ( int i = 0; i < table->columns && pos != values.end(); i++, pos++ )
+        {
+            tmp.fields[i] = *pos;
+        }
+
+        table->rows.push_back(std::move(tmp));
+        return int(table->rows.size()) - 1;
+    }
+
+    void SetField(int tableId, int row, int column, const std::string &value)
+    {
+        assert(tableId >= 0 && tableId < int(_tables.size()));
+        DataTable *table = &_tables[tableId];
+        assert(row < int(table->rows.size()));
+        assert(column < table->columns);
+        table->rows[row].fields[column] = value;
+    }
+
+    // From IDatabase
+    ITableIterator *Tables() override { return new TableIterator(_tables); }
+};
+
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/faststorage_allocator.cpp b/ethosu/regor/compiler/faststorage_allocator.cpp
new file mode 100644
index 00000000..98afd7c9
--- /dev/null
+++ b/ethosu/regor/compiler/faststorage_allocator.cpp
@@ -0,0 +1,403 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "compiler/faststorage_allocator.hpp"
+
+#include "architecture/architecture.hpp"
+#include "common/vector_span.hpp"
+#include "live_range.hpp"
+#include "scheduler.hpp"
+
+#include <cstdint>
+#include <memory>
+#include <random>
+#include <unordered_map>
+#include <vector>
+
+namespace regor
+{
+
+// FastStorageComponentAllocator
+FastStorageComponentAllocator::FastStorageComponentAllocator(std::vector<int> *baseMemUsage,
+    std::vector<int> *maxMemUsage, int stagingLimit, std::unordered_map<LiveRange *, int64_t> *elementAccessLrs) :
+        _baseMemUsage(baseMemUsage),
+        _maxMemUsage(maxMemUsage), _stagingLimit(stagingLimit), _elementAccessLrs(elementAccessLrs)
+{
+}
+
+// Allocates live ranges. Outputs a vector that gives for each live range if it should be evicted or kept
+void FastStorageComponentAllocator::Allocate(vector_span<LiveRange *> &lrs, std::vector<bool> &evicted)
+{
+    int sz = lrs.size();
+    evicted.resize(sz);
+    _lrs = lrs;
+    _evicted = &evicted;
+    _currEvicted.resize(sz);
+    _bestScore = 0;
+    AllocateExhaustive(0, 0);
+    _evicted = nullptr;
+}
+
+// Exhaustive, recursive search, starting at the given index
+void FastStorageComponentAllocator::AllocateExhaustive(int ix, int score)
+{
+    if ( ix >= _lrs.size() )
+    {
+        // Check if score is better (more access is better)
+        if ( score > _bestScore || _bestScore == 0 )
+        {
+            // Best so far, remember this solution
+            _bestScore = score;
+            *_evicted = _currEvicted;
+        }
+        return;
+    }
+
+    auto lr = _lrs[ix];
+    for ( int t = lr->startTime; t <= lr->endTime; ++t )
+    {
+        assert((*_baseMemUsage)[t] <= (*_maxMemUsage)[t]);
+    }
+    // Current peak usage during this live range
+    int baseUsage = *std::max_element(&(*_baseMemUsage)[lr->startTime], &(*_baseMemUsage)[lr->endTime + 1]);
+    bool canFit = baseUsage + lr->size <= _stagingLimit;
+    bool alwaysFits = canFit;
+    if ( canFit )
+    {
+        // Keep current lr
+        int maxUsage = *std::max_element(&(*_maxMemUsage)[lr->startTime], &(*_maxMemUsage)[lr->endTime + 1]);
+        // If alwaysFits is true, lr can be kept regardless of the allocation of the other lrs
+        alwaysFits = maxUsage <= _stagingLimit;
+        _currEvicted[ix] = false;
+        int lrScore = 0;
+        auto entry = _elementAccessLrs->find(lr);
+        if ( entry != _elementAccessLrs->end() )
+        {
+            lrScore = entry->second;
+        }
+        UpdateMemUsage(_baseMemUsage, lr, true);
+
+        AllocateExhaustive(ix + 1, score + lrScore);
+        UpdateMemUsage(_baseMemUsage, lr, false);
+    }
+    if ( !alwaysFits )
+    {
+        // Evict current lr
+        _currEvicted[ix] = true;
+        UpdateMemUsage(_maxMemUsage, lr, false);
+        AllocateExhaustive(ix + 1, score);
+        UpdateMemUsage(_maxMemUsage, lr, true);
+    }
+}
+
+void FastStorageComponentAllocator::UpdateMemUsage(std::vector<int> *memUsage, LiveRange *lr, bool increase)
+{
+    for ( int t = lr->startTime; t <= lr->endTime; ++t )
+    {
+        (*memUsage)[t] += increase ? lr->size : -lr->size;
+        assert((*memUsage)[t] >= 0);
+    }
+}
+
+
+// FastStorageAllocator
+
+void FastStorageAllocator::AllocateFeatureMaps(const std::vector<std::unique_ptr<SchedulerOperation>> &schedOps,
+    Schedule *schedule, const MemArea &fastStorage, Address stagingLimit)
+{
+    _stagingLimit = int(std::min(INT64_C(1) << 30, stagingLimit));
+    // Force all OFMs to fast-storage (except final outputs)
+    // _scratchedFms contains the original tensor MemArea (not fast storage) that tensors will be evicted to
+    _scratchedFms.clear();
+    for ( auto &schedOp : schedOps )
+    {
+        if ( !schedOp->IsNpuOp() )
+        {
+            continue;
+        }
+
+        auto cost = schedule->Cost(schedOp.get());
+        if ( cost->cascade == 0 )
+        {
+            SchedulerConnection *ofm = schedOp->OFM();
+            if ( !ofm->tensor->consumers.empty() && !ofm->tensor->hasCPUReaders && !ofm->tensor->isGraphOutput &&
+                 _scratchedFms.count(ofm->tensor.get()) == 0 )
+            {
+                _scratchedFms[ofm->tensor.get()] = ofm->tensor->memArea;
+                ofm->tensor->memArea = fastStorage;
+            }
+        }
+    }
+
+    auto lrGraph = LiveRangeGraph();
+    lrGraph.ExtractLiveRangesFromCascades(schedOps, schedule, fastStorage, true);
+    // Populate time-array with memory used by live ranges
+    int maxUsage;
+    _maxMemUsage = lrGraph.GetTemporalMemoryUsage(maxUsage);
+
+    if ( maxUsage <= _stagingLimit )
+    {
+        // All feature maps fit in fast storage
+        return;
+    }
+    // Not all feature maps fit in fast storage
+    _baseMemUsage = _maxMemUsage;
+    std::vector<LiveRange *> lrs;
+    for ( auto lr : lrGraph.LiveRanges() )
+    {
+        for ( auto &tens : lr->tensors )
+        {
+            if ( _scratchedFms.count(tens) )
+            {
+                lrs.push_back(lr.get());
+                for ( int t = lr->startTime; t <= lr->endTime; ++t )
+                {
+                    _baseMemUsage[t] -= lr->size;
+                }
+                break;
+            }
+        }
+    }
+
+    // Perform a first sweep to keep/evict live ranges that are obviously too big
+    std::vector<LiveRange *> canFitLrs;
+    for ( auto lr : lrs )
+    {
+        // Highest memory usage in this live range
+        int baseUsage = *std::max_element(&_baseMemUsage[lr->startTime], &_baseMemUsage[lr->endTime + 1]);
+
+        if ( baseUsage + lr->size > _stagingLimit )
+        {
+            // Cannot possibly fit
+            Evict(lr);
+        }
+        else
+        {
+            canFitLrs.push_back(lr);
+        }
+    }
+    std::vector<LiveRange *> competingLrs;
+    for ( auto lr : canFitLrs )
+    {
+        maxUsage = *std::max_element(&_maxMemUsage[lr->startTime], &_maxMemUsage[lr->endTime + 1]);
+        if ( maxUsage <= _stagingLimit )
+        {
+            // Definitively fits without impacting other feature maps
+            Keep(lr);
+        }
+        else
+        {
+            competingLrs.push_back(lr);
+        }
+    }
+    // For the remaining live ranges a choice must be made which to keep and which to evict.
+    // Divide the live ranges in connected components and do a search for each component
+    int sz = int(competingLrs.size());
+    if ( sz == 0 )
+    {
+        ElementwiseSanitizer(schedOps, schedule, fastStorage, lrGraph);
+        return;
+    }
+
+    // For every competing live range accumulate the total element access for the ranges.
+    // Include all tensors access for a range - both read and write access.
+    // A live range that is used within a cascade is given the highest score possible.
+    // The reason is for cascaded elementwise operators where the other ifm (the cascade buffer)
+    // is already in fast storage.
+    // A live range with higher element access is considered more important to keep in
+    // in fast storage.
+    std::unordered_map<LiveRange *, int64_t> elementAccessLrs;
+    for ( auto lr : competingLrs )
+    {
+        bool lrUsedWithinCascade = false;
+        int64_t access = 0;
+        for ( auto tens : lr->tensors )
+        {
+            // Look at readers
+            for ( auto cons : tens->consumers )
+            {
+                auto *ifm = cons->IFM(0);
+                auto *ifm2 = cons->TryIFM(1);
+                auto consCost = schedule->Cost(cons);
+
+                CascadeInfo *cascadeInfo =
+                    consCost == nullptr || consCost->cascade == 0 ? nullptr : &schedule->cascades[consCost->cascade];
+
+                if ( cascadeInfo && cons->Index() > cascadeInfo->start )
+                {
+                    lrUsedWithinCascade = true;
+                    break;
+                }
+
+                if ( ifm->tensor->srcTensor == tens->srcTensor && consCost )
+                {
+                    access += consCost->elementAccess.ifmRead[0];
+                }
+                else if ( ifm2 && ifm2->tensor->srcTensor == tens->srcTensor && consCost )
+                {
+                    access += consCost->elementAccess.ifmRead[1];
+                }
+            }
+            if ( !lrUsedWithinCascade )
+            {
+                // Look at writers
+                for ( auto prod : tens->producers )
+                {
+                    auto cost = schedule->Cost(prod);
+                    if ( cost == nullptr && prod->Parent() )
+                    {
+                        // Most likely a fused LUT, use cost from primary op
+                        cost = schedule->Cost(prod->Parent());
+                    }
+                    if ( cost )
+                    {
+                        access += cost->elementAccess.ofmWrite;
+                    }
+                }
+            }
+            else
+            {
+                access = FastStorageComponentAllocator::MAX_ACCESS_SIZE;
+            }
+        }
+        elementAccessLrs[lr] = access;
+    }
+
+    int start = 0;
+    int startTime = competingLrs[0]->startTime;
+    int endTime = competingLrs[0]->endTime;
+    FastStorageComponentAllocator componentAllocator(&_baseMemUsage, &_maxMemUsage, _stagingLimit, &elementAccessLrs);
+
+    // Calculate and allocate connected components
+    for ( int i = 1; i < sz; ++i )
+    {
+        auto lr = competingLrs[i];
+        if ( lr->startTime <= endTime && i - start <= MAX_COMPONENT_SIZE )
+        {
+            // Add to existing component
+            startTime = std::min(startTime, lr->startTime);
+            endTime = std::max(endTime, lr->endTime);
+        }
+        else
+        {
+            // lr is start of a new component; allocate the current component
+            vector_span<LiveRange *> span(competingLrs, start, i);
+            AllocateComponent(componentAllocator, span);
+            // Start a new component
+            start = i;
+            startTime = lr->startTime;
+            endTime = lr->endTime;
+        }
+    }
+    vector_span<LiveRange *> span(competingLrs, start, sz);
+    AllocateComponent(componentAllocator, span);
+    ElementwiseSanitizer(schedOps, schedule, fastStorage, lrGraph);
+}
+
+// Allocates a connected range of live ranges
+void FastStorageAllocator::AllocateComponent(FastStorageComponentAllocator &allocator, vector_span<LiveRange *> &lrs)
+{
+    std::vector<bool> evicted;
+    int sz = lrs.size();
+    allocator.Allocate(lrs, evicted);
+    assert(sz == int(evicted.size()));
+    for ( int i = 0; i < sz; ++i )
+    {
+        if ( evicted[i] )
+        {
+            Evict(lrs[i]);
+        }
+        else
+        {
+            Keep(lrs[i]);
+        }
+    }
+}
+
+void FastStorageAllocator::ElementwiseSanitizer(const std::vector<std::unique_ptr<SchedulerOperation>> &schedOps,
+    Schedule *schedule, const MemArea &fastStorage, LiveRangeGraph &lrGraph)
+
+{
+    // For now - enforce that both ifm's should be in the same memory for elementwise
+    for ( auto &schedOp : schedOps )
+    {
+        if ( !schedOp->IsNpuOp() )
+        {
+            continue;
+        }
+
+        if ( IsBinaryElementwise(schedOp->_type) )
+        {
+            auto *ifm = schedOp->IFM(0);
+            auto *ifm2 = schedOp->TryIFM(1);
+            auto consCost = schedule->Cost(schedOp.get());
+
+            CascadeInfo *cascadeInfo =
+                consCost == nullptr || consCost->cascade == 0 ? nullptr : &schedule->cascades[consCost->cascade];
+
+            if ( cascadeInfo && schedOp->Index() > cascadeInfo->start )
+                // Within cascade there is nothing to do, since cascade buffer is in fast storage
+                continue;
+
+            if ( ifm2 && ifm2->tensor->memArea != ifm->tensor->memArea )
+            {
+                // One ifm not in fast storage
+                if ( ifm->tensor->memArea == fastStorage && !ifm2->tensor->IsConstant() )
+                {
+                    // Ifm in fast storage and ifm2 is not a constant
+                    auto lr = lrGraph.GetOrCreateRange(ifm->tensor.get());
+                    Evict(lr);
+                }
+
+                if ( ifm2->tensor->memArea == fastStorage && !ifm->tensor->IsConstant() )
+                {
+                    // Ifm2 in fast storage and ifm is not a constant
+                    auto lr = lrGraph.GetOrCreateRange(ifm2->tensor.get());
+                    Evict(lr);
+                }
+            }
+        }
+    }
+}
+
+void FastStorageAllocator::Evict(LiveRange *lr)
+{
+    for ( int t = lr->startTime; t <= lr->endTime; ++t )
+    {
+        _maxMemUsage[t] -= lr->size;
+    }
+    for ( auto &tens : lr->tensors )
+    {
+        auto entry = _scratchedFms.find(tens);
+        if ( entry != _scratchedFms.end() )
+        {
+            tens->memArea = entry->second;
+        }
+    }
+}
+
+void FastStorageAllocator::Keep(LiveRange *lr)
+{
+    for ( int t = lr->startTime; t <= lr->endTime; ++t )
+    {
+        _baseMemUsage[t] += lr->size;
+        assert(_baseMemUsage[t] <= _stagingLimit);
+    }
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/faststorage_allocator.hpp b/ethosu/regor/compiler/faststorage_allocator.hpp
new file mode 100644
index 00000000..19f512c6
--- /dev/null
+++ b/ethosu/regor/compiler/faststorage_allocator.hpp
@@ -0,0 +1,98 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "architecture/architecture.hpp"
+#include "common/vector_span.hpp"
+#include "live_range.hpp"
+#include "scheduler.hpp"
+
+#include <cstdint>
+#include <memory>
+#include <random>
+#include <unordered_map>
+#include <vector>
+
+namespace regor
+{
+
+// Allocates a connected set of live ranges to fast storage.
+//
+// The allocator attempts to maximize the sum of the sizes of all feature maps
+// that are placed in fast storage.
+class FastStorageComponentAllocator
+{
+private:
+    // Memory usage per timestamp when no lrs are kept
+    std::vector<int> *_baseMemUsage = nullptr;
+    // Memory usage per timestamp when all lrs are kept
+    std::vector<int> *_maxMemUsage = nullptr;
+
+    Address _stagingLimit;
+    vector_span<LiveRange *> _lrs;
+    // Indices of evicted lrs in best solution
+    std::vector<bool> *_evicted = nullptr;
+    // Indices of evicted lrs in current solution
+    std::vector<bool> _currEvicted;
+    int _bestScore = 0;
+    std::unordered_map<LiveRange *, int64_t> *_elementAccessLrs = nullptr;
+    // Use default seed (which is well-defined) to guarantee reproducible results
+    std::mt19937 _rng;
+
+public:
+    static constexpr int MAX_ACCESS_SIZE = std::numeric_limits<int>::max();
+
+    FastStorageComponentAllocator(std::vector<int> *baseMemUsage, std::vector<int> *maxMemUsage, int stagingLimit,
+        std::unordered_map<LiveRange *, int64_t> *elementAccessLrs);
+    // Allocates live ranges. Outputs a vector that gives for each live range if it should be evicted or kept
+    void Allocate(vector_span<LiveRange *> &lrs, std::vector<bool> &evicted);
+
+private:
+    // Exhaustive, recursive search, starting at the given index
+    void AllocateExhaustive(int ix, int score);
+    void UpdateMemUsage(std::vector<int> *memUsage, LiveRange *lr, bool increase);
+};
+
+// Allocates feature maps to fast storage
+class FastStorageAllocator
+{
+private:
+    static constexpr int64_t MAX_COMPONENT_SIZE = 20;
+    // Remembers feature map's memory before it was allocated to fast storage
+    std::unordered_map<SchedulerTensor *, MemArea> _scratchedFms;
+    // Memory usage with all feature maps in fast storage
+    std::vector<int> _maxMemUsage;
+    // Memory usage without feature maps that still need to be allocated
+    std::vector<int> _baseMemUsage;
+    int _stagingLimit = 0;
+
+public:
+    void AllocateFeatureMaps(const std::vector<std::unique_ptr<SchedulerOperation>> &schedOps, Schedule *schedule,
+        const MemArea &fastStorage, Address stagingLimit);
+
+private:
+    // Allocates a connected range of live ranges
+    void AllocateComponent(FastStorageComponentAllocator &allocator, vector_span<LiveRange *> &lrs);
+    void ElementwiseSanitizer(const std::vector<std::unique_ptr<SchedulerOperation>> &schedOps, Schedule *schedule,
+        const MemArea &fastStorage, LiveRangeGraph &lrGraph);
+    void Evict(LiveRange *lr);
+    void Keep(LiveRange *lr);
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/graph.hpp b/ethosu/regor/compiler/graph.hpp
new file mode 100644
index 00000000..05755cc8
--- /dev/null
+++ b/ethosu/regor/compiler/graph.hpp
@@ -0,0 +1,172 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "operation.hpp"
+#include "tensor.hpp"
+
+#include <list>
+#include <memory>
+#include <stack>
+#include <unordered_set>
+#include <vector>
+
+namespace regor
+{
+
+enum class GraphNotation
+{
+    Invalid = 0,
+    GraphAPI = 1,
+    TFLite = 2,
+};
+
+/// <summary>
+/// Top level Neural Network Graph (NNG)
+/// </summary>
+class Graph
+{
+private:
+    std::string _name;
+    std::vector<std::shared_ptr<Tensor>> _inputs;
+    std::vector<std::shared_ptr<Tensor>> _outputs;
+    GraphNotation _notation = GraphNotation::Invalid;
+    uint32_t _syntaxVersion = 0;
+    const void *_passthrough = nullptr;  // Original flatbuffer description of this model (if it was loaded from one)
+    std::vector<Operation *> _opsInScheduledOrder;
+
+public:
+    Graph() = delete;
+
+    Graph(GraphNotation nt) : _notation(nt) {}
+
+    Graph(const std::string &name, std::vector<std::shared_ptr<Tensor>> inputs, std::vector<std::shared_ptr<Tensor>> outputs, GraphNotation nt, uint32_t version) :
+            _name(name), _inputs(std::move(inputs)), _outputs(std::move(outputs)), _notation(nt), _syntaxVersion(version)
+    {
+    }
+
+    ~Graph()
+    {
+        _notation = GraphNotation::Invalid;
+        std::vector<Operation *> operations;
+        GetAllOperations(operations);
+        for ( auto operation : operations )
+        {
+            operation->Disconnect();
+        }
+    }
+
+public:
+    const std::string &Name() const { return _name; }
+    uint32_t SyntaxVersion() const { return _syntaxVersion; }
+
+    const std::vector<std::shared_ptr<Tensor>> &Inputs() const { return _inputs; }
+    const std::vector<std::shared_ptr<Tensor>> &Outputs() const { return _outputs; }
+
+    void AddInput(const std::shared_ptr<Tensor> &input) { _inputs.push_back(input); }
+    void AddOutput(const std::shared_ptr<Tensor> &output) { _outputs.push_back(output); }
+
+    bool IsInput(const Tensor *tensor) const
+    {
+        return std::find_if(_inputs.begin(), _inputs.end(),
+                   [&](const std::shared_ptr<Tensor> &input) { return input.get() == tensor; }) != _inputs.end();
+    }
+    bool IsOutput(const Tensor *tensor) const
+    {
+        return std::find_if(_outputs.begin(), _outputs.end(),
+                   [&](const std::shared_ptr<Tensor> &output) { return output.get() == tensor; }) != _outputs.end();
+    }
+
+    GraphNotation Notation() const
+    {
+        assert(_notation != GraphNotation::Invalid);
+        return _notation;
+    }
+
+    uint32_t Version() const { return _syntaxVersion; }
+
+    const void *Passthrough() const { return _passthrough; }
+    void SetPassthrough(const void *passthrough) { _passthrough = passthrough; }
+
+    // Finds all operations which precede a graph output and adds them to the vector in execution order
+    void GetAllOperations(std::vector<Operation *> &operations) const
+    {
+        TraverseGraphFromEnd(Outputs(),
+            [&](Operation *op) -> bool
+            {
+                operations.push_back(op);
+                return true;
+            });
+    }
+
+    // Get all operations in the graph, in scheduled order
+    const std::vector<Operation *> &ScheduledOrder() const { return _opsInScheduledOrder; };
+
+    void SetScheduledOrder(std::vector<Operation *> operations) { _opsInScheduledOrder = std::move(operations); }
+
+    template<typename OPFUNC>
+    static void TraverseGraphFromEnd(const std::vector<std::shared_ptr<Tensor>> &from, OPFUNC opFunc)
+    {
+        struct Entry
+        {
+            bool done;
+            Operation *op;
+        };
+        std::unordered_set<Operation *> visited;
+        std::stack<Entry> stack;
+
+        for ( const auto &tensor : from )
+        {
+            for ( const auto &op : tensor->Writers() )
+            {
+                stack.push(Entry{false, op.get()});
+            }
+        }
+
+        while ( !stack.empty() )
+        {
+            Entry entry = stack.top();
+            stack.pop();
+            if ( entry.done )
+            {
+                if ( !opFunc(entry.op) )
+                {
+                    return;
+                }
+            }
+            else if ( visited.count(entry.op) == 0 )
+            {
+                visited.insert(entry.op);
+                stack.push(Entry{true, entry.op});
+                for ( const auto &pair : entry.op->Inputs().pairs() )
+                {
+                    for ( const auto &op : pair.second.tensor->Writers() )
+                    {
+                        if ( visited.count(op.get()) == 0 )
+                        {
+                            stack.push(Entry{false, op.get()});
+                        }
+                    }
+                }
+            }
+        }
+    }
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/graph_builder.cpp b/ethosu/regor/compiler/graph_builder.cpp
new file mode 100644
index 00000000..ca9f2d02
--- /dev/null
+++ b/ethosu/regor/compiler/graph_builder.cpp
@@ -0,0 +1,530 @@
+//
+// SPDX-FileCopyrightText: Copyright 2022-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "graph_builder.hpp"
+
+#include "attributes.hpp"
+#include "common/numeric_util.hpp"
+#include "graph.hpp"
+#include "operation.hpp"
+
+#include <algorithm>
+#include <memory>
+#include <unordered_set>
+
+
+namespace regor
+{
+
+TensorUsage GraphAPIUsageToTensorUsage(GraphApi::GraphTensorUsage usage)
+{
+    return regor::TensorUsage(usage);  // currently 1:1 mapping required
+}
+
+namespace
+{
+
+// clang-format off
+static constexpr std::pair<tosa::Op, regor::OpType> s_aTosaMapping[] = {
+    {tosa::Op::ARGMAX,     OpType::ArgMax},
+    {tosa::Op::AVG_POOL2D, OpType::AvgPool},
+    {tosa::Op::CONV2D,     OpType::Conv2D},
+    {tosa::Op::CONV3D,     OpType::Conv3D},
+    {tosa::Op::DEPTHWISE_CONV2D, OpType::DepthwiseConv2DBias},
+    {tosa::Op::FULLY_CONNECTED,  OpType::FullyConnected},
+    {tosa::Op::MATMUL,     OpType::MatMul},
+    {tosa::Op::MAX_POOL2D, OpType::MaxPool},
+    {tosa::Op::TRANSPOSE_CONV2D, OpType::TransposeConv2D},
+    {tosa::Op::CLAMP,    OpType::Clamp},
+    {tosa::Op::SIGMOID,  OpType::Sigmoid},
+    {tosa::Op::TANH,     OpType::Tanh},
+    {tosa::Op::ADD,      OpType::Add},
+    {tosa::Op::ARITHMETIC_RIGHT_SHIFT, OpType::Asr},
+    {tosa::Op::BITWISE_AND, OpType::And},
+    {tosa::Op::BITWISE_OR,  OpType::Or},
+    {tosa::Op::BITWISE_XOR, OpType::Xor},
+    {tosa::Op::INTDIV,      OpType::Div},
+    {tosa::Op::LOGICAL_AND, OpType::LogicalAnd},
+    {tosa::Op::LOGICAL_LEFT_SHIFT, OpType::SHL},
+    {tosa::Op::LOGICAL_RIGHT_SHIFT, OpType::SHR},
+    {tosa::Op::LOGICAL_OR,  OpType::LogicalOr},
+    {tosa::Op::LOGICAL_XOR, OpType::LogicalXor},
+    {tosa::Op::MAXIMUM, OpType::Maximum},
+    {tosa::Op::MINIMUM, OpType::Minimum},
+    {tosa::Op::MUL,   OpType::Mul},
+    {tosa::Op::POW,   OpType::Pow},
+    {tosa::Op::SUB,   OpType::Sub},
+    {tosa::Op::TABLE, OpType::LUT},
+    {tosa::Op::ABS,   OpType::Abs},
+    {tosa::Op::BITWISE_NOT, OpType::Not},
+    {tosa::Op::CEIL,  OpType::Ceil},
+    {tosa::Op::CLZ,   OpType::CLZ},
+    {tosa::Op::EXP,   OpType::Exp},
+    {tosa::Op::FLOOR, OpType::Floor},
+    {tosa::Op::LOG,   OpType::Log},
+    {tosa::Op::LOGICAL_NOT, OpType::LogicalNot},
+    {tosa::Op::NEGATE,      OpType::Neg},
+    {tosa::Op::RECIPROCAL,  OpType::Reciprocal},
+    {tosa::Op::RSQRT,   OpType::Rsqrt},
+    {tosa::Op::SELECT,  OpType::Select},
+    {tosa::Op::EQUAL,   OpType::Equal},
+    {tosa::Op::GREATER, OpType::Greater},
+    {tosa::Op::GREATER_EQUAL,  OpType::GreaterEqual},
+    {tosa::Op::REDUCE_ANY,     OpType::ReduceAny},
+    {tosa::Op::REDUCE_ALL,     OpType::ReduceAll},
+    {tosa::Op::REDUCE_MAX,     OpType::ReduceMax},
+    {tosa::Op::REDUCE_MIN,     OpType::ReduceMin},
+    {tosa::Op::REDUCE_PRODUCT, OpType::ReduceProduct},
+    {tosa::Op::REDUCE_SUM, OpType::ReduceSum},
+    {tosa::Op::CONCAT,     OpType::Concat},
+    {tosa::Op::PAD,        OpType::Pad},
+    {tosa::Op::RESHAPE,    OpType::Reshape},
+    {tosa::Op::REVERSE,    OpType::Reverse},
+    {tosa::Op::SLICE,      OpType::Slice},
+    {tosa::Op::TILE,       OpType::Tile},
+    {tosa::Op::TRANSPOSE,  OpType::Transpose},
+    {tosa::Op::GATHER,     OpType::Gather},
+    {tosa::Op::SCATTER,    OpType::Scatter},
+    {tosa::Op::RESIZE,     OpType::Resize},
+    {tosa::Op::CAST,       OpType::Cast},
+    {tosa::Op::RESCALE,    OpType::Rescale},
+    {tosa::Op::IDENTITY,   OpType::Identity},
+    {tosa::Op::CUSTOM,     OpType::Custom},
+    {tosa::Op::COND_IF,    OpType::If},
+    {tosa::Op::WHILE_LOOP, OpType::While},
+    //{tosa::Op::FFT2D,      OpType::CurrentlyUnsupported},
+    //{tosa::Op::RFFT2D,     OpType::CurrentlyUnsupported},
+    //{tosa::Op::ERF,        OpType::CurrentlyUnsupported},
+    //{tosa::Op::DIM,        OpType::CurrentlyUnsupported},
+};
+
+static constexpr std::pair<GraphApi::GraphDataType, regor::DataType> s_aTypeMapping[] = {
+    {GraphApi::GraphDataType::Bool8,      DataType::Bool8},
+    {GraphApi::GraphDataType::Int4Packed8, DataType::Int4Packed8},
+    {GraphApi::GraphDataType::Int8,       DataType::Int8},
+    {GraphApi::GraphDataType::Int16,      DataType::Int16},
+    {GraphApi::GraphDataType::Int32,      DataType::Int32},
+    {GraphApi::GraphDataType::Int48,      DataType::Int48},
+    {GraphApi::GraphDataType::Int64,      DataType::Int64},
+    {GraphApi::GraphDataType::UInt8,      DataType::UInt8},
+    {GraphApi::GraphDataType::UInt16,     DataType::UInt16},
+    {GraphApi::GraphDataType::UInt32,     DataType::UInt32},
+    {GraphApi::GraphDataType::UInt48,     DataType::UInt48},
+    {GraphApi::GraphDataType::UInt64,     DataType::UInt64},
+    {GraphApi::GraphDataType::BFloat16,   DataType::BFloat16},
+    {GraphApi::GraphDataType::Float16,    DataType::Float16},
+    {GraphApi::GraphDataType::Float32,    DataType::Float32},
+};
+// clang-format on
+
+template<typename A, typename B, size_t SIZE>
+constexpr bool is_sorted(const std::pair<A, B> (&list)[SIZE])
+{
+    A v = list[0].first;
+    for ( size_t i = 1; i < SIZE; i++ )
+    {
+        if ( list[i].first < v ) return false;
+        v = list[i].first;
+    }
+    return true;
+}
+
+static_assert(is_sorted(s_aTosaMapping), "TOSA mapping must be sorted");
+
+bool map_tosa_op(tosa::Op op, regor::OpType &tosaOp)
+{
+    auto pos = std::equal_range(std::begin(s_aTosaMapping), std::end(s_aTosaMapping),
+        std::pair<tosa::Op, regor::OpType>(op, {}), [](const auto &a, const auto &b) { return a.first < b.first; });
+    if ( pos.first == std::end(s_aTosaMapping) )
+    {
+        return false;
+    }
+
+    tosaOp = pos.first->second;
+    return true;
+}
+
+static_assert(is_sorted(s_aTypeMapping), "Type mapping must be sorted");
+
+bool map_data_type(GraphApi::GraphDataType type, regor::DataType &out)
+{
+    auto pos = std::equal_range(std::begin(s_aTypeMapping), std::end(s_aTypeMapping),
+        std::pair<GraphApi::GraphDataType, regor::DataType>(type, {}),
+        [](const auto &a, const auto &b) { return a.first < b.first; });
+    if ( pos.first == std::end(s_aTypeMapping) )
+    {
+        return false;
+    }
+
+    out = pos.first->second;
+    return true;
+}
+
+}  // namespace
+
+
+GraphBuilder::GraphBuilder(const std::string &name) : _graphName(name)
+{
+}
+
+GraphBuilder::~GraphBuilder()
+{
+    FreeUnconnected();
+}
+
+bool GraphBuilder::RequireSyntaxVersion(uint32_t version, int32_t level)
+{
+    _syntaxVersion = version | uint32_t(level);
+
+    if ( _syntaxVersion > (GraphApi::VERSION_TOSA_0_60 | GraphApi::PROFILE_BASELINE) )  // 0.60.Baseline
+    {
+        return false;
+    }
+
+    return true;
+}
+
+GraphApi::GraphOperation *GraphBuilder::CreateOp(tosa::Op tosaType, const GraphKernel *kernel)
+{
+    OpType type = OpType::None;
+    if ( !map_tosa_op(tosaType, type) )
+    {
+        return nullptr;
+    }
+
+    auto op = std::make_shared<Operation>(type);
+    if ( kernel )
+    {
+        op->SetKernel(std::make_unique<Kernel>(kernel));
+    }
+    else
+    {
+        op->SetKernel(std::make_unique<Kernel>(Point2i(1, 1), Point2i(1, 1), Point2i(1, 1)));
+    }
+    _operations.push_back(op);
+
+    return op.get();
+}
+
+
+struct GraphBuilderBuffer : public Buffer, public GraphApi::GraphBuffer
+{
+    template<typename TYPE>
+    GraphBuilderBuffer(int sizeBytes, const TYPE *p, bool alias) : Buffer(sizeBytes, p, alias)
+    {
+    }
+};
+
+
+GraphApi::GraphBuffer *GraphBuilder::CreateBuffer(size_t sizeBytes, GraphApi::BufferMapping mapping, const void *initialData)
+{
+    auto buffer = std::make_shared<GraphBuilderBuffer>(
+        int(std::clamp<size_t>(sizeBytes, 0, unsigned(std::numeric_limits<int>::max()))),
+        reinterpret_cast<const uint8_t *>(initialData), (mapping == BufferMapping::Alias));
+    _buffers.push_back(buffer);
+    return buffer.get();
+}
+
+GraphApi::GraphTensor *GraphBuilder::CreateTensor(
+    const char *name, const GraphShape &shape, GraphTensorLayout layout, GraphDataType dataType, GraphBuffer *buffer)
+{
+    DataType type;
+    if ( !map_data_type(dataType, type) )
+    {
+        return nullptr;
+    }
+
+    auto tensor = std::make_shared<Tensor>(name, type);
+    tensor->SetStorageShape(Shape(shape.axisNHWC, int(shape.count)));
+    // TODO: Handle external tensor format specification - tensor->SetStorageLayout(layout);
+    if ( buffer )
+    {
+        assert(uintptr_t(buffer) % alignof(GraphBuilderBuffer) == 0);
+        auto graphBuffer = static_cast<GraphBuilderBuffer *>(buffer)->shared_from_this();
+        int reqBytes = DataTypeStorageSizeBytes(type, tensor->StorageShape().Elements());
+        assert(layout == GraphTensorLayout::Linear);
+        UNUSED(layout);
+        assert(reqBytes <= graphBuffer->Size());
+        if ( reqBytes > graphBuffer->Size() )
+        {
+            return nullptr;
+        }
+        tensor->SetBuffer(graphBuffer);
+    }
+    _tensors.push_back(tensor);
+    return tensor.get();
+}
+
+void GraphBuilder::AddInput(GraphTensor *graphTensor)
+{
+    auto tensor = static_cast<Tensor *>(graphTensor);
+    _inputs.push_back(tensor->shared_from_this());
+}
+
+void GraphBuilder::AddOutput(GraphTensor *graphTensor)
+{
+    auto tensor = static_cast<Tensor *>(graphTensor);
+    _outputs.push_back(tensor->shared_from_this());
+}
+
+void GraphBuilder::AddInput(GraphOperation *graphOp, GraphTensorUsage usage, GraphTensor *graphTensor)
+{
+    auto op = static_cast<Operation *>(graphOp);
+    // TODO check cross graph contamination - assert( std::find(_operations.begin(), _operations.end(), op) !=
+    // _operations.end() );
+    auto tensor = static_cast<Tensor *>(graphTensor);
+    auto tmp = GraphAPIUsageToTensorUsage(usage);
+    int count = op->CountInputs(tmp);
+    op->ConnectInput(MakeTensorUsage(tmp, count), tensor->shared_from_this()).Set(Quantization::Unit());
+}
+
+void GraphBuilder::AddOutput(GraphOperation *graphOp, GraphTensorUsage usage, GraphTensor *graphTensor)
+{
+    auto op = static_cast<Operation *>(graphOp);
+    // TODO check cross graph contamination -  assert( std::find(_operations.begin(), _operations.end(), op) !=
+    // _operations.end() );
+    auto tensor = static_cast<Tensor *>(graphTensor);
+    auto tmp = GraphAPIUsageToTensorUsage(usage);
+    int count = op->CountOutputs(tmp);
+    op->ConnectOutput(MakeTensorUsage(tmp, count), tensor->shared_from_this()).Set(Quantization::Unit());
+}
+
+namespace
+{
+
+const FieldInfo *FindField(const TypeInfo *info, uint32_t id)
+{
+    assert(info);
+    size_t length;
+    const FieldInfo *table = info->Fields(length);
+    for ( size_t i = 0; i < length; i++ )
+    {
+        if ( table[i].id == id ) return &table[i];
+    }
+    return nullptr;
+}
+
+template<typename TYPE>
+void WriteField(void *p, const TYPE &value)
+{
+    assert(p);
+    *reinterpret_cast<TYPE *>(p) = value;
+}
+
+template<typename TYPE>
+bool ConvertToType(void *p, [[maybe_unused]] uint8_t destType, const TYPE &value)
+{
+    if ( destType == FieldTypeId<TYPE>::TYPEID )
+    {
+        WriteField(p, value);
+        return true;
+    }
+    return false;
+}
+
+template<>
+bool ConvertToType(void *p, uint8_t destType, const int32_t &value)
+{
+    assert(p);
+    switch ( destType )
+    {
+        case FieldTypeId<uint8_t>::TYPEID:
+            [[fallthrough]];
+        case FieldTypeId<int8_t>::TYPEID:
+            WriteField<uint8_t>(p, uint8_t(value));
+            break;
+        case FieldTypeId<uint16_t>::TYPEID:
+            [[fallthrough]];
+        case FieldTypeId<int16_t>::TYPEID:
+            WriteField<uint16_t>(p, uint16_t(value));
+            break;
+        case FieldTypeId<uint32_t>::TYPEID:
+            [[fallthrough]];
+        case FieldTypeId<int32_t>::TYPEID:
+            WriteField<uint32_t>(p, uint32_t(value));
+            break;
+        default:
+            assert(false);
+            return false;
+            break;
+    }
+    return true;
+}
+
+template<>
+bool ConvertToType(void *p, uint8_t destType, const double &value)
+{
+    assert(p);
+    switch ( destType )
+    {
+        case FieldTypeId<float>::TYPEID:
+            WriteField<float>(p, float(value));
+            break;
+        case FieldTypeId<double>::TYPEID:
+            WriteField<double>(p, value);
+            break;
+        default:
+            assert(false);
+            return false;
+            break;
+    }
+    return true;
+}
+
+template<typename TYPE>
+bool WriteAttributeValue(Operation *op, GraphApi::OpAttr attrId, const TYPE &value)
+{
+    DynamicRef *attr = op->AttributeByKey(uint32_t(attrId) >> 12);
+    if ( attr )
+    {
+        const auto *field = FindField(attr->Info(), (uint32_t(attrId) >> 4) & 0x0FFFFFFF);
+        if ( field )
+        {
+            void *to = reinterpret_cast<uint8_t *>(attr->Instance()) + field->offset;
+            return ConvertToType(to, field->typeId, value);
+        }
+    }
+    return false;
+}
+
+}  // namespace
+
+bool GraphBuilder::Set(GraphOperation *graphOp, GraphApi::OpAttr attr, bool value)
+{
+    auto op = static_cast<Operation *>(graphOp);
+    if ( (unsigned(attr) & 0xF) != GraphApi::GRAPHAPI_TYPECODE_bool )
+    {
+        assert(false && "Attribute type not bool");
+        return false;
+    }
+
+    return WriteAttributeValue(op, attr, value);
+}
+
+bool GraphBuilder::Set(GraphOperation *graphOp, GraphApi::OpAttr attr, int32_t value)
+{
+    auto op = static_cast<Operation *>(graphOp);
+    if ( (unsigned(attr) & 0xF) != GraphApi::GRAPHAPI_TYPECODE_int32 )
+    {
+        assert(false && "Attribute type not int32");
+        return false;
+    }
+    return WriteAttributeValue(op, attr, value);
+}
+
+bool GraphBuilder::Set(GraphOperation *graphOp, GraphApi::OpAttr attr, double value)
+{
+    auto op = static_cast<Operation *>(graphOp);
+    if ( (unsigned(attr) & 0xF) != GraphApi::GRAPHAPI_TYPECODE_double )
+    {
+        assert(false && "Attribute type not double");
+        return false;
+    }
+    return WriteAttributeValue(op, attr, value);
+}
+
+bool GraphBuilder::Set(GraphOperation *graphOp, GraphApi::OpAttr attr, const GraphApi::GraphShape &value)
+{
+    auto op = static_cast<Operation *>(graphOp);
+    if ( (unsigned(attr) & 0xF) != GraphApi::GRAPHAPI_TYPECODE_GraphShape )
+    {
+        assert(false && "Attribute type not Shape");
+        return false;
+    }
+
+    Shape shape(value.axisNHWC, size_t(value.count));
+    return WriteAttributeValue(op, attr, shape);
+}
+
+bool GraphBuilder::Set(GraphOperation *graphOp, GraphApi::OpAttr attr, const GraphApi::Point2 &value)
+{
+    auto op = static_cast<Operation *>(graphOp);
+    if ( (unsigned(attr) & 0xF) != GraphApi::GRAPHAPI_TYPECODE_Point2 )
+    {
+        assert(false && "Attribute type not Point2");
+        return false;
+    }
+
+    Point2i xy(value.x, value.y);
+    return WriteAttributeValue(op, attr, xy);
+}
+
+bool GraphBuilder::Set(GraphOperation *graphOp, GraphApi::OpAttr attr, const char *value)
+{
+    auto op = static_cast<Operation *>(graphOp);
+    if ( (unsigned(attr) & 0xF) != GraphApi::GRAPHAPI_TYPECODE_string )
+    {
+        assert(false && "Attribute type not string");
+        return false;
+    }
+
+    std::string str(value);
+    return WriteAttributeValue(op, attr, str);
+}
+
+void GraphBuilder::SetZeroPoint(GraphOperation *graphOp, GraphTensorUsage tensorUsage, double zeroPoint)
+{
+    auto op = static_cast<Operation *>(graphOp);
+    auto usage = GraphAPIUsageToTensorUsage(tensorUsage);
+    auto *conn = (usage & regor::TensorUsage::TypeMask) == regor::TensorUsage::OFM ? op->Output(usage) : op->Input(usage);
+    if ( conn )
+    {
+        conn->quantization.zeroPoints = {int64_t(zeroPoint)};
+    }
+}
+
+void GraphBuilder::SetAxisOrder(GraphTensor *graphTensor, GraphApi::AxisOrder order)
+{
+    auto tensor = static_cast<Tensor *>(graphTensor);
+    tensor->SetAxisOrder(regor::AxisOrder(order));
+}
+
+void GraphBuilder::SetAxisStrides([[maybe_unused]] GraphTensor *graphTensor, [[maybe_unused]] const GraphApi::GraphShape *axisStrides)
+{
+    assert(axisStrides == nullptr && "Not currently implemented");
+}
+
+
+void GraphBuilder::FreeUnconnected()
+{
+    try
+    {
+        // In case somebody added self-supporting graph fragments
+        std::unordered_set<Operation *> connected;
+        Graph::TraverseGraphFromEnd(_outputs,
+            [&](Operation *op) -> bool
+            {
+                connected.insert(op);
+                return true;
+            });
+        for ( auto &op : _operations )
+        {
+            if ( !connected.count(op.get()) )
+            {
+                op->Disconnect();
+            }
+        }
+    }
+    catch ( std::bad_weak_ptr & )
+    {
+        // ignored
+    }
+}
+
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/graph_builder.hpp b/ethosu/regor/compiler/graph_builder.hpp
new file mode 100644
index 00000000..ec67499e
--- /dev/null
+++ b/ethosu/regor/compiler/graph_builder.hpp
@@ -0,0 +1,99 @@
+//
+// SPDX-FileCopyrightText: Copyright 2022-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/buffer_view.hpp"
+#include "include/graphapi.hpp"
+#include "include/graphapi_tosa_types.hpp"
+#include "tensor_properties.hpp"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace regor
+{
+
+class Compiler;
+class Operation;
+class Tensor;
+
+/// <summary>
+/// Graph Builder implementation
+/// </summary>
+class GraphBuilder : public GraphApi::IGraphBuilder
+{
+    friend class Compiler;
+    using GraphTensor = GraphApi::GraphTensor;
+    using GraphShape = GraphApi::GraphShape;
+    using GraphKernel = GraphApi::GraphKernel;
+    using GraphOperation = GraphApi::GraphOperation;
+    using GraphTensorUsage = GraphApi::GraphTensorUsage;
+    using GraphDataType = GraphApi::GraphDataType;
+    using GraphBuffer = GraphApi::GraphBuffer;
+    using BufferMapping = GraphApi::BufferMapping;
+    using GraphTensorLayout = GraphApi::GraphTensorLayout;
+
+protected:
+    std::string _graphName;
+    uint32_t _syntaxVersion = 0;
+    std::vector<std::shared_ptr<Operation>> _operations;
+    std::vector<std::shared_ptr<Tensor>> _tensors;
+    std::vector<std::shared_ptr<Tensor>> _inputs;
+    std::vector<std::shared_ptr<Tensor>> _outputs;
+    std::vector<std::shared_ptr<Buffer>> _buffers;
+
+public:
+    GraphBuilder(const std::string &name);
+    ~GraphBuilder();
+
+public:
+    // Inherited via IGraphBuilder
+    bool RequireSyntaxVersion(uint32_t version, int32_t level) override;
+    GraphOperation *CreateOp(tosa::Op opType, const GraphKernel *kernel) override;
+    GraphBuffer *CreateBuffer(size_t sizeBytes, BufferMapping mapping, const void *initialData) override;
+    GraphTensor *CreateTensor(const char *name, const GraphShape &shape, GraphTensorLayout layout,
+        GraphDataType dataType, GraphBuffer *buffer) override;
+    // Set graph inputs/outputs
+    void AddInput(GraphTensor *graphTensor) override;
+    void AddOutput(GraphTensor *graphTensor) override;
+    // Connect operator inputs/outputs
+    void AddInput(GraphOperation *graphOp, GraphTensorUsage usage, GraphTensor *graphTensor) override;
+    void AddOutput(GraphOperation *graphOp, GraphTensorUsage usage, GraphTensor *graphTensor) override;
+    // Object attribute and properties
+    bool Set(GraphOperation *graphOp, GraphApi::OpAttr attr, bool value) override;
+    bool Set(GraphOperation *graphOp, GraphApi::OpAttr attr, int32_t value) override;
+    bool Set(GraphOperation *graphOp, GraphApi::OpAttr attr, double value) override;
+    bool Set(GraphOperation *graphOp, GraphApi::OpAttr attr, const GraphApi::GraphShape &value) override;
+    bool Set(GraphOperation *graphOp, GraphApi::OpAttr attr, const GraphApi::Point2 &value) override;
+    bool Set(GraphOperation *graphOp, GraphApi::OpAttr attr, const char *value) override;
+    void SetZeroPoint(GraphOperation *op, GraphTensorUsage usage, double zeroPoint) override;
+    void SetAxisOrder(GraphTensor *graphTensor, GraphApi::AxisOrder order) override;
+    void SetAxisStrides(GraphTensor *graphTensor, const GraphApi::GraphShape *axisStrides) override;
+    // Utility
+    const std::string &Name() const { return _graphName; }
+    uint32_t SyntaxVersion() const { return _syntaxVersion; }
+
+private:
+    void FreeUnconnected();
+};
+
+TensorUsage GraphAPIUsageToTensorUsage(GraphApi::GraphTensorUsage usage);
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/graph_optimiser.cpp b/ethosu/regor/compiler/graph_optimiser.cpp
new file mode 100644
index 00000000..c3407b8f
--- /dev/null
+++ b/ethosu/regor/compiler/graph_optimiser.cpp
@@ -0,0 +1,341 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "graph_optimiser.hpp"
+
+#include "common/logging.hpp"
+
+#include "architecture/architecture.hpp"
+#include "graph.hpp"
+#include "graphir_optimiser.hpp"
+#include "op_type.hpp"
+#include "operation.hpp"
+#include "tensor.hpp"
+#include "tflite_graph_optimiser.hpp"
+
+#include <cassert>
+#include <iterator>
+#include <memory>
+#include <stack>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "include/regor.h"
+
+namespace regor
+{
+
+std::unique_ptr<GraphOptimiser> GraphOptimiser::MakeGraphOptimiser(
+    GraphNotation notation, Architecture *arch, const GraphOptimiserOptions &options, OptimiserDatabase *db)
+{
+    switch ( notation )
+    {
+        case GraphNotation::TFLite:
+            return std::unique_ptr<GraphOptimiser>(std::make_unique<TFLiteGraphOptimiser>(arch, options, db));
+
+        case GraphNotation::GraphAPI:
+            return std::unique_ptr<GraphOptimiser>(std::make_unique<GraphIrOptimiser>(arch, options, db));
+
+        default:
+            LOG_ERROR("Invalid graph notation");
+            assert(false);
+    }
+
+    return {};
+}
+
+// Some debug functions
+#if LOG_TRACE1_ON
+Operation *GraphOptimiser::VisitOperatorLog(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+    if ( GraphOptimiser::Options().verboseGraph )
+    {
+        LOG_TRACE1("Rewrite operator visits: {0} (@{1})", OpTypeToString(operation->Type()), static_cast<void *>(operation));
+        auto *ifmConn = operation->Input(TensorUsage::IFM0);
+        LOG_TRACE1(" -- IFM shape: [{0}] read shape: [{2}] offset [{1}]",
+            (ifmConn == nullptr ? "" : ifmConn->shape.ToString()), (ifmConn == nullptr ? "" : ifmConn->slice.offset.ToString()),
+            (ifmConn == nullptr ? "" : ifmConn->slice.shape.ToString()));
+
+        auto idx = 1;
+        auto usage = MakeTensorUsage(TensorUsage::IFM, 1);
+        ifmConn = operation->Input(usage);
+        while ( ifmConn != nullptr )
+        {
+            LOG_TRACE1(", [{0}] read shape: [{2}] offset [{1}]", ifmConn->shape.ToString(),
+                ifmConn->slice.offset.ToString(), ifmConn->slice.shape.ToString());
+            usage = MakeTensorUsage(TensorUsage::IFM, ++idx);
+            ifmConn = operation->Input(usage);
+        }
+        auto *ofmConn = operation->Output(TensorUsage::OFM);
+        LOG_TRACE1(" - OFM shape: [{0}] write shape: [{2}] offset [{1}]\n",
+            (ofmConn == nullptr ? "" : ofmConn->shape.ToString()), (ofmConn == nullptr ? "" : ofmConn->slice.offset.ToString()),
+            (ofmConn == nullptr ? "" : ofmConn->slice.shape.ToString()));
+    }
+    return operation;
+}
+
+Tensor *GraphOptimiser::VisitTensorLog(Graph *const graph, Tensor *const tensor)
+{
+    UNUSED(graph);
+    if ( GraphOptimiser::Options().verboseGraph )
+    {
+        LOG_TRACE1("Rewrite tensor visits: {0} (@{1}) -- Tensor shape: [{2}]\n", tensor->Name(),
+            static_cast<void *>(tensor), tensor->StorageShape().ToString());
+    }
+    return tensor;
+}
+#endif
+
+Operation *GraphOptimiser::RecordOperation(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+    if ( _db )
+    {
+        // TODO: implement ext key tracking for TOSA Networks.
+        _db->SourceOp(operation);
+    }
+    return operation;
+}
+
+Operation *GraphOptimiser::RecordOptimisation(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+    // Remaining ops probably reference themselves
+    if ( _db )
+    {
+        _db->AddOptimised(operation, operation);
+    }
+    return operation;
+}
+
+void GraphOptimiser::RecordOptimisation(const Operation *operation, const Operation *op)
+{
+    if ( _db )
+    {
+        _db->AddOptimised(operation, op);
+    }
+}
+
+void GraphOptimiser::PrintGraph(const Graph *graph, const std::string &label) const
+{
+    if ( graph != nullptr )
+    {
+        if ( !label.empty() )
+        {
+            LOG_PRINT("\n[ {0} ]\n", label);
+        }
+        std::vector<Operation *> ops;
+        graph->GetAllOperations(ops);
+        auto idx = 0;
+        for ( const auto &op : ops )
+        {
+            OpType type = op->Type();
+            // TODO: This uses the OFM tensor name to identify the operator
+            std::string name = op->OFM() ? op->OFM()->Name() : "<unnamed>";
+            LOG_PRINT("{0:<5} {1:<20} {2:<30}\n", idx, OpTypeToString(type), name);
+            ++idx;
+        }
+        LOG_PRINT("\n");
+    }
+}
+
+void GraphOptimiser::PrintQuantization(const Graph *graph, const std::string &label) const
+{
+    if ( graph != nullptr )
+    {
+        if ( !label.empty() )
+        {
+            LOG_PRINT("\n[ {0} ]\n", label);
+        }
+        std::vector<Operation *> ops;
+        graph->GetAllOperations(ops);
+        auto op_idx = 0;
+        for ( const auto &op : ops )
+        {
+            OpType type = op->Type();
+            std::string name = op->OFM() ? op->OFM()->Name() : "<unnamed>";
+            LOG_PRINT("{0} {1} {2}\n", op_idx, OpTypeToString(type), name);
+
+            const ordered_map<TensorUsage, TensorConnection> &inputs = op->Inputs();
+            auto input_idx = 0;
+            for ( const auto &v : inputs )
+            {
+                const auto &tens = v.tensor;
+                std::string quantization_string = v.quantization.ToString();
+                LOG_PRINT("    {0} {1:02} {2} {3} {4}\n", "Input", input_idx, DataTypeToString(tens->Type()),
+                    quantization_string, tens->Name());
+                input_idx++;
+            }
+            ++op_idx;
+        }
+        LOG_PRINT("\n");
+    }
+}
+
+void GraphOptimiser::Process(Graph *graph)
+{
+    if ( _options.verboseGraph )
+    {
+        PrintGraph(graph, "Before Graph Optimisation");
+    }
+    OptimiseGraph(graph);
+    if ( _options.verboseGraph )
+    {
+        PrintGraph(graph, "After Graph Optimization");
+    }
+    if ( _options.verboseQuantization )
+    {
+        PrintQuantization(graph, "Graph With Tensor Quantization");
+    }
+}
+
+void GraphOptimiser::ParseGraphOptimiserOptions(GraphOptimiserOptions &opt, IniReader &reader)
+{
+    // Parse debug settings
+    std::string key;
+    while ( reader.Begin(key) )
+    {
+        if ( key == "verbose" )
+        {
+            opt.verboseGraph = reader.Get<bool>();
+        }
+        if ( key == "verbose_quantization" )
+        {
+            opt.verboseQuantization = reader.Get<bool>();
+        }
+
+        reader.End();
+    }
+}
+
+OptimiserDatabase::OptimiserDatabase(Database *db) : _db(db)
+{
+    _sourceTable = _db->AddTable("source");
+    _optTable = _db->AddTable("optimised");
+    _cmdTable = _db->AddTable("queue", false);
+    _streamTable = _db->AddTable("cmdstream");
+    _db->AddColumns(_sourceTable, {"operator", "kernel_w", "kernel_h", "ofm_w", "ofm_h", "ofm_d", "ext_key"});
+    _db->AddColumns(_optTable, {"source_id", "operator", "kernel_w", "kernel_h", "ofm_w", "ofm_h", "ofm_d"});
+    _db->AddColumns(_cmdTable, {"offset", "cmdstream_id", "optimised_id"});
+}
+
+Database *OptimiserDatabase::Get()
+{
+    return _db;
+}
+
+int OptimiserDatabase::SourceId(const void *op)
+{
+    // lookup op in optimised
+    auto pos = _optimised.find(op);
+    if ( pos != std::end(_optimised) )
+    {
+        return std::get<0>(pos->second);
+    }
+    else if ( auto ptr = _source.find(op); ptr != std::end(_source) )
+    {
+        // op is original-op
+        return ptr->second;
+    }
+    return 0;
+}
+
+int OptimiserDatabase::OptimisedId(const void *op)
+{
+    // lookup op in optimised
+    auto pos = _optimised.find(op);
+    if ( pos != std::end(_optimised) )
+    {
+        return std::get<1>(pos->second);
+    }
+    return 0;
+}
+
+int OptimiserDatabase::SourceOp(const Operation *op, int ext_key)
+{
+    auto pos = _source.find(op);
+    if ( pos != _source.end() )
+    {
+        return pos->second;
+    }
+    _sourceId++;
+    _source.emplace(op, _sourceId);
+
+    auto k = op->Kernel()->Size();
+    auto o = Shape::PadAxes(op->OFM()->View().ViewShape(), 3, 1);
+    _db->AddRow(_sourceTable, _sourceId,
+        {OpTypeToString(op->Type()), std::to_string(k.x), std::to_string(k.y), std::to_string(o.Width()),
+            std::to_string(o.Height()), std::to_string(o.Depth()), std::to_string(ext_key)});
+    return _sourceId;
+}
+
+void OptimiserDatabase::AddOptimised(const void *from, const Operation *to)
+{
+    assert(to);
+
+    // Locate the source operation Id (if any)
+    int sourceId = 0;
+    if ( from != nullptr )
+    {
+        // Look for source op in optimised list first and use that op's parent
+        // (source replacement doesn't matter)
+        auto pos = _optimised.find(from);
+        if ( pos != _optimised.end() )
+        {
+            sourceId = std::get<0>(pos->second);
+        }
+        else
+        {
+            auto srcPos = _source.find(from);
+            if ( srcPos != _source.end() )
+            {
+                sourceId = srcPos->second;
+            }
+        }
+    }
+
+    _optId++;
+    _optimised[to] = std::tuple<int, int>(sourceId, _optId);
+
+    auto k = to->Kernel()->Size();
+    auto o = Shape::PadAxes(to->OFM()->View().ViewShape(), 3, 1);
+    _db->AddRow(_optTable, _optId,
+        {std::to_string(sourceId), OpTypeToString(to->Type()), std::to_string(k.x), std::to_string(k.y),
+            std::to_string(o.Width()), std::to_string(o.Height()), std::to_string(o.Depth())});
+}
+
+void OptimiserDatabase::AddCommand(void *key, int stream, int cmdIndex)
+{
+    auto pos = _optimised.find(key);
+    if ( pos != _optimised.end() )
+    {
+        int optId = std::get<1>(pos->second);
+        _db->AddRow(_cmdTable, 0, {std::to_string(4 * cmdIndex), std::to_string(stream), std::to_string(optId)});
+    }
+}
+
+int OptimiserDatabase::AddStream()
+{
+    _streamId++;
+    _db->AddRow(_streamTable, _streamId, {});
+    return _streamId;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/graph_optimiser.hpp b/ethosu/regor/compiler/graph_optimiser.hpp
new file mode 100644
index 00000000..4df471c1
--- /dev/null
+++ b/ethosu/regor/compiler/graph_optimiser.hpp
@@ -0,0 +1,212 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/logging.hpp"
+
+#include "architecture/architecture.hpp"
+#include "common/ini_reader.hpp"
+#include "graph.hpp"
+#include "graph_optimiser_db.hpp"
+#include "operation.hpp"
+#include "tensor.hpp"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "include/regor.h"
+
+namespace regor
+{
+
+/// <summary>
+/// Tensor and Operation rewrite functions for the graph optimisation.
+/// </summary>
+template<typename T>
+struct RewriteFunctions
+{
+    const std::vector<Tensor *(T::*)(Graph *, Tensor *)> tensorFunction;
+    const std::vector<Operation *(T::*)(Graph *, Operation *)> opFunction;
+};
+
+/// <summary>
+/// Graph optimiser options
+/// </summary>
+struct GraphOptimiserOptions
+{
+    bool verboseGraph = false;
+    bool verboseQuantization = false;
+};
+
+/// <summary>
+/// Graph optimiser
+/// </summary>
+class GraphOptimiser
+{
+protected:
+    Architecture *_arch = nullptr;
+    const GraphOptimiserOptions _options;
+    OptimiserDatabase *_db;
+
+public:
+    GraphOptimiser(Architecture *arch, const GraphOptimiserOptions &options, OptimiserDatabase *db) :
+            _arch(arch), _options(options), _db(db)
+    {
+        assert(_arch != nullptr);
+    }
+    const GraphOptimiserOptions &Options() const { return _options; }
+
+
+
+    static std::unique_ptr<GraphOptimiser> MakeGraphOptimiser(
+        GraphNotation notation, Architecture *arch, const GraphOptimiserOptions &options, OptimiserDatabase *db);
+    static void ParseGraphOptimiserOptions(GraphOptimiserOptions &opt, IniReader &reader);
+
+    void Process(Graph *graph);
+    virtual void OptimiseGraph(Graph *graph) = 0;
+
+    // Note no check for if NPU operator, or "rewrite_unsupported".
+    // Such checks are delegated to each specific rewrite function.
+    template<typename T>
+    void RewriteGraph(Graph *const graph, const RewriteFunctions<T> &rewriteFuncs)
+    {
+        using OpFunction = Operation *(T::*)(Graph *, Operation *);
+        using TensFunction = Tensor *(T::*)(Graph *, Tensor *);
+
+        const std::vector<OpFunction> *opFunctions = &rewriteFuncs.opFunction;
+        const std::vector<TensFunction> *tensFunctions = &rewriteFuncs.tensorFunction;
+
+        // TODO: MLBEDSW-9057: Check when specific rewrite functions are added
+        struct Entry
+        {
+            bool done;
+            std::shared_ptr<Operation> op;
+        };
+        std::unordered_set<Operation *> opVisited;
+        std::unordered_set<Tensor *> tensVisited;
+        std::stack<Entry> stack;
+
+        for ( const auto &tensor : graph->Outputs() )
+        {
+            for ( const auto &op : tensor->Writers() )
+            {
+                stack.push(Entry{false, op});
+            }
+        }
+
+        while ( !stack.empty() )
+        {
+            Entry entry = stack.top();
+            stack.pop();
+
+            // Currently we do not do anything with entry.done elements.
+            if ( !entry.done && opVisited.count(entry.op.get()) == 0 && !entry.op->IsDisconnected() )
+            {
+                Operation *updatedOp = entry.op.get();
+                Operation *prevOp = nullptr;
+
+                // Process op
+                if ( !opFunctions->empty() )
+                {
+                    // Op have been updated, parse it again.
+                    while ( prevOp != updatedOp )
+                    {
+                        if ( prevOp != nullptr )
+                        {
+                            // prevOp was removed and replaced by updatedOp; remove the stale pointer from opVisited
+                            opVisited.erase(prevOp);
+                        }
+                        prevOp = updatedOp;
+                        // Execute operator functions
+                        for ( const auto &func : *opFunctions )
+                        {
+                            updatedOp = (static_cast<T *>(this)->*(func))(graph, updatedOp);
+                        }
+                    }
+                }
+                opVisited.insert(updatedOp);
+                stack.push(Entry{true, updatedOp->shared_from_this()});
+
+                std::array<std::vector<Tensor *>, 2> tensors;
+                for ( const auto &pair : updatedOp->Outputs().pairs() )
+                {
+                    tensors[0].push_back(pair.second.tensor.get());
+                }
+                for ( const auto &pair : updatedOp->Inputs().pairs() )
+                {
+                    tensors[1].push_back(pair.second.tensor.get());
+                }
+                for ( auto idx = 0; idx < 2; ++idx )
+                {
+                    for ( const auto &tens : tensors[idx] )
+                    {
+                        Tensor *updatedTensor = tens;
+
+                        // Process Tensor if not already visited
+                        if ( tensVisited.count(tens) == 0 )
+                        {
+                            Tensor *prevTensor = nullptr;
+
+                            if ( !tensFunctions->empty() )
+                            {
+                                // Tensor have been updated, parse it again.
+                                while ( prevTensor != updatedTensor )
+                                {
+                                    if ( prevTensor != nullptr )
+                                    {
+                                        tensVisited.erase(prevTensor);
+                                    }
+                                    prevTensor = updatedTensor;
+                                    // Execute tensor functions
+                                    for ( const auto &func : *tensFunctions )
+                                    {
+                                        updatedTensor = (static_cast<T *>(this)->*(func))(graph, updatedTensor);
+                                    }
+                                }
+                            }
+                            tensVisited.insert(updatedTensor);
+                        }
+                        // Check Writers() for tensor, even if visited as we can bypass op updating the tensors.
+                        const std::vector<std::shared_ptr<Operation>> *ops = &updatedTensor->Writers();
+                        for ( const auto &op : *ops )
+                        {
+                            if ( opVisited.count(op.get()) == 0 )
+                            {
+                                stack.push(Entry{false, op});
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+#if LOG_TRACE1_ON
+    Operation *VisitOperatorLog(Graph *const graph, Operation *const operation);
+    Tensor *VisitTensorLog(Graph *const graph, Tensor *const tensor);
+#endif
+    Operation *RecordOperation(Graph *const graph, Operation *const operation);
+    Operation *RecordOptimisation(Graph *const graph, Operation *const operation);
+    void RecordOptimisation(const Operation *operation, const Operation *op);
+    void PrintGraph(const Graph *graph, const std::string &label) const;
+    void PrintQuantization(const Graph *graph, const std::string &label) const;
+    virtual ~GraphOptimiser() = default;
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/graph_optimiser_db.hpp b/ethosu/regor/compiler/graph_optimiser_db.hpp
new file mode 100644
index 00000000..45fcc61b
--- /dev/null
+++ b/ethosu/regor/compiler/graph_optimiser_db.hpp
@@ -0,0 +1,60 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "database.hpp"
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace regor
+{
+
+class Operation;
+
+/// <summary>
+/// Graph optimiser database implementation
+/// </summary>
+class OptimiserDatabase
+{
+private:
+    Database *_db = nullptr;
+    int _sourceId = 0;
+    int _optId = 0;
+    int _streamId = 0;
+    int _sourceTable = 0;
+    int _optTable = 0;
+    int _cmdTable = 0;
+    int _streamTable = 0;
+    std::unordered_map<const void *, int> _source;
+    std::unordered_map<const void *, std::tuple<int, int>> _optimised;
+
+public:
+    OptimiserDatabase(Database *db);
+    Database *Get();
+    int SourceId(const void *op);
+    int OptimisedId(const void *op);
+    int SourceOp(const Operation *op, int ext_key = -1);
+    void AddOptimised(const void *from, const Operation *to);
+    void AddCommand(void *key, int stream, int cmdIndex);
+    int AddStream();
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/graph_packing.cpp b/ethosu/regor/compiler/graph_packing.cpp
new file mode 100644
index 00000000..fc8f884b
--- /dev/null
+++ b/ethosu/regor/compiler/graph_packing.cpp
@@ -0,0 +1,249 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "graph_packing.hpp"
+
+#include "common/logging.hpp"
+
+#include "graph.hpp"
+#include "scheduler_operation.hpp"
+#include "tensor.hpp"
+
+#include <cassert>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace regor
+{
+
+GraphPacking::GraphPacking()
+{
+}
+
+std::unique_ptr<Graph> GraphPacking::Process(std::vector<std::pair<Operation *, std::unique_ptr<NPUOperation>>> &npuOps,
+    std::vector<std::unique_ptr<SchedulerOperation>> &ops, std::unordered_map<const Tensor *, Address> &tensorAddressMap, const Graph *srcGraph)
+{
+    // Build a new graph where consecutive operations running on NPU are collapsed into a Ethos-U op
+    // (OpType::CustomNpuOp). CPU operations are left unchanged. The algorithm makes two passes over the scheduled
+    // operations, where the first pass creates new operations (Ethos-U and CPU) and connects the CPU operations, and
+    // the second pass connects the Ethos-U operations.
+
+    // List of all ops in new graph in scheduled order
+    std::vector<Operation *> newOpsInScheduledOrder;
+
+    std::shared_ptr<Operation> currentOp = nullptr;
+    NPUOperation *currentNpuOp = nullptr;
+
+    // Pack consecutive NPU ops into a NPUOperation
+    for ( auto &schedOp : ops )
+    {
+        if ( schedOp->IsNpuOp() )
+        {
+            if ( !currentNpuOp )
+            {
+                // Create new Ethos-U operation for the new graph
+                currentOp = std::make_shared<Operation>(OpType::CustomNpuOp);
+
+                // Create new NPUOperation that collects consecutive NPU operations
+                auto newNpuOp = std::make_unique<NPUOperation>();
+                currentNpuOp = newNpuOp.get();
+
+                newOpsInScheduledOrder.push_back(currentOp.get());
+
+                npuOps.emplace_back(currentOp.get(), std::move(newNpuOp));
+            }
+
+            // Map old scheduler operation and its sub-ops to new operation
+            _oldOpToNewOp[schedOp.get()] = currentOp;
+            for ( const auto &subOp : schedOp->SubOps() )
+            {
+                assert(subOp->IsNpuOp());
+                _oldOpToNewOp[subOp.get()] = currentOp;
+            }
+
+            currentNpuOp->AddOperation(std::move(schedOp));
+        }
+        else
+        {
+            // Create new CPU operation for the new graph
+            assert(schedOp->_srcKey != nullptr);
+            currentOp = std::make_shared<Operation>(*static_cast<Operation *>(schedOp->_srcKey));
+            currentNpuOp = nullptr;
+
+            newOpsInScheduledOrder.push_back(currentOp.get());
+
+            // Map old scheduler operation to new CPU operation
+            _oldOpToNewOp[schedOp.get()] = currentOp;
+
+            for ( const auto &[usage, schedConn] : schedOp->inputs.pairs() )
+            {
+                const auto &schedTensor = schedConn.tensor;
+
+                // Connect input tensor to new CPU operation
+                const auto oldTensor = schedTensor->srcTensor;
+                assert(oldTensor && "Missing source graph tensor");
+                const auto newTensor = LookupNewTensor(oldTensor.get(), tensorAddressMap, schedTensor->allocatedAddress);
+                currentOp->ConnectInput(usage, newTensor).Set(schedConn.shape).Set(schedConn.quantization);
+            }
+
+            for ( const auto &[usage, schedConn] : schedOp->outputs.pairs() )
+            {
+                const auto &schedTensor = schedConn.tensor;
+
+                // Connect output tensor to new CPU operation
+                const auto oldTensor = schedTensor->srcTensor;
+                assert(oldTensor && "Missing source graph tensor");
+                const auto newTensor = LookupNewTensor(oldTensor.get(), tensorAddressMap, schedTensor->allocatedAddress);
+                currentOp->ConnectOutput(usage, newTensor).Set(schedConn.shape).Set(schedConn.quantization);
+            }
+        }
+    }
+
+    currentNpuOp = nullptr;
+    currentOp = nullptr;
+
+    for ( auto &item : npuOps )
+    {
+        Operation *op = item.first;
+
+        for ( const auto &schedOp : item.second->Operations() )
+        {
+            for ( const auto &schedConn : schedOp->inputs )
+            {
+                const auto &schedTensor = schedConn.tensor;
+                if ( schedTensor->IsConstant() )
+                {
+                    // Don't connect constant tensors - they are handled at scheduler level
+                    continue;
+                }
+
+                const bool isConsumedByUs = std::any_of(schedTensor->consumers.begin(), schedTensor->consumers.end(),
+                    [&, op](SchedulerOperation *cons) { return _oldOpToNewOp.at(cons).get() == op; });
+                const bool isProducedByUsOnly = std::all_of(schedTensor->producers.begin(), schedTensor->producers.end(),
+                    [&, op](SchedulerOperation *prod) { return _oldOpToNewOp.at(prod).get() == op; });
+                if ( isConsumedByUs && isProducedByUsOnly && !schedTensor->isGraphInput )
+                {
+                    // Don't connect NPU internal tensors
+                    continue;
+                }
+
+                // Connect input tensor to new Ethos-U operation, but only once
+                const auto &oldTensor = schedTensor->srcTensor;
+                assert(oldTensor && "Missing source graph tensor");
+                const auto newTensor = LookupNewTensor(oldTensor.get(), tensorAddressMap, schedTensor->allocatedAddress);
+                if ( op->UsageOfTensor(newTensor.get()) == TensorUsage::None )
+                {
+                    const auto usage = MakeTensorUsage(TensorUsage::IFM, op->Inputs().size());
+                    op->ConnectInput(usage, newTensor).Set(schedConn.quantization);
+                }
+            }
+
+            for ( const auto &schedConn : schedOp->outputs )
+            {
+                const auto &schedTensor = schedConn.tensor;
+
+                const bool isProducedByUs = std::any_of(schedTensor->producers.begin(), schedTensor->producers.end(),
+                    [&](SchedulerOperation *prod) { return _oldOpToNewOp.at(prod).get() == op; });
+                const bool isConsumedByUsOnly = std::all_of(schedTensor->consumers.begin(), schedTensor->consumers.end(),
+                    [&](SchedulerOperation *cons) { return _oldOpToNewOp.at(cons).get() == op; });
+                if ( isProducedByUs && isConsumedByUsOnly && !schedTensor->isGraphOutput )
+                {
+                    // Don't connect NPU internal tensors
+                    continue;
+                }
+
+                // Connect output tensor to new Ethos-U operation, but only once
+                const auto &oldTensor = schedTensor->srcTensor;
+                assert(oldTensor && "Missing source graph tensor");
+                const auto newTensor = LookupNewTensor(oldTensor.get(), tensorAddressMap, schedTensor->allocatedAddress);
+                if ( op->UsageOfTensor(newTensor.get()) == TensorUsage::None )
+                {
+                    const auto usage = MakeTensorUsage(TensorUsage::OFM, op->Outputs().size());
+                    op->ConnectOutput(usage, newTensor).Set(schedConn.quantization);
+                }
+            }
+        }
+    }
+
+    // Clear ops since they have been moved into relevant NPUOperation object
+    ops.clear();
+    _oldOpToNewOp.clear();
+
+    auto graph = std::make_unique<Graph>(srcGraph->Notation());
+    graph->SetPassthrough(srcGraph->Passthrough());
+
+    // Transfer graph input tensors from old graph
+    for ( const auto &graphInput : srcGraph->Inputs() )
+    {
+        graph->AddInput(LookupNewTensor(graphInput.get()));
+    }
+
+    // Transfer graph output tensors from old graph
+    for ( const auto &graphOutput : srcGraph->Outputs() )
+    {
+        graph->AddOutput(LookupNewTensor(graphOutput.get()));
+    }
+
+    _oldTensorToNewTensor.clear();
+
+    // Save the execution order of all ops in the new graph
+    graph->SetScheduledOrder(std::move(newOpsInScheduledOrder));
+
+    return graph;
+}
+
+std::shared_ptr<Tensor> GraphPacking::LookupNewTensor(Tensor *oldTensor)
+{
+    const auto it = _oldTensorToNewTensor.find(oldTensor);
+    if ( it == _oldTensorToNewTensor.end() )
+    {
+        // This cloned tensor will be used in the new graph
+        std::shared_ptr<Tensor> newTensor = oldTensor->Clone();
+
+        _oldTensorToNewTensor[oldTensor] = newTensor;
+
+        return newTensor;
+    }
+    else
+    {
+        return it->second;
+    }
+}
+
+std::shared_ptr<Tensor> GraphPacking::LookupNewTensor(
+    Tensor *oldTensor, std::unordered_map<const Tensor *, Address> &tensorAddressMap, Address allocatedAddress)
+{
+    const auto newTensor = LookupNewTensor(oldTensor);
+
+    // This cloned tensor will use same address as the original tensor
+    tensorAddressMap[newTensor.get()] = allocatedAddress;
+
+    return newTensor;
+}
+
+std::unique_ptr<Graph> PackScheduleToGraph(std::vector<std::pair<Operation *, std::unique_ptr<NPUOperation>>> &npuOps,
+    std::vector<std::unique_ptr<SchedulerOperation>> &ops, std::unordered_map<const Tensor *, Address> &tensorAddressMap, const Graph *srcGraph)
+{
+    GraphPacking p;
+
+    return p.Process(npuOps, ops, tensorAddressMap, srcGraph);
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/graph_packing.hpp b/ethosu/regor/compiler/graph_packing.hpp
new file mode 100644
index 00000000..8cbb6b29
--- /dev/null
+++ b/ethosu/regor/compiler/graph_packing.hpp
@@ -0,0 +1,60 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "graph.hpp"
+#include "operation.hpp"
+#include "scheduler_operation.hpp"
+#include "tensor.hpp"
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+namespace regor
+{
+
+/// <summary>
+/// Graph packing
+/// </summary>
+class GraphPacking
+{
+public:
+    GraphPacking();
+
+public:
+    std::unique_ptr<Graph> Process(std::vector<std::pair<Operation *, std::unique_ptr<NPUOperation>>> &npuOps,
+        std::vector<std::unique_ptr<SchedulerOperation>> &ops,
+        std::unordered_map<const Tensor *, Address> &tensorAddressMap, const Graph *srcGraph);
+
+private:
+    std::unordered_map<SchedulerOperation *, std::shared_ptr<Operation>> _oldOpToNewOp;
+    std::unordered_map<Tensor *, std::shared_ptr<Tensor>> _oldTensorToNewTensor;
+
+    std::shared_ptr<Tensor> LookupNewTensor(Tensor *oldTensor);
+    std::shared_ptr<Tensor> LookupNewTensor(
+        Tensor *oldTensor, std::unordered_map<const Tensor *, Address> &tensorAddressMap, Address allocatedAddress);
+};
+
+// Pack list of scheduler operations into one or more graphs
+std::unique_ptr<Graph> PackScheduleToGraph(std::vector<std::pair<Operation *, std::unique_ptr<NPUOperation>>> &npuOps,
+    std::vector<std::unique_ptr<SchedulerOperation>> &ops,
+    std::unordered_map<const Tensor *, Address> &tensorAddressMap, const Graph *srcGraph);
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/graph_validator.cpp b/ethosu/regor/compiler/graph_validator.cpp
new file mode 100644
index 00000000..3ca341e7
--- /dev/null
+++ b/ethosu/regor/compiler/graph_validator.cpp
@@ -0,0 +1,54 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "graph_validator.hpp"
+
+#include "tosa_graph_validator.hpp"
+
+namespace regor
+{
+
+std::unique_ptr<GraphValidator> GraphValidator::MakeGraphValidator(GraphNotation notation, uint32_t syntaxVersion, Compiler *compiler)
+{
+    if ( notation == GraphNotation::GraphAPI )
+    {
+        if ( TosaGraphValidator::HandlesSyntax(syntaxVersion) )
+        {
+            return std::make_unique<TosaGraphValidator>(notation, syntaxVersion, compiler);
+        }
+    }
+    return std::make_unique<GraphValidator>(notation, syntaxVersion);
+}
+
+bool GraphValidator::Validate(Graph *)
+{
+    _validationErrors.emplace_back(Error{OpType::None, "Unsupported graph Notation/SyntaxVersion"});
+    return false;
+}
+
+std::string GraphValidator::GetErrorMsg()
+{
+    std::string errorMsg = "Validation error:\n";
+    for ( auto &error : _validationErrors )
+    {
+        errorMsg += OpTypeToString(error.operation) + error.errorMessage + "\n";
+    }
+    return errorMsg;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/graph_validator.hpp b/ethosu/regor/compiler/graph_validator.hpp
new file mode 100644
index 00000000..09011075
--- /dev/null
+++ b/ethosu/regor/compiler/graph_validator.hpp
@@ -0,0 +1,59 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "graph.hpp"
+
+#include <string>
+#include <vector>
+
+#include "include/regor.h"
+
+namespace regor
+{
+
+class Compiler;
+
+class GraphValidator
+{
+public:
+    struct Error
+    {
+        OpType operation;
+        std::string errorMessage;
+    };
+
+protected:
+    GraphNotation _notation;
+    uint32_t _syntaxVersion;
+    std::vector<Error> _validationErrors;
+
+public:
+    GraphValidator(GraphNotation notation, uint32_t syntaxVersion) : _notation(notation), _syntaxVersion(syntaxVersion)
+    {
+    }
+    virtual ~GraphValidator() = default;
+
+    static std::unique_ptr<GraphValidator> MakeGraphValidator(GraphNotation notation, uint32_t syntaxVersion, Compiler *compiler);
+    virtual bool Validate(Graph *graph);
+    std::vector<Error> &GetErrors() { return _validationErrors; }
+    std::string GetErrorMsg();
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/graphir_optimiser.cpp b/ethosu/regor/compiler/graphir_optimiser.cpp
new file mode 100644
index 00000000..1dbc99e8
--- /dev/null
+++ b/ethosu/regor/compiler/graphir_optimiser.cpp
@@ -0,0 +1,187 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "compiler/graphir_optimiser.hpp"
+
+#include "optimiser_utils.hpp"
+
+namespace regor
+{
+
+using namespace GraphOptimisation;
+Tensor *GraphIrOptimiser::ConvertInt48Tensors(Graph *, Tensor *tensor)
+{
+    if ( tensor->Type() == DataType::Int48 && !tensor->IsConstant() )
+    {
+        tensor->ChangeType(DataType::Int64);
+    }
+    else if ( tensor->Type() == DataType::UInt48 && !tensor->IsConstant() )
+    {
+        tensor->ChangeType(DataType::UInt64);
+    }
+    return tensor;
+}
+
+
+Operation *GraphIrOptimiser::ConvertAttributes(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+    OpType opType = operation->Type();
+    if ( opType == OpType::Asr )
+    {
+        auto roundMode = operation->attr.asr.round ? RoundMode::NATURAL : RoundMode::TRUNCATE_TO_LOWER;
+        operation->SetRounding(roundMode);
+    }
+    return operation;
+}
+
+Operation *GraphIrOptimiser::ConvertResizeOffsets(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+    // Reduce positive offset parameters that are larger than scale_n
+    // If offset >= scale_n, we can create an ifm-slice to start on offset/scale_n.
+    // The offset parameters are updated to the remainder of the fraction.
+    Operation *returnOp = operation;
+    OpType opType = operation->Type();
+    if ( opType == OpType::Resize )
+    {
+        auto &attr = operation->attr;
+        TensorConnection *ifmConn = operation->Input(TensorUsage::IFM);
+        Shape ifmStart = ifmConn->shape.WithZeros();
+        Shape ifmShape = ifmConn->shape;
+        int offset_h = attr.resize.offsetYX[0];
+        int offset_w = attr.resize.offsetYX[1];
+        int scale_nh = attr.resize.scaleY.n;
+        int scale_nw = attr.resize.scaleX.n;
+        if ( offset_h >= scale_nh )
+        {
+            ifmStart[1] += offset_h / scale_nh;
+            ifmShape[1] -= ifmStart[1];
+            attr.resize.offsetYX[0] = offset_h % scale_nh;
+        }
+        if ( offset_w >= scale_nw )
+        {
+            ifmStart[2] += offset_w / scale_nw;
+            ifmShape[2] -= ifmStart[2];
+            attr.resize.offsetYX[1] = offset_w % scale_nw;
+        }
+        TensorSlice slice{std::move(ifmStart), std::move(ifmShape)};
+        ifmConn->Set(slice);
+    }
+    return returnOp;
+}
+
+Operation *GraphIrOptimiser::RemoveReshape(Graph *const graph, Operation *const operation)
+{
+    Operation *returnOp = operation;
+    OpType opType = operation->Type();
+
+    if ( IsReshape(opType) )
+    {
+        auto *ifmConn = operation->Input(TensorUsage::IFM0);
+        auto *ofmConn = operation->Output(TensorUsage::OFM);
+        auto *ifm = ifmConn->tensor.get();
+        auto *ofm = ofmConn->tensor.get();
+
+        // Check if ifm/ofm are network ifm/ofm
+        bool isIfmSgIfm = IsTensorInVector(graph->Inputs(), ifm);
+        bool isOfmSgOfm = IsTensorInVector(graph->Outputs(), ofm);
+        bool isIfmSgOfm = IsTensorInVector(graph->Outputs(), ifm);
+
+        // TODO: MLBEDSW-9069: Check CPU operator producer/consumer
+
+        // Inserts a copy op if needed before removing reshapes.
+        if ( (isIfmSgIfm || isIfmSgOfm) && (isOfmSgOfm) )
+        {
+            auto copyOp = InsertCopyOpAfterTensor(ifmConn->tensor, ifmConn->quantization);
+            // reset the ifm to reflect the reshape's new ifm
+            ifmConn = operation->Input(TensorUsage::IFM0);
+            ifm = ifmConn->tensor.get();
+            returnOp = copyOp.get();
+            RecordOptimisation(operation, returnOp);
+            // Reshape still needs to be removed.
+        }
+
+        // Remove the reshape and one of the tensors.
+        if ( isOfmSgOfm )
+        {
+            // TODO: This path should also be used for ofm tensors consumed by CPU ops.
+
+            // The OFM is in graph outputs, do not remove this tensor.
+            // Bypass by replacing ifm with ofm.
+            // Set OFM as output for IFM producers
+            ReplaceProducerOutput(ifm->Writers(), ifm, ofmConn->tensor);
+
+            // Set OFM as input to other IFM consumers.
+            ReplaceConsumerInput(operation, ifm->Readers(), ifm, ofmConn->tensor);
+        }
+        else
+        {
+            // Bypass by replacing ofm with ifm.
+            // Set IFM as input to OFM consumers.
+            ReplaceConsumerInput(nullptr, ofm->Readers(), ofm, ifmConn->tensor);
+        }
+        // Remove the reshape from ifm readers and ofm writers.
+        // Note the Inputs/Outputs on operation should still be intact to not break the traversal.
+        ifm->RemoveReader(operation->shared_from_this());
+        ofm->RemoveWriter(operation->shared_from_this());
+    }
+
+    return returnOp;
+}
+
+Operation *GraphIrOptimiser::RewriteFullyConnected(Graph *const graph, Operation *const operation)
+{
+    Operation *returnOp = operation;
+    OpType opType = operation->Type();
+    if ( opType == OpType::FullyConnected )
+    {
+        const auto &weights = operation->Input(TensorUsage::Weights);
+        const auto &shape = weights->tensor->StorageShape();
+        if ( weights->tensor->AxisOrder() == AxisOrder::OI && shape.Size() == 2 )
+        {
+            // Reshape weight tensor from (num_outputs, ..., num_inputs) to (num_outputs, 1, 1, num_inputs)
+            weights->tensor->SetAxisOrder(AxisOrder::OHWI);
+            weights->tensor->Reshape(Shape(shape[0], 1, 1, shape[-1]));
+        }
+        assert(weights->tensor->AxisOrder() == AxisOrder::OHWI);
+    }
+
+    return returnOp;
+}
+
+GraphIrOptimiser::GraphIrOptimiser(Architecture *arch, const GraphOptimiserOptions &options, OptimiserDatabase *db) :
+        GraphOptimiser(arch, options, db)
+{
+}
+
+void GraphIrOptimiser::OptimiseGraph(Graph *graph)
+{
+    for ( auto iOpt = GraphOptimisationSteps().begin(); iOpt != GraphOptimisationSteps().end(); ++iOpt )
+    {
+        LOG_TRACE1("GraphOptimiser {0}/{1}\n", std::distance(GraphOptimisationSteps().begin(), iOpt) + 1,
+            GraphOptimisationSteps().size());
+        // Check if function lists are empty. Do not call for step that only contain disabled debug functions.
+        if ( !iOpt->opFunction.empty() || !iOpt->tensorFunction.empty() )
+        {
+            RewriteGraph<GraphIrOptimiser>(graph, *iOpt);
+        }
+    }
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/graphir_optimiser.hpp b/ethosu/regor/compiler/graphir_optimiser.hpp
new file mode 100644
index 00000000..8dfd2899
--- /dev/null
+++ b/ethosu/regor/compiler/graphir_optimiser.hpp
@@ -0,0 +1,107 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/logging.hpp"
+
+#include "graph.hpp"
+#include "graph_optimiser.hpp"
+#include "operation.hpp"
+#include "tensor.hpp"
+
+#include <vector>
+
+namespace regor
+{
+
+/// <summary>
+/// GraphIR Graph optimiser
+/// </summary>
+class GraphIrOptimiser : public GraphOptimiser
+{
+    using OpRewriteFunction = Operation *(GraphIrOptimiser::*)(Graph *, Operation *);
+    using TensorRewriteFunction = Tensor *(GraphIrOptimiser::*)(Graph *, Tensor *);
+    using GraphOptStepArray = std::vector<RewriteFunctions<GraphIrOptimiser>>;
+
+private:
+    Operation *RemoveReshape(Graph *const graph, Operation *const operation);
+    Operation *ConvertAttributes(Graph *const graph, Operation *const operation);
+    Operation *ConvertResizeOffsets(Graph *const graph, Operation *const operation);
+    Tensor *ConvertInt48Tensors(Graph *graph, Tensor *tensor);
+    Operation *RewriteFullyConnected(Graph *const graph, Operation *const operation);
+
+public:
+    // The graph optimisation steps.
+    // Order matters, array of rewrites processed in order.
+    // clang-format off
+    const GraphOptStepArray _graphOptimisationSteps =
+    {{
+        {
+            {
+#if LOG_TRACE1_ON
+               &GraphOptimiser::VisitTensorLog
+#endif
+            },
+            {
+#if LOG_TRACE1_ON
+                &GraphOptimiser::VisitOperatorLog,
+#endif
+                &GraphOptimiser::RecordOperation
+            }
+        },
+        {
+            {
+                &GraphIrOptimiser::ConvertInt48Tensors,
+            },
+            {
+                &GraphIrOptimiser::RemoveReshape,
+            }
+        },
+        {
+            {},
+            {
+                &GraphIrOptimiser::ConvertAttributes,
+                &GraphIrOptimiser::ConvertResizeOffsets,
+                &GraphIrOptimiser::RewriteFullyConnected,
+            }
+        },
+        {
+            {
+#if LOG_TRACE1_ON
+                &GraphOptimiser::VisitTensorLog
+#endif
+            },
+            {
+#if LOG_TRACE1_ON
+                &GraphOptimiser::VisitOperatorLog,
+#endif
+                &GraphOptimiser::RecordOptimisation
+            }
+        }
+    }};
+    // clang-format on
+
+    explicit GraphIrOptimiser(Architecture *arch, const GraphOptimiserOptions &options, OptimiserDatabase *db);
+
+    const GraphOptStepArray &GraphOptimisationSteps() const { return _graphOptimisationSteps; }
+
+    void OptimiseGraph(Graph *graph);
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/high_level_command_stream.hpp b/ethosu/regor/compiler/high_level_command_stream.hpp
new file mode 100644
index 00000000..86edcaf1
--- /dev/null
+++ b/ethosu/regor/compiler/high_level_command_stream.hpp
@@ -0,0 +1,273 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/common.hpp"
+
+#include "architecture/architecture.hpp"
+#include "architecture/weight_encoder.hpp"
+#include "common/box.hpp"
+#include "common/data_type.hpp"
+#include "common/reverse_type.hpp"
+#include "common/shape.hpp"
+#include "common/transpose_type.hpp"
+#include "kernel.hpp"
+#include "scheduler_operation.hpp"
+
+#include <unordered_map>
+#include <vector>
+
+namespace regor
+{
+
+enum class HLCRoundMode : uint8_t
+{
+    DBL = 0,
+    TRUNCATE = 1,
+    NATURAL = 2,
+    TRUNCATE_TO_LOWER = 3,
+    DOUBLE_ASYMMETRIC = 4,
+    SYMMETRIC = 5,
+    AUTO = 0xff
+};
+
+struct HLCPadding
+{
+    int top = 0;
+    int left = 0;
+    int bottom = 0;
+    int right = 0;
+
+    std::string ToString() const
+    {
+        return fmt::format("[top:{},left:{},bottom:{},right:{}]", top, left, bottom, right);
+    }
+};
+
+/// <summary>
+/// IFM/OFM information needed to generate register commands
+/// </summary>
+struct HLCFeatureMap
+{
+    TensorFormat format = TensorFormat::Unknown;
+    MemArea memArea;
+    Shape shape;
+    Shape strides;
+    Point2i stepXY = {1, 1};
+    DataType dataType;
+    Address address = -1;
+    BufferView bufferView;
+    Quantization quantization;
+    ArchResampling resamplingMode = ArchResampling::None;
+    TransposeType transpose = TransposeType::None;
+    ReverseType reverse = ReverseType::None;
+
+    int AllocationSizeBytes() const { return TensorAllocationBytes(shape, format, dataType); }
+
+    std::string ToString() const
+    {
+        return fmt::format("[{}], format: {}, {}:{}, address: {}", shape.ToString(), format, memArea.memory->Name(),
+            memArea.usage.ToString(), address);
+    }
+};
+
+/// <summary>
+/// Information about encoded weights
+/// </summary>
+struct HLCWeights
+{
+    MemArea memArea;
+    Buffering buffering;
+    Flags<WeightFormat> format;
+    Address address = -1;
+    int maxRangeBytes;  // When double buffering: size of a single buffer
+    int subStreams = 1;
+    std::unordered_map<int, WeightRange> encodedRanges;
+
+    std::string ToString() const
+    {
+        return fmt::format("{} ranges, buffering: {}, {}:{}, address: {}, format: {}", encodedRanges.size(),
+            int(buffering), memArea.memory->Name(), memArea.usage, address, format.ToString());
+    }
+};
+
+// Parameters that apply only to particular sub operation
+union HLCParameters
+{
+    // Alpha value for LeakyReLU
+    struct
+    {
+        float alpha;
+    } leaky_relu;
+
+    // Location of the source LUT, to be DMAed to LUT memory
+    struct LUT
+    {
+        MemArea memArea;
+        Address address;
+        int sizeBytes;
+        DataType ifmType;
+    } lut;
+
+    struct
+    {
+        GraphApi::FractionND scaleY;
+        GraphApi::FractionND scaleX;
+        int offsetY;
+        int offsetX;
+        ArchResizeMode mode;
+    } resize;
+
+    struct
+    {
+        int axis;
+    } argmax;
+};
+
+/// <summary>
+/// Sub operation
+/// </summary>
+struct HLCSubOperation
+{
+    OpType type = OpType::None;
+    HLCParameters parameters = {};
+};
+
+/// <summary>
+/// Contains information needed to generate register commands for an NPU operation.
+///
+/// There is one HLCOperation for every SchedulerOperation. Each HLCOperation can be
+/// associated with one (= non-cascaded) or more (= cascaded) HLCStripes
+/// </summary>
+struct HLCOperation
+{
+    OpType type;
+    Kernel kernel;
+    std::vector<HLCFeatureMap> ifm;
+    HLCFeatureMap ofm;
+    std::unique_ptr<HLCWeights> weights;
+    std::unique_ptr<HLCWeights> scales;
+    std::vector<HLCSubOperation> subOps;
+    HLCParameters parameters = {};
+    HLCRoundMode rounding;
+    ArchitectureOpConfig *config = nullptr;
+    void *_srcKey = nullptr;
+
+#ifndef NDEBUG
+    std::string name;  // name of OFM
+#endif
+
+    std::string ToString() const
+    {
+        std::string k = kernel.Size().x == 0 ? "" : kernel.ToString();
+#ifdef NDEBUG
+        std::string name = "";
+#endif
+        std::string subOpStr = subOps.empty() ? " -" : "";
+        for ( auto &subOp : subOps )
+        {
+            subOpStr += " " + OpTypeToString(subOp.type);
+        }
+        return fmt::format("{} {}, subOps:{}, {} {}", OpTypeToString(type), name, subOpStr, k, config ? config->ToString(false) : "");
+    }
+};
+
+class HighLevelCommand
+{
+public:
+    virtual ~HighLevelCommand() = default;
+    virtual bool IsStripe() const { return false; }
+    virtual std::string ToString() const = 0;
+};
+
+/// <summary>
+/// High level command that performs part of or whole NPU operation,
+/// depending on the box settings.
+/// </summary>
+class HLCStripe : public HighLevelCommand
+{
+public:
+    std::shared_ptr<HLCOperation> operation;
+    std::vector<Box> ifmAreas;
+    Box ofmArea;
+    int weightRangeDepth = 0;  // Identifies depth slice
+    HLCPadding padding;
+
+public:
+    HLCStripe(const std::shared_ptr<HLCOperation> &operation_) : operation(operation_) {}
+    bool IsStripe() const override { return true; }
+
+    std::string ToString() const override
+    {
+        std::string ofm = "";
+#ifndef NDEBUG
+        ofm = " -> " + operation->name;
+#endif
+        std::string extra = "";
+        if ( ifmAreas.size() > 1 )
+        {
+            extra = fmt::format(", IFM2 {}", ifmAreas[1].ToString());
+        }
+        else if ( operation->weights != nullptr && operation->weights->buffering != Buffering::None )
+        {
+            extra = fmt::format(", Weight depth: {}", weightRangeDepth);
+        }
+        if ( padding.top != 0 || padding.bottom != 0 )
+        {
+            extra += fmt::format(", padding: {}", padding.ToString());
+        }
+        if ( ofmArea.SizeShape().Elements() != operation->ofm.shape.Elements() )
+        {
+            extra += (ofmArea.SizeShape().ElementsWH() == operation->ofm.shape.ElementsWH()) ? ", buffered" : ", cascaded";
+        }
+        return fmt::format("{}{} OFM area {}, IFM {}{}", OpTypeToString(operation->type), ofm, ofmArea.ToString(),
+            ifmAreas[0].ToString(), extra);
+    }
+};
+
+/// <summary>
+/// High level command that performs a DMA operation.
+/// </summary>
+class HLCDMA : public HighLevelCommand
+{
+public:
+    MemArea srcMemArea;
+    Address srcAddress;
+    Shape srcStrides;  // Only valid for Ethos U85
+    bool srcIndexed;   // Only valid for Ethos U85
+    MemArea destMemArea;
+    Address destAddress;
+    Shape destStrides;   // Only valid for Ethos U85
+    bool destIndexed;    // Only valid for Ethos U85
+    MemArea idxMemArea;  // Only valid for Ethos U85
+    Address idxAddress;  // Only valid for Ethos U85
+    int idxSkip1;        // Only valid for Ethos U85
+    int idxMax;          // Only valid for Ethos U85
+    int length;
+    Shape sizes;  // Only valid for Ethos U85
+
+    std::string ToString() const override
+    {
+        return fmt::format("DMA src: {}:{}, address: {}, dest: {}:{}, address: {}, size: {}", srcMemArea.memory->Name(),
+            srcMemArea.usage, srcAddress, destMemArea.memory->Name(), destMemArea.usage, destAddress,
+            sizes ? sizes.ToString() : std::to_string(length));
+    }
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/high_level_command_stream_generator.cpp b/ethosu/regor/compiler/high_level_command_stream_generator.cpp
new file mode 100644
index 00000000..ec714708
--- /dev/null
+++ b/ethosu/regor/compiler/high_level_command_stream_generator.cpp
@@ -0,0 +1,813 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "high_level_command_stream_generator.hpp"
+
+#include "common/common.hpp"
+#include "common/logging.hpp"
+
+#include "common/box.hpp"
+#include "common/numeric_util.hpp"
+#include "common/vector_span.hpp"
+#include "high_level_command_stream.hpp"
+#include "scheduler.hpp"
+
+#include <unordered_map>
+#include <vector>
+
+namespace regor
+{
+
+
+static void CalcPaddingAndSkirt(const Kernel *kernel, const Shape &inputShape, const Shape &outputShape, HLCPadding &padding, HLCPadding &skirt)
+{
+    auto dilatedWH = kernel->DilatedWH();
+    int ypad = NeededTotalPadding(inputShape.Height(), outputShape.Height(), kernel->Stride().y, dilatedWH.y);
+    int xpad = NeededTotalPadding(inputShape.Width(), outputShape.Width(), kernel->Stride().x, dilatedWH.x);
+    const auto &pad = kernel->Padding();
+    padding.left = pad.Left();
+    padding.right = pad.Right();
+    padding.top = pad.Top();
+    padding.bottom = pad.Bottom();
+    skirt.top = padding.top;
+    skirt.left = padding.left;
+    skirt.bottom = std::max(ypad - padding.top, dilatedWH.y - 1);
+    skirt.right = std::max(xpad - padding.left, dilatedWH.x - 1);
+}
+
+enum class TransformLimit
+{
+    None,
+    Wrap,
+};
+
+static Box TransformWithStridesAndSkirt(const Box &outputArea, const Shape *strides, const HLCPadding *skirt, const Shape &ifmShape,
+    OpType opType, const Shape &concatOffsets, const Shape &splitOffset, const Shape &splitShape, int dilatedKernelHeight, int upscalingFactor,
+    int &padTop, int &padBottom, TransformLimit limit = TransformLimit::None, TransposeType transposeType = TransposeType::None)
+{
+    Shape outputAreaStart = outputArea.Start().Untranspose(transposeType);
+    Shape outputAreaEnd = outputArea.End().Untranspose(transposeType);
+    Shape concatOffsetsUntransposed = concatOffsets.Untranspose(transposeType);
+    Shape outputAreaSize = outputAreaEnd - outputAreaStart;
+    // Make start/end at least 4 dimensional
+    Shape start = Shape::Max(outputAreaStart - concatOffsetsUntransposed, Shape(0, 0, 0, 0));
+    Shape end = Shape::Max(start + outputAreaSize, Shape(1, 1, 1, 1));
+    start += splitOffset;
+    end += splitOffset;
+    if ( (IsConvolution(opType) && !IsDepthwise(opType)) )
+    {
+        if ( splitOffset.Size() == 0 )
+        {
+            start = start.WithDepth(0);
+            end = end.WithDepth(ifmShape.Depth());
+        }
+        else
+        {
+            start = start.WithDepth(splitOffset.Depth());
+            end = end.WithDepth(start.Depth() + splitShape.Depth());
+        }
+    }
+    else if ( IsVectorProduct(opType) || opType == OpType::ReduceSum )
+    {
+        // these types of operations do a "dot product" or sum over the entire IFM - full shape needed
+        if ( splitOffset.Size() == 0 )
+        {
+            start = Shape(0, 0, 0, 0);
+            end = Shape::PadAxes(ifmShape, 4, 1);
+        }
+        else
+        {
+            start = splitOffset;
+            end = start + splitShape;
+        }
+    }
+    else if ( opType == OpType::Resize )
+    {
+        // TODO MLBEDSW-8660: Striping of resize operations
+        return (splitOffset.Size() > 0) ? Box(splitOffset, splitOffset + splitShape) : Box(Shape::PadAxes(ifmShape, 4, 1));
+    }
+    else if ( IsBinaryElementwise(opType) && splitOffset.Size() != 0 )
+    {
+        // Elementwise with splitShape. IFM might not cover full OFM and can be broadcasted in that case
+        start = splitOffset;
+        end = start + splitShape;
+    }
+    end = Shape::Min(end, Shape::Max(ifmShape, Shape(1, 1, 1, 1)).WithHW(ifmShape.Height() * upscalingFactor, ifmShape.Width() * upscalingFactor));
+    padTop = 0;
+    padBottom = 0;
+
+    assert(strides != nullptr && skirt != nullptr);
+    assert(strides->Size() == 4);
+
+    int strideW = strides->Width();
+    int validIfmOffset = splitOffset.IsEmpty() ? 0 : splitOffset.Width();
+    start = start.WithWidth(std::max(start.Width() * strideW - skirt->left, validIfmOffset));
+    int validIfmWidth = ifmShape.Width();
+    if ( splitShape.Size() > 1 && splitOffset.Size() > 1 )
+        validIfmWidth = std::min(validIfmWidth, splitShape.Width() + splitOffset.Width());
+    else if ( splitShape.Size() > 1 ) validIfmWidth = splitShape.Width();
+    end = end.WithWidth(std::min(end.Width() * strideW + skirt->right, validIfmWidth));
+    int strideH = strides->Height();
+    int skirtTopRemainder = skirt->top % upscalingFactor;
+    int totalStride = strideH * (outputAreaEnd.Height() - outputAreaStart.Height() - 1);
+    int startHeight = start.Height() * strideH - skirt->top + skirtTopRemainder;
+    padTop = std::max(0, -startHeight) + skirtTopRemainder;
+    start = start.WithHeight(std::max(startHeight, 0));
+    if ( end.Height() * strideH + skirt->bottom > ifmShape.Height() * upscalingFactor )
+    {
+        // padBottom is calculated based the diff between the end position of the weight kernel,
+        // after last stride and the ifm height.
+        if ( upscalingFactor != 1 && outputAreaEnd.Height() > ifmShape.Height() * upscalingFactor )
+        {
+            // Special case for Transpose Convolution with VALID padding.
+            padBottom = outputAreaEnd.Height() - ifmShape.Height() * upscalingFactor;
+        }
+        else
+        {
+            int kernelStart = start.Height() - padTop;
+            padBottom = std::max(0, kernelStart + totalStride + dilatedKernelHeight - ifmShape.Height() * upscalingFactor);
+        }
+    }
+    // Adjust for upscaling
+    start = start.WithHeight(std::max(start.Height() / upscalingFactor, 0));
+    int endHeight = end.Height() * strideH + skirt->bottom + skirt->bottom % upscalingFactor;
+    end = end.WithHeight(std::min(std::max(endHeight / upscalingFactor, 1), ifmShape.Height()));
+
+    if ( limit == TransformLimit::Wrap )
+    {
+        Shape ifmWrap = Shape::PadAxes(ifmShape, 4, 1);
+        Shape one(1, 1, 1, 1);
+        start = Shape::Wrap(start, ifmWrap);
+        end = Shape::Wrap(end - one, ifmWrap) + one;
+        assert((end - start).Elements() > 0);
+    }
+
+    return Box(start, end);
+}
+
+static int StrideAdjustedPadding(int pad, int outOffset, int kernelStride)
+{
+    return pad - outOffset * (kernelStride - 1);
+}
+
+static int MarginForKernel(int paddedInput, int dilatedSize, int stride)
+{
+    assert(stride > 0);
+    int margin = paddedInput % stride;
+    if ( margin == 0 )
+    {
+        margin += stride;
+    }
+    return std::max(0, dilatedSize - margin);
+}
+
+static std::pair<Box, HLCPadding> TransformWithInputOutputSteps(const Box &inputArea, const Point2i &inputStep,
+    const Box &outputArea, const Point2i &outputStep, class Kernel *kernel, const HLCPadding &padding)
+{
+    const auto &stride = kernel->Stride();
+    const auto dilatedWH = kernel->DilatedWH();
+    HLCPadding newPadding;
+    auto adjustedTopPad = StrideAdjustedPadding(padding.top, outputArea.Start().Height(), stride.y);
+    auto adjustedLeftPad = StrideAdjustedPadding(padding.left, outputArea.Start().Width(), stride.x);
+    newPadding.top = std::max(0, DivRoundUp(adjustedTopPad, inputStep.y));
+    newPadding.left = std::max(0, DivRoundUp(adjustedLeftPad, inputStep.x));
+    Point2i startAdjustForPadFraction;
+    startAdjustForPadFraction.x = newPadding.left * inputStep.x - adjustedLeftPad;
+    startAdjustForPadFraction.y = newPadding.top * inputStep.y - adjustedTopPad;
+    Point2i neededInput;
+    neededInput.x = DivRoundUp(outputArea.End().Width() - outputArea.Start().Width(), outputStep.x) * stride.x;
+    neededInput.x += MarginForKernel(neededInput.x, dilatedWH.x, stride.x);
+    neededInput.y = DivRoundUp(outputArea.End().Height() - outputArea.Start().Height(), outputStep.y) * stride.y;
+    neededInput.y += MarginForKernel(neededInput.y, dilatedWH.y, stride.y);
+    Shape newStart =
+        inputArea.Start()
+            .WithWidth(inputArea.Start().Width() + startAdjustForPadFraction.x)
+            .WithHeight(inputArea.Start().Height() + startAdjustForPadFraction.y);
+    newPadding.bottom = std::max(0,
+        neededInput.y -
+            (DivRoundUp((inputArea.End().Height() - inputArea.Start().Height()) - startAdjustForPadFraction.y, inputStep.y) +
+                newPadding.top));
+    newPadding.right = std::max(0,
+        neededInput.x -
+            (DivRoundUp(inputArea.End().Width() - inputArea.Start().Width() - startAdjustForPadFraction.x, inputStep.x) +
+                newPadding.left));
+    return std::make_pair<Box, HLCPadding>(Box(newStart, inputArea.End()), std::move(newPadding));
+}
+
+// Calculates STRIDE_C/Y/X
+static Shape GetStrides(const HLCFeatureMap &fm)
+{
+    auto elemSize = DataTypeSizeBits(fm.dataType) / 8;
+    if ( fm.format == TensorFormat::NHWC )
+    {
+        int strideC = elemSize;
+        int strideX = fm.shape.Depth() * strideC;
+        int strideY = fm.shape.Width() * strideX;
+        int strideN = fm.shape.Height() * strideY;
+        return Shape(strideN, strideY, strideX, strideC);
+    }
+    else if ( fm.format == TensorFormat::NHCWB16 )
+    {
+        int strideX = 16 * elemSize;
+        int strideC = strideX * fm.shape.Width();
+        int strideY = elemSize * fm.shape.Width() * RoundAway(fm.shape.Depth(), 16);
+        int strideN = fm.shape.Height() * strideY;
+        return Shape(strideN, strideY, strideX, strideC);
+    }
+    else
+    {
+        assert(false && "Unsupported tensor format");
+        return Shape(0, 0, 0, 0);
+    }
+}
+
+static void MakeFeatureMap(const SchedulerConnection *schedConn, HLCFeatureMap &fm)
+{
+    auto schedTens = schedConn->tensor.get();
+    fm.shape = schedConn->shape;
+    fm.dataType = schedTens->dataType;
+    fm.memArea = schedTens->memArea;
+    fm.format = schedTens->format;
+    fm.address = schedTens->allocatedAddress;
+    fm.quantization = schedConn->quantization;
+    fm.bufferView = schedTens->bufferView;
+    fm.strides = GetStrides(fm);
+    fm.stepXY = schedConn->stepXY;
+    fm.transpose = schedConn->transpose;
+    fm.reverse = schedConn->reverse;
+    fm.resamplingMode = schedConn->resamplingMode;
+}
+
+static std::unique_ptr<HLCWeights> MakeWeights(NpuWeightTensor *srcTensor, Buffering buffering, SchedulerTensor *bufTensor = nullptr)
+{
+    auto weights = std::make_unique<HLCWeights>();
+    if ( buffering == Buffering::None )
+    {
+        assert(!bufTensor);
+    }
+    if ( bufTensor == nullptr )
+    {
+        bufTensor = srcTensor;
+    }
+    weights->address = bufTensor->allocatedAddress;
+    weights->memArea = bufTensor->memArea;
+    weights->buffering = buffering;
+    // Same function is used for generating scales - scales have no config or weight format, so set to default
+    weights->format = srcTensor->config ? srcTensor->config->Format() : Flags(WeightFormat::Default);
+    weights->maxRangeBytes = srcTensor->maxRangeBytes;
+    weights->subStreams = srcTensor->subStreams;
+    weights->encodedRanges = srcTensor->encodedRanges;
+    return weights;
+}
+
+static std::shared_ptr<HLCOperation> MakeOperation(SchedulerOperation *schedOp, SchedulerOpInfo *opInfo)
+{
+    auto op = std::make_shared<HLCOperation>();
+    op->type = schedOp->Type();
+    op->kernel = *schedOp->Kernel();
+    op->config = opInfo->Config();
+    op->rounding = HLCRoundMode(schedOp->Rounding());
+    op->_srcKey = schedOp->_srcKey;
+    for ( int i = 0; i < MAX_NUM_IFM; ++i )
+    {
+        auto ifm = schedOp->TryIFM(i);
+        if ( ifm != nullptr )
+        {
+            HLCFeatureMap fm;
+            MakeFeatureMap(ifm, fm);
+            op->ifm.push_back(fm);
+        }
+    }
+    MakeFeatureMap(schedOp->OFM(), op->ofm);
+#ifndef NDEBUG
+    op->name = schedOp->OFM()->tensor->Name();
+#endif
+    if ( opInfo->npuWeightsTensor != nullptr )
+    {
+        assert(schedOp->TryInput(TensorUsage::Weights) != nullptr);
+        op->weights = MakeWeights(opInfo->npuWeightsTensor.get(), opInfo->bufferedWeightTensor.buffering,
+            opInfo->bufferedWeightTensor.tensor.get());
+    }
+
+    if ( opInfo->npuScalesTensor != nullptr )
+    {
+        // Only scales encoded
+        op->scales = MakeWeights(opInfo->npuScalesTensor.get(), Buffering::None);
+    }
+    else if ( schedOp->TryInput(TensorUsage::Scales) != nullptr )
+    {
+        // Weights and scales encoded together
+        assert(!!opInfo->npuWeightsTensor);
+        op->scales = MakeWeights(opInfo->npuWeightsTensor.get(), opInfo->bufferedWeightTensor.buffering,
+            opInfo->bufferedWeightTensor.tensor.get());
+    }
+
+    auto lutConn = schedOp->TryInput(TensorUsage::LUT);
+    if ( lutConn != nullptr )
+    {
+        // Add sub op for operations using LUT
+        HLCSubOperation lutSubOp;
+        lutSubOp.type = OpType::LUT;
+        auto &param = lutSubOp.parameters.lut;
+        auto lut = lutConn->tensor;
+        auto end = schedOp->SubOps().end();
+        auto subOp = std::find_if(schedOp->SubOps().begin(), end, [](auto &so) { return so->Type() == OpType::LUT; });
+        // Register command stream generator will allocate the LUT
+        // in LUT memory and generate DMA for the LUT; for this
+        // it must know the location of the tensor in read-only memory
+        param.memArea = lut->memArea;
+        param.address = lut->allocatedAddress;
+        param.sizeBytes = lut->AllocationSizeBytes();
+        param.ifmType = subOp != end ? (*subOp)->IFM(0)->tensor->dataType : schedOp->IFM(0)->tensor->dataType;
+        op->subOps.push_back(std::move(lutSubOp));
+    }
+    for ( auto &subOp : schedOp->SubOps() )
+    {
+        if ( subOp->Type() != OpType::LUT )
+        {
+            HLCSubOperation hlcSubOp;
+            hlcSubOp.type = subOp->Type();
+            // TODO: add op type specific info
+            if ( subOp->Type() == OpType::LeakyRelu )
+            {
+                const auto &parameters = subOp->Parameters();
+                hlcSubOp.parameters.leaky_relu.alpha = parameters.leaky_relu.alpha;
+            }
+            op->subOps.push_back(std::move(hlcSubOp));
+        }
+    }
+    const auto &parameters = schedOp->Parameters();
+    const auto &attr = schedOp->Attributes();
+    const auto &ifmShape = schedOp->IFM(0)->shape;
+    switch ( schedOp->Type() )
+    {
+        case OpType::LeakyRelu:
+            op->parameters.leaky_relu.alpha = parameters.leaky_relu.alpha;
+            break;
+        case OpType::Resize:
+            op->parameters.resize.scaleY = attr.resize.scaleY;
+            op->parameters.resize.scaleX = attr.resize.scaleX;
+            op->parameters.resize.offsetY = attr.resize.offsetYX[0];
+            op->parameters.resize.offsetX = attr.resize.offsetYX[1];
+            if ( ifmShape.Width() == 1 && ifmShape.Height() == 1 )
+            {
+                // 1x1 IFMs can be handled with replicate
+                op->parameters.resize.mode = ArchResizeMode::Replicate;
+            }
+            else if ( attr.resize.mode == tosa::ResizeMode::NEAREST )
+            {
+                op->parameters.resize.mode = ArchResizeMode::Nearest;
+            }
+            else
+            {
+                op->parameters.resize.mode = ArchResizeMode::Bilinear;
+            }
+            break;
+        case OpType::ArgMax:
+            op->parameters.argmax.axis = attr.axis.axis;
+            break;
+        default:
+            break;
+    }
+    return op;
+}
+
+// Finds the next stripe command in the stream
+static HLCStripe *FindNextStripe(HLCStream &cmds, int fromIndex)
+{
+    int sz = int(cmds.size());
+    for ( int i = fromIndex; i < sz; ++i )
+    {
+        if ( cmds[i]->IsStripe() )
+        {
+            return static_cast<HLCStripe *>(cmds[i].get());
+        }
+    }
+    assert(fromIndex != 0);  // Every stream should contain at least one stripe
+    return nullptr;
+}
+
+// Generates DMA command for Scatter/Gather
+void HLCStreamGenerator::GenerateHLCDMACommands(SchedulerOperation *op, const std::shared_ptr<HLCOperation> &hlcOp, HLCStream &cmds)
+{
+    UNUSED(op);
+
+    auto opType = hlcOp->type;
+    assert(opType == OpType::Scatter || opType == OpType::Gather);
+
+    int ifmSrc = 0;
+
+    if ( opType == OpType::Scatter )
+    {
+        auto &ifm = hlcOp->ifm[0];  // GraphIR Scatter values_in
+        auto &ofm = hlcOp->ofm;     // GraphIR Scatter values_out
+        assert(ifm.AllocationSizeBytes() == ofm.AllocationSizeBytes());
+
+        // Generate HLCDMA that copies values_in to values_out
+        auto dma = std::make_unique<HLCDMA>();
+        dma->srcMemArea = ifm.memArea;
+        dma->srcAddress = ifm.address;
+        dma->srcStrides = GetStrides(ifm);
+        dma->destMemArea = ofm.memArea;
+        dma->destAddress = ofm.address;
+        dma->destStrides = GetStrides(ofm);
+        dma->length = ifm.AllocationSizeBytes();
+
+        cmds.push_back(std::move(dma));
+
+        ifmSrc = 2;
+    }
+
+    auto &valFm = hlcOp->ifm[0];       // GraphIR Scatter values_in or GraphIR Gather values
+    auto &idxFm = hlcOp->ifm[1];       // GraphIR Scatter indicies or GraphIR Gather indices
+    auto &srcFm = hlcOp->ifm[ifmSrc];  // GraphIR Scatter input or GraphIR Gather values
+    auto &ofm = hlcOp->ofm;            // GraphIR Scatter values_out or GraphIR Gather output
+    assert(idxFm.dataType == DataType::Int32 || idxFm.dataType == DataType::Int64);
+    assert(srcFm.dataType == ofm.dataType);
+
+    // Generate HLCDMA that scatters or gathers
+    auto dma = std::make_unique<HLCDMA>();
+    dma->srcMemArea = srcFm.memArea;
+    dma->srcAddress = srcFm.address;
+    dma->srcIndexed = (opType == OpType::Gather);
+    dma->idxMemArea = idxFm.memArea;
+    dma->idxAddress = idxFm.address;
+    dma->destMemArea = ofm.memArea;
+    dma->destAddress = ofm.address;
+    dma->destIndexed = (opType == OpType::Scatter);
+    dma->length = DataTypeStorageSizeBytes(srcFm.dataType, srcFm.shape[-1]);
+    dma->idxMax = valFm.shape[-2] - 1;
+
+    auto srcStrides = GetStrides(srcFm);
+    auto destStrides = GetStrides(ofm);
+
+    if ( opType == OpType::Scatter && idxFm.dataType == DataType::Int64 )
+    {
+        // Do scatter in 3D mode with index skip because HW can only use int32 indicies
+        dma->srcStrides = Shape(srcStrides[-2], 0, srcStrides[-1]);
+        dma->destStrides = Shape(0, destStrides[-2], destStrides[-1]);
+        dma->sizes = idxFm.shape.Extract({-1, -2});
+        dma->idxSkip1 = 4;
+    }
+    else if ( opType == OpType::Gather && idxFm.dataType == DataType::Int64 )
+    {
+        // Do gather in 3D mode with index skip because HW can only use int32 indicies
+        dma->srcStrides = Shape(0, srcStrides[-2], srcStrides[-1]);
+        dma->destStrides = Shape(destStrides[-2], 0, destStrides[-1]);
+        dma->sizes = idxFm.shape.Extract({-1, -2});
+        dma->idxSkip1 = 4;
+    }
+    else
+    {
+        // Do scatter or gather in 2D mode
+        dma->destStrides = std::move(destStrides);
+        dma->srcStrides = std::move(srcStrides);
+        dma->sizes = idxFm.shape.Extract({-2, -1});
+        dma->idxSkip1 = 0;
+    }
+
+    cmds.push_back(std::move(dma));
+}
+
+// Generates DMA command for weights
+static std::unique_ptr<HLCDMA> GenerateWeightDMA(NpuWeightTensor *weightTens, const SchedulerConnection &bufConn, int depth, int depthIndex)
+{
+    auto dma = std::make_unique<HLCDMA>();
+    dma->srcMemArea = weightTens->memArea;
+    dma->srcAddress = weightTens->allocatedAddress;
+    dma->length = 0;
+    int offset0 = 0;  // offset of the first substream
+    for ( int subStream = 0; subStream < weightTens->subStreams; ++subStream )
+    {
+        auto item = weightTens->encodedRanges.find(WeightKey(subStream, depth));
+        if ( item == weightTens->encodedRanges.end() )
+        {
+            assert(subStream > 0);
+        }
+        else
+        {
+            if ( subStream == 0 )
+            {
+                offset0 = item->second.offset;
+                dma->srcAddress += offset0;
+            }
+            dma->length = RoundAway(item->second.offset + item->second.TotalBytes() - offset0, 16);
+        }
+    }
+    dma->destMemArea = bufConn.tensor->memArea;
+    dma->destAddress = bufConn.tensor->allocatedAddress;
+    if ( bufConn.buffering == Buffering::Double && depthIndex % 2 == 1 )
+    {
+        dma->destAddress += weightTens->maxRangeBytes;
+    }
+    return dma;
+}
+
+void HLCStreamGenerator::GenerateHLCStripeCommands(SchedulerOperation *op, const std::shared_ptr<HLCOperation> &hlcOp, HLCStream &cmds)
+{
+    auto opInfo = _schedule->Cost(op);
+    HLCPadding skirt;
+    HLCPadding padding;
+    auto kernel = op->Kernel();
+    assert(kernel != nullptr && "Operators must have a kernel");
+    Shape strides = Shape(1, kernel->Stride().y, kernel->Stride().x, 1);
+    auto opType = op->Type();
+    auto ofmConn = op->OFM();
+    auto ifm0Conn = op->IFM(0);
+    const auto &ofmShape = ofmConn->SliceShape();
+    const auto &ifm0Shape = ifm0Conn->SliceShape();
+
+    auto *ifm1Conn = op->TryIFM(1);
+    auto maxIfmShape = ifm0Shape;
+    if ( ifm1Conn && IsBinaryElementwise(opType) )
+    {
+        // Use full ifm shape for broadcast elementwise operators
+        maxIfmShape = Shape::Max(ifm0Conn->SliceShape(), ifm1Conn->SliceShape());
+    }
+    CalcPaddingAndSkirt(kernel, maxIfmShape, ofmShape, padding, skirt);
+
+    int upscaling = 1;
+    if ( opType == OpType::Conv2DBackpropInputSwitchedBias )
+    {
+        upscaling = ofmShape.Height() / ifm0Shape.Height();
+    }
+    auto &depthSlices = opInfo->ofmDepthSlices;
+    int dilatedKernelHeight = kernel->DilatedWH().y;
+
+    // Define Start and End coordinates for the OFM
+    auto ofmStart = Shape(0, 0, 0, depthSlices[0]);
+    auto ofmEnd = Shape::PadAxes(ofmShape, 4, 1);
+    if ( ofmConn->slice.offset.Size() > 0 )
+    {
+        ofmStart = ofmConn->slice.offset;
+        ofmEnd = ofmConn->slice.offset + ofmConn->slice.shape;
+    }
+    assert(hlcOp->ifm.size() <= 2);
+
+    // Binary elementwise using broadcast to repeat smaller IFMs over larger IFM volumes need their
+    // coordinates to wrap at the limits of the smaller IFM volume.
+    TransformLimit ifmLimit = IsBinaryElementwise(op->Type()) ? TransformLimit::Wrap : TransformLimit::None;
+
+    const auto &ofmStep = opInfo->stripe;
+    for ( int startHeight = ofmStart.Height(); startHeight < ofmEnd.Height(); startHeight += ofmStep.Height() )
+    {
+        int endHeight = std::min(startHeight + ofmStep.Height(), ofmEnd.Height());
+        for ( int startWidth = ofmStart.Width(); startWidth < ofmEnd.Width(); startWidth += ofmStep.Width() )
+        {
+            int endWidth = std::min(startWidth + ofmStep.Width(), ofmEnd.Width());
+            for ( int depthIndex = 0; depthIndex < int(depthSlices.size()) - 1; ++depthIndex )
+            {
+                int startChannel = std::max(depthSlices[depthIndex], ofmStart.Depth());
+                int endChannel = std::min(depthSlices[depthIndex + 1], ofmEnd.Depth());
+
+                // Construct the output area for the current stripe
+                auto outputAreaStart = Shape(ofmStart.Batch(), startHeight, startWidth, startChannel);
+                auto outputAreaEnd = Shape(ofmEnd.Batch(), endHeight, endWidth, endChannel);
+                auto outputArea = Box(outputAreaStart, outputAreaEnd);
+                auto hlcStripe = std::make_unique<HLCStripe>(hlcOp);
+                hlcStripe->padding = padding;
+                hlcStripe->ofmArea = outputArea;
+                for ( unsigned ifmIndex = 0; ifmIndex < hlcOp->ifm.size(); ++ifmIndex )
+                {
+                    auto ifmConn = op->IFM(ifmIndex);
+                    // Calculate input area based on the output area
+                    auto inputArea = TransformWithStridesAndSkirt(outputArea, &strides, &skirt, ifmConn->shape, opType,
+                        ofmConn->slice.offset, ifmConn->slice.offset, ifmConn->slice.shape, dilatedKernelHeight,
+                        upscaling, hlcStripe->padding.top, hlcStripe->padding.bottom, ifmLimit, ofmConn->transpose);
+                    if ( ofmConn->stepXY != Point2i{1, 1} || ifmConn->stepXY != Point2i{1, 1} )
+                    {
+                        std::tie(inputArea, hlcStripe->padding) = TransformWithInputOutputSteps(
+                            inputArea, ifmConn->stepXY, outputArea, ofmConn->stepXY, kernel, hlcStripe->padding);
+                    }
+                    hlcStripe->ifmAreas.push_back(inputArea);
+                }
+                if ( opInfo->npuWeightsTensor != nullptr )
+                {
+                    hlcStripe->weightRangeDepth = startChannel;
+                    if ( opInfo->bufferedWeightTensor.tensor != nullptr &&
+                         (startHeight == ofmStart.Height() || opInfo->bufferedWeightTensor.buffering == Buffering::Double) )
+                    {
+                        assert(opInfo->npuWeightsTensor->config->DepthOffsets().size() == depthSlices.size());
+                        // Metadata of new weights to put into the weight buffer tensor
+                        auto newWeights = std::make_tuple(opInfo->npuWeightsTensor->equivalenceId, startChannel, depthIndex);
+                        if ( _filledWeightBuffers.count(opInfo->bufferedWeightTensor.tensor.get()) == 0 )
+                        {
+                            // There is nothing in the weights buffer tensor yet
+                            cmds.push_back(GenerateWeightDMA(opInfo->npuWeightsTensor.get(),
+                                opInfo->bufferedWeightTensor, startChannel, depthIndex));
+                        }
+                        else
+                        {
+                            auto &currentWeights = _filledWeightBuffers[opInfo->bufferedWeightTensor.tensor.get()];
+                            if ( currentWeights != newWeights )
+                            {
+                                // There is something in the weights buffer tensor, but it's not correct
+                                cmds.push_back(GenerateWeightDMA(opInfo->npuWeightsTensor.get(),
+                                    opInfo->bufferedWeightTensor, startChannel, depthIndex));
+                            }
+                        }
+                        _filledWeightBuffers[opInfo->bufferedWeightTensor.tensor.get()] = newWeights;
+                    }
+                }
+                else
+                {
+                    hlcStripe->weightRangeDepth = -1;
+                }
+                cmds.push_back(std::move(hlcStripe));
+            }
+        }
+    }
+}
+
+void HLCStreamGenerator::GenerateCommands(SchedulerOperation *op, const std::shared_ptr<HLCOperation> &hlcOp, HLCStream &cmds)
+{
+    auto opType = op->Type();
+
+    if ( IsDma(opType) )
+    {
+        GenerateHLCDMACommands(op, hlcOp, cmds);
+    }
+    else
+    {
+        GenerateHLCStripeCommands(op, hlcOp, cmds);
+    }
+}
+
+void HLCStreamGenerator::GenerateCommandsForCascade(vector_span<std::unique_ptr<SchedulerOperation>> cascadedOps,
+    vector_span<std::shared_ptr<HLCOperation>> hlcOps, const CascadeInfo *cascadeInfo, HLCStream &cmds)
+{
+    // High level command stream for each individual operation
+    std::vector<HLCStream> cmdsForOps;
+    std::vector<int> currIndex;
+    // Performed stripe at each operation
+    std::vector<HLCStripe *> availableStripe;
+    // Next stripe to be performed at each operation
+    std::vector<HLCStripe *> nextStripe;
+    int nrOps = cascadedOps.size();
+    assert(cascadeInfo != nullptr);
+    // Apply intermediate feature map shapes to cascaded operations
+    for ( int i = 1; i < nrOps; ++i )
+    {
+        auto item = cascadeInfo->buffers.find(*cascadedOps[i]);
+        if ( item == cascadeInfo->buffers.end() )
+        {
+            assert(false);
+        }
+        else
+        {
+            auto &shape = item->second.shape;
+            hlcOps[i - 1]->ofm.shape = shape;
+            hlcOps[i]->ifm[cascadedOps[i]->PrimaryIfmIndex()].shape = shape;
+        }
+    }
+    // Generate high level commands for every operation in the cascade;
+    // keep the generated streams in separate lists
+    for ( int i = 0; i < nrOps; ++i )
+    {
+        HLCStream stream;
+        GenerateCommands(cascadedOps[i].get(), hlcOps[i], stream);
+        currIndex.push_back(0);
+        availableStripe.push_back(nullptr);
+        nextStripe.push_back(FindNextStripe(stream, 0));
+        cmdsForOps.push_back(std::move(stream));
+    }
+    // Combine the generated command streams for the individual operations to a single stream.
+    // A command on one level can only performed when its input has been produced at the previous level.
+    int opIndex = 0;
+    while ( true )
+    {
+        int &ix = currIndex[opIndex];
+        if ( opIndex == 0 ||
+             nextStripe[opIndex]->ifmAreas[cascadedOps[opIndex]->PrimaryIfmIndex()].End().IsSubShapeOf(
+                 availableStripe[opIndex - 1]->ofmArea.End()) )
+        {
+            auto &stream = cmdsForOps[opIndex];
+            assert(ix < int(stream.size()));
+            HighLevelCommand *hlc = stream[ix].get();
+            cmds.push_back(std::move(cmdsForOps[opIndex][ix]));
+            ++ix;
+            if ( hlc->IsStripe() )
+            {
+                availableStripe[opIndex] = nextStripe[opIndex];
+                nextStripe[opIndex] = FindNextStripe(stream, ix);
+                if ( opIndex < nrOps - 1 &&
+                     nextStripe[opIndex + 1]->ifmAreas[cascadedOps[opIndex + 1]->PrimaryIfmIndex()].End().IsSubShapeOf(
+                         availableStripe[opIndex]->ofmArea.End()) )
+                {
+                    // Enough output has been produced to continue at next level
+                    ++opIndex;
+                }
+                if ( nextStripe[opIndex] == nullptr )
+                {
+                    // Finished
+                    assert(opIndex >= nrOps - 1);
+                    break;
+                }
+            }
+        }
+        else
+        {
+            // More input is needed from the previous level
+            --opIndex;
+        }
+    }
+}
+
+HLCStream HLCStreamGenerator::GenerateCommandStream(const NPUOperation *npuOp, const Schedule *schedule, bool verbose)
+{
+    HLCStream cmds;
+    _schedule = schedule;
+    auto &npuOps = npuOp->Operations();
+    // Create HLCOperation for every ScheduledOperation
+    std::vector<std::shared_ptr<HLCOperation>> hlcOps;
+    for ( auto &schedOp : npuOps )
+    {
+        auto op = schedOp.get();
+        hlcOps.push_back(MakeOperation(op, schedule->Cost(op)));
+    }
+    // Generate the command stream
+    int sz = int(npuOps.size());
+    for ( int i = 0; i < sz; ++i )
+    {
+        auto op = npuOps[i].get();
+        auto opInfo = schedule->Cost(op);
+        assert(opInfo != nullptr);
+        auto &hlcOp = hlcOps[i];
+        if ( opInfo->cascade == 0 )
+        {
+            // Single operation, not in cascade
+            GenerateCommands(op, hlcOp, cmds);
+        }
+        else
+        {
+            // Cascaded operation: generate commands for all operations in the cascade
+            auto cascadeInfo = _schedule->Cascade(opInfo->cascade);
+            assert(cascadeInfo != nullptr);
+            assert(op->Index() == cascadeInfo->start);
+            // Note: below code assumes:
+            // - all operations in a cascade are in the same NPU op
+            // - operations in a cascade are contiguous
+            // - operations in the npuOp appear in same order as in the schedule
+            int cascadeSize = cascadeInfo->end - cascadeInfo->start + 1;
+            assert(i + cascadeSize <= sz);
+            vector_span<std::unique_ptr<SchedulerOperation>> cascadedOps(npuOps, i, i + cascadeSize);
+            vector_span<std::shared_ptr<HLCOperation>> cascadedHlcOps(hlcOps, i, i + cascadeSize);
+            GenerateCommandsForCascade(cascadedOps, cascadedHlcOps, cascadeInfo, cmds);
+            i += cascadeSize - 1;
+        }
+    }
+    if ( verbose )
+    {
+        PrintCommandStream(npuOp, hlcOps, cmds);
+    }
+    return cmds;
+}
+
+void HLCStreamGenerator::PrintCommandStream(const NPUOperation *npuOp, std::vector<std::shared_ptr<HLCOperation>> &hlcOps, HLCStream &cmds)
+{
+    LOG_PRINT("High level NPU operations:\n");
+    int opIndex = 0;
+    for ( auto &schedOp : npuOp->Operations() )
+    {
+        auto op = schedOp.get();
+        const auto hlcOp = hlcOps[opIndex].get();
+        LOG_PRINT("{} {}\n", opIndex, hlcOp->ToString());
+        LOG_PRINT("  IFM: {}, {}\n", op->IFM(0)->tensor->Name(), hlcOp->ifm[0].ToString());
+        if ( hlcOp->ifm.size() > 1 )
+        {
+            LOG_PRINT("  IFM2: {}, {}\n", op->IFM(1)->tensor->Name(), hlcOp->ifm[1].ToString());
+        }
+        if ( hlcOp->ifm.size() > 2 )
+        {
+            LOG_PRINT("  IFM3: {}, {}\n", op->IFM(2)->tensor->Name(), hlcOp->ifm[2].ToString());
+        }
+        LOG_PRINT("  OFM: {}, {}\n", op->OFM()->tensor->Name(), hlcOp->ofm.ToString());
+        if ( hlcOp->weights != nullptr )
+        {
+            LOG_PRINT("  Weights: {}, {}\n", op->Input(TensorUsage::Weights)->tensor->Name(), hlcOp->weights->ToString());
+        }
+        ++opIndex;
+    }
+    LOG_PRINT("High level command stream:\n");
+    for ( unsigned i = 0; i < cmds.size(); ++i )
+    {
+        LOG_PRINT("{} {}\n", i, cmds[i]->ToString());
+    }
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/high_level_command_stream_generator.hpp b/ethosu/regor/compiler/high_level_command_stream_generator.hpp
new file mode 100644
index 00000000..e3e127ab
--- /dev/null
+++ b/ethosu/regor/compiler/high_level_command_stream_generator.hpp
@@ -0,0 +1,63 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "cascade_builder.hpp"
+#include "common/vector_span.hpp"
+#include "high_level_command_stream.hpp"
+#include "scheduler.hpp"
+#include "scheduler_operation.hpp"
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+namespace regor
+{
+
+using HLCStream = std::vector<std::unique_ptr<HighLevelCommand>>;
+
+/// <summary>
+/// High level command stream generator
+/// </summary>
+class HLCStreamGenerator
+{
+public:
+    // Generates high level command stream for the scheduled operations in the given NPU op
+    HLCStream GenerateCommandStream(const NPUOperation *npuOp, const Schedule *schedule, bool verbose);
+
+private:
+    // Generates one or more HLCStripe commands from a given operation and adds them to the stream
+    void GenerateHLCStripeCommands(SchedulerOperation *op, const std::shared_ptr<HLCOperation> &hlcOp, HLCStream &cmds);
+    // Generates one or more HLCDMA commands from a given operation and adds them to the stream
+    void GenerateHLCDMACommands(SchedulerOperation *op, const std::shared_ptr<HLCOperation> &hlcOp, HLCStream &cmds);
+    // Generates high level commands for the given operation and adds them to the command stream
+    void GenerateCommands(SchedulerOperation *op, const std::shared_ptr<HLCOperation> &hlcOp, HLCStream &cmds);
+    // Generates high level commands for all operations in the cascade and adds them to the command stream
+    void GenerateCommandsForCascade(vector_span<std::unique_ptr<SchedulerOperation>> cascadedOps,
+        vector_span<std::shared_ptr<HLCOperation>> hlcOps, const CascadeInfo *cascadeInfo, HLCStream &cmds);
+    void PrintCommandStream(const NPUOperation *npuOp, std::vector<std::shared_ptr<HLCOperation>> &hlcOps, HLCStream &cmds);
+
+    // Tracking what has been put in the weight buffers
+    std::unordered_map<SchedulerTensor *, std::tuple<UniqueId, int /* start channel */, int /* depth index */>> _filledWeightBuffers;
+
+    const Schedule *_schedule = nullptr;
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/hillclimb_allocator.cpp b/ethosu/regor/compiler/hillclimb_allocator.cpp
new file mode 100644
index 00000000..d69671d0
--- /dev/null
+++ b/ethosu/regor/compiler/hillclimb_allocator.cpp
@@ -0,0 +1,355 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "hillclimb_allocator.hpp"
+
+#include "common/numeric_util.hpp"
+
+#include <algorithm>
+#include <cstdint>
+#include <set>
+#include <vector>
+
+namespace regor
+{
+
+constexpr Address MAX_ADDRESS = std::numeric_limits<Address>::max();
+constexpr Address NOT_ALLOCATED = -1;
+
+void HillClimbAllocator::SetLiveRanges(const std::vector<std::shared_ptr<LiveRange>> &liveRanges, int alignment)
+{
+    int maxEndTime = 0;
+    int id = 0;
+    for ( const auto &lr : liveRanges )
+    {
+        HillClimbLiveRange hlr = {};
+
+        hlr.startTime = lr->startTime;
+        hlr.endTime = lr->endTime;
+        hlr.size = RoundAway(lr->size, alignment);
+        hlr.id = id;
+        maxEndTime = std::max(maxEndTime, lr->endTime);
+        lrs.push_back(hlr);
+        ++id;
+    }
+    lrsAtTime.resize(maxEndTime + 1);
+    sizeAtTime.resize(maxEndTime + 1);
+    neighbours.resize(lrs.size());
+    // Calculate which live ranges are active at every timestamp
+    for ( int t = 0; t <= maxEndTime; ++t )
+    {
+        lrsAtTime[t].clear();
+    }
+    for ( auto &lr : lrs )
+    {
+        for ( auto t = lr.startTime; t <= lr.endTime; ++t )
+        {
+            lrsAtTime[t].push_back(&lr);
+        }
+    }
+    minRequiredSize = 0;
+    for ( int t = 0; t <= maxEndTime; ++t )
+    {
+        // Calculate minimum needed size at each timestamp
+        Address neededSize = 0;
+        for ( auto &lr : lrsAtTime[t] )
+        {
+            neededSize += lr->size;
+        }
+        sizeAtTime[t] = neededSize;
+        minRequiredSize = std::max(neededSize, minRequiredSize);
+        // Calculate all neighbours
+        for ( unsigned i = 0; i < lrsAtTime[t].size(); ++i )
+        {
+            auto lr1 = lrsAtTime[t][i];
+            auto &nb1 = neighbours[lr1->id];
+            for ( auto j = i + 1; j < lrsAtTime[t].size(); ++j )
+            {
+                auto lr2 = lrsAtTime[t][j];
+                if ( find(nb1.begin(), nb1.end(), lr2) == nb1.end() )
+                {
+                    nb1.push_back(lr2);
+                    neighbours[lr2->id].push_back(lr1);
+                }
+            }
+        }
+    }
+    targetSize = minRequiredSize;
+    // Calculate the urgency of each live range
+    lrUrgency.resize(lrs.size());
+    for ( unsigned i = 0; i < lrs.size(); ++i )
+    {
+        auto &lr = lrs[i];
+        Address urgency = 0;
+        for ( auto t = lr.startTime; t <= lr.endTime; ++t )
+        {
+            urgency = std::max(sizeAtTime[t], urgency);
+        }
+        lrUrgency[i] = urgency;
+    }
+}
+
+Address HillClimbAllocator::Allocate(const std::vector<std::shared_ptr<LiveRange>> &liveRanges, int alignment, Address sizeLimit)
+{
+    SetLiveRanges(liveRanges, alignment);
+    maxAllowedSize = sizeLimit;
+    iterations = 0;
+    std::vector<int> indices;
+    int sz = int(liveRanges.size());
+    // Initial solution, using a heuristic allocator
+    for ( int i = 0; i < sz; ++i )
+    {
+        indices.push_back(i);
+    }
+    SortIndicesOnPrio(indices);
+    // Allocate the initial solution
+    bestSize = MAX_ADDRESS;
+    bestSize = AllocateIndices(indices);
+    if ( bestSize <= targetSize )
+    {
+        // The heuristic allocation returned an optimal solution.
+        // No need to search.
+    }
+    else
+    {
+        // Try to improve the heuristic allocation
+        Search(indices, MAX_ITERATIONS);
+    }
+    // Allocate addresses
+    for ( int i = 0; i < sz; ++i )
+    {
+        liveRanges[i]->SetAddress(lrs[i].address);
+    }
+    return bestSize;
+}
+
+void HillClimbAllocator::AllocateLr(HillClimbLiveRange &lr) const
+{
+    Address address = 0;
+    int predecessor = NO_PREDECESSOR;
+    bool fits = false;
+    while ( !fits )
+    {
+        fits = true;
+        // Find neighbours that overlap with address
+        for ( auto lr2_p : neighbours[lr.id] )
+        {
+            if ( lr2_p->address == NOT_ALLOCATED || lr2_p->endAddress <= address )
+            {
+                continue;
+            }
+            if ( lr2_p->Overlaps(address, lr.size) )
+            {
+                // Overlap found; increase address
+                fits = false;
+                address = lr2_p->endAddress;
+                predecessor = lr2_p->id;
+            }
+        }
+    }
+    lr.address = address;
+    lr.endAddress = address + lr.size;
+    lr.predecessor = predecessor;
+}
+
+Address HillClimbAllocator::AllocateIndices(const std::vector<int> &indices)
+{
+    ++iterations;
+    int sz = int(indices.size());
+    std::vector<int> count(sz);
+    for ( auto &lr : lrs )
+    {
+        lr.address = NOT_ALLOCATED;
+    }
+    Address size = 0;
+    for ( int turn = 0; size <= bestSize && turn < sz; ++turn )
+    {
+        auto &lr = lrs[indices[turn]];
+        AllocateLr(lr);
+        lr.turn = turn;
+        size = std::max(size, lr.endAddress);
+    }
+    return size;
+}
+
+void HillClimbAllocator::SortIndicesOnPrio(std::vector<int> &indices) const
+{
+    std::sort(indices.begin(), indices.end(),
+        [this](int const &a, int const &b)
+        {
+            // urgent first
+            if ( lrUrgency[a] != lrUrgency[b] )
+            {
+                return lrUrgency[a] > lrUrgency[b];
+            }
+            auto &lr1 = lrs[a];
+            auto &lr2 = lrs[b];
+            // long duration before short duration
+            auto duration1 = lr1.endTime - lr1.startTime;
+            auto duration2 = lr2.endTime - lr2.startTime;
+            if ( duration1 != duration2 )
+            {
+                return duration1 > duration2;
+            }
+            if ( lr1.startTime != lr2.startTime )
+            {
+                return lr1.startTime < lr2.startTime;
+            }
+            if ( lr1.size != lr2.size )
+            {
+                return lr1.size > lr2.size;
+            }
+            return lr1.id < lr2.id;
+        });
+}
+
+void HillClimbAllocator::AddPredecessorTurns(std::set<int> &turns, const HillClimbLiveRange &lr) const
+{
+    turns.insert(lr.turn);
+    int id = lr.id;
+    while ( lrs[id].predecessor != NO_PREDECESSOR )
+    {
+        id = lrs[id].predecessor;
+        turns.insert(lrs[id].turn);
+    }
+}
+
+void HillClimbAllocator::AttemptBottleneckFix(std::vector<int> &indices, int iterationsStuck)
+{
+    // Find the bottleneck
+    HillClimbLiveRange *maxLr = &lrs[0];
+    for ( auto &lr : lrs )
+    {
+        if ( lr.endAddress > maxLr->endAddress )
+        {
+            maxLr = &lr;
+        }
+    }
+    // Find all live ranges that affected the placement of the bottleneck live range.
+    // This consists of two types of live ranges:
+    // - direct neighbours of the bottleneck live range
+    // - direct and indirect predecessors of these neighbours + bottleneck
+    // The turns at which these live ranges were allocated are put in the turns vector.
+    std::set<int> turns;
+    AddPredecessorTurns(turns, *maxLr);
+    for ( auto lr_p : neighbours[maxLr->id] )
+    {
+        AddPredecessorTurns(turns, *lr_p);
+    }
+    // Non-direct neighbours that interfere with the allocation of the bottleneck are the
+    // immediate cause for gaps in the allocation, and are selected with higher probability.
+    std::vector<int> turnList;
+    std::vector<int> nonNbTurnList;
+    for ( auto turn : turns )
+    {
+        turnList.push_back(turn);
+        auto &lr = lrs[indices[turn]];
+        if ( !maxLr->IsNeighbour(lr) )
+        {
+            nonNbTurnList.push_back(turn);
+        }
+    }
+    // Pick from non-neighbour list with 30% probability (magic number based on tuning)
+    int ix1;
+    if ( rng() % 100 < 30 && !nonNbTurnList.empty() )
+    {
+        // Pick a live range from the "non-neighbour list"
+        ix1 = nonNbTurnList[rng() % nonNbTurnList.size()];
+    }
+    else
+    {
+        // Pick any affecting live range.
+        ix1 = turnList[rng() % turnList.size()];
+    }
+    // Note: turnList has always at least 2 elements for bottlenecks
+    int ix2 = turnList[rng() % (turnList.size() - 1)];
+    if ( ix1 == ix2 )
+    {
+        ix2 = turnList[turnList.size() - 1];
+    }
+    // Swap indices
+    std::swap(indices[ix1], indices[ix2]);
+    if ( iterationsStuck > MAX_ITERATIONS_STUCK )
+    {
+        // The best allocation has not improved for a while, maybe improvement is not possible
+        // by single-swapping indices; add more neighbour live ranges and swap 2 more indices.
+        // Adding more neighbours can sometimes resolve the situation where the current bottleneck
+        // is resolved, but always results in a higher bottleneck at a nearby live range.
+        // Magic number is based on tuning
+        for ( auto turn : nonNbTurnList )
+        {
+            for ( auto lr_p : neighbours[indices[turn]] )
+            {
+                if ( turns.count(lr_p->turn) == 0 )
+                {
+                    turns.insert(lr_p->turn);
+                    turnList.push_back(lr_p->turn);
+                }
+            }
+        }
+        ix1 = turnList[rng() % turnList.size()];
+        ix2 = turnList[rng() % turnList.size()];
+        std::swap(indices[ix1], indices[ix2]);
+    }
+}
+
+void HillClimbAllocator::Search(std::vector<int> &indices, int iters)
+{
+    std::vector<int> bestIndices = indices;
+    std::vector<HillClimbLiveRange> bestLrs = lrs;
+    int lastImprovementIteration = 0;
+
+    for ( int i = 0; bestSize > maxAllowedSize && i < iters && i - lastImprovementIteration < MIN_ITERATIONS_IMPROVE; ++i )
+    {
+        // Reorder the indices
+        AttemptBottleneckFix(indices, i - lastImprovementIteration);
+        // Allocate the reordered indices and check if it gave an improvement
+        auto newSize = AllocateIndices(indices);
+        if ( newSize <= bestSize )
+        {
+            // The new allocation produced a new best result; remember it
+            if ( newSize < bestSize )
+            {
+                lastImprovementIteration = i;
+            }
+            bestSize = newSize;
+            bestIndices = indices;
+            bestLrs = lrs;
+            if ( bestSize <= targetSize )
+            {
+                // Target reached; stop
+                return;
+            }
+        }
+        else
+        {
+            // The new allocation produced worse result; undo the change
+            indices = bestIndices;
+            lrs = bestLrs;
+        }
+    }
+    lrs = std::move(bestLrs);
+}
+
+Address HillClimbAllocateLiveRanges(LiveRangeGraph &lrGraph, int alignment, Address sizeLimit)
+{
+    HillClimbAllocator allocator;
+    return allocator.Allocate(lrGraph.LiveRanges(), alignment, sizeLimit);
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/hillclimb_allocator.hpp b/ethosu/regor/compiler/hillclimb_allocator.hpp
new file mode 100644
index 00000000..bfbfb343
--- /dev/null
+++ b/ethosu/regor/compiler/hillclimb_allocator.hpp
@@ -0,0 +1,172 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "architecture/architecture.hpp"
+#include "live_range.hpp"
+
+#include <cstdint>
+#include <memory>
+#include <random>
+#include <set>
+#include <vector>
+
+namespace regor
+{
+
+struct HillClimbLiveRange
+{
+    // Start time, input to the allocator
+    int startTime;
+    // End time (inclusive), input to the allocator
+    int endTime;
+    // Size, input to the allocator
+    int size;
+    // Allocated address, output of the allocator
+    Address address;
+    // End address, exclusive
+    Address endAddress;
+    // Index of this live range
+    int id;
+    // id of predecessor live range (predecessor's end address == this lr's address)
+    int predecessor;
+    // Turn at which the live range was allocated
+    int turn;
+
+    bool Overlaps(Address addr2, Address size2) const { return address < addr2 + size2 && addr2 < endAddress; }
+    bool IsNeighbour(const HillClimbLiveRange &lr) const { return startTime <= lr.endTime && lr.startTime <= endTime; }
+};
+
+// Implementation of a tensor allocator using state space exploration.
+//
+// The basic algorithm is:
+//
+// - Use a heuristic allocator to find an initial allocation
+// - while allocation is not optimal and iterations < MAX_ITERATIONS:
+//     - find the "bottleneck": the live range with highest end address
+//     - find all live ranges that affected the allocation of the bottleneck
+//     - swap the order of any two affecting live ranges
+//     - reallocate tensors using the reordered live ranges
+//     - if the new allocation is better: keep it, else set allocation to previous allocation
+class HillClimbAllocator
+{
+private:
+    static constexpr int MAX_ITERATIONS = 99999;
+    // Special handling if best solution has not improved during this many iterations
+    static constexpr int MAX_ITERATIONS_STUCK = 50;
+    // Minimum number of iterations since the last improvement (unless an optimal solution is found)
+    static constexpr int MIN_ITERATIONS_IMPROVE = 5000;
+    // Used for live ranges allocated at address 0
+    static constexpr int NO_PREDECESSOR = -1;
+    // Contains the live ranges
+    std::vector<HillClimbLiveRange> lrs;
+    // Contains active live ranges at each timestamp
+    std::vector<std::vector<HillClimbLiveRange *>> lrsAtTime;
+    //
+    // Contains neighbours of each live range (indexed by lr.id), i.e.
+    // live ranges with overlapping start/end time.
+    std::vector<std::vector<HillClimbLiveRange *>> neighbours;
+    //
+    // At each timestamp: accumulated size of active live ranges
+    std::vector<Address> sizeAtTime;
+    //
+    // For each live range: max value of sizeAtTime (only used in the heuristic allocation)
+    std::vector<Address> lrUrgency;
+    //
+    // The maximum allowed size (the size of the physical available memory)
+    Address maxAllowedSize = 0;
+    // The minimum possible size, assuming all live ranges can be perfectly allocated
+    Address minRequiredSize = 0;
+    // The algorithm stops once the target size has been achieved
+    Address targetSize = 0;
+    // The highest end address of the best found allocation
+    Address bestSize = 0;
+    // Number of performed iterations
+    int iterations = 0;
+    // Random number generator; use default seed (which is well-defined)
+    std::mt19937 rng;
+
+public:
+    // Runs the allocation algorithm and updates the address field of lrs.
+    // Finishes when the target size has been reached or when maximum iterations have been run.
+    //
+    // Implementation note: the algorithm produces reproducible results by using
+    // a well-defined random number generator with well-defined default seed,
+    // and using a fixed number of iterations.
+    Address Allocate(const std::vector<std::shared_ptr<LiveRange>> &lrs, int alignment, Address sizeLimit);
+
+    Address MinimumRequiredSize() const { return minRequiredSize; }
+    int Iterations() const { return iterations; }
+
+private:
+    void SetLiveRanges(const std::vector<std::shared_ptr<LiveRange>> &liveRanges, int alignment);
+
+    // Allocates the given live range at the smallest possible address
+    void AllocateLr(HillClimbLiveRange &lr) const;
+    //
+    // Allocates the live ranges in the order indicated by the indices;
+    // allocates each live range at the lowest possible address.
+
+    Address AllocateIndices(const std::vector<int> &indices);
+
+    // Sorts live ranges based on heuristics, used for the initial allocation
+    void SortIndicesOnPrio(std::vector<int> &indices) const;
+
+    // Adds the given live range + predecessors to the turns vector
+    void AddPredecessorTurns(std::set<int> &turns, const HillClimbLiveRange &lr) const;
+
+    // Finds the "bottleneck", the live range with highest end address, and reorders the indices
+    // such that a next allocation might lower the memory usage.
+    //
+    //                          ---------
+    //                          |       |
+    //                          |   D   |
+    //                          |       |
+    // ----------------------------------
+    // |           B                 |
+    // -------------------------------
+    // | |
+    // |A|                      ---
+    // | |                      |C|
+    // | |                      | |
+    // ---------------------------------------
+    //
+    // In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck.
+    // The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C),
+    // and all direct and indirect predecessors of D and its neighbours
+    // (i.e. A, which is the predecessor of B, and indirect predecessor of D).
+    //
+    // By permuting the order in which the affecting live ranges are allocated, the bottleneck might
+    // be lowered. In the above example, almost any permutation would lower the bottleneck.
+    //
+    // Note that there is room to improve the efficiency of the algorithm.
+    // One way could be to first allocate all direct neighbours of the bottleneck
+    // (i.e. B, C, D) and then the other affecting live ranges (i.e. A). The algorithm currently does
+    // not actively try this, as it may lead to allocation loops (A could become the new bottle-neck);
+    // it just uses a higher probability of selecting A.
+    void AttemptBottleneckFix(std::vector<int> &indices, int iterationsStuck);
+
+    // Search for a solution, using the given indices as initial solution.
+    void Search(std::vector<int> &indices, int iterations);
+};
+
+// Wrapper function to perform live range allocation
+Address HillClimbAllocateLiveRanges(LiveRangeGraph &lrGraph, int alignment, Address sizeLimit);
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/kernel.hpp b/ethosu/regor/compiler/kernel.hpp
new file mode 100644
index 00000000..7bea642e
--- /dev/null
+++ b/ethosu/regor/compiler/kernel.hpp
@@ -0,0 +1,161 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/common.hpp"
+
+#include "common/numeric_util.hpp"
+#include "include/graphapi.hpp"
+
+#include <cassert>
+#include <string>
+
+namespace regor
+{
+
+class Margin
+{
+private:
+    int _top = 0;
+    int _left = 0;
+    int _bottom = 0;
+    int _right = 0;
+    int _near = 0;
+    int _far = 0;
+
+public:
+    Margin(int top, int left, int bottom, int right) : _top(top), _left(left), _bottom(bottom), _right(right) {}
+    Margin(int top, int left, int bottom, int right, int near, int far) :
+            _top(top), _left(left), _bottom(bottom), _right(right), _near(near), _far(far)
+    {
+    }
+
+    Margin() = default;
+
+    int Top() const { return _top; }
+    int Left() const { return _left; }
+    int Bottom() const { return _bottom; }
+    int Right() const { return _right; }
+    int Near() const { return _near; }
+    int Far() const { return _far; }
+
+    bool IsZero() const { return !(_top | _left | _bottom | _right | _near | _far); }
+
+    std::string ToString() const
+    {
+        return fmt::format("[t:{},l:{},b:{},r:{},n:{},f:{}]", _top, _left, _bottom, _right, _near, _far);
+    }
+};
+
+/// <summary>
+/// Kernel parameters
+/// </summary>
+class Kernel
+{
+private:
+    Point2i _size;
+    Point2i _stride;
+    Point2i _dilation;
+    int _sizeZ = 0;
+    int _strideZ = 0;
+    int _dilationZ = 0;
+    Margin _padding;
+    int _depthMultiplier = 0;
+
+
+public:
+    Kernel(const GraphApi::GraphKernel *kernel)
+    {
+        _size = Point2i(kernel->sizeYXZ[1], kernel->sizeYXZ[0]);
+        _sizeZ = kernel->sizeYXZ[2];
+        _stride = Point2i(kernel->strideYXZ[1], kernel->strideYXZ[0]);
+        _strideZ = kernel->strideYXZ[2];
+        _dilation = Point2i(kernel->dilationYXZ[1], kernel->dilationYXZ[0]);
+        _dilationZ = kernel->dilationYXZ[2];
+        _padding = Margin(kernel->paddingTBLRNF[0], kernel->paddingTBLRNF[2], kernel->paddingTBLRNF[1],
+            kernel->paddingTBLRNF[3], kernel->paddingTBLRNF[4], kernel->paddingTBLRNF[5]);
+        _depthMultiplier = 0;
+        assert(_size.x > 0 && _size.y > 0);
+    }
+
+    Kernel(Point2i size, Point2i stride, Point2i dilation, int depthMultiplier = 1, Margin padding = Margin(0, 0, 0, 0))
+    {
+        assert(size.x > 0 && size.y > 0);
+        assert(stride.x > 0 && stride.y > 0);
+        _size = size;
+        _stride = stride;
+        _dilation = dilation;
+        _depthMultiplier = depthMultiplier;
+        _padding = padding;
+    }
+    Kernel() = default;
+
+    int ElementsWH() const { return _size.x * _size.y; }
+    const Point3<int> Size3D() const { return {_size.x, _size.y, _sizeZ}; }
+    const Point3<int> Stride3D() const { return {_stride.x, _stride.y, _strideZ}; }
+    const Point3<int> Dilation3D() const { return {_dilation.x, _dilation.y, _dilationZ}; }
+    const Point2i &Size() const { return _size; }
+    const Point2i &Stride() const { return _stride; }
+    const Point2i &Dilation() const { return _dilation; }
+    int DepthMultiplier() const { return _depthMultiplier; }
+    const Margin &Padding() const { return _padding; }
+
+    Kernel WithSize(Point2i size) const
+    {
+        Kernel tmp(*this);
+        tmp._size = size;
+        return tmp;
+    }
+
+    Kernel WithStride(Point2i stride) const
+    {
+        Kernel tmp(*this);
+        tmp._stride = stride;
+        return tmp;
+    }
+
+    Kernel WithDilation(Point2i dilation) const
+    {
+        Kernel tmp(*this);
+        tmp._dilation = dilation;
+        return tmp;
+    }
+
+    Kernel WithPadding(Margin padding) const
+    {
+        Kernel tmp(*this);
+        tmp._padding = padding;
+        return tmp;
+    }
+
+    Point2i DilatedWH() const { return (_dilation * (_size - Point2i(1, 1))) + Point2i(1, 1); }
+
+    std::string ToString() const
+    {
+        return fmt::format("size={},{} stride={},{}, dilation={},{} padding={}", _size.x, _size.y, _stride.x, _stride.y,
+            _dilation.x, _dilation.y, _padding.ToString());
+    }
+};
+
+static inline int RequiredInputSize(int value, int stride, int border, int upscale, int rounding = 0)
+{
+    return int(std::ceil(float((value - 1) * stride + border + rounding) / float(upscale)));
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/live_range.cpp b/ethosu/regor/compiler/live_range.cpp
new file mode 100644
index 00000000..9671045c
--- /dev/null
+++ b/ethosu/regor/compiler/live_range.cpp
@@ -0,0 +1,233 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+
+#include "live_range.hpp"
+
+#include "architecture/architecture.hpp"
+#include "scheduler.hpp"
+#include "scheduler_operation.hpp"
+#include "tensor.hpp"
+
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace regor
+{
+
+std::vector<int> LiveRangeGraph::GetTemporalMemoryUsage(int &maxUsage)
+{
+    std::vector<int> usage(_currentTime + 1);
+    for ( const auto &lr : _lrs )
+    {
+        assert(lr->endTime <= _currentTime);
+        for ( int i = lr->startTime; i <= lr->endTime; ++i )
+        {
+            usage[i] += lr->size;
+        }
+    }
+    maxUsage = *std::max_element(usage.begin(), usage.end());
+    return usage;
+}
+
+void LiveRangeGraph::ExtractLiveRangesFromCascades(const std::vector<std::unique_ptr<SchedulerOperation>> &schedOps,
+    Schedule *schedule, const MemArea &targetMemory, bool addRollingBuffers)
+{
+    std::unordered_map<int, int> timeForCascade;
+    auto startTime = _currentTime;
+    // Live ranges containing graph output
+    std::vector<LiveRange *> graphOutputRanges;
+    for ( const auto &schedOp : schedOps )
+    {
+        SchedulerOpInfo *opInfo = schedule->Cost(schedOp.get());
+        int cascade = opInfo->cascade;
+
+        CascadeInfo *cascadeInfo = cascade == 0 ? nullptr : &schedule->cascades[cascade];
+        CascadeBuffer *cascadeBuffer = nullptr;
+
+        if ( cascadeInfo == nullptr )
+        {
+            // Check if op have an ifm tensor that can be reused for the ofm
+            auto ifmTens = ReusableIFM(schedOp, targetMemory);
+            if ( ifmTens != nullptr )
+            {
+                // ifm can be reused
+                FuseRanges(ifmTens, schedOp->OFM()->tensor.get());
+            }
+        }
+
+        int timeToSet = _currentTime;
+        if ( cascadeInfo != nullptr )
+        {
+            auto entry = cascadeInfo->buffers.find(*schedOp);
+            if ( entry != cascadeInfo->buffers.end() )
+            {
+                cascadeBuffer = &entry->second;
+            }
+            auto tfcEntry = timeForCascade.find(cascade);
+            if ( tfcEntry != timeForCascade.end() )
+            {
+                timeToSet = tfcEntry->second;
+            }
+        }
+        opInfo->timeIndex = timeToSet;
+
+        // Mark usage for all relevant tensors related to this operation
+        for ( auto &liveTensor : schedOp->LiveRangeTensors() )
+        {
+            auto usage = liveTensor.first;
+            auto tens = liveTensor.second;
+            bool isRollingBuffer = cascadeBuffer != nullptr && usage == MakeTensorUsage(TensorUsage::IFM, schedOp->PrimaryIfmIndex());
+            if ( ShouldBeIgnored(tens, targetMemory) && !(addRollingBuffers && isRollingBuffer) )
+            {
+                continue;
+            }
+            auto lr = GetOrCreateRange(tens);
+            if ( tens->isGraphInput )
+            {
+                // Graph input must not be overwritten by preceding schedOps
+                lr->MarkUsage(startTime);
+            }
+            if ( tens->isGraphOutput )
+            {
+                // Graph output must not be overwritten by following schedOps
+                graphOutputRanges.push_back(lr);
+            }
+            lr->MarkUsage(timeToSet);
+            if ( isRollingBuffer )
+            {
+                // This tensor is a rolling buffer in a cascade and the size of the LiveRange needs to be modified
+                // for enabling temporal memory snapshots without modifying the original Tensor
+                lr->size = cascadeBuffer->sizeBytes;
+            }
+        }
+        // Buffered weight tensor
+        auto weightTens = opInfo->bufferedWeightTensor.tensor.get();
+        if ( !ShouldBeIgnored(weightTens, targetMemory) )
+        {
+            auto lr = GetOrCreateRange(weightTens);
+            if ( opInfo->bufferedWeightTensor.preBuffer )
+            {
+                lr->MarkUsage(timeToSet - 1, 2);
+            }
+            else
+            {
+                lr->MarkUsage(timeToSet);
+            }
+        }
+        // Read-only weight/scale tensors
+        for ( auto tens : {opInfo->npuWeightsTensor, opInfo->npuScalesTensor} )
+        {
+            if ( !ShouldBeIgnored(tens.get(), targetMemory) )
+            {
+                auto lr = GetOrCreateRange(tens.get());
+                lr->MarkUsage(timeToSet);
+            }
+        }
+        if ( timeToSet == _currentTime )
+        {
+            _currentTime += 2;
+        }
+        if ( cascade != 0 )
+        {
+            timeForCascade[cascade] = timeToSet;
+        }
+    }
+    for ( auto lr : graphOutputRanges )
+    {
+        lr->MarkUsage(_currentTime, 1);
+        ++_currentTime;
+    }
+}
+
+LiveRange *LiveRangeGraph::GetOrCreateRange(SchedulerTensor *tens)
+{
+    // Return the live range of the tensor (or any of its clones)
+    const auto entry = _equivalenceIdToLr.find(tens->equivalenceId);
+    if ( entry != _equivalenceIdToLr.end() )
+    {
+        entry->second->AddTensor(tens);
+        return entry->second;
+    }
+    // No live range found for the tensor, create a new one
+    auto lr = std::make_shared<LiveRange>(tens);
+    _lrs.push_back(lr);
+    _equivalenceIdToLr[tens->equivalenceId] = lr.get();
+    return lr.get();
+}
+
+LiveRange *LiveRangeGraph::FuseRanges(SchedulerTensor *inTens, SchedulerTensor *outTens)
+{
+    assert(outTens->AllocationSizeBytes() <= inTens->AllocationSizeBytes());
+    auto lr = GetOrCreateRange(inTens);
+    lr->AddTensor(outTens);
+    const auto entry = _equivalenceIdToLr.find(outTens->equivalenceId);
+    if ( entry != _equivalenceIdToLr.end() )
+    {
+        // Live range already existed for outTens, move over tensors
+        auto &lr2 = entry->second;
+        lr->tensors.insert(lr2->tensors.begin(), lr2->tensors.end());
+        lr2->tensors.clear();
+        lr2->size = 0;
+    }
+    _equivalenceIdToLr[outTens->equivalenceId] = lr;
+    return lr;
+}
+
+SchedulerTensor *LiveRangeGraph::ReusableIFM(const std::unique_ptr<SchedulerOperation> &schedOp, const MemArea &targetMemory)
+{
+    SchedulerTensor *reusableIfm = nullptr;
+    if ( IsElementwise(schedOp->Type()) )
+    {
+        // Check if possible to merge ifm/ofm live ranges of elementwise op
+        const auto ofmConn = schedOp->OFM();
+        const auto ofmTens = ofmConn->tensor.get();
+
+        if ( !ShouldBeIgnored(ofmTens, targetMemory) )
+        {
+            for ( const auto &[usage, ifmConn] : schedOp->inputs.pairs() )
+            {
+                const auto ifmTens = ifmConn.tensor.get();
+
+                if ( IsIFM(usage) && !ifmTens->isGraphOutput && ifmConn.shape == ofmConn->shape &&
+                     ifmTens->format == ofmTens->format && ifmTens->dataType == ofmTens->dataType &&
+                     !ShouldBeIgnored(ifmTens, targetMemory) && ifmTens->consumers.size() == 1 && ofmTens->producers.size() == 1 )
+                {
+                    reusableIfm = ifmTens;
+                    break;
+                }
+            }
+        }
+    }
+    return reusableIfm;
+}
+
+bool LiveRangeGraph::ShouldBeIgnored(SchedulerTensor *tens, const MemArea &targetMemory)
+{
+    if ( tens == nullptr )
+    {
+        return true;
+    }
+    return tens->memArea != targetMemory;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/live_range.hpp b/ethosu/regor/compiler/live_range.hpp
new file mode 100644
index 00000000..2a22a6fc
--- /dev/null
+++ b/ethosu/regor/compiler/live_range.hpp
@@ -0,0 +1,112 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "architecture/architecture.hpp"
+#include "scheduler.hpp"
+#include "scheduler_operation.hpp"
+#include "tensor.hpp"
+
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace regor
+{
+
+/// <summary>
+/// Live range
+/// </summary>
+struct LiveRange
+{
+    // Tensors with equivalence ids that are assigned to the same LiveRange will be allocated to the same address
+    std::unordered_set<SchedulerTensor *> tensors;
+    // Time at which the live range's tensors start being used.
+    int startTime = std::numeric_limits<int>::max();
+    // Note: the end time is inclusive
+    int endTime = -1;
+    int size = 0;
+    MemArea memArea;
+    std::string name = "";
+
+    LiveRange(SchedulerTensor *tensor)
+    {
+        size = tensor->AllocationSizeBytes();
+        memArea = tensor->memArea;
+        name = tensor->Name();
+        AddTensor(tensor);
+    }
+
+    void AddTensor(SchedulerTensor *tensor) { tensors.insert(tensor); }
+
+    void MarkUsage(int opTime, int opDuration = 1)
+    {
+        assert(opDuration >= 0);
+        int opTimeStart = std::max(opTime, 0);
+        int opTimeEnd = opTime + opDuration;
+        if ( opTimeEnd > opTimeStart )
+        {
+            startTime = std::min(startTime, opTimeStart);
+            endTime = std::max(endTime, opTimeEnd);
+        }
+    }
+
+    void SetAddress(Address address)
+    {
+        for ( auto &tensor : tensors )
+        {
+            tensor->allocatedAddress = address;
+        }
+    }
+
+    std::string ToString() const
+    {
+        return fmt::format("<LiveRange {}, time: {}-{}, size: {}>", name, startTime, endTime, size);
+    }
+};
+
+class LiveRangeGraph
+{
+private:
+    /** All allocated live ranges */
+    std::vector<std::shared_ptr<LiveRange>> _lrs;
+    /** Map from equivalence id -> live range */
+    std::unordered_map<UniqueId, LiveRange *> _equivalenceIdToLr;
+    int _currentTime = 0;
+
+public:
+    virtual ~LiveRangeGraph() = default;
+    int EndTime() const { return _currentTime + 1; }
+
+    std::vector<std::shared_ptr<LiveRange>> LiveRanges() const { return _lrs; };
+
+    /** usage[t] will be set to the memory usage at time t, for each timestamp t in the live graph */
+    std::vector<int> GetTemporalMemoryUsage(int &maxUsage);
+    void ExtractLiveRangesFromCascades(const std::vector<std::unique_ptr<SchedulerOperation>> &schedOps,
+        Schedule *schedule, const MemArea &targetMemory, bool addRollingBuffers);
+    LiveRange *GetOrCreateRange(SchedulerTensor *tens);
+    LiveRange *FuseRanges(SchedulerTensor *inTens, SchedulerTensor *outTens);
+    SchedulerTensor *ReusableIFM(const std::unique_ptr<SchedulerOperation> &schedOp, const MemArea &targetMemory);
+    virtual bool ShouldBeIgnored(SchedulerTensor *tens, const MemArea &targetMemory);
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/network_performance.cpp b/ethosu/regor/compiler/network_performance.cpp
new file mode 100644
index 00000000..c47868c5
--- /dev/null
+++ b/ethosu/regor/compiler/network_performance.cpp
@@ -0,0 +1,317 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "network_performance.hpp"
+
+#include "common/common.hpp"
+
+#include "database.hpp"
+#include "graph_optimiser.hpp"
+
+#include <unordered_map>
+
+BEGIN_ENUM_TABLE(regor::AccessType)
+    ADD_ENUM_NAME(Lut)
+    ADD_ENUM_NAME(FeatureMap)
+    ADD_ENUM_NAME(Weights)
+    ADD_ENUM_NAME(Scales)
+END_ENUM_TABLE()
+
+namespace regor
+{
+NetworkPerformance::NetworkPerformance(Architecture *arch, const std::vector<std::unique_ptr<SchedulerOperation>> &ops) :
+        _arch(arch), _ops(ops)
+{
+    assert(arch);
+}
+
+PerformanceResult NetworkPerformance::Measure(Schedule *schedule, OptimiserDatabase *optDb)
+{
+    SchedulerOperation *prevOp = nullptr;
+    SchedulerOpInfo *prevCost = nullptr;
+    PerformanceResult performance;
+    Database *db = nullptr;
+    std::unordered_set<ArchitectureMemory *> memories({_arch->ReadonlyMemory().memory, _arch->FeatureMapMemory().memory,
+        _arch->LUTMemory().memory, _arch->StagingMemory().memory});
+    std::unordered_set<MemArea, MemArea::hash> regions(
+        {_arch->ReadonlyMemory(), _arch->FeatureMapMemory(), _arch->LUTMemory(), _arch->StagingMemory()});
+    int opTable = 0;
+    int opTableColumnCount = 0;
+    std::unordered_set<UniqueId> tensorUids;
+
+    if ( optDb )
+    {
+        db = optDb->Get();
+        opTable = db->AddTable("perf");
+        std::vector<std::string> columns = {
+            "source_id",
+            "optimised_id",
+            "operator",
+            "name",
+            "staging_usage",
+            "op_cycles",
+            "npu_cycles",
+            "mac_count",
+        };
+        for ( const auto &mem : memories )
+        {
+            std::string label = mem->Name() + "_ac";
+            columns.push_back(label);
+        }
+        db->AddColumns(opTable, columns);
+        opTableColumnCount = int(columns.size());
+    }
+
+    for ( auto const &schedOp : _ops )
+    {
+        SchedulerOpInfo *cost = schedule->Cost(schedOp.get());
+        PerformanceResult perf = {};
+        if ( schedOp->IsNpuOp() )
+        {
+            perf = EstimateFullOpPerformance(schedOp.get(), cost, prevOp, prevCost);
+            perf.npuOps = 1;
+            perf.memory[_arch->StagingMemory().memory].peakUsage = schedule->MemoryUsageAt(cost->timeIndex);
+
+            // Calculate total original and encoded weights
+            // Weight statistics is not set on a per-operation level as some operations share weight tensors
+            SchedulerConnection *weightConn = schedOp->TryInput(TensorUsage::Weights);
+            if ( weightConn && cost->npuWeightsTensor )
+            {
+                // check if the weight tensor has already been accounted for in total weights
+                auto pos = tensorUids.find(weightConn->tensor->uid);
+                if ( pos == std::end(tensorUids) )
+                {
+                    tensorUids.insert(weightConn->tensor->uid);
+                    performance.originalWeights += weightConn->tensor->AllocationSizeBytes();
+                    performance.encodedWeights += cost->npuWeightsTensor->totalWeightBytes;
+                }
+            }
+        }
+        else
+        {
+            perf.cpuCycles = 1;  // TODO: model CPU cycle counts
+            perf.cpuOps = 1;
+        }
+        // Insert any missing memories
+        for ( ArchitectureMemory *a : memories )
+        {
+            perf.memory.emplace(a, PerformanceResult::MemoryAccesses{});
+        }
+
+        if ( optDb != nullptr )
+        {
+            AddToDatabase(perf, schedOp, opTable, opTableColumnCount, memories, optDb);
+        }
+
+        performance += perf;
+        prevOp = schedOp.get();
+        prevCost = cost;
+    }
+    // TODO: Remove this line and separate memory allocation from usage.
+    performance.memory[_arch->StagingMemory().memory].peakUsage = 0;
+
+    for ( auto &region : regions )
+    {
+        // RHS is not peak usage, but peak allocation.
+        performance.memory[region.memory].peakUsage += schedule->memoryUsage[region];
+    }
+
+    performance.cascades = schedule->cascades.size();
+
+    return performance;
+}
+
+void NetworkPerformance::AddToDatabase(const PerformanceResult &perf, const std::unique_ptr<SchedulerOperation> &schedOp,
+    int opTable, int /*opTableColumnCount*/, const std::unordered_set<ArchitectureMemory *> &memories, OptimiserDatabase *optDb)
+{
+    // Per-layer calculations
+    assert(optDb != nullptr);
+    std::vector<std::string> row;
+    std::string opName = "N/A";
+    Database *db = optDb->Get();
+
+    const auto *conn = schedOp->TryOFM();
+    if ( conn != nullptr && conn->tensor != nullptr && conn->tensor->srcTensor != nullptr )
+    {
+        opName = conn->tensor->srcTensor->Name();
+    }
+
+    int sourceId = optDb->SourceId(schedOp->_srcKey);
+    int optId = optDb->OptimisedId(schedOp->_srcKey);
+    row = {
+        std::to_string(sourceId),
+        std::to_string(optId),
+        OpTypeToString(schedOp->Type()),
+        std::move(opName),
+        std::to_string(perf.memory.at(_arch->StagingMemory().memory).peakUsage),
+        std::to_string(perf.totalCycles),
+        std::to_string(perf.npuCycles),
+        std::to_string(perf.macCount),
+    };
+
+    for ( const auto mem : memories )
+    {
+        row.push_back(std::to_string(perf.memory.at(mem).AccessCycles()));
+    }
+
+    db->AddRow(opTable, schedOp->Index(), std::move(row));
+}
+
+
+PerformanceResult NetworkPerformance::EstimateFullOpPerformance(
+    SchedulerOperation *schedOp, SchedulerOpInfo *cost, SchedulerOperation *prevOp, SchedulerOpInfo *prevCost)
+{
+    UNUSED(prevOp);
+    PerformanceQuery query = Scheduler::InitPerfQuery(schedOp, cost->Config(), -1);
+    std::vector<FusionQuery> fused = Scheduler::InitFusionQuery(schedOp);
+
+    CycleCost cycles = _arch->Performance()->MeasureCycleCost(query, fused);
+
+    PerformanceResult result;
+    result.npuCycles = cycles.opCycles;
+    result.macCount = cycles.macs;
+
+    if ( cost->cascade != 0 )
+    {
+        result.cascadedOps = 1;
+    }
+
+    ElementAccess access = _arch->Performance()->MeasureElementAccess(query);
+    ElementAccess byteAccess = _arch->Performance()->ElementTransferToBytes(query, access);
+
+    // How many NPU cycles are available under the previously executing
+    // operator for performing buffered DMA transfers
+    int64_t slackCycles = (prevCost != nullptr) ? prevCost->slackBufferingCycles : 0;
+
+    // LUT transfer stats
+    auto lut = schedOp->TryInput(TensorUsage::LUT);
+    int64_t lutTransferCycles = 0;
+
+    if ( lut )
+    {
+        auto srcMemory = lut->tensor->memArea.memory;
+        auto dstMemory = _arch->LUTMemory().memory;
+        assert(srcMemory);
+
+        if ( (srcMemory != nullptr) && (dstMemory != srcMemory) )
+        {
+            int copySize = lut->PartialAllocationSizeBytes();
+            lutTransferCycles = _arch->Performance()->MemToMemCycles(dstMemory, srcMemory, copySize);
+
+            result.memory[srcMemory].access[AccessType::Lut].bytesRead += copySize;
+            result.memory[dstMemory].access[AccessType::Lut].bytesWritten += copySize;
+        }
+    }
+
+    // Memory that NPU will source weights from for operations
+    ArchitectureMemory *weightsMemory = cost->npuWeightsTensor ? cost->npuWeightsTensor->memArea.memory : nullptr;
+
+    if ( weightsMemory && cost->bufferedWeightTensor.tensor )
+    {
+        // DMA Weight Transfer
+        int initialSize = 0;
+
+        // Get the size of the first DMA
+        for ( int streamIndex = 0; streamIndex < cost->npuWeightsTensor->subStreams; streamIndex++ )
+        {
+            auto pos = cost->npuWeightsTensor->encodedRanges.find(streamIndex);
+            if ( pos != cost->npuWeightsTensor->encodedRanges.end() )
+            {
+                initialSize += pos->second.TotalBytes();
+            }
+        }
+
+        auto srcWeightMem = weightsMemory;
+        auto dstWeightMem = cost->bufferedWeightTensor.tensor->memArea.memory;
+        assert(srcWeightMem != dstWeightMem);
+
+        weightsMemory = dstWeightMem;  // Update source to use buffered weight memory
+
+        // Calculate initial weight transfer cycles
+        int64_t weightCycles = _arch->Performance()->MemToMemCycles(dstWeightMem, srcWeightMem, initialSize);
+        weightCycles = std::max(weightCycles - slackCycles, int64_t(0));
+
+        int weightsSize = cost->npuWeightsTensor->AllocationSizeBytes();
+        result.memory[srcWeightMem].access[AccessType::Weights].bytesRead += weightsSize;
+        result.memory[dstWeightMem].access[AccessType::Weights].bytesWritten += weightsSize;
+
+        // Add cycles for Weight + Scale Transfer
+        result.npuCycles = std::max(cost->fullWeightTransferCycles - slackCycles + cost->slackBufferingCycles, cycles.opCycles + weightCycles);
+    }
+    else
+    {
+        // Calculate non-hidden LUT transfer cycles
+        lutTransferCycles = std::max(lutTransferCycles - slackCycles, int64_t(0));
+    }
+
+    // Add cycles for LUT Transfer
+    result.npuCycles += lutTransferCycles;
+
+    // OFM write
+    auto ofm = schedOp->OFM();
+    result.memory[ofm->tensor->memArea.memory].access[AccessType::FeatureMap].bytesWritten += byteAccess.ofmWrite;
+
+    // IFM1 read
+    auto ifm = schedOp->IFM(0);
+    result.memory[ifm->tensor->memArea.memory].access[AccessType::FeatureMap].bytesRead += byteAccess.ifmRead[0];
+
+    // IFM2 read
+    auto ifm2 = schedOp->TryIFM(1);
+    if ( ifm2 )
+    {
+        result.memory[ifm2->tensor->memArea.memory].access[AccessType::FeatureMap].bytesRead += byteAccess.ifmRead[1];
+    }
+
+    // Weight read
+    if ( cost->npuWeightsTensor && access.constRead[0] > 0 )
+    {
+        int encodedWeightsSize = cost->npuWeightsTensor->totalWeightBytes;
+        result.memory[weightsMemory].access[AccessType::Weights].bytesRead += int64_t(encodedWeightsSize) * access.weightsRefetch;
+    }
+
+    // Scale read
+    if ( cost->npuWeightsTensor && access.constRead[1] > 0 )
+    {
+        int encodedScaleSize = cost->npuWeightsTensor->AllocationSizeBytes() - cost->npuWeightsTensor->totalWeightBytes;
+        result.memory[weightsMemory].access[AccessType::Scales].bytesRead += int64_t(encodedScaleSize) * access.weightsRefetch;
+    }
+
+    // Update memory-access cycles and find the maximum memory read cycle time
+    int64_t maxMemCycles = 0;
+    for ( auto &[mem, stats] : result.memory )
+    {
+        float bandwidth = mem->Bandwidth();
+        int64_t memBytes = 0;
+        for ( auto &[accType, acc] : stats.access )
+        {
+            // compute cycles per accessType
+            int64_t bytes = acc.bytesRead + acc.bytesWritten;
+            memBytes += bytes;
+            int64_t accCycles = int64_t(float(bytes) / bandwidth);
+            acc.accessCycles = accCycles;
+        }
+        // get maximum cycles per memory
+        int64_t memCycles = int64_t(float(memBytes) / bandwidth);
+        maxMemCycles = std::max(maxMemCycles, memCycles);
+    }
+
+    result.totalCycles = std::max(result.npuCycles, maxMemCycles);
+    return result;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/network_performance.hpp b/ethosu/regor/compiler/network_performance.hpp
new file mode 100644
index 00000000..869acf99
--- /dev/null
+++ b/ethosu/regor/compiler/network_performance.hpp
@@ -0,0 +1,153 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/common.hpp"
+
+#include "compiler/database.hpp"
+#include "compiler/graph_optimiser.hpp"
+#include "compiler/scheduler.hpp"
+
+#include <unordered_set>
+
+namespace regor
+{
+
+/// <summary>
+/// Performance information for a whole schedule
+/// </summary>
+enum AccessType
+{
+    Lut = 0,
+    FeatureMap = 1,
+    Weights = 2,
+    Scales = 3,
+};
+
+struct PerformanceResult
+{
+
+    struct MemoryAccess
+    {
+        int64_t bytesRead = 0;
+        int64_t bytesWritten = 0;
+        int64_t accessCycles = 0;
+
+        MemoryAccess &operator+=(const MemoryAccess &other)
+        {
+            this->bytesRead += other.bytesRead;
+            this->bytesWritten += other.bytesWritten;
+            this->accessCycles += other.accessCycles;
+            return *this;
+        }
+    };
+
+    struct MemoryAccesses
+    {
+        std::unordered_map<AccessType, MemoryAccess> access;
+        int64_t peakUsage = 0;
+        int64_t AccessCycles() const
+        {
+            int64_t cycles = 0;
+            for ( const auto &[type, acc] : access )
+            {
+                cycles += acc.accessCycles;
+            }
+            return cycles;
+        }
+
+        MemoryAccesses &operator+=(const MemoryAccesses &other)
+        {
+            for ( const auto &[type, acc] : other.access )
+            {
+                access[type] += acc;
+            }
+            peakUsage = std::max(peakUsage, other.peakUsage);
+            return *this;
+        }
+    };
+
+    std::unordered_map<ArchitectureMemory *, MemoryAccesses> memory;
+    int64_t npuCycles = 0;
+    int64_t cpuCycles = 0;
+    int64_t totalCycles = 0;
+    int64_t macCount = 0;
+    int64_t cpuOps = 0;
+    int64_t npuOps = 0;
+    int64_t cascadedOps = 0;
+    int64_t cascades = 0;
+    int64_t originalWeights = 0;
+    int64_t encodedWeights = 0;
+
+    int Accesses() const
+    {
+        int accesses = 0;
+        for ( const auto &[archMem, stats] : memory )
+        {
+            accesses += int(stats.access.size());
+        }
+        return accesses;
+    }
+
+    PerformanceResult &operator+=(const PerformanceResult &other)
+    {
+        // Not ideal for performance
+        for ( const auto &[arch, memoryStat] : other.memory )
+        {
+            memory[arch] += memoryStat;
+        }
+        this->npuCycles += other.npuCycles;
+        this->cpuCycles += other.cpuCycles;
+        this->totalCycles += other.totalCycles;
+        this->macCount += other.macCount;
+        this->cpuOps += other.cpuOps;
+        this->npuOps += other.npuOps;
+        this->cascadedOps += other.cascadedOps;
+        this->cascades += other.cascades;
+        this->originalWeights += other.originalWeights;
+        this->encodedWeights += other.encodedWeights;
+        return *this;
+    }
+};
+
+/// <summary>
+/// Whole-schedule performance calculation module
+/// </summary>
+class NetworkPerformance
+{
+private:
+    Architecture *_arch;
+    const std::vector<std::unique_ptr<SchedulerOperation>> &_ops;
+
+public:
+    NetworkPerformance(Architecture *arch, const std::vector<std::unique_ptr<SchedulerOperation>> &ops);
+
+public:
+    PerformanceResult Measure(Schedule *schedule, OptimiserDatabase *optDb);
+
+private:
+    PerformanceResult EstimateFullOpPerformance(
+        SchedulerOperation *schedOp, SchedulerOpInfo *cost, SchedulerOperation *prevOp, SchedulerOpInfo *prevCost);
+    void AddToDatabase(const PerformanceResult &perf, const std::unique_ptr<SchedulerOperation> &schedOp, int opTable,
+        int columns, const std::unordered_set<ArchitectureMemory *> &memories, OptimiserDatabase *optDb);
+};
+
+
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/op_type.cpp b/ethosu/regor/compiler/op_type.cpp
new file mode 100644
index 00000000..1d7ee8bf
--- /dev/null
+++ b/ethosu/regor/compiler/op_type.cpp
@@ -0,0 +1,199 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "compiler/op_type.hpp"
+
+BEGIN_ENUM_TABLE(regor::OpType)
+    ADD_ENUM_NAME(None)
+    // TOSA equivalent ops
+    ADD_ENUM_NAME(ArgMax)
+    ADD_ENUM_NAME(AvgPool)
+    ADD_ENUM_NAME(Conv2D)
+    ADD_ENUM_NAME(Conv3D)
+    ADD_ENUM_NAME(DepthwiseConv2DBias)
+    ADD_ENUM_NAME(FullyConnected)
+    ADD_ENUM_NAME(MatMul)
+    ADD_ENUM_NAME(MaxPool)
+    ADD_ENUM_NAME(TransposeConv2D)
+    ADD_ENUM_NAME(Clamp)
+    ADD_ENUM_NAME(Sigmoid)
+    ADD_ENUM_NAME(Tanh)
+    ADD_ENUM_NAME(Add)
+    ADD_ENUM_NAME(Asr)
+    ADD_ENUM_NAME(And)
+    ADD_ENUM_NAME(Or)
+    ADD_ENUM_NAME(Xor)
+    ADD_ENUM_NAME(Div)
+    ADD_ENUM_NAME(LogicalAnd)
+    ADD_ENUM_NAME(SHL)
+    ADD_ENUM_NAME(SHR)
+    ADD_ENUM_NAME(LogicalOr)
+    ADD_ENUM_NAME(LogicalXor)
+    ADD_ENUM_NAME(Maximum)
+    ADD_ENUM_NAME(Minimum)
+    ADD_ENUM_NAME(Mul)
+    ADD_ENUM_NAME(Pow)
+    ADD_ENUM_NAME(Sub)
+    ADD_ENUM_NAME(LUT)
+    ADD_ENUM_NAME(Abs)
+    ADD_ENUM_NAME(Not)
+    ADD_ENUM_NAME(Ceil)
+    ADD_ENUM_NAME(CLZ)
+    ADD_ENUM_NAME(Exp)
+    ADD_ENUM_NAME(Floor)
+    ADD_ENUM_NAME(LogicalNot)
+    ADD_ENUM_NAME(Neg)
+    ADD_ENUM_NAME(Reciprocal)
+    ADD_ENUM_NAME(Rsqrt)
+    ADD_ENUM_NAME(Select)
+    ADD_ENUM_NAME(Equal)
+    ADD_ENUM_NAME(Greater)
+    ADD_ENUM_NAME(GreaterEqual)
+    ADD_ENUM_NAME(ReduceAny)
+    ADD_ENUM_NAME(ReduceAll)
+    ADD_ENUM_NAME(ReduceMax)
+    ADD_ENUM_NAME(ReduceMin)
+    ADD_ENUM_NAME(ReduceProduct)
+    ADD_ENUM_NAME(ReduceSum)
+    ADD_ENUM_NAME(Concat)
+    ADD_ENUM_NAME(Pad)
+    ADD_ENUM_NAME(Reshape)
+    ADD_ENUM_NAME(Reverse)
+    ADD_ENUM_NAME(Slice)
+    ADD_ENUM_NAME(Tile)
+    ADD_ENUM_NAME(Transpose)
+    ADD_ENUM_NAME(Gather)
+    ADD_ENUM_NAME(Scatter)
+    ADD_ENUM_NAME(Resize)
+    ADD_ENUM_NAME(Cast)
+    ADD_ENUM_NAME(Rescale)
+    ADD_ENUM_NAME(Identity)
+    ADD_ENUM_NAME(If)
+    ADD_ENUM_NAME(While)
+    // Regor Internal Operators
+    ADD_ENUM_NAME(MemoryCopy)
+    // Compatibility Operators
+    ADD_ENUM_NAME(AddN)
+    ADD_ENUM_NAME(Any)
+    ADD_ENUM_NAME(ArgMin)
+    ADD_ENUM_NAME(BatchMatMul)
+    ADD_ENUM_NAME(BatchToSpaceND)
+    ADD_ENUM_NAME(BidirectionalSequenceLstm)
+    ADD_ENUM_NAME(BidirectionalSequenceRnn)
+    ADD_ENUM_NAME(BlockLSTM)
+    ADD_ENUM_NAME(Call)
+    ADD_ENUM_NAME(Clip)
+    ADD_ENUM_NAME(ConcatEmbeddings)
+    ADD_ENUM_NAME(ConcatTFLite)
+    ADD_ENUM_NAME(Const)
+    ADD_ENUM_NAME(Conv2DBackpropInput)
+    ADD_ENUM_NAME(Conv2DBackpropInputSwitchedBias)
+    ADD_ENUM_NAME(Conv2DBias)
+    ADD_ENUM_NAME(Cos)
+    ADD_ENUM_NAME(Cumsum)
+    ADD_ENUM_NAME(Custom)
+    ADD_ENUM_NAME(CustomNpuOp)
+    ADD_ENUM_NAME(Delegate)
+    ADD_ENUM_NAME(Densify)
+    ADD_ENUM_NAME(DepthToSpace)
+    ADD_ENUM_NAME(Dequantize)
+    ADD_ENUM_NAME(Elu)
+    ADD_ENUM_NAME(EmbeddingLookup)
+    ADD_ENUM_NAME(EmbeddingLookupSparse)
+    ADD_ENUM_NAME(ExpandDims)
+    ADD_ENUM_NAME(FakeQuantWithMinMaxArgs)
+    ADD_ENUM_NAME(Fill)
+    ADD_ENUM_NAME(FloorDiv)
+    ADD_ENUM_NAME(FloorMod)
+    ADD_ENUM_NAME(GatherNd)
+    ADD_ENUM_NAME(GatherV2)
+    ADD_ENUM_NAME(HardSwish)
+    ADD_ENUM_NAME(HashtableLookup)
+    ADD_ENUM_NAME(L2Norm)
+    ADD_ENUM_NAME(L2Pool2D)
+    ADD_ENUM_NAME(LRN)
+    ADD_ENUM_NAME(LSHProjection)
+    ADD_ENUM_NAME(LeakyRelu)
+    ADD_ENUM_NAME(Less)
+    ADD_ENUM_NAME(LessEqual)
+    ADD_ENUM_NAME(Log)
+    ADD_ENUM_NAME(LogSoftmax)
+    ADD_ENUM_NAME(Lstm)
+    ADD_ENUM_NAME(MatrixDiag)
+    ADD_ENUM_NAME(MatrixSetDiag)
+    ADD_ENUM_NAME(Max)
+    ADD_ENUM_NAME(Mean)
+    ADD_ENUM_NAME(Min)
+    ADD_ENUM_NAME(MirrorPad)
+    ADD_ENUM_NAME(NonMaxSuppressionV4)
+    ADD_ENUM_NAME(NonMaxSuppressionV5)
+    ADD_ENUM_NAME(NotEqual)
+    ADD_ENUM_NAME(OneHot)
+    ADD_ENUM_NAME(Pack)
+    ADD_ENUM_NAME(PadV2)
+    ADD_ENUM_NAME(Placeholder)
+    ADD_ENUM_NAME(Prelu)
+    ADD_ENUM_NAME(Prod)
+    ADD_ENUM_NAME(Quantize)
+    ADD_ENUM_NAME(QuantizedAvgPool)
+    ADD_ENUM_NAME(QuantizedConv2D)
+    ADD_ENUM_NAME(QuantizedMatMul)
+    ADD_ENUM_NAME(QuantizedMaxPool)
+    ADD_ENUM_NAME(QuantizedReshape)
+    ADD_ENUM_NAME(Range)
+    ADD_ENUM_NAME(Rank)
+    ADD_ENUM_NAME(Relu)
+    ADD_ENUM_NAME(Relu6)
+    ADD_ENUM_NAME(ReluN1To1)
+    ADD_ENUM_NAME(ReluN)
+    ADD_ENUM_NAME(ResizeBilinear)
+    ADD_ENUM_NAME(ResizeNearestNeighbor)
+    ADD_ENUM_NAME(ReverseSequence)
+    ADD_ENUM_NAME(ReverseV2)
+    ADD_ENUM_NAME(Rnn)
+    ADD_ENUM_NAME(Round)
+    ADD_ENUM_NAME(ScatterNd)
+    ADD_ENUM_NAME(SegmentSum)
+    ADD_ENUM_NAME(SelectV2)
+    ADD_ENUM_NAME(Shape)
+    ADD_ENUM_NAME(SignBit)
+    ADD_ENUM_NAME(Sin)
+    ADD_ENUM_NAME(SkipGram)
+    ADD_ENUM_NAME(Softmax)
+    ADD_ENUM_NAME(SpaceToBatchND)
+    ADD_ENUM_NAME(SpaceToDepth)
+    ADD_ENUM_NAME(SparseToDense)
+    ADD_ENUM_NAME(Split)
+    ADD_ENUM_NAME(SplitV)
+    ADD_ENUM_NAME(Sqrt)
+    ADD_ENUM_NAME(Square)
+    ADD_ENUM_NAME(SquaredDifference)
+    ADD_ENUM_NAME(Squeeze)
+    ADD_ENUM_NAME(StridedSlice)
+    ADD_ENUM_NAME(SubgraphInput)
+    ADD_ENUM_NAME(Sum)
+    ADD_ENUM_NAME(Svdf)
+    ADD_ENUM_NAME(TopKV2)
+    ADD_ENUM_NAME(UnidirectionalSequenceLstm)
+    ADD_ENUM_NAME(UnidirectionalSequenceRnn)
+    ADD_ENUM_NAME(Unique)
+    ADD_ENUM_NAME(Unpack)
+    ADD_ENUM_NAME(Where)
+    ADD_ENUM_NAME(ZerosLike)
+    ADD_ENUM_NAME(LookupTable)
+END_ENUM_TABLE()
diff --git a/ethosu/regor/compiler/op_type.hpp b/ethosu/regor/compiler/op_type.hpp
new file mode 100644
index 00000000..018344a0
--- /dev/null
+++ b/ethosu/regor/compiler/op_type.hpp
@@ -0,0 +1,290 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/bit_flags.hpp"
+
+#include <cstdint>
+#include <string>
+
+namespace regor
+{
+
+enum class OpType : uint16_t
+{
+    None = 0,
+    // TOSA equivalent ops
+    ArgMax,
+    AvgPool,
+    Conv2D,
+    Conv3D,
+    DepthwiseConv2DBias,
+    FullyConnected,
+    MatMul,
+    MaxPool,
+    TransposeConv2D,
+    Clamp,
+    Sigmoid,
+    Tanh,
+    Add,
+    Asr,
+    And,
+    Or,
+    Xor,
+    Div,
+    LogicalAnd,
+    SHL,
+    SHR,
+    LogicalOr,
+    LogicalXor,
+    Maximum,
+    Minimum,
+    Mul,
+    Pow,
+    Sub,
+    LUT,
+    Abs,
+    Not,
+    Ceil,
+    CLZ,
+    Exp,
+    Floor,
+    LogicalNot,
+    Neg,
+    Reciprocal,
+    Rsqrt,
+    Select,
+    Equal,
+    Greater,
+    GreaterEqual,
+    ReduceAny,
+    ReduceAll,
+    ReduceMax,
+    ReduceMin,
+    ReduceProduct,
+    ReduceSum,
+    Concat,
+    Pad,
+    Reshape,
+    Reverse,
+    Slice,
+    Tile,
+    Transpose,
+    Gather,
+    Scatter,
+    Resize,
+    Cast,
+    Rescale,
+    Identity,
+    If,
+    While,
+
+    // Regor Internal Operators
+    MemoryCopy,
+
+    // Compatibility Operators
+    AddN,
+    Any,
+    ArgMin,
+    BatchMatMul,
+    BatchToSpaceND,
+    BidirectionalSequenceLstm,
+    BidirectionalSequenceRnn,
+    BlockLSTM,
+    Call,
+    Clip,
+    ConcatEmbeddings,
+    ConcatTFLite,
+    Const,
+    Conv2DBackpropInput,
+    Conv2DBackpropInputSwitchedBias,
+    Conv2DBias,
+    Cos,
+    Cumsum,
+    Custom,
+    CustomNpuOp,
+    Delegate,
+    Densify,
+    DepthToSpace,
+    Dequantize,
+    Elu,
+    EmbeddingLookup,
+    EmbeddingLookupSparse,
+    ExpandDims,
+    FakeQuantWithMinMaxArgs,
+    Fill,
+    FloorDiv,
+    FloorMod,
+    GatherNd,
+    GatherV2,
+    HardSwish,
+    HashtableLookup,
+    L2Norm,
+    L2Pool2D,
+    LRN,
+    LSHProjection,
+    LeakyRelu,
+    Less,
+    LessEqual,
+    Log,
+    LogSoftmax,
+    Lstm,
+    MatrixDiag,
+    MatrixSetDiag,
+    Max,
+    Mean,
+    Min,
+    MirrorPad,
+    NonMaxSuppressionV4,
+    NonMaxSuppressionV5,
+    NotEqual,
+    OneHot,
+    Pack,
+    PadV2,
+    Placeholder,
+    Prelu,
+    Prod,
+    Quantize,
+    QuantizedAvgPool,
+    QuantizedConv2D,
+    QuantizedMatMul,
+    QuantizedMaxPool,
+    QuantizedReshape,
+    Range,
+    Rank,
+    Relu,
+    Relu6,
+    ReluN1To1,
+    ReluN,
+    ResizeBilinear,
+    ResizeNearestNeighbor,
+    ReverseSequence,
+    ReverseV2,
+    Rnn,
+    Round,
+    ScatterNd,
+    SegmentSum,
+    SelectV2,
+    Shape,
+    SignBit,
+    Sin,
+    SkipGram,
+    Softmax,
+    SpaceToBatchND,
+    SpaceToDepth,
+    SparseToDense,
+    Split,
+    SplitV,
+    Sqrt,
+    Square,
+    SquaredDifference,
+    Squeeze,
+    StridedSlice,
+    SubgraphInput,
+    Sum,
+    Svdf,
+    TopKV2,
+    UnidirectionalSequenceLstm,
+    UnidirectionalSequenceRnn,
+    Unique,
+    Unpack,
+    Where,
+    ZerosLike,
+    LookupTable,
+    ENUM_END
+};
+
+inline std::string OpTypeToString(const OpType type)
+{
+    return EnumToString<OpType>(type);
+}
+
+constexpr inline bool IsUnaryElementwise(OpType opType)
+{
+    return opType == OpType::Abs || opType == OpType::LeakyRelu || opType == OpType::CLZ ||
+           opType == OpType::LogicalNot || opType == OpType::Not || opType == OpType::Neg;
+}
+
+constexpr inline bool IsBinaryElementwise(OpType opType)
+{
+    return opType == OpType::Add || opType == OpType::Sub || opType == OpType::Mul || opType == OpType::Minimum ||
+           opType == OpType::Maximum || opType == OpType::SHL || opType == OpType::SHR || opType == OpType::Div ||
+           opType == OpType::LogicalAnd || opType == OpType::LogicalOr || opType == OpType::LogicalXor ||
+           opType == OpType::Xor || opType == OpType::And || opType == OpType::Or || opType == OpType::Asr ||
+           opType == OpType::Equal || opType == OpType::Greater || opType == OpType::GreaterEqual || opType == OpType::NotEqual;
+}
+
+constexpr inline bool IsElementwise(OpType opType)
+{
+    return IsUnaryElementwise(opType) || IsBinaryElementwise(opType);
+}
+
+constexpr inline bool IsDepthwise(OpType opType)
+{
+    return opType == OpType::DepthwiseConv2DBias;
+}
+
+constexpr inline bool IsConvolution(OpType opType)
+{
+    return opType == OpType::Conv2D || opType == OpType::Conv2DBackpropInput || opType == OpType::Conv2DBackpropInputSwitchedBias ||
+           opType == OpType::Conv2DBias || opType == OpType::DepthwiseConv2DBias;
+}
+
+constexpr inline bool IsPooling(OpType opType)
+{
+    return opType == OpType::MaxPool || opType == OpType::AvgPool || opType == OpType::QuantizedAvgPool || opType == OpType::QuantizedMaxPool ||
+           opType == OpType::ReduceSum || opType == OpType::Sum || opType == OpType::Min || opType == OpType::ArgMax;
+}
+
+constexpr inline bool IsVectorProduct(OpType opType)
+{
+    return opType == OpType::FullyConnected || opType == OpType::BidirectionalSequenceLstm || opType == OpType::BidirectionalSequenceRnn ||
+           opType == OpType::BlockLSTM || opType == OpType::Lstm || opType == OpType::MatMul || opType == OpType::Rnn ||
+           opType == OpType::UnidirectionalSequenceLstm || opType == OpType::UnidirectionalSequenceRnn;
+}
+
+constexpr inline bool IsDma(OpType opType)
+{
+    return opType == OpType::Scatter || opType == OpType::ScatterNd || opType == OpType::Gather || opType == OpType::GatherV2 || opType == OpType::GatherNd;
+}
+
+constexpr inline bool IsActivation(OpType opType)
+{
+    return opType == OpType::Relu || opType == OpType::Relu6 || opType == OpType::ReluN || opType == OpType::ReluN1To1 ||
+           opType == OpType::Prelu || opType == OpType::Clip || opType == OpType::Sigmoid || opType == OpType::Tanh || opType == OpType::LUT;
+}
+
+constexpr inline bool IsConcatenation(OpType opType)
+{
+    return opType == OpType::Concat || opType == OpType::ConcatEmbeddings || opType == OpType::ConcatTFLite;
+}
+
+constexpr inline bool IsVariadic(OpType opType)
+{
+    return IsConcatenation(opType) || opType == OpType::Pack || opType == OpType::Maximum || opType == OpType::Minimum ||
+           opType == OpType::AddN || opType == OpType::Custom || opType == OpType::CustomNpuOp;
+}
+
+constexpr inline bool IsReshape(OpType opType)
+{
+    // The Reshape like operations: Reshape, Squeeze, and ExpandDims
+    return opType == OpType::Reshape || opType == OpType::QuantizedReshape || opType == OpType::Squeeze || opType == OpType::ExpandDims;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/operation.cpp b/ethosu/regor/compiler/operation.cpp
new file mode 100644
index 00000000..5ffb6281
--- /dev/null
+++ b/ethosu/regor/compiler/operation.cpp
@@ -0,0 +1,168 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "compiler/operation.hpp"
+
+#include "common/ordered_map.hpp"
+#include "common/scaling.hpp"
+#include "graph_builder.hpp"
+#include "kernel.hpp"
+#include "op_type.hpp"
+#include "tensor.hpp"
+
+#include <memory>
+
+namespace regor
+{
+
+Operation::Operation(OpType opType) : _type(opType), _parameters({}), _rounding(RoundMode::AUTO)
+{
+    // Default 1x1 kernel for ops without a kernel
+    _kernel = std::make_unique<class Kernel>(Point2i(1, 1), Point2i(1, 1), Point2i(1, 1));
+}
+
+Operation::Operation(const Operation &op) : Operation(op._type)
+{
+    _kernel = std::make_unique<class Kernel>(*op._kernel.get());
+    _parameters = op._parameters;
+    _passthrough = op._passthrough;
+}
+
+TensorUsage Operation::UsageOfTensor(const Tensor *tensor) const
+{
+    for ( const auto &list : {_inputs.pairs(), _outputs.pairs()} )
+    {
+        for ( const auto &pair : list )
+        {
+            if ( tensor == pair.second.tensor.get() )
+            {
+                return pair.first;
+            }
+        }
+    }
+    return TensorUsage::None;
+}
+
+Tensor *Operation::IFM(int index) const
+{
+    auto conn = Input(MakeTensorUsage(TensorUsage::IFM, index));
+    return conn ? conn->tensor.get() : nullptr;
+}
+
+Tensor *Operation::OFM() const
+{
+    return _outputs.at(TensorUsage::OFM).tensor.get();
+}
+
+void Operation::CopyInput(TensorUsage usage, const TensorConnection &tensorConnection)
+{
+    ConnectInput(usage, tensorConnection.tensor)
+        .Set(tensorConnection.shape)
+        .Set(tensorConnection.slice)
+        .Set(tensorConnection.quantization)
+        .Set(tensorConnection.transpose);
+}
+
+TensorConnection &Operation::ConnectInput(TensorUsage usage, const std::shared_ptr<Tensor> &tensor)
+{
+    // Must create the new connection before destroying whatever it replaces,
+    // because the existing connection (if present) might be the last remaining reference to this operation.
+    tensor->AddReader(shared_from_this());
+
+    if ( _inputs.contains(usage) && (_inputs[usage].tensor != tensor) )
+    {
+        _inputs[usage].tensor->RemoveReader(shared_from_this());
+    }
+    _inputs[usage].tensor = tensor;
+    _inputs[usage].shape = tensor->StorageShape();
+    return _inputs[usage];
+}
+
+void Operation::CopyOutput(TensorUsage usage, const TensorConnection &tensorConnection)
+{
+    ConnectOutput(usage, tensorConnection.tensor)
+        .Set(tensorConnection.shape)
+        .Set(tensorConnection.slice)
+        .Set(tensorConnection.quantization)
+        .Set(tensorConnection.transpose);
+}
+
+TensorConnection &Operation::ConnectOutput(TensorUsage usage, const std::shared_ptr<Tensor> &tensor)
+{
+    // Must create the new connection before destroying whatever it replaces,
+    // because the existing connection (if present) might be the last remaining reference to this operation.
+    tensor->AddWriter(shared_from_this());
+
+    if ( _outputs.contains(usage) && (_outputs[usage].tensor != tensor) )
+    {
+        _outputs[usage].tensor->RemoveWriter(shared_from_this());
+    }
+    _outputs[usage].tensor = tensor;
+    _outputs[usage].shape = tensor->StorageShape();
+
+    return _outputs[usage];
+}
+
+void Operation::Disconnect()
+{
+    // This operation might be about to remove the last remaining references to itself,
+    // so it must hold onto this one until it is finished disconnecting.
+    auto self = shared_from_this();
+
+    for ( auto &conn : _inputs )
+    {
+        conn.tensor->RemoveReader(self);
+    }
+    _inputs.clear();
+
+    for ( auto &conn : _outputs )
+    {
+        conn.tensor->RemoveWriter(self);
+    }
+    _outputs.clear();
+}
+
+bool Operation::IsDisconnected() const
+{
+    return _inputs.empty() && _outputs.empty();
+}
+
+bool Operation::HasScaling() const
+{
+    bool scaled = true;
+    for ( const auto &fm : _inputs.pairs() )
+    {
+        if ( fm.first == TensorUsage::IFM || fm.first == TensorUsage::IFM1 || fm.first == TensorUsage::OFM )
+        {
+            if ( fm.second.quantization.scales.empty() )
+            {
+                return false;
+            }
+        }
+    }
+    return scaled;
+}
+
+void Operation::SetZeroPoint(GraphApi::GraphTensorUsage graphUsage, double zeroPoint)
+{
+    auto usage = GraphAPIUsageToTensorUsage(graphUsage);
+    auto &connections = (usage & regor::TensorUsage::TypeMask) == regor::TensorUsage::OFM ? _outputs : _inputs;
+    connections.at(usage).quantization.zeroPoints = {int64_t(zeroPoint)};
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/operation.hpp b/ethosu/regor/compiler/operation.hpp
new file mode 100644
index 00000000..1c23b6f5
--- /dev/null
+++ b/ethosu/regor/compiler/operation.hpp
@@ -0,0 +1,234 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "attributes.hpp"
+#include "common/ordered_map.hpp"
+#include "common/reverse_type.hpp"
+#include "common/scaling.hpp"
+#include "common/transpose_type.hpp"
+#include "include/graphapi.hpp"
+#include "kernel.hpp"
+#include "op_type.hpp"
+#include "quantization.hpp"
+#include "tensor.hpp"
+
+#include <memory>
+
+namespace regor
+{
+
+enum class RoundMode : uint8_t
+{
+    DBL = 0,
+    TRUNCATE = 1,
+    NATURAL = 2,
+    TRUNCATE_TO_LOWER = 3,
+    DOUBLE_ASYMMETRIC = 4,
+    SYMMETRIC = 5,
+    AUTO = 0xff
+};
+
+// Parameters that apply only to particular operation types
+union OpTypeParameters
+{
+    struct
+    {
+        float alpha;
+    } leaky_relu;
+
+    struct
+    {
+        float beta;
+    } softmax;
+
+    struct
+    {
+        int axis;
+    } concat;
+
+    struct
+    {
+        int axis;
+    } pack_unpack;
+
+    struct
+    {
+        int begin_mask;
+        int end_mask;
+        int ellipsis_mask;
+        int new_axis_mask;
+        int shrink_axis_mask;
+    } strided_slice;
+
+    struct
+    {
+        bool alignCorners;
+        bool halfPixelCenters;
+    } resize;
+};
+
+struct TensorSlice
+{
+    Shape offset;
+    Shape shape;
+};
+
+struct TensorConnection
+{
+    std::shared_ptr<Tensor> tensor;
+    Shape shape;
+    // For operations accessing a slice of the tensor:
+    // Writing: Concat, ConcatTFLite, and Pack
+    // Reading: Split, SplitV, Unpack, Slice, and StridedSlice
+    TensorSlice slice;
+    Quantization quantization;
+    TransposeType transpose = TransposeType::None;
+    ReverseType reverse = ReverseType::None;
+
+    TensorConnection &Set(const Shape &s)
+    {
+        shape = s;
+        return *this;
+    }
+    TensorConnection &Set(const TensorSlice &s)
+    {
+        slice = s;
+        return *this;
+    }
+    TensorConnection &Set(const Quantization &q)
+    {
+        quantization = q;
+        return *this;
+    }
+    TensorConnection &Set(const TransposeType &t)
+    {
+        transpose = t;
+        return *this;
+    }
+    TensorConnection &Set(const ReverseType &r)
+    {
+        reverse = r;
+        return *this;
+    }
+};
+
+
+/// <summary>
+/// Graph Operation representation
+/// </summary>
+class Operation : public std::enable_shared_from_this<Operation>, public GraphApi::GraphOperation
+{
+private:
+    ordered_map<TensorUsage, TensorConnection> _inputs;
+    ordered_map<TensorUsage, TensorConnection> _outputs;
+    OpType _type;
+    std::unique_ptr<class Kernel> _kernel;
+    DynamicRef _attr;
+    OpTypeParameters _parameters;  // TODO: remove me
+    RoundMode _rounding;
+    const void *_passthrough = nullptr;  // Original flatbuffer description of this op (if it was loaded from one)
+
+public:
+    Operation(OpType opType);
+    Operation(const Operation &op);
+    OpType Type() const { return _type; }
+
+    const ordered_map<TensorUsage, TensorConnection> &Outputs() const { return _outputs; }
+    const ordered_map<TensorUsage, TensorConnection> &Inputs() const { return _inputs; }
+
+    Tensor *IFM(int index) const;
+    Tensor *OFM() const;
+
+    TensorConnection *Input(TensorUsage usage) { return _inputs.try_ref(usage); }
+    const TensorConnection *Input(TensorUsage usage) const { return _inputs.try_ref(usage); }
+    TensorConnection *Output(TensorUsage usage) { return _outputs.try_ref(usage); }
+    const TensorConnection *Output(TensorUsage usage) const { return _outputs.try_ref(usage); }
+
+    TensorUsage UsageOfTensor(const Tensor *tensor) const;
+    const class Kernel *Kernel() const { return _kernel.get(); }
+    class Kernel *Kernel() { return _kernel.get(); }
+    void SetKernel(std::unique_ptr<class Kernel> kernel) { _kernel = std::move(kernel); }
+
+    const OpTypeParameters &Parameters() const { return _parameters; }
+    OpTypeParameters &Parameters() { return _parameters; }
+
+    RoundMode Rounding() const { return _rounding; }
+    void SetRounding(RoundMode rounding) { _rounding = rounding; }
+
+    const void *Passthrough() const { return _passthrough; }
+    void SetPassthrough(const void *passthrough) { _passthrough = passthrough; }
+
+    void CopyInput(TensorUsage usage, const TensorConnection &tensorConnection);
+    TensorConnection &ConnectInput(TensorUsage usage, const std::shared_ptr<Tensor> &tensor);
+    int CountInputs(TensorUsage usage) const { return CountUsage(_inputs, usage); }
+
+    void CopyOutput(TensorUsage usage, const TensorConnection &tensorConnection);
+    TensorConnection &ConnectOutput(TensorUsage usage, const std::shared_ptr<Tensor> &tensor);
+    int CountOutputs(TensorUsage usage) const { return CountUsage(_outputs, usage); }
+
+    void Disconnect();
+    bool IsDisconnected() const;
+    bool HasScaling() const;
+
+    // Inherited via GraphOperation
+    void SetZeroPoint(GraphApi::GraphTensorUsage tensor, double zeroPoint) override;
+
+    template<typename TYPE>
+    TYPE *Attribute()
+    {
+        if ( !_attr )
+        {
+            _attr = CreateAttribute(TypeHash<TYPE>::HASH);
+        }
+        else if ( _attr.Info()->Hash() != TypeHash<TYPE>::HASH )
+        {
+            throw std::runtime_error("attribute already assigned for this operator");
+        }
+
+        return static_cast<TYPE *>(_attr.Instance());
+    }
+
+    DynamicRef *AttributeByKey(uint32_t hash)
+    {
+        if ( !_attr )
+        {
+            _attr = CreateAttribute(hash);
+        }
+        return &_attr;
+    }
+
+    const DynamicRef &AttributeRef() const { return _attr; }
+
+private:
+    int CountUsage(const ordered_map<TensorUsage, TensorConnection> &list, TensorUsage usage) const
+    {
+        int count = 0;
+        for ( const auto &pair : list.pairs() )
+        {
+            if ( (pair.first & TensorUsage::TypeMask) == usage )
+            {
+                count++;
+            }
+        }
+        return count;
+    }
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/operation_util.hpp b/ethosu/regor/compiler/operation_util.hpp
new file mode 100644
index 00000000..63ef2e74
--- /dev/null
+++ b/ethosu/regor/compiler/operation_util.hpp
@@ -0,0 +1,242 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "architecture/architecture.hpp"
+#include "common/buffer_view.hpp"
+#include "operation.hpp"
+#include "quantization.hpp"
+#include "tensor.hpp"
+
+namespace regor
+{
+
+inline std::shared_ptr<Tensor> CreateConstTensor(
+    const std::string &name, DataType type, const std::shared_ptr<Buffer> &buffer, const Shape *shape = nullptr)
+{
+    Shape tensorShape;
+    if ( shape == nullptr )
+    {
+        tensorShape = Shape(DataTypeElements(type, buffer->Size()));
+    }
+    else
+    {
+        tensorShape = *shape;
+    }
+    auto tensor = std::make_shared<Tensor>(name, type, tensorShape, buffer);
+    return tensor;
+}
+
+template<typename T>
+std::shared_ptr<Tensor> CreateConstTensor(const std::string &name, T value)
+{
+    auto buf = std::make_shared<Buffer>(std::vector<T>{value});
+    return CreateConstTensor(name, DataTypeOf<T>::value, buf);
+}
+
+template<typename T>
+std::shared_ptr<Buffer> NewBufferFromView(const BufferView &src)
+{
+    auto size = src.ViewShape().Elements();
+    auto buf = std::make_shared<Buffer>(size, static_cast<T *>(nullptr));
+    auto bufView = BufferView(buf, src);
+    const auto srcData = src.Values<T>();
+    auto dstData = bufView.template WritableValues<T>();
+    for ( int i = 0; i < size; i++ )
+    {
+        dstData[i] = srcData[i];
+    }
+    return buf;
+}
+
+inline std::shared_ptr<Tensor> CreateSliceCopy(const std::string &name, const Tensor *src, const TensorSlice &slice)
+{
+    assert(src->IsConstant());
+    auto sliceView = src->View().SubView(slice.offset, slice.shape);
+    std::shared_ptr<Buffer> buffer;
+    switch ( src->Type() )
+    {
+        case DataType::Int8:
+            buffer = NewBufferFromView<int8_t>(sliceView);
+            break;
+        case DataType::UInt8:
+            buffer = NewBufferFromView<uint8_t>(sliceView);
+            break;
+        case DataType::Int16:
+            buffer = NewBufferFromView<int16_t>(sliceView);
+            break;
+        case DataType::Int32:
+            buffer = NewBufferFromView<int32_t>(sliceView);
+            break;
+        default:
+            assert(false);
+            break;
+    }
+    auto tensor = CreateConstTensor(name, src->Type(), buffer, &slice.shape);
+    return tensor;
+}
+
+inline Operation *CreateLUT(const std::shared_ptr<Tensor> &ifm, const std::shared_ptr<Tensor> &lut, const Quantization &ifmQuantization,
+    const Quantization &ofmQuantization, DataType dtype = DataType::None, const Shape *ifmShape = nullptr,
+    std::shared_ptr<Tensor> ofm = nullptr, TensorSlice ifmSlice = {}, TensorSlice ofmSlice = {})
+{
+    auto op = std::make_shared<Operation>(OpType::LUT);
+    if ( dtype == DataType::None )
+    {
+        dtype = lut->Type();
+    }
+    if ( ifmShape == nullptr )
+    {
+        ifmShape = &ifm->StorageShape();
+    }
+    op->ConnectInput(TensorUsage::IFM, ifm).Set(*ifmShape).Set(ifmQuantization).Set(ifmSlice);
+
+    op->ConnectInput(TensorUsage::LUT, lut);
+    if ( ofm == nullptr )
+    {
+        ofm = std::make_shared<Tensor>(ifm->Name() + "/lut", dtype);
+        ofm->SetStorageShape(*ifmShape);
+    }
+    op->ConnectOutput(TensorUsage::OFM, ofm).Set(ofm->StorageShape()).Set(ofmQuantization).Set(ofmSlice);
+    return op.get();
+}
+
+inline Operation *CreateDepthwiseMaxpool(const std::shared_ptr<Tensor> &ifm, const Shape &ifmShape,
+    const Quantization &ifmQuantization, const Quantization &ofmQuantization)
+{
+    auto op = std::make_shared<Operation>(OpType::MaxPool);
+    int height = ifmShape.ElementsWH();
+    int width = ifmShape.Depth();
+    auto kernel = std::make_unique<Kernel>(Point2i(width, 1), Point2i(1, 1), Point2i(1, 1), 1);
+    auto ofm = std::make_shared<Tensor>(ifm->Name() + "/maxpool", ifm->Type());
+    ofm->SetStorageShape(Shape(1, ifmShape.Height(), ifmShape.Width(), 1));
+    op->SetKernel(std::move(kernel));
+
+    op->ConnectInput(TensorUsage::IFM, ifm).Set(ifmQuantization);
+    op->Input(TensorUsage::IFM)->shape = Shape(1, height, width, 1);
+    op->ConnectOutput(TensorUsage::OFM, ofm).Set(ofmQuantization);
+    op->Output(TensorUsage::OFM)->shape = Shape(1, height, 1, 1);
+    return op.get();
+}
+
+inline Operation *CreateReduceSum(const std::shared_ptr<Tensor> &ifm, const Quantization &ifmQuantization, const Quantization &ofmQuantization)
+{
+    const auto &ifmShape = ifm->StorageShape();
+    auto op = std::make_shared<Operation>(OpType::ReduceSum);
+    auto ofm = std::make_shared<Tensor>(ifm->Name() + "/reducesum", DataType::Int32);
+    ofm->SetStorageShape(Shape(1, ifmShape.Height(), ifmShape.Width(), 1));
+    op->ConnectInput(TensorUsage::IFM, ifm).Set(ifmQuantization);
+    op->ConnectOutput(TensorUsage::OFM, ofm).Set(ofmQuantization);
+    return op.get();
+}
+
+inline Operation *CreateElementwise(OpType type, const std::shared_ptr<Tensor> &ifm, const std::shared_ptr<Tensor> &ifm2,
+    const Quantization &ifmQuantization, const Quantization &ifm2Quantization, const Quantization &ofmQuantization,
+    DataType dtype = DataType::None, const Shape *ifmShape = nullptr, const Shape *ifm2Shape = nullptr)
+{
+    assert(IsElementwise(type));
+    auto op = std::make_shared<Operation>(type);
+    op->ConnectInput(TensorUsage::IFM, ifm).Set(ifmQuantization);
+    if ( ifmShape ) op->Input(TensorUsage::IFM)->shape = *ifmShape;
+    if ( ifm2 )
+    {
+        op->ConnectInput(TensorUsage::IFM1, ifm2).Set(ifm2Quantization);
+        if ( ifm2Shape ) op->Input(TensorUsage::IFM1)->shape = *ifm2Shape;
+    }
+
+    if ( dtype == DataType::None ) dtype = ifm->Type();
+
+    Shape ofmShape = op->Input(TensorUsage::IFM)->shape;
+    // If reverse operands use ifm2 shape as ofm shape
+    if ( ifm2 && ((ofmShape.Elements() == 1 && ifm->IsConstant()) || ofmShape.IsSubShapeOf(op->Input(TensorUsage::IFM1)->shape)) )
+    {
+        ofmShape = ifm2->StorageShape();
+    }
+
+    auto ofm = std::make_shared<Tensor>(ifm->Name() + "/" + OpTypeToString(type), dtype);
+    ofm->SetStorageShape(ofmShape);
+    op->ConnectOutput(TensorUsage::OFM, ofm).Set(ofmQuantization);
+    return op.get();
+}
+
+inline Operation *CreateBinaryElementwise(OpType type, const std::shared_ptr<Tensor> &ifm, const std::shared_ptr<Tensor> &ifm2,
+    const Quantization &ifmQuantization, const Quantization &ifm2Quantization, const Quantization &ofmQuantization,
+    DataType dtype = DataType::None, const Shape *ifmShape = nullptr, const Shape *ifm2Shape = nullptr)
+{
+    assert(IsBinaryElementwise(type));
+    return CreateElementwise(type, ifm, ifm2, ifmQuantization, ifm2Quantization, ofmQuantization, dtype, ifmShape, ifm2Shape);
+}
+
+inline Operation *CreateUnaryElementwise(OpType type, const std::shared_ptr<Tensor> &ifm, const Quantization &ifmQuantization,
+    const Quantization &ofmQuantization, DataType dtype = DataType::None, const Shape *ifmShape = nullptr)
+{
+    assert(IsUnaryElementwise(type));
+    return CreateElementwise(type, ifm, nullptr, ifmQuantization, {}, ofmQuantization, dtype, ifmShape);
+}
+
+inline Operation *CreateClz(const std::shared_ptr<Tensor> &ifm, const Quantization &ifmQuantization,
+    const Quantization &ofmQuantization, DataType dtype = DataType::None, const Shape *ifmShape = nullptr)
+{
+    return CreateUnaryElementwise(OpType::CLZ, ifm, ifmQuantization, ofmQuantization, dtype, ifmShape);
+}
+
+inline Operation *CreateAdd(const std::shared_ptr<Tensor> &ifm, const std::shared_ptr<Tensor> &ifm2,
+    const Quantization &ifmQuantization, const Quantization &ifm2Quantization, const Quantization &ofmQuantization,
+    DataType dtype = DataType::None, const Shape *ifmShape = nullptr, const Shape *ifm2Shape = nullptr)
+{
+    return CreateBinaryElementwise(OpType::Add, ifm, ifm2, ifmQuantization, ifm2Quantization, ofmQuantization, dtype, ifmShape, ifm2Shape);
+}
+
+inline Operation *CreateMul(const std::shared_ptr<Tensor> &ifm, const std::shared_ptr<Tensor> &ifm2,
+    const Quantization &ifmQuantization, const Quantization &ifm2Quantization, const Quantization &ofmQuantization,
+    DataType dtype = DataType::None, const Shape *ifmShape = nullptr, const Shape *ifm2Shape = nullptr)
+{
+    return CreateBinaryElementwise(OpType::Mul, ifm, ifm2, ifmQuantization, ifm2Quantization, ofmQuantization, dtype, ifmShape, ifm2Shape);
+}
+
+inline Operation *CreateSub(const std::shared_ptr<Tensor> &ifm, const std::shared_ptr<Tensor> &ifm2,
+    const Quantization &ifmQuantization, const Quantization &ifm2Quantization, const Quantization &ofmQuantization,
+    DataType dtype = DataType::None, const Shape *ifmShape = nullptr, const Shape *ifm2Shape = nullptr)
+{
+    return CreateBinaryElementwise(OpType::Sub, ifm, ifm2, ifmQuantization, ifm2Quantization, ofmQuantization, dtype, ifmShape, ifm2Shape);
+}
+
+inline Operation *CreateShl(const std::shared_ptr<Tensor> &ifm, const std::shared_ptr<Tensor> &ifm2,
+    const Quantization &ifmQuantization, const Quantization &ifm2Quantization, const Quantization &ofmQuantization,
+    DataType dtype = DataType::None, const Shape *ifmShape = nullptr, const Shape *ifm2Shape = nullptr)
+{
+    return CreateBinaryElementwise(OpType::SHL, ifm, ifm2, ifmQuantization, ifm2Quantization, ofmQuantization, dtype, ifmShape, ifm2Shape);
+}
+
+inline Operation *CreateAsr(const std::shared_ptr<Tensor> &ifm, const std::shared_ptr<Tensor> &ifm2,
+    const Quantization &ifmQuantization, const Quantization &ifm2Quantization, const Quantization &ofmQuantization,
+    DataType dtype = DataType::None, const Shape *ifmShape = nullptr, const Shape *ifm2Shape = nullptr)
+{
+    return CreateBinaryElementwise(OpType::Asr, ifm, ifm2, ifmQuantization, ifm2Quantization, ofmQuantization, dtype, ifmShape, ifm2Shape);
+}
+
+inline Operation *CreateRescaleAdd(const std::shared_ptr<Tensor> &ifm, const std::shared_ptr<Tensor> &ifm2, const Quantization &ifmQuantization,
+    const Quantization &ifm2Quantization, const Quantization &ofmQuantization, int32_t scale, int shift)
+{
+    auto op = CreateBinaryElementwise(OpType::Add, ifm, ifm2, ifmQuantization, ifm2Quantization, ofmQuantization);
+    op->Output(TensorUsage::OFM)->quantization.scales.push_back(QuantizedScale(scale, shift));
+    return op;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/optimiser_utils.cpp b/ethosu/regor/compiler/optimiser_utils.cpp
new file mode 100644
index 00000000..4090d9c9
--- /dev/null
+++ b/ethosu/regor/compiler/optimiser_utils.cpp
@@ -0,0 +1,143 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#include "optimiser_utils.hpp"
+
+#include "operation.hpp"
+namespace regor::GraphOptimisation
+{
+
+// Find specified tensor in Inputs() / Outputs() vectors.
+// returns true if found in given vector.
+bool IsTensorInVector(const std::vector<std::shared_ptr<Tensor>> &tensorVec, const Tensor *const tensorToFind)
+{
+    auto pos = std::find_if(
+        tensorVec.begin(), tensorVec.end(), [&](const std::shared_ptr<Tensor> &t) { return t.get() == tensorToFind; });
+
+    return (pos != tensorVec.end());
+}
+
+// Insert a MemoryCopy operation after given ifm tensor. Returns a copy op shared_ptr.
+// Will make a clone of ifm as ofm  and connects any other consumers of the ifm to it.
+std::shared_ptr<Operation> InsertCopyOpAfterTensor(const std::shared_ptr<Tensor> &ifm, const Quantization &quantization)
+{
+    std::shared_ptr<Tensor> copyTensor = ifm->Clone();
+    auto copyOp = std::make_shared<Operation>(OpType::MemoryCopy);
+    copyOp->ConnectInput(TensorUsage::IFM0, ifm).Set(quantization);
+    auto name = ifm->Name();
+    name.append("_copy");
+    copyTensor->SetName(name);
+    copyOp->ConnectOutput(TensorUsage::OFM, copyTensor).Set(quantization);
+
+    std::vector<std::shared_ptr<Operation>> ifmReaders(ifm->Readers());
+    for ( const auto &opReader : ifmReaders )
+    {
+        auto *cons = opReader.get();
+        if ( cons != copyOp.get() )
+        {
+            auto idx = 0;
+            auto usage = MakeTensorUsage(TensorUsage::IFM, 0);
+            auto *consIfmConn = cons->Input(usage);
+
+            while ( consIfmConn != nullptr )
+            {
+                if ( consIfmConn->tensor.get() == ifm.get() )
+                {
+                    cons->ConnectInput(usage, copyTensor).Set(quantization);
+                }
+                usage = MakeTensorUsage(TensorUsage::IFM, ++idx);
+                consIfmConn = cons->Input(usage);
+            }
+        }
+    }
+    return copyOp;
+}
+
+// Connects output to operations in given list. Will not replace connection shape.
+// Parameters:
+// - producerList: List of producers.
+// - tensorToReplace: if OFM on consumer match this tensor, replace it.
+// - newTensor: The new output tensor to connect.
+void ReplaceProducerOutput(std::vector<std::shared_ptr<Operation>> producerList, const Tensor *const tensorToReplace,
+    std::shared_ptr<Tensor> newTensor)
+{
+    // Not passed by reference. Original can be modified in loop.
+    for ( const auto &producer : producerList )
+    {
+        Operation *prod = producer.get();
+        auto idx = 0;
+        auto usage = MakeTensorUsage(TensorUsage::OFM, 0);
+        auto prodOfmConn = prod->Output(usage);
+
+        while ( prodOfmConn != nullptr )
+        {
+            if ( prodOfmConn->tensor.get() == tensorToReplace )
+            {
+                // Do not want to replace the shape. Only the tensor and add writers.
+                // As ConnectOutput but do not replace shape.
+                newTensor->AddWriter(prod->shared_from_this());
+                if ( prodOfmConn->tensor != newTensor )
+                {
+                    prodOfmConn->tensor->RemoveWriter(prod->shared_from_this());
+                }
+                prodOfmConn->tensor = newTensor;
+            }
+            usage = MakeTensorUsage(TensorUsage::OFM, ++idx);
+            prodOfmConn = prod->Output(usage);
+        }
+    }
+}
+
+
+// Connects input to operations in given list. Will not replace connection shape.
+// Parameters:
+// - exemptOperation: operation to exempt.
+// - consumerList: List of consumers.
+// - tensorToReplace: if IFM on consumer match this tensor, replace it.
+// - newTensor: The new input tensor to connect.
+void ReplaceConsumerInput(const Operation *const exemptOperation, std::vector<std::shared_ptr<Operation>> consumerList,
+    const Tensor *const tensorToReplace, std::shared_ptr<Tensor> newTensor)
+{
+    // Not passed by reference. Original can be modified in loop.
+    for ( const auto &consumer : consumerList )
+    {
+        Operation *cons = consumer.get();
+        auto idx = 0;
+        auto usage = MakeTensorUsage(TensorUsage::IFM, 0);
+        auto *consIfmConn = cons->Input(usage);
+
+        while ( consIfmConn != nullptr )
+        {
+            if ( consIfmConn->tensor.get() == tensorToReplace && cons != exemptOperation )
+            {
+                // Do not want to replace the shape. Only the tensor and add writers.
+                // As ConnectInput but do not replace shape.
+                newTensor->AddReader(cons->shared_from_this());
+                if ( consIfmConn->tensor != newTensor )
+                {
+                    consIfmConn->tensor->RemoveReader(cons->shared_from_this());
+                }
+                consIfmConn->tensor = newTensor;
+            }
+            usage = MakeTensorUsage(TensorUsage::IFM, ++idx);
+            consIfmConn = cons->Input(usage);
+        }
+    }
+}
+
+
+}  // namespace regor::GraphOptimisation
diff --git a/ethosu/regor/compiler/optimiser_utils.hpp b/ethosu/regor/compiler/optimiser_utils.hpp
new file mode 100644
index 00000000..94d3efcf
--- /dev/null
+++ b/ethosu/regor/compiler/optimiser_utils.hpp
@@ -0,0 +1,55 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "quantization.hpp"
+#include "tensor.hpp"
+
+#include <vector>
+
+namespace regor::GraphOptimisation
+{
+
+// Find specified tensor in Inputs() / Outputs() vectors.
+// returns true if found in given vector.
+bool IsTensorInVector(const std::vector<std::shared_ptr<Tensor>> &tensorVec, const Tensor *const tensorToFind);
+
+// Insert a MemoryCopy operation after given ifm tensor. Returns a copy op shared_ptr.
+// Will make a clone of ifm as ofm  and connects any other consumers of the ifm to it.
+std::shared_ptr<Operation> InsertCopyOpAfterTensor(std::shared_ptr<Tensor> const &ifm, const Quantization &quantization);
+
+
+// Connects output to operations in given list. Will not replace connection shape.
+// Parameters:
+// - producerList: List of producers.
+// - tensorToReplace: if OFM on consumer match this tensor, replace it.
+// - newTensor: The new output tensor to connect.
+void ReplaceProducerOutput(std::vector<std::shared_ptr<Operation>> producerList, const Tensor *const tensorToReplace,
+    std::shared_ptr<Tensor> newTensor);
+
+// Connects input to operations in given list. Will not replace connection shape.
+// Parameters:
+// - exemptOperation: operation to exempt.
+// - consumerList: List of consumers.
+// - tensorToReplace: if IFM on consumer match this tensor, replace it.
+// - newTensor: The new input tensor to connect.
+void ReplaceConsumerInput(const Operation *const exemptOperation, std::vector<std::shared_ptr<Operation>> consumerList,
+    const Tensor *const tensorToReplace, std::shared_ptr<Tensor> newTensor);
+
+}  // namespace regor::GraphOptimisation
diff --git a/ethosu/regor/compiler/quantization.cpp b/ethosu/regor/compiler/quantization.cpp
new file mode 100644
index 00000000..0bc4d4c8
--- /dev/null
+++ b/ethosu/regor/compiler/quantization.cpp
@@ -0,0 +1,59 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "quantization.hpp"
+
+#include "common/common.hpp"
+
+namespace regor
+{
+
+std::string Quantization::ToString() const
+{
+    std::vector<std::string> scale;
+    for ( const QuantizedScale &s : scales )
+    {
+        scale.push_back(fmt::format("(scale:{}, shift:{})", s.scale, s.shift));
+    }
+    return fmt::format("scale: [{}], zero_point: [{}], quantMin: [{}], quantMax: [{}], dimension: {}, force_zero_point: {}",
+        fmt::join(scale, ", "), fmt::join(zeroPoints, ", "), fmt::join(quantMin, ", "), fmt::join(quantMax, ", "), dimension, forceZeroPoint);
+}
+
+bool Quantization::operator==(const Quantization &rhs) const
+{
+    return std::tie(scales, zeroPoints, quantMin, quantMax, dimension, forceZeroPoint) ==
+           std::tie(rhs.scales, rhs.zeroPoints, rhs.quantMin, rhs.quantMax, rhs.dimension, rhs.forceZeroPoint);
+}
+
+bool Quantization::operator!=(const Quantization &rhs) const
+{
+    return !(*this == rhs);
+}
+
+const Quantization &Quantization::Unit()
+{
+    static Quantization unitQuantization;
+    if ( unitQuantization.scales.empty() )
+    {
+        unitQuantization.scales.emplace_back(QuantizedScale{1, 0});
+        unitQuantization.zeroPoints.emplace_back(0);
+    }
+    return unitQuantization;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/quantization.hpp b/ethosu/regor/compiler/quantization.hpp
new file mode 100644
index 00000000..edc14d62
--- /dev/null
+++ b/ethosu/regor/compiler/quantization.hpp
@@ -0,0 +1,94 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/common.hpp"
+
+#include "common/scaling.hpp"
+
+#include <vector>
+
+namespace regor
+{
+
+enum class QuantizationType
+{
+    TFLITE,    // TFLite-specific rescale in backend
+    EXPLICIT,  // Explicit scaling
+};
+
+class Quantization
+{
+public:
+    QuantizationType type = QuantizationType::EXPLICIT;
+    std::vector<QuantizedScale> scales;
+    std::vector<int64_t> zeroPoints;
+    std::vector<int64_t> quantMin;
+    std::vector<int64_t> quantMax;
+    int dimension = 0;
+    bool forceZeroPoint = false;
+
+public:
+    Quantization() = default;
+    Quantization(Quantization &&other) noexcept { *this = std::move(other); }
+    Quantization(const Quantization &other) { *this = other; }
+
+    static const Quantization &Unit();
+    bool operator==(const Quantization &rhs) const;
+    bool operator!=(const Quantization &rhs) const;
+    std::string ToString() const;
+    bool IsValid() const { return !zeroPoints.empty() && !scales.empty(); }
+    bool EqualScales(const Quantization &other) const
+    {
+        return other.scales == scales && other.zeroPoints == zeroPoints;
+    }
+    explicit operator bool() const { return IsValid(); }
+
+    Quantization &operator=(const Quantization &other)
+    {
+        if ( this != &other )
+        {
+            type = other.type;
+            scales = other.scales;
+            zeroPoints = other.zeroPoints;
+            quantMin = other.quantMin;
+            quantMax = other.quantMax;
+            dimension = other.dimension;
+            forceZeroPoint = other.forceZeroPoint;
+        }
+        return *this;
+    }
+
+    Quantization &operator=(Quantization &&other) noexcept
+    {
+        if ( this != &other )
+        {
+            type = other.type;
+            scales = std::move(other.scales);
+            zeroPoints = std::move(other.zeroPoints);
+            quantMin = std::move(other.quantMin);
+            quantMax = std::move(other.quantMax);
+            dimension = other.dimension;
+            forceZeroPoint = other.forceZeroPoint;
+        }
+        return *this;
+    }
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/raw_writer.cpp b/ethosu/regor/compiler/raw_writer.cpp
new file mode 100644
index 00000000..22a5b216
--- /dev/null
+++ b/ethosu/regor/compiler/raw_writer.cpp
@@ -0,0 +1,260 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "raw_writer.hpp"
+
+#include "common/logging.hpp"
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "include/regor.h"
+
+
+namespace regor
+{
+
+static constexpr int WEIGHTS_REGION = 0;
+static constexpr int SCRATCH_REGION = 1;
+static constexpr int SCRATCH_FAST_REGION = 2;
+static constexpr int INPUT_REGION = SCRATCH_REGION;
+static constexpr int OUTPUT_REGION = SCRATCH_REGION;
+
+std::vector<std::pair<std::unique_ptr<const uint8_t[]>, size_t>> RawWriter::Serialise(
+    const std::vector<std::unique_ptr<Graph>> &graphs, const std::vector<std::unordered_map<const Tensor *, Address>> &tensor_address_maps)
+{
+    if ( graphs.size() != 1 )
+    {
+        throw std::invalid_argument("RawWriter expects 1 graph");
+    }
+    const auto &graph = graphs[0];
+
+    std::vector<Operation *> operations;
+    graph->GetAllOperations(operations);
+    if ( operations.size() != 1 )
+    {
+        throw std::invalid_argument("RawWriter expects graph with 1 operation");
+    }
+
+    if ( tensor_address_maps.size() != 1 )
+    {
+        throw std::invalid_argument("RawWriter expects 1 tensor address map");
+    }
+    const auto &tensor_address_map = tensor_address_maps[0];
+
+    const Operation *customNpuOp = operations[0];
+    if ( customNpuOp->Type() != OpType::CustomNpuOp )
+    {
+        throw std::invalid_argument("RawWriter expects graph with 1 CustomNpuOp");
+    }
+
+    const auto &graphInputs = graph->Inputs();
+    std::unordered_set<std::shared_ptr<Tensor>> inputsSet(graphInputs.begin(), graphInputs.end());
+
+    const auto &graphOutputs = graph->Outputs();
+    std::unordered_set<std::shared_ptr<Tensor>> outputsSet(graphOutputs.begin(), graphOutputs.end());
+
+    // ethos_u_command_stream in TFLite format
+    auto commandStreamTensorConnection = customNpuOp->Input(MakeTensorUsage(TensorUsage::Params, 0));
+    auto commandStreamTensor = commandStreamTensorConnection->tensor.get();
+    SerialiseCommandStreamTensor(commandStreamTensor);
+
+    // read_only in TFLite format
+    auto readOnlyTensorConnection = customNpuOp->Input(MakeTensorUsage(TensorUsage::Params, 1));
+    auto readOnlyTensor = readOnlyTensorConnection->tensor.get();
+    SerialiseReadOnlyTensor(readOnlyTensor);
+
+    // scratch in TFLite format
+    auto featureMapTensorConnection = customNpuOp->Input(MakeTensorUsage(TensorUsage::State, 0));
+    auto featureMapTensor = featureMapTensorConnection->tensor.get();
+    SerialiseScratchTensor(featureMapTensor, tensor_address_map.at(featureMapTensor));
+
+    // scratch/scratch_fast TFLite format
+    auto stagingTensorConnection = customNpuOp->Input(MakeTensorUsage(TensorUsage::State, 1));
+    auto stagingTensor = stagingTensorConnection->tensor.get();
+    if ( stagingTensor == featureMapTensor )
+    {
+        SerialiseScratchTensor(stagingTensor, tensor_address_map.at(stagingTensor));
+    }
+    else
+    {
+        SerialiseScratchFastTensor(stagingTensor, tensor_address_map.at(stagingTensor));
+    }
+
+    for ( const auto &[tensorUsage, tensorConnection] : customNpuOp->Inputs().pairs() )
+    {
+        auto inputTensor = tensorConnection.tensor.get();
+        if ( IsIFM(tensorUsage) && !inputTensor->IsConstant() )
+        {
+            // Serialise input tensor
+            SerialiseInputTensor(inputTensor, tensor_address_map.at(inputTensor));
+        }
+    }
+
+    for ( const auto &[tensorUsage, tensorConnection] : customNpuOp->Outputs().pairs() )
+    {
+        auto outputTensor = tensorConnection.tensor.get();
+        if ( IsOFM(tensorUsage) )
+        {
+            // Serialise output tensor
+            SerialiseOutputTensor(outputTensor, tensor_address_map.at(outputTensor));
+        }
+    }
+
+    return std::move(_raw);
+}
+
+void RawWriter::SerialiseCommandStreamTensor(const Tensor *tensor)
+{
+    assert(tensor->IsConstant());
+
+    // command_stream buffer and buffer size
+    auto blob = tensor->View().Buffer()->Data<uint8_t>();
+    auto blobSize = tensor->View().Buffer()->Size();
+
+    const size_t serialisedTensorSize = sizeof(regor_raw_tensor_header_t) + blobSize;
+    auto serialisedTensor = std::make_unique<uint8_t[]>(serialisedTensorSize);
+
+    // Initialise header
+    regor_raw_tensor_header_t header;
+    header.type = regor_raw_tensor_header_t::RAW_TENSOR_TYPE_COMMAND_STREAM;
+    header.tensor.command_stream.size = blobSize;
+
+    // Copy header
+    std::copy_n(reinterpret_cast<uint8_t *>(&header), sizeof(header), serialisedTensor.get());
+
+    // Copy blob to right after header
+    std::copy_n(blob, blobSize, serialisedTensor.get() + sizeof(regor_raw_tensor_header_t));
+
+    _raw.emplace_back(std::move(serialisedTensor), serialisedTensorSize);
+}
+
+void RawWriter::SerialiseReadOnlyTensor(const Tensor *tensor)
+{
+    assert(tensor->IsConstant());
+
+    // read_only buffer and buffer size
+    auto blob = tensor->View().Buffer()->Data<uint8_t>();
+    auto blobSize = tensor->View().Buffer()->Size();
+
+    const size_t serialisedTensorSize = sizeof(regor_raw_tensor_header_t) + blobSize;
+    auto serialisedTensor = std::make_unique<uint8_t[]>(serialisedTensorSize);
+
+    // Initialise read_only header
+    regor_raw_tensor_header_t header;
+    header.type = regor_raw_tensor_header_t::RAW_TENSOR_TYPE_READ_ONLY;
+    header.tensor.read_only.region = WEIGHTS_REGION;
+    header.tensor.read_only.size = blobSize;
+
+    // Copy header
+    std::copy_n(reinterpret_cast<uint8_t *>(&header), sizeof(header), serialisedTensor.get());
+
+    // Copy blob to right after header
+    std::copy_n(blob, blobSize, serialisedTensor.get() + sizeof(regor_raw_tensor_header_t));
+
+    _raw.emplace_back(std::move(serialisedTensor), serialisedTensorSize);
+}
+
+void RawWriter::SerialiseScratchTensor(const Tensor *tensor, Address address)
+{
+    assert(!tensor->IsConstant());
+
+    const size_t serialisedTensorSize = sizeof(regor_raw_tensor_header_t);
+    auto serialisedTensor = std::make_unique<uint8_t[]>(serialisedTensorSize);
+
+    // Initialise scratch header
+    regor_raw_tensor_header_t header;
+    header.type = regor_raw_tensor_header_t::RAW_TENSOR_TYPE_SCRATCH;
+    header.tensor.scratch.region = SCRATCH_REGION;
+    header.tensor.scratch.size = DataTypeStorageSizeBytes(tensor->Type(), tensor->StorageShape().Elements());
+    header.tensor.scratch.address = address;
+
+    // Copy header
+    std::copy_n(reinterpret_cast<uint8_t *>(&header), sizeof(header), serialisedTensor.get());
+
+    _raw.emplace_back(std::move(serialisedTensor), serialisedTensorSize);
+}
+
+void RawWriter::SerialiseScratchFastTensor(const Tensor *tensor, Address address)
+{
+    assert(!tensor->IsConstant());
+
+    const size_t serialisedTensorSize = sizeof(regor_raw_tensor_header_t);
+    auto serialisedTensor = std::make_unique<uint8_t[]>(serialisedTensorSize);
+
+    // Initialise scratch_fast tensor header
+    regor_raw_tensor_header_t header;
+    header.type = regor_raw_tensor_header_t::RAW_TENSOR_TYPE_SCRATCH_FAST;
+    header.tensor.scratch_fast.region = SCRATCH_FAST_REGION;
+    header.tensor.scratch_fast.size = DataTypeStorageSizeBytes(tensor->Type(), tensor->StorageShape().Elements());
+    header.tensor.scratch_fast.address = address;
+
+    // Copy header
+    std::copy_n(reinterpret_cast<uint8_t *>(&header), sizeof(header), serialisedTensor.get());
+
+    _raw.emplace_back(std::move(serialisedTensor), serialisedTensorSize);
+}
+
+void RawWriter::SerialiseInputTensor(const Tensor *tensor, Address address)
+{
+    assert(!tensor->IsConstant());
+
+    const size_t serialisedTensorSize = sizeof(regor_raw_tensor_header_t);
+    auto serialisedTensor = std::make_unique<uint8_t[]>(serialisedTensorSize);
+
+    // Initialise input tensor header
+    regor_raw_tensor_header_t header;
+    header.type = regor_raw_tensor_header_t::RAW_TENSOR_TYPE_INPUT;
+    header.tensor.input.region = INPUT_REGION;
+    header.tensor.input.element_size = DataTypeStorageSizeBytes(tensor->Type(), 1);
+    auto shape = Shape::PadAxes(tensor->StorageShape(), 4, 1).ToList<uint32_t>();
+    std::copy(shape.begin(), shape.end(), header.tensor.input.shape);
+    header.tensor.input.size = DataTypeStorageSizeBytes(tensor->Type(), tensor->StorageShape().Elements());
+    header.tensor.input.address = address;
+
+    // Copy header
+    std::copy_n(reinterpret_cast<uint8_t *>(&header), sizeof(header), serialisedTensor.get());
+
+    _raw.emplace_back(std::move(serialisedTensor), serialisedTensorSize);
+}
+
+void RawWriter::SerialiseOutputTensor(const Tensor *tensor, Address address)
+{
+    assert(!tensor->IsConstant());
+
+    const size_t serialisedTensorSize = sizeof(regor_raw_tensor_header_t);
+    auto serialisedTensor = std::make_unique<uint8_t[]>(serialisedTensorSize);
+
+    // Initialise output tensor header
+    regor_raw_tensor_header_t header;
+    header.type = regor_raw_tensor_header_t::RAW_TENSOR_TYPE_OUTPUT;
+    header.tensor.output.region = OUTPUT_REGION;
+    header.tensor.output.element_size = DataTypeStorageSizeBytes(tensor->Type(), 1);
+    auto shape = Shape::PadAxes(tensor->StorageShape(), 4, 1).ToList<uint32_t>();
+    std::copy(shape.begin(), shape.end(), header.tensor.input.shape);
+    header.tensor.output.size = DataTypeStorageSizeBytes(tensor->Type(), tensor->StorageShape().Elements());
+    header.tensor.output.address = address;
+
+    // Copy header
+    std::copy_n(reinterpret_cast<uint8_t *>(&header), sizeof(header), serialisedTensor.get());
+
+    _raw.emplace_back(std::move(serialisedTensor), serialisedTensorSize);
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/raw_writer.hpp b/ethosu/regor/compiler/raw_writer.hpp
new file mode 100644
index 00000000..9630f104
--- /dev/null
+++ b/ethosu/regor/compiler/raw_writer.hpp
@@ -0,0 +1,53 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "compiler/graph.hpp"
+#include "compiler/tensor.hpp"
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+namespace regor
+{
+
+class RawWriter
+{
+public:
+    std::vector<std::pair<std::unique_ptr<const uint8_t[]>, size_t>> Serialise(const std::vector<std::unique_ptr<Graph>> &graphs,
+        const std::vector<std::unordered_map<const Tensor *, Address>> &tensor_address_maps);
+
+private:
+    std::vector<std::pair<std::unique_ptr<const uint8_t[]>, size_t>> _raw;
+
+    void SerialiseCommandStreamTensor(const Tensor *tensor);
+
+    void SerialiseReadOnlyTensor(const Tensor *tensor);
+
+    void SerialiseScratchTensor(const Tensor *tensor, Address address);
+
+    void SerialiseScratchFastTensor(const Tensor *tensor, Address address);
+
+    void SerialiseInputTensor(const Tensor *tensor, Address address);
+
+    void SerialiseOutputTensor(const Tensor *tensor, Address address);
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/scheduler.cpp b/ethosu/regor/compiler/scheduler.cpp
new file mode 100644
index 00000000..31571307
--- /dev/null
+++ b/ethosu/regor/compiler/scheduler.cpp
@@ -0,0 +1,1742 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "scheduler.hpp"
+
+#include "common/logging.hpp"
+
+#include "architecture/weight_encoder.hpp"
+#include "cascade_builder.hpp"
+#include "common/scaling.hpp"
+#include "common/vector_span.hpp"
+#include "faststorage_allocator.hpp"
+#include "live_range.hpp"
+#include "tensor_allocator.hpp"
+
+#include <cassert>
+#include <memory>
+#include <vector>
+
+namespace regor
+{
+
+constexpr int AllocationQuantum = 16;
+constexpr int AlignmentQuantum = 16;
+
+static Shape GetShapeForFormat(const Shape &shape, TensorFormat format)
+{
+    if ( format == TensorFormat::NHCWB16 )
+    {
+        return shape.With(-1, RoundAway(shape.Depth(), 16));
+    }
+    return shape;
+}
+
+int TensorAllocationBytes(const Shape &shape, TensorFormat format, DataType dtype)
+{
+    Shape storageShape = GetShapeForFormat(shape, format);
+    return RoundAway(DataTypeStorageSizeBytes(dtype, storageShape.Elements()), AllocationQuantum);
+}
+
+Scheduler::Scheduler(Architecture *arch, const SchedulerOptions &options, const std::string &name,
+    std::vector<std::unique_ptr<SchedulerOperation>> &ops) :
+        _ops(ops)
+{
+    assert(arch != nullptr);
+    _arch = arch;
+    _options = options;
+    _name = name;
+    _spilling = _arch->StagingMemory() != _arch->FeatureMapMemory();
+}
+
+std::shared_ptr<Schedule> Scheduler::Process()
+{
+    Address peakMemoryUsage = CreateSchedulerRepresentation();
+
+    // Create the Max schedule template
+    _maxSchedule = CreateInitialSchedule();
+
+    // TODO: Disabled until fully implemented
+    // MoveConstantData( _maxSchedule.get() );
+
+    // Create the optimised Max schedule
+    UpdateOpMemorySnapshot(_maxSchedule.get());
+    auto optMaxSchedule = ProposeScheduleBuffering(_maxSchedule.get(), std::numeric_limits<int>::max());
+    UpdateOpMemorySnapshot(optMaxSchedule.get());
+
+    // Create Min schedule
+    auto minSchedule = ProposeMinimalSchedule();
+    Address initialStagingLimit = _options.optimizationStagingLimit;
+    if ( _options.optimizationStrategy == OptimizationStrategy::Size )
+    {
+        initialStagingLimit = peakMemoryUsage;
+    }
+
+    // Build cascades from min schedule
+    std::unordered_map<UniqueId, int> nonLocal;
+    CascadeBuilder cascadeBuilder(_ops, nonLocal, _spilling);
+    cascadeBuilder.BuildCascades(minSchedule.get(), _maxSchedule.get(), initialStagingLimit);
+    UpdateOpMemorySnapshot(minSchedule.get());
+
+    std::shared_ptr<Schedule> chosenSchedule = minSchedule;
+
+    if ( _options.optimizationStrategy == OptimizationStrategy::Performance )
+    {
+        // Create an optimized schedule
+        auto optSchedule = OptimizeSchedule(minSchedule.get(), optMaxSchedule);
+        UpdateOpMemorySnapshot(optSchedule.get());
+        chosenSchedule = std::move(optSchedule);
+    }
+
+    CoalesceWeightBufferTensors(chosenSchedule.get());
+    UpdateOpMemorySnapshot(chosenSchedule.get());
+
+    ApplySchedule(chosenSchedule.get());
+
+    if ( _spilling )
+    {
+        // Use fast storage for feature maps
+        FastStorageAllocator allocator;
+        allocator.AllocateFeatureMaps(_ops, chosenSchedule.get(), _arch->StagingMemory(), _options.optimizationStagingLimit);
+    }
+
+    UpdateOpMemorySnapshot(chosenSchedule.get());
+
+    if ( _options.verboseSchedule )
+    {
+        PrintSchedule(chosenSchedule.get());
+    }
+
+    AllocateAddresses(chosenSchedule.get());
+
+    return chosenSchedule;
+}
+
+Point2i Scheduler::GetStripeInputRequirement(const Shape &ofmShape, Kernel *kernel, ArchResampling resampling)
+{
+    int rounding;
+    int upscale = _arch->UpscaleAndRounding(resampling, rounding);
+    int h = RequiredInputSize(ofmShape.Height(), kernel->Stride().y, kernel->DilatedWH().y, upscale, rounding);
+    int w = RequiredInputSize(ofmShape.Width(), kernel->Stride().x, kernel->DilatedWH().x, upscale, rounding);
+    return Point2i(w, h);
+}
+
+// Returns true if NHWC format must be used for the given tensor
+static bool CheckLinearFormatForConcatSplit(SchedulerTensor *tensor)
+{
+    for ( const auto &prod : tensor->producers )
+    {
+        // If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's
+        // are a multiple of 16. This as, it is only then the address offset for the ofm, for all operations,
+        // will be 16 byte aligned. For other values of axis the address offsets will be 16 byte aligned, as they
+        // are all based on c = 0 and those addresses are always 16 byte aligned due to the NHCWB16 format.
+        for ( auto &conn : prod->outputs )
+        {
+            if ( conn.tensor.get() == tensor && conn.slice.offset.Size() > 0 && (conn.slice.offset.Depth() & 15) != 0 )
+            {
+                return true;
+            }
+        }
+    }
+    for ( const auto &cons : tensor->consumers )
+    {
+        // If read offset is not a multiple of 16 in the C-dimension, NHCWB16 need to be avoided in the input.
+        for ( auto &conn : cons->inputs )
+        {
+            if ( conn.tensor.get() == tensor && conn.slice.offset.Size() > 0 && (conn.slice.offset.Depth() & 15) != 0 )
+            {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+
+static int UpdateSchedulerTensor(TensorUsage usage, SchedulerConnection *conn)
+{
+    auto tensor = conn->tensor.get();
+
+    // Multiple consumers/producers require the full tensor present
+    if ( tensor->producers.size() > 1 || tensor->consumers.size() > 1 || (IsOFM(usage) && conn->slice.offset.Size() > 0) ||  // Concat
+         tensor->isGraphInput || tensor->isGraphOutput )
+    {
+        conn->requireFullTensor = true;
+    }
+
+    // Force linear output from Reverse for C dimension because brick output from Reverse has special requirements
+    if ( IsOFM(usage) && conn->reverse == ReverseType::C )
+    {
+        tensor->needsLinearFormat = true;
+    }
+
+    for ( auto producer : tensor->producers )
+    {
+        // TODO: Gather doesn't support brick format yet (MLBEDSW-8410)
+        if ( producer->Type() == OpType::Scatter || producer->Type() == OpType::Gather )
+        {
+            tensor->needsLinearFormat = true;
+        }
+        if ( !producer->IsNpuOp() )
+        {
+            tensor->hasCPUWriters = true;
+        }
+    }
+
+    for ( auto consumer : tensor->consumers )
+    {
+        if ( !consumer->IsNpuOp() )
+        {
+            tensor->hasCPUReaders = true;
+        }
+        // TODO: Gather doesn't support brick format yet (MLBEDSW-8410)
+        if ( consumer->Type() == OpType::Scatter || consumer->Type() == OpType::Gather )
+        {
+            tensor->needsLinearFormat = true;
+        }
+        // Int32 ReduceSum requires linear format
+        if ( consumer->Type() == OpType::ReduceSum && tensor->dataType == DataType::Int32 )
+        {
+            tensor->needsLinearFormat = true;
+        }
+        // Check if consumer shape requires linear format
+        // Brick format can only be used if both shapes have equal W and C
+        auto ifm0 = consumer->TryIFM(0);
+        auto ifm1 = consumer->TryIFM(1);
+        auto ifm2 = consumer->TryIFM(2);
+        if ( (ifm0 && ifm0->tensor.get() == tensor &&
+                 Shape::PadAxes(ifm0->shape, 2, 1).WC<int>() != Shape::PadAxes(conn->shape, 2, 1).WC<int>()) ||
+             (ifm1 && ifm1->tensor.get() == tensor &&
+                 Shape::PadAxes(ifm1->shape, 2, 1).WC<int>() != Shape::PadAxes(conn->shape, 2, 1).WC<int>()) ||
+             (ifm2 && ifm2->tensor.get() == tensor &&
+                 Shape::PadAxes(ifm2->shape, 2, 1).WC<int>() != Shape::PadAxes(conn->shape, 2, 1).WC<int>()) )
+        {
+            tensor->needsLinearFormat = true;
+        }
+    }
+
+    // Initial criteria (may change)
+    bool cpuTensor = tensor->hasCPUWriters || tensor->hasCPUReaders || tensor->isGraphInput || tensor->isGraphOutput;
+    conn->requireFullTensor = conn->requireFullTensor || cpuTensor;
+    tensor->needsLinearFormat = tensor->needsLinearFormat || cpuTensor || CheckLinearFormatForConcatSplit(tensor);
+
+    // Set tensor format to NHCWB16 for output FeatureMaps, if possible
+    if ( IsOFM(usage) )
+    {
+        if ( !tensor->needsLinearFormat )
+        {
+            tensor->format = TensorFormat::NHCWB16;
+        }
+    }
+
+    return tensor->srcTensor->IsConstant() ? 0 : tensor->AllocationSizeBytes();
+}
+
+
+Address Scheduler::CreateSchedulerRepresentation()
+{
+    int minMemoryRequired = 0;
+
+    for ( auto const &schedOp : _ops )
+    {
+        int opMemoryRequired = 0;
+
+        for ( auto pos : schedOp->outputs.pairs() )
+        {
+            opMemoryRequired += UpdateSchedulerTensor(pos.first, &pos.second);
+        }
+
+        for ( auto pos : schedOp->inputs.pairs() )
+        {
+            opMemoryRequired += UpdateSchedulerTensor(pos.first, &pos.second);
+        }
+
+        minMemoryRequired = std::max(minMemoryRequired, opMemoryRequired);
+    }
+
+    return minMemoryRequired;
+}
+
+
+namespace
+{
+
+
+ArchAccumulatorSource GetArchAccumulatorSource(const AccumulatorControl &ac)
+{
+    switch ( ac.source )
+    {
+        case AccumulatorSource::Reset:
+            return ArchAccumulatorSource::Reset;
+        case AccumulatorSource::Acc:
+            return ArchAccumulatorSource::Acc;
+        case AccumulatorSource::Ifm2:
+            return ArchAccumulatorSource::Ifm2;
+        default:
+            return ArchAccumulatorSource::Reset;
+    }
+}
+
+std::unique_ptr<ArchitectureOpConfig> GetOpConfig(Architecture *arch, SchedulerOperation *op, const Shape &ifmShape,
+    const Shape &ifm2Shape, const Shape &ofmShape, WeightFormat wgtFormat)
+{
+    assert(op->IsNpuOp());
+
+    SchedulerConnection *ifm = op->IFM(0);
+    SchedulerConnection *ifm2 = op->TryIFM(1);
+    SchedulerConnection *ofm = op->OFM();
+
+    ArchitectureConfigQuery query;
+    query.ofmShape = Shape::PadAxes(ofmShape, 3, 1);
+    query.ifmShape[0] = ifmShape;
+    query.ifmShape[1] = ifm2Shape;
+    query.ifmBits = DataTypeSizeBits(ifm->tensor->dataType);
+    query.kernel = op->Kernel();
+    query.lutBytes = op->TryInput(TensorUsage::LUT) ? 2048 : 0;
+    query.scaled = op->HasScaling();
+    query.ifmResampling = ifm->resamplingMode;
+    query.ofmShape = query.ofmShape.Untranspose(ofm->transpose);
+    query.transpose = ofm->transpose;
+    query.reverse = ofm->reverse;
+    query.ofmFormat = ofm->tensor->format;
+    const auto &accMode = op->AccumulatorMode();
+    query.accSource = GetArchAccumulatorSource(accMode);
+    query.accOutputEnabled = accMode.outputEnabled;
+    query.weightFormat = wgtFormat;
+    if ( op->Type() == OpType::Resize )
+    {
+        query.rescaling.scaleX = op->Attributes().resize.scaleX;
+        query.rescaling.scaleY = op->Attributes().resize.scaleY;
+    }
+
+    return arch->GetOpConfig(op->Type(), query);
+}
+
+struct EncodingResult
+{
+    Flags<WeightFormat> format;
+    int size = std::numeric_limits<int>::max();
+    int optimalDepth = 0;
+};
+
+Flags<WeightFormat> ChooseBestWeightFormat(Architecture *arch, SchedulerOperation *, OptimizationStrategy strategy,
+    int maxStreams, std::vector<EncodingResult> &encodingResults)
+{
+    EncodingResult *bestResult = nullptr;
+    // Prefer fast decoder if weights are in fast memory.
+    bool preferFast = (arch->ReadonlyMemory().memory->Bandwidth() == arch->StagingMemory().memory->Bandwidth()) && (strategy != OptimizationStrategy::Size);
+    // Standard decoder is faster if we have 4 instances
+    bool avoidFast = maxStreams > 2;
+    // If buffering is improved, we may want to allow larger diff here
+    constexpr float maxFastSizeRatio{1.01f};
+    constexpr float maxSparseSizeRatio{1.01f};
+    for ( auto &encodingResult : encodingResults )
+    {
+        bool isFast = (encodingResult.format & WeightFormat::Fast);
+        if ( !bestResult )
+        {
+            bestResult = &encodingResult;
+            continue;
+        }
+        if ( (bestResult->format & WeightFormat::Fast) != isFast )
+        {
+            if ( isFast && !avoidFast && (preferFast || encodingResult.size <= bestResult->size * maxFastSizeRatio) )
+            {
+                bestResult = &encodingResult;
+                continue;
+            }
+            if ( !isFast && (avoidFast || (!preferFast && encodingResult.size <= bestResult->size * maxFastSizeRatio)) )
+            {
+                bestResult = &encodingResult;
+                continue;
+            }
+        }
+        if ( !(bestResult->format & WeightFormat::Sparse2_4) && (encodingResult.format & WeightFormat::Sparse2_4) )
+        {
+            if ( (encodingResult.optimalDepth <= bestResult->optimalDepth) &&  // Sparsity can give different block
+                                                                               // config
+                 (encodingResult.size <= bestResult->size * maxSparseSizeRatio) )
+            {
+                bestResult = &encodingResult;
+            }
+        }
+    }
+    return bestResult->format;
+}
+
+}  // namespace
+
+Flags<WeightFormat> Scheduler::BestWeightFormat(
+    SchedulerOperation *op, Shape &ifmShape, Shape &ifm2Shape, Shape &ofmShape, Flags<WeightFormat> supportedFormats)
+{
+    using WF = Flags<WeightFormat>;
+
+    std::vector<EncodingResult> encodingResults;
+    auto weights = op->Input(TensorUsage::Weights);
+    auto ifm = op->IFM(op->PrimaryIfmIndex());
+    auto ifmType = ifm->tensor->dataType;
+    int maxStreams = 0;
+    //  WF(WeightFormat::Default) needs to be first, FWD doesn't count zeroes.
+    for ( auto weightFormat : {WF(WeightFormat::Default), WF(WeightFormat::Fast),
+              WF(WeightFormat::Default, WeightFormat::Sparse2_4), WF(WeightFormat::Fast, WeightFormat::Sparse2_4)} )
+    {
+        if ( (weightFormat & supportedFormats) != weightFormat ) continue;
+        EncodingResult result{weightFormat};
+        std::unique_ptr<ArchitectureOpConfig> blockConfig = GetOpConfig(_arch, op, ifmShape, ifm2Shape, ofmShape, weightFormat);
+        result.optimalDepth = blockConfig->OptimalDepthGranule();
+        WeightsRef weightsRef = {&weights->tensor->bufferView, weights->tensor->srcTensor->AxisOrder(), weights->tensor->dataType};
+
+        std::vector<int> depthOffsets{0, ofmShape.Depth()};
+        auto encodingParams = _arch->WeightEncoder()->GetEncodingConfig(
+            blockConfig.get(), weightsRef, op->Kernel(), ifmType, depthOffsets, weightFormat);
+        try
+        {
+            auto weightInfo = AnalyzeWeights(encodingParams.get(), weights->tensor.get(), weights->quantization);
+
+            result.size = weightInfo.encodedSize;
+            if ( weightInfo.streams > maxStreams ) maxStreams = weightInfo.streams;
+            if ( weightFormat == WF(WeightFormat::Default) && weightInfo.zeroCount < DivRoundUp(weightInfo.sourceSize, 2) )
+            {
+                // Can't be Sparse2_4 if less than half of the weights are zero
+                supportedFormats &= ~WF(WeightFormat::Sparse2_4);
+            }
+        }
+        catch ( const WeightEncodeException & )
+        {
+            continue;
+        }
+        encodingResults.emplace_back(result);
+    }
+    assert(!encodingResults.empty());
+    return ChooseBestWeightFormat(_arch, op, _options.optimizationStrategy, maxStreams, encodingResults);
+}
+
+std::unique_ptr<SchedulerOpInfo> Scheduler::CreateSchedulerOpInfo(SchedulerOperation *op, const Shape &ofmStripeShape)
+{
+    assert(op->PrimaryIfmIndex() >= 0 && op->PrimaryIfmIndex() <= 1);
+    SchedulerConnection *ifm = op->IFM(op->PrimaryIfmIndex());
+    SchedulerConnection *ifm2 = op->TryIFM(1 - op->PrimaryIfmIndex());
+
+    auto ifmShape = ifm->shape;
+    auto ifm2Shape = ifm2 ? ifm2->shape : Shape();
+    auto ofmShape = ofmStripeShape;
+
+    // Operations that cannot be subdivided require full OFM shape
+    if ( _arch->CanSubdivide(op->Type()) == AxisMask::None )
+    {
+        ofmShape = op->OFM()->shape;
+    }
+
+    // Give empty operation info to CPU ops
+    if ( !op->IsNpuOp() )
+    {
+        return std::make_unique<SchedulerOpInfo>(nullptr, ifmShape, ifm2Shape, ofmShape);
+    }
+
+    // Determine if striped operation
+    if ( ofmShape != op->OFM()->shape )
+    {
+        // Striped Op - Need to calculate stripe input volume
+        Point2i stripeInput = GetStripeInputRequirement(ofmShape, op->Kernel(), ifm->resamplingMode);
+
+        // Ensure stripe input volume is within the full IFM volume
+        stripeInput = Point2i::Min(stripeInput, ifmShape.WH<int>());
+        ifmShape = ifmShape.WithHW(stripeInput.y, stripeInput.x);
+
+        if ( !ifm2Shape.IsEmpty() )
+        {
+            Point2i stripeInput2 = Point2i::Min(stripeInput, ifm2Shape.WH<int>());
+            ifm2Shape = ifm2Shape.WithHW(stripeInput2.y, stripeInput2.x);
+        }
+    }
+
+    auto weightFormat = _arch->SupportedWeightFormat(op->Type());
+
+    WeightScaleTensors weightScales;
+    auto weights = op->TryInput(TensorUsage::Weights);
+    if ( !weights || !weights->tensor->IsConstant() ) weightFormat = WeightFormat::Default;
+    if ( weightFormat != WeightFormat::Default )
+        weightFormat = BestWeightFormat(op, ifmShape, ifm2Shape, ofmShape, weightFormat);
+    // Potentially repeat until weight encoding successful
+    std::unique_ptr<ArchitectureOpConfig> blockConfig;
+    do
+    {
+        blockConfig = GetOpConfig(_arch, op, ifmShape, ifm2Shape, ofmShape, weightFormat);
+
+        if ( weights == nullptr || !weights->tensor->IsConstant() ) break;
+
+        auto scales = op->Input(TensorUsage::Scales);
+
+        WeightsRef weightsRef = {&weights->tensor->bufferView, weights->tensor->srcTensor->AxisOrder(), weights->tensor->dataType};
+
+        std::vector<int> depthOffsets{0, ofmShape.Depth()};
+
+        auto encodingParams = _arch->WeightEncoder()->GetEncodingConfig(
+            blockConfig.get(), weightsRef, op->Kernel(), ifm->tensor->dataType, depthOffsets, weightFormat);
+
+        try
+        {
+            if ( op->OFM()->quantization.type != QuantizationType::EXPLICIT )
+            {
+                auto temp = _arch->WeightEncoder()->MakeExplicit(ifm->quantization, weights->quantization,
+                    op->OFM()->quantization, scales->tensor->dataType, ifm->tensor->dataType);
+                op->OFM()->quantization = std::move(temp);
+                assert(op->OFM()->quantization.type == QuantizationType::EXPLICIT);
+            }
+
+            weightScales = EncodeWeightAndScaleTensor(std::move(encodingParams), weights->tensor.get(),
+                scales->tensor.get(), weights->quantization, op->OFM()->quantization);
+            break;
+        }
+        catch ( const WeightEncodeException & )
+        {
+            if ( weightFormat & WeightFormat::Sparse2_4 ) weightFormat ^= WeightFormat::Sparse2_4;
+            else if ( weightFormat & WeightFormat::Fast ) weightFormat ^= WeightFormat::Fast;
+            else break;
+        }
+    } while ( true );
+
+    // Finally construct and populate operator information (cost)
+    auto opInfo = std::make_unique<SchedulerOpInfo>(std::move(blockConfig), ifmShape, ifm2Shape, ofmShape);
+    opInfo->SetWeightScaleTensors(weightScales.npuWeightsTensor, weightScales.npuScalesTensor);
+
+    return opInfo;
+}
+
+
+std::unique_ptr<Schedule> Scheduler::CreateInitialSchedule()
+{
+    auto schedule = std::make_unique<Schedule>(_name + "_MAX");
+
+    for ( auto &op : _ops )
+    {
+        auto cost = CreateSchedulerOpInfo(op.get(), op->OFM()->SliceShape());
+        cost->cycles = EstimateOpPerformance(op.get(), cost->Config(), op->OFM()->shape.Depth());
+        cost->elementAccess = EstimateOpElementAccess(op.get(), cost->Config(), op->OFM()->shape.Depth());
+        schedule->SetCost(*op, std::move(cost));
+    }
+    return schedule;
+}
+
+
+void Scheduler::MoveConstantData(Schedule *refSchedule)
+{
+    auto permanentStorageMemory = _arch->ReadonlyMemory();
+    const bool moveConstantData = permanentStorageMemory != _arch->FeatureMapMemory();
+
+    // Determine if data can be moved from permanent storage to another memory area. A difference in source tensor
+    // and target tensor memory area will generate a DMA command in the command stream.
+    for ( auto &schedOp : _ops )
+    {
+        // Ignore CPU ops
+        if ( !schedOp->IsNpuOp() )
+        {
+            continue;
+        }
+
+        auto cost = refSchedule->Cost(schedOp.get());
+        int maxIfmShramAvail = cost->Config()->MaxIFMBuffering() / 2;
+        for ( auto pos : schedOp->inputs.pairs() )
+        {
+            SchedulerConnection *conn = &pos.second;
+            if ( !conn->tensor->srcTensor->IsConstant() )
+            {
+                continue;
+            }
+
+            // Determine whether or not to move data from permanent storage to more suitable
+            // storage before use.
+            bool moveData = false;
+            if ( conn->tensor->memArea == permanentStorageMemory && moveConstantData )
+            {
+                moveData = std::any_of(conn->tensor->consumers.begin(), conn->tensor->consumers.end(),
+                    [](const SchedulerOperation *op) { return op->Type() != OpType::FullyConnected; });
+
+                // Check if broadcast elementwise can be buffered
+                if ( IsIFM(pos.first) && IsElementwise(schedOp->Type()) && (conn->shape != schedOp->OFM()->shape) &&
+                     conn->tensor->srcTensor->View().BufferSize() > maxIfmShramAvail )
+                {
+                    moveData = true;
+                }
+            }
+
+            if ( moveData )
+            {
+                // Set scheduler tensor to different memory area i.e. move from srcTensor to (scheduler) tensor
+                conn->tensor->memArea = _arch->StagingMemory();
+            }
+        }
+    }
+}
+
+
+void Scheduler::AllocateAddresses(Schedule *schedule)
+{
+    const auto verbose = _options.verboseAllocation;
+    AllocateTensors(_ops, schedule, _arch->FeatureMapMemory(), TensorAllocator::HillClimb, AlignmentQuantum, verbose);
+    if ( _spilling )
+    {
+        const auto limit = _options.optimizationStagingLimit;
+        AllocateTensors(_ops, schedule, _arch->StagingMemory(), TensorAllocator::HillClimb, AlignmentQuantum, verbose, limit);
+    }
+}
+
+
+/// @brief Specialised LiveRangeGraph for read only (flash) memory which ignores scalars if possible
+class ReadOnlyLiveRangeGraph : public LiveRangeGraph
+{
+private:
+    Architecture *_arch;
+
+public:
+    ReadOnlyLiveRangeGraph(Architecture *arch) : _arch(arch) {}
+    bool ShouldBeIgnored(SchedulerTensor *tens, const MemArea &targetMemory) override
+    {
+        // First do the regular check for matching memory type
+        if ( LiveRangeGraph::ShouldBeIgnored(tens, targetMemory) )
+        {
+            return true;
+        }
+        // Memory type correct, check if this tensor is a scalar that can be encoded
+        // in the command stream for this architecture
+        auto srcTens = tens->srcTensor;
+        if ( srcTens && srcTens->StorageShape().Elements() == 1 && srcTens->IsConstant() )
+        {
+            // All consumers must accept this scalar if we are to ignore it
+            for ( auto op : tens->consumers )
+            {
+                // Find usage of the tensor for this consumer op
+                TensorUsage usage(TensorUsage::None);
+                for ( auto input : op->inputs.pairs() )
+                {
+                    if ( input.second.tensor.get() == tens )
+                    {
+                        usage = input.first;
+                    }
+                }
+                if ( !_arch->SupportsScalar(op->Type(), tens->dataType, usage) )
+                {  // This scalar cannot be ignored and must be handled
+                    return false;
+                }
+            }
+            // At this point we have determined that the tensor can be encoded
+            // as a scalar in the command stream and can safely be ignored
+            return true;
+        }
+        // Not a scalar - cannot be ignored
+        return false;
+    }
+};
+
+
+void Scheduler::AllocateReadOnlyAddresses(Schedule *schedule, IncrementalLinearAllocator &readOnlyAllocator)
+{
+    auto lrGraph = ReadOnlyLiveRangeGraph(_arch);
+    lrGraph.ExtractLiveRangesFromCascades(_ops, schedule, _arch->ReadonlyMemory(), false);
+    auto totalSize = readOnlyAllocator.Allocate(&lrGraph, AlignmentQuantum, _options.verboseAllocation);
+    schedule->memoryUsage[_arch->ReadonlyMemory()] = int(totalSize);
+}
+
+
+void Scheduler::UpdateOpMemorySnapshot(Schedule *schedule)
+{
+    const auto fastStorage = _arch->StagingMemory();
+    auto lrGraph = LiveRangeGraph();
+    lrGraph.ExtractLiveRangesFromCascades(_ops, schedule, fastStorage, true);
+    // Populate time-array with memory used by live ranges
+    std::vector<int> temporalUsage = lrGraph.GetTemporalMemoryUsage(schedule->fastStoragePeakUsage);
+    schedule->memorySnapshot = std::move(temporalUsage);
+}
+
+
+std::shared_ptr<Schedule> Scheduler::ProposeScheduleBuffering(Schedule *refSchedule, Address stagingLimitBytes)
+{
+    auto bufferedSchedule = std::make_shared<Schedule>(refSchedule->Name() + "_BUFFERED");
+    int stagingLimitClamped = int(std::min(INT64_C(1) << 30, stagingLimitBytes));
+
+    SchedulerOperation *prevOp = nullptr;
+    for ( auto const &schedOp : _ops )
+    {
+        SchedulerOpInfo *cost = refSchedule->Cost(schedOp.get());
+        // schedOp is not part of this sub-schedule - skip
+        if ( cost == nullptr )
+        {
+            continue;
+        }
+
+        ProposeOperatorBuffering(schedOp.get(), prevOp, bufferedSchedule.get(), refSchedule, stagingLimitClamped);
+        prevOp = schedOp.get();
+    }
+
+    return bufferedSchedule;
+}
+
+
+void Scheduler::ProposeOperatorBuffering(SchedulerOperation *schedOp, SchedulerOperation *prevOp,
+    Schedule *bufferedSchedule, Schedule *refSchedule, int stagingLimitBytes)
+{
+    // Mild recursion might mean this Op has already been seen
+    if ( bufferedSchedule->Cost(schedOp) != nullptr )
+    {
+        return;
+    }
+
+    // Take the reference schedule as default costings for this schedule
+    auto refCost = refSchedule->Cost(schedOp);
+    assert(refCost != nullptr);
+    auto costCopy = std::make_unique<SchedulerOpInfo>(*refCost);
+    auto cost = costCopy.get();
+    bufferedSchedule->SetCost(*schedOp, std::move(costCopy));
+
+    // Don't buffer non NPU operations
+    if ( !schedOp->IsNpuOp() )
+    {
+        return;
+    }
+
+    int slackBufferingMemory = stagingLimitBytes - refSchedule->MemoryUsageAt(refCost->timeIndex);
+    cost->slackBufferingMemory = slackBufferingMemory;
+    cost->slackBufferingCycles = refCost->cycles.opCycles;
+
+    // Attempt weight buffering on anything with a weights tensor
+    auto weights = schedOp->TryInput(TensorUsage::Weights);
+    if ( weights != nullptr )
+    {
+        auto scales = schedOp->Input(TensorUsage::Scales);
+        ProposeWeightBuffering(weights, scales, schedOp, prevOp, bufferedSchedule, refSchedule, slackBufferingMemory);
+    }
+}
+
+
+void Scheduler::ProposeWeightBuffering(SchedulerConnection *weights, SchedulerConnection *scales, SchedulerOperation *schedOp,
+    SchedulerOperation *prevOp, Schedule *bufferedSchedule, Schedule *refSchedule, int bufferLimitBytes)
+{
+    constexpr int OFMSplitDepth = 16;
+    auto cost = bufferedSchedule->Cost(schedOp);
+    auto prevCost = bufferedSchedule->Cost(prevOp);
+    auto refCost = refSchedule->Cost(schedOp);
+    auto ifm = schedOp->IFM(0);
+    auto ofm = schedOp->OFM();
+
+    assert(cost && refCost);
+
+    // Weights are in permanent storage. When permanent storage differs from feature map storage,
+    // there is a point moving the data
+    auto weightTens = weights->tensor.get();
+    auto scaleTens = scales->tensor.get();
+    // No need to move the weights if they are already in the same memory as the staging area
+    bool needsDMA = weightTens->memArea.memory != _arch->StagingMemory().memory;
+
+    std::vector<int> ofmFullDepthSlices = {0, refCost->stripe.Depth()};
+
+    WeightsRef weightsRef = {&weightTens->bufferView, weightTens->srcTensor->AxisOrder(), weightTens->dataType};
+
+    auto weightFormat = cost->npuWeightsTensor->config->Format();
+
+    auto encodingParams = _arch->WeightEncoder()->GetEncodingConfig(
+        cost->Config(), weightsRef, schedOp->Kernel(), ifm->tensor->dataType, ofmFullDepthSlices, weightFormat);
+
+    auto fullWeightScales = EncodeWeightAndScaleTensor(
+        std::move(encodingParams), weightTens, scaleTens, weights->quantization, ofm->quantization);
+
+    int fullWeightsBytes = fullWeightScales.npuWeightsTensor->AllocationSizeBytes();
+
+    // No buffering required - take all the weights from permanent storage
+    if ( schedOp->Type() == OpType::FullyConnected || !needsDMA ||
+         _arch->CanSubdivide(schedOp->Type()) == AxisMask::None || schedOp->OFM()->reverse == ReverseType::C )
+    {
+        cost->ofmDepthSlices = std::move(ofmFullDepthSlices);
+        cost->SetWeightScaleTensors(fullWeightScales.npuWeightsTensor, fullWeightScales.npuScalesTensor);
+        return;
+    }
+
+    auto encodedWeightScales = fullWeightScales;
+
+    // How many NPU cycles are available under the previously executing
+    // operator for performing buffered DMA transfers
+    int64_t slackCycles = (prevCost != nullptr) ? prevCost->slackBufferingCycles : 0;
+    int slackMemory = (prevCost != nullptr) ? prevCost->slackBufferingMemory : 0;
+
+    int weightBufferSize = 0;
+
+    // Force full depth for cascaded Ops
+    if ( cost->cascade != 0 )
+    {
+        weightBufferSize = fullWeightsBytes;
+        // Update the memory snapshot to reflect the added size of the weights
+        refSchedule->memorySnapshot[cost->timeIndex] += weightBufferSize;
+    }
+    else
+    {
+        // Estimate the buffering cycle time for the full set of weights
+        int64_t fullTransferCycles = _arch->Performance()->MemToMemCycles(
+            _arch->StagingMemory().memory, weightTens->memArea.memory, fullWeightsBytes);
+        cost->fullWeightTransferCycles = fullTransferCycles;
+
+        // Calculate the amount of pre-buffering necessary (or what is possible with limited
+        // double buffer buffer size)
+        double prebufferRatio = 0;
+        const int halfBufferLimit = bufferLimitBytes / 2;
+        int prebufferBytes = std::min(fullWeightsBytes, halfBufferLimit);
+        if ( fullTransferCycles > slackCycles )
+        {
+            prebufferRatio = double(slackCycles) / double(fullTransferCycles);
+            prebufferBytes = std::min(int(prebufferRatio * fullWeightsBytes), halfBufferLimit);
+        }
+
+        prebufferRatio = double(prebufferBytes) / fullWeightsBytes;
+
+        // Have to split the weights if the initial buffering can't store
+        // all of the compressed weights
+        if ( prebufferBytes < fullWeightsBytes )
+        {
+            int blockDepth = cost->Config()->OptimalDepthGranule();
+
+            // Choose initial pre-buffering depth (already buffer clamped)
+            int prebufferDepth = int(refCost->stripe.Depth() * prebufferRatio);
+            prebufferDepth = int(std::max(16, RoundZero(prebufferDepth, OFMSplitDepth)));
+
+            // Calculate cycles executed during the pre-buffer
+            auto preOpCycles = EstimateOpPerformance(schedOp, cost->Config(), prebufferDepth);
+            int bufferingDepth = int((refCost->stripe.Depth() * preOpCycles.opCycles) / fullTransferCycles);
+
+            // Choose initial buffering depth and clamp to the double buffering limit
+            bufferingDepth = RoundAway(bufferingDepth, blockDepth);
+            int bufferingBytes = (bufferingDepth / refCost->stripe.Depth()) * fullWeightsBytes;
+            if ( bufferingBytes > halfBufferLimit )
+            {
+                bufferingDepth = (halfBufferLimit / fullWeightsBytes) * refCost->stripe.Depth();
+            }
+
+            while ( true )
+            {
+                // Attempt to buffer whole blocks
+                if ( bufferingDepth > blockDepth )
+                {
+                    bufferingDepth = RoundZero(bufferingDepth, blockDepth);
+                }
+                else
+                {
+                    bufferingDepth = RoundZero(bufferingDepth, OFMSplitDepth);
+                }
+
+                bufferingDepth = int(std::max(bufferingDepth, OFMSplitDepth));
+
+                // Create list of depth slices
+                std::vector<int> depthSlices = {0};
+
+                for ( int depth = prebufferDepth; depth < refCost->stripe.Depth(); depth += bufferingDepth )
+                {
+                    depthSlices.push_back(depth);
+                }
+                depthSlices.push_back(refCost->stripe.Depth());
+
+                // Encode weights based depth slices
+                cost->ofmDepthSlices = std::move(depthSlices);
+
+                weightsRef = {&weightTens->bufferView, weightTens->srcTensor->AxisOrder(), weightTens->dataType, false};
+
+                encodingParams = _arch->WeightEncoder()->GetEncodingConfig(cost->Config(), weightsRef,
+                    schedOp->Kernel(), ifm->tensor->dataType, cost->ofmDepthSlices, weightFormat);
+
+                encodedWeightScales = EncodeWeightAndScaleTensor(
+                    std::move(encodingParams), weightTens, scaleTens, weights->quantization, ofm->quantization);
+
+                // Chosen buffering might not fit at all, iterate until it does
+                // or until the minimum usable slice size is reached
+                if ( encodedWeightScales.npuWeightsTensor->maxRangeBytes <= halfBufferLimit || prebufferDepth == OFMSplitDepth )
+                {
+                    break;
+                }
+
+                // Failed to choose buffer sizes above, reduce them and try again
+                if ( bufferingDepth > prebufferDepth )
+                {
+                    bufferingDepth = RoundAway(bufferingDepth / 2, OFMSplitDepth);
+                }
+                else
+                {
+                    prebufferDepth = RoundAway(prebufferDepth / 2, OFMSplitDepth);
+                }
+            }
+
+            // Calculate cycles required to run the last op for use as future slack
+            int lastDepth = cost->ofmDepthSlices.back();
+            lastDepth -= *(cost->ofmDepthSlices.rbegin() + 1);
+            auto tailCycles = EstimateOpPerformance(schedOp, cost->Config(), lastDepth);
+            cost->slackBufferingCycles = tailCycles.opCycles;
+        }
+    }
+
+    // Determine whether the weights need to be double buffered
+    int encodedWeightsSize = encodedWeightScales.npuWeightsTensor->AllocationSizeBytes();
+    weightBufferSize = std::min(encodedWeightsSize, encodedWeightScales.npuWeightsTensor->maxRangeBytes);
+
+    // Only buffer weights if there's still space left for the buffer
+    if ( weightBufferSize <= bufferLimitBytes )
+    {
+        assert(weightBufferSize % 16 == 0);  // NOTE: vague check, leave validation until later?
+
+        // Determine whether to double buffer or single buffer
+        Buffering buffering = Buffering::Single;
+        if ( (weightBufferSize * 2 <= bufferLimitBytes) && (weightBufferSize < encodedWeightsSize) )
+        {
+            weightBufferSize = weightBufferSize * 2;
+            buffering = Buffering::Double;
+        }
+
+        // Create a new tensor in fast storage to use as weights buffer
+        cost->bufferedWeightTensor.tensor = std::make_shared<SchedulerTensor>();
+        cost->bufferedWeightTensor.tensor->srcTensor = encodedWeightScales.npuWeightsTensor->srcTensor;
+        cost->bufferedWeightTensor.tensor->allocatedSize = weightBufferSize;
+        cost->bufferedWeightTensor.tensor->memArea = _arch->StagingMemory();
+        cost->bufferedWeightTensor.buffering = buffering;
+
+        if ( cost->cascade == 0 )
+        {
+            // Determine if the lifetime can be extended and pre-buffer weights under the previous operation
+            cost->bufferedWeightTensor.preBuffer = (weightBufferSize < slackMemory);
+        }
+
+        cost->slackBufferingMemory -= weightBufferSize;
+    }
+    else
+    {
+        // Don't slice or buffer - use the whole depth from persistent storage
+        cost->ofmDepthSlices = std::move(ofmFullDepthSlices);
+        encodedWeightScales = std::move(fullWeightScales);
+    }
+    cost->SetWeightScaleTensors(encodedWeightScales.npuWeightsTensor, encodedWeightScales.npuScalesTensor);
+}
+
+
+std::shared_ptr<Schedule> Scheduler::ProposeMinimalSchedule()
+{
+    // Proposes scheduling parameters where every operator is subdivided into the smallest stripe that
+    // satisfies the next operators stride
+    auto minSchedule = std::make_shared<Schedule>(_name + "_MIN");
+
+    // Keep track of the previous Op - which consumes the current Op's OFM
+    SchedulerOperation *prevOp = nullptr;
+
+    // Work backwards up the schedule setting the minimum stripe height
+    for ( auto pos = _ops.rbegin(); pos != _ops.rend(); pos++ )
+    {
+        auto const &schedOp = *pos;
+        int minStripeHeight = (prevOp != nullptr) ? prevOp->Kernel()->Stride().y : 1;
+        Shape minStripe = Shape::PadAxes(schedOp->OFM()->shape, 3, 1);
+        minStripe[-3] = minStripeHeight;
+        auto cost = CreateSchedulerOpInfo(schedOp.get(), minStripe);
+        cost->cycles = EstimateOpPerformance(schedOp.get(), cost->Config(), schedOp->OFM()->shape.Depth());
+        cost->elementAccess = EstimateOpElementAccess(schedOp.get(), cost->Config(), schedOp->OFM()->shape.Depth());
+        minSchedule->SetCost(*schedOp, std::move(cost));
+
+        prevOp = schedOp.get();
+    }
+
+    return minSchedule;
+}
+
+
+std::shared_ptr<Schedule> Scheduler::OptimizeSchedule(Schedule *schedule, const std::shared_ptr<Schedule> &maxSchedule)
+{
+    // Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule
+    if ( maxSchedule->fastStoragePeakUsage < _options.optimizationStagingLimit && !_spilling )
+    {
+        return maxSchedule;
+    }
+
+    // Optimize cascades separately
+    // Iterate over a copy of the cascades since they may change during the loop
+    auto cascades = schedule->cascades;
+    for ( const auto &pos : cascades )
+    {
+        const CascadeInfo &cascadeInfo = pos.second;
+
+        auto optSubSchedule = OptimizeSubSchedule(cascadeInfo, schedule, _options.optimizationStagingLimit);
+        if ( optSubSchedule != nullptr )
+        {
+            // Remove the existing cascade
+            schedule->cascades.erase(pos.first);
+            // Move subschedule costs/cascades back into the schedule
+            SchedulerCostMap costs;
+            optSubSchedule->DetachCosts(costs);
+            schedule->UpdateCosts(costs);
+            schedule->UpdateCascades(optSubSchedule->cascades);
+        }
+    }
+
+    // Update memory snapshot
+    UpdateOpMemorySnapshot(schedule);
+
+    // Propose schedule buffering to the optimized schedule
+    auto optSchedule = ProposeScheduleBuffering(schedule, _options.optimizationStagingLimit);
+    optSchedule->cascades = std::move(schedule->cascades);  // TODO: Check this is okay
+    // Copy the cascade's metadata from the unbuffered schedule
+    return optSchedule;
+}
+
+
+std::shared_ptr<Schedule> Scheduler::ProposeScheduleStriping(const Shape &finalStripe, const std::string &label, Schedule *refSchedule)
+{
+    // Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down
+    auto stripedSchedule = std::make_shared<Schedule>(label);
+
+    Shape stripe = finalStripe;
+    for ( auto pos = _ops.rbegin(); pos != _ops.rend(); pos++ )
+    {
+        auto schedOp = pos->get();
+        auto refCost = refSchedule->Cost(schedOp);
+        if ( !schedOp->IsNpuOp() || refCost == nullptr )
+        {
+            // sched_op is not part of the sub-schedule - skip
+            continue;
+        }
+
+        // Create a cost entry with the new stripe
+        auto cost = CreateSchedulerOpInfo(schedOp, stripe);
+
+        // Estimate performance
+        cost->cycles = EstimateOpPerformance(schedOp, cost->Config(), schedOp->OFM()->shape.Depth());
+        cost->elementAccess = EstimateOpElementAccess(schedOp, cost->Config(), schedOp->OFM()->shape.Depth());
+        stripedSchedule->SetCost(*schedOp, std::move(cost));
+
+        // Calculate the preceding Op's stripe
+        stripe = schedOp->IFM(schedOp->PrimaryIfmIndex())->shape.With(-3, stripe.Height() * schedOp->Kernel()->Stride().y);
+    }
+    return stripedSchedule;
+}
+
+
+Address Scheduler::EstimateScheduleMemoryUsage(Schedule *schedule, const std::unordered_map<UniqueId, int> &nonLocalMem)
+{
+    // Estimates the memory usage of a schedule
+    // cascades = schedule.cascades;
+    int peakMemUsage = 0;
+    for ( auto const &schedOp : _ops )
+    {
+        auto cost = schedule->Cost(schedOp.get());
+        if ( cost == nullptr )
+        {
+            // sched_op is not part of the sub-schedule - skip
+            continue;
+        }
+
+        if ( cost->cascade != 0 )
+        {
+            // This Op is part of a cascade - use the cascade's memory usage
+            auto const &cascadeInfo = schedule->cascades.at(cost->cascade);
+            // Non-local memory usage is already included in the cascade_info
+            peakMemUsage = std::max(cascadeInfo.memUsage, peakMemUsage);
+        }
+        else
+        {
+            // This Op is not part of a cascade - calculate the memory usage
+            int opWeightBuffer = 0;
+            if ( cost->bufferedWeightTensor.tensor )
+            {
+                opWeightBuffer = cost->bufferedWeightTensor.tensor->AllocationSizeBytes();
+            }
+
+            int opMemUsage = schedOp->IFM(0)->PartialAllocationSizeBytes() + schedOp->OFM()->PartialAllocationSizeBytes() + opWeightBuffer;
+            if ( nonLocalMem.find(*schedOp) != nonLocalMem.end() )
+            {
+                opMemUsage += nonLocalMem.at(*schedOp);
+            }
+
+            auto ifm1 = schedOp->TryIFM(1);
+            if ( ifm1 )
+            {
+                opMemUsage += ifm1->PartialAllocationSizeBytes();
+            }
+
+            peakMemUsage = std::max(opMemUsage, peakMemUsage);
+        }
+    }
+    return peakMemUsage;
+}
+
+
+std::shared_ptr<Schedule> Scheduler::OptimizeSubSchedule(const CascadeInfo &cascadeInfo, Schedule *refSchedule, Address stagingLimitBytes)
+{
+    // Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by
+    // proposing weight buffering and then continuously proposing new stripe sizes
+
+    // Extract the ops that are part of this sub-schedule
+    vector_span<std::unique_ptr<SchedulerOperation>> subOps(_ops, cascadeInfo.start, cascadeInfo.end + 1);
+
+    // Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule
+    auto subSchedule = std::make_shared<Schedule>(_name + fmt::format("SUB_{}_{}", cascadeInfo.start, cascadeInfo.end));
+    for ( auto &subOp : subOps )
+    {
+        // NOTE: Copies the cost objects, consider optimising this
+        auto costCopy = std::make_unique<SchedulerOpInfo>(*refSchedule->Cost(subOp.get()));
+        subSchedule->SetCost(*subOp, std::move(costCopy));
+    }
+
+    // Update subschedule cascade list
+    subSchedule->cascades[cascadeInfo.end] = cascadeInfo;
+
+    // Use the memory snapshot from the reference schedule (takes a copy)
+    subSchedule->memorySnapshot = refSchedule->memorySnapshot;
+
+    SchedulerOperation *firstOp = subOps.front().get();
+
+    // Calculate memory usage that is live during the sub-schedule but not part of it
+    int timeForCascade = refSchedule->Cost(firstOp)->timeIndex;
+
+    int memUsageParallelToSubSchedule = refSchedule->MemoryUsageAt(timeForCascade) - cascadeInfo.memUsage;
+
+    // If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's
+    // included in a cascade or not
+    int persistentInitialIFM = 0;
+    auto firstOpIfm = firstOp->IFM(firstOp->PrimaryIfmIndex());
+    if ( firstOpIfm->tensor->consumers.size() > 1 )
+    {
+        persistentInitialIFM = firstOpIfm->tensor->AllocationSizeBytes();
+    }
+
+    // Calculate non-local-mem-usage per Operator
+    std::unordered_map<UniqueId, int> nonLocalMemUsage;
+    nonLocalMemUsage[*firstOp] = memUsageParallelToSubSchedule;
+    for ( int i = 1; i < subOps.size(); i++ )
+    {
+        nonLocalMemUsage[*subOps[i]] = memUsageParallelToSubSchedule + persistentInitialIFM;
+    }
+
+    CascadeBuilder cascadeBuilder(subOps, nonLocalMemUsage, _spilling);
+
+    // Start by adding buffering
+    auto bufferedSubSchedule = ProposeScheduleBuffering(subSchedule.get(), _options.optimizationStagingLimit);
+
+    // Copy the cascades over from the unbuffered-schedule
+    bufferedSubSchedule->cascades = subSchedule->cascades;
+
+    // Generate the possible stripings for the final Op in the sub-schedule
+    Shape finalOFMShape = subOps.back()->OFM()->shape;
+    const int maxStripeHeight = (finalOFMShape.Height() + 1) / 2;
+
+    std::vector<Shape> possibleStripes;
+    possibleStripes.reserve(maxStripeHeight);
+    for ( int h = 1; h <= maxStripeHeight; h++ )
+    {
+        possibleStripes.push_back(finalOFMShape.With(-3, h));
+    }
+
+    // Propose different striping - the possible stripes are proposed similarly to a binary search
+    std::shared_ptr<Schedule> bestSchedule;
+    int first = 0, last = int(possibleStripes.size());
+
+#if LOG_TRACE1_ON
+    LOG_INDENT(Logging::Out);
+#endif
+
+    while ( first < last )
+    {
+        int index = first + (last - first) / 2;
+        const Shape &proposedStripe = possibleStripes[index];
+
+        auto proposedSchedule = ProposeScheduleStriping(
+            proposedStripe, fmt::format("_OPT_{}", proposedStripe.Height()), bufferedSubSchedule.get());
+
+        cascadeBuilder.BuildCascades(proposedSchedule.get(), _maxSchedule.get(), stagingLimitBytes);
+
+        // Check if proposal fits
+        Address proposedMemUsage = EstimateScheduleMemoryUsage(proposedSchedule.get(), nonLocalMemUsage);
+
+        if ( proposedMemUsage <= stagingLimitBytes )
+        {
+            // Ignore all possible stripes smaller than this
+            first = index + 1;
+            bestSchedule = proposedSchedule;
+            // No cascading required - early exit
+            if ( proposedSchedule->cascades.empty() )
+            {
+                break;
+            }
+        }
+        else
+        {
+            // Proposal doesn't fit within the limit - ignore all possible stripes larger than this
+            last = index;
+        }
+    }
+
+    return bestSchedule;
+}
+
+
+void Scheduler::ApplySchedule(Schedule *schedule)
+{
+    const auto idealFormat = _arch->IdealBufferingFormat();
+
+    // Applies the given schedule as the end result
+    for ( auto &schedOp : _ops )
+    {
+        if ( !schedOp->IsNpuOp() )
+        {
+            continue;
+        }
+
+        auto cost = schedule->Cost(schedOp.get());
+        if ( cost->cascade > 0 )
+        {
+            const CascadeInfo &cascadeInfo = schedule->cascades.at(cost->cascade);
+            auto pos = cascadeInfo.buffers.find(*schedOp);
+            if ( pos != cascadeInfo.buffers.end() )
+            {
+                auto bufferTensor = schedOp->IFM(schedOp->PrimaryIfmIndex())->tensor.get();
+                // Apply memory area
+                bufferTensor->memArea = _arch->StagingMemory();
+                // Apply rolling buffer dimensions
+                Shape bufferShape = pos->second.shape;
+                assert(!bufferTensor->needsLinearFormat);
+                bufferTensor->format = idealFormat;
+                assert(bufferShape.Width() == bufferTensor->storageShape.Width() && "Only y-striping implemented");
+                bufferTensor->storageShape = bufferTensor->storageShape.WithHW(bufferShape.Height(), bufferShape.Width());
+            }
+        }
+
+        // Check buffering tensors are meaningfully defined
+        assert(!cost->bufferedWeightTensor.tensor ||
+               (cost->bufferedWeightTensor.tensor->allocatedSize == 0 || (cost->bufferedWeightTensor.tensor->srcTensor != nullptr)));
+    }
+}
+
+
+// Coalesce repeated weight buffer tensors
+void Scheduler::CoalesceWeightBufferTensors(Schedule *schedule)
+{
+    SchedulerOpInfo *prevCost = nullptr;
+
+    for ( auto &schedOp : _ops )
+    {
+        if ( !schedOp->IsNpuOp() )
+        {
+            continue;
+        }
+
+        auto cost = schedule->Cost(schedOp.get());
+        if ( prevCost && cost )
+        {
+            auto &prevBufTensor = prevCost->bufferedWeightTensor.tensor;
+            auto &bufTensor = cost->bufferedWeightTensor.tensor;
+            if ( prevBufTensor && bufTensor )
+            {
+                UniqueId prevWeightsTensorId = prevCost->npuWeightsTensor ? prevCost->npuWeightsTensor->equivalenceId : -1;
+                UniqueId weightsTensorId = cost->npuWeightsTensor ? cost->npuWeightsTensor->equivalenceId : -2;
+                if ( prevWeightsTensorId == weightsTensorId && prevBufTensor->allocatedSize == bufTensor->allocatedSize &&
+                     prevCost->ofmDepthSlices.size() == 2 && cost->ofmDepthSlices.size() == 2 && prevCost->ofmDepthSlices == cost->ofmDepthSlices )
+                {
+                    // Reuse previous weight buffer tensor if both current and previous op use 1 depth slice
+                    // This will extend the life range weight buffer tensor
+                    bufTensor = prevBufTensor;
+                }
+            }
+        }
+
+        prevCost = cost;
+    }
+}
+
+
+PerformanceQuery Scheduler::InitPerfQuery(SchedulerOperation *op, ArchitectureOpConfig *config, int ofmDepth = -1)
+{
+    PerformanceQuery query = {};
+    query.type = op->Type();
+    query.kernel = op->Kernel();
+    query.config = config;
+
+    SchedulerConnection *ifm0 = op->IFM(0);
+    query.ifmShape[0] = ifm0->shape;
+    query.ifmMemory[0] = ifm0->tensor->memArea.memory;
+    query.ifmType[0] = ifm0->tensor->dataType;
+    query.ifmFormat[0] = ifm0->tensor->format;
+
+    SchedulerConnection *ifm1 = op->TryIFM(1);
+    if ( ifm1 )
+    {
+        query.ifmShape[1] = ifm1->shape;
+        query.ifmMemory[1] = ifm1->tensor->memArea.memory;
+        query.ifmType[1] = ifm1->tensor->dataType;
+        query.ifmFormat[1] = ifm1->tensor->format;
+    }
+
+    SchedulerConnection *ofm = op->OFM();
+    query.ofmShape = (ofmDepth >= 0) ? ofm->shape.WithDepth(ofmDepth) : ofm->shape;
+    query.ofmMemory = ofm->tensor->memArea.memory;
+    query.ofmType = ofm->tensor->dataType;
+    query.ofmFormat = ofm->tensor->format;
+
+    SchedulerConnection *scales = op->TryInput(TensorUsage::Scales);
+    if ( scales )
+    {
+        query.constShape = Shape(1, 1, 1, query.ofmShape.Depth());
+        query.constMemory = scales->tensor->memArea.memory;
+    }
+
+    return query;
+}
+
+
+std::vector<FusionQuery> Scheduler::InitFusionQuery(SchedulerOperation *op)
+{
+    std::vector<FusionQuery> fused;
+
+    for ( auto const &subOp : op->SubOps() )
+    {
+        fused.emplace_back();
+
+        FusionQuery &fusedOp = fused.back();
+        fusedOp.type = subOp->Type();
+        fusedOp.kernel = subOp->Kernel();
+        auto ifm2 = subOp->TryIFM(1);
+        if ( ifm2 )
+        {
+            fusedOp.ifm2Shape = ifm2->shape;
+            fusedOp.ifm2Memory = ifm2->tensor->memArea.memory;
+            fusedOp.ifm2Type = ifm2->tensor->dataType;
+            fusedOp.ifm2Format = ifm2->tensor->format;
+        }
+    }
+
+    return fused;
+}
+
+
+CycleCost Scheduler::EstimateOpPerformance(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth)
+{
+    CycleCost cycleCost;
+    if ( !op->IsNpuOp() )
+    {
+        LOG_WARN("CPU performance estimation for \"{}\" not implemented\n", OpTypeToString(op->Type()));
+        return cycleCost;
+    }
+
+    PerformanceQuery query = InitPerfQuery(op, config, ofm_depth);
+    std::vector<FusionQuery> fused = InitFusionQuery(op);
+    cycleCost = _arch->Performance()->MeasureCycleCost(query, fused);
+    return cycleCost;
+}
+
+ElementAccess Scheduler::EstimateOpElementAccess(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth)
+{
+    ElementAccess access;
+    if ( !op->IsNpuOp() )
+    {
+        LOG_WARN("CPU performance estimation for \"{}\" not implemented\n", OpTypeToString(op->Type()));
+        return access;
+    }
+    PerformanceQuery query = InitPerfQuery(op, config, ofm_depth);
+    access = _arch->Performance()->MeasureElementAccess(query);
+    return access;
+}
+
+void Scheduler::PrintSchedule(Schedule *schedule)
+{
+    LOG_PRINT("Schedule: '{}'\n", schedule->Name());
+    for ( auto const &schedOp : _ops )
+    {
+        auto cost = schedule->Cost(schedOp.get());
+        if ( cost == nullptr )
+        {
+            continue;
+        }
+
+        LOG_PRINT("\t{0}: Operation {1}  - OFM {2}\n", schedOp->Index(), OpTypeToString(schedOp->Type()),
+            schedOp->OFM()->shape.ToString());
+        LOG_PRINT("\t\tKernel: {0}\n", schedOp->Kernel()->ToString());
+
+        if ( !schedOp->IsNpuOp() )
+        {
+            LOG_PRINT("\t\tCPU Operation\n");
+        }
+        else
+        {
+            LOG_PRINT("{0}\n", cost->ToString());
+        }
+
+        int mem_usage = 0;
+        if ( cost->timeIndex >= 0 && cost->timeIndex < int(schedule->memorySnapshot.size()) )
+        {
+            mem_usage = schedule->memorySnapshot[cost->timeIndex];
+        }
+
+        LOG_PRINT("\t\tEstimated Perf: Macs={0} Cycles={1}\n", cost->cycles.macs, cost->cycles.opCycles);
+        LOG_PRINT("\t\tMemory Used: {0} bytes\n", mem_usage);
+    }
+
+    LOG_PRINT("\tCascades:\n");
+    auto const &cascades = schedule->cascades;
+
+    // Sort cascade contents by id and start time
+    std::vector<int> keys;
+    for ( auto const &pos : cascades )
+    {
+        keys.push_back(pos.first | (pos.second.start << 16));
+    }
+    std::sort(keys.begin(), keys.end());
+
+    // Print sorted cascade indices
+    for ( auto key : keys )
+    {
+        auto const &cascade = cascades.at(key & 0xFFFF);
+        LOG_PRINT("\t\t{0}: {1} -> {2}, size: {3}\n", key & 0xFFFF, cascade.start, cascade.end, cascade.memUsage);
+    }
+}
+
+
+void ParseSchedulerOptions(SchedulerOptions &opt, IniReader &reader)
+{
+    // Parse debug settings
+    std::string key;
+    while ( reader.Begin(key) )
+    {
+        if ( key == "optimize" )
+        {
+            std::string value;
+            if ( reader.Read(value) )
+            {
+                if ( _strnicmp(value.data(), "size", 5) == 0 )
+                {
+                    opt.optimizationStrategy = OptimizationStrategy::Size;
+                }
+                else if ( _strnicmp(value.data(), "performance", 12) == 0 )
+                {
+                    opt.optimizationStrategy = OptimizationStrategy::Performance;
+                }
+            }
+        }
+        else if ( key == "verbose" )
+        {
+            opt.verboseSchedule = reader.Get<bool>();
+        }
+        else if ( key == "verbose_allocation" )
+        {
+            opt.verboseAllocation = reader.Get<bool>();
+        }
+        else if ( key == "arena_size_limit" )
+        {
+            opt.optimizationStagingLimit = reader.Get<int64_t>();
+            std::string suffix;
+            if ( reader.Read(suffix) )
+            {
+                if ( suffix == "kb" )
+                {
+                    opt.optimizationStagingLimit *= 1024;
+                }
+                else if ( suffix == "mb" )
+                {
+                    opt.optimizationStagingLimit *= 1024 * 1024;
+                }
+            }
+        }
+
+        reader.End();
+    }
+}
+
+
+struct SchedulerTransformParam : public WeightTransformParam
+{
+    const int64_t *zeroPoints;
+    int zeroCount;
+};
+
+
+static int ApplyZeroPointIHWO(const WeightTransformParam *param, int value)
+{
+    const SchedulerTransformParam *p = static_cast<const SchedulerTransformParam *>(param);
+    value = (value - int(p->zeroPoints[p->o % p->zeroCount]));
+    assert(value >= -255 && value <= 255);
+    return value;
+}
+
+
+static int ApplyZeroPointOHWI(const WeightTransformParam *param, int value)
+{
+    const SchedulerTransformParam *p = static_cast<const SchedulerTransformParam *>(param);
+    value = (value - int(p->zeroPoints[p->i % p->zeroCount]));
+    assert(value >= -255 && value <= 255);
+    return value;
+}
+
+WeightScaleTensors Scheduler::EncodeWeightAndScaleTensor(std::unique_ptr<IWeightEncodingConfig> encodingParams, const SchedulerTensor *weightTens,
+    const SchedulerTensor *scaleTens, const Quantization &weightQuantization, const Quantization &ofmQuantization)
+{
+    bool doWeights = true;
+    bool doScales = true;
+
+    // Check cache for weight tensors already encoded with this configuration.
+    auto cacheKey = TensorCacheKey(encodingParams.get(), weightTens->equivalenceId);
+    auto pos = _tensorCache.find(cacheKey);
+    std::shared_ptr<NpuWeightTensor> cachedWeightsTensor;
+    if ( pos != _tensorCache.end() )
+    {
+        const WeightScaleTensors &cached = pos->second;
+        assert(ofmQuantization.type == QuantizationType::EXPLICIT);
+        uint32_t scaleHash = HashVector32(ofmQuantization.scales);
+        // If scale tensor hashes match, return this combined weights tensor.
+        if ( cached.scaleHash == scaleHash )
+        {
+            return cached;
+        }
+        // Already cached weights, but scales differ so perform scale encoding
+        cachedWeightsTensor = cached.npuWeightsTensor;
+        doWeights = false;
+    }
+
+    // Attempt the encode (may fail)
+    WeightScaleTensors result = TryEncodeWeightAndScaleTensor(
+        encodingParams.get(), weightTens, scaleTens, weightQuantization, ofmQuantization, doWeights, doScales);
+    result.scaleHash = HashVector32(ofmQuantization.scales);
+
+    if ( doWeights )
+    {
+        // Weights and scales now encoded together
+        _tensorCache.emplace(cacheKey, result);
+        result.npuWeightsTensor->config = std::move(encodingParams);
+    }
+    else
+    {
+        // Going to reuse a cached tensor for weights (must alias if memory areas don't match).
+        if ( cachedWeightsTensor->memArea == weightTens->memArea )
+        {
+            result.npuWeightsTensor = std::move(cachedWeightsTensor);
+        }
+        else
+        {
+            // TODO: Clone tensor (but share buffer) if mem area assignment conflicts.
+            //       Or cache encoded buffers and always wrap in a new tensor.
+            assert(false);
+            throw WeightEncodeException{};
+        }
+    }
+
+    return result;
+}
+
+WeightScaleTensors Scheduler::TryEncodeWeightAndScaleTensor(IWeightEncodingConfig *encodingParams,
+    const SchedulerTensor *weightTens, const SchedulerTensor *scaleTens, const Quantization &weightQuantization,
+    const Quantization &ofmQuantization, bool doWeights, bool doScales)
+{
+    // Create tensor to hold encoded output
+    auto npuTensor = std::make_shared<NpuWeightTensor>();
+    npuTensor->memArea = weightTens->memArea;
+
+    int weightRangeIndex = 0;
+    int maxSingleBufferLen = 0;
+    std::vector<uint8_t> encodedStream;
+
+    const auto &weightView = weightTens->bufferView;
+    Shape wshape = weightView.ViewShape();
+    auto strides = weightView.StrideBytes() * 8 / DataTypeSizeBits(weightTens->dataType);
+
+    Shape ohwiStrides;
+    Shape ohwiShape;
+    if ( weightTens->srcTensor->AxisOrder() == AxisOrder::IHWO )
+    {
+        ohwiStrides = strides.Extract(3, 1, 2, 0);
+        ohwiShape = wshape.Extract(3, 1, 2, 0);
+    }
+    else
+    {
+        ohwiStrides = std::move(strides);
+        ohwiShape = std::move(wshape);
+    }
+
+    // Set up weight source
+    SchedulerTransformParam param;
+    param.zeroPoints = weightQuantization.zeroPoints.data();
+    param.zeroCount = int(weightQuantization.zeroPoints.size());
+
+    auto zeroOffsetFunc = (weightTens->srcTensor->AxisOrder() == AxisOrder::IHWO) ? ApplyZeroPointIHWO : ApplyZeroPointOHWI;
+    auto weightSource = _arch->WeightEncoder()->GetWeightSource(encodingParams, weightTens->dataType, zeroOffsetFunc, &param);
+    auto scaleSource = _arch->WeightEncoder()->GetScaleSource(encodingParams, scaleTens->dataType, ofmQuantization);
+
+    auto weightsData = weightView.Buffer()->Data<uint8_t>();
+    int fullOfmDepth = ohwiShape[0];
+    int totalWeightBytes = 0;
+    int subStreams = 1;
+    int scaleStreamsRequired = 1;
+    const int streamsRequired = _arch->WeightEncoder()->StreamsRequired(encodingParams, ohwiShape, scaleStreamsRequired);
+
+    // Note: in case of multiple cores, each core's weights are interleaved in O-dimension
+    auto const &depthOffsets = encodingParams->DepthOffsets();
+    const int nrDepthOffsets = int(depthOffsets.size());
+    for ( int idx = 0; idx < nrDepthOffsets - 1; ++idx )
+    {
+        int depthOffset = depthOffsets[idx];
+
+        // Do not generate for offsets outside the OFM
+        assert(depthOffset >= 0 && depthOffset < fullOfmDepth);
+        int depthLength = depthOffsets[idx + 1] - depthOffset;
+
+        int bufferStartOffset = int(encodedStream.size());
+
+        // For each stream, deinterleave weights/scales from the larger volume
+        // and generate separate compressed streams.
+        for ( int stream = 0; stream < streamsRequired; ++stream )
+        {
+            int key = WeightKey(stream, depthOffset);
+            WeightRange range;
+            range.offset = int(encodedStream.size());
+            range.index = weightRangeIndex++;
+
+            if ( doScales && stream < scaleStreamsRequired )
+            {
+                // Encode Scales and biases
+                auto biases = scaleTens->bufferView.Values<uint8_t>();
+                scaleSource->SetSource(&biases[0], scaleTens->bufferView.ViewShape().Depth(), depthOffset, depthLength, stream);
+                if ( scaleSource->Elements() == 0 )
+                {
+                    // No more elements left to encode
+                    continue;
+                }
+                range.scaleBytes = _arch->WeightEncoder()->EncodeScales(encodingParams, scaleSource.get(), encodedStream, false);
+
+                // Align to 16 for start of next substream
+                while ( encodedStream.size() % 16 != 0 )
+                {
+                    encodedStream.push_back(0);
+                }
+            }
+
+            if ( doWeights )
+            {
+                range.weightOffset = int(encodedStream.size()) - range.offset;
+
+                // Encode Weights
+                ohwiShape[0] = depthLength;
+                weightSource->SetSource(weightsData, depthOffset, ohwiShape, ohwiStrides, stream);
+                int len =
+                    _arch->WeightEncoder()->EncodeWeights(encodingParams, weightSource.get(), encodedStream, false).encodedSize;
+                range.weightBytes = len;
+                totalWeightBytes += len;
+            }
+
+            assert(encodedStream.size() % 16 == 0);
+            npuTensor->encodedRanges[key] = range;
+            subStreams = std::max(stream + 1, subStreams);
+        }
+
+        // Remember maximum encoded length for DoubleBuffering
+        maxSingleBufferLen = std::max(maxSingleBufferLen, int(encodedStream.size()) - bufferStartOffset);
+    }
+
+    // Reduce stored memory usage as much as possible
+    encodedStream.shrink_to_fit();
+
+    auto encodedTensor = std::make_shared<Tensor>(doWeights ? weightTens->Name() : scaleTens->Name(), DataType::UInt8);
+    int streamSize = int(encodedStream.size());
+    auto buf = std::make_shared<Buffer>(std::move(encodedStream));
+    encodedTensor->SetStorageShape(Shape(1, 1, 1, streamSize));
+    encodedTensor->SetBuffer(buf);
+
+    npuTensor->srcTensor = encodedTensor;
+    npuTensor->maxRangeBytes = maxSingleBufferLen;
+    npuTensor->totalWeightBytes = totalWeightBytes;
+    npuTensor->subStreams = subStreams;
+    npuTensor->storageShape = encodedTensor->StorageShape();
+    npuTensor->allocatedSize = encodedTensor->View().BufferSize();
+
+    WeightScaleTensors result;
+    result.scaleHash = HashVector32(ofmQuantization.scales);
+
+    if ( doWeights )
+    {
+        result.npuWeightsTensor = std::move(npuTensor);
+        result.npuScalesTensor = nullptr;
+    }
+    else
+    {
+        // Only scales encoded
+        assert(doScales);
+        result.npuScalesTensor = std::move(npuTensor);
+    }
+
+    return result;
+}
+
+WeightsInfo Scheduler::AnalyzeWeights(IWeightEncodingConfig *encodingParams, const SchedulerTensor *weightTens, const Quantization &weightQuantization)
+{
+
+    WeightsInfo result;
+    std::vector<uint8_t> encodedStream;
+
+    const auto &weightView = weightTens->bufferView;
+    Shape wshape = weightView.ViewShape();
+    auto strides = weightView.StrideBytes() * 8 / DataTypeSizeBits(weightTens->dataType);
+
+    Shape ohwiStrides;
+    Shape ohwiShape;
+    if ( weightTens->srcTensor->AxisOrder() == AxisOrder::IHWO )
+    {
+        ohwiStrides = strides.Extract(3, 1, 2, 0);
+        ohwiShape = wshape.Extract(3, 1, 2, 0);
+    }
+    else
+    {
+        ohwiStrides = std::move(strides);
+        ohwiShape = std::move(wshape);
+    }
+
+    // Set up weight source
+    SchedulerTransformParam param;
+    param.zeroPoints = weightQuantization.zeroPoints.data();
+    param.zeroCount = int(weightQuantization.zeroPoints.size());
+
+    auto zeroOffsetFunc = (weightTens->srcTensor->AxisOrder() == AxisOrder::IHWO) ? ApplyZeroPointIHWO : ApplyZeroPointOHWI;
+    auto weightSource = _arch->WeightEncoder()->GetWeightSource(encodingParams, weightTens->dataType, zeroOffsetFunc, &param);
+
+    auto weightsData = weightView.Buffer()->Data<uint8_t>();
+    int fullOfmDepth = ohwiShape[0];
+    int subStreams = 1;
+    int scaleStreamsRequired = 1;
+    const int streamsRequired = _arch->WeightEncoder()->StreamsRequired(encodingParams, ohwiShape, scaleStreamsRequired);
+    // Note: in case of multiple cores, each core's weights are interleaved in O-dimension
+    auto const &depthOffsets = encodingParams->DepthOffsets();
+    const int nrDepthOffsets = int(depthOffsets.size());
+    for ( int idx = 0; idx < nrDepthOffsets - 1; ++idx )
+    {
+        int depthOffset = depthOffsets[idx];
+
+        // Do not generate for offsets outside the OFM
+        assert(depthOffset >= 0 && depthOffset < fullOfmDepth);
+        int depthLength = depthOffsets[idx + 1] - depthOffset;
+
+        int bufferStartOffset = int(encodedStream.size());
+
+        // For each stream, deinterleave weights/scales from the larger volume
+        // and generate separate compressed streams.
+        for ( int stream = 0; stream < streamsRequired; ++stream )
+        {
+            int key = WeightKey(stream, depthOffset);
+            // Encode Weights
+            ohwiShape[0] = depthLength;
+            weightSource->SetSource(weightsData, depthOffset, ohwiShape, ohwiStrides, stream);
+            auto weightInfo = _arch->WeightEncoder()->EncodeWeights(encodingParams, weightSource.get(), encodedStream, true);
+            result.sourceSize += weightInfo.sourceSize;
+            result.encodedSize += weightInfo.encodedSize;
+            result.zeroCount += weightInfo.zeroCount;
+        }
+    }
+    result.streams = streamsRequired;
+    return result;
+}
+}  // namespace regor
diff --git a/ethosu/regor/compiler/scheduler.hpp b/ethosu/regor/compiler/scheduler.hpp
new file mode 100644
index 00000000..f028cb42
--- /dev/null
+++ b/ethosu/regor/compiler/scheduler.hpp
@@ -0,0 +1,340 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/common.hpp"
+
+#include "architecture/architecture.hpp"
+#include "architecture/weight_encoder.hpp"
+#include "cascade_builder.hpp"
+#include "common/shape.hpp"
+#include "graph.hpp"
+#include "quantization.hpp"
+#include "scheduler_operation.hpp"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace regor
+{
+
+class IncrementalLinearAllocator;
+
+enum class OptimizationStrategy
+{
+    Size,
+    Performance,
+};
+
+/// <summary>
+/// Scheduling options
+/// </summary>
+struct SchedulerOptions
+{
+    OptimizationStrategy optimizationStrategy = OptimizationStrategy::Size;
+    Address optimizationStagingLimit = 0;
+    bool verboseSchedule = false;
+    bool verboseAllocation = false;
+};
+
+/// <summary>
+/// Metadata for each scheduled operation (unique per schedule)
+/// </summary>
+class SchedulerOpInfo
+{
+private:
+    std::unique_ptr<ArchitectureOpConfig> _config;
+
+public:
+    Shape stripeInput[2];
+    Shape stripe;
+    int cascade = 0;
+    int timeIndex = -1;
+    int weightSize = 0;
+    std::vector<int> ofmDepthSlices;
+    int64_t slackBufferingCycles = 0;
+    int slackBufferingMemory = 0;
+    int64_t fullWeightTransferCycles = 0;
+    // Encoded weights in readonly memory
+    std::shared_ptr<NpuWeightTensor> npuWeightsTensor;
+    // Encoded scales in readonly memory
+    std::shared_ptr<NpuWeightTensor> npuScalesTensor;
+    // Buffered weights/scales in fast storage
+    SchedulerConnection bufferedWeightTensor;
+    CycleCost cycles;
+    ElementAccess elementAccess;
+
+public:
+    SchedulerOpInfo(std::unique_ptr<ArchitectureOpConfig> opConfig, const Shape &stripeInput1, const Shape &stripeInput2, const Shape &stripe_)
+    {
+        this->_config = std::move(opConfig);
+        this->stripeInput[0] = stripeInput1;
+        this->stripeInput[1] = stripeInput2;
+        this->stripe = stripe_;
+        this->ofmDepthSlices = {0, stripe_.Depth()};
+    }
+
+    SchedulerOpInfo(const SchedulerOpInfo &other) { Copy(other); }
+
+    const SchedulerOpInfo &operator=(const SchedulerOpInfo &other)
+    {
+        Copy(other);
+        return *this;
+    }
+
+    void SetWeightScaleTensors(const std::shared_ptr<NpuWeightTensor> &weights, const std::shared_ptr<NpuWeightTensor> &scales)
+    {
+        npuWeightsTensor = weights;
+        npuScalesTensor = scales;
+    }
+
+    ArchitectureOpConfig *Config() const { return _config.get(); }
+
+    std::string ToString() const
+    {
+        std::string temp = fmt::format("\t\tTime index = {0}\n", this->timeIndex);
+        temp += fmt::format(
+            "\t\tOperator Config = {0}\n"
+            "\t\tIFM Stripe   = [{1}]\n"
+            "\t\tIFM2 Stripe  = [{2}]\n"
+            "\t\tOFM Stripe   = [{3}]\n",
+            _config ? _config->ToString(false) : "no config", stripeInput[0].ToString(), stripeInput[1].ToString(),
+            stripe.ToString());
+
+        temp += fmt::format("\t\tAssigned Cascade = {0}", this->cascade);
+
+        if ( npuWeightsTensor )
+        {
+            // TODO: Finish formatting;
+            temp += fmt::format(
+                "\n\t\tEncoded Weights = {0} bytes\n"
+                "\t\tWeight buffer = {1} bytes\n"
+                "\t\tDepth slices = [{2}]",
+                npuWeightsTensor->AllocationSizeBytes(),
+                bufferedWeightTensor.tensor ? bufferedWeightTensor.tensor->AllocationSizeBytes() : 0, fmt::join(ofmDepthSlices, ", "));
+        }
+
+        return temp;
+    }
+
+private:
+    void Copy(const SchedulerOpInfo &other)
+    {
+        if ( other._config )
+        {
+            // Must duplicate (can't be auto-generated)
+            _config = other._config->Clone();
+        }
+
+        // Potentially generatable
+        stripeInput[0] = other.stripeInput[0];
+        stripeInput[1] = other.stripeInput[1];
+        stripe = other.stripe;
+        cascade = other.cascade;
+        timeIndex = other.timeIndex;
+        weightSize = other.weightSize;
+        ofmDepthSlices = other.ofmDepthSlices;
+        slackBufferingCycles = other.slackBufferingCycles;
+        slackBufferingMemory = other.slackBufferingMemory;
+        fullWeightTransferCycles = other.fullWeightTransferCycles;
+        npuWeightsTensor = other.npuWeightsTensor;
+        npuScalesTensor = other.npuScalesTensor;
+        bufferedWeightTensor = other.bufferedWeightTensor;
+        cycles = other.cycles;
+        elementAccess = other.elementAccess;
+    }
+};
+
+
+using SchedulerCostMap = std::unordered_map<UniqueId, std::unique_ptr<SchedulerOpInfo>>;
+
+/// <summary>
+/// Individual schedule
+/// </summary>
+class Schedule
+{
+private:
+    std::string _name;
+    SchedulerCostMap _costMap;
+
+public:
+    std::unordered_map<int, CascadeInfo> cascades;
+    std::vector<int> memorySnapshot;
+    int fastStoragePeakUsage = 0;
+    std::unordered_map<MemArea, int, MemArea::hash> memoryUsage;
+
+public:
+    Schedule(const std::string &name) : _name(name) {}
+
+    const std::string &Name() const { return _name; }
+
+    void SetCost(UniqueId id, std::unique_ptr<SchedulerOpInfo> opInfo) { _costMap[id] = std::move(opInfo); }
+
+    SchedulerOpInfo *Cost(const SchedulerOperation *op) const { return op ? Cost(*op) : nullptr; }
+    SchedulerOpInfo *Cost(UniqueId id) const
+    {
+        auto pos = _costMap.find(id);
+        return (pos != _costMap.end()) ? pos->second.get() : nullptr;
+    }
+
+    const SchedulerCostMap &Costs() const { return _costMap; }
+
+    int MemoryUsageAt(int timeIndex) const
+    {
+        return (timeIndex < int(memorySnapshot.size())) ? memorySnapshot[timeIndex] : 0;
+    }
+
+    void DetachCosts(SchedulerCostMap &costs) { costs = std::move(_costMap); }
+
+    void UpdateCosts(SchedulerCostMap &costs)
+    {
+        for ( auto &pos : costs )
+        {
+            _costMap[pos.first] = std::move(pos.second);
+        }
+    }
+
+    void UpdateCascades(const std::unordered_map<int, CascadeInfo> &other)
+    {
+        cascades.insert(other.begin(), other.end());
+    }
+
+    const CascadeInfo *Cascade(int cascade) const
+    {
+        auto it = cascades.find(cascade);
+        return it == cascades.end() ? nullptr : &it->second;
+    }
+};
+
+
+/// <summary>
+/// Executable scheduling implementation
+/// </summary>
+class Scheduler
+{
+    struct TensorCacheKey
+    {
+    public:
+        IWeightEncodingConfig *_config;  // must persist as map entry
+        UniqueId _uid;
+
+    public:
+        TensorCacheKey(IWeightEncodingConfig *config, UniqueId uid) : _config(config), _uid(uid) {}
+
+        bool operator==(const TensorCacheKey &other) const
+        {
+            return _config->Equals(other._config) && _uid == other._uid;
+        }
+    };
+
+    struct TensorCacheHash
+    {
+        std::size_t operator()(const TensorCacheKey &key) const
+        {
+            return key._config->Hash() + 37 * std::uintptr_t(key._uid);
+        }
+    };
+
+private:
+    Architecture *_arch = nullptr;
+    SchedulerOptions _options;
+    std::string _name;
+    std::vector<std::unique_ptr<SchedulerOperation>> &_ops;
+    std::shared_ptr<Schedule> _maxSchedule;
+    int _minMemoryRequired = 0;
+    bool _spilling = false;
+    std::unordered_map<TensorCacheKey, WeightScaleTensors, TensorCacheHash> _tensorCache;
+
+public:
+    Scheduler(Architecture *arch, const SchedulerOptions &options, const std::string &name,
+        std::vector<std::unique_ptr<SchedulerOperation>> &ops);
+
+public:
+    std::shared_ptr<Schedule> Process();
+
+    static std::unique_ptr<Graph> ToGraph(std::vector<std::unique_ptr<SchedulerOperation>> &ops,
+        std::unordered_map<const Tensor *, Address> &tensorAddressMap, const Graph *srcGraph);
+
+    void AllocateReadOnlyAddresses(Schedule *schedule, IncrementalLinearAllocator &readOnlyAllocator);
+
+    static PerformanceQuery InitPerfQuery(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth);
+    static std::vector<FusionQuery> InitFusionQuery(SchedulerOperation *op);
+
+private:
+    Address CreateSchedulerRepresentation();
+
+    Point2i GetStripeInputRequirement(const Shape &ofmShape, Kernel *kernel, ArchResampling resampling);
+
+    std::unique_ptr<SchedulerOpInfo> CreateSchedulerOpInfo(SchedulerOperation *op, const Shape &ofmStripeShape);
+
+    std::unique_ptr<Schedule> CreateInitialSchedule();
+
+    void MoveConstantData(Schedule *refSchedule);
+
+    void AllocateAddresses(Schedule *schedule);
+
+    void UpdateOpMemorySnapshot(Schedule *schedule);
+
+    std::shared_ptr<Schedule> ProposeScheduleBuffering(Schedule *refSchedule, Address stagingLimitBytes);
+
+    void ProposeOperatorBuffering(SchedulerOperation *schedOp, SchedulerOperation *prevOp, Schedule *bufferedSchedule,
+        Schedule *refSchedule, int stagingLimitBytes);
+
+    void ProposeWeightBuffering(SchedulerConnection *weights, SchedulerConnection *scales, SchedulerOperation *schedOp,
+        SchedulerOperation *prevOp, Schedule *bufferedSchedule, Schedule *refSchedule, int bufferLimitBytes);
+
+    std::shared_ptr<Schedule> ProposeMinimalSchedule();
+
+    std::shared_ptr<Schedule> OptimizeSchedule(Schedule *schedule, const std::shared_ptr<Schedule> &maxSchedule);
+
+    std::shared_ptr<Schedule> ProposeScheduleStriping(const Shape &finalStripe, const std::string &label, Schedule *refSchedule);
+
+    Address EstimateScheduleMemoryUsage(Schedule *schedule, const std::unordered_map<UniqueId, int> &nonLocalMem);
+
+    std::shared_ptr<Schedule> OptimizeSubSchedule(const CascadeInfo &cascadeInfo, Schedule *refSchedule, Address stagingLimitBytes);
+
+    void ApplySchedule(Schedule *schedule);
+
+    void CoalesceWeightBufferTensors(Schedule *schedule);
+
+    CycleCost EstimateOpPerformance(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth);
+
+    ElementAccess EstimateOpElementAccess(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth);
+
+    void PrintSchedule(Schedule *schedule);
+
+    WeightScaleTensors EncodeWeightAndScaleTensor(std::unique_ptr<IWeightEncodingConfig> encodingParams, const SchedulerTensor *weightTens,
+        const SchedulerTensor *scaleTens, const Quantization &weightQuantization, const Quantization &ofmQuantization);
+
+    WeightScaleTensors TryEncodeWeightAndScaleTensor(IWeightEncodingConfig *encodingParams,
+        const SchedulerTensor *weightTens, const SchedulerTensor *scaleTens, const Quantization &weightQuantization,
+        const Quantization &ofmQuantization, bool doWeights, bool doScales);
+
+    WeightsInfo AnalyzeWeights(IWeightEncodingConfig *encodingParams, const SchedulerTensor *weightTens, const Quantization &weightQuantization);
+
+    Flags<WeightFormat> BestWeightFormat(
+        SchedulerOperation *op, Shape &ifmShape, Shape &ifm2Shape, Shape &ofmShape, Flags<WeightFormat> weightFormat);
+};
+
+void ParseSchedulerOptions(SchedulerOptions &opt, IniReader &reader);
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/scheduler_decompose.cpp b/ethosu/regor/compiler/scheduler_decompose.cpp
new file mode 100644
index 00000000..5e880f95
--- /dev/null
+++ b/ethosu/regor/compiler/scheduler_decompose.cpp
@@ -0,0 +1,326 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "scheduler_decompose.hpp"
+
+#include "operation_util.hpp"
+
+#include <numeric>
+#include <optional>
+
+namespace regor
+{
+
+bool NeedsDecompose(Architecture *arch, const SchedulerOperation *schedOp)
+{
+    return CanDecompose(arch, schedOp) && !CanRunOnHardware(arch, schedOp);
+}
+
+static std::unique_ptr<SchedulerOperation> MakeSubOperation(const SchedulerOperation *schedOp, const Kernel *newKernel = nullptr)
+{
+    assert(schedOp->SubOps().empty());
+    assert(schedOp->Parent() == nullptr);
+    auto subOp = std::make_unique<SchedulerOperation>(schedOp->Type());
+    subOp->SetKernel(newKernel ? newKernel : schedOp->Kernel());
+    subOp->SetHasScaling(schedOp->HasScaling());
+    subOp->_srcKey = schedOp->_srcKey;
+    subOp->SetPrimaryIfmIndex(schedOp->PrimaryIfmIndex());
+    subOp->SetParameters(schedOp->Parameters());
+    subOp->SetRounding(schedOp->Rounding());
+    subOp->SetAccumulatorMode(schedOp->AccumulatorMode());
+    for ( const auto *list : {&schedOp->inputs, &schedOp->outputs} )
+    {
+        for ( const auto &item : list->pairs() )
+        {
+            auto usage = item.first;
+            const auto &connection = item.second;
+            if ( IsOFM(usage) )
+            {
+                connection.tensor->producers.push_back(subOp.get());
+                *subOp->AddOutput(usage) = connection;
+            }
+            else
+            {
+                connection.tensor->consumers.push_back(subOp.get());
+                *subOp->AddInput(usage) = connection;
+            }
+        }
+    }
+    return subOp;
+}
+
+static auto GetArchAccumulatorSource(const AccumulatorControl &ac)
+{
+    switch ( ac.source )
+    {
+        case AccumulatorSource::Reset:
+            return ArchAccumulatorSource::Reset;
+        case AccumulatorSource::Acc:
+            return ArchAccumulatorSource::Acc;
+        case AccumulatorSource::Ifm2:
+            return ArchAccumulatorSource::Ifm2;
+        default:
+            assert(false);
+            return ArchAccumulatorSource::Reset;
+    }
+}
+
+bool CanRunOnHardware(Architecture *arch, const SchedulerOperation *schedOp)
+{
+    regor::ArchitectureOpGroupQuery qOpGroup{};
+    auto *ifm = schedOp->TryIFM(0);
+    auto *ifm2 = schedOp->TryIFM(1);
+    auto *ofm = schedOp->TryOFM();
+
+    if ( !ifm || !ofm ) return false;
+    qOpGroup.type = schedOp->Type();
+    qOpGroup.kernel = schedOp->Kernel();
+    qOpGroup.ifm.key = ifm->tensor->uid;
+    qOpGroup.ifm.type = ifm->tensor->dataType;
+    if ( ifm2 )
+    {
+        qOpGroup.ifm2.key = ifm2->tensor->uid;
+        qOpGroup.ifm2.type = ifm2->tensor->dataType;
+    }
+    qOpGroup.ofm.key = ofm->tensor->uid;
+    qOpGroup.ofm.type = ofm->tensor->dataType;
+    if ( arch->CreateOpGroup(qOpGroup) == nullptr ) return false;
+    regor::ArchitectureConfigQuery qConfig;
+    qConfig.ofmShape = Shape::PadAxes(ofm->SliceShape(), 3, 1);
+    qConfig.ifmShape[0] = ifm->SliceShape();
+    if ( ifm2 )
+    {
+        qConfig.ifmShape[1] = ifm2->SliceShape();
+    }
+    qConfig.ifmBits = DataTypeSizeBits(ifm->tensor->dataType);
+    qConfig.kernel = schedOp->Kernel();
+    qConfig.lutBytes = schedOp->TryInput(TensorUsage::LUT) ? 2048 : 0;
+    qConfig.scaled = schedOp->HasScaling();
+    qConfig.ifmResampling = ifm->resamplingMode;
+    qConfig.ofmShape = qConfig.ofmShape.Untranspose(ofm->transpose);
+    qConfig.transpose = ofm->transpose;
+    qConfig.ofmFormat = ofm->tensor->format;
+    const auto &accMode = schedOp->AccumulatorMode();
+    qConfig.accSource = GetArchAccumulatorSource(accMode);
+    qConfig.accOutputEnabled = accMode.outputEnabled;
+    return arch->GetOpConfig(schedOp->Type(), qConfig) != nullptr;
+}
+
+bool CanDecompose(Architecture *, const SchedulerOperation *schedOp)
+{
+    // TODO: MLBEDSW-8868 Restore fused transpose/reverse
+    if ( auto ofm = schedOp->TryOFM(); ofm && (ofm->transpose != TransposeType::None || ofm->reverse != ReverseType::None) )
+        return false;
+    if ( schedOp->Type() == OpType::Conv2D ) return true;
+    if ( schedOp->Type() == OpType::DepthwiseConv2DBias ) return true;
+    return false;
+}
+
+using DecomposeFunc = std::vector<std::unique_ptr<SchedulerOperation>> (*)(Architecture *, std::unique_ptr<SchedulerOperation>);
+
+// Decompose to sub-operations with size 1 along the leading <dimension> axes.
+// Used for the batch dimension, and for the leading N-3 dimensions for elementwise operations.
+static std::vector<std::unique_ptr<SchedulerOperation>> DecomposeLeadingDimensions(
+    int dimensions, Architecture *arch, std::unique_ptr<SchedulerOperation> op, DecomposeFunc doDecompose)
+{
+    std::vector<std::unique_ptr<SchedulerOperation>> result;
+    int axis = --dimensions;
+    auto *ofmConn = op->Output(TensorUsage::OFM);
+    auto *ifmConn = op->Input(TensorUsage::IFM0);
+    auto *ifm2Conn = op->TryInput(TensorUsage::IFM1);
+    auto newIfmSlice = ifmConn->slice;
+    auto newOfmSlice = ofmConn->slice;
+    newIfmSlice.shape[axis] = 1;
+    newOfmSlice.shape[axis] = 1;
+    TensorSlice newIfm2Slice;
+    if ( ifm2Conn != nullptr )
+    {
+        newIfm2Slice = ifm2Conn->slice;
+        newIfm2Slice.shape[axis] = 1;
+    }
+    auto dimSize = ofmConn->shape[axis];
+    for ( int i = 0; i < dimSize; i++ )
+    {
+        std::unique_ptr<SchedulerOperation> subOp = MakeSubOperation(op.get());
+        newIfmSlice.offset[axis] = i;
+        newOfmSlice.offset[axis] = i;
+        if ( ifm2Conn != nullptr )
+        {
+            newIfm2Slice.offset[axis] = i;
+        }
+        subOp->Input(TensorUsage::IFM)->slice = newIfmSlice;
+        subOp->Output(TensorUsage::OFM)->slice = newOfmSlice;
+        if ( ifm2Conn != nullptr )
+        {
+            subOp->Input(TensorUsage::IFM1)->slice = newIfm2Slice;
+        }
+        auto subOps = (dimensions > 0) ? DecomposeLeadingDimensions(dimensions, arch, std::move(subOp), doDecompose) : doDecompose(arch, std::move(subOp));
+        result.insert(result.end(), std::make_move_iterator(subOps.begin()), std::make_move_iterator(subOps.end()));
+    }
+    return result;
+}
+
+// Handle dilation by decomposing to suboperations with input stride = dilation and dilation 1
+static std::vector<std::unique_ptr<SchedulerOperation>>
+HandleDilation(Architecture *arch, std::unique_ptr<SchedulerOperation> op, DecomposeFunc doDecompose)
+{
+    std::vector<std::unique_ptr<SchedulerOperation>> result;
+    auto *ofmConn = op->Output(TensorUsage::OFM);
+    auto *ifmConn = op->Input(TensorUsage::IFM);
+    auto *kernel = op->Kernel();
+    auto &dilation = kernel->Dilation();
+    auto &stride = kernel->Stride();
+    auto GY = std::gcd(dilation.y, stride.y);
+    auto GX = std::gcd(dilation.x, stride.x);
+    auto DY = dilation.y / GY;
+    auto DX = dilation.x / GX;
+    for ( auto dy = 0; dy < DY; ++dy )
+    {
+        for ( auto dx = 0; dx < DX; ++dx )
+        {
+            auto newIfmSlice = ifmConn->slice;
+            auto newOfmSlice = ofmConn->slice;
+            auto ifmStrides = ifmConn->stepXY;
+            auto ofmStrides = ofmConn->stepXY;
+            newIfmSlice.offset[1] += dy * GY;
+            newIfmSlice.offset[2] += dx * GX;
+            ifmStrides.y *= DY * GY;
+            ifmStrides.x *= DX * GX;
+            newOfmSlice.offset[1] += dy;
+            newOfmSlice.offset[2] += dx;
+            newOfmSlice.shape[1] -= dy;
+            newOfmSlice.shape[2] -= dx;
+            ofmStrides.y *= DY;
+            ofmStrides.x *= DX;
+            auto newKernel = kernel->WithDilation({1, 1}).WithStride(stride / Point2i{GX, GY});
+            std::unique_ptr<SchedulerOperation> subOp = MakeSubOperation(op.get(), &newKernel);
+            auto *subIfmConn = subOp->Input(TensorUsage::IFM);
+            subIfmConn->slice = std::move(newIfmSlice);
+            subIfmConn->stepXY = ifmStrides;
+            auto *subOfmConn = subOp->Output(TensorUsage::OFM);
+            subOfmConn->slice = std::move(newOfmSlice);
+            subOfmConn->stepXY = ofmStrides;
+            auto subOps = doDecompose(arch, std::move(subOp));
+            result.insert(result.end(), std::make_move_iterator(subOps.begin()), std::make_move_iterator(subOps.end()));
+        }
+    }
+    return result;
+}
+
+// Negative ifm offsets indicate new padding values with ifm offset 0
+static void UpdatePaddingIfOffsetNegative(SchedulerOperation *op)
+{
+    auto &ifmSlice = op->Input(TensorUsage::IFM)->slice;
+    if ( ifmSlice.offset.Height() < 0 || ifmSlice.offset.Width() < 0 )
+    {
+        auto *kernel = op->Kernel();
+        auto &padding = kernel->Padding();
+        auto topPad = std::max(0, -ifmSlice.offset.Height());
+        auto leftPad = std::max(0, -ifmSlice.offset.Width());
+        auto newPadding = Margin(topPad, leftPad, padding.Bottom(), padding.Right());
+        ifmSlice.offset[1] = std::max(0, ifmSlice.offset.Height());
+        ifmSlice.offset[2] = std::max(0, ifmSlice.offset.Width());
+        auto newKernel = kernel->WithPadding(newPadding);
+        op->SetKernel(&newKernel);
+    }
+}
+
+std::vector<std::unique_ptr<SchedulerOperation>> DecomposeConv2D(Architecture *arch, std::unique_ptr<SchedulerOperation> op)
+{
+    std::vector<std::unique_ptr<SchedulerOperation>> result;
+    auto *ofmConn = op->Output(TensorUsage::OFM);
+    auto *ifmConn = op->Input(TensorUsage::IFM);
+    const auto &ofmShape = ofmConn->shape;
+    const auto &ifmShape = ifmConn->shape;
+    auto &ofmSlice = ofmConn->slice;
+    auto &ifmSlice = ifmConn->slice;
+    auto *kernel = op->Kernel();
+    auto &padding = kernel->Padding();
+    if ( !ofmSlice.offset.IsValid() )
+    {
+        ofmSlice.offset = ofmShape.WithZeros();
+        ifmSlice.offset = ifmShape.WithZeros().WithHW(-padding.Top(), -padding.Left());
+    }
+    if ( ofmShape.Batch() > 1 )
+    {
+        return DecomposeLeadingDimensions(1, arch, std::move(op), DecomposeConv2D);
+    }
+    if ( CanRunOnHardware(arch, op.get()) )
+    {
+        UpdatePaddingIfOffsetNegative(op.get());
+        result.emplace_back(std::move(op));
+        return result;
+    }
+    auto &dilation = kernel->Dilation();
+    if ( dilation.x > 1 || dilation.y > 1 )
+    {
+        return HandleDilation(arch, std::move(op), DecomposeConv2D);
+    }
+    // TODO: MLBEDSW-8783 Decompose convolutions with large stride
+    // If we get here, decomposition has failed, the resulting operations will be executed on CPU
+    result.emplace_back(std::move(op));
+    return result;
+}
+
+std::vector<std::unique_ptr<SchedulerOperation>> DecomposeDepthwiseConv2D(Architecture *arch, std::unique_ptr<SchedulerOperation> op)
+{
+    std::vector<std::unique_ptr<SchedulerOperation>> result;
+    auto *ofmConn = op->Output(TensorUsage::OFM);
+    auto *ifmConn = op->Input(TensorUsage::IFM);
+    auto *weightsConn = op->Input(TensorUsage::Weights);
+    const auto &ofmShape = ofmConn->shape;
+    const auto &ifmShape = ifmConn->shape;
+    const auto &weightsShape = weightsConn->shape;
+    auto &ofmSlice = ofmConn->slice;
+    auto &ifmSlice = ifmConn->slice;
+    auto *kernel = op->Kernel();
+    auto &padding = kernel->Padding();
+    if ( !ofmSlice.offset.IsValid() )
+    {
+        ofmSlice.offset = ofmShape.WithZeros();
+        ifmSlice.offset = ifmShape.WithZeros().WithHW(-padding.Top(), -padding.Left());
+    }
+    if ( ofmShape.Batch() > 1 )
+    {
+        return DecomposeLeadingDimensions(1, arch, std::move(op), DecomposeDepthwiseConv2D);
+    }
+    if ( weightsShape.Depth() > 1 )
+    {
+        // TODO: MLBEDSW-8789 Handle depthwise convolution with depth multiplier > 1
+        // If we get here, decomposition has failed, the resulting operations will be executed on CPU
+        result.emplace_back(std::move(op));
+        return result;
+    }
+    if ( CanRunOnHardware(arch, op.get()) )
+    {
+        UpdatePaddingIfOffsetNegative(op.get());
+        result.emplace_back(std::move(op));
+        return result;
+    }
+    auto &dilation = kernel->Dilation();
+    if ( dilation.x > 1 || dilation.y > 1 )
+    {
+        return HandleDilation(arch, std::move(op), DecomposeDepthwiseConv2D);
+    }
+    // TODO: MLBEDSW-8783 Decompose convolutions with large stride
+    // If we get here, decomposition has failed, the resulting operations will be executed on CPU
+    result.emplace_back(std::move(op));
+    return result;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/scheduler_decompose.hpp b/ethosu/regor/compiler/scheduler_decompose.hpp
new file mode 100644
index 00000000..970c05f7
--- /dev/null
+++ b/ethosu/regor/compiler/scheduler_decompose.hpp
@@ -0,0 +1,36 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "graph.hpp"
+#include "operation.hpp"
+#include "scheduler_operation.hpp"
+
+#include <vector>
+
+namespace regor
+{
+bool NeedsDecompose(Architecture *arch, const SchedulerOperation *schedOp);
+bool CanRunOnHardware(Architecture *arch, const SchedulerOperation *schedOp);
+bool CanDecompose(Architecture *arch, const SchedulerOperation *schedOp);
+std::vector<std::unique_ptr<SchedulerOperation>> DecomposeConv2D(Architecture *arch, std::unique_ptr<SchedulerOperation> op);
+std::vector<std::unique_ptr<SchedulerOperation>> DecomposeDepthwiseConv2D(Architecture *arch, std::unique_ptr<SchedulerOperation> op);
+std::vector<std::unique_ptr<SchedulerOperation>> DecomposeElementwise(Architecture *arch, std::unique_ptr<SchedulerOperation> op);
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/scheduler_operation.hpp b/ethosu/regor/compiler/scheduler_operation.hpp
new file mode 100644
index 00000000..3e663905
--- /dev/null
+++ b/ethosu/regor/compiler/scheduler_operation.hpp
@@ -0,0 +1,305 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/common.hpp"
+
+#include "common/ordered_map.hpp"
+#include "kernel.hpp"
+#include "operation.hpp"
+#include "tensor.hpp"
+
+#include <vector>
+
+namespace regor
+{
+
+class SchedulerOperation;
+
+int TensorAllocationBytes(const Shape &shape, TensorFormat format, DataType dtype);
+
+/// <summary>
+/// Scheduler's metadata for graph tensors.
+/// </summary>
+struct SchedulerTensor
+{
+public:
+    std::shared_ptr<Tensor> srcTensor;
+    TensorFormat format = TensorFormat::Unknown;
+    MemArea memArea;
+    Shape storageShape;
+    BufferView bufferView;
+    DataType dataType;
+    bool hasCPUReaders = false;
+    bool hasCPUWriters = false;
+    bool isGraphInput = false;
+    bool isGraphOutput = false;
+    int allocatedSize = -1;
+    Address allocatedAddress = -1;
+    bool needsLinearFormat = false;
+    // If two tensors have same equivalence id and same memory area, they can be stored on the same address
+    UniqueId equivalenceId = GenerateUniqueId();
+    UniqueId uid = ~0u;  // Packing must initialise
+    std::vector<SchedulerOperation *> producers;
+    std::vector<SchedulerOperation *> consumers;
+
+    int AllocationSizeBytes() const
+    {
+        return (allocatedSize > 0) ? allocatedSize : TensorAllocationBytes(storageShape, format, dataType);
+    }
+    std::string Name() const { return srcTensor.get() == nullptr ? "?" : srcTensor->Name(); }
+    bool IsConstant() const { return bufferView.HasBuffer() && bufferView.BufferSize() > 0; }
+};
+
+
+enum class Buffering
+{
+    None,
+    Single,
+    Double,
+};
+
+
+/// <summary>
+/// Scheduler's metadata for tensor connections. This data is not shared
+/// between operators working on the same tensor
+/// </summary>
+struct SchedulerConnection
+{
+    std::shared_ptr<SchedulerTensor> tensor;
+    Shape shape;
+    TensorSlice slice;
+    Point2i stepXY{1, 1};
+    Quantization quantization;
+    ArchResampling resamplingMode = ArchResampling::None;
+    TransposeType transpose = TransposeType::None;
+    ReverseType reverse = ReverseType::None;
+    bool requireFullTensor = false;
+    bool preBuffer = false;
+    Buffering buffering = Buffering::None;
+
+    int PartialAllocationSizeBytes() const { return TensorAllocationBytes(shape, tensor->format, tensor->dataType); }
+    const Shape &SliceShape() const { return slice.shape.IsEmpty() ? shape : slice.shape; }
+};
+
+enum class AccumulatorSource
+{
+    Reset = 0,
+    Acc = 1,
+    Ifm2 = 2
+};
+
+struct AccumulatorControl
+{
+    AccumulatorSource source = AccumulatorSource::Reset;
+    bool outputEnabled = true;
+};
+
+/// <summary>
+/// Scheduler's representation of executable operations
+/// </summary>
+class SchedulerOperation
+{
+    friend class SchedulerPacking;
+    friend class Scheduler;
+
+public:
+    OpType _type;
+    int _index = -1;  // Execution index
+    std::unique_ptr<class Kernel> _kernel;
+    bool _npuOp = false;
+    bool _hasScaling = false;
+    void *_srcKey = nullptr;
+    int _primaryIfmIndex = 0;
+    OpTypeParameters _parameters;
+    Operation::Attributes _attributes;
+    RoundMode _rounding = RoundMode::DBL;
+    AccumulatorControl _accumulatorControl;
+    DynamicRef _attr;
+    const class SchedulerOperation *_parent = nullptr;
+    std::vector<std::unique_ptr<SchedulerOperation>> _subOps;  // (activations, or Ethos-U85 chained ops)
+    ordered_map<TensorUsage, SchedulerConnection> inputs;
+    ordered_map<TensorUsage, SchedulerConnection> outputs;
+    std::unique_ptr<ArchitectureOpGroup> _opGroup;
+    int _opGroupKey = 0;
+
+private:
+    UniqueId _uid;
+
+public:
+    SchedulerOperation(OpType opType) : _type(opType), _parameters({}) { _uid = GenerateUniqueId(); }
+    ~SchedulerOperation() { Disconnect(); }
+
+public:
+    OpType Type() const { return _type; }
+    int Index() const { return _index; }
+
+    UniqueId Uid() const { return _uid; }
+    operator UniqueId() const { return _uid; }
+
+    bool IsNpuOp() const { return _npuOp; }
+    void SetNpuOp(bool npuOp) { _npuOp = npuOp; }
+
+    class Kernel *Kernel() const { return _kernel.get(); }
+    void SetKernel(const class Kernel *kernel) { _kernel = std::make_unique<class Kernel>(*kernel); }
+
+    bool HasScaling() const { return _hasScaling; }
+    void SetHasScaling(bool hasScaling) { _hasScaling = hasScaling; }
+
+    RoundMode Rounding() const { return _rounding; }
+    void SetRounding(RoundMode rounding) { _rounding = rounding; }
+
+    const AccumulatorControl &AccumulatorMode() const { return _accumulatorControl; }
+    void SetAccumulatorMode(const AccumulatorControl &accumulatorControl) { _accumulatorControl = accumulatorControl; }
+
+    const class SchedulerOperation *Parent() const { return _parent; }
+    void SetParent(const class SchedulerOperation *parent) { _parent = parent; }
+
+    int PrimaryIfmIndex() const { return _primaryIfmIndex; }
+    void SetPrimaryIfmIndex(int index) { _primaryIfmIndex = index; }
+
+    const OpTypeParameters &Parameters() const { return _parameters; }
+    OpTypeParameters &Parameters() { return _parameters; }
+    void SetParameters(OpTypeParameters parameters) { _parameters = parameters; }
+
+    const Operation::Attributes &Attributes() const { return _attributes; }
+    Operation::Attributes &Attributes() { return _attributes; }
+    void SetAttributes(Operation::Attributes attributes) { _attributes = attributes; }
+
+    template<typename TYPE>
+    TYPE *Attribute()
+    {
+        if ( _attr && _attr.Info()->Hash() == TypeHash<TYPE>::HASH )
+        {
+            return static_cast<TYPE *>(_attr.Instance());
+        }
+        return nullptr;
+    }
+
+    // Input connections
+    SchedulerConnection *AddInput(TensorUsage usage) { return &inputs[usage]; }
+
+    const SchedulerConnection *TryInput(TensorUsage usage) const { return inputs.try_ref(usage); }
+    SchedulerConnection *TryInput(TensorUsage usage) { return inputs.try_ref(usage); }
+    SchedulerConnection *Input(TensorUsage usage) { return &inputs.at(usage); }
+    const SchedulerConnection *Input(TensorUsage usage) const { return &inputs.at(usage); }
+
+    SchedulerConnection *TryIFM(int index) { return inputs.try_ref(MakeTensorUsage(TensorUsage::IFM, index)); }
+    const SchedulerConnection *TryIFM(int index) const
+    {
+        return inputs.try_ref(MakeTensorUsage(TensorUsage::IFM, index));
+    }
+    SchedulerConnection *IFM(int index) { return &inputs.at(MakeTensorUsage(TensorUsage::IFM, index)); }
+    const SchedulerConnection *IFM(int index) const { return &inputs.at(MakeTensorUsage(TensorUsage::IFM, index)); }
+
+    // Output connections
+    SchedulerConnection *AddOutput(TensorUsage usage) { return &outputs[usage]; }
+
+    SchedulerConnection *TryOutput(TensorUsage usage) { return outputs.try_ref(usage); }
+    SchedulerConnection *Output(TensorUsage usage) { return &outputs.at(usage); }
+    const SchedulerConnection *Output(TensorUsage usage) const { return &outputs.at(usage); }
+
+    SchedulerConnection *TryOFM() { return outputs.try_ref(TensorUsage::OFM); }
+    const SchedulerConnection *TryOFM() const { return outputs.try_ref(TensorUsage::OFM); }
+    SchedulerConnection *OFM() { return &outputs.at(TensorUsage::OFM); }
+    const SchedulerConnection *OFM() const { return &outputs.at(TensorUsage::OFM); }
+
+    void AddSubOp(std::unique_ptr<SchedulerOperation> subOp) { _subOps.push_back(std::move(subOp)); }
+
+    const std::vector<std::unique_ptr<SchedulerOperation>> &SubOps() const { return _subOps; }
+
+    // Returns connections for which live range calculation is needed
+    std::vector<std::pair<TensorUsage, SchedulerTensor *>> LiveRangeTensors() const
+    {
+        std::vector<std::pair<TensorUsage, SchedulerTensor *>> liveTensors;
+        for ( const auto *list : {&inputs, &outputs} )
+        {
+            for ( const auto &item : list->pairs() )
+            {
+                auto usage = item.first & TensorUsage::TypeMask;
+                if ( usage == TensorUsage::IFM || usage == TensorUsage::OFM || usage == TensorUsage::LUT )
+                {
+                    liveTensors.push_back(std::make_pair(item.first, item.second.tensor.get()));
+                }
+            }
+        }
+        // TODO: intermediates
+        return liveTensors;
+    }
+
+    void SetOpGroup(std::unique_ptr<ArchitectureOpGroup> &&opGroup) { _opGroup = std::move(opGroup); }
+
+    void SetOpGroupKey(int opGroupKey) { _opGroupKey = opGroupKey; }
+
+    bool IsReordering() const
+    {
+        if ( !IsNone(OFM()->transpose) )
+        {
+            return true;
+        }
+
+        if ( OFM()->reverse != ReverseType::None )
+        {
+            return true;
+        }
+
+        for ( const auto &op : _subOps )
+        {
+            if ( op->IsReordering() )
+            {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    void Disconnect()
+    {
+        for ( const auto *list : {&inputs, &outputs} )
+        {
+            for ( const auto &item : list->pairs() )
+            {
+                auto usage = item.first;
+                const auto &connection = item.second;
+                auto &vec = IsOFM(usage) ? connection.tensor->producers : connection.tensor->consumers;
+                vec.erase(std::remove(vec.begin(), vec.end(), this), vec.end());
+            }
+        }
+        inputs.clear();
+        outputs.clear();
+    }
+
+    bool IsDisconnected() const { return inputs.empty() && outputs.empty(); }
+};
+
+/// <summary>
+/// NPU-Operation
+/// </summary>
+class NPUOperation
+{
+private:
+    std::vector<std::unique_ptr<SchedulerOperation>> _ops;
+
+public:
+    const std::vector<std::unique_ptr<SchedulerOperation>> &Operations() const { return _ops; };
+    void AddOperation(std::unique_ptr<SchedulerOperation> op) { _ops.push_back(std::move(op)); }
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/scheduler_packing.cpp b/ethosu/regor/compiler/scheduler_packing.cpp
new file mode 100644
index 00000000..44fba907
--- /dev/null
+++ b/ethosu/regor/compiler/scheduler_packing.cpp
@@ -0,0 +1,493 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "scheduler_packing.hpp"
+
+#include "common/common.hpp"
+#include "common/logging.hpp"
+
+#include "common/shape.hpp"
+#include "graph.hpp"
+#include "operation.hpp"
+#include "scheduler_decompose.hpp"
+#include "scheduler_operation.hpp"
+#include "tensor.hpp"
+
+#include <vector>
+
+namespace regor
+{
+
+namespace
+{
+
+// Returns true if no IFMs, or if all IFMs are graph inputs
+bool AllInputsAreGraphInputs(const SchedulerOperation &op)
+{
+    for ( const auto &schedConn : op.inputs )
+    {
+        if ( !schedConn.tensor->IsConstant() && !schedConn.tensor->isGraphInput ) return false;
+    }
+    return true;
+}
+
+// Returns true if no OFMs, or if all OFMs are graph outputs
+bool AllOutputsAreGraphOutputs(const SchedulerOperation &op)
+{
+    for ( const auto &outputSchedConn : op.outputs )
+    {
+        if ( !outputSchedConn.tensor->isGraphOutput ) return false;
+    }
+    return true;
+}
+
+// Returns true if any of first's OFMs are same as second's IFMs
+bool IsConnected(const SchedulerOperation &first, const SchedulerOperation &second)
+{
+    for ( const auto &firstOutputSchedConn : first.outputs )
+    {
+        for ( const auto &secondInputSchedConn : second.inputs )
+        {
+            if ( firstOutputSchedConn.tensor == secondInputSchedConn.tensor ) return true;
+        }
+    }
+    return false;
+}
+
+}  // namespace
+
+SchedulerPacking::SchedulerPacking(Architecture *arch) : _arch(arch)
+{
+}
+
+std::vector<std::unique_ptr<SchedulerOperation>> SchedulerPacking::Process(const Graph *graph)
+{
+    // Get operation list in execution order
+    std::vector<Operation *> executionList;
+    Graph::TraverseGraphFromEnd(graph->Outputs(),
+        [&](Operation *op) -> bool
+        {
+            executionList.push_back(op);
+            return true;
+        });
+
+    FilterOperations(executionList, graph);
+
+    PackOperations();
+
+    ReorderOperations();
+
+    return std::move(_schedList);
+}
+
+void SchedulerPacking::FilterOperations(const std::vector<Operation *> &executionList, const Graph *graph)
+{
+    // Convert linear Graph Operations to a list of Scheduler Operations
+    for ( Operation *op : executionList )
+    {
+        auto schedOp = MakeSchedulerOperation(op, graph);
+        if ( NeedsDecompose(_arch, schedOp.get()) )
+        {
+            auto schedOps = DecomposeSchedulerOperation(std::move(schedOp));
+            _schedList.insert(
+                _schedList.end(), std::make_move_iterator(schedOps.begin()), std::make_move_iterator(schedOps.end()));
+        }
+        else
+        {
+            _schedList.push_back(std::move(schedOp));
+        }
+    }
+}
+
+void SchedulerPacking::SchedulerPacking::PackOperations()
+{
+    LOG_TRACE1("Scheduler Packing (of {0} Ops)\n", _schedList.size());
+
+    auto cur = _schedList.begin();
+    auto write = cur;
+
+    while ( cur != _schedList.end() )
+    {
+        SchedulerOperation *primaryOp = cur->get();
+
+        // Compact the list as we go
+        if ( std::distance(write, cur) >= 1 )
+        {
+            *write = std::move(*cur);
+        }
+        primaryOp->_index = int(std::distance(_schedList.begin(), write));
+
+        cur++;
+
+        LOG_TRACE1("Creating new group with {}\n", OpTypeToString(primaryOp->Type()));
+
+        ArchitectureOpGroupQuery op0{};
+        op0.type = primaryOp->Type();
+        op0.kernel = primaryOp->Kernel();
+        op0.ifm.key = primaryOp->IFM(0)->tensor->uid;
+        op0.ifm.type = primaryOp->IFM(0)->tensor->dataType;
+        if ( primaryOp->TryIFM(1) )
+        {
+            op0.ifm2.key = primaryOp->IFM(1)->tensor->uid;
+            op0.ifm2.type = primaryOp->IFM(1)->tensor->dataType;
+        }
+        op0.ofm.key = primaryOp->OFM()->tensor->uid;
+        op0.ofm.type = primaryOp->OFM()->tensor->dataType;
+
+        // Try to create OpGroup
+        auto group = _arch->CreateOpGroup(op0);
+
+        // OpGroup is nullptr if op can't run on NPU
+        if ( group )
+        {
+            primaryOp->SetNpuOp(true);
+
+            // First op in group has key 0
+            int prevOpKey = 0;
+            primaryOp->SetOpGroupKey(prevOpKey);
+            LOG_TRACE1("Created new group with {} (key {})\n", OpTypeToString(primaryOp->Type()), prevOpKey);
+
+            // Root SchedulerOperation takes ownership of the ArchitectureOpGroup here
+            primaryOp->SetOpGroup(std::move(group));
+
+            // Pack any future ops that will fit
+            auto prevOp = primaryOp;
+
+            // Try chaining subsequent ops into the primary
+            while ( cur != _schedList.end() )
+            {
+                SchedulerOperation *nextOp = cur->get();
+                assert(nextOp);  // Empty op may be possible if we seek ahead
+
+                int key = CanPack(primaryOp, prevOp, nextOp, prevOpKey);
+                if ( !key )
+                {
+                    LOG_TRACE1("Can't add next op\n");
+                    break;
+                }
+                nextOp->SetNpuOp(true);
+                nextOp->SetParent(primaryOp);
+                nextOp->SetOpGroupKey(key);
+
+                LOG_TRACE1("Added {} (key {}) to {} (key {})\n", OpTypeToString(nextOp->Type()), key,
+                    OpTypeToString(prevOp->Type()), prevOpKey);
+
+                // Replace primary op's OFM by nextOp's OFM
+                auto *ofmConn = primaryOp->OFM();
+                ofmConn->tensor = nextOp->OFM()->tensor;
+                if ( IsActivation(nextOp->Type()) )
+                {
+                    ofmConn->quantization = prevOp->Output(TensorUsage::OFM)->quantization;
+                    ofmConn->quantization.quantMin = nextOp->Output(TensorUsage::OFM)->quantization.quantMin;
+                    ofmConn->quantization.quantMax = nextOp->Output(TensorUsage::OFM)->quantization.quantMax;
+                }
+                // Add nextOp's LUT to primary Op
+                auto lutConn = nextOp->TryInput(TensorUsage::LUT);
+                if ( lutConn != nullptr )
+                {
+                    primaryOp->AddInput(TensorUsage::LUT)->tensor = lutConn->tensor;
+                }
+
+                prevOpKey = key;
+                prevOp = nextOp;
+                primaryOp->AddSubOp(std::move(*cur));
+                cur++;
+            }
+
+            LOG_TRACE1("\t{0}: {1} - OFM [{2}] <- (IFM0 [{3}], IFM1 [{4}], Primary={5})\n", primaryOp->Index(),
+                OpTypeToString(primaryOp->Type()), primaryOp->OFM()->shape.ToString(), primaryOp->IFM(0)->shape.ToString(),
+                primaryOp->IFM(1) ? primaryOp->IFM(1)->shape.ToString() : "", primaryOp->PrimaryIfmIndex());
+        }
+        write++;
+    }
+
+    // Shorten list to contain only those operators written
+    _schedList.erase(write, cur);
+}
+
+// Reorder CPU ops so that there are fewer groups of consecutive CPU ops in the list of ops
+void SchedulerPacking::ReorderOperations()
+{
+    // Graphs with both CPU and NPU ops might not have an optimal order in the ops list due to how the graph is
+    // traversed (depth first search). This can result in more context switching between CPU and NPU. Try to optimise
+    // this by moving/grouping CPU ops where that is possible. Criteria for CPU pass to be moved:
+    //
+    // 1) CPU passes that only consumes graph input tensors can be moved to the top of the list.
+    //
+    // 2) CPU passes that only produces graph output tensors can be moved to the bottom of the list.
+    //
+    // 3) A CPU pass X is allowed to be grouped together with CPU pass Y if there is no NPU pass between pass X and pass
+    //    Y that depends on output from pass X. Criteria 3 will try to move as many CPU passes towards the bottom of the
+    //    list.
+
+    // Ops with only graph input IFMs
+    std::vector<std::unique_ptr<SchedulerOperation>> earlyOps;
+
+    // Ops with only graph output OFMs
+    std::vector<std::unique_ptr<SchedulerOperation>> lateOps;
+
+    // Ops not in the above two lists
+    std::vector<std::unique_ptr<SchedulerOperation>> otherOps;
+
+    // Reserving space since most ops are likely to end up here
+    otherOps.reserve(_schedList.size());
+
+    // Iterate in execution order to find CPU ops with only graph input IFMs or only graph output OFMs
+    for ( auto i = _schedList.begin(); i != _schedList.end(); ++i )
+    {
+        std::unique_ptr<SchedulerOperation> &op = *i;
+
+        if ( !op->IsNpuOp() && AllInputsAreGraphInputs(*op) )
+        {
+            earlyOps.push_back(std::move(*i));
+        }
+        else if ( !op->IsNpuOp() && AllOutputsAreGraphOutputs(*op) )
+        {
+            lateOps.push_back(std::move(*i));
+        }
+        else
+        {
+            otherOps.push_back(std::move(*i));
+        }
+    }
+
+    // Iterate in reverse execution order to find CPU ops
+    for ( auto i = otherOps.rbegin(); i != otherOps.rend(); ++i )
+    {
+        std::unique_ptr<SchedulerOperation> &op = *i;
+
+        // We're looking for CPU ops
+        if ( op->IsNpuOp() ) continue;
+
+        // Iterate in execution order from the CPU op's position
+        for ( auto j = i; j != otherOps.rbegin(); --j )
+        {
+            std::unique_ptr<SchedulerOperation> &op0 = *(j - 0);  // Earlier in execution order
+            std::unique_ptr<SchedulerOperation> &op1 = *(j - 1);  // Later in execution order
+            assert(!op0->IsNpuOp());
+
+            // Don't move past another CPU op
+            if ( !op1->IsNpuOp() ) break;
+
+            // If our CPU op and the op after are connected, we can't move it down
+            if ( IsConnected(*op0, *op1) ) break;
+
+            // Move our CPU op one step later in execution order
+            std::iter_swap(j, j - 1);
+        }
+    }
+
+    // Reassemble the list
+    _schedList.clear();
+    _schedList.reserve(earlyOps.size() + otherOps.size() + lateOps.size());
+    _schedList.insert(_schedList.end(), std::make_move_iterator(earlyOps.begin()), std::make_move_iterator(earlyOps.end()));
+    _schedList.insert(_schedList.end(), std::make_move_iterator(otherOps.begin()), std::make_move_iterator(otherOps.end()));
+    _schedList.insert(_schedList.end(), std::make_move_iterator(lateOps.begin()), std::make_move_iterator(lateOps.end()));
+
+    // Recalculate the op index now when the list may have a different order
+    for ( auto i = _schedList.begin(); i != _schedList.end(); ++i )
+    {
+        (*i)->_index = int(std::distance(_schedList.begin(), i));
+    }
+}
+
+int SchedulerPacking::CanPack(const SchedulerOperation *schedOp, const SchedulerOperation *prevOp,
+    const SchedulerOperation *nextOp, const int prevOpKey) const
+{
+    const auto prevConnOfm = prevOp->OFM();
+    const auto nextConnIfm = nextOp->IFM(0);
+    const auto nextConnIfm2 = nextOp->TryIFM(1);
+    const auto nextConnOfm = nextOp->OFM();
+
+    SchedulerTensor *prevOFM = prevConnOfm->tensor.get();
+    SchedulerTensor *ifmTensor = nextConnIfm->tensor.get();
+    SchedulerTensor *ifm2Tensor = nextConnIfm2 ? nextConnIfm2->tensor.get() : nullptr;
+    assert(prevOFM && "primary/prev op must have OFM");
+    assert(ifmTensor && "next op must have IFM");
+
+    // Previous op in execution order doesn't connect to this one
+    if ( prevOFM != ifmTensor && prevOFM != ifm2Tensor )
+    {
+        return 0;
+    }
+
+    // Highly unlikely constant tensor between ops
+    assert(!prevOFM->srcTensor->IsConstant() && "Unexpected constant tensor between ops");
+
+    // Only pack tensors on single-reader/writer paths (i.e. can't pack across concat/split)
+    if ( prevOFM->producers.size() != 1 || prevOFM->consumers.size() != 1 )
+    {
+        return 0;
+    }
+
+    if ( schedOp->OFM()->tensor->isGraphOutput )
+    {
+        return 0;
+    }
+
+    if ( IsActivation(nextOp->Type()) && !nextConnIfm->quantization.EqualScales(nextConnOfm->quantization) )
+    {
+        // Can not fuse activation with different scales
+        return 0;
+    }
+
+    ArchitectureOpGroupQuery op1{};
+    op1.type = nextOp->Type();
+    op1.kernel = nextOp->Kernel();
+    op1.ifm.key = nextConnIfm->tensor->uid;
+    op1.ifm.type = nextConnIfm->tensor->dataType;
+    if ( nextConnIfm2 )
+    {
+        op1.ifm2.key = nextConnIfm2->tensor->uid;
+        op1.ifm2.type = nextConnIfm2->tensor->dataType;
+    }
+    op1.ofm.key = nextConnOfm->tensor->uid;
+    op1.ofm.type = nextConnOfm->tensor->dataType;
+
+    return schedOp->_opGroup->Add(op1, {prevOpKey});
+}
+
+void SchedulerPacking::InitSchedulerConnection(
+    SchedulerConnection *schedConn, const std::shared_ptr<SchedulerTensor> &tensor, const TensorConnection &conn)
+{
+    schedConn->tensor = tensor;
+    schedConn->slice = conn.slice;
+    schedConn->shape = Shape::PadAxes(conn.shape, 3, 1);  // Scheduler needs minimum HWC axes to stripe
+    schedConn->quantization = conn.quantization;
+    schedConn->transpose = conn.transpose;
+    schedConn->reverse = conn.reverse;
+}
+
+void SchedulerPacking::InitSchedulerTensor(SchedulerTensor *schedTensor, Tensor *tensor, const Graph *graph)
+{
+    // Take scheduler-local copies of graph tensor parameters.
+    schedTensor->format = TensorFormat::NHWC;
+    schedTensor->memArea = tensor->IsConstant() ? _arch->ReadonlyMemory() : _arch->FeatureMapMemory();
+    schedTensor->storageShape = Shape::PadAxes(tensor->StorageShape(), 4, 1);
+    schedTensor->dataType = tensor->Type();
+    schedTensor->bufferView = tensor->View();
+    schedTensor->isGraphInput = graph->IsInput(tensor);
+    schedTensor->isGraphOutput = graph->IsOutput(tensor);
+    schedTensor->uid = tensor->Uid();
+}
+
+std::unique_ptr<SchedulerOperation> SchedulerPacking::MakeSchedulerOperation(Operation *op, const Graph *graph)
+{
+    assert(op->Type() != OpType::None);
+
+    std::unique_ptr<SchedulerOperation> schedOp = std::make_unique<SchedulerOperation>(op->Type());
+
+    schedOp->SetKernel(op->Kernel());
+    schedOp->SetHasScaling(op->HasScaling());
+    schedOp->SetRounding(op->Rounding());
+    schedOp->SetParameters(op->Parameters());
+    schedOp->SetAttributes(op->attr);
+    schedOp->_attr = op->AttributeRef();
+    schedOp->_srcKey = op;
+
+    // Get the inputs from the source op and connect with scheduler specific tensor
+    for ( const auto *list : {&op->Inputs(), &op->Outputs()} )
+    {
+        for ( const auto &item : list->pairs() )
+        {
+            Tensor *tensor = item.second.tensor.get();
+
+            // Get/update scheduler's metadata for the graph tensor.
+            auto pos = _tensorMap.find(tensor);
+            if ( pos == _tensorMap.end() )
+            {
+                // Create new scheduler tensor if metadata is missing.
+                auto tmp = std::make_shared<SchedulerTensor>();
+                pos = _tensorMap.emplace(tensor, tmp).first;
+                tmp->srcTensor = item.second.tensor;
+                InitSchedulerTensor(tmp.get(), tensor, graph);
+            }
+
+            // Update consumers and manage connectivity
+            const std::shared_ptr<SchedulerTensor> &schedTensor = pos->second;
+
+            if ( IsOFM(item.first) )
+            {
+                schedTensor->producers.push_back(schedOp.get());
+            }
+            else
+            {
+                schedTensor->consumers.push_back(schedOp.get());
+            }
+            SchedulerConnection *schedConn = IsOFM(item.first) ? schedOp->AddOutput(item.first) : schedOp->AddInput(item.first);
+            InitSchedulerConnection(schedConn, schedTensor, item.second);
+            schedConn->resamplingMode = ResamplingMode(item.first, schedOp->Type());
+        }
+    }
+
+    // Examine elementwise and set a primary path for cascading.
+    if ( IsBinaryElementwise(op->Type()) )
+    {
+        auto ifm0 = op->Input(TensorUsage::IFM0);
+        auto ifm1 = op->Input(TensorUsage::IFM1);
+        auto ofm = op->Output(TensorUsage::OFM);
+        assert(ifm0->shape.Size() > 0 && "IFM0 must have dimension");
+        assert(ifm1->shape.Size() > 0 && "IFM1 must have dimension");
+        // Choose the non-const IFM path for binary operations that have
+        // a constant input on the first IFM
+        if ( ifm0->tensor->IsConstant() && !ifm1->tensor->IsConstant() )
+        {
+            schedOp->SetPrimaryIfmIndex(1);
+        }
+        // Favour the non-broadcast shape for cascading.
+        else if ( (ifm0->shape != ofm->shape) && (ifm1->shape == ofm->shape) )
+        {
+            schedOp->SetPrimaryIfmIndex(1);
+        }
+    }
+
+    return schedOp;
+}
+
+std::vector<std::unique_ptr<SchedulerOperation>> SchedulerPacking::DecomposeSchedulerOperation(std::unique_ptr<SchedulerOperation> op)
+{
+    std::vector<std::unique_ptr<SchedulerOperation>> result;
+    switch ( op->Type() )
+    {
+        case OpType::Conv2D:
+            result = DecomposeConv2D(_arch, std::move(op));
+            break;
+        case OpType::DepthwiseConv2DBias:
+            result = DecomposeDepthwiseConv2D(_arch, std::move(op));
+            break;
+        default:
+            assert(false);
+            break;
+    }
+    return result;
+}
+
+ArchResampling SchedulerPacking::ResamplingMode(TensorUsage usage, OpType opType) const
+{
+    // only set resampling-mode on IFM-connections
+    if ( IsIFM(usage) )
+    {
+        if ( opType == OpType::Conv2DBackpropInputSwitchedBias )
+        {
+            return ArchResampling::Zeros;
+        }
+    }
+    return ArchResampling::None;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/scheduler_packing.hpp b/ethosu/regor/compiler/scheduler_packing.hpp
new file mode 100644
index 00000000..84123e4c
--- /dev/null
+++ b/ethosu/regor/compiler/scheduler_packing.hpp
@@ -0,0 +1,70 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/common.hpp"
+#include "common/logging.hpp"
+
+#include "common/shape.hpp"
+#include "graph.hpp"
+#include "operation.hpp"
+#include "scheduler_operation.hpp"
+#include "tensor.hpp"
+
+#include <deque>
+#include <memory>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace regor
+{
+
+/// <summary>
+/// Pack graph-level operations into flat/linear SchedulerOperation objects
+/// (NOTE: Was pass_packing, but we skip the Pass to pack directly into SchedulerOperation)
+/// </summary>
+class SchedulerPacking
+{
+protected:
+    Architecture *_arch = nullptr;
+    std::vector<std::unique_ptr<SchedulerOperation>> _schedList;
+    std::unordered_map<Tensor *, std::shared_ptr<SchedulerTensor>> _tensorMap;
+
+public:
+    SchedulerPacking(Architecture *arch);
+
+public:
+    std::vector<std::unique_ptr<SchedulerOperation>> Process(const Graph *graph);
+
+private:
+    void FilterOperations(const std::vector<Operation *> &executionList, const Graph *graph);
+    void PackOperations();
+    void ReorderOperations();
+
+    int CanPack(const SchedulerOperation *schedOp, const SchedulerOperation *prevOp, const SchedulerOperation *op, const int prevOpKey) const;
+    void InitSchedulerConnection(SchedulerConnection *schedConn, const std::shared_ptr<SchedulerTensor> &tensor, const TensorConnection &conn);
+    void InitSchedulerTensor(SchedulerTensor *schedTensor, Tensor *tensor, const Graph *graph);
+    std::unique_ptr<SchedulerOperation> MakeSchedulerOperation(Operation *op, const Graph *graph);
+    std::vector<std::unique_ptr<SchedulerOperation>> DecomposeSchedulerOperation(std::unique_ptr<SchedulerOperation> op);
+    ArchResampling ResamplingMode(TensorUsage usage, OpType opType) const;
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/softmax.cpp b/ethosu/regor/compiler/softmax.cpp
new file mode 100644
index 00000000..313ff9d9
--- /dev/null
+++ b/ethosu/regor/compiler/softmax.cpp
@@ -0,0 +1,530 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "compiler/softmax.hpp"
+
+#include "common/numeric_util.hpp"
+#include "common/scaling.hpp"
+#include "operation.hpp"
+#include "operation_util.hpp"
+
+#include <fixedpoint/fixedpoint.h>
+#include <algorithm>
+#include <vector>
+
+namespace regor
+{
+
+/*** Exp LUT table for int16 Softmax */
+static const uint32_t EXP_LUT[] = {
+    // clang-format off
+    0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002,
+    0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002,
+    0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002,
+    0x00000002, 0x00000002, 0x00010002, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003,
+    0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003,
+    0x00000003, 0x00000003, 0x00000003, 0x00010003, 0x00000004, 0x00000004, 0x00000004, 0x00000004,
+    0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004,
+    0x00010004, 0x00000005, 0x00000005, 0x00000005, 0x00000005, 0x00000005, 0x00000005, 0x00000005,
+    0x00000005, 0x00000005, 0x00010005, 0x00000006, 0x00000006, 0x00000006, 0x00000006, 0x00000006,
+    0x00000006, 0x00000006, 0x00010006, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007,
+    0x00000007, 0x00000007, 0x00010007, 0x00000008, 0x00000008, 0x00000008, 0x00000008, 0x00000008,
+    0x00010008, 0x00000009, 0x00000009, 0x00000009, 0x00000009, 0x00000009, 0x00010009, 0x0000000a,
+    0x0000000a, 0x0000000a, 0x0000000a, 0x0001000a, 0x0000000b, 0x0000000b, 0x0000000b, 0x0000000b,
+    0x0001000b, 0x0000000c, 0x0000000c, 0x0000000c, 0x0001000c, 0x0000000d, 0x0000000d, 0x0000000d,
+    0x0001000d, 0x0000000e, 0x0000000e, 0x0000000e, 0x0001000e, 0x0000000f, 0x0000000f, 0x0001000f,
+    0x00000010, 0x00000010, 0x00010010, 0x00000011, 0x00000011, 0x00010011, 0x00000012, 0x00000012,
+    0x00010012, 0x00000013, 0x00000013, 0x00010013, 0x00000014, 0x00010014, 0x00000015, 0x00000015,
+    0x00010015, 0x00000016, 0x00010016, 0x00000017, 0x00010017, 0x00000018, 0x00010018, 0x00000019,
+    0x00010019, 0x0000001a, 0x0001001a, 0x0000001b, 0x0001001b, 0x0000001c, 0x0001001c, 0x0000001d,
+    0x0001001d, 0x0000001e, 0x0001001e, 0x0001001f, 0x00000020, 0x00010020, 0x00010021, 0x00000022,
+    0x00010022, 0x00010023, 0x00000024, 0x00010024, 0x00000025, 0x00010025, 0x00010026, 0x00010027,
+    0x00000028, 0x00020028, 0x0000002a, 0x0001002a, 0x0001002b, 0x0001002c, 0x0000002d, 0x0001002d,
+    0x0001002e, 0x0001002f, 0x00010030, 0x00010031, 0x00010032, 0x00010033, 0x00010034, 0x00010035,
+    0x00010036, 0x00010037, 0x00010038, 0x00020039, 0x0001003b, 0x0000003c, 0x0002003c, 0x0001003e,
+    0x0002003f, 0x00000041, 0x00020041, 0x00010043, 0x00010044, 0x00020045, 0x00020047, 0x00010049,
+    0x0001004a, 0x0002004b, 0x0001004d, 0x0002004e, 0x00010050, 0x00020051, 0x00020053, 0x00010055,
+    0x00020056, 0x00020058, 0x0002005a, 0x0001005c, 0x0002005d, 0x0002005f, 0x00020061, 0x00020063,
+    0x00020065, 0x00020067, 0x00020069, 0x0002006b, 0x0003006d, 0x00020070, 0x00020072, 0x00020074,
+    0x00030076, 0x00020079, 0x0003007b, 0x0002007e, 0x00030080, 0x00020083, 0x00020085, 0x00040087,
+    0x0002008b, 0x0003008d, 0x00030090, 0x00020093, 0x00030095, 0x00030098, 0x0003009b, 0x0004009e,
+    0x000300a2, 0x000300a5, 0x000300a8, 0x000300ab, 0x000400ae, 0x000300b2, 0x000400b5, 0x000400b9,
+    0x000300bd, 0x000400c0, 0x000400c4, 0x000400c8, 0x000400cc, 0x000400d0, 0x000500d4, 0x000400d9,
+    0x000400dd, 0x000500e1, 0x000400e6, 0x000500ea, 0x000400ef, 0x000500f3, 0x000500f8, 0x000500fd,
+    0x00050102, 0x00050107, 0x0005010c, 0x00060111, 0x00050117, 0x0006011c, 0x00060122, 0x00060128,
+    0x0006012e, 0x00060134, 0x0006013a, 0x00070140, 0x00060147, 0x0007014d, 0x00060154, 0x0007015a,
+    0x00070161, 0x00060168, 0x0008016e, 0x00070176, 0x0008017d, 0x00080185, 0x0007018d, 0x00090194,
+    0x0008019d, 0x000801a5, 0x000801ad, 0x000901b5, 0x000901be, 0x000901c7, 0x000901d0, 0x000901d9,
+    0x000a01e2, 0x000901ec, 0x000a01f5, 0x000b01ff, 0x000a020a, 0x000b0214, 0x000a021f, 0x000b0229,
+    0x000b0234, 0x000b023f, 0x000c024a, 0x000c0256, 0x000c0262, 0x000c026e, 0x000c027a, 0x000d0286,
+    0x000d0293, 0x000d02a0, 0x000e02ad, 0x000e02bb, 0x000e02c9, 0x000e02d7, 0x000f02e5, 0x000f02f4,
+    0x000f0303, 0x000f0312, 0x00100321, 0x00100331, 0x00110341, 0x00100352, 0x00120362, 0x00110374,
+    0x00120385, 0x00120397, 0x001203a9, 0x001303bb, 0x001303ce, 0x001403e1, 0x001403f5, 0x00140409,
+    0x0015041d, 0x00150432, 0x00160447, 0x0016045d, 0x00160473, 0x00170489, 0x001704a0, 0x001904b7,
+    0x001804d0, 0x001904e8, 0x00190501, 0x001a051a, 0x001a0534, 0x001b054e, 0x001b0569, 0x001c0584,
+    0x001c05a0, 0x001d05bc, 0x001e05d9, 0x001e05f7, 0x001e0615, 0x00200633, 0x00200653, 0x00200673,
+    0x00210693, 0x002206b4, 0x002306d6, 0x002306f9, 0x0024071c, 0x00240740, 0x00260764, 0x0026078a,
+    0x002607b0, 0x002807d6, 0x002907fe, 0x00290827, 0x002a0850, 0x002a087a, 0x002c08a4, 0x002c08d0,
+    0x002e08fc, 0x002e092a, 0x002f0958, 0x00310987, 0x003109b8, 0x003209e9, 0x00330a1b, 0x00340a4e,
+    0x00350a82, 0x00350ab7, 0x00380aec, 0x00380b24, 0x003a0b5c, 0x003a0b96, 0x003c0bd0, 0x003d0c0c,
+    0x003e0c49, 0x003f0c87, 0x00400cc6, 0x00420d06, 0x00430d48, 0x00440d8b, 0x00460dcf, 0x00480e15,
+    0x00480e5d, 0x00490ea5, 0x004c0eee, 0x004d0f3a, 0x004e0f87, 0x00500fd5, 0x00511025, 0x00531076,
+    0x005610c9, 0x0056111f, 0x00581175, 0x005a11cd, 0x005c1227, 0x005e1283, 0x005e12e1, 0x0061133f,
+    0x006413a0, 0x00651404, 0x00671469, 0x006914d0, 0x006c1539, 0x006c15a5, 0x00701611, 0x00721681,
+    0x007416f3, 0x00761767, 0x007917dd, 0x007a1856, 0x007d18d0, 0x0080194d, 0x008319cd, 0x00841a50,
+    0x00881ad4, 0x00891b5c, 0x008d1be5, 0x00911c72, 0x00911d03, 0x00961d94, 0x00981e2a, 0x009c1ec2,
+    0x009e1f5e, 0x00a21ffc, 0x00a4209e, 0x00a92142, 0x00ab21eb, 0x00ae2296, 0x00b22344, 0x00b523f6,
+    0x00b924ab, 0x00be2564, 0x00c02622, 0x00c526e2, 0x00c827a7, 0x00cc286f, 0x00d0293b, 0x00d52a0b,
+    0x00d72ae0, 0x00dd2bb7, 0x00e12c94, 0x00e62d75, 0x00eb2e5b, 0x00ef2f46, 0x00f23035, 0x00f83127,
+    0x00fe321f, 0x0101331d, 0x0108341e, 0x010c3526, 0x01123632, 0x01173744, 0x011c385b, 0x01233977,
+    0x01273a9a, 0x012e3bc1, 0x01343cef, 0x013a3e23, 0x01403f5d, 0x0146409d, 0x014c41e3, 0x0154432f,
+    0x01594483, 0x016145dc, 0x0168473d, 0x016f48a5, 0x01764a14, 0x017d4b8a, 0x01854d07, 0x018d4e8c,
+    0x01945019, 0x019d51ad, 0x01a4534a, 0x01ad54ee, 0x01b5569b, 0x01be5850, 0x01c75a0e, 0x01d05bd5,
+    0x01d85da5, 0x01e35f7d, 0x01eb6160, 0x01f6634b, 0x01ff6541, 0x02096740, 0x02146949, 0x021e6b5d,
+    0x02296d7b, 0x02336fa4, 0x023f71d7, 0x024a7416, 0x02567660, 0x026278b6, 0x026d7b18, 0x027a7d85
+    // clang-format on
+};
+
+/*** 1/(1+X) LUT table for int16 Softmax */
+static const uint32_t ONE_OVER_ONE_PLUS_X_LUT[] = {
+    // clang-format off
+    0xffc17fff, 0xffc07fc0, 0xffc27f80, 0xffc07f42, 0xffc17f02, 0xffc17ec3, 0xffc27e84, 0xffc27e46,
+    0xffc27e08, 0xffc37dca, 0xffc27d8d, 0xffc37d4f, 0xffc37d12, 0xffc37cd5, 0xffc37c98, 0xffc47c5b,
+    0xffc47c1f, 0xffc47be3, 0xffc57ba7, 0xffc57b6c, 0xffc37b31, 0xffc67af4, 0xffc57aba, 0xffc67a7f,
+    0xffc57a45, 0xffc67a0a, 0xffc779d0, 0xffc67997, 0xffc6795d, 0xffc77923, 0xffc778ea, 0xffc778b1,
+    0xffc87878, 0xffc77840, 0xffc87807, 0xffc877cf, 0xffc97797, 0xffc87760, 0xffc97728, 0xffc976f1,
+    0xffc976ba, 0xffc87683, 0xffca764b, 0xffca7615, 0xffca75df, 0xffca75a9, 0xffca7573, 0xffcb753d,
+    0xffca7508, 0xffcb74d2, 0xffcb749d, 0xffca7468, 0xffcc7432, 0xffcc73fe, 0xffcb73ca, 0xffcc7395,
+    0xffcd7361, 0xffcc732e, 0xffcc72fa, 0xffcd72c6, 0xffcd7293, 0xffcd7260, 0xffcc722d, 0xffce71f9,
+    0xffcd71c7, 0xffce7194, 0xffce7162, 0xffce7130, 0xffcf70fe, 0xffce70cd, 0xffce709b, 0xffcf7069,
+    0xffcf7038, 0xffcf7007, 0xffcf6fd6, 0xffcf6fa5, 0xffd06f74, 0xffd06f44, 0xffd06f14, 0xffd06ee4,
+    0xffd06eb4, 0xffd06e84, 0xffd16e54, 0xffd16e25, 0xffd16df6, 0xffd16dc7, 0xffd06d98, 0xffd26d68,
+    0xffd16d3a, 0xffd26d0b, 0xffd26cdd, 0xffd26caf, 0xffd26c81, 0xffd26c53, 0xffd36c25, 0xffd26bf8,
+    0xffd36bca, 0xffd36b9d, 0xffd36b70, 0xffd26b43, 0xffd46b15, 0xffd36ae9, 0xffd46abc, 0xffd46a90,
+    0xffd46a64, 0xffd46a38, 0xffd46a0c, 0xffd469e0, 0xffd469b4, 0xffd56988, 0xffd5695d, 0xffd56932,
+    0xffd56907, 0xffd568dc, 0xffd568b1, 0xffd56886, 0xffd6685b, 0xffd56831, 0xffd66806, 0xffd667dc,
+    0xffd667b2, 0xffd76788, 0xffd6675f, 0xffd76735, 0xffd6670c, 0xffd766e2, 0xffd666b9, 0xffd7668f,
+    0xffd86666, 0xffd6663e, 0xffd86614, 0xffd765ec, 0xffd865c3, 0xffd8659b, 0xffd86573, 0xffd8654b,
+    0xffd86523, 0xffd864fb, 0xffd964d3, 0xffd864ac, 0xffd96484, 0xffd8645d, 0xffd96435, 0xffd9640e,
+    0xffd963e7, 0xffd963c0, 0xffd96399, 0xffda6372, 0xffd9634c, 0xffda6325, 0xffda62ff, 0xffda62d9,
+    0xffda62b3, 0xffda628d, 0xffda6267, 0xffdb6241, 0xffda621c, 0xffdb61f6, 0xffda61d1, 0xffdc61ab,
+    0xffd96187, 0xffdc6160, 0xffdb613c, 0xffdb6117, 0xffdb60f2, 0xffdc60cd, 0xffdc60a9, 0xffdb6085,
+    0xffdc6060, 0xffdc603c, 0xffdc6018, 0xffdc5ff4, 0xffdc5fd0, 0xffdd5fac, 0xffdc5f89, 0xffdc5f65,
+    0xffdd5f41, 0xffdd5f1e, 0xffdd5efb, 0xffdd5ed8, 0xffdd5eb5, 0xffdd5e92, 0xffdd5e6f, 0xffdd5e4c,
+    0xffdd5e29, 0xffde5e06, 0xffde5de4, 0xffdd5dc2, 0xffde5d9f, 0xffde5d7d, 0xffde5d5b, 0xffde5d39,
+    0xffdf5d17, 0xffde5cf6, 0xffde5cd4, 0xffdf5cb2, 0xffdf5c91, 0xffde5c70, 0xffdf5c4e, 0xffdf5c2d,
+    0xffde5c0c, 0xffe05bea, 0xffdf5bca, 0xffdf5ba9, 0xffdf5b88, 0xffdf5b67, 0xffe05b46, 0xffe05b26,
+    0xffdf5b06, 0xffe05ae5, 0xffe05ac5, 0xffe05aa5, 0xffe05a85, 0xffe05a65, 0xffe05a45, 0xffe15a25,
+    0xffe05a06, 0xffe059e6, 0xffe159c6, 0xffe159a7, 0xffe05988, 0xffe15968, 0xffe15949, 0xffe1592a,
+    0xffe1590b, 0xffe158ec, 0xffe258cd, 0xffe158af, 0xffe15890, 0xffe25871, 0xffe15853, 0xffe25834,
+    0xffe25816, 0xffe257f8, 0xffe157da, 0xffe257bb, 0xffe3579d, 0xffe25780, 0xffe25762, 0xffe25744,
+    0xffe35726, 0xffe25709, 0xffe256eb, 0xffe356cd, 0xffe356b0, 0xffe35693, 0xffe25676, 0xffe35658,
+    0xffe3563b, 0xffe3561e, 0xffe35601, 0xffe355e4, 0xffe455c7, 0xffe355ab, 0xffe4558e, 0xffe35572,
+    0xffe45555, 0xffe35539, 0xffe4551c, 0xffe45500, 0xffe454e4, 0xffe454c8, 0xffe454ac, 0xffe45490,
+    0xffe45474, 0xffe55458, 0xffe4543d, 0xffe45421, 0xffe55405, 0xffe553ea, 0xffe453cf, 0xffe553b3,
+    0xffe45398, 0xffe5537c, 0xffe55361, 0xffe55346, 0xffe5532b, 0xffe55310, 0xffe552f5, 0xffe552da,
+    0xffe652bf, 0xffe552a5, 0xffe5528a, 0xffe6526f, 0xffe55255, 0xffe6523a, 0xffe65220, 0xffe55206,
+    0xffe651eb, 0xffe651d1, 0xffe651b7, 0xffe6519d, 0xffe65183, 0xffe65169, 0xffe7514f, 0xffe65136,
+    0xffe6511c, 0xffe75102, 0xffe650e9, 0xffe750cf, 0xffe650b6, 0xffe7509c, 0xffe75083, 0xffe6506a,
+    0xffe75050, 0xffe75037, 0xffe7501e, 0xffe75005, 0xffe74fec, 0xffe74fd3, 0xffe74fba, 0xffe74fa1,
+    0xffe84f88, 0xffe74f70, 0xffe84f57, 0xffe74f3f, 0xffe84f26, 0xffe74f0e, 0xffe84ef5, 0xffe84edd,
+    0xffe84ec5, 0xffe84ead, 0xffe74e95, 0xffe84e7c, 0xffe84e64, 0xffe94e4c, 0xffe84e35, 0xffe84e1d,
+    0xffe84e05, 0xffe94ded, 0xffe84dd6, 0xffe84dbe, 0xffe94da6, 0xffe94d8f, 0xffe84d78, 0xffe84d60,
+    0xffea4d48, 0xffe84d32, 0xffe94d1a, 0xffe94d03, 0xffe84cec, 0xffe94cd4, 0xffe94cbd, 0xffea4ca6,
+    0xffe94c90, 0xffe84c79, 0xffea4c61, 0xffe94c4b, 0xffe94c34, 0xffea4c1d, 0xffe94c07, 0xffea4bf0,
+    0xffe94bda, 0xffea4bc3, 0xffea4bad, 0xffe94b97, 0xffea4b80, 0xffea4b6a, 0xffea4b54, 0xffea4b3e,
+    0xffea4b28, 0xffea4b12, 0xffea4afc, 0xffea4ae6, 0xffea4ad0, 0xffeb4aba, 0xffea4aa5, 0xffea4a8f,
+    0xffeb4a79, 0xffea4a64, 0xffea4a4e, 0xffeb4a38, 0xffeb4a23, 0xffea4a0e, 0xffeb49f8, 0xffea49e3,
+    0xffeb49cd, 0xffeb49b8, 0xffeb49a3, 0xffeb498e, 0xffea4979, 0xffeb4963, 0xffeb494e, 0xffec4939,
+    0xffeb4925, 0xffea4910, 0xffec48fa, 0xffeb48e6, 0xffeb48d1, 0xffec48bc, 0xffeb48a8, 0xffec4893,
+    0xffeb487f, 0xffec486a, 0xffeb4856, 0xffec4841, 0xffec482d, 0xffeb4819, 0xffec4804, 0xffec47f0,
+    0xffec47dc, 0xffec47c8, 0xffec47b4, 0xffec47a0, 0xffec478c, 0xffec4778, 0xffec4764, 0xffec4750,
+    0xffec473c, 0xffed4728, 0xffec4715, 0xffec4701, 0xffed46ed, 0xffec46da, 0xffed46c6, 0xffec46b3,
+    0xffec469f, 0xffed468b, 0xffed4678, 0xffec4665, 0xffed4651, 0xffed463e, 0xffed462b, 0xffec4618,
+    0xffed4604, 0xffed45f1, 0xffed45de, 0xffed45cb, 0xffed45b8, 0xffed45a5, 0xffed4592, 0xffed457f,
+    0xffee456c, 0xffed455a, 0xffed4547, 0xffed4534, 0xffee4521, 0xffed450f, 0xffed44fc, 0xffee44e9,
+    0xffed44d7, 0xffee44c4, 0xffee44b2, 0xffed44a0, 0xffee448d, 0xffee447b, 0xffed4469, 0xffee4456,
+    0xffee4444, 0xffee4432, 0xffee4420, 0xffee440e, 0xffee43fc, 0xffee43ea, 0xffee43d8, 0xffee43c6,
+    0xffee43b4, 0xffee43a2, 0xffee4390, 0xffef437e, 0xffee436d, 0xffee435b, 0xffef4349, 0xffee4338,
+    0xffee4326, 0xffef4314, 0xffee4303, 0xffef42f1, 0xffee42e0, 0xffef42ce, 0xffee42bd, 0xffef42ab,
+    0xffef429a, 0xffee4289, 0xfff04277, 0xffee4267, 0xffef4255, 0xffef4244, 0xffef4233, 0xffef4222,
+    0xffee4211, 0xffef41ff, 0xfff041ee, 0xffef41de, 0xffef41cd, 0xffee41bc, 0xfff041aa, 0xffef419a,
+    0xffef4189, 0xffef4178, 0xfff04167, 0xffef4157, 0xffef4146, 0xfff04135, 0xffef4125, 0xfff04114,
+    0xffef4104, 0xfff040f3, 0xffef40e3, 0xfff040d2, 0xfff040c2, 0xffef40b2, 0xfff040a1, 0xfff04091,
+    0xfff04081, 0xffef4071, 0xfff04060, 0xfff04050, 0xfff04040, 0xfff04030, 0xfff04020, 0xfff04010
+    // clang-format on
+};
+
+Softmax::Softmax(Architecture *arch, OptimiserDatabase *db) : _arch(arch), _db(db)
+{
+    assert(_arch != nullptr);
+}
+
+Operation *Softmax::ConvertOp(Operation *const operation)
+{
+    auto returnOp = operation;
+
+    if ( OpType::Softmax == operation->Type() )
+    {
+        auto ifmConn = operation->Input(TensorUsage::IFM0);
+        auto ofmConn = operation->Output(TensorUsage::OFM);
+        auto ifm = ifmConn->tensor.get();
+        auto ofm = ofmConn->tensor.get();
+
+        if ( ifm->Type() == ofm->Type() || (ifm->Type() == DataType::Int8 && ofm->Type() == DataType::Int16) )
+        {
+            // Reshape if needed
+            auto fullShape = Shape::PadAxes(ifmConn->shape, 4, 1);
+            if ( fullShape.Batch() > 1 )
+            {
+                fullShape = fullShape.WithHeight(fullShape.Batch() * fullShape.Height()).WithBatch(1);
+            }
+            ifmConn->shape = fullShape;
+            ofmConn->shape = std::move(fullShape);
+
+            if ( ifm->Type() == DataType::Int8 || ifm->Type() == DataType::UInt8 )
+            {
+                returnOp = GetGraph8Bit(operation, ifmConn, ofmConn);
+            }
+            else if ( ifm->Type() == DataType::Int16 )
+            {
+                returnOp = GetGraphInt16(operation, ifmConn, ofmConn);
+            }
+        }
+        if ( operation != returnOp )
+        {
+            operation->Disconnect();
+        }
+    }
+
+    return returnOp;
+}
+
+
+void Softmax::RecordOptimisation(Operation *const operation, Operation *op)
+{
+    if ( _db )
+    {
+        _db->AddOptimised(operation, op);
+    }
+}
+
+Operation *Softmax::GetGraph8Bit(Operation *const operation, TensorConnection *ifmConn, TensorConnection *ofmConn)
+{
+    const auto &ifmQuant = ifmConn->quantization;
+    auto expTable = GenerateExpTable(double(operation->Parameters().softmax.beta), ifmQuant.scales[0].Dequantize());
+    auto noScaleQuant = ifmConn->quantization;
+    noScaleQuant.scales.clear();
+    auto noScaleQuantZp0 = noScaleQuant;
+    noScaleQuantZp0.zeroPoints[0] = 0;
+    auto oneScaleQuant = ifmConn->quantization;
+    oneScaleQuant.scales[0] = {1, 0};
+    oneScaleQuant.zeroPoints[0] = 0;
+    auto twoScaleQuant = oneScaleQuant;
+    twoScaleQuant.scales[0] = {2, 0};
+
+    // PASS 0 - Depthwise Maxpool
+    auto op = CreateDepthwiseMaxpool(ifmConn->tensor, ifmConn->shape, ifmConn->quantization, noScaleQuant);
+    op->SetRounding(RoundMode::DBL);
+    auto ifmMax = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 1 - Sub
+    auto subQuant = oneScaleQuant;
+    subQuant.zeroPoints[0] = 127;
+    op = CreateSub(ifmConn->tensor, ifmMax, ifmConn->quantization, noScaleQuant, subQuant, DataType::Int8, &ifmConn->shape);
+    op->SetRounding(RoundMode::DBL);
+    auto ifm_sub = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 1.5 - LUT(exp)
+    auto expLut = CreateConstTensor("exp_lut", DataType::Int32, std::make_shared<Buffer>(std::move(expTable)));
+    op = CreateLUT(ifm_sub, expLut, subQuant, subQuant);
+    auto ifm_exp = op->Output(TensorUsage::OFM)->tensor;
+    op->SetRounding(RoundMode::DBL);
+    RecordOptimisation(operation, op);
+
+    // PASS 2 - ASR
+    auto right_shift12 = CreateConstTensor("right_shift12", 12);
+    op = CreateAsr(ifm_exp, right_shift12, subQuant, noScaleQuant, noScaleQuantZp0);
+    op->SetRounding(RoundMode::NATURAL);
+    op->attr.asr.round = true;
+    auto rescaled_exp = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 3 - Reduce sum
+    op = CreateReduceSum(rescaled_exp, noScaleQuantZp0, noScaleQuantZp0);
+    op->SetRounding(RoundMode::NATURAL);
+    auto sum_of_exp = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 4 - CLZ
+    op = CreateClz(sum_of_exp, noScaleQuantZp0, noScaleQuantZp0);
+    op->SetRounding(RoundMode::DBL);
+    auto headroom_plus_one = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 5 - Sub
+    auto headroom_offset = CreateConstTensor("headroom_offset", 12 + 31 - DataTypeSizeBits(ofmConn->tensor->Type()));
+    op = CreateSub(headroom_offset, headroom_plus_one, noScaleQuantZp0, noScaleQuantZp0, noScaleQuantZp0);
+    op->SetRounding(RoundMode::DBL);
+    auto right_shift = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 6 - Sub
+    auto one = CreateConstTensor("one_const", 1);
+    op = CreateSub(headroom_plus_one, one, noScaleQuantZp0, noScaleQuant, noScaleQuantZp0);
+    op->SetRounding(RoundMode::DBL);
+    auto headroom = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 7 - SHL
+    op = CreateShl(sum_of_exp, headroom, noScaleQuantZp0, noScaleQuantZp0, oneScaleQuant);
+    op->SetRounding(RoundMode::DBL);
+    auto half_denominator = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 8 - Multiply
+    auto neg_32_over_17 = CreateConstTensor("neg_32_over_17", -int32_t((32ULL << 29U) / 17U));
+    op = CreateMul(half_denominator, neg_32_over_17, oneScaleQuant, oneScaleQuant, twoScaleQuant);
+    op->SetRounding(RoundMode::DBL);
+    auto rescaled = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 9 - Add
+    auto const_48_over_17 = CreateConstTensor("const_48_over_17", int32_t((48ULL << 29U) / 17U));
+    op = CreateAdd(rescaled, const_48_over_17, twoScaleQuant, noScaleQuant, oneScaleQuant);
+    op->SetRounding(RoundMode::DBL);
+    auto rescale_w_offset = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 10 - 24
+    auto nr_x = std::move(rescale_w_offset);
+    auto F2_one = CreateConstTensor("F2_one", 1 << 29);
+    auto four = CreateConstTensor("four", 4);
+    for ( int i = 0; i < 3; ++i )
+    {
+        // PASS 10, 15, 20 - MUL
+        op = CreateMul(nr_x, half_denominator, oneScaleQuant, oneScaleQuant, twoScaleQuant);
+        op->SetRounding(RoundMode::DBL);
+        auto half_denominator_times_x = op->Output(TensorUsage::OFM)->tensor;
+        RecordOptimisation(operation, op);
+
+        // PASS 11, 16, 21 - SUB
+        op = CreateSub(F2_one, half_denominator_times_x, noScaleQuant, twoScaleQuant, oneScaleQuant);
+        op->SetRounding(RoundMode::DBL);
+        auto one_minus_half_denominator_times_x = op->Output(TensorUsage::OFM)->tensor;
+        RecordOptimisation(operation, op);
+
+        // PASS 12, 17, 22 - MUL
+        op = CreateMul(nr_x, one_minus_half_denominator_times_x, oneScaleQuant, oneScaleQuant, twoScaleQuant);
+        op->SetRounding(RoundMode::DBL);
+        auto to_rescale = op->Output(TensorUsage::OFM)->tensor;
+        RecordOptimisation(operation, op);
+
+        // PASS 13, 18, 23 - MUL
+        op = CreateMul(to_rescale, four, twoScaleQuant, noScaleQuant, noScaleQuantZp0);
+        op->SetRounding(RoundMode::DBL);
+        auto to_add = op->Output(TensorUsage::OFM)->tensor;
+        RecordOptimisation(operation, op);
+
+        // PASS 14, 19, 24 - ADD
+        op = CreateAdd(nr_x, to_add, oneScaleQuant, noScaleQuantZp0, oneScaleQuant);
+        op->SetRounding(RoundMode::DBL);
+        nr_x = op->Output(TensorUsage::OFM)->tensor;
+        RecordOptimisation(operation, op);
+    }
+
+    // PASS 25 - Multiply
+    op = CreateMul(ifm_exp, nr_x, oneScaleQuant, oneScaleQuant, oneScaleQuant);
+    op->SetRounding(RoundMode::DBL);
+    auto scaled_exp = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 26 - ASR
+    auto shrOp = std::make_shared<Operation>(OpType::Asr);
+    op = shrOp.get();
+    op->SetRounding(RoundMode::NATURAL);
+    op->attr.asr.round = true;
+    op->ConnectInput(TensorUsage::IFM, scaled_exp).Set(oneScaleQuant);
+    op->ConnectInput(TensorUsage::IFM1, right_shift).Set(noScaleQuantZp0);
+    op->ConnectOutput(TensorUsage::OFM, ofmConn->tensor).Set(ofmConn->quantization).Set(ofmConn->shape);
+    RecordOptimisation(operation, op);
+
+    return op;
+}
+
+Operation *Softmax::GetGraphInt16(Operation *const operation, TensorConnection *ifmConn, TensorConnection *ofmConn)
+{
+    auto noScaleQuant = ifmConn->quantization;
+    noScaleQuant.scales.clear();
+
+    // PASS 0 - Depthwise Maxpool
+    auto op = CreateDepthwiseMaxpool(ifmConn->tensor, ifmConn->shape, ifmConn->quantization, noScaleQuant);
+    op->SetRounding(RoundMode::NATURAL);
+    auto ifmMax = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 1 - Sub
+    op = CreateSub(ifmConn->tensor, ifmMax, ifmConn->quantization, noScaleQuant, ifmConn->quantization, DataType::Int32,
+        &ifmConn->shape);
+    op->SetRounding(RoundMode::DBL);
+    auto sub1_ofm = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 2 - Mul
+    double beta = double(operation->Parameters().softmax.beta);
+    double mul2_out_range = 10.0 / 65535.0;
+    auto quant = ElementwiseMulScale(ifmConn->quantization.scales[0].Dequantize(), beta, mul2_out_range);
+    auto scale_quant = ifmConn->quantization;
+    scale_quant.scales[0] = QuantizedScale(beta);
+    auto mul2_quant = ofmConn->quantization;
+    mul2_quant.scales[0] = QuantizedScale(mul2_out_range);
+    auto scale = CreateConstTensor("mul2_scale", quant.scale);
+    op = CreateMul(sub1_ofm, scale, ifmConn->quantization, scale_quant, mul2_quant);
+    op->SetRounding(RoundMode::DBL);
+    auto mul2_ofm = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 3 - Add
+    auto const_add = CreateConstTensor("add3_const", 32767);
+    op = CreateAdd(mul2_ofm, const_add, mul2_quant, noScaleQuant, mul2_quant, DataType::Int16);
+    op->SetRounding(RoundMode::DBL);
+    auto ifm_add = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 3.5 - LUT(exp)
+    auto expBuf = std::make_shared<Buffer>(int(std::size(EXP_LUT)), EXP_LUT, true);
+    auto expLut = CreateConstTensor("exp_lut", DataType::Int32, expBuf);
+    op = CreateLUT(ifm_add, expLut, mul2_quant, mul2_quant, DataType::Int16);
+    op->SetRounding(RoundMode::DBL);
+    auto ifm_exp = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 4 - Reduce sum
+    op = CreateReduceSum(ifm_exp, mul2_quant, noScaleQuant);
+    op->SetRounding(RoundMode::NATURAL);
+    auto sum_of_exp = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 5 - CLZ
+    op = CreateClz(sum_of_exp, noScaleQuant, noScaleQuant);
+    op->SetRounding(RoundMode::DBL);
+    auto headroom_plus_one = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 6 - Sub
+    auto const_31 = CreateConstTensor("const_31", 31);
+    op = CreateSub(const_31, headroom_plus_one, noScaleQuant, noScaleQuant, noScaleQuant);
+    op->SetRounding(RoundMode::DBL);
+    auto reciprocal_right_shift = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 7 - SHL
+    auto one = CreateConstTensor("one_const", 1);
+    op = CreateShl(one, reciprocal_right_shift, noScaleQuant, noScaleQuant, noScaleQuant);
+    op->SetRounding(RoundMode::DBL);
+    auto constant_one = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 8 - Sub
+    op = CreateSub(sum_of_exp, constant_one, noScaleQuant, noScaleQuant, noScaleQuant);
+    op->SetRounding(RoundMode::DBL);
+    auto sum_of_exps_minus_one = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // # PASS 9 - SHL
+    op = CreateShl(sum_of_exps_minus_one, headroom_plus_one, noScaleQuant, noScaleQuant, noScaleQuant);
+    op->SetRounding(RoundMode::DBL);
+    auto shifted_sum_minus_one = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 10 - ASR
+    auto shift = CreateConstTensor("shift_const", 15);
+    op = CreateAsr(shifted_sum_minus_one, shift, noScaleQuant, noScaleQuant, noScaleQuant);
+    op->SetRounding(RoundMode::NATURAL);
+    op->attr.asr.round = true;
+    auto shifted_sum_minus_one_16 = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 11 - Sub
+    auto sub11_const = CreateConstTensor("sub11_const", 32768);
+    op = CreateSub(shifted_sum_minus_one_16, sub11_const, noScaleQuant, noScaleQuant, noScaleQuant, DataType::Int16);
+    op->SetRounding(RoundMode::DBL);
+    auto reciprocal_scale = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 11.5 - LUT(one over one plus x)
+    auto oneOverOnePlusXBuf = std::make_shared<Buffer>(int(std::size(ONE_OVER_ONE_PLUS_X_LUT)), ONE_OVER_ONE_PLUS_X_LUT, true);
+    auto oneOverOnePlusXLut = CreateConstTensor("one_over_one_plus_x_lut", DataType::Int32, oneOverOnePlusXBuf);
+    op = CreateLUT(reciprocal_scale, oneOverOnePlusXLut, noScaleQuant, noScaleQuant, DataType::Int16);
+    op->SetRounding(RoundMode::DBL);
+    reciprocal_scale = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // # PASS 12 - Multiply
+    op = CreateMul(ifm_exp, reciprocal_scale, noScaleQuant, noScaleQuant, noScaleQuant, DataType::Int32);
+    op->SetRounding(RoundMode::DBL);
+    auto mul_ofm = op->Output(TensorUsage::OFM)->tensor;
+    RecordOptimisation(operation, op);
+
+    // PASS 13 - ASR
+    auto shrOp = std::make_shared<Operation>(OpType::Asr);
+    op = shrOp.get();
+    op->SetRounding(RoundMode::NATURAL);
+    op->attr.asr.round = true;
+    op->ConnectInput(TensorUsage::IFM, mul_ofm).Set(noScaleQuant);
+    op->ConnectInput(TensorUsage::IFM1, reciprocal_right_shift).Set(noScaleQuant);
+    op->ConnectOutput(TensorUsage::OFM, ofmConn->tensor).Set(ofmConn->quantization).Set(ofmConn->shape);
+    RecordOptimisation(operation, op);
+
+    return op;
+}
+
+std::vector<int32_t> Softmax::GenerateExpTable(double beta, double inputScale)
+{
+    const int kTableSize = 256;
+    const int kIntegerBits = 5;
+    const int kSignedBits = 31;
+    std::vector<int32_t> expTable(kTableSize);
+    using FixedPoint = gemmlowp::FixedPoint<int32_t, kIntegerBits>;
+
+    const double realBeta = std::min(beta * inputScale * (1 << (kSignedBits - kIntegerBits)), (1ll << kSignedBits) - 1.0);
+    const auto quant = QuantizedScale(realBeta);
+    const int leftShift = 31 - quant.shift;
+    const int diffMin = -int(std::floor(1.0 * ((1 << kIntegerBits) - 1) * (1 << (kSignedBits - kIntegerBits)) / (1U << leftShift)));
+
+    for ( int x = 0; x < kTableSize; ++x )
+    {
+        int inputDiff = x - 255;
+        if ( inputDiff >= diffMin )
+        {
+            const int32_t inputDiffRescaled = gemmlowp::SaturatingRoundingDoublingHighMul(
+                ClampToType<int32_t>(inputDiff * (1LL << leftShift)), quant.scale);
+            expTable[x] = gemmlowp::exp_on_negative_values(FixedPoint::FromRaw(inputDiffRescaled)).raw();
+        }
+        else
+        {
+            expTable[x] = 0;
+        }
+    }
+
+    return expTable;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/softmax.hpp b/ethosu/regor/compiler/softmax.hpp
new file mode 100644
index 00000000..88ecbb7e
--- /dev/null
+++ b/ethosu/regor/compiler/softmax.hpp
@@ -0,0 +1,53 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/scaling.hpp"
+#include "graph_optimiser.hpp"
+#include "operation.hpp"
+#include "operation_util.hpp"
+
+#include <fixedpoint/fixedpoint.h>
+#include <algorithm>
+#include <vector>
+
+namespace regor
+{
+
+/// <summary>
+/// TFLite Graph optimiser Softmax rewriter
+/// </summary>
+class Softmax
+{
+private:
+    Architecture *_arch = nullptr;
+    OptimiserDatabase *_db = nullptr;
+
+public:
+    Softmax(Architecture *arch, OptimiserDatabase *db);
+    Operation *ConvertOp(Operation *const operation);
+
+private:
+    void RecordOptimisation(Operation *const operation, Operation *op);
+    Operation *GetGraph8Bit(Operation *const operation, TensorConnection *ifmConn, TensorConnection *ofmConn);
+    Operation *GetGraphInt16(Operation *const operation, TensorConnection *ifmConn, TensorConnection *ofmConn);
+    std::vector<int32_t> GenerateExpTable(double beta, double inputScale);
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/tensor.cpp b/ethosu/regor/compiler/tensor.cpp
new file mode 100644
index 00000000..1b2fc6f6
--- /dev/null
+++ b/ethosu/regor/compiler/tensor.cpp
@@ -0,0 +1,118 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "compiler/tensor.hpp"
+
+#include "common/common.hpp"
+
+#include "architecture/architecture.hpp"
+#include "common/buffer_view.hpp"
+#include "common/data_type.hpp"
+#include "common/shape.hpp"
+
+#include <algorithm>
+#include <string>
+#include <tuple>
+#include <vector>
+
+namespace regor
+{
+
+Tensor::Tensor(const std::string &name, DataType type) : _name(name), _type(type), _uid(GenerateUniqueId())
+{
+}
+
+Tensor::Tensor(const std::string &name, DataType type, Shape shape) :
+        _name(name), _type(type), _uid(GenerateUniqueId()), _storageShape(std::move(shape))
+{
+}
+
+Tensor::Tensor(const std::string &name, DataType type, Shape shape, const std::shared_ptr<Buffer> &buffer) :
+        _name(name), _type(type), _uid(GenerateUniqueId()), _storageShape(shape), _buffer(buffer)
+{
+    assert(DataTypeStorageSizeBytes(type, shape.Elements()) <= buffer->Size());
+}
+
+BufferView Tensor::View() const
+{
+    int elementBits = DataTypeSizeBits(_type) > 0 ? DataTypeSizeBits(_type) : 8;
+    return BufferView(_buffer, 0, elementBits, _storageShape, Shape());
+}
+
+bool Tensor::IsConstant() const
+{
+    return _buffer && _buffer->Size();
+}
+
+void Tensor::Reshape(const Shape &shape)
+{
+    assert(shape.Elements() == StorageShape().Elements());
+    SetStorageShape(shape);
+}
+void Tensor::ChangeType(DataType newType)
+{
+    assert(!IsConstant());
+    _type = newType;
+}
+
+void Tensor::AddReader(std::shared_ptr<Operation> reader)
+{
+    if ( std::find(_readers.begin(), _readers.end(), reader) == _readers.end() )
+    {
+        _readers.push_back(reader);
+    }
+}
+void Tensor::AddWriter(std::shared_ptr<Operation> writer)
+{
+    if ( std::find(_writers.begin(), _writers.end(), writer) == _writers.end() )
+    {
+        _writers.push_back(writer);
+    }
+}
+void Tensor::RemoveReader(std::shared_ptr<Operation> reader)
+{
+    _readers.erase(std::remove(_readers.begin(), _readers.end(), reader), _readers.end());
+}
+void Tensor::RemoveWriter(std::shared_ptr<Operation> writer)
+{
+    _writers.erase(std::remove(_writers.begin(), _writers.end(), writer), _writers.end());
+}
+void Tensor::RemoveReaders()
+{
+    _readers.clear();
+}
+void Tensor::RemoveWriters()
+{
+    _writers.clear();
+}
+
+std::unique_ptr<Tensor> Tensor::Clone() const
+{
+    auto clone = std::make_unique<Tensor>(*this);
+    clone->_uid = GenerateUniqueId();
+    clone->RemoveReaders();
+    clone->RemoveWriters();
+    return clone;
+}
+
+std::string Tensor::ToString() const
+{
+    return fmt::format("<Tensor '{}': Shape = {}; Type = {};>", Name(), StorageShape().ToString(), DataTypeToString(Type()));
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/tensor.hpp b/ethosu/regor/compiler/tensor.hpp
new file mode 100644
index 00000000..f83b0009
--- /dev/null
+++ b/ethosu/regor/compiler/tensor.hpp
@@ -0,0 +1,95 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/common.hpp"
+
+#include "architecture/architecture.hpp"
+#include "common/buffer_view.hpp"
+#include "common/data_type.hpp"
+#include "common/shape.hpp"
+#include "include/graphapi.hpp"
+#include "tensor_properties.hpp"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+namespace regor
+{
+
+class Operation;
+
+/// <summary>
+/// Graph tensor representation
+/// </summary>
+class Tensor : public GraphApi::GraphTensor, public std::enable_shared_from_this<Tensor>
+{
+private:
+    std::string _name;
+    DataType _type;
+    UniqueId _uid;
+    class Shape _storageShape;
+    std::shared_ptr<class Buffer> _buffer;
+    enum AxisOrder _axisOrder = AxisOrder::Unknown;
+    const void *_passthrough = nullptr;  // Original flatbuffer description of this tensor (if it was loaded from one)
+
+    std::vector<std::shared_ptr<Operation>> _readers;
+    std::vector<std::shared_ptr<Operation>> _writers;
+
+public:
+    Tensor(const std::string &name, DataType type);
+    Tensor(const std::string &name, DataType type, Shape shape);
+    Tensor(const std::string &name, DataType type, Shape shape, const std::shared_ptr<Buffer> &buffer);
+
+    const std::string &Name() const { return _name; }
+    void SetName(const std::string &name) { _name = name; }
+    DataType Type() const { return _type; }
+    UniqueId Uid() const { return _uid; }
+
+    const Shape &StorageShape() const { return _storageShape; }
+    void SetStorageShape(const Shape &shape) { _storageShape = shape; }
+    void SetBuffer(const std::shared_ptr<Buffer> &buffer) { _buffer = buffer; }
+
+    BufferView View() const;
+    bool IsConstant() const;
+    void Reshape(const Shape &shape);
+    void ChangeType(DataType newType);
+
+    enum AxisOrder AxisOrder() const { return _axisOrder; }
+    void SetAxisOrder(enum AxisOrder axisOrder) { _axisOrder = axisOrder; }
+
+    const void *Passthrough() const { return _passthrough; }
+    void SetPassthrough(const void *passthrough) { _passthrough = passthrough; }
+
+    const std::vector<std::shared_ptr<Operation>> &Readers() const { return _readers; }
+    const std::vector<std::shared_ptr<Operation>> &Writers() const { return _writers; }
+
+    void AddReader(std::shared_ptr<Operation> reader);
+    void AddWriter(std::shared_ptr<Operation> writer);
+    void RemoveReader(std::shared_ptr<Operation> reader);
+    void RemoveWriter(std::shared_ptr<Operation> writer);
+    void RemoveReaders();
+    void RemoveWriters();
+
+    std::unique_ptr<Tensor> Clone() const;
+    std::string ToString() const;
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/tensor_allocator.cpp b/ethosu/regor/compiler/tensor_allocator.cpp
new file mode 100644
index 00000000..510d3619
--- /dev/null
+++ b/ethosu/regor/compiler/tensor_allocator.cpp
@@ -0,0 +1,149 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "tensor_allocator.hpp"
+
+#include "common/common.hpp"
+#include "common/logging.hpp"
+
+#include "architecture/architecture.hpp"
+#include "common/bit_flags.hpp"
+#include "common/numeric_util.hpp"
+#include "hillclimb_allocator.hpp"
+#include "live_range.hpp"
+
+namespace regor
+{
+
+namespace
+{
+
+// Implementation of the linear allocator
+Address LinearAllocateLiveRanges(LiveRangeGraph &lrGraph, int alignment)
+{
+    Address address = 0;
+    for ( const auto &lr : lrGraph.LiveRanges() )
+    {
+        lr->SetAddress(address);
+        address += RoundAway<int>(lr->size, alignment);
+    }
+    return address;
+}
+
+void PrintAllocation(LiveRangeGraph &lrGraph, Address totalSize)
+{
+    LOG_PRINT("{0:10} - {1:10}: {2:>10} - {3:>10}: {4:11}: {5:12} : {6}\n", "Start Time", "End Time", "Start Addr",
+        "End Addr", "Tensor Size", "Memory Usage", "Name");
+    auto lrs = lrGraph.LiveRanges();
+    // Sort LiveRanges
+    std::sort(lrs.begin(), lrs.end(),
+        [](const std::shared_ptr<LiveRange> &a, const std::shared_ptr<LiveRange> &b)
+        {
+            if ( a->startTime != b->startTime ) return a->startTime < b->startTime;
+            return a->endTime < b->endTime;
+        });
+    // Create memory histogram to track usage over time
+    std::vector<int> memHist(lrGraph.EndTime(), 0);
+    for ( const auto &lr : lrs )
+    {
+        for ( int t = lr->startTime; t <= lr->endTime; ++t )
+        {
+            memHist[t] += lr->size;
+        }
+    }
+
+    for ( const auto &lr : lrs )
+    {
+        if ( lr->tensors.empty() )
+        {
+            continue;
+        }
+        auto address = (*lr->tensors.begin())->allocatedAddress;
+        auto peakUsageDuringLiveRange = *std::max_element(memHist.begin() + lr->startTime, memHist.begin() + lr->endTime + 1);
+        for ( const auto &tens : lr->tensors )
+        {
+            LOG_PRINT("{0:10} - {1:10}: {2:#10x} - {3:#10x}: {4:11}: {5:12} : {6}\n", lr->startTime, lr->endTime,
+                address, address + lr->size, lr->size, peakUsageDuringLiveRange, tens->Name());
+        }
+    }
+    LOG_PRINT("Allocation Peak Tensor Size: {} bytes == {} KiB\n", totalSize, double(totalSize) / 1024.0);
+}
+
+Address Allocate(LiveRangeGraph &lrGraph, const std::vector<std::unique_ptr<SchedulerOperation>> &schedOps,
+    Schedule *schedule, const MemArea &targetMemory, TensorAllocator allocator, int alignment, Address sizeLimit)
+{
+    lrGraph.ExtractLiveRangesFromCascades(schedOps, schedule, targetMemory, false);
+    Address totalSize = 0;
+    if ( allocator == TensorAllocator::LinearAlloc )
+    {
+        totalSize = LinearAllocateLiveRanges(lrGraph, alignment);
+    }
+    else if ( allocator == TensorAllocator::HillClimb )
+    {
+        totalSize = HillClimbAllocateLiveRanges(lrGraph, alignment, sizeLimit);
+    }
+    return totalSize;
+}
+
+}  // namespace
+
+Address IncrementalLinearAllocator::Allocate(LiveRangeGraph *lrGraph, int alignment, bool verboseAllocation)
+{
+    for ( const auto &lr : lrGraph->LiveRanges() )
+    {
+        if ( !lr->tensors.empty() )
+        {
+            auto tensor = *(lr->tensors.begin());
+            auto it = _allocatedAddresses.find(tensor->equivalenceId);
+            if ( it == _allocatedAddresses.end() )
+            {
+                lr->SetAddress(_highestAddress);
+                _allocatedAddresses[tensor->equivalenceId] = _highestAddress;
+                _highestAddress += RoundAway<int>(lr->size, alignment);
+            }
+            else
+            {
+                // An equivalent tensor has previously been allocated, reuse its address
+                lr->SetAddress(it->second);
+            }
+        }
+    }
+    if ( verboseAllocation )
+    {
+        LOG_PRINT("{0:#^{1}}\n", "", 80);
+        LOG_PRINT("Tensor Allocation for {}:\n", _name);
+        PrintAllocation(*lrGraph, _highestAddress);
+    }
+    return _highestAddress;
+}
+
+void AllocateTensors(const std::vector<std::unique_ptr<SchedulerOperation>> &schedOps, Schedule *schedule,
+    const MemArea &memArea, TensorAllocator allocator, int alignment, bool verboseAllocation, Address sizeLimit)
+{
+    LiveRangeGraph lrGraph;
+    auto totalSize = Allocate(lrGraph, schedOps, schedule, memArea, allocator, alignment, sizeLimit);
+    if ( verboseAllocation )
+    {
+        LOG_PRINT("{0:#^{1}}\n", "", 80);
+        LOG_PRINT("Allocation, memory {}, usage mask: {}\n", memArea.memory->Name(), memArea.usage.ToString());
+        PrintAllocation(lrGraph, totalSize);
+    }
+    schedule->memoryUsage[memArea] = int(totalSize);
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/tensor_allocator.hpp b/ethosu/regor/compiler/tensor_allocator.hpp
new file mode 100644
index 00000000..14a62f80
--- /dev/null
+++ b/ethosu/regor/compiler/tensor_allocator.hpp
@@ -0,0 +1,64 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "architecture/architecture.hpp"
+#include "scheduler.hpp"
+#include "scheduler_operation.hpp"
+
+#include <cstdint>
+#include <vector>
+
+namespace regor
+{
+
+class LiveRangeGraph;
+
+// Tensor allocation algorithms
+enum class TensorAllocator : uint16_t
+{
+    // Allocator that does not reuse memory
+    LinearAlloc = 0,
+    // Search based allocator
+    HillClimb = 1,
+    Last,
+};
+
+/// <summary>
+/// Linear allocator that can be used to allocate addresses across multiple subgraphs
+/// </summary>
+class IncrementalLinearAllocator
+{
+public:
+    IncrementalLinearAllocator(const std::string &name) : _name(name) {}
+    Address Allocate(LiveRangeGraph *lrGraph, int alignment, bool verboseAllocation);
+
+private:
+    std::string _name;
+    // Map from tensor's equivalence id to allocated address
+    std::unordered_map<UniqueId, Address> _allocatedAddresses;
+    Address _highestAddress = 0;
+};
+
+// Allocates addresses to the tensors involved in the given operations/mem area(s)
+// using the given tensor allocation algorithm.
+void AllocateTensors(const std::vector<std::unique_ptr<SchedulerOperation>> &schedOps, Schedule *schedule, const MemArea &memArea,
+    TensorAllocator allocator, int alignment, bool verboseAllocation, Address sizeLimit = std::numeric_limits<Address>::max());
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/tensor_properties.hpp b/ethosu/regor/compiler/tensor_properties.hpp
new file mode 100644
index 00000000..fa45f5d3
--- /dev/null
+++ b/ethosu/regor/compiler/tensor_properties.hpp
@@ -0,0 +1,90 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/common.hpp"
+
+#include "include/graphapi.hpp"
+
+namespace regor
+{
+
+// Aliased by value (using/typedef doesn't work)
+enum class AxisOrder : int16_t
+{
+    Unknown = int16_t(GraphApi::AxisOrder::Unknown),
+    OHWI = int16_t(GraphApi::AxisOrder::OHWI),
+    IHWO = int16_t(GraphApi::AxisOrder::IHWO),
+    OI = int16_t(GraphApi::AxisOrder::OI),
+};
+
+/// <summary>
+/// Classification for how a Tensor is consumed by an operator.
+/// </summary>
+enum class TensorUsage : uint32_t
+{
+    None = 0,
+    IFM = 0x01,
+    OFM = 0x02,
+    Weights = 0x03,
+    Scales = 0x04,
+    Params = 0x05,
+    LUT = 0x06,
+    Commands = 0x07,
+    State = 0x08,
+    Last,
+    TypeMask = 0x0F,
+    IndexShift = 8,
+    IndexMask = 0xFFFFF00,
+    IFM0 = IFM,
+    IFM1 = 0x0100 | IFM,
+    IFM2 = 0x0200 | IFM,
+    Params0 = Params,
+    Params1 = 0x100 | Params,
+};
+
+DECLARE_ENUM_AS_FLAGS(TensorUsage)
+
+constexpr int MAX_NUM_IFM = 3;
+
+constexpr inline bool IsOFM(TensorUsage usage)
+{
+    return (usage & TensorUsage::TypeMask) == TensorUsage::OFM;
+}
+
+constexpr inline bool IsIFM(TensorUsage usage)
+{
+    return (usage & TensorUsage::TypeMask) == TensorUsage::IFM;
+}
+
+constexpr inline TensorUsage MakeTensorUsage(TensorUsage type, int index)
+{
+    return TensorUsage(uint32_t(type) | (index << 8));
+}
+
+constexpr inline int GetUsageIndex(TensorUsage usage)
+{
+    return unsigned(usage & TensorUsage::IndexMask) >> unsigned(TensorUsage::IndexShift);
+}
+constexpr inline TensorUsage GetUsageType(TensorUsage usage)
+{
+    return (usage & TensorUsage::TypeMask);
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/tflite_graph_optimiser.cpp b/ethosu/regor/compiler/tflite_graph_optimiser.cpp
new file mode 100644
index 00000000..6dde3b4e
--- /dev/null
+++ b/ethosu/regor/compiler/tflite_graph_optimiser.cpp
@@ -0,0 +1,2936 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "compiler/tflite_graph_optimiser.hpp"
+
+#include "common/logging.hpp"
+
+#include "architecture/architecture.hpp"
+#include "common/reverse_type.hpp"
+#include "common/scaling.hpp"
+#include "common/transpose_type.hpp"
+#include "graph.hpp"
+#include "graph_optimiser.hpp"
+#include "op_type.hpp"
+#include "operation.hpp"
+#include "optimiser_utils.hpp"
+#include "softmax.hpp"
+#include "tensor.hpp"
+#include "tflite/tflite_schema_generated.hpp"
+
+#include <fixedpoint/fixedpoint.h>
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <numeric>
+#include <vector>
+
+namespace regor
+{
+
+using namespace GraphOptimisation;
+
+// Is the scaling of Tensor connection a and b valid and equal.
+bool TFLiteGraphOptimiser::IsScalingValidAndEqual(const TensorConnection &a, const TensorConnection &b)
+{
+    return (a.quantization.IsValid() && b.quantization.IsValid() && a.quantization.scales == b.quantization.scales &&
+            a.quantization.zeroPoints == b.quantization.zeroPoints);
+}
+
+
+// Multiplies int with QuantizedScale with rounding.
+int TFLiteGraphOptimiser::MultiplyByQuantizedMultiplier(int x, QuantizedScale quantScale)
+{
+    // Multiplies x (int32) by QuantizedScale (scale, shift), returns rounded result.
+    // Expects the QuantizedScale to be left-shift positive.
+    const int leftShift = quantScale.shift > 0 ? quantScale.shift : 0;
+    const int rightShift = quantScale.shift < 0 ? -quantScale.shift : 0;
+    const std::int32_t mul = gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << leftShift), quantScale.scale);
+    return gemmlowp::RoundingDivideByPOT<std::int32_t>(mul, rightShift);
+}
+
+Operation *TFLiteGraphOptimiser::MakeMulWithConstTensor(const std::string &name, const TensorConnection &ifmConn,
+    const TensorConnection &ofmConn, const std::shared_ptr<Tensor> &constTens, const Quantization &quantization)
+{
+    auto ofm = ofmConn.tensor;
+    auto op = std::make_shared<Operation>(OpType::Mul);
+    op->SetRounding(RoundMode::DBL);
+
+    op->CopyInput(TensorUsage::IFM0, ifmConn);
+    op->ConnectInput(TensorUsage::IFM1, constTens).Set(quantization);
+
+    auto ofmName = ofm->Name();
+    ofmName.append("_");
+    ofmName.append(name);
+
+    std::shared_ptr<Tensor> cloneOfm = ofm->Clone();
+    cloneOfm->SetName(ofmName);
+    op->ConnectOutput(TensorUsage::OFM, cloneOfm).Set(ofmConn.shape).Set(ofmConn.quantization).Set(ofmConn.slice);
+
+    return op.get();
+}
+
+Operation *TFLiteGraphOptimiser::MakeOperation(
+    OpType opType, const TensorConnection *ifm0Conn, const TensorConnection *ifm1Conn, const TensorConnection *ofmConn)
+{
+    auto op = std::make_shared<Operation>(opType);
+    assert(ifm0Conn != nullptr);
+    assert(ofmConn != nullptr);
+    op->CopyInput(TensorUsage::IFM0, *ifm0Conn);
+    op->CopyOutput(TensorUsage::OFM, *ofmConn);
+    if ( ifm1Conn != nullptr )
+    {
+        op->CopyInput(TensorUsage::IFM1, *ifm1Conn);
+    }
+    op->SetRounding(RoundMode::DBL);
+    return op.get();
+}
+
+// Converts LeakyReLU to
+// if alpha <= 1
+//     Max(alpha * IFM, identity * IFM)
+// else
+//     Min(alpha * IFM, identity * IFM)
+Operation *TFLiteGraphOptimiser::ConvertLeakyRelu16bit(TensorConnection &ifmConn, TensorConnection &ofmConn, Operation *operation)
+{
+    Operation *returnOp = operation;
+
+    auto ifm = ifmConn.tensor.get();
+    auto ofm = ofmConn.tensor.get();
+    float alpha = operation->Parameters().leaky_relu.alpha;
+    int scalar = 1;
+
+    auto alphaQuant = ifmConn.quantization;
+    alphaQuant.quantMin = {0};
+    alphaQuant.quantMax = {int64_t(alpha * IntegerMax(ifmConn.tensor->Type()))};
+    alphaQuant.zeroPoints[0] = 0;
+    alphaQuant.scales[0] = QuantizedScale(alpha);
+
+    if ( alpha < 0 )
+    {
+        scalar = -1;
+        alphaQuant.scales[0].scale *= -1;
+    }
+
+    // Multiply all values with alpha
+    auto fmAlpha = CreateConstTensor("lrelu_alpha", int16_t(scalar));
+    auto alphaMulOp = MakeMulWithConstTensor("alpha", ifmConn, ofmConn, fmAlpha, alphaQuant);
+
+    RecordOptimisation(operation, alphaMulOp);
+    TensorConnection *identityConn = &ifmConn;
+
+    if ( !IsScalingValidAndEqual(ifmConn, ofmConn) )
+    {
+        // Identity operation is introduced to handle rescaling of the IFM
+        auto identityQuant = ifmConn.quantization;
+        identityQuant.quantMin = {0};
+        identityQuant.quantMax = {int64_t(IntegerMax(ifmConn.tensor->Type()))};
+        identityQuant.zeroPoints[0] = 0;
+        identityQuant.scales[0] = {1, 0};
+
+        auto fmIdentity = CreateConstTensor("lrelu_ident", int16_t(1));
+
+        auto identityMulOp = MakeMulWithConstTensor("identity", ifmConn, ofmConn, fmIdentity, identityQuant);
+        RecordOptimisation(operation, identityMulOp);
+        identityConn = identityMulOp->Output(TensorUsage::OFM);
+    }
+
+    // Merge scaled and unscaled values with a MIN or a MAX depending on alpha
+    if ( alpha <= 1 )
+    {
+        // If alpha <= 1
+        // Max(negative * alpha, negative) = negative * alpha
+        // Max(positive * alpha, positive) = positive
+        auto maxOp = MakeOperation(OpType::Maximum, alphaMulOp->Output(TensorUsage::OFM), identityConn, &ofmConn);
+        RecordOptimisation(operation, maxOp);
+        returnOp = maxOp;
+    }
+    else
+    {
+        // If alpha > 1:
+        // Min(negative * alpha, negative) = negative * alpha
+        // Min(positive * alpha, positive) = positive
+        auto minOp = MakeOperation(OpType::Minimum, alphaMulOp->Output(TensorUsage::OFM), identityConn, &ofmConn);
+        RecordOptimisation(operation, minOp);
+        returnOp = minOp;
+    }
+
+    return returnOp;
+}
+
+
+// Get axis parameter for operator
+int TFLiteGraphOptimiser::GetAxis(const Operation *const operation)
+{
+    auto opType = operation->Type();
+    int axis = 0;
+
+    switch ( opType )
+    {
+        case OpType::Concat:
+        case OpType::ConcatTFLite:
+            axis = operation->Parameters().concat.axis;
+            break;
+        case OpType::Pack:
+        case OpType::Unpack:
+            axis = operation->Parameters().pack_unpack.axis;
+            break;
+        case OpType::Split:
+        {
+            auto *paramConn = operation->Input(TensorUsage::Params);
+            axis = paramConn->tensor->View().Values<int>()[0];
+            break;
+        }
+        case OpType::SplitV:
+        {
+            auto usage = MakeTensorUsage(TensorUsage::Params, 1);
+            auto *paramConn = operation->Input(usage);
+            axis = paramConn->tensor->View().Values<int>()[0];
+            break;
+        }
+        default:
+            break;
+    }
+    return axis;
+}
+
+
+// Calculate the read shape and offset values for Slice.
+void TFLiteGraphOptimiser::SetSliceOffsetValues(Operation *const operation, Shape &readShape, Shape &readOffset)
+{
+    auto *beginConn = operation->Input(TensorUsage::Params0);
+    auto *sizeConn = operation->Input(TensorUsage::Params1);
+
+    for ( auto idx = 0; idx < beginConn->tensor->View().ViewShape()[0]; idx++ )
+    {
+        auto begin = beginConn->tensor->View().Values<int>()[idx];
+        auto size = sizeConn->tensor->View().Values<int>()[idx];
+        readOffset[idx] = begin;
+        readShape[idx] = size;
+    }
+
+    readOffset = Shape::PadAxes(readOffset, 4, 0);
+    readShape = Shape::PadAxes(readShape, 4, 1);
+}
+
+
+// Calculate the read shape and offset values for StridedSlice.
+void TFLiteGraphOptimiser::SetStridedSliceOffsetValues(
+    Operation *const operation, const TensorConnection *const ifmConn, Shape &readShape, Shape &readOffset)
+{
+    auto *beginConn = operation->Input(TensorUsage::Params0);
+    auto *endConn = operation->Input(TensorUsage::Params1);
+
+    // strides tensor not used.
+    auto beginMask = operation->Parameters().strided_slice.begin_mask;
+    auto endMask = operation->Parameters().strided_slice.end_mask;
+
+    readShape = ifmConn->shape;
+
+    for ( auto idx = 0; idx < ifmConn->shape.Size(); idx++ )
+    {
+        // If the i:th bit in the mask is set then the value on offset_tens[i] should be ignored
+        if ( (beginMask & (1 << idx)) == 0 )
+        {
+            readOffset[idx] = beginConn->tensor->View().Values<int>()[idx];
+            if ( readOffset[idx] < 0 )
+            {
+                // Convert offset to positive value
+                readOffset[idx] += ifmConn->shape[idx];
+            }
+        }
+        if ( (endMask & (1 << idx)) == 0 )
+        {
+            readShape[idx] = endConn->tensor->View().Values<int>()[idx];
+            if ( readShape[idx] < 0 )
+            {
+                // Convert offset to positive value
+                readShape[idx] += ifmConn->shape[idx];
+            }
+        }
+    }
+    readOffset = Shape::PadAxes(readOffset, 4, 0);
+}
+
+
+// Creates MemoryCopy operation for the given ifm/ofm and write offset.
+std::shared_ptr<Operation> TFLiteGraphOptimiser::MakeMemoryCopyForConcat(
+    const TensorConnection *const ofmConn, const TensorConnection *const ifmConn, const Shape &writeOffset)
+{
+    auto op = std::make_shared<Operation>(OpType::MemoryCopy);
+    op->SetRounding(RoundMode::NATURAL);
+
+    op->CopyInput(TensorUsage::IFM0, *ifmConn);
+    op->ConnectOutput(TensorUsage::OFM, ofmConn->tensor)
+        .Set(ofmConn->shape)
+        .Set(ofmConn->quantization)
+        .Set({writeOffset, ifmConn->shape});
+
+    return op;
+}
+
+
+// Creates a MemoryCopy operation for the given ifm/ofm and readOffset.
+std::shared_ptr<Operation> TFLiteGraphOptimiser::MakeMemoryCopyForSplitOps(const TensorConnection *const ofmConn,
+    const TensorConnection *const ifmConn, const Shape &readShape, const Shape &readOffset)
+{
+    auto op = std::make_shared<Operation>(OpType::MemoryCopy);
+    op->SetRounding(RoundMode::NATURAL);
+    op->ConnectInput(TensorUsage::IFM0, ifmConn->tensor).Set(ifmConn->shape).Set(ifmConn->quantization).Set({readOffset, readShape});
+    op->CopyOutput(TensorUsage::OFM, *ofmConn);
+
+    return op;
+}
+
+
+// Creates the desired shape of either:
+// - Concat         (Input shape - supply IFM base shape)
+// - Split/SplitV   (Output shape - supply OFM base shape)
+//
+// returns the Desired shape.
+// Also calculates the axis4D, returned through supplied pointer.
+Shape TFLiteGraphOptimiser::MakeConcatSplitDesiredShape(int axis, const Shape &baseShape, int *const axis4D)
+{
+    // Convert axis to positive.
+    if ( axis < 0 )
+    {
+        axis += baseShape.Size();
+    }
+    int to4D = (4 - baseShape.Size());
+    *axis4D = axis + to4D;
+    return Shape::PadAxes(baseShape, 4, 1);
+}
+
+
+// Creates the desired shape of either:
+// - pack   (Input shape - supply IFM base shape)
+// - unpack (Output shape - supply OFM base shape)
+//
+// returns the Desired shape.
+// Unpack keeps the unpacked dimension set to 1.
+// Also calculates the axis4D, returned through supplied pointer.
+Shape TFLiteGraphOptimiser::MakePackUnpackDesiredShape(int axis, const Shape &baseShape, int *const axis4D)
+{
+    // Convert axis to positive.
+    if ( axis < 0 )
+    {
+        axis += baseShape.Size() + 1;
+    }
+    Shape tmp = baseShape;
+    tmp = tmp.Insert(axis, 1);
+    int to4D = (4 - tmp.Size());
+    *axis4D = axis + to4D;
+    return Shape::PadAxes(tmp, 4, 1);
+}
+
+
+// Creates the desired Output shape of StridedSlice.
+//
+// returns the Desired shape.
+Shape TFLiteGraphOptimiser::MakeStridedSliceDesiredShape(Operation *const operation, const Shape &baseShape)
+{
+    auto newMask = unsigned(operation->Parameters().strided_slice.new_axis_mask);
+    auto shrinkMask = unsigned(operation->Parameters().strided_slice.shrink_axis_mask);
+
+    if ( newMask == 0 && shrinkMask == 0 )
+    {
+        return baseShape;
+    }
+    assert((newMask == 0) || (shrinkMask == 0));
+
+    Shape tmp = baseShape;
+    while ( shrinkMask )
+    {
+        auto prevMask = shrinkMask;
+        shrinkMask &= shrinkMask - 1;
+        auto axis = 0;
+        auto diff = prevMask - shrinkMask;
+        diff >>= 1;
+        while ( diff )
+        {
+            diff >>= 1;
+            ++axis;
+        }
+        tmp = tmp.Insert(axis, 1);
+    }
+
+    while ( newMask )
+    {
+        auto prevMask = newMask;
+        newMask &= newMask - 1;
+        auto axis = 0;
+        auto diff = prevMask - newMask;
+        diff >>= 1;
+        while ( diff )
+        {
+            diff >>= 1;
+            ++axis;
+        }
+        tmp = tmp.Erase(axis);
+        newMask >>= 1;
+    }
+
+    return Shape::PadAxes(tmp, 4, 1);
+}
+
+
+// Move Split/slice op to consumer
+void TFLiteGraphOptimiser::MoveToConsumer(const Operation *const operation, Operation *const cons)
+{
+    auto *ifmConn = operation->Input(TensorUsage::IFM0);
+    auto *ofm = operation->OFM();
+    auto *consIfm0 = cons->IFM(0);
+    auto *consIfm1 = cons->IFM(1);
+
+    if ( consIfm0 == ofm )
+    {
+        cons->CopyInput(TensorUsage::IFM0, *ifmConn);
+    }
+    else if ( consIfm1 != nullptr && IsBinaryElementwise(cons->Type()) && consIfm1 == ofm )
+    {
+        cons->CopyInput(TensorUsage::IFM1, *ifmConn);
+    }
+}
+
+void TFLiteGraphOptimiser::ReplaceOperation(Operation *const operationToReplace, Operation *const newOperation)
+{
+    auto oldOperation = operationToReplace->shared_from_this();
+
+    for ( const auto &input : oldOperation->Inputs().pairs() )
+    {
+        newOperation->CopyInput(input.first, input.second);
+    }
+    for ( const auto &output : oldOperation->Outputs().pairs() )
+    {
+        newOperation->CopyOutput(output.first, output.second);
+    }
+    oldOperation->Disconnect();
+}
+
+Operation *TFLiteGraphOptimiser::MakeDepthwiseMeanOp(const TensorConnection *ifmConn, const Shape &ifmShape4D, const Shape &readShape,
+    const Shape &readOffset, const Shape &ofmShape4D, int w, int h, const std::string &name, std::shared_ptr<Tensor> &weightTensor,
+    std::shared_ptr<Tensor> biasTensor, const Quantization &ifmQuant, const Quantization &weightQuant, const Quantization &ofmQuant)
+{
+    auto ifm = ifmConn->tensor;
+    auto op = std::make_shared<Operation>(OpType::DepthwiseConv2DBias);
+    op->SetRounding(ifm->Type() == DataType::Int16 ? RoundMode::NATURAL : RoundMode::DBL);
+    op->SetKernel(std::make_unique<Kernel>(Point2i(w, h), Point2i(1, 1), Point2i(1, 1)));
+
+    if ( weightTensor == nullptr )
+    {
+        Shape weightShape(ifmShape4D.Batch(), h, w, ifmShape4D.Depth());
+        std::vector<uint8_t> ones(weightShape.Elements(), 1);
+        auto onesBuf = std::make_shared<Buffer>(std::move(ones));
+        weightTensor = std::make_shared<Tensor>(name + "_weights", DataType::UInt8, weightShape, onesBuf);
+        weightTensor->SetAxisOrder(AxisOrder::IHWO);
+    }
+
+    if ( biasTensor == nullptr )
+    {
+        DataType biasType;
+        std::shared_ptr<Buffer> buf;
+        auto elems = ifmShape4D.Depth();
+        if ( ifm->Type() == DataType::Int16 )
+        {
+            biasType = DataType::Int64;
+            std::vector<int64_t> data(ToUnsigned(elems));
+            buf = std::make_shared<Buffer>(std::move(data));
+        }
+        else
+        {
+            biasType = DataType::Int32;
+            std::vector<int32_t> data(ToUnsigned(elems));
+            buf = std::make_shared<Buffer>(std::move(data));
+        }
+        biasTensor = std::make_shared<Tensor>(name + "bias", biasType, Shape(ifmShape4D.Depth()), buf);
+    }
+
+    auto ifmQuantZp0 = ifmQuant;
+    ifmQuantZp0.zeroPoints.clear();
+    ifmQuantZp0.zeroPoints.push_back(0);
+    op->ConnectInput(TensorUsage::IFM, ifm).Set(ifmShape4D).Set(ifmQuant).Set({readOffset, readShape});
+    op->ConnectInput(TensorUsage::Weights, weightTensor).Set(weightQuant);
+    op->ConnectInput(TensorUsage::Scales, biasTensor).Set(ifmQuantZp0);
+
+    auto ofm = std::make_shared<Tensor>(name + "_intermediate", DataType::Int32);
+    ofm->SetStorageShape(ofmShape4D);
+    op->ConnectOutput(TensorUsage::OFM, ofm).Set(ofmQuant);
+
+    return op.get();
+}
+
+
+// Upcast to int32
+Operation *TFLiteGraphOptimiser::CreateCastToInt32(const TensorConnection *ifmConn)
+{
+    assert(ifmConn->tensor->Type() != DataType::Int32);
+
+    auto noScaleQuantZp0 = ifmConn->quantization;
+    noScaleQuantZp0.scales.clear();
+    noScaleQuantZp0.zeroPoints.clear();
+    noScaleQuantZp0.zeroPoints.push_back(0);
+
+    auto ofmShape4D = Shape::PadAxes(ifmConn->shape, 4, 1);
+    auto op = std::make_shared<Operation>(OpType::MemoryCopy);
+    op->SetRounding(RoundMode::NATURAL);
+    op->CopyInput(TensorUsage::IFM0, *ifmConn);
+    auto ofm = std::make_shared<Tensor>(ifmConn->tensor->Name() + "_32bit", DataType::Int32);
+    ofm->SetStorageShape(ofmShape4D);
+    op->ConnectOutput(TensorUsage::OFM, ofm).Set(noScaleQuantZp0);
+    return op.get();
+}
+
+
+// Converts op to int8/uint8 LUT which is generated with the given function.
+Operation *TFLiteGraphOptimiser::ConvertToLUT8(Operation *op, std::function<double(double)> func, const std::string &name)
+{
+    auto ifmConn = op->Input(TensorUsage::IFM0);
+    auto ofmConn = op->Output(TensorUsage::OFM);
+    auto ifm = ifmConn->tensor;
+    auto ofm = ofmConn->tensor;
+
+    if ( (ifm->Type() != DataType::Int8 && ifm->Type() != DataType::UInt8) || ifm->Type() != ofm->Type() )
+    {
+        return op;
+    }
+
+    // Generate LUT
+    double ifmScale(ifmConn->quantization.scales[0].Dequantize());
+    double ofmScale(ofmConn->quantization.scales[0].Dequantize());
+    auto zpIn = ifmConn->quantization.zeroPoints[0];
+    auto zpOut = ofmConn->quantization.zeroPoints[0];
+    int qMin = ifm->Type() == DataType::Int8 ? -128 : 0;
+    int qMax = ifm->Type() == DataType::Int8 ? 127 : 255;
+
+    std::vector<uint8_t> lut;
+    lut.reserve(256);
+    for ( int x = qMin; x <= qMax; ++x )
+    {
+        auto xReal = ifmScale * double(x - zpIn);
+        auto yReal = func(xReal);
+        int lutVal = int(RoundAwayZero(double(zpOut) + yReal / ofmScale));
+        lutVal = std::min(qMax, std::max(qMin, lutVal));
+        lut.push_back(uint8_t(lutVal));
+    }
+    auto lutTens = CreateConstTensor(name, ifmConn->tensor->Type(), std::make_shared<Buffer>(std::move(lut)));
+    // The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale),
+    // so even if the OFM has a different scale than the IFM, the generated OFM scale instructions
+    // should be the same as the IFM
+    auto returnOp = CreateLUT(ifmConn->tensor, lutTens, ifmConn->quantization, ifmConn->quantization, lutTens->Type(),
+        &ifmConn->shape, ofmConn->tensor, ifmConn->slice, ofmConn->slice);
+    returnOp->SetRounding(RoundMode::NATURAL);
+    return returnOp;
+}
+
+// Converts op to int16 interpolating LUT which is generated with the given function.
+Operation *TFLiteGraphOptimiser::ConvertToInterpolatingLUT16(Operation *op, std::function<double(double)> func, const std::string &name)
+{
+    auto ifmConn = op->Input(TensorUsage::IFM0);
+    auto ofmConn = op->Output(TensorUsage::OFM);
+    auto ifm = ifmConn->tensor;
+    auto ofm = ofmConn->tensor;
+
+    if ( (ifm->Type() != DataType::Int16) || ifm->Type() != ofm->Type() )
+    {
+        return op;
+    }
+
+    double ifmScale(ifmConn->quantization.scales[0].Dequantize());
+    double ofmScale(ofmConn->quantization.scales[0].Dequantize());
+    auto zpIn = ifmConn->quantization.zeroPoints[0];
+    auto zpOut = ofmConn->quantization.zeroPoints[0];
+    double qMin = IntegerMin(DataType::Int16);
+    double qMax = IntegerMax(DataType::Int16);
+    double inputMin = ifmScale * (qMin - zpIn);
+    double inputMax = ifmScale * (qMax - zpIn);
+    double outputMin = ofmScale * (qMin - zpOut);
+    double outputMax = ofmScale * (qMax - zpOut);
+    const int steps = 512;
+    double step = (inputMax - inputMin) / steps;
+    double halfStep = step / 2.0;
+    double outputScalingInv = (qMax - qMin + 1) / (outputMax - outputMin);
+
+    // Create 32-bit LUT represented by a 16-bit base and 16-bit slope.
+    auto lut = std::make_unique<uint32_t[]>(512);
+    double prevLutResult = 0;
+    for ( int i = 0; i < steps; i++ )
+    {
+        double val = func(inputMin + i * step);
+        double valMidpoint = func(inputMin + i * step + halfStep);
+        double valNext = func(inputMin + (i + 1) * step);
+        double sampleVal = RoundAwayZero(val * outputScalingInv);
+
+        double midpointInterpVal = RoundAwayZero((valNext * outputScalingInv + sampleVal) / 2);
+        double midpointVal = RoundAwayZero(valMidpoint * outputScalingInv);
+        double midpointErr = midpointInterpVal - midpointVal;
+        double bias = RoundAwayZero(midpointErr / 2.0);
+
+        double lutResult = std::clamp(sampleVal - bias, qMin, qMax);
+
+        if ( i > 0 )
+        {
+            uint32_t base = uint32_t(prevLutResult);
+            uint32_t slope = uint32_t(lutResult - prevLutResult);
+            lut[i - 1] = base + (slope << 16);
+        }
+        prevLutResult = lutResult;
+    }
+    double val = RoundAwayZero(func(inputMax) * outputScalingInv);
+    double lutResult = std::clamp(val, qMin, qMax);
+    uint32_t base = uint32_t(prevLutResult);
+    uint32_t slope = uint32_t(lutResult - prevLutResult);
+    lut[steps - 1] = base + (slope << 16);
+
+    auto lutTens = CreateConstTensor(name, DataType::Int32, std::make_shared<Buffer>(std::move(lut), 512));
+    // The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale),
+    // so even if the OFM has a different scale than the IFM, the generated OFM scale instructions
+    // should be the same as the IFM
+    auto returnOp = CreateLUT(ifmConn->tensor, lutTens, ifmConn->quantization, ifmConn->quantization, lutTens->Type(),
+        &ifmConn->shape, ofmConn->tensor, ifmConn->slice, ofmConn->slice);
+    returnOp->SetRounding(RoundMode::NATURAL);
+    return returnOp;
+}
+
+Operation *TFLiteGraphOptimiser::ConvertTanhSigmoidToLUT16(Operation *const op)
+{
+    auto ifmConn = op->Input(TensorUsage::IFM0);
+    auto ofmConn = op->Output(TensorUsage::OFM);
+    auto ifm = ifmConn->tensor;
+    auto ofm = ofmConn->tensor;
+
+    if ( ifm->Type() != DataType::Int16 || ifm->Type() != ofm->Type() )
+    {
+        return op;
+    }
+
+    // clang-format off
+    // Table of sigmoid(i/24)*65536
+    static const uint16_t SIGMOID_TABLE[256] =
+    {
+        32768, 33451, 34133, 34813, 35493, 36169, 36843, 37513,
+        38180, 38841, 39498, 40149, 40794, 41432, 42064, 42688,
+        43304, 43912, 44511, 45102, 45683, 46255, 46817, 47369,
+        47911, 48443, 48964, 49475, 49975, 50464, 50942, 51409,
+        51865, 52311, 52745, 53169, 53581, 53983, 54374, 54755,
+        55125, 55485, 55834, 56174, 56503, 56823, 57133, 57433,
+        57724, 58007, 58280, 58544, 58800, 59048, 59288, 59519,
+        59743, 59959, 60168, 60370, 60565, 60753, 60935, 61110,
+        61279, 61441, 61599, 61750, 61896, 62036, 62172, 62302,
+        62428, 62549, 62666, 62778, 62886, 62990, 63090, 63186,
+        63279, 63368, 63454, 63536, 63615, 63691, 63765, 63835,
+        63903, 63968, 64030, 64090, 64148, 64204, 64257, 64308,
+        64357, 64405, 64450, 64494, 64536, 64576, 64614, 64652,
+        64687, 64721, 64754, 64786, 64816, 64845, 64873, 64900,
+        64926, 64950, 64974, 64997, 65019, 65039, 65060, 65079,
+        65097, 65115, 65132, 65149, 65164, 65179, 65194, 65208,
+        65221, 65234, 65246, 65258, 65269, 65280, 65291, 65301,
+        65310, 65319, 65328, 65337, 65345, 65352, 65360, 65367,
+        65374, 65381, 65387, 65393, 65399, 65404, 65410, 65415,
+        65420, 65425, 65429, 65433, 65438, 65442, 65445, 65449,
+        65453, 65456, 65459, 65462, 65465, 65468, 65471, 65474,
+        65476, 65479, 65481, 65483, 65485, 65488, 65489, 65491,
+        65493, 65495, 65497, 65498, 65500, 65501, 65503, 65504,
+        65505, 65507, 65508, 65509, 65510, 65511, 65512, 65513,
+        65514, 65515, 65516, 65517, 65517, 65518, 65519, 65520,
+        65520, 65521, 65522, 65522, 65523, 65523, 65524, 65524,
+        65525, 65525, 65526, 65526, 65526, 65527, 65527, 65528,
+        65528, 65528, 65529, 65529, 65529, 65529, 65530, 65530,
+        65530, 65530, 65531, 65531, 65531, 65531, 65531, 65532,
+        65532, 65532, 65532, 65532, 65532, 65533, 65533, 65533,
+        65533, 65533, 65533, 65533, 65533, 65534, 65534, 65534,
+        65534, 65534, 65534, 65534, 65534, 65534, 65534, 65535
+        // clang-format on
+    };
+
+    auto lut = std::make_unique<uint32_t[]>(512);
+    for ( int i = -256; i < 256; ++i )
+    {
+        int j0, j1, v0, v1;
+        if ( i >= 0 )
+        {
+            j0 = i;
+            j1 = i == 255 ? 255 : i + 1;
+            v0 = SIGMOID_TABLE[j0] - 0x8000;
+            v1 = SIGMOID_TABLE[j1] - 0x8000;
+        }
+        else
+        {
+            j0 = i == -256 ? 255 : -i;
+            if ( op->Type() == OpType::Sigmoid )
+            {
+                j1 = j0 - 1;
+            }
+            else
+            {
+                j1 = i == -256 ? 255 : j0 - 1;
+            }
+
+            v0 = 0x8000 - SIGMOID_TABLE[j0];
+            v1 = 0x8000 - SIGMOID_TABLE[j1];
+        }
+
+        uint32_t base = v0 & 0xffff;
+
+        uint32_t slope = 0;
+        if ( v1 - v0 > 0 ) slope = v1 - v0;
+
+        lut[256 + i] = (slope << 16) | (base);
+    }
+
+    auto lutTens = CreateConstTensor("LUT", ifmConn->tensor->Type(), std::make_shared<Buffer>(std::move(lut), 512));
+    op->ConnectInput(TensorUsage::LUT, lutTens);
+    return op;
+}
+
+
+// Rewrite functions
+
+// Convert EXP operations to LUT
+Operation *TFLiteGraphOptimiser::ConvertExpToLUT(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+    Operation *returnOp = operation;
+    OpType type = operation->Type();
+    if ( type != OpType::Exp )
+    {
+        return returnOp;
+    }
+    const auto &ifmConn = operation->Input(TensorUsage::IFM0);
+    DataType ifmType = ifmConn->tensor->Type();
+    if ( (ifmType & DataType::Bits8) == DataType::Bits8 )
+    {
+        returnOp = ConvertToLUT8(
+            operation, [](double x) -> double { return std::exp(x); }, "Exp");
+        RecordOptimisation(operation, returnOp);
+        operation->Disconnect();
+    }
+    else if ( ifmType == DataType::Int16 )
+    {
+        returnOp = ConvertToInterpolatingLUT16(
+            operation, [](double x) -> double { return std::exp(x); }, "Exp16(interp)");
+        RecordOptimisation(operation, returnOp);
+        operation->Disconnect();
+    }
+    return returnOp;
+}
+
+Operation *TFLiteGraphOptimiser::RewriteConcat(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+    auto *returnOp = operation;
+    auto opType = operation->Type();
+
+    if ( opType == OpType::Concat || opType == OpType::ConcatTFLite || opType == OpType::Pack )
+    {
+        auto *ifm0Conn = operation->Input(TensorUsage::IFM0);
+        auto *ofmConn = operation->Output(TensorUsage::OFM);
+        auto *ofm = ofmConn->tensor.get();
+        auto axis = GetAxis(operation);
+        auto axis4D = 0;
+
+        // Remove writers from OFM
+        ofm->RemoveWriters();
+
+        // No unfuse of activation, should have been taken care of in TFLite reader.
+        // Pack treated like concat after setting desired shape
+        Shape packShape = Shape();  // Pack/Unpack calculates shape once outside loop.
+        if ( opType == OpType::Pack )
+        {
+            packShape = MakePackUnpackDesiredShape(axis, ifm0Conn->shape, &axis4D);
+        }
+
+        auto idx = 0;
+        auto usage = MakeTensorUsage(TensorUsage::IFM, 0);
+        auto *ifmConn = operation->Input(usage);
+        auto offset = 0;
+        // Set shape on all IFMs
+        while ( ifmConn != nullptr )
+        {
+            Shape writeOffset(0, 0, 0, 0);
+            if ( opType == OpType::Pack )
+            {
+                ifmConn->shape = packShape;
+                writeOffset[axis4D] = offset;
+            }
+            else if ( opType == OpType::Concat || opType == OpType::ConcatTFLite )
+            {
+                ifmConn->shape = MakeConcatSplitDesiredShape(axis, ifmConn->shape, &axis4D);
+                writeOffset[axis4D] = offset;
+            }
+
+            auto op = MakeMemoryCopyForConcat(ofmConn, ifmConn, writeOffset);
+
+            offset += ifmConn->shape[axis4D];
+
+            ifmConn->tensor->RemoveReader(operation->shared_from_this());
+
+            usage = MakeTensorUsage(TensorUsage::IFM, ++idx);
+            ifmConn = operation->Input(usage);
+
+            RecordOptimisation(operation, op.get());
+        }
+        // Replaced by multiple ops.
+        // Will return the original op, which have all the Input/Outputs for the traversal.
+        // But with Writers and Readers cleared.
+    }
+    return returnOp;
+}
+
+
+Operation *TFLiteGraphOptimiser::RewriteSplit(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+    auto *returnOp = operation;
+    auto opType = operation->Type();
+
+    if ( opType == OpType::Split || opType == OpType::SplitV || opType == OpType::StridedSlice ||
+         opType == OpType::Slice || opType == OpType::Unpack )
+    {
+        auto *ifmConn = operation->Input(TensorUsage::IFM0);
+        assert(ifmConn);
+        auto *ofmConn = operation->Output(TensorUsage::OFM);
+        assert(ofmConn);
+        auto axis = GetAxis(operation);
+        auto axis4D = 0;
+
+        if ( opType == OpType::StridedSlice )
+        {
+            // StridedSlice ellipsis_mask not supported.
+            // StridedSlice new_axis_mask and shrink_axis_mask cannot both be set.
+            const auto ellipsis_mask = operation->Parameters().strided_slice.ellipsis_mask;
+            const auto new_axis_mask = operation->Parameters().strided_slice.new_axis_mask;
+            const auto shrink_axis_mask = operation->Parameters().strided_slice.shrink_axis_mask;
+            if ( ellipsis_mask != 0 || (new_axis_mask != 0 && shrink_axis_mask != 0) )
+            {
+                return returnOp;
+            }
+        }
+
+        // Only rewrite for int8, uint8 and int16 supported.
+        auto ifmType = ifmConn->tensor->Type();
+        if ( ifmType != DataType::Int8 && ifmType != DataType::UInt8 && ifmType != DataType::Int16 )
+        {
+            return returnOp;
+        }
+
+        // Only rewrite for int8, uint8 and int16 supported.
+        auto ofmType = ofmConn->tensor->Type();
+        if ( ofmType != DataType::Int8 && ofmType != DataType::UInt8 && ofmType != DataType::Int16 )
+        {
+            return returnOp;
+        }
+
+        Shape unpackShape = Shape();  // Pack/Unpack calculates shape once outside loop.
+        if ( opType == OpType::Unpack )
+        {
+            unpackShape = MakePackUnpackDesiredShape(axis, ofmConn->shape, &axis4D);
+        }
+
+        auto idx = 0;
+        auto offset = 0;
+        auto usage = MakeTensorUsage(TensorUsage::OFM, 0);
+        ofmConn = operation->Output(usage);
+        // Set shape on all OFMs
+        while ( ofmConn != nullptr )
+        {
+            // Remove writers from OFM
+            auto *ofm = ofmConn->tensor.get();
+            ofm->RemoveWriters();
+
+            Shape readOffset(0, 0, 0, 0);
+            Shape readShape(1, 1, 1, 1);
+
+            if ( opType == OpType::Unpack )
+            {
+                ofmConn->shape = unpackShape;
+                readShape = unpackShape;
+                readOffset[axis4D] = offset;
+            }
+            else if ( opType == OpType::Split || opType == OpType::SplitV )
+            {
+                ofmConn->shape = MakeConcatSplitDesiredShape(axis, ofmConn->shape, &axis4D);
+                readShape = ofmConn->shape;
+                readOffset[axis4D] = offset;
+            }
+            else if ( opType == OpType::Slice )
+            {
+                ofmConn->shape = Shape::PadAxes(ofmConn->shape, 4, 1);
+                readShape = ifmConn->shape.WithOnes();
+                readOffset = ifmConn->shape.WithZeros();
+                SetSliceOffsetValues(operation, readShape, readOffset);
+            }
+            else if ( opType == OpType::StridedSlice )
+            {
+                // TODO: MLBEDSW-9071: Change StridedSlice shape to 4D
+                ofmConn->shape = MakeStridedSliceDesiredShape(operation, ofmConn->shape);
+                readShape = ifmConn->shape.WithOnes();
+                readOffset = ifmConn->shape.WithZeros();
+                SetStridedSliceOffsetValues(operation, ifmConn, readShape, readOffset);
+            }
+
+            auto op = MakeMemoryCopyForSplitOps(ofmConn, ifmConn, readShape, readOffset);
+            offset += ofmConn->shape[axis4D];
+
+            usage = MakeTensorUsage(TensorUsage::OFM, ++idx);
+            ofmConn = operation->Output(usage);
+            RecordOptimisation(operation, op.get());
+        }
+        // Replaced by multiple ops.
+        // Will return the original op, which have all the Input/Outputs for the traversal.
+        // But with Writers and Readers cleared.
+        ifmConn->tensor->RemoveReader(operation->shared_from_this());
+    }
+    return returnOp;
+}
+
+
+Operation *TFLiteGraphOptimiser::RemoveReshape(Graph *const graph, Operation *const operation)
+{
+    Operation *returnOp = operation;
+    OpType opType = operation->Type();
+
+    if ( IsReshape(opType) )
+    {
+        auto *ifmConn = operation->Input(TensorUsage::IFM0);
+        auto *ofmConn = operation->Output(TensorUsage::OFM);
+        auto *ifm = ifmConn->tensor.get();
+        auto *ofm = ofmConn->tensor.get();
+
+        // Check if ifm/ofm are network ifm/ofm
+        bool isIfmSgIfm = IsTensorInVector(graph->Inputs(), ifm);
+        bool isOfmSgOfm = IsTensorInVector(graph->Outputs(), ofm);
+        bool isIfmSgOfm = IsTensorInVector(graph->Outputs(), ifm);
+
+        // TODO: MLBEDSW-9069: Check CPU operator producer/consumer
+
+        // Inserts a copy op if needed before removing reshapes.
+        if ( (isIfmSgIfm || isIfmSgOfm) && (isOfmSgOfm) )
+        {
+            auto copyOp = InsertCopyOpAfterTensor(ifmConn->tensor, ifmConn->quantization);
+            copyOp->SetRounding(RoundMode::NATURAL);
+
+            // reset the ifm to reflect the reshape's new ifm
+            ifmConn = operation->Input(TensorUsage::IFM0);
+            ifm = ifmConn->tensor.get();
+            returnOp = copyOp.get();
+            RecordOptimisation(operation, returnOp);
+            // Reshape still needs to be removed.
+        }
+
+        // Remove the reshape and one of the tensors.
+        if ( isOfmSgOfm )
+        {
+            // TODO: This path should also be used for ofm tensors consumed by CPU ops.
+
+            // The OFM is in graph outputs, do not remove this tensor.
+            // Bypass by replacing ifm with ofm.
+            // Set OFM as output for IFM producers
+            ReplaceProducerOutput(ifm->Writers(), ifm, ofmConn->tensor);
+
+            // Set OFM as input to other IFM consumers.
+            ReplaceConsumerInput(operation, ifm->Readers(), ifm, ofmConn->tensor);
+        }
+        else
+        {
+            // Bypass by replacing ofm with ifm.
+            // Set IFM as input to OFM consumers.
+            ReplaceConsumerInput(nullptr, ofm->Readers(), ofm, ifmConn->tensor);
+        }
+        // Remove the reshape from ifm readers and ofm writers.
+        // Note the Inputs/Outputs on operation should still be intact to not break the traversal.
+        ifm->RemoveReader(operation->shared_from_this());
+        ofm->RemoveWriter(operation->shared_from_this());
+    }
+
+    return returnOp;
+}
+
+Operation *TFLiteGraphOptimiser::RemoveTranspose(Graph *const graph, Operation *const operation)
+{
+    Operation *returnOp = operation;
+
+    OpType opType = operation->Type();
+
+    if ( opType == OpType::Transpose )
+    {
+        auto *ifmConn = operation->Input(TensorUsage::IFM0);
+        assert(ifmConn);
+        auto *paramsConn = operation->Input(TensorUsage::Params);
+        assert(paramsConn);
+        assert(paramsConn->shape.Size() == 1);
+        auto *ofmConn = operation->Output(TensorUsage::OFM);
+        assert(ofmConn);
+        auto *ifm = ifmConn->tensor.get();
+        auto *ofm = ofmConn->tensor.get();
+        Shape ifmShape = ifmConn->shape;
+        Shape ofmShape = ofmConn->shape;
+
+        // We can only handle permutation vectors up 4 elements
+        if ( paramsConn->shape.Depth() > 4 ) return returnOp;
+
+        // We can only handle constant permutation vectors
+        if ( !paramsConn->tensor->IsConstant() ) return returnOp;
+
+        // Convert the permutation vector to a transpose mask that can transpose a shape of size 4
+        // For example:
+        // [0, 1, 2, 3] -> 0x0123 ("NHWC")
+        // [0, 1, 2] -> 0x0123 ("NHWC")
+        // [0, 1] -> 0x0123 ("NHWC")
+        // [0] -> 0x0123 ("NHWC")
+        // [0, 2, 1, 3] -> 0x0213 ("NWHC")
+        // [1, 0, 2] -> 0x0213 ("NWHC")
+        uint32_t mask = 0;
+        int offset = 4 - paramsConn->shape.Depth();
+        for ( int i = 0; i < offset; i++ )
+        {
+            mask = (mask << 4) | i;
+        }
+        for ( int i = offset; i < 4; i++ )
+        {
+            mask = (mask << 4) | (offset + paramsConn->tensor->View().Values<int32_t>()[i - offset]);
+        }
+
+        // Convert the transpose mask to a transpose type
+        TransposeType transposeType = TransposeType(mask);
+
+        // Check if IFM/OFM are network IFM/OFM
+        bool isIfmSgIfm = IsTensorInVector(graph->Inputs(), ifm);
+        bool isIfmSgOfm = IsTensorInVector(graph->Outputs(), ifm);
+        bool ifmSingleReader = ifm->Readers().size() == 1;
+        bool ifmSingleWriter = ifm->Writers().size() == 1;
+
+        Operation *mainOp = nullptr;
+
+        if ( ifmSingleReader && ifmSingleWriter && !isIfmSgIfm && !isIfmSgOfm )
+        {
+            // Get the previous op and its current transpose type
+            Operation *prevOp = ifm->Writers()[0].get();
+            OpType prevOpType = prevOp->Type();
+            auto prevOfmConn = prevOp->Output(TensorUsage::OFM);
+
+            if ( IsNone(prevOfmConn->transpose) && prevOfmConn->reverse == ReverseType::None &&
+                 prevOfmConn->shape == ifmShape && _arch->SupportsTranspose(prevOpType, transposeType) )
+            {
+                // Set transpose type and shape on main op's OFM connection
+                prevOp->Output(TensorUsage::OFM)->shape = Shape::PadAxes(ofmShape, 4, 1);
+                prevOp->Output(TensorUsage::OFM)->transpose = transposeType;
+
+                // Previous op supports transpose -- Save it so we can fuse transpose to it
+                mainOp = prevOp;
+            }
+        }
+
+        if ( !mainOp && _arch->SupportsTranspose(OpType::MemoryCopy, transposeType) )
+        {
+            // Previous doesn't support transpose -- Add a MemoryCopy so we can fuse transpose to it
+            auto memoryCopy = InsertCopyOpAfterTensor(ifmConn->tensor, ifmConn->quantization);
+            memoryCopy->SetRounding(RoundMode::NATURAL);
+
+            // Set transpose type and shapes on main op's IFM/OFM connection
+            memoryCopy->Input(TensorUsage::IFM0)->shape = Shape::PadAxes(ifmShape, 4, 1);
+            memoryCopy->Output(TensorUsage::OFM)->shape = Shape::PadAxes(ofmShape, 4, 1);
+            memoryCopy->Output(TensorUsage::OFM)->transpose = transposeType;
+
+            // Since we added a new op and tensor before our transpose, update to new IFM
+            ifmConn = operation->Input(TensorUsage::IFM0);
+            ifm = ifmConn->tensor.get();
+
+            mainOp = memoryCopy.get();
+        }
+
+        if ( mainOp )
+        {
+            // Bypass and remove Transpose
+            ReplaceProducerOutput(ifm->Writers(), ifm, ofmConn->tensor);
+            ReplaceConsumerInput(operation, ifm->Readers(), ifm, ofmConn->tensor);
+            operation->Disconnect();
+
+            returnOp = mainOp;
+        }
+    }
+
+    return returnOp;
+}
+
+
+// Remove Reverse op and move its reverse type to the previous op's OFM TensorConnection. If previous op can't reverse,
+// insert a MemoryCopy.
+Operation *TFLiteGraphOptimiser::RemoveReverse(Graph *const graph, Operation *const operation)
+{
+    auto returnOp = operation;
+
+    if ( operation->Type() == OpType::ReverseV2 )
+    {
+        auto ifmConn = operation->Input(TensorUsage::IFM);
+        auto paramsConn = operation->Input(TensorUsage::Params);
+        auto ofmConn = operation->Output(TensorUsage::OFM);
+
+        auto *ifm = ifmConn->tensor.get();
+        auto *ofm = ofmConn->tensor.get();
+        Shape ifmShape = ifmConn->shape;
+        Shape ofmShape = ofmConn->shape;
+
+        // We can only handle constant axis vectors
+        if ( !paramsConn->tensor->IsConstant() ) return returnOp;
+
+        // We can only handle 1-element axis vectors
+        if ( paramsConn->shape != Shape(1) ) return returnOp;
+
+        assert(paramsConn->tensor->Type() == DataType::Int32);
+        int32_t axis = paramsConn->tensor->View().Values<int32_t>()[0];
+
+        // Convert the axis parameter to a reverse type.
+        // For example:
+        // [axis = 0, size = 1, min_axis = 0, max_axis = 0] -> reverse type C (0x1)
+        // [axis = 0, size = 2, min_axis = 0, max_axis = 1] -> reverse type W (0x2)
+        // [axis = 1, size = 2, min_axis = 0, max_axis = 1] -> reverse type C (0x1)
+        // [axis = 0, size = 3, min_axis = 0, max_axis = 2] -> reverse type H (0x4)
+        // [axis = 1, size = 3, min_axis = 0, max_axis = 2] -> reverse type W (0x2)
+        // [axis = 2, size = 3, min_axis = 0, max_axis = 2] -> reverse type C (0x1)
+        // [axis = 1, size = 4, min_axis = 1, max_axis = 3] -> reverse type H (0x4)
+        // [axis = 2, size = 4, min_axis = 1, max_axis = 3] -> reverse type W (0x2)
+        // [axis = 3, size = 4, min_axis = 1, max_axis = 3] -> reverse type C (0x1)
+        const int size = ifmShape.Size();
+        if ( axis < 0 ) axis = size + axis;
+        const int axis_min = std::max(size - 3, 0);  // Can only reverse the last 3 dimensions
+        const int axis_max = size - 1;
+        if ( axis < axis_min || axis > axis_max ) return returnOp;
+        const ReverseType reverseType = ReverseType(1 << (axis_max - axis));
+
+        // Check if IFM/OFM are network IFM/OFM
+        bool isIfmSgIfm = IsTensorInVector(graph->Inputs(), ifm);
+        bool isIfmSgOfm = IsTensorInVector(graph->Outputs(), ifm);
+        bool ifmSingleReader = ifm->Readers().size() == 1;
+        bool ifmSingleWriter = ifm->Writers().size() == 1;
+
+        Operation *mainOp = nullptr;
+
+        if ( ifmSingleReader && ifmSingleWriter && !isIfmSgIfm && !isIfmSgOfm )
+        {
+            // Get the previous op and its current reverse type
+            Operation *prevOp = ifm->Writers()[0].get();
+            OpType prevOpType = prevOp->Type();
+            auto prevOfmConn = prevOp->Output(TensorUsage::OFM);
+
+            if ( prevOfmConn->reverse == ReverseType::None && IsNone(prevOfmConn->transpose) &&
+                 prevOfmConn->shape == ifmShape && _arch->SupportsReverse(prevOpType, reverseType) )
+            {
+                // Set reverse type and shape on main op's OFM connection
+                prevOp->Output(TensorUsage::OFM)->shape = Shape::PadAxes(ofmShape, 4, 1);
+                prevOp->Output(TensorUsage::OFM)->reverse = reverseType;
+
+                // Previous op supports reverse -- Save it so we can fuse reverse to it
+                mainOp = prevOp;
+            }
+        }
+
+        if ( !mainOp && _arch->SupportsReverse(OpType::MemoryCopy, reverseType) )
+        {
+            // Previous doesn't support reverse -- Add a MemoryCopy so we can fuse reverse to it
+            auto memoryCopy = InsertCopyOpAfterTensor(ifmConn->tensor, ifmConn->quantization);
+            memoryCopy->SetRounding(RoundMode::NATURAL);
+
+            // Set reverse type and shapes on main op's IFM/OFM connection
+            memoryCopy->Input(TensorUsage::IFM0)->shape = Shape::PadAxes(ifmShape, 4, 1);
+            memoryCopy->Output(TensorUsage::OFM)->shape = Shape::PadAxes(ofmShape, 4, 1);
+            memoryCopy->Output(TensorUsage::OFM)->reverse = reverseType;
+
+            // Since we added a new op and tensor before our reverse, update to new IFM
+            ifmConn = operation->Input(TensorUsage::IFM0);
+            ifm = ifmConn->tensor.get();
+
+            mainOp = memoryCopy.get();
+        }
+
+        if ( mainOp )
+        {
+            // Bypass and remove Reverse
+            ReplaceProducerOutput(ifm->Writers(), ifm, ofmConn->tensor);
+            ReplaceConsumerInput(operation, ifm->Readers(), ifm, ofmConn->tensor);
+            operation->Disconnect();
+
+            returnOp = mainOp;
+        }
+    }
+
+    return returnOp;
+}
+
+// Replace TFLite GatherV2 and GatherNd with GraphIR Gather, if possible.
+Operation *TFLiteGraphOptimiser::ConvertGather(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+
+    Operation *returnOp = operation;
+
+    OpType opType = operation->Type();
+
+    if ( opType == OpType::GatherV2 && _arch->SupportsGather(OpType::Gather) )
+    {
+        auto *paramsConn = operation->Input(TensorUsage::IFM0);
+        auto *idxConn = operation->Input(TensorUsage::IFM1);
+        auto *ofmConn = operation->Output(TensorUsage::OFM);
+        assert(paramsConn);
+        assert(idxConn);
+        assert(ofmConn);
+
+        auto paramsRank = paramsConn->shape.Size();
+        auto idxRank = idxConn->shape.Size();
+
+        // TFLite Gather attributes
+        int axisParam = 0;
+        int batchDimsParam = 0;
+        const tflite::Operator *const passthrough = static_cast<const tflite::Operator *>(operation->Passthrough());
+        if ( passthrough )
+        {
+            const auto options = passthrough->builtin_options_as_GatherOptions();
+            if ( options )
+            {
+                axisParam = options->axis();
+                if ( axisParam < 0 ) axisParam = paramsRank - (-axisParam);
+                batchDimsParam = options->batch_dims();
+                // TODO: convert below asserts to TFLite semantic checks
+                assert(axisParam >= 0);
+                assert(axisParam < paramsRank);
+                assert(batchDimsParam >= 0);
+                assert(batchDimsParam < paramsRank);
+                assert(batchDimsParam < idxRank);
+                assert(batchDimsParam <= axisParam);
+            }
+        }
+
+        // Calculate GraphIR Gather N dim
+        int N = 1;
+        for ( int i = 0; i < batchDimsParam; i++ )
+        {
+            N *= paramsConn->shape[i];
+        }
+
+        // Calculate GraphIR Gather W dim
+        int W = 1;
+        for ( int i = batchDimsParam; i < idxRank; i++ )
+        {
+            W *= idxConn->shape[i];
+        }
+
+        // Calculate GraphIR Gather K dim
+        int K = paramsConn->shape[axisParam];
+
+        // Calculate GraphIR Gather C dim
+        int C = 1;
+        for ( int i = axisParam + 1; i < paramsRank; i++ )
+        {
+            C *= paramsConn->shape[i];
+        }
+
+        // Calculate the remaining dims (must be 1)
+        int S = 1;
+        for ( int i = batchDimsParam; i < axisParam; i++ )
+        {
+            S *= paramsConn->shape[i];
+        }
+
+        if ( S == 1 )
+        {
+            // Rebuild shapes
+            paramsConn->shape = Shape(1, N, K, C);
+            paramsConn->tensor->SetName("values");
+            idxConn->shape = Shape(1, 1, N, W);
+            idxConn->tensor->SetName("indices");
+            ofmConn->shape = Shape(1, N, W, C);
+            ofmConn->tensor->SetName("output");
+
+            if ( idxConn->tensor->Type() == DataType::Int16 )
+            {
+                // Create new op that casts indices to int32
+                auto idxCastOp = CreateCastToInt32(idxConn);
+
+                // Use the casted indicies
+                auto idxCastConn = idxCastOp->Output(TensorUsage::OFM);
+                idxCastConn->shape = Shape(1, 1, N, W);
+                idxCastConn->tensor->SetName("indices-int32");
+                operation->CopyInput(TensorUsage::IFM1, *idxCastConn);
+            }
+
+            // Replace TFLite GatherV2 with GraphIR Gather
+            auto gatherOp = std::make_shared<Operation>(OpType::Gather);
+            gatherOp->SetRounding(RoundMode::DBL);
+            ReplaceOperation(operation, gatherOp.get());
+            RecordOptimisation(operation, gatherOp.get());
+
+            returnOp = gatherOp.get();
+        }
+    }
+
+    return returnOp;
+}
+
+// Replace TFLite ScatterNd with GraphIR Scatter, if possible.
+Operation *TFLiteGraphOptimiser::ConvertScatter(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+
+    Operation *returnOp = operation;
+
+    OpType opType = operation->Type();
+
+    if ( opType == OpType::ScatterNd && _arch->SupportsScatter(OpType::Scatter) )
+    {
+        auto *idxConn = operation->Input(TensorUsage::IFM0);
+        auto *updatesConn = operation->Input(TensorUsage::IFM1);
+        auto *shapeConn = operation->Input(TensorUsage::Params);
+        auto *ofmConn = operation->Output(TensorUsage::OFM);
+        assert(idxConn);
+        assert(updatesConn);
+        assert(shapeConn);
+        assert(ofmConn);
+
+        // Can only support this op when last dimension is 1
+        if ( idxConn->shape[-1] != 1 )
+        {
+            return returnOp;
+        }
+
+        // TODO: MLBEDSW-8459: Add supported ops check for TFLite ScatterND
+        assert(shapeConn->tensor->IsConstant());
+        assert(shapeConn->shape.Size() == 1);
+
+        // Calculate GraphIR Scatter N dim
+        int N = 1;
+
+        // Calculate GraphIR Scatter K dim
+        int K = shapeConn->tensor->View().Values<int32_t>()[0];
+
+        // Calculate GraphIR Scatter W dim
+        int W = 1;
+        for ( int i = 0; i < idxConn->shape.Size() - 1; i++ )
+        {
+            W *= idxConn->shape[i];
+        }
+
+        // Calculate GraphIR Scatter C dim
+        int C = 1;
+        for ( int i = 1; i < shapeConn->shape.Depth(); i++ )
+        {
+            C *= shapeConn->tensor->View().Values<int32_t>()[i];
+        }
+
+        // Reshape tensors to follow GraphIR Scatter convention
+        idxConn->shape = Shape(1, 1, N, W);
+        idxConn->tensor->SetName("indices");
+        updatesConn->shape = Shape(1, N, W, C);
+        updatesConn->tensor->SetName("input");
+        ofmConn->shape = Shape(1, N, K, C);
+        ofmConn->tensor->SetName("values_out");
+
+        // Generate a constant zeroed tensor as the GraphIR Scatter values_in tensor with same shape as values_out
+        auto dtype = ofmConn->tensor->Type();
+        std::vector<uint8_t> zeroVector(DataTypeStorageSizeBytes(dtype, ofmConn->shape.Elements()), 0);
+        auto zeroBuffer = std::make_shared<Buffer>(std::move(zeroVector));
+        auto zeroTensor = CreateConstTensor("values_in", dtype, zeroBuffer, &ofmConn->shape);
+
+        // Add GraphIR Scatter op
+        auto scatterOp = std::make_shared<Operation>(OpType::Scatter);
+        scatterOp->SetRounding(RoundMode::NATURAL);
+        scatterOp->ConnectInput(TensorUsage::IFM0, zeroTensor);  // GraphIR Scatter values_in
+        scatterOp->CopyInput(TensorUsage::IFM1, *idxConn);       // GraphIR Scatter indices
+        scatterOp->CopyInput(TensorUsage::IFM2, *updatesConn);   // GraphIR Scatter input
+        scatterOp->CopyOutput(TensorUsage::OFM, *ofmConn);       // GraphIR Scatter values_out
+
+        // Remove TFLite ScatterNd op
+        operation->Disconnect();
+        RecordOptimisation(operation, scatterOp.get());
+
+        returnOp = scatterOp.get();
+    }
+
+    return returnOp;
+}
+
+// Replace TFLite ResizeBilinear or ResizeNearestNeighbor with Resize
+Operation *TFLiteGraphOptimiser::ConvertResize(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+    Operation *returnOp = operation;
+    OpType opType = operation->Type();
+
+    if ( opType == OpType::ResizeBilinear || opType == OpType::ResizeNearestNeighbor )
+    {
+        auto ifmConn = operation->Input(TensorUsage::IFM);
+        auto ofmConn = operation->Output(TensorUsage::OFM);
+        assert(ifmConn);
+        assert(ofmConn);
+
+        // Get numerators(n) and denominators(d) for the scale fractions
+        int width_n = ofmConn->shape.Width();
+        int width_d = ifmConn->shape.Width();
+        int height_n = ofmConn->shape.Height();
+        int height_d = ifmConn->shape.Height();
+        int heightOffset = 0;
+        int widthOffset = 0;
+
+        // Compute scaling fractions
+        // align-corners use a scale-factor of (n-1)/(d-1)
+        if ( operation->Parameters().resize.alignCorners )
+        {
+            if ( width_d > 1 )
+            {
+                width_n -= 1;
+                width_d -= 1;
+            }
+            if ( height_d > 1 )
+            {
+                height_n -= 1;
+                height_d -= 1;
+            }
+        }
+
+        // reduce scaling fractions with gcd
+        int gcd_w = std::gcd(width_n, width_d);
+        width_n = (width_n / gcd_w);
+        width_d = (width_d / gcd_w);
+
+        int gcd_h = std::gcd(height_n, height_d);
+        height_n = (height_n / gcd_h);
+        height_d = (height_d / gcd_h);
+
+        if ( operation->Parameters().resize.halfPixelCenters )
+        {
+            // make sure fractions are evenly divisible by 2
+            width_n = width_n * 2;
+            width_d = width_d * 2;
+            height_n = height_n * 2;
+            height_d = height_d * 2;
+            // adjust offset for half-pixel-centers
+            widthOffset = (width_d / 2) - (width_n / 2);
+            heightOffset = (height_d / 2) - (height_n / 2);
+        }
+
+        // set up op-support query
+        ResizeSupportQuery query;
+        query.scaleX = {int16_t(width_n), int16_t(width_d)};
+        query.scaleY = {int16_t(height_n), int16_t(height_d)};
+        query.offsetX = widthOffset;
+        query.offsetY = heightOffset;
+        query.ifmShape = ifmConn->shape;
+
+        if ( opType == OpType::ResizeBilinear )
+        {
+            query.mode = ArchResizeMode::Bilinear;
+        }
+        else
+        {
+            query.mode = ArchResizeMode::Nearest;
+        }
+        if ( _arch->SupportsResize(query) )
+        {
+            // Replace ResizeBilinear or ResizeNearestNeighbor with a Resize op
+            auto resizeOp = std::make_shared<Operation>(OpType::Resize);
+            resizeOp->SetRounding(RoundMode::SYMMETRIC);
+            resizeOp->CopyInput(TensorUsage::IFM, *ifmConn);
+            resizeOp->CopyOutput(TensorUsage::OFM, *ofmConn);
+            resizeOp->Parameters() = operation->Parameters();
+
+            // write operator attributes
+            auto &attr = resizeOp->attr;
+            attr.resize.scaleX = {int16_t(width_n), int16_t(width_d)};
+            attr.resize.scaleY = {int16_t(height_n), int16_t(height_d)};
+            attr.resize.offsetYX[0] = heightOffset;
+            attr.resize.offsetYX[1] = widthOffset;
+            attr.resize.borderYX[0] = 0;
+            attr.resize.borderYX[1] = 0;
+            attr.resize.mode = tosa::ResizeMode::NEAREST;
+            if ( opType == OpType::ResizeBilinear )
+            {
+                attr.resize.mode = tosa::ResizeMode::BILINEAR;
+            }
+
+            int shift = 0;
+            if ( opType == OpType::ResizeBilinear && (ifmConn->shape.Width() > 1 || ifmConn->shape.Height() > 1) )
+            {
+                // ResizeBilinear is post-scaled with
+                // 1 / (height_n * width_n)
+                // as the scale-factor is a power of two, we can use shift
+                shift = IntLog2(width_n * height_n);
+            }
+
+            // Set explicit scaling
+            Quantization ofmQuant = ofmConn->quantization;
+            ofmQuant.scales.clear();
+            ofmQuant.zeroPoints.clear();
+            ofmQuant.scales.emplace_back(QuantizedScale(1, shift));
+            ofmQuant.zeroPoints.emplace_back(0);
+            ofmQuant.type = QuantizationType::EXPLICIT;
+            resizeOp->Output(TensorUsage::OFM)->Set(ofmQuant);
+
+            RecordOptimisation(operation, resizeOp.get());
+            returnOp = resizeOp.get();
+            operation->Disconnect();
+        }
+    }
+    return returnOp;
+}
+
+Operation *TFLiteGraphOptimiser::ConvertArgMax(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+    Operation *returnOp = operation;
+    if ( operation->Type() == OpType::ArgMax && _arch->SupportsArgMax(OpType::ArgMax) )
+    {
+        if ( operation->Input(TensorUsage::IFM0)->slice.shape )
+        {
+            // Already processed, return
+            return returnOp;
+        }
+        constexpr int kMaxSize = 65536;
+        auto *ifmConn = operation->Input(TensorUsage::IFM0);
+        auto *paramsConn = operation->Input(TensorUsage::Params);
+        auto *ofmConn = operation->Output(TensorUsage::OFM);
+
+        auto axis = paramsConn->tensor->View().Values<int32_t>()[0];
+        axis = (axis == -1) ? ifmConn->shape.Size() - 1 : axis;
+        auto axis_4D = axis + (4 - ifmConn->shape.Size());
+
+        int ifmHeight = ifmConn->shape.Size() >= 3 ? ifmConn->shape.Height() : 1;
+        int ifmWidth = ifmConn->shape.Size() >= 2 ? ifmConn->shape.Width() : 1;
+
+        if ( axis_4D == 3 )
+        {
+            // ArgMax on Z-dimension, reshape to 1x(H*W)xCx1
+            ifmHeight = ifmHeight * ifmWidth;
+            ifmWidth = ifmConn->shape.Depth() > 0 ? ifmConn->shape.Depth() : 1;
+            ifmConn->shape = Shape(1, ifmHeight, ifmWidth, 1);
+            ofmConn->shape = Shape(1, ifmHeight, 1, 1);
+            axis_4D = 2;
+        }
+        operation->attr.axis.axis = axis_4D;
+        int kernelH = axis_4D == 1 ? ifmHeight : 1;
+        int kernelW = axis_4D == 2 ? ifmWidth : 1;
+        std::unique_ptr<Kernel> kernel = std::make_unique<Kernel>(Point2i(kernelW, kernelH), Point2i(1, 1), Point2i(1, 1));
+        operation->SetKernel(std::move(kernel));
+        ofmConn->quantization.scales.clear();
+        ofmConn->quantization.scales.push_back(QuantizedScale(1, 16));
+        ofmConn->quantization.zeroPoints.clear();
+        ofmConn->quantization.zeroPoints.push_back(0);
+        operation->SetRounding(RoundMode::TRUNCATE_TO_LOWER);
+        if ( ifmConn->shape.Size() != ofmConn->shape.Size() )
+        {
+            // One dimension has been removed, reinsert a one in the removed axis dimension
+            ofmConn->shape.Insert(axis_4D, 1);
+        }
+
+        if ( ifmHeight > kMaxSize )
+        {
+            // Create splits of kMaxSize until not enough height left
+            TensorSlice ifmSlice = {Shape(0, 0, 0, 0), Shape(1, kMaxSize, ifmWidth, 1)};
+            ifmConn->Set(ifmSlice);
+            TensorSlice ofmSlice = {Shape(0, 0, 0, 0), Shape(1, kMaxSize, 1, 1)};
+            ofmConn->Set(ofmSlice);
+
+            int remainingHeight = ifmHeight - kMaxSize;
+            int offset = kMaxSize;
+            while ( remainingHeight > 0 )
+            {
+                int splitHeight = std::min(remainingHeight, kMaxSize);
+                remainingHeight -= splitHeight;
+
+                auto argmaxSplit = std::make_shared<Operation>(OpType::ArgMax);
+                argmaxSplit->ConnectInput(TensorUsage::Params, paramsConn->tensor);
+
+                // Set slices to the size of each split
+                argmaxSplit->ConnectInput(TensorUsage::IFM0, ifmConn->tensor);
+                auto *splitIfmConn = argmaxSplit->Input(TensorUsage::IFM0);
+                ifmSlice = {Shape(0, offset, 0, 0), Shape(1, splitHeight, ifmWidth, 1)};
+                splitIfmConn->Set(ifmSlice);
+                splitIfmConn->shape = Shape(1, ifmHeight, ifmWidth, 1);
+
+                argmaxSplit->ConnectOutput(TensorUsage::OFM, ofmConn->tensor);
+                auto *splitOfmConn = argmaxSplit->Output(TensorUsage::OFM);
+                ofmSlice = {Shape(0, offset, 0, 0), Shape(1, splitHeight, 1, 1)};
+                splitOfmConn->Set(ofmSlice);
+
+                splitOfmConn->shape = Shape(1, ifmHeight, 1, 1);
+                splitOfmConn->quantization.scales.push_back(QuantizedScale(1, 16));
+                splitOfmConn->quantization.zeroPoints.push_back(0);
+                argmaxSplit->SetRounding(RoundMode::TRUNCATE_TO_LOWER);
+                argmaxSplit->attr.axis.axis = axis_4D;
+                argmaxSplit->SetKernel(std::make_unique<Kernel>(Point2i(kernelW, kernelH), Point2i(1, 1), Point2i(1, 1)));
+                offset += splitHeight;
+                RecordOptimisation(operation, argmaxSplit.get());
+            }
+        }
+    }
+    return returnOp;
+}
+
+Operation *TFLiteGraphOptimiser::MoveSplitSliceToConsumer(Graph *const, Operation *const operation)
+{
+    auto *ifmConn = operation->Input(TensorUsage::IFM0);
+
+    if ( operation->Type() == OpType::MemoryCopy && ifmConn->slice.offset.Size() > 0 )
+    {
+        auto *ofmConn = operation->Output(TensorUsage::OFM);
+        auto *ofm = ofmConn->tensor.get();
+
+        // TODO: MLBEDSW-9072: Add check that moving split to consumer is valid
+
+        // We can only move to consumer if there is no transpose on the op that we will remove,
+        // otherwise we will lose that transposition.
+        if ( ofm->Readers().size() == 1 && IsNone(ofmConn->transpose) )
+        {
+            auto cons = ofm->Readers().front();
+            auto consOfmConn = cons->Output(TensorUsage::OFM);
+            auto *consIfm0 = cons->IFM(0);
+            auto *consIfm1 = cons->IFM(1);
+
+            bool ifmShapeEqual = false;
+            if ( consIfm0 == ofm )
+            {
+                // Check if ifm0 consumer has correct shape
+                auto *consIfm0Conn = cons->Input(TensorUsage::IFM0);
+                ifmShapeEqual = consIfm0Conn->shape == ofmConn->shape;
+            }
+            else if ( consIfm1 != nullptr && consIfm1 == ofm )
+            {
+                // Check if ifm1 consumer has correct shape
+                auto *consIfm1Conn = cons->Input(TensorUsage::IFM1);
+                ifmShapeEqual = consIfm1Conn->shape == ofmConn->shape;
+            }
+
+            // We can only move to consumer if there is no transpose on the op that we move to,
+            // otherwise the IFM shape may change and transposition will be wrong.
+            if ( !IsReshape(cons->Type()) && ofmConn->shape == Shape::PadAxes(ofm->StorageShape(), 4, 1) &&
+                 IsNone(consOfmConn->transpose) && ifmShapeEqual )
+            {
+                // Split/Slice can be performed by tensor consumer
+                MoveToConsumer(operation, cons.get());
+            }
+        }
+    }
+
+    return operation;
+}
+
+Operation *TFLiteGraphOptimiser::CreateTransposeForMatMul(const std::shared_ptr<Tensor> &ifm, const Shape &ofmShape)
+{
+    auto op = std::make_shared<Operation>(OpType::Transpose);
+
+    int32_t permutation[] = {0, 1, 3, 2};
+    auto buf = std::make_shared<Buffer>(4, std::move(permutation), false);
+
+    // IFM should have the untransposed shape
+    op->ConnectInput(TensorUsage::IFM, ifm).Set(Shape(1, ofmShape.Height(), ofmShape.Depth(), ofmShape.Width()));
+    op->ConnectInput(TensorUsage::Params, std::make_shared<Tensor>("perm", DataType::Int32, Shape(4), buf));
+
+    auto ofm = std::make_shared<Tensor>(ifm->Name() + "/" + OpTypeToString(op->Type()), ifm->Type());
+    ofm->SetStorageShape(ofmShape);
+
+    op->ConnectOutput(TensorUsage::OFM, ofm);
+    return op.get();
+}
+
+Operation *TFLiteGraphOptimiser::RewriteBatchMatMul(Graph *const, Operation *const operation)
+{
+    Operation *returnOp = operation;
+    if ( operation->Type() == OpType::BatchMatMul && _arch->SupportsMatMul(OpType::MatMul) )
+    {
+        const auto ifm = operation->Input(TensorUsage::IFM0);
+        const auto ifm2 = operation->Input(TensorUsage::IFM1);
+        const auto ofm = operation->Output(TensorUsage::OFM);
+
+        bool transposeIfm = false;
+        bool transposeIfm2 = false;
+        const tflite::Operator *const passthrough = static_cast<const tflite::Operator *>(operation->Passthrough());
+        if ( passthrough )
+        {
+            const auto options = passthrough->builtin_options_as_BatchMatMulOptions();
+            if ( options )
+            {
+                // adj_x = True then ifm should be transposed
+                transposeIfm = options->adj_x();
+                // adj_y = False then ifm2 should be transposed
+                transposeIfm2 = !options->adj_y();
+            }
+        }
+
+        auto ofmShape = Shape::PadAxes(ofm->shape, 4, 1);
+        auto ifmShape = Shape::PadAxes(ifm->shape, 4, 1);
+        auto ifm2Shape = Shape::PadAxes(ifm2->shape, 4, 1);
+
+        int n = ofmShape.Batch() * ofmShape.Height();
+
+        // IFM handling - Reshape ifm N,H,W,C -> 1,NxH,W,C
+        auto ifmReshaped = Shape(1, n, ifmShape.Width(), ifmShape.Depth());
+        auto ifmTensor = ifm->tensor;
+        if ( transposeIfm )
+        {
+            // Add Transpose op, ifm:  1,n,W,C -> 1,n,C,W
+            ifmReshaped = Shape(1, ifmReshaped.Height(), ifmReshaped.Depth(), ifmReshaped.Width());
+            auto op = CreateTransposeForMatMul(ifm->tensor, ifmReshaped);
+            RecordOptimisation(operation, op);
+            ifmTensor = op->Output(TensorUsage::OFM)->tensor;
+        }
+
+        // IFM2 handling - Reshape ifm2 N,H,W,C -> 1,NxH,W,C
+        auto ifm2Reshaped = Shape(1, n, ifm2Shape.Width(), ifm2Shape.Depth());
+        auto ifm2Tensor = ifm2->tensor;
+        if ( transposeIfm2 )
+        {
+            // Add Transpose op, ifm2: 1,n,W,C -> 1,n,C,W
+            ifm2Reshaped = Shape(1, ifm2Reshaped.Height(), ifm2Reshaped.Depth(), ifm2Reshaped.Width());
+            auto op = CreateTransposeForMatMul(ifm2->tensor, ifm2Reshaped);
+            RecordOptimisation(operation, op);
+            ifm2Tensor = op->Output(TensorUsage::OFM)->tensor;
+        }
+
+        // OFM handling
+        // Reshape ofm N,H,W,C -> 1,NxH,W,C
+        auto ofmReshaped = Shape(1, n, ofmShape.Width(), ofmShape.Depth());
+
+        // Add n Matmul ops
+        for ( int i = 0; i < n; ++i )
+        {
+            auto newOp = std::make_shared<Operation>(OpType::MatMul);
+            newOp->SetRounding(ifm->tensor->Type() == DataType::Int16 ? RoundMode::NATURAL : RoundMode::DBL);
+
+            TensorSlice ifmSlice = {Shape(0, i, 0, 0), ifmReshaped.WithHeight(1)};
+            newOp->ConnectInput(TensorUsage::IFM0, ifmTensor).Set(ifmReshaped).Set(ifm->quantization).Set(ifmSlice);
+
+            TensorSlice ifm2Slice = {Shape(0, i, 0, 0), ifm2Reshaped.WithHeight(1)};
+            newOp->ConnectInput(TensorUsage::IFM1, ifm2Tensor).Set(ifm2Reshaped).Set(ifm2->quantization).Set(ifm2Slice);
+
+
+            TensorSlice ofmSlice = {Shape(0, i, 0, 0), ofmReshaped.WithHeight(1)};
+            newOp->ConnectOutput(TensorUsage::OFM, ofm->tensor).Set(ofmReshaped).Set(ofm->quantization).Set(ofmSlice);
+
+            RecordOptimisation(operation, newOp.get());
+            returnOp = newOp.get();
+        }
+
+        operation->Disconnect();
+    }
+    return returnOp;
+}
+
+
+Operation *TFLiteGraphOptimiser::RewriteFullyConnectDynamic(Graph *const, Operation *const operation)
+{
+    Operation *returnOp = operation;
+    auto ifm2 = operation->Input(TensorUsage::Weights);
+    if ( operation->Type() == OpType::FullyConnected && !ifm2->tensor->IsConstant() && _arch->SupportsMatMul(OpType::MatMul) )
+    {
+        const auto ifm = operation->Input(TensorUsage::IFM0);
+        const auto ofm = operation->Output(TensorUsage::OFM);
+
+        auto ofmShape = Shape::PadAxes(ofm->shape, 4, 1);
+        auto ifmShape = Shape::PadAxes(ifm->shape, 4, 1);
+        auto ifm2Shape = Shape::PadAxes(ifm2->shape, 4, 1);
+
+        auto matMulOp = std::make_shared<Operation>(OpType::MatMul);
+        matMulOp->SetRounding(ifm->tensor->Type() == DataType::Int16 ? RoundMode::NATURAL : RoundMode::DBL);
+
+        matMulOp->ConnectInput(TensorUsage::IFM0, ifm->tensor).Set(ifmShape).Set(ifm->quantization).Set(ifm->slice).Set(ifm->transpose);
+        matMulOp->ConnectInput(TensorUsage::IFM1, ifm2->tensor).Set(ifm2Shape).Set(ifm2->quantization).Set(ifm2->slice).Set(ifm2->transpose);
+        matMulOp->ConnectOutput(TensorUsage::OFM, ofm->tensor).Set(ofmShape).Set(ofm->quantization).Set(ofm->slice).Set(ofm->transpose);
+
+        RecordOptimisation(operation, matMulOp.get());
+        returnOp = matMulOp.get();
+
+        operation->Disconnect();
+    }
+    return returnOp;
+}
+
+
+Operation *TFLiteGraphOptimiser::RewriteSquaredDifference(Graph *const, Operation *const operation)
+{
+    Operation *returnOp = operation;
+    if ( operation->Type() == OpType::SquaredDifference )
+    {
+        const auto ifmConn = operation->Input(TensorUsage::IFM0);
+        const auto ifm2Conn = operation->Input(TensorUsage::IFM1);
+        const auto ofmConn = operation->Output(TensorUsage::OFM);
+
+        const double ifmScale = ifmConn->quantization.scales[0].Dequantize();
+        const double ifm2Scale = ifm2Conn->quantization.scales[0].Dequantize();
+        const double ofmScale = ofmConn->quantization.scales[0].Dequantize();
+
+        auto oneScaleQuant = ifmConn->quantization;
+        oneScaleQuant.scales[0] = {1, 0};
+        oneScaleQuant.zeroPoints.clear();
+
+        auto noScaleQuant = ifmConn->quantization;
+        noScaleQuant.scales.clear();
+        noScaleQuant.zeroPoints.clear();
+
+        // All the calculations same as reference kernel
+        const double twiceMaxInputScale = 2.0 * std::max(ifmScale, ifm2Scale);
+        const double realInput1Multiplier = ifmScale / twiceMaxInputScale;
+        const double realInput2Multiplier = ifm2Scale / twiceMaxInputScale;
+
+        int leftShift = ifmConn->tensor->Type() == DataType::Int16 ? 0 : 7;
+
+        double realOutputMultiplier = (twiceMaxInputScale * twiceMaxInputScale) / ((1 << (leftShift * 2)) * ofmScale);
+
+        auto quantizedRealInput1 = QuantizedScale(realInput1Multiplier);
+        auto quantizedRealInput2 = QuantizedScale(realInput2Multiplier);
+        auto quantizedRealOutput = QuantizedScale(realOutputMultiplier);
+        quantizedRealInput1.scale = std::max(quantizedRealInput1.scale, 1);
+        quantizedRealInput2.scale = std::max(quantizedRealInput2.scale, 1);
+        quantizedRealOutput.scale = std::max(quantizedRealOutput.scale, 1);
+
+        auto input1MultiplierConst = CreateConstTensor(
+            ifmConn->tensor->Name() + "_input1_multiplier", quantizedRealInput1.scale);
+        auto input2MultiplierConst = CreateConstTensor(
+            ifm2Conn->tensor->Name() + "_input2_multiplier", quantizedRealInput2.scale);
+        auto outputMultiplierConst = CreateConstTensor(
+            ofmConn->tensor->Name() + "_output_multiplier", quantizedRealOutput.scale);
+
+        // Convert ifm to 32 bit
+        auto castOp = CreateCastToInt32(ifmConn);
+        // Use explicit scaling (multiplier) for the left shift
+        castOp->Output(TensorUsage::OFM)->quantization.scales.clear();
+        castOp->Output(TensorUsage::OFM)->quantization.scales.push_back(QuantizedScale(1 << leftShift, 0));
+        castOp->Output(TensorUsage::OFM)->quantization.type = QuantizationType::EXPLICIT;
+
+        // Scale/shift ifm (for 32-bit operations, scale is not applied but shift is)
+        auto mulOp = CreateMul(castOp->Output(TensorUsage::OFM)->tensor, input1MultiplierConst, noScaleQuant, noScaleQuant, noScaleQuant);
+        mulOp->SetRounding(RoundMode::DBL);
+        mulOp->Output(TensorUsage::OFM)->quantization.scales.clear();
+        mulOp->Output(TensorUsage::OFM)->quantization.scales.push_back(QuantizedScale(1, quantizedRealInput1.shift));
+        mulOp->Output(TensorUsage::OFM)->quantization.type = QuantizationType::EXPLICIT;
+        auto ifmScaled = mulOp->Output(TensorUsage::OFM);
+        RecordOptimisation(operation, mulOp);
+
+        // Convert ifm2 to 32 bit
+        castOp = CreateCastToInt32(ifm2Conn);
+        // Use explicit scaling (multiplier) for the left shift
+        castOp->Output(TensorUsage::OFM)->quantization.scales.clear();
+        castOp->Output(TensorUsage::OFM)->quantization.scales.push_back(QuantizedScale(1 << leftShift, 0));
+        castOp->Output(TensorUsage::OFM)->quantization.type = QuantizationType::EXPLICIT;
+        RecordOptimisation(operation, castOp);
+
+        // Scale/shift ifm2 (for 32-bit operations, scale is not applied but shift is)
+        mulOp = CreateMul(castOp->Output(TensorUsage::OFM)->tensor, input2MultiplierConst, noScaleQuant, noScaleQuant, noScaleQuant);
+        mulOp->SetRounding(RoundMode::DBL);
+        mulOp->Output(TensorUsage::OFM)->quantization.scales.clear();
+        mulOp->Output(TensorUsage::OFM)->quantization.scales.push_back(QuantizedScale(1, quantizedRealInput2.shift));
+        mulOp->Output(TensorUsage::OFM)->quantization.type = QuantizationType::EXPLICIT;
+        auto ifm2Scaled = mulOp->Output(TensorUsage::OFM);
+        RecordOptimisation(operation, mulOp);
+
+        // Calculate the raw diff
+        auto subOp = CreateSub(ifmScaled->tensor, ifm2Scaled->tensor, noScaleQuant, noScaleQuant, noScaleQuant);
+        subOp->SetRounding(RoundMode::DBL);
+        auto rawDiff = subOp->Output(TensorUsage::OFM);
+        RecordOptimisation(operation, subOp);
+
+        // Calculate the squared diff
+        mulOp = CreateMul(rawDiff->tensor, rawDiff->tensor, noScaleQuant, noScaleQuant, noScaleQuant);
+        mulOp->SetRounding(RoundMode::DBL);
+        auto squaredRaw = mulOp->Output(TensorUsage::OFM);
+        RecordOptimisation(operation, mulOp);
+
+        // Scale/shift ofm ((for 32-bit operations, scale is not applied but shift is)
+        returnOp = CreateMul(squaredRaw->tensor, outputMultiplierConst, noScaleQuant, noScaleQuant, ofmConn->quantization);
+        returnOp->SetRounding(RoundMode::DBL);
+        returnOp->ConnectOutput(TensorUsage::OFM, ofmConn->tensor);
+        returnOp->Output(TensorUsage::OFM)->quantization.scales.clear();
+        returnOp->Output(TensorUsage::OFM)->quantization.scales.push_back(QuantizedScale(1, quantizedRealOutput.shift));
+        returnOp->Output(TensorUsage::OFM)->quantization.type = QuantizationType::EXPLICIT;
+        RecordOptimisation(operation, returnOp);
+
+        operation->Disconnect();
+    }
+    return returnOp;
+}
+
+
+Operation *TFLiteGraphOptimiser::RewriteSpaceToBatchConvBatchToSpace(Graph *const, Operation *const operation)
+{
+    auto opType = operation->Type();
+    if ( opType == OpType::DepthwiseConv2DBias || opType == OpType::Conv2DBias )
+    {
+        auto prevOp = operation->IFM(0)->Writers().empty() ? nullptr : operation->IFM(0)->Writers().front().get();
+        auto nextOp = operation->OFM()->Readers().empty() ? nullptr : operation->OFM()->Readers().front().get();
+        if ( prevOp && prevOp->Type() == OpType::SpaceToBatchND &&  // Previous op is SpaceToBatchND
+             nextOp && nextOp->Type() == OpType::BatchToSpaceND &&  // Next op is BatchToSpaceND
+             operation->IFM(0)->Readers().size() == 1 &&            // No other consumers of SpaceToBatchND output
+             operation->OFM()->Readers().size() == 1                // No other consumers of BatchToSpaceND input
+        )
+        {
+            // Go ahead and short-circuit the SpaceToBatchND and BatchToSpaceND ops
+            operation->ConnectInput(TensorUsage::IFM0, prevOp->Input(TensorUsage::IFM0)->tensor);
+            operation->ConnectOutput(TensorUsage::OFM, nextOp->Output(TensorUsage::OFM)->tensor);
+            // Set new kernel dilation
+            auto blockShape = prevOp->Input(TensorUsage::Params);
+            int count = blockShape->shape[0];
+            assert(count == operation->IFM(0)->StorageShape().Size() - 2);
+            assert(blockShape->tensor->IsConstant());
+            auto values = blockShape->tensor->View().Values<int32_t>();
+            Point2i dilation(values[0], values[count > 1 ? 1 : 0]);
+            Kernel dilatedKernel = operation->Kernel()->WithDilation(std::move(dilation));
+            // Calculate padding for new kernel
+            Point2i dilatedWH = dilatedKernel.DilatedWH();
+            auto &stride = dilatedKernel.Stride();
+            auto &inputShape = operation->IFM(0)->StorageShape();
+            int xpad = NeededTotalPadding(inputShape.Width(), stride.x, dilatedWH.x);
+            int ypad = NeededTotalPadding(inputShape.Height(), stride.y, dilatedWH.y);
+            Margin pad = Margin(ypad / 2, xpad / 2, (ypad + 1) / 2, (xpad + 1) / 2);
+            // Set the new kernel with updated dilation and padding
+            operation->SetKernel(std::make_unique<Kernel>(dilatedKernel.WithPadding(pad)));
+            // Disconnect the SpaceToBatchND and BatchToSpaceND ops
+            prevOp->Disconnect();
+            nextOp->Disconnect();
+        }
+    }
+    return operation;
+}
+
+// Fixup Conv2DBias and DepthwiseConv2DBias to allow dilation greater than 2.
+// TODO: Replace with kernel decomposition for supported architectures
+Operation *TFLiteGraphOptimiser::FixupDilationGT2(Graph *const, Operation *const operation)
+{
+    auto returnOp = operation;
+    if ( operation->Type() == OpType::Conv2DBias || operation->Type() == OpType::DepthwiseConv2DBias )
+    {
+        auto dilation = operation->Kernel()->Dilation();
+        // If dilation in either axis is greater than that supported by hardware then we must manually dilate the kernel
+        if ( dilation.x > 2 || dilation.y > 2 )
+        {
+            // If the dilation is a multiple of 2 then the hardware dilation can be enabled to provide that multiple
+            // of 2. This allows the kernel size to be reduced (via the scaled dilation) by half in that dimension.
+            int hwDilationH = (dilation.y % 2 == 0) ? 2 : 1;
+            int hwDilationW = (dilation.x % 2 == 0) ? 2 : 1;
+            int manualDilationH = dilation.y / hwDilationH;
+            int maunalDilationW = dilation.x / hwDilationW;
+
+            auto *weightConn = operation->Input(TensorUsage::Weights);
+            assert(weightConn);
+            assert(weightConn->tensor->IsConstant());
+            auto weights = weightConn->tensor->View().Values<int8_t>();
+            const auto &weightShape = weightConn->shape;
+
+            // Create new empty kernel with dilated size
+            auto origKernelSize = operation->Kernel()->Size();
+            auto dilatedKernelSize = operation->Kernel()->WithDilation({manualDilationH, maunalDilationW}).DilatedWH();
+            Kernel dilatedKernel = operation->Kernel()->WithDilation({hwDilationH, hwDilationW}).WithSize(dilatedKernelSize);
+            const int newKernelBufferSize = dilatedKernel.ElementsWH() * weightShape.Depth();
+            operation->SetKernel(std::make_unique<Kernel>(std::move(dilatedKernel)));
+
+            // Copy the original kernel values into the new sparse kernel
+            // Width and depth stride same for original and new kernel
+            auto strideC = 1;
+            auto strideW = weightShape.Depth();
+            auto origStrideH = strideW * origKernelSize.x;
+            auto newStrideH = strideW * dilatedKernelSize.x;
+            auto newKernelVals = std::make_unique<int8_t[]>(newKernelBufferSize);
+            for ( int h = 0; h < origKernelSize.y; ++h )
+            {
+                for ( int w = 0; w < origKernelSize.x; ++w )
+                {
+                    for ( int c = 0; c < weightShape.Depth(); c++ )
+                    {
+                        auto origKernelIdx = c * strideC + w * strideW + h * origStrideH;
+                        auto newKernelIdx = c * strideC + w * strideW * maunalDilationW + h * newStrideH * manualDilationH;
+                        newKernelVals[newKernelIdx] = weights[origKernelIdx];
+                    }
+                }
+            }
+            weightConn->tensor->SetBuffer(std::make_shared<Buffer>(std::move(newKernelVals), newKernelBufferSize));
+            Shape newShape = weightShape.WithHW(dilatedKernelSize.y, dilatedKernelSize.x);
+            weightConn->tensor->SetStorageShape(newShape);
+            weightConn->Set(newShape);
+        }
+    }
+    return returnOp;
+}
+
+// If conv op without bias tensor, create one with zeroes
+Operation *TFLiteGraphOptimiser::FixupBias(Graph *const, Operation *const operation)
+{
+    if ( IsConvolution(operation->Type()) && operation->CountInputs(TensorUsage::Scales) == 0 )
+    {
+        auto ifmConn = operation->Input(TensorUsage::IFM);
+        auto ofmConn = operation->Output(TensorUsage::OFM);
+
+        // Create bias tensor with zeroes
+        DataType biasType;
+        std::shared_ptr<Buffer> biasBuffer;
+        auto biasElements = ofmConn->shape.Depth();
+        if ( ifmConn->tensor->Type() == DataType::Int16 )
+        {
+            biasType = DataType::Int64;
+            biasBuffer = std::make_shared<Buffer>(std::make_unique<int64_t[]>(biasElements), biasElements);
+        }
+        else
+        {
+            biasType = DataType::Int32;
+            biasBuffer = std::make_shared<Buffer>(std::make_unique<int32_t[]>(biasElements), biasElements);
+        }
+        auto biasTensor = CreateConstTensor("bias", biasType, biasBuffer);
+        operation->ConnectInput(TensorUsage::Scales, biasTensor);
+    }
+    return operation;
+}
+
+// Convert depthwise convolutions with a depth multiplier greater than 1 into a single Conv2D if:
+// - the input depth is 1; and
+// - the output depth equals the depth multiplier.
+Operation *TFLiteGraphOptimiser::RewriteDepthwise(Graph *const, Operation *const operation)
+{
+    Operation *returnOp = operation;
+    if ( operation->Type() == OpType::DepthwiseConv2DBias )
+    {
+        const auto ifm = operation->Input(TensorUsage::IFM0);
+        const auto ofm = operation->Output(TensorUsage::OFM);
+        const auto multiplier = operation->Kernel()->DepthMultiplier();
+
+        if ( ifm && (ifm->shape.Depth() == 1) && (multiplier != 1) && ofm && (ofm->shape.Depth() == multiplier) )
+        {
+            auto newOp = std::make_shared<Operation>(OpType::Conv2DBias);
+            newOp->SetRounding(ifm->tensor->Type() == DataType::Int16 ? RoundMode::NATURAL : RoundMode::DBL);
+            auto kernel = std::make_unique<Kernel>(operation->Kernel()->Size(), operation->Kernel()->Stride(),
+                operation->Kernel()->Dilation(), 1, operation->Kernel()->Padding());
+            newOp->SetKernel(std::move(kernel));
+            ReplaceOperation(operation, newOp.get());
+            returnOp = newOp.get();
+            RecordOptimisation(operation, returnOp);
+        }
+    }
+    return returnOp;
+}
+
+
+// Check that no reshape like operations remain in graph.
+Operation *TFLiteGraphOptimiser::CheckReshapeOpsRemoved(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+    OpType opType = operation->Type();
+    if ( IsReshape(opType) )
+    {
+        LOG_ERROR("Reshape-like operation type {0} expected to have been removed, still remains.\n", OpTypeToString(opType));
+        assert(false);
+    }
+    return operation;
+}
+
+Operation *TFLiteGraphOptimiser::ConvertSoftmaxOps(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+    return _softmax->ConvertOp(operation);
+}
+
+Operation *TFLiteGraphOptimiser::RewriteFullyConnectedInput(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+    if ( operation->Type() == OpType::FullyConnected )
+    {
+        auto weights = operation->Input(TensorUsage::Weights);
+        assert(weights != nullptr);
+        auto nInElems = weights->shape.Depth();
+        auto ifm = operation->Input(TensorUsage::IFM0);
+        auto &ifmShape = ifm->slice.shape.IsEmpty() ? ifm->shape : ifm->slice.shape;
+        auto elms = ifmShape.Elements();
+        auto batchSize = elms / nInElems;
+        assert(batchSize * nInElems == elms);
+        ifmShape = Shape(batchSize, 1, 1, nInElems);
+    }
+    return operation;
+}
+
+// Must be called after RewriteFullyConnectedInput
+Operation *TFLiteGraphOptimiser::ConvertBatchedFullyConnected(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+    auto returnOp = operation;
+    if ( operation->Type() == OpType::FullyConnected )
+    {
+        auto ifm = operation->Input(TensorUsage::IFM0);
+        // Check if the first dimension indicates batching
+        auto &ifmShape = ifm->slice.shape.IsEmpty() ? ifm->shape : ifm->slice.shape;
+        int n = ifmShape.Batch();
+        if ( n > 1 )
+        {
+            assert(ifmShape.Height() == 1 && ifmShape.Width() == 1);
+            int h = 1;
+            int w = n;
+            // More square H/W gives better performance up to a point
+            for ( int x = 2; x <= 16 && x * x <= n; ++x )
+            {
+                if ( n % x == 0 )
+                {
+                    h = x;
+                    w = n / x;
+                }
+            }
+            ifmShape = Shape(1, h, w, ifmShape.Depth());
+            auto ofm = operation->Output(TensorUsage::OFM);
+            ofm->shape = Shape(1, h, w, ofm->shape.Depth());
+            if ( h > 4 || w > 4 )
+            {
+                // Ended up with shape that requires the weights to be reread.
+                // Convert op to conv2d since this enables weight buffering.
+                auto newOp = std::make_shared<Operation>(OpType::Conv2DBias);
+                newOp->SetRounding(ifm->tensor->Type() == DataType::Int16 ? RoundMode::NATURAL : RoundMode::DBL);
+                ReplaceOperation(operation, newOp.get());
+                returnOp = newOp.get();
+                RecordOptimisation(operation, returnOp);
+            }
+        }
+    }
+    return returnOp;
+}
+
+Operation *TFLiteGraphOptimiser::UnrollConv(Graph *const, Operation *const operation)
+{
+    auto returnOp = operation;
+
+    if ( operation->Type() == OpType::Conv2D || operation->Type() == OpType::Conv2DBias )
+    {
+        const auto ifmConn = operation->Input(TensorUsage::IFM);
+        assert(ifmConn);
+        const auto weightsConn = operation->Input(TensorUsage::Weights);
+        assert(weightsConn);
+        const auto scalesConn = operation->Input(TensorUsage::Scales);
+        assert(scalesConn);
+        const auto ofmConn = operation->Output(TensorUsage::OFM);
+        assert(ofmConn);
+
+        const auto kernel = operation->Kernel();
+        assert(kernel);
+        const int32_t kernel_h = kernel->Size().y;
+        assert(kernel_h > 0);
+        const int32_t kernel_w = kernel->Size().x;
+        assert(kernel_w > 0);
+        const int32_t stride_h = kernel->Stride().y;
+        assert(stride_h > 0);
+        const int32_t stride_w = kernel->Stride().x;
+        assert(stride_w > 0);
+        const int32_t dilation_h = kernel->Dilation().y;
+        assert(dilation_h > 0);
+        const int32_t dilation_w = kernel->Dilation().x;
+        assert(dilation_w > 0);
+        const bool hasPadding = !kernel->Padding().IsZero();
+        const bool hasIfmSlice = ifmConn->slice.shape.IsValid() || ifmConn->slice.offset.IsValid();
+        const bool hasOfmSlice = ofmConn->slice.shape.IsValid() || ofmConn->slice.offset.IsValid();
+
+        tflite::Padding paddingType = tflite::Padding::VALID;
+        const tflite::Operator *const passthrough = static_cast<const tflite::Operator *>(operation->Passthrough());
+        if ( passthrough )
+        {
+            const auto options = passthrough->builtin_options_as_Conv2DOptions();
+            if ( options )
+            {
+                paddingType = options->padding();
+            }
+        }
+
+        // Figure out if op needs to be unrolled
+        const bool needUnrollH = stride_h > 3;
+        const bool needUnrollW = stride_w > 3;
+
+        // Figure out if op can be unrolled
+        const bool canUnroll = !hasPadding && !hasIfmSlice && !hasOfmSlice && paddingType == tflite::Padding::VALID;
+        const bool canUnrollH = dilation_h == 1 && canUnroll;
+        const bool canUnrollW = dilation_w == 1 && canUnroll;
+
+        if ( (needUnrollH || needUnrollW) && canUnrollH && canUnrollW )
+        {
+            const Shape inputGridCell = ifmConn->shape.WithHeight(kernel_h).WithWidth(kernel_w);
+            const Shape outputGridCell = ofmConn->shape.WithHeight(1).WithWidth(1);
+            const Point2i gridSize = ofmConn->shape.WH<int>();
+
+            for ( int h = 0; h < gridSize.y; h++ )
+            {
+                for ( int w = 0; w < gridSize.x; w++ )
+                {
+                    TensorSlice ifmSlice;
+                    ifmSlice.shape = inputGridCell;
+                    ifmSlice.offset = Shape(0, h * stride_h, w * stride_w, 0);
+
+                    TensorSlice ofmSlice;
+                    ofmSlice.shape = outputGridCell;
+                    ofmSlice.offset = Shape(0, h, w, 0);
+
+                    // Add new for this grid cell
+                    auto op = std::make_shared<Operation>(operation->Type());
+                    op->SetKernel(std::make_unique<Kernel>(kernel->WithStride({1, 1})));
+                    op->CopyInput(TensorUsage::IFM, *ifmConn);
+                    op->Input(TensorUsage::IFM)->Set(ifmSlice);
+                    op->CopyInput(TensorUsage::Weights, *weightsConn);
+                    op->CopyInput(TensorUsage::Scales, *scalesConn);
+                    op->CopyOutput(TensorUsage::OFM, *ofmConn);
+                    op->Output(TensorUsage::OFM)->Set(ofmSlice);
+                    RecordOptimisation(operation, op.get());
+
+                    returnOp = op.get();
+                }
+            }
+
+            // Remove original op
+            operation->Disconnect();
+        }
+    }
+
+    return returnOp;
+}
+
+static bool MeanOpSupported(Operation *const operation, Shape &reduceAxis, Shape &ifmShape4D)
+{
+    auto ifmConn = operation->Input(TensorUsage::IFM0);
+    auto ifm = ifmConn->tensor;
+    auto axis = operation->Input(TensorUsage::Params)->tensor;
+    auto axisValues = axis->View().Values<int32_t>();
+    auto axisCount = axis->StorageShape().IsEmpty() ? 1 : axis->StorageShape().Depth();
+    auto ifmDims = ifmShape4D.Size();
+
+    // Max kernel size
+    static constexpr int MAX_MEAN_KERNEL_SIZE = 64 * 64;
+    // Max size to avoid overflow INT32
+    static constexpr int MAX_MEAN_ELEMENTS_INT8 = 2 << 23;   // 2²⁴ x 2⁷  = 2³¹
+    static constexpr int MAX_MEAN_ELEMENTS_UINT8 = 2 << 22;  // 2²³ x 2⁸  = 2³¹
+    static constexpr int MAX_MEAN_ELEMENTS_INT16 = 2 << 15;  // 2¹⁶ x 2¹⁵ = 2³¹
+
+    bool supported = false;
+
+    // Compute total number of elements
+    int elements = 1;
+    for ( int i = 0; i < ifmDims; ++i )
+    {
+        elements *= reduceAxis[i] ? ifmShape4D[i] : 1;
+    }
+
+    // Make sure overflow can not occur
+    switch ( ifm->Type() )
+    {
+        case DataType::Int8:
+            supported = elements <= MAX_MEAN_ELEMENTS_INT8;
+            break;
+
+        case DataType::UInt8:
+            supported = elements <= MAX_MEAN_ELEMENTS_UINT8;
+            break;
+
+        case DataType::Int16:
+            supported = elements <= MAX_MEAN_ELEMENTS_INT16;
+            break;
+
+        default:
+            supported = false;
+            break;
+    }
+
+    // Only support batch 1
+    supported = supported && (ifmShape4D.Batch() == 1);
+
+    // Reduced axis must be no greater than MAX_MEAN_KERNEL_SIZE
+    supported = supported && (reduceAxis.Depth() * ifmShape4D.Depth() <= MAX_MEAN_KERNEL_SIZE);
+    supported = supported && (reduceAxis.Width() * ifmShape4D.Width() <= MAX_MEAN_KERNEL_SIZE);
+    supported = supported && (reduceAxis.Height() * ifmShape4D.Height() <= MAX_MEAN_KERNEL_SIZE);
+
+    // Depth is supported if any of h,w,c == 1
+    if ( supported && reduceAxis.Depth() )
+    {
+        supported = false;
+        for ( int i = 1; i < 4; i++ )
+        {
+            if ( ifmShape4D[i] == 1 )
+            {
+                supported = true;
+                break;
+            }
+        }
+    }
+    return supported;
+}
+
+Operation *TFLiteGraphOptimiser::ConvertMeanOps(Graph *const, Operation *const operation)
+{
+    auto returnOp = operation;
+    if ( operation->Type() == OpType::Mean )
+    {
+        auto ifmConn = operation->Input(TensorUsage::IFM0);
+        auto ofmConn = operation->Output(TensorUsage::OFM);
+        auto axis = operation->Input(TensorUsage::Params)->tensor;
+        auto axisValues = axis->View().Values<int32_t>();
+        auto axisCount = axis->StorageShape().IsEmpty() ? 1 : axis->StorageShape().Depth();
+        auto &ifmShape = ifmConn->shape;
+        auto &ofmShape = ofmConn->shape;
+        auto ifmDims = ifmShape.Size();
+        auto ofmDims = ofmShape.Size();
+        auto &ifmQuant = ifmConn->quantization;
+        auto &ofmQuant = ofmConn->quantization;
+        static constexpr int MAX_MEAN_HEIGHT = 64;
+        static constexpr int MAX_MEAN_KERNEL_SIZE = 64 * 64;
+
+        // Create a 4D shape to indicate which axis that will be reduced
+        Shape reduceAxis = ifmShape.WithZeros();
+        for ( int i = 0; i < axisCount; ++i )
+        {
+            reduceAxis[axisValues[i]] = 1;
+        }
+        reduceAxis = Shape::PadAxes(reduceAxis, 4, 0);
+
+        Shape ifmShape4D = Shape::PadAxes(ifmShape, 4, 1);
+
+        // Check if it is possible to convert the MEAN
+        if ( !MeanOpSupported(operation, reduceAxis, ifmShape4D) )
+        {
+            return operation;
+        }
+
+        // Fix intermediateShape when keep_dims is false
+        // e.g. IFM=1xHxWxC axis=2 OFM=1xHxC, the intermediateShape should be 1xHx1xC
+        Shape intermediateShape = ofmConn->shape;
+        if ( ofmDims < ifmDims )
+        {
+            for ( int i = 0; i < ifmDims; i++ )
+            {
+                if ( reduceAxis[i] )
+                {
+                    intermediateShape = intermediateShape.Insert(i, 1);
+                }
+            }
+        }
+        intermediateShape = Shape::PadAxes(intermediateShape, 4, 1);
+
+        // Support mean over depth-axis by left-shifting the C channel
+        // From operator checks we can assume that one of H,W,C has shape 1
+        if ( reduceAxis.Depth() && ifmShape4D.Depth() > 1 )
+        {
+            // If W=1 reshape NxHx1xC -> NxHxCx1, else reshape Nx1xWxC -> NxWxCx1
+            int idxToDelete = ifmShape.Width() == 1 ? 2 : 1;
+
+            // Delete axis with size 1
+            reduceAxis = reduceAxis.Erase(idxToDelete);
+            ifmShape4D = ifmShape4D.Erase(idxToDelete);
+            intermediateShape = intermediateShape.Erase(idxToDelete);
+
+            // Add another element to set channel-axis to one
+            reduceAxis = reduceAxis.Insert(3, 0);
+            ifmShape4D = ifmShape4D.Insert(3, 1);
+            intermediateShape = intermediateShape.Insert(3, 1);
+        }
+
+        // Compute kernel sizes for our convolutions
+        int h = reduceAxis.Height() ? ifmShape4D.Height() : 1;
+        int w = reduceAxis.Width() ? ifmShape4D.Width() : 1;
+
+        assert(CheckSafeMul(w, h));
+        int num_elements_in_axis = h * w;
+
+        // If one convolution is enough, but height is greater than max kernel height
+        // reshape from HxW to 1x(HxW)
+        // This can only be done if the mean is computed over both H and W
+        if ( h > MAX_MEAN_HEIGHT && num_elements_in_axis <= MAX_MEAN_KERNEL_SIZE && reduceAxis.Height() && reduceAxis.Width() )
+        {
+            ifmShape4D = Shape(ifmShape4D.Batch(), 1, h * w, ifmShape4D.Depth());
+            w = h * w;
+            h = 1;
+        }
+
+        // When h x w <= 4096     When h x w > 4096 there is a need to split into several ops.
+        //                        Do this by splitting up h and change the read_offset/shape.
+        //                        Below is an example where ifm is 1x190x64x1
+        //   MEAN                                           MEAN
+        //     |                      |-----------------------|----------------------|
+        // DepthwiseConv2DBias    1_DepthwiseConv2DBias   2_DepthwiseConv2DBias   3_DepthwiseConv2DBias
+        //     |                      |                       |                     |
+        //    MUL                     |---------ADD-----------|                     |
+        //                                       |                                  |
+        //                                       |----------------ADD---------------|
+        //                                                          |
+        //                                                         MUL
+        //       1_DepthwiseConv2DBias: read_offset [0, 0, 0, 0]> read_shape [1,  64, 64, 1]>
+        //       2_DepthwiseConv2DBias: read_offset [0, 64, 0, 0]> read_shape [1,  64, 64, 1]>
+        //       3_DepthwiseConv2DBias: read_offset [0, 128, 0, 0]> read_shape [1,  62, 64, 1]>
+
+
+        int heightPerConv = std::min(MAX_MEAN_KERNEL_SIZE / w, h);
+        heightPerConv = std::min(heightPerConv, MAX_MEAN_HEIGHT);
+        int opCount = (h + heightPerConv - 1) / heightPerConv;
+        Quantization oneScaleQuant = ifmConn->quantization;
+        oneScaleQuant.scales.clear();
+        oneScaleQuant.scales.push_back({1, 0});
+        Quantization oneScaleQuantZp0 = oneScaleQuant;
+        oneScaleQuantZp0.zeroPoints.clear();
+        oneScaleQuantZp0.zeroPoints.push_back(0);
+
+        std::shared_ptr<Tensor> accTensor = nullptr;
+
+        // Reuse weight tensor if more ops are needed
+        std::shared_ptr<Tensor> weightTensor = nullptr;
+        std::shared_ptr<Tensor> biasTensor = nullptr;
+
+        // set weight quantization
+        Quantization weightQuant = ifmConn->quantization;
+        weightQuant.quantMin = {0};
+        weightQuant.quantMax = {255};
+        weightQuant.scales.clear();
+        weightQuant.zeroPoints.clear();
+        weightQuant.scales.push_back({1, 0});
+        weightQuant.zeroPoints.push_back(0);
+
+        for ( int i = 0; i < opCount; ++i )
+        {
+            bool isLastOp = (i == (opCount - 1));
+
+            // Compute height for the kernel
+            int kh = heightPerConv;
+            if ( isLastOp && h % heightPerConv != 0 )
+            {
+                kh = h % heightPerConv;
+                // New kernel shape so new weight tensor is needed
+                weightTensor = nullptr;
+                biasTensor = nullptr;
+            }
+
+            // Calculate read and offset shape
+            int readShapeH = reduceAxis.Height() ? kh : ifmShape4D.Height();
+            int readShapeW = reduceAxis.Width() ? w : ifmShape4D.Width();
+
+            Shape readOffset(0, i * heightPerConv, 0, 0);
+            Shape readShape = ifmShape4D.WithHW(readShapeH, readShapeW);
+
+            auto op = MakeDepthwiseMeanOp(ifmConn, ifmShape4D, readShape, readOffset, intermediateShape, w, kh,
+                ofmConn->tensor->Name(), weightTensor, biasTensor, oneScaleQuant, weightQuant, oneScaleQuantZp0);
+            RecordOptimisation(operation, op);
+
+            if ( i > 0 )
+            {
+                // Add result to accumulator tensor
+                Quantization accQuant = op->Output(TensorUsage::OFM)->quantization;
+                op = CreateAdd(accTensor, op->Output(TensorUsage::OFM)->tensor, oneScaleQuantZp0, oneScaleQuantZp0, oneScaleQuantZp0);
+                op->SetRounding(RoundMode::DBL);
+                op->Output(TensorUsage::OFM)->quantization.scales.clear();
+                op->Output(TensorUsage::OFM)->quantization.scales.push_back(QuantizedScale(1, 0));
+                op->Output(TensorUsage::OFM)->quantization.type = QuantizationType::EXPLICIT;
+                RecordOptimisation(operation, op);
+            }
+            accTensor = op->Output(TensorUsage::OFM)->tensor;
+        }
+        QuantizedScale quant(ifmQuant.scales[0].Dequantize() / ofmQuant.scales[0].Dequantize());
+
+        // Convert to left shift-positive notation
+        auto outputShift = 31 - quant.shift;
+
+        // Below calculation same as in reference to avoid any risk of overflow,
+        // clamping the shift value at the price of some precision loss.
+        // IntLog2 same as 63 - CountLeadingZeros(num_elements_in_axis)
+        int shift = IntLog2(num_elements_in_axis);
+        shift = std::min(shift, 32);
+        shift = std::min(shift, 31 + outputShift);
+        // Multiplier should be 32bit
+        int32_t outputMultiplier = int32_t((int64_t(quant.scale) << shift) / num_elements_in_axis);
+
+        // Convert to right-shift
+        outputShift = 31 - (outputShift - shift);
+
+        // For int32 scaling is not supported so instead multiply with the scale
+        auto scalar = CreateConstTensor(ofmConn->tensor->Name() + "_scalar", outputMultiplier);
+        auto op = CreateMul(accTensor, scalar, oneScaleQuantZp0, oneScaleQuantZp0, oneScaleQuantZp0);
+        op->SetRounding(RoundMode::DBL);
+
+        // Apply the shift
+        QuantizedScale scale(1, outputShift);
+        Quantization outQuant = ofmConn->quantization;
+        outQuant.scales.clear();
+        outQuant.scales.push_back({1, outputShift});
+        outQuant.type = QuantizationType::EXPLICIT;
+        op->ConnectOutput(TensorUsage::OFM, ofmConn->tensor).Set(intermediateShape).Set(outQuant);
+        RecordOptimisation(operation, op);
+        operation->Disconnect();
+        returnOp = op;
+    }
+
+    return returnOp;
+}
+
+// Converts int8/uint8 Sigmoid and Tanh to a LUT based solution
+Operation *TFLiteGraphOptimiser::ConvertTanhSigmoidToLUT(Graph *const, Operation *const operation)
+{
+    auto returnOp = operation;
+    auto opType = operation->Type();
+    auto ifmConn = operation->Input(TensorUsage::IFM0);
+    auto ifm = ifmConn->tensor.get();
+
+    if ( ifm->Type() == DataType::Int16 && (opType == OpType::Sigmoid || opType == OpType::Tanh) )
+    {
+        if ( _arch->SupportsSigmoidTanhLutInt16(opType) )
+        {
+            returnOp = ConvertTanhSigmoidToLUT16(operation);
+        }
+    }
+    else if ( opType == OpType::Sigmoid )
+    {
+        returnOp = ConvertToLUT8(
+            operation, [](double x) -> double { return ClampSigmoid8(x); }, "sigmoid");
+    }
+    else if ( opType == OpType::Tanh )
+    {
+        returnOp = ConvertToLUT8(
+            operation, [](double x) -> double { return std::tanh(x); }, "tanh");
+    }
+
+
+    if ( operation != returnOp )
+    {
+        RecordOptimisation(operation, returnOp);
+        operation->Disconnect();
+    }
+
+    return returnOp;
+}
+
+
+Operation *TFLiteGraphOptimiser::ConvertPrelu(Graph *const graph, Operation *const operation)
+{
+    // Lowering of PReLU
+    // To Minimum + Mul + ReLU + Add
+    //
+    //   x>0      x <= 0
+    //   ReLU      Minimum(x, 0)
+    //     \       /
+    //      \     Mul(alpha)
+    //       \   /
+    //        Add
+    //
+    // ReLU is used for positive input values
+    // Minimum(x,0) + Mul(alpha) is used for negative input values
+    // Add sums the two cases
+    UNUSED(graph);
+    auto returnOp = operation;
+    auto opType = operation->Type();
+    const auto ifmConn = operation->Input(TensorUsage::IFM0);
+    const auto params = operation->Input(TensorUsage::Params);
+    const auto ofmConn = operation->Output(TensorUsage::OFM);
+
+    if ( opType == OpType::Prelu && ifmConn && ofmConn && params )
+    {
+        Quantization ofmQuant = ofmConn->quantization;
+        Quantization ifmQuant = ifmConn->quantization;
+        Quantization alphaQuant = params->quantization;
+
+        Quantization noScaleQuant = Quantization::Unit();
+        noScaleQuant.scales.clear();
+        noScaleQuant.zeroPoints.clear();
+
+        Quantization unitQuantOfmZp = Quantization::Unit();
+        unitQuantOfmZp.zeroPoints.clear();
+        unitQuantOfmZp.zeroPoints.push_back(ofmQuant.zeroPoints[0]);
+        unitQuantOfmZp.type = QuantizationType::EXPLICIT;
+
+        std::shared_ptr<Tensor> zeroTens;
+        if ( ifmConn->tensor->Type() == DataType::Int16 )
+        {
+            zeroTens = CreateConstTensor("zero_const", int16_t(0));
+        }
+        else
+        {
+            zeroTens = CreateConstTensor("zero_const", int8_t(0));
+        }
+        std::shared_ptr<Tensor> fmNegative = ifmConn->tensor->Clone();
+        std::shared_ptr<Tensor> fmAlpha = ofmConn->tensor->Clone();
+        std::shared_ptr<Tensor> fmScaled = ofmConn->tensor->Clone();
+
+        // Select values < 0
+        auto minOp = std::make_shared<Operation>(OpType::Minimum);
+        minOp->CopyInput(TensorUsage::IFM0, *ifmConn);
+        minOp->ConnectInput(TensorUsage::IFM1, zeroTens).Set(noScaleQuant);
+        minOp->ConnectOutput(TensorUsage::OFM, fmNegative).Set(ifmConn->quantization);
+        minOp->SetRounding(RoundMode::DBL);
+        RecordOptimisation(operation, minOp.get());
+
+        // and multiply with alpha tensor
+        auto mulAlpha = std::make_shared<Operation>(OpType::Mul);
+        mulAlpha->CopyInput(TensorUsage::IFM0, *minOp->Output(TensorUsage::OFM));
+        mulAlpha->CopyInput(TensorUsage::IFM1, *params);
+        mulAlpha->ConnectOutput(TensorUsage::OFM, fmAlpha).Set(ofmConn->quantization);
+        mulAlpha->SetRounding(RoundMode::DBL);
+        RecordOptimisation(operation, mulAlpha.get());
+
+        // Select (and scale) values > 0
+        auto reluOp = std::make_shared<Operation>(OpType::Relu);
+        reluOp->CopyInput(TensorUsage::IFM0, *ifmConn);
+        reluOp->ConnectOutput(TensorUsage::OFM, fmScaled).Set(ofmConn->quantization);
+        reluOp->Output(TensorUsage::OFM)->quantization.quantMin.push_back(ofmConn->quantization.zeroPoints[0]);
+        reluOp->SetRounding(RoundMode::DBL);
+        RecordOptimisation(operation, reluOp.get());
+
+        // Add scaled and alpha multiplied values
+        auto addOp = std::make_shared<Operation>(OpType::Add);
+        addOp->ConnectInput(TensorUsage::IFM0, fmAlpha).Set(unitQuantOfmZp);
+        addOp->ConnectInput(TensorUsage::IFM1, fmScaled).Set(unitQuantOfmZp);
+        addOp->CopyOutput(TensorUsage::OFM, *ofmConn);
+        addOp->Output(TensorUsage::OFM)->Set(unitQuantOfmZp);
+        addOp->SetRounding(RoundMode::DBL);
+        RecordOptimisation(operation, addOp.get());
+        returnOp = addOp.get();
+        operation->Disconnect();
+    }
+    return returnOp;
+}
+
+// Converts LeakyReLU
+//
+// alpha == 0
+//   converted to ReLU
+// alpha == -1
+//   converted to Abs
+// 8-bit LeakyReLU
+//   converted to a LUT if unsupported by arch
+// 16-bit LeakyReLU:
+//   alpha > 1
+//       Converted to Mul + (Mul) + Min if unsupported by arch
+//       The extra Mul is needed if ifmQuant != ofmQuant
+//   alpha <= 1
+//       Converted to Mul + (Mul) + Max if unsupported by arch
+Operation *TFLiteGraphOptimiser::ConvertLeakyRelu(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+    auto returnOp = operation;
+    auto opType = operation->Type();
+    auto ifmConn = operation->Input(TensorUsage::IFM0);
+    auto ofmConn = operation->Output(TensorUsage::OFM);
+
+    // TODO MLBEDSW-8770: Investigate performance of leakyReLU optimisations
+    if ( opType == OpType::LeakyRelu && ifmConn != nullptr && ofmConn != nullptr )
+    {
+        float alpha = operation->Parameters().leaky_relu.alpha;
+        auto ifm = ifmConn->tensor.get();
+        auto ofm = ofmConn->tensor.get();
+        bool quantized = !IsScalingValidAndEqual(*ifmConn, *ofmConn);
+        if ( alpha == 0 || std::isinf(1 / alpha) )
+        {
+            // alpha == 0 can be converted to ReLU
+            auto reluOp = MakeOperation(OpType::Relu, ifmConn, nullptr, ofmConn);
+            reluOp->Output(TensorUsage::OFM)->quantization.quantMin.push_back(ofmConn->quantization.zeroPoints[0]);
+            RecordOptimisation(operation, reluOp);
+            returnOp = reluOp;
+        }
+        else if ( alpha == -1 )
+        {
+            // alpha == -1 can be converted to Abs
+            auto absOp = MakeOperation(OpType::Abs, ifmConn, nullptr, ofmConn);
+            RecordOptimisation(operation, absOp);
+            returnOp = absOp;
+        }
+        else if ( alpha < 0 || !_arch->SupportsLeakyRelu(quantized, ifm->Type()) )
+        {
+            if ( (ifm->Type() == DataType::Int8 || ifm->Type() == DataType::UInt8) )
+            {
+                // convert to 8-bit LUT
+                assert(ifm->Type() == ofm->Type());
+                returnOp = ConvertToLUT8(
+                    operation, [&alpha](double x) -> double { return x < 0 ? (alpha * x) : x; }, "LeakyReLU");
+                RecordOptimisation(operation, returnOp);
+            }
+            else
+            {
+                // Use 16-bit lowering to Mul + Max or Mul + Min
+                returnOp = ConvertLeakyRelu16bit(*ifmConn, *ofmConn, operation);
+            }
+        }
+    }
+
+    if ( operation != returnOp )
+    {
+        operation->Disconnect();
+    }
+
+    return returnOp;
+}
+
+// Converts RSqrt to a LUT based solution.
+Operation *TFLiteGraphOptimiser::ConvertRSqrtToLUT(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+    auto returnOp = operation;
+    auto opType = operation->Type();
+    auto ifmConn = operation->Input(TensorUsage::IFM0);
+    auto ofmConn = operation->Output(TensorUsage::OFM);
+
+    // LUT has been generated by printing the output from the reference.
+    // clang-format off
+    static const int32_t kRSqrtLut[] =
+    {
+        0x00000000, 0x00100000, 0x000b504e, 0x00093cd4, 0x00080000, 0x000727c9, 0x0006882f, 0x00060c24,
+        0x0005a827, 0x00055555, 0x00050f45, 0x0004d2fe, 0x00049e6a, 0x00047007, 0x000446b4, 0x00042195,
+        0x00040000, 0x0003e16d, 0x0003c570, 0x0003abb0, 0x000393e5, 0x00037dd2, 0x00036945, 0x00035613,
+        0x00034418, 0x00033333, 0x0003234b, 0x00031447, 0x00030612, 0x0002f89c, 0x0002ebd3, 0x0002dfaa,
+        0x0002d414, 0x0002c906, 0x0002be75, 0x0002b45a, 0x0002aaab, 0x0002a161, 0x00029875, 0x00028fe3,
+        0x000287a2, 0x00027fb0, 0x00027807, 0x000270a2, 0x0002697f, 0x00026298, 0x00025bec, 0x00025577,
+        0x00024f35, 0x00024925, 0x00024343, 0x00023d8e, 0x00023803, 0x000232a1, 0x00022d65, 0x0002284e,
+        0x0002235a, 0x00021e87, 0x000219d5, 0x00021541, 0x000210cb, 0x00020c70, 0x00020831, 0x0002040c,
+        0x00020000, 0x0001fc0c, 0x0001f82f, 0x0001f468, 0x0001f0b7, 0x0001ed1a, 0x0001e991, 0x0001e61b,
+        0x0001e2b8, 0x0001df67, 0x0001dc26, 0x0001d8f7, 0x0001d5d8, 0x0001d2c8, 0x0001cfc8, 0x0001ccd6,
+        0x0001c9f2, 0x0001c71c, 0x0001c454, 0x0001c198, 0x0001bee9, 0x0001bc46, 0x0001b9af, 0x0001b723,
+        0x0001b4a3, 0x0001b22d, 0x0001afc2, 0x0001ad61, 0x0001ab0a, 0x0001a8bc, 0x0001a678, 0x0001a43e,
+        0x0001a20c, 0x00019fe3, 0x00019dc2, 0x00019baa, 0x0001999a, 0x00019791, 0x00019590, 0x00019397,
+        0x000191a5, 0x00018fbb, 0x00018dd7, 0x00018bfa, 0x00018a23, 0x00018853, 0x0001868a, 0x000184c6,
+        0x00018309, 0x00018152, 0x00017fa0, 0x00017df4, 0x00017c4e, 0x00017aad, 0x00017911, 0x0001777b,
+        0x000175e9, 0x0001745d, 0x000172d6, 0x00017153, 0x00016fd5, 0x00016e5b, 0x00016ce7, 0x00016b76,
+        0x00016a0a, 0x000168a2, 0x0001673e, 0x000165de, 0x00016483, 0x0001632b, 0x000161d7, 0x00016087,
+        0x00015f3b, 0x00015df2, 0x00015cad, 0x00015b6b, 0x00015a2d, 0x000158f2, 0x000157bb, 0x00015686,
+        0x00015555, 0x00015427, 0x000152fd, 0x000151d5, 0x000150b0, 0x00014f8f, 0x00014e70, 0x00014d54,
+        0x00014c3b, 0x00014b24, 0x00014a11, 0x00014900, 0x000147f1, 0x000146e5, 0x000145dc, 0x000144d5,
+        0x000143d1, 0x000142cf, 0x000141d0, 0x000140d3, 0x00013fd8, 0x00013ee0, 0x00013de9, 0x00013cf5,
+        0x00013c03, 0x00013b14, 0x00013a26, 0x0001393b, 0x00013851, 0x0001376a, 0x00013684, 0x000135a1,
+        0x000134bf, 0x000133e0, 0x00013302, 0x00013226, 0x0001314c, 0x00013074, 0x00012f9e, 0x00012ec9,
+        0x00012df6, 0x00012d25, 0x00012c55, 0x00012b87, 0x00012abb, 0x000129f1, 0x00012928, 0x00012860,
+        0x0001279a, 0x000126d6, 0x00012613, 0x00012552, 0x00012492, 0x000123d4, 0x00012317, 0x0001225c,
+        0x000121a2, 0x000120e9, 0x00012032, 0x00011f7c, 0x00011ec7, 0x00011e14, 0x00011d62, 0x00011cb1,
+        0x00011c02, 0x00011b54, 0x00011aa7, 0x000119fb, 0x00011950, 0x000118a7, 0x000117ff, 0x00011758,
+        0x000116b3, 0x0001160e, 0x0001156b, 0x000114c8, 0x00011427, 0x00011387, 0x000112e8, 0x0001124a,
+        0x000111ad, 0x00011111, 0x00011076, 0x00010fdc, 0x00010f44, 0x00010eac, 0x00010e15, 0x00010d7f,
+        0x00010cea, 0x00010c56, 0x00010bc4, 0x00010b32, 0x00010aa0, 0x00010a10, 0x00010981, 0x000108f3,
+        0x00010865, 0x000107d9, 0x0001074d, 0x000106c2, 0x00010638, 0x000105af, 0x00010527, 0x0001049f,
+        0x00010419, 0x00010393, 0x0001030e, 0x0001028a, 0x00010206, 0x00010183, 0x00010102, 0x00010080
+    };
+    // clang-format on
+
+    if ( opType == OpType::Rsqrt && ifmConn->tensor->Type() == DataType::Int8 && ofmConn->tensor->Type() == DataType::Int8 )
+    {
+        const int kShift = 20;
+        const int qMin = -128;
+        const int qMax = 127;
+        const auto zpIn = ifmConn->quantization.zeroPoints[0];
+        const auto zpOut = ofmConn->quantization.zeroPoints[0];
+        const auto ifmScale = ifmConn->quantization.scales[0].Dequantize();
+        const auto ofmScale = ofmConn->quantization.scales[0].Dequantize();
+        double scale = 1.0 / double(std::sqrt(float(ifmScale)) * float(ofmScale));
+        QuantizedScale qScale = QuantizedScale(scale);
+        // convert to left shift-positive notation
+        qScale.shift = 31 - qScale.shift - kShift;
+
+        std::vector<uint8_t> lut;
+        lut.reserve(256);
+        lut.push_back(qMax);
+        for ( int x = qMin + 1; x <= qMax; ++x )
+        {
+            int index = std::max(0, x - int(zpIn));
+            auto value = zpOut + MultiplyByQuantizedMultiplier(kRSqrtLut[index], qScale);
+            lut.push_back(uint8_t(std::min(qMax, std::max(qMin, int(value)))));
+        }
+
+        auto lutTens = CreateConstTensor("rsqrt", ifmConn->tensor->Type(), std::make_shared<Buffer>(std::move(lut)));
+        returnOp = CreateLUT(ifmConn->tensor, lutTens, ifmConn->quantization, ifmConn->quantization, lutTens->Type(),
+            &ifmConn->shape, ofmConn->tensor, ifmConn->slice, ofmConn->slice);
+        returnOp->SetRounding(RoundMode::NATURAL);
+    }
+    else if ( opType == OpType::Rsqrt && ifmConn->tensor->Type() == DataType::Int16 && ofmConn->tensor->Type() == DataType::Int16 )
+    {
+        const auto ofmScale = operation->Output(TensorUsage::OFM)->quantization.scales[0].Dequantize();
+        returnOp = ConvertToInterpolatingLUT16(
+            operation,
+            [&ofmScale](double x) -> double
+            {
+                if ( x <= 0.0f )
+                {
+                    return IntegerMax(DataType::Int16) * ofmScale;
+                }
+                else
+                {
+                    return 1 / std::sqrt(x);
+                }
+            },
+            "Rsqrt16(interp)");
+    }
+
+    if ( operation != returnOp )
+    {
+        RecordOptimisation(operation, returnOp);
+        operation->Disconnect();
+    }
+
+    return returnOp;
+}
+
+// Based on explicit padding provided in a PAD operation, returns adjusted value for
+// padAfter that provides equivalent results when used with explicit padding
+int TFLiteGraphOptimiser::CalcPadAfter(int inputSize, int stride, int filterSize, int padBefore, int padAfter)
+{
+    int totalPadding = NeededTotalPadding(inputSize, stride, filterSize);
+    // The bottom/right padding might need downward adjustment depending on stride/input size
+    int remainderDiff = padAfter % stride - (totalPadding - padBefore) % stride;
+    return std::max(0, padAfter - remainderDiff - (remainderDiff >= 0 ? 0 : stride));
+}
+
+// Tries to completely remove a PAD operator by using explicit padding.
+// E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3
+// is rewritten such that the PAD is removed, and the CONV uses explicit padding.
+// Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV
+// This is the most efficient way to implement PAD, but cannot be done for all pad sizes.
+Operation *TFLiteGraphOptimiser::ReplacePadByExplicitPadding(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+    if ( IsConvolution(operation->Type()) && operation->Kernel()->Padding().IsZero() && operation->IFM(0)->Writers().size() == 1 )
+    {
+        // Potential for future optimization: in certain cases also Pad+AvgPool can be handled
+        // by changing to Depthwise.
+        auto padOp = operation->IFM(0)->Writers()[0].get();
+        if ( padOp->Type() != OpType::Pad )
+        {
+            return operation;
+        }
+        auto padIfmConn = padOp->Input(TensorUsage::IFM0);
+        auto padOfmConn = padOp->Output(TensorUsage::OFM);
+        const auto &padIfm = padOp->IFM(0);
+        const auto &padOfm = padOp->OFM();
+
+        if ( padIfm->Type() != padOfm->Type() || !IsScalingValidAndEqual(*padIfmConn, *padOfmConn) )
+        {
+            return operation;
+        }
+        const auto padValues = padOp->Input(MakeTensorUsage(TensorUsage::Params, 0))->tensor->View().Values<int32_t>();
+        int top = padValues[2];
+        int bottom = padValues[3];
+        int left = padValues[4];
+        int right = padValues[5];
+        const auto &k = operation->Kernel();
+        const auto &kwh = k->DilatedWH();
+        if ( left + right >= kwh.x || top + bottom >= kwh.y )
+        {
+            // Too much padding
+            return operation;
+        }
+        const auto &ifmShape = padOp->Input(TensorUsage::IFM0)->shape;
+        int bottomPad = CalcPadAfter(ifmShape.Height(), k->Stride().y, kwh.y, top, bottom);
+        int rightPad = CalcPadAfter(ifmShape.Width(), k->Stride().x, kwh.x, left, right);
+        // Adjust the padding attributes of the convolution operator
+        auto kernel = std::make_unique<Kernel>(
+            Kernel(k->Size(), k->Stride(), k->Dilation(), k->DepthMultiplier(), Margin(top, left, bottomPad, rightPad)));
+        operation->SetKernel(std::move(kernel));
+        operation->CopyInput(TensorUsage::IFM0, *(padOp->Input(TensorUsage::IFM0)));
+        if ( padOfm->Readers().empty() )
+        {
+            // Bypass the PAD operator
+            padOp->Disconnect();
+        }
+    }
+    return operation;
+}
+
+void TFLiteGraphOptimiser::MakeMemoryCopyForPad(
+    const char *name, const Operation *operation, TensorConnection *ofmConn, const Shape &shape, const Shape &offset)
+{
+    auto dtype = ofmConn->tensor->Type();
+    std::vector<uint8_t> zeroBuf(DataTypeStorageSizeBytes(dtype, shape.Elements()));
+    std::fill(zeroBuf.begin(), zeroBuf.end(), uint8_t(ofmConn->quantization.zeroPoints[0]));
+
+    auto zeroTens = CreateConstTensor(ofmConn->tensor->Name() + "/" + name, dtype, std::make_shared<Buffer>(std::move(zeroBuf)), &shape);
+    auto op = std::make_shared<Operation>(OpType::MemoryCopy);
+    op->SetRounding(RoundMode::NATURAL);
+
+    op->ConnectInput(TensorUsage::IFM0, zeroTens).Set(ofmConn->quantization);
+    op->ConnectOutput(TensorUsage::OFM, ofmConn->tensor).Set(ofmConn->shape).Set(ofmConn->quantization).Set({offset, shape});
+    RecordOptimisation(operation, op.get());
+}
+
+// Rewrites PAD operator to a MemoryCopy that copies the IFM to the OFM
+// + up to 4 MemoryCopy operators that fill the OFM with zeros at the borders.
+// This is done as fall-back for the PAD operators that remain after ReplacePadByExplicitPadding
+Operation *TFLiteGraphOptimiser::ConvertPad(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+    if ( operation->Type() != OpType::Pad )
+    {
+        return operation;
+    }
+    const auto &ifmConn = operation->Input(TensorUsage::IFM0);
+    const auto &ifmShape = ifmConn->shape;
+    const auto &ofmConn = operation->Output(TensorUsage::OFM);
+    const auto &ofmShape = ofmConn->shape;
+    const auto &paramsConn = operation->Input(TensorUsage::Params);
+    const auto padValues = paramsConn->tensor->View().Values<int32_t>();
+    int top = padValues[2];
+    int bottom = padValues[3];
+    int left = padValues[4];
+    int right = padValues[5];
+    int near = padValues[6];
+    int far = padValues[7];
+
+    // Create MemoryCopy op that copies IFM to the right place inside the OFM
+    Shape shp0 = ofmShape.WithZeros();
+    auto mainOp = MakeMemoryCopyForConcat(ofmConn, ifmConn, shp0.WithHeight(top).WithWidth(left).WithDepth(near));
+    RecordOptimisation(operation, mainOp.get());
+    // Add operations that fill the borders of the OFM
+    if ( top > 0 )
+    {
+        Shape shape = ofmShape.WithHeight(top);
+        MakeMemoryCopyForPad("top", operation, ofmConn, shape, shp0);
+    }
+    if ( bottom > 0 )
+    {
+        Shape shape = ofmShape.WithHeight(bottom);
+        Shape offset = shp0.WithHeight(ofmShape.Height() - bottom);
+        MakeMemoryCopyForPad("bottom", operation, ofmConn, shape, offset);
+    }
+    if ( left > 0 )
+    {
+        Shape shape = ifmShape.WithWidth(left).WithDepth(ofmShape.Depth());
+        Shape offset = shp0.WithHeight(top);
+        MakeMemoryCopyForPad("left", operation, ofmConn, shape, offset);
+    }
+    if ( right > 0 )
+    {
+        Shape shape = ifmShape.WithWidth(right).WithDepth(ofmShape.Depth());
+        Shape offset = shp0.WithHeight(top).WithWidth(ofmShape.Width() - right);
+        MakeMemoryCopyForPad("right", operation, ofmConn, shape, offset);
+    }
+    if ( near > 0 )
+    {
+        Shape shape = ifmShape.WithDepth(near);
+        Shape offset = shp0.WithHeight(top).WithWidth(left);
+        MakeMemoryCopyForPad("near", operation, ofmConn, shape, offset);
+    }
+    if ( far > 0 )
+    {
+        Shape shape = ifmShape.WithDepth(far);
+        Shape offset = shp0.WithHeight(top).WithWidth(left).WithDepth(ofmShape.Depth() - far);
+        MakeMemoryCopyForPad("far", operation, ofmConn, shape, offset);
+    }
+    operation->Disconnect();
+    return mainOp.get();
+}
+
+TFLiteGraphOptimiser::TFLiteGraphOptimiser(Architecture *arch, const GraphOptimiserOptions &options, OptimiserDatabase *db) :
+        GraphOptimiser(arch, options, db)
+{
+    _softmax = std::make_unique<Softmax>(arch, db);
+}
+
+void TFLiteGraphOptimiser::OptimiseGraph(Graph *graph)
+{
+    for ( auto iOpt = GraphOptimisationSteps().begin(); iOpt != GraphOptimisationSteps().end(); ++iOpt )
+    {
+        LOG_TRACE1("GraphOptimiser {0}/{1}\n", std::distance(GraphOptimisationSteps().begin(), iOpt) + 1,
+            GraphOptimisationSteps().size());
+        // Check if function lists are empty. Do not call for step that only contain disabled debug functions.
+        if ( !iOpt->opFunction.empty() || !iOpt->tensorFunction.empty() )
+        {
+            RewriteGraph<TFLiteGraphOptimiser>(graph, *iOpt);
+        }
+    }
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/tflite_graph_optimiser.hpp b/ethosu/regor/compiler/tflite_graph_optimiser.hpp
new file mode 100644
index 00000000..4842b0e0
--- /dev/null
+++ b/ethosu/regor/compiler/tflite_graph_optimiser.hpp
@@ -0,0 +1,312 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/logging.hpp"
+
+#include "architecture/architecture.hpp"
+#include "common/scaling.hpp"
+#include "graph.hpp"
+#include "graph_optimiser.hpp"
+#include "op_type.hpp"
+#include "operation.hpp"
+#include "softmax.hpp"
+#include "tensor.hpp"
+
+#include <fixedpoint/fixedpoint.h>
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+namespace regor
+{
+// Sigmoid clamp (-8, 8)
+static double ClampSigmoid8(double value)
+{
+    return ClampSigmoid(value, 8.0);
+};
+
+/// <summary>
+/// TFLite Graph optimiser
+/// </summary>
+class TFLiteGraphOptimiser : public GraphOptimiser
+{
+    using OpRewriteFunction = Operation *(TFLiteGraphOptimiser::*)(Graph *, Operation *);
+    using TensorRewriteFunction = Tensor *(TFLiteGraphOptimiser::*)(Graph *, Tensor *);
+    using GraphOptStepArray = std::vector<RewriteFunctions<TFLiteGraphOptimiser>>;
+
+private:
+    std::unique_ptr<Softmax> _softmax;
+
+    // utility functions
+
+    // Is the scaling of tensor a and b valid and equal.
+    bool IsScalingValidAndEqual(const TensorConnection &a, const TensorConnection &b);
+    // Multiplies int with QuantizedScale with rounding.
+    int MultiplyByQuantizedMultiplier(int x, QuantizedScale quantScale);
+
+    Operation *MakeMulWithConstTensor(const std::string &name, const TensorConnection &ifmConn,
+        const TensorConnection &ofmConn, const std::shared_ptr<Tensor> &constTens, const Quantization &quantization);
+
+    // Helper function for converting operations
+    Operation *MakeOperation(OpType opType, const TensorConnection *ifm0Conn, const TensorConnection *ifm1Conn, const TensorConnection *ofmConn);
+
+    // Converts 16-bit Leaky ReLU
+    Operation *ConvertLeakyRelu16bit(TensorConnection &ifmConn, TensorConnection &ofmConn, Operation *operation);
+
+    // Get axis parameter for operator
+    int GetAxis(const Operation *const operation);
+    // Calculate the read shape and offset values for Slice.
+    void SetSliceOffsetValues(Operation *const operation, Shape &readShape, Shape &readOffset);
+    // Calculate the read shape and offset values for StridedSlice.
+    void SetStridedSliceOffsetValues(Operation *const operation, const TensorConnection *const ifmConn, Shape &readShape, Shape &readOffset);
+    // Creates MemoryCopy operation for the given ifm/ofm and write offset.
+    std::shared_ptr<Operation> MakeMemoryCopyForConcat(
+        const TensorConnection *const ofmConn, const TensorConnection *const ifmConn, const Shape &writeOffset);
+    // Creates a MemoryCopy operation for the given ifm/ofm and readOffset.
+    std::shared_ptr<Operation> MakeMemoryCopyForSplitOps(const TensorConnection *const ofmConn,
+        const TensorConnection *const ifmConn, const Shape &readShape, const Shape &readOffset);
+
+    // Creates the desired shape of either:
+    // - Concat         (Input shape - supply IFM base shape)
+    // - Split/SplitV   (Output shape - supply OFM base shape)
+    //
+    // returns the Desired shape.
+    // Also calculates the axis4D, returned through supplied pointer.
+    Shape MakeConcatSplitDesiredShape(int axis, const Shape &baseShape, int *const axis4D);
+
+    // Creates the desired shape of either:
+    // - pack   (Input shape - supply IFM base shape)
+    // - unpack (Output shape - supply OFM base shape)
+    //
+    // returns the Desired shape.
+    // Unpack keeps the unpacked dimension set to 1.
+    // Also calculates the axis4D, returned through supplied pointer.
+    Shape MakePackUnpackDesiredShape(int axis, const Shape &baseShape, int *const axis4D);
+
+    // Creates the desired Output shape of StridedSlice.
+    //
+    // returns the Desired shape.
+    Shape MakeStridedSliceDesiredShape(Operation *const operation, const Shape &baseShape);
+    // Move Split/slice op to consumer
+    void MoveToConsumer(const Operation *const operation, Operation *const cons);
+
+    static void ReplaceOperation(Operation *const operationToReplace, Operation *const newOperation);
+
+    Operation *MakeDepthwiseMeanOp(const TensorConnection *ifmConn, const Shape &ifmShape4D, const Shape &readShape,
+        const Shape &readOffset, const Shape &ofmShape4D, int w, int h, const std::string &name, std::shared_ptr<Tensor> &weightTensor,
+        std::shared_ptr<Tensor> biasTensor, const Quantization &ifmQuant, const Quantization &weightQuant, const Quantization &ofmQuant);
+
+    // Converts op to int8/uint8 LUT which is generated with the given function.
+    Operation *ConvertToLUT8(Operation *op, std::function<double(double)> func, const std::string &name);
+
+    // Converts op to int16 interpolating LUT which is generated with the given function.
+    Operation *ConvertToInterpolatingLUT16(Operation *op, std::function<double(double)> func, const std::string &name);
+
+    // Converts int16 Tanh/Sigmoid to LUT16
+    Operation *ConvertTanhSigmoidToLUT16(Operation *const op);
+
+
+    // Rewrite functions
+    Operation *ConvertExpToLUT(Graph *const graph, Operation *const operation);
+    Operation *RewriteConcat(Graph *const graph, Operation *const operation);
+    Operation *RewriteSplit(Graph *const graph, Operation *const operation);
+    Operation *RemoveReshape(Graph *const graph, Operation *const operation);
+    Operation *ExtractTransposePermutation(Graph *const graph, Operation *const operation);
+    Operation *RemoveTranspose(Graph *const graph, Operation *const operation);
+    Operation *RemoveReverse(Graph *const graph, Operation *const operation);
+    Operation *ConvertGather(Graph *const graph, Operation *const operation);
+    Operation *ConvertScatter(Graph *const graph, Operation *const operation);
+    Operation *ConvertResize(Graph *const graph, Operation *const operation);
+    Operation *ConvertArgMax(Graph *const graph, Operation *const operation);
+    Operation *MoveSplitSliceToConsumer(Graph *const, Operation *const operation);
+
+    // RewriteBatchMatMul must be called before rewrite of transpose
+    Operation *CreateTransposeForMatMul(const std::shared_ptr<Tensor> &ifm, const Shape &ofmShape);
+    Operation *RewriteBatchMatMul(Graph *const, Operation *const operation);
+    Operation *RewriteSpaceToBatchConvBatchToSpace(Graph *const, Operation *const operation);
+    Operation *FixupDilationGT2(Graph *const, Operation *const operation);
+    Operation *FixupBias(Graph *const, Operation *const operation);
+
+    // Rewrite FullyConnect with dynamic weights to MatMul
+    Operation *RewriteFullyConnectDynamic(Graph *const, Operation *const operation);
+
+    Operation *CreateCastToInt32(const TensorConnection *ifmConn);
+    Operation *RewriteSquaredDifference(Graph *const, Operation *const operation);
+
+    // Convert depthwise convolutions with a depth multiplier greater than 1 into a single Conv2D if:
+    // - the input depth is 1; and
+    // - the output depth equals the depth multiplier.
+    Operation *RewriteDepthwise(Graph *const, Operation *const operation);
+
+    // Check that no reshape like operations remain in graph.
+    Operation *CheckReshapeOpsRemoved(Graph *const graph, Operation *const operation);
+
+    Operation *ConvertSoftmaxOps(Graph *const graph, Operation *const operation);
+    Operation *RewriteFullyConnectedInput(Graph *const graph, Operation *const operation);
+
+    // Unroll convolution if stride is too great
+    Operation *UnrollConv(Graph *const, Operation *const operation);
+
+    // Must be called after RewriteFullyConnectedInput
+    Operation *ConvertBatchedFullyConnected(Graph *const graph, Operation *const operation);
+    Operation *ConvertMeanOps(Graph *const, Operation *const operation);
+
+    // Converts int8/uint8 Sigmoid and Tanh to a LUT based solution
+    Operation *ConvertTanhSigmoidToLUT(Graph *const, Operation *const operation);
+
+    // Convert PReLU to (ReLU + Minimum + Mul + Add)
+    Operation *ConvertPrelu(Graph *const graph, Operation *const operation);
+
+    // Converts Leaky ReLU when needed (LUT based solution or mul + max).
+    Operation *ConvertLeakyRelu(Graph *const graph, Operation *const operation);
+
+    // Converts HardSwish to a LUT based solution.
+    Operation *ConvertHardSwishToLUT(Graph *const graph, Operation *const operation);
+
+    // Converts RSqrt to a LUT based solution.
+    Operation *ConvertRSqrtToLUT(Graph *const graph, Operation *const operation);
+
+    // Based on explicit padding provided in a PAD operation, returns adjusted value for
+    // padAfter that provides equivalent results when used with explicit padding
+    int CalcPadAfter(int inputSize, int stride, int filterSize, int padBefore, int padAfter);
+
+    // Tries to completely remove a PAD operator by using explicit padding.
+    // E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3
+    // is rewritten such that the PAD is removed, and the CONV uses explicit padding.
+    // Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV
+    // This is the most efficient way to implement PAD, but cannot be done for all pad sizes.
+    Operation *ReplacePadByExplicitPadding(Graph *const graph, Operation *const operation);
+
+    void MakeMemoryCopyForPad(const char *name, const Operation *operation, TensorConnection *ofmConn,
+        const Shape &shape, const Shape &offset);
+
+    // Rewrites PAD operator to a MemoryCopy that copies the IFM to the OFM
+    // + up to 4 MemoryCopy operators that fill the OFM with zeros at the borders.
+    // This is done as fall-back for the PAD operators that remain after ReplacePadByExplicitPadding
+    Operation *ConvertPad(Graph *const graph, Operation *const operation);
+
+public:
+    // The graph optimisation steps.
+    // Order matters, array of rewrites processed in order.
+    // clang-format off
+    const GraphOptStepArray _graphOptimisationSteps =
+    {{
+        {
+            {
+#if LOG_TRACE1_ON
+                &GraphOptimiser::VisitTensorLog
+#endif
+            },
+            {
+#if LOG_TRACE1_ON
+                &GraphOptimiser::VisitOperatorLog,
+#endif
+            }
+        },
+        {
+            {},
+            {
+                &TFLiteGraphOptimiser::RewriteConcat
+            }
+        },
+        {
+            {},
+            {
+                &TFLiteGraphOptimiser::RewriteSplit
+            }
+        },
+        {
+            {},
+            {
+                &TFLiteGraphOptimiser::RewriteBatchMatMul,
+                &TFLiteGraphOptimiser::RewriteFullyConnectDynamic
+            }
+        },
+        {
+            {},
+            {
+                &TFLiteGraphOptimiser::RemoveReshape,
+                &TFLiteGraphOptimiser::RemoveTranspose,
+                &TFLiteGraphOptimiser::RemoveReverse,
+            }
+        },
+        {
+            {},
+            {
+                &TFLiteGraphOptimiser::RewriteSpaceToBatchConvBatchToSpace,
+                &TFLiteGraphOptimiser::FixupDilationGT2,
+                &TFLiteGraphOptimiser::FixupBias,
+                &TFLiteGraphOptimiser::RewriteDepthwise,
+                &TFLiteGraphOptimiser::RewriteFullyConnectedInput,
+                &TFLiteGraphOptimiser::ConvertArgMax,
+                &TFLiteGraphOptimiser::ConvertBatchedFullyConnected,
+                &TFLiteGraphOptimiser::ConvertExpToLUT,
+                &TFLiteGraphOptimiser::ConvertTanhSigmoidToLUT,
+                &TFLiteGraphOptimiser::ConvertSoftmaxOps,
+                &TFLiteGraphOptimiser::ReplacePadByExplicitPadding,
+                &TFLiteGraphOptimiser::ConvertMeanOps,
+                &TFLiteGraphOptimiser::ConvertPrelu,
+                &TFLiteGraphOptimiser::ConvertLeakyRelu,
+                &TFLiteGraphOptimiser::ConvertHardSwishToLUT,
+                &TFLiteGraphOptimiser::ConvertRSqrtToLUT,
+                &TFLiteGraphOptimiser::ConvertGather,
+                &TFLiteGraphOptimiser::RewriteSquaredDifference,
+                &TFLiteGraphOptimiser::ConvertScatter,
+                &TFLiteGraphOptimiser::ConvertResize,
+                &TFLiteGraphOptimiser::UnrollConv,
+            }
+        },
+        // MoveSplitSliceToConsumer need to be done after any other optimisation that can affect the ifm/ofm shapes
+        // has been performed, since the ifm/ofm shapes are of importance to this function.
+        {
+            {},
+            {
+                &TFLiteGraphOptimiser::ConvertPad,
+                &TFLiteGraphOptimiser::MoveSplitSliceToConsumer
+            }
+        },
+        {
+            {
+#if LOG_TRACE1_ON
+                &GraphOptimiser::VisitTensorLog
+#endif
+            },
+            {
+                &TFLiteGraphOptimiser::CheckReshapeOpsRemoved,
+#if LOG_TRACE1_ON
+                &GraphOptimiser::VisitOperatorLog,
+#endif
+                &GraphOptimiser::RecordOptimisation
+            }
+        }
+    }};
+    // clang-format on
+
+    explicit TFLiteGraphOptimiser(Architecture *arch, const GraphOptimiserOptions &options, OptimiserDatabase *db);
+
+    const GraphOptStepArray &GraphOptimisationSteps() const { return _graphOptimisationSteps; }
+
+    void OptimiseGraph(Graph *graph);
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/tflite_graph_optimiser_tp.cpp b/ethosu/regor/compiler/tflite_graph_optimiser_tp.cpp
new file mode 100644
index 00000000..3d74371b
--- /dev/null
+++ b/ethosu/regor/compiler/tflite_graph_optimiser_tp.cpp
@@ -0,0 +1,145 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// SPDX-FileCopyrightText: Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "compiler/tflite_graph_optimiser.hpp"
+
+namespace
+{
+
+// Implementation from TensorFlow Lite Micro kernel
+int16_t SaturatingLeftShift(std::int16_t value, int amount)
+{
+    int32_t result = value << amount;
+    result = std::min<int32_t>(result, std::numeric_limits<int16_t>::max());
+    result = std::max<int32_t>(result, std::numeric_limits<int16_t>::min());
+    return int16_t(result);
+}
+
+// Implementation from TensorFlow Lite Micro kernel
+// Similar to ARM instruction SQDMULH.
+// Similar to gemmlowp::SaturatingRoundingDoublingHighMul except
+// rounding to zero instead of to nearest (SQRDMULH).
+int16_t SaturatingDoublingHighMul(int16_t a, int16_t b)
+{
+    bool overflow = a == b && a == std::numeric_limits<int16_t>::min();
+    int32_t a_32(a);
+    int32_t b_32(b);
+    int32_t ab_32 = a_32 * b_32;
+    int16_t ab_x2_high16 = int16_t((ab_32) / (1 << 15));
+    return overflow ? std::numeric_limits<int16_t>::max() : ab_x2_high16;
+}
+
+}  // namespace
+
+namespace regor
+{
+
+// Converts HardSwish to a LUT based solution.
+Operation *TFLiteGraphOptimiser::ConvertHardSwishToLUT(Graph *const graph, Operation *const operation)
+{
+    UNUSED(graph);
+    auto returnOp = operation;
+    auto opType = operation->Type();
+    auto ifmConn = operation->Input(TensorUsage::IFM0);
+    auto ofmConn = operation->Output(TensorUsage::OFM);
+
+    if ( opType == OpType::HardSwish && ifmConn != nullptr && ofmConn != nullptr )
+    {
+        auto ifm = ifmConn->tensor.get();
+        auto ofm = ofmConn->tensor.get();
+
+        // Generate the LUT
+        const double ifmScale = ifmConn->quantization.scales[0].Dequantize();
+        const double ofmScale = ofmConn->quantization.scales[0].Dequantize();
+        const int zpIn = int(ifmConn->quantization.zeroPoints[0]);
+        const int zpOut = int(ofmConn->quantization.zeroPoints[0]);
+        const int qMin = ifm->Type() == DataType::Int8 ? -128 : 0;
+        const int qMax = ifm->Type() == DataType::Int8 ? 127 : 255;
+
+        const double ifmScaleHires = (1.0 / 128.0) * ifmScale;
+        const double reluMultiplier = 3.0 / 32768.0;
+
+        QuantizedScale outScale(ifmScaleHires / ofmScale);
+        QuantizedScale reluScale(ifmScaleHires / reluMultiplier);
+        int16_t outScale16 = DownScaleInt32ToInt16Multiplier(outScale.scale);
+        int16_t reluScale16 = DownScaleInt32ToInt16Multiplier(reluScale.scale);
+        // convert to left shift-positive notation
+        int outShift = 31 - outScale.shift;
+        int reluShift = 31 - reluScale.shift;
+
+        std::vector<uint8_t> lut;
+        lut.reserve(256);
+        for ( int x = qMin; x <= qMax; ++x )
+        {
+            // Compute the "relu-ish multiplier".
+            // This matches the code in TensorFlow Lite Micro kernel
+            const int16_t inputValue = int16_t(x - zpIn);
+
+            const int16_t inputValueOnHiresInputScale = int16_t(inputValue << 7);
+
+            const int16_t inputValueOnPreshiftOutputScale = gemmlowp::SaturatingRoundingDoublingHighMul(inputValueOnHiresInputScale, outScale16);
+
+            int16_t reluValue = inputValueOnHiresInputScale;
+
+            if ( reluShift > 0 )
+            {
+                reluValue = SaturatingLeftShift(reluValue, reluShift - 1);
+            }
+
+            reluValue = gemmlowp::SaturatingRoundingDoublingHighMul(reluValue, reluScale16);
+
+            if ( reluShift > 0 )
+            {
+                reluValue = SaturatingLeftShift(reluValue, 1);
+            }
+
+            if ( reluShift < 0 )
+            {
+                reluValue = gemmlowp::RoundingDivideByPOT(reluValue, -reluShift);
+            }
+            reluValue = int16_t((reluValue + (1 << 15)) >> 1);
+
+            const int16_t preshiftOutputValue = SaturatingDoublingHighMul(reluValue, inputValueOnPreshiftOutputScale);
+
+            int16_t outputValue = gemmlowp::RoundingDivideByPOT(preshiftOutputValue, -outShift);
+
+            int lutVal = outputValue + zpOut;
+            lutVal = std::min(qMax, std::max(qMin, lutVal));
+            lut.push_back(uint8_t(lutVal));
+        }
+
+        auto lutTens = CreateConstTensor("hardswish", ifmConn->tensor->Type(), std::make_shared<Buffer>(std::move(lut)));
+        // The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale),
+        // so even if the OFM has a different scale than the IFM, the generated OFM scale instructions
+        // should be the same as the IFM
+        returnOp = CreateLUT(ifmConn->tensor, lutTens, ifmConn->quantization, ifmConn->quantization, lutTens->Type(),
+            &ifmConn->shape, ofmConn->tensor, ifmConn->slice, ofmConn->slice);
+        returnOp->SetRounding(RoundMode::NATURAL);
+    }
+
+    if ( operation != returnOp )
+    {
+        RecordOptimisation(operation, returnOp);
+        operation->Disconnect();
+    }
+
+    return returnOp;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/tosa_graph_validator.cpp b/ethosu/regor/compiler/tosa_graph_validator.cpp
new file mode 100644
index 00000000..4213beda
--- /dev/null
+++ b/ethosu/regor/compiler/tosa_graph_validator.cpp
@@ -0,0 +1,92 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "tosa_graph_validator.hpp"
+
+#include "compiler.hpp"
+
+#include <optional>
+
+namespace
+{
+
+std::optional<uint32_t> MaybeGetTosaVersion(uint32_t syntaxVersion)
+{
+    if ( syntaxVersion == 0 ) syntaxVersion = (GraphApi::VERSION_TOSA_0_60 | GraphApi::PROFILE_BASELINE);
+    if ( (syntaxVersion & GraphApi::VERSION_TOSA_0_60) == GraphApi::VERSION_TOSA_0_60 )
+    {
+        return GraphApi::VERSION_TOSA_0_60;
+    }
+    else if ( (syntaxVersion & GraphApi::VERSION_TOSA_0_80) == GraphApi::VERSION_TOSA_0_80 )
+    {
+        return GraphApi::VERSION_TOSA_0_80;
+    }
+    else if ( (syntaxVersion & GraphApi::VERSION_TOSA_1_00) == GraphApi::VERSION_TOSA_1_00 )
+    {
+        return GraphApi::VERSION_TOSA_1_00;
+    }
+    return std::nullopt;
+}
+
+}  // namespace
+
+namespace regor
+{
+
+bool TosaGraphValidator::HandlesSyntax(uint32_t syntaxVersion)
+{
+    return MaybeGetTosaVersion(syntaxVersion).has_value();
+}
+
+TosaGraphValidator::TosaGraphValidator(GraphNotation notation, uint32_t syntaxVersion, Compiler *compiler) :
+        GraphValidator(notation, syntaxVersion)
+{
+    _context.version = MaybeGetTosaVersion(syntaxVersion).value_or(GraphApi::VERSION_TOSA_0_60);
+
+    if ( (syntaxVersion & GraphApi::PROFILE_MAIN) == GraphApi::PROFILE_MAIN )
+    {
+        _context.profile = GraphApi::PROFILE_MAIN;
+    }
+    else
+    {
+        _context.profile = GraphApi::PROFILE_BASELINE;
+    }
+    _context.GetGraph = [compiler](const char *name) { return compiler->GetGraph(name); };
+}
+
+bool TosaGraphValidator::Validate(Graph *graph)
+{
+    bool graphValid = true;
+    Graph::TraverseGraphFromEnd(graph->Outputs(),
+        [&graphValid, &graph, this](Operation *op) -> bool
+        {
+            try
+            {
+                tosa::validator::ValidateOperator(op, _context);
+            }
+            catch ( const std::invalid_argument &e )
+            {
+                graphValid = false;
+                _validationErrors.emplace_back(Error{op->Type(), e.what()});
+            }
+            return true;
+        });
+    return graphValid;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/compiler/tosa_graph_validator.hpp b/ethosu/regor/compiler/tosa_graph_validator.hpp
new file mode 100644
index 00000000..ae5d0686
--- /dev/null
+++ b/ethosu/regor/compiler/tosa_graph_validator.hpp
@@ -0,0 +1,39 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "graph_validator.hpp"
+#include "tosa/tosa_validator.hpp"
+
+#include <functional>
+
+namespace regor
+{
+
+class TosaGraphValidator : public GraphValidator
+{
+    tosa::validator::Context _context;
+
+public:
+    TosaGraphValidator(GraphNotation notation, uint32_t syntaxVersion, Compiler *compiler);
+    static bool HandlesSyntax(uint32_t syntaxVersion);
+    bool Validate(Graph *graph) override;
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/dependencies/mlw_codec/CMakeLists.txt b/ethosu/regor/dependencies/mlw_codec/CMakeLists.txt
new file mode 100644
index 00000000..14b40296
--- /dev/null
+++ b/ethosu/regor/dependencies/mlw_codec/CMakeLists.txt
@@ -0,0 +1,183 @@
+#
+# SPDX-FileCopyrightText: Copyright 2022-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+cmake_minimum_required(VERSION 3.15.6)
+cmake_policy(SET CMP0063 NEW)
+project(mlw_codec VERSION 1.0 DESCRIPTION "MLW Codec" LANGUAGES C CXX)
+include(GNUInstallDirs)
+
+if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+    set(CMAKE_BUILD_TYPE Debug)
+endif()
+
+option(DEBUG_PACKET "Debug packet syntax" OFF)
+option(DEBUG_BITSTREAM "Debug bitstream contents" OFF)
+
+if (DEBUG_PACKET)
+    list(APPEND MLW_DEFINES "ENABLE_DEBUG_PACKET=1")
+endif()
+
+if (DEBUG_BITSTREAM)
+    list(APPEND MLW_DEFINES "ENABLE_DEBUG_BITSTREAM=1")
+endif()
+
+message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
+
+# Add a target library made from the above source files
+add_library(mlw_codec_st STATIC
+    "source/mlw_encode.cpp"
+    "source/mlw_encode_fwd.cpp"
+    "source/ml_ethosu_encode.cpp"
+    "source/mlw_decode.cpp"
+)
+set_target_properties(mlw_codec_st PROPERTIES OUTPUT_NAME mlw_codec)
+
+# Default compiler settings
+set_property(TARGET mlw_codec_st PROPERTY CXX_STANDARD 11)
+set_property(TARGET mlw_codec_st PROPERTY CXX_EXTENSIONS OFF)
+set_property(TARGET mlw_codec_st PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+add_custom_target(mlw_codec DEPENDS mlw_codec_st)
+
+if (MSVC)
+    set(MLW_TOOLCHAIN_CXX_OPTIONS
+        "/DWIN32_LEAN_AND_MEAN"
+        "/DNOMINMAX"
+        "/D_USE_MATH_DEFINES"
+        "/D_CRT_SECURE_NO_WARNINGS"
+        "/D_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES"
+        "/bigobj"
+        "/experimental:external"
+        "/external:W0"
+        "/external:anglebrackets"
+        # Diagnostics
+        "/W3"      # Default warning level (severe + significant + production quality).
+        "/wd4200"  # "nonstandard extension used : zero-sized array in struct/union"
+        "/wd4018"  # "signed/unsigned mismatch in comparison"
+        "/wd4146"  # operator applied to unsigned type, result still unsigned
+        "/wd4244"  # possible loss of data
+        "/wd4267"  # initializing: possible loss of data
+        "/wd4005"  # allow: macro redefinition
+        "/wd4065"  # allow: switch statement contains 'default' but no 'case' labels
+        "/wd4141"  # allow: inline used more than once
+        "/wd4624"  # allow: destructor was implicitly defined as deleted
+        "/wd4146"  # operator applied to unsigned type, result still unsigned
+        "/wd4244"  # possible loss of data
+        "/wd4267"  # initializing: possible loss of data
+        "/wd5105"  # allow: macro expansion producing 'defined' has undefined behavior
+    )
+else()
+    set(MLW_TOOLCHAIN_CXX_OPTIONS
+        # Enabled compiler options
+        -Wall -Wextra -Wsign-compare -Wold-style-cast -Wswitch-default
+        -Wformat -Wdouble-promotion -Wredundant-decls -Wlogical-op
+        -Wnon-virtual-dtor -Wcast-align -Wshadow
+        # Disabled compiler options
+        -Wno-format-contains-nul -Wno-format-extra-args
+        -Wno-unused-function -Wno-unused-label -Wno-maybe-uninitialized -Wno-unused
+        -Wno-unused-local-typedefs -Wno-stringop-truncation -Wno-missing-field-initializers
+        # Config specific options
+        $<$<CONFIG:Release>:-Werror>
+    )
+endif()
+
+# Config specific compiler defines
+set(MLW_TOOLCHAIN_CXX_DEFINES
+    RELEASE=$<BOOL:$<CONFIG:Release>>
+    DEBUG=$<BOOL:$<CONFIG:Debug>>)
+
+# Configure target's compilation options
+target_compile_definitions(mlw_codec_st PRIVATE ${MLW_DEFINES} $<$<COMPILE_LANGUAGE:CXX>:${MLW_TOOLCHAIN_CXX_DEFINES}>)
+target_compile_options(mlw_codec_st PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${MLW_TOOLCHAIN_CXX_OPTIONS}>)
+
+# Properties
+file(GLOB MLW_CODEC_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/include/*)
+
+set_target_properties(mlw_codec_st PROPERTIES
+    VERSION ${${PROJECT_NAME}_VERSION}
+    #PUBLIC_HEADER "${MLW_CODEC_HDRS}"
+)
+
+# Include directories (private and public interface)
+target_include_directories(mlw_codec_st
+    PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}"
+    INTERFACE
+        "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+        "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>"
+)
+
+# Install
+install(TARGETS mlw_codec_st
+    EXPORT ${PROJECT_NAME}
+    LIBRARY       DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+    ARCHIVE       DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+    RUNTIME       DESTINATION "${CMAKE_INSTALL_BINDIR}"
+    PUBLIC_HEADER DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}"
+)
+if(MSVC AND NOT CMAKE_VERSION VERSION_LESS 3.15)
+    install(FILES
+        "$<TARGET_FILE_DIR:mlw_codec_st>/$<TARGET_FILE_PREFIX:mlw_codec_st>$<TARGET_FILE_BASE_NAME:mlw_codec_st>.pdb"
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        OPTIONAL
+    )
+endif()
+install(EXPORT ${PROJECT_NAME}
+    NAMESPACE ${PROJECT_NAME}::
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}"
+)
+
+# Aliases
+add_library(${PROJECT_NAME}::mlw_codec_st ALIAS mlw_codec_st)
+
+# Use a PEP-656 compliant package tag
+# The default value for this variable is not useful
+if (NOT Python3_FOUND)
+    set(Python3_FIND_STRATEGY VERSION)
+    set(Python3_FIND_REGISTRY LAST)
+    set(Python3_FIND_FRAMEWORK LAST)
+    find_package(Python3 COMPONENTS Interpreter REQUIRED)
+endif()
+execute_process(
+    COMMAND ${Python3_EXECUTABLE} -c "import sysconfig; print(sysconfig.get_platform())"
+    OUTPUT_VARIABLE MLW_CODEC_SYSTEM_NAME OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+set(CPACK_PACKAGE_NAME ${PROJECT_NAME})
+# Default variables
+set(CPACK_PACKAGE_VENDOR "Arm")
+set(CPACK_PACKAGE_DESCRIPTION "${PROJECT_DESCRIPTION}")
+set(CPACK_PACKAGE_VERSION_MAJOR "${${PROJECT_NAME}_VERSION_MAJOR}")
+set(CPACK_PACKAGE_VERSION_MINOR "${${PROJECT_NAME}_VERSION_MINOR}")
+if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+    set(CPACK_STRIP_FILES FALSE)
+    set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-dev-${MLW_CODEC_SYSTEM_NAME})
+else()
+    set(CPACK_STRIP_FILES TRUE)
+    set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${MLW_CODEC_SYSTEM_NAME})
+endif()
+set(CPACK_VERBATIM_VARIABLES TRUE)
+
+# Archive generator setup
+set(CPACK_BINARY_TGZ  ON)
+set(CPACK_BINARY_STGZ OFF)
+set(CPACK_BINARY_TBZ2 OFF)
+set(CPACK_BINARY_TXZ  OFF)
+set(CPACK_BINARY_TZ   OFF)
+set(CPACK_INSTALL_CMAKE_PROJECTS
+    "${CMAKE_CURRENT_BINARY_DIR};${CMAKE_PROJECT_NAME};/")
+
+include(CPack)
diff --git a/ethosu/regor/dependencies/mlw_codec/include/mlw_decode.h b/ethosu/regor/dependencies/mlw_codec/include/mlw_decode.h
new file mode 100644
index 00000000..8aca4ea2
--- /dev/null
+++ b/ethosu/regor/dependencies/mlw_codec/include/mlw_decode.h
@@ -0,0 +1,50 @@
+//
+// SPDX-FileCopyrightText: Copyright 2022-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#ifndef __MLW_DECODE_H__
+#define __MLW_DECODE_H__
+
+#pragma once
+
+#include <stdint.h>
+
+// Result of the decode process
+typedef struct ml_decode_result_t
+{
+    int16_t *decoded_data;              // decoded weight elements
+    int32_t  decoded_length;            // decoded weight length (in elements)
+    int32_t  section_count;             // number of sections in stream
+    int32_t *section_sizes;             // section sizes in stream
+} ml_decode_result_t;
+
+
+#if defined __cplusplus
+extern "C"
+{
+#endif
+
+    void ml_decode_ethosu_stream(ml_decode_result_t *result, const uint8_t *buffer, int size_bytes);
+
+    void mld_free(ml_decode_result_t *result);
+
+#if defined __cplusplus
+}  // extern "C"
+#endif
+
+
+#endif
diff --git a/ethosu/regor/dependencies/mlw_codec/include/mlw_encode.h b/ethosu/regor/dependencies/mlw_codec/include/mlw_encode.h
new file mode 100644
index 00000000..3b728f33
--- /dev/null
+++ b/ethosu/regor/dependencies/mlw_codec/include/mlw_encode.h
@@ -0,0 +1,114 @@
+//
+// SPDX-FileCopyrightText: Copyright 2022-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#if !defined MLW_ENCODE_H
+#define MLW_ENCODE_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+#if defined _MSC_VER
+    #define MLW_CODEC_PACKED
+    #define MLW_CODEC_USE_PACK_PRAGMA (1)
+#else // __GNUC__ and clang
+    #define MLW_CODEC_PACKED __attribute__((packed))
+#endif
+
+// Encoder input parameters EthosU
+typedef struct ml_ethosu_encode_params_t
+{
+    int32_t source_buffering_hint;     // Recommend a buffering size
+    uint16_t encoder_flags;            // Control flags to pass to the encoder
+    void* (*realloc_func)(void*, size_t, int purpose); // Custom output allocator function
+} ml_ethosu_encode_params_t;
+
+// Resulting encoded section information
+typedef struct ml_encode_section_t
+{
+    int32_t offset;                    // Byte offset of encoded section
+    int32_t size;                      // Byte size of encoded section
+    int32_t zeroes;                    // Number of zeroes encoded in a section
+    int8_t  group_start;               // Start of group
+} ml_encode_section_t;
+
+// Result of the encode process
+typedef struct ml_encode_result_t
+{
+    uint8_t *encoded_data;              // Encoded weight data
+    int32_t  encoded_length;            // Encoded weight length (in bytes)
+    int32_t  source_length;             // Source elements read
+    ml_encode_section_t *section_info;  // Array of sections in stream
+    int32_t  section_count;             // Number of section in stream
+} ml_encode_result_t;
+
+#define MLW_SOURCE_QUERY_WEIGHTS 0
+#define MLW_SOURCE_QUERY_SHIFTS  1
+
+// State of the source iterator
+typedef struct ml_source_state_t
+{
+    uint8_t new_dim_mask;               // Dimension start mask
+    uint8_t end_dim_mask;               // Dimension end mask
+    bool    eos;                        // End-of-stream flag
+} ml_source_state_t;
+
+// Stream input callback (encoder will collect input through this function, of the size recommended by the buffering hint)
+typedef int32_t (*ml_weight_source_fn)(int32_t query, ml_source_state_t *state, int16_t *buffer, int32_t size, void *user_arg);
+
+// Internal state context
+typedef struct mle_context_t mle_context_t;
+
+#define MLW_ENCODE_FLAG_NONE (0)       // Default encoding flag
+#define MLW_ENCODE_NO_BITSTREAM (1)    // Do not write any bitstream data (only return the length)
+#define MLW_ENCODE_INSERT_PALETTE (2)  // Insert a new palette header with this encode
+#define MLW_ENCODE_RESET_PALETTE (4)   // Clear and recalculate the palette header
+#define MLW_ENCODE_PARTIAL_DATA (8)    // Frequency analysis and palette will be constructed from incomplete data
+#define MLW_ENCODE_NO_PADDING (16)     // Disable trailing padding
+#define MLW_ENCODE_NO_PALETTE_LUT (32) // Disable palette LUT generation
+#define MLW_ENCODE_NO_ZERO_RUNS (64)   // Disable zero run generation
+#define MLW_ENCODE_DPIC_FORCE_PARAMS (128) // Force debug parameters
+#define MLW_ENCODE_NEW_PALETTE (MLW_ENCODE_INSERT_PALETTE|MLW_ENCODE_RESET_PALETTE)
+
+#define MLW_ENCODE_SYNTAX_ETHOSU (0)   // EthosU bitstream encode syntax
+#define MLW_ENCODE_SYNTAX_ETHOSU_FWD (2) // EthosU FWD bitstream encode syntax
+
+#define MLW_ENCODE_ALLOC_GENERAL  (0)   // General allocations used by the encoder
+#define MLW_ENCODE_ALLOC_METADATA (1)   // Allocation for codec's metadata output
+#define MLW_ENCODE_ALLOC_STREAM0  (2)   // Stream 0 allocation for this codec
+#define MLW_ENCODE_ALLOC_STREAM1  (3)   // Stream 1 allocation for this codec
+
+#if defined __cplusplus
+extern "C"
+{
+#endif
+    // Baseline encode
+    mle_context_t *mle_create_context(int syntax);
+    int      mle_context_query_zeroes(mle_context_t *ctx);
+    void     mle_context_set_allocator(mle_context_t *ctx, void* (*realloc_func)(void*, size_t, int purpose));
+    void     mle_destroy_context(mle_context_t *ctx);
+    int      mle_encode(mle_context_t *ctx, ml_encode_result_t *result, const int16_t *inbuf, int inbuf_size, unsigned mlw_encode_flags);
+    void     mle_free(ml_encode_result_t *result);
+
+    int32_t  ml_encode_ethosu_stream(ml_encode_result_t *result, const ml_ethosu_encode_params_t *ep, ml_weight_source_fn src, void *user_arg, mle_context_t **ctx_out);
+
+#if defined __cplusplus
+}  // extern "C"
+#endif
+
+
+#endif // MLW_ENCODE_H
diff --git a/ethosu/regor/dependencies/mlw_codec/source/ml_bit_buffer.hpp b/ethosu/regor/dependencies/mlw_codec/source/ml_bit_buffer.hpp
new file mode 100644
index 00000000..61369d75
--- /dev/null
+++ b/ethosu/regor/dependencies/mlw_codec/source/ml_bit_buffer.hpp
@@ -0,0 +1,255 @@
+//
+// SPDX-FileCopyrightText: Copyright 2022-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#if !defined ML_BIT_BUFFER_HPP
+#define ML_BIT_BUFFER_HPP
+
+#pragma once
+
+#include "ml_raw_buffer.hpp"
+
+#include <cstdint>
+
+struct bitbuf_t
+{
+private:
+    uint32_t *_buf;
+    uint32_t  _next = 0;
+    int  _pos;           // bit pos of next bit
+    int  _limit;         // in bytes
+    int  _substream_start = 0; // start position for substreams
+    int  _substream_end = 0;   // end position for substreams
+    bool _enabled = false;
+    raw_buffer_t<uint8_t> *_buffer;
+public:
+    // Read constructor
+    bitbuf_t(const void *buf, int used_bytes) : _buffer(nullptr)
+    {
+        _limit = used_bytes & ~3;
+        _buf = reinterpret_cast<uint32_t *>(const_cast<void*>(buf));
+        _pos = 0;
+    }
+
+    // Write constructor
+    bitbuf_t(raw_buffer_t<uint8_t> &buffer, int reserve_bytes, bool disable_writes) : _buffer(&buffer)
+    {
+        _enabled = !disable_writes;
+        if ( _enabled ) _buffer->reserve(reserve_bytes);
+        _limit = _buffer->capacity() & ~3;
+        prime( _buffer->used() * 8 ); // Start at the end of the buffer's used data
+    }
+
+    // Sub-stream writer
+    bitbuf_t(bitbuf_t &dest, int bitpos, int bitlen=0) : _buffer(dest._buffer)
+    {
+        _limit = _buffer->capacity() & ~3;
+        _substream_start = bitpos;
+        bitlen = (bitlen <= 0) ? (_limit * 8) - bitpos : bitlen; // Default to rest of buffer
+        _substream_end = bitpos + bitlen;
+        int required = (_substream_end + 7 ) / 8;
+        assert( required <= _limit );
+        _enabled = dest._enabled;
+         prime( bitpos );
+    }
+
+public:
+    void put(int len, int32_t data)
+    {
+        assert( _buf == reinterpret_cast<uint32_t *>(_buffer->begin()) && "Buffer resized externally" );
+        assert( (data & ((1 << len)-1)) == data && "Data must be pre-masked" );
+        assert( ((_substream_end == 0)  || (_pos + len <= _substream_end)) && "Write past end of substream section" );
+        if ( len > 0 && _enabled )
+        {
+            uint32_t next = _next;
+            int bitpos = _pos & 0x1F;
+            next |= uint32_t(data) << bitpos;
+
+            if ( len >= (32 - bitpos) )
+            {
+                // Write won't fit, reserve more output space
+                if ( (_pos / 8) >= _limit )
+                {
+                    extend();
+                }
+
+                _buf[_pos >> 5] = next;  // Requires little-endian
+                next = uint32_t(data) >> (32 - bitpos);
+            }
+
+            _next = next;
+        }
+
+        _pos += len;
+    }
+
+    void put_masked(int len, int32_t data)
+    {
+        put(len, data & ((1 << len)-1));
+    }
+
+    void fill(int len, unsigned bit)
+    {
+        const uint32_t mask = 0xFFFFFFFF * bit;
+        int remain = len;
+        while ( remain >= 32 )
+        {
+            put(32, mask);
+            remain -= 32;
+        }
+        if (remain > 0)
+            put(remain, mask & ((1u << remain) - 1)  );
+    }
+
+    void align(int bits, int fill_byte)
+    {
+        // Alignments must be power of 2
+        assert( (bits & (bits - 1)) == 0  && bits );
+        const int mask = bits-1;
+
+        int distance = (bits - (_pos & mask)) & mask;
+
+        // Byte align first
+        put_masked( distance & 7, fill_byte );
+        distance &= ~7;
+        while (distance != 0)
+        {
+            put(8, fill_byte);
+            distance -= 8;
+        }
+    }
+
+    void reposition(int bitpos)
+    {
+        int end = (_substream_end != 0) ? _substream_end : _limit * 8;
+        assert( (bitpos >= 0 && bitpos <= end) && "Can't reposition out of stream" );
+        assert( (_substream_end == 0 || bitpos >= _substream_start) && "Can't reposition before substream");
+        if ((_pos != bitpos) && (bitpos > 0) && (bitpos <= end) )
+        {
+            // Reposition in bitstream. Caller must flush if writing.
+            prime(bitpos);
+        }
+    }
+
+    void flush(bool done=true)
+    {
+        if ( !_enabled ) return;
+        // If buffering word is not empty, write it out as-is.
+        if ( _pos & 0x1F )
+        {
+            // If writing a substream, blend any overlapping words with the parent stream
+            if ( _substream_end )
+            {
+                int remain = _substream_end - (_pos & ~0x1F);  // Remaining word-bits in this substream
+                if ( remain < 32 )                             // Will overlap happen?
+                {
+                    uint32_t mask = ~0u << (32 - remain);      // Mask the parent bits that we want to keep
+                    _next = (_buf[_pos >> 5] & mask) | (_next & ~mask);
+                }
+            }
+            // Otherwise limited by the buffer
+            else
+            {
+                // Only extend by space required to flush remaining word.
+                if ( (_pos / 8) >= _limit )
+                {
+                    extend(true);
+                }
+            }
+            _buf[_pos >> 5] = _next;
+        }
+        if ( done && !_substream_end )
+        {
+            _buffer->set_used( _pos / 8 );
+        }
+    }
+
+    void sync(bitbuf_t &substream)
+    {
+        flush(false);
+        substream.flush(false);
+        prime( std::max(_pos, substream._pos) );
+        substream._buffer = nullptr;
+    }
+
+    int get(int len)
+    {
+        if ( len == 0 )
+        {
+            return 0;
+        }
+
+        const unsigned mask = (1u << len) - 1;
+        assert( (_pos / 8) < _limit );
+        uint32_t next = _buf[_pos >> 5];
+        int bitpos = _pos & 0x1F;
+        // Bits from this word
+        unsigned value = next >> bitpos;
+        _pos += len;
+
+        // Some of the bits are in the next word
+        if ( len > (32 - bitpos) )
+        {
+            assert( (_pos / 8) < _limit );
+            next = _buf[_pos >> 5];
+            value |= next << (32 - bitpos);
+        }
+
+        return int(value & mask);
+    }
+
+    void read_align(int bits)
+    {
+        // Alignments must be power of 2
+        assert( (bits & (bits - 1)) == 0  && bits );
+        const int mask = bits-1;
+        _pos += (bits - (_pos & mask)) & mask;
+    }
+
+    bool read_eos() const { return _pos/8 >= _limit; }
+
+    int read_avail() const { return (_limit - (_pos / 8)) * 8 - (_pos & 7); }
+    int read_avail(int watermark) const { return (watermark - (_pos / 8)) * 8 - (_pos & 7); }
+
+    int pos() const { return _pos; }
+    int byte_pos() const { return _pos / 8; }
+    int byte_length() const { return _limit; }
+
+private:
+    void prime(int bitpos)
+    {
+        assert( (bitpos >= 0) && (bitpos / 8) < _limit );
+        // Prime (start up) the bitstream writer at the given bit position
+        _pos = bitpos;
+        _buf = reinterpret_cast<uint32_t *>(_buffer->begin());
+        _next = _buf[bitpos >> 5];
+        _next &= (1u << (bitpos & 0x1F)) - 1;
+    }
+
+    void extend(bool exact_resize=false)
+    {
+        assert(_enabled);
+        _buffer->set_used( (_pos / 8) & ~3 );  // Only use whole words
+        _buffer->reserve( sizeof(uint32_t), exact_resize );       // Buffer implementation must optimise small requests
+        assert( (_buffer->capacity() & ~3) > _limit );
+        assert( _substream_end == 0 ); // Can't extend a substream
+        _limit = _buffer->capacity() & ~3;
+        _buf = reinterpret_cast<uint32_t *>(_buffer->begin());
+    }
+};
+
+#endif // ML_BIT_BUFFER_HPP
diff --git a/ethosu/regor/dependencies/mlw_codec/source/ml_encoder_internal.hpp b/ethosu/regor/dependencies/mlw_codec/source/ml_encoder_internal.hpp
new file mode 100644
index 00000000..dc9b8b28
--- /dev/null
+++ b/ethosu/regor/dependencies/mlw_codec/source/ml_encoder_internal.hpp
@@ -0,0 +1,128 @@
+//
+// SPDX-FileCopyrightText: Copyright 2022-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#if !defined ML_ENCODER_INTERNAL_HPP
+#define ML_ENCODER_INTERNAL_HPP
+
+#pragma once
+
+#include "../include/mlw_encode.h"
+#include "ml_bit_buffer.hpp"
+
+#include <cstdint>
+#include <vector>
+
+#if __GNUC__
+    #define ML_ENCODER_DLL_EXPORT __attribute__((visibility("default")))
+#elif _WIN32
+    #if TARGET_WIN32_DLL
+        #define ML_ENCODER_DLL_EXPORT __declspec(dllexport)
+    #else
+        #define ML_ENCODER_DLL_EXPORT
+    #endif
+#else
+    #error "undefined export semantics"
+#endif
+
+#if !defined ENABLE_DEBUG_PACKET
+    #define ENABLE_DEBUG_PACKET (0)
+#endif
+
+#if !defined ENABLE_DEBUG_BITSTREAM
+    #define ENABLE_DEBUG_BITSTREAM (0)
+#endif
+
+#if ENABLE_DEBUG_PACKET
+    #include <cstdio>
+    #define PACKET_LOG(...) printf(__VA_ARGS__)
+#else
+    #define PACKET_LOG(...)
+#endif
+
+#if ENABLE_DEBUG_BITSTREAM
+    #include <cstdio>
+    #define BITSTREAM_LOG(...) printf(__VA_ARGS__)
+#else
+    #define BITSTREAM_LOG(...)
+#endif
+
+constexpr int ETHOSU_SLICELEN_BITS = 15;
+constexpr int ZDIV_DISABLE = 6;  // not alternating mode
+constexpr int ZDIV_EOS = 7;      // indicates end of stream
+constexpr int WDIV_UNCOMPRESSED = 7;  // indicates uncompressed weights
+
+struct palette_t
+{
+    int16_t lut[32] = {0};
+    int16_t inv_lut[512] = {0};
+    int freq[512] = {0};
+    int palsize;         // number of palette entries
+    int palbits;         // bit width of palette entries
+    int direct_offset;   // added to the decoded weight index before direct conversion to sign/mag
+    bool use_zero_runs;  // zeros are coded separately
+    bool only_palette;   // no values outside the palette
+    bool only_zeros;     // special case that the section is all zeros
+};
+
+struct slice_params_t
+{
+    uint8_t w_grc_trunc;
+    bool    w_uncompressed;
+    uint8_t z_grc_div;
+    uint8_t w_grc_div;
+};
+
+struct mle_slice_debug_t
+{
+    slice_params_t params;
+    palette_t palette;
+};
+
+struct mle_context_t
+{
+    palette_t palette;
+    int  syntax = 0;
+    int  zero_count = 0;
+    int  slicelen_bits = ETHOSU_SLICELEN_BITS;
+    bool palette_valid = false;
+    bool single_slice_sections = false;
+    bool allow_empty_slices = false;
+    bool eos_required = false;
+    bool enable_slice_debug = false;
+    bool disable_lut = false;
+    int8_t fixed_wgrc = -1;
+    int8_t fixed_zgrc = -1;
+    std::vector<mle_slice_debug_t> slice_debug;
+    void* (*realloc_func)(void*, size_t, int); // Custom output allocator function
+};
+
+inline int div_round_up(int num, int div)
+{
+    return (num + div - 1) / div;
+}
+
+int ml_encode_fwd(mle_context_t *ctx, bitbuf_t &bits, const int16_t *weights, int encode_count, unsigned mlw_encode_flags);
+int ml_encode_section(mle_context_t *ctx, const int16_t *inbuf, int size, palette_t *p, bitbuf_t *bitbuf);
+palette_t *ml_encode_palette(mle_context_t *ctx, const int16_t *weights, int encode_count, int analyse_count, unsigned mlw_encode_flags);
+void ml_encode_eos(mle_context_t *ctx, bitbuf_t &bits, unsigned mlw_encode_flags);
+int ml_encode_internal(mle_context_t *ctx, bitbuf_t &bits, const int16_t *weights, int encode_count, int analyse_count, unsigned mlw_encode_flags);
+
+#endif // ML_ENCODER_INTERNAL_HPP
+
+
+
diff --git a/ethosu/regor/dependencies/mlw_codec/source/ml_ethosu_encode.cpp b/ethosu/regor/dependencies/mlw_codec/source/ml_ethosu_encode.cpp
new file mode 100644
index 00000000..14f92b17
--- /dev/null
+++ b/ethosu/regor/dependencies/mlw_codec/source/ml_ethosu_encode.cpp
@@ -0,0 +1,114 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "../include/mlw_encode.h"
+
+#include "ml_bit_buffer.hpp"
+#include "ml_raw_buffer.hpp"
+#include "ml_encoder_internal.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+#if defined __cplusplus
+extern "C"
+{
+#endif
+
+
+ML_ENCODER_DLL_EXPORT int32_t ml_encode_ethosu_stream(ml_encode_result_t *result, const ml_ethosu_encode_params_t *ep, ml_weight_source_fn src, void *user_arg, mle_context_t **ctx_out)
+{
+    constexpr int BUFFERING_REQUEST_SIZE  = 8192;    // Initial input buffering
+    constexpr int INITIAL_OUTPUT_BUFFER   = 8192;    // Initial size of output buffer (doubles at every overflow)
+    constexpr unsigned VALID_FLAGS = MLW_ENCODE_NO_BITSTREAM;
+
+    assert(result && ep);
+    if ( !(result && ep && src) )
+    {
+        return 0;
+    }
+
+    mle_context_t *ctx = mle_create_context(MLW_ENCODE_SYNTAX_ETHOSU);
+    // Allow forcing parameters for debug validation - it is expected that
+    // the caller knows what they're doing here since it accesses the opaque
+    // internals via the public interface.
+
+    assert( !(ep->encoder_flags & ~(VALID_FLAGS)) );  // Check acceptable flags
+    unsigned ethosu_encode_flags = (ep->encoder_flags & VALID_FLAGS);
+
+    // Input buffering of data from the source function
+    assert( ep->source_buffering_hint >= 0 );
+    int request_size = std::max(BUFFERING_REQUEST_SIZE, ep->source_buffering_hint & 0x00FFFFFF);
+    raw_buffer_t<int16_t> buffer( request_size );
+
+    // The source function will communicate the state to this encoding loop
+    ml_source_state_t state = {0};
+    state.eos = false;
+
+    // Output bitstream allocation
+    raw_buffer_t<uint8_t> output(INITIAL_OUTPUT_BUFFER, MLW_ENCODE_ALLOC_STREAM0, ep->realloc_func);
+    bitbuf_t bits(output, 8, ethosu_encode_flags & MLW_ENCODE_NO_BITSTREAM);
+
+    result->source_length = 0;
+
+    // Repeatedly ask for values until the source function signals end-of-stream.
+    while ( !state.eos )
+    {
+        int16_t* buf_write = buffer.reserve(request_size);
+        int received = (*src)(MLW_SOURCE_QUERY_WEIGHTS, &state, buf_write, buffer.capacity() - buffer.used(), user_arg);
+        buffer.use(received);
+
+        unsigned encode_flags = ethosu_encode_flags;
+        encode_flags |= MLW_ENCODE_NEW_PALETTE;
+
+        int bytes_written = ml_encode_internal(ctx, bits, buffer.begin(), buffer.used(), buffer.used(), encode_flags);
+        if ( bytes_written < 0 )
+        {
+            // Encoder errored
+            mle_destroy_context(ctx);
+            return -1;
+        }
+        result->source_length += buffer.used();
+        buffer.clear();
+    }
+
+    ml_encode_eos(ctx, bits, ethosu_encode_flags);
+
+    // Populate the return result
+    assert(bits.byte_pos() == output.used() || (ethosu_encode_flags & MLW_ENCODE_NO_BITSTREAM));
+    result->encoded_length = bits.byte_pos();
+    result->encoded_data = output.detach();
+    result->section_info = nullptr;
+    result->section_count = 0;
+
+    if (ctx_out != nullptr)
+    {
+        assert( *ctx_out == nullptr );
+        *ctx_out = ctx;
+    }
+    else
+    {
+        mle_destroy_context(ctx);
+    }
+    return 1;
+}
+
+
+#if defined __cplusplus
+}  // extern "C"
+#endif
diff --git a/ethosu/regor/dependencies/mlw_codec/source/ml_raw_buffer.hpp b/ethosu/regor/dependencies/mlw_codec/source/ml_raw_buffer.hpp
new file mode 100644
index 00000000..62277d4b
--- /dev/null
+++ b/ethosu/regor/dependencies/mlw_codec/source/ml_raw_buffer.hpp
@@ -0,0 +1,161 @@
+//
+// SPDX-FileCopyrightText: Copyright 2022-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#if !defined ML_RAW_BUFFER_HPP
+#define ML_RAW_BUFFER_HPP
+
+#pragma once
+
+#include <memory>
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <type_traits>
+#include <algorithm>
+
+typedef void* (*realloc_t)(void *ptr, size_t size, int);
+
+template <typename TYPE>
+struct raw_buffer_t
+{
+    static_assert(std::is_trivially_copyable<TYPE>::value, "expected simple storage type");
+    constexpr static int CAPACITY_ALIGN = 16;
+    TYPE *_data;
+    int  _used;
+    int  _capacity;
+    int  _reallocArg = 0;
+    realloc_t _realloc=&realloc_proxy;
+
+public:
+    raw_buffer_t(int capacity, int arg=0, realloc_t rfunc=nullptr)
+    {
+        assert(capacity > 0);
+        _realloc = (rfunc != nullptr) ? rfunc : &realloc_proxy;
+        _capacity = (capacity + CAPACITY_ALIGN - 1) & ~(CAPACITY_ALIGN - 1);
+        _reallocArg = arg;
+        _data = reinterpret_cast<TYPE*>(_realloc(nullptr, _capacity * sizeof(TYPE), _reallocArg));
+        _used = 0;
+    }
+
+    raw_buffer_t(raw_buffer_t &&other)
+    {
+        _capacity = other._capacity;
+        other._capacity = 0;
+        _data = other._data;
+        other._data = nullptr;
+        _used = other._used;
+        other._used = 0;
+        _reallocArg = other._reallocArg;
+        _realloc = other._realloc;
+    }
+
+    raw_buffer_t(TYPE *data, int used, int capacity)
+    {
+        _data = data;
+        _used = used;
+        _capacity = capacity;
+    }
+
+    ~raw_buffer_t()
+    {
+        if (_data)
+        {
+            _realloc(_data, 0, _reallocArg);
+        }
+    }
+
+    TYPE *begin() { return _data; }
+    TYPE *end() { return _data + _used; }
+    int   used() const { return _used; }
+    int   capacity() const { return _capacity; }
+    void  clear() { _used = 0; }
+
+    const TYPE &operator[](int index) const { assert(index < _used); return _data[index]; }
+
+    void set_used(int used)
+    {
+        assert(used >= _used);
+        assert(used <= _capacity);
+        _used = used;
+    }
+
+    TYPE *reserve(int count, bool exact_resize=false)
+    {
+        int req_capacity = _used + count;
+        if ( req_capacity > _capacity )
+        {
+            if ( !exact_resize )
+            {
+                req_capacity = std::max(req_capacity, _capacity * 2);
+            }
+
+            auto *p = reinterpret_cast<TYPE*>( _realloc(_data, req_capacity * sizeof(TYPE), _reallocArg) );
+            if ( !p )
+            {
+                return nullptr;
+            }
+            _data = p;
+            _capacity = req_capacity;
+        }
+        int at = _used;
+        return _data + at;
+    }
+
+    TYPE *use(int count)
+    {
+        int at = _used;
+        _used += count;
+        return _data + at;
+    }
+
+    TYPE *detach()
+    {
+        auto tmp = _data;
+        _data = nullptr;
+        return tmp;
+    }
+
+    void align(int align_bytes, TYPE fill)
+    {
+        int count = (((_used + align_bytes - 1) / align_bytes) * align_bytes) - _used;
+        TYPE *p = reserve(count);
+        use(count);
+        while (count--)
+        {
+            *p++ = fill;
+        }
+    }
+
+    void remove_left(int count)
+    {
+        int to_move = _used - count;
+        if (to_move >= 0)
+        {
+            memmove(_data, _data + count, to_move * sizeof(TYPE));
+        }
+        _used = to_move;
+    }
+
+private:
+    static void *realloc_proxy(void *ptr, size_t size, int)
+    {
+        return realloc(ptr, size);
+    }
+};
+
+#endif // ML_RAW_BUFFER_HPP
diff --git a/ethosu/regor/dependencies/mlw_codec/source/mlw_decode.cpp b/ethosu/regor/dependencies/mlw_codec/source/mlw_decode.cpp
new file mode 100644
index 00000000..15d597d0
--- /dev/null
+++ b/ethosu/regor/dependencies/mlw_codec/source/mlw_decode.cpp
@@ -0,0 +1,361 @@
+//
+// SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "../include/mlw_decode.h"
+#include "ml_encoder_internal.hpp"
+#include "ml_bit_buffer.hpp"
+
+#include <algorithm>
+#include <cassert>
+#include <vector>
+
+#if NDEBUG || 1
+   // Release build get bits macro
+    #define bitbuf_get(bb_, name_, len_) bb_.get(len_)
+#else
+    #include <cstdio>
+
+    // Debug build get bits macro
+    inline int bitbuf_get(bitbuf_t &bb, const char *name, int len)
+    {
+        assert(len <= 32);
+        int tmp = bb.get(len);
+        printf("%6d  %s:%d = %d\n", bb.pos() - len, name, len, tmp);
+        fflush(stdout);
+        return tmp;
+    }
+#endif
+
+// Extract and decode weights from the given bitstream
+//
+//   outbuf       - output decoded weights
+//   bb           - input bitstream buffer (wherever it is positioned)
+//   endpos       - bitstream end position (in bytes)
+//   single_slice - process slices one-at-a-time
+//   slice_len    - slice length in bits
+//
+// Returns - the number of weights extracted from the bitstream
+static int ml_decode_internal(raw_buffer_t<int16_t> &outbuf, bitbuf_t &bb, palette_t &palette, int endpos, bool single_slice, int slice_len)
+{
+    int start_offset = outbuf.used();
+    int w_cnt;
+    int w_grc_div;
+    int w_grc_trunc;
+    int w_uncompressed;
+    int z_grc_div;
+    int z_prev_grc_div = -1;
+    bool new_palette;
+    int i, j;
+    endpos = std::min(endpos, bb.byte_length());
+
+    // Loop over all slices
+    do {
+        // Decode slice header
+        int bits_avail = bb.read_avail(endpos);
+        if ( bits_avail >= 3 )
+        {
+            z_grc_div = bitbuf_get(bb, "ZDIV", 3);
+        }
+        else // Insufficient bits left for a terminator (EOS may be optional)
+        {
+            z_grc_div = ZDIV_EOS;
+            int ones = bb.get(bits_avail);
+            assert( ones == (1 << bits_avail) - 1 );
+        }
+
+        while ( z_grc_div == ZDIV_EOS )
+        {
+            // End of stream
+            // Byte align
+            bb.read_align(8);
+            if ( bb.byte_pos() >= endpos || single_slice )
+            {
+                goto labelExit;
+            }
+            z_grc_div = bitbuf_get(bb, "ZDIV", 3);
+        }
+        if ( bb.read_avail(endpos) <= 0 )
+        {
+            assert(false);
+            break;  // Unexpectedly reached end of the input stream
+        }
+        assert(z_grc_div < 4 || z_grc_div == ZDIV_DISABLE);
+        bool use_zero_runs = z_grc_div != ZDIV_DISABLE;  // alternating grc
+        w_cnt = bitbuf_get(bb, "SLICELEN", slice_len) + (slice_len == ETHOSU_SLICELEN_BITS ? 1 : 0);
+        w_grc_div = bitbuf_get(bb, "WDIV", 3);
+        w_grc_trunc = bitbuf_get(bb, "WTRUNC", 1);
+        new_palette = bitbuf_get(bb, "NEWPAL", 1);
+        if ( !new_palette )
+        {
+            // At the moment it is not supported to change between alternating
+            // and non-alternating without redefining the palette (this is because
+            // the zero is not included in the palette in case of alternating)
+            bool prev_use_zero_run = z_prev_grc_div != ZDIV_DISABLE;
+            (void)(prev_use_zero_run);
+            assert((z_prev_grc_div == -1) || (use_zero_runs == prev_use_zero_run));
+        }
+        z_prev_grc_div = z_grc_div;
+        if ( new_palette )
+        {
+            palette.direct_offset = bitbuf_get(bb, "DIROFS", 5);
+            palette.palsize = bitbuf_get(bb, "PALSIZE", 5);
+            if ( palette.palsize > 0 )
+            {
+                palette.palsize++;
+            }
+            palette.palbits = bitbuf_get(bb, "PALBITS", 3) + 2;
+            for ( i = 0; i < palette.palsize; i++ )
+            {
+                palette.inv_lut[i] = int16_t(bitbuf_get(bb, "PALETTE", palette.palbits));
+            }
+        }
+
+        if ( w_grc_div == WDIV_UNCOMPRESSED )
+        {
+            // Uncompressed mode
+            w_uncompressed = 1;
+            int uncompressed_bits;
+            if ( palette.palsize > 0 )
+            {
+                // Uncompressed bits is given by palette size.
+                uncompressed_bits = 0;
+                while ( (1 << uncompressed_bits) < palette.palsize )
+                {
+                    uncompressed_bits++;
+                }
+            }
+            else
+            {
+                // No palette. PALBITS is used to specify uncompressed bits.
+                uncompressed_bits = palette.palbits;
+            }
+            // In uncompressed mode there's only a remainder part (no unary)
+            // This is achieved by setting w_grc_div to index bit width
+            w_grc_div = uncompressed_bits;
+        }
+        else
+        {
+            w_uncompressed = 0;
+            assert(w_grc_div < 6);
+        }
+
+        // Decode the slice
+        int z_cnt = w_cnt + (( slice_len != ETHOSU_SLICELEN_BITS || new_palette ) ? 1 : 0);
+        std::vector<int> w_value(w_cnt);
+        std::vector<int> z_value(z_cnt);
+        int w_pos = 0, z_pos = 0;
+        int w_prev_pos = 0, z_prev_pos = 0;
+        int w_unary0 = 0, w_unary1 = 0, w_unary1_len = 0, w_q[12] = {0}, wq = 0;
+        int z_unary = 0, z_q[12] = {0}, zq = 0;
+        int w_nsymbols = 0;
+        int w_prev_enable = 0, w_prev_nsymbols = 0, w_prev_q[12] = {0};
+        int z_nsymbols = 0;
+        int z_prev_enable = 0, z_prev_nsymbols = 0, z_prev_q[12] = {0};
+        int total_zcnt = 0;
+        int z_unary_len = z_grc_div < 3 ? 12 : 8;
+
+        // Loop over all chunks in the slice
+        do
+        {
+            // Flow control to possibly throttle either the weights or zero-runs
+            int balance = use_zero_runs ? w_pos - z_pos : 0;
+            int w_enable = (balance < 8 || !use_zero_runs) && w_pos < w_cnt;
+            int z_enable = (balance >= 0 && use_zero_runs) && z_pos < z_cnt;
+            if ( w_enable )
+            {
+                w_unary0 = w_uncompressed ? 0 : bitbuf_get(bb, "WUNARY0", 12);
+            }
+            if ( z_enable )
+            {
+                z_unary = bitbuf_get(bb, "ZUNARY", z_unary_len);
+                z_nsymbols = 0;
+                for ( i = 0; i < z_unary_len; i++ )
+                {
+                    if ( z_unary & (1 << i) )
+                    {
+                        zq++;
+                    }
+                    else
+                    {
+                        z_q[z_nsymbols++] = zq;
+                        zq = 0;
+                    }
+                }
+                z_pos += z_nsymbols;
+            }
+
+            if ( w_enable )
+            {
+                w_unary1_len = 0;
+                int max_symbols = w_uncompressed && w_grc_div > 5 ? 8 : 12;
+                if ( w_unary0 != 0 )
+                {
+                    for ( i = 0; i < max_symbols; i++ )
+                    {
+                        if ( w_unary0 & (1 << i) )
+                        {
+                            w_unary1_len++;
+                        }
+                    }
+                }
+                w_unary1 = (w_unary1_len > 0) ? bitbuf_get(bb, "WUNARY1", w_unary1_len) : 0;
+                w_nsymbols = 0;
+
+                for ( i = 0; i < max_symbols && (w_nsymbols < (w_cnt - w_pos)); i++ )
+                {
+                    int code = 0;
+                    if ( w_unary0 & (1 << i) )
+                    {
+                        code = 1 + ( w_unary1 & 1 );
+                        w_unary1 = w_unary1 >> 1;
+                    }
+                    wq += code;
+                    if ( code < 2 || w_grc_trunc )
+                    {
+                        w_q[w_nsymbols++] = wq;
+                        wq = 0;
+                    }
+                }
+                w_pos += w_nsymbols;
+            }
+
+            // Remainders corresponding to the quotients in the previous chunk
+            if ( w_prev_enable )
+            {
+                for ( i = 0; i < w_prev_nsymbols && w_prev_pos < w_cnt; i++, w_prev_pos++ )
+                {
+                    int remain = bitbuf_get(bb, "WREMAIN", w_grc_div);
+                    w_value[w_prev_pos] = (w_prev_q[i] << w_grc_div) + remain;
+                }
+            }
+            if ( z_prev_enable )
+            {
+                for ( i = 0; i < z_prev_nsymbols && z_prev_pos < z_cnt; i++, z_prev_pos++ )
+                {
+                    int remain = 0;
+                    if ( z_grc_div != 0 )
+                    {
+                        remain = bitbuf_get(bb, "ZREMAIN", z_grc_div);
+                    }
+                    z_value[z_prev_pos] = (z_prev_q[i] << z_grc_div) + remain;
+                    total_zcnt += z_value[z_prev_pos];
+                }
+            }
+            w_prev_enable = w_enable;
+            w_prev_nsymbols = w_nsymbols;
+            std::copy(std::begin(w_q), std::end(w_q), std::begin(w_prev_q));
+            z_prev_enable = z_enable;
+            z_prev_nsymbols = z_nsymbols;
+            std::copy(std::begin(z_q), std::end(z_q), std::begin(z_prev_q));
+        } while ( w_prev_enable || z_prev_enable );
+
+        // Interleave non-zero and zeros into the outbuf buffer
+        // Increase the outbuffer to fit the new slice
+        int16_t *p = outbuf.reserve(w_cnt + total_zcnt);
+
+        // Insert initial zeros
+        if ( ( slice_len != ETHOSU_SLICELEN_BITS || new_palette ) && use_zero_runs )
+        {
+            for ( j = 0; j < z_value[0]; j++ )
+            {
+                *p++ = 0;
+            }
+        }
+
+        // Loop over all weights and insert zeros in-between
+        for ( i = 0; i < w_cnt; i++ )
+        {
+            int val;
+            assert(w_value[i] < 512);  // HW supports 9bit
+            if ( w_value[i] < palette.palsize )
+            {
+                val = palette.inv_lut[w_value[i]];
+            }
+            else
+            {
+                val = w_value[i] - palette.palsize + palette.direct_offset;
+            }
+            int sign = val & 1;
+            int mag = val >> 1;
+            *p++ = sign ? int16_t(-mag) : int16_t(mag);
+            if ( use_zero_runs )
+            {
+                for ( j = 0; j < z_value[i + (new_palette ? 1 : 0)]; j++ )
+                {
+                    *p++ = 0;
+                }
+            }
+        }
+
+        outbuf.use(w_cnt + total_zcnt);
+    } while (!single_slice);
+labelExit:
+    return outbuf.used() - start_offset;
+}
+
+
+constexpr int INITIAL_BLOCKS = 4;
+
+
+#if defined __cplusplus
+extern "C"
+{
+#endif
+
+// Decode a stream
+//
+//   result     - Resulting data from decode (must be freeed after use)
+//   buffer     - Incoming bitstream buffer
+//   size_bytes - Size of the bitstream buffer (in bytes)
+void ml_decode_ethosu_stream(ml_decode_result_t *result, const uint8_t *buffer, int size_bytes)
+{
+    assert(result && buffer && size_bytes);
+    result->decoded_data = nullptr;
+    result->section_sizes = nullptr;
+
+    bitbuf_t bb(buffer, size_bytes);
+    raw_buffer_t<int16_t> output(4096, MLW_ENCODE_ALLOC_STREAM0, nullptr);
+    palette_t palette;
+    ml_decode_internal(output, bb, palette, size_bytes, false, ETHOSU_SLICELEN_BITS);
+
+    // Populate the results set
+    result->decoded_length = output.used();
+    result->decoded_data = output.detach();
+}
+
+ML_ENCODER_DLL_EXPORT void mld_free(ml_decode_result_t *result)
+{
+    if ( result )
+    {
+        if ( result->decoded_data )
+        {
+            result->decoded_data = static_cast<int16_t*>(realloc( result->decoded_data, 0));
+        }
+        if ( result->section_sizes )
+        {
+            free( result->section_sizes );
+            result->section_sizes = nullptr;
+        }
+    }
+}
+
+
+#if defined __cplusplus
+}  // extern "C"
+#endif
+
diff --git a/ethosu/regor/dependencies/mlw_codec/source/mlw_encode.cpp b/ethosu/regor/dependencies/mlw_codec/source/mlw_encode.cpp
new file mode 100644
index 00000000..3a05dab3
--- /dev/null
+++ b/ethosu/regor/dependencies/mlw_codec/source/mlw_encode.cpp
@@ -0,0 +1,961 @@
+//
+// SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "../include/mlw_encode.h"
+
+#include "ml_encoder_internal.hpp"
+#include "ml_raw_buffer.hpp"
+#include "ml_bit_buffer.hpp"
+
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <tuple>
+#include <vector>
+#include <limits>
+
+constexpr int ZERO_FREQ_THRESHOLD = 5;
+constexpr int MIN_ZERO_RUN_LENGTH = 2;
+
+// Create palette from the given frequencies
+// Freq index 0-511 correspond to weights -256..255
+// partial_data - don't make decisions about data that will be encoded
+//                that wasn't included in the frequency analysis.
+static void create_palette(palette_t *p, bool partial_data, bool disable_lut)
+{
+    uint64_t freq64[512] = {0};
+    int i, all_cnt, all_max_val;
+
+    // Pair the frequency with the value so that
+    // the array can be sorted on frequency while keeping
+    // track of the corresponding palette value
+    all_cnt = 0;
+    all_max_val = 0;
+    for ( i = -255; i < 256; i++ )
+    {
+        if ( i == 0 && p->use_zero_runs ) continue;
+        int sign = i < 0;
+        int mag = abs(i);
+        int palval = (mag << 1) | sign;
+
+        // Store palette value in 16 LSB bits, which will not affect the sorting
+        freq64[palval] = ((static_cast<uint64_t>(p->freq[i + 256])) << 16) | palval;
+        all_cnt += p->freq[i + 256];
+
+        if ( p->freq[i + 256] > 0 )
+        {
+            all_max_val = std::max(all_max_val, palval);
+        }
+    }
+
+    // Cannot use direct offset with partial data.
+    p->only_zeros = !partial_data && (all_cnt == 0);
+    p->direct_offset = 0;
+    if ( !partial_data && (all_cnt != 0) )
+    {
+        // Find the first non-used weight value around zero (0, -1, +1, -2, +2 etc)
+        for ( i = 0; i < 31; i++ )
+        {
+            if ( (freq64[i] >> 16) != 0 )
+            {
+                break;
+            }
+        }
+        p->direct_offset = i;
+    }
+
+    // Sort in descending frequency order
+    std::sort(std::begin(freq64), std::end(freq64), [](uint64_t a, uint64_t b) { return b < a; });
+
+    // Check if all weights fit into the palette (and the palette is not empty)
+    p->only_palette = !disable_lut && !partial_data && (freq64[0] >> 16) > 0 && (freq64[32] >> 16) == 0;
+
+    int max_palette_size;
+    if ( p->only_palette )
+    {
+        max_palette_size = 32;
+    }
+    else
+    {
+        // For direct-lut we must make sure that the encoded weight
+        // index is not > 511. We do that by limiting the palette size
+        // such that the greatest value can be reached after subtracting
+        // the palette size.
+        max_palette_size = std::min(32, 511 - all_max_val);
+        if ( max_palette_size == 1 )
+        {
+            max_palette_size = 0;  // because palette of size 1 is not supported
+        }
+    }
+
+    // Setup the 32 entry palette
+    int max_lut_val = 0, val, cnt, lut_cnt = 0;
+    for ( i = 0; i < max_palette_size; i++ )
+    {
+        cnt = static_cast<int>(freq64[i] >> 16);
+        val = freq64[i] & 0xffff;
+        // If partial data, all palette entries must be filled (even if they're wrong)
+        if ( cnt == 0 && !partial_data ) break;
+        p->lut[i] = int16_t(val);
+        max_lut_val = std::max(max_lut_val, val);
+        lut_cnt += cnt;
+    }
+
+    // When all weights are the same nonzero value a palette size of 1 is possible; but not supported.
+    // Make the palette 2 entries long and zero the second entry (it's never indexed).
+    if ( i == 1 )
+    {
+        p->lut[i++] = 0;
+    }
+
+    // Heuristic for when to use the palette. If more than half of the
+    // weights are in the palette then we use it. This ensures we don't
+    // use palette for e.g. rectangular distributions.
+    int palbits_val;
+    if ( !disable_lut && (lut_cnt >= all_cnt / 2) )
+    {
+        p->palsize = i;
+        palbits_val = max_lut_val;
+    }
+    else
+    {
+        // No palette
+        p->palsize = 0;
+        // If no palette, then palbits is used to specify the
+        // number of bits required for uncompressed mode, i.e.
+        // the number of bits for the greatest weight value
+        palbits_val = all_max_val;
+    }
+
+    // the palette entry bit width
+    // minimum 2-bits (because PALBITS is in range 2..9)
+    int palbits = 2;
+    while ( (1 << palbits) <= palbits_val )
+    {
+        palbits++;
+    }
+    assert(palbits <= 9);
+    p->palbits = palbits;
+}
+
+static void create_inverse_palette(palette_t *p)
+{
+    int i;
+    int val = p->palsize - p->direct_offset;
+    for ( i = 0; i < 256; i++ )
+    {
+        p->inv_lut[256 + i] = int16_t(val);
+        p->inv_lut[256 - i] = int16_t(val + 1);
+        val += 2;
+    }
+    p->inv_lut[0] = 0;
+
+    for ( i = 0; i < p->palsize; i++ )
+    {
+        val = p->lut[i];
+        int sign = val & 1;
+        int mag = val >> 1;
+        int weight = sign ? -mag : mag;
+        assert( ((weight + 256) >= 0) && ((weight + 256) < 512) );
+        p->inv_lut[weight + 256] = int16_t(i);
+    }
+}
+
+// If palette_size is 512, then palette is not used (in that case the palette is setup
+// with the standard alternating unsigned to signed mapping)
+static void update_palette(palette_t *p, const int16_t *weights, int weights_count, bool partial_data, bool disable_lut, bool disable_zruns)
+{
+    int(&freq)[512] = p->freq;
+
+    int total_zeroes = 0;
+    int zeroes_in_run = 0;
+    int zeroes_in_all_runs = 0;
+
+    // Calculate frequencies of the given weight stream
+    for ( int i = 0; i < weights_count; i++ )
+    {
+        freq[weights[i] + 256]++;
+        if ( weights[i] == 0 )
+        {
+            total_zeroes++;
+            zeroes_in_run++;
+        }
+        else
+        {
+            if ( zeroes_in_run >= MIN_ZERO_RUN_LENGTH )
+            {
+                zeroes_in_all_runs += zeroes_in_run;
+            }
+            zeroes_in_run = 0;
+        }
+    }
+
+    // Detect trailing zero runs in compression
+    if ( zeroes_in_run >= MIN_ZERO_RUN_LENGTH )
+    {
+        zeroes_in_all_runs += zeroes_in_run;
+    }
+
+    int common_val = 0;
+    int common_freq = 0;
+    for ( int i = 0; i < 512; i++ )
+    {
+        // Most common non-zero frequency (because we already have that)
+        if ( (i != 256) && freq[i] > common_freq )
+        {
+            common_val = i - 256;
+            common_freq = freq[i];
+        }
+    }
+
+    // Decide if zero-runs (alternating mode) should be used:
+    // * zero runs must make up at least half of the zeroes
+    // * zero should be the most common symbol
+    // * zero should be sufficiently more common than the second most common symbol
+    bool use_zero_runs = zeroes_in_all_runs >= (total_zeroes / 2);
+    use_zero_runs &= total_zeroes > (ZERO_FREQ_THRESHOLD * common_freq);
+    p->use_zero_runs = use_zero_runs && !disable_zruns;
+    // Create the palette
+    create_palette(p, partial_data, disable_lut);
+}
+
+#define NWCFG 13
+#define NZCFG 4  // restrict search to ZDIV=0..3
+#define MAX_ZWCFG ((NWCFG > NZCFG) ? NWCFG : NZCFG)
+
+// (trunc<<4) | div, 0x20 means uncompressed
+static constexpr char w_grc_params[] = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x20};
+static constexpr char z_grc_params[] = {0x00, 0x01, 0x02, 0x03};
+
+struct grc_param_t
+{
+    int cfg = 0;
+    int end_pos = 0;
+};
+
+template <typename TYPE>
+static int search_grc_params(const TYPE *inval_buf, int n_inval, int zrun_mode, int uncompressed_bits,
+                             std::vector<grc_param_t> &result, bool single_slice)
+{
+    assert(uncompressed_bits < 32);
+    int n_cfg = zrun_mode ? NZCFG : NWCFG;
+    const char *grc_params = zrun_mode ? z_grc_params : w_grc_params;
+
+    // Greedy forward-only GRC search (with optimisation for avoiding
+    // unusable GRC parameters).
+    const int cmd_cost = 40;
+
+    int bit_count[MAX_ZWCFG] = {0};
+    int reset_pos[MAX_ZWCFG] = {0};
+    bool coded[MAX_ZWCFG] = {false};
+    bool any_uncodable[MAX_ZWCFG] = {false};
+    int active_bitcount = 0;
+    int active_cfg = -1;
+    int add_uncompressed_bits = (uncompressed_bits > 0) ? uncompressed_bits : 100;
+
+    for ( int i = 0; i < n_inval; i++ )
+    {
+        int value = inval_buf[i];
+
+        int best_bitcount = 0x7FFFFFFF;
+        int best_cfg = 0;
+
+        // Loop over GRC parameters, calculate bits to code value, and then update the search state
+        for ( int j = 0; j < n_cfg; j++ )
+        {
+            int div = grc_params[j] & 15;
+            int trunc = grc_params[j] >> 4;
+            int q = value >> div;
+            int bits = trunc ? std::min(q + 1, 2) + div : q + 1 + div;
+
+            bool can_code = !(!zrun_mode && ((trunc && q > 2) || q > 31));
+            if ( trunc == 2 )
+            {
+                bits = add_uncompressed_bits;
+                can_code = true;
+            }
+
+            if ( can_code )
+            {
+                if ( !coded[j] )
+                {
+                    bit_count[j] = active_bitcount;  // Reset non-coded to current best
+                }
+                bit_count[j] = bit_count[j] + bits;
+
+                if ( bit_count[j] < best_bitcount )
+                {
+                    best_bitcount = bit_count[j];
+                    best_cfg = j;
+                }
+            }
+            else
+            {
+                reset_pos[j] = i + 1;
+                bit_count[j] += cmd_cost;  // Would have to change away if used
+            }
+
+            coded[j] = can_code;
+            any_uncodable[j] |= !can_code;
+        }
+
+        // In single-slice mode we can check the bit counts afterwards; otherwise we record
+        // slice start points by tracking the minimum of the accumulting bit counts for
+        // different grc parameters.
+        if ( !single_slice )
+        {
+            bool must_code = (active_cfg == -1) || !coded[active_cfg];
+            if ( must_code || ((best_cfg != active_cfg) && (best_bitcount + cmd_cost) < active_bitcount) )
+            {
+                // Commit non-initial changes
+                if ( active_cfg != -1 )
+                {
+                    // Range elision (was the other config better all along?)
+                    if ( (bit_count[best_cfg] < bit_count[active_cfg]) && (reset_pos[best_cfg] <= reset_pos[active_cfg]) )
+                    {
+                        // If the current BEST config started  before the ACTIVE config in time, then switch to using the BEST config.
+                        active_cfg = best_cfg;  // Duplicated on both paths for clarity
+                    }
+                    else
+                    {
+                        // Otherwise use the ACTIVE config for this slice before switching to the BEST config.
+                        grc_param_t param;
+                        param.cfg = active_cfg;
+                        param.end_pos = i;
+                        assert((active_cfg != 12) || (uncompressed_bits != 0));
+                        result.push_back(param);
+                    }
+                }
+                active_cfg = best_cfg;
+            }
+        }
+        else if ( active_cfg == -1 )
+        {
+            active_cfg = best_cfg;
+        }
+
+        active_bitcount = bit_count[active_cfg];
+    }
+
+    // terminate the run
+    if ( result.empty() || (result.back().cfg != active_cfg) )
+    {
+        // If single slice then select the best minimum-bits configuration
+        if (single_slice)
+        {
+            assert( result.empty() );
+            active_cfg = -1;
+            int max_bit_count = std::numeric_limits<int>::max();
+            for (int i=0; i < n_cfg; i++)
+            {
+                if ( !any_uncodable[i] && (bit_count[i] <= max_bit_count) )
+                {
+                    if ( (active_cfg != 12) || (uncompressed_bits != 0) )
+                    {
+                        active_cfg = i;
+                        max_bit_count = bit_count[i];
+                    }
+                }
+            }
+            assert(active_cfg != -1); // There isn't a usable grc parameter (fatal)
+        }
+
+        grc_param_t param;
+        param.cfg = active_cfg;
+        assert((active_cfg != 12) || (uncompressed_bits != 0));
+        result.push_back(param);
+    }
+    result.back().end_pos = n_inval;
+
+    return active_bitcount;
+}
+
+#if !ENABLE_DEBUG_BITSTREAM
+    // Release build putbits macro
+    #define bitbuf_put(bb_, name_, len_, data_) bb_->put(len_, data_)
+    #define bitbuf_align(bb_, name_, len_, data_) bb_->align(len_, data_)
+#else
+    // Debug build putbits macro
+    inline void bitbuf_put(bitbuf_t *bb, const char *name, int len, int data)
+    {
+        assert(len <= 32);
+        int pre_pos = bb->pos();
+        bb->put(len, data);
+        BITSTREAM_LOG("%6d  %s:%d = %d\n", pre_pos, name, bb->pos()-pre_pos, data);
+    }
+
+    // Debug build putbits macro
+    inline void bitbuf_align(bitbuf_t *bb, const char *name, int len, int data)
+    {
+        assert(len <= 32);
+        int pre_pos = bb->pos();
+        bb->align(len, data);
+        BITSTREAM_LOG("%6d  %s:%d = %d\n", pre_pos, name, bb->pos()-pre_pos, data);
+    }
+#endif
+
+static slice_params_t encode_slice_header(mle_context_t *ctx, int slicelen, bool new_palette, int uncompressed_bits, int w_cfg, int z_cfg, bitbuf_t *bb)
+{
+    assert( (ctx->slicelen_bits == 15) || (ctx->slicelen_bits == 17) ); // Currently known formats
+    assert( slicelen < (1 << ctx->slicelen_bits) );
+    assert( w_cfg >= 0 && w_cfg < (sizeof(w_grc_params)/sizeof(w_grc_params[0])));
+
+    // GRC parameters for this slice
+    int w_grc_trunc = (w_grc_params[w_cfg] >> 4) == 1;
+    int w_uncompressed = (w_grc_params[w_cfg] >> 4) == 2;
+    // Callers can signal a truly empty slice with a negative z_cfg index
+    assert( ((z_cfg < 0) && (slicelen == 0)) || (ctx->allow_empty_slices && slicelen >= 0) || (slicelen >= 1) );
+    int z_grc_div = (z_cfg < 0) ? ZDIV_DISABLE : z_grc_params[z_cfg] & 15;
+    int w_grc_div = w_uncompressed ? uncompressed_bits : (w_grc_params[w_cfg] & 15);
+
+    int zdiv = ctx->palette.use_zero_runs ? z_grc_div : ZDIV_DISABLE;
+    int wdiv = !w_uncompressed ? w_grc_div : WDIV_UNCOMPRESSED;
+
+    if ( ENABLE_DEBUG_BITSTREAM )
+    {
+        BITSTREAM_LOG("slice: bitoffset %d slicelen %d zdiv %d wdiv %d wtrunc %d newpal %d palbits %d palsize %d\n", bb->pos(),
+                       slicelen, zdiv, wdiv, w_grc_trunc, new_palette, ctx->palette.palbits, ctx->palette.palsize);
+    }
+
+    // Write slice header
+    bitbuf_put(bb, "ZDIV", 3, zdiv);
+    bitbuf_put(bb, "SLICELEN", ctx->slicelen_bits, ctx->allow_empty_slices ? slicelen : slicelen - 1);
+    bitbuf_put(bb, "WDIV", 3, wdiv);
+    bitbuf_put(bb, "WTRUNC", 1, w_grc_trunc);
+    bitbuf_put(bb, "NEWPAL", 1, new_palette);
+    if ( new_palette )
+    {
+        bitbuf_put(bb, "DIROFS", 5, ctx->palette.direct_offset);
+        bitbuf_put(bb, "PALSIZE", 5, std::max(0, ctx->palette.palsize - 1));
+        bitbuf_put(bb, "PALBITS", 3, ctx->palette.palbits - 2);
+        for (int i = 0; i < ctx->palette.palsize; i++ )
+        {
+            bitbuf_put(bb, "PALETTE", ctx->palette.palbits, ctx->palette.lut[i]);
+        }
+    }
+
+    slice_params_t header;
+    header.w_grc_trunc = w_grc_trunc;
+    header.w_uncompressed = w_uncompressed;
+    header.z_grc_div = z_grc_div;
+    header.w_grc_div = w_grc_div;
+    return header;
+}
+
+
+static void encode_slice(mle_context_t *ctx, const int16_t *w_values, int weight_count, const int32_t *z_values, int zero_count, bool new_palette,
+                         int uncompressed_bits, int w_cfg, int z_cfg, bitbuf_t *bb)
+{
+    int w_cnt = weight_count;
+    int z_cnt = (z_values && zero_count) ? w_cnt + (new_palette ? 1 : 0) : 0;
+
+    slice_params_t hdr = encode_slice_header(ctx, w_cnt, new_palette, uncompressed_bits, w_cfg, z_cfg, bb);
+
+    assert(z_cfg >= 0 && "slice was signalled as truly empty");
+
+    // Record slice parameters for HW testbench debugging
+    if ( ctx->enable_slice_debug )
+    {
+        ctx->slice_debug.push_back( mle_slice_debug_t { hdr, ctx->palette } );
+    }
+
+    int  w_grc_div = hdr.w_grc_div;
+    bool w_uncompressed = hdr.w_uncompressed;
+    int  z_grc_div = hdr.z_grc_div;
+    int  w_grc_trunc = hdr.w_grc_trunc;
+
+    int j;
+    int z_unary_len = z_grc_div < 3 ? 12 : 8;
+    int w_pos = 0, z_pos = 0;
+    int w_unary0 = 0, w_unary1 = 0, w_unary1_len = 0, w_q = -1, w_r = 0;
+    int z_unary = 0, z_q = -1, z_r = 0;
+
+    int w_remain_data[2][12] = {{0}};
+    int *w_remain = w_remain_data[0];
+    int *w_prev_remain = w_remain_data[1];
+    int w_nsymbols = 0;
+    int w_prev_enable = 0, w_prev_nsymbols = 0;
+
+    int z_remain_data[2][12] = {{0}};
+    int *z_remain = z_remain_data[0];
+    int *z_prev_remain = z_remain_data[1];
+    int z_nsymbols = 0;
+    int z_prev_enable = 0, z_prev_nsymbols = 0;
+    bool use_zero_runs = ctx->palette.use_zero_runs;
+
+    do
+    {
+        int balance = use_zero_runs ? w_pos - z_pos : 0;
+        int w_enable = balance < 8 && w_pos < w_cnt;
+        int z_enable = balance >= 0 && use_zero_runs && z_pos < z_cnt;
+        if ( w_enable )
+        {
+            // Encode chunk (weights)
+            j = 0;
+            w_nsymbols = 0;
+            w_unary0 = 0;
+            w_unary1 = 0;
+            w_unary1_len = 0;
+            int max_symbols = (w_uncompressed && w_grc_div > 5) ? 8 : 12;
+            while ( j < max_symbols )
+            {
+                if ( w_q < 0 )
+                {
+                    if ( w_pos < w_cnt )
+                    {
+                        int value = w_values[w_pos];
+                        assert( value >= 0 && value < 512 );
+                        w_q = value >> w_grc_div;
+                        w_r = value & ((1 << w_grc_div) - 1);
+                        assert(w_q <= 31 && (!w_grc_trunc || w_q <= 2));
+                    }
+                    else
+                    {
+                        w_q = 0;
+                        w_r = -1;  // don't send remainder
+                    }
+                }
+                while ( w_q >= 0 && j < max_symbols )
+                {
+                    w_unary0 |= w_q > 0 ? (1 << j) : 0;
+                    if ( w_q > 0 )
+                    {
+                        w_unary1 |= w_q > 1 ? (1 << w_unary1_len) : 0;
+                        w_unary1_len++;
+                    }
+                    j++;
+                    w_q -= 2;
+                    if ( w_grc_trunc ) w_q--;
+                }
+                if ( w_q < 0 && w_r >= 0 )
+                {
+                    w_remain[w_nsymbols] = w_r;
+                    w_nsymbols++;
+                    w_pos++;
+                }
+            }
+        }
+
+        if ( z_enable )
+        {
+            // Encode chunk (zrun)
+            j = 0;
+            z_nsymbols = 0;
+            z_unary = 0;
+            while ( j < z_unary_len )
+            {
+                if ( z_q < 0 )
+                {
+                    if ( z_pos < z_cnt )
+                    {
+                        int value = z_values[z_pos];
+                        z_q = value >> z_grc_div;
+                        z_r = value & ((1 << z_grc_div) - 1);
+                        assert( z_q >= 0 );  // There are no negative length z-runs
+                    }
+                    else
+                    {
+                        z_q = 0;
+                        z_r = -1;
+                    }
+                }
+                while ( z_q >= 0 && j < z_unary_len )
+                {
+                    z_unary |= z_q > 0 ? (1 << j) : 0;
+                    j++;
+                    z_q--;
+                }
+                if ( z_q < 0 && z_r >= 0 )
+                {
+                    assert( z_nsymbols < 12 );
+                    z_remain[z_nsymbols] = z_r;
+                    z_nsymbols++;
+                    z_pos++;
+                }
+            }
+        }
+
+        // Write chunk to bitstream
+        if ( w_enable && !w_uncompressed )
+        {
+            bitbuf_put(bb, "WUNARY0", 12, w_unary0);  // 12 bits
+        }
+        if ( z_enable )
+        {
+            bitbuf_put(bb, "ZUNARY", z_unary_len, z_unary);  // 12 or 8 bits
+        }
+        if ( w_enable && !w_uncompressed && (w_unary1_len > 0) )
+        {
+            bitbuf_put(bb, "WUNARY1", w_unary1_len, w_unary1);  // max 12 bits
+        }
+        if ( w_prev_enable )
+        {
+            for (int i = 0; i < w_prev_nsymbols; i++ )
+            {
+                bitbuf_put(bb, "WREMAIN", w_grc_div, w_prev_remain[i]);
+            }
+        }
+        if ( z_prev_enable && (z_grc_div > 0) )
+        {
+            for (int i = 0; i < z_prev_nsymbols; i++ )
+            {
+                bitbuf_put(bb, "ZREMAIN", z_grc_div, z_prev_remain[i]);
+            }
+        }
+        w_prev_enable = w_enable;
+        w_prev_nsymbols = w_nsymbols;
+        std::swap(w_prev_remain, w_remain);
+        z_prev_enable = z_enable;
+        z_prev_nsymbols = z_nsymbols;
+        std::swap(z_prev_remain, z_remain);
+    } while ( w_prev_enable || z_prev_enable );
+}
+
+
+int ml_encode_section(mle_context_t *ctx, const int16_t *inbuf, int size, palette_t *p, bitbuf_t *bitbuf)
+{
+    bool new_palette = (p != nullptr);
+
+    // Reuse previous if not specified
+    if ( p == nullptr )
+    {
+        p = &ctx->palette;
+    }
+
+    // Uncompressed mode can only be used if either all weights
+    // are in the palette OR if the palette is not used.
+    int  uncompressed_bits = 0;
+    if ( p->only_palette )
+    {
+        // Uncompressed bits derived from palette size
+        while ( (1 << uncompressed_bits) < p->palsize )
+        {
+            uncompressed_bits++;
+        }
+    }
+    else if ( p->palsize == 0 )
+    {
+        // Uncompressed bits is palbits (which is the bitdepth of the greatest weight)
+        uncompressed_bits = p->palbits;
+    }
+
+    // If there are no weights at all, emit an empty slice header, then exit.
+    if ( size == 0 )
+    {
+        // Signal a truly empty slice using -ve zgrc to ensure ZDIV_DISABLE is written to the stream.
+        if ( ctx->allow_empty_slices )
+        {
+            encode_slice_header(ctx, 0, new_palette, uncompressed_bits, 0, -1, bitbuf);
+        }
+        return 0;
+    }
+
+    std::vector<int16_t> weight_values;
+    weight_values.reserve(size);
+
+    // If zruns was enabled, expect total to be < weight_values/2
+    std::vector<int32_t> zrun_values;
+    if ( p->use_zero_runs )
+    {
+        zrun_values.reserve( size / 4);
+    }
+
+    // Get weights (or weight indicies) AND zero-runs from the input weight stream.
+    int i = 0;
+    bool allow_empty_slices = !p->only_zeros || ctx->allow_empty_slices;
+    int total_zcnt = 0;
+    const int max_slice_len = (1 << ctx->slicelen_bits) - 1;
+    while ( 1 )
+    {
+        if ( p->use_zero_runs )
+        {
+            int zcnt = 0;
+            // Count zero run
+            // Special case: if all weights in the section are zero, we must
+            // still ensure we have one coded weight so the the slice length
+            // doesn't become 0. Therefore we skip the first zero run and code
+            // the zero explicitly as a weight value instead
+            if ( allow_empty_slices || i > 0 )
+            {
+                while ( i < size && inbuf[i] == 0 && zcnt < max_slice_len )
+                {
+                    zcnt++;
+                    i++;
+                }
+            }
+            total_zcnt += zcnt;
+            zrun_values.push_back(zcnt);
+        }
+        if ( i == size ) break;
+        int16_t value = p->inv_lut[inbuf[i] + 256];
+        weight_values.push_back(value);
+        i++;
+    }
+
+    // Search for good GRC parameters for the weight stream
+    std::vector<grc_param_t> w_slice_cfg;
+    int n_weights = int(weight_values.size());
+    if ( n_weights )
+    {
+        // Use a fixed grc config index if provided (partial-data mode sets this)
+        if ( ctx->fixed_wgrc >= 0 )
+        {
+            w_slice_cfg.push_back(grc_param_t{ ctx->fixed_wgrc,  n_weights });
+        }
+        else
+        {
+            search_grc_params(weight_values.data(), n_weights, 0, uncompressed_bits, w_slice_cfg, ctx->single_slice_sections);
+        }
+    }
+    int n_w_slice = int(w_slice_cfg.size());
+
+    // Search for good GRC parameters for the zrun stream
+    std::vector<grc_param_t> z_slice_cfg;
+    if ( p->use_zero_runs )
+    {
+        // Use a fixed grc config index if provided (partial-data mode sets this)
+        if ( ctx->fixed_zgrc >= 0 )
+        {
+            z_slice_cfg.push_back(grc_param_t{ ctx->fixed_zgrc,  n_weights + 1 });
+        }
+        else
+        {
+            search_grc_params(zrun_values.data(), n_weights + 1, 1, 0, z_slice_cfg, ctx->single_slice_sections);
+        }
+    }
+    int n_z_slice = int(z_slice_cfg.size());
+
+    int loops = 0;
+
+    // Encode bitstream slice
+    int pos = 0, i_w_slice = 0, i_z_slice = 0;
+    bool only_zero_runs_pass = !zrun_values.empty();
+    while ( (pos < n_weights) || new_palette || only_zero_runs_pass )
+    {
+        int w_len = 0;
+        int z_len = 0;
+
+        if ( i_w_slice < n_w_slice )
+        {
+            w_len = w_slice_cfg[i_w_slice].end_pos - pos;
+            w_len = std::min(w_len, max_slice_len);
+        }
+
+        if ( i_z_slice < n_z_slice )
+        {
+            z_len = z_slice_cfg[i_z_slice].end_pos - pos;
+            z_len = std::min(z_len, max_slice_len);
+        }
+
+        // The first slice (when new_palette is 1) encodes zero runs both at the
+        // beginning and end (i.e. number of zero runs are len+1).
+        // The following slices only encode zero runs at the end (there cannot be
+        // any zeros in the beginning since they are encoded by the previous slice)
+        const int32_t *zrun_buf = p->use_zero_runs ? zrun_values.data() + pos + !(new_palette || ctx->allow_empty_slices) : nullptr;
+        const int16_t *w_buf = w_len ? weight_values.data() + pos : nullptr;
+        int            w_cfg = w_len ? w_slice_cfg[i_w_slice].cfg : 0;
+        int            z_cfg = p->use_zero_runs ? z_slice_cfg[i_z_slice].cfg : 0;
+
+        encode_slice(ctx, w_buf, w_len, zrun_buf, z_len, new_palette, uncompressed_bits, w_cfg, z_cfg, bitbuf);
+        new_palette = 0;
+
+        if ( z_len <= 0 && w_len > 0 )
+            pos += w_len;
+        else if ( w_len <= 0 && z_len > 0 )
+            pos += z_len;
+        else
+            pos += std::min(z_len, w_len);
+
+        if ( i_w_slice < n_w_slice && w_slice_cfg[i_w_slice].end_pos <= pos )
+        {
+            i_w_slice++;
+        }
+        if ( i_z_slice < n_z_slice && z_slice_cfg[i_z_slice].end_pos <= pos )
+        {
+            i_z_slice++;
+        }
+        loops++;
+        only_zero_runs_pass = false;
+    }
+    // Single-slice sections can only generate one slice (a single loop)
+    assert( !ctx->single_slice_sections || (ctx->single_slice_sections && loops == 1) );
+    if ( ctx->single_slice_sections && (loops != 1) )
+    {
+        return -1;
+    }
+    return total_zcnt;
+}
+
+
+palette_t *ml_encode_palette(mle_context_t *ctx, const int16_t *weights, int encode_count, int analyse_count, unsigned mlw_encode_flags)
+{
+    palette_t *palette = nullptr;
+    if ( !ctx->palette_valid || (mlw_encode_flags & MLW_ENCODE_INSERT_PALETTE) )
+    {
+        if (mlw_encode_flags & MLW_ENCODE_RESET_PALETTE)
+        {
+            memset( ctx->palette.freq, 0, sizeof(ctx->palette.freq) );
+        }
+
+        bool partial_data = (mlw_encode_flags & MLW_ENCODE_PARTIAL_DATA) != 0;
+        bool disable_lut = (mlw_encode_flags & MLW_ENCODE_NO_PALETTE_LUT) != 0;
+        bool disable_zruns = (mlw_encode_flags & MLW_ENCODE_NO_ZERO_RUNS) != 0;
+
+        assert( analyse_count >= encode_count && "Must analyse at least as much as is encoded");
+
+        update_palette(&ctx->palette, weights, analyse_count, partial_data, disable_lut, disable_zruns);
+        ctx->palette_valid = true;
+        if ( !(mlw_encode_flags & MLW_ENCODE_DPIC_FORCE_PARAMS) )
+        {
+            ctx->fixed_wgrc = (partial_data) ? 5 : -1;
+            ctx->fixed_zgrc = (partial_data) ? 3 : -1;
+        }
+
+        create_inverse_palette(&ctx->palette);
+        palette = &ctx->palette;
+    }
+    return palette;
+}
+
+void ml_encode_eos(mle_context_t *ctx, bitbuf_t &bits, unsigned mlw_encode_flags)
+{
+    // Add end of stream marker and align to 128bit
+    bitbuf_t *bb = &bits;
+    if ( ctx->eos_required )
+    {
+        bitbuf_put(bb, "ZDIV", 3, ZDIV_EOS);
+    }
+    bitbuf_align(bb, "BYTEALIGN", 8, 0xff);
+
+    if ( !(mlw_encode_flags & MLW_ENCODE_NO_PADDING) )
+    {
+        bb->align( 128, 0xFF );
+    }
+    bb->flush();
+}
+
+int ml_encode_internal(mle_context_t *ctx, bitbuf_t &bits, const int16_t *weights, int encode_count, int analyse_count, unsigned mlw_encode_flags)
+{
+    palette_t *palette = ml_encode_palette(ctx, weights, encode_count, analyse_count, mlw_encode_flags);
+
+    int zresult = ml_encode_section(ctx, weights, encode_count, palette, &bits);
+    if ( zresult < 0 )
+    {
+        return -1;
+    }
+    ctx->zero_count += zresult;
+    return 0;
+}
+
+extern "C"
+{
+
+ML_ENCODER_DLL_EXPORT mle_context_t *mle_create_context(int32_t syntax)
+{
+    mle_context_t *ctx = new mle_context_t;
+    ctx->zero_count = 0;
+    ctx->syntax = syntax;
+    ctx->realloc_func = nullptr;
+    if (syntax == MLW_ENCODE_SYNTAX_ETHOSU)
+    {
+        ctx->slicelen_bits = ETHOSU_SLICELEN_BITS;
+        ctx->allow_empty_slices = false;
+        ctx->single_slice_sections = false;
+        ctx->eos_required = true;
+    }
+    else if (syntax == MLW_ENCODE_SYNTAX_ETHOSU_FWD)
+    {
+        ctx->slicelen_bits = 0;
+    }
+    else
+    {
+        assert(false && "bad syntax");
+        delete ctx;
+        return nullptr;
+    }
+    return ctx;
+}
+
+ML_ENCODER_DLL_EXPORT int mle_context_query_zeroes(mle_context_t *ctx)
+{
+    assert( ctx );
+    return ctx->zero_count;
+}
+
+ML_ENCODER_DLL_EXPORT void mle_context_set_allocator(mle_context_t *ctx, void* (*realloc_func)(void*, size_t, int))
+{
+    assert( ctx );
+    ctx->realloc_func = realloc_func;
+}
+
+ML_ENCODER_DLL_EXPORT void mle_destroy_context(mle_context_t *ctx)
+{
+    assert(ctx);
+    delete ctx;
+}
+
+ML_ENCODER_DLL_EXPORT int mle_encode(mle_context_t *ctx, ml_encode_result_t *result, const int16_t *inbuf, int inbuf_size, unsigned mlw_encode_flags)
+{
+    assert( ctx && result );
+    raw_buffer_t<uint8_t> output(4096, MLW_ENCODE_ALLOC_STREAM0, ctx->realloc_func);
+    bitbuf_t bits(output, 4096, mlw_encode_flags & MLW_ENCODE_NO_BITSTREAM);
+    int written = 0;
+
+    if ( ctx->syntax == MLW_ENCODE_SYNTAX_ETHOSU_FWD )
+    {
+        written = ml_encode_fwd(ctx, bits, inbuf, inbuf_size, mlw_encode_flags);
+    }
+    else
+    {
+        int start = bits.byte_pos();
+        if ( ml_encode_internal(ctx, bits, inbuf, inbuf_size, inbuf_size, mlw_encode_flags) < 0 )
+        {
+            return -1;
+        }
+        ml_encode_eos(ctx, bits, mlw_encode_flags);
+        written = bits.byte_pos() - start;
+    }
+
+    if ( written >= 0 )
+    {
+        result->encoded_data = output.detach();
+        result->encoded_length = written;
+        result->section_info = nullptr;
+        result->section_count = 0;
+    }
+    return written;
+}
+
+ML_ENCODER_DLL_EXPORT void mle_free(ml_encode_result_t *result)
+{
+    if ( result )
+    {
+        if ( result->encoded_data )
+        {
+            free( result->encoded_data );
+            result->encoded_data = nullptr;
+        }
+        if ( result->section_info )
+        {
+            free( result->section_info );
+            result->section_info = nullptr;
+        }
+    }
+}
+
+}
diff --git a/ethosu/regor/dependencies/mlw_codec/source/mlw_encode_fwd.cpp b/ethosu/regor/dependencies/mlw_codec/source/mlw_encode_fwd.cpp
new file mode 100644
index 00000000..c0bcc101
--- /dev/null
+++ b/ethosu/regor/dependencies/mlw_codec/source/mlw_encode_fwd.cpp
@@ -0,0 +1,197 @@
+//
+// SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "../include/mlw_encode.h"
+
+#include "ml_encoder_internal.hpp"
+#include "ml_raw_buffer.hpp"
+#include "ml_bit_buffer.hpp"
+
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <tuple>
+#include <vector>
+#include <limits>
+
+constexpr static int LUT_MAX = 16;
+constexpr static int INV_MAX = 512;
+
+struct fwd_header_t
+{
+    bool raw_mode_flag = false;
+    bool small_lut_flag = false;
+    int8_t zero_adjust = 0;
+    int16_t lut[LUT_MAX] = {};
+    int8_t  inv_index[INV_MAX] = {};
+    fwd_header_t()
+    {
+        std::fill_n(inv_index, INV_MAX, -1);
+    }
+};
+
+
+static inline int32_t fold(int32_t value)
+{
+    // Fold into positive value (sign in lsb)
+    return (abs(value) << 1) | (uint32_t(value) >> 31);
+}
+
+
+void fwd_emit_header(bitbuf_t &bits, const fwd_header_t &hdr)
+{
+    bits.put( 1, hdr.raw_mode_flag ? 1 : 0 );
+    bits.put( 1, hdr.small_lut_flag ? 1 : 0 );
+    bits.fill( 102, 0 );
+    bits.put_masked( 8, hdr.zero_adjust );
+    for (int i = 0; i < LUT_MAX; i++)
+    {
+        bits.put( 9, fold(hdr.lut[i]) );
+    }
+}
+
+
+bool fwd_analyse(fwd_header_t &hdr, const int16_t *weights, int count, bitbuf_t &bits)
+{
+    int range_min = 1000;
+    int range_max = -1000;
+    int8_t  *inv = hdr.inv_index;
+    int16_t *lut = hdr.lut;
+    int lut_used = 0;
+    bool use_lut = true;
+
+    // Must check all the zero-point-correct values for full range
+    for (int i = 0; i < count; i++)
+    {
+        int value = weights[i];
+        range_min = std::min( range_min, value );
+        range_max = std::max( range_max, value );
+
+        // Update the LUT only while it's still viable (predicts well).
+        if ( use_lut )
+        {
+            // Map the signed value to the LUT via +ve indexed table
+            int idx = fold(value);
+            assert( idx < INV_MAX );
+
+            // Check if value has already been indexed before adding a
+            // new lut entry.
+            if ( inv[idx] < 0 )
+            {
+                if ( lut_used < LUT_MAX )
+                {
+                    inv[idx] = lut_used;
+                    lut[lut_used] = value;
+                    lut_used++;
+                }
+                else
+                {
+                    use_lut = false; // LUT was full and is now unusable
+                }
+            }
+            // While lut2 is valid, encode the entries. When we're
+            // done the bitstream will be ready.
+            if (lut_used <= 4)
+            {
+                bits.put(2, inv[idx]);
+            }
+        }
+    }
+
+    hdr.raw_mode_flag = !use_lut;
+    hdr.small_lut_flag = (lut_used <= 4);
+    hdr.zero_adjust = 0;
+
+    // If raw mode, calculate the zero point
+    if ( hdr.raw_mode_flag )
+    {
+        int full_range = (range_max - range_min);
+        if (full_range >= 256)
+        {
+            return false;   // Can't encode this stream
+        }
+        else if ( range_min < -128 )
+        {
+            hdr.zero_adjust = -128 - range_min; // Raw values need offsetting +ve by this amount
+        }
+        else if ( range_max > 127 )
+        {
+            hdr.zero_adjust = 127 - range_max; // Raw values need offsetting -ve by this amount
+        }
+    }
+
+    return (range_min >= -256) && (range_max < 256);
+}
+
+// Encode zero-corrected weight values in the optimal fast-weight format.
+int ml_encode_fwd(mle_context_t *ctx, bitbuf_t &bits, const int16_t *weights, int count, unsigned mlw_encode_flags)
+{
+    fwd_header_t header;
+    int pos = bits.pos();
+    bits.fill(256, 0); // Reserve space for header
+
+    // Encode lut2 weights directly to the main stream while analysing
+    if ( !fwd_analyse(header, weights, count, bits) )
+    {
+        return -1; // Encoding error
+    }
+
+    // Check for forced no palette
+    if ( mlw_encode_flags & MLW_ENCODE_NO_PALETTE_LUT )
+    {
+        header.raw_mode_flag = 1;
+        header.small_lut_flag = 0;
+    }
+
+    // Use a substream of the main stream for the header
+    bitbuf_t hdr_bits(bits, pos);
+    fwd_emit_header(hdr_bits, header);
+    bits.sync(hdr_bits);
+
+    // LUT2
+    if ( header.small_lut_flag )
+    {
+        assert( !header.raw_mode_flag );
+    }
+    // RAW
+    else if ( header.raw_mode_flag )
+    {
+        bits.reposition(pos + 256);
+        for (int i=0; i < count; i++)
+        {
+            int value = (weights[i] + header.zero_adjust) & 0xFF;
+            bits.put(8, value);
+        }
+    }
+    // LUT4
+    else
+    {
+        bits.reposition(pos + 256);
+        for (int i=0; i < count; i++)
+        {
+            int idx = fold(weights[i]);
+            bits.put(4, header.inv_index[idx]);
+        }
+    }
+
+    bits.align(256, 0);
+    bits.flush();
+
+    int written = bits.pos() / 8;
+    return written;
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/BUILD.bazel b/ethosu/regor/dependencies/thirdparty/Catch2/BUILD.bazel
new file mode 100644
index 00000000..c51bf57e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/BUILD.bazel
@@ -0,0 +1,95 @@
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+
+expand_template(
+    name = "catch_user_config",
+    out = "catch2/catch_user_config.hpp",
+    substitutions = {
+        "@CATCH_CONFIG_CONSOLE_WIDTH@": "80",
+        "@CATCH_CONFIG_DEFAULT_REPORTER@": "console",
+        "#cmakedefine CATCH_CONFIG_ANDROID_LOGWRITE": "",
+        "#cmakedefine CATCH_CONFIG_BAZEL_SUPPORT": "#define CATCH_CONFIG_BAZEL_SUPPORT",
+        "#cmakedefine CATCH_CONFIG_COLOUR_WIN32": "",
+        "#cmakedefine CATCH_CONFIG_COUNTER": "",
+        "#cmakedefine CATCH_CONFIG_CPP11_TO_STRING": "",
+        "#cmakedefine CATCH_CONFIG_CPP17_BYTE": "",
+        "#cmakedefine CATCH_CONFIG_CPP17_OPTIONAL": "",
+        "#cmakedefine CATCH_CONFIG_CPP17_STRING_VIEW": "",
+        "#cmakedefine CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS": "",
+        "#cmakedefine CATCH_CONFIG_CPP17_VARIANT": "",
+        "#cmakedefine CATCH_CONFIG_DISABLE_EXCEPTIONS_CUSTOM_HANDLER": "",
+        "#cmakedefine CATCH_CONFIG_DISABLE_EXCEPTIONS": "",
+        "#cmakedefine CATCH_CONFIG_DISABLE_STRINGIFICATION": "",
+        "#cmakedefine CATCH_CONFIG_DISABLE": "",
+        "#cmakedefine CATCH_CONFIG_ENABLE_ALL_STRINGMAKERS": "",
+        "#cmakedefine CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER": "",
+        "#cmakedefine CATCH_CONFIG_ENABLE_PAIR_STRINGMAKER": "",
+        "#cmakedefine CATCH_CONFIG_ENABLE_TUPLE_STRINGMAKER": "",
+        "#cmakedefine CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER": "",
+        "#cmakedefine CATCH_CONFIG_EXPERIMENTAL_REDIRECT": "",
+        "#cmakedefine CATCH_CONFIG_FALLBACK_STRINGIFIER @CATCH_CONFIG_FALLBACK_STRINGIFIER@": "",
+        "#cmakedefine CATCH_CONFIG_FAST_COMPILE": "",
+        "#cmakedefine CATCH_CONFIG_GETENV": "",
+        "#cmakedefine CATCH_CONFIG_GLOBAL_NEXTAFTER": "",
+        "#cmakedefine CATCH_CONFIG_NO_ANDROID_LOGWRITE": "",
+        "#cmakedefine CATCH_CONFIG_NO_COLOUR_WIN32": "",
+        "#cmakedefine CATCH_CONFIG_NO_COUNTER": "",
+        "#cmakedefine CATCH_CONFIG_NO_CPP11_TO_STRING": "",
+        "#cmakedefine CATCH_CONFIG_NO_CPP17_BYTE": "",
+        "#cmakedefine CATCH_CONFIG_NO_CPP17_OPTIONAL": "",
+        "#cmakedefine CATCH_CONFIG_NO_CPP17_STRING_VIEW": "",
+        "#cmakedefine CATCH_CONFIG_NO_CPP17_UNCAUGHT_EXCEPTIONS": "",
+        "#cmakedefine CATCH_CONFIG_NO_CPP17_VARIANT": "",
+        "#cmakedefine CATCH_CONFIG_NO_GETENV": "",
+        "#cmakedefine CATCH_CONFIG_NO_GLOBAL_NEXTAFTER": "",
+        "#cmakedefine CATCH_CONFIG_NO_POSIX_SIGNALS": "",
+        "#cmakedefine CATCH_CONFIG_NO_USE_ASYNC": "",
+        "#cmakedefine CATCH_CONFIG_NO_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT": "",
+        "#cmakedefine CATCH_CONFIG_NO_WCHAR": "",
+        "#cmakedefine CATCH_CONFIG_NO_WINDOWS_SEH": "",
+        "#cmakedefine CATCH_CONFIG_NOSTDOUT": "",
+        "#cmakedefine CATCH_CONFIG_POSIX_SIGNALS": "",
+        "#cmakedefine CATCH_CONFIG_PREFIX_ALL": "",
+        "#cmakedefine CATCH_CONFIG_PREFIX_MESSAGES": "",
+        "#cmakedefine CATCH_CONFIG_SHARED_LIBRARY": "",
+        "#cmakedefine CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT": "",
+        "#cmakedefine CATCH_CONFIG_USE_ASYNC": "",
+        "#cmakedefine CATCH_CONFIG_WCHAR": "",
+        "#cmakedefine CATCH_CONFIG_WINDOWS_CRTDBG": "",
+        "#cmakedefine CATCH_CONFIG_WINDOWS_SEH": "",
+    },
+    template = "src/catch2/catch_user_config.hpp.in",
+)
+
+# Generated header library, modifies the include prefix to account for
+# generation path so that we can include <catch2/catch_user_config.hpp>
+# correctly.
+cc_library(
+    name = "catch2_generated",
+    hdrs = ["catch2/catch_user_config.hpp"],
+    include_prefix = ".",  # to manipulate -I of dependenices
+    visibility = ["//visibility:public"],
+)
+
+# Static library, without main.
+cc_library(
+    name = "catch2",
+    srcs = glob(
+        ["src/catch2/**/*.cpp"],
+        exclude = ["src/catch2/internal/catch_main.cpp"],
+    ),
+    hdrs = glob(["src/catch2/**/*.hpp"]),
+    includes = ["src/"],
+    linkstatic = True,
+    visibility = ["//visibility:public"],
+    deps = [":catch2_generated"],
+)
+
+# Static library, with main.
+cc_library(
+    name = "catch2_main",
+    srcs = ["src/catch2/internal/catch_main.cpp"],
+    includes = ["src/"],
+    linkstatic = True,
+    visibility = ["//visibility:public"],
+    deps = [":catch2"],
+)
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMake/Catch2Config.cmake.in b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/Catch2Config.cmake.in
new file mode 100644
index 00000000..c485219c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/Catch2Config.cmake.in
@@ -0,0 +1,10 @@
+@PACKAGE_INIT@
+
+
+# Avoid repeatedly including the targets
+if(NOT TARGET Catch2::Catch2)
+    # Provide path for scripts
+    list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
+
+    include(${CMAKE_CURRENT_LIST_DIR}/Catch2Targets.cmake)
+endif()
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMake/CatchConfigOptions.cmake b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/CatchConfigOptions.cmake
new file mode 100644
index 00000000..6eae220d
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/CatchConfigOptions.cmake
@@ -0,0 +1,89 @@
+
+#              Copyright Catch2 Authors
+# Distributed under the Boost Software License, Version 1.0.
+#   (See accompanying file LICENSE.txt or copy at
+#        https://www.boost.org/LICENSE_1_0.txt)
+
+# SPDX-License-Identifier: BSL-1.0
+
+##
+# This file contains options that are materialized into the Catch2
+# compiled library. All of them default to OFF, as even the positive
+# forms correspond to the user _forcing_ them to ON, while being OFF
+# means that Catch2 can use its own autodetection.
+#
+# For detailed docs look into docs/configuration.md
+
+
+macro(AddOverridableConfigOption OptionBaseName)
+  option(CATCH_CONFIG_${OptionBaseName} "Read docs/configuration.md for details" OFF)
+  option(CATCH_CONFIG_NO_${OptionBaseName} "Read docs/configuration.md for details" OFF)
+  mark_as_advanced(CATCH_CONFIG_${OptionBaseName} CATCH_CONFIG_NO_${OptionBaseName})
+endmacro()
+
+macro(AddConfigOption OptionBaseName)
+  option(CATCH_CONFIG_${OptionBaseName} "Read docs/configuration.md for details" OFF)
+  mark_as_advanced(CATCH_CONFIG_${OptionBaseName})
+endmacro()
+
+set(_OverridableOptions
+  "ANDROID_LOGWRITE"
+  "BAZEL_SUPPORT"
+  "COLOUR_WIN32"
+  "COUNTER"
+  "CPP11_TO_STRING"
+  "CPP17_BYTE"
+  "CPP17_OPTIONAL"
+  "CPP17_STRING_VIEW"
+  "CPP17_UNCAUGHT_EXCEPTIONS"
+  "CPP17_VARIANT"
+  "GLOBAL_NEXTAFTER"
+  "POSIX_SIGNALS"
+  "USE_ASYNC"
+  "WCHAR"
+  "WINDOWS_SEH"
+  "GETENV"
+  "EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT"
+)
+
+foreach(OptionName ${_OverridableOptions})
+  AddOverridableConfigOption(${OptionName})
+endforeach()
+
+set(_OtherConfigOptions
+  "DISABLE_EXCEPTIONS"
+  "DISABLE_EXCEPTIONS_CUSTOM_HANDLER"
+  "DISABLE"
+  "DISABLE_STRINGIFICATION"
+  "ENABLE_ALL_STRINGMAKERS"
+  "ENABLE_OPTIONAL_STRINGMAKER"
+  "ENABLE_PAIR_STRINGMAKER"
+  "ENABLE_TUPLE_STRINGMAKER"
+  "ENABLE_VARIANT_STRINGMAKER"
+  "EXPERIMENTAL_REDIRECT"
+  "FAST_COMPILE"
+  "NOSTDOUT"
+  "PREFIX_ALL"
+  "PREFIX_MESSAGES"
+  "WINDOWS_CRTDBG"
+)
+
+
+foreach(OptionName ${_OtherConfigOptions})
+  AddConfigOption(${OptionName})
+endforeach()
+if(DEFINED BUILD_SHARED_LIBS)
+    set(CATCH_CONFIG_SHARED_LIBRARY ${BUILD_SHARED_LIBS})
+else()
+    set(CATCH_CONFIG_SHARED_LIBRARY "")
+endif()
+
+set(CATCH_CONFIG_DEFAULT_REPORTER "console" CACHE STRING "Read docs/configuration.md for details. The name of the reporter should be without quotes.")
+set(CATCH_CONFIG_CONSOLE_WIDTH "80" CACHE STRING "Read docs/configuration.md for details. Must form a valid integer literal.")
+
+mark_as_advanced(CATCH_CONFIG_SHARED_LIBRARY CATCH_CONFIG_DEFAULT_REPORTER CATCH_CONFIG_CONSOLE_WIDTH)
+
+# There is no good way to both turn this into a CMake cache variable,
+# and keep reasonable default semantics inside the project. Thus we do
+# not define it and users have to provide it as an outside variable.
+#set(CATCH_CONFIG_FALLBACK_STRINGIFIER "" CACHE STRING "Read docs/configuration.md for details.")
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMake/CatchMiscFunctions.cmake b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/CatchMiscFunctions.cmake
new file mode 100644
index 00000000..84bd7cc7
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/CatchMiscFunctions.cmake
@@ -0,0 +1,121 @@
+
+#              Copyright Catch2 Authors
+# Distributed under the Boost Software License, Version 1.0.
+#   (See accompanying file LICENSE.txt or copy at
+#        https://www.boost.org/LICENSE_1_0.txt)
+
+# SPDX-License-Identifier: BSL-1.0
+
+include(CheckCXXCompilerFlag)
+function(add_cxx_flag_if_supported_to_targets flagname targets)
+    string(MAKE_C_IDENTIFIER ${flagname} flag_identifier )
+    check_cxx_compiler_flag("${flagname}" HAVE_FLAG_${flag_identifier})
+
+    if (HAVE_FLAG_${flag_identifier})
+        foreach(target ${targets})
+            target_compile_options(${target} PRIVATE ${flagname})
+        endforeach()
+    endif()
+endfunction()
+
+# Assumes that it is only called for development builds, where warnings
+# and Werror is desired, so it also enables Werror.
+function(add_warnings_to_targets targets)
+    LIST(LENGTH targets TARGETS_LEN)
+    # For now we just assume 2 possibilities: msvc and msvc-like compilers,
+    # and other.
+    if (MSVC)
+        foreach(target ${targets})
+            # Force MSVC to consider everything as encoded in utf-8
+            target_compile_options( ${target} PRIVATE /utf-8 )
+            # Enable Werror equivalent
+            if (CATCH_ENABLE_WERROR)
+                target_compile_options( ${target} PRIVATE /WX )
+            endif()
+
+            # MSVC is currently handled specially
+            if ( CMAKE_CXX_COMPILER_ID MATCHES "MSVC" )
+                STRING(REGEX REPLACE "/W[0-9]" "/W4" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) # override default warning level
+                target_compile_options( ${target} PRIVATE /w44265 /w44061 /w44062 /w45038 )
+            endif()
+        endforeach()
+
+    endif()
+
+    if (NOT MSVC)
+        set(CHECKED_WARNING_FLAGS
+          "-Wabsolute-value"
+          "-Wall"
+          "-Wcall-to-pure-virtual-from-ctor-dtor"
+          "-Wcast-align"
+          "-Wcatch-value"
+          "-Wdangling"
+          "-Wdeprecated"
+          "-Wdeprecated-register"
+          "-Wexceptions"
+          "-Wexit-time-destructors"
+          "-Wextra"
+          "-Wextra-semi"
+          "-Wfloat-equal"
+          "-Wglobal-constructors"
+          "-Winit-self"
+          "-Wmisleading-indentation"
+          "-Wmismatched-new-delete"
+          "-Wmismatched-return-types"
+          "-Wmismatched-tags"
+          "-Wmissing-braces"
+          "-Wmissing-declarations"
+          "-Wmissing-noreturn"
+          "-Wmissing-prototypes"
+          "-Wmissing-variable-declarations"
+          "-Wnull-dereference"
+          "-Wold-style-cast"
+          "-Woverloaded-virtual"
+          "-Wparentheses"
+          "-Wpedantic"
+          "-Wredundant-decls"
+          "-Wreorder"
+          "-Wreturn-std-move"
+          "-Wshadow"
+          "-Wstrict-aliasing"
+          "-Wsubobject-linkage"
+          "-Wsuggest-destructor-override"
+          "-Wsuggest-override"
+          "-Wundef"
+          "-Wuninitialized"
+          "-Wunneeded-internal-declaration"
+          "-Wunreachable-code-aggressive"
+          "-Wunused"
+          "-Wunused-function"
+          "-Wunused-parameter"
+          "-Wvla"
+          "-Wweak-vtables"
+
+          # This is a useful warning, but our tests sometimes rely on
+          # functions being present, but not picked (e.g. various checks
+          # for stringification implementation ordering).
+          # Ergo, we should use it every now and then, but we cannot
+          # enable it by default.
+          # "-Wunused-member-function"
+        )
+        foreach(warning ${CHECKED_WARNING_FLAGS})
+            add_cxx_flag_if_supported_to_targets(${warning} "${targets}")
+        endforeach()
+
+        if (CATCH_ENABLE_WERROR)
+            foreach(target ${targets})
+                # Enable Werror equivalent
+                target_compile_options( ${target} PRIVATE -Werror )
+            endforeach()
+        endif()
+    endif()
+endfunction()
+
+# Adds flags required for reproducible build to the target
+# Currently only supports GCC and Clang
+function(add_build_reproducibility_settings target)
+  # Make the build reproducible on versions of g++ and clang that supports -ffile-prefix-map
+  if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
+    add_cxx_flag_if_supported_to_targets("-ffile-prefix-map=${CATCH_DIR}/=" "${target}")
+  endif()
+endfunction()
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMake/FindGcov.cmake b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/FindGcov.cmake
new file mode 100644
index 00000000..41417113
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/FindGcov.cmake
@@ -0,0 +1,157 @@
+# This file is part of CMake-codecov.
+#
+# Copyright (c)
+#   2015-2017 RWTH Aachen University, Federal Republic of Germany
+#
+# See the LICENSE file in the package base directory for details
+#
+# Written by Alexander Haase, alexander.haase@rwth-aachen.de
+#
+
+
+# include required Modules
+include(FindPackageHandleStandardArgs)
+
+
+# Search for gcov binary.
+set(CMAKE_REQUIRED_QUIET_SAVE ${CMAKE_REQUIRED_QUIET})
+set(CMAKE_REQUIRED_QUIET ${codecov_FIND_QUIETLY})
+
+get_property(ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES)
+foreach (LANG ${ENABLED_LANGUAGES})
+	# Gcov evaluation is dependent on the used compiler. Check gcov support for
+	# each compiler that is used. If gcov binary was already found for this
+	# compiler, do not try to find it again.
+	if (NOT GCOV_${CMAKE_${LANG}_COMPILER_ID}_BIN)
+		get_filename_component(COMPILER_PATH "${CMAKE_${LANG}_COMPILER}" PATH)
+
+		if ("${CMAKE_${LANG}_COMPILER_ID}" STREQUAL "GNU")
+			# Some distributions like OSX (homebrew) ship gcov with the compiler
+			# version appended as gcov-x. To find this binary we'll build the
+			# suggested binary name with the compiler version.
+			string(REGEX MATCH "^[0-9]+" GCC_VERSION
+				"${CMAKE_${LANG}_COMPILER_VERSION}")
+
+			find_program(GCOV_BIN NAMES gcov-${GCC_VERSION} gcov
+				HINTS ${COMPILER_PATH})
+
+		elseif ("${CMAKE_${LANG}_COMPILER_ID}" STREQUAL "Clang")
+			# Some distributions like Debian ship llvm-cov with the compiler
+			# version appended as llvm-cov-x.y. To find this binary we'll build
+			# the suggested binary name with the compiler version.
+			string(REGEX MATCH "^[0-9]+.[0-9]+" LLVM_VERSION
+				"${CMAKE_${LANG}_COMPILER_VERSION}")
+
+			# llvm-cov prior version 3.5 seems to be not working with coverage
+			# evaluation tools, but these versions are compatible with the gcc
+			# gcov tool.
+			if(LLVM_VERSION VERSION_GREATER 3.4)
+				find_program(LLVM_COV_BIN NAMES "llvm-cov-${LLVM_VERSION}"
+					"llvm-cov" HINTS ${COMPILER_PATH})
+				mark_as_advanced(LLVM_COV_BIN)
+
+				if (LLVM_COV_BIN)
+					find_program(LLVM_COV_WRAPPER "llvm-cov-wrapper" PATHS
+						${CMAKE_MODULE_PATH})
+					if (LLVM_COV_WRAPPER)
+						set(GCOV_BIN "${LLVM_COV_WRAPPER}" CACHE FILEPATH "")
+
+						# set additional parameters
+						set(GCOV_${CMAKE_${LANG}_COMPILER_ID}_ENV
+							"LLVM_COV_BIN=${LLVM_COV_BIN}" CACHE STRING
+							"Environment variables for llvm-cov-wrapper.")
+						mark_as_advanced(GCOV_${CMAKE_${LANG}_COMPILER_ID}_ENV)
+					endif ()
+				endif ()
+			endif ()
+
+			if (NOT GCOV_BIN)
+				# Fall back to gcov binary if llvm-cov was not found or is
+				# incompatible. This is the default on OSX, but may crash on
+				# recent Linux versions.
+				find_program(GCOV_BIN gcov HINTS ${COMPILER_PATH})
+			endif ()
+		endif ()
+
+
+		if (GCOV_BIN)
+			set(GCOV_${CMAKE_${LANG}_COMPILER_ID}_BIN "${GCOV_BIN}" CACHE STRING
+				"${LANG} gcov binary.")
+
+			if (NOT CMAKE_REQUIRED_QUIET)
+				message("-- Found gcov evaluation for "
+				"${CMAKE_${LANG}_COMPILER_ID}: ${GCOV_BIN}")
+			endif()
+
+			unset(GCOV_BIN CACHE)
+		endif ()
+	endif ()
+endforeach ()
+
+
+
+
+# Add a new global target for all gcov targets. This target could be used to
+# generate the gcov files for the whole project instead of calling <TARGET>-gcov
+# for each target.
+if (NOT TARGET gcov)
+	add_custom_target(gcov)
+endif (NOT TARGET gcov)
+
+
+
+# This function will add gcov evaluation for target <TNAME>. Only sources of
+# this target will be evaluated and no dependencies will be added. It will call
+# Gcov on any source file of <TNAME> once and store the gcov file in the same
+# directory.
+function (add_gcov_target TNAME)
+	set(TDIR ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${TNAME}.dir)
+
+	# We don't have to check, if the target has support for coverage, thus this
+	# will be checked by add_coverage_target in Findcoverage.cmake. Instead we
+	# have to determine which gcov binary to use.
+	get_target_property(TSOURCES ${TNAME} SOURCES)
+	set(SOURCES "")
+	set(TCOMPILER "")
+	foreach (FILE ${TSOURCES})
+		codecov_path_of_source(${FILE} FILE)
+		if (NOT "${FILE}" STREQUAL "")
+			codecov_lang_of_source(${FILE} LANG)
+			if (NOT "${LANG}" STREQUAL "")
+				list(APPEND SOURCES "${FILE}")
+				set(TCOMPILER ${CMAKE_${LANG}_COMPILER_ID})
+			endif ()
+		endif ()
+	endforeach ()
+
+	# If no gcov binary was found, coverage data can't be evaluated.
+	if (NOT GCOV_${TCOMPILER}_BIN)
+		message(WARNING "No coverage evaluation binary found for ${TCOMPILER}.")
+		return()
+	endif ()
+
+	set(GCOV_BIN "${GCOV_${TCOMPILER}_BIN}")
+	set(GCOV_ENV "${GCOV_${TCOMPILER}_ENV}")
+
+
+	set(BUFFER "")
+	foreach(FILE ${SOURCES})
+		get_filename_component(FILE_PATH "${TDIR}/${FILE}" PATH)
+
+		# call gcov
+		add_custom_command(OUTPUT ${TDIR}/${FILE}.gcov
+			COMMAND ${GCOV_ENV} ${GCOV_BIN} ${TDIR}/${FILE}.gcno > /dev/null
+			DEPENDS ${TNAME} ${TDIR}/${FILE}.gcno
+			WORKING_DIRECTORY ${FILE_PATH}
+		)
+
+		list(APPEND BUFFER ${TDIR}/${FILE}.gcov)
+	endforeach()
+
+
+	# add target for gcov evaluation of <TNAME>
+	add_custom_target(${TNAME}-gcov DEPENDS ${BUFFER})
+
+	# add evaluation target to the global gcov target.
+	add_dependencies(gcov ${TNAME}-gcov)
+endfunction (add_gcov_target)
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMake/FindLcov.cmake b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/FindLcov.cmake
new file mode 100644
index 00000000..beb925ae
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/FindLcov.cmake
@@ -0,0 +1,354 @@
+# This file is part of CMake-codecov.
+#
+# Copyright (c)
+#   2015-2017 RWTH Aachen University, Federal Republic of Germany
+#
+# See the LICENSE file in the package base directory for details
+#
+# Written by Alexander Haase, alexander.haase@rwth-aachen.de
+#
+
+
+# configuration
+set(LCOV_DATA_PATH "${CMAKE_BINARY_DIR}/lcov/data")
+set(LCOV_DATA_PATH_INIT "${LCOV_DATA_PATH}/init")
+set(LCOV_DATA_PATH_CAPTURE "${LCOV_DATA_PATH}/capture")
+set(LCOV_HTML_PATH "${CMAKE_BINARY_DIR}/lcov/html")
+
+
+
+
+# Search for Gcov which is used by Lcov.
+find_package(Gcov)
+
+
+
+
+# This function will add lcov evaluation for target <TNAME>. Only sources of
+# this target will be evaluated and no dependencies will be added. It will call
+# geninfo on any source file of <TNAME> once and store the info file in the same
+# directory.
+#
+# Note: This function is only a wrapper to define this function always, even if
+#   coverage is not supported by the compiler or disabled. This function must
+#   be defined here, because the module will be exited, if there is no coverage
+#   support by the compiler or it is disabled by the user.
+function (add_lcov_target TNAME)
+	if (LCOV_FOUND)
+		# capture initial coverage data
+		lcov_capture_initial_tgt(${TNAME})
+
+		# capture coverage data after execution
+		lcov_capture_tgt(${TNAME})
+	endif ()
+endfunction (add_lcov_target)
+
+
+
+
+# include required Modules
+include(FindPackageHandleStandardArgs)
+
+# Search for required lcov binaries.
+find_program(LCOV_BIN lcov)
+find_program(GENINFO_BIN geninfo)
+find_program(GENHTML_BIN genhtml)
+find_package_handle_standard_args(lcov
+	REQUIRED_VARS LCOV_BIN GENINFO_BIN GENHTML_BIN
+)
+
+# enable genhtml C++ demangeling, if c++filt is found.
+set(GENHTML_CPPFILT_FLAG "")
+find_program(CPPFILT_BIN c++filt)
+if (NOT CPPFILT_BIN STREQUAL "")
+	set(GENHTML_CPPFILT_FLAG "--demangle-cpp")
+endif (NOT CPPFILT_BIN STREQUAL "")
+
+# enable no-external flag for lcov, if available.
+if (GENINFO_BIN AND NOT DEFINED GENINFO_EXTERN_FLAG)
+	set(FLAG "")
+	execute_process(COMMAND ${GENINFO_BIN} --help OUTPUT_VARIABLE GENINFO_HELP)
+	string(REGEX MATCH "external" GENINFO_RES "${GENINFO_HELP}")
+	if (GENINFO_RES)
+		set(FLAG "--no-external")
+	endif ()
+
+	set(GENINFO_EXTERN_FLAG "${FLAG}"
+		CACHE STRING "Geninfo flag to exclude system sources.")
+endif ()
+
+# If Lcov was not found, exit module now.
+if (NOT LCOV_FOUND)
+	return()
+endif (NOT LCOV_FOUND)
+
+
+
+
+# Create directories to be used.
+file(MAKE_DIRECTORY ${LCOV_DATA_PATH_INIT})
+file(MAKE_DIRECTORY ${LCOV_DATA_PATH_CAPTURE})
+
+set(LCOV_REMOVE_PATTERNS "")
+
+# This function will merge lcov files to a single target file. Additional lcov
+# flags may be set with setting LCOV_EXTRA_FLAGS before calling this function.
+function (lcov_merge_files OUTFILE ...)
+	# Remove ${OUTFILE} from ${ARGV} and generate lcov parameters with files.
+	list(REMOVE_AT ARGV 0)
+
+	# Generate merged file.
+	string(REPLACE "${CMAKE_BINARY_DIR}/" "" FILE_REL "${OUTFILE}")
+	add_custom_command(OUTPUT "${OUTFILE}.raw"
+		COMMAND cat ${ARGV} > ${OUTFILE}.raw
+		DEPENDS ${ARGV}
+		COMMENT "Generating ${FILE_REL}"
+	)
+
+	add_custom_command(OUTPUT "${OUTFILE}"
+		COMMAND ${LCOV_BIN} --quiet -a ${OUTFILE}.raw --output-file ${OUTFILE}
+			--base-directory ${PROJECT_SOURCE_DIR} ${LCOV_EXTRA_FLAGS}
+		COMMAND ${LCOV_BIN} --quiet -r ${OUTFILE} ${LCOV_REMOVE_PATTERNS}
+			--output-file ${OUTFILE} ${LCOV_EXTRA_FLAGS}
+		DEPENDS ${OUTFILE}.raw
+		COMMENT "Post-processing ${FILE_REL}"
+	)
+endfunction ()
+
+
+
+
+# Add a new global target to generate initial coverage reports for all targets.
+# This target will be used to generate the global initial info file, which is
+# used to gather even empty report data.
+if (NOT TARGET lcov-capture-init)
+	add_custom_target(lcov-capture-init)
+	set(LCOV_CAPTURE_INIT_FILES "" CACHE INTERNAL "")
+endif (NOT TARGET lcov-capture-init)
+
+
+# This function will add initial capture of coverage data for target <TNAME>,
+# which is needed to get also data for objects, which were not loaded at
+# execution time. It will call geninfo for every source file of <TNAME> once and
+# store the info file in the same directory.
+function (lcov_capture_initial_tgt TNAME)
+	# We don't have to check, if the target has support for coverage, thus this
+	# will be checked by add_coverage_target in Findcoverage.cmake. Instead we
+	# have to determine which gcov binary to use.
+	get_target_property(TSOURCES ${TNAME} SOURCES)
+	set(SOURCES "")
+	set(TCOMPILER "")
+	foreach (FILE ${TSOURCES})
+		codecov_path_of_source(${FILE} FILE)
+		if (NOT "${FILE}" STREQUAL "")
+			codecov_lang_of_source(${FILE} LANG)
+			if (NOT "${LANG}" STREQUAL "")
+				list(APPEND SOURCES "${FILE}")
+				set(TCOMPILER ${CMAKE_${LANG}_COMPILER_ID})
+			endif ()
+		endif ()
+	endforeach ()
+
+	# If no gcov binary was found, coverage data can't be evaluated.
+	if (NOT GCOV_${TCOMPILER}_BIN)
+		message(WARNING "No coverage evaluation binary found for ${TCOMPILER}.")
+		return()
+	endif ()
+
+	set(GCOV_BIN "${GCOV_${TCOMPILER}_BIN}")
+	set(GCOV_ENV "${GCOV_${TCOMPILER}_ENV}")
+
+
+	set(TDIR ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${TNAME}.dir)
+	set(GENINFO_FILES "")
+	foreach(FILE ${SOURCES})
+		# generate empty coverage files
+		set(OUTFILE "${TDIR}/${FILE}.info.init")
+		list(APPEND GENINFO_FILES ${OUTFILE})
+
+		add_custom_command(OUTPUT ${OUTFILE} COMMAND ${GCOV_ENV} ${GENINFO_BIN}
+				--quiet --base-directory ${PROJECT_SOURCE_DIR} --initial
+				--gcov-tool ${GCOV_BIN} --output-filename ${OUTFILE}
+				${GENINFO_EXTERN_FLAG} ${TDIR}/${FILE}.gcno
+			DEPENDS ${TNAME}
+			COMMENT "Capturing initial coverage data for ${FILE}"
+		)
+	endforeach()
+
+	# Concatenate all files generated by geninfo to a single file per target.
+	set(OUTFILE "${LCOV_DATA_PATH_INIT}/${TNAME}.info")
+	set(LCOV_EXTRA_FLAGS "--initial")
+	lcov_merge_files("${OUTFILE}" ${GENINFO_FILES})
+	add_custom_target(${TNAME}-capture-init ALL DEPENDS ${OUTFILE})
+
+	# add geninfo file generation to global lcov-geninfo target
+	add_dependencies(lcov-capture-init ${TNAME}-capture-init)
+	set(LCOV_CAPTURE_INIT_FILES "${LCOV_CAPTURE_INIT_FILES}"
+		"${OUTFILE}" CACHE INTERNAL ""
+	)
+endfunction (lcov_capture_initial_tgt)
+
+
+# This function will generate the global info file for all targets. It has to be
+# called after all other CMake functions in the root CMakeLists.txt file, to get
+# a full list of all targets that generate coverage data.
+function (lcov_capture_initial)
+	# Skip this function (and do not create the following targets), if there are
+	# no input files.
+	if ("${LCOV_CAPTURE_INIT_FILES}" STREQUAL "")
+		return()
+	endif ()
+
+	# Add a new target to merge the files of all targets.
+	set(OUTFILE "${LCOV_DATA_PATH_INIT}/all_targets.info")
+	lcov_merge_files("${OUTFILE}" ${LCOV_CAPTURE_INIT_FILES})
+	add_custom_target(lcov-geninfo-init ALL	DEPENDS ${OUTFILE}
+		lcov-capture-init
+	)
+endfunction (lcov_capture_initial)
+
+
+
+
+# Add a new global target to generate coverage reports for all targets. This
+# target will be used to generate the global info file.
+if (NOT TARGET lcov-capture)
+	add_custom_target(lcov-capture)
+	set(LCOV_CAPTURE_FILES "" CACHE INTERNAL "")
+endif (NOT TARGET lcov-capture)
+
+
+# This function will add capture of coverage data for target <TNAME>, which is
+# needed to get also data for objects, which were not loaded at execution time.
+# It will call geninfo for every source file of <TNAME> once and store the info
+# file in the same directory.
+function (lcov_capture_tgt TNAME)
+	# We don't have to check, if the target has support for coverage, thus this
+	# will be checked by add_coverage_target in Findcoverage.cmake. Instead we
+	# have to determine which gcov binary to use.
+	get_target_property(TSOURCES ${TNAME} SOURCES)
+	set(SOURCES "")
+	set(TCOMPILER "")
+	foreach (FILE ${TSOURCES})
+		codecov_path_of_source(${FILE} FILE)
+		if (NOT "${FILE}" STREQUAL "")
+			codecov_lang_of_source(${FILE} LANG)
+			if (NOT "${LANG}" STREQUAL "")
+				list(APPEND SOURCES "${FILE}")
+				set(TCOMPILER ${CMAKE_${LANG}_COMPILER_ID})
+			endif ()
+		endif ()
+	endforeach ()
+
+	# If no gcov binary was found, coverage data can't be evaluated.
+	if (NOT GCOV_${TCOMPILER}_BIN)
+		message(WARNING "No coverage evaluation binary found for ${TCOMPILER}.")
+		return()
+	endif ()
+
+	set(GCOV_BIN "${GCOV_${TCOMPILER}_BIN}")
+	set(GCOV_ENV "${GCOV_${TCOMPILER}_ENV}")
+
+
+	set(TDIR ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${TNAME}.dir)
+	set(GENINFO_FILES "")
+	foreach(FILE ${SOURCES})
+		# Generate coverage files. If no .gcda file was generated during
+		# execution, the empty coverage file will be used instead.
+		set(OUTFILE "${TDIR}/${FILE}.info")
+		list(APPEND GENINFO_FILES ${OUTFILE})
+
+		add_custom_command(OUTPUT ${OUTFILE}
+			COMMAND test -f "${TDIR}/${FILE}.gcda"
+				&& ${GCOV_ENV} ${GENINFO_BIN} --quiet --base-directory
+					${PROJECT_SOURCE_DIR} --gcov-tool ${GCOV_BIN}
+					--output-filename ${OUTFILE} ${GENINFO_EXTERN_FLAG}
+					${TDIR}/${FILE}.gcda
+				|| cp ${OUTFILE}.init ${OUTFILE}
+			DEPENDS ${TNAME} ${TNAME}-capture-init
+			COMMENT "Capturing coverage data for ${FILE}"
+		)
+	endforeach()
+
+	# Concatenate all files generated by geninfo to a single file per target.
+	set(OUTFILE "${LCOV_DATA_PATH_CAPTURE}/${TNAME}.info")
+	lcov_merge_files("${OUTFILE}" ${GENINFO_FILES})
+	add_custom_target(${TNAME}-geninfo DEPENDS ${OUTFILE})
+
+	# add geninfo file generation to global lcov-capture target
+	add_dependencies(lcov-capture ${TNAME}-geninfo)
+	set(LCOV_CAPTURE_FILES "${LCOV_CAPTURE_FILES}" "${OUTFILE}" CACHE INTERNAL
+		""
+	)
+
+	# Add target for generating html output for this target only.
+	file(MAKE_DIRECTORY ${LCOV_HTML_PATH}/${TNAME})
+	add_custom_target(${TNAME}-genhtml
+		COMMAND ${GENHTML_BIN} --quiet --sort --prefix ${PROJECT_SOURCE_DIR}
+			--baseline-file ${LCOV_DATA_PATH_INIT}/${TNAME}.info
+			--output-directory ${LCOV_HTML_PATH}/${TNAME}
+			--title "${CMAKE_PROJECT_NAME} - target ${TNAME}"
+			${GENHTML_CPPFILT_FLAG} ${OUTFILE}
+		DEPENDS ${TNAME}-geninfo ${TNAME}-capture-init
+	)
+endfunction (lcov_capture_tgt)
+
+
+# This function will generate the global info file for all targets. It has to be
+# called after all other CMake functions in the root CMakeLists.txt file, to get
+# a full list of all targets that generate coverage data.
+function (lcov_capture)
+	# Skip this function (and do not create the following targets), if there are
+	# no input files.
+	if ("${LCOV_CAPTURE_FILES}" STREQUAL "")
+		return()
+	endif ()
+
+	# Add a new target to merge the files of all targets.
+	set(OUTFILE "${LCOV_DATA_PATH_CAPTURE}/all_targets.info")
+	lcov_merge_files("${OUTFILE}" ${LCOV_CAPTURE_FILES})
+	add_custom_target(lcov-geninfo DEPENDS ${OUTFILE} lcov-capture)
+
+	# Add a new global target for all lcov targets. This target could be used to
+	# generate the lcov html output for the whole project instead of calling
+	# <TARGET>-geninfo and <TARGET>-genhtml for each target. It will also be
+	# used to generate a html site for all project data together instead of one
+	# for each target.
+	if (NOT TARGET lcov)
+		file(MAKE_DIRECTORY ${LCOV_HTML_PATH}/all_targets)
+		add_custom_target(lcov
+			COMMAND ${GENHTML_BIN} --quiet --sort
+				--baseline-file ${LCOV_DATA_PATH_INIT}/all_targets.info
+				--output-directory ${LCOV_HTML_PATH}/all_targets
+				--title "${CMAKE_PROJECT_NAME}" --prefix "${PROJECT_SOURCE_DIR}"
+				${GENHTML_CPPFILT_FLAG} ${OUTFILE}
+			DEPENDS lcov-geninfo-init lcov-geninfo
+		)
+	endif ()
+endfunction (lcov_capture)
+
+
+
+
+# Add a new global target to generate the lcov html report for the whole project
+# instead of calling <TARGET>-genhtml for each target (to create an own report
+# for each target). Instead of the lcov target it does not require geninfo for
+# all targets, so you have to call <TARGET>-geninfo to generate the info files
+# the targets you'd like to have in your report or lcov-geninfo for generating
+# info files for all targets before calling lcov-genhtml.
+file(MAKE_DIRECTORY ${LCOV_HTML_PATH}/selected_targets)
+if (NOT TARGET lcov-genhtml)
+	add_custom_target(lcov-genhtml
+		COMMAND ${GENHTML_BIN}
+			--quiet
+			--output-directory ${LCOV_HTML_PATH}/selected_targets
+			--title \"${CMAKE_PROJECT_NAME} - targets  `find
+				${LCOV_DATA_PATH_CAPTURE} -name \"*.info\" ! -name
+				\"all_targets.info\" -exec basename {} .info \\\;`\"
+			--prefix ${PROJECT_SOURCE_DIR}
+			--sort
+			${GENHTML_CPPFILT_FLAG}
+			`find ${LCOV_DATA_PATH_CAPTURE} -name \"*.info\" ! -name
+				\"all_targets.info\"`
+	)
+endif (NOT TARGET lcov-genhtml)
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMake/Findcodecov.cmake b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/Findcodecov.cmake
new file mode 100644
index 00000000..2c0f2fee
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/Findcodecov.cmake
@@ -0,0 +1,258 @@
+# This file is part of CMake-codecov.
+#
+# Copyright (c)
+#   2015-2017 RWTH Aachen University, Federal Republic of Germany
+#
+# See the LICENSE file in the package base directory for details
+#
+# Written by Alexander Haase, alexander.haase@rwth-aachen.de
+#
+
+
+# Add an option to choose, if coverage should be enabled or not. If enabled
+# marked targets will be build with coverage support and appropriate targets
+# will be added. If disabled coverage will be ignored for *ALL* targets.
+option(ENABLE_COVERAGE "Enable coverage build." OFF)
+
+set(COVERAGE_FLAG_CANDIDATES
+	# gcc and clang
+	"-O0 -g -fprofile-arcs -ftest-coverage"
+
+	# gcc and clang fallback
+	"-O0 -g --coverage"
+)
+
+
+# Add coverage support for target ${TNAME} and register target for coverage
+# evaluation. If coverage is disabled or not supported, this function will
+# simply do nothing.
+#
+# Note: This function is only a wrapper to define this function always, even if
+#   coverage is not supported by the compiler or disabled. This function must
+#   be defined here, because the module will be exited, if there is no coverage
+#   support by the compiler or it is disabled by the user.
+function (add_coverage TNAME)
+	# only add coverage for target, if coverage is support and enabled.
+	if (ENABLE_COVERAGE)
+		foreach (TNAME ${ARGV})
+			add_coverage_target(${TNAME})
+		endforeach ()
+	endif ()
+endfunction (add_coverage)
+
+
+# Add global target to gather coverage information after all targets have been
+# added. Other evaluation functions could be added here, after checks for the
+# specific module have been passed.
+#
+# Note: This function is only a wrapper to define this function always, even if
+#   coverage is not supported by the compiler or disabled. This function must
+#   be defined here, because the module will be exited, if there is no coverage
+#   support by the compiler or it is disabled by the user.
+function (coverage_evaluate)
+	# add lcov evaluation
+	if (LCOV_FOUND)
+		lcov_capture_initial()
+		lcov_capture()
+	endif (LCOV_FOUND)
+endfunction ()
+
+
+# Exit this module, if coverage is disabled. add_coverage is defined before this
+# return, so this module can be exited now safely without breaking any build-
+# scripts.
+if (NOT ENABLE_COVERAGE)
+	return()
+endif ()
+
+
+
+
+# Find the reuired flags foreach language.
+set(CMAKE_REQUIRED_QUIET_SAVE ${CMAKE_REQUIRED_QUIET})
+set(CMAKE_REQUIRED_QUIET ${codecov_FIND_QUIETLY})
+
+get_property(ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES)
+foreach (LANG ${ENABLED_LANGUAGES})
+	# Coverage flags are not dependent on language, but the used compiler. So
+	# instead of searching flags foreach language, search flags foreach compiler
+	# used.
+	set(COMPILER ${CMAKE_${LANG}_COMPILER_ID})
+	if (NOT COVERAGE_${COMPILER}_FLAGS)
+		foreach (FLAG ${COVERAGE_FLAG_CANDIDATES})
+			if(NOT CMAKE_REQUIRED_QUIET)
+				message(STATUS "Try ${COMPILER} code coverage flag = [${FLAG}]")
+			endif()
+
+			set(CMAKE_REQUIRED_FLAGS "${FLAG}")
+			unset(COVERAGE_FLAG_DETECTED CACHE)
+
+			if (${LANG} STREQUAL "C")
+				include(CheckCCompilerFlag)
+				check_c_compiler_flag("${FLAG}" COVERAGE_FLAG_DETECTED)
+
+			elseif (${LANG} STREQUAL "CXX")
+				include(CheckCXXCompilerFlag)
+				check_cxx_compiler_flag("${FLAG}" COVERAGE_FLAG_DETECTED)
+
+			elseif (${LANG} STREQUAL "Fortran")
+				# CheckFortranCompilerFlag was introduced in CMake 3.x. To be
+				# compatible with older Cmake versions, we will check if this
+				# module is present before we use it. Otherwise we will define
+				# Fortran coverage support as not available.
+				include(CheckFortranCompilerFlag OPTIONAL
+					RESULT_VARIABLE INCLUDED)
+				if (INCLUDED)
+					check_fortran_compiler_flag("${FLAG}"
+						COVERAGE_FLAG_DETECTED)
+				elseif (NOT CMAKE_REQUIRED_QUIET)
+					message("-- Performing Test COVERAGE_FLAG_DETECTED")
+					message("-- Performing Test COVERAGE_FLAG_DETECTED - Failed"
+						" (Check not supported)")
+				endif ()
+			endif()
+
+			if (COVERAGE_FLAG_DETECTED)
+				set(COVERAGE_${COMPILER}_FLAGS "${FLAG}"
+					CACHE STRING "${COMPILER} flags for code coverage.")
+				mark_as_advanced(COVERAGE_${COMPILER}_FLAGS)
+				break()
+			else ()
+				message(WARNING "Code coverage is not available for ${COMPILER}"
+				        " compiler. Targets using this compiler will be "
+				        "compiled without it.")
+			endif ()
+		endforeach ()
+	endif ()
+endforeach ()
+
+set(CMAKE_REQUIRED_QUIET ${CMAKE_REQUIRED_QUIET_SAVE})
+
+
+
+
+# Helper function to get the language of a source file.
+function (codecov_lang_of_source FILE RETURN_VAR)
+	get_filename_component(FILE_EXT "${FILE}" EXT)
+	string(TOLOWER "${FILE_EXT}" FILE_EXT)
+	string(SUBSTRING "${FILE_EXT}" 1 -1 FILE_EXT)
+
+	get_property(ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES)
+	foreach (LANG ${ENABLED_LANGUAGES})
+		list(FIND CMAKE_${LANG}_SOURCE_FILE_EXTENSIONS "${FILE_EXT}" TEMP)
+		if (NOT ${TEMP} EQUAL -1)
+			set(${RETURN_VAR} "${LANG}" PARENT_SCOPE)
+			return()
+		endif ()
+	endforeach()
+
+	set(${RETURN_VAR} "" PARENT_SCOPE)
+endfunction ()
+
+
+# Helper function to get the relative path of the source file destination path.
+# This path is needed by FindGcov and FindLcov cmake files to locate the
+# captured data.
+function (codecov_path_of_source FILE RETURN_VAR)
+	string(REGEX MATCH "TARGET_OBJECTS:([^ >]+)" _source ${FILE})
+
+	# If expression was found, SOURCEFILE is a generator-expression for an
+	# object library. Currently we found no way to call this function automatic
+	# for the referenced target, so it must be called in the directoryso of the
+	# object library definition.
+	if (NOT "${_source}" STREQUAL "")
+		set(${RETURN_VAR} "" PARENT_SCOPE)
+		return()
+	endif ()
+
+
+	string(REPLACE "${CMAKE_CURRENT_BINARY_DIR}/" "" FILE "${FILE}")
+	if(IS_ABSOLUTE ${FILE})
+		file(RELATIVE_PATH FILE ${CMAKE_CURRENT_SOURCE_DIR} ${FILE})
+	endif()
+
+	# get the right path for file
+	string(REPLACE ".." "__" PATH "${FILE}")
+
+	set(${RETURN_VAR} "${PATH}" PARENT_SCOPE)
+endfunction()
+
+
+
+
+# Add coverage support for target ${TNAME} and register target for coverage
+# evaluation.
+function(add_coverage_target TNAME)
+	# Check if all sources for target use the same compiler. If a target uses
+	# e.g. C and Fortran mixed and uses different compilers (e.g. clang and
+	# gfortran) this can trigger huge problems, because different compilers may
+	# use different implementations for code coverage.
+	get_target_property(TSOURCES ${TNAME} SOURCES)
+	set(TARGET_COMPILER "")
+	set(ADDITIONAL_FILES "")
+	foreach (FILE ${TSOURCES})
+		# If expression was found, FILE is a generator-expression for an object
+		# library. Object libraries will be ignored.
+		string(REGEX MATCH "TARGET_OBJECTS:([^ >]+)" _file ${FILE})
+		if ("${_file}" STREQUAL "")
+			codecov_lang_of_source(${FILE} LANG)
+			if (LANG)
+				list(APPEND TARGET_COMPILER ${CMAKE_${LANG}_COMPILER_ID})
+
+				list(APPEND ADDITIONAL_FILES "${FILE}.gcno")
+				list(APPEND ADDITIONAL_FILES "${FILE}.gcda")
+			endif ()
+		endif ()
+	endforeach ()
+
+	list(REMOVE_DUPLICATES TARGET_COMPILER)
+	list(LENGTH TARGET_COMPILER NUM_COMPILERS)
+
+	if (NUM_COMPILERS GREATER 1)
+		message(WARNING "Can't use code coverage for target ${TNAME}, because "
+		        "it will be compiled by incompatible compilers. Target will be "
+		        "compiled without code coverage.")
+		return()
+
+	elseif (NUM_COMPILERS EQUAL 0)
+		message(WARNING "Can't use code coverage for target ${TNAME}, because "
+		        "it uses an unknown compiler. Target will be compiled without "
+		        "code coverage.")
+		return()
+
+	elseif (NOT DEFINED "COVERAGE_${TARGET_COMPILER}_FLAGS")
+		# A warning has been printed before, so just return if flags for this
+		# compiler aren't available.
+		return()
+	endif()
+
+
+	# enable coverage for target
+	set_property(TARGET ${TNAME} APPEND_STRING
+		PROPERTY COMPILE_FLAGS " ${COVERAGE_${TARGET_COMPILER}_FLAGS}")
+	set_property(TARGET ${TNAME} APPEND_STRING
+		PROPERTY LINK_FLAGS " ${COVERAGE_${TARGET_COMPILER}_FLAGS}")
+
+
+	# Add gcov files generated by compiler to clean target.
+	set(CLEAN_FILES "")
+	foreach (FILE ${ADDITIONAL_FILES})
+		codecov_path_of_source(${FILE} FILE)
+		list(APPEND CLEAN_FILES "CMakeFiles/${TNAME}.dir/${FILE}")
+	endforeach()
+
+	set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES
+		"${CLEAN_FILES}")
+
+
+	add_gcov_target(${TNAME})
+	add_lcov_target(${TNAME})
+endfunction(add_coverage_target)
+
+
+
+
+# Include modules for parsing the collected data and output it in a readable
+# format (like gcov and lcov).
+find_package(Gcov)
+find_package(Lcov)
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMake/catch2-with-main.pc.in b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/catch2-with-main.pc.in
new file mode 100644
index 00000000..69a790bb
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/catch2-with-main.pc.in
@@ -0,0 +1,10 @@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+pkg_version=@Catch2_VERSION@
+
+Name: Catch2-With-Main
+Description: A modern, C++-native test framework for C++14 and above (links in default main)
+Version: ${pkg_version}
+Requires: catch2 = ${pkg_version}
+Cflags: -I${includedir}
+Libs: -L${libdir} -lCatch2Main
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMake/catch2.pc.in b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/catch2.pc.in
new file mode 100644
index 00000000..bd1c95a1
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/catch2.pc.in
@@ -0,0 +1,11 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+
+Name: Catch2
+Description: A modern, C++-native, test framework for C++14 and above
+URL: https://github.com/catchorg/Catch2
+Version: @Catch2_VERSION@
+Cflags: -I${includedir}
+Libs: -L${libdir} -lCatch2
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMake/llvm-cov-wrapper b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/llvm-cov-wrapper
new file mode 100755
index 00000000..2ac33102
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/llvm-cov-wrapper
@@ -0,0 +1,56 @@
+#!/bin/sh
+
+# This file is part of CMake-codecov.
+#
+# Copyright (c)
+#   2015-2017 RWTH Aachen University, Federal Republic of Germany
+#
+# See the LICENSE file in the package base directory for details
+#
+# Written by Alexander Haase, alexander.haase@rwth-aachen.de
+#
+
+if [ -z "$LLVM_COV_BIN" ]
+then
+	echo "LLVM_COV_BIN not set!" >& 2
+	exit 1
+fi
+
+
+# Get LLVM version to find out.
+LLVM_VERSION=$($LLVM_COV_BIN -version | grep -i "LLVM version" \
+	| sed "s/^\([A-Za-z ]*\)\([0-9]\).\([0-9]\).*$/\2.\3/g")
+
+if [ "$1" = "-v" ]
+then
+	echo "llvm-cov-wrapper $LLVM_VERSION"
+	exit 0
+fi
+
+
+if [ -n "$LLVM_VERSION" ]
+then
+	MAJOR=$(echo $LLVM_VERSION | cut -d'.' -f1)
+	MINOR=$(echo $LLVM_VERSION | cut -d'.' -f2)
+
+	if [ $MAJOR -eq 3 ] && [ $MINOR -le 4 ]
+	then
+		if [ -f "$1" ]
+		then
+			filename=$(basename "$1")
+			extension="${filename##*.}"
+
+			case "$extension" in
+				"gcno") exec $LLVM_COV_BIN --gcno="$1" ;;
+				"gcda") exec $LLVM_COV_BIN --gcda="$1" ;;
+			esac
+		fi
+	fi
+
+	if [ $MAJOR -eq 3 ] && [ $MINOR -le 5 ]
+	then
+		exec $LLVM_COV_BIN $@
+	fi
+fi
+
+exec $LLVM_COV_BIN gcov $@
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMakeLists.txt b/ethosu/regor/dependencies/thirdparty/Catch2/CMakeLists.txt
new file mode 100644
index 00000000..ad5d939b
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMakeLists.txt
@@ -0,0 +1,203 @@
+cmake_minimum_required(VERSION 3.10)
+
+# detect if Catch is being bundled,
+# disable testsuite in that case
+if(NOT DEFINED PROJECT_NAME)
+  set(NOT_SUBPROJECT ON)
+else()
+  set(NOT_SUBPROJECT OFF)
+endif()
+
+option(CATCH_INSTALL_DOCS "Install documentation alongside library" ON)
+option(CATCH_INSTALL_EXTRAS "Install extras (CMake scripts, debugger helpers) alongside library" ON)
+option(CATCH_DEVELOPMENT_BUILD "Build tests, enable warnings, enable Werror, etc" OFF)
+option(CATCH_ENABLE_REPRODUCIBLE_BUILD "Add compiler flags for improving build reproducibility" ON)
+
+include(CMakeDependentOption)
+cmake_dependent_option(CATCH_BUILD_TESTING "Build the SelfTest project" ON "CATCH_DEVELOPMENT_BUILD" OFF)
+cmake_dependent_option(CATCH_BUILD_EXAMPLES "Build code examples" OFF "CATCH_DEVELOPMENT_BUILD" OFF)
+cmake_dependent_option(CATCH_BUILD_EXTRA_TESTS "Build extra tests" OFF "CATCH_DEVELOPMENT_BUILD" OFF)
+cmake_dependent_option(CATCH_BUILD_FUZZERS "Build fuzzers" OFF "CATCH_DEVELOPMENT_BUILD" OFF)
+cmake_dependent_option(CATCH_ENABLE_COVERAGE "Generate coverage for codecov.io" OFF "CATCH_DEVELOPMENT_BUILD" OFF)
+cmake_dependent_option(CATCH_ENABLE_WERROR "Enables Werror during build" ON "CATCH_DEVELOPMENT_BUILD" OFF)
+cmake_dependent_option(CATCH_BUILD_SURROGATES "Enable generating and building surrogate TUs for the main headers" OFF "CATCH_DEVELOPMENT_BUILD" OFF)
+cmake_dependent_option(CATCH_ENABLE_CONFIGURE_TESTS "Enable CMake configuration tests. WARNING: VERY EXPENSIVE" OFF "CATCH_DEVELOPMENT_BUILD" OFF)
+cmake_dependent_option(CATCH_ENABLE_CMAKE_HELPER_TESTS "Enable CMake helper tests. WARNING: VERY EXPENSIVE" OFF "CATCH_DEVELOPMENT_BUILD" OFF)
+
+
+# Catch2's build breaks if done in-tree. You probably should not build
+# things in tree anyway, but we can allow projects that include Catch2
+# as a subproject to build in-tree as long as it is not in our tree.
+if (CMAKE_BINARY_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+    message(FATAL_ERROR "Building in-source is not supported! Create a build dir and remove ${CMAKE_SOURCE_DIR}/CMakeCache.txt")
+endif()
+
+project(Catch2
+  VERSION 3.5.3 # CML version placeholder, don't delete
+  LANGUAGES CXX
+  # HOMEPAGE_URL is not supported until CMake version 3.12, which
+  # we do not target yet.
+  # HOMEPAGE_URL "https://github.com/catchorg/Catch2"
+  DESCRIPTION "A modern, C++-native, unit test framework."
+)
+
+
+# Provide path for scripts. We first add path to the scripts we don't use,
+# but projects including us might, and set the path up to parent scope.
+# Then we also add path that we use to configure the project, but is of
+# no use to top level projects.
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/extras")
+if (NOT NOT_SUBPROJECT)
+  set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" PARENT_SCOPE)
+endif()
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/CMake")
+
+include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)
+include(CatchConfigOptions)
+if(CATCH_DEVELOPMENT_BUILD)
+  include(CTest)
+endif()
+
+# This variable is used in some subdirectories, so we need it here, rather
+# than later in the install block
+set(CATCH_CMAKE_CONFIG_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/Catch2")
+
+# We have some Windows builds that test `wmain` entry point,
+# and we need this change to be present in all binaries that
+# are built during these tests, so this is required here, before
+# the subdirectories are added.
+if(CATCH_TEST_USE_WMAIN)
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /ENTRY:wmainCRTStartup")
+endif()
+
+
+# Basic paths
+set(CATCH_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(SOURCES_DIR ${CATCH_DIR}/src/catch2)
+set(SELF_TEST_DIR ${CATCH_DIR}/tests/SelfTest)
+set(BENCHMARK_DIR ${CATCH_DIR}/tests/Benchmark)
+set(EXAMPLES_DIR ${CATCH_DIR}/examples)
+
+# We need to bring-in the variables defined there to this scope
+add_subdirectory(src)
+
+# Build tests only if requested
+if (BUILD_TESTING AND CATCH_BUILD_TESTING AND NOT_SUBPROJECT)
+    find_package(PythonInterp 3 REQUIRED)
+    if (NOT PYTHONINTERP_FOUND)
+        message(FATAL_ERROR "Python not found, but required for tests")
+    endif()
+    add_subdirectory(tests)
+endif()
+
+if(CATCH_BUILD_EXAMPLES)
+    add_subdirectory(examples)
+endif()
+
+if(CATCH_BUILD_EXTRA_TESTS)
+    add_subdirectory(tests/ExtraTests)
+endif()
+
+if(CATCH_BUILD_FUZZERS)
+    add_subdirectory(fuzzing)
+endif()
+
+if (CATCH_DEVELOPMENT_BUILD)
+    add_warnings_to_targets("${CATCH_WARNING_TARGETS}")
+endif()
+
+# Only perform the installation steps when Catch is not being used as
+# a subproject via `add_subdirectory`, or the destinations will break,
+# see https://github.com/catchorg/Catch2/issues/1373
+if (NOT_SUBPROJECT)
+    configure_package_config_file(
+        ${CMAKE_CURRENT_LIST_DIR}/CMake/Catch2Config.cmake.in
+        ${CMAKE_CURRENT_BINARY_DIR}/Catch2Config.cmake
+        INSTALL_DESTINATION
+          ${CATCH_CMAKE_CONFIG_DESTINATION}
+    )
+
+    write_basic_package_version_file(
+      "${CMAKE_CURRENT_BINARY_DIR}/Catch2ConfigVersion.cmake"
+      COMPATIBILITY
+        SameMajorVersion
+    )
+
+    install(
+      FILES
+        "${CMAKE_CURRENT_BINARY_DIR}/Catch2Config.cmake"
+        "${CMAKE_CURRENT_BINARY_DIR}/Catch2ConfigVersion.cmake"
+      DESTINATION
+        ${CATCH_CMAKE_CONFIG_DESTINATION}
+    )
+
+    # Install documentation
+    if(CATCH_INSTALL_DOCS)
+      install(
+        DIRECTORY
+          docs/
+        DESTINATION
+          "${CMAKE_INSTALL_DOCDIR}"
+        PATTERN "doxygen" EXCLUDE
+      )
+    endif()
+
+    if(CATCH_INSTALL_EXTRAS)
+        # Install CMake scripts
+        install(
+          FILES
+            "extras/ParseAndAddCatchTests.cmake"
+            "extras/Catch.cmake"
+            "extras/CatchAddTests.cmake"
+            "extras/CatchShardTests.cmake"
+            "extras/CatchShardTestsImpl.cmake"
+          DESTINATION
+            ${CATCH_CMAKE_CONFIG_DESTINATION}
+        )
+    
+        # Install debugger helpers
+        install(
+          FILES
+            "extras/gdbinit"
+            "extras/lldbinit"
+          DESTINATION
+            ${CMAKE_INSTALL_DATAROOTDIR}/Catch2
+        )
+    endif()
+
+    ## Provide some pkg-config integration
+    set(PKGCONFIG_INSTALL_DIR
+        "${CMAKE_INSTALL_DATAROOTDIR}/pkgconfig"
+        CACHE PATH "Path where catch2.pc is installed"
+    )
+    configure_file(
+      ${CMAKE_CURRENT_SOURCE_DIR}/CMake/catch2.pc.in
+      ${CMAKE_CURRENT_BINARY_DIR}/catch2.pc
+      @ONLY
+    )
+    configure_file(
+      ${CMAKE_CURRENT_SOURCE_DIR}/CMake/catch2-with-main.pc.in
+      ${CMAKE_CURRENT_BINARY_DIR}/catch2-with-main.pc
+      @ONLY
+    )
+    install(
+      FILES
+        "${CMAKE_CURRENT_BINARY_DIR}/catch2.pc"
+        "${CMAKE_CURRENT_BINARY_DIR}/catch2-with-main.pc"
+      DESTINATION
+        ${PKGCONFIG_INSTALL_DIR}
+    )
+
+    # CPack/CMake started taking the package version from project version 3.12
+    # So we need to set the version manually for older CMake versions
+    if(${CMAKE_VERSION} VERSION_LESS "3.12.0")
+        set(CPACK_PACKAGE_VERSION ${PROJECT_VERSION})
+    endif()
+
+    set(CPACK_PACKAGE_CONTACT "https://github.com/catchorg/Catch2/")
+
+
+    include( CPack )
+
+endif(NOT_SUBPROJECT)
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMakePresets.json b/ethosu/regor/dependencies/thirdparty/Catch2/CMakePresets.json
new file mode 100644
index 00000000..88541285
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMakePresets.json
@@ -0,0 +1,26 @@
+{
+    "version": 3,
+    "configurePresets": [
+        {
+            "name": "basic-tests",
+            "displayName": "Basic development build",
+            "description": "Enables development build with basic tests that are cheap to build and run",
+            "cacheVariables": {
+                "CATCH_DEVELOPMENT_BUILD": "ON"
+            }
+        },
+        {
+            "name": "all-tests",
+            "inherits": "basic-tests",
+            "displayName": "Full development build",
+            "description": "Enables development build with examples and ALL tests",
+            "cacheVariables": {
+                "CATCH_BUILD_EXAMPLES": "ON",
+                "CATCH_BUILD_EXTRA_TESTS": "ON",
+                "CATCH_BUILD_SURROGATES": "ON",
+                "CATCH_ENABLE_CONFIGURE_TESTS": "ON",
+                "CATCH_ENABLE_CMAKE_HELPER_TESTS": "ON"
+            }
+        }
+    ]   
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CODE_OF_CONDUCT.md b/ethosu/regor/dependencies/thirdparty/Catch2/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000..be1a688e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/CODE_OF_CONDUCT.md
@@ -0,0 +1,46 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at github@philnash.me. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
+
+[homepage]: http://contributor-covenant.org
+[version]: http://contributor-covenant.org/version/1/4/
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/Doxyfile b/ethosu/regor/dependencies/thirdparty/Catch2/Doxyfile
new file mode 100644
index 00000000..914e5984
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/Doxyfile
@@ -0,0 +1,2650 @@
+# Doxyfile 1.9.1
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = Catch2
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          = "Popular C++ unit testing framework"
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = docs/doxygen
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all generated output in the proper direction.
+# Possible values are: None, LTR, RTL and Context.
+# The default value is: None.
+
+OUTPUT_TEXT_DIRECTION  = None
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = YES
+
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = YES
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING       = YES
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines (in the resulting output). You can put ^^ in the value part of an
+# alias to insert a newline as if a physical newline was in the original file.
+# When you need a literal { or } or , in the value part of an alias you have to
+# escape them by means of a backslash (\), this can lead to conflicts with the
+# commands \{ and \} for these it is advised to use the version @{ and @} or use
+# a double escape (\\{ and \\})
+
+ALIASES                = "complexity=@par Complexity:" \
+                         noexcept=**Noexcept**
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, VHDL,
+# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files). For instance to make doxygen treat .inc files
+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 5
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = YES
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = YES
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = YES
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which efficively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS       = 1
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = YES
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = NO
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = YES
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = YES
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation. If
+# EXTRACT_ALL is set to YES then this flag will automatically be disabled.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = YES
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# Possible values are: NO, YES and FAIL_ON_WARNINGS.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           = doxygen.errors
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = src/catch2
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment),
+# *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl,
+# *.ucf, *.qsf and *.ice.
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.idl \
+                         *.ddl \
+                         *.odl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.cs \
+                         *.d \
+                         *.php \
+                         *.php4 \
+                         *.php5 \
+                         *.phtml \
+                         *.inc \
+                         *.m \
+                         *.markdown \
+                         *.md \
+                         *.mm \
+                         *.dox \
+                         *.py \
+                         *.pyw \
+                         *.f90 \
+                         *.f95 \
+                         *.f03 \
+                         *.f08 \
+                         *.f18 \
+                         *.f \
+                         *.for \
+                         *.vhd \
+                         *.vhdl \
+                         *.ucf \
+                         *.qsf \
+                         *.ice
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = */lib/*
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = NO
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# entity all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = NO
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see https://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see:
+# http://clang.llvm.org/) for more accurate parsing at the cost of reduced
+# performance. This can be particularly helpful with template rich C++ code for
+# which doxygen's built-in parser lacks the necessary type information.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = NO
+
+# If clang assisted parsing is enabled and the CLANG_ADD_INC_PATHS tag is set to
+# YES then doxygen will add the directory of each input to the include path.
+# The default value is: YES.
+
+CLANG_ADD_INC_PATHS    = YES
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          =
+
+# If clang assisted parsing is enabled you can provide the clang parser with the
+# path to the directory containing a file called compile_commands.json. This
+# file is the compilation database (see:
+# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) containing the
+# options used when the source files were built. This is equivalent to
+# specifying the -p option to a clang tool, such as clang-check. These options
+# will then be passed to the parser. Any options specified with CLANG_OPTIONS
+# will be added as well.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+
+CLANG_DATABASE_PATH    =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via JavaScript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have JavaScript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see:
+# https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the main .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
+# the HTML output. These images will generally look nicer at scaled resolutions.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
+# The default value is: png.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FORMULA_FORMAT    = png
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
+
+FORMULA_MACROFILE      =
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = YES
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from https://www.mathjax.org before deployment.
+# The default value is: https://cdn.jsdelivr.net/npm/mathjax@2.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     = TeX/AMSmath \
+                         TeX/AMSsymbols
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using JavaScript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see:
+# https://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = YES
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = NO
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: YES.
+
+HAVE_DOT               = YES
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = NO
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = NO
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = NO
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS        = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_WRAP_THRESHOLD     = 17
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = NO
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = NO
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
+# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
+# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = YES
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 100
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = YES
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
+# files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc and
+# plantuml temporary files.
+# The default value is: YES.
+
+DOT_CLEANUP            = YES
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/LICENSE.txt b/ethosu/regor/dependencies/thirdparty/Catch2/LICENSE.txt
new file mode 100644
index 00000000..36b7cd93
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/LICENSE.txt
@@ -0,0 +1,23 @@
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/MODULE.bazel b/ethosu/regor/dependencies/thirdparty/Catch2/MODULE.bazel
new file mode 100644
index 00000000..a7846cd6
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/MODULE.bazel
@@ -0,0 +1,3 @@
+module(name = "catch2")
+
+bazel_dep(name = "bazel_skylib", version = "1.5.0")
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/README.md b/ethosu/regor/dependencies/thirdparty/Catch2/README.md
new file mode 100644
index 00000000..3ea54a5d
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/README.md
@@ -0,0 +1,103 @@
+<a id="top"></a>
+![Catch2 logo](data/artwork/catch2-logo-small-with-background.png)
+
+[![Github Releases](https://img.shields.io/github/release/catchorg/catch2.svg)](https://github.com/catchorg/catch2/releases)
+[![Linux build status](https://github.com/catchorg/Catch2/actions/workflows/linux-simple-builds.yml/badge.svg)](https://github.com/catchorg/Catch2/actions/workflows/linux-simple-builds.yml)
+[![Linux build status](https://github.com/catchorg/Catch2/actions/workflows/linux-other-builds.yml/badge.svg)](https://github.com/catchorg/Catch2/actions/workflows/linux-other-builds.yml)
+[![MacOS build status](https://github.com/catchorg/Catch2/actions/workflows/mac-builds.yml/badge.svg)](https://github.com/catchorg/Catch2/actions/workflows/mac-builds.yml)
+[![Build Status](https://ci.appveyor.com/api/projects/status/github/catchorg/Catch2?svg=true&branch=devel)](https://ci.appveyor.com/project/catchorg/catch2)
+[![Code Coverage](https://codecov.io/gh/catchorg/Catch2/branch/devel/graph/badge.svg)](https://codecov.io/gh/catchorg/Catch2)
+[![Try online](https://img.shields.io/badge/try-online-blue.svg)](https://godbolt.org/z/EdoY15q9G)
+[![Join the chat in Discord: https://discord.gg/4CWS9zD](https://img.shields.io/badge/Discord-Chat!-brightgreen.svg)](https://discord.gg/4CWS9zD)
+
+
+## What is Catch2?
+
+Catch2 is mainly a unit testing framework for C++, but it also
+provides basic micro-benchmarking features, and simple BDD macros.
+
+Catch2's main advantage is that using it is both simple and natural.
+Test names do not have to be valid identifiers, assertions look like
+normal C++ boolean expressions, and sections provide a nice and local way
+to share set-up and tear-down code in tests.
+
+**Example unit test**
+```cpp
+#include <catch2/catch_test_macros.hpp>
+
+#include <cstdint>
+
+uint32_t factorial( uint32_t number ) {
+    return number <= 1 ? number : factorial(number-1) * number;
+}
+
+TEST_CASE( "Factorials are computed", "[factorial]" ) {
+    REQUIRE( factorial( 1) == 1 );
+    REQUIRE( factorial( 2) == 2 );
+    REQUIRE( factorial( 3) == 6 );
+    REQUIRE( factorial(10) == 3'628'800 );
+}
+```
+
+**Example microbenchmark**
+```cpp
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
+
+#include <cstdint>
+
+uint64_t fibonacci(uint64_t number) {
+    return number < 2 ? number : fibonacci(number - 1) + fibonacci(number - 2);
+}
+
+TEST_CASE("Benchmark Fibonacci", "[!benchmark]") {
+    REQUIRE(fibonacci(5) == 5);
+
+    REQUIRE(fibonacci(20) == 6'765);
+    BENCHMARK("fibonacci 20") {
+        return fibonacci(20);
+    };
+
+    REQUIRE(fibonacci(25) == 75'025);
+    BENCHMARK("fibonacci 25") {
+        return fibonacci(25);
+    };
+}
+```
+
+_Note that benchmarks are not run by default, so you need to run it explicitly
+with the `[!benchmark]` tag._
+
+
+## Catch2 v3 has been released!
+
+You are on the `devel` branch, where the v3 version is being developed.
+v3 brings a bunch of significant changes, the big one being that Catch2
+is no longer a single-header library. Catch2 now behaves as a normal
+library, with multiple headers and separately compiled implementation.
+
+The documentation is slowly being updated to take these changes into
+account, but this work is currently still ongoing.
+
+For migrating from the v2 releases to v3, you should look at [our
+documentation](docs/migrate-v2-to-v3.md#top). It provides a simple
+guidelines on getting started, and collects most common migration
+problems.
+
+For the previous major version of Catch2 [look into the `v2.x` branch
+here on GitHub](https://github.com/catchorg/Catch2/tree/v2.x).
+
+
+## How to use it
+This documentation comprises these three parts:
+
+* [Why do we need yet another C++ Test Framework?](docs/why-catch.md#top)
+* [Tutorial](docs/tutorial.md#top) - getting started
+* [Reference section](docs/Readme.md#top) - all the details
+
+
+## More
+* Issues and bugs can be raised on the [Issue tracker on GitHub](https://github.com/catchorg/Catch2/issues)
+* For discussion or questions please use [our Discord](https://discord.gg/4CWS9zD)
+* See who else is using Catch2 in [Open Source Software](docs/opensource-users.md#top)
+or [commercially](docs/commercial-users.md#top).
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/SECURITY.md b/ethosu/regor/dependencies/thirdparty/Catch2/SECURITY.md
new file mode 100644
index 00000000..b66bf731
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/SECURITY.md
@@ -0,0 +1,19 @@
+# Security Policy
+
+## Supported Versions
+
+* Versions 1.x (branch Catch1.x) are no longer supported.
+* Versions 2.x (branch v2.x) are currently supported.
+* `devel` branch serves for stable-ish development and is supported,
+  but branches `devel-*` are considered short lived and are not supported separately.
+
+
+## Reporting a Vulnerability
+
+Due to its nature as a _unit_ test framework, Catch2 shouldn't interact
+with untrusted inputs and there shouldn't be many security vulnerabilities
+in it.
+
+However, if you find one you send email to martin <dot> horenovsky <at>
+gmail <dot> com. If you want to encrypt the email, my pgp key is
+`E29C 46F3 B8A7 5028 6079 3B7D ECC9 C20E 314B 2360`.
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/WORKSPACE.bazel b/ethosu/regor/dependencies/thirdparty/Catch2/WORKSPACE.bazel
new file mode 100644
index 00000000..357e6f94
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/WORKSPACE.bazel
@@ -0,0 +1,16 @@
+workspace(name = "catch2")
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+http_archive(
+    name = "bazel_skylib",
+    sha256 = "cd55a062e763b9349921f0f5db8c3933288dc8ba4f76dd9416aac68acee3cb94",
+    urls = [
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.5.0/bazel-skylib-1.5.0.tar.gz",
+        "https://github.com/bazelbuild/bazel-skylib/releases/download/1.5.0/bazel-skylib-1.5.0.tar.gz",
+    ],
+)
+
+load("@bazel_skylib//:workspace.bzl", "bazel_skylib_workspace")
+
+bazel_skylib_workspace()
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/appveyor.yml b/ethosu/regor/dependencies/thirdparty/Catch2/appveyor.yml
new file mode 100644
index 00000000..ba4556ea
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/appveyor.yml
@@ -0,0 +1,83 @@
+version: "{build}-{branch}"
+
+# If we ever get a backlog larger than clone_depth, builds will fail
+# spuriously. I do not think we will ever get 20 deep commits deep though.
+clone_depth: 20
+
+# We want to build everything, except for branches that are explicitly
+# for messing around with Github Actions.
+branches:
+  except:
+    - /devel-gha.+/
+
+
+# We need a more up to date pip because Python 2.7 is EOL soon
+init:
+  - set PATH=C:\Python35;C:\Python35\Scripts;%PATH%
+
+
+install:
+  - ps: if (($env:CONFIGURATION) -eq "Debug" -And ($env:coverage) -eq "1" ) { pip --disable-pip-version-check install codecov }
+  # This removes our changes to PATH. Keep this step last!
+  - ps: if (($env:CONFIGURATION) -eq "Debug" -And ($env:coverage) -eq "1" ) { .\tools\misc\installOpenCppCoverage.ps1 }
+
+
+before_build:
+  # We need to modify PATH again, because it was reset since the "init" step
+  - set PATH=C:\Python35;C:\Python35\Scripts;%PATH%
+  - set CXXFLAGS=%additional_flags%
+  # If we are building examples/extra-tests, we need to regenerate the amalgamated files
+  - cmd: if "%examples%"=="1" ( python .\tools\scripts\generateAmalgamatedFiles.py )
+  # Indirection because appveyor doesn't handle multiline batch scripts properly
+  # https://stackoverflow.com/questions/37627248/how-to-split-a-command-over-multiple-lines-in-appveyor-yml/37647169#37647169
+  # https://help.appveyor.com/discussions/questions/3888-multi-line-cmd-or-powershell-warning-ignore
+  - cmd: .\tools\misc\appveyorBuildConfigurationScript.bat
+
+
+# build with MSBuild
+build:
+  project: Build\Catch2.sln             # path to Visual Studio solution or project
+  parallel: true                        # enable MSBuild parallel builds
+  verbosity: normal                     # MSBuild verbosity level {quiet|minimal|normal|detailed}
+
+test_script:
+  - set CTEST_OUTPUT_ON_FAILURE=1
+  - cmd: .\tools\misc\appveyorTestRunScript.bat
+
+
+# Sadly we cannot use the standard "dimensions" based approach towards
+# specifying the different builds, as there is no way to add one-offs
+# builds afterwards. This means that we will painfully specify each
+# build explicitly.
+environment:
+  matrix:
+    - FLAVOR: VS 2019 x64 Debug Coverage Examples
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019
+      examples: 1
+      coverage: 1
+      platform: x64
+      configuration: Debug
+
+    - FLAVOR: VS 2019 x64 Debug WMain
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019
+      wmain: 1
+      additional_flags: "/D_UNICODE /DUNICODE"
+      platform: x64
+      configuration: Debug
+
+    - FLAVOR: VS 2019 x64 Debug Latest Strict
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019
+      additional_flags: "/permissive- /std:c++latest"
+      platform: x64
+      configuration: Debug
+
+    - FLAVOR: VS 2017 x64 Debug
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
+      platform: x64
+      configuration: Debug
+
+    - FLAVOR: VS 2017 x64 Release Coverage
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
+      coverage: 1
+      platform: x64
+      configuration: Debug
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/codecov.yml b/ethosu/regor/dependencies/thirdparty/Catch2/codecov.yml
new file mode 100644
index 00000000..357cb5eb
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/codecov.yml
@@ -0,0 +1,22 @@
+coverage:
+  precision: 2
+  round: nearest
+  range: "60...90"
+  status:
+    project:
+      default:
+        threshold: 2%
+    patch:
+      default:
+        target: 80%
+  ignore:
+    - "**/external/clara.hpp"
+    - "tests"
+
+
+codecov:
+  branch: devel
+  max_report_age: off
+
+comment:
+  layout: "diff"
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/conanfile.py b/ethosu/regor/dependencies/thirdparty/Catch2/conanfile.py
new file mode 100755
index 00000000..f8892ad6
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/conanfile.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python
+from conan import ConanFile, tools, __version__ as conan_version
+from conan.tools.cmake import CMake, CMakeToolchain, CMakeDeps, cmake_layout
+from conan.tools import files, scm
+import os
+import shutil
+import re
+
+required_conan_version = ">=1.53.0"
+
+class CatchConan(ConanFile):
+    name = "catch2"
+    description = "A modern, C++-native, framework for unit-tests, TDD and BDD"
+    topics = ("conan", "catch2", "unit-test", "tdd", "bdd")
+    url = "https://github.com/catchorg/Catch2"
+    homepage = url
+    license = "BSL-1.0"
+    version = "latest"
+
+    exports = "LICENSE.txt"
+    exports_sources = ("src/*", "CMakeLists.txt", "CMake/*", "extras/*")
+
+    settings = "os", "compiler", "build_type", "arch"
+
+    def set_version(self):
+        pattern = re.compile(r"\w*VERSION (\d+\.\d+\.\d+) # CML version placeholder, don't delete")
+        with open("CMakeLists.txt") as file:
+            for line in file:
+                result = pattern.search(line)
+                if result:
+                    self.version = result.group(1)
+
+        self.output.info(f'Using version: {self.version}')
+
+    def layout(self):
+        cmake_layout(self)
+
+    def generate(self):
+        tc = CMakeToolchain(self)
+        tc.generate()
+
+        deps = CMakeDeps(self)
+        deps.generate()
+
+    def _configure_cmake(self):
+        cmake = CMake(self)
+
+        # These are option variables. The toolchain in conan 2 doesn't appear to
+        # set these correctly so you have to do it in the configure variables.
+        cmake.configure(variables= {
+            "BUILD_TESTING": "OFF",
+            "CATCH_INSTALL_DOCS": "OFF",
+            "CATCH_INSTALL_EXTRAS": "ON",
+            }
+        )
+        return cmake
+
+    def build(self):
+        cmake = self._configure_cmake()
+        cmake.build()
+
+    def package(self):
+        cmake = self._configure_cmake()
+        cmake.install()
+
+        os.mkdir(f'{self.package_folder}/licenses/')
+        shutil.copy2(f'{self.recipe_folder}/LICENSE.txt', f'{self.package_folder}/licenses/')
+
+    def package_info(self):
+        lib_suffix = "d" if self.settings.build_type == "Debug" else ""
+
+        self.cpp_info.set_property("cmake_file_name", "Catch2")
+        self.cpp_info.set_property("cmake_target_name", "Catch2::Catch2WithMain")
+        self.cpp_info.set_property("pkg_config_name", "catch2-with-main")
+
+        # Catch2
+        self.cpp_info.components["catch2base"].set_property("cmake_file_name", "Catch2::Catch2")
+        self.cpp_info.components["catch2base"].set_property("pkg_config_name", "catch2")
+        self.cpp_info.components["catch2base"].libs = ["Catch2" + lib_suffix]
+        self.cpp_info.components["catch2base"].builddirs.append("lib/cmake/Catch2")
+
+        # Catch2WithMain
+        self.cpp_info.components["catch2main"].set_property("cmake_file_name", "Catch2::Catch2WithMain")
+        self.cpp_info.components["catch2main"].set_property("cmake_target_name", "Catch2::Catch2WithMain")
+        self.cpp_info.components["catch2main"].set_property("pkg_config_name", "catch2-with-main")
+        self.cpp_info.components["catch2main"].libs = ["Catch2Main" + lib_suffix]
+        self.cpp_info.components["catch2main"].requires = ["catch2base"]
\ No newline at end of file
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/extras/Catch.cmake b/ethosu/regor/dependencies/thirdparty/Catch2/extras/Catch.cmake
new file mode 100644
index 00000000..8f30688c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/extras/Catch.cmake
@@ -0,0 +1,304 @@
+# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+# file Copyright.txt or https://cmake.org/licensing for details.
+
+#[=======================================================================[.rst:
+Catch
+-----
+
+This module defines a function to help use the Catch test framework.
+
+The :command:`catch_discover_tests` discovers tests by asking the compiled test
+executable to enumerate its tests.  This does not require CMake to be re-run
+when tests change.  However, it may not work in a cross-compiling environment,
+and setting test properties is less convenient.
+
+This command is intended to replace use of :command:`add_test` to register
+tests, and will create a separate CTest test for each Catch test case.  Note
+that this is in some cases less efficient, as common set-up and tear-down logic
+cannot be shared by multiple test cases executing in the same instance.
+However, it provides more fine-grained pass/fail information to CTest, which is
+usually considered as more beneficial.  By default, the CTest test name is the
+same as the Catch name; see also ``TEST_PREFIX`` and ``TEST_SUFFIX``.
+
+.. command:: catch_discover_tests
+
+  Automatically add tests with CTest by querying the compiled test executable
+  for available tests::
+
+    catch_discover_tests(target
+                         [TEST_SPEC arg1...]
+                         [EXTRA_ARGS arg1...]
+                         [WORKING_DIRECTORY dir]
+                         [TEST_PREFIX prefix]
+                         [TEST_SUFFIX suffix]
+                         [PROPERTIES name1 value1...]
+                         [TEST_LIST var]
+                         [REPORTER reporter]
+                         [OUTPUT_DIR dir]
+                         [OUTPUT_PREFIX prefix]
+                         [OUTPUT_SUFFIX suffix]
+                         [DISCOVERY_MODE <POST_BUILD|PRE_TEST>]
+    )
+
+  ``catch_discover_tests`` sets up a post-build command on the test executable
+  that generates the list of tests by parsing the output from running the test
+  with the ``--list-test-names-only`` argument.  This ensures that the full
+  list of tests is obtained.  Since test discovery occurs at build time, it is
+  not necessary to re-run CMake when the list of tests changes.
+  However, it requires that :prop_tgt:`CROSSCOMPILING_EMULATOR` is properly set
+  in order to function in a cross-compiling environment.
+
+  Additionally, setting properties on tests is somewhat less convenient, since
+  the tests are not available at CMake time.  Additional test properties may be
+  assigned to the set of tests as a whole using the ``PROPERTIES`` option.  If
+  more fine-grained test control is needed, custom content may be provided
+  through an external CTest script using the :prop_dir:`TEST_INCLUDE_FILES`
+  directory property.  The set of discovered tests is made accessible to such a
+  script via the ``<target>_TESTS`` variable.
+
+  The options are:
+
+  ``target``
+    Specifies the Catch executable, which must be a known CMake executable
+    target.  CMake will substitute the location of the built executable when
+    running the test.
+
+  ``TEST_SPEC arg1...``
+    Specifies test cases, wildcarded test cases, tags and tag expressions to
+    pass to the Catch executable with the ``--list-test-names-only`` argument.
+
+  ``EXTRA_ARGS arg1...``
+    Any extra arguments to pass on the command line to each test case.
+
+  ``WORKING_DIRECTORY dir``
+    Specifies the directory in which to run the discovered test cases.  If this
+    option is not provided, the current binary directory is used.
+
+  ``TEST_PREFIX prefix``
+    Specifies a ``prefix`` to be prepended to the name of each discovered test
+    case.  This can be useful when the same test executable is being used in
+    multiple calls to ``catch_discover_tests()`` but with different
+    ``TEST_SPEC`` or ``EXTRA_ARGS``.
+
+  ``TEST_SUFFIX suffix``
+    Similar to ``TEST_PREFIX`` except the ``suffix`` is appended to the name of
+    every discovered test case.  Both ``TEST_PREFIX`` and ``TEST_SUFFIX`` may
+    be specified.
+
+  ``PROPERTIES name1 value1...``
+    Specifies additional properties to be set on all tests discovered by this
+    invocation of ``catch_discover_tests``.
+
+  ``TEST_LIST var``
+    Make the list of tests available in the variable ``var``, rather than the
+    default ``<target>_TESTS``.  This can be useful when the same test
+    executable is being used in multiple calls to ``catch_discover_tests()``.
+    Note that this variable is only available in CTest.
+
+  ``REPORTER reporter``
+    Use the specified reporter when running the test case. The reporter will
+    be passed to the Catch executable as ``--reporter reporter``.
+
+  ``OUTPUT_DIR dir``
+    If specified, the parameter is passed along as
+    ``--out dir/<test_name>`` to Catch executable. The actual file name is the
+    same as the test name. This should be used instead of
+    ``EXTRA_ARGS --out foo`` to avoid race conditions writing the result output
+    when using parallel test execution.
+
+  ``OUTPUT_PREFIX prefix``
+    May be used in conjunction with ``OUTPUT_DIR``.
+    If specified, ``prefix`` is added to each output file name, like so
+    ``--out dir/prefix<test_name>``.
+
+  ``OUTPUT_SUFFIX suffix``
+    May be used in conjunction with ``OUTPUT_DIR``.
+    If specified, ``suffix`` is added to each output file name, like so
+    ``--out dir/<test_name>suffix``. This can be used to add a file extension to
+    the output e.g. ".xml".
+
+  ``DL_PATHS path...``
+    Specifies paths that need to be set for the dynamic linker to find shared
+    libraries/DLLs when running the test executable (PATH/LD_LIBRARY_PATH respectively).
+    These paths will both be set when retrieving the list of test cases from the
+    test executable and when the tests are executed themselves. This requires
+    cmake/ctest >= 3.22.
+
+  `DISCOVERY_MODE mode``
+    Provides control over when ``catch_discover_tests`` performs test discovery.
+    By default, ``POST_BUILD`` sets up a post-build command to perform test discovery
+    at build time. In certain scenarios, like cross-compiling, this ``POST_BUILD``
+    behavior is not desirable. By contrast, ``PRE_TEST`` delays test discovery until
+    just prior to test execution. This way test discovery occurs in the target environment
+    where the test has a better chance at finding appropriate runtime dependencies.
+
+    ``DISCOVERY_MODE`` defaults to the value of the
+    ``CMAKE_CATCH_DISCOVER_TESTS_DISCOVERY_MODE`` variable if it is not passed when
+    calling ``catch_discover_tests``. This provides a mechanism for globally selecting
+    a preferred test discovery behavior without having to modify each call site.
+
+#]=======================================================================]
+
+#------------------------------------------------------------------------------
+function(catch_discover_tests TARGET)
+
+  cmake_parse_arguments(
+    ""
+    ""
+    "TEST_PREFIX;TEST_SUFFIX;WORKING_DIRECTORY;TEST_LIST;REPORTER;OUTPUT_DIR;OUTPUT_PREFIX;OUTPUT_SUFFIX;DISCOVERY_MODE"
+    "TEST_SPEC;EXTRA_ARGS;PROPERTIES;DL_PATHS"
+    ${ARGN}
+  )
+
+  if(NOT _WORKING_DIRECTORY)
+    set(_WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
+  endif()
+  if(NOT _TEST_LIST)
+    set(_TEST_LIST ${TARGET}_TESTS)
+  endif()
+  if (_DL_PATHS)
+    if(${CMAKE_VERSION} VERSION_LESS "3.22.0")
+        message(FATAL_ERROR "The DL_PATHS option requires at least cmake 3.22")
+    endif()
+  endif()
+  if(NOT _DISCOVERY_MODE)
+    if(NOT CMAKE_CATCH_DISCOVER_TESTS_DISCOVERY_MODE)
+      set(CMAKE_CATCH_DISCOVER_TESTS_DISCOVERY_MODE "POST_BUILD")
+    endif()
+    set(_DISCOVERY_MODE ${CMAKE_CATCH_DISCOVER_TESTS_DISCOVERY_MODE})
+  endif()
+  if (NOT _DISCOVERY_MODE MATCHES "^(POST_BUILD|PRE_TEST)$")
+    message(FATAL_ERROR "Unknown DISCOVERY_MODE: ${_DISCOVERY_MODE}")
+  endif()
+
+  ## Generate a unique name based on the extra arguments
+  string(SHA1 args_hash "${_TEST_SPEC} ${_EXTRA_ARGS} ${_REPORTER} ${_OUTPUT_DIR} ${_OUTPUT_PREFIX} ${_OUTPUT_SUFFIX}")
+  string(SUBSTRING ${args_hash} 0 7 args_hash)
+
+  # Define rule to generate test list for aforementioned test executable
+  set(ctest_file_base "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}-${args_hash}")
+  set(ctest_include_file "${ctest_file_base}_include.cmake")
+  set(ctest_tests_file "${ctest_file_base}_tests.cmake")
+
+  get_property(crosscompiling_emulator
+    TARGET ${TARGET}
+    PROPERTY CROSSCOMPILING_EMULATOR
+  )
+
+  if(_DISCOVERY_MODE STREQUAL "POST_BUILD")
+    add_custom_command(
+      TARGET ${TARGET} POST_BUILD
+      BYPRODUCTS "${ctest_tests_file}"
+      COMMAND "${CMAKE_COMMAND}"
+              -D "TEST_TARGET=${TARGET}"
+              -D "TEST_EXECUTABLE=$<TARGET_FILE:${TARGET}>"
+              -D "TEST_EXECUTOR=${crosscompiling_emulator}"
+              -D "TEST_WORKING_DIR=${_WORKING_DIRECTORY}"
+              -D "TEST_SPEC=${_TEST_SPEC}"
+              -D "TEST_EXTRA_ARGS=${_EXTRA_ARGS}"
+              -D "TEST_PROPERTIES=${_PROPERTIES}"
+              -D "TEST_PREFIX=${_TEST_PREFIX}"
+              -D "TEST_SUFFIX=${_TEST_SUFFIX}"
+              -D "TEST_LIST=${_TEST_LIST}"
+              -D "TEST_REPORTER=${_REPORTER}"
+              -D "TEST_OUTPUT_DIR=${_OUTPUT_DIR}"
+              -D "TEST_OUTPUT_PREFIX=${_OUTPUT_PREFIX}"
+              -D "TEST_OUTPUT_SUFFIX=${_OUTPUT_SUFFIX}"
+              -D "TEST_DL_PATHS=${_DL_PATHS}"
+              -D "CTEST_FILE=${ctest_tests_file}"
+              -P "${_CATCH_DISCOVER_TESTS_SCRIPT}"
+      VERBATIM
+    )
+
+    file(WRITE "${ctest_include_file}"
+      "if(EXISTS \"${ctest_tests_file}\")\n"
+      "  include(\"${ctest_tests_file}\")\n"
+      "else()\n"
+      "  add_test(${TARGET}_NOT_BUILT-${args_hash} ${TARGET}_NOT_BUILT-${args_hash})\n"
+      "endif()\n"
+    )
+
+  elseif(_DISCOVERY_MODE STREQUAL "PRE_TEST")
+
+    get_property(GENERATOR_IS_MULTI_CONFIG GLOBAL
+        PROPERTY GENERATOR_IS_MULTI_CONFIG
+    )
+
+    if(GENERATOR_IS_MULTI_CONFIG)
+      set(ctest_tests_file "${ctest_file_base}_tests-$<CONFIG>.cmake")
+    endif()
+
+    string(CONCAT ctest_include_content
+      "if(EXISTS \"$<TARGET_FILE:${TARGET}>\")"                                    "\n"
+      "  if(NOT EXISTS \"${ctest_tests_file}\" OR"                                 "\n"
+      "     NOT \"${ctest_tests_file}\" IS_NEWER_THAN \"$<TARGET_FILE:${TARGET}>\" OR\n"
+      "     NOT \"${ctest_tests_file}\" IS_NEWER_THAN \"\${CMAKE_CURRENT_LIST_FILE}\")\n"
+      "    include(\"${_CATCH_DISCOVER_TESTS_SCRIPT}\")"                           "\n"
+      "    catch_discover_tests_impl("                                             "\n"
+      "      TEST_EXECUTABLE"        " [==[" "$<TARGET_FILE:${TARGET}>"   "]==]"   "\n"
+      "      TEST_EXECUTOR"          " [==[" "${crosscompiling_emulator}" "]==]"   "\n"
+      "      TEST_WORKING_DIR"       " [==[" "${_WORKING_DIRECTORY}"      "]==]"   "\n"
+      "      TEST_SPEC"              " [==[" "${_TEST_SPEC}"              "]==]"   "\n"
+      "      TEST_EXTRA_ARGS"        " [==[" "${_EXTRA_ARGS}"             "]==]"   "\n"
+      "      TEST_PROPERTIES"        " [==[" "${_PROPERTIES}"             "]==]"   "\n"
+      "      TEST_PREFIX"            " [==[" "${_TEST_PREFIX}"            "]==]"   "\n"
+      "      TEST_SUFFIX"            " [==[" "${_TEST_SUFFIX}"            "]==]"   "\n"
+      "      TEST_LIST"              " [==[" "${_TEST_LIST}"              "]==]"   "\n"
+      "      TEST_REPORTER"          " [==[" "${_REPORTER}"               "]==]"   "\n"
+      "      TEST_OUTPUT_DIR"        " [==[" "${_OUTPUT_DIR}"             "]==]"   "\n"
+      "      TEST_OUTPUT_PREFIX"     " [==[" "${_OUTPUT_PREFIX}"          "]==]"   "\n"
+      "      TEST_OUTPUT_SUFFIX"     " [==[" "${_OUTPUT_SUFFIX}"          "]==]"   "\n"
+      "      CTEST_FILE"             " [==[" "${ctest_tests_file}"        "]==]"   "\n"
+      "      TEST_DL_PATHS"          " [==[" "${_DL_PATHS}"               "]==]"   "\n"
+      "      CTEST_FILE"             " [==[" "${CTEST_FILE}"              "]==]"   "\n"
+      "    )"                                                                      "\n"
+      "  endif()"                                                                  "\n"
+      "  include(\"${ctest_tests_file}\")"                                         "\n"
+      "else()"                                                                     "\n"
+      "  add_test(${TARGET}_NOT_BUILT ${TARGET}_NOT_BUILT)"                        "\n"
+      "endif()"                                                                    "\n"
+    )
+
+    if(GENERATOR_IS_MULTI_CONFIG)
+      foreach(_config ${CMAKE_CONFIGURATION_TYPES})
+        file(GENERATE OUTPUT "${ctest_file_base}_include-${_config}.cmake" CONTENT "${ctest_include_content}" CONDITION $<CONFIG:${_config}>)
+      endforeach()
+      string(CONCAT ctest_include_multi_content
+        "if(NOT CTEST_CONFIGURATION_TYPE)"                                              "\n"
+        "  message(\"No configuration for testing specified, use '-C <cfg>'.\")"        "\n"
+        "else()"                                                                        "\n"
+        "  include(\"${ctest_file_base}_include-\${CTEST_CONFIGURATION_TYPE}.cmake\")"  "\n"
+        "endif()"                                                                       "\n"
+      )
+      file(GENERATE OUTPUT "${ctest_include_file}" CONTENT "${ctest_include_multi_content}")
+    else()
+      file(GENERATE OUTPUT "${ctest_file_base}_include.cmake" CONTENT "${ctest_include_content}")
+      file(WRITE "${ctest_include_file}" "include(\"${ctest_file_base}_include.cmake\")")
+    endif()
+  endif()
+
+  if(NOT ${CMAKE_VERSION} VERSION_LESS "3.10.0")
+    # Add discovered tests to directory TEST_INCLUDE_FILES
+    set_property(DIRECTORY
+      APPEND PROPERTY TEST_INCLUDE_FILES "${ctest_include_file}"
+    )
+  else()
+    # Add discovered tests as directory TEST_INCLUDE_FILE if possible
+    get_property(test_include_file_set DIRECTORY PROPERTY TEST_INCLUDE_FILE SET)
+    if (NOT ${test_include_file_set})
+      set_property(DIRECTORY
+        PROPERTY TEST_INCLUDE_FILE "${ctest_include_file}"
+      )
+    else()
+      message(FATAL_ERROR "Cannot set more than one TEST_INCLUDE_FILE")
+    endif()
+  endif()
+
+endfunction()
+
+###############################################################################
+
+set(_CATCH_DISCOVER_TESTS_SCRIPT
+  ${CMAKE_CURRENT_LIST_DIR}/CatchAddTests.cmake
+  CACHE INTERNAL "Catch2 full path to CatchAddTests.cmake helper file"
+)
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/extras/CatchAddTests.cmake b/ethosu/regor/dependencies/thirdparty/Catch2/extras/CatchAddTests.cmake
new file mode 100644
index 00000000..692e3405
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/extras/CatchAddTests.cmake
@@ -0,0 +1,192 @@
+# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+# file Copyright.txt or https://cmake.org/licensing for details.
+
+function(add_command NAME)
+  set(_args "")
+  # use ARGV* instead of ARGN, because ARGN splits arrays into multiple arguments
+  math(EXPR _last_arg ${ARGC}-1)
+  foreach(_n RANGE 1 ${_last_arg})
+    set(_arg "${ARGV${_n}}")
+    if(_arg MATCHES "[^-./:a-zA-Z0-9_]")
+      set(_args "${_args} [==[${_arg}]==]") # form a bracket_argument
+    else()
+      set(_args "${_args} ${_arg}")
+    endif()
+  endforeach()
+  set(script "${script}${NAME}(${_args})\n" PARENT_SCOPE)
+endfunction()
+
+function(catch_discover_tests_impl)
+
+  cmake_parse_arguments(
+    ""
+    ""
+    "TEST_EXECUTABLE;TEST_WORKING_DIR;TEST_DL_PATHS;TEST_OUTPUT_DIR;TEST_OUTPUT_PREFIX;TEST_OUTPUT_SUFFIX;TEST_PREFIX;TEST_REPORTER;TEST_SPEC;TEST_SUFFIX;TEST_LIST;CTEST_FILE"
+    "TEST_EXTRA_ARGS;TEST_PROPERTIES;TEST_EXECUTOR"
+    ${ARGN}
+  )
+
+  set(prefix "${_TEST_PREFIX}")
+  set(suffix "${_TEST_SUFFIX}")
+  set(spec ${_TEST_SPEC})
+  set(extra_args ${_TEST_EXTRA_ARGS})
+  set(properties ${_TEST_PROPERTIES})
+  set(reporter ${_TEST_REPORTER})
+  set(output_dir ${_TEST_OUTPUT_DIR})
+  set(output_prefix ${_TEST_OUTPUT_PREFIX})
+  set(output_suffix ${_TEST_OUTPUT_SUFFIX})
+  set(dl_paths ${_TEST_DL_PATHS})
+  set(script)
+  set(suite)
+  set(tests)
+
+  if(WIN32)
+    set(dl_paths_variable_name PATH)
+  elseif(APPLE)
+    set(dl_paths_variable_name DYLD_LIBRARY_PATH)
+  else()
+    set(dl_paths_variable_name LD_LIBRARY_PATH)
+  endif()
+
+  # Run test executable to get list of available tests
+  if(NOT EXISTS "${_TEST_EXECUTABLE}")
+    message(FATAL_ERROR
+      "Specified test executable '${_TEST_EXECUTABLE}' does not exist"
+    )
+  endif()
+
+  if(dl_paths)
+    cmake_path(CONVERT "${dl_paths}" TO_NATIVE_PATH_LIST paths)
+    set(ENV{${dl_paths_variable_name}} "${paths}")
+  endif()
+
+  execute_process(
+    COMMAND ${_TEST_EXECUTOR} "${_TEST_EXECUTABLE}" ${spec} --list-tests --verbosity quiet
+    OUTPUT_VARIABLE output
+    RESULT_VARIABLE result
+    WORKING_DIRECTORY "${_TEST_WORKING_DIR}"
+  )
+  if(NOT ${result} EQUAL 0)
+    message(FATAL_ERROR
+      "Error running test executable '${_TEST_EXECUTABLE}':\n"
+      "  Result: ${result}\n"
+      "  Output: ${output}\n"
+    )
+  endif()
+
+  # Make sure to escape ; (semicolons) in test names first, because
+  # that'd break the foreach loop for "Parse output" later and create
+  # wrongly splitted and thus failing test cases (false positives)
+  string(REPLACE ";" "\;" output "${output}")
+  string(REPLACE "\n" ";" output "${output}")
+
+  # Prepare reporter
+  if(reporter)
+    set(reporter_arg "--reporter ${reporter}")
+
+    # Run test executable to check whether reporter is available
+    # note that the use of --list-reporters is not the important part,
+    # we only want to check whether the execution succeeds with ${reporter_arg}
+    execute_process(
+      COMMAND ${_TEST_EXECUTOR} "${_TEST_EXECUTABLE}" ${spec} ${reporter_arg} --list-reporters
+      OUTPUT_VARIABLE reporter_check_output
+      RESULT_VARIABLE reporter_check_result
+      WORKING_DIRECTORY "${_TEST_WORKING_DIR}"
+    )
+    if(${reporter_check_result} EQUAL 255)
+      message(FATAL_ERROR
+        "\"${reporter}\" is not a valid reporter!\n"
+      )
+    elseif(NOT ${reporter_check_result} EQUAL 0)
+      message(FATAL_ERROR
+        "Error running test executable '${_TEST_EXECUTABLE}':\n"
+        "  Result: ${reporter_check_result}\n"
+        "  Output: ${reporter_check_output}\n"
+      )
+    endif()
+  endif()
+
+  # Prepare output dir
+  if(output_dir AND NOT IS_ABSOLUTE ${output_dir})
+    set(output_dir "${_TEST_WORKING_DIR}/${output_dir}")
+    if(NOT EXISTS ${output_dir})
+      file(MAKE_DIRECTORY ${output_dir})
+    endif()
+  endif()
+
+  if(dl_paths)
+    foreach(path ${dl_paths})
+      cmake_path(NATIVE_PATH path native_path)
+      list(APPEND environment_modifications "${dl_paths_variable_name}=path_list_prepend:${native_path}")
+    endforeach()
+  endif()
+
+  # Parse output
+  foreach(line ${output})
+    set(test "${line}")
+    # Escape characters in test case names that would be parsed by Catch2
+    # Note that the \ escaping must happen FIRST! Do not change the order.
+    set(test_name "${test}")
+    foreach(char \\ , [ ])
+      string(REPLACE ${char} "\\${char}" test_name "${test_name}")
+    endforeach(char)
+    # ...add output dir
+    if(output_dir)
+      string(REGEX REPLACE "[^A-Za-z0-9_]" "_" test_name_clean "${test_name}")
+      set(output_dir_arg "--out ${output_dir}/${output_prefix}${test_name_clean}${output_suffix}")
+    endif()
+
+    # ...and add to script
+    add_command(add_test
+      "${prefix}${test}${suffix}"
+      ${_TEST_EXECUTOR}
+      "${_TEST_EXECUTABLE}"
+      "${test_name}"
+      ${extra_args}
+      "${reporter_arg}"
+      "${output_dir_arg}"
+    )
+    add_command(set_tests_properties
+      "${prefix}${test}${suffix}"
+      PROPERTIES
+      WORKING_DIRECTORY "${_TEST_WORKING_DIR}"
+      ${properties}
+    )
+
+    if(environment_modifications)
+      add_command(set_tests_properties
+        "${prefix}${test}${suffix}"
+        PROPERTIES
+        ENVIRONMENT_MODIFICATION "${environment_modifications}")
+    endif()
+
+    list(APPEND tests "${prefix}${test}${suffix}")
+  endforeach()
+
+  # Create a list of all discovered tests, which users may use to e.g. set
+  # properties on the tests
+  add_command(set ${_TEST_LIST} ${tests})
+
+  # Write CTest script
+  file(WRITE "${_CTEST_FILE}" "${script}")
+endfunction()
+
+if(CMAKE_SCRIPT_MODE_FILE)
+  catch_discover_tests_impl(
+    TEST_EXECUTABLE ${TEST_EXECUTABLE}
+    TEST_EXECUTOR ${TEST_EXECUTOR}
+    TEST_WORKING_DIR ${TEST_WORKING_DIR}
+    TEST_SPEC ${TEST_SPEC}
+    TEST_EXTRA_ARGS ${TEST_EXTRA_ARGS}
+    TEST_PROPERTIES ${TEST_PROPERTIES}
+    TEST_PREFIX ${TEST_PREFIX}
+    TEST_SUFFIX ${TEST_SUFFIX}
+    TEST_LIST ${TEST_LIST}
+    TEST_REPORTER ${TEST_REPORTER}
+    TEST_OUTPUT_DIR ${TEST_OUTPUT_DIR}
+    TEST_OUTPUT_PREFIX ${TEST_OUTPUT_PREFIX}
+    TEST_OUTPUT_SUFFIX ${TEST_OUTPUT_SUFFIX}
+    TEST_DL_PATHS ${TEST_DL_PATHS}
+    CTEST_FILE ${CTEST_FILE}
+  )
+endif()
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/extras/CatchShardTests.cmake b/ethosu/regor/dependencies/thirdparty/Catch2/extras/CatchShardTests.cmake
new file mode 100644
index 00000000..68228f5a
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/extras/CatchShardTests.cmake
@@ -0,0 +1,74 @@
+
+#              Copyright Catch2 Authors
+# Distributed under the Boost Software License, Version 1.0.
+#   (See accompanying file LICENSE.txt or copy at
+#        https://www.boost.org/LICENSE_1_0.txt)
+
+# SPDX-License-Identifier: BSL-1.0
+
+# Supported optional args:
+#  * SHARD_COUNT - number of shards to split target's tests into
+#  * REPORTER    - reporter spec to use for tests
+#  * TEST_SPEC   - test spec used for filtering tests
+function(catch_add_sharded_tests TARGET)
+  if (${CMAKE_VERSION} VERSION_LESS "3.10.0")
+    message(FATAL_ERROR "add_sharded_catch_tests only supports CMake versions 3.10.0 and up")
+  endif()
+
+  cmake_parse_arguments(
+    ""
+    ""
+    "SHARD_COUNT;REPORTER;TEST_SPEC"
+    ""
+    ${ARGN}
+  )
+  
+  if (NOT DEFINED _SHARD_COUNT)
+    set(_SHARD_COUNT 2)
+  endif()
+
+  # Generate a unique name based on the extra arguments
+  string(SHA1 args_hash "${_TEST_SPEC} ${_EXTRA_ARGS} ${_REPORTER} ${_OUTPUT_DIR} ${_OUTPUT_PREFIX} ${_OUTPUT_SUFFIX} ${_SHARD_COUNT}")
+  string(SUBSTRING ${args_hash} 0 7 args_hash)
+
+  set(ctest_include_file "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}-sharded-tests-include-${args_hash}.cmake")
+  set(ctest_tests_file "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}-sharded-tests-impl-${args_hash}.cmake")
+
+  file(WRITE "${ctest_include_file}"
+    "if(EXISTS \"${ctest_tests_file}\")\n"
+    "  include(\"${ctest_tests_file}\")\n"
+    "else()\n"
+    "  add_test(${TARGET}_NOT_BUILT-${args_hash} ${TARGET}_NOT_BUILT-${args_hash})\n"
+    "endif()\n"
+  )
+
+  set_property(DIRECTORY
+    APPEND PROPERTY TEST_INCLUDE_FILES "${ctest_include_file}"
+  )
+
+  set(shard_impl_script_file "${_CATCH_DISCOVER_SHARD_TESTS_IMPL_SCRIPT}")
+
+  add_custom_command(
+    TARGET ${TARGET} POST_BUILD
+    BYPRODUCTS "${ctest_tests_file}"
+    COMMAND "${CMAKE_COMMAND}"
+            -D "TARGET_NAME=${TARGET}"
+            -D "TEST_BINARY=$<TARGET_FILE:${TARGET}>"
+            -D "CTEST_FILE=${ctest_tests_file}"
+            -D "SHARD_COUNT=${_SHARD_COUNT}"
+            -D "REPORTER_SPEC=${_REPORTER}"
+            -D "TEST_SPEC=${_TEST_SPEC}"
+            -P "${shard_impl_script_file}"
+    VERBATIM
+  )
+
+
+endfunction()
+
+
+###############################################################################
+
+set(_CATCH_DISCOVER_SHARD_TESTS_IMPL_SCRIPT
+    ${CMAKE_CURRENT_LIST_DIR}/CatchShardTestsImpl.cmake
+  CACHE INTERNAL "Catch2 full path to CatchShardTestsImpl.cmake helper file"
+)
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/extras/CatchShardTestsImpl.cmake b/ethosu/regor/dependencies/thirdparty/Catch2/extras/CatchShardTestsImpl.cmake
new file mode 100644
index 00000000..bb2fc3ef
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/extras/CatchShardTestsImpl.cmake
@@ -0,0 +1,52 @@
+
+#              Copyright Catch2 Authors
+# Distributed under the Boost Software License, Version 1.0.
+#   (See accompanying file LICENSE.txt or copy at
+#        https://www.boost.org/LICENSE_1_0.txt)
+
+# SPDX-License-Identifier: BSL-1.0
+
+# Indirection for CatchShardTests that allows us to delay the script
+# file generation until build time.
+
+# Expected args:
+#  * TEST_BINARY - full path to the test binary to run sharded
+#  * CTEST_FILE  - full path to ctest script file to write to
+#  * TARGET_NAME - name of the target to shard (used for test names)
+#  * SHARD_COUNT - number of shards to split the binary into
+# Optional args:
+#  * REPORTER_SPEC - reporter specs to be passed down to the binary
+#  * TEST_SPEC     - test spec to pass down to the test binary
+
+if(NOT EXISTS "${TEST_BINARY}")
+  message(FATAL_ERROR
+    "Specified test binary '${TEST_BINARY}' does not exist"
+  )
+endif()
+
+set(other_args "")
+if (TEST_SPEC)
+  set(other_args "${other_args} ${TEST_SPEC}")
+endif()
+if (REPORTER_SPEC)
+  set(other_args "${other_args} --reporter ${REPORTER_SPEC}")
+endif()
+
+# foreach RANGE in cmake is inclusive of the end, so we have to adjust it
+math(EXPR adjusted_shard_count "${SHARD_COUNT} - 1")
+
+file(WRITE "${CTEST_FILE}"
+  "string(RANDOM LENGTH 8 ALPHABET \"0123456789abcdef\" rng_seed)\n"
+  "\n"
+  "foreach(shard_idx RANGE ${adjusted_shard_count})\n"
+  "  add_test(${TARGET_NAME}-shard-" [[${shard_idx}]] "/${adjusted_shard_count}\n"
+  "    ${TEST_BINARY}"
+  " --shard-index " [[${shard_idx}]]
+  " --shard-count ${SHARD_COUNT}"
+  " --rng-seed " [[0x${rng_seed}]]
+  " --order rand"
+  "${other_args}"
+  "\n"
+  "  )\n"
+  "endforeach()\n"
+)
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/extras/ParseAndAddCatchTests.cmake b/ethosu/regor/dependencies/thirdparty/Catch2/extras/ParseAndAddCatchTests.cmake
new file mode 100644
index 00000000..4771e029
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/extras/ParseAndAddCatchTests.cmake
@@ -0,0 +1,252 @@
+#==================================================================================================#
+#  supported macros                                                                                #
+#    - TEST_CASE,                                                                                  #
+#    - TEMPLATE_TEST_CASE                                                                          #
+#    - SCENARIO,                                                                                   #
+#    - TEST_CASE_METHOD,                                                                           #
+#    - CATCH_TEST_CASE,                                                                            #
+#    - CATCH_TEMPLATE_TEST_CASE                                                                    #
+#    - CATCH_SCENARIO,                                                                             #
+#    - CATCH_TEST_CASE_METHOD.                                                                     #
+#                                                                                                  #
+#  Usage                                                                                           #
+# 1. make sure this module is in the path or add this otherwise:                                   #
+#    set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake.modules/")              #
+# 2. make sure that you've enabled testing option for the project by the call:                     #
+#    enable_testing()                                                                              #
+# 3. add the lines to the script for testing target (sample CMakeLists.txt):                       #
+#        project(testing_target)                                                                   #
+#        set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake.modules/")          #
+#        enable_testing()                                                                          #
+#                                                                                                  #
+#        find_path(CATCH_INCLUDE_DIR "catch.hpp")                                                  #
+#        include_directories(${INCLUDE_DIRECTORIES} ${CATCH_INCLUDE_DIR})                          #
+#                                                                                                  #
+#        file(GLOB SOURCE_FILES "*.cpp")                                                           #
+#        add_executable(${PROJECT_NAME} ${SOURCE_FILES})                                           #
+#                                                                                                  #
+#        include(ParseAndAddCatchTests)                                                            #
+#        ParseAndAddCatchTests(${PROJECT_NAME})                                                    #
+#                                                                                                  #
+# The following variables affect the behavior of the script:                                       #
+#                                                                                                  #
+#    PARSE_CATCH_TESTS_VERBOSE (Default OFF)                                                       #
+#    -- enables debug messages                                                                     #
+#    PARSE_CATCH_TESTS_NO_HIDDEN_TESTS (Default OFF)                                               #
+#    -- excludes tests marked with [!hide], [.] or [.foo] tags                                     #
+#    PARSE_CATCH_TESTS_ADD_FIXTURE_IN_TEST_NAME (Default ON)                                       #
+#    -- adds fixture class name to the test name                                                   #
+#    PARSE_CATCH_TESTS_ADD_TARGET_IN_TEST_NAME (Default ON)                                        #
+#    -- adds cmake target name to the test name                                                    #
+#    PARSE_CATCH_TESTS_ADD_TO_CONFIGURE_DEPENDS (Default OFF)                                      #
+#    -- causes CMake to rerun when file with tests changes so that new tests will be discovered    #
+#                                                                                                  #
+# One can also set (locally) the optional variable OptionalCatchTestLauncher to precise the way    #
+# a test should be run. For instance to use test MPI, one can write                                #
+#     set(OptionalCatchTestLauncher ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${NUMPROC})                 #
+# just before calling this ParseAndAddCatchTests function                                          #
+#                                                                                                  #
+# The AdditionalCatchParameters optional variable can be used to pass extra argument to the test   #
+# command. For example, to include successful tests in the output, one can write                   #
+#     set(AdditionalCatchParameters --success)                                                     #
+#                                                                                                  #
+# After the script, the ParseAndAddCatchTests_TESTS property for the target, and for each source   #
+# file in the target is set, and contains the list of the tests extracted from that target, or     #
+# from that file. This is useful, for example to add further labels or properties to the tests.    #
+#                                                                                                  #
+#==================================================================================================#
+
+if (CMAKE_MINIMUM_REQUIRED_VERSION VERSION_LESS 2.8.8)
+  message(FATAL_ERROR "ParseAndAddCatchTests requires CMake 2.8.8 or newer")
+endif()
+
+option(PARSE_CATCH_TESTS_VERBOSE "Print Catch to CTest parser debug messages" OFF)
+option(PARSE_CATCH_TESTS_NO_HIDDEN_TESTS "Exclude tests with [!hide], [.] or [.foo] tags" OFF)
+option(PARSE_CATCH_TESTS_ADD_FIXTURE_IN_TEST_NAME "Add fixture class name to the test name" ON)
+option(PARSE_CATCH_TESTS_ADD_TARGET_IN_TEST_NAME "Add target name to the test name" ON)
+option(PARSE_CATCH_TESTS_ADD_TO_CONFIGURE_DEPENDS "Add test file to CMAKE_CONFIGURE_DEPENDS property" OFF)
+
+function(ParseAndAddCatchTests_PrintDebugMessage)
+    if(PARSE_CATCH_TESTS_VERBOSE)
+            message(STATUS "ParseAndAddCatchTests: ${ARGV}")
+    endif()
+endfunction()
+
+# This removes the contents between
+#  - block comments (i.e. /* ... */)
+#  - full line comments (i.e. // ... )
+# contents have been read into '${CppCode}'.
+# !keep partial line comments
+function(ParseAndAddCatchTests_RemoveComments CppCode)
+  string(ASCII 2 CMakeBeginBlockComment)
+  string(ASCII 3 CMakeEndBlockComment)
+  string(REGEX REPLACE "/\\*" "${CMakeBeginBlockComment}" ${CppCode} "${${CppCode}}")
+  string(REGEX REPLACE "\\*/" "${CMakeEndBlockComment}" ${CppCode} "${${CppCode}}")
+  string(REGEX REPLACE "${CMakeBeginBlockComment}[^${CMakeEndBlockComment}]*${CMakeEndBlockComment}" "" ${CppCode} "${${CppCode}}")
+  string(REGEX REPLACE "\n[ \t]*//+[^\n]+" "\n" ${CppCode} "${${CppCode}}")
+
+  set(${CppCode} "${${CppCode}}" PARENT_SCOPE)
+endfunction()
+
+# Worker function
+function(ParseAndAddCatchTests_ParseFile SourceFile TestTarget)
+    # If SourceFile is an object library, do not scan it (as it is not a file). Exit without giving a warning about a missing file.
+    if(SourceFile MATCHES "\\\$<TARGET_OBJECTS:.+>")
+        ParseAndAddCatchTests_PrintDebugMessage("Detected OBJECT library: ${SourceFile} this will not be scanned for tests.")
+        return()
+    endif()
+    # According to CMake docs EXISTS behavior is well-defined only for full paths.
+    get_filename_component(SourceFile ${SourceFile} ABSOLUTE)
+    if(NOT EXISTS ${SourceFile})
+        message(WARNING "Cannot find source file: ${SourceFile}")
+        return()
+    endif()
+    ParseAndAddCatchTests_PrintDebugMessage("parsing ${SourceFile}")
+    file(STRINGS ${SourceFile} Contents NEWLINE_CONSUME)
+
+    # Remove block and fullline comments
+    ParseAndAddCatchTests_RemoveComments(Contents)
+
+    # Find definition of test names
+    # https://regex101.com/r/JygOND/1
+    string(REGEX MATCHALL "[ \t]*(CATCH_)?(TEMPLATE_)?(TEST_CASE_METHOD|SCENARIO|TEST_CASE)[ \t]*\\([ \t\n]*\"[^\"]*\"[ \t\n]*(,[ \t\n]*\"[^\"]*\")?(,[ \t\n]*[^\,\)]*)*\\)[ \t\n]*\{+[ \t]*(//[^\n]*[Tt][Ii][Mm][Ee][Oo][Uu][Tt][ \t]*[0-9]+)*" Tests "${Contents}")
+
+    if(PARSE_CATCH_TESTS_ADD_TO_CONFIGURE_DEPENDS AND Tests)
+      ParseAndAddCatchTests_PrintDebugMessage("Adding ${SourceFile} to CMAKE_CONFIGURE_DEPENDS property")
+      set_property(
+        DIRECTORY
+        APPEND
+        PROPERTY CMAKE_CONFIGURE_DEPENDS ${SourceFile}
+      )
+    endif()
+
+    # check CMP0110 policy for new add_test() behavior
+    if(POLICY CMP0110)
+        cmake_policy(GET CMP0110 _cmp0110_value) # new add_test() behavior
+    else()
+        # just to be thorough explicitly set the variable
+        set(_cmp0110_value)
+    endif()
+
+    foreach(TestName ${Tests})
+        # Strip newlines
+        string(REGEX REPLACE "\\\\\n|\n" "" TestName "${TestName}")
+
+        # Get test type and fixture if applicable
+        string(REGEX MATCH "(CATCH_)?(TEMPLATE_)?(TEST_CASE_METHOD|SCENARIO|TEST_CASE)[ \t]*\\([^,^\"]*" TestTypeAndFixture "${TestName}")
+        string(REGEX MATCH "(CATCH_)?(TEMPLATE_)?(TEST_CASE_METHOD|SCENARIO|TEST_CASE)" TestType "${TestTypeAndFixture}")
+        string(REGEX REPLACE "${TestType}\\([ \t]*" "" TestFixture "${TestTypeAndFixture}")
+
+        # Get string parts of test definition
+        string(REGEX MATCHALL "\"+([^\\^\"]|\\\\\")+\"+" TestStrings "${TestName}")
+
+        # Strip wrapping quotation marks
+        string(REGEX REPLACE "^\"(.*)\"$" "\\1" TestStrings "${TestStrings}")
+        string(REPLACE "\";\"" ";" TestStrings "${TestStrings}")
+
+        # Validate that a test name and tags have been provided
+        list(LENGTH TestStrings TestStringsLength)
+        if(TestStringsLength GREATER 2 OR TestStringsLength LESS 1)
+            message(FATAL_ERROR "You must provide a valid test name and tags for all tests in ${SourceFile}")
+        endif()
+
+        # Assign name and tags
+        list(GET TestStrings 0 Name)
+        if("${TestType}" STREQUAL "SCENARIO")
+            set(Name "Scenario: ${Name}")
+        endif()
+        if(PARSE_CATCH_TESTS_ADD_FIXTURE_IN_TEST_NAME AND "${TestType}" MATCHES "(CATCH_)?TEST_CASE_METHOD" AND TestFixture )
+            set(CTestName "${TestFixture}:${Name}")
+        else()
+            set(CTestName "${Name}")
+        endif()
+        if(PARSE_CATCH_TESTS_ADD_TARGET_IN_TEST_NAME)
+            set(CTestName "${TestTarget}:${CTestName}")
+        endif()
+        # add target to labels to enable running all tests added from this target
+        set(Labels ${TestTarget})
+        if(TestStringsLength EQUAL 2)
+            list(GET TestStrings 1 Tags)
+            string(TOLOWER "${Tags}" Tags)
+            # remove target from labels if the test is hidden
+            if("${Tags}" MATCHES ".*\\[!?(hide|\\.)\\].*")
+                list(REMOVE_ITEM Labels ${TestTarget})
+            endif()
+            string(REPLACE "]" ";" Tags "${Tags}")
+            string(REPLACE "[" "" Tags "${Tags}")
+        else()
+          # unset tags variable from previous loop
+          unset(Tags)
+        endif()
+
+        list(APPEND Labels ${Tags})
+
+        set(HiddenTagFound OFF)
+        foreach(label ${Labels})
+            string(REGEX MATCH "^!hide|^\\." result ${label})
+            if(result)
+                set(HiddenTagFound ON)
+                break()
+            endif(result)
+        endforeach(label)
+        if(PARSE_CATCH_TESTS_NO_HIDDEN_TESTS AND ${HiddenTagFound} AND ${CMAKE_VERSION} VERSION_LESS "3.9")
+            ParseAndAddCatchTests_PrintDebugMessage("Skipping test \"${CTestName}\" as it has [!hide], [.] or [.foo] label")
+        else()
+            ParseAndAddCatchTests_PrintDebugMessage("Adding test \"${CTestName}\"")
+            if(Labels)
+                ParseAndAddCatchTests_PrintDebugMessage("Setting labels to ${Labels}")
+            endif()
+
+            # Escape commas in the test spec
+            string(REPLACE "," "\\," Name ${Name})
+
+            # Work around CMake 3.18.0 change in `add_test()`, before the escaped quotes were necessary,
+            # only with CMake 3.18.0 the escaped double quotes confuse the call. This change is reverted in 3.18.1
+            # And properly introduced in 3.19 with the CMP0110 policy
+            if(_cmp0110_value STREQUAL "NEW" OR ${CMAKE_VERSION} VERSION_EQUAL "3.18")
+                ParseAndAddCatchTests_PrintDebugMessage("CMP0110 set to NEW, no need for add_test(\"\") workaround")
+            else()
+                ParseAndAddCatchTests_PrintDebugMessage("CMP0110 set to OLD adding \"\" for add_test() workaround")
+                set(CTestName "\"${CTestName}\"")
+            endif()
+
+            # Handle template test cases
+            if("${TestTypeAndFixture}" MATCHES ".*TEMPLATE_.*")
+              set(Name "${Name} - *")
+            endif()
+
+            # Add the test and set its properties
+            add_test(NAME "${CTestName}" COMMAND ${OptionalCatchTestLauncher} $<TARGET_FILE:${TestTarget}> ${Name} ${AdditionalCatchParameters})
+            # Old CMake versions do not document VERSION_GREATER_EQUAL, so we use VERSION_GREATER with 3.8 instead
+            if(PARSE_CATCH_TESTS_NO_HIDDEN_TESTS AND ${HiddenTagFound} AND ${CMAKE_VERSION} VERSION_GREATER "3.8")
+                ParseAndAddCatchTests_PrintDebugMessage("Setting DISABLED test property")
+                set_tests_properties("${CTestName}" PROPERTIES DISABLED ON)
+            else()
+                set_tests_properties("${CTestName}" PROPERTIES FAIL_REGULAR_EXPRESSION "No tests ran"
+                                                        LABELS "${Labels}")
+            endif()
+            set_property(
+              TARGET ${TestTarget}
+              APPEND
+              PROPERTY ParseAndAddCatchTests_TESTS "${CTestName}")
+            set_property(
+              SOURCE ${SourceFile}
+              APPEND
+              PROPERTY ParseAndAddCatchTests_TESTS "${CTestName}")
+        endif()
+
+
+    endforeach()
+endfunction()
+
+# entry point
+function(ParseAndAddCatchTests TestTarget)
+    message(DEPRECATION "ParseAndAddCatchTest: function deprecated because of possibility of missed test cases. Consider using 'catch_discover_tests' from 'Catch.cmake'")
+    ParseAndAddCatchTests_PrintDebugMessage("Started parsing ${TestTarget}")
+    get_target_property(SourceFiles ${TestTarget} SOURCES)
+    ParseAndAddCatchTests_PrintDebugMessage("Found the following sources: ${SourceFiles}")
+    foreach(SourceFile ${SourceFiles})
+        ParseAndAddCatchTests_ParseFile(${SourceFile} ${TestTarget})
+    endforeach()
+    ParseAndAddCatchTests_PrintDebugMessage("Finished parsing ${TestTarget}")
+endfunction()
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/extras/catch_amalgamated.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/extras/catch_amalgamated.cpp
new file mode 100644
index 00000000..c3fbc061
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/extras/catch_amalgamated.cpp
@@ -0,0 +1,11516 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+//  Catch v3.5.3
+//  Generated: 2024-03-01 22:05:56.038084
+//  ----------------------------------------------------------
+//  This file is an amalgamation of multiple different files.
+//  You probably shouldn't edit it directly.
+//  ----------------------------------------------------------
+
+#include "catch_amalgamated.hpp"
+
+
+#ifndef CATCH_WINDOWS_H_PROXY_HPP_INCLUDED
+#define CATCH_WINDOWS_H_PROXY_HPP_INCLUDED
+
+
+#if defined(CATCH_PLATFORM_WINDOWS)
+
+// We might end up with the define made globally through the compiler,
+// and we don't want to trigger warnings for this
+#if !defined(NOMINMAX)
+#  define NOMINMAX
+#endif
+#if !defined(WIN32_LEAN_AND_MEAN)
+#  define WIN32_LEAN_AND_MEAN
+#endif
+
+#include <windows.h>
+
+#endif // defined(CATCH_PLATFORM_WINDOWS)
+
+#endif // CATCH_WINDOWS_H_PROXY_HPP_INCLUDED
+
+
+
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            ChronometerConcept::~ChronometerConcept() = default;
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+
+// Adapted from donated nonius code.
+
+
+#include <vector>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            SampleAnalysis analyse(const IConfig &cfg, FDuration* first, FDuration* last) {
+                if (!cfg.benchmarkNoAnalysis()) {
+                    std::vector<double> samples;
+                    samples.reserve(static_cast<size_t>(last - first));
+                    for (auto current = first; current != last; ++current) {
+                        samples.push_back( current->count() );
+                    }
+
+                    auto analysis = Catch::Benchmark::Detail::analyse_samples(
+                        cfg.benchmarkConfidenceInterval(),
+                        cfg.benchmarkResamples(),
+                        samples.data(),
+                        samples.data() + samples.size() );
+                    auto outliers = Catch::Benchmark::Detail::classify_outliers(
+                        samples.data(), samples.data() + samples.size() );
+
+                    auto wrap_estimate = [](Estimate<double> e) {
+                        return Estimate<FDuration> {
+                            FDuration(e.point),
+                                FDuration(e.lower_bound),
+                                FDuration(e.upper_bound),
+                                e.confidence_interval,
+                        };
+                    };
+                    std::vector<FDuration> samples2;
+                    samples2.reserve(samples.size());
+                    for (auto s : samples) {
+                        samples2.push_back( FDuration( s ) );
+                    }
+
+                    return {
+                        CATCH_MOVE(samples2),
+                        wrap_estimate(analysis.mean),
+                        wrap_estimate(analysis.standard_deviation),
+                        outliers,
+                        analysis.outlier_variance,
+                    };
+                } else {
+                    std::vector<FDuration> samples;
+                    samples.reserve(static_cast<size_t>(last - first));
+
+                    FDuration mean = FDuration(0);
+                    int i = 0;
+                    for (auto it = first; it < last; ++it, ++i) {
+                        samples.push_back(*it);
+                        mean += *it;
+                    }
+                    mean /= i;
+
+                    return SampleAnalysis{
+                        CATCH_MOVE(samples),
+                        Estimate<FDuration>{ mean, mean, mean, 0.0 },
+                        Estimate<FDuration>{ FDuration( 0 ),
+                                             FDuration( 0 ),
+                                             FDuration( 0 ),
+                                             0.0 },
+                        OutlierClassification{},
+                        0.0
+                    };
+                }
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+
+
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            BenchmarkFunction::callable::~callable() = default;
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+
+
+
+#include <exception>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            struct optimized_away_error : std::exception {
+                const char* what() const noexcept override;
+            };
+
+            const char* optimized_away_error::what() const noexcept {
+                return "could not measure benchmark, maybe it was optimized away";
+            }
+
+            void throw_optimized_away_error() {
+                Catch::throw_exception(optimized_away_error{});
+            }
+
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+
+// Adapted from donated nonius code.
+
+
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <numeric>
+#include <random>
+
+
+#if defined(CATCH_CONFIG_USE_ASYNC)
+#include <future>
+#endif
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            namespace {
+
+                template <typename URng, typename Estimator>
+                static sample
+                resample( URng& rng,
+                          unsigned int resamples,
+                          double const* first,
+                          double const* last,
+                          Estimator& estimator ) {
+                    auto n = static_cast<size_t>( last - first );
+                    std::uniform_int_distribution<size_t> dist( 0, n - 1 );
+
+                    sample out;
+                    out.reserve( resamples );
+                    std::vector<double> resampled;
+                    resampled.reserve( n );
+                    for ( size_t i = 0; i < resamples; ++i ) {
+                        resampled.clear();
+                        for ( size_t s = 0; s < n; ++s ) {
+                            resampled.push_back( first[dist( rng )] );
+                        }
+                        const auto estimate =
+                            estimator( resampled.data(), resampled.data() + resampled.size() );
+                        out.push_back( estimate );
+                    }
+                    std::sort( out.begin(), out.end() );
+                    return out;
+                }
+
+                static double outlier_variance( Estimate<double> mean,
+                                                Estimate<double> stddev,
+                                                int n ) {
+                    double sb = stddev.point;
+                    double mn = mean.point / n;
+                    double mg_min = mn / 2.;
+                    double sg = (std::min)( mg_min / 4., sb / std::sqrt( n ) );
+                    double sg2 = sg * sg;
+                    double sb2 = sb * sb;
+
+                    auto c_max = [n, mn, sb2, sg2]( double x ) -> double {
+                        double k = mn - x;
+                        double d = k * k;
+                        double nd = n * d;
+                        double k0 = -n * nd;
+                        double k1 = sb2 - n * sg2 + nd;
+                        double det = k1 * k1 - 4 * sg2 * k0;
+                        return static_cast<int>( -2. * k0 /
+                                                 ( k1 + std::sqrt( det ) ) );
+                    };
+
+                    auto var_out = [n, sb2, sg2]( double c ) {
+                        double nc = n - c;
+                        return ( nc / n ) * ( sb2 - nc * sg2 );
+                    };
+
+                    return (std::min)( var_out( 1 ),
+                                       var_out(
+                                           (std::min)( c_max( 0. ),
+                                                       c_max( mg_min ) ) ) ) /
+                           sb2;
+                }
+
+                static double erf_inv( double x ) {
+                    // Code accompanying the article "Approximating the erfinv
+                    // function" in GPU Computing Gems, Volume 2
+                    double w, p;
+
+                    w = -log( ( 1.0 - x ) * ( 1.0 + x ) );
+
+                    if ( w < 6.250000 ) {
+                        w = w - 3.125000;
+                        p = -3.6444120640178196996e-21;
+                        p = -1.685059138182016589e-19 + p * w;
+                        p = 1.2858480715256400167e-18 + p * w;
+                        p = 1.115787767802518096e-17 + p * w;
+                        p = -1.333171662854620906e-16 + p * w;
+                        p = 2.0972767875968561637e-17 + p * w;
+                        p = 6.6376381343583238325e-15 + p * w;
+                        p = -4.0545662729752068639e-14 + p * w;
+                        p = -8.1519341976054721522e-14 + p * w;
+                        p = 2.6335093153082322977e-12 + p * w;
+                        p = -1.2975133253453532498e-11 + p * w;
+                        p = -5.4154120542946279317e-11 + p * w;
+                        p = 1.051212273321532285e-09 + p * w;
+                        p = -4.1126339803469836976e-09 + p * w;
+                        p = -2.9070369957882005086e-08 + p * w;
+                        p = 4.2347877827932403518e-07 + p * w;
+                        p = -1.3654692000834678645e-06 + p * w;
+                        p = -1.3882523362786468719e-05 + p * w;
+                        p = 0.0001867342080340571352 + p * w;
+                        p = -0.00074070253416626697512 + p * w;
+                        p = -0.0060336708714301490533 + p * w;
+                        p = 0.24015818242558961693 + p * w;
+                        p = 1.6536545626831027356 + p * w;
+                    } else if ( w < 16.000000 ) {
+                        w = sqrt( w ) - 3.250000;
+                        p = 2.2137376921775787049e-09;
+                        p = 9.0756561938885390979e-08 + p * w;
+                        p = -2.7517406297064545428e-07 + p * w;
+                        p = 1.8239629214389227755e-08 + p * w;
+                        p = 1.5027403968909827627e-06 + p * w;
+                        p = -4.013867526981545969e-06 + p * w;
+                        p = 2.9234449089955446044e-06 + p * w;
+                        p = 1.2475304481671778723e-05 + p * w;
+                        p = -4.7318229009055733981e-05 + p * w;
+                        p = 6.8284851459573175448e-05 + p * w;
+                        p = 2.4031110387097893999e-05 + p * w;
+                        p = -0.0003550375203628474796 + p * w;
+                        p = 0.00095328937973738049703 + p * w;
+                        p = -0.0016882755560235047313 + p * w;
+                        p = 0.0024914420961078508066 + p * w;
+                        p = -0.0037512085075692412107 + p * w;
+                        p = 0.005370914553590063617 + p * w;
+                        p = 1.0052589676941592334 + p * w;
+                        p = 3.0838856104922207635 + p * w;
+                    } else {
+                        w = sqrt( w ) - 5.000000;
+                        p = -2.7109920616438573243e-11;
+                        p = -2.5556418169965252055e-10 + p * w;
+                        p = 1.5076572693500548083e-09 + p * w;
+                        p = -3.7894654401267369937e-09 + p * w;
+                        p = 7.6157012080783393804e-09 + p * w;
+                        p = -1.4960026627149240478e-08 + p * w;
+                        p = 2.9147953450901080826e-08 + p * w;
+                        p = -6.7711997758452339498e-08 + p * w;
+                        p = 2.2900482228026654717e-07 + p * w;
+                        p = -9.9298272942317002539e-07 + p * w;
+                        p = 4.5260625972231537039e-06 + p * w;
+                        p = -1.9681778105531670567e-05 + p * w;
+                        p = 7.5995277030017761139e-05 + p * w;
+                        p = -0.00021503011930044477347 + p * w;
+                        p = -0.00013871931833623122026 + p * w;
+                        p = 1.0103004648645343977 + p * w;
+                        p = 4.8499064014085844221 + p * w;
+                    }
+                    return p * x;
+                }
+
+                static double
+                standard_deviation( double const* first, double const* last ) {
+                    auto m = Catch::Benchmark::Detail::mean( first, last );
+                    double variance =
+                        std::accumulate( first,
+                                         last,
+                                         0.,
+                                         [m]( double a, double b ) {
+                                             double diff = b - m;
+                                             return a + diff * diff;
+                                         } ) /
+                        ( last - first );
+                    return std::sqrt( variance );
+                }
+
+                static sample jackknife( double ( *estimator )( double const*,
+                                                                double const* ),
+                                         double* first,
+                                         double* last ) {
+                    const auto second = first + 1;
+                    sample results;
+                    results.reserve( static_cast<size_t>( last - first ) );
+
+                    for ( auto it = first; it != last; ++it ) {
+                        std::iter_swap( it, first );
+                        results.push_back( estimator( second, last ) );
+                    }
+
+                    return results;
+                }
+
+
+            } // namespace
+        }     // namespace Detail
+    }         // namespace Benchmark
+} // namespace Catch
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+
+            double weighted_average_quantile( int k,
+                                              int q,
+                                              double* first,
+                                              double* last ) {
+                auto count = last - first;
+                double idx = (count - 1) * k / static_cast<double>(q);
+                int j = static_cast<int>(idx);
+                double g = idx - j;
+                std::nth_element(first, first + j, last);
+                auto xj = first[j];
+                if ( Catch::Detail::directCompare( g, 0 ) ) {
+                    return xj;
+                }
+
+                auto xj1 = *std::min_element(first + (j + 1), last);
+                return xj + g * (xj1 - xj);
+            }
+
+            OutlierClassification
+            classify_outliers( double const* first, double const* last ) {
+                std::vector<double> copy( first, last );
+
+                auto q1 = weighted_average_quantile( 1, 4, copy.data(), copy.data() + copy.size() );
+                auto q3 = weighted_average_quantile( 3, 4, copy.data(), copy.data() + copy.size() );
+                auto iqr = q3 - q1;
+                auto los = q1 - ( iqr * 3. );
+                auto lom = q1 - ( iqr * 1.5 );
+                auto him = q3 + ( iqr * 1.5 );
+                auto his = q3 + ( iqr * 3. );
+
+                OutlierClassification o;
+                for ( ; first != last; ++first ) {
+                    const double t = *first;
+                    if ( t < los ) {
+                        ++o.low_severe;
+                    } else if ( t < lom ) {
+                        ++o.low_mild;
+                    } else if ( t > his ) {
+                        ++o.high_severe;
+                    } else if ( t > him ) {
+                        ++o.high_mild;
+                    }
+                    ++o.samples_seen;
+                }
+                return o;
+            }
+
+            double mean( double const* first, double const* last ) {
+                auto count = last - first;
+                double sum = 0.;
+                while (first != last) {
+                    sum += *first;
+                    ++first;
+                }
+                return sum / static_cast<double>(count);
+            }
+
+            double normal_cdf( double x ) {
+                return std::erfc( -x / std::sqrt( 2.0 ) ) / 2.0;
+            }
+
+            double erfc_inv(double x) {
+                return erf_inv(1.0 - x);
+            }
+
+            double normal_quantile(double p) {
+                static const double ROOT_TWO = std::sqrt(2.0);
+
+                double result = 0.0;
+                assert(p >= 0 && p <= 1);
+                if (p < 0 || p > 1) {
+                    return result;
+                }
+
+                result = -erfc_inv(2.0 * p);
+                // result *= normal distribution standard deviation (1.0) * sqrt(2)
+                result *= /*sd * */ ROOT_TWO;
+                // result += normal disttribution mean (0)
+                return result;
+            }
+
+            Estimate<double>
+            bootstrap( double confidence_level,
+                       double* first,
+                       double* last,
+                       sample const& resample,
+                       double ( *estimator )( double const*, double const* ) ) {
+                auto n_samples = last - first;
+
+                double point = estimator( first, last );
+                // Degenerate case with a single sample
+                if ( n_samples == 1 )
+                    return { point, point, point, confidence_level };
+
+                sample jack = jackknife( estimator, first, last );
+                double jack_mean =
+                    mean( jack.data(), jack.data() + jack.size() );
+                double sum_squares = 0, sum_cubes = 0;
+                for ( double x : jack ) {
+                    auto difference = jack_mean - x;
+                    auto square = difference * difference;
+                    auto cube = square * difference;
+                    sum_squares += square;
+                    sum_cubes += cube;
+                }
+
+                double accel = sum_cubes / ( 6 * std::pow( sum_squares, 1.5 ) );
+                long n = static_cast<long>( resample.size() );
+                double prob_n =
+                    std::count_if( resample.begin(),
+                                   resample.end(),
+                                   [point]( double x ) { return x < point; } ) /
+                    static_cast<double>( n );
+                // degenerate case with uniform samples
+                if ( Catch::Detail::directCompare( prob_n, 0. ) ) {
+                    return { point, point, point, confidence_level };
+                }
+
+                double bias = normal_quantile( prob_n );
+                double z1 = normal_quantile( ( 1. - confidence_level ) / 2. );
+
+                auto cumn = [n]( double x ) -> long {
+                    return std::lround( normal_cdf( x ) *
+                                        static_cast<double>( n ) );
+                };
+                auto a = [bias, accel]( double b ) {
+                    return bias + b / ( 1. - accel * b );
+                };
+                double b1 = bias + z1;
+                double b2 = bias - z1;
+                double a1 = a( b1 );
+                double a2 = a( b2 );
+                auto lo = static_cast<size_t>( (std::max)( cumn( a1 ), 0l ) );
+                auto hi =
+                    static_cast<size_t>( (std::min)( cumn( a2 ), n - 1 ) );
+
+                return { point, resample[lo], resample[hi], confidence_level };
+            }
+
+            bootstrap_analysis analyse_samples(double confidence_level,
+                                               unsigned int n_resamples,
+                                               double* first,
+                                               double* last) {
+                auto mean = &Detail::mean;
+                auto stddev = &standard_deviation;
+
+#if defined(CATCH_CONFIG_USE_ASYNC)
+                auto Estimate = [=](double(*f)(double const*, double const*)) {
+                    std::random_device rd;
+                    auto seed = rd();
+                    return std::async(std::launch::async, [=] {
+                        SimplePcg32 rng( seed );
+                        auto resampled = resample(rng, n_resamples, first, last, f);
+                        return bootstrap(confidence_level, first, last, resampled, f);
+                    });
+                };
+
+                auto mean_future = Estimate(mean);
+                auto stddev_future = Estimate(stddev);
+
+                auto mean_estimate = mean_future.get();
+                auto stddev_estimate = stddev_future.get();
+#else
+                auto Estimate = [=](double(*f)(double const* , double const*)) {
+                    std::random_device rd;
+                    auto seed = rd();
+                    SimplePcg32 rng( seed );
+                    auto resampled = resample(rng, n_resamples, first, last, f);
+                    return bootstrap(confidence_level, first, last, resampled, f);
+                };
+
+                auto mean_estimate = Estimate(mean);
+                auto stddev_estimate = Estimate(stddev);
+#endif // CATCH_USE_ASYNC
+
+                auto n = static_cast<int>(last - first); // seriously, one can't use integral types without hell in C++
+                double outlier_variance = Detail::outlier_variance(mean_estimate, stddev_estimate, n);
+
+                return { mean_estimate, stddev_estimate, outlier_variance };
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+
+
+#include <cmath>
+#include <limits>
+
+namespace {
+
+// Performs equivalent check of std::fabs(lhs - rhs) <= margin
+// But without the subtraction to allow for INFINITY in comparison
+bool marginComparison(double lhs, double rhs, double margin) {
+    return (lhs + margin >= rhs) && (rhs + margin >= lhs);
+}
+
+}
+
+namespace Catch {
+
+    Approx::Approx ( double value )
+    :   m_epsilon( static_cast<double>(std::numeric_limits<float>::epsilon())*100. ),
+        m_margin( 0.0 ),
+        m_scale( 0.0 ),
+        m_value( value )
+    {}
+
+    Approx Approx::custom() {
+        return Approx( 0 );
+    }
+
+    Approx Approx::operator-() const {
+        auto temp(*this);
+        temp.m_value = -temp.m_value;
+        return temp;
+    }
+
+
+    std::string Approx::toString() const {
+        ReusableStringStream rss;
+        rss << "Approx( " << ::Catch::Detail::stringify( m_value ) << " )";
+        return rss.str();
+    }
+
+    bool Approx::equalityComparisonImpl(const double other) const {
+        // First try with fixed margin, then compute margin based on epsilon, scale and Approx's value
+        // Thanks to Richard Harris for his help refining the scaled margin value
+        return marginComparison(m_value, other, m_margin)
+            || marginComparison(m_value, other, m_epsilon * (m_scale + std::fabs(std::isinf(m_value)? 0 : m_value)));
+    }
+
+    void Approx::setMargin(double newMargin) {
+        CATCH_ENFORCE(newMargin >= 0,
+            "Invalid Approx::margin: " << newMargin << '.'
+            << " Approx::Margin has to be non-negative.");
+        m_margin = newMargin;
+    }
+
+    void Approx::setEpsilon(double newEpsilon) {
+        CATCH_ENFORCE(newEpsilon >= 0 && newEpsilon <= 1.0,
+            "Invalid Approx::epsilon: " << newEpsilon << '.'
+            << " Approx::epsilon has to be in [0, 1]");
+        m_epsilon = newEpsilon;
+    }
+
+namespace literals {
+    Approx operator ""_a(long double val) {
+        return Approx(val);
+    }
+    Approx operator ""_a(unsigned long long val) {
+        return Approx(val);
+    }
+} // end namespace literals
+
+std::string StringMaker<Catch::Approx>::convert(Catch::Approx const& value) {
+    return value.toString();
+}
+
+} // end namespace Catch
+
+
+
+namespace Catch {
+
+    AssertionResultData::AssertionResultData(ResultWas::OfType _resultType, LazyExpression const & _lazyExpression):
+        lazyExpression(_lazyExpression),
+        resultType(_resultType) {}
+
+    std::string AssertionResultData::reconstructExpression() const {
+
+        if( reconstructedExpression.empty() ) {
+            if( lazyExpression ) {
+                ReusableStringStream rss;
+                rss << lazyExpression;
+                reconstructedExpression = rss.str();
+            }
+        }
+        return reconstructedExpression;
+    }
+
+    AssertionResult::AssertionResult( AssertionInfo const& info, AssertionResultData&& data )
+    :   m_info( info ),
+        m_resultData( CATCH_MOVE(data) )
+    {}
+
+    // Result was a success
+    bool AssertionResult::succeeded() const {
+        return Catch::isOk( m_resultData.resultType );
+    }
+
+    // Result was a success, or failure is suppressed
+    bool AssertionResult::isOk() const {
+        return Catch::isOk( m_resultData.resultType ) || shouldSuppressFailure( m_info.resultDisposition );
+    }
+
+    ResultWas::OfType AssertionResult::getResultType() const {
+        return m_resultData.resultType;
+    }
+
+    bool AssertionResult::hasExpression() const {
+        return !m_info.capturedExpression.empty();
+    }
+
+    bool AssertionResult::hasMessage() const {
+        return !m_resultData.message.empty();
+    }
+
+    std::string AssertionResult::getExpression() const {
+        // Possibly overallocating by 3 characters should be basically free
+        std::string expr; expr.reserve(m_info.capturedExpression.size() + 3);
+        if (isFalseTest(m_info.resultDisposition)) {
+            expr += "!(";
+        }
+        expr += m_info.capturedExpression;
+        if (isFalseTest(m_info.resultDisposition)) {
+            expr += ')';
+        }
+        return expr;
+    }
+
+    std::string AssertionResult::getExpressionInMacro() const {
+        if ( m_info.macroName.empty() ) {
+            return static_cast<std::string>( m_info.capturedExpression );
+        }
+        std::string expr;
+        expr.reserve( m_info.macroName.size() + m_info.capturedExpression.size() + 4 );
+        expr += m_info.macroName;
+        expr += "( ";
+        expr += m_info.capturedExpression;
+        expr += " )";
+        return expr;
+    }
+
+    bool AssertionResult::hasExpandedExpression() const {
+        return hasExpression() && getExpandedExpression() != getExpression();
+    }
+
+    std::string AssertionResult::getExpandedExpression() const {
+        std::string expr = m_resultData.reconstructExpression();
+        return expr.empty()
+                ? getExpression()
+                : expr;
+    }
+
+    StringRef AssertionResult::getMessage() const {
+        return m_resultData.message;
+    }
+    SourceLineInfo AssertionResult::getSourceInfo() const {
+        return m_info.lineInfo;
+    }
+
+    StringRef AssertionResult::getTestMacroName() const {
+        return m_info.macroName;
+    }
+
+} // end namespace Catch
+
+
+
+#include <fstream>
+
+namespace Catch {
+
+    namespace {
+        static bool enableBazelEnvSupport() {
+#if defined( CATCH_CONFIG_BAZEL_SUPPORT )
+            return true;
+#else
+            return Detail::getEnv( "BAZEL_TEST" ) != nullptr;
+#endif
+        }
+
+        struct bazelShardingOptions {
+            unsigned int shardIndex, shardCount;
+            std::string shardFilePath;
+        };
+
+        static Optional<bazelShardingOptions> readBazelShardingOptions() {
+            const auto bazelShardIndex = Detail::getEnv( "TEST_SHARD_INDEX" );
+            const auto bazelShardTotal = Detail::getEnv( "TEST_TOTAL_SHARDS" );
+            const auto bazelShardInfoFile = Detail::getEnv( "TEST_SHARD_STATUS_FILE" );
+
+
+            const bool has_all =
+                bazelShardIndex && bazelShardTotal && bazelShardInfoFile;
+            if ( !has_all ) {
+                // We provide nice warning message if the input is
+                // misconfigured.
+                auto warn = []( const char* env_var ) {
+                    Catch::cerr()
+                        << "Warning: Bazel shard configuration is missing '"
+                        << env_var << "'. Shard configuration is skipped.\n";
+                };
+                if ( !bazelShardIndex ) {
+                    warn( "TEST_SHARD_INDEX" );
+                }
+                if ( !bazelShardTotal ) {
+                    warn( "TEST_TOTAL_SHARDS" );
+                }
+                if ( !bazelShardInfoFile ) {
+                    warn( "TEST_SHARD_STATUS_FILE" );
+                }
+                return {};
+            }
+
+            auto shardIndex = parseUInt( bazelShardIndex );
+            if ( !shardIndex ) {
+                Catch::cerr()
+                    << "Warning: could not parse 'TEST_SHARD_INDEX' ('" << bazelShardIndex
+                    << "') as unsigned int.\n";
+                return {};
+            }
+            auto shardTotal = parseUInt( bazelShardTotal );
+            if ( !shardTotal ) {
+                Catch::cerr()
+                    << "Warning: could not parse 'TEST_TOTAL_SHARD' ('"
+                    << bazelShardTotal << "') as unsigned int.\n";
+                return {};
+            }
+
+            return bazelShardingOptions{
+                *shardIndex, *shardTotal, bazelShardInfoFile };
+
+        }
+    } // end namespace
+
+
+    bool operator==( ProcessedReporterSpec const& lhs,
+                     ProcessedReporterSpec const& rhs ) {
+        return lhs.name == rhs.name &&
+               lhs.outputFilename == rhs.outputFilename &&
+               lhs.colourMode == rhs.colourMode &&
+               lhs.customOptions == rhs.customOptions;
+    }
+
+    Config::Config( ConfigData const& data ):
+        m_data( data ) {
+        // We need to trim filter specs to avoid trouble with superfluous
+        // whitespace (esp. important for bdd macros, as those are manually
+        // aligned with whitespace).
+
+        for (auto& elem : m_data.testsOrTags) {
+            elem = trim(elem);
+        }
+        for (auto& elem : m_data.sectionsToRun) {
+            elem = trim(elem);
+        }
+
+        // Insert the default reporter if user hasn't asked for a specific one
+        if ( m_data.reporterSpecifications.empty() ) {
+            m_data.reporterSpecifications.push_back( {
+#if defined( CATCH_CONFIG_DEFAULT_REPORTER )
+                CATCH_CONFIG_DEFAULT_REPORTER,
+#else
+                "console",
+#endif
+                {}, {}, {}
+            } );
+        }
+
+        if ( enableBazelEnvSupport() ) {
+            readBazelEnvVars();
+        }
+
+        // Bazel support can modify the test specs, so parsing has to happen
+        // after reading Bazel env vars.
+        TestSpecParser parser( ITagAliasRegistry::get() );
+        if ( !m_data.testsOrTags.empty() ) {
+            m_hasTestFilters = true;
+            for ( auto const& testOrTags : m_data.testsOrTags ) {
+                parser.parse( testOrTags );
+            }
+        }
+        m_testSpec = parser.testSpec();
+
+
+        // We now fixup the reporter specs to handle default output spec,
+        // default colour spec, etc
+        bool defaultOutputUsed = false;
+        for ( auto const& reporterSpec : m_data.reporterSpecifications ) {
+            // We do the default-output check separately, while always
+            // using the default output below to make the code simpler
+            // and avoid superfluous copies.
+            if ( reporterSpec.outputFile().none() ) {
+                CATCH_ENFORCE( !defaultOutputUsed,
+                               "Internal error: cannot use default output for "
+                               "multiple reporters" );
+                defaultOutputUsed = true;
+            }
+
+            m_processedReporterSpecs.push_back( ProcessedReporterSpec{
+                reporterSpec.name(),
+                reporterSpec.outputFile() ? *reporterSpec.outputFile()
+                                          : data.defaultOutputFilename,
+                reporterSpec.colourMode().valueOr( data.defaultColourMode ),
+                reporterSpec.customOptions() } );
+        }
+    }
+
+    Config::~Config() = default;
+
+
+    bool Config::listTests() const          { return m_data.listTests; }
+    bool Config::listTags() const           { return m_data.listTags; }
+    bool Config::listReporters() const      { return m_data.listReporters; }
+    bool Config::listListeners() const      { return m_data.listListeners; }
+
+    std::vector<std::string> const& Config::getTestsOrTags() const { return m_data.testsOrTags; }
+    std::vector<std::string> const& Config::getSectionsToRun() const { return m_data.sectionsToRun; }
+
+    std::vector<ReporterSpec> const& Config::getReporterSpecs() const {
+        return m_data.reporterSpecifications;
+    }
+
+    std::vector<ProcessedReporterSpec> const&
+    Config::getProcessedReporterSpecs() const {
+        return m_processedReporterSpecs;
+    }
+
+    TestSpec const& Config::testSpec() const { return m_testSpec; }
+    bool Config::hasTestFilters() const { return m_hasTestFilters; }
+
+    bool Config::showHelp() const { return m_data.showHelp; }
+
+    // IConfig interface
+    bool Config::allowThrows() const                   { return !m_data.noThrow; }
+    StringRef Config::name() const { return m_data.name.empty() ? m_data.processName : m_data.name; }
+    bool Config::includeSuccessfulResults() const      { return m_data.showSuccessfulTests; }
+    bool Config::warnAboutMissingAssertions() const {
+        return !!( m_data.warnings & WarnAbout::NoAssertions );
+    }
+    bool Config::warnAboutUnmatchedTestSpecs() const {
+        return !!( m_data.warnings & WarnAbout::UnmatchedTestSpec );
+    }
+    bool Config::zeroTestsCountAsSuccess() const       { return m_data.allowZeroTests; }
+    ShowDurations Config::showDurations() const        { return m_data.showDurations; }
+    double Config::minDuration() const                 { return m_data.minDuration; }
+    TestRunOrder Config::runOrder() const              { return m_data.runOrder; }
+    uint32_t Config::rngSeed() const                   { return m_data.rngSeed; }
+    unsigned int Config::shardCount() const            { return m_data.shardCount; }
+    unsigned int Config::shardIndex() const            { return m_data.shardIndex; }
+    ColourMode Config::defaultColourMode() const       { return m_data.defaultColourMode; }
+    bool Config::shouldDebugBreak() const              { return m_data.shouldDebugBreak; }
+    int Config::abortAfter() const                     { return m_data.abortAfter; }
+    bool Config::showInvisibles() const                { return m_data.showInvisibles; }
+    Verbosity Config::verbosity() const                { return m_data.verbosity; }
+
+    bool Config::skipBenchmarks() const                           { return m_data.skipBenchmarks; }
+    bool Config::benchmarkNoAnalysis() const                      { return m_data.benchmarkNoAnalysis; }
+    unsigned int Config::benchmarkSamples() const                 { return m_data.benchmarkSamples; }
+    double Config::benchmarkConfidenceInterval() const            { return m_data.benchmarkConfidenceInterval; }
+    unsigned int Config::benchmarkResamples() const               { return m_data.benchmarkResamples; }
+    std::chrono::milliseconds Config::benchmarkWarmupTime() const { return std::chrono::milliseconds(m_data.benchmarkWarmupTime); }
+
+    void Config::readBazelEnvVars() {
+        // Register a JUnit reporter for Bazel. Bazel sets an environment
+        // variable with the path to XML output. If this file is written to
+        // during test, Bazel will not generate a default XML output.
+        // This allows the XML output file to contain higher level of detail
+        // than what is possible otherwise.
+        const auto bazelOutputFile = Detail::getEnv( "XML_OUTPUT_FILE" );
+
+        if ( bazelOutputFile ) {
+            m_data.reporterSpecifications.push_back(
+                { "junit", std::string( bazelOutputFile ), {}, {} } );
+        }
+
+        const auto bazelTestSpec = Detail::getEnv( "TESTBRIDGE_TEST_ONLY" );
+        if ( bazelTestSpec ) {
+            // Presumably the test spec from environment should overwrite
+            // the one we got from CLI (if we got any)
+            m_data.testsOrTags.clear();
+            m_data.testsOrTags.push_back( bazelTestSpec );
+        }
+
+        const auto bazelShardOptions = readBazelShardingOptions();
+        if ( bazelShardOptions ) {
+            std::ofstream f( bazelShardOptions->shardFilePath,
+                             std::ios_base::out | std::ios_base::trunc );
+            if ( f.is_open() ) {
+                f << "";
+                m_data.shardIndex = bazelShardOptions->shardIndex;
+                m_data.shardCount = bazelShardOptions->shardCount;
+            }
+        }
+    }
+
+} // end namespace Catch
+
+
+
+
+
+namespace Catch {
+    std::uint32_t getSeed() {
+        return getCurrentContext().getConfig()->rngSeed();
+    }
+}
+
+
+
+#include <cassert>
+#include <stack>
+
+namespace Catch {
+
+    ////////////////////////////////////////////////////////////////////////////
+
+
+    ScopedMessage::ScopedMessage( MessageBuilder&& builder ):
+        m_info( CATCH_MOVE(builder.m_info) ) {
+        m_info.message = builder.m_stream.str();
+        getResultCapture().pushScopedMessage( m_info );
+    }
+
+    ScopedMessage::ScopedMessage( ScopedMessage&& old ) noexcept:
+        m_info( CATCH_MOVE( old.m_info ) ) {
+        old.m_moved = true;
+    }
+
+    ScopedMessage::~ScopedMessage() {
+        if ( !uncaught_exceptions() && !m_moved ){
+            getResultCapture().popScopedMessage(m_info);
+        }
+    }
+
+
+    Capturer::Capturer( StringRef macroName,
+                        SourceLineInfo const& lineInfo,
+                        ResultWas::OfType resultType,
+                        StringRef names ):
+        m_resultCapture( getResultCapture() ) {
+        auto trimmed = [&] (size_t start, size_t end) {
+            while (names[start] == ',' || isspace(static_cast<unsigned char>(names[start]))) {
+                ++start;
+            }
+            while (names[end] == ',' || isspace(static_cast<unsigned char>(names[end]))) {
+                --end;
+            }
+            return names.substr(start, end - start + 1);
+        };
+        auto skipq = [&] (size_t start, char quote) {
+            for (auto i = start + 1; i < names.size() ; ++i) {
+                if (names[i] == quote)
+                    return i;
+                if (names[i] == '\\')
+                    ++i;
+            }
+            CATCH_INTERNAL_ERROR("CAPTURE parsing encountered unmatched quote");
+        };
+
+        size_t start = 0;
+        std::stack<char> openings;
+        for (size_t pos = 0; pos < names.size(); ++pos) {
+            char c = names[pos];
+            switch (c) {
+            case '[':
+            case '{':
+            case '(':
+            // It is basically impossible to disambiguate between
+            // comparison and start of template args in this context
+//            case '<':
+                openings.push(c);
+                break;
+            case ']':
+            case '}':
+            case ')':
+//           case '>':
+                openings.pop();
+                break;
+            case '"':
+            case '\'':
+                pos = skipq(pos, c);
+                break;
+            case ',':
+                if (start != pos && openings.empty()) {
+                    m_messages.emplace_back(macroName, lineInfo, resultType);
+                    m_messages.back().message = static_cast<std::string>(trimmed(start, pos));
+                    m_messages.back().message += " := ";
+                    start = pos;
+                }
+            default:; // noop
+            }
+        }
+        assert(openings.empty() && "Mismatched openings");
+        m_messages.emplace_back(macroName, lineInfo, resultType);
+        m_messages.back().message = static_cast<std::string>(trimmed(start, names.size() - 1));
+        m_messages.back().message += " := ";
+    }
+    Capturer::~Capturer() {
+        if ( !uncaught_exceptions() ){
+            assert( m_captured == m_messages.size() );
+            for( size_t i = 0; i < m_captured; ++i  )
+                m_resultCapture.popScopedMessage( m_messages[i] );
+        }
+    }
+
+    void Capturer::captureValue( size_t index, std::string const& value ) {
+        assert( index < m_messages.size() );
+        m_messages[index].message += value;
+        m_resultCapture.pushScopedMessage( m_messages[index] );
+        m_captured++;
+    }
+
+} // end namespace Catch
+
+
+
+
+#include <exception>
+
+namespace Catch {
+
+    namespace {
+
+        class RegistryHub : public IRegistryHub,
+                            public IMutableRegistryHub,
+                            private Detail::NonCopyable {
+
+        public: // IRegistryHub
+            RegistryHub() = default;
+            ReporterRegistry const& getReporterRegistry() const override {
+                return m_reporterRegistry;
+            }
+            ITestCaseRegistry const& getTestCaseRegistry() const override {
+                return m_testCaseRegistry;
+            }
+            IExceptionTranslatorRegistry const& getExceptionTranslatorRegistry() const override {
+                return m_exceptionTranslatorRegistry;
+            }
+            ITagAliasRegistry const& getTagAliasRegistry() const override {
+                return m_tagAliasRegistry;
+            }
+            StartupExceptionRegistry const& getStartupExceptionRegistry() const override {
+                return m_exceptionRegistry;
+            }
+
+        public: // IMutableRegistryHub
+            void registerReporter( std::string const& name, IReporterFactoryPtr factory ) override {
+                m_reporterRegistry.registerReporter( name, CATCH_MOVE(factory) );
+            }
+            void registerListener( Detail::unique_ptr<EventListenerFactory> factory ) override {
+                m_reporterRegistry.registerListener( CATCH_MOVE(factory) );
+            }
+            void registerTest( Detail::unique_ptr<TestCaseInfo>&& testInfo, Detail::unique_ptr<ITestInvoker>&& invoker ) override {
+                m_testCaseRegistry.registerTest( CATCH_MOVE(testInfo), CATCH_MOVE(invoker) );
+            }
+            void registerTranslator( Detail::unique_ptr<IExceptionTranslator>&& translator ) override {
+                m_exceptionTranslatorRegistry.registerTranslator( CATCH_MOVE(translator) );
+            }
+            void registerTagAlias( std::string const& alias, std::string const& tag, SourceLineInfo const& lineInfo ) override {
+                m_tagAliasRegistry.add( alias, tag, lineInfo );
+            }
+            void registerStartupException() noexcept override {
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+                m_exceptionRegistry.add(std::current_exception());
+#else
+                CATCH_INTERNAL_ERROR("Attempted to register active exception under CATCH_CONFIG_DISABLE_EXCEPTIONS!");
+#endif
+            }
+            IMutableEnumValuesRegistry& getMutableEnumValuesRegistry() override {
+                return m_enumValuesRegistry;
+            }
+
+        private:
+            TestRegistry m_testCaseRegistry;
+            ReporterRegistry m_reporterRegistry;
+            ExceptionTranslatorRegistry m_exceptionTranslatorRegistry;
+            TagAliasRegistry m_tagAliasRegistry;
+            StartupExceptionRegistry m_exceptionRegistry;
+            Detail::EnumValuesRegistry m_enumValuesRegistry;
+        };
+    }
+
+    using RegistryHubSingleton = Singleton<RegistryHub, IRegistryHub, IMutableRegistryHub>;
+
+    IRegistryHub const& getRegistryHub() {
+        return RegistryHubSingleton::get();
+    }
+    IMutableRegistryHub& getMutableRegistryHub() {
+        return RegistryHubSingleton::getMutable();
+    }
+    void cleanUp() {
+        cleanupSingletons();
+        cleanUpContext();
+    }
+    std::string translateActiveException() {
+        return getRegistryHub().getExceptionTranslatorRegistry().translateActiveException();
+    }
+
+
+} // end namespace Catch
+
+
+
+#include <algorithm>
+#include <cassert>
+#include <exception>
+#include <iomanip>
+#include <set>
+
+namespace Catch {
+
+    namespace {
+        const int MaxExitCode = 255;
+
+        IEventListenerPtr createReporter(std::string const& reporterName, ReporterConfig&& config) {
+            auto reporter = Catch::getRegistryHub().getReporterRegistry().create(reporterName, CATCH_MOVE(config));
+            CATCH_ENFORCE(reporter, "No reporter registered with name: '" << reporterName << '\'');
+
+            return reporter;
+        }
+
+        IEventListenerPtr prepareReporters(Config const* config) {
+            if (Catch::getRegistryHub().getReporterRegistry().getListeners().empty()
+                    && config->getProcessedReporterSpecs().size() == 1) {
+                auto const& spec = config->getProcessedReporterSpecs()[0];
+                return createReporter(
+                    spec.name,
+                    ReporterConfig( config,
+                                    makeStream( spec.outputFilename ),
+                                    spec.colourMode,
+                                    spec.customOptions ) );
+            }
+
+            auto multi = Detail::make_unique<MultiReporter>(config);
+
+            auto const& listeners = Catch::getRegistryHub().getReporterRegistry().getListeners();
+            for (auto const& listener : listeners) {
+                multi->addListener(listener->create(config));
+            }
+
+            for ( auto const& reporterSpec : config->getProcessedReporterSpecs() ) {
+                multi->addReporter( createReporter(
+                    reporterSpec.name,
+                    ReporterConfig( config,
+                                    makeStream( reporterSpec.outputFilename ),
+                                    reporterSpec.colourMode,
+                                    reporterSpec.customOptions ) ) );
+            }
+
+            return multi;
+        }
+
+        class TestGroup {
+        public:
+            explicit TestGroup(IEventListenerPtr&& reporter, Config const* config):
+                m_reporter(reporter.get()),
+                m_config{config},
+                m_context{config, CATCH_MOVE(reporter)} {
+
+                assert( m_config->testSpec().getInvalidSpecs().empty() &&
+                        "Invalid test specs should be handled before running tests" );
+
+                auto const& allTestCases = getAllTestCasesSorted(*m_config);
+                auto const& testSpec = m_config->testSpec();
+                if ( !testSpec.hasFilters() ) {
+                    for ( auto const& test : allTestCases ) {
+                        if ( !test.getTestCaseInfo().isHidden() ) {
+                            m_tests.emplace( &test );
+                        }
+                    }
+                } else {
+                    m_matches =
+                        testSpec.matchesByFilter( allTestCases, *m_config );
+                    for ( auto const& match : m_matches ) {
+                        m_tests.insert( match.tests.begin(),
+                                        match.tests.end() );
+                    }
+                }
+
+                m_tests = createShard(m_tests, m_config->shardCount(), m_config->shardIndex());
+            }
+
+            Totals execute() {
+                Totals totals;
+                for (auto const& testCase : m_tests) {
+                    if (!m_context.aborting())
+                        totals += m_context.runTest(*testCase);
+                    else
+                        m_reporter->skipTest(testCase->getTestCaseInfo());
+                }
+
+                for (auto const& match : m_matches) {
+                    if (match.tests.empty()) {
+                        m_unmatchedTestSpecs = true;
+                        m_reporter->noMatchingTestCases( match.name );
+                    }
+                }
+
+                return totals;
+            }
+
+            bool hadUnmatchedTestSpecs() const {
+                return m_unmatchedTestSpecs;
+            }
+
+
+        private:
+            IEventListener* m_reporter;
+            Config const* m_config;
+            RunContext m_context;
+            std::set<TestCaseHandle const*> m_tests;
+            TestSpec::Matches m_matches;
+            bool m_unmatchedTestSpecs = false;
+        };
+
+        void applyFilenamesAsTags() {
+            for (auto const& testInfo : getRegistryHub().getTestCaseRegistry().getAllInfos()) {
+                testInfo->addFilenameTag();
+            }
+        }
+
+    } // anon namespace
+
+    Session::Session() {
+        static bool alreadyInstantiated = false;
+        if( alreadyInstantiated ) {
+            CATCH_TRY { CATCH_INTERNAL_ERROR( "Only one instance of Catch::Session can ever be used" ); }
+            CATCH_CATCH_ALL { getMutableRegistryHub().registerStartupException(); }
+        }
+
+        // There cannot be exceptions at startup in no-exception mode.
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+        const auto& exceptions = getRegistryHub().getStartupExceptionRegistry().getExceptions();
+        if ( !exceptions.empty() ) {
+            config();
+            getCurrentMutableContext().setConfig(m_config.get());
+
+            m_startupExceptions = true;
+            auto errStream = makeStream( "%stderr" );
+            auto colourImpl = makeColourImpl(
+                ColourMode::PlatformDefault, errStream.get() );
+            auto guard = colourImpl->guardColour( Colour::Red );
+            errStream->stream() << "Errors occurred during startup!" << '\n';
+            // iterate over all exceptions and notify user
+            for ( const auto& ex_ptr : exceptions ) {
+                try {
+                    std::rethrow_exception(ex_ptr);
+                } catch ( std::exception const& ex ) {
+                    errStream->stream() << TextFlow::Column( ex.what() ).indent(2) << '\n';
+                }
+            }
+        }
+#endif
+
+        alreadyInstantiated = true;
+        m_cli = makeCommandLineParser( m_configData );
+    }
+    Session::~Session() {
+        Catch::cleanUp();
+    }
+
+    void Session::showHelp() const {
+        Catch::cout()
+                << "\nCatch2 v" << libraryVersion() << '\n'
+                << m_cli << '\n'
+                << "For more detailed usage please see the project docs\n\n" << std::flush;
+    }
+    void Session::libIdentify() {
+        Catch::cout()
+                << std::left << std::setw(16) << "description: " << "A Catch2 test executable\n"
+                << std::left << std::setw(16) << "category: " << "testframework\n"
+                << std::left << std::setw(16) << "framework: " << "Catch2\n"
+                << std::left << std::setw(16) << "version: " << libraryVersion() << '\n' << std::flush;
+    }
+
+    int Session::applyCommandLine( int argc, char const * const * argv ) {
+        if( m_startupExceptions )
+            return 1;
+
+        auto result = m_cli.parse( Clara::Args( argc, argv ) );
+
+        if( !result ) {
+            config();
+            getCurrentMutableContext().setConfig(m_config.get());
+            auto errStream = makeStream( "%stderr" );
+            auto colour = makeColourImpl( ColourMode::PlatformDefault, errStream.get() );
+
+            errStream->stream()
+                << colour->guardColour( Colour::Red )
+                << "\nError(s) in input:\n"
+                << TextFlow::Column( result.errorMessage() ).indent( 2 )
+                << "\n\n";
+            errStream->stream() << "Run with -? for usage\n\n" << std::flush;
+            return MaxExitCode;
+        }
+
+        if( m_configData.showHelp )
+            showHelp();
+        if( m_configData.libIdentify )
+            libIdentify();
+
+        m_config.reset();
+        return 0;
+    }
+
+#if defined(CATCH_CONFIG_WCHAR) && defined(_WIN32) && defined(UNICODE)
+    int Session::applyCommandLine( int argc, wchar_t const * const * argv ) {
+
+        char **utf8Argv = new char *[ argc ];
+
+        for ( int i = 0; i < argc; ++i ) {
+            int bufSize = WideCharToMultiByte( CP_UTF8, 0, argv[i], -1, nullptr, 0, nullptr, nullptr );
+
+            utf8Argv[ i ] = new char[ bufSize ];
+
+            WideCharToMultiByte( CP_UTF8, 0, argv[i], -1, utf8Argv[i], bufSize, nullptr, nullptr );
+        }
+
+        int returnCode = applyCommandLine( argc, utf8Argv );
+
+        for ( int i = 0; i < argc; ++i )
+            delete [] utf8Argv[ i ];
+
+        delete [] utf8Argv;
+
+        return returnCode;
+    }
+#endif
+
+    void Session::useConfigData( ConfigData const& configData ) {
+        m_configData = configData;
+        m_config.reset();
+    }
+
+    int Session::run() {
+        if( ( m_configData.waitForKeypress & WaitForKeypress::BeforeStart ) != 0 ) {
+            Catch::cout() << "...waiting for enter/ return before starting\n" << std::flush;
+            static_cast<void>(std::getchar());
+        }
+        int exitCode = runInternal();
+        if( ( m_configData.waitForKeypress & WaitForKeypress::BeforeExit ) != 0 ) {
+            Catch::cout() << "...waiting for enter/ return before exiting, with code: " << exitCode << '\n' << std::flush;
+            static_cast<void>(std::getchar());
+        }
+        return exitCode;
+    }
+
+    Clara::Parser const& Session::cli() const {
+        return m_cli;
+    }
+    void Session::cli( Clara::Parser const& newParser ) {
+        m_cli = newParser;
+    }
+    ConfigData& Session::configData() {
+        return m_configData;
+    }
+    Config& Session::config() {
+        if( !m_config )
+            m_config = Detail::make_unique<Config>( m_configData );
+        return *m_config;
+    }
+
+    int Session::runInternal() {
+        if( m_startupExceptions )
+            return 1;
+
+        if (m_configData.showHelp || m_configData.libIdentify) {
+            return 0;
+        }
+
+        if ( m_configData.shardIndex >= m_configData.shardCount ) {
+            Catch::cerr() << "The shard count (" << m_configData.shardCount
+                          << ") must be greater than the shard index ("
+                          << m_configData.shardIndex << ")\n"
+                          << std::flush;
+            return 1;
+        }
+
+        CATCH_TRY {
+            config(); // Force config to be constructed
+
+            seedRng( *m_config );
+
+            if (m_configData.filenamesAsTags) {
+                applyFilenamesAsTags();
+            }
+
+            // Set up global config instance before we start calling into other functions
+            getCurrentMutableContext().setConfig(m_config.get());
+
+            // Create reporter(s) so we can route listings through them
+            auto reporter = prepareReporters(m_config.get());
+
+            auto const& invalidSpecs = m_config->testSpec().getInvalidSpecs();
+            if ( !invalidSpecs.empty() ) {
+                for ( auto const& spec : invalidSpecs ) {
+                    reporter->reportInvalidTestSpec( spec );
+                }
+                return 1;
+            }
+
+
+            // Handle list request
+            if (list(*reporter, *m_config)) {
+                return 0;
+            }
+
+            TestGroup tests { CATCH_MOVE(reporter), m_config.get() };
+            auto const totals = tests.execute();
+
+            if ( tests.hadUnmatchedTestSpecs()
+                && m_config->warnAboutUnmatchedTestSpecs() ) {
+                return 3;
+            }
+
+            if ( totals.testCases.total() == 0
+                && !m_config->zeroTestsCountAsSuccess() ) {
+                return 2;
+            }
+
+            if ( totals.testCases.total() > 0 &&
+                 totals.testCases.total() == totals.testCases.skipped
+                && !m_config->zeroTestsCountAsSuccess() ) {
+                return 4;
+            }
+
+            // Note that on unices only the lower 8 bits are usually used, clamping
+            // the return value to 255 prevents false negative when some multiple
+            // of 256 tests has failed
+            return (std::min) (MaxExitCode, static_cast<int>(totals.assertions.failed));
+        }
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+        catch( std::exception& ex ) {
+            Catch::cerr() << ex.what() << '\n' << std::flush;
+            return MaxExitCode;
+        }
+#endif
+    }
+
+} // end namespace Catch
+
+
+
+
+namespace Catch {
+
+    RegistrarForTagAliases::RegistrarForTagAliases(char const* alias, char const* tag, SourceLineInfo const& lineInfo) {
+        CATCH_TRY {
+            getMutableRegistryHub().registerTagAlias(alias, tag, lineInfo);
+        } CATCH_CATCH_ALL {
+            // Do not throw when constructing global objects, instead register the exception to be processed later
+            getMutableRegistryHub().registerStartupException();
+        }
+    }
+
+}
+
+
+
+#include <cassert>
+#include <cctype>
+#include <algorithm>
+
+namespace Catch {
+
+    namespace {
+        using TCP_underlying_type = uint8_t;
+        static_assert(sizeof(TestCaseProperties) == sizeof(TCP_underlying_type),
+                      "The size of the TestCaseProperties is different from the assumed size");
+
+        TestCaseProperties operator|(TestCaseProperties lhs, TestCaseProperties rhs) {
+            return static_cast<TestCaseProperties>(
+                static_cast<TCP_underlying_type>(lhs) | static_cast<TCP_underlying_type>(rhs)
+            );
+        }
+
+        TestCaseProperties& operator|=(TestCaseProperties& lhs, TestCaseProperties rhs) {
+            lhs = static_cast<TestCaseProperties>(
+                static_cast<TCP_underlying_type>(lhs) | static_cast<TCP_underlying_type>(rhs)
+            );
+            return lhs;
+        }
+
+        TestCaseProperties operator&(TestCaseProperties lhs, TestCaseProperties rhs) {
+            return static_cast<TestCaseProperties>(
+                static_cast<TCP_underlying_type>(lhs) & static_cast<TCP_underlying_type>(rhs)
+            );
+        }
+
+        bool applies(TestCaseProperties tcp) {
+            static_assert(static_cast<TCP_underlying_type>(TestCaseProperties::None) == 0,
+                          "TestCaseProperties::None must be equal to 0");
+            return tcp != TestCaseProperties::None;
+        }
+
+        TestCaseProperties parseSpecialTag( StringRef tag ) {
+            if( !tag.empty() && tag[0] == '.' )
+                return TestCaseProperties::IsHidden;
+            else if( tag == "!throws"_sr )
+                return TestCaseProperties::Throws;
+            else if( tag == "!shouldfail"_sr )
+                return TestCaseProperties::ShouldFail;
+            else if( tag == "!mayfail"_sr )
+                return TestCaseProperties::MayFail;
+            else if( tag == "!nonportable"_sr )
+                return TestCaseProperties::NonPortable;
+            else if( tag == "!benchmark"_sr )
+                return TestCaseProperties::Benchmark | TestCaseProperties::IsHidden;
+            else
+                return TestCaseProperties::None;
+        }
+        bool isReservedTag( StringRef tag ) {
+            return parseSpecialTag( tag ) == TestCaseProperties::None
+                && tag.size() > 0
+                && !std::isalnum( static_cast<unsigned char>(tag[0]) );
+        }
+        void enforceNotReservedTag( StringRef tag, SourceLineInfo const& _lineInfo ) {
+            CATCH_ENFORCE( !isReservedTag(tag),
+                          "Tag name: [" << tag << "] is not allowed.\n"
+                          << "Tag names starting with non alphanumeric characters are reserved\n"
+                          << _lineInfo );
+        }
+
+        std::string makeDefaultName() {
+            static size_t counter = 0;
+            return "Anonymous test case " + std::to_string(++counter);
+        }
+
+        StringRef extractFilenamePart(StringRef filename) {
+            size_t lastDot = filename.size();
+            while (lastDot > 0 && filename[lastDot - 1] != '.') {
+                --lastDot;
+            }
+            // In theory we could have filename without any extension in it
+            if ( lastDot == 0 ) { return StringRef(); }
+
+            --lastDot;
+            size_t nameStart = lastDot;
+            while (nameStart > 0 && filename[nameStart - 1] != '/' && filename[nameStart - 1] != '\\') {
+                --nameStart;
+            }
+
+            return filename.substr(nameStart, lastDot - nameStart);
+        }
+
+        // Returns the upper bound on size of extra tags ([#file]+[.])
+        size_t sizeOfExtraTags(StringRef filepath) {
+            // [.] is 3, [#] is another 3
+            const size_t extras = 3 + 3;
+            return extractFilenamePart(filepath).size() + extras;
+        }
+    } // end unnamed namespace
+
+    bool operator<(  Tag const& lhs, Tag const& rhs ) {
+        Detail::CaseInsensitiveLess cmp;
+        return cmp( lhs.original, rhs.original );
+    }
+    bool operator==( Tag const& lhs, Tag const& rhs ) {
+        Detail::CaseInsensitiveEqualTo cmp;
+        return cmp( lhs.original, rhs.original );
+    }
+
+    Detail::unique_ptr<TestCaseInfo>
+        makeTestCaseInfo(StringRef _className,
+                         NameAndTags const& nameAndTags,
+                         SourceLineInfo const& _lineInfo ) {
+        return Detail::make_unique<TestCaseInfo>(_className, nameAndTags, _lineInfo);
+    }
+
+    TestCaseInfo::TestCaseInfo(StringRef _className,
+                               NameAndTags const& _nameAndTags,
+                               SourceLineInfo const& _lineInfo):
+        name( _nameAndTags.name.empty() ? makeDefaultName() : _nameAndTags.name ),
+        className( _className ),
+        lineInfo( _lineInfo )
+    {
+        StringRef originalTags = _nameAndTags.tags;
+        // We need to reserve enough space to store all of the tags
+        // (including optional hidden tag and filename tag)
+        auto requiredSize = originalTags.size() + sizeOfExtraTags(_lineInfo.file);
+        backingTags.reserve(requiredSize);
+
+        // We cannot copy the tags directly, as we need to normalize
+        // some tags, so that [.foo] is copied as [.][foo].
+        size_t tagStart = 0;
+        size_t tagEnd = 0;
+        bool inTag = false;
+        for (size_t idx = 0; idx < originalTags.size(); ++idx) {
+            auto c = originalTags[idx];
+            if (c == '[') {
+                CATCH_ENFORCE(
+                    !inTag,
+                    "Found '[' inside a tag while registering test case '"
+                        << _nameAndTags.name << "' at " << _lineInfo );
+
+                inTag = true;
+                tagStart = idx;
+            }
+            if (c == ']') {
+                CATCH_ENFORCE(
+                    inTag,
+                    "Found unmatched ']' while registering test case '"
+                        << _nameAndTags.name << "' at " << _lineInfo );
+
+                inTag = false;
+                tagEnd = idx;
+                assert(tagStart < tagEnd);
+
+                // We need to check the tag for special meanings, copy
+                // it over to backing storage and actually reference the
+                // backing storage in the saved tags
+                StringRef tagStr = originalTags.substr(tagStart+1, tagEnd - tagStart - 1);
+                CATCH_ENFORCE( !tagStr.empty(),
+                               "Found an empty tag while registering test case '"
+                                   << _nameAndTags.name << "' at "
+                                   << _lineInfo );
+
+                enforceNotReservedTag(tagStr, lineInfo);
+                properties |= parseSpecialTag(tagStr);
+                // When copying a tag to the backing storage, we need to
+                // check if it is a merged hide tag, such as [.foo], and
+                // if it is, we need to handle it as if it was [foo].
+                if (tagStr.size() > 1 && tagStr[0] == '.') {
+                    tagStr = tagStr.substr(1, tagStr.size() - 1);
+                }
+                // We skip over dealing with the [.] tag, as we will add
+                // it later unconditionally and then sort and unique all
+                // the tags.
+                internalAppendTag(tagStr);
+            }
+        }
+        CATCH_ENFORCE( !inTag,
+                       "Found an unclosed tag while registering test case '"
+                           << _nameAndTags.name << "' at " << _lineInfo );
+
+
+        // Add [.] if relevant
+        if (isHidden()) {
+            internalAppendTag("."_sr);
+        }
+
+        // Sort and prepare tags
+        std::sort(begin(tags), end(tags));
+        tags.erase(std::unique(begin(tags), end(tags)),
+                   end(tags));
+    }
+
+    bool TestCaseInfo::isHidden() const {
+        return applies( properties & TestCaseProperties::IsHidden );
+    }
+    bool TestCaseInfo::throws() const {
+        return applies( properties & TestCaseProperties::Throws );
+    }
+    bool TestCaseInfo::okToFail() const {
+        return applies( properties & (TestCaseProperties::ShouldFail | TestCaseProperties::MayFail ) );
+    }
+    bool TestCaseInfo::expectedToFail() const {
+        return applies( properties & (TestCaseProperties::ShouldFail) );
+    }
+
+    void TestCaseInfo::addFilenameTag() {
+        std::string combined("#");
+        combined += extractFilenamePart(lineInfo.file);
+        internalAppendTag(combined);
+    }
+
+    std::string TestCaseInfo::tagsAsString() const {
+        std::string ret;
+        // '[' and ']' per tag
+        std::size_t full_size = 2 * tags.size();
+        for (const auto& tag : tags) {
+            full_size += tag.original.size();
+        }
+        ret.reserve(full_size);
+        for (const auto& tag : tags) {
+            ret.push_back('[');
+            ret += tag.original;
+            ret.push_back(']');
+        }
+
+        return ret;
+    }
+
+    void TestCaseInfo::internalAppendTag(StringRef tagStr) {
+        backingTags += '[';
+        const auto backingStart = backingTags.size();
+        backingTags += tagStr;
+        const auto backingEnd = backingTags.size();
+        backingTags += ']';
+        tags.emplace_back(StringRef(backingTags.c_str() + backingStart, backingEnd - backingStart));
+    }
+
+    bool operator<( TestCaseInfo const& lhs, TestCaseInfo const& rhs ) {
+        // We want to avoid redoing the string comparisons multiple times,
+        // so we store the result of a three-way comparison before using
+        // it in the actual comparison logic.
+        const auto cmpName = lhs.name.compare( rhs.name );
+        if ( cmpName != 0 ) {
+            return cmpName < 0;
+        }
+        const auto cmpClassName = lhs.className.compare( rhs.className );
+        if ( cmpClassName != 0 ) {
+            return cmpClassName < 0;
+        }
+        return lhs.tags < rhs.tags;
+    }
+
+    TestCaseInfo const& TestCaseHandle::getTestCaseInfo() const {
+        return *m_info;
+    }
+
+} // end namespace Catch
+
+
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <ostream>
+
+namespace Catch {
+
+    TestSpec::Pattern::Pattern( std::string const& name )
+    : m_name( name )
+    {}
+
+    TestSpec::Pattern::~Pattern() = default;
+
+    std::string const& TestSpec::Pattern::name() const {
+        return m_name;
+    }
+
+
+    TestSpec::NamePattern::NamePattern( std::string const& name, std::string const& filterString )
+    : Pattern( filterString )
+    , m_wildcardPattern( toLower( name ), CaseSensitive::No )
+    {}
+
+    bool TestSpec::NamePattern::matches( TestCaseInfo const& testCase ) const {
+        return m_wildcardPattern.matches( testCase.name );
+    }
+
+    void TestSpec::NamePattern::serializeTo( std::ostream& out ) const {
+        out << '"' << name() << '"';
+    }
+
+
+    TestSpec::TagPattern::TagPattern( std::string const& tag, std::string const& filterString )
+    : Pattern( filterString )
+    , m_tag( tag )
+    {}
+
+    bool TestSpec::TagPattern::matches( TestCaseInfo const& testCase ) const {
+        return std::find( begin( testCase.tags ),
+                          end( testCase.tags ),
+                          Tag( m_tag ) ) != end( testCase.tags );
+    }
+
+    void TestSpec::TagPattern::serializeTo( std::ostream& out ) const {
+        out << name();
+    }
+
+    bool TestSpec::Filter::matches( TestCaseInfo const& testCase ) const {
+        bool should_use = !testCase.isHidden();
+        for (auto const& pattern : m_required) {
+            should_use = true;
+            if (!pattern->matches(testCase)) {
+                return false;
+            }
+        }
+        for (auto const& pattern : m_forbidden) {
+            if (pattern->matches(testCase)) {
+                return false;
+            }
+        }
+        return should_use;
+    }
+
+    void TestSpec::Filter::serializeTo( std::ostream& out ) const {
+        bool first = true;
+        for ( auto const& pattern : m_required ) {
+            if ( !first ) {
+                out << ' ';
+            }
+            out << *pattern;
+            first = false;
+        }
+        for ( auto const& pattern : m_forbidden ) {
+            if ( !first ) {
+                out << ' ';
+            }
+            out << *pattern;
+            first = false;
+        }
+    }
+
+
+    std::string TestSpec::extractFilterName( Filter const& filter ) {
+        Catch::ReusableStringStream sstr;
+        sstr << filter;
+        return sstr.str();
+    }
+
+    bool TestSpec::hasFilters() const {
+        return !m_filters.empty();
+    }
+
+    bool TestSpec::matches( TestCaseInfo const& testCase ) const {
+        return std::any_of( m_filters.begin(), m_filters.end(), [&]( Filter const& f ){ return f.matches( testCase ); } );
+    }
+
+    TestSpec::Matches TestSpec::matchesByFilter( std::vector<TestCaseHandle> const& testCases, IConfig const& config ) const {
+        Matches matches;
+        matches.reserve( m_filters.size() );
+        for ( auto const& filter : m_filters ) {
+            std::vector<TestCaseHandle const*> currentMatches;
+            for ( auto const& test : testCases )
+                if ( isThrowSafe( test, config ) &&
+                     filter.matches( test.getTestCaseInfo() ) )
+                    currentMatches.emplace_back( &test );
+            matches.push_back(
+                FilterMatch{ extractFilterName( filter ), currentMatches } );
+        }
+        return matches;
+    }
+
+    const TestSpec::vectorStrings& TestSpec::getInvalidSpecs() const {
+        return m_invalidSpecs;
+    }
+
+    void TestSpec::serializeTo( std::ostream& out ) const {
+        bool first = true;
+        for ( auto const& filter : m_filters ) {
+            if ( !first ) {
+                out << ',';
+            }
+            out << filter;
+            first = false;
+        }
+    }
+
+}
+
+
+
+#include <chrono>
+
+namespace Catch {
+
+    namespace {
+        static auto getCurrentNanosecondsSinceEpoch() -> uint64_t {
+            return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now().time_since_epoch()).count();
+        }
+    } // end unnamed namespace
+
+    void Timer::start() {
+       m_nanoseconds = getCurrentNanosecondsSinceEpoch();
+    }
+    auto Timer::getElapsedNanoseconds() const -> uint64_t {
+        return getCurrentNanosecondsSinceEpoch() - m_nanoseconds;
+    }
+    auto Timer::getElapsedMicroseconds() const -> uint64_t {
+        return getElapsedNanoseconds()/1000;
+    }
+    auto Timer::getElapsedMilliseconds() const -> unsigned int {
+        return static_cast<unsigned int>(getElapsedMicroseconds()/1000);
+    }
+    auto Timer::getElapsedSeconds() const -> double {
+        return getElapsedMicroseconds()/1000000.0;
+    }
+
+
+} // namespace Catch
+
+
+
+
+#include <cmath>
+#include <iomanip>
+
+namespace Catch {
+
+namespace Detail {
+
+    namespace {
+        const int hexThreshold = 255;
+
+        struct Endianness {
+            enum Arch { Big, Little };
+
+            static Arch which() {
+                int one = 1;
+                // If the lowest byte we read is non-zero, we can assume
+                // that little endian format is used.
+                auto value = *reinterpret_cast<char*>(&one);
+                return value ? Little : Big;
+            }
+        };
+
+        template<typename T>
+        std::string fpToString(T value, int precision) {
+            if (Catch::isnan(value)) {
+                return "nan";
+            }
+
+            ReusableStringStream rss;
+            rss << std::setprecision(precision)
+                << std::fixed
+                << value;
+            std::string d = rss.str();
+            std::size_t i = d.find_last_not_of('0');
+            if (i != std::string::npos && i != d.size() - 1) {
+                if (d[i] == '.')
+                    i++;
+                d = d.substr(0, i + 1);
+            }
+            return d;
+        }
+    } // end unnamed namespace
+
+    std::string convertIntoString(StringRef string, bool escapeInvisibles) {
+        std::string ret;
+        // This is enough for the "don't escape invisibles" case, and a good
+        // lower bound on the "escape invisibles" case.
+        ret.reserve(string.size() + 2);
+
+        if (!escapeInvisibles) {
+            ret += '"';
+            ret += string;
+            ret += '"';
+            return ret;
+        }
+
+        ret += '"';
+        for (char c : string) {
+            switch (c) {
+            case '\r':
+                ret.append("\\r");
+                break;
+            case '\n':
+                ret.append("\\n");
+                break;
+            case '\t':
+                ret.append("\\t");
+                break;
+            case '\f':
+                ret.append("\\f");
+                break;
+            default:
+                ret.push_back(c);
+                break;
+            }
+        }
+        ret += '"';
+
+        return ret;
+    }
+
+    std::string convertIntoString(StringRef string) {
+        return convertIntoString(string, getCurrentContext().getConfig()->showInvisibles());
+    }
+
+    std::string rawMemoryToString( const void *object, std::size_t size ) {
+        // Reverse order for little endian architectures
+        int i = 0, end = static_cast<int>( size ), inc = 1;
+        if( Endianness::which() == Endianness::Little ) {
+            i = end-1;
+            end = inc = -1;
+        }
+
+        unsigned char const *bytes = static_cast<unsigned char const *>(object);
+        ReusableStringStream rss;
+        rss << "0x" << std::setfill('0') << std::hex;
+        for( ; i != end; i += inc )
+             rss << std::setw(2) << static_cast<unsigned>(bytes[i]);
+       return rss.str();
+    }
+} // end Detail namespace
+
+
+
+//// ======================================================= ////
+//
+//   Out-of-line defs for full specialization of StringMaker
+//
+//// ======================================================= ////
+
+std::string StringMaker<std::string>::convert(const std::string& str) {
+    return Detail::convertIntoString( str );
+}
+
+#ifdef CATCH_CONFIG_CPP17_STRING_VIEW
+std::string StringMaker<std::string_view>::convert(std::string_view str) {
+    return Detail::convertIntoString( StringRef( str.data(), str.size() ) );
+}
+#endif
+
+std::string StringMaker<char const*>::convert(char const* str) {
+    if (str) {
+        return Detail::convertIntoString( str );
+    } else {
+        return{ "{null string}" };
+    }
+}
+std::string StringMaker<char*>::convert(char* str) { // NOLINT(readability-non-const-parameter)
+    if (str) {
+        return Detail::convertIntoString( str );
+    } else {
+        return{ "{null string}" };
+    }
+}
+
+#ifdef CATCH_CONFIG_WCHAR
+std::string StringMaker<std::wstring>::convert(const std::wstring& wstr) {
+    std::string s;
+    s.reserve(wstr.size());
+    for (auto c : wstr) {
+        s += (c <= 0xff) ? static_cast<char>(c) : '?';
+    }
+    return ::Catch::Detail::stringify(s);
+}
+
+# ifdef CATCH_CONFIG_CPP17_STRING_VIEW
+std::string StringMaker<std::wstring_view>::convert(std::wstring_view str) {
+    return StringMaker<std::wstring>::convert(std::wstring(str));
+}
+# endif
+
+std::string StringMaker<wchar_t const*>::convert(wchar_t const * str) {
+    if (str) {
+        return ::Catch::Detail::stringify(std::wstring{ str });
+    } else {
+        return{ "{null string}" };
+    }
+}
+std::string StringMaker<wchar_t *>::convert(wchar_t * str) {
+    if (str) {
+        return ::Catch::Detail::stringify(std::wstring{ str });
+    } else {
+        return{ "{null string}" };
+    }
+}
+#endif
+
+#if defined(CATCH_CONFIG_CPP17_BYTE)
+#include <cstddef>
+std::string StringMaker<std::byte>::convert(std::byte value) {
+    return ::Catch::Detail::stringify(std::to_integer<unsigned long long>(value));
+}
+#endif // defined(CATCH_CONFIG_CPP17_BYTE)
+
+std::string StringMaker<int>::convert(int value) {
+    return ::Catch::Detail::stringify(static_cast<long long>(value));
+}
+std::string StringMaker<long>::convert(long value) {
+    return ::Catch::Detail::stringify(static_cast<long long>(value));
+}
+std::string StringMaker<long long>::convert(long long value) {
+    ReusableStringStream rss;
+    rss << value;
+    if (value > Detail::hexThreshold) {
+        rss << " (0x" << std::hex << value << ')';
+    }
+    return rss.str();
+}
+
+std::string StringMaker<unsigned int>::convert(unsigned int value) {
+    return ::Catch::Detail::stringify(static_cast<unsigned long long>(value));
+}
+std::string StringMaker<unsigned long>::convert(unsigned long value) {
+    return ::Catch::Detail::stringify(static_cast<unsigned long long>(value));
+}
+std::string StringMaker<unsigned long long>::convert(unsigned long long value) {
+    ReusableStringStream rss;
+    rss << value;
+    if (value > Detail::hexThreshold) {
+        rss << " (0x" << std::hex << value << ')';
+    }
+    return rss.str();
+}
+
+std::string StringMaker<signed char>::convert(signed char value) {
+    if (value == '\r') {
+        return "'\\r'";
+    } else if (value == '\f') {
+        return "'\\f'";
+    } else if (value == '\n') {
+        return "'\\n'";
+    } else if (value == '\t') {
+        return "'\\t'";
+    } else if ('\0' <= value && value < ' ') {
+        return ::Catch::Detail::stringify(static_cast<unsigned int>(value));
+    } else {
+        char chstr[] = "' '";
+        chstr[1] = value;
+        return chstr;
+    }
+}
+std::string StringMaker<char>::convert(char c) {
+    return ::Catch::Detail::stringify(static_cast<signed char>(c));
+}
+std::string StringMaker<unsigned char>::convert(unsigned char value) {
+    return ::Catch::Detail::stringify(static_cast<char>(value));
+}
+
+int StringMaker<float>::precision = 5;
+
+std::string StringMaker<float>::convert(float value) {
+    return Detail::fpToString(value, precision) + 'f';
+}
+
+int StringMaker<double>::precision = 10;
+
+std::string StringMaker<double>::convert(double value) {
+    return Detail::fpToString(value, precision);
+}
+
+} // end namespace Catch
+
+
+
+namespace Catch {
+
+    Counts Counts::operator - ( Counts const& other ) const {
+        Counts diff;
+        diff.passed = passed - other.passed;
+        diff.failed = failed - other.failed;
+        diff.failedButOk = failedButOk - other.failedButOk;
+        diff.skipped = skipped - other.skipped;
+        return diff;
+    }
+
+    Counts& Counts::operator += ( Counts const& other ) {
+        passed += other.passed;
+        failed += other.failed;
+        failedButOk += other.failedButOk;
+        skipped += other.skipped;
+        return *this;
+    }
+
+    std::uint64_t Counts::total() const {
+        return passed + failed + failedButOk + skipped;
+    }
+    bool Counts::allPassed() const {
+        return failed == 0 && failedButOk == 0 && skipped == 0;
+    }
+    bool Counts::allOk() const {
+        return failed == 0;
+    }
+
+    Totals Totals::operator - ( Totals const& other ) const {
+        Totals diff;
+        diff.assertions = assertions - other.assertions;
+        diff.testCases = testCases - other.testCases;
+        return diff;
+    }
+
+    Totals& Totals::operator += ( Totals const& other ) {
+        assertions += other.assertions;
+        testCases += other.testCases;
+        return *this;
+    }
+
+    Totals Totals::delta( Totals const& prevTotals ) const {
+        Totals diff = *this - prevTotals;
+        if( diff.assertions.failed > 0 )
+            ++diff.testCases.failed;
+        else if( diff.assertions.failedButOk > 0 )
+            ++diff.testCases.failedButOk;
+        else if ( diff.assertions.skipped > 0 )
+            ++ diff.testCases.skipped;
+        else
+            ++diff.testCases.passed;
+        return diff;
+    }
+
+}
+
+
+
+
+namespace Catch {
+    namespace Detail {
+        void registerTranslatorImpl(
+            Detail::unique_ptr<IExceptionTranslator>&& translator ) {
+            getMutableRegistryHub().registerTranslator(
+                CATCH_MOVE( translator ) );
+        }
+    } // namespace Detail
+} // namespace Catch
+
+
+#include <ostream>
+
+namespace Catch {
+
+    Version::Version
+        (   unsigned int _majorVersion,
+            unsigned int _minorVersion,
+            unsigned int _patchNumber,
+            char const * const _branchName,
+            unsigned int _buildNumber )
+    :   majorVersion( _majorVersion ),
+        minorVersion( _minorVersion ),
+        patchNumber( _patchNumber ),
+        branchName( _branchName ),
+        buildNumber( _buildNumber )
+    {}
+
+    std::ostream& operator << ( std::ostream& os, Version const& version ) {
+        os  << version.majorVersion << '.'
+            << version.minorVersion << '.'
+            << version.patchNumber;
+        // branchName is never null -> 0th char is \0 if it is empty
+        if (version.branchName[0]) {
+            os << '-' << version.branchName
+               << '.' << version.buildNumber;
+        }
+        return os;
+    }
+
+    Version const& libraryVersion() {
+        static Version version( 3, 5, 3, "", 0 );
+        return version;
+    }
+
+}
+
+
+
+
+namespace Catch {
+
+    const char* GeneratorException::what() const noexcept {
+        return m_msg;
+    }
+
+} // end namespace Catch
+
+
+
+
+namespace Catch {
+
+    IGeneratorTracker::~IGeneratorTracker() = default;
+
+namespace Generators {
+
+namespace Detail {
+
+    [[noreturn]]
+    void throw_generator_exception(char const* msg) {
+        Catch::throw_exception(GeneratorException{ msg });
+    }
+} // end namespace Detail
+
+    GeneratorUntypedBase::~GeneratorUntypedBase() = default;
+
+    IGeneratorTracker* acquireGeneratorTracker(StringRef generatorName, SourceLineInfo const& lineInfo ) {
+        return getResultCapture().acquireGeneratorTracker( generatorName, lineInfo );
+    }
+
+    IGeneratorTracker* createGeneratorTracker( StringRef generatorName,
+                                 SourceLineInfo lineInfo,
+                                 GeneratorBasePtr&& generator ) {
+        return getResultCapture().createGeneratorTracker(
+            generatorName, lineInfo, CATCH_MOVE( generator ) );
+    }
+
+} // namespace Generators
+} // namespace Catch
+
+
+
+
+#include <random>
+
+namespace Catch {
+    namespace Generators {
+        namespace Detail {
+            std::uint32_t getSeed() { return sharedRng()(); }
+        } // namespace Detail
+
+        struct RandomFloatingGenerator<long double>::PImpl {
+            PImpl( long double a, long double b, uint32_t seed ):
+                rng( seed ), dist( a, b ) {}
+
+            Catch::SimplePcg32 rng;
+            std::uniform_real_distribution<long double> dist;
+        };
+
+        RandomFloatingGenerator<long double>::RandomFloatingGenerator(
+            long double a, long double b, std::uint32_t seed) :
+            m_pimpl(Catch::Detail::make_unique<PImpl>(a, b, seed)) {
+            static_cast<void>( next() );
+        }
+
+        RandomFloatingGenerator<long double>::~RandomFloatingGenerator() =
+            default;
+        bool RandomFloatingGenerator<long double>::next() {
+            m_current_number = m_pimpl->dist( m_pimpl->rng );
+            return true;
+        }
+    } // namespace Generators
+} // namespace Catch
+
+
+
+
+namespace Catch {
+    IResultCapture::~IResultCapture() = default;
+}
+
+
+
+
+namespace Catch {
+    IConfig::~IConfig() = default;
+}
+
+
+
+
+namespace Catch {
+    IExceptionTranslator::~IExceptionTranslator() = default;
+    IExceptionTranslatorRegistry::~IExceptionTranslatorRegistry() = default;
+}
+
+
+
+#include <string>
+
+namespace Catch {
+    namespace Generators {
+
+        bool GeneratorUntypedBase::countedNext() {
+            auto ret = next();
+            if ( ret ) {
+                m_stringReprCache.clear();
+                ++m_currentElementIndex;
+            }
+            return ret;
+        }
+
+        StringRef GeneratorUntypedBase::currentElementAsString() const {
+            if ( m_stringReprCache.empty() ) {
+                m_stringReprCache = stringifyImpl();
+            }
+            return m_stringReprCache;
+        }
+
+    } // namespace Generators
+} // namespace Catch
+
+
+
+
+namespace Catch {
+    IRegistryHub::~IRegistryHub() = default;
+    IMutableRegistryHub::~IMutableRegistryHub() = default;
+}
+
+
+
+#include <cassert>
+
+namespace Catch {
+
+    ReporterConfig::ReporterConfig(
+        IConfig const* _fullConfig,
+        Detail::unique_ptr<IStream> _stream,
+        ColourMode colourMode,
+        std::map<std::string, std::string> customOptions ):
+        m_stream( CATCH_MOVE(_stream) ),
+        m_fullConfig( _fullConfig ),
+        m_colourMode( colourMode ),
+        m_customOptions( CATCH_MOVE( customOptions ) ) {}
+
+    Detail::unique_ptr<IStream> ReporterConfig::takeStream() && {
+        assert( m_stream );
+        return CATCH_MOVE( m_stream );
+    }
+    IConfig const * ReporterConfig::fullConfig() const { return m_fullConfig; }
+    ColourMode ReporterConfig::colourMode() const { return m_colourMode; }
+
+    std::map<std::string, std::string> const&
+    ReporterConfig::customOptions() const {
+        return m_customOptions;
+    }
+
+    ReporterConfig::~ReporterConfig() = default;
+
+    AssertionStats::AssertionStats( AssertionResult const& _assertionResult,
+                                    std::vector<MessageInfo> const& _infoMessages,
+                                    Totals const& _totals )
+    :   assertionResult( _assertionResult ),
+        infoMessages( _infoMessages ),
+        totals( _totals )
+    {
+        if( assertionResult.hasMessage() ) {
+            // Copy message into messages list.
+            // !TBD This should have been done earlier, somewhere
+            MessageBuilder builder( assertionResult.getTestMacroName(), assertionResult.getSourceInfo(), assertionResult.getResultType() );
+            builder.m_info.message = static_cast<std::string>(assertionResult.getMessage());
+
+            infoMessages.push_back( CATCH_MOVE(builder.m_info) );
+        }
+    }
+
+    SectionStats::SectionStats(  SectionInfo&& _sectionInfo,
+                                 Counts const& _assertions,
+                                 double _durationInSeconds,
+                                 bool _missingAssertions )
+    :   sectionInfo( CATCH_MOVE(_sectionInfo) ),
+        assertions( _assertions ),
+        durationInSeconds( _durationInSeconds ),
+        missingAssertions( _missingAssertions )
+    {}
+
+
+    TestCaseStats::TestCaseStats(  TestCaseInfo const& _testInfo,
+                                   Totals const& _totals,
+                                   std::string&& _stdOut,
+                                   std::string&& _stdErr,
+                                   bool _aborting )
+    : testInfo( &_testInfo ),
+        totals( _totals ),
+        stdOut( CATCH_MOVE(_stdOut) ),
+        stdErr( CATCH_MOVE(_stdErr) ),
+        aborting( _aborting )
+    {}
+
+
+    TestRunStats::TestRunStats(   TestRunInfo const& _runInfo,
+                    Totals const& _totals,
+                    bool _aborting )
+    :   runInfo( _runInfo ),
+        totals( _totals ),
+        aborting( _aborting )
+    {}
+
+    IEventListener::~IEventListener() = default;
+
+} // end namespace Catch
+
+
+
+
+namespace Catch {
+    IReporterFactory::~IReporterFactory() = default;
+    EventListenerFactory::~EventListenerFactory() = default;
+}
+
+
+
+
+namespace Catch {
+    ITestCaseRegistry::~ITestCaseRegistry() = default;
+}
+
+
+
+namespace Catch {
+
+    AssertionHandler::AssertionHandler
+        (   StringRef macroName,
+            SourceLineInfo const& lineInfo,
+            StringRef capturedExpression,
+            ResultDisposition::Flags resultDisposition )
+    :   m_assertionInfo{ macroName, lineInfo, capturedExpression, resultDisposition },
+        m_resultCapture( getResultCapture() )
+    {
+        m_resultCapture.notifyAssertionStarted( m_assertionInfo );
+    }
+
+    void AssertionHandler::handleExpr( ITransientExpression const& expr ) {
+        m_resultCapture.handleExpr( m_assertionInfo, expr, m_reaction );
+    }
+    void AssertionHandler::handleMessage(ResultWas::OfType resultType, StringRef message) {
+        m_resultCapture.handleMessage( m_assertionInfo, resultType, message, m_reaction );
+    }
+
+    auto AssertionHandler::allowThrows() const -> bool {
+        return getCurrentContext().getConfig()->allowThrows();
+    }
+
+    void AssertionHandler::complete() {
+        m_completed = true;
+        if( m_reaction.shouldDebugBreak ) {
+
+            // If you find your debugger stopping you here then go one level up on the
+            // call-stack for the code that caused it (typically a failed assertion)
+
+            // (To go back to the test and change execution, jump over the throw, next)
+            CATCH_BREAK_INTO_DEBUGGER();
+        }
+        if (m_reaction.shouldThrow) {
+            throw_test_failure_exception();
+        }
+        if ( m_reaction.shouldSkip ) {
+            throw_test_skip_exception();
+        }
+    }
+
+    void AssertionHandler::handleUnexpectedInflightException() {
+        m_resultCapture.handleUnexpectedInflightException( m_assertionInfo, Catch::translateActiveException(), m_reaction );
+    }
+
+    void AssertionHandler::handleExceptionThrownAsExpected() {
+        m_resultCapture.handleNonExpr(m_assertionInfo, ResultWas::Ok, m_reaction);
+    }
+    void AssertionHandler::handleExceptionNotThrownAsExpected() {
+        m_resultCapture.handleNonExpr(m_assertionInfo, ResultWas::Ok, m_reaction);
+    }
+
+    void AssertionHandler::handleUnexpectedExceptionNotThrown() {
+        m_resultCapture.handleUnexpectedExceptionNotThrown( m_assertionInfo, m_reaction );
+    }
+
+    void AssertionHandler::handleThrowingCallSkipped() {
+        m_resultCapture.handleNonExpr(m_assertionInfo, ResultWas::Ok, m_reaction);
+    }
+
+    // This is the overload that takes a string and infers the Equals matcher from it
+    // The more general overload, that takes any string matcher, is in catch_capture_matchers.cpp
+    void handleExceptionMatchExpr( AssertionHandler& handler, std::string const& str ) {
+        handleExceptionMatchExpr( handler, Matchers::Equals( str ) );
+    }
+
+} // namespace Catch
+
+
+
+
+#include <algorithm>
+
+namespace Catch {
+    namespace Detail {
+
+        bool CaseInsensitiveLess::operator()( StringRef lhs,
+                                              StringRef rhs ) const {
+            return std::lexicographical_compare(
+                lhs.begin(), lhs.end(),
+                rhs.begin(), rhs.end(),
+                []( char l, char r ) { return toLower( l ) < toLower( r ); } );
+        }
+
+        bool
+        CaseInsensitiveEqualTo::operator()( StringRef lhs,
+                                            StringRef rhs ) const {
+            return std::equal(
+                lhs.begin(), lhs.end(),
+                rhs.begin(), rhs.end(),
+                []( char l, char r ) { return toLower( l ) == toLower( r ); } );
+        }
+
+    } // namespace Detail
+} // namespace Catch
+
+
+
+
+#include <algorithm>
+#include <ostream>
+
+namespace {
+    bool isOptPrefix( char c ) {
+        return c == '-'
+#ifdef CATCH_PLATFORM_WINDOWS
+               || c == '/'
+#endif
+            ;
+    }
+
+    Catch::StringRef normaliseOpt( Catch::StringRef optName ) {
+        if ( optName[0] == '-'
+#if defined(CATCH_PLATFORM_WINDOWS)
+             || optName[0] == '/'
+#endif
+        ) {
+            return optName.substr( 1, optName.size() );
+        }
+
+        return optName;
+    }
+
+    static size_t find_first_separator(Catch::StringRef sr) {
+        auto is_separator = []( char c ) {
+            return c == ' ' || c == ':' || c == '=';
+        };
+        size_t pos = 0;
+        while (pos < sr.size()) {
+            if (is_separator(sr[pos])) { return pos; }
+            ++pos;
+        }
+
+        return Catch::StringRef::npos;
+    }
+
+} // namespace
+
+namespace Catch {
+    namespace Clara {
+        namespace Detail {
+
+            void TokenStream::loadBuffer() {
+                m_tokenBuffer.clear();
+
+                // Skip any empty strings
+                while ( it != itEnd && it->empty() ) {
+                    ++it;
+                }
+
+                if ( it != itEnd ) {
+                    StringRef next = *it;
+                    if ( isOptPrefix( next[0] ) ) {
+                        auto delimiterPos = find_first_separator(next);
+                        if ( delimiterPos != StringRef::npos ) {
+                            m_tokenBuffer.push_back(
+                                { TokenType::Option,
+                                  next.substr( 0, delimiterPos ) } );
+                            m_tokenBuffer.push_back(
+                                { TokenType::Argument,
+                                  next.substr( delimiterPos + 1, next.size() ) } );
+                        } else {
+                            if ( next[1] != '-' && next.size() > 2 ) {
+                                // Combined short args, e.g. "-ab" for "-a -b"
+                                for ( size_t i = 1; i < next.size(); ++i ) {
+                                    m_tokenBuffer.push_back(
+                                        { TokenType::Option,
+                                          next.substr( i, 1 ) } );
+                                }
+                            } else {
+                                m_tokenBuffer.push_back(
+                                    { TokenType::Option, next } );
+                            }
+                        }
+                    } else {
+                        m_tokenBuffer.push_back(
+                            { TokenType::Argument, next } );
+                    }
+                }
+            }
+
+            TokenStream::TokenStream( Args const& args ):
+                TokenStream( args.m_args.begin(), args.m_args.end() ) {}
+
+            TokenStream::TokenStream( Iterator it_, Iterator itEnd_ ):
+                it( it_ ), itEnd( itEnd_ ) {
+                loadBuffer();
+            }
+
+            TokenStream& TokenStream::operator++() {
+                if ( m_tokenBuffer.size() >= 2 ) {
+                    m_tokenBuffer.erase( m_tokenBuffer.begin() );
+                } else {
+                    if ( it != itEnd )
+                        ++it;
+                    loadBuffer();
+                }
+                return *this;
+            }
+
+            ParserResult convertInto( std::string const& source,
+                                      std::string& target ) {
+                target = source;
+                return ParserResult::ok( ParseResultType::Matched );
+            }
+
+            ParserResult convertInto( std::string const& source,
+                                      bool& target ) {
+                std::string srcLC = toLower( source );
+
+                if ( srcLC == "y" || srcLC == "1" || srcLC == "true" ||
+                     srcLC == "yes" || srcLC == "on" ) {
+                    target = true;
+                } else if ( srcLC == "n" || srcLC == "0" || srcLC == "false" ||
+                            srcLC == "no" || srcLC == "off" ) {
+                    target = false;
+                } else {
+                    return ParserResult::runtimeError(
+                        "Expected a boolean value but did not recognise: '" +
+                        source + '\'' );
+                }
+                return ParserResult::ok( ParseResultType::Matched );
+            }
+
+            size_t ParserBase::cardinality() const { return 1; }
+
+            InternalParseResult ParserBase::parse( Args const& args ) const {
+                return parse( static_cast<std::string>(args.exeName()), TokenStream( args ) );
+            }
+
+            ParseState::ParseState( ParseResultType type,
+                                    TokenStream remainingTokens ):
+                m_type( type ), m_remainingTokens( CATCH_MOVE(remainingTokens) ) {}
+
+            ParserResult BoundFlagRef::setFlag( bool flag ) {
+                m_ref = flag;
+                return ParserResult::ok( ParseResultType::Matched );
+            }
+
+            ResultBase::~ResultBase() = default;
+
+            bool BoundRef::isContainer() const { return false; }
+
+            bool BoundRef::isFlag() const { return false; }
+
+            bool BoundFlagRefBase::isFlag() const { return true; }
+
+} // namespace Detail
+
+        Detail::InternalParseResult Arg::parse(std::string const&,
+                                               Detail::TokenStream tokens) const {
+            auto validationResult = validate();
+            if (!validationResult)
+                return Detail::InternalParseResult(validationResult);
+
+            auto token = *tokens;
+            if (token.type != Detail::TokenType::Argument)
+                return Detail::InternalParseResult::ok(Detail::ParseState(
+                    ParseResultType::NoMatch, CATCH_MOVE(tokens)));
+
+            assert(!m_ref->isFlag());
+            auto valueRef =
+                static_cast<Detail::BoundValueRefBase*>(m_ref.get());
+
+            auto result = valueRef->setValue(static_cast<std::string>(token.token));
+            if ( !result )
+                return Detail::InternalParseResult( result );
+            else
+                return Detail::InternalParseResult::ok(
+                    Detail::ParseState( ParseResultType::Matched,
+                                        CATCH_MOVE( ++tokens ) ) );
+        }
+
+        Opt::Opt(bool& ref) :
+            ParserRefImpl(std::make_shared<Detail::BoundFlagRef>(ref)) {}
+
+        Detail::HelpColumns Opt::getHelpColumns() const {
+            ReusableStringStream oss;
+            bool first = true;
+            for (auto const& opt : m_optNames) {
+                if (first)
+                    first = false;
+                else
+                    oss << ", ";
+                oss << opt;
+            }
+            if (!m_hint.empty())
+                oss << " <" << m_hint << '>';
+            return { oss.str(), m_description };
+        }
+
+        bool Opt::isMatch(StringRef optToken) const {
+            auto normalisedToken = normaliseOpt(optToken);
+            for (auto const& name : m_optNames) {
+                if (normaliseOpt(name) == normalisedToken)
+                    return true;
+            }
+            return false;
+        }
+
+        Detail::InternalParseResult Opt::parse(std::string const&,
+                                       Detail::TokenStream tokens) const {
+            auto validationResult = validate();
+            if (!validationResult)
+                return Detail::InternalParseResult(validationResult);
+
+            if (tokens &&
+                tokens->type == Detail::TokenType::Option) {
+                auto const& token = *tokens;
+                if (isMatch(token.token)) {
+                    if (m_ref->isFlag()) {
+                        auto flagRef =
+                            static_cast<Detail::BoundFlagRefBase*>(
+                                m_ref.get());
+                        auto result = flagRef->setFlag(true);
+                        if (!result)
+                            return Detail::InternalParseResult(result);
+                        if (result.value() ==
+                            ParseResultType::ShortCircuitAll)
+                            return Detail::InternalParseResult::ok(Detail::ParseState(
+                                result.value(), CATCH_MOVE(tokens)));
+                    } else {
+                        auto valueRef =
+                            static_cast<Detail::BoundValueRefBase*>(
+                                m_ref.get());
+                        ++tokens;
+                        if (!tokens)
+                            return Detail::InternalParseResult::runtimeError(
+                                "Expected argument following " +
+                                token.token);
+                        auto const& argToken = *tokens;
+                        if (argToken.type != Detail::TokenType::Argument)
+                            return Detail::InternalParseResult::runtimeError(
+                                "Expected argument following " +
+                                token.token);
+                        const auto result = valueRef->setValue(static_cast<std::string>(argToken.token));
+                        if (!result)
+                            return Detail::InternalParseResult(result);
+                        if (result.value() ==
+                            ParseResultType::ShortCircuitAll)
+                            return Detail::InternalParseResult::ok(Detail::ParseState(
+                                result.value(), CATCH_MOVE(tokens)));
+                    }
+                    return Detail::InternalParseResult::ok(Detail::ParseState(
+                        ParseResultType::Matched, CATCH_MOVE(++tokens)));
+                }
+            }
+            return Detail::InternalParseResult::ok(
+                Detail::ParseState(ParseResultType::NoMatch, CATCH_MOVE(tokens)));
+        }
+
+        Detail::Result Opt::validate() const {
+            if (m_optNames.empty())
+                return Detail::Result::logicError("No options supplied to Opt");
+            for (auto const& name : m_optNames) {
+                if (name.empty())
+                    return Detail::Result::logicError(
+                        "Option name cannot be empty");
+#ifdef CATCH_PLATFORM_WINDOWS
+                if (name[0] != '-' && name[0] != '/')
+                    return Detail::Result::logicError(
+                        "Option name must begin with '-' or '/'");
+#else
+                if (name[0] != '-')
+                    return Detail::Result::logicError(
+                        "Option name must begin with '-'");
+#endif
+            }
+            return ParserRefImpl::validate();
+        }
+
+        ExeName::ExeName() :
+            m_name(std::make_shared<std::string>("<executable>")) {}
+
+        ExeName::ExeName(std::string& ref) : ExeName() {
+            m_ref = std::make_shared<Detail::BoundValueRef<std::string>>(ref);
+        }
+
+        Detail::InternalParseResult
+            ExeName::parse(std::string const&,
+                           Detail::TokenStream tokens) const {
+            return Detail::InternalParseResult::ok(
+                Detail::ParseState(ParseResultType::NoMatch, CATCH_MOVE(tokens)));
+        }
+
+        ParserResult ExeName::set(std::string const& newName) {
+            auto lastSlash = newName.find_last_of("\\/");
+            auto filename = (lastSlash == std::string::npos)
+                ? newName
+                : newName.substr(lastSlash + 1);
+
+            *m_name = filename;
+            if (m_ref)
+                return m_ref->setValue(filename);
+            else
+                return ParserResult::ok(ParseResultType::Matched);
+        }
+
+
+
+
+        Parser& Parser::operator|=( Parser const& other ) {
+            m_options.insert( m_options.end(),
+                              other.m_options.begin(),
+                              other.m_options.end() );
+            m_args.insert(
+                m_args.end(), other.m_args.begin(), other.m_args.end() );
+            return *this;
+        }
+
+        std::vector<Detail::HelpColumns> Parser::getHelpColumns() const {
+            std::vector<Detail::HelpColumns> cols;
+            cols.reserve( m_options.size() );
+            for ( auto const& o : m_options ) {
+                cols.push_back(o.getHelpColumns());
+            }
+            return cols;
+        }
+
+        void Parser::writeToStream( std::ostream& os ) const {
+            if ( !m_exeName.name().empty() ) {
+                os << "usage:\n"
+                   << "  " << m_exeName.name() << ' ';
+                bool required = true, first = true;
+                for ( auto const& arg : m_args ) {
+                    if ( first )
+                        first = false;
+                    else
+                        os << ' ';
+                    if ( arg.isOptional() && required ) {
+                        os << '[';
+                        required = false;
+                    }
+                    os << '<' << arg.hint() << '>';
+                    if ( arg.cardinality() == 0 )
+                        os << " ... ";
+                }
+                if ( !required )
+                    os << ']';
+                if ( !m_options.empty() )
+                    os << " options";
+                os << "\n\nwhere options are:\n";
+            }
+
+            auto rows = getHelpColumns();
+            size_t consoleWidth = CATCH_CONFIG_CONSOLE_WIDTH;
+            size_t optWidth = 0;
+            for ( auto const& cols : rows )
+                optWidth = ( std::max )( optWidth, cols.left.size() + 2 );
+
+            optWidth = ( std::min )( optWidth, consoleWidth / 2 );
+
+            for ( auto& cols : rows ) {
+                auto row = TextFlow::Column( CATCH_MOVE(cols.left) )
+                               .width( optWidth )
+                               .indent( 2 ) +
+                           TextFlow::Spacer( 4 ) +
+                           TextFlow::Column( static_cast<std::string>(cols.descriptions) )
+                               .width( consoleWidth - 7 - optWidth );
+                os << row << '\n';
+            }
+        }
+
+        Detail::Result Parser::validate() const {
+            for ( auto const& opt : m_options ) {
+                auto result = opt.validate();
+                if ( !result )
+                    return result;
+            }
+            for ( auto const& arg : m_args ) {
+                auto result = arg.validate();
+                if ( !result )
+                    return result;
+            }
+            return Detail::Result::ok();
+        }
+
+        Detail::InternalParseResult
+        Parser::parse( std::string const& exeName,
+                       Detail::TokenStream tokens ) const {
+
+            struct ParserInfo {
+                ParserBase const* parser = nullptr;
+                size_t count = 0;
+            };
+            std::vector<ParserInfo> parseInfos;
+            parseInfos.reserve( m_options.size() + m_args.size() );
+            for ( auto const& opt : m_options ) {
+                parseInfos.push_back( { &opt, 0 } );
+            }
+            for ( auto const& arg : m_args ) {
+                parseInfos.push_back( { &arg, 0 } );
+            }
+
+            m_exeName.set( exeName );
+
+            auto result = Detail::InternalParseResult::ok(
+                Detail::ParseState( ParseResultType::NoMatch, CATCH_MOVE(tokens) ) );
+            while ( result.value().remainingTokens() ) {
+                bool tokenParsed = false;
+
+                for ( auto& parseInfo : parseInfos ) {
+                    if ( parseInfo.parser->cardinality() == 0 ||
+                         parseInfo.count < parseInfo.parser->cardinality() ) {
+                        result = parseInfo.parser->parse(
+                            exeName, CATCH_MOVE(result).value().remainingTokens() );
+                        if ( !result )
+                            return result;
+                        if ( result.value().type() !=
+                             ParseResultType::NoMatch ) {
+                            tokenParsed = true;
+                            ++parseInfo.count;
+                            break;
+                        }
+                    }
+                }
+
+                if ( result.value().type() == ParseResultType::ShortCircuitAll )
+                    return result;
+                if ( !tokenParsed )
+                    return Detail::InternalParseResult::runtimeError(
+                        "Unrecognised token: " +
+                        result.value().remainingTokens()->token );
+            }
+            // !TBD Check missing required options
+            return result;
+        }
+
+        Args::Args(int argc, char const* const* argv) :
+            m_exeName(argv[0]), m_args(argv + 1, argv + argc) {}
+
+        Args::Args(std::initializer_list<StringRef> args) :
+            m_exeName(*args.begin()),
+            m_args(args.begin() + 1, args.end()) {}
+
+
+        Help::Help( bool& showHelpFlag ):
+            Opt( [&]( bool flag ) {
+                showHelpFlag = flag;
+                return ParserResult::ok( ParseResultType::ShortCircuitAll );
+            } ) {
+            static_cast<Opt&> ( *this )(
+                "display usage information" )["-?"]["-h"]["--help"]
+                .optional();
+        }
+
+    } // namespace Clara
+} // namespace Catch
+
+
+
+
+#include <fstream>
+#include <string>
+
+namespace Catch {
+
+    Clara::Parser makeCommandLineParser( ConfigData& config ) {
+
+        using namespace Clara;
+
+        auto const setWarning = [&]( std::string const& warning ) {
+            if ( warning == "NoAssertions" ) {
+                config.warnings = static_cast<WarnAbout::What>(config.warnings | WarnAbout::NoAssertions);
+                return ParserResult::ok( ParseResultType::Matched );
+            } else if ( warning == "UnmatchedTestSpec" ) {
+                config.warnings = static_cast<WarnAbout::What>(config.warnings | WarnAbout::UnmatchedTestSpec);
+                return ParserResult::ok( ParseResultType::Matched );
+            }
+
+            return ParserResult ::runtimeError(
+                "Unrecognised warning option: '" + warning + '\'' );
+        };
+        auto const loadTestNamesFromFile = [&]( std::string const& filename ) {
+                std::ifstream f( filename.c_str() );
+                if( !f.is_open() )
+                    return ParserResult::runtimeError( "Unable to load input file: '" + filename + '\'' );
+
+                std::string line;
+                while( std::getline( f, line ) ) {
+                    line = trim(line);
+                    if( !line.empty() && !startsWith( line, '#' ) ) {
+                        if( !startsWith( line, '"' ) )
+                            line = '"' + CATCH_MOVE(line) + '"';
+                        config.testsOrTags.push_back( line );
+                        config.testsOrTags.emplace_back( "," );
+                    }
+                }
+                //Remove comma in the end
+                if(!config.testsOrTags.empty())
+                    config.testsOrTags.erase( config.testsOrTags.end()-1 );
+
+                return ParserResult::ok( ParseResultType::Matched );
+            };
+        auto const setTestOrder = [&]( std::string const& order ) {
+                if( startsWith( "declared", order ) )
+                    config.runOrder = TestRunOrder::Declared;
+                else if( startsWith( "lexical", order ) )
+                    config.runOrder = TestRunOrder::LexicographicallySorted;
+                else if( startsWith( "random", order ) )
+                    config.runOrder = TestRunOrder::Randomized;
+                else
+                    return ParserResult::runtimeError( "Unrecognised ordering: '" + order + '\'' );
+                return ParserResult::ok( ParseResultType::Matched );
+            };
+        auto const setRngSeed = [&]( std::string const& seed ) {
+                if( seed == "time" ) {
+                    config.rngSeed = generateRandomSeed(GenerateFrom::Time);
+                    return ParserResult::ok(ParseResultType::Matched);
+                } else if (seed == "random-device") {
+                    config.rngSeed = generateRandomSeed(GenerateFrom::RandomDevice);
+                    return ParserResult::ok(ParseResultType::Matched);
+                }
+
+                // TODO: ideally we should be parsing uint32_t directly
+                //       fix this later when we add new parse overload
+                auto parsedSeed = parseUInt( seed, 0 );
+                if ( !parsedSeed ) {
+                    return ParserResult::runtimeError( "Could not parse '" + seed + "' as seed" );
+                }
+                config.rngSeed = *parsedSeed;
+                return ParserResult::ok( ParseResultType::Matched );
+            };
+        auto const setDefaultColourMode = [&]( std::string const& colourMode ) {
+            Optional<ColourMode> maybeMode = Catch::Detail::stringToColourMode(toLower( colourMode ));
+            if ( !maybeMode ) {
+                return ParserResult::runtimeError(
+                    "colour mode must be one of: default, ansi, win32, "
+                    "or none. '" +
+                    colourMode + "' is not recognised" );
+            }
+            auto mode = *maybeMode;
+            if ( !isColourImplAvailable( mode ) ) {
+                return ParserResult::runtimeError(
+                    "colour mode '" + colourMode +
+                    "' is not supported in this binary" );
+            }
+            config.defaultColourMode = mode;
+            return ParserResult::ok( ParseResultType::Matched );
+        };
+        auto const setWaitForKeypress = [&]( std::string const& keypress ) {
+                auto keypressLc = toLower( keypress );
+                if (keypressLc == "never")
+                    config.waitForKeypress = WaitForKeypress::Never;
+                else if( keypressLc == "start" )
+                    config.waitForKeypress = WaitForKeypress::BeforeStart;
+                else if( keypressLc == "exit" )
+                    config.waitForKeypress = WaitForKeypress::BeforeExit;
+                else if( keypressLc == "both" )
+                    config.waitForKeypress = WaitForKeypress::BeforeStartAndExit;
+                else
+                    return ParserResult::runtimeError( "keypress argument must be one of: never, start, exit or both. '" + keypress + "' not recognised" );
+            return ParserResult::ok( ParseResultType::Matched );
+            };
+        auto const setVerbosity = [&]( std::string const& verbosity ) {
+            auto lcVerbosity = toLower( verbosity );
+            if( lcVerbosity == "quiet" )
+                config.verbosity = Verbosity::Quiet;
+            else if( lcVerbosity == "normal" )
+                config.verbosity = Verbosity::Normal;
+            else if( lcVerbosity == "high" )
+                config.verbosity = Verbosity::High;
+            else
+                return ParserResult::runtimeError( "Unrecognised verbosity, '" + verbosity + '\'' );
+            return ParserResult::ok( ParseResultType::Matched );
+        };
+        auto const setReporter = [&]( std::string const& userReporterSpec ) {
+            if ( userReporterSpec.empty() ) {
+                return ParserResult::runtimeError( "Received empty reporter spec." );
+            }
+
+            Optional<ReporterSpec> parsed =
+                parseReporterSpec( userReporterSpec );
+            if ( !parsed ) {
+                return ParserResult::runtimeError(
+                    "Could not parse reporter spec '" + userReporterSpec +
+                    "'" );
+            }
+
+            auto const& reporterSpec = *parsed;
+
+            auto const& factories =
+                getRegistryHub().getReporterRegistry().getFactories();
+            auto result = factories.find( reporterSpec.name() );
+
+            if ( result == factories.end() ) {
+                return ParserResult::runtimeError(
+                    "Unrecognized reporter, '" + reporterSpec.name() +
+                    "'. Check available with --list-reporters" );
+            }
+
+
+            const bool hadOutputFile = reporterSpec.outputFile().some();
+            config.reporterSpecifications.push_back( CATCH_MOVE( *parsed ) );
+            // It would be enough to check this only once at the very end, but
+            // there is  not a place where we could call this check, so do it
+            // every time it could fail. For valid inputs, this is still called
+            // at most once.
+            if (!hadOutputFile) {
+                int n_reporters_without_file = 0;
+                for (auto const& spec : config.reporterSpecifications) {
+                    if (spec.outputFile().none()) {
+                        n_reporters_without_file++;
+                    }
+                }
+                if (n_reporters_without_file > 1) {
+                    return ParserResult::runtimeError( "Only one reporter may have unspecified output file." );
+                }
+            }
+
+            return ParserResult::ok( ParseResultType::Matched );
+        };
+        auto const setShardCount = [&]( std::string const& shardCount ) {
+            auto parsedCount = parseUInt( shardCount );
+            if ( !parsedCount ) {
+                return ParserResult::runtimeError(
+                    "Could not parse '" + shardCount + "' as shard count" );
+            }
+            if ( *parsedCount == 0 ) {
+                return ParserResult::runtimeError(
+                    "Shard count must be positive" );
+            }
+            config.shardCount = *parsedCount;
+            return ParserResult::ok( ParseResultType::Matched );
+        };
+
+        auto const setShardIndex = [&](std::string const& shardIndex) {
+            auto parsedIndex = parseUInt( shardIndex );
+            if ( !parsedIndex ) {
+                return ParserResult::runtimeError(
+                    "Could not parse '" + shardIndex + "' as shard index" );
+            }
+            config.shardIndex = *parsedIndex;
+            return ParserResult::ok( ParseResultType::Matched );
+        };
+
+        auto cli
+            = ExeName( config.processName )
+            | Help( config.showHelp )
+            | Opt( config.showSuccessfulTests )
+                ["-s"]["--success"]
+                ( "include successful tests in output" )
+            | Opt( config.shouldDebugBreak )
+                ["-b"]["--break"]
+                ( "break into debugger on failure" )
+            | Opt( config.noThrow )
+                ["-e"]["--nothrow"]
+                ( "skip exception tests" )
+            | Opt( config.showInvisibles )
+                ["-i"]["--invisibles"]
+                ( "show invisibles (tabs, newlines)" )
+            | Opt( config.defaultOutputFilename, "filename" )
+                ["-o"]["--out"]
+                ( "default output filename" )
+            | Opt( accept_many, setReporter, "name[::key=value]*" )
+                ["-r"]["--reporter"]
+                ( "reporter to use (defaults to console)" )
+            | Opt( config.name, "name" )
+                ["-n"]["--name"]
+                ( "suite name" )
+            | Opt( [&]( bool ){ config.abortAfter = 1; } )
+                ["-a"]["--abort"]
+                ( "abort at first failure" )
+            | Opt( [&]( int x ){ config.abortAfter = x; }, "no. failures" )
+                ["-x"]["--abortx"]
+                ( "abort after x failures" )
+            | Opt( accept_many, setWarning, "warning name" )
+                ["-w"]["--warn"]
+                ( "enable warnings" )
+            | Opt( [&]( bool flag ) { config.showDurations = flag ? ShowDurations::Always : ShowDurations::Never; }, "yes|no" )
+                ["-d"]["--durations"]
+                ( "show test durations" )
+            | Opt( config.minDuration, "seconds" )
+                ["-D"]["--min-duration"]
+                ( "show test durations for tests taking at least the given number of seconds" )
+            | Opt( loadTestNamesFromFile, "filename" )
+                ["-f"]["--input-file"]
+                ( "load test names to run from a file" )
+            | Opt( config.filenamesAsTags )
+                ["-#"]["--filenames-as-tags"]
+                ( "adds a tag for the filename" )
+            | Opt( config.sectionsToRun, "section name" )
+                ["-c"]["--section"]
+                ( "specify section to run" )
+            | Opt( setVerbosity, "quiet|normal|high" )
+                ["-v"]["--verbosity"]
+                ( "set output verbosity" )
+            | Opt( config.listTests )
+                ["--list-tests"]
+                ( "list all/matching test cases" )
+            | Opt( config.listTags )
+                ["--list-tags"]
+                ( "list all/matching tags" )
+            | Opt( config.listReporters )
+                ["--list-reporters"]
+                ( "list all available reporters" )
+            | Opt( config.listListeners )
+                ["--list-listeners"]
+                ( "list all listeners" )
+            | Opt( setTestOrder, "decl|lex|rand" )
+                ["--order"]
+                ( "test case order (defaults to decl)" )
+            | Opt( setRngSeed, "'time'|'random-device'|number" )
+                ["--rng-seed"]
+                ( "set a specific seed for random numbers" )
+            | Opt( setDefaultColourMode, "ansi|win32|none|default" )
+                ["--colour-mode"]
+                ( "what color mode should be used as default" )
+            | Opt( config.libIdentify )
+                ["--libidentify"]
+                ( "report name and version according to libidentify standard" )
+            | Opt( setWaitForKeypress, "never|start|exit|both" )
+                ["--wait-for-keypress"]
+                ( "waits for a keypress before exiting" )
+            | Opt( config.skipBenchmarks)
+                ["--skip-benchmarks"]
+                ( "disable running benchmarks")
+            | Opt( config.benchmarkSamples, "samples" )
+                ["--benchmark-samples"]
+                ( "number of samples to collect (default: 100)" )
+            | Opt( config.benchmarkResamples, "resamples" )
+                ["--benchmark-resamples"]
+                ( "number of resamples for the bootstrap (default: 100000)" )
+            | Opt( config.benchmarkConfidenceInterval, "confidence interval" )
+                ["--benchmark-confidence-interval"]
+                ( "confidence interval for the bootstrap (between 0 and 1, default: 0.95)" )
+            | Opt( config.benchmarkNoAnalysis )
+                ["--benchmark-no-analysis"]
+                ( "perform only measurements; do not perform any analysis" )
+            | Opt( config.benchmarkWarmupTime, "benchmarkWarmupTime" )
+                ["--benchmark-warmup-time"]
+                ( "amount of time in milliseconds spent on warming up each test (default: 100)" )
+            | Opt( setShardCount, "shard count" )
+                ["--shard-count"]
+                ( "split the tests to execute into this many groups" )
+            | Opt( setShardIndex, "shard index" )
+                ["--shard-index"]
+                ( "index of the group of tests to execute (see --shard-count)" )
+            | Opt( config.allowZeroTests )
+                ["--allow-running-no-tests"]
+                ( "Treat 'No tests run' as a success" )
+            | Arg( config.testsOrTags, "test name|pattern|tags" )
+                ( "which test or tests to use" );
+
+        return cli;
+    }
+
+} // end namespace Catch
+
+
+#if defined(__clang__)
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wexit-time-destructors"
+#endif
+
+
+
+#include <cassert>
+#include <ostream>
+#include <utility>
+
+namespace Catch {
+
+    ColourImpl::~ColourImpl() = default;
+
+    ColourImpl::ColourGuard ColourImpl::guardColour( Colour::Code colourCode ) {
+        return ColourGuard(colourCode, this );
+    }
+
+    void ColourImpl::ColourGuard::engageImpl( std::ostream& stream ) {
+        assert( &stream == &m_colourImpl->m_stream->stream() &&
+                "Engaging colour guard for different stream than used by the "
+                "parent colour implementation" );
+        static_cast<void>( stream );
+
+        m_engaged = true;
+        m_colourImpl->use( m_code );
+    }
+
+    ColourImpl::ColourGuard::ColourGuard( Colour::Code code,
+                                          ColourImpl const* colour ):
+        m_colourImpl( colour ), m_code( code ) {
+    }
+    ColourImpl::ColourGuard::ColourGuard( ColourGuard&& rhs ) noexcept:
+        m_colourImpl( rhs.m_colourImpl ),
+        m_code( rhs.m_code ),
+        m_engaged( rhs.m_engaged ) {
+        rhs.m_engaged = false;
+    }
+    ColourImpl::ColourGuard&
+    ColourImpl::ColourGuard::operator=( ColourGuard&& rhs ) noexcept {
+        using std::swap;
+        swap( m_colourImpl, rhs.m_colourImpl );
+        swap( m_code, rhs.m_code );
+        swap( m_engaged, rhs.m_engaged );
+
+        return *this;
+    }
+    ColourImpl::ColourGuard::~ColourGuard() {
+        if ( m_engaged ) {
+            m_colourImpl->use( Colour::None );
+        }
+    }
+
+    ColourImpl::ColourGuard&
+    ColourImpl::ColourGuard::engage( std::ostream& stream ) & {
+        engageImpl( stream );
+        return *this;
+    }
+
+    ColourImpl::ColourGuard&&
+    ColourImpl::ColourGuard::engage( std::ostream& stream ) && {
+        engageImpl( stream );
+        return CATCH_MOVE(*this);
+    }
+
+    namespace {
+        //! A do-nothing implementation of colour, used as fallback for unknown
+        //! platforms, and when the user asks to deactivate all colours.
+        class NoColourImpl final : public ColourImpl {
+        public:
+            NoColourImpl( IStream* stream ): ColourImpl( stream ) {}
+
+        private:
+            void use( Colour::Code ) const override {}
+        };
+    } // namespace
+
+
+} // namespace Catch
+
+
+#if defined ( CATCH_CONFIG_COLOUR_WIN32 ) /////////////////////////////////////////
+
+namespace Catch {
+namespace {
+
+    class Win32ColourImpl final : public ColourImpl {
+    public:
+        Win32ColourImpl(IStream* stream):
+            ColourImpl(stream) {
+            CONSOLE_SCREEN_BUFFER_INFO csbiInfo;
+            GetConsoleScreenBufferInfo( GetStdHandle( STD_OUTPUT_HANDLE ),
+                                        &csbiInfo );
+            originalForegroundAttributes = csbiInfo.wAttributes & ~( BACKGROUND_GREEN | BACKGROUND_RED | BACKGROUND_BLUE | BACKGROUND_INTENSITY );
+            originalBackgroundAttributes = csbiInfo.wAttributes & ~( FOREGROUND_GREEN | FOREGROUND_RED | FOREGROUND_BLUE | FOREGROUND_INTENSITY );
+        }
+
+        static bool useImplementationForStream(IStream const& stream) {
+            // Win32 text colour APIs can only be used on console streams
+            // We cannot check that the output hasn't been redirected,
+            // so we just check that the original stream is console stream.
+            return stream.isConsole();
+        }
+
+    private:
+        void use( Colour::Code _colourCode ) const override {
+            switch( _colourCode ) {
+                case Colour::None:      return setTextAttribute( originalForegroundAttributes );
+                case Colour::White:     return setTextAttribute( FOREGROUND_GREEN | FOREGROUND_RED | FOREGROUND_BLUE );
+                case Colour::Red:       return setTextAttribute( FOREGROUND_RED );
+                case Colour::Green:     return setTextAttribute( FOREGROUND_GREEN );
+                case Colour::Blue:      return setTextAttribute( FOREGROUND_BLUE );
+                case Colour::Cyan:      return setTextAttribute( FOREGROUND_BLUE | FOREGROUND_GREEN );
+                case Colour::Yellow:    return setTextAttribute( FOREGROUND_RED | FOREGROUND_GREEN );
+                case Colour::Grey:      return setTextAttribute( 0 );
+
+                case Colour::LightGrey:     return setTextAttribute( FOREGROUND_INTENSITY );
+                case Colour::BrightRed:     return setTextAttribute( FOREGROUND_INTENSITY | FOREGROUND_RED );
+                case Colour::BrightGreen:   return setTextAttribute( FOREGROUND_INTENSITY | FOREGROUND_GREEN );
+                case Colour::BrightWhite:   return setTextAttribute( FOREGROUND_INTENSITY | FOREGROUND_GREEN | FOREGROUND_RED | FOREGROUND_BLUE );
+                case Colour::BrightYellow:  return setTextAttribute( FOREGROUND_INTENSITY | FOREGROUND_RED | FOREGROUND_GREEN );
+
+                case Colour::Bright: CATCH_INTERNAL_ERROR( "not a colour" );
+
+                default:
+                    CATCH_ERROR( "Unknown colour requested" );
+            }
+        }
+
+        void setTextAttribute( WORD _textAttribute ) const {
+            SetConsoleTextAttribute( GetStdHandle( STD_OUTPUT_HANDLE ),
+                                     _textAttribute |
+                                         originalBackgroundAttributes );
+        }
+        WORD originalForegroundAttributes;
+        WORD originalBackgroundAttributes;
+    };
+
+} // end anon namespace
+} // end namespace Catch
+
+#endif // Windows/ ANSI/ None
+
+
+#if defined( CATCH_PLATFORM_LINUX ) || defined( CATCH_PLATFORM_MAC )
+#    define CATCH_INTERNAL_HAS_ISATTY
+#    include <unistd.h>
+#endif
+
+namespace Catch {
+namespace {
+
+    class ANSIColourImpl final : public ColourImpl {
+    public:
+        ANSIColourImpl( IStream* stream ): ColourImpl( stream ) {}
+
+        static bool useImplementationForStream(IStream const& stream) {
+            // This is kinda messy due to trying to support a bunch of
+            // different platforms at once.
+            // The basic idea is that if we are asked to do autodetection (as
+            // opposed to being told to use posixy colours outright), then we
+            // only want to use the colours if we are writing to console.
+            // However, console might be redirected, so we make an attempt at
+            // checking for that on platforms where we know how to do that.
+            bool useColour = stream.isConsole();
+#if defined( CATCH_INTERNAL_HAS_ISATTY ) && \
+    !( defined( __DJGPP__ ) && defined( __STRICT_ANSI__ ) )
+            ErrnoGuard _; // for isatty
+            useColour = useColour && isatty( STDOUT_FILENO );
+#    endif
+#    if defined( CATCH_PLATFORM_MAC ) || defined( CATCH_PLATFORM_IPHONE )
+            useColour = useColour && !isDebuggerActive();
+#    endif
+
+            return useColour;
+        }
+
+    private:
+        void use( Colour::Code _colourCode ) const override {
+            auto setColour = [&out =
+                                  m_stream->stream()]( char const* escapeCode ) {
+                // The escape sequence must be flushed to console, otherwise
+                // if stdin and stderr are intermixed, we'd get accidentally
+                // coloured output.
+                out << '\033' << escapeCode << std::flush;
+            };
+            switch( _colourCode ) {
+                case Colour::None:
+                case Colour::White:     return setColour( "[0m" );
+                case Colour::Red:       return setColour( "[0;31m" );
+                case Colour::Green:     return setColour( "[0;32m" );
+                case Colour::Blue:      return setColour( "[0;34m" );
+                case Colour::Cyan:      return setColour( "[0;36m" );
+                case Colour::Yellow:    return setColour( "[0;33m" );
+                case Colour::Grey:      return setColour( "[1;30m" );
+
+                case Colour::LightGrey:     return setColour( "[0;37m" );
+                case Colour::BrightRed:     return setColour( "[1;31m" );
+                case Colour::BrightGreen:   return setColour( "[1;32m" );
+                case Colour::BrightWhite:   return setColour( "[1;37m" );
+                case Colour::BrightYellow:  return setColour( "[1;33m" );
+
+                case Colour::Bright: CATCH_INTERNAL_ERROR( "not a colour" );
+                default: CATCH_INTERNAL_ERROR( "Unknown colour requested" );
+            }
+        }
+    };
+
+} // end anon namespace
+} // end namespace Catch
+
+namespace Catch {
+
+    Detail::unique_ptr<ColourImpl> makeColourImpl( ColourMode colourSelection,
+                                                   IStream* stream ) {
+#if defined( CATCH_CONFIG_COLOUR_WIN32 )
+        if ( colourSelection == ColourMode::Win32 ) {
+            return Detail::make_unique<Win32ColourImpl>( stream );
+        }
+#endif
+        if ( colourSelection == ColourMode::ANSI ) {
+            return Detail::make_unique<ANSIColourImpl>( stream );
+        }
+        if ( colourSelection == ColourMode::None ) {
+            return Detail::make_unique<NoColourImpl>( stream );
+        }
+
+        if ( colourSelection == ColourMode::PlatformDefault) {
+#if defined( CATCH_CONFIG_COLOUR_WIN32 )
+            if ( Win32ColourImpl::useImplementationForStream( *stream ) ) {
+                return Detail::make_unique<Win32ColourImpl>( stream );
+            }
+#endif
+            if ( ANSIColourImpl::useImplementationForStream( *stream ) ) {
+                return Detail::make_unique<ANSIColourImpl>( stream );
+            }
+            return Detail::make_unique<NoColourImpl>( stream );
+        }
+
+        CATCH_ERROR( "Could not create colour impl for selection " << static_cast<int>(colourSelection) );
+    }
+
+    bool isColourImplAvailable( ColourMode colourSelection ) {
+        switch ( colourSelection ) {
+#if defined( CATCH_CONFIG_COLOUR_WIN32 )
+        case ColourMode::Win32:
+#endif
+        case ColourMode::ANSI:
+        case ColourMode::None:
+        case ColourMode::PlatformDefault:
+            return true;
+        default:
+            return false;
+        }
+    }
+
+
+} // end namespace Catch
+
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
+
+
+
+
+namespace Catch {
+
+    Context* Context::currentContext = nullptr;
+
+    void cleanUpContext() {
+        delete Context::currentContext;
+        Context::currentContext = nullptr;
+    }
+    void Context::createContext() {
+        currentContext = new Context();
+    }
+
+    Context& getCurrentMutableContext() {
+        if ( !Context::currentContext ) { Context::createContext(); }
+        // NOLINTNEXTLINE(clang-analyzer-core.uninitialized.UndefReturn)
+        return *Context::currentContext;
+    }
+
+    void Context::setResultCapture( IResultCapture* resultCapture ) {
+        m_resultCapture = resultCapture;
+    }
+
+    void Context::setConfig( IConfig const* config ) { m_config = config; }
+
+    SimplePcg32& sharedRng() {
+        static SimplePcg32 s_rng;
+        return s_rng;
+    }
+
+}
+
+
+
+
+
+#include <ostream>
+
+#if defined(CATCH_CONFIG_ANDROID_LOGWRITE)
+#include <android/log.h>
+
+    namespace Catch {
+        void writeToDebugConsole( std::string const& text ) {
+            __android_log_write( ANDROID_LOG_DEBUG, "Catch", text.c_str() );
+        }
+    }
+
+#elif defined(CATCH_PLATFORM_WINDOWS)
+
+    namespace Catch {
+        void writeToDebugConsole( std::string const& text ) {
+            ::OutputDebugStringA( text.c_str() );
+        }
+    }
+
+#else
+
+    namespace Catch {
+        void writeToDebugConsole( std::string const& text ) {
+            // !TBD: Need a version for Mac/ XCode and other IDEs
+            Catch::cout() << text;
+        }
+    }
+
+#endif // Platform
+
+
+
+#if defined(CATCH_PLATFORM_MAC) || defined(CATCH_PLATFORM_IPHONE)
+
+#  include <cassert>
+#  include <sys/types.h>
+#  include <unistd.h>
+#  include <cstddef>
+#  include <ostream>
+
+#ifdef __apple_build_version__
+    // These headers will only compile with AppleClang (XCode)
+    // For other compilers (Clang, GCC, ... ) we need to exclude them
+#  include <sys/sysctl.h>
+#endif
+
+    namespace Catch {
+        #ifdef __apple_build_version__
+        // The following function is taken directly from the following technical note:
+        // https://developer.apple.com/library/archive/qa/qa1361/_index.html
+
+        // Returns true if the current process is being debugged (either
+        // running under the debugger or has a debugger attached post facto).
+        bool isDebuggerActive(){
+            int                 mib[4];
+            struct kinfo_proc   info;
+            std::size_t         size;
+
+            // Initialize the flags so that, if sysctl fails for some bizarre
+            // reason, we get a predictable result.
+
+            info.kp_proc.p_flag = 0;
+
+            // Initialize mib, which tells sysctl the info we want, in this case
+            // we're looking for information about a specific process ID.
+
+            mib[0] = CTL_KERN;
+            mib[1] = KERN_PROC;
+            mib[2] = KERN_PROC_PID;
+            mib[3] = getpid();
+
+            // Call sysctl.
+
+            size = sizeof(info);
+            if( sysctl(mib, sizeof(mib) / sizeof(*mib), &info, &size, nullptr, 0) != 0 ) {
+                Catch::cerr() << "\n** Call to sysctl failed - unable to determine if debugger is active **\n\n" << std::flush;
+                return false;
+            }
+
+            // We're being debugged if the P_TRACED flag is set.
+
+            return ( (info.kp_proc.p_flag & P_TRACED) != 0 );
+        }
+        #else
+        bool isDebuggerActive() {
+            // We need to find another way to determine this for non-appleclang compilers on macOS
+            return false;
+        }
+        #endif
+    } // namespace Catch
+
+#elif defined(CATCH_PLATFORM_LINUX)
+    #include <fstream>
+    #include <string>
+
+    namespace Catch{
+        // The standard POSIX way of detecting a debugger is to attempt to
+        // ptrace() the process, but this needs to be done from a child and not
+        // this process itself to still allow attaching to this process later
+        // if wanted, so is rather heavy. Under Linux we have the PID of the
+        // "debugger" (which doesn't need to be gdb, of course, it could also
+        // be strace, for example) in /proc/$PID/status, so just get it from
+        // there instead.
+        bool isDebuggerActive(){
+            // Libstdc++ has a bug, where std::ifstream sets errno to 0
+            // This way our users can properly assert over errno values
+            ErrnoGuard guard;
+            std::ifstream in("/proc/self/status");
+            for( std::string line; std::getline(in, line); ) {
+                static const int PREFIX_LEN = 11;
+                if( line.compare(0, PREFIX_LEN, "TracerPid:\t") == 0 ) {
+                    // We're traced if the PID is not 0 and no other PID starts
+                    // with 0 digit, so it's enough to check for just a single
+                    // character.
+                    return line.length() > PREFIX_LEN && line[PREFIX_LEN] != '0';
+                }
+            }
+
+            return false;
+        }
+    } // namespace Catch
+#elif defined(_MSC_VER)
+    extern "C" __declspec(dllimport) int __stdcall IsDebuggerPresent();
+    namespace Catch {
+        bool isDebuggerActive() {
+            return IsDebuggerPresent() != 0;
+        }
+    }
+#elif defined(__MINGW32__)
+    extern "C" __declspec(dllimport) int __stdcall IsDebuggerPresent();
+    namespace Catch {
+        bool isDebuggerActive() {
+            return IsDebuggerPresent() != 0;
+        }
+    }
+#else
+    namespace Catch {
+       bool isDebuggerActive() { return false; }
+    }
+#endif // Platform
+
+
+
+
+namespace Catch {
+
+    void ITransientExpression::streamReconstructedExpression(
+        std::ostream& os ) const {
+        // We can't make this function pure virtual to keep ITransientExpression
+        // constexpr, so we write error message instead
+        os << "Some class derived from ITransientExpression without overriding streamReconstructedExpression";
+    }
+
+    void formatReconstructedExpression( std::ostream &os, std::string const& lhs, StringRef op, std::string const& rhs ) {
+        if( lhs.size() + rhs.size() < 40 &&
+                lhs.find('\n') == std::string::npos &&
+                rhs.find('\n') == std::string::npos )
+            os << lhs << ' ' << op << ' ' << rhs;
+        else
+            os << lhs << '\n' << op << '\n' << rhs;
+    }
+}
+
+
+
+#include <stdexcept>
+
+
+namespace Catch {
+#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) && !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS_CUSTOM_HANDLER)
+    [[noreturn]]
+    void throw_exception(std::exception const& e) {
+        Catch::cerr() << "Catch will terminate because it needed to throw an exception.\n"
+                      << "The message was: " << e.what() << '\n';
+        std::terminate();
+    }
+#endif
+
+    [[noreturn]]
+    void throw_logic_error(std::string const& msg) {
+        throw_exception(std::logic_error(msg));
+    }
+
+    [[noreturn]]
+    void throw_domain_error(std::string const& msg) {
+        throw_exception(std::domain_error(msg));
+    }
+
+    [[noreturn]]
+    void throw_runtime_error(std::string const& msg) {
+        throw_exception(std::runtime_error(msg));
+    }
+
+
+
+} // namespace Catch;
+
+
+
+#include <cassert>
+
+namespace Catch {
+
+    IMutableEnumValuesRegistry::~IMutableEnumValuesRegistry() = default;
+
+    namespace Detail {
+
+        namespace {
+            // Extracts the actual name part of an enum instance
+            // In other words, it returns the Blue part of Bikeshed::Colour::Blue
+            StringRef extractInstanceName(StringRef enumInstance) {
+                // Find last occurrence of ":"
+                size_t name_start = enumInstance.size();
+                while (name_start > 0 && enumInstance[name_start - 1] != ':') {
+                    --name_start;
+                }
+                return enumInstance.substr(name_start, enumInstance.size() - name_start);
+            }
+        }
+
+        std::vector<StringRef> parseEnums( StringRef enums ) {
+            auto enumValues = splitStringRef( enums, ',' );
+            std::vector<StringRef> parsed;
+            parsed.reserve( enumValues.size() );
+            for( auto const& enumValue : enumValues ) {
+                parsed.push_back(trim(extractInstanceName(enumValue)));
+            }
+            return parsed;
+        }
+
+        EnumInfo::~EnumInfo() = default;
+
+        StringRef EnumInfo::lookup( int value ) const {
+            for( auto const& valueToName : m_values ) {
+                if( valueToName.first == value )
+                    return valueToName.second;
+            }
+            return "{** unexpected enum value **}"_sr;
+        }
+
+        Catch::Detail::unique_ptr<EnumInfo> makeEnumInfo( StringRef enumName, StringRef allValueNames, std::vector<int> const& values ) {
+            auto enumInfo = Catch::Detail::make_unique<EnumInfo>();
+            enumInfo->m_name = enumName;
+            enumInfo->m_values.reserve( values.size() );
+
+            const auto valueNames = Catch::Detail::parseEnums( allValueNames );
+            assert( valueNames.size() == values.size() );
+            std::size_t i = 0;
+            for( auto value : values )
+                enumInfo->m_values.emplace_back(value, valueNames[i++]);
+
+            return enumInfo;
+        }
+
+        EnumInfo const& EnumValuesRegistry::registerEnum( StringRef enumName, StringRef allValueNames, std::vector<int> const& values ) {
+            m_enumInfos.push_back(makeEnumInfo(enumName, allValueNames, values));
+            return *m_enumInfos.back();
+        }
+
+    } // Detail
+} // Catch
+
+
+
+
+
+#include <cerrno>
+
+namespace Catch {
+        ErrnoGuard::ErrnoGuard():m_oldErrno(errno){}
+        ErrnoGuard::~ErrnoGuard() { errno = m_oldErrno; }
+}
+
+
+
+#include <exception>
+
+namespace Catch {
+
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+    namespace {
+        static std::string tryTranslators(
+            std::vector<
+                Detail::unique_ptr<IExceptionTranslator const>> const& translators ) {
+            if ( translators.empty() ) {
+                std::rethrow_exception( std::current_exception() );
+            } else {
+                return translators[0]->translate( translators.begin() + 1,
+                                                  translators.end() );
+            }
+        }
+
+    }
+#endif //!defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+
+    ExceptionTranslatorRegistry::~ExceptionTranslatorRegistry() = default;
+
+    void ExceptionTranslatorRegistry::registerTranslator( Detail::unique_ptr<IExceptionTranslator>&& translator ) {
+        m_translators.push_back( CATCH_MOVE( translator ) );
+    }
+
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+    std::string ExceptionTranslatorRegistry::translateActiveException() const {
+        // Compiling a mixed mode project with MSVC means that CLR
+        // exceptions will be caught in (...) as well. However, these do
+        // do not fill-in std::current_exception and thus lead to crash
+        // when attempting rethrow.
+        // /EHa switch also causes structured exceptions to be caught
+        // here, but they fill-in current_exception properly, so
+        // at worst the output should be a little weird, instead of
+        // causing a crash.
+        if ( std::current_exception() == nullptr ) {
+            return "Non C++ exception. Possibly a CLR exception.";
+        }
+
+        // First we try user-registered translators. If none of them can
+        // handle the exception, it will be rethrown handled by our defaults.
+        try {
+            return tryTranslators(m_translators);
+        }
+        // To avoid having to handle TFE explicitly everywhere, we just
+        // rethrow it so that it goes back up the caller.
+        catch( TestFailureException& ) {
+            std::rethrow_exception(std::current_exception());
+        }
+        catch( TestSkipException& ) {
+            std::rethrow_exception(std::current_exception());
+        }
+        catch( std::exception const& ex ) {
+            return ex.what();
+        }
+        catch( std::string const& msg ) {
+            return msg;
+        }
+        catch( const char* msg ) {
+            return msg;
+        }
+        catch(...) {
+            return "Unknown exception";
+        }
+    }
+
+#else // ^^ Exceptions are enabled // Exceptions are disabled vv
+    std::string ExceptionTranslatorRegistry::translateActiveException() const {
+        CATCH_INTERNAL_ERROR("Attempted to translate active exception under CATCH_CONFIG_DISABLE_EXCEPTIONS!");
+    }
+#endif
+
+}
+
+
+
+/** \file
+ * This file provides platform specific implementations of FatalConditionHandler
+ *
+ * This means that there is a lot of conditional compilation, and platform
+ * specific code. Currently, Catch2 supports a dummy handler (if no
+ * handler is desired), and 2 platform specific handlers:
+ *  * Windows' SEH
+ *  * POSIX signals
+ *
+ * Consequently, various pieces of code below are compiled if either of
+ * the platform specific handlers is enabled, or if none of them are
+ * enabled. It is assumed that both cannot be enabled at the same time,
+ * and doing so should cause a compilation error.
+ *
+ * If another platform specific handler is added, the compile guards
+ * below will need to be updated taking these assumptions into account.
+ */
+
+
+
+#include <algorithm>
+
+#if !defined( CATCH_CONFIG_WINDOWS_SEH ) && !defined( CATCH_CONFIG_POSIX_SIGNALS )
+
+namespace Catch {
+
+    // If neither SEH nor signal handling is required, the handler impls
+    // do not have to do anything, and can be empty.
+    void FatalConditionHandler::engage_platform() {}
+    void FatalConditionHandler::disengage_platform() noexcept {}
+    FatalConditionHandler::FatalConditionHandler() = default;
+    FatalConditionHandler::~FatalConditionHandler() = default;
+
+} // end namespace Catch
+
+#endif // !CATCH_CONFIG_WINDOWS_SEH && !CATCH_CONFIG_POSIX_SIGNALS
+
+#if defined( CATCH_CONFIG_WINDOWS_SEH ) && defined( CATCH_CONFIG_POSIX_SIGNALS )
+#error "Inconsistent configuration: Windows' SEH handling and POSIX signals cannot be enabled at the same time"
+#endif // CATCH_CONFIG_WINDOWS_SEH && CATCH_CONFIG_POSIX_SIGNALS
+
+#if defined( CATCH_CONFIG_WINDOWS_SEH ) || defined( CATCH_CONFIG_POSIX_SIGNALS )
+
+namespace {
+    //! Signals fatal error message to the run context
+    void reportFatal( char const * const message ) {
+        Catch::getCurrentContext().getResultCapture()->handleFatalErrorCondition( message );
+    }
+
+    //! Minimal size Catch2 needs for its own fatal error handling.
+    //! Picked empirically, so it might not be sufficient on all
+    //! platforms, and for all configurations.
+    constexpr std::size_t minStackSizeForErrors = 32 * 1024;
+} // end unnamed namespace
+
+#endif // CATCH_CONFIG_WINDOWS_SEH || CATCH_CONFIG_POSIX_SIGNALS
+
+#if defined( CATCH_CONFIG_WINDOWS_SEH )
+
+namespace Catch {
+
+    struct SignalDefs { DWORD id; const char* name; };
+
+    // There is no 1-1 mapping between signals and windows exceptions.
+    // Windows can easily distinguish between SO and SigSegV,
+    // but SigInt, SigTerm, etc are handled differently.
+    static SignalDefs signalDefs[] = {
+        { EXCEPTION_ILLEGAL_INSTRUCTION,  "SIGILL - Illegal instruction signal" },
+        { EXCEPTION_STACK_OVERFLOW, "SIGSEGV - Stack overflow" },
+        { EXCEPTION_ACCESS_VIOLATION, "SIGSEGV - Segmentation violation signal" },
+        { EXCEPTION_INT_DIVIDE_BY_ZERO, "Divide by zero error" },
+    };
+
+    static LONG CALLBACK topLevelExceptionFilter(PEXCEPTION_POINTERS ExceptionInfo) {
+        for (auto const& def : signalDefs) {
+            if (ExceptionInfo->ExceptionRecord->ExceptionCode == def.id) {
+                reportFatal(def.name);
+            }
+        }
+        // If its not an exception we care about, pass it along.
+        // This stops us from eating debugger breaks etc.
+        return EXCEPTION_CONTINUE_SEARCH;
+    }
+
+    // Since we do not support multiple instantiations, we put these
+    // into global variables and rely on cleaning them up in outlined
+    // constructors/destructors
+    static LPTOP_LEVEL_EXCEPTION_FILTER previousTopLevelExceptionFilter = nullptr;
+
+
+    // For MSVC, we reserve part of the stack memory for handling
+    // memory overflow structured exception.
+    FatalConditionHandler::FatalConditionHandler() {
+        ULONG guaranteeSize = static_cast<ULONG>(minStackSizeForErrors);
+        if (!SetThreadStackGuarantee(&guaranteeSize)) {
+            // We do not want to fully error out, because needing
+            // the stack reserve should be rare enough anyway.
+            Catch::cerr()
+                << "Failed to reserve piece of stack."
+                << " Stack overflows will not be reported successfully.";
+        }
+    }
+
+    // We do not attempt to unset the stack guarantee, because
+    // Windows does not support lowering the stack size guarantee.
+    FatalConditionHandler::~FatalConditionHandler() = default;
+
+
+    void FatalConditionHandler::engage_platform() {
+        // Register as a the top level exception filter.
+        previousTopLevelExceptionFilter = SetUnhandledExceptionFilter(topLevelExceptionFilter);
+    }
+
+    void FatalConditionHandler::disengage_platform() noexcept {
+        if (SetUnhandledExceptionFilter(previousTopLevelExceptionFilter) != topLevelExceptionFilter) {
+            Catch::cerr()
+                << "Unexpected SEH unhandled exception filter on disengage."
+                << " The filter was restored, but might be rolled back unexpectedly.";
+        }
+        previousTopLevelExceptionFilter = nullptr;
+    }
+
+} // end namespace Catch
+
+#endif // CATCH_CONFIG_WINDOWS_SEH
+
+#if defined( CATCH_CONFIG_POSIX_SIGNALS )
+
+#include <signal.h>
+
+namespace Catch {
+
+    struct SignalDefs {
+        int id;
+        const char* name;
+    };
+
+    static SignalDefs signalDefs[] = {
+        { SIGINT,  "SIGINT - Terminal interrupt signal" },
+        { SIGILL,  "SIGILL - Illegal instruction signal" },
+        { SIGFPE,  "SIGFPE - Floating point error signal" },
+        { SIGSEGV, "SIGSEGV - Segmentation violation signal" },
+        { SIGTERM, "SIGTERM - Termination request signal" },
+        { SIGABRT, "SIGABRT - Abort (abnormal termination) signal" }
+    };
+
+// Older GCCs trigger -Wmissing-field-initializers for T foo = {}
+// which is zero initialization, but not explicit. We want to avoid
+// that.
+#if defined(__GNUC__)
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#endif
+
+    static char* altStackMem = nullptr;
+    static std::size_t altStackSize = 0;
+    static stack_t oldSigStack{};
+    static struct sigaction oldSigActions[sizeof(signalDefs) / sizeof(SignalDefs)]{};
+
+    static void restorePreviousSignalHandlers() noexcept {
+        // We set signal handlers back to the previous ones. Hopefully
+        // nobody overwrote them in the meantime, and doesn't expect
+        // their signal handlers to live past ours given that they
+        // installed them after ours..
+        for (std::size_t i = 0; i < sizeof(signalDefs) / sizeof(SignalDefs); ++i) {
+            sigaction(signalDefs[i].id, &oldSigActions[i], nullptr);
+        }
+        // Return the old stack
+        sigaltstack(&oldSigStack, nullptr);
+    }
+
+    static void handleSignal( int sig ) {
+        char const * name = "<unknown signal>";
+        for (auto const& def : signalDefs) {
+            if (sig == def.id) {
+                name = def.name;
+                break;
+            }
+        }
+        // We need to restore previous signal handlers and let them do
+        // their thing, so that the users can have the debugger break
+        // when a signal is raised, and so on.
+        restorePreviousSignalHandlers();
+        reportFatal( name );
+        raise( sig );
+    }
+
+    FatalConditionHandler::FatalConditionHandler() {
+        assert(!altStackMem && "Cannot initialize POSIX signal handler when one already exists");
+        if (altStackSize == 0) {
+            altStackSize = std::max(static_cast<size_t>(SIGSTKSZ), minStackSizeForErrors);
+        }
+        altStackMem = new char[altStackSize]();
+    }
+
+    FatalConditionHandler::~FatalConditionHandler() {
+        delete[] altStackMem;
+        // We signal that another instance can be constructed by zeroing
+        // out the pointer.
+        altStackMem = nullptr;
+    }
+
+    void FatalConditionHandler::engage_platform() {
+        stack_t sigStack;
+        sigStack.ss_sp = altStackMem;
+        sigStack.ss_size = altStackSize;
+        sigStack.ss_flags = 0;
+        sigaltstack(&sigStack, &oldSigStack);
+        struct sigaction sa = { };
+
+        sa.sa_handler = handleSignal;
+        sa.sa_flags = SA_ONSTACK;
+        for (std::size_t i = 0; i < sizeof(signalDefs)/sizeof(SignalDefs); ++i) {
+            sigaction(signalDefs[i].id, &sa, &oldSigActions[i]);
+        }
+    }
+
+#if defined(__GNUC__)
+#    pragma GCC diagnostic pop
+#endif
+
+
+    void FatalConditionHandler::disengage_platform() noexcept {
+        restorePreviousSignalHandlers();
+    }
+
+} // end namespace Catch
+
+#endif // CATCH_CONFIG_POSIX_SIGNALS
+
+
+
+
+#include <cstring>
+
+namespace Catch {
+    namespace Detail {
+
+        uint32_t convertToBits(float f) {
+            static_assert(sizeof(float) == sizeof(uint32_t), "Important ULP matcher assumption violated");
+            uint32_t i;
+            std::memcpy(&i, &f, sizeof(f));
+            return i;
+        }
+
+        uint64_t convertToBits(double d) {
+            static_assert(sizeof(double) == sizeof(uint64_t), "Important ULP matcher assumption violated");
+            uint64_t i;
+            std::memcpy(&i, &d, sizeof(d));
+            return i;
+        }
+
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        bool directCompare( float lhs, float rhs ) { return lhs == rhs; }
+        bool directCompare( double lhs, double rhs ) { return lhs == rhs; }
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+
+
+    } // end namespace Detail
+} // end namespace Catch
+
+
+
+
+
+
+#include <cstdlib>
+
+namespace Catch {
+    namespace Detail {
+
+#if !defined (CATCH_CONFIG_GETENV)
+        char const* getEnv( char const* ) { return nullptr; }
+#else
+
+        char const* getEnv( char const* varName ) {
+#    if defined( _MSC_VER )
+#        pragma warning( push )
+#        pragma warning( disable : 4996 ) // use getenv_s instead of getenv
+#    endif
+
+            return std::getenv( varName );
+
+#    if defined( _MSC_VER )
+#        pragma warning( pop )
+#    endif
+        }
+#endif
+} // namespace Detail
+} // namespace Catch
+
+
+
+
+#include <cstdio>
+#include <fstream>
+#include <sstream>
+#include <vector>
+
+namespace Catch {
+
+    Catch::IStream::~IStream() = default;
+
+namespace Detail {
+    namespace {
+        template<typename WriterF, std::size_t bufferSize=256>
+        class StreamBufImpl final : public std::streambuf {
+            char data[bufferSize];
+            WriterF m_writer;
+
+        public:
+            StreamBufImpl() {
+                setp( data, data + sizeof(data) );
+            }
+
+            ~StreamBufImpl() noexcept override {
+                StreamBufImpl::sync();
+            }
+
+        private:
+            int overflow( int c ) override {
+                sync();
+
+                if( c != EOF ) {
+                    if( pbase() == epptr() )
+                        m_writer( std::string( 1, static_cast<char>( c ) ) );
+                    else
+                        sputc( static_cast<char>( c ) );
+                }
+                return 0;
+            }
+
+            int sync() override {
+                if( pbase() != pptr() ) {
+                    m_writer( std::string( pbase(), static_cast<std::string::size_type>( pptr() - pbase() ) ) );
+                    setp( pbase(), epptr() );
+                }
+                return 0;
+            }
+        };
+
+        ///////////////////////////////////////////////////////////////////////////
+
+        struct OutputDebugWriter {
+
+            void operator()( std::string const& str ) {
+                if ( !str.empty() ) {
+                    writeToDebugConsole( str );
+                }
+            }
+        };
+
+        ///////////////////////////////////////////////////////////////////////////
+
+        class FileStream final : public IStream {
+            std::ofstream m_ofs;
+        public:
+            FileStream( std::string const& filename ) {
+                m_ofs.open( filename.c_str() );
+                CATCH_ENFORCE( !m_ofs.fail(), "Unable to open file: '" << filename << '\'' );
+                m_ofs << std::unitbuf;
+            }
+        public: // IStream
+            std::ostream& stream() override {
+                return m_ofs;
+            }
+        };
+
+        ///////////////////////////////////////////////////////////////////////////
+
+        class CoutStream final : public IStream {
+            std::ostream m_os;
+        public:
+            // Store the streambuf from cout up-front because
+            // cout may get redirected when running tests
+            CoutStream() : m_os( Catch::cout().rdbuf() ) {}
+
+        public: // IStream
+            std::ostream& stream() override { return m_os; }
+            bool isConsole() const override { return true; }
+        };
+
+        class CerrStream : public IStream {
+            std::ostream m_os;
+
+        public:
+            // Store the streambuf from cerr up-front because
+            // cout may get redirected when running tests
+            CerrStream(): m_os( Catch::cerr().rdbuf() ) {}
+
+        public: // IStream
+            std::ostream& stream() override { return m_os; }
+            bool isConsole() const override { return true; }
+        };
+
+        ///////////////////////////////////////////////////////////////////////////
+
+        class DebugOutStream final : public IStream {
+            Detail::unique_ptr<StreamBufImpl<OutputDebugWriter>> m_streamBuf;
+            std::ostream m_os;
+        public:
+            DebugOutStream()
+            :   m_streamBuf( Detail::make_unique<StreamBufImpl<OutputDebugWriter>>() ),
+                m_os( m_streamBuf.get() )
+            {}
+
+        public: // IStream
+            std::ostream& stream() override { return m_os; }
+        };
+
+    } // unnamed namespace
+} // namespace Detail
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    auto makeStream( std::string const& filename ) -> Detail::unique_ptr<IStream> {
+        if ( filename.empty() || filename == "-" ) {
+            return Detail::make_unique<Detail::CoutStream>();
+        }
+        if( filename[0] == '%' ) {
+            if ( filename == "%debug" ) {
+                return Detail::make_unique<Detail::DebugOutStream>();
+            } else if ( filename == "%stderr" ) {
+                return Detail::make_unique<Detail::CerrStream>();
+            } else if ( filename == "%stdout" ) {
+                return Detail::make_unique<Detail::CoutStream>();
+            } else {
+                CATCH_ERROR( "Unrecognised stream: '" << filename << '\'' );
+            }
+        }
+        return Detail::make_unique<Detail::FileStream>( filename );
+    }
+
+}
+
+
+
+namespace Catch {
+    void JsonUtils::indent( std::ostream& os, std::uint64_t level ) {
+        for ( std::uint64_t i = 0; i < level; ++i ) {
+            os << "  ";
+        }
+    }
+    void JsonUtils::appendCommaNewline( std::ostream& os,
+                                        bool& should_comma,
+                                        std::uint64_t level ) {
+        if ( should_comma ) { os << ','; }
+        should_comma = true;
+        os << '\n';
+        indent( os, level );
+    }
+
+    JsonObjectWriter::JsonObjectWriter( std::ostream& os ):
+        JsonObjectWriter{ os, 0 } {}
+
+    JsonObjectWriter::JsonObjectWriter( std::ostream& os,
+                                        std::uint64_t indent_level ):
+        m_os{ os }, m_indent_level{ indent_level } {
+        m_os << '{';
+    }
+    JsonObjectWriter::JsonObjectWriter( JsonObjectWriter&& source ) noexcept:
+        m_os{ source.m_os },
+        m_indent_level{ source.m_indent_level },
+        m_should_comma{ source.m_should_comma },
+        m_active{ source.m_active } {
+        source.m_active = false;
+    }
+
+    JsonObjectWriter::~JsonObjectWriter() {
+        if ( !m_active ) { return; }
+
+        m_os << '\n';
+        JsonUtils::indent( m_os, m_indent_level );
+        m_os << '}';
+    }
+
+    JsonValueWriter JsonObjectWriter::write( StringRef key ) {
+        JsonUtils::appendCommaNewline(
+            m_os, m_should_comma, m_indent_level + 1 );
+
+        m_os << '"' << key << "\": ";
+        return JsonValueWriter{ m_os, m_indent_level + 1 };
+    }
+
+    JsonArrayWriter::JsonArrayWriter( std::ostream& os ):
+        JsonArrayWriter{ os, 0 } {}
+    JsonArrayWriter::JsonArrayWriter( std::ostream& os,
+                                      std::uint64_t indent_level ):
+        m_os{ os }, m_indent_level{ indent_level } {
+        m_os << '[';
+    }
+    JsonArrayWriter::JsonArrayWriter( JsonArrayWriter&& source ) noexcept:
+        m_os{ source.m_os },
+        m_indent_level{ source.m_indent_level },
+        m_should_comma{ source.m_should_comma },
+        m_active{ source.m_active } {
+        source.m_active = false;
+    }
+    JsonArrayWriter::~JsonArrayWriter() {
+        if ( !m_active ) { return; }
+
+        m_os << '\n';
+        JsonUtils::indent( m_os, m_indent_level );
+        m_os << ']';
+    }
+
+    JsonObjectWriter JsonArrayWriter::writeObject() {
+        JsonUtils::appendCommaNewline(
+            m_os, m_should_comma, m_indent_level + 1 );
+        return JsonObjectWriter{ m_os, m_indent_level + 1 };
+    }
+
+    JsonArrayWriter JsonArrayWriter::writeArray() {
+        JsonUtils::appendCommaNewline(
+            m_os, m_should_comma, m_indent_level + 1 );
+        return JsonArrayWriter{ m_os, m_indent_level + 1 };
+    }
+
+    JsonArrayWriter& JsonArrayWriter::write( bool value ) {
+        return writeImpl( value );
+    }
+
+    JsonValueWriter::JsonValueWriter( std::ostream& os ):
+        JsonValueWriter{ os, 0 } {}
+
+    JsonValueWriter::JsonValueWriter( std::ostream& os,
+                                      std::uint64_t indent_level ):
+        m_os{ os }, m_indent_level{ indent_level } {}
+
+    JsonObjectWriter JsonValueWriter::writeObject() && {
+        return JsonObjectWriter{ m_os, m_indent_level };
+    }
+
+    JsonArrayWriter JsonValueWriter::writeArray() && {
+        return JsonArrayWriter{ m_os, m_indent_level };
+    }
+
+    void JsonValueWriter::write( Catch::StringRef value ) && {
+        writeImpl( value, true );
+    }
+
+    void JsonValueWriter::write( bool value ) && {
+        writeImpl( value ? "true"_sr : "false"_sr, false );
+    }
+
+    void JsonValueWriter::writeImpl( Catch::StringRef value, bool quote ) {
+        if ( quote ) { m_os << '"'; }
+        for (char c : value) {
+            // Escape list taken from https://www.json.org/json-en.html,
+            // string definition.
+            // Note that while forward slash _can_ be escaped, it does
+            // not have to be, if JSON is not further embedded somewhere
+            // where forward slash is meaningful.
+            if ( c == '"' ) {
+                m_os << "\\\"";
+            } else if ( c == '\\' ) {
+                m_os << "\\\\";
+            } else if ( c == '\b' ) {
+                m_os << "\\b";
+            } else if ( c == '\f' ) {
+                m_os << "\\f";
+            } else if ( c == '\n' ) {
+                m_os << "\\n";
+            } else if ( c == '\r' ) {
+                m_os << "\\r";
+            } else if ( c == '\t' ) {
+                m_os << "\\t";
+            } else {
+                m_os << c;
+            }
+        }
+        if ( quote ) { m_os << '"'; }
+    }
+
+} // namespace Catch
+
+
+
+
+namespace Catch {
+
+    auto operator << (std::ostream& os, LazyExpression const& lazyExpr) -> std::ostream& {
+        if (lazyExpr.m_isNegated)
+            os << '!';
+
+        if (lazyExpr) {
+            if (lazyExpr.m_isNegated && lazyExpr.m_transientExpression->isBinaryExpression())
+                os << '(' << *lazyExpr.m_transientExpression << ')';
+            else
+                os << *lazyExpr.m_transientExpression;
+        } else {
+            os << "{** error - unchecked empty expression requested **}";
+        }
+        return os;
+    }
+
+} // namespace Catch
+
+
+
+
+#ifdef CATCH_CONFIG_WINDOWS_CRTDBG
+#include <crtdbg.h>
+
+namespace Catch {
+
+    LeakDetector::LeakDetector() {
+        int flag = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
+        flag |= _CRTDBG_LEAK_CHECK_DF;
+        flag |= _CRTDBG_ALLOC_MEM_DF;
+        _CrtSetDbgFlag(flag);
+        _CrtSetReportMode(_CRT_WARN, _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG);
+        _CrtSetReportFile(_CRT_WARN, _CRTDBG_FILE_STDERR);
+        // Change this to leaking allocation's number to break there
+        _CrtSetBreakAlloc(-1);
+    }
+}
+
+#else // ^^ Windows crt debug heap enabled // Windows crt debug heap disabled vv
+
+    Catch::LeakDetector::LeakDetector() = default;
+
+#endif // CATCH_CONFIG_WINDOWS_CRTDBG
+
+Catch::LeakDetector::~LeakDetector() {
+    Catch::cleanUp();
+}
+
+
+
+
+namespace Catch {
+    namespace {
+
+        void listTests(IEventListener& reporter, IConfig const& config) {
+            auto const& testSpec = config.testSpec();
+            auto matchedTestCases = filterTests(getAllTestCasesSorted(config), testSpec, config);
+            reporter.listTests(matchedTestCases);
+        }
+
+        void listTags(IEventListener& reporter, IConfig const& config) {
+            auto const& testSpec = config.testSpec();
+            std::vector<TestCaseHandle> matchedTestCases = filterTests(getAllTestCasesSorted(config), testSpec, config);
+
+            std::map<StringRef, TagInfo, Detail::CaseInsensitiveLess> tagCounts;
+            for (auto const& testCase : matchedTestCases) {
+                for (auto const& tagName : testCase.getTestCaseInfo().tags) {
+                    auto it = tagCounts.find(tagName.original);
+                    if (it == tagCounts.end())
+                        it = tagCounts.insert(std::make_pair(tagName.original, TagInfo())).first;
+                    it->second.add(tagName.original);
+                }
+            }
+
+            std::vector<TagInfo> infos; infos.reserve(tagCounts.size());
+            for (auto& tagc : tagCounts) {
+                infos.push_back(CATCH_MOVE(tagc.second));
+            }
+
+            reporter.listTags(infos);
+        }
+
+        void listReporters(IEventListener& reporter) {
+            std::vector<ReporterDescription> descriptions;
+
+            auto const& factories = getRegistryHub().getReporterRegistry().getFactories();
+            descriptions.reserve(factories.size());
+            for (auto const& fac : factories) {
+                descriptions.push_back({ fac.first, fac.second->getDescription() });
+            }
+
+            reporter.listReporters(descriptions);
+        }
+
+        void listListeners(IEventListener& reporter) {
+            std::vector<ListenerDescription> descriptions;
+
+            auto const& factories =
+                getRegistryHub().getReporterRegistry().getListeners();
+            descriptions.reserve( factories.size() );
+            for ( auto const& fac : factories ) {
+                descriptions.push_back( { fac->getName(), fac->getDescription() } );
+            }
+
+            reporter.listListeners( descriptions );
+        }
+
+    } // end anonymous namespace
+
+    void TagInfo::add( StringRef spelling ) {
+        ++count;
+        spellings.insert( spelling );
+    }
+
+    std::string TagInfo::all() const {
+        // 2 per tag for brackets '[' and ']'
+        size_t size =  spellings.size() * 2;
+        for (auto const& spelling : spellings) {
+            size += spelling.size();
+        }
+
+        std::string out; out.reserve(size);
+        for (auto const& spelling : spellings) {
+            out += '[';
+            out += spelling;
+            out += ']';
+        }
+        return out;
+    }
+
+    bool list( IEventListener& reporter, Config const& config ) {
+        bool listed = false;
+        if (config.listTests()) {
+            listed = true;
+            listTests(reporter, config);
+        }
+        if (config.listTags()) {
+            listed = true;
+            listTags(reporter, config);
+        }
+        if (config.listReporters()) {
+            listed = true;
+            listReporters(reporter);
+        }
+        if ( config.listListeners() ) {
+            listed = true;
+            listListeners( reporter );
+        }
+        return listed;
+    }
+
+} // end namespace Catch
+
+
+
+namespace Catch {
+    CATCH_INTERNAL_START_WARNINGS_SUPPRESSION
+    CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS
+    static LeakDetector leakDetector;
+    CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+}
+
+// Allow users of amalgamated .cpp file to remove our main and provide their own.
+#if !defined(CATCH_AMALGAMATED_CUSTOM_MAIN)
+
+#if defined(CATCH_CONFIG_WCHAR) && defined(CATCH_PLATFORM_WINDOWS) && defined(_UNICODE) && !defined(DO_NOT_USE_WMAIN)
+// Standard C/C++ Win32 Unicode wmain entry point
+extern "C" int __cdecl wmain (int argc, wchar_t * argv[], wchar_t * []) {
+#else
+// Standard C/C++ main entry point
+int main (int argc, char * argv[]) {
+#endif
+
+    // We want to force the linker not to discard the global variable
+    // and its constructor, as it (optionally) registers leak detector
+    (void)&Catch::leakDetector;
+
+    return Catch::Session().run( argc, argv );
+}
+
+#endif // !defined(CATCH_AMALGAMATED_CUSTOM_MAIN
+
+
+
+
+namespace Catch {
+
+    MessageInfo::MessageInfo(   StringRef _macroName,
+                                SourceLineInfo const& _lineInfo,
+                                ResultWas::OfType _type )
+    :   macroName( _macroName ),
+        lineInfo( _lineInfo ),
+        type( _type ),
+        sequence( ++globalCount )
+    {}
+
+    // This may need protecting if threading support is added
+    unsigned int MessageInfo::globalCount = 0;
+
+} // end namespace Catch
+
+
+
+#include <cstdio>
+#include <cstring>
+#include <sstream>
+
+#if defined(CATCH_CONFIG_NEW_CAPTURE)
+    #if defined(_MSC_VER)
+    #include <io.h>      //_dup and _dup2
+    #define dup _dup
+    #define dup2 _dup2
+    #define fileno _fileno
+    #else
+    #include <unistd.h>  // dup and dup2
+    #endif
+#endif
+
+
+namespace Catch {
+
+    RedirectedStream::RedirectedStream( std::ostream& originalStream, std::ostream& redirectionStream )
+    :   m_originalStream( originalStream ),
+        m_redirectionStream( redirectionStream ),
+        m_prevBuf( m_originalStream.rdbuf() )
+    {
+        m_originalStream.rdbuf( m_redirectionStream.rdbuf() );
+    }
+
+    RedirectedStream::~RedirectedStream() {
+        m_originalStream.rdbuf( m_prevBuf );
+    }
+
+    RedirectedStdOut::RedirectedStdOut() : m_cout( Catch::cout(), m_rss.get() ) {}
+    auto RedirectedStdOut::str() const -> std::string { return m_rss.str(); }
+
+    RedirectedStdErr::RedirectedStdErr()
+    :   m_cerr( Catch::cerr(), m_rss.get() ),
+        m_clog( Catch::clog(), m_rss.get() )
+    {}
+    auto RedirectedStdErr::str() const -> std::string { return m_rss.str(); }
+
+    RedirectedStreams::RedirectedStreams(std::string& redirectedCout, std::string& redirectedCerr)
+    :   m_redirectedCout(redirectedCout),
+        m_redirectedCerr(redirectedCerr)
+    {}
+
+    RedirectedStreams::~RedirectedStreams() {
+        m_redirectedCout += m_redirectedStdOut.str();
+        m_redirectedCerr += m_redirectedStdErr.str();
+    }
+
+#if defined(CATCH_CONFIG_NEW_CAPTURE)
+
+#if defined(_MSC_VER)
+    TempFile::TempFile() {
+        if (tmpnam_s(m_buffer)) {
+            CATCH_RUNTIME_ERROR("Could not get a temp filename");
+        }
+        if (fopen_s(&m_file, m_buffer, "w+")) {
+            char buffer[100];
+            if (strerror_s(buffer, errno)) {
+                CATCH_RUNTIME_ERROR("Could not translate errno to a string");
+            }
+            CATCH_RUNTIME_ERROR("Could not open the temp file: '" << m_buffer << "' because: " << buffer);
+        }
+    }
+#else
+    TempFile::TempFile() {
+        m_file = std::tmpfile();
+        if (!m_file) {
+            CATCH_RUNTIME_ERROR("Could not create a temp file.");
+        }
+    }
+
+#endif
+
+    TempFile::~TempFile() {
+         // TBD: What to do about errors here?
+         std::fclose(m_file);
+         // We manually create the file on Windows only, on Linux
+         // it will be autodeleted
+#if defined(_MSC_VER)
+         std::remove(m_buffer);
+#endif
+    }
+
+
+    FILE* TempFile::getFile() {
+        return m_file;
+    }
+
+    std::string TempFile::getContents() {
+        std::stringstream sstr;
+        char buffer[100] = {};
+        std::rewind(m_file);
+        while (std::fgets(buffer, sizeof(buffer), m_file)) {
+            sstr << buffer;
+        }
+        return sstr.str();
+    }
+
+    OutputRedirect::OutputRedirect(std::string& stdout_dest, std::string& stderr_dest) :
+        m_originalStdout(dup(1)),
+        m_originalStderr(dup(2)),
+        m_stdoutDest(stdout_dest),
+        m_stderrDest(stderr_dest) {
+        dup2(fileno(m_stdoutFile.getFile()), 1);
+        dup2(fileno(m_stderrFile.getFile()), 2);
+    }
+
+    OutputRedirect::~OutputRedirect() {
+        Catch::cout() << std::flush;
+        fflush(stdout);
+        // Since we support overriding these streams, we flush cerr
+        // even though std::cerr is unbuffered
+        Catch::cerr() << std::flush;
+        Catch::clog() << std::flush;
+        fflush(stderr);
+
+        dup2(m_originalStdout, 1);
+        dup2(m_originalStderr, 2);
+
+        m_stdoutDest += m_stdoutFile.getContents();
+        m_stderrDest += m_stderrFile.getContents();
+    }
+
+#endif // CATCH_CONFIG_NEW_CAPTURE
+
+} // namespace Catch
+
+#if defined(CATCH_CONFIG_NEW_CAPTURE)
+    #if defined(_MSC_VER)
+    #undef dup
+    #undef dup2
+    #undef fileno
+    #endif
+#endif
+
+
+
+
+#include <limits>
+#include <stdexcept>
+
+namespace Catch {
+
+    Optional<unsigned int> parseUInt(std::string const& input, int base) {
+        auto trimmed = trim( input );
+        // std::stoull is annoying and accepts numbers starting with '-',
+        // it just negates them into unsigned int
+        if ( trimmed.empty() || trimmed[0] == '-' ) {
+            return {};
+        }
+
+        CATCH_TRY {
+            size_t pos = 0;
+            const auto ret = std::stoull( trimmed, &pos, base );
+
+            // We did not consume the whole input, so there is an issue
+            // This can be bunch of different stuff, like multiple numbers
+            // in the input, or invalid digits/characters and so on. Either
+            // way, we do not want to return the partially parsed result.
+            if ( pos != trimmed.size() ) {
+                return {};
+            }
+            // Too large
+            if ( ret > std::numeric_limits<unsigned int>::max() ) {
+                return {};
+            }
+            return static_cast<unsigned int>(ret);
+        }
+        CATCH_CATCH_ANON( std::invalid_argument const& ) {
+            // no conversion could be performed
+        }
+        CATCH_CATCH_ANON( std::out_of_range const& ) {
+            // the input does not fit into an unsigned long long
+        }
+        return {};
+    }
+
+} // namespace Catch
+
+
+
+
+#include <cmath>
+
+namespace Catch {
+
+#if !defined(CATCH_CONFIG_POLYFILL_ISNAN)
+    bool isnan(float f) {
+        return std::isnan(f);
+    }
+    bool isnan(double d) {
+        return std::isnan(d);
+    }
+#else
+    // For now we only use this for embarcadero
+    bool isnan(float f) {
+        return std::_isnan(f);
+    }
+    bool isnan(double d) {
+        return std::_isnan(d);
+    }
+#endif
+
+#if !defined( CATCH_CONFIG_GLOBAL_NEXTAFTER )
+    float nextafter( float x, float y ) { return std::nextafter( x, y ); }
+    double nextafter( double x, double y ) { return std::nextafter( x, y ); }
+#else
+    float nextafter( float x, float y ) { return ::nextafterf( x, y ); }
+    double nextafter( double x, double y ) { return ::nextafter( x, y ); }
+#endif
+
+} // end namespace Catch
+
+
+
+namespace Catch {
+
+namespace {
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable:4146) // we negate uint32 during the rotate
+#endif
+        // Safe rotr implementation thanks to John Regehr
+        uint32_t rotate_right(uint32_t val, uint32_t count) {
+            const uint32_t mask = 31;
+            count &= mask;
+            return (val >> count) | (val << (-count & mask));
+        }
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+}
+
+
+    SimplePcg32::SimplePcg32(result_type seed_) {
+        seed(seed_);
+    }
+
+
+    void SimplePcg32::seed(result_type seed_) {
+        m_state = 0;
+        (*this)();
+        m_state += seed_;
+        (*this)();
+    }
+
+    void SimplePcg32::discard(uint64_t skip) {
+        // We could implement this to run in O(log n) steps, but this
+        // should suffice for our use case.
+        for (uint64_t s = 0; s < skip; ++s) {
+            static_cast<void>((*this)());
+        }
+    }
+
+    SimplePcg32::result_type SimplePcg32::operator()() {
+        // prepare the output value
+        const uint32_t xorshifted = static_cast<uint32_t>(((m_state >> 18u) ^ m_state) >> 27u);
+        const auto output = rotate_right(xorshifted, m_state >> 59u);
+
+        // advance state
+        m_state = m_state * 6364136223846793005ULL + s_inc;
+
+        return output;
+    }
+
+    bool operator==(SimplePcg32 const& lhs, SimplePcg32 const& rhs) {
+        return lhs.m_state == rhs.m_state;
+    }
+
+    bool operator!=(SimplePcg32 const& lhs, SimplePcg32 const& rhs) {
+        return lhs.m_state != rhs.m_state;
+    }
+}
+
+
+
+
+
+#include <ctime>
+#include <random>
+
+namespace Catch {
+
+    std::uint32_t generateRandomSeed( GenerateFrom from ) {
+        switch ( from ) {
+        case GenerateFrom::Time:
+            return static_cast<std::uint32_t>( std::time( nullptr ) );
+
+        case GenerateFrom::Default:
+        case GenerateFrom::RandomDevice: {
+            std::random_device rd;
+            return Detail::fillBitsFrom<std::uint32_t>( rd );
+        }
+
+        default:
+            CATCH_ERROR("Unknown generation method");
+        }
+    }
+
+} // end namespace Catch
+
+
+
+
+namespace Catch {
+    struct ReporterRegistry::ReporterRegistryImpl {
+        std::vector<Detail::unique_ptr<EventListenerFactory>> listeners;
+        std::map<std::string, IReporterFactoryPtr, Detail::CaseInsensitiveLess>
+            factories;
+    };
+
+    ReporterRegistry::ReporterRegistry():
+        m_impl( Detail::make_unique<ReporterRegistryImpl>() ) {
+        // Because it is impossible to move out of initializer list,
+        // we have to add the elements manually
+        m_impl->factories["Automake"] =
+            Detail::make_unique<ReporterFactory<AutomakeReporter>>();
+        m_impl->factories["compact"] =
+            Detail::make_unique<ReporterFactory<CompactReporter>>();
+        m_impl->factories["console"] =
+            Detail::make_unique<ReporterFactory<ConsoleReporter>>();
+        m_impl->factories["JUnit"] =
+            Detail::make_unique<ReporterFactory<JunitReporter>>();
+        m_impl->factories["SonarQube"] =
+            Detail::make_unique<ReporterFactory<SonarQubeReporter>>();
+        m_impl->factories["TAP"] =
+            Detail::make_unique<ReporterFactory<TAPReporter>>();
+        m_impl->factories["TeamCity"] =
+            Detail::make_unique<ReporterFactory<TeamCityReporter>>();
+        m_impl->factories["XML"] =
+            Detail::make_unique<ReporterFactory<XmlReporter>>();
+        m_impl->factories["JSON"] =
+            Detail::make_unique<ReporterFactory<JsonReporter>>();
+    }
+
+    ReporterRegistry::~ReporterRegistry() = default;
+
+    IEventListenerPtr
+    ReporterRegistry::create( std::string const& name,
+                              ReporterConfig&& config ) const {
+        auto it = m_impl->factories.find( name );
+        if ( it == m_impl->factories.end() ) return nullptr;
+        return it->second->create( CATCH_MOVE( config ) );
+    }
+
+    void ReporterRegistry::registerReporter( std::string const& name,
+                                             IReporterFactoryPtr factory ) {
+        CATCH_ENFORCE( name.find( "::" ) == name.npos,
+                       "'::' is not allowed in reporter name: '" + name +
+                           '\'' );
+        auto ret = m_impl->factories.emplace( name, CATCH_MOVE( factory ) );
+        CATCH_ENFORCE( ret.second,
+                       "reporter using '" + name +
+                           "' as name was already registered" );
+    }
+    void ReporterRegistry::registerListener(
+        Detail::unique_ptr<EventListenerFactory> factory ) {
+        m_impl->listeners.push_back( CATCH_MOVE( factory ) );
+    }
+
+    std::map<std::string,
+             IReporterFactoryPtr,
+             Detail::CaseInsensitiveLess> const&
+    ReporterRegistry::getFactories() const {
+        return m_impl->factories;
+    }
+
+    std::vector<Detail::unique_ptr<EventListenerFactory>> const&
+    ReporterRegistry::getListeners() const {
+        return m_impl->listeners;
+    }
+} // namespace Catch
+
+
+
+
+
+#include <algorithm>
+
+namespace Catch {
+
+    namespace {
+        struct kvPair {
+            StringRef key, value;
+        };
+
+        kvPair splitKVPair(StringRef kvString) {
+            auto splitPos = static_cast<size_t>(
+                std::find( kvString.begin(), kvString.end(), '=' ) -
+                kvString.begin() );
+
+            return { kvString.substr( 0, splitPos ),
+                     kvString.substr( splitPos + 1, kvString.size() ) };
+        }
+    }
+
+    namespace Detail {
+        std::vector<std::string> splitReporterSpec( StringRef reporterSpec ) {
+            static constexpr auto separator = "::";
+            static constexpr size_t separatorSize = 2;
+
+            size_t separatorPos = 0;
+            auto findNextSeparator = [&reporterSpec]( size_t startPos ) {
+                static_assert(
+                    separatorSize == 2,
+                    "The code below currently assumes 2 char separator" );
+
+                auto currentPos = startPos;
+                do {
+                    while ( currentPos < reporterSpec.size() &&
+                            reporterSpec[currentPos] != separator[0] ) {
+                        ++currentPos;
+                    }
+                    if ( currentPos + 1 < reporterSpec.size() &&
+                         reporterSpec[currentPos + 1] == separator[1] ) {
+                        return currentPos;
+                    }
+                    ++currentPos;
+                } while ( currentPos < reporterSpec.size() );
+
+                return static_cast<size_t>( -1 );
+            };
+
+            std::vector<std::string> parts;
+
+            while ( separatorPos < reporterSpec.size() ) {
+                const auto nextSeparator = findNextSeparator( separatorPos );
+                parts.push_back( static_cast<std::string>( reporterSpec.substr(
+                    separatorPos, nextSeparator - separatorPos ) ) );
+
+                if ( nextSeparator == static_cast<size_t>( -1 ) ) {
+                    break;
+                }
+                separatorPos = nextSeparator + separatorSize;
+            }
+
+            // Handle a separator at the end.
+            // This is not a valid spec, but we want to do validation in a
+            // centralized place
+            if ( separatorPos == reporterSpec.size() ) {
+                parts.emplace_back();
+            }
+
+            return parts;
+        }
+
+        Optional<ColourMode> stringToColourMode( StringRef colourMode ) {
+            if ( colourMode == "default" ) {
+                return ColourMode::PlatformDefault;
+            } else if ( colourMode == "ansi" ) {
+                return ColourMode::ANSI;
+            } else if ( colourMode == "win32" ) {
+                return ColourMode::Win32;
+            } else if ( colourMode == "none" ) {
+                return ColourMode::None;
+            } else {
+                return {};
+            }
+        }
+    } // namespace Detail
+
+
+    bool operator==( ReporterSpec const& lhs, ReporterSpec const& rhs ) {
+        return lhs.m_name == rhs.m_name &&
+               lhs.m_outputFileName == rhs.m_outputFileName &&
+               lhs.m_colourMode == rhs.m_colourMode &&
+               lhs.m_customOptions == rhs.m_customOptions;
+    }
+
+    Optional<ReporterSpec> parseReporterSpec( StringRef reporterSpec ) {
+        auto parts = Detail::splitReporterSpec( reporterSpec );
+
+        assert( parts.size() > 0 && "Split should never return empty vector" );
+
+        std::map<std::string, std::string> kvPairs;
+        Optional<std::string> outputFileName;
+        Optional<ColourMode> colourMode;
+
+        // First part is always reporter name, so we skip it
+        for ( size_t i = 1; i < parts.size(); ++i ) {
+            auto kv = splitKVPair( parts[i] );
+            auto key = kv.key, value = kv.value;
+
+            if ( key.empty() || value.empty() ) { // NOLINT(bugprone-branch-clone)
+                return {};
+            } else if ( key[0] == 'X' ) {
+                // This is a reporter-specific option, we don't check these
+                // apart from basic sanity checks
+                if ( key.size() == 1 ) {
+                    return {};
+                }
+
+                auto ret = kvPairs.emplace( std::string(kv.key), std::string(kv.value) );
+                if ( !ret.second ) {
+                    // Duplicated key. We might want to handle this differently,
+                    // e.g. by overwriting the existing value?
+                    return {};
+                }
+            } else if ( key == "out" ) {
+                // Duplicated key
+                if ( outputFileName ) {
+                    return {};
+                }
+                outputFileName = static_cast<std::string>( value );
+            } else if ( key == "colour-mode" ) {
+                // Duplicated key
+                if ( colourMode ) {
+                    return {};
+                }
+                colourMode = Detail::stringToColourMode( value );
+                // Parsing failed
+                if ( !colourMode ) {
+                    return {};
+                }
+            } else {
+                // Unrecognized option
+                return {};
+            }
+        }
+
+        return ReporterSpec{ CATCH_MOVE( parts[0] ),
+                             CATCH_MOVE( outputFileName ),
+                             CATCH_MOVE( colourMode ),
+                             CATCH_MOVE( kvPairs ) };
+    }
+
+ReporterSpec::ReporterSpec(
+        std::string name,
+        Optional<std::string> outputFileName,
+        Optional<ColourMode> colourMode,
+        std::map<std::string, std::string> customOptions ):
+        m_name( CATCH_MOVE( name ) ),
+        m_outputFileName( CATCH_MOVE( outputFileName ) ),
+        m_colourMode( CATCH_MOVE( colourMode ) ),
+        m_customOptions( CATCH_MOVE( customOptions ) ) {}
+
+} // namespace Catch
+
+
+
+namespace Catch {
+
+    bool isOk( ResultWas::OfType resultType ) {
+        return ( resultType & ResultWas::FailureBit ) == 0;
+    }
+    bool isJustInfo( int flags ) {
+        return flags == ResultWas::Info;
+    }
+
+    ResultDisposition::Flags operator | ( ResultDisposition::Flags lhs, ResultDisposition::Flags rhs ) {
+        return static_cast<ResultDisposition::Flags>( static_cast<int>( lhs ) | static_cast<int>( rhs ) );
+    }
+
+    bool shouldContinueOnFailure( int flags )    { return ( flags & ResultDisposition::ContinueOnFailure ) != 0; }
+    bool shouldSuppressFailure( int flags )      { return ( flags & ResultDisposition::SuppressFail ) != 0; }
+
+} // end namespace Catch
+
+
+
+#include <cstdio>
+#include <sstream>
+#include <vector>
+
+namespace Catch {
+
+    // This class encapsulates the idea of a pool of ostringstreams that can be reused.
+    struct StringStreams {
+        std::vector<Detail::unique_ptr<std::ostringstream>> m_streams;
+        std::vector<std::size_t> m_unused;
+        std::ostringstream m_referenceStream; // Used for copy state/ flags from
+
+        auto add() -> std::size_t {
+            if( m_unused.empty() ) {
+                m_streams.push_back( Detail::make_unique<std::ostringstream>() );
+                return m_streams.size()-1;
+            }
+            else {
+                auto index = m_unused.back();
+                m_unused.pop_back();
+                return index;
+            }
+        }
+
+        void release( std::size_t index ) {
+            m_streams[index]->copyfmt( m_referenceStream ); // Restore initial flags and other state
+            m_unused.push_back(index);
+        }
+    };
+
+    ReusableStringStream::ReusableStringStream()
+    :   m_index( Singleton<StringStreams>::getMutable().add() ),
+        m_oss( Singleton<StringStreams>::getMutable().m_streams[m_index].get() )
+    {}
+
+    ReusableStringStream::~ReusableStringStream() {
+        static_cast<std::ostringstream*>( m_oss )->str("");
+        m_oss->clear();
+        Singleton<StringStreams>::getMutable().release( m_index );
+    }
+
+    std::string ReusableStringStream::str() const {
+        return static_cast<std::ostringstream*>( m_oss )->str();
+    }
+
+    void ReusableStringStream::str( std::string const& str ) {
+        static_cast<std::ostringstream*>( m_oss )->str( str );
+    }
+
+
+}
+
+
+
+
+#include <cassert>
+#include <algorithm>
+
+namespace Catch {
+
+    namespace Generators {
+        namespace {
+            struct GeneratorTracker final : TestCaseTracking::TrackerBase,
+                                      IGeneratorTracker {
+                GeneratorBasePtr m_generator;
+
+                GeneratorTracker(
+                    TestCaseTracking::NameAndLocation&& nameAndLocation,
+                    TrackerContext& ctx,
+                    ITracker* parent ):
+                    TrackerBase( CATCH_MOVE( nameAndLocation ), ctx, parent ) {}
+
+                static GeneratorTracker*
+                acquire( TrackerContext& ctx,
+                         TestCaseTracking::NameAndLocationRef const&
+                             nameAndLocation ) {
+                    GeneratorTracker* tracker;
+
+                    ITracker& currentTracker = ctx.currentTracker();
+                    // Under specific circumstances, the generator we want
+                    // to acquire is also the current tracker. If this is
+                    // the case, we have to avoid looking through current
+                    // tracker's children, and instead return the current
+                    // tracker.
+                    // A case where this check is important is e.g.
+                    //     for (int i = 0; i < 5; ++i) {
+                    //         int n = GENERATE(1, 2);
+                    //     }
+                    //
+                    // without it, the code above creates 5 nested generators.
+                    if ( currentTracker.nameAndLocation() == nameAndLocation ) {
+                        auto thisTracker = currentTracker.parent()->findChild(
+                            nameAndLocation );
+                        assert( thisTracker );
+                        assert( thisTracker->isGeneratorTracker() );
+                        tracker = static_cast<GeneratorTracker*>( thisTracker );
+                    } else if ( ITracker* childTracker =
+                                    currentTracker.findChild(
+                                        nameAndLocation ) ) {
+                        assert( childTracker );
+                        assert( childTracker->isGeneratorTracker() );
+                        tracker =
+                            static_cast<GeneratorTracker*>( childTracker );
+                    } else {
+                        return nullptr;
+                    }
+
+                    if ( !tracker->isComplete() ) { tracker->open(); }
+
+                    return tracker;
+                }
+
+                // TrackerBase interface
+                bool isGeneratorTracker() const override { return true; }
+                auto hasGenerator() const -> bool override {
+                    return !!m_generator;
+                }
+                void close() override {
+                    TrackerBase::close();
+                    // If a generator has a child (it is followed by a section)
+                    // and none of its children have started, then we must wait
+                    // until later to start consuming its values.
+                    // This catches cases where `GENERATE` is placed between two
+                    // `SECTION`s.
+                    // **The check for m_children.empty cannot be removed**.
+                    // doing so would break `GENERATE` _not_ followed by
+                    // `SECTION`s.
+                    const bool should_wait_for_child = [&]() {
+                        // No children -> nobody to wait for
+                        if ( m_children.empty() ) { return false; }
+                        // If at least one child started executing, don't wait
+                        if ( std::find_if(
+                                 m_children.begin(),
+                                 m_children.end(),
+                                 []( TestCaseTracking::ITrackerPtr const&
+                                         tracker ) {
+                                     return tracker->hasStarted();
+                                 } ) != m_children.end() ) {
+                            return false;
+                        }
+
+                        // No children have started. We need to check if they
+                        // _can_ start, and thus we should wait for them, or
+                        // they cannot start (due to filters), and we shouldn't
+                        // wait for them
+                        ITracker* parent = m_parent;
+                        // This is safe: there is always at least one section
+                        // tracker in a test case tracking tree
+                        while ( !parent->isSectionTracker() ) {
+                            parent = parent->parent();
+                        }
+                        assert( parent &&
+                                "Missing root (test case) level section" );
+
+                        auto const& parentSection =
+                            static_cast<SectionTracker const&>( *parent );
+                        auto const& filters = parentSection.getFilters();
+                        // No filters -> no restrictions on running sections
+                        if ( filters.empty() ) { return true; }
+
+                        for ( auto const& child : m_children ) {
+                            if ( child->isSectionTracker() &&
+                                 std::find( filters.begin(),
+                                            filters.end(),
+                                            static_cast<SectionTracker const&>(
+                                                *child )
+                                                .trimmedName() ) !=
+                                     filters.end() ) {
+                                return true;
+                            }
+                        }
+                        return false;
+                    }();
+
+                    // This check is a bit tricky, because m_generator->next()
+                    // has a side-effect, where it consumes generator's current
+                    // value, but we do not want to invoke the side-effect if
+                    // this generator is still waiting for any child to start.
+                    assert( m_generator && "Tracker without generator" );
+                    if ( should_wait_for_child ||
+                         ( m_runState == CompletedSuccessfully &&
+                           m_generator->countedNext() ) ) {
+                        m_children.clear();
+                        m_runState = Executing;
+                    }
+                }
+
+                // IGeneratorTracker interface
+                auto getGenerator() const -> GeneratorBasePtr const& override {
+                    return m_generator;
+                }
+                void setGenerator( GeneratorBasePtr&& generator ) override {
+                    m_generator = CATCH_MOVE( generator );
+                }
+            };
+        } // namespace
+    }
+
+    RunContext::RunContext(IConfig const* _config, IEventListenerPtr&& reporter)
+    :   m_runInfo(_config->name()),
+        m_config(_config),
+        m_reporter(CATCH_MOVE(reporter)),
+        m_lastAssertionInfo{ StringRef(), SourceLineInfo("",0), StringRef(), ResultDisposition::Normal },
+        m_includeSuccessfulResults( m_config->includeSuccessfulResults() || m_reporter->getPreferences().shouldReportAllAssertions )
+    {
+        getCurrentMutableContext().setResultCapture( this );
+        m_reporter->testRunStarting(m_runInfo);
+    }
+
+    RunContext::~RunContext() {
+        m_reporter->testRunEnded(TestRunStats(m_runInfo, m_totals, aborting()));
+    }
+
+    Totals RunContext::runTest(TestCaseHandle const& testCase) {
+        const Totals prevTotals = m_totals;
+
+        auto const& testInfo = testCase.getTestCaseInfo();
+        m_reporter->testCaseStarting(testInfo);
+        m_activeTestCase = &testCase;
+
+
+        ITracker& rootTracker = m_trackerContext.startRun();
+        assert(rootTracker.isSectionTracker());
+        static_cast<SectionTracker&>(rootTracker).addInitialFilters(m_config->getSectionsToRun());
+
+        // We intentionally only seed the internal RNG once per test case,
+        // before it is first invoked. The reason for that is a complex
+        // interplay of generator/section implementation details and the
+        // Random*Generator types.
+        //
+        // The issue boils down to us needing to seed the Random*Generators
+        // with different seed each, so that they return different sequences
+        // of random numbers. We do this by giving them a number from the
+        // shared RNG instance as their seed.
+        //
+        // However, this runs into an issue if the reseeding happens each
+        // time the test case is entered (as opposed to first time only),
+        // because multiple generators could get the same seed, e.g. in
+        // ```cpp
+        // TEST_CASE() {
+        //     auto i = GENERATE(take(10, random(0, 100));
+        //     SECTION("A") {
+        //         auto j = GENERATE(take(10, random(0, 100));
+        //     }
+        //     SECTION("B") {
+        //         auto k = GENERATE(take(10, random(0, 100));
+        //     }
+        // }
+        // ```
+        // `i` and `j` would properly return values from different sequences,
+        // but `i` and `k` would return the same sequence, because their seed
+        // would be the same.
+        // (The reason their seeds would be the same is that the generator
+        //  for k would be initialized when the test case is entered the second
+        //  time, after the shared RNG instance was reset to the same value
+        //  it had when the generator for i was initialized.)
+        seedRng( *m_config );
+
+        uint64_t testRuns = 0;
+        std::string redirectedCout;
+        std::string redirectedCerr;
+        do {
+            m_trackerContext.startCycle();
+            m_testCaseTracker = &SectionTracker::acquire(m_trackerContext, TestCaseTracking::NameAndLocationRef(testInfo.name, testInfo.lineInfo));
+
+            m_reporter->testCasePartialStarting(testInfo, testRuns);
+
+            const auto beforeRunTotals = m_totals;
+            std::string oneRunCout, oneRunCerr;
+            runCurrentTest(oneRunCout, oneRunCerr);
+            redirectedCout += oneRunCout;
+            redirectedCerr += oneRunCerr;
+
+            const auto singleRunTotals = m_totals.delta(beforeRunTotals);
+            auto statsForOneRun = TestCaseStats(testInfo, singleRunTotals, CATCH_MOVE(oneRunCout), CATCH_MOVE(oneRunCerr), aborting());
+
+            m_reporter->testCasePartialEnded(statsForOneRun, testRuns);
+            ++testRuns;
+        } while (!m_testCaseTracker->isSuccessfullyCompleted() && !aborting());
+
+        Totals deltaTotals = m_totals.delta(prevTotals);
+        if (testInfo.expectedToFail() && deltaTotals.testCases.passed > 0) {
+            deltaTotals.assertions.failed++;
+            deltaTotals.testCases.passed--;
+            deltaTotals.testCases.failed++;
+        }
+        m_totals.testCases += deltaTotals.testCases;
+        m_reporter->testCaseEnded(TestCaseStats(testInfo,
+                                  deltaTotals,
+                                  CATCH_MOVE(redirectedCout),
+                                  CATCH_MOVE(redirectedCerr),
+                                  aborting()));
+
+        m_activeTestCase = nullptr;
+        m_testCaseTracker = nullptr;
+
+        return deltaTotals;
+    }
+
+
+    void RunContext::assertionEnded(AssertionResult&& result) {
+        if (result.getResultType() == ResultWas::Ok) {
+            m_totals.assertions.passed++;
+            m_lastAssertionPassed = true;
+        } else if (result.getResultType() == ResultWas::ExplicitSkip) {
+            m_totals.assertions.skipped++;
+            m_lastAssertionPassed = true;
+        } else if (!result.succeeded()) {
+            m_lastAssertionPassed = false;
+            if (result.isOk()) {
+            }
+            else if( m_activeTestCase->getTestCaseInfo().okToFail() )
+                m_totals.assertions.failedButOk++;
+            else
+                m_totals.assertions.failed++;
+        }
+        else {
+            m_lastAssertionPassed = true;
+        }
+
+        m_reporter->assertionEnded(AssertionStats(result, m_messages, m_totals));
+
+        if ( result.getResultType() != ResultWas::Warning ) {
+            m_messageScopes.clear();
+        }
+
+        // Reset working state. assertion info will be reset after
+        // populateReaction is run if it is needed
+        m_lastResult = CATCH_MOVE( result );
+    }
+    void RunContext::resetAssertionInfo() {
+        m_lastAssertionInfo.macroName = StringRef();
+        m_lastAssertionInfo.capturedExpression = "{Unknown expression after the reported line}"_sr;
+        m_lastAssertionInfo.resultDisposition = ResultDisposition::Normal;
+    }
+
+    void RunContext::notifyAssertionStarted( AssertionInfo const& info ) {
+        m_reporter->assertionStarting( info );
+    }
+
+    bool RunContext::sectionStarted( StringRef sectionName,
+                                     SourceLineInfo const& sectionLineInfo,
+                                     Counts& assertions ) {
+        ITracker& sectionTracker =
+            SectionTracker::acquire( m_trackerContext,
+                                     TestCaseTracking::NameAndLocationRef(
+                                         sectionName, sectionLineInfo ) );
+
+        if (!sectionTracker.isOpen())
+            return false;
+        m_activeSections.push_back(&sectionTracker);
+
+        SectionInfo sectionInfo( sectionLineInfo, static_cast<std::string>(sectionName) );
+        m_lastAssertionInfo.lineInfo = sectionInfo.lineInfo;
+
+        m_reporter->sectionStarting(sectionInfo);
+
+        assertions = m_totals.assertions;
+
+        return true;
+    }
+    IGeneratorTracker*
+    RunContext::acquireGeneratorTracker( StringRef generatorName,
+                                         SourceLineInfo const& lineInfo ) {
+        using namespace Generators;
+        GeneratorTracker* tracker = GeneratorTracker::acquire(
+            m_trackerContext,
+            TestCaseTracking::NameAndLocationRef(
+                 generatorName, lineInfo ) );
+        m_lastAssertionInfo.lineInfo = lineInfo;
+        return tracker;
+    }
+
+    IGeneratorTracker* RunContext::createGeneratorTracker(
+        StringRef generatorName,
+        SourceLineInfo lineInfo,
+        Generators::GeneratorBasePtr&& generator ) {
+
+        auto nameAndLoc = TestCaseTracking::NameAndLocation( static_cast<std::string>( generatorName ), lineInfo );
+        auto& currentTracker = m_trackerContext.currentTracker();
+        assert(
+            currentTracker.nameAndLocation() != nameAndLoc &&
+            "Trying to create tracker for a genreator that already has one" );
+
+        auto newTracker = Catch::Detail::make_unique<Generators::GeneratorTracker>(
+            CATCH_MOVE(nameAndLoc), m_trackerContext, &currentTracker );
+        auto ret = newTracker.get();
+        currentTracker.addChild( CATCH_MOVE( newTracker ) );
+
+        ret->setGenerator( CATCH_MOVE( generator ) );
+        ret->open();
+        return ret;
+    }
+
+    bool RunContext::testForMissingAssertions(Counts& assertions) {
+        if (assertions.total() != 0)
+            return false;
+        if (!m_config->warnAboutMissingAssertions())
+            return false;
+        if (m_trackerContext.currentTracker().hasChildren())
+            return false;
+        m_totals.assertions.failed++;
+        assertions.failed++;
+        return true;
+    }
+
+    void RunContext::sectionEnded(SectionEndInfo&& endInfo) {
+        Counts assertions = m_totals.assertions - endInfo.prevAssertions;
+        bool missingAssertions = testForMissingAssertions(assertions);
+
+        if (!m_activeSections.empty()) {
+            m_activeSections.back()->close();
+            m_activeSections.pop_back();
+        }
+
+        m_reporter->sectionEnded(SectionStats(CATCH_MOVE(endInfo.sectionInfo), assertions, endInfo.durationInSeconds, missingAssertions));
+        m_messages.clear();
+        m_messageScopes.clear();
+    }
+
+    void RunContext::sectionEndedEarly(SectionEndInfo&& endInfo) {
+        if ( m_unfinishedSections.empty() ) {
+            m_activeSections.back()->fail();
+        } else {
+            m_activeSections.back()->close();
+        }
+        m_activeSections.pop_back();
+
+        m_unfinishedSections.push_back(CATCH_MOVE(endInfo));
+    }
+
+    void RunContext::benchmarkPreparing( StringRef name ) {
+        m_reporter->benchmarkPreparing(name);
+    }
+    void RunContext::benchmarkStarting( BenchmarkInfo const& info ) {
+        m_reporter->benchmarkStarting( info );
+    }
+    void RunContext::benchmarkEnded( BenchmarkStats<> const& stats ) {
+        m_reporter->benchmarkEnded( stats );
+    }
+    void RunContext::benchmarkFailed( StringRef error ) {
+        m_reporter->benchmarkFailed( error );
+    }
+
+    void RunContext::pushScopedMessage(MessageInfo const & message) {
+        m_messages.push_back(message);
+    }
+
+    void RunContext::popScopedMessage(MessageInfo const & message) {
+        m_messages.erase(std::remove(m_messages.begin(), m_messages.end(), message), m_messages.end());
+    }
+
+    void RunContext::emplaceUnscopedMessage( MessageBuilder&& builder ) {
+        m_messageScopes.emplace_back( CATCH_MOVE(builder) );
+    }
+
+    std::string RunContext::getCurrentTestName() const {
+        return m_activeTestCase
+            ? m_activeTestCase->getTestCaseInfo().name
+            : std::string();
+    }
+
+    const AssertionResult * RunContext::getLastResult() const {
+        return &(*m_lastResult);
+    }
+
+    void RunContext::exceptionEarlyReported() {
+        m_shouldReportUnexpected = false;
+    }
+
+    void RunContext::handleFatalErrorCondition( StringRef message ) {
+        // First notify reporter that bad things happened
+        m_reporter->fatalErrorEncountered(message);
+
+        // Don't rebuild the result -- the stringification itself can cause more fatal errors
+        // Instead, fake a result data.
+        AssertionResultData tempResult( ResultWas::FatalErrorCondition, { false } );
+        tempResult.message = static_cast<std::string>(message);
+        AssertionResult result(m_lastAssertionInfo, CATCH_MOVE(tempResult));
+
+        assertionEnded(CATCH_MOVE(result) );
+        resetAssertionInfo();
+
+        handleUnfinishedSections();
+
+        // Recreate section for test case (as we will lose the one that was in scope)
+        auto const& testCaseInfo = m_activeTestCase->getTestCaseInfo();
+        SectionInfo testCaseSection(testCaseInfo.lineInfo, testCaseInfo.name);
+
+        Counts assertions;
+        assertions.failed = 1;
+        SectionStats testCaseSectionStats(CATCH_MOVE(testCaseSection), assertions, 0, false);
+        m_reporter->sectionEnded(testCaseSectionStats);
+
+        auto const& testInfo = m_activeTestCase->getTestCaseInfo();
+
+        Totals deltaTotals;
+        deltaTotals.testCases.failed = 1;
+        deltaTotals.assertions.failed = 1;
+        m_reporter->testCaseEnded(TestCaseStats(testInfo,
+                                  deltaTotals,
+                                  std::string(),
+                                  std::string(),
+                                  false));
+        m_totals.testCases.failed++;
+        m_reporter->testRunEnded(TestRunStats(m_runInfo, m_totals, false));
+    }
+
+    bool RunContext::lastAssertionPassed() {
+         return m_lastAssertionPassed;
+    }
+
+    void RunContext::assertionPassed() {
+        m_lastAssertionPassed = true;
+        ++m_totals.assertions.passed;
+        resetAssertionInfo();
+        m_messageScopes.clear();
+    }
+
+    bool RunContext::aborting() const {
+        return m_totals.assertions.failed >= static_cast<std::size_t>(m_config->abortAfter());
+    }
+
+    void RunContext::runCurrentTest(std::string & redirectedCout, std::string & redirectedCerr) {
+        auto const& testCaseInfo = m_activeTestCase->getTestCaseInfo();
+        SectionInfo testCaseSection(testCaseInfo.lineInfo, testCaseInfo.name);
+        m_reporter->sectionStarting(testCaseSection);
+        Counts prevAssertions = m_totals.assertions;
+        double duration = 0;
+        m_shouldReportUnexpected = true;
+        m_lastAssertionInfo = { "TEST_CASE"_sr, testCaseInfo.lineInfo, StringRef(), ResultDisposition::Normal };
+
+        Timer timer;
+        CATCH_TRY {
+            if (m_reporter->getPreferences().shouldRedirectStdOut) {
+#if !defined(CATCH_CONFIG_EXPERIMENTAL_REDIRECT)
+                RedirectedStreams redirectedStreams(redirectedCout, redirectedCerr);
+
+                timer.start();
+                invokeActiveTestCase();
+#else
+                OutputRedirect r(redirectedCout, redirectedCerr);
+                timer.start();
+                invokeActiveTestCase();
+#endif
+            } else {
+                timer.start();
+                invokeActiveTestCase();
+            }
+            duration = timer.getElapsedSeconds();
+        } CATCH_CATCH_ANON (TestFailureException&) {
+            // This just means the test was aborted due to failure
+        } CATCH_CATCH_ANON (TestSkipException&) {
+            // This just means the test was explicitly skipped
+        } CATCH_CATCH_ALL {
+            // Under CATCH_CONFIG_FAST_COMPILE, unexpected exceptions under REQUIRE assertions
+            // are reported without translation at the point of origin.
+            if( m_shouldReportUnexpected ) {
+                AssertionReaction dummyReaction;
+                handleUnexpectedInflightException( m_lastAssertionInfo, translateActiveException(), dummyReaction );
+            }
+        }
+        Counts assertions = m_totals.assertions - prevAssertions;
+        bool missingAssertions = testForMissingAssertions(assertions);
+
+        m_testCaseTracker->close();
+        handleUnfinishedSections();
+        m_messages.clear();
+        m_messageScopes.clear();
+
+        SectionStats testCaseSectionStats(CATCH_MOVE(testCaseSection), assertions, duration, missingAssertions);
+        m_reporter->sectionEnded(testCaseSectionStats);
+    }
+
+    void RunContext::invokeActiveTestCase() {
+        // We need to engage a handler for signals/structured exceptions
+        // before running the tests themselves, or the binary can crash
+        // without failed test being reported.
+        FatalConditionHandlerGuard _(&m_fatalConditionhandler);
+        // We keep having issue where some compilers warn about an unused
+        // variable, even though the type has non-trivial constructor and
+        // destructor. This is annoying and ugly, but it makes them stfu.
+        (void)_;
+
+        m_activeTestCase->invoke();
+    }
+
+    void RunContext::handleUnfinishedSections() {
+        // If sections ended prematurely due to an exception we stored their
+        // infos here so we can tear them down outside the unwind process.
+        for (auto it = m_unfinishedSections.rbegin(),
+             itEnd = m_unfinishedSections.rend();
+             it != itEnd;
+             ++it)
+            sectionEnded(CATCH_MOVE(*it));
+        m_unfinishedSections.clear();
+    }
+
+    void RunContext::handleExpr(
+        AssertionInfo const& info,
+        ITransientExpression const& expr,
+        AssertionReaction& reaction
+    ) {
+        bool negated = isFalseTest( info.resultDisposition );
+        bool result = expr.getResult() != negated;
+
+        if( result ) {
+            if (!m_includeSuccessfulResults) {
+                assertionPassed();
+            }
+            else {
+                reportExpr(info, ResultWas::Ok, &expr, negated);
+            }
+        }
+        else {
+            reportExpr(info, ResultWas::ExpressionFailed, &expr, negated );
+            populateReaction( reaction );
+        }
+        resetAssertionInfo();
+    }
+    void RunContext::reportExpr(
+            AssertionInfo const &info,
+            ResultWas::OfType resultType,
+            ITransientExpression const *expr,
+            bool negated ) {
+
+        m_lastAssertionInfo = info;
+        AssertionResultData data( resultType, LazyExpression( negated ) );
+
+        AssertionResult assertionResult{ info, CATCH_MOVE( data ) };
+        assertionResult.m_resultData.lazyExpression.m_transientExpression = expr;
+
+        assertionEnded( CATCH_MOVE(assertionResult) );
+    }
+
+    void RunContext::handleMessage(
+            AssertionInfo const& info,
+            ResultWas::OfType resultType,
+            StringRef message,
+            AssertionReaction& reaction
+    ) {
+        m_lastAssertionInfo = info;
+
+        AssertionResultData data( resultType, LazyExpression( false ) );
+        data.message = static_cast<std::string>(message);
+        AssertionResult assertionResult{ m_lastAssertionInfo,
+                                         CATCH_MOVE( data ) };
+
+        const auto isOk = assertionResult.isOk();
+        assertionEnded( CATCH_MOVE(assertionResult) );
+        if ( !isOk ) {
+            populateReaction( reaction );
+        } else if ( resultType == ResultWas::ExplicitSkip ) {
+            // TODO: Need to handle this explicitly, as ExplicitSkip is
+            // considered "OK"
+            reaction.shouldSkip = true;
+        }
+        resetAssertionInfo();
+    }
+    void RunContext::handleUnexpectedExceptionNotThrown(
+            AssertionInfo const& info,
+            AssertionReaction& reaction
+    ) {
+        handleNonExpr(info, Catch::ResultWas::DidntThrowException, reaction);
+    }
+
+    void RunContext::handleUnexpectedInflightException(
+            AssertionInfo const& info,
+            std::string&& message,
+            AssertionReaction& reaction
+    ) {
+        m_lastAssertionInfo = info;
+
+        AssertionResultData data( ResultWas::ThrewException, LazyExpression( false ) );
+        data.message = CATCH_MOVE(message);
+        AssertionResult assertionResult{ info, CATCH_MOVE(data) };
+        assertionEnded( CATCH_MOVE(assertionResult) );
+        populateReaction( reaction );
+        resetAssertionInfo();
+    }
+
+    void RunContext::populateReaction( AssertionReaction& reaction ) {
+        reaction.shouldDebugBreak = m_config->shouldDebugBreak();
+        reaction.shouldThrow = aborting() || (m_lastAssertionInfo.resultDisposition & ResultDisposition::Normal);
+    }
+
+    void RunContext::handleIncomplete(
+            AssertionInfo const& info
+    ) {
+        using namespace std::string_literals;
+        m_lastAssertionInfo = info;
+
+        AssertionResultData data( ResultWas::ThrewException, LazyExpression( false ) );
+        data.message = "Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE"s;
+        AssertionResult assertionResult{ info, CATCH_MOVE( data ) };
+        assertionEnded( CATCH_MOVE(assertionResult) );
+        resetAssertionInfo();
+    }
+    void RunContext::handleNonExpr(
+            AssertionInfo const &info,
+            ResultWas::OfType resultType,
+            AssertionReaction &reaction
+    ) {
+        m_lastAssertionInfo = info;
+
+        AssertionResultData data( resultType, LazyExpression( false ) );
+        AssertionResult assertionResult{ info, CATCH_MOVE( data ) };
+
+        const auto isOk = assertionResult.isOk();
+        assertionEnded( CATCH_MOVE(assertionResult) );
+        if ( !isOk ) { populateReaction( reaction ); }
+        resetAssertionInfo();
+    }
+
+
+    IResultCapture& getResultCapture() {
+        if (auto* capture = getCurrentContext().getResultCapture())
+            return *capture;
+        else
+            CATCH_INTERNAL_ERROR("No result capture instance");
+    }
+
+    void seedRng(IConfig const& config) {
+        sharedRng().seed(config.rngSeed());
+    }
+
+    unsigned int rngSeed() {
+        return getCurrentContext().getConfig()->rngSeed();
+    }
+
+}
+
+
+
+namespace Catch {
+
+    Section::Section( SectionInfo&& info ):
+        m_info( CATCH_MOVE( info ) ),
+        m_sectionIncluded(
+            getResultCapture().sectionStarted( m_info.name, m_info.lineInfo, m_assertions ) ) {
+        // Non-"included" sections will not use the timing information
+        // anyway, so don't bother with the potential syscall.
+        if (m_sectionIncluded) {
+            m_timer.start();
+        }
+    }
+
+    Section::Section( SourceLineInfo const& _lineInfo,
+                      StringRef _name,
+                      const char* const ):
+        m_info( { "invalid", static_cast<std::size_t>( -1 ) }, std::string{} ),
+        m_sectionIncluded(
+            getResultCapture().sectionStarted( _name, _lineInfo, m_assertions ) ) {
+        // We delay initialization the SectionInfo member until we know
+        // this section needs it, so we avoid allocating std::string for name.
+        // We also delay timer start to avoid the potential syscall unless we
+        // will actually use the result.
+        if ( m_sectionIncluded ) {
+            m_info.name = static_cast<std::string>( _name );
+            m_info.lineInfo = _lineInfo;
+            m_timer.start();
+        }
+    }
+
+    Section::~Section() {
+        if( m_sectionIncluded ) {
+            SectionEndInfo endInfo{ CATCH_MOVE(m_info), m_assertions, m_timer.getElapsedSeconds() };
+            if ( uncaught_exceptions() ) {
+                getResultCapture().sectionEndedEarly( CATCH_MOVE(endInfo) );
+            } else {
+                getResultCapture().sectionEnded( CATCH_MOVE( endInfo ) );
+            }
+        }
+    }
+
+    // This indicates whether the section should be executed or not
+    Section::operator bool() const {
+        return m_sectionIncluded;
+    }
+
+
+} // end namespace Catch
+
+
+
+#include <vector>
+
+namespace Catch {
+
+    namespace {
+        static auto getSingletons() -> std::vector<ISingleton*>*& {
+            static std::vector<ISingleton*>* g_singletons = nullptr;
+            if( !g_singletons )
+                g_singletons = new std::vector<ISingleton*>();
+            return g_singletons;
+        }
+    }
+
+    ISingleton::~ISingleton() = default;
+
+    void addSingleton(ISingleton* singleton ) {
+        getSingletons()->push_back( singleton );
+    }
+    void cleanupSingletons() {
+        auto& singletons = getSingletons();
+        for( auto singleton : *singletons )
+            delete singleton;
+        delete singletons;
+        singletons = nullptr;
+    }
+
+} // namespace Catch
+
+
+
+#include <cstring>
+#include <ostream>
+
+namespace Catch {
+
+    bool SourceLineInfo::operator == ( SourceLineInfo const& other ) const noexcept {
+        return line == other.line && (file == other.file || std::strcmp(file, other.file) == 0);
+    }
+    bool SourceLineInfo::operator < ( SourceLineInfo const& other ) const noexcept {
+        // We can assume that the same file will usually have the same pointer.
+        // Thus, if the pointers are the same, there is no point in calling the strcmp
+        return line < other.line || ( line == other.line && file != other.file && (std::strcmp(file, other.file) < 0));
+    }
+
+    std::ostream& operator << ( std::ostream& os, SourceLineInfo const& info ) {
+#ifndef __GNUG__
+        os << info.file << '(' << info.line << ')';
+#else
+        os << info.file << ':' << info.line;
+#endif
+        return os;
+    }
+
+} // end namespace Catch
+
+
+
+
+namespace Catch {
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+    void StartupExceptionRegistry::add( std::exception_ptr const& exception ) noexcept {
+        CATCH_TRY {
+            m_exceptions.push_back(exception);
+        } CATCH_CATCH_ALL {
+            // If we run out of memory during start-up there's really not a lot more we can do about it
+            std::terminate();
+        }
+    }
+
+    std::vector<std::exception_ptr> const& StartupExceptionRegistry::getExceptions() const noexcept {
+        return m_exceptions;
+    }
+#endif
+
+} // end namespace Catch
+
+
+
+
+
+#include <iostream>
+
+namespace Catch {
+
+// If you #define this you must implement these functions
+#if !defined( CATCH_CONFIG_NOSTDOUT )
+    std::ostream& cout() { return std::cout; }
+    std::ostream& cerr() { return std::cerr; }
+    std::ostream& clog() { return std::clog; }
+#endif
+
+} // namespace Catch
+
+
+
+#include <ostream>
+#include <cstring>
+#include <cctype>
+#include <vector>
+
+namespace Catch {
+
+    bool startsWith( std::string const& s, std::string const& prefix ) {
+        return s.size() >= prefix.size() && std::equal(prefix.begin(), prefix.end(), s.begin());
+    }
+    bool startsWith( StringRef s, char prefix ) {
+        return !s.empty() && s[0] == prefix;
+    }
+    bool endsWith( std::string const& s, std::string const& suffix ) {
+        return s.size() >= suffix.size() && std::equal(suffix.rbegin(), suffix.rend(), s.rbegin());
+    }
+    bool endsWith( std::string const& s, char suffix ) {
+        return !s.empty() && s[s.size()-1] == suffix;
+    }
+    bool contains( std::string const& s, std::string const& infix ) {
+        return s.find( infix ) != std::string::npos;
+    }
+    void toLowerInPlace( std::string& s ) {
+        for ( char& c : s ) {
+            c = toLower( c );
+        }
+    }
+    std::string toLower( std::string const& s ) {
+        std::string lc = s;
+        toLowerInPlace( lc );
+        return lc;
+    }
+    char toLower(char c) {
+        return static_cast<char>(std::tolower(static_cast<unsigned char>(c)));
+    }
+
+    std::string trim( std::string const& str ) {
+        static char const* whitespaceChars = "\n\r\t ";
+        std::string::size_type start = str.find_first_not_of( whitespaceChars );
+        std::string::size_type end = str.find_last_not_of( whitespaceChars );
+
+        return start != std::string::npos ? str.substr( start, 1+end-start ) : std::string();
+    }
+
+    StringRef trim(StringRef ref) {
+        const auto is_ws = [](char c) {
+            return c == ' ' || c == '\t' || c == '\n' || c == '\r';
+        };
+        size_t real_begin = 0;
+        while (real_begin < ref.size() && is_ws(ref[real_begin])) { ++real_begin; }
+        size_t real_end = ref.size();
+        while (real_end > real_begin && is_ws(ref[real_end - 1])) { --real_end; }
+
+        return ref.substr(real_begin, real_end - real_begin);
+    }
+
+    bool replaceInPlace( std::string& str, std::string const& replaceThis, std::string const& withThis ) {
+        std::size_t i = str.find( replaceThis );
+        if (i == std::string::npos) {
+            return false;
+        }
+        std::size_t copyBegin = 0;
+        std::string origStr = CATCH_MOVE(str);
+        str.clear();
+        // There is at least one replacement, so reserve with the best guess
+        // we can make without actually counting the number of occurences.
+        str.reserve(origStr.size() - replaceThis.size() + withThis.size());
+        do {
+            str.append(origStr, copyBegin, i-copyBegin );
+            str += withThis;
+            copyBegin = i + replaceThis.size();
+            if( copyBegin < origStr.size() )
+                i = origStr.find( replaceThis, copyBegin );
+            else
+                i = std::string::npos;
+        } while( i != std::string::npos );
+        if ( copyBegin < origStr.size() ) {
+            str.append(origStr, copyBegin, origStr.size() );
+        }
+        return true;
+    }
+
+    std::vector<StringRef> splitStringRef( StringRef str, char delimiter ) {
+        std::vector<StringRef> subStrings;
+        std::size_t start = 0;
+        for(std::size_t pos = 0; pos < str.size(); ++pos ) {
+            if( str[pos] == delimiter ) {
+                if( pos - start > 1 )
+                    subStrings.push_back( str.substr( start, pos-start ) );
+                start = pos+1;
+            }
+        }
+        if( start < str.size() )
+            subStrings.push_back( str.substr( start, str.size()-start ) );
+        return subStrings;
+    }
+
+    std::ostream& operator << ( std::ostream& os, pluralise const& pluraliser ) {
+        os << pluraliser.m_count << ' ' << pluraliser.m_label;
+        if( pluraliser.m_count != 1 )
+            os << 's';
+        return os;
+    }
+
+}
+
+
+
+#include <algorithm>
+#include <ostream>
+#include <cstring>
+#include <cstdint>
+
+namespace Catch {
+    StringRef::StringRef( char const* rawChars ) noexcept
+    : StringRef( rawChars, std::strlen(rawChars) )
+    {}
+
+
+    bool StringRef::operator<(StringRef rhs) const noexcept {
+        if (m_size < rhs.m_size) {
+            return strncmp(m_start, rhs.m_start, m_size) <= 0;
+        }
+        return strncmp(m_start, rhs.m_start, rhs.m_size) < 0;
+    }
+
+    int StringRef::compare( StringRef rhs ) const {
+        auto cmpResult =
+            strncmp( m_start, rhs.m_start, std::min( m_size, rhs.m_size ) );
+
+        // This means that strncmp found a difference before the strings
+        // ended, and we can return it directly
+        if ( cmpResult != 0 ) {
+            return cmpResult;
+        }
+
+        // If strings are equal up to length, then their comparison results on
+        // their size
+        if ( m_size < rhs.m_size ) {
+            return -1;
+        } else if ( m_size > rhs.m_size ) {
+            return 1;
+        } else {
+            return 0;
+        }
+    }
+
+    auto operator << ( std::ostream& os, StringRef str ) -> std::ostream& {
+        return os.write(str.data(), static_cast<std::streamsize>(str.size()));
+    }
+
+    std::string operator+(StringRef lhs, StringRef rhs) {
+        std::string ret;
+        ret.reserve(lhs.size() + rhs.size());
+        ret += lhs;
+        ret += rhs;
+        return ret;
+    }
+
+    auto operator+=( std::string& lhs, StringRef rhs ) -> std::string& {
+        lhs.append(rhs.data(), rhs.size());
+        return lhs;
+    }
+
+} // namespace Catch
+
+
+
+namespace Catch {
+
+    TagAliasRegistry::~TagAliasRegistry() = default;
+
+    TagAlias const* TagAliasRegistry::find( std::string const& alias ) const {
+        auto it = m_registry.find( alias );
+        if( it != m_registry.end() )
+            return &(it->second);
+        else
+            return nullptr;
+    }
+
+    std::string TagAliasRegistry::expandAliases( std::string const& unexpandedTestSpec ) const {
+        std::string expandedTestSpec = unexpandedTestSpec;
+        for( auto const& registryKvp : m_registry ) {
+            std::size_t pos = expandedTestSpec.find( registryKvp.first );
+            if( pos != std::string::npos ) {
+                expandedTestSpec =  expandedTestSpec.substr( 0, pos ) +
+                                    registryKvp.second.tag +
+                                    expandedTestSpec.substr( pos + registryKvp.first.size() );
+            }
+        }
+        return expandedTestSpec;
+    }
+
+    void TagAliasRegistry::add( std::string const& alias, std::string const& tag, SourceLineInfo const& lineInfo ) {
+        CATCH_ENFORCE( startsWith(alias, "[@") && endsWith(alias, ']'),
+                      "error: tag alias, '" << alias << "' is not of the form [@alias name].\n" << lineInfo );
+
+        CATCH_ENFORCE( m_registry.insert(std::make_pair(alias, TagAlias(tag, lineInfo))).second,
+                      "error: tag alias, '" << alias << "' already registered.\n"
+                      << "\tFirst seen at: " << find(alias)->lineInfo << "\n"
+                      << "\tRedefined at: " << lineInfo );
+    }
+
+    ITagAliasRegistry::~ITagAliasRegistry() = default;
+
+    ITagAliasRegistry const& ITagAliasRegistry::get() {
+        return getRegistryHub().getTagAliasRegistry();
+    }
+
+} // end namespace Catch
+
+
+
+
+namespace Catch {
+    TestCaseInfoHasher::TestCaseInfoHasher( hash_t seed ): m_seed( seed ) {}
+
+    uint32_t TestCaseInfoHasher::operator()( TestCaseInfo const& t ) const {
+        // FNV-1a hash algorithm that is designed for uniqueness:
+        const hash_t prime = 1099511628211u;
+        hash_t hash = 14695981039346656037u;
+        for ( const char c : t.name ) {
+            hash ^= c;
+            hash *= prime;
+        }
+        for ( const char c : t.className ) {
+            hash ^= c;
+            hash *= prime;
+        }
+        for ( const Tag& tag : t.tags ) {
+            for ( const char c : tag.original ) {
+                hash ^= c;
+                hash *= prime;
+            }
+        }
+        hash ^= m_seed;
+        hash *= prime;
+        const uint32_t low{ static_cast<uint32_t>( hash ) };
+        const uint32_t high{ static_cast<uint32_t>( hash >> 32 ) };
+        return low * high;
+    }
+} // namespace Catch
+
+
+
+
+#include <algorithm>
+#include <set>
+
+namespace Catch {
+
+    namespace {
+        static void enforceNoDuplicateTestCases(
+            std::vector<TestCaseHandle> const& tests ) {
+            auto testInfoCmp = []( TestCaseInfo const* lhs,
+                                   TestCaseInfo const* rhs ) {
+                return *lhs < *rhs;
+            };
+            std::set<TestCaseInfo const*, decltype( testInfoCmp )&> seenTests(
+                testInfoCmp );
+            for ( auto const& test : tests ) {
+                const auto infoPtr = &test.getTestCaseInfo();
+                const auto prev = seenTests.insert( infoPtr );
+                CATCH_ENFORCE( prev.second,
+                               "error: test case \""
+                                   << infoPtr->name << "\", with tags \""
+                                   << infoPtr->tagsAsString()
+                                   << "\" already defined.\n"
+                                   << "\tFirst seen at "
+                                   << ( *prev.first )->lineInfo << "\n"
+                                   << "\tRedefined at " << infoPtr->lineInfo );
+            }
+        }
+
+        static bool matchTest( TestCaseHandle const& testCase,
+                               TestSpec const& testSpec,
+                               IConfig const& config ) {
+            return testSpec.matches( testCase.getTestCaseInfo() ) &&
+                   isThrowSafe( testCase, config );
+        }
+
+    } // end unnamed namespace
+
+    std::vector<TestCaseHandle> sortTests( IConfig const& config, std::vector<TestCaseHandle> const& unsortedTestCases ) {
+        switch (config.runOrder()) {
+        case TestRunOrder::Declared:
+            return unsortedTestCases;
+
+        case TestRunOrder::LexicographicallySorted: {
+            std::vector<TestCaseHandle> sorted = unsortedTestCases;
+            std::sort(
+                sorted.begin(),
+                sorted.end(),
+                []( TestCaseHandle const& lhs, TestCaseHandle const& rhs ) {
+                    return lhs.getTestCaseInfo() < rhs.getTestCaseInfo();
+                }
+            );
+            return sorted;
+        }
+        case TestRunOrder::Randomized: {
+            using TestWithHash = std::pair<TestCaseInfoHasher::hash_t, TestCaseHandle>;
+
+            TestCaseInfoHasher h{ config.rngSeed() };
+            std::vector<TestWithHash> indexed_tests;
+            indexed_tests.reserve(unsortedTestCases.size());
+
+            for (auto const& handle : unsortedTestCases) {
+                indexed_tests.emplace_back(h(handle.getTestCaseInfo()), handle);
+            }
+
+            std::sort( indexed_tests.begin(),
+                       indexed_tests.end(),
+                       []( TestWithHash const& lhs, TestWithHash const& rhs ) {
+                           if ( lhs.first == rhs.first ) {
+                               return lhs.second.getTestCaseInfo() <
+                                      rhs.second.getTestCaseInfo();
+                           }
+                           return lhs.first < rhs.first;
+                       } );
+
+            std::vector<TestCaseHandle> randomized;
+            randomized.reserve(indexed_tests.size());
+
+            for (auto const& indexed : indexed_tests) {
+                randomized.push_back(indexed.second);
+            }
+
+            return randomized;
+        }
+        }
+
+        CATCH_INTERNAL_ERROR("Unknown test order value!");
+    }
+
+    bool isThrowSafe( TestCaseHandle const& testCase, IConfig const& config ) {
+        return !testCase.getTestCaseInfo().throws() || config.allowThrows();
+    }
+
+    std::vector<TestCaseHandle> filterTests( std::vector<TestCaseHandle> const& testCases, TestSpec const& testSpec, IConfig const& config ) {
+        std::vector<TestCaseHandle> filtered;
+        filtered.reserve( testCases.size() );
+        for (auto const& testCase : testCases) {
+            if ((!testSpec.hasFilters() && !testCase.getTestCaseInfo().isHidden()) ||
+                (testSpec.hasFilters() && matchTest(testCase, testSpec, config))) {
+                filtered.push_back(testCase);
+            }
+        }
+        return createShard(filtered, config.shardCount(), config.shardIndex());
+    }
+    std::vector<TestCaseHandle> const& getAllTestCasesSorted( IConfig const& config ) {
+        return getRegistryHub().getTestCaseRegistry().getAllTestsSorted( config );
+    }
+
+    void TestRegistry::registerTest(Detail::unique_ptr<TestCaseInfo> testInfo, Detail::unique_ptr<ITestInvoker> testInvoker) {
+        m_handles.emplace_back(testInfo.get(), testInvoker.get());
+        m_viewed_test_infos.push_back(testInfo.get());
+        m_owned_test_infos.push_back(CATCH_MOVE(testInfo));
+        m_invokers.push_back(CATCH_MOVE(testInvoker));
+    }
+
+    std::vector<TestCaseInfo*> const& TestRegistry::getAllInfos() const {
+        return m_viewed_test_infos;
+    }
+
+    std::vector<TestCaseHandle> const& TestRegistry::getAllTests() const {
+        return m_handles;
+    }
+    std::vector<TestCaseHandle> const& TestRegistry::getAllTestsSorted( IConfig const& config ) const {
+        if( m_sortedFunctions.empty() )
+            enforceNoDuplicateTestCases( m_handles );
+
+        if(  m_currentSortOrder != config.runOrder() || m_sortedFunctions.empty() ) {
+            m_sortedFunctions = sortTests( config, m_handles );
+            m_currentSortOrder = config.runOrder();
+        }
+        return m_sortedFunctions;
+    }
+
+} // end namespace Catch
+
+
+
+
+#include <algorithm>
+#include <cassert>
+
+#if defined(__clang__)
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wexit-time-destructors"
+#endif
+
+namespace Catch {
+namespace TestCaseTracking {
+
+    NameAndLocation::NameAndLocation( std::string&& _name, SourceLineInfo const& _location )
+    :   name( CATCH_MOVE(_name) ),
+        location( _location )
+    {}
+
+
+    ITracker::~ITracker() = default;
+
+    void ITracker::markAsNeedingAnotherRun() {
+        m_runState = NeedsAnotherRun;
+    }
+
+    void ITracker::addChild( ITrackerPtr&& child ) {
+        m_children.push_back( CATCH_MOVE(child) );
+    }
+
+    ITracker* ITracker::findChild( NameAndLocationRef const& nameAndLocation ) {
+        auto it = std::find_if(
+            m_children.begin(),
+            m_children.end(),
+            [&nameAndLocation]( ITrackerPtr const& tracker ) {
+                auto const& tnameAndLoc = tracker->nameAndLocation();
+                if ( tnameAndLoc.location.line !=
+                     nameAndLocation.location.line ) {
+                    return false;
+                }
+                return tnameAndLoc == nameAndLocation;
+            } );
+        return ( it != m_children.end() ) ? it->get() : nullptr;
+    }
+
+    bool ITracker::isSectionTracker() const { return false; }
+    bool ITracker::isGeneratorTracker() const { return false; }
+
+    bool ITracker::isOpen() const {
+        return m_runState != NotStarted && !isComplete();
+    }
+
+    bool ITracker::hasStarted() const { return m_runState != NotStarted; }
+
+    void ITracker::openChild() {
+        if (m_runState != ExecutingChildren) {
+            m_runState = ExecutingChildren;
+            if (m_parent) {
+                m_parent->openChild();
+            }
+        }
+    }
+
+    ITracker& TrackerContext::startRun() {
+        using namespace std::string_literals;
+        m_rootTracker = Catch::Detail::make_unique<SectionTracker>(
+            NameAndLocation( "{root}"s, CATCH_INTERNAL_LINEINFO ),
+            *this,
+            nullptr );
+        m_currentTracker = nullptr;
+        m_runState = Executing;
+        return *m_rootTracker;
+    }
+
+    void TrackerContext::completeCycle() {
+        m_runState = CompletedCycle;
+    }
+
+    bool TrackerContext::completedCycle() const {
+        return m_runState == CompletedCycle;
+    }
+    void TrackerContext::setCurrentTracker( ITracker* tracker ) {
+        m_currentTracker = tracker;
+    }
+
+
+    TrackerBase::TrackerBase( NameAndLocation&& nameAndLocation, TrackerContext& ctx, ITracker* parent ):
+        ITracker(CATCH_MOVE(nameAndLocation), parent),
+        m_ctx( ctx )
+    {}
+
+    bool TrackerBase::isComplete() const {
+        return m_runState == CompletedSuccessfully || m_runState == Failed;
+    }
+
+    void TrackerBase::open() {
+        m_runState = Executing;
+        moveToThis();
+        if( m_parent )
+            m_parent->openChild();
+    }
+
+    void TrackerBase::close() {
+
+        // Close any still open children (e.g. generators)
+        while( &m_ctx.currentTracker() != this )
+            m_ctx.currentTracker().close();
+
+        switch( m_runState ) {
+            case NeedsAnotherRun:
+                break;
+
+            case Executing:
+                m_runState = CompletedSuccessfully;
+                break;
+            case ExecutingChildren:
+                if( std::all_of(m_children.begin(), m_children.end(), [](ITrackerPtr const& t){ return t->isComplete(); }) )
+                    m_runState = CompletedSuccessfully;
+                break;
+
+            case NotStarted:
+            case CompletedSuccessfully:
+            case Failed:
+                CATCH_INTERNAL_ERROR( "Illogical state: " << m_runState );
+
+            default:
+                CATCH_INTERNAL_ERROR( "Unknown state: " << m_runState );
+        }
+        moveToParent();
+        m_ctx.completeCycle();
+    }
+    void TrackerBase::fail() {
+        m_runState = Failed;
+        if( m_parent )
+            m_parent->markAsNeedingAnotherRun();
+        moveToParent();
+        m_ctx.completeCycle();
+    }
+
+    void TrackerBase::moveToParent() {
+        assert( m_parent );
+        m_ctx.setCurrentTracker( m_parent );
+    }
+    void TrackerBase::moveToThis() {
+        m_ctx.setCurrentTracker( this );
+    }
+
+    SectionTracker::SectionTracker( NameAndLocation&& nameAndLocation, TrackerContext& ctx, ITracker* parent )
+    :   TrackerBase( CATCH_MOVE(nameAndLocation), ctx, parent ),
+        m_trimmed_name(trim(StringRef(ITracker::nameAndLocation().name)))
+    {
+        if( parent ) {
+            while ( !parent->isSectionTracker() ) {
+                parent = parent->parent();
+            }
+
+            SectionTracker& parentSection = static_cast<SectionTracker&>( *parent );
+            addNextFilters( parentSection.m_filters );
+        }
+    }
+
+    bool SectionTracker::isComplete() const {
+        bool complete = true;
+
+        if (m_filters.empty()
+            || m_filters[0].empty()
+            || std::find(m_filters.begin(), m_filters.end(), m_trimmed_name) != m_filters.end()) {
+            complete = TrackerBase::isComplete();
+        }
+        return complete;
+    }
+
+    bool SectionTracker::isSectionTracker() const { return true; }
+
+    SectionTracker& SectionTracker::acquire( TrackerContext& ctx, NameAndLocationRef const& nameAndLocation ) {
+        SectionTracker* tracker;
+
+        ITracker& currentTracker = ctx.currentTracker();
+        if ( ITracker* childTracker =
+                 currentTracker.findChild( nameAndLocation ) ) {
+            assert( childTracker );
+            assert( childTracker->isSectionTracker() );
+            tracker = static_cast<SectionTracker*>( childTracker );
+        } else {
+            auto newTracker = Catch::Detail::make_unique<SectionTracker>(
+                NameAndLocation{ static_cast<std::string>(nameAndLocation.name),
+                                 nameAndLocation.location },
+                ctx,
+                &currentTracker );
+            tracker = newTracker.get();
+            currentTracker.addChild( CATCH_MOVE( newTracker ) );
+        }
+
+        if ( !ctx.completedCycle() ) {
+            tracker->tryOpen();
+        }
+
+        return *tracker;
+    }
+
+    void SectionTracker::tryOpen() {
+        if( !isComplete() )
+            open();
+    }
+
+    void SectionTracker::addInitialFilters( std::vector<std::string> const& filters ) {
+        if( !filters.empty() ) {
+            m_filters.reserve( m_filters.size() + filters.size() + 2 );
+            m_filters.emplace_back(StringRef{}); // Root - should never be consulted
+            m_filters.emplace_back(StringRef{}); // Test Case - not a section filter
+            m_filters.insert( m_filters.end(), filters.begin(), filters.end() );
+        }
+    }
+    void SectionTracker::addNextFilters( std::vector<StringRef> const& filters ) {
+        if( filters.size() > 1 )
+            m_filters.insert( m_filters.end(), filters.begin()+1, filters.end() );
+    }
+
+    StringRef SectionTracker::trimmedName() const {
+        return m_trimmed_name;
+    }
+
+} // namespace TestCaseTracking
+
+} // namespace Catch
+
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
+
+
+
+
+namespace Catch {
+
+    void throw_test_failure_exception() {
+#if !defined( CATCH_CONFIG_DISABLE_EXCEPTIONS )
+        throw TestFailureException{};
+#else
+        CATCH_ERROR( "Test failure requires aborting test!" );
+#endif
+    }
+
+    void throw_test_skip_exception() {
+#if !defined( CATCH_CONFIG_DISABLE_EXCEPTIONS )
+        throw Catch::TestSkipException();
+#else
+        CATCH_ERROR( "Explicitly skipping tests during runtime requires exceptions" );
+#endif
+    }
+
+} // namespace Catch
+
+
+
+#include <algorithm>
+#include <iterator>
+
+namespace Catch {
+    ITestInvoker::~ITestInvoker() = default;
+
+    namespace {
+        static StringRef extractClassName( StringRef classOrMethodName ) {
+            if ( !startsWith( classOrMethodName, '&' ) ) {
+                return classOrMethodName;
+            }
+
+            // Remove the leading '&' to avoid having to special case it later
+            const auto methodName =
+                classOrMethodName.substr( 1, classOrMethodName.size() );
+
+            auto reverseStart = std::make_reverse_iterator( methodName.end() );
+            auto reverseEnd = std::make_reverse_iterator( methodName.begin() );
+
+            // We make a simplifying assumption that ":" is only present
+            // in the input as part of "::" from C++ typenames (this is
+            // relatively safe assumption because the input is generated
+            // as stringification of type through preprocessor).
+            auto lastColons = std::find( reverseStart, reverseEnd, ':' ) + 1;
+            auto secondLastColons =
+                std::find( lastColons + 1, reverseEnd, ':' );
+
+            auto const startIdx = reverseEnd - secondLastColons;
+            auto const classNameSize = secondLastColons - lastColons - 1;
+
+            return methodName.substr(
+                static_cast<std::size_t>( startIdx ),
+                static_cast<std::size_t>( classNameSize ) );
+        }
+
+        class TestInvokerAsFunction final : public ITestInvoker {
+            using TestType = void ( * )();
+            TestType m_testAsFunction;
+
+        public:
+            TestInvokerAsFunction( TestType testAsFunction ) noexcept:
+                m_testAsFunction( testAsFunction ) {}
+
+            void invoke() const override { m_testAsFunction(); }
+        };
+
+    } // namespace
+
+    Detail::unique_ptr<ITestInvoker> makeTestInvoker( void(*testAsFunction)() ) {
+        return Detail::make_unique<TestInvokerAsFunction>( testAsFunction );
+    }
+
+    AutoReg::AutoReg( Detail::unique_ptr<ITestInvoker> invoker, SourceLineInfo const& lineInfo, StringRef classOrMethod, NameAndTags const& nameAndTags ) noexcept {
+        CATCH_TRY {
+            getMutableRegistryHub()
+                    .registerTest(
+                        makeTestCaseInfo(
+                            extractClassName( classOrMethod ),
+                            nameAndTags,
+                            lineInfo),
+                        CATCH_MOVE(invoker)
+                    );
+        } CATCH_CATCH_ALL {
+            // Do not throw when constructing global objects, instead register the exception to be processed later
+            getMutableRegistryHub().registerStartupException();
+        }
+    }
+}
+
+
+
+
+
+namespace Catch {
+
+    TestSpecParser::TestSpecParser( ITagAliasRegistry const& tagAliases ) : m_tagAliases( &tagAliases ) {}
+
+    TestSpecParser& TestSpecParser::parse( std::string const& arg ) {
+        m_mode = None;
+        m_exclusion = false;
+        m_arg = m_tagAliases->expandAliases( arg );
+        m_escapeChars.clear();
+        m_substring.reserve(m_arg.size());
+        m_patternName.reserve(m_arg.size());
+        m_realPatternPos = 0;
+
+        for( m_pos = 0; m_pos < m_arg.size(); ++m_pos )
+          //if visitChar fails
+           if( !visitChar( m_arg[m_pos] ) ){
+               m_testSpec.m_invalidSpecs.push_back(arg);
+               break;
+           }
+        endMode();
+        return *this;
+    }
+    TestSpec TestSpecParser::testSpec() {
+        addFilter();
+        return CATCH_MOVE(m_testSpec);
+    }
+    bool TestSpecParser::visitChar( char c ) {
+        if( (m_mode != EscapedName) && (c == '\\') ) {
+            escape();
+            addCharToPattern(c);
+            return true;
+        }else if((m_mode != EscapedName) && (c == ',') )  {
+            return separate();
+        }
+
+        switch( m_mode ) {
+        case None:
+            if( processNoneChar( c ) )
+                return true;
+            break;
+        case Name:
+            processNameChar( c );
+            break;
+        case EscapedName:
+            endMode();
+            addCharToPattern(c);
+            return true;
+        default:
+        case Tag:
+        case QuotedName:
+            if( processOtherChar( c ) )
+                return true;
+            break;
+        }
+
+        m_substring += c;
+        if( !isControlChar( c ) ) {
+            m_patternName += c;
+            m_realPatternPos++;
+        }
+        return true;
+    }
+    // Two of the processing methods return true to signal the caller to return
+    // without adding the given character to the current pattern strings
+    bool TestSpecParser::processNoneChar( char c ) {
+        switch( c ) {
+        case ' ':
+            return true;
+        case '~':
+            m_exclusion = true;
+            return false;
+        case '[':
+            startNewMode( Tag );
+            return false;
+        case '"':
+            startNewMode( QuotedName );
+            return false;
+        default:
+            startNewMode( Name );
+            return false;
+        }
+    }
+    void TestSpecParser::processNameChar( char c ) {
+        if( c == '[' ) {
+            if( m_substring == "exclude:" )
+                m_exclusion = true;
+            else
+                endMode();
+            startNewMode( Tag );
+        }
+    }
+    bool TestSpecParser::processOtherChar( char c ) {
+        if( !isControlChar( c ) )
+            return false;
+        m_substring += c;
+        endMode();
+        return true;
+    }
+    void TestSpecParser::startNewMode( Mode mode ) {
+        m_mode = mode;
+    }
+    void TestSpecParser::endMode() {
+        switch( m_mode ) {
+        case Name:
+        case QuotedName:
+            return addNamePattern();
+        case Tag:
+            return addTagPattern();
+        case EscapedName:
+            revertBackToLastMode();
+            return;
+        case None:
+        default:
+            return startNewMode( None );
+        }
+    }
+    void TestSpecParser::escape() {
+        saveLastMode();
+        m_mode = EscapedName;
+        m_escapeChars.push_back(m_realPatternPos);
+    }
+    bool TestSpecParser::isControlChar( char c ) const {
+        switch( m_mode ) {
+            default:
+                return false;
+            case None:
+                return c == '~';
+            case Name:
+                return c == '[';
+            case EscapedName:
+                return true;
+            case QuotedName:
+                return c == '"';
+            case Tag:
+                return c == '[' || c == ']';
+        }
+    }
+
+    void TestSpecParser::addFilter() {
+        if( !m_currentFilter.m_required.empty() || !m_currentFilter.m_forbidden.empty() ) {
+            m_testSpec.m_filters.push_back( CATCH_MOVE(m_currentFilter) );
+            m_currentFilter = TestSpec::Filter();
+        }
+    }
+
+    void TestSpecParser::saveLastMode() {
+      lastMode = m_mode;
+    }
+
+    void TestSpecParser::revertBackToLastMode() {
+      m_mode = lastMode;
+    }
+
+    bool TestSpecParser::separate() {
+      if( (m_mode==QuotedName) || (m_mode==Tag) ){
+         //invalid argument, signal failure to previous scope.
+         m_mode = None;
+         m_pos = m_arg.size();
+         m_substring.clear();
+         m_patternName.clear();
+         m_realPatternPos = 0;
+         return false;
+      }
+      endMode();
+      addFilter();
+      return true; //success
+    }
+
+    std::string TestSpecParser::preprocessPattern() {
+        std::string token = m_patternName;
+        for (std::size_t i = 0; i < m_escapeChars.size(); ++i)
+            token = token.substr(0, m_escapeChars[i] - i) + token.substr(m_escapeChars[i] - i + 1);
+        m_escapeChars.clear();
+        if (startsWith(token, "exclude:")) {
+            m_exclusion = true;
+            token = token.substr(8);
+        }
+
+        m_patternName.clear();
+        m_realPatternPos = 0;
+
+        return token;
+    }
+
+    void TestSpecParser::addNamePattern() {
+        auto token = preprocessPattern();
+
+        if (!token.empty()) {
+            if (m_exclusion) {
+                m_currentFilter.m_forbidden.emplace_back(Detail::make_unique<TestSpec::NamePattern>(token, m_substring));
+            } else {
+                m_currentFilter.m_required.emplace_back(Detail::make_unique<TestSpec::NamePattern>(token, m_substring));
+            }
+        }
+        m_substring.clear();
+        m_exclusion = false;
+        m_mode = None;
+    }
+
+    void TestSpecParser::addTagPattern() {
+        auto token = preprocessPattern();
+
+        if (!token.empty()) {
+            // If the tag pattern is the "hide and tag" shorthand (e.g. [.foo])
+            // we have to create a separate hide tag and shorten the real one
+            if (token.size() > 1 && token[0] == '.') {
+                token.erase(token.begin());
+                if (m_exclusion) {
+                    m_currentFilter.m_forbidden.emplace_back(Detail::make_unique<TestSpec::TagPattern>(".", m_substring));
+                } else {
+                    m_currentFilter.m_required.emplace_back(Detail::make_unique<TestSpec::TagPattern>(".", m_substring));
+                }
+            }
+            if (m_exclusion) {
+                m_currentFilter.m_forbidden.emplace_back(Detail::make_unique<TestSpec::TagPattern>(token, m_substring));
+            } else {
+                m_currentFilter.m_required.emplace_back(Detail::make_unique<TestSpec::TagPattern>(token, m_substring));
+            }
+        }
+        m_substring.clear();
+        m_exclusion = false;
+        m_mode = None;
+    }
+
+} // namespace Catch
+
+
+
+#include <algorithm>
+#include <cstring>
+#include <ostream>
+
+namespace {
+    bool isWhitespace( char c ) {
+        return c == ' ' || c == '\t' || c == '\n' || c == '\r';
+    }
+
+    bool isBreakableBefore( char c ) {
+        static const char chars[] = "[({<|";
+        return std::memchr( chars, c, sizeof( chars ) - 1 ) != nullptr;
+    }
+
+    bool isBreakableAfter( char c ) {
+        static const char chars[] = "])}>.,:;*+-=&/\\";
+        return std::memchr( chars, c, sizeof( chars ) - 1 ) != nullptr;
+    }
+
+    bool isBoundary( std::string const& line, size_t at ) {
+        assert( at > 0 );
+        assert( at <= line.size() );
+
+        return at == line.size() ||
+               ( isWhitespace( line[at] ) && !isWhitespace( line[at - 1] ) ) ||
+               isBreakableBefore( line[at] ) ||
+               isBreakableAfter( line[at - 1] );
+    }
+
+} // namespace
+
+namespace Catch {
+    namespace TextFlow {
+
+        void Column::const_iterator::calcLength() {
+            m_addHyphen = false;
+            m_parsedTo = m_lineStart;
+
+            std::string const& current_line = m_column.m_string;
+            if ( current_line[m_lineStart] == '\n' ) {
+                ++m_parsedTo;
+            }
+
+            const auto maxLineLength = m_column.m_width - indentSize();
+            const auto maxParseTo = std::min(current_line.size(), m_lineStart + maxLineLength);
+            while ( m_parsedTo < maxParseTo &&
+                    current_line[m_parsedTo] != '\n' ) {
+                ++m_parsedTo;
+            }
+
+            // If we encountered a newline before the column is filled,
+            // then we linebreak at the newline and consider this line
+            // finished.
+            if ( m_parsedTo < m_lineStart + maxLineLength ) {
+                m_lineLength = m_parsedTo - m_lineStart;
+            } else {
+                // Look for a natural linebreak boundary in the column
+                // (We look from the end, so that the first found boundary is
+                // the right one)
+                size_t newLineLength = maxLineLength;
+                while ( newLineLength > 0 && !isBoundary( current_line, m_lineStart + newLineLength ) ) {
+                    --newLineLength;
+                }
+                while ( newLineLength > 0 &&
+                        isWhitespace( current_line[m_lineStart + newLineLength - 1] ) ) {
+                    --newLineLength;
+                }
+
+                // If we found one, then that is where we linebreak
+                if ( newLineLength > 0 ) {
+                    m_lineLength = newLineLength;
+                } else {
+                    // Otherwise we have to split text with a hyphen
+                    m_addHyphen = true;
+                    m_lineLength = maxLineLength - 1;
+                }
+            }
+        }
+
+        size_t Column::const_iterator::indentSize() const {
+            auto initial =
+                m_lineStart == 0 ? m_column.m_initialIndent : std::string::npos;
+            return initial == std::string::npos ? m_column.m_indent : initial;
+        }
+
+        std::string
+        Column::const_iterator::addIndentAndSuffix( size_t position,
+                                              size_t length ) const {
+            std::string ret;
+            const auto desired_indent = indentSize();
+            ret.reserve( desired_indent + length + m_addHyphen );
+            ret.append( desired_indent, ' ' );
+            ret.append( m_column.m_string, position, length );
+            if ( m_addHyphen ) {
+                ret.push_back( '-' );
+            }
+
+            return ret;
+        }
+
+        Column::const_iterator::const_iterator( Column const& column ): m_column( column ) {
+            assert( m_column.m_width > m_column.m_indent );
+            assert( m_column.m_initialIndent == std::string::npos ||
+                    m_column.m_width > m_column.m_initialIndent );
+            calcLength();
+            if ( m_lineLength == 0 ) {
+                m_lineStart = m_column.m_string.size();
+            }
+        }
+
+        std::string Column::const_iterator::operator*() const {
+            assert( m_lineStart <= m_parsedTo );
+            return addIndentAndSuffix( m_lineStart, m_lineLength );
+        }
+
+        Column::const_iterator& Column::const_iterator::operator++() {
+            m_lineStart += m_lineLength;
+            std::string const& current_line = m_column.m_string;
+            if ( m_lineStart < current_line.size() && current_line[m_lineStart] == '\n' ) {
+                m_lineStart += 1;
+            } else {
+                while ( m_lineStart < current_line.size() &&
+                        isWhitespace( current_line[m_lineStart] ) ) {
+                    ++m_lineStart;
+                }
+            }
+
+            if ( m_lineStart != current_line.size() ) {
+                calcLength();
+            }
+            return *this;
+        }
+
+        Column::const_iterator Column::const_iterator::operator++( int ) {
+            const_iterator prev( *this );
+            operator++();
+            return prev;
+        }
+
+        std::ostream& operator<<( std::ostream& os, Column const& col ) {
+            bool first = true;
+            for ( auto line : col ) {
+                if ( first ) {
+                    first = false;
+                } else {
+                    os << '\n';
+                }
+                os << line;
+            }
+            return os;
+        }
+
+        Column Spacer( size_t spaceWidth ) {
+            Column ret{ "" };
+            ret.width( spaceWidth );
+            return ret;
+        }
+
+        Columns::iterator::iterator( Columns const& columns, EndTag ):
+            m_columns( columns.m_columns ), m_activeIterators( 0 ) {
+
+            m_iterators.reserve( m_columns.size() );
+            for ( auto const& col : m_columns ) {
+                m_iterators.push_back( col.end() );
+            }
+        }
+
+        Columns::iterator::iterator( Columns const& columns ):
+            m_columns( columns.m_columns ),
+            m_activeIterators( m_columns.size() ) {
+
+            m_iterators.reserve( m_columns.size() );
+            for ( auto const& col : m_columns ) {
+                m_iterators.push_back( col.begin() );
+            }
+        }
+
+        std::string Columns::iterator::operator*() const {
+            std::string row, padding;
+
+            for ( size_t i = 0; i < m_columns.size(); ++i ) {
+                const auto width = m_columns[i].width();
+                if ( m_iterators[i] != m_columns[i].end() ) {
+                    std::string col = *m_iterators[i];
+                    row += padding;
+                    row += col;
+
+                    padding.clear();
+                    if ( col.size() < width ) {
+                        padding.append( width - col.size(), ' ' );
+                    }
+                } else {
+                    padding.append( width, ' ' );
+                }
+            }
+            return row;
+        }
+
+        Columns::iterator& Columns::iterator::operator++() {
+            for ( size_t i = 0; i < m_columns.size(); ++i ) {
+                if ( m_iterators[i] != m_columns[i].end() ) {
+                    ++m_iterators[i];
+                }
+            }
+            return *this;
+        }
+
+        Columns::iterator Columns::iterator::operator++( int ) {
+            iterator prev( *this );
+            operator++();
+            return prev;
+        }
+
+        std::ostream& operator<<( std::ostream& os, Columns const& cols ) {
+            bool first = true;
+            for ( auto line : cols ) {
+                if ( first ) {
+                    first = false;
+                } else {
+                    os << '\n';
+                }
+                os << line;
+            }
+            return os;
+        }
+
+        Columns operator+(Column const& lhs, Column const& rhs) {
+            Columns cols;
+            cols += lhs;
+            cols += rhs;
+            return cols;
+        }
+        Columns operator+(Column&& lhs, Column&& rhs) {
+            Columns cols;
+            cols += CATCH_MOVE( lhs );
+            cols += CATCH_MOVE( rhs );
+            return cols;
+        }
+
+        Columns& operator+=(Columns& lhs, Column const& rhs) {
+            lhs.m_columns.push_back( rhs );
+            return lhs;
+        }
+        Columns& operator+=(Columns& lhs, Column&& rhs) {
+            lhs.m_columns.push_back( CATCH_MOVE(rhs) );
+            return lhs;
+        }
+        Columns operator+( Columns const& lhs, Column const& rhs ) {
+            auto combined( lhs );
+            combined += rhs;
+            return combined;
+        }
+        Columns operator+( Columns&& lhs, Column&& rhs ) {
+            lhs += CATCH_MOVE( rhs );
+            return CATCH_MOVE( lhs );
+        }
+
+    } // namespace TextFlow
+} // namespace Catch
+
+
+
+
+#include <exception>
+
+namespace Catch {
+    bool uncaught_exceptions() {
+#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+        return false;
+#elif defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
+        return std::uncaught_exceptions() > 0;
+#else
+        return std::uncaught_exception();
+#endif
+  }
+} // end namespace Catch
+
+
+
+namespace Catch {
+
+    WildcardPattern::WildcardPattern( std::string const& pattern,
+                                      CaseSensitive caseSensitivity )
+    :   m_caseSensitivity( caseSensitivity ),
+        m_pattern( normaliseString( pattern ) )
+    {
+        if( startsWith( m_pattern, '*' ) ) {
+            m_pattern = m_pattern.substr( 1 );
+            m_wildcard = WildcardAtStart;
+        }
+        if( endsWith( m_pattern, '*' ) ) {
+            m_pattern = m_pattern.substr( 0, m_pattern.size()-1 );
+            m_wildcard = static_cast<WildcardPosition>( m_wildcard | WildcardAtEnd );
+        }
+    }
+
+    bool WildcardPattern::matches( std::string const& str ) const {
+        switch( m_wildcard ) {
+            case NoWildcard:
+                return m_pattern == normaliseString( str );
+            case WildcardAtStart:
+                return endsWith( normaliseString( str ), m_pattern );
+            case WildcardAtEnd:
+                return startsWith( normaliseString( str ), m_pattern );
+            case WildcardAtBothEnds:
+                return contains( normaliseString( str ), m_pattern );
+            default:
+                CATCH_INTERNAL_ERROR( "Unknown enum" );
+        }
+    }
+
+    std::string WildcardPattern::normaliseString( std::string const& str ) const {
+        return trim( m_caseSensitivity == CaseSensitive::No ? toLower( str ) : str );
+    }
+}
+
+
+// Note: swapping these two includes around causes MSVC to error out
+//       while in /permissive- mode. No, I don't know why.
+//       Tested on VS 2019, 18.{3, 4}.x
+
+#include <cstdint>
+#include <iomanip>
+#include <type_traits>
+
+namespace Catch {
+
+namespace {
+
+    size_t trailingBytes(unsigned char c) {
+        if ((c & 0xE0) == 0xC0) {
+            return 2;
+        }
+        if ((c & 0xF0) == 0xE0) {
+            return 3;
+        }
+        if ((c & 0xF8) == 0xF0) {
+            return 4;
+        }
+        CATCH_INTERNAL_ERROR("Invalid multibyte utf-8 start byte encountered");
+    }
+
+    uint32_t headerValue(unsigned char c) {
+        if ((c & 0xE0) == 0xC0) {
+            return c & 0x1F;
+        }
+        if ((c & 0xF0) == 0xE0) {
+            return c & 0x0F;
+        }
+        if ((c & 0xF8) == 0xF0) {
+            return c & 0x07;
+        }
+        CATCH_INTERNAL_ERROR("Invalid multibyte utf-8 start byte encountered");
+    }
+
+    void hexEscapeChar(std::ostream& os, unsigned char c) {
+        std::ios_base::fmtflags f(os.flags());
+        os << "\\x"
+            << std::uppercase << std::hex << std::setfill('0') << std::setw(2)
+            << static_cast<int>(c);
+        os.flags(f);
+    }
+
+    bool shouldNewline(XmlFormatting fmt) {
+        return !!(static_cast<std::underlying_type_t<XmlFormatting>>(fmt & XmlFormatting::Newline));
+    }
+
+    bool shouldIndent(XmlFormatting fmt) {
+        return !!(static_cast<std::underlying_type_t<XmlFormatting>>(fmt & XmlFormatting::Indent));
+    }
+
+} // anonymous namespace
+
+    XmlFormatting operator | (XmlFormatting lhs, XmlFormatting rhs) {
+        return static_cast<XmlFormatting>(
+            static_cast<std::underlying_type_t<XmlFormatting>>(lhs) |
+            static_cast<std::underlying_type_t<XmlFormatting>>(rhs)
+        );
+    }
+
+    XmlFormatting operator & (XmlFormatting lhs, XmlFormatting rhs) {
+        return static_cast<XmlFormatting>(
+            static_cast<std::underlying_type_t<XmlFormatting>>(lhs) &
+            static_cast<std::underlying_type_t<XmlFormatting>>(rhs)
+        );
+    }
+
+
+    XmlEncode::XmlEncode( StringRef str, ForWhat forWhat )
+    :   m_str( str ),
+        m_forWhat( forWhat )
+    {}
+
+    void XmlEncode::encodeTo( std::ostream& os ) const {
+        // Apostrophe escaping not necessary if we always use " to write attributes
+        // (see: http://www.w3.org/TR/xml/#syntax)
+
+        for( std::size_t idx = 0; idx < m_str.size(); ++ idx ) {
+            unsigned char c = static_cast<unsigned char>(m_str[idx]);
+            switch (c) {
+            case '<':   os << "&lt;"; break;
+            case '&':   os << "&amp;"; break;
+
+            case '>':
+                // See: http://www.w3.org/TR/xml/#syntax
+                if (idx > 2 && m_str[idx - 1] == ']' && m_str[idx - 2] == ']')
+                    os << "&gt;";
+                else
+                    os << c;
+                break;
+
+            case '\"':
+                if (m_forWhat == ForAttributes)
+                    os << "&quot;";
+                else
+                    os << c;
+                break;
+
+            default:
+                // Check for control characters and invalid utf-8
+
+                // Escape control characters in standard ascii
+                // see http://stackoverflow.com/questions/404107/why-are-control-characters-illegal-in-xml-1-0
+                if (c < 0x09 || (c > 0x0D && c < 0x20) || c == 0x7F) {
+                    hexEscapeChar(os, c);
+                    break;
+                }
+
+                // Plain ASCII: Write it to stream
+                if (c < 0x7F) {
+                    os << c;
+                    break;
+                }
+
+                // UTF-8 territory
+                // Check if the encoding is valid and if it is not, hex escape bytes.
+                // Important: We do not check the exact decoded values for validity, only the encoding format
+                // First check that this bytes is a valid lead byte:
+                // This means that it is not encoded as 1111 1XXX
+                // Or as 10XX XXXX
+                if (c <  0xC0 ||
+                    c >= 0xF8) {
+                    hexEscapeChar(os, c);
+                    break;
+                }
+
+                auto encBytes = trailingBytes(c);
+                // Are there enough bytes left to avoid accessing out-of-bounds memory?
+                if (idx + encBytes - 1 >= m_str.size()) {
+                    hexEscapeChar(os, c);
+                    break;
+                }
+                // The header is valid, check data
+                // The next encBytes bytes must together be a valid utf-8
+                // This means: bitpattern 10XX XXXX and the extracted value is sane (ish)
+                bool valid = true;
+                uint32_t value = headerValue(c);
+                for (std::size_t n = 1; n < encBytes; ++n) {
+                    unsigned char nc = static_cast<unsigned char>(m_str[idx + n]);
+                    valid &= ((nc & 0xC0) == 0x80);
+                    value = (value << 6) | (nc & 0x3F);
+                }
+
+                if (
+                    // Wrong bit pattern of following bytes
+                    (!valid) ||
+                    // Overlong encodings
+                    (value < 0x80) ||
+                    (0x80 <= value && value < 0x800   && encBytes > 2) ||
+                    (0x800 < value && value < 0x10000 && encBytes > 3) ||
+                    // Encoded value out of range
+                    (value >= 0x110000)
+                    ) {
+                    hexEscapeChar(os, c);
+                    break;
+                }
+
+                // If we got here, this is in fact a valid(ish) utf-8 sequence
+                for (std::size_t n = 0; n < encBytes; ++n) {
+                    os << m_str[idx + n];
+                }
+                idx += encBytes - 1;
+                break;
+            }
+        }
+    }
+
+    std::ostream& operator << ( std::ostream& os, XmlEncode const& xmlEncode ) {
+        xmlEncode.encodeTo( os );
+        return os;
+    }
+
+    XmlWriter::ScopedElement::ScopedElement( XmlWriter* writer, XmlFormatting fmt )
+    :   m_writer( writer ),
+        m_fmt(fmt)
+    {}
+
+    XmlWriter::ScopedElement::ScopedElement( ScopedElement&& other ) noexcept
+    :   m_writer( other.m_writer ),
+        m_fmt(other.m_fmt)
+    {
+        other.m_writer = nullptr;
+        other.m_fmt = XmlFormatting::None;
+    }
+    XmlWriter::ScopedElement& XmlWriter::ScopedElement::operator=( ScopedElement&& other ) noexcept {
+        if ( m_writer ) {
+            m_writer->endElement();
+        }
+        m_writer = other.m_writer;
+        other.m_writer = nullptr;
+        m_fmt = other.m_fmt;
+        other.m_fmt = XmlFormatting::None;
+        return *this;
+    }
+
+
+    XmlWriter::ScopedElement::~ScopedElement() {
+        if (m_writer) {
+            m_writer->endElement(m_fmt);
+        }
+    }
+
+    XmlWriter::ScopedElement&
+    XmlWriter::ScopedElement::writeText( StringRef text, XmlFormatting fmt ) {
+        m_writer->writeText( text, fmt );
+        return *this;
+    }
+
+    XmlWriter::ScopedElement&
+    XmlWriter::ScopedElement::writeAttribute( StringRef name,
+                                              StringRef attribute ) {
+        m_writer->writeAttribute( name, attribute );
+        return *this;
+    }
+
+
+    XmlWriter::XmlWriter( std::ostream& os ) : m_os( os )
+    {
+        writeDeclaration();
+    }
+
+    XmlWriter::~XmlWriter() {
+        while (!m_tags.empty()) {
+            endElement();
+        }
+        newlineIfNecessary();
+    }
+
+    XmlWriter& XmlWriter::startElement( std::string const& name, XmlFormatting fmt ) {
+        ensureTagClosed();
+        newlineIfNecessary();
+        if (shouldIndent(fmt)) {
+            m_os << m_indent;
+            m_indent += "  ";
+        }
+        m_os << '<' << name;
+        m_tags.push_back( name );
+        m_tagIsOpen = true;
+        applyFormatting(fmt);
+        return *this;
+    }
+
+    XmlWriter::ScopedElement XmlWriter::scopedElement( std::string const& name, XmlFormatting fmt ) {
+        ScopedElement scoped( this, fmt );
+        startElement( name, fmt );
+        return scoped;
+    }
+
+    XmlWriter& XmlWriter::endElement(XmlFormatting fmt) {
+        m_indent = m_indent.substr(0, m_indent.size() - 2);
+
+        if( m_tagIsOpen ) {
+            m_os << "/>";
+            m_tagIsOpen = false;
+        } else {
+            newlineIfNecessary();
+            if (shouldIndent(fmt)) {
+                m_os << m_indent;
+            }
+            m_os << "</" << m_tags.back() << '>';
+        }
+        m_os << std::flush;
+        applyFormatting(fmt);
+        m_tags.pop_back();
+        return *this;
+    }
+
+    XmlWriter& XmlWriter::writeAttribute( StringRef name,
+                                          StringRef attribute ) {
+        if( !name.empty() && !attribute.empty() )
+            m_os << ' ' << name << "=\"" << XmlEncode( attribute, XmlEncode::ForAttributes ) << '"';
+        return *this;
+    }
+
+    XmlWriter& XmlWriter::writeAttribute( StringRef name, bool attribute ) {
+        writeAttribute(name, (attribute ? "true"_sr : "false"_sr));
+        return *this;
+    }
+
+    XmlWriter& XmlWriter::writeAttribute( StringRef name,
+                                          char const* attribute ) {
+        writeAttribute( name, StringRef( attribute ) );
+        return *this;
+    }
+
+    XmlWriter& XmlWriter::writeText( StringRef text, XmlFormatting fmt ) {
+        CATCH_ENFORCE(!m_tags.empty(), "Cannot write text as top level element");
+        if( !text.empty() ){
+            bool tagWasOpen = m_tagIsOpen;
+            ensureTagClosed();
+            if (tagWasOpen && shouldIndent(fmt)) {
+                m_os << m_indent;
+            }
+            m_os << XmlEncode( text, XmlEncode::ForTextNodes );
+            applyFormatting(fmt);
+        }
+        return *this;
+    }
+
+    XmlWriter& XmlWriter::writeComment( StringRef text, XmlFormatting fmt ) {
+        ensureTagClosed();
+        if (shouldIndent(fmt)) {
+            m_os << m_indent;
+        }
+        m_os << "<!-- " << text << " -->";
+        applyFormatting(fmt);
+        return *this;
+    }
+
+    void XmlWriter::writeStylesheetRef( StringRef url ) {
+        m_os << R"(<?xml-stylesheet type="text/xsl" href=")" << url << R"("?>)" << '\n';
+    }
+
+    void XmlWriter::ensureTagClosed() {
+        if( m_tagIsOpen ) {
+            m_os << '>' << std::flush;
+            newlineIfNecessary();
+            m_tagIsOpen = false;
+        }
+    }
+
+    void XmlWriter::applyFormatting(XmlFormatting fmt) {
+        m_needsNewline = shouldNewline(fmt);
+    }
+
+    void XmlWriter::writeDeclaration() {
+        m_os << R"(<?xml version="1.0" encoding="UTF-8"?>)" << '\n';
+    }
+
+    void XmlWriter::newlineIfNecessary() {
+        if( m_needsNewline ) {
+            m_os << '\n' << std::flush;
+            m_needsNewline = false;
+        }
+    }
+}
+
+
+
+
+
+namespace Catch {
+namespace Matchers {
+
+    std::string MatcherUntypedBase::toString() const {
+        if (m_cachedToString.empty()) {
+            m_cachedToString = describe();
+        }
+        return m_cachedToString;
+    }
+
+    MatcherUntypedBase::~MatcherUntypedBase() = default;
+
+} // namespace Matchers
+} // namespace Catch
+
+
+
+
+namespace Catch {
+namespace Matchers {
+
+    std::string IsEmptyMatcher::describe() const {
+        return "is empty";
+    }
+
+    std::string HasSizeMatcher::describe() const {
+        ReusableStringStream sstr;
+        sstr << "has size == " << m_target_size;
+        return sstr.str();
+    }
+
+    IsEmptyMatcher IsEmpty() {
+        return {};
+    }
+
+    HasSizeMatcher SizeIs(std::size_t sz) {
+        return HasSizeMatcher{ sz };
+    }
+
+} // end namespace Matchers
+} // end namespace Catch
+
+
+
+namespace Catch {
+namespace Matchers {
+
+bool ExceptionMessageMatcher::match(std::exception const& ex) const {
+    return ex.what() == m_message;
+}
+
+std::string ExceptionMessageMatcher::describe() const {
+    return "exception message matches \"" + m_message + '"';
+}
+
+ExceptionMessageMatcher Message(std::string const& message) {
+    return ExceptionMessageMatcher(message);
+}
+
+} // namespace Matchers
+} // namespace Catch
+
+
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <cstdint>
+#include <sstream>
+#include <iomanip>
+#include <limits>
+
+
+namespace Catch {
+namespace {
+
+    template <typename FP>
+    bool almostEqualUlps(FP lhs, FP rhs, uint64_t maxUlpDiff) {
+        // Comparison with NaN should always be false.
+        // This way we can rule it out before getting into the ugly details
+        if (Catch::isnan(lhs) || Catch::isnan(rhs)) {
+            return false;
+        }
+
+        // This should also handle positive and negative zeros, infinities
+        const auto ulpDist = ulpDistance(lhs, rhs);
+
+        return ulpDist <= maxUlpDiff;
+    }
+
+
+template <typename FP>
+FP step(FP start, FP direction, uint64_t steps) {
+    for (uint64_t i = 0; i < steps; ++i) {
+        start = Catch::nextafter(start, direction);
+    }
+    return start;
+}
+
+// Performs equivalent check of std::fabs(lhs - rhs) <= margin
+// But without the subtraction to allow for INFINITY in comparison
+bool marginComparison(double lhs, double rhs, double margin) {
+    return (lhs + margin >= rhs) && (rhs + margin >= lhs);
+}
+
+template <typename FloatingPoint>
+void write(std::ostream& out, FloatingPoint num) {
+    out << std::scientific
+        << std::setprecision(std::numeric_limits<FloatingPoint>::max_digits10 - 1)
+        << num;
+}
+
+} // end anonymous namespace
+
+namespace Matchers {
+namespace Detail {
+
+    enum class FloatingPointKind : uint8_t {
+        Float,
+        Double
+    };
+
+} // end namespace Detail
+
+
+    WithinAbsMatcher::WithinAbsMatcher(double target, double margin)
+        :m_target{ target }, m_margin{ margin } {
+        CATCH_ENFORCE(margin >= 0, "Invalid margin: " << margin << '.'
+            << " Margin has to be non-negative.");
+    }
+
+    // Performs equivalent check of std::fabs(lhs - rhs) <= margin
+    // But without the subtraction to allow for INFINITY in comparison
+    bool WithinAbsMatcher::match(double const& matchee) const {
+        return (matchee + m_margin >= m_target) && (m_target + m_margin >= matchee);
+    }
+
+    std::string WithinAbsMatcher::describe() const {
+        return "is within " + ::Catch::Detail::stringify(m_margin) + " of " + ::Catch::Detail::stringify(m_target);
+    }
+
+
+    WithinUlpsMatcher::WithinUlpsMatcher(double target, uint64_t ulps, Detail::FloatingPointKind baseType)
+        :m_target{ target }, m_ulps{ ulps }, m_type{ baseType } {
+        CATCH_ENFORCE(m_type == Detail::FloatingPointKind::Double
+                   || m_ulps < (std::numeric_limits<uint32_t>::max)(),
+            "Provided ULP is impossibly large for a float comparison.");
+        CATCH_ENFORCE( std::numeric_limits<double>::is_iec559,
+                       "WithinUlp matcher only supports platforms with "
+                       "IEEE-754 compatible floating point representation" );
+    }
+
+#if defined(__clang__)
+#pragma clang diagnostic push
+// Clang <3.5 reports on the default branch in the switch below
+#pragma clang diagnostic ignored "-Wunreachable-code"
+#endif
+
+    bool WithinUlpsMatcher::match(double const& matchee) const {
+        switch (m_type) {
+        case Detail::FloatingPointKind::Float:
+            return almostEqualUlps<float>(static_cast<float>(matchee), static_cast<float>(m_target), m_ulps);
+        case Detail::FloatingPointKind::Double:
+            return almostEqualUlps<double>(matchee, m_target, m_ulps);
+        default:
+            CATCH_INTERNAL_ERROR( "Unknown Detail::FloatingPointKind value" );
+        }
+    }
+
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+    std::string WithinUlpsMatcher::describe() const {
+        std::stringstream ret;
+
+        ret << "is within " << m_ulps << " ULPs of ";
+
+        if (m_type == Detail::FloatingPointKind::Float) {
+            write(ret, static_cast<float>(m_target));
+            ret << 'f';
+        } else {
+            write(ret, m_target);
+        }
+
+        ret << " ([";
+        if (m_type == Detail::FloatingPointKind::Double) {
+            write( ret,
+                   step( m_target,
+                         -std::numeric_limits<double>::infinity(),
+                         m_ulps ) );
+            ret << ", ";
+            write( ret,
+                   step( m_target,
+                         std::numeric_limits<double>::infinity(),
+                         m_ulps ) );
+        } else {
+            // We have to cast INFINITY to float because of MinGW, see #1782
+            write( ret,
+                   step( static_cast<float>( m_target ),
+                         -std::numeric_limits<float>::infinity(),
+                         m_ulps ) );
+            ret << ", ";
+            write( ret,
+                   step( static_cast<float>( m_target ),
+                         std::numeric_limits<float>::infinity(),
+                         m_ulps ) );
+        }
+        ret << "])";
+
+        return ret.str();
+    }
+
+    WithinRelMatcher::WithinRelMatcher(double target, double epsilon):
+        m_target(target),
+        m_epsilon(epsilon){
+        CATCH_ENFORCE(m_epsilon >= 0., "Relative comparison with epsilon <  0 does not make sense.");
+        CATCH_ENFORCE(m_epsilon  < 1., "Relative comparison with epsilon >= 1 does not make sense.");
+    }
+
+    bool WithinRelMatcher::match(double const& matchee) const {
+        const auto relMargin = m_epsilon * (std::max)(std::fabs(matchee), std::fabs(m_target));
+        return marginComparison(matchee, m_target,
+                                std::isinf(relMargin)? 0 : relMargin);
+    }
+
+    std::string WithinRelMatcher::describe() const {
+        Catch::ReusableStringStream sstr;
+        sstr << "and " << m_target << " are within " << m_epsilon * 100. << "% of each other";
+        return sstr.str();
+    }
+
+
+WithinUlpsMatcher WithinULP(double target, uint64_t maxUlpDiff) {
+    return WithinUlpsMatcher(target, maxUlpDiff, Detail::FloatingPointKind::Double);
+}
+
+WithinUlpsMatcher WithinULP(float target, uint64_t maxUlpDiff) {
+    return WithinUlpsMatcher(target, maxUlpDiff, Detail::FloatingPointKind::Float);
+}
+
+WithinAbsMatcher WithinAbs(double target, double margin) {
+    return WithinAbsMatcher(target, margin);
+}
+
+WithinRelMatcher WithinRel(double target, double eps) {
+    return WithinRelMatcher(target, eps);
+}
+
+WithinRelMatcher WithinRel(double target) {
+    return WithinRelMatcher(target, std::numeric_limits<double>::epsilon() * 100);
+}
+
+WithinRelMatcher WithinRel(float target, float eps) {
+    return WithinRelMatcher(target, eps);
+}
+
+WithinRelMatcher WithinRel(float target) {
+    return WithinRelMatcher(target, std::numeric_limits<float>::epsilon() * 100);
+}
+
+
+
+bool IsNaNMatcher::match( double const& matchee ) const {
+    return std::isnan( matchee );
+}
+
+std::string IsNaNMatcher::describe() const {
+    using namespace std::string_literals;
+    return "is NaN"s;
+}
+
+IsNaNMatcher IsNaN() { return IsNaNMatcher(); }
+
+    } // namespace Matchers
+} // namespace Catch
+
+
+
+
+std::string Catch::Matchers::Detail::finalizeDescription(const std::string& desc) {
+    if (desc.empty()) {
+        return "matches undescribed predicate";
+    } else {
+        return "matches predicate: \"" + desc + '"';
+    }
+}
+
+
+
+namespace Catch {
+    namespace Matchers {
+        std::string AllTrueMatcher::describe() const { return "contains only true"; }
+
+        AllTrueMatcher AllTrue() { return AllTrueMatcher{}; }
+
+        std::string NoneTrueMatcher::describe() const { return "contains no true"; }
+
+        NoneTrueMatcher NoneTrue() { return NoneTrueMatcher{}; }
+
+        std::string AnyTrueMatcher::describe() const { return "contains at least one true"; }
+
+        AnyTrueMatcher AnyTrue() { return AnyTrueMatcher{}; }
+    } // namespace Matchers
+} // namespace Catch
+
+
+
+#include <regex>
+
+namespace Catch {
+namespace Matchers {
+
+    CasedString::CasedString( std::string const& str, CaseSensitive caseSensitivity )
+    :   m_caseSensitivity( caseSensitivity ),
+        m_str( adjustString( str ) )
+    {}
+    std::string CasedString::adjustString( std::string const& str ) const {
+        return m_caseSensitivity == CaseSensitive::No
+               ? toLower( str )
+               : str;
+    }
+    StringRef CasedString::caseSensitivitySuffix() const {
+        return m_caseSensitivity == CaseSensitive::Yes
+                   ? StringRef()
+                   : " (case insensitive)"_sr;
+    }
+
+
+    StringMatcherBase::StringMatcherBase( StringRef operation, CasedString const& comparator )
+    : m_comparator( comparator ),
+      m_operation( operation ) {
+    }
+
+    std::string StringMatcherBase::describe() const {
+        std::string description;
+        description.reserve(5 + m_operation.size() + m_comparator.m_str.size() +
+                                    m_comparator.caseSensitivitySuffix().size());
+        description += m_operation;
+        description += ": \"";
+        description += m_comparator.m_str;
+        description += '"';
+        description += m_comparator.caseSensitivitySuffix();
+        return description;
+    }
+
+    StringEqualsMatcher::StringEqualsMatcher( CasedString const& comparator ) : StringMatcherBase( "equals"_sr, comparator ) {}
+
+    bool StringEqualsMatcher::match( std::string const& source ) const {
+        return m_comparator.adjustString( source ) == m_comparator.m_str;
+    }
+
+
+    StringContainsMatcher::StringContainsMatcher( CasedString const& comparator ) : StringMatcherBase( "contains"_sr, comparator ) {}
+
+    bool StringContainsMatcher::match( std::string const& source ) const {
+        return contains( m_comparator.adjustString( source ), m_comparator.m_str );
+    }
+
+
+    StartsWithMatcher::StartsWithMatcher( CasedString const& comparator ) : StringMatcherBase( "starts with"_sr, comparator ) {}
+
+    bool StartsWithMatcher::match( std::string const& source ) const {
+        return startsWith( m_comparator.adjustString( source ), m_comparator.m_str );
+    }
+
+
+    EndsWithMatcher::EndsWithMatcher( CasedString const& comparator ) : StringMatcherBase( "ends with"_sr, comparator ) {}
+
+    bool EndsWithMatcher::match( std::string const& source ) const {
+        return endsWith( m_comparator.adjustString( source ), m_comparator.m_str );
+    }
+
+
+
+    RegexMatcher::RegexMatcher(std::string regex, CaseSensitive caseSensitivity): m_regex(CATCH_MOVE(regex)), m_caseSensitivity(caseSensitivity) {}
+
+    bool RegexMatcher::match(std::string const& matchee) const {
+        auto flags = std::regex::ECMAScript; // ECMAScript is the default syntax option anyway
+        if (m_caseSensitivity == CaseSensitive::No) {
+            flags |= std::regex::icase;
+        }
+        auto reg = std::regex(m_regex, flags);
+        return std::regex_match(matchee, reg);
+    }
+
+    std::string RegexMatcher::describe() const {
+        return "matches " + ::Catch::Detail::stringify(m_regex) + ((m_caseSensitivity == CaseSensitive::Yes)? " case sensitively" : " case insensitively");
+    }
+
+
+    StringEqualsMatcher Equals( std::string const& str, CaseSensitive caseSensitivity ) {
+        return StringEqualsMatcher( CasedString( str, caseSensitivity) );
+    }
+    StringContainsMatcher ContainsSubstring( std::string const& str, CaseSensitive caseSensitivity ) {
+        return StringContainsMatcher( CasedString( str, caseSensitivity) );
+    }
+    EndsWithMatcher EndsWith( std::string const& str, CaseSensitive caseSensitivity ) {
+        return EndsWithMatcher( CasedString( str, caseSensitivity) );
+    }
+    StartsWithMatcher StartsWith( std::string const& str, CaseSensitive caseSensitivity ) {
+        return StartsWithMatcher( CasedString( str, caseSensitivity) );
+    }
+
+    RegexMatcher Matches(std::string const& regex, CaseSensitive caseSensitivity) {
+        return RegexMatcher(regex, caseSensitivity);
+    }
+
+} // namespace Matchers
+} // namespace Catch
+
+
+
+namespace Catch {
+namespace Matchers {
+    MatcherGenericBase::~MatcherGenericBase() = default;
+
+    namespace Detail {
+
+        std::string describe_multi_matcher(StringRef combine, std::string const* descriptions_begin, std::string const* descriptions_end) {
+            std::string description;
+            std::size_t combined_size = 4;
+            for ( auto desc = descriptions_begin; desc != descriptions_end; ++desc ) {
+                combined_size += desc->size();
+            }
+            combined_size += static_cast<size_t>(descriptions_end - descriptions_begin - 1) * combine.size();
+
+            description.reserve(combined_size);
+
+            description += "( ";
+            bool first = true;
+            for( auto desc = descriptions_begin; desc != descriptions_end; ++desc ) {
+                if( first )
+                    first = false;
+                else
+                    description += combine;
+                description += *desc;
+            }
+            description += " )";
+            return description;
+        }
+
+    } // namespace Detail
+} // namespace Matchers
+} // namespace Catch
+
+
+
+
+namespace Catch {
+
+    // This is the general overload that takes a any string matcher
+    // There is another overload, in catch_assertionhandler.h/.cpp, that only takes a string and infers
+    // the Equals matcher (so the header does not mention matchers)
+    void handleExceptionMatchExpr( AssertionHandler& handler, StringMatcher const& matcher ) {
+        std::string exceptionMessage = Catch::translateActiveException();
+        MatchExpr<std::string, StringMatcher const&> expr( CATCH_MOVE(exceptionMessage), matcher );
+        handler.handleExpr( expr );
+    }
+
+} // namespace Catch
+
+
+
+#include <ostream>
+
+namespace Catch {
+
+    AutomakeReporter::~AutomakeReporter() = default;
+
+    void AutomakeReporter::testCaseEnded(TestCaseStats const& _testCaseStats) {
+        // Possible values to emit are PASS, XFAIL, SKIP, FAIL, XPASS and ERROR.
+        m_stream << ":test-result: ";
+        if ( _testCaseStats.totals.testCases.skipped > 0 ) {
+            m_stream << "SKIP";
+        } else if (_testCaseStats.totals.assertions.allPassed()) {
+            m_stream << "PASS";
+        } else if (_testCaseStats.totals.assertions.allOk()) {
+            m_stream << "XFAIL";
+        } else {
+            m_stream << "FAIL";
+        }
+        m_stream << ' ' << _testCaseStats.testInfo->name << '\n';
+        StreamingReporterBase::testCaseEnded(_testCaseStats);
+    }
+
+    void AutomakeReporter::skipTest(TestCaseInfo const& testInfo) {
+        m_stream << ":test-result: SKIP " << testInfo.name << '\n';
+    }
+
+} // end namespace Catch
+
+
+
+
+
+
+namespace Catch {
+    ReporterBase::ReporterBase( ReporterConfig&& config ):
+        IEventListener( config.fullConfig() ),
+        m_wrapped_stream( CATCH_MOVE(config).takeStream() ),
+        m_stream( m_wrapped_stream->stream() ),
+        m_colour( makeColourImpl( config.colourMode(), m_wrapped_stream.get() ) ),
+        m_customOptions( config.customOptions() )
+    {}
+
+    ReporterBase::~ReporterBase() = default;
+
+    void ReporterBase::listReporters(
+        std::vector<ReporterDescription> const& descriptions ) {
+        defaultListReporters(m_stream, descriptions, m_config->verbosity());
+    }
+
+    void ReporterBase::listListeners(
+        std::vector<ListenerDescription> const& descriptions ) {
+        defaultListListeners( m_stream, descriptions );
+    }
+
+    void ReporterBase::listTests(std::vector<TestCaseHandle> const& tests) {
+        defaultListTests(m_stream,
+                         m_colour.get(),
+                         tests,
+                         m_config->hasTestFilters(),
+                         m_config->verbosity());
+    }
+
+    void ReporterBase::listTags(std::vector<TagInfo> const& tags) {
+        defaultListTags( m_stream, tags, m_config->hasTestFilters() );
+    }
+
+} // namespace Catch
+
+
+
+
+#include <ostream>
+
+namespace Catch {
+namespace {
+
+    // Colour::LightGrey
+    static constexpr Colour::Code compactDimColour = Colour::FileName;
+
+#ifdef CATCH_PLATFORM_MAC
+    static constexpr Catch::StringRef compactFailedString = "FAILED"_sr;
+    static constexpr Catch::StringRef compactPassedString = "PASSED"_sr;
+#else
+    static constexpr Catch::StringRef compactFailedString = "failed"_sr;
+    static constexpr Catch::StringRef compactPassedString = "passed"_sr;
+#endif
+
+// Implementation of CompactReporter formatting
+class AssertionPrinter {
+public:
+    AssertionPrinter& operator= (AssertionPrinter const&) = delete;
+    AssertionPrinter(AssertionPrinter const&) = delete;
+    AssertionPrinter(std::ostream& _stream, AssertionStats const& _stats, bool _printInfoMessages, ColourImpl* colourImpl_)
+        : stream(_stream)
+        , result(_stats.assertionResult)
+        , messages(_stats.infoMessages)
+        , itMessage(_stats.infoMessages.begin())
+        , printInfoMessages(_printInfoMessages)
+        , colourImpl(colourImpl_)
+    {}
+
+    void print() {
+        printSourceInfo();
+
+        itMessage = messages.begin();
+
+        switch (result.getResultType()) {
+        case ResultWas::Ok:
+            printResultType(Colour::ResultSuccess, compactPassedString);
+            printOriginalExpression();
+            printReconstructedExpression();
+            if (!result.hasExpression())
+                printRemainingMessages(Colour::None);
+            else
+                printRemainingMessages();
+            break;
+        case ResultWas::ExpressionFailed:
+            if (result.isOk())
+                printResultType(Colour::ResultSuccess, compactFailedString + " - but was ok"_sr);
+            else
+                printResultType(Colour::Error, compactFailedString);
+            printOriginalExpression();
+            printReconstructedExpression();
+            printRemainingMessages();
+            break;
+        case ResultWas::ThrewException:
+            printResultType(Colour::Error, compactFailedString);
+            printIssue("unexpected exception with message:");
+            printMessage();
+            printExpressionWas();
+            printRemainingMessages();
+            break;
+        case ResultWas::FatalErrorCondition:
+            printResultType(Colour::Error, compactFailedString);
+            printIssue("fatal error condition with message:");
+            printMessage();
+            printExpressionWas();
+            printRemainingMessages();
+            break;
+        case ResultWas::DidntThrowException:
+            printResultType(Colour::Error, compactFailedString);
+            printIssue("expected exception, got none");
+            printExpressionWas();
+            printRemainingMessages();
+            break;
+        case ResultWas::Info:
+            printResultType(Colour::None, "info"_sr);
+            printMessage();
+            printRemainingMessages();
+            break;
+        case ResultWas::Warning:
+            printResultType(Colour::None, "warning"_sr);
+            printMessage();
+            printRemainingMessages();
+            break;
+        case ResultWas::ExplicitFailure:
+            printResultType(Colour::Error, compactFailedString);
+            printIssue("explicitly");
+            printRemainingMessages(Colour::None);
+            break;
+        case ResultWas::ExplicitSkip:
+            printResultType(Colour::Skip, "skipped"_sr);
+            printMessage();
+            printRemainingMessages();
+            break;
+            // These cases are here to prevent compiler warnings
+        case ResultWas::Unknown:
+        case ResultWas::FailureBit:
+        case ResultWas::Exception:
+            printResultType(Colour::Error, "** internal error **");
+            break;
+        }
+    }
+
+private:
+    void printSourceInfo() const {
+        stream << colourImpl->guardColour( Colour::FileName )
+               << result.getSourceInfo() << ':';
+    }
+
+    void printResultType(Colour::Code colour, StringRef passOrFail) const {
+        if (!passOrFail.empty()) {
+            stream << colourImpl->guardColour(colour) << ' ' << passOrFail;
+            stream << ':';
+        }
+    }
+
+    void printIssue(char const* issue) const {
+        stream << ' ' << issue;
+    }
+
+    void printExpressionWas() {
+        if (result.hasExpression()) {
+            stream << ';';
+            {
+                stream << colourImpl->guardColour(compactDimColour) << " expression was:";
+            }
+            printOriginalExpression();
+        }
+    }
+
+    void printOriginalExpression() const {
+        if (result.hasExpression()) {
+            stream << ' ' << result.getExpression();
+        }
+    }
+
+    void printReconstructedExpression() const {
+        if (result.hasExpandedExpression()) {
+            stream << colourImpl->guardColour(compactDimColour) << " for: ";
+            stream << result.getExpandedExpression();
+        }
+    }
+
+    void printMessage() {
+        if (itMessage != messages.end()) {
+            stream << " '" << itMessage->message << '\'';
+            ++itMessage;
+        }
+    }
+
+    void printRemainingMessages(Colour::Code colour = compactDimColour) {
+        if (itMessage == messages.end())
+            return;
+
+        const auto itEnd = messages.cend();
+        const auto N = static_cast<std::size_t>(itEnd - itMessage);
+
+        stream << colourImpl->guardColour( colour ) << " with "
+               << pluralise( N, "message"_sr ) << ':';
+
+        while (itMessage != itEnd) {
+            // If this assertion is a warning ignore any INFO messages
+            if (printInfoMessages || itMessage->type != ResultWas::Info) {
+                printMessage();
+                if (itMessage != itEnd) {
+                    stream << colourImpl->guardColour(compactDimColour) << " and";
+                }
+                continue;
+            }
+            ++itMessage;
+        }
+    }
+
+private:
+    std::ostream& stream;
+    AssertionResult const& result;
+    std::vector<MessageInfo> const& messages;
+    std::vector<MessageInfo>::const_iterator itMessage;
+    bool printInfoMessages;
+    ColourImpl* colourImpl;
+};
+
+} // anon namespace
+
+        std::string CompactReporter::getDescription() {
+            return "Reports test results on a single line, suitable for IDEs";
+        }
+
+        void CompactReporter::noMatchingTestCases( StringRef unmatchedSpec ) {
+            m_stream << "No test cases matched '" << unmatchedSpec << "'\n";
+        }
+
+        void CompactReporter::testRunStarting( TestRunInfo const& ) {
+            if ( m_config->testSpec().hasFilters() ) {
+                m_stream << m_colour->guardColour( Colour::BrightYellow )
+                         << "Filters: "
+                         << m_config->testSpec()
+                         << '\n';
+            }
+            m_stream << "RNG seed: " << getSeed() << '\n';
+        }
+
+        void CompactReporter::assertionEnded( AssertionStats const& _assertionStats ) {
+            AssertionResult const& result = _assertionStats.assertionResult;
+
+            bool printInfoMessages = true;
+
+            // Drop out if result was successful and we're not printing those
+            if( !m_config->includeSuccessfulResults() && result.isOk() ) {
+                if( result.getResultType() != ResultWas::Warning && result.getResultType() != ResultWas::ExplicitSkip )
+                    return;
+                printInfoMessages = false;
+            }
+
+            AssertionPrinter printer( m_stream, _assertionStats, printInfoMessages, m_colour.get() );
+            printer.print();
+
+            m_stream << '\n' << std::flush;
+        }
+
+        void CompactReporter::sectionEnded(SectionStats const& _sectionStats) {
+            double dur = _sectionStats.durationInSeconds;
+            if ( shouldShowDuration( *m_config, dur ) ) {
+                m_stream << getFormattedDuration( dur ) << " s: " << _sectionStats.sectionInfo.name << '\n' << std::flush;
+            }
+        }
+
+        void CompactReporter::testRunEnded( TestRunStats const& _testRunStats ) {
+            printTestRunTotals( m_stream, *m_colour, _testRunStats.totals );
+            m_stream << "\n\n" << std::flush;
+            StreamingReporterBase::testRunEnded( _testRunStats );
+        }
+
+        CompactReporter::~CompactReporter() = default;
+
+} // end namespace Catch
+
+
+
+
+#include <cstdio>
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable:4061) // Not all labels are EXPLICITLY handled in switch
+ // Note that 4062 (not all labels are handled and default is missing) is enabled
+#endif
+
+#if defined(__clang__)
+#  pragma clang diagnostic push
+// For simplicity, benchmarking-only helpers are always enabled
+#  pragma clang diagnostic ignored "-Wunused-function"
+#endif
+
+
+
+namespace Catch {
+
+namespace {
+
+// Formatter impl for ConsoleReporter
+class ConsoleAssertionPrinter {
+public:
+    ConsoleAssertionPrinter& operator= (ConsoleAssertionPrinter const&) = delete;
+    ConsoleAssertionPrinter(ConsoleAssertionPrinter const&) = delete;
+    ConsoleAssertionPrinter(std::ostream& _stream, AssertionStats const& _stats, ColourImpl* colourImpl_, bool _printInfoMessages)
+        : stream(_stream),
+        stats(_stats),
+        result(_stats.assertionResult),
+        colour(Colour::None),
+        messages(_stats.infoMessages),
+        colourImpl(colourImpl_),
+        printInfoMessages(_printInfoMessages) {
+        switch (result.getResultType()) {
+        case ResultWas::Ok:
+            colour = Colour::Success;
+            passOrFail = "PASSED"_sr;
+            //if( result.hasMessage() )
+            if (messages.size() == 1)
+                messageLabel = "with message"_sr;
+            if (messages.size() > 1)
+                messageLabel = "with messages"_sr;
+            break;
+        case ResultWas::ExpressionFailed:
+            if (result.isOk()) {
+                colour = Colour::Success;
+                passOrFail = "FAILED - but was ok"_sr;
+            } else {
+                colour = Colour::Error;
+                passOrFail = "FAILED"_sr;
+            }
+            if (messages.size() == 1)
+                messageLabel = "with message"_sr;
+            if (messages.size() > 1)
+                messageLabel = "with messages"_sr;
+            break;
+        case ResultWas::ThrewException:
+            colour = Colour::Error;
+            passOrFail = "FAILED"_sr;
+            // todo switch
+            switch (messages.size()) { case 0:
+                messageLabel = "due to unexpected exception with "_sr;
+                break;
+            case 1:
+                messageLabel = "due to unexpected exception with message"_sr;
+                break;
+            default:
+                messageLabel = "due to unexpected exception with messages"_sr;
+                break;
+            }
+            break;
+        case ResultWas::FatalErrorCondition:
+            colour = Colour::Error;
+            passOrFail = "FAILED"_sr;
+            messageLabel = "due to a fatal error condition"_sr;
+            break;
+        case ResultWas::DidntThrowException:
+            colour = Colour::Error;
+            passOrFail = "FAILED"_sr;
+            messageLabel = "because no exception was thrown where one was expected"_sr;
+            break;
+        case ResultWas::Info:
+            messageLabel = "info"_sr;
+            break;
+        case ResultWas::Warning:
+            messageLabel = "warning"_sr;
+            break;
+        case ResultWas::ExplicitFailure:
+            passOrFail = "FAILED"_sr;
+            colour = Colour::Error;
+            if (messages.size() == 1)
+                messageLabel = "explicitly with message"_sr;
+            if (messages.size() > 1)
+                messageLabel = "explicitly with messages"_sr;
+            break;
+        case ResultWas::ExplicitSkip:
+            colour = Colour::Skip;
+            passOrFail = "SKIPPED"_sr;
+            if (messages.size() == 1)
+                messageLabel = "explicitly with message"_sr;
+            if (messages.size() > 1)
+                messageLabel = "explicitly with messages"_sr;
+            break;
+            // These cases are here to prevent compiler warnings
+        case ResultWas::Unknown:
+        case ResultWas::FailureBit:
+        case ResultWas::Exception:
+            passOrFail = "** internal error **"_sr;
+            colour = Colour::Error;
+            break;
+        }
+    }
+
+    void print() const {
+        printSourceInfo();
+        if (stats.totals.assertions.total() > 0) {
+            printResultType();
+            printOriginalExpression();
+            printReconstructedExpression();
+        } else {
+            stream << '\n';
+        }
+        printMessage();
+    }
+
+private:
+    void printResultType() const {
+        if (!passOrFail.empty()) {
+            stream << colourImpl->guardColour(colour) << passOrFail << ":\n";
+        }
+    }
+    void printOriginalExpression() const {
+        if (result.hasExpression()) {
+            stream << colourImpl->guardColour( Colour::OriginalExpression )
+                   << "  " << result.getExpressionInMacro() << '\n';
+        }
+    }
+    void printReconstructedExpression() const {
+        if (result.hasExpandedExpression()) {
+            stream << "with expansion:\n";
+            stream << colourImpl->guardColour( Colour::ReconstructedExpression )
+                   << TextFlow::Column( result.getExpandedExpression() )
+                          .indent( 2 )
+                   << '\n';
+        }
+    }
+    void printMessage() const {
+        if (!messageLabel.empty())
+            stream << messageLabel << ':' << '\n';
+        for (auto const& msg : messages) {
+            // If this assertion is a warning ignore any INFO messages
+            if (printInfoMessages || msg.type != ResultWas::Info)
+                stream << TextFlow::Column(msg.message).indent(2) << '\n';
+        }
+    }
+    void printSourceInfo() const {
+        stream << colourImpl->guardColour( Colour::FileName )
+               << result.getSourceInfo() << ": ";
+    }
+
+    std::ostream& stream;
+    AssertionStats const& stats;
+    AssertionResult const& result;
+    Colour::Code colour;
+    StringRef passOrFail;
+    StringRef messageLabel;
+    std::vector<MessageInfo> const& messages;
+    ColourImpl* colourImpl;
+    bool printInfoMessages;
+};
+
+std::size_t makeRatio( std::uint64_t number, std::uint64_t total ) {
+    const auto ratio = total > 0 ? CATCH_CONFIG_CONSOLE_WIDTH * number / total : 0;
+    return (ratio == 0 && number > 0) ? 1 : static_cast<std::size_t>(ratio);
+}
+
+std::size_t&
+findMax( std::size_t& i, std::size_t& j, std::size_t& k, std::size_t& l ) {
+    if (i > j && i > k && i > l)
+        return i;
+    else if (j > k && j > l)
+        return j;
+    else if (k > l)
+        return k;
+    else
+        return l;
+}
+
+struct ColumnBreak {};
+struct RowBreak {};
+struct OutputFlush {};
+
+class Duration {
+    enum class Unit {
+        Auto,
+        Nanoseconds,
+        Microseconds,
+        Milliseconds,
+        Seconds,
+        Minutes
+    };
+    static const uint64_t s_nanosecondsInAMicrosecond = 1000;
+    static const uint64_t s_nanosecondsInAMillisecond = 1000 * s_nanosecondsInAMicrosecond;
+    static const uint64_t s_nanosecondsInASecond = 1000 * s_nanosecondsInAMillisecond;
+    static const uint64_t s_nanosecondsInAMinute = 60 * s_nanosecondsInASecond;
+
+    double m_inNanoseconds;
+    Unit m_units;
+
+public:
+    explicit Duration(double inNanoseconds, Unit units = Unit::Auto)
+        : m_inNanoseconds(inNanoseconds),
+        m_units(units) {
+        if (m_units == Unit::Auto) {
+            if (m_inNanoseconds < s_nanosecondsInAMicrosecond)
+                m_units = Unit::Nanoseconds;
+            else if (m_inNanoseconds < s_nanosecondsInAMillisecond)
+                m_units = Unit::Microseconds;
+            else if (m_inNanoseconds < s_nanosecondsInASecond)
+                m_units = Unit::Milliseconds;
+            else if (m_inNanoseconds < s_nanosecondsInAMinute)
+                m_units = Unit::Seconds;
+            else
+                m_units = Unit::Minutes;
+        }
+
+    }
+
+    auto value() const -> double {
+        switch (m_units) {
+        case Unit::Microseconds:
+            return m_inNanoseconds / static_cast<double>(s_nanosecondsInAMicrosecond);
+        case Unit::Milliseconds:
+            return m_inNanoseconds / static_cast<double>(s_nanosecondsInAMillisecond);
+        case Unit::Seconds:
+            return m_inNanoseconds / static_cast<double>(s_nanosecondsInASecond);
+        case Unit::Minutes:
+            return m_inNanoseconds / static_cast<double>(s_nanosecondsInAMinute);
+        default:
+            return m_inNanoseconds;
+        }
+    }
+    StringRef unitsAsString() const {
+        switch (m_units) {
+        case Unit::Nanoseconds:
+            return "ns"_sr;
+        case Unit::Microseconds:
+            return "us"_sr;
+        case Unit::Milliseconds:
+            return "ms"_sr;
+        case Unit::Seconds:
+            return "s"_sr;
+        case Unit::Minutes:
+            return "m"_sr;
+        default:
+            return "** internal error **"_sr;
+        }
+
+    }
+    friend auto operator << (std::ostream& os, Duration const& duration) -> std::ostream& {
+        return os << duration.value() << ' ' << duration.unitsAsString();
+    }
+};
+} // end anon namespace
+
+enum class Justification { Left, Right };
+
+struct ColumnInfo {
+    std::string name;
+    std::size_t width;
+    Justification justification;
+};
+
+class TablePrinter {
+    std::ostream& m_os;
+    std::vector<ColumnInfo> m_columnInfos;
+    ReusableStringStream m_oss;
+    int m_currentColumn = -1;
+    bool m_isOpen = false;
+
+public:
+    TablePrinter( std::ostream& os, std::vector<ColumnInfo> columnInfos )
+    :   m_os( os ),
+        m_columnInfos( CATCH_MOVE( columnInfos ) ) {}
+
+    auto columnInfos() const -> std::vector<ColumnInfo> const& {
+        return m_columnInfos;
+    }
+
+    void open() {
+        if (!m_isOpen) {
+            m_isOpen = true;
+            *this << RowBreak();
+
+			TextFlow::Columns headerCols;
+			for (auto const& info : m_columnInfos) {
+                assert(info.width > 2);
+				headerCols += TextFlow::Column(info.name).width(info.width - 2);
+                headerCols += TextFlow::Spacer( 2 );
+			}
+			m_os << headerCols << '\n';
+
+            m_os << lineOfChars('-') << '\n';
+        }
+    }
+    void close() {
+        if (m_isOpen) {
+            *this << RowBreak();
+            m_os << '\n' << std::flush;
+            m_isOpen = false;
+        }
+    }
+
+    template<typename T>
+    friend TablePrinter& operator<< (TablePrinter& tp, T const& value) {
+        tp.m_oss << value;
+        return tp;
+    }
+
+    friend TablePrinter& operator<< (TablePrinter& tp, ColumnBreak) {
+        auto colStr = tp.m_oss.str();
+        const auto strSize = colStr.size();
+        tp.m_oss.str("");
+        tp.open();
+        if (tp.m_currentColumn == static_cast<int>(tp.m_columnInfos.size() - 1)) {
+            tp.m_currentColumn = -1;
+            tp.m_os << '\n';
+        }
+        tp.m_currentColumn++;
+
+        auto colInfo = tp.m_columnInfos[tp.m_currentColumn];
+        auto padding = (strSize + 1 < colInfo.width)
+            ? std::string(colInfo.width - (strSize + 1), ' ')
+            : std::string();
+        if (colInfo.justification == Justification::Left)
+            tp.m_os << colStr << padding << ' ';
+        else
+            tp.m_os << padding << colStr << ' ';
+        return tp;
+    }
+
+    friend TablePrinter& operator<< (TablePrinter& tp, RowBreak) {
+        if (tp.m_currentColumn > 0) {
+            tp.m_os << '\n';
+            tp.m_currentColumn = -1;
+        }
+        return tp;
+    }
+
+    friend TablePrinter& operator<<(TablePrinter& tp, OutputFlush) {
+        tp.m_os << std::flush;
+        return tp;
+    }
+};
+
+ConsoleReporter::ConsoleReporter(ReporterConfig&& config):
+    StreamingReporterBase( CATCH_MOVE( config ) ),
+    m_tablePrinter(Detail::make_unique<TablePrinter>(m_stream,
+        [&config]() -> std::vector<ColumnInfo> {
+        if (config.fullConfig()->benchmarkNoAnalysis())
+        {
+            return{
+                { "benchmark name", CATCH_CONFIG_CONSOLE_WIDTH - 43, Justification::Left },
+                { "     samples", 14, Justification::Right },
+                { "  iterations", 14, Justification::Right },
+                { "        mean", 14, Justification::Right }
+            };
+        }
+        else
+        {
+            return{
+                { "benchmark name", CATCH_CONFIG_CONSOLE_WIDTH - 43, Justification::Left },
+                { "samples      mean       std dev", 14, Justification::Right },
+                { "iterations   low mean   low std dev", 14, Justification::Right },
+                { "est run time high mean  high std dev", 14, Justification::Right }
+            };
+        }
+    }())) {}
+ConsoleReporter::~ConsoleReporter() = default;
+
+std::string ConsoleReporter::getDescription() {
+    return "Reports test results as plain lines of text";
+}
+
+void ConsoleReporter::noMatchingTestCases( StringRef unmatchedSpec ) {
+    m_stream << "No test cases matched '" << unmatchedSpec << "'\n";
+}
+
+void ConsoleReporter::reportInvalidTestSpec( StringRef arg ) {
+    m_stream << "Invalid Filter: " << arg << '\n';
+}
+
+void ConsoleReporter::assertionStarting(AssertionInfo const&) {}
+
+void ConsoleReporter::assertionEnded(AssertionStats const& _assertionStats) {
+    AssertionResult const& result = _assertionStats.assertionResult;
+
+    bool includeResults = m_config->includeSuccessfulResults() || !result.isOk();
+
+    // Drop out if result was successful but we're not printing them.
+    // TODO: Make configurable whether skips should be printed
+    if (!includeResults && result.getResultType() != ResultWas::Warning && result.getResultType() != ResultWas::ExplicitSkip)
+        return;
+
+    lazyPrint();
+
+    ConsoleAssertionPrinter printer(m_stream, _assertionStats, m_colour.get(), includeResults);
+    printer.print();
+    m_stream << '\n' << std::flush;
+}
+
+void ConsoleReporter::sectionStarting(SectionInfo const& _sectionInfo) {
+    m_tablePrinter->close();
+    m_headerPrinted = false;
+    StreamingReporterBase::sectionStarting(_sectionInfo);
+}
+void ConsoleReporter::sectionEnded(SectionStats const& _sectionStats) {
+    m_tablePrinter->close();
+    if (_sectionStats.missingAssertions) {
+        lazyPrint();
+        auto guard =
+            m_colour->guardColour( Colour::ResultError ).engage( m_stream );
+        if (m_sectionStack.size() > 1)
+            m_stream << "\nNo assertions in section";
+        else
+            m_stream << "\nNo assertions in test case";
+        m_stream << " '" << _sectionStats.sectionInfo.name << "'\n\n" << std::flush;
+    }
+    double dur = _sectionStats.durationInSeconds;
+    if (shouldShowDuration(*m_config, dur)) {
+        m_stream << getFormattedDuration(dur) << " s: " << _sectionStats.sectionInfo.name << '\n' << std::flush;
+    }
+    if (m_headerPrinted) {
+        m_headerPrinted = false;
+    }
+    StreamingReporterBase::sectionEnded(_sectionStats);
+}
+
+void ConsoleReporter::benchmarkPreparing( StringRef name ) {
+	lazyPrintWithoutClosingBenchmarkTable();
+
+	auto nameCol = TextFlow::Column( static_cast<std::string>( name ) )
+                       .width( m_tablePrinter->columnInfos()[0].width - 2 );
+
+	bool firstLine = true;
+	for (auto line : nameCol) {
+		if (!firstLine)
+			(*m_tablePrinter) << ColumnBreak() << ColumnBreak() << ColumnBreak();
+		else
+			firstLine = false;
+
+		(*m_tablePrinter) << line << ColumnBreak();
+	}
+}
+
+void ConsoleReporter::benchmarkStarting(BenchmarkInfo const& info) {
+    (*m_tablePrinter) << info.samples << ColumnBreak()
+        << info.iterations << ColumnBreak();
+    if ( !m_config->benchmarkNoAnalysis() ) {
+        ( *m_tablePrinter )
+            << Duration( info.estimatedDuration ) << ColumnBreak();
+    }
+    ( *m_tablePrinter ) << OutputFlush{};
+}
+void ConsoleReporter::benchmarkEnded(BenchmarkStats<> const& stats) {
+    if (m_config->benchmarkNoAnalysis())
+    {
+        (*m_tablePrinter) << Duration(stats.mean.point.count()) << ColumnBreak();
+    }
+    else
+    {
+        (*m_tablePrinter) << ColumnBreak()
+            << Duration(stats.mean.point.count()) << ColumnBreak()
+            << Duration(stats.mean.lower_bound.count()) << ColumnBreak()
+            << Duration(stats.mean.upper_bound.count()) << ColumnBreak() << ColumnBreak()
+            << Duration(stats.standardDeviation.point.count()) << ColumnBreak()
+            << Duration(stats.standardDeviation.lower_bound.count()) << ColumnBreak()
+            << Duration(stats.standardDeviation.upper_bound.count()) << ColumnBreak() << ColumnBreak() << ColumnBreak() << ColumnBreak() << ColumnBreak();
+    }
+}
+
+void ConsoleReporter::benchmarkFailed( StringRef error ) {
+    auto guard = m_colour->guardColour( Colour::Red ).engage( m_stream );
+    (*m_tablePrinter)
+        << "Benchmark failed (" << error << ')'
+        << ColumnBreak() << RowBreak();
+}
+
+void ConsoleReporter::testCaseEnded(TestCaseStats const& _testCaseStats) {
+    m_tablePrinter->close();
+    StreamingReporterBase::testCaseEnded(_testCaseStats);
+    m_headerPrinted = false;
+}
+void ConsoleReporter::testRunEnded(TestRunStats const& _testRunStats) {
+    printTotalsDivider(_testRunStats.totals);
+    printTestRunTotals( m_stream, *m_colour, _testRunStats.totals );
+    m_stream << '\n' << std::flush;
+    StreamingReporterBase::testRunEnded(_testRunStats);
+}
+void ConsoleReporter::testRunStarting(TestRunInfo const& _testRunInfo) {
+    StreamingReporterBase::testRunStarting(_testRunInfo);
+    if ( m_config->testSpec().hasFilters() ) {
+        m_stream << m_colour->guardColour( Colour::BrightYellow ) << "Filters: "
+                 << m_config->testSpec() << '\n';
+    }
+    m_stream << "Randomness seeded to: " << getSeed() << '\n';
+}
+
+void ConsoleReporter::lazyPrint() {
+
+    m_tablePrinter->close();
+    lazyPrintWithoutClosingBenchmarkTable();
+}
+
+void ConsoleReporter::lazyPrintWithoutClosingBenchmarkTable() {
+
+    if ( !m_testRunInfoPrinted ) {
+        lazyPrintRunInfo();
+    }
+    if (!m_headerPrinted) {
+        printTestCaseAndSectionHeader();
+        m_headerPrinted = true;
+    }
+}
+void ConsoleReporter::lazyPrintRunInfo() {
+    m_stream << '\n'
+             << lineOfChars( '~' ) << '\n'
+             << m_colour->guardColour( Colour::SecondaryText )
+             << currentTestRunInfo.name << " is a Catch2 v" << libraryVersion()
+             << " host application.\n"
+             << "Run with -? for options\n\n";
+
+    m_testRunInfoPrinted = true;
+}
+void ConsoleReporter::printTestCaseAndSectionHeader() {
+    assert(!m_sectionStack.empty());
+    printOpenHeader(currentTestCaseInfo->name);
+
+    if (m_sectionStack.size() > 1) {
+        auto guard = m_colour->guardColour( Colour::Headers ).engage( m_stream );
+
+        auto
+            it = m_sectionStack.begin() + 1, // Skip first section (test case)
+            itEnd = m_sectionStack.end();
+        for (; it != itEnd; ++it)
+            printHeaderString(it->name, 2);
+    }
+
+    SourceLineInfo lineInfo = m_sectionStack.back().lineInfo;
+
+
+    m_stream << lineOfChars( '-' ) << '\n'
+             << m_colour->guardColour( Colour::FileName ) << lineInfo << '\n'
+             << lineOfChars( '.' ) << "\n\n"
+             << std::flush;
+}
+
+void ConsoleReporter::printClosedHeader(std::string const& _name) {
+    printOpenHeader(_name);
+    m_stream << lineOfChars('.') << '\n';
+}
+void ConsoleReporter::printOpenHeader(std::string const& _name) {
+    m_stream << lineOfChars('-') << '\n';
+    {
+        auto guard = m_colour->guardColour( Colour::Headers ).engage( m_stream );
+        printHeaderString(_name);
+    }
+}
+
+void ConsoleReporter::printHeaderString(std::string const& _string, std::size_t indent) {
+    // We want to get a bit fancy with line breaking here, so that subsequent
+    // lines start after ":" if one is present, e.g.
+    // ```
+    // blablabla: Fancy
+    //            linebreaking
+    // ```
+    // but we also want to avoid problems with overly long indentation causing
+    // the text to take up too many lines, e.g.
+    // ```
+    // blablabla: F
+    //            a
+    //            n
+    //            c
+    //            y
+    //            .
+    //            .
+    //            .
+    // ```
+    // So we limit the prefix indentation check to first quarter of the possible
+    // width
+    std::size_t idx = _string.find( ": " );
+    if ( idx != std::string::npos && idx < CATCH_CONFIG_CONSOLE_WIDTH / 4 ) {
+        idx += 2;
+    } else {
+        idx = 0;
+    }
+    m_stream << TextFlow::Column( _string )
+                  .indent( indent + idx )
+                  .initialIndent( indent )
+           << '\n';
+}
+
+void ConsoleReporter::printTotalsDivider(Totals const& totals) {
+    if (totals.testCases.total() > 0) {
+        std::size_t failedRatio = makeRatio(totals.testCases.failed, totals.testCases.total());
+        std::size_t failedButOkRatio = makeRatio(totals.testCases.failedButOk, totals.testCases.total());
+        std::size_t passedRatio = makeRatio(totals.testCases.passed, totals.testCases.total());
+        std::size_t skippedRatio = makeRatio(totals.testCases.skipped, totals.testCases.total());
+        while (failedRatio + failedButOkRatio + passedRatio + skippedRatio < CATCH_CONFIG_CONSOLE_WIDTH - 1)
+            findMax(failedRatio, failedButOkRatio, passedRatio, skippedRatio)++;
+        while (failedRatio + failedButOkRatio + passedRatio > CATCH_CONFIG_CONSOLE_WIDTH - 1)
+            findMax(failedRatio, failedButOkRatio, passedRatio, skippedRatio)--;
+
+        m_stream << m_colour->guardColour( Colour::Error )
+                 << std::string( failedRatio, '=' )
+                 << m_colour->guardColour( Colour::ResultExpectedFailure )
+                 << std::string( failedButOkRatio, '=' );
+        if ( totals.testCases.allPassed() ) {
+            m_stream << m_colour->guardColour( Colour::ResultSuccess )
+                     << std::string( passedRatio, '=' );
+        } else {
+            m_stream << m_colour->guardColour( Colour::Success )
+                     << std::string( passedRatio, '=' );
+        }
+        m_stream << m_colour->guardColour( Colour::Skip )
+                 << std::string( skippedRatio, '=' );
+    } else {
+        m_stream << m_colour->guardColour( Colour::Warning )
+                 << std::string( CATCH_CONFIG_CONSOLE_WIDTH - 1, '=' );
+    }
+    m_stream << '\n';
+}
+
+} // end namespace Catch
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+#if defined(__clang__)
+#  pragma clang diagnostic pop
+#endif
+
+
+
+
+#include <algorithm>
+#include <cassert>
+
+namespace Catch {
+    namespace {
+        struct BySectionInfo {
+            BySectionInfo( SectionInfo const& other ): m_other( other ) {}
+            BySectionInfo( BySectionInfo const& other ) = default;
+            bool operator()(
+                Detail::unique_ptr<CumulativeReporterBase::SectionNode> const&
+                    node ) const {
+                return (
+                    ( node->stats.sectionInfo.name == m_other.name ) &&
+                    ( node->stats.sectionInfo.lineInfo == m_other.lineInfo ) );
+            }
+            void operator=( BySectionInfo const& ) = delete;
+
+        private:
+            SectionInfo const& m_other;
+        };
+
+    } // namespace
+
+    namespace Detail {
+        AssertionOrBenchmarkResult::AssertionOrBenchmarkResult(
+            AssertionStats const& assertion ):
+            m_assertion( assertion ) {}
+
+        AssertionOrBenchmarkResult::AssertionOrBenchmarkResult(
+            BenchmarkStats<> const& benchmark ):
+            m_benchmark( benchmark ) {}
+
+        bool AssertionOrBenchmarkResult::isAssertion() const {
+            return m_assertion.some();
+        }
+        bool AssertionOrBenchmarkResult::isBenchmark() const {
+            return m_benchmark.some();
+        }
+
+        AssertionStats const& AssertionOrBenchmarkResult::asAssertion() const {
+            assert(m_assertion.some());
+
+            return *m_assertion;
+        }
+        BenchmarkStats<> const& AssertionOrBenchmarkResult::asBenchmark() const {
+            assert(m_benchmark.some());
+
+            return *m_benchmark;
+        }
+
+    }
+
+    CumulativeReporterBase::~CumulativeReporterBase() = default;
+
+    void CumulativeReporterBase::benchmarkEnded(BenchmarkStats<> const& benchmarkStats) {
+        m_sectionStack.back()->assertionsAndBenchmarks.emplace_back(benchmarkStats);
+    }
+
+    void
+    CumulativeReporterBase::sectionStarting( SectionInfo const& sectionInfo ) {
+        // We need a copy, because SectionStats expect to take ownership
+        SectionStats incompleteStats( SectionInfo(sectionInfo), Counts(), 0, false );
+        SectionNode* node;
+        if ( m_sectionStack.empty() ) {
+            if ( !m_rootSection ) {
+                m_rootSection =
+                    Detail::make_unique<SectionNode>( incompleteStats );
+            }
+            node = m_rootSection.get();
+        } else {
+            SectionNode& parentNode = *m_sectionStack.back();
+            auto it = std::find_if( parentNode.childSections.begin(),
+                                    parentNode.childSections.end(),
+                                    BySectionInfo( sectionInfo ) );
+            if ( it == parentNode.childSections.end() ) {
+                auto newNode =
+                    Detail::make_unique<SectionNode>( incompleteStats );
+                node = newNode.get();
+                parentNode.childSections.push_back( CATCH_MOVE( newNode ) );
+            } else {
+                node = it->get();
+            }
+        }
+
+        m_deepestSection = node;
+        m_sectionStack.push_back( node );
+    }
+
+    void CumulativeReporterBase::assertionEnded(
+        AssertionStats const& assertionStats ) {
+        assert( !m_sectionStack.empty() );
+        // AssertionResult holds a pointer to a temporary DecomposedExpression,
+        // which getExpandedExpression() calls to build the expression string.
+        // Our section stack copy of the assertionResult will likely outlive the
+        // temporary, so it must be expanded or discarded now to avoid calling
+        // a destroyed object later.
+        if ( m_shouldStoreFailedAssertions &&
+             !assertionStats.assertionResult.isOk() ) {
+            static_cast<void>(
+                assertionStats.assertionResult.getExpandedExpression() );
+        }
+        if ( m_shouldStoreSuccesfulAssertions &&
+             assertionStats.assertionResult.isOk() ) {
+            static_cast<void>(
+                assertionStats.assertionResult.getExpandedExpression() );
+        }
+        SectionNode& sectionNode = *m_sectionStack.back();
+        sectionNode.assertionsAndBenchmarks.emplace_back( assertionStats );
+    }
+
+    void CumulativeReporterBase::sectionEnded( SectionStats const& sectionStats ) {
+        assert( !m_sectionStack.empty() );
+        SectionNode& node = *m_sectionStack.back();
+        node.stats = sectionStats;
+        m_sectionStack.pop_back();
+    }
+
+    void CumulativeReporterBase::testCaseEnded(
+        TestCaseStats const& testCaseStats ) {
+        auto node = Detail::make_unique<TestCaseNode>( testCaseStats );
+        assert( m_sectionStack.size() == 0 );
+        node->children.push_back( CATCH_MOVE(m_rootSection) );
+        m_testCases.push_back( CATCH_MOVE(node) );
+
+        assert( m_deepestSection );
+        m_deepestSection->stdOut = testCaseStats.stdOut;
+        m_deepestSection->stdErr = testCaseStats.stdErr;
+    }
+
+
+    void CumulativeReporterBase::testRunEnded( TestRunStats const& testRunStats ) {
+        assert(!m_testRun && "CumulativeReporterBase assumes there can only be one test run");
+        m_testRun = Detail::make_unique<TestRunNode>( testRunStats );
+        m_testRun->children.swap( m_testCases );
+        testRunEndedCumulative();
+    }
+
+    bool CumulativeReporterBase::SectionNode::hasAnyAssertions() const {
+        return std::any_of(
+            assertionsAndBenchmarks.begin(),
+            assertionsAndBenchmarks.end(),
+            []( Detail::AssertionOrBenchmarkResult const& res ) {
+                return res.isAssertion();
+            } );
+    }
+
+} // end namespace Catch
+
+
+
+
+namespace Catch {
+
+    void EventListenerBase::fatalErrorEncountered( StringRef ) {}
+
+    void EventListenerBase::benchmarkPreparing( StringRef ) {}
+    void EventListenerBase::benchmarkStarting( BenchmarkInfo const& ) {}
+    void EventListenerBase::benchmarkEnded( BenchmarkStats<> const& ) {}
+    void EventListenerBase::benchmarkFailed( StringRef ) {}
+
+    void EventListenerBase::assertionStarting( AssertionInfo const& ) {}
+
+    void EventListenerBase::assertionEnded( AssertionStats const& ) {}
+    void EventListenerBase::listReporters(
+        std::vector<ReporterDescription> const& ) {}
+    void EventListenerBase::listListeners(
+        std::vector<ListenerDescription> const& ) {}
+    void EventListenerBase::listTests( std::vector<TestCaseHandle> const& ) {}
+    void EventListenerBase::listTags( std::vector<TagInfo> const& ) {}
+    void EventListenerBase::noMatchingTestCases( StringRef ) {}
+    void EventListenerBase::reportInvalidTestSpec( StringRef ) {}
+    void EventListenerBase::testRunStarting( TestRunInfo const& ) {}
+    void EventListenerBase::testCaseStarting( TestCaseInfo const& ) {}
+    void EventListenerBase::testCasePartialStarting(TestCaseInfo const&, uint64_t) {}
+    void EventListenerBase::sectionStarting( SectionInfo const& ) {}
+    void EventListenerBase::sectionEnded( SectionStats const& ) {}
+    void EventListenerBase::testCasePartialEnded(TestCaseStats const&, uint64_t) {}
+    void EventListenerBase::testCaseEnded( TestCaseStats const& ) {}
+    void EventListenerBase::testRunEnded( TestRunStats const& ) {}
+    void EventListenerBase::skipTest( TestCaseInfo const& ) {}
+} // namespace Catch
+
+
+
+
+#include <algorithm>
+#include <cfloat>
+#include <cstdio>
+#include <ostream>
+#include <iomanip>
+
+namespace Catch {
+
+    namespace {
+        void listTestNamesOnly(std::ostream& out,
+                               std::vector<TestCaseHandle> const& tests) {
+            for (auto const& test : tests) {
+                auto const& testCaseInfo = test.getTestCaseInfo();
+
+                if (startsWith(testCaseInfo.name, '#')) {
+                    out << '"' << testCaseInfo.name << '"';
+                } else {
+                    out << testCaseInfo.name;
+                }
+
+                out << '\n';
+            }
+            out << std::flush;
+        }
+    } // end unnamed namespace
+
+
+    // Because formatting using c++ streams is stateful, drop down to C is
+    // required Alternatively we could use stringstream, but its performance
+    // is... not good.
+    std::string getFormattedDuration( double duration ) {
+        // Max exponent + 1 is required to represent the whole part
+        // + 1 for decimal point
+        // + 3 for the 3 decimal places
+        // + 1 for null terminator
+        const std::size_t maxDoubleSize = DBL_MAX_10_EXP + 1 + 1 + 3 + 1;
+        char buffer[maxDoubleSize];
+
+        // Save previous errno, to prevent sprintf from overwriting it
+        ErrnoGuard guard;
+#ifdef _MSC_VER
+        size_t printedLength = static_cast<size_t>(
+            sprintf_s( buffer, "%.3f", duration ) );
+#else
+        size_t printedLength = static_cast<size_t>(
+            std::snprintf( buffer, maxDoubleSize, "%.3f", duration ) );
+#endif
+        return std::string( buffer, printedLength );
+    }
+
+    bool shouldShowDuration( IConfig const& config, double duration ) {
+        if ( config.showDurations() == ShowDurations::Always ) {
+            return true;
+        }
+        if ( config.showDurations() == ShowDurations::Never ) {
+            return false;
+        }
+        const double min = config.minDuration();
+        return min >= 0 && duration >= min;
+    }
+
+    std::string serializeFilters( std::vector<std::string> const& filters ) {
+        // We add a ' ' separator between each filter
+        size_t serialized_size = filters.size() - 1;
+        for (auto const& filter : filters) {
+            serialized_size += filter.size();
+        }
+
+        std::string serialized;
+        serialized.reserve(serialized_size);
+        bool first = true;
+
+        for (auto const& filter : filters) {
+            if (!first) {
+                serialized.push_back(' ');
+            }
+            first = false;
+            serialized.append(filter);
+        }
+
+        return serialized;
+    }
+
+    std::ostream& operator<<( std::ostream& out, lineOfChars value ) {
+        for ( size_t idx = 0; idx < CATCH_CONFIG_CONSOLE_WIDTH - 1; ++idx ) {
+            out.put( value.c );
+        }
+        return out;
+    }
+
+    void
+    defaultListReporters( std::ostream& out,
+                          std::vector<ReporterDescription> const& descriptions,
+                          Verbosity verbosity ) {
+        out << "Available reporters:\n";
+        const auto maxNameLen =
+            std::max_element( descriptions.begin(),
+                              descriptions.end(),
+                              []( ReporterDescription const& lhs,
+                                  ReporterDescription const& rhs ) {
+                                  return lhs.name.size() < rhs.name.size();
+                              } )
+                ->name.size();
+
+        for ( auto const& desc : descriptions ) {
+            if ( verbosity == Verbosity::Quiet ) {
+                out << TextFlow::Column( desc.name )
+                           .indent( 2 )
+                           .width( 5 + maxNameLen )
+                    << '\n';
+            } else {
+                out << TextFlow::Column( desc.name + ':' )
+                               .indent( 2 )
+                               .width( 5 + maxNameLen ) +
+                           TextFlow::Column( desc.description )
+                               .initialIndent( 0 )
+                               .indent( 2 )
+                               .width( CATCH_CONFIG_CONSOLE_WIDTH - maxNameLen - 8 )
+                    << '\n';
+            }
+        }
+        out << '\n' << std::flush;
+    }
+
+    void defaultListListeners( std::ostream& out,
+                               std::vector<ListenerDescription> const& descriptions ) {
+        out << "Registered listeners:\n";
+
+        if(descriptions.empty()) {
+            return;
+        }
+
+        const auto maxNameLen =
+            std::max_element( descriptions.begin(),
+                              descriptions.end(),
+                              []( ListenerDescription const& lhs,
+                                  ListenerDescription const& rhs ) {
+                                  return lhs.name.size() < rhs.name.size();
+                              } )
+                ->name.size();
+
+        for ( auto const& desc : descriptions ) {
+            out << TextFlow::Column( static_cast<std::string>( desc.name ) +
+                                     ':' )
+                           .indent( 2 )
+                           .width( maxNameLen + 5 ) +
+                       TextFlow::Column( desc.description )
+                           .initialIndent( 0 )
+                           .indent( 2 )
+                           .width( CATCH_CONFIG_CONSOLE_WIDTH - maxNameLen - 8 )
+                << '\n';
+        }
+
+        out << '\n' << std::flush;
+    }
+
+    void defaultListTags( std::ostream& out,
+                          std::vector<TagInfo> const& tags,
+                          bool isFiltered ) {
+        if ( isFiltered ) {
+            out << "Tags for matching test cases:\n";
+        } else {
+            out << "All available tags:\n";
+        }
+
+        for ( auto const& tagCount : tags ) {
+            ReusableStringStream rss;
+            rss << "  " << std::setw( 2 ) << tagCount.count << "  ";
+            auto str = rss.str();
+            auto wrapper = TextFlow::Column( tagCount.all() )
+                               .initialIndent( 0 )
+                               .indent( str.size() )
+                               .width( CATCH_CONFIG_CONSOLE_WIDTH - 10 );
+            out << str << wrapper << '\n';
+        }
+        out << pluralise(tags.size(), "tag"_sr) << "\n\n" << std::flush;
+    }
+
+    void defaultListTests(std::ostream& out, ColourImpl* streamColour, std::vector<TestCaseHandle> const& tests, bool isFiltered, Verbosity verbosity) {
+        // We special case this to provide the equivalent of old
+        // `--list-test-names-only`, which could then be used by the
+        // `--input-file` option.
+        if (verbosity == Verbosity::Quiet) {
+            listTestNamesOnly(out, tests);
+            return;
+        }
+
+        if (isFiltered) {
+            out << "Matching test cases:\n";
+        } else {
+            out << "All available test cases:\n";
+        }
+
+        for (auto const& test : tests) {
+            auto const& testCaseInfo = test.getTestCaseInfo();
+            Colour::Code colour = testCaseInfo.isHidden()
+                ? Colour::SecondaryText
+                : Colour::None;
+            auto colourGuard = streamColour->guardColour( colour ).engage( out );
+
+            out << TextFlow::Column(testCaseInfo.name).indent(2) << '\n';
+            if (verbosity >= Verbosity::High) {
+                out << TextFlow::Column(Catch::Detail::stringify(testCaseInfo.lineInfo)).indent(4) << '\n';
+            }
+            if (!testCaseInfo.tags.empty() &&
+                verbosity > Verbosity::Quiet) {
+                out << TextFlow::Column(testCaseInfo.tagsAsString()).indent(6) << '\n';
+            }
+        }
+
+        if (isFiltered) {
+            out << pluralise(tests.size(), "matching test case"_sr);
+        } else {
+            out << pluralise(tests.size(), "test case"_sr);
+        }
+        out << "\n\n" << std::flush;
+    }
+
+    namespace {
+        class SummaryColumn {
+        public:
+            SummaryColumn( std::string suffix, Colour::Code colour ):
+                m_suffix( CATCH_MOVE( suffix ) ), m_colour( colour ) {}
+
+            SummaryColumn&& addRow( std::uint64_t count ) && {
+                std::string row = std::to_string(count);
+                auto const new_width = std::max( m_width, row.size() );
+                if ( new_width > m_width ) {
+                    for ( auto& oldRow : m_rows ) {
+                        oldRow.insert( 0, new_width - m_width, ' ' );
+                    }
+                } else {
+                    row.insert( 0, m_width - row.size(), ' ' );
+                }
+                m_width = new_width;
+                m_rows.push_back( row );
+                return std::move( *this );
+            }
+
+            std::string const& getSuffix() const { return m_suffix; }
+            Colour::Code getColour() const { return m_colour; }
+            std::string const& getRow( std::size_t index ) const {
+                return m_rows[index];
+            }
+
+        private:
+            std::string m_suffix;
+            Colour::Code m_colour;
+            std::size_t m_width = 0;
+            std::vector<std::string> m_rows;
+        };
+
+        void printSummaryRow( std::ostream& stream,
+                              ColourImpl& colour,
+                              StringRef label,
+                              std::vector<SummaryColumn> const& cols,
+                              std::size_t row ) {
+            for ( auto const& col : cols ) {
+                auto const& value = col.getRow( row );
+                auto const& suffix = col.getSuffix();
+                if ( suffix.empty() ) {
+                    stream << label << ": ";
+                    if ( value != "0" ) {
+                        stream << value;
+                    } else {
+                        stream << colour.guardColour( Colour::Warning )
+                               << "- none -";
+                    }
+                } else if ( value != "0" ) {
+                    stream << colour.guardColour( Colour::LightGrey ) << " | "
+                           << colour.guardColour( col.getColour() ) << value
+                           << ' ' << suffix;
+                }
+            }
+            stream << '\n';
+        }
+    } // namespace
+
+    void printTestRunTotals( std::ostream& stream,
+                             ColourImpl& streamColour,
+                             Totals const& totals ) {
+        if ( totals.testCases.total() == 0 ) {
+            stream << streamColour.guardColour( Colour::Warning )
+                   << "No tests ran\n";
+            return;
+        }
+
+        if ( totals.assertions.total() > 0 && totals.testCases.allPassed() ) {
+            stream << streamColour.guardColour( Colour::ResultSuccess )
+                   << "All tests passed";
+            stream << " ("
+                   << pluralise( totals.assertions.passed, "assertion"_sr )
+                   << " in "
+                   << pluralise( totals.testCases.passed, "test case"_sr )
+                   << ')' << '\n';
+            return;
+        }
+
+        std::vector<SummaryColumn> columns;
+        // Don't include "skipped assertions" in total count
+        const auto totalAssertionCount =
+            totals.assertions.total() - totals.assertions.skipped;
+        columns.push_back( SummaryColumn( "", Colour::None )
+                               .addRow( totals.testCases.total() )
+                               .addRow( totalAssertionCount ) );
+        columns.push_back( SummaryColumn( "passed", Colour::Success )
+                               .addRow( totals.testCases.passed )
+                               .addRow( totals.assertions.passed ) );
+        columns.push_back( SummaryColumn( "failed", Colour::ResultError )
+                               .addRow( totals.testCases.failed )
+                               .addRow( totals.assertions.failed ) );
+        columns.push_back( SummaryColumn( "skipped", Colour::Skip )
+                               .addRow( totals.testCases.skipped )
+                               // Don't print "skipped assertions"
+                               .addRow( 0 ) );
+        columns.push_back(
+            SummaryColumn( "failed as expected", Colour::ResultExpectedFailure )
+                .addRow( totals.testCases.failedButOk )
+                .addRow( totals.assertions.failedButOk ) );
+        printSummaryRow( stream, streamColour, "test cases"_sr, columns, 0 );
+        printSummaryRow( stream, streamColour, "assertions"_sr, columns, 1 );
+    }
+
+} // namespace Catch
+
+
+//
+
+namespace Catch {
+    namespace {
+        void writeSourceInfo( JsonObjectWriter& writer,
+                              SourceLineInfo const& sourceInfo ) {
+            auto source_location_writer =
+                writer.write( "source-location"_sr ).writeObject();
+            source_location_writer.write( "filename"_sr )
+                .write( sourceInfo.file );
+            source_location_writer.write( "line"_sr ).write( sourceInfo.line );
+        }
+
+        void writeTags( JsonArrayWriter writer, std::vector<Tag> const& tags ) {
+            for ( auto const& tag : tags ) {
+                writer.write( tag.original );
+            }
+        }
+
+        void writeProperties( JsonArrayWriter writer,
+                              TestCaseInfo const& info ) {
+            if ( info.isHidden() ) { writer.write( "is-hidden"_sr ); }
+            if ( info.okToFail() ) { writer.write( "ok-to-fail"_sr ); }
+            if ( info.expectedToFail() ) {
+                writer.write( "expected-to-fail"_sr );
+            }
+            if ( info.throws() ) { writer.write( "throws"_sr ); }
+        }
+
+    } // namespace
+
+    JsonReporter::JsonReporter( ReporterConfig&& config ):
+        StreamingReporterBase{ CATCH_MOVE( config ) } {
+
+        m_preferences.shouldRedirectStdOut = true;
+        // TBD: Do we want to report all assertions? XML reporter does
+        //      not, but for machine-parseable reporters I think the answer
+        //      should be yes.
+        m_preferences.shouldReportAllAssertions = true;
+
+        m_objectWriters.emplace( m_stream );
+        m_writers.emplace( Writer::Object );
+        auto& writer = m_objectWriters.top();
+
+        writer.write( "version"_sr ).write( 1 );
+
+        {
+            auto metadata_writer = writer.write( "metadata"_sr ).writeObject();
+            metadata_writer.write( "name"_sr ).write( m_config->name() );
+            metadata_writer.write( "rng-seed"_sr ).write( m_config->rngSeed() );
+            metadata_writer.write( "catch2-version"_sr )
+                .write( libraryVersion() );
+            if ( m_config->testSpec().hasFilters() ) {
+                metadata_writer.write( "filters"_sr )
+                    .write( m_config->testSpec() );
+            }
+        }
+    }
+
+    JsonReporter::~JsonReporter() {
+        endListing();
+        // TODO: Ensure this closes the top level object, add asserts
+        assert( m_writers.size() == 1 && "Only the top level object should be open" );
+        assert( m_writers.top() == Writer::Object );
+        endObject();
+        m_stream << '\n' << std::flush;
+        assert( m_writers.empty() );
+    }
+
+    JsonArrayWriter& JsonReporter::startArray() {
+        m_arrayWriters.emplace( m_arrayWriters.top().writeArray() );
+        m_writers.emplace( Writer::Array );
+        return m_arrayWriters.top();
+    }
+    JsonArrayWriter& JsonReporter::startArray( StringRef key ) {
+        m_arrayWriters.emplace(
+            m_objectWriters.top().write( key ).writeArray() );
+        m_writers.emplace( Writer::Array );
+        return m_arrayWriters.top();
+    }
+
+    JsonObjectWriter& JsonReporter::startObject() {
+        m_objectWriters.emplace( m_arrayWriters.top().writeObject() );
+        m_writers.emplace( Writer::Object );
+        return m_objectWriters.top();
+    }
+    JsonObjectWriter& JsonReporter::startObject( StringRef key ) {
+        m_objectWriters.emplace(
+            m_objectWriters.top().write( key ).writeObject() );
+        m_writers.emplace( Writer::Object );
+        return m_objectWriters.top();
+    }
+
+    void JsonReporter::endObject() {
+        assert( isInside( Writer::Object ) );
+        m_objectWriters.pop();
+        m_writers.pop();
+    }
+    void JsonReporter::endArray() {
+        assert( isInside( Writer::Array ) );
+        m_arrayWriters.pop();
+        m_writers.pop();
+    }
+
+    bool JsonReporter::isInside( Writer writer ) {
+        return !m_writers.empty() && m_writers.top() == writer;
+    }
+
+    void JsonReporter::startListing() {
+        if ( !m_startedListing ) { startObject( "listings"_sr ); }
+        m_startedListing = true;
+    }
+    void JsonReporter::endListing() {
+        if ( m_startedListing ) { endObject(); }
+        m_startedListing = false;
+    }
+
+    std::string JsonReporter::getDescription() {
+        return "Outputs listings as JSON. Test listing is Work-in-Progress!";
+    }
+
+    void JsonReporter::testRunStarting( TestRunInfo const& runInfo ) {
+        StreamingReporterBase::testRunStarting( runInfo );
+        endListing();
+
+        assert( isInside( Writer::Object ) );
+        startObject( "test-run"_sr );
+        startArray( "test-cases"_sr );
+    }
+
+     static void writeCounts( JsonObjectWriter&& writer, Counts const& counts ) {
+        writer.write( "passed"_sr ).write( counts.passed );
+        writer.write( "failed"_sr ).write( counts.failed );
+        writer.write( "fail-but-ok"_sr ).write( counts.failedButOk );
+        writer.write( "skipped"_sr ).write( counts.skipped );
+    }
+
+    void JsonReporter::testRunEnded(TestRunStats const& runStats) {
+        assert( isInside( Writer::Array ) );
+        // End "test-cases"
+        endArray();
+
+        {
+            auto totals =
+                m_objectWriters.top().write( "totals"_sr ).writeObject();
+            writeCounts( totals.write( "assertions"_sr ).writeObject(),
+                         runStats.totals.assertions );
+            writeCounts( totals.write( "test-cases"_sr ).writeObject(),
+                         runStats.totals.testCases );
+        }
+
+        // End the "test-run" object
+        endObject();
+    }
+
+    void JsonReporter::testCaseStarting( TestCaseInfo const& tcInfo ) {
+        StreamingReporterBase::testCaseStarting( tcInfo );
+
+        assert( isInside( Writer::Array ) &&
+                "We should be in the 'test-cases' array" );
+        startObject();
+        // "test-info" prelude
+        {
+            auto testInfo =
+                m_objectWriters.top().write( "test-info"_sr ).writeObject();
+            // TODO: handle testName vs className!!
+            testInfo.write( "name"_sr ).write( tcInfo.name );
+            writeSourceInfo(testInfo, tcInfo.lineInfo);
+            writeTags( testInfo.write( "tags"_sr ).writeArray(), tcInfo.tags );
+            writeProperties( testInfo.write( "properties"_sr ).writeArray(),
+                             tcInfo );
+        }
+
+
+        // Start the array for individual test runs (testCasePartial pairs)
+        startArray( "runs"_sr );
+    }
+
+    void JsonReporter::testCaseEnded( TestCaseStats const& tcStats ) {
+        StreamingReporterBase::testCaseEnded( tcStats );
+
+        // We need to close the 'runs' array before finishing the test case
+        assert( isInside( Writer::Array ) );
+        endArray();
+
+        {
+            auto totals =
+                m_objectWriters.top().write( "totals"_sr ).writeObject();
+            writeCounts( totals.write( "assertions"_sr ).writeObject(),
+                         tcStats.totals.assertions );
+            // We do not write the test case totals, because there will always be just one test case here.
+            // TODO: overall "result" -> success, skip, fail here? Or in partial result?
+        }
+        // We do not write out stderr/stdout, because we instead wrote those out in partial runs
+
+        // TODO: aborting?
+
+        // And we also close this test case's object
+        assert( isInside( Writer::Object ) );
+        endObject();
+    }
+
+    void JsonReporter::testCasePartialStarting( TestCaseInfo const& /*tcInfo*/,
+                                                uint64_t index ) {
+        startObject();
+        m_objectWriters.top().write( "run-idx"_sr ).write( index );
+        startArray( "path"_sr );
+        // TODO: we want to delay most of the printing to the 'root' section
+        // TODO: childSection key name?
+    }
+
+    void JsonReporter::testCasePartialEnded( TestCaseStats const& tcStats,
+                                             uint64_t /*index*/ ) {
+        // Fixme: the top level section handles this.
+        //// path object
+        endArray();
+        if ( !tcStats.stdOut.empty() ) {
+            m_objectWriters.top()
+                .write( "captured-stdout"_sr )
+                .write( tcStats.stdOut );
+        }
+        if ( !tcStats.stdErr.empty() ) {
+            m_objectWriters.top()
+                .write( "captured-stderr"_sr )
+                .write( tcStats.stdErr );
+        }
+        {
+            auto totals =
+                m_objectWriters.top().write( "totals"_sr ).writeObject();
+            writeCounts( totals.write( "assertions"_sr ).writeObject(),
+                         tcStats.totals.assertions );
+            // We do not write the test case totals, because there will
+            // always be just one test case here.
+            // TODO: overall "result" -> success, skip, fail here? Or in
+            // partial result?
+        }
+        // TODO: aborting?
+        // run object
+        endObject();
+    }
+
+    void JsonReporter::sectionStarting( SectionInfo const& sectionInfo ) {
+        assert( isInside( Writer::Array ) &&
+                "Section should always start inside an object" );
+        // We want to nest top level sections, even though it shares name
+        // and source loc with the TEST_CASE
+        auto& sectionObject = startObject();
+        sectionObject.write( "kind"_sr ).write( "section"_sr );
+        sectionObject.write( "name"_sr ).write( sectionInfo.name );
+        writeSourceInfo( m_objectWriters.top(), sectionInfo.lineInfo );
+
+
+        // TBD: Do we want to create this event lazily? It would become
+        //      rather complex, but we could do it, and it would look
+        //      better for empty sections. OTOH, empty sections should
+        //      be rare.
+        startArray( "path"_sr );
+    }
+    void JsonReporter::sectionEnded( SectionStats const& /*sectionStats */) {
+        // End the subpath array
+        endArray();
+        // TODO: metadata
+        // TODO: what info do we have here?
+
+        // End the section object
+        endObject();
+    }
+
+    void JsonReporter::assertionStarting( AssertionInfo const& /*assertionInfo*/ ) {}
+    void JsonReporter::assertionEnded( AssertionStats const& assertionStats ) {
+        // TODO: There is lot of different things to handle here, but
+        //       we can fill it in later, after we show that the basic
+        //       outline and streaming reporter impl works well enough.
+        //if ( !m_config->includeSuccessfulResults()
+        //    && assertionStats.assertionResult.isOk() ) {
+        //    return;
+        //}
+        assert( isInside( Writer::Array ) );
+        auto assertionObject = m_arrayWriters.top().writeObject();
+
+        assertionObject.write( "kind"_sr ).write( "assertion"_sr );
+        writeSourceInfo( assertionObject,
+                         assertionStats.assertionResult.getSourceInfo() );
+        assertionObject.write( "status"_sr )
+            .write( assertionStats.assertionResult.isOk() );
+        // TODO: handling of result.
+        // TODO: messages
+        // TODO: totals?
+    }
+
+
+    void JsonReporter::benchmarkPreparing( StringRef name ) { (void)name; }
+    void JsonReporter::benchmarkStarting( BenchmarkInfo const& ) {}
+    void JsonReporter::benchmarkEnded( BenchmarkStats<> const& ) {}
+    void JsonReporter::benchmarkFailed( StringRef error ) { (void)error; }
+
+    void JsonReporter::listReporters(
+        std::vector<ReporterDescription> const& descriptions ) {
+        startListing();
+
+        auto writer =
+            m_objectWriters.top().write( "reporters"_sr ).writeArray();
+        for ( auto const& desc : descriptions ) {
+            auto desc_writer = writer.writeObject();
+            desc_writer.write( "name"_sr ).write( desc.name );
+            desc_writer.write( "description"_sr ).write( desc.description );
+        }
+    }
+    void JsonReporter::listListeners(
+        std::vector<ListenerDescription> const& descriptions ) {
+        startListing();
+
+        auto writer =
+            m_objectWriters.top().write( "listeners"_sr ).writeArray();
+
+        for ( auto const& desc : descriptions ) {
+            auto desc_writer = writer.writeObject();
+            desc_writer.write( "name"_sr ).write( desc.name );
+            desc_writer.write( "description"_sr ).write( desc.description );
+        }
+    }
+    void JsonReporter::listTests( std::vector<TestCaseHandle> const& tests ) {
+        startListing();
+
+        auto writer = m_objectWriters.top().write( "tests"_sr ).writeArray();
+
+        for ( auto const& test : tests ) {
+            auto desc_writer = writer.writeObject();
+            auto const& info = test.getTestCaseInfo();
+
+            desc_writer.write( "name"_sr ).write( info.name );
+            desc_writer.write( "class-name"_sr ).write( info.className );
+            {
+                auto tag_writer = desc_writer.write( "tags"_sr ).writeArray();
+                for ( auto const& tag : info.tags ) {
+                    tag_writer.write( tag.original );
+                }
+            }
+            writeSourceInfo( desc_writer, info.lineInfo );
+        }
+    }
+    void JsonReporter::listTags( std::vector<TagInfo> const& tags ) {
+        startListing();
+
+        auto writer = m_objectWriters.top().write( "tags"_sr ).writeArray();
+        for ( auto const& tag : tags ) {
+            auto tag_writer = writer.writeObject();
+            {
+                auto aliases_writer =
+                    tag_writer.write( "aliases"_sr ).writeArray();
+                for ( auto alias : tag.spellings ) {
+                    aliases_writer.write( alias );
+                }
+            }
+            tag_writer.write( "count"_sr ).write( tag.count );
+        }
+    }
+} // namespace Catch
+
+
+
+
+#include <cassert>
+#include <ctime>
+#include <algorithm>
+#include <iomanip>
+
+namespace Catch {
+
+    namespace {
+        std::string getCurrentTimestamp() {
+            time_t rawtime;
+            std::time(&rawtime);
+
+            std::tm timeInfo = {};
+#if defined (_MSC_VER) || defined (__MINGW32__)
+            gmtime_s(&timeInfo, &rawtime);
+#elif defined (CATCH_PLATFORM_PLAYSTATION)
+            gmtime_s(&rawtime, &timeInfo);
+#elif defined (__IAR_SYSTEMS_ICC__)
+            timeInfo = *std::gmtime(&rawtime);
+#else
+            gmtime_r(&rawtime, &timeInfo);
+#endif
+
+            auto const timeStampSize = sizeof("2017-01-16T17:06:45Z");
+            char timeStamp[timeStampSize];
+            const char * const fmt = "%Y-%m-%dT%H:%M:%SZ";
+
+            std::strftime(timeStamp, timeStampSize, fmt, &timeInfo);
+
+            return std::string(timeStamp, timeStampSize - 1);
+        }
+
+        std::string fileNameTag(std::vector<Tag> const& tags) {
+            auto it = std::find_if(begin(tags),
+                                   end(tags),
+                                   [] (Tag const& tag) {
+                                       return tag.original.size() > 0
+                                           && tag.original[0] == '#'; });
+            if (it != tags.end()) {
+                return static_cast<std::string>(
+                    it->original.substr(1, it->original.size() - 1)
+                );
+            }
+            return std::string();
+        }
+
+        // Formats the duration in seconds to 3 decimal places.
+        // This is done because some genius defined Maven Surefire schema
+        // in a way that only accepts 3 decimal places, and tools like
+        // Jenkins use that schema for validation JUnit reporter output.
+        std::string formatDuration( double seconds ) {
+            ReusableStringStream rss;
+            rss << std::fixed << std::setprecision( 3 ) << seconds;
+            return rss.str();
+        }
+
+        static void normalizeNamespaceMarkers(std::string& str) {
+            std::size_t pos = str.find( "::" );
+            while ( pos != std::string::npos ) {
+                str.replace( pos, 2, "." );
+                pos += 1;
+                pos = str.find( "::", pos );
+            }
+        }
+
+    } // anonymous namespace
+
+    JunitReporter::JunitReporter( ReporterConfig&& _config )
+        :   CumulativeReporterBase( CATCH_MOVE(_config) ),
+            xml( m_stream )
+        {
+            m_preferences.shouldRedirectStdOut = true;
+            m_preferences.shouldReportAllAssertions = true;
+            m_shouldStoreSuccesfulAssertions = false;
+        }
+
+    std::string JunitReporter::getDescription() {
+        return "Reports test results in an XML format that looks like Ant's junitreport target";
+    }
+
+    void JunitReporter::testRunStarting( TestRunInfo const& runInfo )  {
+        CumulativeReporterBase::testRunStarting( runInfo );
+        xml.startElement( "testsuites" );
+        suiteTimer.start();
+        stdOutForSuite.clear();
+        stdErrForSuite.clear();
+        unexpectedExceptions = 0;
+    }
+
+    void JunitReporter::testCaseStarting( TestCaseInfo const& testCaseInfo ) {
+        m_okToFail = testCaseInfo.okToFail();
+    }
+
+    void JunitReporter::assertionEnded( AssertionStats const& assertionStats ) {
+        if( assertionStats.assertionResult.getResultType() == ResultWas::ThrewException && !m_okToFail )
+            unexpectedExceptions++;
+        CumulativeReporterBase::assertionEnded( assertionStats );
+    }
+
+    void JunitReporter::testCaseEnded( TestCaseStats const& testCaseStats ) {
+        stdOutForSuite += testCaseStats.stdOut;
+        stdErrForSuite += testCaseStats.stdErr;
+        CumulativeReporterBase::testCaseEnded( testCaseStats );
+    }
+
+    void JunitReporter::testRunEndedCumulative() {
+        const auto suiteTime = suiteTimer.getElapsedSeconds();
+        writeRun( *m_testRun, suiteTime );
+        xml.endElement();
+    }
+
+    void JunitReporter::writeRun( TestRunNode const& testRunNode, double suiteTime ) {
+        XmlWriter::ScopedElement e = xml.scopedElement( "testsuite" );
+
+        TestRunStats const& stats = testRunNode.value;
+        xml.writeAttribute( "name"_sr, stats.runInfo.name );
+        xml.writeAttribute( "errors"_sr, unexpectedExceptions );
+        xml.writeAttribute( "failures"_sr, stats.totals.assertions.failed-unexpectedExceptions );
+        xml.writeAttribute( "skipped"_sr, stats.totals.assertions.skipped );
+        xml.writeAttribute( "tests"_sr, stats.totals.assertions.total() );
+        xml.writeAttribute( "hostname"_sr, "tbd"_sr ); // !TBD
+        if( m_config->showDurations() == ShowDurations::Never )
+            xml.writeAttribute( "time"_sr, ""_sr );
+        else
+            xml.writeAttribute( "time"_sr, formatDuration( suiteTime ) );
+        xml.writeAttribute( "timestamp"_sr, getCurrentTimestamp() );
+
+        // Write properties
+        {
+            auto properties = xml.scopedElement("properties");
+            xml.scopedElement("property")
+                .writeAttribute("name"_sr, "random-seed"_sr)
+                .writeAttribute("value"_sr, m_config->rngSeed());
+            if (m_config->testSpec().hasFilters()) {
+                xml.scopedElement("property")
+                    .writeAttribute("name"_sr, "filters"_sr)
+                    .writeAttribute("value"_sr, m_config->testSpec());
+            }
+        }
+
+        // Write test cases
+        for( auto const& child : testRunNode.children )
+            writeTestCase( *child );
+
+        xml.scopedElement( "system-out" ).writeText( trim( stdOutForSuite ), XmlFormatting::Newline );
+        xml.scopedElement( "system-err" ).writeText( trim( stdErrForSuite ), XmlFormatting::Newline );
+    }
+
+    void JunitReporter::writeTestCase( TestCaseNode const& testCaseNode ) {
+        TestCaseStats const& stats = testCaseNode.value;
+
+        // All test cases have exactly one section - which represents the
+        // test case itself. That section may have 0-n nested sections
+        assert( testCaseNode.children.size() == 1 );
+        SectionNode const& rootSection = *testCaseNode.children.front();
+
+        std::string className =
+            static_cast<std::string>( stats.testInfo->className );
+
+        if( className.empty() ) {
+            className = fileNameTag(stats.testInfo->tags);
+            if ( className.empty() ) {
+                className = "global";
+            }
+        }
+
+        if ( !m_config->name().empty() )
+            className = static_cast<std::string>(m_config->name()) + '.' + className;
+
+        normalizeNamespaceMarkers(className);
+
+        writeSection( className, "", rootSection, stats.testInfo->okToFail() );
+    }
+
+    void JunitReporter::writeSection( std::string const& className,
+                                      std::string const& rootName,
+                                      SectionNode const& sectionNode,
+                                      bool testOkToFail) {
+        std::string name = trim( sectionNode.stats.sectionInfo.name );
+        if( !rootName.empty() )
+            name = rootName + '/' + name;
+
+        if( sectionNode.hasAnyAssertions()
+           || !sectionNode.stdOut.empty()
+           || !sectionNode.stdErr.empty() ) {
+            XmlWriter::ScopedElement e = xml.scopedElement( "testcase" );
+            if( className.empty() ) {
+                xml.writeAttribute( "classname"_sr, name );
+                xml.writeAttribute( "name"_sr, "root"_sr );
+            }
+            else {
+                xml.writeAttribute( "classname"_sr, className );
+                xml.writeAttribute( "name"_sr, name );
+            }
+            xml.writeAttribute( "time"_sr, formatDuration( sectionNode.stats.durationInSeconds ) );
+            // This is not ideal, but it should be enough to mimic gtest's
+            // junit output.
+            // Ideally the JUnit reporter would also handle `skipTest`
+            // events and write those out appropriately.
+            xml.writeAttribute( "status"_sr, "run"_sr );
+
+            if (sectionNode.stats.assertions.failedButOk) {
+                xml.scopedElement("skipped")
+                    .writeAttribute("message", "TEST_CASE tagged with !mayfail");
+            }
+
+            writeAssertions( sectionNode );
+
+
+            if( !sectionNode.stdOut.empty() )
+                xml.scopedElement( "system-out" ).writeText( trim( sectionNode.stdOut ), XmlFormatting::Newline );
+            if( !sectionNode.stdErr.empty() )
+                xml.scopedElement( "system-err" ).writeText( trim( sectionNode.stdErr ), XmlFormatting::Newline );
+        }
+        for( auto const& childNode : sectionNode.childSections )
+            if( className.empty() )
+                writeSection( name, "", *childNode, testOkToFail );
+            else
+                writeSection( className, name, *childNode, testOkToFail );
+    }
+
+    void JunitReporter::writeAssertions( SectionNode const& sectionNode ) {
+        for (auto const& assertionOrBenchmark : sectionNode.assertionsAndBenchmarks) {
+            if (assertionOrBenchmark.isAssertion()) {
+                writeAssertion(assertionOrBenchmark.asAssertion());
+            }
+        }
+    }
+
+    void JunitReporter::writeAssertion( AssertionStats const& stats ) {
+        AssertionResult const& result = stats.assertionResult;
+        if ( !result.isOk() ||
+             result.getResultType() == ResultWas::ExplicitSkip ) {
+            std::string elementName;
+            switch( result.getResultType() ) {
+                case ResultWas::ThrewException:
+                case ResultWas::FatalErrorCondition:
+                    elementName = "error";
+                    break;
+                case ResultWas::ExplicitFailure:
+                case ResultWas::ExpressionFailed:
+                case ResultWas::DidntThrowException:
+                    elementName = "failure";
+                    break;
+                case ResultWas::ExplicitSkip:
+                    elementName = "skipped";
+                    break;
+                // We should never see these here:
+                case ResultWas::Info:
+                case ResultWas::Warning:
+                case ResultWas::Ok:
+                case ResultWas::Unknown:
+                case ResultWas::FailureBit:
+                case ResultWas::Exception:
+                    elementName = "internalError";
+                    break;
+            }
+
+            XmlWriter::ScopedElement e = xml.scopedElement( elementName );
+
+            xml.writeAttribute( "message"_sr, result.getExpression() );
+            xml.writeAttribute( "type"_sr, result.getTestMacroName() );
+
+            ReusableStringStream rss;
+            if ( result.getResultType() == ResultWas::ExplicitSkip ) {
+                rss << "SKIPPED\n";
+            } else {
+                rss << "FAILED" << ":\n";
+                if (result.hasExpression()) {
+                    rss << "  ";
+                    rss << result.getExpressionInMacro();
+                    rss << '\n';
+                }
+                if (result.hasExpandedExpression()) {
+                    rss << "with expansion:\n";
+                    rss << TextFlow::Column(result.getExpandedExpression()).indent(2) << '\n';
+                }
+            }
+
+            if( result.hasMessage() )
+                rss << result.getMessage() << '\n';
+            for( auto const& msg : stats.infoMessages )
+                if( msg.type == ResultWas::Info )
+                    rss << msg.message << '\n';
+
+            rss << "at " << result.getSourceInfo();
+            xml.writeText( rss.str(), XmlFormatting::Newline );
+        }
+    }
+
+} // end namespace Catch
+
+
+
+
+#include <ostream>
+
+namespace Catch {
+    void MultiReporter::updatePreferences(IEventListener const& reporterish) {
+        m_preferences.shouldRedirectStdOut |=
+            reporterish.getPreferences().shouldRedirectStdOut;
+        m_preferences.shouldReportAllAssertions |=
+            reporterish.getPreferences().shouldReportAllAssertions;
+    }
+
+    void MultiReporter::addListener( IEventListenerPtr&& listener ) {
+        updatePreferences(*listener);
+        m_reporterLikes.insert(m_reporterLikes.begin() + m_insertedListeners, CATCH_MOVE(listener) );
+        ++m_insertedListeners;
+    }
+
+    void MultiReporter::addReporter( IEventListenerPtr&& reporter ) {
+        updatePreferences(*reporter);
+
+        // We will need to output the captured stdout if there are reporters
+        // that do not want it captured.
+        // We do not consider listeners, because it is generally assumed that
+        // listeners are output-transparent, even though they can ask for stdout
+        // capture to do something with it.
+        m_haveNoncapturingReporters |= !reporter->getPreferences().shouldRedirectStdOut;
+
+        // Reporters can always be placed to the back without breaking the
+        // reporting order
+        m_reporterLikes.push_back( CATCH_MOVE( reporter ) );
+    }
+
+    void MultiReporter::noMatchingTestCases( StringRef unmatchedSpec ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->noMatchingTestCases( unmatchedSpec );
+        }
+    }
+
+    void MultiReporter::fatalErrorEncountered( StringRef error ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->fatalErrorEncountered( error );
+        }
+    }
+
+    void MultiReporter::reportInvalidTestSpec( StringRef arg ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->reportInvalidTestSpec( arg );
+        }
+    }
+
+    void MultiReporter::benchmarkPreparing( StringRef name ) {
+        for (auto& reporterish : m_reporterLikes) {
+            reporterish->benchmarkPreparing(name);
+        }
+    }
+    void MultiReporter::benchmarkStarting( BenchmarkInfo const& benchmarkInfo ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->benchmarkStarting( benchmarkInfo );
+        }
+    }
+    void MultiReporter::benchmarkEnded( BenchmarkStats<> const& benchmarkStats ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->benchmarkEnded( benchmarkStats );
+        }
+    }
+
+    void MultiReporter::benchmarkFailed( StringRef error ) {
+        for (auto& reporterish : m_reporterLikes) {
+            reporterish->benchmarkFailed(error);
+        }
+    }
+
+    void MultiReporter::testRunStarting( TestRunInfo const& testRunInfo ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->testRunStarting( testRunInfo );
+        }
+    }
+
+    void MultiReporter::testCaseStarting( TestCaseInfo const& testInfo ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->testCaseStarting( testInfo );
+        }
+    }
+
+    void
+    MultiReporter::testCasePartialStarting( TestCaseInfo const& testInfo,
+                                                uint64_t partNumber ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->testCasePartialStarting( testInfo, partNumber );
+        }
+    }
+
+    void MultiReporter::sectionStarting( SectionInfo const& sectionInfo ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->sectionStarting( sectionInfo );
+        }
+    }
+
+    void MultiReporter::assertionStarting( AssertionInfo const& assertionInfo ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->assertionStarting( assertionInfo );
+        }
+    }
+
+    void MultiReporter::assertionEnded( AssertionStats const& assertionStats ) {
+        const bool reportByDefault =
+            assertionStats.assertionResult.getResultType() != ResultWas::Ok ||
+            m_config->includeSuccessfulResults();
+
+        for ( auto & reporterish : m_reporterLikes ) {
+            if ( reportByDefault ||
+                 reporterish->getPreferences().shouldReportAllAssertions ) {
+                    reporterish->assertionEnded( assertionStats );
+            }
+        }
+    }
+
+    void MultiReporter::sectionEnded( SectionStats const& sectionStats ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->sectionEnded( sectionStats );
+        }
+    }
+
+    void MultiReporter::testCasePartialEnded( TestCaseStats const& testStats,
+                                                  uint64_t partNumber ) {
+        if ( m_preferences.shouldRedirectStdOut &&
+             m_haveNoncapturingReporters ) {
+            if ( !testStats.stdOut.empty() ) {
+                Catch::cout() << testStats.stdOut << std::flush;
+            }
+            if ( !testStats.stdErr.empty() ) {
+                Catch::cerr() << testStats.stdErr << std::flush;
+            }
+        }
+
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->testCasePartialEnded( testStats, partNumber );
+        }
+    }
+
+    void MultiReporter::testCaseEnded( TestCaseStats const& testCaseStats ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->testCaseEnded( testCaseStats );
+        }
+    }
+
+    void MultiReporter::testRunEnded( TestRunStats const& testRunStats ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->testRunEnded( testRunStats );
+        }
+    }
+
+
+    void MultiReporter::skipTest( TestCaseInfo const& testInfo ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->skipTest( testInfo );
+        }
+    }
+
+    void MultiReporter::listReporters(std::vector<ReporterDescription> const& descriptions) {
+        for (auto& reporterish : m_reporterLikes) {
+            reporterish->listReporters(descriptions);
+        }
+    }
+
+    void MultiReporter::listListeners(
+        std::vector<ListenerDescription> const& descriptions ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->listListeners( descriptions );
+        }
+    }
+
+    void MultiReporter::listTests(std::vector<TestCaseHandle> const& tests) {
+        for (auto& reporterish : m_reporterLikes) {
+            reporterish->listTests(tests);
+        }
+    }
+
+    void MultiReporter::listTags(std::vector<TagInfo> const& tags) {
+        for (auto& reporterish : m_reporterLikes) {
+            reporterish->listTags(tags);
+        }
+    }
+
+} // end namespace Catch
+
+
+
+
+
+namespace Catch {
+    namespace Detail {
+
+        void registerReporterImpl( std::string const& name,
+                                   IReporterFactoryPtr reporterPtr ) {
+            CATCH_TRY {
+                getMutableRegistryHub().registerReporter(
+                    name, CATCH_MOVE( reporterPtr ) );
+            }
+            CATCH_CATCH_ALL {
+                // Do not throw when constructing global objects, instead
+                // register the exception to be processed later
+                getMutableRegistryHub().registerStartupException();
+            }
+        }
+
+        void registerListenerImpl( Detail::unique_ptr<EventListenerFactory> listenerFactory ) {
+            getMutableRegistryHub().registerListener( CATCH_MOVE(listenerFactory) );
+        }
+
+
+    } // namespace Detail
+} // namespace Catch
+
+
+
+
+#include <map>
+
+namespace Catch {
+
+    namespace {
+        std::string createMetadataString(IConfig const& config) {
+            ReusableStringStream sstr;
+            if ( config.testSpec().hasFilters() ) {
+                sstr << "filters='"
+                         << config.testSpec()
+                         << "' ";
+            }
+            sstr << "rng-seed=" << config.rngSeed();
+            return sstr.str();
+        }
+    }
+
+    void SonarQubeReporter::testRunStarting(TestRunInfo const& testRunInfo) {
+        CumulativeReporterBase::testRunStarting(testRunInfo);
+
+        xml.writeComment( createMetadataString( *m_config ) );
+        xml.startElement("testExecutions");
+        xml.writeAttribute("version"_sr, '1');
+    }
+
+    void SonarQubeReporter::writeRun( TestRunNode const& runNode ) {
+        std::map<StringRef, std::vector<TestCaseNode const*>> testsPerFile;
+
+        for ( auto const& child : runNode.children ) {
+            testsPerFile[child->value.testInfo->lineInfo.file].push_back(
+                child.get() );
+        }
+
+        for ( auto const& kv : testsPerFile ) {
+            writeTestFile( kv.first, kv.second );
+        }
+    }
+
+    void SonarQubeReporter::writeTestFile(StringRef filename, std::vector<TestCaseNode const*> const& testCaseNodes) {
+        XmlWriter::ScopedElement e = xml.scopedElement("file");
+        xml.writeAttribute("path"_sr, filename);
+
+        for (auto const& child : testCaseNodes)
+            writeTestCase(*child);
+    }
+
+    void SonarQubeReporter::writeTestCase(TestCaseNode const& testCaseNode) {
+        // All test cases have exactly one section - which represents the
+        // test case itself. That section may have 0-n nested sections
+        assert(testCaseNode.children.size() == 1);
+        SectionNode const& rootSection = *testCaseNode.children.front();
+        writeSection("", rootSection, testCaseNode.value.testInfo->okToFail());
+    }
+
+    void SonarQubeReporter::writeSection(std::string const& rootName, SectionNode const& sectionNode, bool okToFail) {
+        std::string name = trim(sectionNode.stats.sectionInfo.name);
+        if (!rootName.empty())
+            name = rootName + '/' + name;
+
+        if ( sectionNode.hasAnyAssertions()
+            || !sectionNode.stdOut.empty()
+            ||  !sectionNode.stdErr.empty() ) {
+            XmlWriter::ScopedElement e = xml.scopedElement("testCase");
+            xml.writeAttribute("name"_sr, name);
+            xml.writeAttribute("duration"_sr, static_cast<long>(sectionNode.stats.durationInSeconds * 1000));
+
+            writeAssertions(sectionNode, okToFail);
+        }
+
+        for (auto const& childNode : sectionNode.childSections)
+            writeSection(name, *childNode, okToFail);
+    }
+
+    void SonarQubeReporter::writeAssertions(SectionNode const& sectionNode, bool okToFail) {
+        for (auto const& assertionOrBenchmark : sectionNode.assertionsAndBenchmarks) {
+            if (assertionOrBenchmark.isAssertion()) {
+                writeAssertion(assertionOrBenchmark.asAssertion(), okToFail);
+            }
+        }
+    }
+
+    void SonarQubeReporter::writeAssertion(AssertionStats const& stats, bool okToFail) {
+        AssertionResult const& result = stats.assertionResult;
+        if ( !result.isOk() ||
+             result.getResultType() == ResultWas::ExplicitSkip ) {
+            std::string elementName;
+            if (okToFail) {
+                elementName = "skipped";
+            } else {
+                switch (result.getResultType()) {
+                case ResultWas::ThrewException:
+                case ResultWas::FatalErrorCondition:
+                    elementName = "error";
+                    break;
+                case ResultWas::ExplicitFailure:
+                case ResultWas::ExpressionFailed:
+                case ResultWas::DidntThrowException:
+                    elementName = "failure";
+                    break;
+                case ResultWas::ExplicitSkip:
+                    elementName = "skipped";
+                    break;
+                    // We should never see these here:
+                case ResultWas::Info:
+                case ResultWas::Warning:
+                case ResultWas::Ok:
+                case ResultWas::Unknown:
+                case ResultWas::FailureBit:
+                case ResultWas::Exception:
+                    elementName = "internalError";
+                    break;
+                }
+            }
+
+            XmlWriter::ScopedElement e = xml.scopedElement(elementName);
+
+            ReusableStringStream messageRss;
+            messageRss << result.getTestMacroName() << '(' << result.getExpression() << ')';
+            xml.writeAttribute("message"_sr, messageRss.str());
+
+            ReusableStringStream textRss;
+            if ( result.getResultType() == ResultWas::ExplicitSkip ) {
+                textRss << "SKIPPED\n";
+            } else {
+                textRss << "FAILED:\n";
+                if (result.hasExpression()) {
+                    textRss << '\t' << result.getExpressionInMacro() << '\n';
+                }
+                if (result.hasExpandedExpression()) {
+                    textRss << "with expansion:\n\t" << result.getExpandedExpression() << '\n';
+                }
+            }
+
+            if (result.hasMessage())
+                textRss << result.getMessage() << '\n';
+
+            for (auto const& msg : stats.infoMessages)
+                if (msg.type == ResultWas::Info)
+                    textRss << msg.message << '\n';
+
+            textRss << "at " << result.getSourceInfo();
+            xml.writeText(textRss.str(), XmlFormatting::Newline);
+        }
+    }
+
+} // end namespace Catch
+
+
+
+namespace Catch {
+
+    StreamingReporterBase::~StreamingReporterBase() = default;
+
+    void
+    StreamingReporterBase::testRunStarting( TestRunInfo const& _testRunInfo ) {
+        currentTestRunInfo = _testRunInfo;
+    }
+
+    void StreamingReporterBase::testRunEnded( TestRunStats const& ) {
+        currentTestCaseInfo = nullptr;
+    }
+
+} // end namespace Catch
+
+
+
+#include <algorithm>
+#include <ostream>
+
+namespace Catch {
+
+    namespace {
+        // Yes, this has to be outside the class and namespaced by naming.
+        // Making older compiler happy is hard.
+        static constexpr StringRef tapFailedString = "not ok"_sr;
+        static constexpr StringRef tapPassedString = "ok"_sr;
+        static constexpr Colour::Code tapDimColour = Colour::FileName;
+
+        class TapAssertionPrinter {
+        public:
+            TapAssertionPrinter& operator= (TapAssertionPrinter const&) = delete;
+            TapAssertionPrinter(TapAssertionPrinter const&) = delete;
+            TapAssertionPrinter(std::ostream& _stream, AssertionStats const& _stats, std::size_t _counter, ColourImpl* colour_)
+                : stream(_stream)
+                , result(_stats.assertionResult)
+                , messages(_stats.infoMessages)
+                , itMessage(_stats.infoMessages.begin())
+                , printInfoMessages(true)
+                , counter(_counter)
+                , colourImpl( colour_ ) {}
+
+            void print() {
+                itMessage = messages.begin();
+
+                switch (result.getResultType()) {
+                case ResultWas::Ok:
+                    printResultType(tapPassedString);
+                    printOriginalExpression();
+                    printReconstructedExpression();
+                    if (!result.hasExpression())
+                        printRemainingMessages(Colour::None);
+                    else
+                        printRemainingMessages();
+                    break;
+                case ResultWas::ExpressionFailed:
+                    if (result.isOk()) {
+                        printResultType(tapPassedString);
+                    } else {
+                        printResultType(tapFailedString);
+                    }
+                    printOriginalExpression();
+                    printReconstructedExpression();
+                    if (result.isOk()) {
+                        printIssue(" # TODO");
+                    }
+                    printRemainingMessages();
+                    break;
+                case ResultWas::ThrewException:
+                    printResultType(tapFailedString);
+                    printIssue("unexpected exception with message:"_sr);
+                    printMessage();
+                    printExpressionWas();
+                    printRemainingMessages();
+                    break;
+                case ResultWas::FatalErrorCondition:
+                    printResultType(tapFailedString);
+                    printIssue("fatal error condition with message:"_sr);
+                    printMessage();
+                    printExpressionWas();
+                    printRemainingMessages();
+                    break;
+                case ResultWas::DidntThrowException:
+                    printResultType(tapFailedString);
+                    printIssue("expected exception, got none"_sr);
+                    printExpressionWas();
+                    printRemainingMessages();
+                    break;
+                case ResultWas::Info:
+                    printResultType("info"_sr);
+                    printMessage();
+                    printRemainingMessages();
+                    break;
+                case ResultWas::Warning:
+                    printResultType("warning"_sr);
+                    printMessage();
+                    printRemainingMessages();
+                    break;
+                case ResultWas::ExplicitFailure:
+                    printResultType(tapFailedString);
+                    printIssue("explicitly"_sr);
+                    printRemainingMessages(Colour::None);
+                    break;
+                case ResultWas::ExplicitSkip:
+                    printResultType(tapPassedString);
+                    printIssue(" # SKIP"_sr);
+                    printMessage();
+                    printRemainingMessages();
+                    break;
+                    // These cases are here to prevent compiler warnings
+                case ResultWas::Unknown:
+                case ResultWas::FailureBit:
+                case ResultWas::Exception:
+                    printResultType("** internal error **"_sr);
+                    break;
+                }
+            }
+
+        private:
+            void printResultType(StringRef passOrFail) const {
+                if (!passOrFail.empty()) {
+                    stream << passOrFail << ' ' << counter << " -";
+                }
+            }
+
+            void printIssue(StringRef issue) const {
+                stream << ' ' << issue;
+            }
+
+            void printExpressionWas() {
+                if (result.hasExpression()) {
+                    stream << ';';
+                    stream << colourImpl->guardColour( tapDimColour )
+                           << " expression was:";
+                    printOriginalExpression();
+                }
+            }
+
+            void printOriginalExpression() const {
+                if (result.hasExpression()) {
+                    stream << ' ' << result.getExpression();
+                }
+            }
+
+            void printReconstructedExpression() const {
+                if (result.hasExpandedExpression()) {
+                    stream << colourImpl->guardColour( tapDimColour ) << " for: ";
+
+                    std::string expr = result.getExpandedExpression();
+                    std::replace(expr.begin(), expr.end(), '\n', ' ');
+                    stream << expr;
+                }
+            }
+
+            void printMessage() {
+                if (itMessage != messages.end()) {
+                    stream << " '" << itMessage->message << '\'';
+                    ++itMessage;
+                }
+            }
+
+            void printRemainingMessages(Colour::Code colour = tapDimColour) {
+                if (itMessage == messages.end()) {
+                    return;
+                }
+
+                // using messages.end() directly (or auto) yields compilation error:
+                std::vector<MessageInfo>::const_iterator itEnd = messages.end();
+                const std::size_t N = static_cast<std::size_t>(itEnd - itMessage);
+
+                stream << colourImpl->guardColour( colour ) << " with "
+                       << pluralise( N, "message"_sr ) << ':';
+
+                for (; itMessage != itEnd; ) {
+                    // If this assertion is a warning ignore any INFO messages
+                    if (printInfoMessages || itMessage->type != ResultWas::Info) {
+                        stream << " '" << itMessage->message << '\'';
+                        if (++itMessage != itEnd) {
+                            stream << colourImpl->guardColour(tapDimColour) << " and";
+                        }
+                    }
+                }
+            }
+
+        private:
+            std::ostream& stream;
+            AssertionResult const& result;
+            std::vector<MessageInfo> const& messages;
+            std::vector<MessageInfo>::const_iterator itMessage;
+            bool printInfoMessages;
+            std::size_t counter;
+            ColourImpl* colourImpl;
+        };
+
+    } // End anonymous namespace
+
+    void TAPReporter::testRunStarting( TestRunInfo const& ) {
+        if ( m_config->testSpec().hasFilters() ) {
+            m_stream << "# filters: " << m_config->testSpec() << '\n';
+        }
+        m_stream << "# rng-seed: " << m_config->rngSeed() << '\n';
+    }
+
+    void TAPReporter::noMatchingTestCases( StringRef unmatchedSpec ) {
+        m_stream << "# No test cases matched '" << unmatchedSpec << "'\n";
+    }
+
+    void TAPReporter::assertionEnded(AssertionStats const& _assertionStats) {
+        ++counter;
+
+        m_stream << "# " << currentTestCaseInfo->name << '\n';
+        TapAssertionPrinter printer(m_stream, _assertionStats, counter, m_colour.get());
+        printer.print();
+
+        m_stream << '\n' << std::flush;
+    }
+
+    void TAPReporter::testRunEnded(TestRunStats const& _testRunStats) {
+        m_stream << "1.." << _testRunStats.totals.assertions.total();
+        if (_testRunStats.totals.testCases.total() == 0) {
+            m_stream << " # Skipped: No tests ran.";
+        }
+        m_stream << "\n\n" << std::flush;
+        StreamingReporterBase::testRunEnded(_testRunStats);
+    }
+
+
+
+
+} // end namespace Catch
+
+
+
+
+#include <cassert>
+#include <ostream>
+
+namespace Catch {
+
+    namespace {
+        // if string has a : in first line will set indent to follow it on
+        // subsequent lines
+        void printHeaderString(std::ostream& os, std::string const& _string, std::size_t indent = 0) {
+            std::size_t i = _string.find(": ");
+            if (i != std::string::npos)
+                i += 2;
+            else
+                i = 0;
+            os << TextFlow::Column(_string)
+                  .indent(indent + i)
+                  .initialIndent(indent) << '\n';
+        }
+
+        std::string escape(StringRef str) {
+            std::string escaped = static_cast<std::string>(str);
+            replaceInPlace(escaped, "|", "||");
+            replaceInPlace(escaped, "'", "|'");
+            replaceInPlace(escaped, "\n", "|n");
+            replaceInPlace(escaped, "\r", "|r");
+            replaceInPlace(escaped, "[", "|[");
+            replaceInPlace(escaped, "]", "|]");
+            return escaped;
+        }
+    } // end anonymous namespace
+
+
+    TeamCityReporter::~TeamCityReporter() = default;
+
+    void TeamCityReporter::testRunStarting( TestRunInfo const& runInfo ) {
+        m_stream << "##teamcity[testSuiteStarted name='" << escape( runInfo.name )
+               << "']\n";
+    }
+
+    void TeamCityReporter::testRunEnded( TestRunStats const& runStats ) {
+        m_stream << "##teamcity[testSuiteFinished name='"
+               << escape( runStats.runInfo.name ) << "']\n";
+    }
+
+    void TeamCityReporter::assertionEnded(AssertionStats const& assertionStats) {
+        AssertionResult const& result = assertionStats.assertionResult;
+        if ( !result.isOk() ||
+             result.getResultType() == ResultWas::ExplicitSkip ) {
+
+            ReusableStringStream msg;
+            if (!m_headerPrintedForThisSection)
+                printSectionHeader(msg.get());
+            m_headerPrintedForThisSection = true;
+
+            msg << result.getSourceInfo() << '\n';
+
+            switch (result.getResultType()) {
+            case ResultWas::ExpressionFailed:
+                msg << "expression failed";
+                break;
+            case ResultWas::ThrewException:
+                msg << "unexpected exception";
+                break;
+            case ResultWas::FatalErrorCondition:
+                msg << "fatal error condition";
+                break;
+            case ResultWas::DidntThrowException:
+                msg << "no exception was thrown where one was expected";
+                break;
+            case ResultWas::ExplicitFailure:
+                msg << "explicit failure";
+                break;
+            case ResultWas::ExplicitSkip:
+                msg << "explicit skip";
+                break;
+
+                // We shouldn't get here because of the isOk() test
+            case ResultWas::Ok:
+            case ResultWas::Info:
+            case ResultWas::Warning:
+                CATCH_ERROR("Internal error in TeamCity reporter");
+                // These cases are here to prevent compiler warnings
+            case ResultWas::Unknown:
+            case ResultWas::FailureBit:
+            case ResultWas::Exception:
+                CATCH_ERROR("Not implemented");
+            }
+            if (assertionStats.infoMessages.size() == 1)
+                msg << " with message:";
+            if (assertionStats.infoMessages.size() > 1)
+                msg << " with messages:";
+            for (auto const& messageInfo : assertionStats.infoMessages)
+                msg << "\n  \"" << messageInfo.message << '"';
+
+
+            if (result.hasExpression()) {
+                msg <<
+                    "\n  " << result.getExpressionInMacro() << "\n"
+                    "with expansion:\n"
+                    "  " << result.getExpandedExpression() << '\n';
+            }
+
+            if ( result.getResultType() == ResultWas::ExplicitSkip ) {
+                m_stream << "##teamcity[testIgnored";
+            } else if ( currentTestCaseInfo->okToFail() ) {
+                msg << "- failure ignore as test marked as 'ok to fail'\n";
+                m_stream << "##teamcity[testIgnored";
+            } else {
+                m_stream << "##teamcity[testFailed";
+            }
+            m_stream << " name='" << escape( currentTestCaseInfo->name ) << '\''
+                     << " message='" << escape( msg.str() ) << '\'' << "]\n";
+        }
+        m_stream.flush();
+    }
+
+    void TeamCityReporter::testCaseStarting(TestCaseInfo const& testInfo) {
+        m_testTimer.start();
+        StreamingReporterBase::testCaseStarting(testInfo);
+        m_stream << "##teamcity[testStarted name='"
+            << escape(testInfo.name) << "']\n";
+        m_stream.flush();
+    }
+
+    void TeamCityReporter::testCaseEnded(TestCaseStats const& testCaseStats) {
+        StreamingReporterBase::testCaseEnded(testCaseStats);
+        auto const& testCaseInfo = *testCaseStats.testInfo;
+        if (!testCaseStats.stdOut.empty())
+            m_stream << "##teamcity[testStdOut name='"
+            << escape(testCaseInfo.name)
+            << "' out='" << escape(testCaseStats.stdOut) << "']\n";
+        if (!testCaseStats.stdErr.empty())
+            m_stream << "##teamcity[testStdErr name='"
+            << escape(testCaseInfo.name)
+            << "' out='" << escape(testCaseStats.stdErr) << "']\n";
+        m_stream << "##teamcity[testFinished name='"
+            << escape(testCaseInfo.name) << "' duration='"
+            << m_testTimer.getElapsedMilliseconds() << "']\n";
+        m_stream.flush();
+    }
+
+    void TeamCityReporter::printSectionHeader(std::ostream& os) {
+        assert(!m_sectionStack.empty());
+
+        if (m_sectionStack.size() > 1) {
+            os << lineOfChars('-') << '\n';
+
+            std::vector<SectionInfo>::const_iterator
+                it = m_sectionStack.begin() + 1, // Skip first section (test case)
+                itEnd = m_sectionStack.end();
+            for (; it != itEnd; ++it)
+                printHeaderString(os, it->name);
+            os << lineOfChars('-') << '\n';
+        }
+
+        SourceLineInfo lineInfo = m_sectionStack.front().lineInfo;
+
+        os << lineInfo << '\n';
+        os << lineOfChars('.') << "\n\n";
+    }
+
+} // end namespace Catch
+
+
+
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable:4061) // Not all labels are EXPLICITLY handled in switch
+                              // Note that 4062 (not all labels are handled
+                              // and default is missing) is enabled
+#endif
+
+namespace Catch {
+    XmlReporter::XmlReporter( ReporterConfig&& _config )
+    :   StreamingReporterBase( CATCH_MOVE(_config) ),
+        m_xml(m_stream)
+    {
+        m_preferences.shouldRedirectStdOut = true;
+        m_preferences.shouldReportAllAssertions = true;
+    }
+
+    XmlReporter::~XmlReporter() = default;
+
+    std::string XmlReporter::getDescription() {
+        return "Reports test results as an XML document";
+    }
+
+    std::string XmlReporter::getStylesheetRef() const {
+        return std::string();
+    }
+
+    void XmlReporter::writeSourceInfo( SourceLineInfo const& sourceInfo ) {
+        m_xml
+            .writeAttribute( "filename"_sr, sourceInfo.file )
+            .writeAttribute( "line"_sr, sourceInfo.line );
+    }
+
+    void XmlReporter::testRunStarting( TestRunInfo const& testInfo ) {
+        StreamingReporterBase::testRunStarting( testInfo );
+        std::string stylesheetRef = getStylesheetRef();
+        if( !stylesheetRef.empty() )
+            m_xml.writeStylesheetRef( stylesheetRef );
+        m_xml.startElement("Catch2TestRun")
+             .writeAttribute("name"_sr, m_config->name())
+             .writeAttribute("rng-seed"_sr, m_config->rngSeed())
+             .writeAttribute("xml-format-version"_sr, 3)
+             .writeAttribute("catch2-version"_sr, libraryVersion());
+        if ( m_config->testSpec().hasFilters() ) {
+            m_xml.writeAttribute( "filters"_sr, m_config->testSpec() );
+        }
+    }
+
+    void XmlReporter::testCaseStarting( TestCaseInfo const& testInfo ) {
+        StreamingReporterBase::testCaseStarting(testInfo);
+        m_xml.startElement( "TestCase" )
+            .writeAttribute( "name"_sr, trim( StringRef(testInfo.name) ) )
+            .writeAttribute( "tags"_sr, testInfo.tagsAsString() );
+
+        writeSourceInfo( testInfo.lineInfo );
+
+        if ( m_config->showDurations() == ShowDurations::Always )
+            m_testCaseTimer.start();
+        m_xml.ensureTagClosed();
+    }
+
+    void XmlReporter::sectionStarting( SectionInfo const& sectionInfo ) {
+        StreamingReporterBase::sectionStarting( sectionInfo );
+        if( m_sectionDepth++ > 0 ) {
+            m_xml.startElement( "Section" )
+                .writeAttribute( "name"_sr, trim( StringRef(sectionInfo.name) ) );
+            writeSourceInfo( sectionInfo.lineInfo );
+            m_xml.ensureTagClosed();
+        }
+    }
+
+    void XmlReporter::assertionStarting( AssertionInfo const& ) { }
+
+    void XmlReporter::assertionEnded( AssertionStats const& assertionStats ) {
+
+        AssertionResult const& result = assertionStats.assertionResult;
+
+        bool includeResults = m_config->includeSuccessfulResults() || !result.isOk();
+
+        if( includeResults || result.getResultType() == ResultWas::Warning ) {
+            // Print any info messages in <Info> tags.
+            for( auto const& msg : assertionStats.infoMessages ) {
+                if( msg.type == ResultWas::Info && includeResults ) {
+                    auto t = m_xml.scopedElement( "Info" );
+                    writeSourceInfo( msg.lineInfo );
+                    t.writeText( msg.message );
+                } else if ( msg.type == ResultWas::Warning ) {
+                    auto t = m_xml.scopedElement( "Warning" );
+                    writeSourceInfo( msg.lineInfo );
+                    t.writeText( msg.message );
+                }
+            }
+        }
+
+        // Drop out if result was successful but we're not printing them.
+        if ( !includeResults && result.getResultType() != ResultWas::Warning &&
+             result.getResultType() != ResultWas::ExplicitSkip ) {
+            return;
+        }
+
+        // Print the expression if there is one.
+        if( result.hasExpression() ) {
+            m_xml.startElement( "Expression" )
+                .writeAttribute( "success"_sr, result.succeeded() )
+                .writeAttribute( "type"_sr, result.getTestMacroName() );
+
+            writeSourceInfo( result.getSourceInfo() );
+
+            m_xml.scopedElement( "Original" )
+                .writeText( result.getExpression() );
+            m_xml.scopedElement( "Expanded" )
+                .writeText( result.getExpandedExpression() );
+        }
+
+        // And... Print a result applicable to each result type.
+        switch( result.getResultType() ) {
+            case ResultWas::ThrewException:
+                m_xml.startElement( "Exception" );
+                writeSourceInfo( result.getSourceInfo() );
+                m_xml.writeText( result.getMessage() );
+                m_xml.endElement();
+                break;
+            case ResultWas::FatalErrorCondition:
+                m_xml.startElement( "FatalErrorCondition" );
+                writeSourceInfo( result.getSourceInfo() );
+                m_xml.writeText( result.getMessage() );
+                m_xml.endElement();
+                break;
+            case ResultWas::Info:
+                m_xml.scopedElement( "Info" )
+                     .writeText( result.getMessage() );
+                break;
+            case ResultWas::Warning:
+                // Warning will already have been written
+                break;
+            case ResultWas::ExplicitFailure:
+                m_xml.startElement( "Failure" );
+                writeSourceInfo( result.getSourceInfo() );
+                m_xml.writeText( result.getMessage() );
+                m_xml.endElement();
+                break;
+            case ResultWas::ExplicitSkip:
+                m_xml.startElement( "Skip" );
+                writeSourceInfo( result.getSourceInfo() );
+                m_xml.writeText( result.getMessage() );
+                m_xml.endElement();
+                break;
+            default:
+                break;
+        }
+
+        if( result.hasExpression() )
+            m_xml.endElement();
+    }
+
+    void XmlReporter::sectionEnded( SectionStats const& sectionStats ) {
+        StreamingReporterBase::sectionEnded( sectionStats );
+        if ( --m_sectionDepth > 0 ) {
+            {
+                XmlWriter::ScopedElement e = m_xml.scopedElement( "OverallResults" );
+                e.writeAttribute( "successes"_sr, sectionStats.assertions.passed );
+                e.writeAttribute( "failures"_sr, sectionStats.assertions.failed );
+                e.writeAttribute( "expectedFailures"_sr, sectionStats.assertions.failedButOk );
+                e.writeAttribute( "skipped"_sr, sectionStats.assertions.skipped > 0 );
+
+                if ( m_config->showDurations() == ShowDurations::Always )
+                    e.writeAttribute( "durationInSeconds"_sr, sectionStats.durationInSeconds );
+            }
+            // Ends assertion tag
+            m_xml.endElement();
+        }
+    }
+
+    void XmlReporter::testCaseEnded( TestCaseStats const& testCaseStats ) {
+        StreamingReporterBase::testCaseEnded( testCaseStats );
+        XmlWriter::ScopedElement e = m_xml.scopedElement( "OverallResult" );
+        e.writeAttribute( "success"_sr, testCaseStats.totals.assertions.allOk() );
+        e.writeAttribute( "skips"_sr, testCaseStats.totals.assertions.skipped );
+
+        if ( m_config->showDurations() == ShowDurations::Always )
+            e.writeAttribute( "durationInSeconds"_sr, m_testCaseTimer.getElapsedSeconds() );
+        if( !testCaseStats.stdOut.empty() )
+            m_xml.scopedElement( "StdOut" ).writeText( trim( StringRef(testCaseStats.stdOut) ), XmlFormatting::Newline );
+        if( !testCaseStats.stdErr.empty() )
+            m_xml.scopedElement( "StdErr" ).writeText( trim( StringRef(testCaseStats.stdErr) ), XmlFormatting::Newline );
+
+        m_xml.endElement();
+    }
+
+    void XmlReporter::testRunEnded( TestRunStats const& testRunStats ) {
+        StreamingReporterBase::testRunEnded( testRunStats );
+        m_xml.scopedElement( "OverallResults" )
+            .writeAttribute( "successes"_sr, testRunStats.totals.assertions.passed )
+            .writeAttribute( "failures"_sr, testRunStats.totals.assertions.failed )
+            .writeAttribute( "expectedFailures"_sr, testRunStats.totals.assertions.failedButOk )
+            .writeAttribute( "skips"_sr, testRunStats.totals.assertions.skipped );
+        m_xml.scopedElement( "OverallResultsCases")
+            .writeAttribute( "successes"_sr, testRunStats.totals.testCases.passed )
+            .writeAttribute( "failures"_sr, testRunStats.totals.testCases.failed )
+            .writeAttribute( "expectedFailures"_sr, testRunStats.totals.testCases.failedButOk )
+            .writeAttribute( "skips"_sr, testRunStats.totals.testCases.skipped );
+        m_xml.endElement();
+    }
+
+    void XmlReporter::benchmarkPreparing( StringRef name ) {
+        m_xml.startElement("BenchmarkResults")
+             .writeAttribute("name"_sr, name);
+    }
+
+    void XmlReporter::benchmarkStarting(BenchmarkInfo const &info) {
+        m_xml.writeAttribute("samples"_sr, info.samples)
+            .writeAttribute("resamples"_sr, info.resamples)
+            .writeAttribute("iterations"_sr, info.iterations)
+            .writeAttribute("clockResolution"_sr, info.clockResolution)
+            .writeAttribute("estimatedDuration"_sr, info.estimatedDuration)
+            .writeComment("All values in nano seconds"_sr);
+    }
+
+    void XmlReporter::benchmarkEnded(BenchmarkStats<> const& benchmarkStats) {
+        m_xml.scopedElement("mean")
+            .writeAttribute("value"_sr, benchmarkStats.mean.point.count())
+            .writeAttribute("lowerBound"_sr, benchmarkStats.mean.lower_bound.count())
+            .writeAttribute("upperBound"_sr, benchmarkStats.mean.upper_bound.count())
+            .writeAttribute("ci"_sr, benchmarkStats.mean.confidence_interval);
+        m_xml.scopedElement("standardDeviation")
+            .writeAttribute("value"_sr, benchmarkStats.standardDeviation.point.count())
+            .writeAttribute("lowerBound"_sr, benchmarkStats.standardDeviation.lower_bound.count())
+            .writeAttribute("upperBound"_sr, benchmarkStats.standardDeviation.upper_bound.count())
+            .writeAttribute("ci"_sr, benchmarkStats.standardDeviation.confidence_interval);
+        m_xml.scopedElement("outliers")
+            .writeAttribute("variance"_sr, benchmarkStats.outlierVariance)
+            .writeAttribute("lowMild"_sr, benchmarkStats.outliers.low_mild)
+            .writeAttribute("lowSevere"_sr, benchmarkStats.outliers.low_severe)
+            .writeAttribute("highMild"_sr, benchmarkStats.outliers.high_mild)
+            .writeAttribute("highSevere"_sr, benchmarkStats.outliers.high_severe);
+        m_xml.endElement();
+    }
+
+    void XmlReporter::benchmarkFailed(StringRef error) {
+        m_xml.scopedElement("failed").
+            writeAttribute("message"_sr, error);
+        m_xml.endElement();
+    }
+
+    void XmlReporter::listReporters(std::vector<ReporterDescription> const& descriptions) {
+        auto outerTag = m_xml.scopedElement("AvailableReporters");
+        for (auto const& reporter : descriptions) {
+            auto inner = m_xml.scopedElement("Reporter");
+            m_xml.startElement("Name", XmlFormatting::Indent)
+                 .writeText(reporter.name, XmlFormatting::None)
+                 .endElement(XmlFormatting::Newline);
+            m_xml.startElement("Description", XmlFormatting::Indent)
+                 .writeText(reporter.description, XmlFormatting::None)
+                 .endElement(XmlFormatting::Newline);
+        }
+    }
+
+    void XmlReporter::listListeners(std::vector<ListenerDescription> const& descriptions) {
+        auto outerTag = m_xml.scopedElement( "RegisteredListeners" );
+        for ( auto const& listener : descriptions ) {
+            auto inner = m_xml.scopedElement( "Listener" );
+            m_xml.startElement( "Name", XmlFormatting::Indent )
+                .writeText( listener.name, XmlFormatting::None )
+                .endElement( XmlFormatting::Newline );
+            m_xml.startElement( "Description", XmlFormatting::Indent )
+                .writeText( listener.description, XmlFormatting::None )
+                .endElement( XmlFormatting::Newline );
+        }
+    }
+
+    void XmlReporter::listTests(std::vector<TestCaseHandle> const& tests) {
+        auto outerTag = m_xml.scopedElement("MatchingTests");
+        for (auto const& test : tests) {
+            auto innerTag = m_xml.scopedElement("TestCase");
+            auto const& testInfo = test.getTestCaseInfo();
+            m_xml.startElement("Name", XmlFormatting::Indent)
+                 .writeText(testInfo.name, XmlFormatting::None)
+                 .endElement(XmlFormatting::Newline);
+            m_xml.startElement("ClassName", XmlFormatting::Indent)
+                 .writeText(testInfo.className, XmlFormatting::None)
+                 .endElement(XmlFormatting::Newline);
+            m_xml.startElement("Tags", XmlFormatting::Indent)
+                 .writeText(testInfo.tagsAsString(), XmlFormatting::None)
+                 .endElement(XmlFormatting::Newline);
+
+            auto sourceTag = m_xml.scopedElement("SourceInfo");
+            m_xml.startElement("File", XmlFormatting::Indent)
+                 .writeText(testInfo.lineInfo.file, XmlFormatting::None)
+                 .endElement(XmlFormatting::Newline);
+            m_xml.startElement("Line", XmlFormatting::Indent)
+                 .writeText(std::to_string(testInfo.lineInfo.line), XmlFormatting::None)
+                 .endElement(XmlFormatting::Newline);
+        }
+    }
+
+    void XmlReporter::listTags(std::vector<TagInfo> const& tags) {
+        auto outerTag = m_xml.scopedElement("TagsFromMatchingTests");
+        for (auto const& tag : tags) {
+            auto innerTag = m_xml.scopedElement("Tag");
+            m_xml.startElement("Count", XmlFormatting::Indent)
+                 .writeText(std::to_string(tag.count), XmlFormatting::None)
+                 .endElement(XmlFormatting::Newline);
+            auto aliasTag = m_xml.scopedElement("Aliases");
+            for (auto const& alias : tag.spellings) {
+                m_xml.startElement("Alias", XmlFormatting::Indent)
+                     .writeText(alias, XmlFormatting::None)
+                     .endElement(XmlFormatting::Newline);
+            }
+        }
+    }
+
+} // end namespace Catch
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/extras/catch_amalgamated.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/extras/catch_amalgamated.hpp
new file mode 100644
index 00000000..e5754314
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/extras/catch_amalgamated.hpp
@@ -0,0 +1,13915 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+//  Catch v3.5.3
+//  Generated: 2024-03-01 22:05:55.031514
+//  ----------------------------------------------------------
+//  This file is an amalgamation of multiple different files.
+//  You probably shouldn't edit it directly.
+//  ----------------------------------------------------------
+#ifndef CATCH_AMALGAMATED_HPP_INCLUDED
+#define CATCH_AMALGAMATED_HPP_INCLUDED
+
+
+/** \file
+ * This is a convenience header for Catch2. It includes **all** of Catch2 headers.
+ *
+ * Generally the Catch2 users should use specific includes they need,
+ * but this header can be used instead for ease-of-experimentation, or
+ * just plain convenience, at the cost of (significantly) increased
+ * compilation times.
+ *
+ * When a new header is added to either the top level folder, or to the
+ * corresponding internal subfolder, it should be added here. Headers
+ * added to the various subparts (e.g. matchers, generators, etc...),
+ * should go their respective catch-all headers.
+ */
+
+#ifndef CATCH_ALL_HPP_INCLUDED
+#define CATCH_ALL_HPP_INCLUDED
+
+
+
+/** \file
+ * This is a convenience header for Catch2's benchmarking. It includes
+ * **all** of Catch2 headers related to benchmarking.
+ *
+ * Generally the Catch2 users should use specific includes they need,
+ * but this header can be used instead for ease-of-experimentation, or
+ * just plain convenience, at the cost of (significantly) increased
+ * compilation times.
+ *
+ * When a new header is added to either the `benchmark` folder, or to
+ * the corresponding internal (detail) subfolder, it should be added here.
+ */
+
+#ifndef CATCH_BENCHMARK_ALL_HPP_INCLUDED
+#define CATCH_BENCHMARK_ALL_HPP_INCLUDED
+
+
+
+// Adapted from donated nonius code.
+
+#ifndef CATCH_BENCHMARK_HPP_INCLUDED
+#define CATCH_BENCHMARK_HPP_INCLUDED
+
+
+
+#ifndef CATCH_COMPILER_CAPABILITIES_HPP_INCLUDED
+#define CATCH_COMPILER_CAPABILITIES_HPP_INCLUDED
+
+// Detect a number of compiler features - by compiler
+// The following features are defined:
+//
+// CATCH_CONFIG_WINDOWS_SEH : is Windows SEH supported?
+// CATCH_CONFIG_POSIX_SIGNALS : are POSIX signals supported?
+// CATCH_CONFIG_DISABLE_EXCEPTIONS : Are exceptions enabled?
+// ****************
+// Note to maintainers: if new toggles are added please document them
+// in configuration.md, too
+// ****************
+
+// In general each macro has a _NO_<feature name> form
+// (e.g. CATCH_CONFIG_NO_POSIX_SIGNALS) which disables the feature.
+// Many features, at point of detection, define an _INTERNAL_ macro, so they
+// can be combined, en-mass, with the _NO_ forms later.
+
+
+
+#ifndef CATCH_PLATFORM_HPP_INCLUDED
+#define CATCH_PLATFORM_HPP_INCLUDED
+
+// See e.g.:
+// https://opensource.apple.com/source/CarbonHeaders/CarbonHeaders-18.1/TargetConditionals.h.auto.html
+#ifdef __APPLE__
+#  include <TargetConditionals.h>
+#  if (defined(TARGET_OS_OSX) && TARGET_OS_OSX == 1) || \
+      (defined(TARGET_OS_MAC) && TARGET_OS_MAC == 1)
+#    define CATCH_PLATFORM_MAC
+#  elif (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE == 1)
+#    define CATCH_PLATFORM_IPHONE
+#  endif
+
+#elif defined(linux) || defined(__linux) || defined(__linux__)
+#  define CATCH_PLATFORM_LINUX
+
+#elif defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(_MSC_VER) || defined(__MINGW32__)
+#  define CATCH_PLATFORM_WINDOWS
+
+#  if defined( WINAPI_FAMILY ) && ( WINAPI_FAMILY == WINAPI_FAMILY_APP )
+#      define CATCH_PLATFORM_WINDOWS_UWP
+#  endif
+
+#elif defined(__ORBIS__) || defined(__PROSPERO__)
+#  define CATCH_PLATFORM_PLAYSTATION
+
+#endif
+
+#endif // CATCH_PLATFORM_HPP_INCLUDED
+
+#ifdef __cplusplus
+
+#  if (__cplusplus >= 201703L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#    define CATCH_CPP17_OR_GREATER
+#  endif
+
+#  if (__cplusplus >= 202002L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
+#    define CATCH_CPP20_OR_GREATER
+#  endif
+
+#endif
+
+// Only GCC compiler should be used in this block, so other compilers trying to
+// mask themselves as GCC should be ignored.
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && !defined(__CUDACC__) && !defined(__LCC__) && !defined(__NVCOMPILER)
+#    define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "GCC diagnostic push" )
+#    define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION  _Pragma( "GCC diagnostic pop" )
+
+// This only works on GCC 9+. so we have to also add a global suppression of Wparentheses
+// for older versions of GCC.
+#    define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \
+         _Pragma( "GCC diagnostic ignored \"-Wparentheses\"" )
+
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT \
+         _Pragma( "GCC diagnostic ignored \"-Wunused-result\"" )
+
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+         _Pragma( "GCC diagnostic ignored \"-Wunused-variable\"" )
+
+#    define CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
+         _Pragma( "GCC diagnostic ignored \"-Wuseless-cast\"" )
+
+#    define CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS \
+         _Pragma( "GCC diagnostic ignored \"-Wshadow\"" )
+
+#    define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__)
+
+#endif
+
+#if defined(__NVCOMPILER)
+#    define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "diag push" )
+#    define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION  _Pragma( "diag pop" )
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS _Pragma( "diag_suppress declared_but_not_referenced" )
+#endif
+
+#if defined(__CUDACC__) && !defined(__clang__)
+#  ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+// New pragmas introduced in CUDA 11.5+
+#    define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "nv_diagnostic push" )
+#    define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION  _Pragma( "nv_diagnostic pop" )
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS _Pragma( "nv_diag_suppress 177" )
+#  else
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS _Pragma( "diag_suppress 177" )
+#  endif
+#endif
+
+// clang-cl defines _MSC_VER as well as __clang__, which could cause the
+// start/stop internal suppression macros to be double defined.
+#if defined(__clang__) && !defined(_MSC_VER)
+
+#    define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "clang diagnostic push" )
+#    define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION  _Pragma( "clang diagnostic pop" )
+
+#endif // __clang__ && !_MSC_VER
+
+#if defined(__clang__)
+
+// As of this writing, IBM XL's implementation of __builtin_constant_p has a bug
+// which results in calls to destructors being emitted for each temporary,
+// without a matching initialization. In practice, this can result in something
+// like `std::string::~string` being called on an uninitialized value.
+//
+// For example, this code will likely segfault under IBM XL:
+// ```
+// REQUIRE(std::string("12") + "34" == "1234")
+// ```
+//
+// Similarly, NVHPC's implementation of `__builtin_constant_p` has a bug which
+// results in calls to the immediately evaluated lambda expressions to be
+// reported as unevaluated lambdas.
+// https://developer.nvidia.com/nvidia_bug/3321845.
+//
+// Therefore, `CATCH_INTERNAL_IGNORE_BUT_WARN` is not implemented.
+#  if !defined(__ibmxl__) && !defined(__CUDACC__) && !defined( __NVCOMPILER )
+#    define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__) /* NOLINT(cppcoreguidelines-pro-type-vararg, hicpp-vararg) */
+#  endif
+
+
+#    define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+         _Pragma( "clang diagnostic ignored \"-Wexit-time-destructors\"" ) \
+         _Pragma( "clang diagnostic ignored \"-Wglobal-constructors\"")
+
+#    define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \
+         _Pragma( "clang diagnostic ignored \"-Wparentheses\"" )
+
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+         _Pragma( "clang diagnostic ignored \"-Wunused-variable\"" )
+
+#    define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \
+         _Pragma( "clang diagnostic ignored \"-Wgnu-zero-variadic-macro-arguments\"" )
+
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \
+         _Pragma( "clang diagnostic ignored \"-Wunused-template\"" )
+
+#    define CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS \
+        _Pragma( "clang diagnostic ignored \"-Wcomma\"" )
+
+#    define CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS \
+        _Pragma( "clang diagnostic ignored \"-Wshadow\"" )
+
+#endif // __clang__
+
+
+////////////////////////////////////////////////////////////////////////////////
+// We know some environments not to support full POSIX signals
+#if defined( CATCH_PLATFORM_WINDOWS ) ||                                       \
+    defined( CATCH_PLATFORM_PLAYSTATION ) ||                                   \
+    defined( __CYGWIN__ ) ||                                                   \
+    defined( __QNX__ ) ||                                                      \
+    defined( __EMSCRIPTEN__ ) ||                                               \
+    defined( __DJGPP__ ) ||                                                    \
+    defined( __OS400__ )
+#    define CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS
+#else
+#    define CATCH_INTERNAL_CONFIG_POSIX_SIGNALS
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Assume that some platforms do not support getenv.
+#if defined( CATCH_PLATFORM_WINDOWS_UWP ) ||                                   \
+    defined( CATCH_PLATFORM_PLAYSTATION ) ||                                   \
+    defined( _GAMING_XBOX )
+#    define CATCH_INTERNAL_CONFIG_NO_GETENV
+#else
+#    define CATCH_INTERNAL_CONFIG_GETENV
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Android somehow still does not support std::to_string
+#if defined(__ANDROID__)
+#    define CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Not all Windows environments support SEH properly
+#if defined(__MINGW32__)
+#    define CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// PS4
+#if defined(__ORBIS__)
+#    define CATCH_INTERNAL_CONFIG_NO_NEW_CAPTURE
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Cygwin
+#ifdef __CYGWIN__
+
+// Required for some versions of Cygwin to declare gettimeofday
+// see: http://stackoverflow.com/questions/36901803/gettimeofday-not-declared-in-this-scope-cygwin
+#   define _BSD_SOURCE
+// some versions of cygwin (most) do not support std::to_string. Use the libstd check.
+// https://gcc.gnu.org/onlinedocs/gcc-4.8.2/libstdc++/api/a01053_source.html line 2812-2813
+# if !((__cplusplus >= 201103L) && defined(_GLIBCXX_USE_C99) \
+           && !defined(_GLIBCXX_HAVE_BROKEN_VSWPRINTF))
+
+#    define CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING
+
+# endif
+#endif // __CYGWIN__
+
+////////////////////////////////////////////////////////////////////////////////
+// Visual C++
+#if defined(_MSC_VER)
+
+// We want to defer to nvcc-specific warning suppression if we are compiled
+// with nvcc masquerading for MSVC.
+#    if !defined( __CUDACC__ )
+#        define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+            __pragma( warning( push ) )
+#        define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+            __pragma( warning( pop ) )
+#    endif
+
+// Universal Windows platform does not support SEH
+// Or console colours (or console at all...)
+#  if defined(CATCH_PLATFORM_WINDOWS_UWP)
+#    define CATCH_INTERNAL_CONFIG_NO_COLOUR_WIN32
+#  else
+#    define CATCH_INTERNAL_CONFIG_WINDOWS_SEH
+#  endif
+
+// MSVC traditional preprocessor needs some workaround for __VA_ARGS__
+// _MSVC_TRADITIONAL == 0 means new conformant preprocessor
+// _MSVC_TRADITIONAL == 1 means old traditional non-conformant preprocessor
+#  if !defined(__clang__) // Handle Clang masquerading for msvc
+#    if !defined(_MSVC_TRADITIONAL) || (defined(_MSVC_TRADITIONAL) && _MSVC_TRADITIONAL)
+#      define CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#    endif // MSVC_TRADITIONAL
+#  endif // __clang__
+
+#endif // _MSC_VER
+
+#if defined(_REENTRANT) || defined(_MSC_VER)
+// Enable async processing, as -pthread is specified or no additional linking is required
+# define CATCH_INTERNAL_CONFIG_USE_ASYNC
+#endif // _MSC_VER
+
+////////////////////////////////////////////////////////////////////////////////
+// Check if we are compiled with -fno-exceptions or equivalent
+#if defined(__EXCEPTIONS) || defined(__cpp_exceptions) || defined(_CPPUNWIND)
+#  define CATCH_INTERNAL_CONFIG_EXCEPTIONS_ENABLED
+#endif
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Embarcadero C++Build
+#if defined(__BORLANDC__)
+    #define CATCH_INTERNAL_CONFIG_POLYFILL_ISNAN
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+// RTX is a special version of Windows that is real time.
+// This means that it is detected as Windows, but does not provide
+// the same set of capabilities as real Windows does.
+#if defined(UNDER_RTSS) || defined(RTX64_BUILD)
+    #define CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH
+    #define CATCH_INTERNAL_CONFIG_NO_ASYNC
+    #define CATCH_INTERNAL_CONFIG_NO_COLOUR_WIN32
+#endif
+
+#if !defined(_GLIBCXX_USE_C99_MATH_TR1)
+#define CATCH_INTERNAL_CONFIG_GLOBAL_NEXTAFTER
+#endif
+
+// Various stdlib support checks that require __has_include
+#if defined(__has_include)
+  // Check if string_view is available and usable
+  #if __has_include(<string_view>) && defined(CATCH_CPP17_OR_GREATER)
+  #    define CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW
+  #endif
+
+  // Check if optional is available and usable
+  #  if __has_include(<optional>) && defined(CATCH_CPP17_OR_GREATER)
+  #    define CATCH_INTERNAL_CONFIG_CPP17_OPTIONAL
+  #  endif // __has_include(<optional>) && defined(CATCH_CPP17_OR_GREATER)
+
+  // Check if byte is available and usable
+  #  if __has_include(<cstddef>) && defined(CATCH_CPP17_OR_GREATER)
+  #    include <cstddef>
+  #    if defined(__cpp_lib_byte) && (__cpp_lib_byte > 0)
+  #      define CATCH_INTERNAL_CONFIG_CPP17_BYTE
+  #    endif
+  #  endif // __has_include(<cstddef>) && defined(CATCH_CPP17_OR_GREATER)
+
+  // Check if variant is available and usable
+  #  if __has_include(<variant>) && defined(CATCH_CPP17_OR_GREATER)
+  #    if defined(__clang__) && (__clang_major__ < 8)
+         // work around clang bug with libstdc++ https://bugs.llvm.org/show_bug.cgi?id=31852
+         // fix should be in clang 8, workaround in libstdc++ 8.2
+  #      include <ciso646>
+  #      if defined(__GLIBCXX__) && defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE < 9)
+  #        define CATCH_CONFIG_NO_CPP17_VARIANT
+  #      else
+  #        define CATCH_INTERNAL_CONFIG_CPP17_VARIANT
+  #      endif // defined(__GLIBCXX__) && defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE < 9)
+  #    else
+  #      define CATCH_INTERNAL_CONFIG_CPP17_VARIANT
+  #    endif // defined(__clang__) && (__clang_major__ < 8)
+  #  endif // __has_include(<variant>) && defined(CATCH_CPP17_OR_GREATER)
+#endif // defined(__has_include)
+
+
+#if defined(CATCH_INTERNAL_CONFIG_WINDOWS_SEH) && !defined(CATCH_CONFIG_NO_WINDOWS_SEH) && !defined(CATCH_CONFIG_WINDOWS_SEH) && !defined(CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH)
+#   define CATCH_CONFIG_WINDOWS_SEH
+#endif
+// This is set by default, because we assume that unix compilers are posix-signal-compatible by default.
+#if defined(CATCH_INTERNAL_CONFIG_POSIX_SIGNALS) && !defined(CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS) && !defined(CATCH_CONFIG_NO_POSIX_SIGNALS) && !defined(CATCH_CONFIG_POSIX_SIGNALS)
+#   define CATCH_CONFIG_POSIX_SIGNALS
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_GETENV) && !defined(CATCH_INTERNAL_CONFIG_NO_GETENV) && !defined(CATCH_CONFIG_NO_GETENV) && !defined(CATCH_CONFIG_GETENV)
+#   define CATCH_CONFIG_GETENV
+#endif
+
+#if !defined(CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING) && !defined(CATCH_CONFIG_NO_CPP11_TO_STRING) && !defined(CATCH_CONFIG_CPP11_TO_STRING)
+#    define CATCH_CONFIG_CPP11_TO_STRING
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_CPP17_OPTIONAL) && !defined(CATCH_CONFIG_NO_CPP17_OPTIONAL) && !defined(CATCH_CONFIG_CPP17_OPTIONAL)
+#  define CATCH_CONFIG_CPP17_OPTIONAL
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_NO_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_CPP17_STRING_VIEW)
+#  define CATCH_CONFIG_CPP17_STRING_VIEW
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_CPP17_VARIANT) && !defined(CATCH_CONFIG_NO_CPP17_VARIANT) && !defined(CATCH_CONFIG_CPP17_VARIANT)
+#  define CATCH_CONFIG_CPP17_VARIANT
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_CPP17_BYTE) && !defined(CATCH_CONFIG_NO_CPP17_BYTE) && !defined(CATCH_CONFIG_CPP17_BYTE)
+#  define CATCH_CONFIG_CPP17_BYTE
+#endif
+
+
+#if defined(CATCH_CONFIG_EXPERIMENTAL_REDIRECT)
+#  define CATCH_INTERNAL_CONFIG_NEW_CAPTURE
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_NEW_CAPTURE) && !defined(CATCH_INTERNAL_CONFIG_NO_NEW_CAPTURE) && !defined(CATCH_CONFIG_NO_NEW_CAPTURE) && !defined(CATCH_CONFIG_NEW_CAPTURE)
+#  define CATCH_CONFIG_NEW_CAPTURE
+#endif
+
+#if !defined( CATCH_INTERNAL_CONFIG_EXCEPTIONS_ENABLED ) && \
+    !defined( CATCH_CONFIG_DISABLE_EXCEPTIONS ) &&          \
+    !defined( CATCH_CONFIG_NO_DISABLE_EXCEPTIONS )
+#  define CATCH_CONFIG_DISABLE_EXCEPTIONS
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_POLYFILL_ISNAN) && !defined(CATCH_CONFIG_NO_POLYFILL_ISNAN) && !defined(CATCH_CONFIG_POLYFILL_ISNAN)
+#  define CATCH_CONFIG_POLYFILL_ISNAN
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_USE_ASYNC)  && !defined(CATCH_INTERNAL_CONFIG_NO_ASYNC) && !defined(CATCH_CONFIG_NO_USE_ASYNC) && !defined(CATCH_CONFIG_USE_ASYNC)
+#  define CATCH_CONFIG_USE_ASYNC
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_GLOBAL_NEXTAFTER) && !defined(CATCH_CONFIG_NO_GLOBAL_NEXTAFTER) && !defined(CATCH_CONFIG_GLOBAL_NEXTAFTER)
+#  define CATCH_CONFIG_GLOBAL_NEXTAFTER
+#endif
+
+
+// Even if we do not think the compiler has that warning, we still have
+// to provide a macro that can be used by the code.
+#if !defined(CATCH_INTERNAL_START_WARNINGS_SUPPRESSION)
+#   define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION
+#endif
+#if !defined(CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION)
+#   define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#endif
+#if !defined(CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS)
+#   define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS
+#endif
+#if !defined(CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS)
+#   define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS
+#endif
+#if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT)
+#   define CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT
+#endif
+#if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS)
+#   define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS
+#endif
+#if !defined(CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS)
+#   define CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS
+#endif
+#if !defined(CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS)
+#   define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS
+#endif
+#if !defined( CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS )
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
+#endif
+#if !defined( CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS )
+#    define CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS
+#endif
+#if !defined( CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS )
+#    define CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS
+#endif
+
+
+// The goal of this macro is to avoid evaluation of the arguments, but
+// still have the compiler warn on problems inside...
+#if !defined(CATCH_INTERNAL_IGNORE_BUT_WARN)
+#   define CATCH_INTERNAL_IGNORE_BUT_WARN(...)
+#endif
+
+#if defined(__APPLE__) && defined(__apple_build_version__) && (__clang_major__ < 10)
+#   undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
+#elif defined(__clang__) && (__clang_major__ < 5)
+#   undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
+#endif
+
+
+#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+#define CATCH_TRY if ((true))
+#define CATCH_CATCH_ALL if ((false))
+#define CATCH_CATCH_ANON(type) if ((false))
+#else
+#define CATCH_TRY try
+#define CATCH_CATCH_ALL catch (...)
+#define CATCH_CATCH_ANON(type) catch (type)
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR) && !defined(CATCH_CONFIG_NO_TRADITIONAL_MSVC_PREPROCESSOR) && !defined(CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR)
+#define CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#endif
+
+#if defined( CATCH_PLATFORM_WINDOWS ) &&       \
+    !defined( CATCH_CONFIG_COLOUR_WIN32 ) && \
+    !defined( CATCH_CONFIG_NO_COLOUR_WIN32 ) && \
+    !defined( CATCH_INTERNAL_CONFIG_NO_COLOUR_WIN32 )
+#    define CATCH_CONFIG_COLOUR_WIN32
+#endif
+
+#if defined( CATCH_CONFIG_SHARED_LIBRARY ) && defined( _MSC_VER ) && \
+    !defined( CATCH_CONFIG_STATIC )
+#    ifdef Catch2_EXPORTS
+#        define CATCH_EXPORT //__declspec( dllexport ) // not needed
+#    else
+#        define CATCH_EXPORT __declspec( dllimport )
+#    endif
+#else
+#    define CATCH_EXPORT
+#endif
+
+#endif // CATCH_COMPILER_CAPABILITIES_HPP_INCLUDED
+
+
+#ifndef CATCH_CONTEXT_HPP_INCLUDED
+#define CATCH_CONTEXT_HPP_INCLUDED
+
+
+namespace Catch {
+
+    class IResultCapture;
+    class IConfig;
+
+    class Context {
+        IConfig const* m_config = nullptr;
+        IResultCapture* m_resultCapture = nullptr;
+
+        CATCH_EXPORT static Context* currentContext;
+        friend Context& getCurrentMutableContext();
+        friend Context const& getCurrentContext();
+        static void createContext();
+        friend void cleanUpContext();
+
+    public:
+        IResultCapture* getResultCapture() const { return m_resultCapture; }
+        IConfig const* getConfig() const { return m_config; }
+        void setResultCapture( IResultCapture* resultCapture );
+        void setConfig( IConfig const* config );
+    };
+
+    Context& getCurrentMutableContext();
+
+    inline Context const& getCurrentContext() {
+        // We duplicate the logic from `getCurrentMutableContext` here,
+        // to avoid paying the call overhead in debug mode.
+        if ( !Context::currentContext ) { Context::createContext(); }
+        // NOLINTNEXTLINE(clang-analyzer-core.uninitialized.UndefReturn)
+        return *Context::currentContext;
+    }
+
+    void cleanUpContext();
+
+    class SimplePcg32;
+    SimplePcg32& sharedRng();
+}
+
+#endif // CATCH_CONTEXT_HPP_INCLUDED
+
+
+#ifndef CATCH_MOVE_AND_FORWARD_HPP_INCLUDED
+#define CATCH_MOVE_AND_FORWARD_HPP_INCLUDED
+
+#include <type_traits>
+
+//! Replacement for std::move with better compile time performance
+#define CATCH_MOVE(...) static_cast<std::remove_reference_t<decltype(__VA_ARGS__)>&&>(__VA_ARGS__)
+
+//! Replacement for std::forward with better compile time performance
+#define CATCH_FORWARD(...) static_cast<decltype(__VA_ARGS__)&&>(__VA_ARGS__)
+
+#endif // CATCH_MOVE_AND_FORWARD_HPP_INCLUDED
+
+
+#ifndef CATCH_TEST_FAILURE_EXCEPTION_HPP_INCLUDED
+#define CATCH_TEST_FAILURE_EXCEPTION_HPP_INCLUDED
+
+namespace Catch {
+
+    //! Used to signal that an assertion macro failed
+    struct TestFailureException{};
+    //! Used to signal that the remainder of a test should be skipped
+    struct TestSkipException {};
+
+    /**
+     * Outlines throwing of `TestFailureException` into a single TU
+     *
+     * Also handles `CATCH_CONFIG_DISABLE_EXCEPTIONS` for callers.
+     */
+    [[noreturn]] void throw_test_failure_exception();
+
+    /**
+     * Outlines throwing of `TestSkipException` into a single TU
+     *
+     * Also handles `CATCH_CONFIG_DISABLE_EXCEPTIONS` for callers.
+     */
+    [[noreturn]] void throw_test_skip_exception();
+
+} // namespace Catch
+
+#endif // CATCH_TEST_FAILURE_EXCEPTION_HPP_INCLUDED
+
+
+#ifndef CATCH_UNIQUE_NAME_HPP_INCLUDED
+#define CATCH_UNIQUE_NAME_HPP_INCLUDED
+
+
+
+
+/** \file
+ * Wrapper for the CONFIG configuration option
+ *
+ * When generating internal unique names, there are two options. Either
+ * we mix in the current line number, or mix in an incrementing number.
+ * We prefer the latter, using `__COUNTER__`, but users might want to
+ * use the former.
+ */
+
+#ifndef CATCH_CONFIG_COUNTER_HPP_INCLUDED
+#define CATCH_CONFIG_COUNTER_HPP_INCLUDED
+
+
+#if ( !defined(__JETBRAINS_IDE__) || __JETBRAINS_IDE__ >= 20170300L )
+    #define CATCH_INTERNAL_CONFIG_COUNTER
+#endif
+
+#if defined( CATCH_INTERNAL_CONFIG_COUNTER ) && \
+    !defined( CATCH_CONFIG_NO_COUNTER ) && \
+    !defined( CATCH_CONFIG_COUNTER )
+#    define CATCH_CONFIG_COUNTER
+#endif
+
+
+#endif // CATCH_CONFIG_COUNTER_HPP_INCLUDED
+#define INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line ) name##line
+#define INTERNAL_CATCH_UNIQUE_NAME_LINE( name, line ) INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line )
+#ifdef CATCH_CONFIG_COUNTER
+#  define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __COUNTER__ )
+#else
+#  define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __LINE__ )
+#endif
+
+#endif // CATCH_UNIQUE_NAME_HPP_INCLUDED
+
+
+#ifndef CATCH_INTERFACES_CAPTURE_HPP_INCLUDED
+#define CATCH_INTERFACES_CAPTURE_HPP_INCLUDED
+
+#include <string>
+#include <chrono>
+
+
+
+#ifndef CATCH_STRINGREF_HPP_INCLUDED
+#define CATCH_STRINGREF_HPP_INCLUDED
+
+#include <cstddef>
+#include <string>
+#include <iosfwd>
+#include <cassert>
+
+#include <cstring>
+
+namespace Catch {
+
+    /// A non-owning string class (similar to the forthcoming std::string_view)
+    /// Note that, because a StringRef may be a substring of another string,
+    /// it may not be null terminated.
+    class StringRef {
+    public:
+        using size_type = std::size_t;
+        using const_iterator = const char*;
+
+        static constexpr size_type npos{ static_cast<size_type>( -1 ) };
+
+    private:
+        static constexpr char const* const s_empty = "";
+
+        char const* m_start = s_empty;
+        size_type m_size = 0;
+
+    public: // construction
+        constexpr StringRef() noexcept = default;
+
+        StringRef( char const* rawChars ) noexcept;
+
+        constexpr StringRef( char const* rawChars, size_type size ) noexcept
+        :   m_start( rawChars ),
+            m_size( size )
+        {}
+
+        StringRef( std::string const& stdString ) noexcept
+        :   m_start( stdString.c_str() ),
+            m_size( stdString.size() )
+        {}
+
+        explicit operator std::string() const {
+            return std::string(m_start, m_size);
+        }
+
+    public: // operators
+        auto operator == ( StringRef other ) const noexcept -> bool {
+            return m_size == other.m_size
+                && (std::memcmp( m_start, other.m_start, m_size ) == 0);
+        }
+        auto operator != (StringRef other) const noexcept -> bool {
+            return !(*this == other);
+        }
+
+        constexpr auto operator[] ( size_type index ) const noexcept -> char {
+            assert(index < m_size);
+            return m_start[index];
+        }
+
+        bool operator<(StringRef rhs) const noexcept;
+
+    public: // named queries
+        constexpr auto empty() const noexcept -> bool {
+            return m_size == 0;
+        }
+        constexpr auto size() const noexcept -> size_type {
+            return m_size;
+        }
+
+        // Returns a substring of [start, start + length).
+        // If start + length > size(), then the substring is [start, size()).
+        // If start > size(), then the substring is empty.
+        constexpr StringRef substr(size_type start, size_type length) const noexcept {
+            if (start < m_size) {
+                const auto shortened_size = m_size - start;
+                return StringRef(m_start + start, (shortened_size < length) ? shortened_size : length);
+            } else {
+                return StringRef();
+            }
+        }
+
+        // Returns the current start pointer. May not be null-terminated.
+        constexpr char const* data() const noexcept {
+            return m_start;
+        }
+
+        constexpr const_iterator begin() const { return m_start; }
+        constexpr const_iterator end() const { return m_start + m_size; }
+
+
+        friend std::string& operator += (std::string& lhs, StringRef rhs);
+        friend std::ostream& operator << (std::ostream& os, StringRef str);
+        friend std::string operator+(StringRef lhs, StringRef rhs);
+
+        /**
+         * Provides a three-way comparison with rhs
+         *
+         * Returns negative number if lhs < rhs, 0 if lhs == rhs, and a positive
+         * number if lhs > rhs
+         */
+        int compare( StringRef rhs ) const;
+    };
+
+
+    constexpr auto operator ""_sr( char const* rawChars, std::size_t size ) noexcept -> StringRef {
+        return StringRef( rawChars, size );
+    }
+} // namespace Catch
+
+constexpr auto operator ""_catch_sr( char const* rawChars, std::size_t size ) noexcept -> Catch::StringRef {
+    return Catch::StringRef( rawChars, size );
+}
+
+#endif // CATCH_STRINGREF_HPP_INCLUDED
+
+
+#ifndef CATCH_RESULT_TYPE_HPP_INCLUDED
+#define CATCH_RESULT_TYPE_HPP_INCLUDED
+
+namespace Catch {
+
+    // ResultWas::OfType enum
+    struct ResultWas { enum OfType {
+        Unknown = -1,
+        Ok = 0,
+        Info = 1,
+        Warning = 2,
+        // TODO: Should explicit skip be considered "not OK" (cf. isOk)? I.e., should it have the failure bit?
+        ExplicitSkip = 4,
+
+        FailureBit = 0x10,
+
+        ExpressionFailed = FailureBit | 1,
+        ExplicitFailure = FailureBit | 2,
+
+        Exception = 0x100 | FailureBit,
+
+        ThrewException = Exception | 1,
+        DidntThrowException = Exception | 2,
+
+        FatalErrorCondition = 0x200 | FailureBit
+
+    }; };
+
+    bool isOk( ResultWas::OfType resultType );
+    bool isJustInfo( int flags );
+
+
+    // ResultDisposition::Flags enum
+    struct ResultDisposition { enum Flags {
+        Normal = 0x01,
+
+        ContinueOnFailure = 0x02,   // Failures fail test, but execution continues
+        FalseTest = 0x04,           // Prefix expression with !
+        SuppressFail = 0x08         // Failures are reported but do not fail the test
+    }; };
+
+    ResultDisposition::Flags operator | ( ResultDisposition::Flags lhs, ResultDisposition::Flags rhs );
+
+    bool shouldContinueOnFailure( int flags );
+    inline bool isFalseTest( int flags ) { return ( flags & ResultDisposition::FalseTest ) != 0; }
+    bool shouldSuppressFailure( int flags );
+
+} // end namespace Catch
+
+#endif // CATCH_RESULT_TYPE_HPP_INCLUDED
+
+
+#ifndef CATCH_UNIQUE_PTR_HPP_INCLUDED
+#define CATCH_UNIQUE_PTR_HPP_INCLUDED
+
+#include <cassert>
+#include <type_traits>
+
+
+namespace Catch {
+namespace Detail {
+    /**
+     * A reimplementation of `std::unique_ptr` for improved compilation performance
+     *
+     * Does not support arrays nor custom deleters.
+     */
+    template <typename T>
+    class unique_ptr {
+        T* m_ptr;
+    public:
+        constexpr unique_ptr(std::nullptr_t = nullptr):
+            m_ptr{}
+        {}
+        explicit constexpr unique_ptr(T* ptr):
+            m_ptr(ptr)
+        {}
+
+        template <typename U, typename = std::enable_if_t<std::is_base_of<T, U>::value>>
+        unique_ptr(unique_ptr<U>&& from):
+            m_ptr(from.release())
+        {}
+
+        template <typename U, typename = std::enable_if_t<std::is_base_of<T, U>::value>>
+        unique_ptr& operator=(unique_ptr<U>&& from) {
+            reset(from.release());
+
+            return *this;
+        }
+
+        unique_ptr(unique_ptr const&) = delete;
+        unique_ptr& operator=(unique_ptr const&) = delete;
+
+        unique_ptr(unique_ptr&& rhs) noexcept:
+            m_ptr(rhs.m_ptr) {
+            rhs.m_ptr = nullptr;
+        }
+        unique_ptr& operator=(unique_ptr&& rhs) noexcept {
+            reset(rhs.release());
+
+            return *this;
+        }
+
+        ~unique_ptr() {
+            delete m_ptr;
+        }
+
+        T& operator*() {
+            assert(m_ptr);
+            return *m_ptr;
+        }
+        T const& operator*() const {
+            assert(m_ptr);
+            return *m_ptr;
+        }
+        T* operator->() noexcept {
+            assert(m_ptr);
+            return m_ptr;
+        }
+        T const* operator->() const noexcept {
+            assert(m_ptr);
+            return m_ptr;
+        }
+
+        T* get() { return m_ptr; }
+        T const* get() const { return m_ptr; }
+
+        void reset(T* ptr = nullptr) {
+            delete m_ptr;
+            m_ptr = ptr;
+        }
+
+        T* release() {
+            auto temp = m_ptr;
+            m_ptr = nullptr;
+            return temp;
+        }
+
+        explicit operator bool() const {
+            return m_ptr;
+        }
+
+        friend void swap(unique_ptr& lhs, unique_ptr& rhs) {
+            auto temp = lhs.m_ptr;
+            lhs.m_ptr = rhs.m_ptr;
+            rhs.m_ptr = temp;
+        }
+    };
+
+    //! Specialization to cause compile-time error for arrays
+    template <typename T>
+    class unique_ptr<T[]>;
+
+    template <typename T, typename... Args>
+    unique_ptr<T> make_unique(Args&&... args) {
+        return unique_ptr<T>(new T(CATCH_FORWARD(args)...));
+    }
+
+
+} // end namespace Detail
+} // end namespace Catch
+
+#endif // CATCH_UNIQUE_PTR_HPP_INCLUDED
+
+
+#ifndef CATCH_BENCHMARK_STATS_FWD_HPP_INCLUDED
+#define CATCH_BENCHMARK_STATS_FWD_HPP_INCLUDED
+
+
+
+// Adapted from donated nonius code.
+
+#ifndef CATCH_CLOCK_HPP_INCLUDED
+#define CATCH_CLOCK_HPP_INCLUDED
+
+#include <chrono>
+
+namespace Catch {
+    namespace Benchmark {
+        using IDuration = std::chrono::nanoseconds;
+        using FDuration = std::chrono::duration<double, std::nano>;
+
+        template <typename Clock>
+        using TimePoint = typename Clock::time_point;
+
+        using default_clock = std::chrono::steady_clock;
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_CLOCK_HPP_INCLUDED
+
+namespace Catch {
+
+    // We cannot forward declare the type with default template argument
+    // multiple times, so it is split out into a separate header so that
+    // we can prevent multiple declarations in dependees
+    template <typename Duration = Benchmark::FDuration>
+    struct BenchmarkStats;
+
+} // end namespace Catch
+
+#endif // CATCH_BENCHMARK_STATS_FWD_HPP_INCLUDED
+
+namespace Catch {
+
+    class AssertionResult;
+    struct AssertionInfo;
+    struct SectionInfo;
+    struct SectionEndInfo;
+    struct MessageInfo;
+    struct MessageBuilder;
+    struct Counts;
+    struct AssertionReaction;
+    struct SourceLineInfo;
+
+    class ITransientExpression;
+    class IGeneratorTracker;
+
+    struct BenchmarkInfo;
+
+    namespace Generators {
+        class GeneratorUntypedBase;
+        using GeneratorBasePtr = Catch::Detail::unique_ptr<GeneratorUntypedBase>;
+    }
+
+
+    class IResultCapture {
+    public:
+        virtual ~IResultCapture();
+
+        virtual void notifyAssertionStarted( AssertionInfo const& info ) = 0;
+        virtual bool sectionStarted( StringRef sectionName,
+                                     SourceLineInfo const& sectionLineInfo,
+                                     Counts& assertions ) = 0;
+        virtual void sectionEnded( SectionEndInfo&& endInfo ) = 0;
+        virtual void sectionEndedEarly( SectionEndInfo&& endInfo ) = 0;
+
+        virtual IGeneratorTracker*
+        acquireGeneratorTracker( StringRef generatorName,
+                                 SourceLineInfo const& lineInfo ) = 0;
+        virtual IGeneratorTracker*
+        createGeneratorTracker( StringRef generatorName,
+                                SourceLineInfo lineInfo,
+                                Generators::GeneratorBasePtr&& generator ) = 0;
+
+        virtual void benchmarkPreparing( StringRef name ) = 0;
+        virtual void benchmarkStarting( BenchmarkInfo const& info ) = 0;
+        virtual void benchmarkEnded( BenchmarkStats<> const& stats ) = 0;
+        virtual void benchmarkFailed( StringRef error ) = 0;
+
+        virtual void pushScopedMessage( MessageInfo const& message ) = 0;
+        virtual void popScopedMessage( MessageInfo const& message ) = 0;
+
+        virtual void emplaceUnscopedMessage( MessageBuilder&& builder ) = 0;
+
+        virtual void handleFatalErrorCondition( StringRef message ) = 0;
+
+        virtual void handleExpr
+                (   AssertionInfo const& info,
+                    ITransientExpression const& expr,
+                    AssertionReaction& reaction ) = 0;
+        virtual void handleMessage
+                (   AssertionInfo const& info,
+                    ResultWas::OfType resultType,
+                    StringRef message,
+                    AssertionReaction& reaction ) = 0;
+        virtual void handleUnexpectedExceptionNotThrown
+                (   AssertionInfo const& info,
+                    AssertionReaction& reaction ) = 0;
+        virtual void handleUnexpectedInflightException
+                (   AssertionInfo const& info,
+                    std::string&& message,
+                    AssertionReaction& reaction ) = 0;
+        virtual void handleIncomplete
+                (   AssertionInfo const& info ) = 0;
+        virtual void handleNonExpr
+                (   AssertionInfo const &info,
+                    ResultWas::OfType resultType,
+                    AssertionReaction &reaction ) = 0;
+
+
+
+        virtual bool lastAssertionPassed() = 0;
+        virtual void assertionPassed() = 0;
+
+        // Deprecated, do not use:
+        virtual std::string getCurrentTestName() const = 0;
+        virtual const AssertionResult* getLastResult() const = 0;
+        virtual void exceptionEarlyReported() = 0;
+    };
+
+    IResultCapture& getResultCapture();
+}
+
+#endif // CATCH_INTERFACES_CAPTURE_HPP_INCLUDED
+
+
+#ifndef CATCH_INTERFACES_CONFIG_HPP_INCLUDED
+#define CATCH_INTERFACES_CONFIG_HPP_INCLUDED
+
+
+
+#ifndef CATCH_NONCOPYABLE_HPP_INCLUDED
+#define CATCH_NONCOPYABLE_HPP_INCLUDED
+
+namespace Catch {
+    namespace Detail {
+
+        //! Deriving classes become noncopyable and nonmovable
+        class NonCopyable {
+            NonCopyable( NonCopyable const& ) = delete;
+            NonCopyable( NonCopyable&& ) = delete;
+            NonCopyable& operator=( NonCopyable const& ) = delete;
+            NonCopyable& operator=( NonCopyable&& ) = delete;
+
+        protected:
+            NonCopyable() noexcept = default;
+        };
+
+    } // namespace Detail
+} // namespace Catch
+
+#endif // CATCH_NONCOPYABLE_HPP_INCLUDED
+
+#include <chrono>
+#include <iosfwd>
+#include <string>
+#include <vector>
+
+namespace Catch {
+
+    enum class Verbosity {
+        Quiet = 0,
+        Normal,
+        High
+    };
+
+    struct WarnAbout { enum What {
+        Nothing = 0x00,
+        //! A test case or leaf section did not run any assertions
+        NoAssertions = 0x01,
+        //! A command line test spec matched no test cases
+        UnmatchedTestSpec = 0x02,
+    }; };
+
+    enum class ShowDurations {
+        DefaultForReporter,
+        Always,
+        Never
+    };
+    enum class TestRunOrder {
+        Declared,
+        LexicographicallySorted,
+        Randomized
+    };
+    enum class ColourMode : std::uint8_t {
+        //! Let Catch2 pick implementation based on platform detection
+        PlatformDefault,
+        //! Use ANSI colour code escapes
+        ANSI,
+        //! Use Win32 console colour API
+        Win32,
+        //! Don't use any colour
+        None
+    };
+    struct WaitForKeypress { enum When {
+        Never,
+        BeforeStart = 1,
+        BeforeExit = 2,
+        BeforeStartAndExit = BeforeStart | BeforeExit
+    }; };
+
+    class TestSpec;
+    class IStream;
+
+    class IConfig : public Detail::NonCopyable {
+    public:
+        virtual ~IConfig();
+
+        virtual bool allowThrows() const = 0;
+        virtual StringRef name() const = 0;
+        virtual bool includeSuccessfulResults() const = 0;
+        virtual bool shouldDebugBreak() const = 0;
+        virtual bool warnAboutMissingAssertions() const = 0;
+        virtual bool warnAboutUnmatchedTestSpecs() const = 0;
+        virtual bool zeroTestsCountAsSuccess() const = 0;
+        virtual int abortAfter() const = 0;
+        virtual bool showInvisibles() const = 0;
+        virtual ShowDurations showDurations() const = 0;
+        virtual double minDuration() const = 0;
+        virtual TestSpec const& testSpec() const = 0;
+        virtual bool hasTestFilters() const = 0;
+        virtual std::vector<std::string> const& getTestsOrTags() const = 0;
+        virtual TestRunOrder runOrder() const = 0;
+        virtual uint32_t rngSeed() const = 0;
+        virtual unsigned int shardCount() const = 0;
+        virtual unsigned int shardIndex() const = 0;
+        virtual ColourMode defaultColourMode() const = 0;
+        virtual std::vector<std::string> const& getSectionsToRun() const = 0;
+        virtual Verbosity verbosity() const = 0;
+
+        virtual bool skipBenchmarks() const = 0;
+        virtual bool benchmarkNoAnalysis() const = 0;
+        virtual unsigned int benchmarkSamples() const = 0;
+        virtual double benchmarkConfidenceInterval() const = 0;
+        virtual unsigned int benchmarkResamples() const = 0;
+        virtual std::chrono::milliseconds benchmarkWarmupTime() const = 0;
+    };
+}
+
+#endif // CATCH_INTERFACES_CONFIG_HPP_INCLUDED
+
+
+#ifndef CATCH_INTERFACES_REGISTRY_HUB_HPP_INCLUDED
+#define CATCH_INTERFACES_REGISTRY_HUB_HPP_INCLUDED
+
+
+#include <string>
+
+namespace Catch {
+
+    class TestCaseHandle;
+    struct TestCaseInfo;
+    class ITestCaseRegistry;
+    class IExceptionTranslatorRegistry;
+    class IExceptionTranslator;
+    class ReporterRegistry;
+    class IReporterFactory;
+    class ITagAliasRegistry;
+    class ITestInvoker;
+    class IMutableEnumValuesRegistry;
+    struct SourceLineInfo;
+
+    class StartupExceptionRegistry;
+    class EventListenerFactory;
+
+    using IReporterFactoryPtr = Detail::unique_ptr<IReporterFactory>;
+
+    class IRegistryHub {
+    public:
+        virtual ~IRegistryHub(); // = default
+
+        virtual ReporterRegistry const& getReporterRegistry() const = 0;
+        virtual ITestCaseRegistry const& getTestCaseRegistry() const = 0;
+        virtual ITagAliasRegistry const& getTagAliasRegistry() const = 0;
+        virtual IExceptionTranslatorRegistry const& getExceptionTranslatorRegistry() const = 0;
+
+
+        virtual StartupExceptionRegistry const& getStartupExceptionRegistry() const = 0;
+    };
+
+    class IMutableRegistryHub {
+    public:
+        virtual ~IMutableRegistryHub(); // = default
+        virtual void registerReporter( std::string const& name, IReporterFactoryPtr factory ) = 0;
+        virtual void registerListener( Detail::unique_ptr<EventListenerFactory> factory ) = 0;
+        virtual void registerTest(Detail::unique_ptr<TestCaseInfo>&& testInfo, Detail::unique_ptr<ITestInvoker>&& invoker) = 0;
+        virtual void registerTranslator( Detail::unique_ptr<IExceptionTranslator>&& translator ) = 0;
+        virtual void registerTagAlias( std::string const& alias, std::string const& tag, SourceLineInfo const& lineInfo ) = 0;
+        virtual void registerStartupException() noexcept = 0;
+        virtual IMutableEnumValuesRegistry& getMutableEnumValuesRegistry() = 0;
+    };
+
+    IRegistryHub const& getRegistryHub();
+    IMutableRegistryHub& getMutableRegistryHub();
+    void cleanUp();
+    std::string translateActiveException();
+
+}
+
+#endif // CATCH_INTERFACES_REGISTRY_HUB_HPP_INCLUDED
+
+
+#ifndef CATCH_BENCHMARK_STATS_HPP_INCLUDED
+#define CATCH_BENCHMARK_STATS_HPP_INCLUDED
+
+
+
+// Adapted from donated nonius code.
+
+#ifndef CATCH_ESTIMATE_HPP_INCLUDED
+#define CATCH_ESTIMATE_HPP_INCLUDED
+
+namespace Catch {
+    namespace Benchmark {
+        template <typename Type>
+        struct Estimate {
+            Type point;
+            Type lower_bound;
+            Type upper_bound;
+            double confidence_interval;
+        };
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_ESTIMATE_HPP_INCLUDED
+
+
+// Adapted from donated nonius code.
+
+#ifndef CATCH_OUTLIER_CLASSIFICATION_HPP_INCLUDED
+#define CATCH_OUTLIER_CLASSIFICATION_HPP_INCLUDED
+
+namespace Catch {
+    namespace Benchmark {
+        struct OutlierClassification {
+            int samples_seen = 0;
+            int low_severe = 0;     // more than 3 times IQR below Q1
+            int low_mild = 0;       // 1.5 to 3 times IQR below Q1
+            int high_mild = 0;      // 1.5 to 3 times IQR above Q3
+            int high_severe = 0;    // more than 3 times IQR above Q3
+
+            int total() const {
+                return low_severe + low_mild + high_mild + high_severe;
+            }
+        };
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_OUTLIERS_CLASSIFICATION_HPP_INCLUDED
+// The fwd decl & default specialization needs to be seen by VS2017 before
+// BenchmarkStats itself, or VS2017 will report compilation error.
+
+#include <string>
+#include <vector>
+
+namespace Catch {
+
+    struct BenchmarkInfo {
+        std::string name;
+        double estimatedDuration;
+        int iterations;
+        unsigned int samples;
+        unsigned int resamples;
+        double clockResolution;
+        double clockCost;
+    };
+
+    // We need to keep template parameter for backwards compatibility,
+    // but we also do not want to use the template paraneter.
+    template <class Dummy>
+    struct BenchmarkStats {
+        BenchmarkInfo info;
+
+        std::vector<Benchmark::FDuration> samples;
+        Benchmark::Estimate<Benchmark::FDuration> mean;
+        Benchmark::Estimate<Benchmark::FDuration> standardDeviation;
+        Benchmark::OutlierClassification outliers;
+        double outlierVariance;
+    };
+
+
+} // end namespace Catch
+
+#endif // CATCH_BENCHMARK_STATS_HPP_INCLUDED
+
+
+// Adapted from donated nonius code.
+
+#ifndef CATCH_ENVIRONMENT_HPP_INCLUDED
+#define CATCH_ENVIRONMENT_HPP_INCLUDED
+
+
+namespace Catch {
+    namespace Benchmark {
+        struct EnvironmentEstimate {
+            FDuration mean;
+            OutlierClassification outliers;
+        };
+        struct Environment {
+            EnvironmentEstimate clock_resolution;
+            EnvironmentEstimate clock_cost;
+        };
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_ENVIRONMENT_HPP_INCLUDED
+
+
+// Adapted from donated nonius code.
+
+#ifndef CATCH_EXECUTION_PLAN_HPP_INCLUDED
+#define CATCH_EXECUTION_PLAN_HPP_INCLUDED
+
+
+
+// Adapted from donated nonius code.
+
+#ifndef CATCH_BENCHMARK_FUNCTION_HPP_INCLUDED
+#define CATCH_BENCHMARK_FUNCTION_HPP_INCLUDED
+
+
+
+// Adapted from donated nonius code.
+
+#ifndef CATCH_CHRONOMETER_HPP_INCLUDED
+#define CATCH_CHRONOMETER_HPP_INCLUDED
+
+
+
+// Adapted from donated nonius code.
+
+#ifndef CATCH_OPTIMIZER_HPP_INCLUDED
+#define CATCH_OPTIMIZER_HPP_INCLUDED
+
+#if defined(_MSC_VER) || defined(__IAR_SYSTEMS_ICC__)
+#   include <atomic> // atomic_thread_fence
+#endif
+
+
+#include <type_traits>
+
+namespace Catch {
+    namespace Benchmark {
+#if defined(__GNUC__) || defined(__clang__)
+        template <typename T>
+        inline void keep_memory(T* p) {
+            asm volatile("" : : "g"(p) : "memory");
+        }
+        inline void keep_memory() {
+            asm volatile("" : : : "memory");
+        }
+
+        namespace Detail {
+            inline void optimizer_barrier() { keep_memory(); }
+        } // namespace Detail
+#elif defined(_MSC_VER) || defined(__IAR_SYSTEMS_ICC__)
+
+#if defined(_MSVC_VER)
+#pragma optimize("", off)
+#elif defined(__IAR_SYSTEMS_ICC__)
+// For IAR the pragma only affects the following function
+#pragma optimize=disable
+#endif
+        template <typename T>
+        inline void keep_memory(T* p) {
+            // thanks @milleniumbug
+            *reinterpret_cast<char volatile*>(p) = *reinterpret_cast<char const volatile*>(p);
+        }
+        // TODO equivalent keep_memory()
+#if defined(_MSVC_VER)
+#pragma optimize("", on)
+#endif
+
+        namespace Detail {
+            inline void optimizer_barrier() {
+                std::atomic_thread_fence(std::memory_order_seq_cst);
+            }
+        } // namespace Detail
+
+#endif
+
+        template <typename T>
+        inline void deoptimize_value(T&& x) {
+            keep_memory(&x);
+        }
+
+        template <typename Fn, typename... Args>
+        inline auto invoke_deoptimized(Fn&& fn, Args&&... args) -> std::enable_if_t<!std::is_same<void, decltype(fn(args...))>::value> {
+            deoptimize_value(CATCH_FORWARD(fn) (CATCH_FORWARD(args)...));
+        }
+
+        template <typename Fn, typename... Args>
+        inline auto invoke_deoptimized(Fn&& fn, Args&&... args) -> std::enable_if_t<std::is_same<void, decltype(fn(args...))>::value> {
+            CATCH_FORWARD((fn)) (CATCH_FORWARD(args)...);
+        }
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_OPTIMIZER_HPP_INCLUDED
+
+
+#ifndef CATCH_META_HPP_INCLUDED
+#define CATCH_META_HPP_INCLUDED
+
+#include <type_traits>
+
+namespace Catch {
+    template <typename>
+    struct true_given : std::true_type {};
+
+    struct is_callable_tester {
+        template <typename Fun, typename... Args>
+        static true_given<decltype(std::declval<Fun>()(std::declval<Args>()...))> test(int);
+        template <typename...>
+        static std::false_type test(...);
+    };
+
+    template <typename T>
+    struct is_callable;
+
+    template <typename Fun, typename... Args>
+    struct is_callable<Fun(Args...)> : decltype(is_callable_tester::test<Fun, Args...>(0)) {};
+
+
+#if defined(__cpp_lib_is_invocable) && __cpp_lib_is_invocable >= 201703
+    // std::result_of is deprecated in C++17 and removed in C++20. Hence, it is
+    // replaced with std::invoke_result here.
+    template <typename Func, typename... U>
+    using FunctionReturnType = std::remove_reference_t<std::remove_cv_t<std::invoke_result_t<Func, U...>>>;
+#else
+    template <typename Func, typename... U>
+    using FunctionReturnType = std::remove_reference_t<std::remove_cv_t<std::result_of_t<Func(U...)>>>;
+#endif
+
+} // namespace Catch
+
+namespace mpl_{
+    struct na;
+}
+
+#endif // CATCH_META_HPP_INCLUDED
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            struct ChronometerConcept {
+                virtual void start() = 0;
+                virtual void finish() = 0;
+                virtual ~ChronometerConcept(); // = default;
+
+                ChronometerConcept() = default;
+                ChronometerConcept(ChronometerConcept const&) = default;
+                ChronometerConcept& operator=(ChronometerConcept const&) = default;
+            };
+            template <typename Clock>
+            struct ChronometerModel final : public ChronometerConcept {
+                void start() override { started = Clock::now(); }
+                void finish() override { finished = Clock::now(); }
+
+                IDuration elapsed() const {
+                    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+                        finished - started );
+                }
+
+                TimePoint<Clock> started;
+                TimePoint<Clock> finished;
+            };
+        } // namespace Detail
+
+        struct Chronometer {
+        public:
+            template <typename Fun>
+            void measure(Fun&& fun) { measure(CATCH_FORWARD(fun), is_callable<Fun(int)>()); }
+
+            int runs() const { return repeats; }
+
+            Chronometer(Detail::ChronometerConcept& meter, int repeats_)
+                : impl(&meter)
+                , repeats(repeats_) {}
+
+        private:
+            template <typename Fun>
+            void measure(Fun&& fun, std::false_type) {
+                measure([&fun](int) { return fun(); }, std::true_type());
+            }
+
+            template <typename Fun>
+            void measure(Fun&& fun, std::true_type) {
+                Detail::optimizer_barrier();
+                impl->start();
+                for (int i = 0; i < repeats; ++i) invoke_deoptimized(fun, i);
+                impl->finish();
+                Detail::optimizer_barrier();
+            }
+
+            Detail::ChronometerConcept* impl;
+            int repeats;
+        };
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_CHRONOMETER_HPP_INCLUDED
+
+#include <type_traits>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename T, typename U>
+            struct is_related
+                : std::is_same<std::decay_t<T>, std::decay_t<U>> {};
+
+            /// We need to reinvent std::function because every piece of code that might add overhead
+            /// in a measurement context needs to have consistent performance characteristics so that we
+            /// can account for it in the measurement.
+            /// Implementations of std::function with optimizations that aren't always applicable, like
+            /// small buffer optimizations, are not uncommon.
+            /// This is effectively an implementation of std::function without any such optimizations;
+            /// it may be slow, but it is consistently slow.
+            struct BenchmarkFunction {
+            private:
+                struct callable {
+                    virtual void call(Chronometer meter) const = 0;
+                    virtual Catch::Detail::unique_ptr<callable> clone() const = 0;
+                    virtual ~callable(); // = default;
+
+                    callable() = default;
+                    callable(callable const&) = default;
+                    callable& operator=(callable const&) = default;
+                };
+                template <typename Fun>
+                struct model : public callable {
+                    model(Fun&& fun_) : fun(CATCH_MOVE(fun_)) {}
+                    model(Fun const& fun_) : fun(fun_) {}
+
+                    Catch::Detail::unique_ptr<callable> clone() const override {
+                        return Catch::Detail::make_unique<model<Fun>>( *this );
+                    }
+
+                    void call(Chronometer meter) const override {
+                        call(meter, is_callable<Fun(Chronometer)>());
+                    }
+                    void call(Chronometer meter, std::true_type) const {
+                        fun(meter);
+                    }
+                    void call(Chronometer meter, std::false_type) const {
+                        meter.measure(fun);
+                    }
+
+                    Fun fun;
+                };
+
+                struct do_nothing { void operator()() const {} };
+
+                template <typename T>
+                BenchmarkFunction(model<T>* c) : f(c) {}
+
+            public:
+                BenchmarkFunction()
+                    : f(new model<do_nothing>{ {} }) {}
+
+                template <typename Fun,
+                    std::enable_if_t<!is_related<Fun, BenchmarkFunction>::value, int> = 0>
+                    BenchmarkFunction(Fun&& fun)
+                    : f(new model<std::decay_t<Fun>>(CATCH_FORWARD(fun))) {}
+
+                BenchmarkFunction( BenchmarkFunction&& that ) noexcept:
+                    f( CATCH_MOVE( that.f ) ) {}
+
+                BenchmarkFunction(BenchmarkFunction const& that)
+                    : f(that.f->clone()) {}
+
+                BenchmarkFunction&
+                operator=( BenchmarkFunction&& that ) noexcept {
+                    f = CATCH_MOVE( that.f );
+                    return *this;
+                }
+
+                BenchmarkFunction& operator=(BenchmarkFunction const& that) {
+                    f = that.f->clone();
+                    return *this;
+                }
+
+                void operator()(Chronometer meter) const { f->call(meter); }
+
+            private:
+                Catch::Detail::unique_ptr<callable> f;
+            };
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_BENCHMARK_FUNCTION_HPP_INCLUDED
+
+
+// Adapted from donated nonius code.
+
+#ifndef CATCH_REPEAT_HPP_INCLUDED
+#define CATCH_REPEAT_HPP_INCLUDED
+
+#include <type_traits>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename Fun>
+            struct repeater {
+                void operator()(int k) const {
+                    for (int i = 0; i < k; ++i) {
+                        fun();
+                    }
+                }
+                Fun fun;
+            };
+            template <typename Fun>
+            repeater<std::decay_t<Fun>> repeat(Fun&& fun) {
+                return { CATCH_FORWARD(fun) };
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_REPEAT_HPP_INCLUDED
+
+
+// Adapted from donated nonius code.
+
+#ifndef CATCH_RUN_FOR_AT_LEAST_HPP_INCLUDED
+#define CATCH_RUN_FOR_AT_LEAST_HPP_INCLUDED
+
+
+
+// Adapted from donated nonius code.
+
+#ifndef CATCH_MEASURE_HPP_INCLUDED
+#define CATCH_MEASURE_HPP_INCLUDED
+
+
+
+// Adapted from donated nonius code.
+
+#ifndef CATCH_COMPLETE_INVOKE_HPP_INCLUDED
+#define CATCH_COMPLETE_INVOKE_HPP_INCLUDED
+
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename T>
+            struct CompleteType { using type = T; };
+            template <>
+            struct CompleteType<void> { struct type {}; };
+
+            template <typename T>
+            using CompleteType_t = typename CompleteType<T>::type;
+
+            template <typename Result>
+            struct CompleteInvoker {
+                template <typename Fun, typename... Args>
+                static Result invoke(Fun&& fun, Args&&... args) {
+                    return CATCH_FORWARD(fun)(CATCH_FORWARD(args)...);
+                }
+            };
+            template <>
+            struct CompleteInvoker<void> {
+                template <typename Fun, typename... Args>
+                static CompleteType_t<void> invoke(Fun&& fun, Args&&... args) {
+                    CATCH_FORWARD(fun)(CATCH_FORWARD(args)...);
+                    return {};
+                }
+            };
+
+            // invoke and not return void :(
+            template <typename Fun, typename... Args>
+            CompleteType_t<FunctionReturnType<Fun, Args...>> complete_invoke(Fun&& fun, Args&&... args) {
+                return CompleteInvoker<FunctionReturnType<Fun, Args...>>::invoke(CATCH_FORWARD(fun), CATCH_FORWARD(args)...);
+            }
+
+        } // namespace Detail
+
+        template <typename Fun>
+        Detail::CompleteType_t<FunctionReturnType<Fun>> user_code(Fun&& fun) {
+            return Detail::complete_invoke(CATCH_FORWARD(fun));
+        }
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_COMPLETE_INVOKE_HPP_INCLUDED
+
+
+// Adapted from donated nonius code.
+
+#ifndef CATCH_TIMING_HPP_INCLUDED
+#define CATCH_TIMING_HPP_INCLUDED
+
+
+#include <type_traits>
+
+namespace Catch {
+    namespace Benchmark {
+        template <typename Result>
+        struct Timing {
+            IDuration elapsed;
+            Result result;
+            int iterations;
+        };
+        template <typename Func, typename... Args>
+        using TimingOf = Timing<Detail::CompleteType_t<FunctionReturnType<Func, Args...>>>;
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_TIMING_HPP_INCLUDED
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename Clock, typename Fun, typename... Args>
+            TimingOf<Fun, Args...> measure(Fun&& fun, Args&&... args) {
+                auto start = Clock::now();
+                auto&& r = Detail::complete_invoke(fun, CATCH_FORWARD(args)...);
+                auto end = Clock::now();
+                auto delta = end - start;
+                return { delta, CATCH_FORWARD(r), 1 };
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_MEASURE_HPP_INCLUDED
+
+#include <type_traits>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename Clock, typename Fun>
+            TimingOf<Fun, int> measure_one(Fun&& fun, int iters, std::false_type) {
+                return Detail::measure<Clock>(fun, iters);
+            }
+            template <typename Clock, typename Fun>
+            TimingOf<Fun, Chronometer> measure_one(Fun&& fun, int iters, std::true_type) {
+                Detail::ChronometerModel<Clock> meter;
+                auto&& result = Detail::complete_invoke(fun, Chronometer(meter, iters));
+
+                return { meter.elapsed(), CATCH_MOVE(result), iters };
+            }
+
+            template <typename Clock, typename Fun>
+            using run_for_at_least_argument_t = std::conditional_t<is_callable<Fun(Chronometer)>::value, Chronometer, int>;
+
+
+            [[noreturn]]
+            void throw_optimized_away_error();
+
+            template <typename Clock, typename Fun>
+            TimingOf<Fun, run_for_at_least_argument_t<Clock, Fun>>
+                run_for_at_least(IDuration how_long,
+                                 const int initial_iterations,
+                                 Fun&& fun) {
+                auto iters = initial_iterations;
+                while (iters < (1 << 30)) {
+                    auto&& Timing = measure_one<Clock>(fun, iters, is_callable<Fun(Chronometer)>());
+
+                    if (Timing.elapsed >= how_long) {
+                        return { Timing.elapsed, CATCH_MOVE(Timing.result), iters };
+                    }
+                    iters *= 2;
+                }
+                throw_optimized_away_error();
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_RUN_FOR_AT_LEAST_HPP_INCLUDED
+
+#include <vector>
+
+namespace Catch {
+    namespace Benchmark {
+        struct ExecutionPlan {
+            int iterations_per_sample;
+            FDuration estimated_duration;
+            Detail::BenchmarkFunction benchmark;
+            FDuration warmup_time;
+            int warmup_iterations;
+
+            template <typename Clock>
+            std::vector<FDuration> run(const IConfig &cfg, Environment env) const {
+                // warmup a bit
+                Detail::run_for_at_least<Clock>(
+                    std::chrono::duration_cast<IDuration>( warmup_time ),
+                    warmup_iterations,
+                    Detail::repeat( []() { return Clock::now(); } )
+                );
+
+                std::vector<FDuration> times;
+                const auto num_samples = cfg.benchmarkSamples();
+                times.reserve( num_samples );
+                for ( size_t i = 0; i < num_samples; ++i ) {
+                    Detail::ChronometerModel<Clock> model;
+                    this->benchmark( Chronometer( model, iterations_per_sample ) );
+                    auto sample_time = model.elapsed() - env.clock_cost.mean;
+                    if ( sample_time < FDuration::zero() ) {
+                        sample_time = FDuration::zero();
+                    }
+                    times.push_back(sample_time / iterations_per_sample);
+                }
+                return times;
+            }
+        };
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_EXECUTION_PLAN_HPP_INCLUDED
+
+
+// Adapted from donated nonius code.
+
+#ifndef CATCH_ESTIMATE_CLOCK_HPP_INCLUDED
+#define CATCH_ESTIMATE_CLOCK_HPP_INCLUDED
+
+
+
+// Adapted from donated nonius code.
+
+#ifndef CATCH_STATS_HPP_INCLUDED
+#define CATCH_STATS_HPP_INCLUDED
+
+
+#include <vector>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            using sample = std::vector<double>;
+
+            double weighted_average_quantile( int k,
+                                              int q,
+                                              double* first,
+                                              double* last );
+
+            OutlierClassification
+            classify_outliers( double const* first, double const* last );
+
+            double mean( double const* first, double const* last );
+
+            double normal_cdf( double x );
+
+            double erfc_inv(double x);
+
+            double normal_quantile(double p);
+
+            Estimate<double>
+            bootstrap( double confidence_level,
+                       double* first,
+                       double* last,
+                       sample const& resample,
+                       double ( *estimator )( double const*, double const* ) );
+
+            struct bootstrap_analysis {
+                Estimate<double> mean;
+                Estimate<double> standard_deviation;
+                double outlier_variance;
+            };
+
+            bootstrap_analysis analyse_samples(double confidence_level,
+                                               unsigned int n_resamples,
+                                               double* first,
+                                               double* last);
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_STATS_HPP_INCLUDED
+
+#include <algorithm>
+#include <vector>
+#include <cmath>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename Clock>
+            std::vector<double> resolution(int k) {
+                std::vector<TimePoint<Clock>> times;
+                times.reserve(static_cast<size_t>(k + 1));
+                for ( int i = 0; i < k + 1; ++i ) {
+                    times.push_back( Clock::now() );
+                }
+
+                std::vector<double> deltas;
+                deltas.reserve(static_cast<size_t>(k));
+                for ( size_t idx = 1; idx < times.size(); ++idx ) {
+                    deltas.push_back( static_cast<double>(
+                        ( times[idx] - times[idx - 1] ).count() ) );
+                }
+
+                return deltas;
+            }
+
+            constexpr auto warmup_iterations = 10000;
+            constexpr auto warmup_time = std::chrono::milliseconds(100);
+            constexpr auto minimum_ticks = 1000;
+            constexpr auto warmup_seed = 10000;
+            constexpr auto clock_resolution_estimation_time = std::chrono::milliseconds(500);
+            constexpr auto clock_cost_estimation_time_limit = std::chrono::seconds(1);
+            constexpr auto clock_cost_estimation_tick_limit = 100000;
+            constexpr auto clock_cost_estimation_time = std::chrono::milliseconds(10);
+            constexpr auto clock_cost_estimation_iterations = 10000;
+
+            template <typename Clock>
+            int warmup() {
+                return run_for_at_least<Clock>(warmup_time, warmup_seed, &resolution<Clock>)
+                    .iterations;
+            }
+            template <typename Clock>
+            EnvironmentEstimate estimate_clock_resolution(int iterations) {
+                auto r = run_for_at_least<Clock>(clock_resolution_estimation_time, iterations, &resolution<Clock>)
+                    .result;
+                return {
+                    FDuration(mean(r.data(), r.data() + r.size())),
+                    classify_outliers(r.data(), r.data() + r.size()),
+                };
+            }
+            template <typename Clock>
+            EnvironmentEstimate estimate_clock_cost(FDuration resolution) {
+                auto time_limit = (std::min)(
+                    resolution * clock_cost_estimation_tick_limit,
+                    FDuration(clock_cost_estimation_time_limit));
+                auto time_clock = [](int k) {
+                    return Detail::measure<Clock>([k] {
+                        for (int i = 0; i < k; ++i) {
+                            volatile auto ignored = Clock::now();
+                            (void)ignored;
+                        }
+                    }).elapsed;
+                };
+                time_clock(1);
+                int iters = clock_cost_estimation_iterations;
+                auto&& r = run_for_at_least<Clock>(clock_cost_estimation_time, iters, time_clock);
+                std::vector<double> times;
+                int nsamples = static_cast<int>(std::ceil(time_limit / r.elapsed));
+                times.reserve(static_cast<size_t>(nsamples));
+                for ( int s = 0; s < nsamples; ++s ) {
+                    times.push_back( static_cast<double>(
+                        ( time_clock( r.iterations ) / r.iterations )
+                            .count() ) );
+                }
+                return {
+                    FDuration(mean(times.data(), times.data() + times.size())),
+                    classify_outliers(times.data(), times.data() + times.size()),
+                };
+            }
+
+            template <typename Clock>
+            Environment measure_environment() {
+#if defined(__clang__)
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wexit-time-destructors"
+#endif
+                static Catch::Detail::unique_ptr<Environment> env;
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
+                if (env) {
+                    return *env;
+                }
+
+                auto iters = Detail::warmup<Clock>();
+                auto resolution = Detail::estimate_clock_resolution<Clock>(iters);
+                auto cost = Detail::estimate_clock_cost<Clock>(resolution.mean);
+
+                env = Catch::Detail::make_unique<Environment>( Environment{resolution, cost} );
+                return *env;
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_ESTIMATE_CLOCK_HPP_INCLUDED
+
+
+// Adapted from donated nonius code.
+
+#ifndef CATCH_ANALYSE_HPP_INCLUDED
+#define CATCH_ANALYSE_HPP_INCLUDED
+
+
+
+// Adapted from donated nonius code.
+
+#ifndef CATCH_SAMPLE_ANALYSIS_HPP_INCLUDED
+#define CATCH_SAMPLE_ANALYSIS_HPP_INCLUDED
+
+
+#include <vector>
+
+namespace Catch {
+    namespace Benchmark {
+        struct SampleAnalysis {
+            std::vector<FDuration> samples;
+            Estimate<FDuration> mean;
+            Estimate<FDuration> standard_deviation;
+            OutlierClassification outliers;
+            double outlier_variance;
+        };
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_SAMPLE_ANALYSIS_HPP_INCLUDED
+
+
+namespace Catch {
+    class IConfig;
+
+    namespace Benchmark {
+        namespace Detail {
+            SampleAnalysis analyse(const IConfig &cfg, FDuration* first, FDuration* last);
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_ANALYSE_HPP_INCLUDED
+
+#include <algorithm>
+#include <chrono>
+#include <exception>
+#include <string>
+#include <cmath>
+
+namespace Catch {
+    namespace Benchmark {
+        struct Benchmark {
+            Benchmark(std::string&& benchmarkName)
+                : name(CATCH_MOVE(benchmarkName)) {}
+
+            template <class FUN>
+            Benchmark(std::string&& benchmarkName , FUN &&func)
+                : fun(CATCH_MOVE(func)), name(CATCH_MOVE(benchmarkName)) {}
+
+            template <typename Clock>
+            ExecutionPlan prepare(const IConfig &cfg, Environment env) const {
+                auto min_time = env.clock_resolution.mean * Detail::minimum_ticks;
+                auto run_time = std::max(min_time, std::chrono::duration_cast<decltype(min_time)>(cfg.benchmarkWarmupTime()));
+                auto&& test = Detail::run_for_at_least<Clock>(std::chrono::duration_cast<IDuration>(run_time), 1, fun);
+                int new_iters = static_cast<int>(std::ceil(min_time * test.iterations / test.elapsed));
+                return { new_iters, test.elapsed / test.iterations * new_iters * cfg.benchmarkSamples(), fun, std::chrono::duration_cast<FDuration>(cfg.benchmarkWarmupTime()), Detail::warmup_iterations };
+            }
+
+            template <typename Clock = default_clock>
+            void run() {
+                static_assert( Clock::is_steady,
+                               "Benchmarking clock should be steady" );
+                auto const* cfg = getCurrentContext().getConfig();
+
+                auto env = Detail::measure_environment<Clock>();
+
+                getResultCapture().benchmarkPreparing(name);
+                CATCH_TRY{
+                    auto plan = user_code([&] {
+                        return prepare<Clock>(*cfg, env);
+                    });
+
+                    BenchmarkInfo info {
+                        CATCH_MOVE(name),
+                        plan.estimated_duration.count(),
+                        plan.iterations_per_sample,
+                        cfg->benchmarkSamples(),
+                        cfg->benchmarkResamples(),
+                        env.clock_resolution.mean.count(),
+                        env.clock_cost.mean.count()
+                    };
+
+                    getResultCapture().benchmarkStarting(info);
+
+                    auto samples = user_code([&] {
+                        return plan.template run<Clock>(*cfg, env);
+                    });
+
+                    auto analysis = Detail::analyse(*cfg, samples.data(), samples.data() + samples.size());
+                    BenchmarkStats<> stats{ CATCH_MOVE(info), CATCH_MOVE(analysis.samples), analysis.mean, analysis.standard_deviation, analysis.outliers, analysis.outlier_variance };
+                    getResultCapture().benchmarkEnded(stats);
+                } CATCH_CATCH_ANON (TestFailureException const&) {
+                    getResultCapture().benchmarkFailed("Benchmark failed due to failed assertion"_sr);
+                } CATCH_CATCH_ALL{
+                    getResultCapture().benchmarkFailed(translateActiveException());
+                    // We let the exception go further up so that the
+                    // test case is marked as failed.
+                    std::rethrow_exception(std::current_exception());
+                }
+            }
+
+            // sets lambda to be used in fun *and* executes benchmark!
+            template <typename Fun, std::enable_if_t<!Detail::is_related<Fun, Benchmark>::value, int> = 0>
+                Benchmark & operator=(Fun func) {
+                auto const* cfg = getCurrentContext().getConfig();
+                if (!cfg->skipBenchmarks()) {
+                    fun = Detail::BenchmarkFunction(func);
+                    run();
+                }
+                return *this;
+            }
+
+            explicit operator bool() {
+                return true;
+            }
+
+        private:
+            Detail::BenchmarkFunction fun;
+            std::string name;
+        };
+    }
+} // namespace Catch
+
+#define INTERNAL_CATCH_GET_1_ARG(arg1, arg2, ...) arg1
+#define INTERNAL_CATCH_GET_2_ARG(arg1, arg2, ...) arg2
+
+#define INTERNAL_CATCH_BENCHMARK(BenchmarkName, name, benchmarkIndex)\
+    if( Catch::Benchmark::Benchmark BenchmarkName{name} ) \
+        BenchmarkName = [&](int benchmarkIndex)
+
+#define INTERNAL_CATCH_BENCHMARK_ADVANCED(BenchmarkName, name)\
+    if( Catch::Benchmark::Benchmark BenchmarkName{name} ) \
+        BenchmarkName = [&]
+
+#if defined(CATCH_CONFIG_PREFIX_ALL)
+
+#define CATCH_BENCHMARK(...) \
+    INTERNAL_CATCH_BENCHMARK(INTERNAL_CATCH_UNIQUE_NAME(CATCH2_INTERNAL_BENCHMARK_), INTERNAL_CATCH_GET_1_ARG(__VA_ARGS__,,), INTERNAL_CATCH_GET_2_ARG(__VA_ARGS__,,))
+#define CATCH_BENCHMARK_ADVANCED(name) \
+    INTERNAL_CATCH_BENCHMARK_ADVANCED(INTERNAL_CATCH_UNIQUE_NAME(CATCH2_INTERNAL_BENCHMARK_), name)
+
+#else
+
+#define BENCHMARK(...) \
+    INTERNAL_CATCH_BENCHMARK(INTERNAL_CATCH_UNIQUE_NAME(CATCH2_INTERNAL_BENCHMARK_), INTERNAL_CATCH_GET_1_ARG(__VA_ARGS__,,), INTERNAL_CATCH_GET_2_ARG(__VA_ARGS__,,))
+#define BENCHMARK_ADVANCED(name) \
+    INTERNAL_CATCH_BENCHMARK_ADVANCED(INTERNAL_CATCH_UNIQUE_NAME(CATCH2_INTERNAL_BENCHMARK_), name)
+
+#endif
+
+#endif // CATCH_BENCHMARK_HPP_INCLUDED
+
+
+// Adapted from donated nonius code.
+
+#ifndef CATCH_CONSTRUCTOR_HPP_INCLUDED
+#define CATCH_CONSTRUCTOR_HPP_INCLUDED
+
+
+#include <type_traits>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename T, bool Destruct>
+            struct ObjectStorage
+            {
+                ObjectStorage() = default;
+
+                ObjectStorage(const ObjectStorage& other)
+                {
+                    new(&data) T(other.stored_object());
+                }
+
+                ObjectStorage(ObjectStorage&& other)
+                {
+                    new(data) T(CATCH_MOVE(other.stored_object()));
+                }
+
+                ~ObjectStorage() { destruct_on_exit<T>(); }
+
+                template <typename... Args>
+                void construct(Args&&... args)
+                {
+                    new (data) T(CATCH_FORWARD(args)...);
+                }
+
+                template <bool AllowManualDestruction = !Destruct>
+                std::enable_if_t<AllowManualDestruction> destruct()
+                {
+                    stored_object().~T();
+                }
+
+            private:
+                // If this is a constructor benchmark, destruct the underlying object
+                template <typename U>
+                void destruct_on_exit(std::enable_if_t<Destruct, U>* = nullptr) { destruct<true>(); }
+                // Otherwise, don't
+                template <typename U>
+                void destruct_on_exit(std::enable_if_t<!Destruct, U>* = nullptr) { }
+
+#if defined( __GNUC__ ) && __GNUC__ <= 6
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+                T& stored_object() { return *reinterpret_cast<T*>( data ); }
+
+                T const& stored_object() const {
+                    return *reinterpret_cast<T const*>( data );
+                }
+#if defined( __GNUC__ ) && __GNUC__ <= 6
+#    pragma GCC diagnostic pop
+#endif
+
+                alignas( T ) unsigned char data[sizeof( T )]{};
+            };
+        } // namespace Detail
+
+        template <typename T>
+        using storage_for = Detail::ObjectStorage<T, true>;
+
+        template <typename T>
+        using destructable_object = Detail::ObjectStorage<T, false>;
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_CONSTRUCTOR_HPP_INCLUDED
+
+#endif // CATCH_BENCHMARK_ALL_HPP_INCLUDED
+
+
+#ifndef CATCH_APPROX_HPP_INCLUDED
+#define CATCH_APPROX_HPP_INCLUDED
+
+
+
+#ifndef CATCH_TOSTRING_HPP_INCLUDED
+#define CATCH_TOSTRING_HPP_INCLUDED
+
+
+#include <vector>
+#include <cstddef>
+#include <type_traits>
+#include <string>
+
+
+
+
+/** \file
+ * Wrapper for the WCHAR configuration option
+ *
+ * We want to support platforms that do not provide `wchar_t`, so we
+ * sometimes have to disable providing wchar_t overloads through Catch2,
+ * e.g. the StringMaker specialization for `std::wstring`.
+ */
+
+#ifndef CATCH_CONFIG_WCHAR_HPP_INCLUDED
+#define CATCH_CONFIG_WCHAR_HPP_INCLUDED
+
+
+// We assume that WCHAR should be enabled by default, and only disabled
+// for a shortlist (so far only DJGPP) of compilers.
+
+#if defined(__DJGPP__)
+#  define CATCH_INTERNAL_CONFIG_NO_WCHAR
+#endif // __DJGPP__
+
+#if !defined( CATCH_INTERNAL_CONFIG_NO_WCHAR ) && \
+    !defined( CATCH_CONFIG_NO_WCHAR ) && \
+    !defined( CATCH_CONFIG_WCHAR )
+#    define CATCH_CONFIG_WCHAR
+#endif
+
+#endif // CATCH_CONFIG_WCHAR_HPP_INCLUDED
+
+
+#ifndef CATCH_REUSABLE_STRING_STREAM_HPP_INCLUDED
+#define CATCH_REUSABLE_STRING_STREAM_HPP_INCLUDED
+
+
+#include <iosfwd>
+#include <cstddef>
+#include <ostream>
+#include <string>
+
+namespace Catch {
+
+    class ReusableStringStream : Detail::NonCopyable {
+        std::size_t m_index;
+        std::ostream* m_oss;
+    public:
+        ReusableStringStream();
+        ~ReusableStringStream();
+
+        //! Returns the serialized state
+        std::string str() const;
+        //! Sets internal state to `str`
+        void str(std::string const& str);
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+// Old versions of GCC do not understand -Wnonnull-compare
+#pragma GCC diagnostic ignored "-Wpragmas"
+// Streaming a function pointer triggers Waddress and Wnonnull-compare
+// on GCC, because it implicitly converts it to bool and then decides
+// that the check it uses (a? true : false) is tautological and cannot
+// be null...
+#pragma GCC diagnostic ignored "-Waddress"
+#pragma GCC diagnostic ignored "-Wnonnull-compare"
+#endif
+
+        template<typename T>
+        auto operator << ( T const& value ) -> ReusableStringStream& {
+            *m_oss << value;
+            return *this;
+        }
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+        auto get() -> std::ostream& { return *m_oss; }
+    };
+}
+
+#endif // CATCH_REUSABLE_STRING_STREAM_HPP_INCLUDED
+
+
+#ifndef CATCH_VOID_TYPE_HPP_INCLUDED
+#define CATCH_VOID_TYPE_HPP_INCLUDED
+
+
+namespace Catch {
+    namespace Detail {
+
+        template <typename...>
+        struct make_void { using type = void; };
+
+        template <typename... Ts>
+        using void_t = typename make_void<Ts...>::type;
+
+    } // namespace Detail
+} // namespace Catch
+
+
+#endif // CATCH_VOID_TYPE_HPP_INCLUDED
+
+
+#ifndef CATCH_INTERFACES_ENUM_VALUES_REGISTRY_HPP_INCLUDED
+#define CATCH_INTERFACES_ENUM_VALUES_REGISTRY_HPP_INCLUDED
+
+
+#include <vector>
+
+namespace Catch {
+
+    namespace Detail {
+        struct EnumInfo {
+            StringRef m_name;
+            std::vector<std::pair<int, StringRef>> m_values;
+
+            ~EnumInfo();
+
+            StringRef lookup( int value ) const;
+        };
+    } // namespace Detail
+
+    class IMutableEnumValuesRegistry {
+    public:
+        virtual ~IMutableEnumValuesRegistry(); // = default;
+
+        virtual Detail::EnumInfo const& registerEnum( StringRef enumName, StringRef allEnums, std::vector<int> const& values ) = 0;
+
+        template<typename E>
+        Detail::EnumInfo const& registerEnum( StringRef enumName, StringRef allEnums, std::initializer_list<E> values ) {
+            static_assert(sizeof(int) >= sizeof(E), "Cannot serialize enum to int");
+            std::vector<int> intValues;
+            intValues.reserve( values.size() );
+            for( auto enumValue : values )
+                intValues.push_back( static_cast<int>( enumValue ) );
+            return registerEnum( enumName, allEnums, intValues );
+        }
+    };
+
+} // Catch
+
+#endif // CATCH_INTERFACES_ENUM_VALUES_REGISTRY_HPP_INCLUDED
+
+#ifdef CATCH_CONFIG_CPP17_STRING_VIEW
+#include <string_view>
+#endif
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable:4180) // We attempt to stream a function (address) by const&, which MSVC complains about but is harmless
+#endif
+
+// We need a dummy global operator<< so we can bring it into Catch namespace later
+struct Catch_global_namespace_dummy{};
+std::ostream& operator<<(std::ostream&, Catch_global_namespace_dummy);
+
+namespace Catch {
+    // Bring in global namespace operator<< for ADL lookup in
+    // `IsStreamInsertable` below.
+    using ::operator<<;
+
+    namespace Detail {
+
+        inline std::size_t catch_strnlen(const char *str, std::size_t n) {
+            auto ret = std::char_traits<char>::find(str, n, '\0');
+            if (ret != nullptr) {
+                return static_cast<std::size_t>(ret - str);
+            }
+            return n;
+        }
+
+        constexpr StringRef unprintableString = "{?}"_sr;
+
+        //! Encases `string in quotes, and optionally escapes invisibles
+        std::string convertIntoString( StringRef string, bool escapeInvisibles );
+
+        //! Encases `string` in quotes, and escapes invisibles if user requested
+        //! it via CLI
+        std::string convertIntoString( StringRef string );
+
+        std::string rawMemoryToString( const void *object, std::size_t size );
+
+        template<typename T>
+        std::string rawMemoryToString( const T& object ) {
+          return rawMemoryToString( &object, sizeof(object) );
+        }
+
+        template<typename T>
+        class IsStreamInsertable {
+            template<typename Stream, typename U>
+            static auto test(int)
+                -> decltype(std::declval<Stream&>() << std::declval<U>(), std::true_type());
+
+            template<typename, typename>
+            static auto test(...)->std::false_type;
+
+        public:
+            static const bool value = decltype(test<std::ostream, const T&>(0))::value;
+        };
+
+        template<typename E>
+        std::string convertUnknownEnumToString( E e );
+
+        template<typename T>
+        std::enable_if_t<
+            !std::is_enum<T>::value && !std::is_base_of<std::exception, T>::value,
+        std::string> convertUnstreamable( T const& ) {
+            return std::string(Detail::unprintableString);
+        }
+        template<typename T>
+        std::enable_if_t<
+            !std::is_enum<T>::value && std::is_base_of<std::exception, T>::value,
+         std::string> convertUnstreamable(T const& ex) {
+            return ex.what();
+        }
+
+
+        template<typename T>
+        std::enable_if_t<
+            std::is_enum<T>::value,
+        std::string> convertUnstreamable( T const& value ) {
+            return convertUnknownEnumToString( value );
+        }
+
+#if defined(_MANAGED)
+        //! Convert a CLR string to a utf8 std::string
+        template<typename T>
+        std::string clrReferenceToString( T^ ref ) {
+            if (ref == nullptr)
+                return std::string("null");
+            auto bytes = System::Text::Encoding::UTF8->GetBytes(ref->ToString());
+            cli::pin_ptr<System::Byte> p = &bytes[0];
+            return std::string(reinterpret_cast<char const *>(p), bytes->Length);
+        }
+#endif
+
+    } // namespace Detail
+
+
+    template <typename T, typename = void>
+    struct StringMaker {
+        template <typename Fake = T>
+        static
+        std::enable_if_t<::Catch::Detail::IsStreamInsertable<Fake>::value, std::string>
+            convert(const Fake& value) {
+                ReusableStringStream rss;
+                // NB: call using the function-like syntax to avoid ambiguity with
+                // user-defined templated operator<< under clang.
+                rss.operator<<(value);
+                return rss.str();
+        }
+
+        template <typename Fake = T>
+        static
+        std::enable_if_t<!::Catch::Detail::IsStreamInsertable<Fake>::value, std::string>
+            convert( const Fake& value ) {
+#if !defined(CATCH_CONFIG_FALLBACK_STRINGIFIER)
+            return Detail::convertUnstreamable(value);
+#else
+            return CATCH_CONFIG_FALLBACK_STRINGIFIER(value);
+#endif
+        }
+    };
+
+    namespace Detail {
+
+        // This function dispatches all stringification requests inside of Catch.
+        // Should be preferably called fully qualified, like ::Catch::Detail::stringify
+        template <typename T>
+        std::string stringify(const T& e) {
+            return ::Catch::StringMaker<std::remove_cv_t<std::remove_reference_t<T>>>::convert(e);
+        }
+
+        template<typename E>
+        std::string convertUnknownEnumToString( E e ) {
+            return ::Catch::Detail::stringify(static_cast<std::underlying_type_t<E>>(e));
+        }
+
+#if defined(_MANAGED)
+        template <typename T>
+        std::string stringify( T^ e ) {
+            return ::Catch::StringMaker<T^>::convert(e);
+        }
+#endif
+
+    } // namespace Detail
+
+    // Some predefined specializations
+
+    template<>
+    struct StringMaker<std::string> {
+        static std::string convert(const std::string& str);
+    };
+
+#ifdef CATCH_CONFIG_CPP17_STRING_VIEW
+    template<>
+    struct StringMaker<std::string_view> {
+        static std::string convert(std::string_view str);
+    };
+#endif
+
+    template<>
+    struct StringMaker<char const *> {
+        static std::string convert(char const * str);
+    };
+    template<>
+    struct StringMaker<char *> {
+        static std::string convert(char * str);
+    };
+
+#if defined(CATCH_CONFIG_WCHAR)
+    template<>
+    struct StringMaker<std::wstring> {
+        static std::string convert(const std::wstring& wstr);
+    };
+
+# ifdef CATCH_CONFIG_CPP17_STRING_VIEW
+    template<>
+    struct StringMaker<std::wstring_view> {
+        static std::string convert(std::wstring_view str);
+    };
+# endif
+
+    template<>
+    struct StringMaker<wchar_t const *> {
+        static std::string convert(wchar_t const * str);
+    };
+    template<>
+    struct StringMaker<wchar_t *> {
+        static std::string convert(wchar_t * str);
+    };
+#endif // CATCH_CONFIG_WCHAR
+
+    template<size_t SZ>
+    struct StringMaker<char[SZ]> {
+        static std::string convert(char const* str) {
+            return Detail::convertIntoString(
+                StringRef( str, Detail::catch_strnlen( str, SZ ) ) );
+        }
+    };
+    template<size_t SZ>
+    struct StringMaker<signed char[SZ]> {
+        static std::string convert(signed char const* str) {
+            auto reinterpreted = reinterpret_cast<char const*>(str);
+            return Detail::convertIntoString(
+                StringRef(reinterpreted, Detail::catch_strnlen(reinterpreted, SZ)));
+        }
+    };
+    template<size_t SZ>
+    struct StringMaker<unsigned char[SZ]> {
+        static std::string convert(unsigned char const* str) {
+            auto reinterpreted = reinterpret_cast<char const*>(str);
+            return Detail::convertIntoString(
+                StringRef(reinterpreted, Detail::catch_strnlen(reinterpreted, SZ)));
+        }
+    };
+
+#if defined(CATCH_CONFIG_CPP17_BYTE)
+    template<>
+    struct StringMaker<std::byte> {
+        static std::string convert(std::byte value);
+    };
+#endif // defined(CATCH_CONFIG_CPP17_BYTE)
+    template<>
+    struct StringMaker<int> {
+        static std::string convert(int value);
+    };
+    template<>
+    struct StringMaker<long> {
+        static std::string convert(long value);
+    };
+    template<>
+    struct StringMaker<long long> {
+        static std::string convert(long long value);
+    };
+    template<>
+    struct StringMaker<unsigned int> {
+        static std::string convert(unsigned int value);
+    };
+    template<>
+    struct StringMaker<unsigned long> {
+        static std::string convert(unsigned long value);
+    };
+    template<>
+    struct StringMaker<unsigned long long> {
+        static std::string convert(unsigned long long value);
+    };
+
+    template<>
+    struct StringMaker<bool> {
+        static std::string convert(bool b) {
+            using namespace std::string_literals;
+            return b ? "true"s : "false"s;
+        }
+    };
+
+    template<>
+    struct StringMaker<char> {
+        static std::string convert(char c);
+    };
+    template<>
+    struct StringMaker<signed char> {
+        static std::string convert(signed char value);
+    };
+    template<>
+    struct StringMaker<unsigned char> {
+        static std::string convert(unsigned char value);
+    };
+
+    template<>
+    struct StringMaker<std::nullptr_t> {
+        static std::string convert(std::nullptr_t) {
+            using namespace std::string_literals;
+            return "nullptr"s;
+        }
+    };
+
+    template<>
+    struct StringMaker<float> {
+        static std::string convert(float value);
+        CATCH_EXPORT static int precision;
+    };
+
+    template<>
+    struct StringMaker<double> {
+        static std::string convert(double value);
+        CATCH_EXPORT static int precision;
+    };
+
+    template <typename T>
+    struct StringMaker<T*> {
+        template <typename U>
+        static std::string convert(U* p) {
+            if (p) {
+                return ::Catch::Detail::rawMemoryToString(p);
+            } else {
+                return "nullptr";
+            }
+        }
+    };
+
+    template <typename R, typename C>
+    struct StringMaker<R C::*> {
+        static std::string convert(R C::* p) {
+            if (p) {
+                return ::Catch::Detail::rawMemoryToString(p);
+            } else {
+                return "nullptr";
+            }
+        }
+    };
+
+#if defined(_MANAGED)
+    template <typename T>
+    struct StringMaker<T^> {
+        static std::string convert( T^ ref ) {
+            return ::Catch::Detail::clrReferenceToString(ref);
+        }
+    };
+#endif
+
+    namespace Detail {
+        template<typename InputIterator, typename Sentinel = InputIterator>
+        std::string rangeToString(InputIterator first, Sentinel last) {
+            ReusableStringStream rss;
+            rss << "{ ";
+            if (first != last) {
+                rss << ::Catch::Detail::stringify(*first);
+                for (++first; first != last; ++first)
+                    rss << ", " << ::Catch::Detail::stringify(*first);
+            }
+            rss << " }";
+            return rss.str();
+        }
+    }
+
+} // namespace Catch
+
+//////////////////////////////////////////////////////
+// Separate std-lib types stringification, so it can be selectively enabled
+// This means that we do not bring in their headers
+
+#if defined(CATCH_CONFIG_ENABLE_ALL_STRINGMAKERS)
+#  define CATCH_CONFIG_ENABLE_PAIR_STRINGMAKER
+#  define CATCH_CONFIG_ENABLE_TUPLE_STRINGMAKER
+#  define CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER
+#  define CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER
+#endif
+
+// Separate std::pair specialization
+#if defined(CATCH_CONFIG_ENABLE_PAIR_STRINGMAKER)
+#include <utility>
+namespace Catch {
+    template<typename T1, typename T2>
+    struct StringMaker<std::pair<T1, T2> > {
+        static std::string convert(const std::pair<T1, T2>& pair) {
+            ReusableStringStream rss;
+            rss << "{ "
+                << ::Catch::Detail::stringify(pair.first)
+                << ", "
+                << ::Catch::Detail::stringify(pair.second)
+                << " }";
+            return rss.str();
+        }
+    };
+}
+#endif // CATCH_CONFIG_ENABLE_PAIR_STRINGMAKER
+
+#if defined(CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER) && defined(CATCH_CONFIG_CPP17_OPTIONAL)
+#include <optional>
+namespace Catch {
+    template<typename T>
+    struct StringMaker<std::optional<T> > {
+        static std::string convert(const std::optional<T>& optional) {
+            if (optional.has_value()) {
+                return ::Catch::Detail::stringify(*optional);
+            } else {
+                return "{ }";
+            }
+        }
+    };
+    template <>
+    struct StringMaker<std::nullopt_t> {
+        static std::string convert(const std::nullopt_t&) {
+            return "{ }";
+        }
+    };
+}
+#endif // CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER
+
+// Separate std::tuple specialization
+#if defined(CATCH_CONFIG_ENABLE_TUPLE_STRINGMAKER)
+#include <tuple>
+namespace Catch {
+    namespace Detail {
+        template<
+            typename Tuple,
+            std::size_t N = 0,
+            bool = (N < std::tuple_size<Tuple>::value)
+            >
+            struct TupleElementPrinter {
+            static void print(const Tuple& tuple, std::ostream& os) {
+                os << (N ? ", " : " ")
+                    << ::Catch::Detail::stringify(std::get<N>(tuple));
+                TupleElementPrinter<Tuple, N + 1>::print(tuple, os);
+            }
+        };
+
+        template<
+            typename Tuple,
+            std::size_t N
+        >
+            struct TupleElementPrinter<Tuple, N, false> {
+            static void print(const Tuple&, std::ostream&) {}
+        };
+
+    }
+
+
+    template<typename ...Types>
+    struct StringMaker<std::tuple<Types...>> {
+        static std::string convert(const std::tuple<Types...>& tuple) {
+            ReusableStringStream rss;
+            rss << '{';
+            Detail::TupleElementPrinter<std::tuple<Types...>>::print(tuple, rss.get());
+            rss << " }";
+            return rss.str();
+        }
+    };
+}
+#endif // CATCH_CONFIG_ENABLE_TUPLE_STRINGMAKER
+
+#if defined(CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER) && defined(CATCH_CONFIG_CPP17_VARIANT)
+#include <variant>
+namespace Catch {
+    template<>
+    struct StringMaker<std::monostate> {
+        static std::string convert(const std::monostate&) {
+            return "{ }";
+        }
+    };
+
+    template<typename... Elements>
+    struct StringMaker<std::variant<Elements...>> {
+        static std::string convert(const std::variant<Elements...>& variant) {
+            if (variant.valueless_by_exception()) {
+                return "{valueless variant}";
+            } else {
+                return std::visit(
+                    [](const auto& value) {
+                        return ::Catch::Detail::stringify(value);
+                    },
+                    variant
+                );
+            }
+        }
+    };
+}
+#endif // CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER
+
+namespace Catch {
+    // Import begin/ end from std here
+    using std::begin;
+    using std::end;
+
+    namespace Detail {
+        template <typename T, typename = void>
+        struct is_range_impl : std::false_type {};
+
+        template <typename T>
+        struct is_range_impl<T, void_t<decltype(begin(std::declval<T>()))>> : std::true_type {};
+    } // namespace Detail
+
+    template <typename T>
+    struct is_range : Detail::is_range_impl<T> {};
+
+#if defined(_MANAGED) // Managed types are never ranges
+    template <typename T>
+    struct is_range<T^> {
+        static const bool value = false;
+    };
+#endif
+
+    template<typename Range>
+    std::string rangeToString( Range const& range ) {
+        return ::Catch::Detail::rangeToString( begin( range ), end( range ) );
+    }
+
+    // Handle vector<bool> specially
+    template<typename Allocator>
+    std::string rangeToString( std::vector<bool, Allocator> const& v ) {
+        ReusableStringStream rss;
+        rss << "{ ";
+        bool first = true;
+        for( bool b : v ) {
+            if( first )
+                first = false;
+            else
+                rss << ", ";
+            rss << ::Catch::Detail::stringify( b );
+        }
+        rss << " }";
+        return rss.str();
+    }
+
+    template<typename R>
+    struct StringMaker<R, std::enable_if_t<is_range<R>::value && !::Catch::Detail::IsStreamInsertable<R>::value>> {
+        static std::string convert( R const& range ) {
+            return rangeToString( range );
+        }
+    };
+
+    template <typename T, size_t SZ>
+    struct StringMaker<T[SZ]> {
+        static std::string convert(T const(&arr)[SZ]) {
+            return rangeToString(arr);
+        }
+    };
+
+
+} // namespace Catch
+
+// Separate std::chrono::duration specialization
+#include <ctime>
+#include <ratio>
+#include <chrono>
+
+
+namespace Catch {
+
+template <class Ratio>
+struct ratio_string {
+    static std::string symbol() {
+        Catch::ReusableStringStream rss;
+        rss << '[' << Ratio::num << '/'
+            << Ratio::den << ']';
+        return rss.str();
+    }
+};
+
+template <>
+struct ratio_string<std::atto> {
+    static char symbol() { return 'a'; }
+};
+template <>
+struct ratio_string<std::femto> {
+    static char symbol() { return 'f'; }
+};
+template <>
+struct ratio_string<std::pico> {
+    static char symbol() { return 'p'; }
+};
+template <>
+struct ratio_string<std::nano> {
+    static char symbol() { return 'n'; }
+};
+template <>
+struct ratio_string<std::micro> {
+    static char symbol() { return 'u'; }
+};
+template <>
+struct ratio_string<std::milli> {
+    static char symbol() { return 'm'; }
+};
+
+    ////////////
+    // std::chrono::duration specializations
+    template<typename Value, typename Ratio>
+    struct StringMaker<std::chrono::duration<Value, Ratio>> {
+        static std::string convert(std::chrono::duration<Value, Ratio> const& duration) {
+            ReusableStringStream rss;
+            rss << duration.count() << ' ' << ratio_string<Ratio>::symbol() << 's';
+            return rss.str();
+        }
+    };
+    template<typename Value>
+    struct StringMaker<std::chrono::duration<Value, std::ratio<1>>> {
+        static std::string convert(std::chrono::duration<Value, std::ratio<1>> const& duration) {
+            ReusableStringStream rss;
+            rss << duration.count() << " s";
+            return rss.str();
+        }
+    };
+    template<typename Value>
+    struct StringMaker<std::chrono::duration<Value, std::ratio<60>>> {
+        static std::string convert(std::chrono::duration<Value, std::ratio<60>> const& duration) {
+            ReusableStringStream rss;
+            rss << duration.count() << " m";
+            return rss.str();
+        }
+    };
+    template<typename Value>
+    struct StringMaker<std::chrono::duration<Value, std::ratio<3600>>> {
+        static std::string convert(std::chrono::duration<Value, std::ratio<3600>> const& duration) {
+            ReusableStringStream rss;
+            rss << duration.count() << " h";
+            return rss.str();
+        }
+    };
+
+    ////////////
+    // std::chrono::time_point specialization
+    // Generic time_point cannot be specialized, only std::chrono::time_point<system_clock>
+    template<typename Clock, typename Duration>
+    struct StringMaker<std::chrono::time_point<Clock, Duration>> {
+        static std::string convert(std::chrono::time_point<Clock, Duration> const& time_point) {
+            return ::Catch::Detail::stringify(time_point.time_since_epoch()) + " since epoch";
+        }
+    };
+    // std::chrono::time_point<system_clock> specialization
+    template<typename Duration>
+    struct StringMaker<std::chrono::time_point<std::chrono::system_clock, Duration>> {
+        static std::string convert(std::chrono::time_point<std::chrono::system_clock, Duration> const& time_point) {
+            auto converted = std::chrono::system_clock::to_time_t(time_point);
+
+#ifdef _MSC_VER
+            std::tm timeInfo = {};
+            gmtime_s(&timeInfo, &converted);
+#else
+            std::tm* timeInfo = std::gmtime(&converted);
+#endif
+
+            auto const timeStampSize = sizeof("2017-01-16T17:06:45Z");
+            char timeStamp[timeStampSize];
+            const char * const fmt = "%Y-%m-%dT%H:%M:%SZ";
+
+#ifdef _MSC_VER
+            std::strftime(timeStamp, timeStampSize, fmt, &timeInfo);
+#else
+            std::strftime(timeStamp, timeStampSize, fmt, timeInfo);
+#endif
+            return std::string(timeStamp, timeStampSize - 1);
+        }
+    };
+}
+
+
+#define INTERNAL_CATCH_REGISTER_ENUM( enumName, ... ) \
+namespace Catch { \
+    template<> struct StringMaker<enumName> { \
+        static std::string convert( enumName value ) { \
+            static const auto& enumInfo = ::Catch::getMutableRegistryHub().getMutableEnumValuesRegistry().registerEnum( #enumName, #__VA_ARGS__, { __VA_ARGS__ } ); \
+            return static_cast<std::string>(enumInfo.lookup( static_cast<int>( value ) )); \
+        } \
+    }; \
+}
+
+#define CATCH_REGISTER_ENUM( enumName, ... ) INTERNAL_CATCH_REGISTER_ENUM( enumName, __VA_ARGS__ )
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#endif // CATCH_TOSTRING_HPP_INCLUDED
+
+#include <type_traits>
+
+namespace Catch {
+
+    class Approx {
+    private:
+        bool equalityComparisonImpl(double other) const;
+        // Sets and validates the new margin (margin >= 0)
+        void setMargin(double margin);
+        // Sets and validates the new epsilon (0 < epsilon < 1)
+        void setEpsilon(double epsilon);
+
+    public:
+        explicit Approx ( double value );
+
+        static Approx custom();
+
+        Approx operator-() const;
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        Approx operator()( T const& value ) const {
+            Approx approx( static_cast<double>(value) );
+            approx.m_epsilon = m_epsilon;
+            approx.m_margin = m_margin;
+            approx.m_scale = m_scale;
+            return approx;
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        explicit Approx( T const& value ): Approx(static_cast<double>(value))
+        {}
+
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        friend bool operator == ( const T& lhs, Approx const& rhs ) {
+            auto lhs_v = static_cast<double>(lhs);
+            return rhs.equalityComparisonImpl(lhs_v);
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        friend bool operator == ( Approx const& lhs, const T& rhs ) {
+            return operator==( rhs, lhs );
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        friend bool operator != ( T const& lhs, Approx const& rhs ) {
+            return !operator==( lhs, rhs );
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        friend bool operator != ( Approx const& lhs, T const& rhs ) {
+            return !operator==( rhs, lhs );
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        friend bool operator <= ( T const& lhs, Approx const& rhs ) {
+            return static_cast<double>(lhs) < rhs.m_value || lhs == rhs;
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        friend bool operator <= ( Approx const& lhs, T const& rhs ) {
+            return lhs.m_value < static_cast<double>(rhs) || lhs == rhs;
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        friend bool operator >= ( T const& lhs, Approx const& rhs ) {
+            return static_cast<double>(lhs) > rhs.m_value || lhs == rhs;
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        friend bool operator >= ( Approx const& lhs, T const& rhs ) {
+            return lhs.m_value > static_cast<double>(rhs) || lhs == rhs;
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        Approx& epsilon( T const& newEpsilon ) {
+            const auto epsilonAsDouble = static_cast<double>(newEpsilon);
+            setEpsilon(epsilonAsDouble);
+            return *this;
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        Approx& margin( T const& newMargin ) {
+            const auto marginAsDouble = static_cast<double>(newMargin);
+            setMargin(marginAsDouble);
+            return *this;
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        Approx& scale( T const& newScale ) {
+            m_scale = static_cast<double>(newScale);
+            return *this;
+        }
+
+        std::string toString() const;
+
+    private:
+        double m_epsilon;
+        double m_margin;
+        double m_scale;
+        double m_value;
+    };
+
+namespace literals {
+    Approx operator ""_a(long double val);
+    Approx operator ""_a(unsigned long long val);
+} // end namespace literals
+
+template<>
+struct StringMaker<Catch::Approx> {
+    static std::string convert(Catch::Approx const& value);
+};
+
+} // end namespace Catch
+
+#endif // CATCH_APPROX_HPP_INCLUDED
+
+
+#ifndef CATCH_ASSERTION_INFO_HPP_INCLUDED
+#define CATCH_ASSERTION_INFO_HPP_INCLUDED
+
+
+
+#ifndef CATCH_SOURCE_LINE_INFO_HPP_INCLUDED
+#define CATCH_SOURCE_LINE_INFO_HPP_INCLUDED
+
+#include <cstddef>
+#include <iosfwd>
+
+namespace Catch {
+
+    struct SourceLineInfo {
+
+        SourceLineInfo() = delete;
+        constexpr SourceLineInfo( char const* _file, std::size_t _line ) noexcept:
+            file( _file ),
+            line( _line )
+        {}
+
+        bool operator == ( SourceLineInfo const& other ) const noexcept;
+        bool operator < ( SourceLineInfo const& other ) const noexcept;
+
+        char const* file;
+        std::size_t line;
+
+        friend std::ostream& operator << (std::ostream& os, SourceLineInfo const& info);
+    };
+}
+
+#define CATCH_INTERNAL_LINEINFO \
+    ::Catch::SourceLineInfo( __FILE__, static_cast<std::size_t>( __LINE__ ) )
+
+#endif // CATCH_SOURCE_LINE_INFO_HPP_INCLUDED
+
+namespace Catch {
+
+    struct AssertionInfo {
+        // AssertionInfo() = delete;
+
+        StringRef macroName;
+        SourceLineInfo lineInfo;
+        StringRef capturedExpression;
+        ResultDisposition::Flags resultDisposition;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_ASSERTION_INFO_HPP_INCLUDED
+
+
+#ifndef CATCH_ASSERTION_RESULT_HPP_INCLUDED
+#define CATCH_ASSERTION_RESULT_HPP_INCLUDED
+
+
+
+#ifndef CATCH_LAZY_EXPR_HPP_INCLUDED
+#define CATCH_LAZY_EXPR_HPP_INCLUDED
+
+#include <iosfwd>
+
+namespace Catch {
+
+    class ITransientExpression;
+
+    class LazyExpression {
+        friend class AssertionHandler;
+        friend struct AssertionStats;
+        friend class RunContext;
+
+        ITransientExpression const* m_transientExpression = nullptr;
+        bool m_isNegated;
+    public:
+        LazyExpression( bool isNegated ):
+            m_isNegated(isNegated)
+        {}
+        LazyExpression(LazyExpression const& other) = default;
+        LazyExpression& operator = ( LazyExpression const& ) = delete;
+
+        explicit operator bool() const {
+            return m_transientExpression != nullptr;
+        }
+
+        friend auto operator << ( std::ostream& os, LazyExpression const& lazyExpr ) -> std::ostream&;
+    };
+
+} // namespace Catch
+
+#endif // CATCH_LAZY_EXPR_HPP_INCLUDED
+
+#include <string>
+
+namespace Catch {
+
+    struct AssertionResultData
+    {
+        AssertionResultData() = delete;
+
+        AssertionResultData( ResultWas::OfType _resultType, LazyExpression const& _lazyExpression );
+
+        std::string message;
+        mutable std::string reconstructedExpression;
+        LazyExpression lazyExpression;
+        ResultWas::OfType resultType;
+
+        std::string reconstructExpression() const;
+    };
+
+    class AssertionResult {
+    public:
+        AssertionResult() = delete;
+        AssertionResult( AssertionInfo const& info, AssertionResultData&& data );
+
+        bool isOk() const;
+        bool succeeded() const;
+        ResultWas::OfType getResultType() const;
+        bool hasExpression() const;
+        bool hasMessage() const;
+        std::string getExpression() const;
+        std::string getExpressionInMacro() const;
+        bool hasExpandedExpression() const;
+        std::string getExpandedExpression() const;
+        StringRef getMessage() const;
+        SourceLineInfo getSourceInfo() const;
+        StringRef getTestMacroName() const;
+
+    //protected:
+        AssertionInfo m_info;
+        AssertionResultData m_resultData;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_ASSERTION_RESULT_HPP_INCLUDED
+
+
+#ifndef CATCH_CONFIG_HPP_INCLUDED
+#define CATCH_CONFIG_HPP_INCLUDED
+
+
+
+#ifndef CATCH_TEST_SPEC_HPP_INCLUDED
+#define CATCH_TEST_SPEC_HPP_INCLUDED
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpadded"
+#endif
+
+
+
+#ifndef CATCH_WILDCARD_PATTERN_HPP_INCLUDED
+#define CATCH_WILDCARD_PATTERN_HPP_INCLUDED
+
+
+
+#ifndef CATCH_CASE_SENSITIVE_HPP_INCLUDED
+#define CATCH_CASE_SENSITIVE_HPP_INCLUDED
+
+namespace Catch {
+
+    enum class CaseSensitive { Yes, No };
+
+} // namespace Catch
+
+#endif // CATCH_CASE_SENSITIVE_HPP_INCLUDED
+
+#include <string>
+
+namespace Catch
+{
+    class WildcardPattern {
+        enum WildcardPosition {
+            NoWildcard = 0,
+            WildcardAtStart = 1,
+            WildcardAtEnd = 2,
+            WildcardAtBothEnds = WildcardAtStart | WildcardAtEnd
+        };
+
+    public:
+
+        WildcardPattern( std::string const& pattern, CaseSensitive caseSensitivity );
+        bool matches( std::string const& str ) const;
+
+    private:
+        std::string normaliseString( std::string const& str ) const;
+        CaseSensitive m_caseSensitivity;
+        WildcardPosition m_wildcard = NoWildcard;
+        std::string m_pattern;
+    };
+}
+
+#endif // CATCH_WILDCARD_PATTERN_HPP_INCLUDED
+
+#include <iosfwd>
+#include <string>
+#include <vector>
+
+namespace Catch {
+
+    class IConfig;
+    struct TestCaseInfo;
+    class TestCaseHandle;
+
+    class TestSpec {
+
+        class Pattern {
+        public:
+            explicit Pattern( std::string const& name );
+            virtual ~Pattern();
+            virtual bool matches( TestCaseInfo const& testCase ) const = 0;
+            std::string const& name() const;
+        private:
+            virtual void serializeTo( std::ostream& out ) const = 0;
+            // Writes string that would be reparsed into the pattern
+            friend std::ostream& operator<<(std::ostream& out,
+                                            Pattern const& pattern) {
+                pattern.serializeTo( out );
+                return out;
+            }
+
+            std::string const m_name;
+        };
+
+        class NamePattern : public Pattern {
+        public:
+            explicit NamePattern( std::string const& name, std::string const& filterString );
+            bool matches( TestCaseInfo const& testCase ) const override;
+        private:
+            void serializeTo( std::ostream& out ) const override;
+
+            WildcardPattern m_wildcardPattern;
+        };
+
+        class TagPattern : public Pattern {
+        public:
+            explicit TagPattern( std::string const& tag, std::string const& filterString );
+            bool matches( TestCaseInfo const& testCase ) const override;
+        private:
+            void serializeTo( std::ostream& out ) const override;
+
+            std::string m_tag;
+        };
+
+        struct Filter {
+            std::vector<Detail::unique_ptr<Pattern>> m_required;
+            std::vector<Detail::unique_ptr<Pattern>> m_forbidden;
+
+            //! Serializes this filter into a string that would be parsed into
+            //! an equivalent filter
+            void serializeTo( std::ostream& out ) const;
+            friend std::ostream& operator<<(std::ostream& out, Filter const& f) {
+                f.serializeTo( out );
+                return out;
+            }
+
+            bool matches( TestCaseInfo const& testCase ) const;
+        };
+
+        static std::string extractFilterName( Filter const& filter );
+
+    public:
+        struct FilterMatch {
+            std::string name;
+            std::vector<TestCaseHandle const*> tests;
+        };
+        using Matches = std::vector<FilterMatch>;
+        using vectorStrings = std::vector<std::string>;
+
+        bool hasFilters() const;
+        bool matches( TestCaseInfo const& testCase ) const;
+        Matches matchesByFilter( std::vector<TestCaseHandle> const& testCases, IConfig const& config ) const;
+        const vectorStrings & getInvalidSpecs() const;
+
+    private:
+        std::vector<Filter> m_filters;
+        std::vector<std::string> m_invalidSpecs;
+
+        friend class TestSpecParser;
+        //! Serializes this test spec into a string that would be parsed into
+        //! equivalent test spec
+        void serializeTo( std::ostream& out ) const;
+        friend std::ostream& operator<<(std::ostream& out,
+                                        TestSpec const& spec) {
+            spec.serializeTo( out );
+            return out;
+        }
+    };
+}
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+#endif // CATCH_TEST_SPEC_HPP_INCLUDED
+
+
+#ifndef CATCH_OPTIONAL_HPP_INCLUDED
+#define CATCH_OPTIONAL_HPP_INCLUDED
+
+
+#include <cassert>
+
+namespace Catch {
+
+    // An optional type
+    template<typename T>
+    class Optional {
+    public:
+        Optional(): nullableValue( nullptr ) {}
+        ~Optional() { reset(); }
+
+        Optional( T const& _value ):
+            nullableValue( new ( storage ) T( _value ) ) {}
+        Optional( T&& _value ):
+            nullableValue( new ( storage ) T( CATCH_MOVE( _value ) ) ) {}
+
+        Optional& operator=( T const& _value ) {
+            reset();
+            nullableValue = new ( storage ) T( _value );
+            return *this;
+        }
+        Optional& operator=( T&& _value ) {
+            reset();
+            nullableValue = new ( storage ) T( CATCH_MOVE( _value ) );
+            return *this;
+        }
+
+        Optional( Optional const& _other ):
+            nullableValue( _other ? new ( storage ) T( *_other ) : nullptr ) {}
+        Optional( Optional&& _other ):
+            nullableValue( _other ? new ( storage ) T( CATCH_MOVE( *_other ) )
+                                  : nullptr ) {}
+
+        Optional& operator=( Optional const& _other ) {
+            if ( &_other != this ) {
+                reset();
+                if ( _other ) { nullableValue = new ( storage ) T( *_other ); }
+            }
+            return *this;
+        }
+        Optional& operator=( Optional&& _other ) {
+            if ( &_other != this ) {
+                reset();
+                if ( _other ) {
+                    nullableValue = new ( storage ) T( CATCH_MOVE( *_other ) );
+                }
+            }
+            return *this;
+        }
+
+        void reset() {
+            if ( nullableValue ) { nullableValue->~T(); }
+            nullableValue = nullptr;
+        }
+
+        T& operator*() {
+            assert(nullableValue);
+            return *nullableValue;
+        }
+        T const& operator*() const {
+            assert(nullableValue);
+            return *nullableValue;
+        }
+        T* operator->() {
+            assert(nullableValue);
+            return nullableValue;
+        }
+        const T* operator->() const {
+            assert(nullableValue);
+            return nullableValue;
+        }
+
+        T valueOr( T const& defaultValue ) const {
+            return nullableValue ? *nullableValue : defaultValue;
+        }
+
+        bool some() const { return nullableValue != nullptr; }
+        bool none() const { return nullableValue == nullptr; }
+
+        bool operator !() const { return nullableValue == nullptr; }
+        explicit operator bool() const {
+            return some();
+        }
+
+        friend bool operator==(Optional const& a, Optional const& b) {
+            if (a.none() && b.none()) {
+                return true;
+            } else if (a.some() && b.some()) {
+                return *a == *b;
+            } else {
+                return false;
+            }
+        }
+        friend bool operator!=(Optional const& a, Optional const& b) {
+            return !( a == b );
+        }
+
+    private:
+        T* nullableValue;
+        alignas(alignof(T)) char storage[sizeof(T)];
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_OPTIONAL_HPP_INCLUDED
+
+
+#ifndef CATCH_RANDOM_SEED_GENERATION_HPP_INCLUDED
+#define CATCH_RANDOM_SEED_GENERATION_HPP_INCLUDED
+
+#include <cstdint>
+
+namespace Catch {
+
+    enum class GenerateFrom {
+        Time,
+        RandomDevice,
+        //! Currently equivalent to RandomDevice, but can change at any point
+        Default
+    };
+
+    std::uint32_t generateRandomSeed(GenerateFrom from);
+
+} // end namespace Catch
+
+#endif // CATCH_RANDOM_SEED_GENERATION_HPP_INCLUDED
+
+
+#ifndef CATCH_REPORTER_SPEC_PARSER_HPP_INCLUDED
+#define CATCH_REPORTER_SPEC_PARSER_HPP_INCLUDED
+
+
+#include <map>
+#include <string>
+#include <vector>
+
+namespace Catch {
+
+    enum class ColourMode : std::uint8_t;
+
+    namespace Detail {
+        //! Splits the reporter spec into reporter name and kv-pair options
+        std::vector<std::string> splitReporterSpec( StringRef reporterSpec );
+
+        Optional<ColourMode> stringToColourMode( StringRef colourMode );
+    }
+
+    /**
+     * Structured reporter spec that a reporter can be created from
+     *
+     * Parsing has been validated, but semantics have not. This means e.g.
+     * that the colour mode is known to Catch2, but it might not be
+     * compiled into the binary, and the output filename might not be
+     * openable.
+     */
+    class ReporterSpec {
+        std::string m_name;
+        Optional<std::string> m_outputFileName;
+        Optional<ColourMode> m_colourMode;
+        std::map<std::string, std::string> m_customOptions;
+
+        friend bool operator==( ReporterSpec const& lhs,
+                                ReporterSpec const& rhs );
+        friend bool operator!=( ReporterSpec const& lhs,
+                                ReporterSpec const& rhs ) {
+            return !( lhs == rhs );
+        }
+
+    public:
+        ReporterSpec(
+            std::string name,
+            Optional<std::string> outputFileName,
+            Optional<ColourMode> colourMode,
+            std::map<std::string, std::string> customOptions );
+
+        std::string const& name() const { return m_name; }
+
+        Optional<std::string> const& outputFile() const {
+            return m_outputFileName;
+        }
+
+        Optional<ColourMode> const& colourMode() const { return m_colourMode; }
+
+        std::map<std::string, std::string> const& customOptions() const {
+            return m_customOptions;
+        }
+    };
+
+    /**
+     * Parses provided reporter spec string into
+     *
+     * Returns empty optional on errors, e.g.
+     *  * field that is not first and not a key+value pair
+     *  * duplicated keys in kv pair
+     *  * unknown catch reporter option
+     *  * empty key/value in an custom kv pair
+     *  * ...
+     */
+    Optional<ReporterSpec> parseReporterSpec( StringRef reporterSpec );
+
+}
+
+#endif // CATCH_REPORTER_SPEC_PARSER_HPP_INCLUDED
+
+#include <chrono>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace Catch {
+
+    class IStream;
+
+    /**
+     * `ReporterSpec` but with the defaults filled in.
+     *
+     * Like `ReporterSpec`, the semantics are unchecked.
+     */
+    struct ProcessedReporterSpec {
+        std::string name;
+        std::string outputFilename;
+        ColourMode colourMode;
+        std::map<std::string, std::string> customOptions;
+        friend bool operator==( ProcessedReporterSpec const& lhs,
+                                ProcessedReporterSpec const& rhs );
+        friend bool operator!=( ProcessedReporterSpec const& lhs,
+                                ProcessedReporterSpec const& rhs ) {
+            return !( lhs == rhs );
+        }
+    };
+
+    struct ConfigData {
+
+        bool listTests = false;
+        bool listTags = false;
+        bool listReporters = false;
+        bool listListeners = false;
+
+        bool showSuccessfulTests = false;
+        bool shouldDebugBreak = false;
+        bool noThrow = false;
+        bool showHelp = false;
+        bool showInvisibles = false;
+        bool filenamesAsTags = false;
+        bool libIdentify = false;
+        bool allowZeroTests = false;
+
+        int abortAfter = -1;
+        uint32_t rngSeed = generateRandomSeed(GenerateFrom::Default);
+
+        unsigned int shardCount = 1;
+        unsigned int shardIndex = 0;
+
+        bool skipBenchmarks = false;
+        bool benchmarkNoAnalysis = false;
+        unsigned int benchmarkSamples = 100;
+        double benchmarkConfidenceInterval = 0.95;
+        unsigned int benchmarkResamples = 100'000;
+        std::chrono::milliseconds::rep benchmarkWarmupTime = 100;
+
+        Verbosity verbosity = Verbosity::Normal;
+        WarnAbout::What warnings = WarnAbout::Nothing;
+        ShowDurations showDurations = ShowDurations::DefaultForReporter;
+        double minDuration = -1;
+        TestRunOrder runOrder = TestRunOrder::Declared;
+        ColourMode defaultColourMode = ColourMode::PlatformDefault;
+        WaitForKeypress::When waitForKeypress = WaitForKeypress::Never;
+
+        std::string defaultOutputFilename;
+        std::string name;
+        std::string processName;
+        std::vector<ReporterSpec> reporterSpecifications;
+
+        std::vector<std::string> testsOrTags;
+        std::vector<std::string> sectionsToRun;
+    };
+
+
+    class Config : public IConfig {
+    public:
+
+        Config() = default;
+        Config( ConfigData const& data );
+        ~Config() override; // = default in the cpp file
+
+        bool listTests() const;
+        bool listTags() const;
+        bool listReporters() const;
+        bool listListeners() const;
+
+        std::vector<ReporterSpec> const& getReporterSpecs() const;
+        std::vector<ProcessedReporterSpec> const&
+        getProcessedReporterSpecs() const;
+
+        std::vector<std::string> const& getTestsOrTags() const override;
+        std::vector<std::string> const& getSectionsToRun() const override;
+
+        TestSpec const& testSpec() const override;
+        bool hasTestFilters() const override;
+
+        bool showHelp() const;
+
+        // IConfig interface
+        bool allowThrows() const override;
+        StringRef name() const override;
+        bool includeSuccessfulResults() const override;
+        bool warnAboutMissingAssertions() const override;
+        bool warnAboutUnmatchedTestSpecs() const override;
+        bool zeroTestsCountAsSuccess() const override;
+        ShowDurations showDurations() const override;
+        double minDuration() const override;
+        TestRunOrder runOrder() const override;
+        uint32_t rngSeed() const override;
+        unsigned int shardCount() const override;
+        unsigned int shardIndex() const override;
+        ColourMode defaultColourMode() const override;
+        bool shouldDebugBreak() const override;
+        int abortAfter() const override;
+        bool showInvisibles() const override;
+        Verbosity verbosity() const override;
+        bool skipBenchmarks() const override;
+        bool benchmarkNoAnalysis() const override;
+        unsigned int benchmarkSamples() const override;
+        double benchmarkConfidenceInterval() const override;
+        unsigned int benchmarkResamples() const override;
+        std::chrono::milliseconds benchmarkWarmupTime() const override;
+
+    private:
+        // Reads Bazel env vars and applies them to the config
+        void readBazelEnvVars();
+
+        ConfigData m_data;
+        std::vector<ProcessedReporterSpec> m_processedReporterSpecs;
+        TestSpec m_testSpec;
+        bool m_hasTestFilters = false;
+    };
+} // end namespace Catch
+
+#endif // CATCH_CONFIG_HPP_INCLUDED
+
+
+#ifndef CATCH_GET_RANDOM_SEED_HPP_INCLUDED
+#define CATCH_GET_RANDOM_SEED_HPP_INCLUDED
+
+#include <cstdint>
+
+namespace Catch {
+    //! Returns Catch2's current RNG seed.
+    std::uint32_t getSeed();
+}
+
+#endif // CATCH_GET_RANDOM_SEED_HPP_INCLUDED
+
+
+#ifndef CATCH_MESSAGE_HPP_INCLUDED
+#define CATCH_MESSAGE_HPP_INCLUDED
+
+
+
+
+/** \file
+ * Wrapper for the CATCH_CONFIG_PREFIX_MESSAGES configuration option
+ *
+ * CATCH_CONFIG_PREFIX_ALL can be used to avoid clashes with other macros
+ * by prepending CATCH_. This may not be desirable if the only clashes are with
+ * logger macros such as INFO and WARN. In this cases
+ * CATCH_CONFIG_PREFIX_MESSAGES can be used to only prefix a small subset
+ * of relevant macros.
+ *
+ */
+
+#ifndef CATCH_CONFIG_PREFIX_MESSAGES_HPP_INCLUDED
+#define CATCH_CONFIG_PREFIX_MESSAGES_HPP_INCLUDED
+
+
+#if defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_PREFIX_MESSAGES)
+    #define CATCH_CONFIG_PREFIX_MESSAGES
+#endif
+
+#endif // CATCH_CONFIG_PREFIX_MESSAGES_HPP_INCLUDED
+
+
+#ifndef CATCH_STREAM_END_STOP_HPP_INCLUDED
+#define CATCH_STREAM_END_STOP_HPP_INCLUDED
+
+
+namespace Catch {
+
+    // Use this in variadic streaming macros to allow
+    //    << +StreamEndStop
+    // as well as
+    //    << stuff +StreamEndStop
+    struct StreamEndStop {
+        constexpr StringRef operator+() const { return StringRef(); }
+
+        template <typename T>
+        constexpr friend T const& operator+( T const& value, StreamEndStop ) {
+            return value;
+        }
+    };
+
+} // namespace Catch
+
+#endif // CATCH_STREAM_END_STOP_HPP_INCLUDED
+
+
+#ifndef CATCH_MESSAGE_INFO_HPP_INCLUDED
+#define CATCH_MESSAGE_INFO_HPP_INCLUDED
+
+
+#include <string>
+
+namespace Catch {
+
+    struct MessageInfo {
+        MessageInfo(    StringRef _macroName,
+                        SourceLineInfo const& _lineInfo,
+                        ResultWas::OfType _type );
+
+        StringRef macroName;
+        std::string message;
+        SourceLineInfo lineInfo;
+        ResultWas::OfType type;
+        unsigned int sequence;
+
+        bool operator == (MessageInfo const& other) const {
+            return sequence == other.sequence;
+        }
+        bool operator < (MessageInfo const& other) const {
+            return sequence < other.sequence;
+        }
+    private:
+        static unsigned int globalCount;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_MESSAGE_INFO_HPP_INCLUDED
+
+#include <string>
+#include <vector>
+
+namespace Catch {
+
+    struct SourceLineInfo;
+    class IResultCapture;
+
+    struct MessageStream {
+
+        template<typename T>
+        MessageStream& operator << ( T const& value ) {
+            m_stream << value;
+            return *this;
+        }
+
+        ReusableStringStream m_stream;
+    };
+
+    struct MessageBuilder : MessageStream {
+        MessageBuilder( StringRef macroName,
+                        SourceLineInfo const& lineInfo,
+                        ResultWas::OfType type ):
+            m_info(macroName, lineInfo, type) {}
+
+        template<typename T>
+        MessageBuilder&& operator << ( T const& value ) && {
+            m_stream << value;
+            return CATCH_MOVE(*this);
+        }
+
+        MessageInfo m_info;
+    };
+
+    class ScopedMessage {
+    public:
+        explicit ScopedMessage( MessageBuilder&& builder );
+        ScopedMessage( ScopedMessage& duplicate ) = delete;
+        ScopedMessage( ScopedMessage&& old ) noexcept;
+        ~ScopedMessage();
+
+        MessageInfo m_info;
+        bool m_moved = false;
+    };
+
+    class Capturer {
+        std::vector<MessageInfo> m_messages;
+        IResultCapture& m_resultCapture;
+        size_t m_captured = 0;
+    public:
+        Capturer( StringRef macroName, SourceLineInfo const& lineInfo, ResultWas::OfType resultType, StringRef names );
+
+        Capturer(Capturer const&) = delete;
+        Capturer& operator=(Capturer const&) = delete;
+
+        ~Capturer();
+
+        void captureValue( size_t index, std::string const& value );
+
+        template<typename T>
+        void captureValues( size_t index, T const& value ) {
+            captureValue( index, Catch::Detail::stringify( value ) );
+        }
+
+        template<typename T, typename... Ts>
+        void captureValues( size_t index, T const& value, Ts const&... values ) {
+            captureValue( index, Catch::Detail::stringify(value) );
+            captureValues( index+1, values... );
+        }
+    };
+
+} // end namespace Catch
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_MSG( macroName, messageType, resultDisposition, ... ) \
+    do { \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, Catch::StringRef(), resultDisposition ); \
+        catchAssertionHandler.handleMessage( messageType, ( Catch::MessageStream() << __VA_ARGS__ + ::Catch::StreamEndStop() ).m_stream.str() ); \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( false )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_CAPTURE( varName, macroName, ... ) \
+    Catch::Capturer varName( macroName##_catch_sr,        \
+                             CATCH_INTERNAL_LINEINFO,     \
+                             Catch::ResultWas::Info,      \
+                             #__VA_ARGS__##_catch_sr );   \
+    varName.captureValues( 0, __VA_ARGS__ )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_INFO( macroName, log ) \
+    const Catch::ScopedMessage INTERNAL_CATCH_UNIQUE_NAME( scopedMessage )( Catch::MessageBuilder( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, Catch::ResultWas::Info ) << log )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_UNSCOPED_INFO( macroName, log ) \
+    Catch::getResultCapture().emplaceUnscopedMessage( Catch::MessageBuilder( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, Catch::ResultWas::Info ) << log )
+
+
+#if defined(CATCH_CONFIG_PREFIX_MESSAGES) && !defined(CATCH_CONFIG_DISABLE)
+
+  #define CATCH_INFO( msg ) INTERNAL_CATCH_INFO( "CATCH_INFO", msg )
+  #define CATCH_UNSCOPED_INFO( msg ) INTERNAL_CATCH_UNSCOPED_INFO( "CATCH_UNSCOPED_INFO", msg )
+  #define CATCH_WARN( msg ) INTERNAL_CATCH_MSG( "CATCH_WARN", Catch::ResultWas::Warning, Catch::ResultDisposition::ContinueOnFailure, msg )
+  #define CATCH_CAPTURE( ... ) INTERNAL_CATCH_CAPTURE( INTERNAL_CATCH_UNIQUE_NAME(capturer), "CATCH_CAPTURE", __VA_ARGS__ )
+
+#elif defined(CATCH_CONFIG_PREFIX_MESSAGES) && defined(CATCH_CONFIG_DISABLE)
+
+  #define CATCH_INFO( msg )          (void)(0)
+  #define CATCH_UNSCOPED_INFO( msg ) (void)(0)
+  #define CATCH_WARN( msg )          (void)(0)
+  #define CATCH_CAPTURE( ... )       (void)(0)
+
+#elif !defined(CATCH_CONFIG_PREFIX_MESSAGES) && !defined(CATCH_CONFIG_DISABLE)
+
+  #define INFO( msg ) INTERNAL_CATCH_INFO( "INFO", msg )
+  #define UNSCOPED_INFO( msg ) INTERNAL_CATCH_UNSCOPED_INFO( "UNSCOPED_INFO", msg )
+  #define WARN( msg ) INTERNAL_CATCH_MSG( "WARN", Catch::ResultWas::Warning, Catch::ResultDisposition::ContinueOnFailure, msg )
+  #define CAPTURE( ... ) INTERNAL_CATCH_CAPTURE( INTERNAL_CATCH_UNIQUE_NAME(capturer), "CAPTURE", __VA_ARGS__ )
+
+#elif !defined(CATCH_CONFIG_PREFIX_MESSAGES) && defined(CATCH_CONFIG_DISABLE)
+
+  #define INFO( msg )          (void)(0)
+  #define UNSCOPED_INFO( msg ) (void)(0)
+  #define WARN( msg )          (void)(0)
+  #define CAPTURE( ... )       (void)(0)
+
+#endif // end of user facing macro declarations
+
+
+
+
+#endif // CATCH_MESSAGE_HPP_INCLUDED
+
+
+#ifndef CATCH_SECTION_INFO_HPP_INCLUDED
+#define CATCH_SECTION_INFO_HPP_INCLUDED
+
+
+
+#ifndef CATCH_TOTALS_HPP_INCLUDED
+#define CATCH_TOTALS_HPP_INCLUDED
+
+#include <cstdint>
+
+namespace Catch {
+
+    struct Counts {
+        Counts operator - ( Counts const& other ) const;
+        Counts& operator += ( Counts const& other );
+
+        std::uint64_t total() const;
+        bool allPassed() const;
+        bool allOk() const;
+
+        std::uint64_t passed = 0;
+        std::uint64_t failed = 0;
+        std::uint64_t failedButOk = 0;
+        std::uint64_t skipped = 0;
+    };
+
+    struct Totals {
+
+        Totals operator - ( Totals const& other ) const;
+        Totals& operator += ( Totals const& other );
+
+        Totals delta( Totals const& prevTotals ) const;
+
+        Counts assertions;
+        Counts testCases;
+    };
+}
+
+#endif // CATCH_TOTALS_HPP_INCLUDED
+
+#include <string>
+
+namespace Catch {
+
+    struct SectionInfo {
+        // The last argument is ignored, so that people can write
+        // SECTION("ShortName", "Proper description that is long") and
+        // still use the `-c` flag comfortably.
+        SectionInfo( SourceLineInfo const& _lineInfo, std::string _name,
+                    const char* const = nullptr ):
+            name(CATCH_MOVE(_name)),
+            lineInfo(_lineInfo)
+            {}
+
+        std::string name;
+        SourceLineInfo lineInfo;
+    };
+
+    struct SectionEndInfo {
+        SectionInfo sectionInfo;
+        Counts prevAssertions;
+        double durationInSeconds;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_SECTION_INFO_HPP_INCLUDED
+
+
+#ifndef CATCH_SESSION_HPP_INCLUDED
+#define CATCH_SESSION_HPP_INCLUDED
+
+
+
+#ifndef CATCH_COMMANDLINE_HPP_INCLUDED
+#define CATCH_COMMANDLINE_HPP_INCLUDED
+
+
+
+#ifndef CATCH_CLARA_HPP_INCLUDED
+#define CATCH_CLARA_HPP_INCLUDED
+
+#if defined( __clang__ )
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wweak-vtables"
+#    pragma clang diagnostic ignored "-Wshadow"
+#    pragma clang diagnostic ignored "-Wdeprecated"
+#endif
+
+#if defined( __GNUC__ )
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wsign-conversion"
+#endif
+
+#ifndef CLARA_CONFIG_OPTIONAL_TYPE
+#    ifdef __has_include
+#        if __has_include( <optional>) && __cplusplus >= 201703L
+#            include <optional>
+#            define CLARA_CONFIG_OPTIONAL_TYPE std::optional
+#        endif
+#    endif
+#endif
+
+
+#include <cassert>
+#include <memory>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+namespace Catch {
+    namespace Clara {
+
+        class Args;
+        class Parser;
+
+        // enum of result types from a parse
+        enum class ParseResultType {
+            Matched,
+            NoMatch,
+            ShortCircuitAll,
+            ShortCircuitSame
+        };
+
+        struct accept_many_t {};
+        constexpr accept_many_t accept_many {};
+
+        namespace Detail {
+            struct fake_arg {
+                template <typename T>
+                operator T();
+            };
+
+            template <typename F, typename = void>
+            struct is_unary_function : std::false_type {};
+
+            template <typename F>
+            struct is_unary_function<
+                F,
+                Catch::Detail::void_t<decltype(
+                    std::declval<F>()( fake_arg() ) )
+                >
+            > : std::true_type {};
+
+            // Traits for extracting arg and return type of lambdas (for single
+            // argument lambdas)
+            template <typename L>
+            struct UnaryLambdaTraits
+                : UnaryLambdaTraits<decltype( &L::operator() )> {};
+
+            template <typename ClassT, typename ReturnT, typename... Args>
+            struct UnaryLambdaTraits<ReturnT ( ClassT::* )( Args... ) const> {
+                static const bool isValid = false;
+            };
+
+            template <typename ClassT, typename ReturnT, typename ArgT>
+            struct UnaryLambdaTraits<ReturnT ( ClassT::* )( ArgT ) const> {
+                static const bool isValid = true;
+                using ArgType = std::remove_const_t<std::remove_reference_t<ArgT>>;
+                using ReturnType = ReturnT;
+            };
+
+            class TokenStream;
+
+            // Wraps a token coming from a token stream. These may not directly
+            // correspond to strings as a single string may encode an option +
+            // its argument if the : or = form is used
+            enum class TokenType { Option, Argument };
+            struct Token {
+                TokenType type;
+                StringRef token;
+            };
+
+            // Abstracts iterators into args as a stream of tokens, with option
+            // arguments uniformly handled
+            class TokenStream {
+                using Iterator = std::vector<StringRef>::const_iterator;
+                Iterator it;
+                Iterator itEnd;
+                std::vector<Token> m_tokenBuffer;
+                void loadBuffer();
+
+            public:
+                explicit TokenStream( Args const& args );
+                TokenStream( Iterator it, Iterator itEnd );
+
+                explicit operator bool() const {
+                    return !m_tokenBuffer.empty() || it != itEnd;
+                }
+
+                size_t count() const {
+                    return m_tokenBuffer.size() + ( itEnd - it );
+                }
+
+                Token operator*() const {
+                    assert( !m_tokenBuffer.empty() );
+                    return m_tokenBuffer.front();
+                }
+
+                Token const* operator->() const {
+                    assert( !m_tokenBuffer.empty() );
+                    return &m_tokenBuffer.front();
+                }
+
+                TokenStream& operator++();
+            };
+
+            //! Denotes type of a parsing result
+            enum class ResultType {
+                Ok,          ///< No errors
+                LogicError,  ///< Error in user-specified arguments for
+                             ///< construction
+                RuntimeError ///< Error in parsing inputs
+            };
+
+            class ResultBase {
+            protected:
+                ResultBase( ResultType type ): m_type( type ) {}
+                virtual ~ResultBase(); // = default;
+
+
+                ResultBase(ResultBase const&) = default;
+                ResultBase& operator=(ResultBase const&) = default;
+                ResultBase(ResultBase&&) = default;
+                ResultBase& operator=(ResultBase&&) = default;
+
+                virtual void enforceOk() const = 0;
+
+                ResultType m_type;
+            };
+
+            template <typename T>
+            class ResultValueBase : public ResultBase {
+            public:
+                T const& value() const& {
+                    enforceOk();
+                    return m_value;
+                }
+                T&& value() && {
+                    enforceOk();
+                    return CATCH_MOVE( m_value );
+                }
+
+            protected:
+                ResultValueBase( ResultType type ): ResultBase( type ) {}
+
+                ResultValueBase( ResultValueBase const& other ):
+                    ResultBase( other ) {
+                    if ( m_type == ResultType::Ok )
+                        new ( &m_value ) T( other.m_value );
+                }
+                ResultValueBase( ResultValueBase&& other ):
+                    ResultBase( other ) {
+                    if ( m_type == ResultType::Ok )
+                        new ( &m_value ) T( CATCH_MOVE(other.m_value) );
+                }
+
+
+                ResultValueBase( ResultType, T const& value ):
+                    ResultBase( ResultType::Ok ) {
+                    new ( &m_value ) T( value );
+                }
+                ResultValueBase( ResultType, T&& value ):
+                    ResultBase( ResultType::Ok ) {
+                    new ( &m_value ) T( CATCH_MOVE(value) );
+                }
+
+                ResultValueBase& operator=( ResultValueBase const& other ) {
+                    if ( m_type == ResultType::Ok )
+                        m_value.~T();
+                    ResultBase::operator=( other );
+                    if ( m_type == ResultType::Ok )
+                        new ( &m_value ) T( other.m_value );
+                    return *this;
+                }
+                ResultValueBase& operator=( ResultValueBase&& other ) {
+                    if ( m_type == ResultType::Ok ) m_value.~T();
+                    ResultBase::operator=( other );
+                    if ( m_type == ResultType::Ok )
+                        new ( &m_value ) T( CATCH_MOVE(other.m_value) );
+                    return *this;
+                }
+
+
+                ~ResultValueBase() override {
+                    if ( m_type == ResultType::Ok )
+                        m_value.~T();
+                }
+
+                union {
+                    T m_value;
+                };
+            };
+
+            template <> class ResultValueBase<void> : public ResultBase {
+            protected:
+                using ResultBase::ResultBase;
+            };
+
+            template <typename T = void>
+            class BasicResult : public ResultValueBase<T> {
+            public:
+                template <typename U>
+                explicit BasicResult( BasicResult<U> const& other ):
+                    ResultValueBase<T>( other.type() ),
+                    m_errorMessage( other.errorMessage() ) {
+                    assert( type() != ResultType::Ok );
+                }
+
+                template <typename U>
+                static auto ok( U&& value ) -> BasicResult {
+                    return { ResultType::Ok, CATCH_FORWARD(value) };
+                }
+                static auto ok() -> BasicResult { return { ResultType::Ok }; }
+                static auto logicError( std::string&& message )
+                    -> BasicResult {
+                    return { ResultType::LogicError, CATCH_MOVE(message) };
+                }
+                static auto runtimeError( std::string&& message )
+                    -> BasicResult {
+                    return { ResultType::RuntimeError, CATCH_MOVE(message) };
+                }
+
+                explicit operator bool() const {
+                    return m_type == ResultType::Ok;
+                }
+                auto type() const -> ResultType { return m_type; }
+                auto errorMessage() const -> std::string const& {
+                    return m_errorMessage;
+                }
+
+            protected:
+                void enforceOk() const override {
+
+                    // Errors shouldn't reach this point, but if they do
+                    // the actual error message will be in m_errorMessage
+                    assert( m_type != ResultType::LogicError );
+                    assert( m_type != ResultType::RuntimeError );
+                    if ( m_type != ResultType::Ok )
+                        std::abort();
+                }
+
+                std::string
+                    m_errorMessage; // Only populated if resultType is an error
+
+                BasicResult( ResultType type,
+                             std::string&& message ):
+                    ResultValueBase<T>( type ), m_errorMessage( CATCH_MOVE(message) ) {
+                    assert( m_type != ResultType::Ok );
+                }
+
+                using ResultValueBase<T>::ResultValueBase;
+                using ResultBase::m_type;
+            };
+
+            class ParseState {
+            public:
+                ParseState( ParseResultType type,
+                            TokenStream remainingTokens );
+
+                ParseResultType type() const { return m_type; }
+                TokenStream const& remainingTokens() const& {
+                    return m_remainingTokens;
+                }
+                TokenStream&& remainingTokens() && {
+                    return CATCH_MOVE( m_remainingTokens );
+                }
+
+            private:
+                ParseResultType m_type;
+                TokenStream m_remainingTokens;
+            };
+
+            using Result = BasicResult<void>;
+            using ParserResult = BasicResult<ParseResultType>;
+            using InternalParseResult = BasicResult<ParseState>;
+
+            struct HelpColumns {
+                std::string left;
+                StringRef descriptions;
+            };
+
+            template <typename T>
+            ParserResult convertInto( std::string const& source, T& target ) {
+                std::stringstream ss( source );
+                ss >> target;
+                if ( ss.fail() ) {
+                    return ParserResult::runtimeError(
+                        "Unable to convert '" + source +
+                        "' to destination type" );
+                } else {
+                    return ParserResult::ok( ParseResultType::Matched );
+                }
+            }
+            ParserResult convertInto( std::string const& source,
+                                      std::string& target );
+            ParserResult convertInto( std::string const& source, bool& target );
+
+#ifdef CLARA_CONFIG_OPTIONAL_TYPE
+            template <typename T>
+            auto convertInto( std::string const& source,
+                              CLARA_CONFIG_OPTIONAL_TYPE<T>& target )
+                -> ParserResult {
+                T temp;
+                auto result = convertInto( source, temp );
+                if ( result )
+                    target = CATCH_MOVE( temp );
+                return result;
+            }
+#endif // CLARA_CONFIG_OPTIONAL_TYPE
+
+            struct BoundRef : Catch::Detail::NonCopyable {
+                virtual ~BoundRef() = default;
+                virtual bool isContainer() const;
+                virtual bool isFlag() const;
+            };
+            struct BoundValueRefBase : BoundRef {
+                virtual auto setValue( std::string const& arg )
+                    -> ParserResult = 0;
+            };
+            struct BoundFlagRefBase : BoundRef {
+                virtual auto setFlag( bool flag ) -> ParserResult = 0;
+                bool isFlag() const override;
+            };
+
+            template <typename T> struct BoundValueRef : BoundValueRefBase {
+                T& m_ref;
+
+                explicit BoundValueRef( T& ref ): m_ref( ref ) {}
+
+                ParserResult setValue( std::string const& arg ) override {
+                    return convertInto( arg, m_ref );
+                }
+            };
+
+            template <typename T>
+            struct BoundValueRef<std::vector<T>> : BoundValueRefBase {
+                std::vector<T>& m_ref;
+
+                explicit BoundValueRef( std::vector<T>& ref ): m_ref( ref ) {}
+
+                auto isContainer() const -> bool override { return true; }
+
+                auto setValue( std::string const& arg )
+                    -> ParserResult override {
+                    T temp;
+                    auto result = convertInto( arg, temp );
+                    if ( result )
+                        m_ref.push_back( temp );
+                    return result;
+                }
+            };
+
+            struct BoundFlagRef : BoundFlagRefBase {
+                bool& m_ref;
+
+                explicit BoundFlagRef( bool& ref ): m_ref( ref ) {}
+
+                ParserResult setFlag( bool flag ) override;
+            };
+
+            template <typename ReturnType> struct LambdaInvoker {
+                static_assert(
+                    std::is_same<ReturnType, ParserResult>::value,
+                    "Lambda must return void or clara::ParserResult" );
+
+                template <typename L, typename ArgType>
+                static auto invoke( L const& lambda, ArgType const& arg )
+                    -> ParserResult {
+                    return lambda( arg );
+                }
+            };
+
+            template <> struct LambdaInvoker<void> {
+                template <typename L, typename ArgType>
+                static auto invoke( L const& lambda, ArgType const& arg )
+                    -> ParserResult {
+                    lambda( arg );
+                    return ParserResult::ok( ParseResultType::Matched );
+                }
+            };
+
+            template <typename ArgType, typename L>
+            auto invokeLambda( L const& lambda, std::string const& arg )
+                -> ParserResult {
+                ArgType temp{};
+                auto result = convertInto( arg, temp );
+                return !result ? result
+                               : LambdaInvoker<typename UnaryLambdaTraits<
+                                     L>::ReturnType>::invoke( lambda, temp );
+            }
+
+            template <typename L> struct BoundLambda : BoundValueRefBase {
+                L m_lambda;
+
+                static_assert(
+                    UnaryLambdaTraits<L>::isValid,
+                    "Supplied lambda must take exactly one argument" );
+                explicit BoundLambda( L const& lambda ): m_lambda( lambda ) {}
+
+                auto setValue( std::string const& arg )
+                    -> ParserResult override {
+                    return invokeLambda<typename UnaryLambdaTraits<L>::ArgType>(
+                        m_lambda, arg );
+                }
+            };
+
+            template <typename L> struct BoundManyLambda : BoundLambda<L> {
+                explicit BoundManyLambda( L const& lambda ): BoundLambda<L>( lambda ) {}
+                bool isContainer() const override { return true; }
+            };
+
+            template <typename L> struct BoundFlagLambda : BoundFlagRefBase {
+                L m_lambda;
+
+                static_assert(
+                    UnaryLambdaTraits<L>::isValid,
+                    "Supplied lambda must take exactly one argument" );
+                static_assert(
+                    std::is_same<typename UnaryLambdaTraits<L>::ArgType,
+                                 bool>::value,
+                    "flags must be boolean" );
+
+                explicit BoundFlagLambda( L const& lambda ):
+                    m_lambda( lambda ) {}
+
+                auto setFlag( bool flag ) -> ParserResult override {
+                    return LambdaInvoker<typename UnaryLambdaTraits<
+                        L>::ReturnType>::invoke( m_lambda, flag );
+                }
+            };
+
+            enum class Optionality { Optional, Required };
+
+            class ParserBase {
+            public:
+                virtual ~ParserBase() = default;
+                virtual auto validate() const -> Result { return Result::ok(); }
+                virtual auto parse( std::string const& exeName,
+                                    TokenStream tokens ) const
+                    -> InternalParseResult = 0;
+                virtual size_t cardinality() const;
+
+                InternalParseResult parse( Args const& args ) const;
+            };
+
+            template <typename DerivedT>
+            class ComposableParserImpl : public ParserBase {
+            public:
+                template <typename T>
+                auto operator|( T const& other ) const -> Parser;
+            };
+
+            // Common code and state for Args and Opts
+            template <typename DerivedT>
+            class ParserRefImpl : public ComposableParserImpl<DerivedT> {
+            protected:
+                Optionality m_optionality = Optionality::Optional;
+                std::shared_ptr<BoundRef> m_ref;
+                StringRef m_hint;
+                StringRef m_description;
+
+                explicit ParserRefImpl( std::shared_ptr<BoundRef> const& ref ):
+                    m_ref( ref ) {}
+
+            public:
+                template <typename LambdaT>
+                ParserRefImpl( accept_many_t,
+                               LambdaT const& ref,
+                               StringRef hint ):
+                    m_ref( std::make_shared<BoundManyLambda<LambdaT>>( ref ) ),
+                    m_hint( hint ) {}
+
+                template <typename T,
+                          typename = typename std::enable_if_t<
+                              !Detail::is_unary_function<T>::value>>
+                ParserRefImpl( T& ref, StringRef hint ):
+                    m_ref( std::make_shared<BoundValueRef<T>>( ref ) ),
+                    m_hint( hint ) {}
+
+                template <typename LambdaT,
+                          typename = typename std::enable_if_t<
+                              Detail::is_unary_function<LambdaT>::value>>
+                ParserRefImpl( LambdaT const& ref, StringRef hint ):
+                    m_ref( std::make_shared<BoundLambda<LambdaT>>( ref ) ),
+                    m_hint( hint ) {}
+
+                DerivedT& operator()( StringRef description ) & {
+                    m_description = description;
+                    return static_cast<DerivedT&>( *this );
+                }
+                DerivedT&& operator()( StringRef description ) && {
+                    m_description = description;
+                    return static_cast<DerivedT&&>( *this );
+                }
+
+                auto optional() -> DerivedT& {
+                    m_optionality = Optionality::Optional;
+                    return static_cast<DerivedT&>( *this );
+                }
+
+                auto required() -> DerivedT& {
+                    m_optionality = Optionality::Required;
+                    return static_cast<DerivedT&>( *this );
+                }
+
+                auto isOptional() const -> bool {
+                    return m_optionality == Optionality::Optional;
+                }
+
+                auto cardinality() const -> size_t override {
+                    if ( m_ref->isContainer() )
+                        return 0;
+                    else
+                        return 1;
+                }
+
+                StringRef hint() const { return m_hint; }
+            };
+
+        } // namespace detail
+
+
+        // A parser for arguments
+        class Arg : public Detail::ParserRefImpl<Arg> {
+        public:
+            using ParserRefImpl::ParserRefImpl;
+            using ParserBase::parse;
+
+            Detail::InternalParseResult
+                parse(std::string const&,
+                      Detail::TokenStream tokens) const override;
+        };
+
+        // A parser for options
+        class Opt : public Detail::ParserRefImpl<Opt> {
+        protected:
+            std::vector<StringRef> m_optNames;
+
+        public:
+            template <typename LambdaT>
+            explicit Opt(LambdaT const& ref) :
+                ParserRefImpl(
+                    std::make_shared<Detail::BoundFlagLambda<LambdaT>>(ref)) {}
+
+            explicit Opt(bool& ref);
+
+            template <typename LambdaT,
+                      typename = typename std::enable_if_t<
+                          Detail::is_unary_function<LambdaT>::value>>
+            Opt( LambdaT const& ref, StringRef hint ):
+                ParserRefImpl( ref, hint ) {}
+
+            template <typename LambdaT>
+            Opt( accept_many_t, LambdaT const& ref, StringRef hint ):
+                ParserRefImpl( accept_many, ref, hint ) {}
+
+            template <typename T,
+                      typename = typename std::enable_if_t<
+                          !Detail::is_unary_function<T>::value>>
+            Opt( T& ref, StringRef hint ):
+                ParserRefImpl( ref, hint ) {}
+
+            Opt& operator[]( StringRef optName ) & {
+                m_optNames.push_back(optName);
+                return *this;
+            }
+            Opt&& operator[]( StringRef optName ) && {
+                m_optNames.push_back( optName );
+                return CATCH_MOVE(*this);
+            }
+
+            Detail::HelpColumns getHelpColumns() const;
+
+            bool isMatch(StringRef optToken) const;
+
+            using ParserBase::parse;
+
+            Detail::InternalParseResult
+                parse(std::string const&,
+                      Detail::TokenStream tokens) const override;
+
+            Detail::Result validate() const override;
+        };
+
+        // Specifies the name of the executable
+        class ExeName : public Detail::ComposableParserImpl<ExeName> {
+            std::shared_ptr<std::string> m_name;
+            std::shared_ptr<Detail::BoundValueRefBase> m_ref;
+
+        public:
+            ExeName();
+            explicit ExeName(std::string& ref);
+
+            template <typename LambdaT>
+            explicit ExeName(LambdaT const& lambda) : ExeName() {
+                m_ref = std::make_shared<Detail::BoundLambda<LambdaT>>(lambda);
+            }
+
+            // The exe name is not parsed out of the normal tokens, but is
+            // handled specially
+            Detail::InternalParseResult
+                parse(std::string const&,
+                      Detail::TokenStream tokens) const override;
+
+            std::string const& name() const { return *m_name; }
+            Detail::ParserResult set(std::string const& newName);
+        };
+
+
+        // A Combined parser
+        class Parser : Detail::ParserBase {
+            mutable ExeName m_exeName;
+            std::vector<Opt> m_options;
+            std::vector<Arg> m_args;
+
+        public:
+
+            auto operator|=(ExeName const& exeName) -> Parser& {
+                m_exeName = exeName;
+                return *this;
+            }
+
+            auto operator|=(Arg const& arg) -> Parser& {
+                m_args.push_back(arg);
+                return *this;
+            }
+
+            friend Parser& operator|=( Parser& p, Opt const& opt ) {
+                p.m_options.push_back( opt );
+                return p;
+            }
+            friend Parser& operator|=( Parser& p, Opt&& opt ) {
+                p.m_options.push_back( CATCH_MOVE(opt) );
+                return p;
+            }
+
+            Parser& operator|=(Parser const& other);
+
+            template <typename T>
+            friend Parser operator|( Parser const& p, T&& rhs ) {
+                Parser temp( p );
+                temp |= rhs;
+                return temp;
+            }
+
+            template <typename T>
+            friend Parser operator|( Parser&& p, T&& rhs ) {
+                p |= CATCH_FORWARD(rhs);
+                return CATCH_MOVE(p);
+            }
+
+            std::vector<Detail::HelpColumns> getHelpColumns() const;
+
+            void writeToStream(std::ostream& os) const;
+
+            friend auto operator<<(std::ostream& os, Parser const& parser)
+                -> std::ostream& {
+                parser.writeToStream(os);
+                return os;
+            }
+
+            Detail::Result validate() const override;
+
+            using ParserBase::parse;
+            Detail::InternalParseResult
+                parse(std::string const& exeName,
+                      Detail::TokenStream tokens) const override;
+        };
+
+        /**
+         * Wrapper over argc + argv, assumes that the inputs outlive it
+         */
+        class Args {
+            friend Detail::TokenStream;
+            StringRef m_exeName;
+            std::vector<StringRef> m_args;
+
+        public:
+            Args(int argc, char const* const* argv);
+            // Helper constructor for testing
+            Args(std::initializer_list<StringRef> args);
+
+            StringRef exeName() const { return m_exeName; }
+        };
+
+
+        // Convenience wrapper for option parser that specifies the help option
+        struct Help : Opt {
+            Help(bool& showHelpFlag);
+        };
+
+        // Result type for parser operation
+        using Detail::ParserResult;
+
+        namespace Detail {
+            template <typename DerivedT>
+            template <typename T>
+            Parser
+                ComposableParserImpl<DerivedT>::operator|(T const& other) const {
+                return Parser() | static_cast<DerivedT const&>(*this) | other;
+            }
+        }
+
+    } // namespace Clara
+} // namespace Catch
+
+#if defined( __clang__ )
+#    pragma clang diagnostic pop
+#endif
+
+#if defined( __GNUC__ )
+#    pragma GCC diagnostic pop
+#endif
+
+#endif // CATCH_CLARA_HPP_INCLUDED
+
+namespace Catch {
+
+    struct ConfigData;
+
+    Clara::Parser makeCommandLineParser( ConfigData& config );
+
+} // end namespace Catch
+
+#endif // CATCH_COMMANDLINE_HPP_INCLUDED
+
+namespace Catch {
+
+    class Session : Detail::NonCopyable {
+    public:
+
+        Session();
+        ~Session();
+
+        void showHelp() const;
+        void libIdentify();
+
+        int applyCommandLine( int argc, char const * const * argv );
+    #if defined(CATCH_CONFIG_WCHAR) && defined(_WIN32) && defined(UNICODE)
+        int applyCommandLine( int argc, wchar_t const * const * argv );
+    #endif
+
+        void useConfigData( ConfigData const& configData );
+
+        template<typename CharT>
+        int run(int argc, CharT const * const argv[]) {
+            if (m_startupExceptions)
+                return 1;
+            int returnCode = applyCommandLine(argc, argv);
+            if (returnCode == 0)
+                returnCode = run();
+            return returnCode;
+        }
+
+        int run();
+
+        Clara::Parser const& cli() const;
+        void cli( Clara::Parser const& newParser );
+        ConfigData& configData();
+        Config& config();
+    private:
+        int runInternal();
+
+        Clara::Parser m_cli;
+        ConfigData m_configData;
+        Detail::unique_ptr<Config> m_config;
+        bool m_startupExceptions = false;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_SESSION_HPP_INCLUDED
+
+
+#ifndef CATCH_TAG_ALIAS_HPP_INCLUDED
+#define CATCH_TAG_ALIAS_HPP_INCLUDED
+
+
+#include <string>
+
+namespace Catch {
+
+    struct TagAlias {
+        TagAlias(std::string const& _tag, SourceLineInfo _lineInfo):
+            tag(_tag),
+            lineInfo(_lineInfo)
+        {}
+
+        std::string tag;
+        SourceLineInfo lineInfo;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_TAG_ALIAS_HPP_INCLUDED
+
+
+#ifndef CATCH_TAG_ALIAS_AUTOREGISTRAR_HPP_INCLUDED
+#define CATCH_TAG_ALIAS_AUTOREGISTRAR_HPP_INCLUDED
+
+
+namespace Catch {
+
+    struct RegistrarForTagAliases {
+        RegistrarForTagAliases( char const* alias, char const* tag, SourceLineInfo const& lineInfo );
+    };
+
+} // end namespace Catch
+
+#define CATCH_REGISTER_TAG_ALIAS( alias, spec ) \
+    CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+    CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+    namespace{ Catch::RegistrarForTagAliases INTERNAL_CATCH_UNIQUE_NAME( AutoRegisterTagAlias )( alias, spec, CATCH_INTERNAL_LINEINFO ); } \
+    CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#endif // CATCH_TAG_ALIAS_AUTOREGISTRAR_HPP_INCLUDED
+
+
+#ifndef CATCH_TEMPLATE_TEST_MACROS_HPP_INCLUDED
+#define CATCH_TEMPLATE_TEST_MACROS_HPP_INCLUDED
+
+// We need this suppression to leak, because it took until GCC 10
+// for the front end to handle local suppression via _Pragma properly
+// inside templates (so `TEMPLATE_TEST_CASE` and co).
+// **THIS IS DIFFERENT FOR STANDARD TESTS, WHERE GCC 9 IS SUFFICIENT**
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && __GNUC__ < 10
+#pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+
+
+
+
+#ifndef CATCH_TEST_MACROS_HPP_INCLUDED
+#define CATCH_TEST_MACROS_HPP_INCLUDED
+
+
+
+#ifndef CATCH_TEST_MACRO_IMPL_HPP_INCLUDED
+#define CATCH_TEST_MACRO_IMPL_HPP_INCLUDED
+
+
+
+#ifndef CATCH_ASSERTION_HANDLER_HPP_INCLUDED
+#define CATCH_ASSERTION_HANDLER_HPP_INCLUDED
+
+
+
+#ifndef CATCH_DECOMPOSER_HPP_INCLUDED
+#define CATCH_DECOMPOSER_HPP_INCLUDED
+
+
+
+#ifndef CATCH_COMPARE_TRAITS_HPP_INCLUDED
+#define CATCH_COMPARE_TRAITS_HPP_INCLUDED
+
+
+#include <type_traits>
+
+namespace Catch {
+    namespace Detail {
+
+#if defined( __GNUC__ ) && !defined( __clang__ )
+#    pragma GCC diagnostic push
+    // GCC likes to complain about comparing bool with 0, in the decltype()
+    // that defines the comparable traits below.
+#    pragma GCC diagnostic ignored "-Wbool-compare"
+    // "ordered comparison of pointer with integer zero" same as above,
+    // but it does not have a separate warning flag to suppress
+#    pragma GCC diagnostic ignored "-Wextra"
+    // Did you know that comparing floats with `0` directly
+    // is super-duper dangerous in unevaluated context?
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+
+#if defined( __clang__ )
+#    pragma clang diagnostic push
+    // Did you know that comparing floats with `0` directly
+    // is super-duper dangerous in unevaluated context?
+#    pragma clang diagnostic ignored "-Wfloat-equal"
+#endif
+
+#define CATCH_DEFINE_COMPARABLE_TRAIT( id, op )                               \
+    template <typename, typename, typename = void>                            \
+    struct is_##id##_comparable : std::false_type {};                         \
+    template <typename T, typename U>                                         \
+    struct is_##id##_comparable<                                              \
+        T,                                                                    \
+        U,                                                                    \
+        void_t<decltype( std::declval<T>() op std::declval<U>() )>>           \
+        : std::true_type {};                                                  \
+    template <typename, typename = void>                                      \
+    struct is_##id##_0_comparable : std::false_type {};                       \
+    template <typename T>                                                     \
+    struct is_##id##_0_comparable<T,                                          \
+                                  void_t<decltype( std::declval<T>() op 0 )>> \
+        : std::true_type {};
+
+        // We need all 6 pre-spaceship comparison ops: <, <=, >, >=, ==, !=
+        CATCH_DEFINE_COMPARABLE_TRAIT( lt, < )
+        CATCH_DEFINE_COMPARABLE_TRAIT( le, <= )
+        CATCH_DEFINE_COMPARABLE_TRAIT( gt, > )
+        CATCH_DEFINE_COMPARABLE_TRAIT( ge, >= )
+        CATCH_DEFINE_COMPARABLE_TRAIT( eq, == )
+        CATCH_DEFINE_COMPARABLE_TRAIT( ne, != )
+
+#undef CATCH_DEFINE_COMPARABLE_TRAIT
+
+#if defined( __GNUC__ ) && !defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+#if defined( __clang__ )
+#    pragma clang diagnostic pop
+#endif
+
+
+    } // namespace Detail
+} // namespace Catch
+
+#endif // CATCH_COMPARE_TRAITS_HPP_INCLUDED
+
+
+#ifndef CATCH_LOGICAL_TRAITS_HPP_INCLUDED
+#define CATCH_LOGICAL_TRAITS_HPP_INCLUDED
+
+#include <type_traits>
+
+namespace Catch {
+namespace Detail {
+
+#if defined( __cpp_lib_logical_traits ) && __cpp_lib_logical_traits >= 201510
+
+    using std::conjunction;
+    using std::disjunction;
+    using std::negation;
+
+#else
+
+    template <class...> struct conjunction : std::true_type {};
+    template <class B1> struct conjunction<B1> : B1 {};
+    template <class B1, class... Bn>
+    struct conjunction<B1, Bn...>
+        : std::conditional_t<bool( B1::value ), conjunction<Bn...>, B1> {};
+
+    template <class...> struct disjunction : std::false_type {};
+    template <class B1> struct disjunction<B1> : B1 {};
+    template <class B1, class... Bn>
+    struct disjunction<B1, Bn...>
+        : std::conditional_t<bool( B1::value ), B1, disjunction<Bn...>> {};
+
+    template <class B>
+    struct negation : std::integral_constant<bool, !bool(B::value)> {};
+
+#endif
+
+} // namespace Detail
+} // namespace Catch
+
+#endif // CATCH_LOGICAL_TRAITS_HPP_INCLUDED
+
+#include <type_traits>
+#include <iosfwd>
+
+/** \file
+ * Why does decomposing look the way it does:
+ *
+ * Conceptually, decomposing is simple. We change `REQUIRE( a == b )` into
+ * `Decomposer{} <= a == b`, so that `Decomposer{} <= a` is evaluated first,
+ * and our custom operator is used for `a == b`, because `a` is transformed
+ * into `ExprLhs<T&>` and then into `BinaryExpr<T&, U&>`.
+ *
+ * In practice, decomposing ends up a mess, because we have to support
+ * various fun things.
+ *
+ * 1) Types that are only comparable with literal 0, and they do this by
+ *    comparing against a magic type with pointer constructor and deleted
+ *    other constructors. Example: `REQUIRE((a <=> b) == 0)` in libstdc++
+ *
+ * 2) Types that are only comparable with literal 0, and they do this by
+ *    comparing against a magic type with consteval integer constructor.
+ *    Example: `REQUIRE((a <=> b) == 0)` in current MSVC STL.
+ *
+ * 3) Types that have no linkage, and so we cannot form a reference to
+ *    them. Example: some implementations of traits.
+ *
+ * 4) Starting with C++20, when the compiler sees `a == b`, it also uses
+ *    `b == a` when constructing the overload set. For us this means that
+ *    when the compiler handles `ExprLhs<T> == b`, it also tries to resolve
+ *    the overload set for `b == ExprLhs<T>`.
+ *
+ * To accomodate these use cases, decomposer ended up rather complex.
+ *
+ * 1) These types are handled by adding SFINAE overloads to our comparison
+ *    operators, checking whether `T == U` are comparable with the given
+ *    operator, and if not, whether T (or U) are comparable with literal 0.
+ *    If yes, the overload compares T (or U) with 0 literal inline in the
+ *    definition.
+ *
+ *    Note that for extra correctness, we check  that the other type is
+ *    either an `int` (literal 0 is captured as `int` by templates), or
+ *    a `long` (some platforms use 0L for `NULL` and we want to support
+ *    that for pointer comparisons).
+ *
+ * 2) For these types, `is_foo_comparable<T, int>` is true, but letting
+ *    them fall into the overload that actually does `T == int` causes
+ *    compilation error. Handling them requires that the decomposition
+ *    is `constexpr`, so that P2564R3 applies and the `consteval` from
+ *    their accompanying magic type is propagated through the `constexpr`
+ *    call stack.
+ *
+ *    However this is not enough to handle these types automatically,
+ *    because our default is to capture types by reference, to avoid
+ *    runtime copies. While these references cannot become dangling,
+ *    they outlive the constexpr context and thus the default capture
+ *    path cannot be actually constexpr.
+ *
+ *    The solution is to capture these types by value, by explicitly
+ *    specializing `Catch::capture_by_value` for them. Catch2 provides
+ *    specialization for `std::foo_ordering`s, but users can specialize
+ *    the trait for their own types as well.
+ *
+ * 3) If a type has no linkage, we also cannot capture it by reference.
+ *    The solution is once again to capture them by value. We handle
+ *    the common cases by using `std::is_arithmetic` as the default
+ *    for `Catch::capture_by_value`, but that is only a some-effort
+ *    heuristic. But as with 2), users can specialize `capture_by_value`
+ *    for their own types as needed.
+ *
+ * 4) To support C++20 and make the SFINAE on our decomposing operators
+ *    work, the SFINAE has to happen in return type, rather than in
+ *    a template type. This is due to our use of logical type traits
+ *    (`conjunction`/`disjunction`/`negation`), that we use to workaround
+ *    an issue in older (9-) versions of GCC. I still blame C++20 for
+ *    this, because without the comparison order switching, the logical
+ *    traits could still be used in template type.
+ *
+ * There are also other side concerns, e.g. supporting both `REQUIRE(a)`
+ * and `REQUIRE(a == b)`, or making `REQUIRE_THAT(a, IsEqual(b))` slot
+ * nicely into the same expression handling logic, but these are rather
+ * straightforward and add only a bit of complexity (e.g. common base
+ * class for decomposed expressions).
+ */
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable:4389) // '==' : signed/unsigned mismatch
+#pragma warning(disable:4018) // more "signed/unsigned mismatch"
+#pragma warning(disable:4312) // Converting int to T* using reinterpret_cast (issue on x64 platform)
+#pragma warning(disable:4180) // qualifier applied to function type has no meaning
+#pragma warning(disable:4800) // Forcing result to true or false
+#endif
+
+#ifdef __clang__
+#  pragma clang diagnostic push
+#  pragma clang diagnostic ignored "-Wsign-compare"
+#elif defined __GNUC__
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wsign-compare"
+#endif
+
+#if defined(CATCH_CPP20_OR_GREATER) && __has_include(<compare>)
+#  include <compare>
+#    if defined( __cpp_lib_three_way_comparison ) && \
+            __cpp_lib_three_way_comparison >= 201907L
+#      define CATCH_CONFIG_CPP20_COMPARE_OVERLOADS
+#    endif
+#endif
+
+namespace Catch {
+
+    // Note: There is nothing that stops us from extending this,
+    //       e.g. to `std::is_scalar`, but the more encompassing
+    //       traits are usually also more expensive. For now we
+    //       keep this as it used to be and it can be changed later.
+    template <typename T>
+    struct capture_by_value
+        : std::integral_constant<bool, std::is_arithmetic<T>{}> {};
+
+#if defined( CATCH_CONFIG_CPP20_COMPARE_OVERLOADS )
+    template <>
+    struct capture_by_value<std::strong_ordering> : std::true_type {};
+    template <>
+    struct capture_by_value<std::weak_ordering> : std::true_type {};
+    template <>
+    struct capture_by_value<std::partial_ordering> : std::true_type {};
+#endif
+
+    template <typename T>
+    struct always_false : std::false_type {};
+
+    class ITransientExpression {
+        bool m_isBinaryExpression;
+        bool m_result;
+
+    public:
+        constexpr auto isBinaryExpression() const -> bool { return m_isBinaryExpression; }
+        constexpr auto getResult() const -> bool { return m_result; }
+        //! This function **has** to be overriden by the derived class.
+        virtual void streamReconstructedExpression( std::ostream& os ) const;
+
+        constexpr ITransientExpression( bool isBinaryExpression, bool result )
+        :   m_isBinaryExpression( isBinaryExpression ),
+            m_result( result )
+        {}
+
+        ITransientExpression() = default;
+        ITransientExpression(ITransientExpression const&) = default;
+        ITransientExpression& operator=(ITransientExpression const&) = default;
+
+        // We don't actually need a virtual destructor, but many static analysers
+        // complain if it's not here :-(
+        virtual ~ITransientExpression() = default;
+
+        friend std::ostream& operator<<(std::ostream& out, ITransientExpression const& expr) {
+            expr.streamReconstructedExpression(out);
+            return out;
+        }
+    };
+
+    void formatReconstructedExpression( std::ostream &os, std::string const& lhs, StringRef op, std::string const& rhs );
+
+    template<typename LhsT, typename RhsT>
+    class BinaryExpr  : public ITransientExpression {
+        LhsT m_lhs;
+        StringRef m_op;
+        RhsT m_rhs;
+
+        void streamReconstructedExpression( std::ostream &os ) const override {
+            formatReconstructedExpression
+                    ( os, Catch::Detail::stringify( m_lhs ), m_op, Catch::Detail::stringify( m_rhs ) );
+        }
+
+    public:
+        constexpr BinaryExpr( bool comparisonResult, LhsT lhs, StringRef op, RhsT rhs )
+        :   ITransientExpression{ true, comparisonResult },
+            m_lhs( lhs ),
+            m_op( op ),
+            m_rhs( rhs )
+        {}
+
+        template<typename T>
+        auto operator && ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename T>
+        auto operator || ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename T>
+        auto operator == ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename T>
+        auto operator != ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename T>
+        auto operator > ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename T>
+        auto operator < ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename T>
+        auto operator >= ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename T>
+        auto operator <= ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+    };
+
+    template<typename LhsT>
+    class UnaryExpr : public ITransientExpression {
+        LhsT m_lhs;
+
+        void streamReconstructedExpression( std::ostream &os ) const override {
+            os << Catch::Detail::stringify( m_lhs );
+        }
+
+    public:
+        explicit constexpr UnaryExpr( LhsT lhs )
+        :   ITransientExpression{ false, static_cast<bool>(lhs) },
+            m_lhs( lhs )
+        {}
+    };
+
+
+    template<typename LhsT>
+    class ExprLhs {
+        LhsT m_lhs;
+    public:
+        explicit constexpr ExprLhs( LhsT lhs ) : m_lhs( lhs ) {}
+
+#define CATCH_INTERNAL_DEFINE_EXPRESSION_EQUALITY_OPERATOR( id, op )           \
+    template <typename RhsT>                                                   \
+    constexpr friend auto operator op( ExprLhs&& lhs, RhsT&& rhs )             \
+        ->std::enable_if_t<                                                    \
+            Detail::conjunction<Detail::is_##id##_comparable<LhsT, RhsT>,      \
+                                Detail::negation<capture_by_value<             \
+                                    std::remove_reference_t<RhsT>>>>::value,   \
+            BinaryExpr<LhsT, RhsT const&>> {                                   \
+        return {                                                               \
+            static_cast<bool>( lhs.m_lhs op rhs ), lhs.m_lhs, #op##_sr, rhs }; \
+    }                                                                          \
+    template <typename RhsT>                                                   \
+    constexpr friend auto operator op( ExprLhs&& lhs, RhsT rhs )               \
+        ->std::enable_if_t<                                                    \
+            Detail::conjunction<Detail::is_##id##_comparable<LhsT, RhsT>,      \
+                                capture_by_value<RhsT>>::value,                \
+            BinaryExpr<LhsT, RhsT>> {                                          \
+        return {                                                               \
+            static_cast<bool>( lhs.m_lhs op rhs ), lhs.m_lhs, #op##_sr, rhs }; \
+    }                                                                          \
+    template <typename RhsT>                                                   \
+    constexpr friend auto operator op( ExprLhs&& lhs, RhsT rhs )               \
+        ->std::enable_if_t<                                                    \
+            Detail::conjunction<                                               \
+                Detail::negation<Detail::is_##id##_comparable<LhsT, RhsT>>,    \
+                Detail::is_eq_0_comparable<LhsT>,                              \
+              /* We allow long because we want `ptr op NULL` to be accepted */ \
+                Detail::disjunction<std::is_same<RhsT, int>,                   \
+                                    std::is_same<RhsT, long>>>::value,         \
+            BinaryExpr<LhsT, RhsT>> {                                          \
+        if ( rhs != 0 ) { throw_test_failure_exception(); }                    \
+        return {                                                               \
+            static_cast<bool>( lhs.m_lhs op 0 ), lhs.m_lhs, #op##_sr, rhs };   \
+    }                                                                          \
+    template <typename RhsT>                                                   \
+    constexpr friend auto operator op( ExprLhs&& lhs, RhsT rhs )               \
+        ->std::enable_if_t<                                                    \
+            Detail::conjunction<                                               \
+                Detail::negation<Detail::is_##id##_comparable<LhsT, RhsT>>,    \
+                Detail::is_eq_0_comparable<RhsT>,                              \
+              /* We allow long because we want `ptr op NULL` to be accepted */ \
+                Detail::disjunction<std::is_same<LhsT, int>,                   \
+                                    std::is_same<LhsT, long>>>::value,         \
+            BinaryExpr<LhsT, RhsT>> {                                          \
+        if ( lhs.m_lhs != 0 ) { throw_test_failure_exception(); }              \
+        return { static_cast<bool>( 0 op rhs ), lhs.m_lhs, #op##_sr, rhs };    \
+    }
+
+        CATCH_INTERNAL_DEFINE_EXPRESSION_EQUALITY_OPERATOR( eq, == )
+        CATCH_INTERNAL_DEFINE_EXPRESSION_EQUALITY_OPERATOR( ne, != )
+
+    #undef CATCH_INTERNAL_DEFINE_EXPRESSION_EQUALITY_OPERATOR
+
+
+#define CATCH_INTERNAL_DEFINE_EXPRESSION_COMPARISON_OPERATOR( id, op )         \
+    template <typename RhsT>                                                   \
+    constexpr friend auto operator op( ExprLhs&& lhs, RhsT&& rhs )             \
+        ->std::enable_if_t<                                                    \
+            Detail::conjunction<Detail::is_##id##_comparable<LhsT, RhsT>,      \
+                                Detail::negation<capture_by_value<             \
+                                    std::remove_reference_t<RhsT>>>>::value,   \
+            BinaryExpr<LhsT, RhsT const&>> {                                   \
+        return {                                                               \
+            static_cast<bool>( lhs.m_lhs op rhs ), lhs.m_lhs, #op##_sr, rhs }; \
+    }                                                                          \
+    template <typename RhsT>                                                   \
+    constexpr friend auto operator op( ExprLhs&& lhs, RhsT rhs )               \
+        ->std::enable_if_t<                                                    \
+            Detail::conjunction<Detail::is_##id##_comparable<LhsT, RhsT>,      \
+                                capture_by_value<RhsT>>::value,                \
+            BinaryExpr<LhsT, RhsT>> {                                          \
+        return {                                                               \
+            static_cast<bool>( lhs.m_lhs op rhs ), lhs.m_lhs, #op##_sr, rhs }; \
+    }                                                                          \
+    template <typename RhsT>                                                   \
+    constexpr friend auto operator op( ExprLhs&& lhs, RhsT rhs )               \
+        ->std::enable_if_t<                                                    \
+            Detail::conjunction<                                               \
+                Detail::negation<Detail::is_##id##_comparable<LhsT, RhsT>>,    \
+                Detail::is_##id##_0_comparable<LhsT>,                          \
+                std::is_same<RhsT, int>>::value,                               \
+            BinaryExpr<LhsT, RhsT>> {                                          \
+        if ( rhs != 0 ) { throw_test_failure_exception(); }                    \
+        return {                                                               \
+            static_cast<bool>( lhs.m_lhs op 0 ), lhs.m_lhs, #op##_sr, rhs };   \
+    }                                                                          \
+    template <typename RhsT>                                                   \
+    constexpr friend auto operator op( ExprLhs&& lhs, RhsT rhs )               \
+        ->std::enable_if_t<                                                    \
+            Detail::conjunction<                                               \
+                Detail::negation<Detail::is_##id##_comparable<LhsT, RhsT>>,    \
+                Detail::is_##id##_0_comparable<RhsT>,                          \
+                std::is_same<LhsT, int>>::value,                               \
+            BinaryExpr<LhsT, RhsT>> {                                          \
+        if ( lhs.m_lhs != 0 ) { throw_test_failure_exception(); }              \
+        return { static_cast<bool>( 0 op rhs ), lhs.m_lhs, #op##_sr, rhs };    \
+    }
+
+        CATCH_INTERNAL_DEFINE_EXPRESSION_COMPARISON_OPERATOR( lt, < )
+        CATCH_INTERNAL_DEFINE_EXPRESSION_COMPARISON_OPERATOR( le, <= )
+        CATCH_INTERNAL_DEFINE_EXPRESSION_COMPARISON_OPERATOR( gt, > )
+        CATCH_INTERNAL_DEFINE_EXPRESSION_COMPARISON_OPERATOR( ge, >= )
+
+    #undef CATCH_INTERNAL_DEFINE_EXPRESSION_COMPARISON_OPERATOR
+
+
+#define CATCH_INTERNAL_DEFINE_EXPRESSION_OPERATOR( op )                        \
+    template <typename RhsT>                                                   \
+    constexpr friend auto operator op( ExprLhs&& lhs, RhsT&& rhs )             \
+        ->std::enable_if_t<                                                    \
+            !capture_by_value<std::remove_reference_t<RhsT>>::value,           \
+            BinaryExpr<LhsT, RhsT const&>> {                                   \
+        return {                                                               \
+            static_cast<bool>( lhs.m_lhs op rhs ), lhs.m_lhs, #op##_sr, rhs }; \
+    }                                                                          \
+    template <typename RhsT>                                                   \
+    constexpr friend auto operator op( ExprLhs&& lhs, RhsT rhs )               \
+        ->std::enable_if_t<capture_by_value<RhsT>::value,                      \
+                           BinaryExpr<LhsT, RhsT>> {                           \
+        return {                                                               \
+            static_cast<bool>( lhs.m_lhs op rhs ), lhs.m_lhs, #op##_sr, rhs }; \
+    }
+
+        CATCH_INTERNAL_DEFINE_EXPRESSION_OPERATOR(|)
+        CATCH_INTERNAL_DEFINE_EXPRESSION_OPERATOR(&)
+        CATCH_INTERNAL_DEFINE_EXPRESSION_OPERATOR(^)
+
+    #undef CATCH_INTERNAL_DEFINE_EXPRESSION_OPERATOR
+
+        template<typename RhsT>
+        friend auto operator && ( ExprLhs &&, RhsT && ) -> BinaryExpr<LhsT, RhsT const&> {
+            static_assert(always_false<RhsT>::value,
+            "operator&& is not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename RhsT>
+        friend auto operator || ( ExprLhs &&, RhsT && ) -> BinaryExpr<LhsT, RhsT const&> {
+            static_assert(always_false<RhsT>::value,
+            "operator|| is not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        constexpr auto makeUnaryExpr() const -> UnaryExpr<LhsT> {
+            return UnaryExpr<LhsT>{ m_lhs };
+        }
+    };
+
+    struct Decomposer {
+        template <typename T,
+                  std::enable_if_t<
+                      !capture_by_value<std::remove_reference_t<T>>::value,
+                      int> = 0>
+        constexpr friend auto operator <= ( Decomposer &&, T && lhs ) -> ExprLhs<T const&> {
+            return ExprLhs<const T&>{ lhs };
+        }
+
+        template <typename T,
+                  std::enable_if_t<capture_by_value<T>::value, int> = 0>
+        constexpr friend auto operator <= ( Decomposer &&, T value ) -> ExprLhs<T> {
+            return ExprLhs<T>{ value };
+        }
+    };
+
+} // end namespace Catch
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+#ifdef __clang__
+#  pragma clang diagnostic pop
+#elif defined __GNUC__
+#  pragma GCC diagnostic pop
+#endif
+
+#endif // CATCH_DECOMPOSER_HPP_INCLUDED
+
+#include <string>
+
+namespace Catch {
+
+    struct AssertionReaction {
+        bool shouldDebugBreak = false;
+        bool shouldThrow = false;
+        bool shouldSkip = false;
+    };
+
+    class AssertionHandler {
+        AssertionInfo m_assertionInfo;
+        AssertionReaction m_reaction;
+        bool m_completed = false;
+        IResultCapture& m_resultCapture;
+
+    public:
+        AssertionHandler
+            (   StringRef macroName,
+                SourceLineInfo const& lineInfo,
+                StringRef capturedExpression,
+                ResultDisposition::Flags resultDisposition );
+        ~AssertionHandler() {
+            if ( !m_completed ) {
+                m_resultCapture.handleIncomplete( m_assertionInfo );
+            }
+        }
+
+
+        template<typename T>
+        void handleExpr( ExprLhs<T> const& expr ) {
+            handleExpr( expr.makeUnaryExpr() );
+        }
+        void handleExpr( ITransientExpression const& expr );
+
+        void handleMessage(ResultWas::OfType resultType, StringRef message);
+
+        void handleExceptionThrownAsExpected();
+        void handleUnexpectedExceptionNotThrown();
+        void handleExceptionNotThrownAsExpected();
+        void handleThrowingCallSkipped();
+        void handleUnexpectedInflightException();
+
+        void complete();
+
+        // query
+        auto allowThrows() const -> bool;
+    };
+
+    void handleExceptionMatchExpr( AssertionHandler& handler, std::string const& str );
+
+} // namespace Catch
+
+#endif // CATCH_ASSERTION_HANDLER_HPP_INCLUDED
+
+
+#ifndef CATCH_PREPROCESSOR_INTERNAL_STRINGIFY_HPP_INCLUDED
+#define CATCH_PREPROCESSOR_INTERNAL_STRINGIFY_HPP_INCLUDED
+
+
+#if !defined(CATCH_CONFIG_DISABLE_STRINGIFICATION)
+  #define CATCH_INTERNAL_STRINGIFY(...) #__VA_ARGS__##_catch_sr
+#else
+  #define CATCH_INTERNAL_STRINGIFY(...) "Disabled by CATCH_CONFIG_DISABLE_STRINGIFICATION"_catch_sr
+#endif
+
+#endif // CATCH_PREPROCESSOR_INTERNAL_STRINGIFY_HPP_INCLUDED
+
+// We need this suppression to leak, because it took until GCC 10
+// for the front end to handle local suppression via _Pragma properly
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && __GNUC__ <= 9
+  #pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+
+#if !defined(CATCH_CONFIG_DISABLE)
+
+#if defined(CATCH_CONFIG_FAST_COMPILE) || defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+
+///////////////////////////////////////////////////////////////////////////////
+// Another way to speed-up compilation is to omit local try-catch for REQUIRE*
+// macros.
+#define INTERNAL_CATCH_TRY
+#define INTERNAL_CATCH_CATCH( capturer )
+
+#else // CATCH_CONFIG_FAST_COMPILE
+
+#define INTERNAL_CATCH_TRY try
+#define INTERNAL_CATCH_CATCH( handler ) catch(...) { (handler).handleUnexpectedInflightException(); }
+
+#endif
+
+#define INTERNAL_CATCH_REACT( handler ) handler.complete();
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_TEST( macroName, resultDisposition, ... ) \
+    do { /* NOLINT(bugprone-infinite-loop) */ \
+        /* The expression should not be evaluated, but warnings should hopefully be checked */ \
+        CATCH_INTERNAL_IGNORE_BUT_WARN(__VA_ARGS__); \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__), resultDisposition ); \
+        INTERNAL_CATCH_TRY { \
+            CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+            CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \
+            catchAssertionHandler.handleExpr( Catch::Decomposer() <= __VA_ARGS__ ); /* NOLINT(bugprone-chained-comparison) */ \
+            CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        } INTERNAL_CATCH_CATCH( catchAssertionHandler ) \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( (void)0, (false) && static_cast<const bool&>( !!(__VA_ARGS__) ) ) // the expression here is never evaluated at runtime but it forces the compiler to give it a look
+    // The double negation silences MSVC's C4800 warning, the static_cast forces short-circuit evaluation if the type has overloaded &&.
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_IF( macroName, resultDisposition, ... ) \
+    INTERNAL_CATCH_TEST( macroName, resultDisposition, __VA_ARGS__ ); \
+    if( Catch::getResultCapture().lastAssertionPassed() )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_ELSE( macroName, resultDisposition, ... ) \
+    INTERNAL_CATCH_TEST( macroName, resultDisposition, __VA_ARGS__ ); \
+    if( !Catch::getResultCapture().lastAssertionPassed() )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_NO_THROW( macroName, resultDisposition, ... ) \
+    do { \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__), resultDisposition ); \
+        try { \
+            CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+            CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
+            static_cast<void>(__VA_ARGS__); \
+            CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+            catchAssertionHandler.handleExceptionNotThrownAsExpected(); \
+        } \
+        catch( ... ) { \
+            catchAssertionHandler.handleUnexpectedInflightException(); \
+        } \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( false )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_THROWS( macroName, resultDisposition, ... ) \
+    do { \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__), resultDisposition); \
+        if( catchAssertionHandler.allowThrows() ) \
+            try { \
+                CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+                CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT \
+                CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
+                static_cast<void>(__VA_ARGS__); \
+                CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+                catchAssertionHandler.handleUnexpectedExceptionNotThrown(); \
+            } \
+            catch( ... ) { \
+                catchAssertionHandler.handleExceptionThrownAsExpected(); \
+            } \
+        else \
+            catchAssertionHandler.handleThrowingCallSkipped(); \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( false )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_THROWS_AS( macroName, exceptionType, resultDisposition, expr ) \
+    do { \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(expr) ", " CATCH_INTERNAL_STRINGIFY(exceptionType), resultDisposition ); \
+        if( catchAssertionHandler.allowThrows() ) \
+            try { \
+                CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+                CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT \
+                CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
+                static_cast<void>(expr); \
+                CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+                catchAssertionHandler.handleUnexpectedExceptionNotThrown(); \
+            } \
+            catch( exceptionType const& ) { \
+                catchAssertionHandler.handleExceptionThrownAsExpected(); \
+            } \
+            catch( ... ) { \
+                catchAssertionHandler.handleUnexpectedInflightException(); \
+            } \
+        else \
+            catchAssertionHandler.handleThrowingCallSkipped(); \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( false )
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+// Although this is matcher-based, it can be used with just a string
+#define INTERNAL_CATCH_THROWS_STR_MATCHES( macroName, resultDisposition, matcher, ... ) \
+    do { \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__) ", " CATCH_INTERNAL_STRINGIFY(matcher), resultDisposition ); \
+        if( catchAssertionHandler.allowThrows() ) \
+            try { \
+                CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+                CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT \
+                CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
+                static_cast<void>(__VA_ARGS__); \
+                CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+                catchAssertionHandler.handleUnexpectedExceptionNotThrown(); \
+            } \
+            catch( ... ) { \
+                Catch::handleExceptionMatchExpr( catchAssertionHandler, matcher ); \
+            } \
+        else \
+            catchAssertionHandler.handleThrowingCallSkipped(); \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( false )
+
+#endif // CATCH_CONFIG_DISABLE
+
+#endif // CATCH_TEST_MACRO_IMPL_HPP_INCLUDED
+
+
+#ifndef CATCH_SECTION_HPP_INCLUDED
+#define CATCH_SECTION_HPP_INCLUDED
+
+
+
+
+/** \file
+ * Wrapper for the STATIC_ANALYSIS_SUPPORT configuration option
+ *
+ * Some of Catch2's macros can be defined differently to work better with
+ * static analysis tools, like clang-tidy or coverity.
+ * Currently the main use case is to show that `SECTION`s are executed
+ * exclusively, and not all in one run of a `TEST_CASE`.
+ */
+
+#ifndef CATCH_CONFIG_STATIC_ANALYSIS_SUPPORT_HPP_INCLUDED
+#define CATCH_CONFIG_STATIC_ANALYSIS_SUPPORT_HPP_INCLUDED
+
+
+#if defined(__clang_analyzer__) || defined(__COVERITY__)
+    #define CATCH_INTERNAL_CONFIG_STATIC_ANALYSIS_SUPPORT
+#endif
+
+#if defined( CATCH_INTERNAL_CONFIG_STATIC_ANALYSIS_SUPPORT ) && \
+    !defined( CATCH_CONFIG_NO_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT ) && \
+    !defined( CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT )
+#    define CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT
+#endif
+
+
+#endif // CATCH_CONFIG_STATIC_ANALYSIS_SUPPORT_HPP_INCLUDED
+
+
+#ifndef CATCH_TIMER_HPP_INCLUDED
+#define CATCH_TIMER_HPP_INCLUDED
+
+#include <cstdint>
+
+namespace Catch {
+
+    class Timer {
+        uint64_t m_nanoseconds = 0;
+    public:
+        void start();
+        auto getElapsedNanoseconds() const -> uint64_t;
+        auto getElapsedMicroseconds() const -> uint64_t;
+        auto getElapsedMilliseconds() const -> unsigned int;
+        auto getElapsedSeconds() const -> double;
+    };
+
+} // namespace Catch
+
+#endif // CATCH_TIMER_HPP_INCLUDED
+
+namespace Catch {
+
+    class Section : Detail::NonCopyable {
+    public:
+        Section( SectionInfo&& info );
+        Section( SourceLineInfo const& _lineInfo,
+                 StringRef _name,
+                 const char* const = nullptr );
+        ~Section();
+
+        // This indicates whether the section should be executed or not
+        explicit operator bool() const;
+
+    private:
+        SectionInfo m_info;
+
+        Counts m_assertions;
+        bool m_sectionIncluded;
+        Timer m_timer;
+    };
+
+} // end namespace Catch
+
+#if !defined(CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT)
+#    define INTERNAL_CATCH_SECTION( ... )                                 \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                         \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                  \
+        if ( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME(            \
+                 catch_internal_Section ) =                               \
+                 Catch::Section( CATCH_INTERNAL_LINEINFO, __VA_ARGS__ ) ) \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#    define INTERNAL_CATCH_DYNAMIC_SECTION( ... )                     \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                     \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS              \
+        if ( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME(        \
+                 catch_internal_Section ) =                           \
+                 Catch::SectionInfo(                                  \
+                     CATCH_INTERNAL_LINEINFO,                         \
+                     ( Catch::ReusableStringStream() << __VA_ARGS__ ) \
+                         .str() ) )                                   \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#else
+
+// These section definitions imply that at most one section at one level
+// will be intered (because only one section's __LINE__ can be equal to
+// the dummy `catchInternalSectionHint` variable from `TEST_CASE`).
+
+namespace Catch {
+    namespace Detail {
+        // Intentionally without linkage, as it should only be used as a dummy
+        // symbol for static analysis.
+        // The arguments are used as a dummy for checking warnings in the passed
+        // expressions.
+        int GetNewSectionHint( StringRef, const char* const = nullptr );
+    } // namespace Detail
+} // namespace Catch
+
+
+#    define INTERNAL_CATCH_SECTION( ... )                                   \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                           \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                    \
+        CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS                             \
+        if ( [[maybe_unused]] const int catchInternalPreviousSectionHint =  \
+                 catchInternalSectionHint,                                  \
+             catchInternalSectionHint =                                     \
+                 Catch::Detail::GetNewSectionHint(__VA_ARGS__);             \
+             catchInternalPreviousSectionHint == __LINE__ )                 \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#    define INTERNAL_CATCH_DYNAMIC_SECTION( ... )                           \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                           \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                    \
+        CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS                             \
+        if ( [[maybe_unused]] const int catchInternalPreviousSectionHint =  \
+                 catchInternalSectionHint,                                  \
+             catchInternalSectionHint = Catch::Detail::GetNewSectionHint(   \
+                ( Catch::ReusableStringStream() << __VA_ARGS__ ).str());    \
+             catchInternalPreviousSectionHint == __LINE__ )                 \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#endif
+
+
+#endif // CATCH_SECTION_HPP_INCLUDED
+
+
+#ifndef CATCH_TEST_REGISTRY_HPP_INCLUDED
+#define CATCH_TEST_REGISTRY_HPP_INCLUDED
+
+
+
+#ifndef CATCH_INTERFACES_TEST_INVOKER_HPP_INCLUDED
+#define CATCH_INTERFACES_TEST_INVOKER_HPP_INCLUDED
+
+namespace Catch {
+
+    class ITestInvoker {
+    public:
+        virtual void invoke() const = 0;
+        virtual ~ITestInvoker(); // = default
+    };
+
+} // namespace Catch
+
+#endif // CATCH_INTERFACES_TEST_INVOKER_HPP_INCLUDED
+
+
+#ifndef CATCH_PREPROCESSOR_REMOVE_PARENS_HPP_INCLUDED
+#define CATCH_PREPROCESSOR_REMOVE_PARENS_HPP_INCLUDED
+
+#define INTERNAL_CATCH_EXPAND1( param ) INTERNAL_CATCH_EXPAND2( param )
+#define INTERNAL_CATCH_EXPAND2( ... ) INTERNAL_CATCH_NO##__VA_ARGS__
+#define INTERNAL_CATCH_DEF( ... ) INTERNAL_CATCH_DEF __VA_ARGS__
+#define INTERNAL_CATCH_NOINTERNAL_CATCH_DEF
+
+#define INTERNAL_CATCH_REMOVE_PARENS( ... ) \
+    INTERNAL_CATCH_EXPAND1( INTERNAL_CATCH_DEF __VA_ARGS__ )
+
+#endif // CATCH_PREPROCESSOR_REMOVE_PARENS_HPP_INCLUDED
+
+// GCC 5 and older do not properly handle disabling unused-variable warning
+// with a _Pragma. This means that we have to leak the suppression to the
+// user code as well :-(
+#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ <= 5
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+
+
+namespace Catch {
+
+template<typename C>
+class TestInvokerAsMethod : public ITestInvoker {
+    void (C::*m_testAsMethod)();
+public:
+    TestInvokerAsMethod( void (C::*testAsMethod)() ) noexcept : m_testAsMethod( testAsMethod ) {}
+
+    void invoke() const override {
+        C obj;
+        (obj.*m_testAsMethod)();
+    }
+};
+
+Detail::unique_ptr<ITestInvoker> makeTestInvoker( void(*testAsFunction)() );
+
+template<typename C>
+Detail::unique_ptr<ITestInvoker> makeTestInvoker( void (C::*testAsMethod)() ) {
+    return Detail::make_unique<TestInvokerAsMethod<C>>( testAsMethod );
+}
+
+struct NameAndTags {
+    constexpr NameAndTags( StringRef name_ = StringRef(),
+                           StringRef tags_ = StringRef() ) noexcept:
+        name( name_ ), tags( tags_ ) {}
+    StringRef name;
+    StringRef tags;
+};
+
+struct AutoReg : Detail::NonCopyable {
+    AutoReg( Detail::unique_ptr<ITestInvoker> invoker, SourceLineInfo const& lineInfo, StringRef classOrMethod, NameAndTags const& nameAndTags ) noexcept;
+};
+
+} // end namespace Catch
+
+#if defined(CATCH_CONFIG_DISABLE)
+    #define INTERNAL_CATCH_TESTCASE_NO_REGISTRATION( TestName, ... ) \
+        static inline void TestName()
+    #define INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION( TestName, ClassName, ... ) \
+        namespace{                        \
+            struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName) { \
+                void test();              \
+            };                            \
+        }                                 \
+        void TestName::test()
+#endif
+
+
+#if !defined(CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT)
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_TESTCASE2( TestName, ... ) \
+        static void TestName(); \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+        namespace{ const Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( Catch::makeTestInvoker( &TestName ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ __VA_ARGS__ } ); } /* NOLINT */ \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        static void TestName()
+    #define INTERNAL_CATCH_TESTCASE( ... ) \
+        INTERNAL_CATCH_TESTCASE2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ), __VA_ARGS__ )
+
+#else  // ^^ !CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT | vv CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT
+
+
+// Dummy registrator for the dumy test case macros
+namespace Catch {
+    namespace Detail {
+        struct DummyUse {
+            DummyUse( void ( * )( int ), Catch::NameAndTags const& );
+        };
+    } // namespace Detail
+} // namespace Catch
+
+// Note that both the presence of the argument and its exact name are
+// necessary for the section support.
+
+// We provide a shadowed variable so that a `SECTION` inside non-`TEST_CASE`
+// tests can compile. The redefined `TEST_CASE` shadows this with param.
+static int catchInternalSectionHint = 0;
+
+#    define INTERNAL_CATCH_TESTCASE2( fname, ... )                         \
+        static void fname( int );                                          \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                          \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                           \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                   \
+        static const Catch::Detail::DummyUse INTERNAL_CATCH_UNIQUE_NAME(   \
+            dummyUser )( &(fname), Catch::NameAndTags{ __VA_ARGS__ } );    \
+        CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS                            \
+        static void fname( [[maybe_unused]] int catchInternalSectionHint ) \
+            CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#    define INTERNAL_CATCH_TESTCASE( ... ) \
+        INTERNAL_CATCH_TESTCASE2( INTERNAL_CATCH_UNIQUE_NAME( dummyFunction ), __VA_ARGS__ )
+
+
+#endif // CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_TEST_CASE_METHOD2( TestName, ClassName, ... )\
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+        namespace{ \
+            struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName) { \
+                void test(); \
+            }; \
+            const Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( \
+            Catch::makeTestInvoker( &TestName::test ),                    \
+            CATCH_INTERNAL_LINEINFO,                                      \
+            #ClassName##_catch_sr,                                        \
+            Catch::NameAndTags{ __VA_ARGS__ } ); /* NOLINT */ \
+        } \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        void TestName::test()
+    #define INTERNAL_CATCH_TEST_CASE_METHOD( ClassName, ... ) \
+        INTERNAL_CATCH_TEST_CASE_METHOD2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ), ClassName, __VA_ARGS__ )
+
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_METHOD_AS_TEST_CASE( QualifiedMethod, ... ) \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+        namespace {                                                           \
+        const Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( \
+            Catch::makeTestInvoker( &QualifiedMethod ),                   \
+            CATCH_INTERNAL_LINEINFO,                                      \
+            "&" #QualifiedMethod##_catch_sr,                              \
+            Catch::NameAndTags{ __VA_ARGS__ } );                          \
+    } /* NOLINT */ \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_REGISTER_TESTCASE( Function, ... ) \
+        do { \
+            CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+            CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+            CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+            Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( Catch::makeTestInvoker( Function ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ __VA_ARGS__ } ); /* NOLINT */ \
+            CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        } while(false)
+
+
+#endif // CATCH_TEST_REGISTRY_HPP_INCLUDED
+
+
+// All of our user-facing macros support configuration toggle, that
+// forces them to be defined prefixed with CATCH_. We also like to
+// support another toggle that can minimize (disable) their implementation.
+// Given this, we have 4 different configuration options below
+
+#if defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_DISABLE)
+
+  #define CATCH_REQUIRE( ... ) INTERNAL_CATCH_TEST( "CATCH_REQUIRE", Catch::ResultDisposition::Normal, __VA_ARGS__ )
+  #define CATCH_REQUIRE_FALSE( ... ) INTERNAL_CATCH_TEST( "CATCH_REQUIRE_FALSE", Catch::ResultDisposition::Normal | Catch::ResultDisposition::FalseTest, __VA_ARGS__ )
+
+  #define CATCH_REQUIRE_THROWS( ... ) INTERNAL_CATCH_THROWS( "CATCH_REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__ )
+  #define CATCH_REQUIRE_THROWS_AS( expr, exceptionType ) INTERNAL_CATCH_THROWS_AS( "CATCH_REQUIRE_THROWS_AS", exceptionType, Catch::ResultDisposition::Normal, expr )
+  #define CATCH_REQUIRE_NOTHROW( ... ) INTERNAL_CATCH_NO_THROW( "CATCH_REQUIRE_NOTHROW", Catch::ResultDisposition::Normal, __VA_ARGS__ )
+
+  #define CATCH_CHECK( ... ) INTERNAL_CATCH_TEST( "CATCH_CHECK", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+  #define CATCH_CHECK_FALSE( ... ) INTERNAL_CATCH_TEST( "CATCH_CHECK_FALSE", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::FalseTest, __VA_ARGS__ )
+  #define CATCH_CHECKED_IF( ... ) INTERNAL_CATCH_IF( "CATCH_CHECKED_IF", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::SuppressFail, __VA_ARGS__ )
+  #define CATCH_CHECKED_ELSE( ... ) INTERNAL_CATCH_ELSE( "CATCH_CHECKED_ELSE", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::SuppressFail, __VA_ARGS__ )
+  #define CATCH_CHECK_NOFAIL( ... ) INTERNAL_CATCH_TEST( "CATCH_CHECK_NOFAIL", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::SuppressFail, __VA_ARGS__ )
+
+  #define CATCH_CHECK_THROWS( ... )  INTERNAL_CATCH_THROWS( "CATCH_CHECK_THROWS", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+  #define CATCH_CHECK_THROWS_AS( expr, exceptionType ) INTERNAL_CATCH_THROWS_AS( "CATCH_CHECK_THROWS_AS", exceptionType, Catch::ResultDisposition::ContinueOnFailure, expr )
+  #define CATCH_CHECK_NOTHROW( ... ) INTERNAL_CATCH_NO_THROW( "CATCH_CHECK_NOTHROW", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+
+  #define CATCH_TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE( __VA_ARGS__ )
+  #define CATCH_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEST_CASE_METHOD( className, __VA_ARGS__ )
+  #define CATCH_METHOD_AS_TEST_CASE( method, ... ) INTERNAL_CATCH_METHOD_AS_TEST_CASE( method, __VA_ARGS__ )
+  #define CATCH_REGISTER_TEST_CASE( Function, ... ) INTERNAL_CATCH_REGISTER_TESTCASE( Function, __VA_ARGS__ )
+  #define CATCH_SECTION( ... ) INTERNAL_CATCH_SECTION( __VA_ARGS__ )
+  #define CATCH_DYNAMIC_SECTION( ... ) INTERNAL_CATCH_DYNAMIC_SECTION( __VA_ARGS__ )
+  #define CATCH_FAIL( ... ) INTERNAL_CATCH_MSG( "CATCH_FAIL", Catch::ResultWas::ExplicitFailure, Catch::ResultDisposition::Normal, __VA_ARGS__ )
+  #define CATCH_FAIL_CHECK( ... ) INTERNAL_CATCH_MSG( "CATCH_FAIL_CHECK", Catch::ResultWas::ExplicitFailure, Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+  #define CATCH_SUCCEED( ... ) INTERNAL_CATCH_MSG( "CATCH_SUCCEED", Catch::ResultWas::Ok, Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+  #define CATCH_SKIP( ... ) INTERNAL_CATCH_MSG( "SKIP", Catch::ResultWas::ExplicitSkip, Catch::ResultDisposition::Normal, __VA_ARGS__ )
+
+
+  #if !defined(CATCH_CONFIG_RUNTIME_STATIC_REQUIRE)
+    #define CATCH_STATIC_REQUIRE( ... )       static_assert(   __VA_ARGS__ ,      #__VA_ARGS__ );     CATCH_SUCCEED( #__VA_ARGS__ )
+    #define CATCH_STATIC_REQUIRE_FALSE( ... ) static_assert( !(__VA_ARGS__), "!(" #__VA_ARGS__ ")" ); CATCH_SUCCEED( #__VA_ARGS__ )
+    #define CATCH_STATIC_CHECK( ... )       static_assert(   __VA_ARGS__ ,      #__VA_ARGS__ );     CATCH_SUCCEED( #__VA_ARGS__ )
+    #define CATCH_STATIC_CHECK_FALSE( ... ) static_assert( !(__VA_ARGS__), "!(" #__VA_ARGS__ ")" ); CATCH_SUCCEED( #__VA_ARGS__ )
+  #else
+    #define CATCH_STATIC_REQUIRE( ... )       CATCH_REQUIRE( __VA_ARGS__ )
+    #define CATCH_STATIC_REQUIRE_FALSE( ... ) CATCH_REQUIRE_FALSE( __VA_ARGS__ )
+    #define CATCH_STATIC_CHECK( ... )       CATCH_CHECK( __VA_ARGS__ )
+    #define CATCH_STATIC_CHECK_FALSE( ... ) CATCH_CHECK_FALSE( __VA_ARGS__ )
+  #endif
+
+
+  // "BDD-style" convenience wrappers
+  #define CATCH_SCENARIO( ... ) CATCH_TEST_CASE( "Scenario: " __VA_ARGS__ )
+  #define CATCH_SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TEST_CASE_METHOD( className, "Scenario: " __VA_ARGS__ )
+  #define CATCH_GIVEN( desc )     INTERNAL_CATCH_DYNAMIC_SECTION( "    Given: " << desc )
+  #define CATCH_AND_GIVEN( desc ) INTERNAL_CATCH_DYNAMIC_SECTION( "And given: " << desc )
+  #define CATCH_WHEN( desc )      INTERNAL_CATCH_DYNAMIC_SECTION( "     When: " << desc )
+  #define CATCH_AND_WHEN( desc )  INTERNAL_CATCH_DYNAMIC_SECTION( " And when: " << desc )
+  #define CATCH_THEN( desc )      INTERNAL_CATCH_DYNAMIC_SECTION( "     Then: " << desc )
+  #define CATCH_AND_THEN( desc )  INTERNAL_CATCH_DYNAMIC_SECTION( "      And: " << desc )
+
+#elif defined(CATCH_CONFIG_PREFIX_ALL) && defined(CATCH_CONFIG_DISABLE) // ^^ prefixed, implemented | vv prefixed, disabled
+
+  #define CATCH_REQUIRE( ... )        (void)(0)
+  #define CATCH_REQUIRE_FALSE( ... )  (void)(0)
+
+  #define CATCH_REQUIRE_THROWS( ... ) (void)(0)
+  #define CATCH_REQUIRE_THROWS_AS( expr, exceptionType ) (void)(0)
+  #define CATCH_REQUIRE_NOTHROW( ... ) (void)(0)
+
+  #define CATCH_CHECK( ... )         (void)(0)
+  #define CATCH_CHECK_FALSE( ... )   (void)(0)
+  #define CATCH_CHECKED_IF( ... )    if (__VA_ARGS__)
+  #define CATCH_CHECKED_ELSE( ... )  if (!(__VA_ARGS__))
+  #define CATCH_CHECK_NOFAIL( ... )  (void)(0)
+
+  #define CATCH_CHECK_THROWS( ... )  (void)(0)
+  #define CATCH_CHECK_THROWS_AS( expr, exceptionType ) (void)(0)
+  #define CATCH_CHECK_NOTHROW( ... ) (void)(0)
+
+  #define CATCH_TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ))
+  #define CATCH_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ))
+  #define CATCH_METHOD_AS_TEST_CASE( method, ... )
+  #define CATCH_REGISTER_TEST_CASE( Function, ... ) (void)(0)
+  #define CATCH_SECTION( ... )
+  #define CATCH_DYNAMIC_SECTION( ... )
+  #define CATCH_FAIL( ... ) (void)(0)
+  #define CATCH_FAIL_CHECK( ... ) (void)(0)
+  #define CATCH_SUCCEED( ... ) (void)(0)
+  #define CATCH_SKIP( ... ) (void)(0)
+
+  #define CATCH_STATIC_REQUIRE( ... )       (void)(0)
+  #define CATCH_STATIC_REQUIRE_FALSE( ... ) (void)(0)
+  #define CATCH_STATIC_CHECK( ... )       (void)(0)
+  #define CATCH_STATIC_CHECK_FALSE( ... ) (void)(0)
+
+  // "BDD-style" convenience wrappers
+  #define CATCH_SCENARIO( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ))
+  #define CATCH_SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ), className )
+  #define CATCH_GIVEN( desc )
+  #define CATCH_AND_GIVEN( desc )
+  #define CATCH_WHEN( desc )
+  #define CATCH_AND_WHEN( desc )
+  #define CATCH_THEN( desc )
+  #define CATCH_AND_THEN( desc )
+
+#elif !defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_DISABLE) // ^^ prefixed, disabled | vv unprefixed, implemented
+
+  #define REQUIRE( ... ) INTERNAL_CATCH_TEST( "REQUIRE", Catch::ResultDisposition::Normal, __VA_ARGS__  )
+  #define REQUIRE_FALSE( ... ) INTERNAL_CATCH_TEST( "REQUIRE_FALSE", Catch::ResultDisposition::Normal | Catch::ResultDisposition::FalseTest, __VA_ARGS__ )
+
+  #define REQUIRE_THROWS( ... ) INTERNAL_CATCH_THROWS( "REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__ )
+  #define REQUIRE_THROWS_AS( expr, exceptionType ) INTERNAL_CATCH_THROWS_AS( "REQUIRE_THROWS_AS", exceptionType, Catch::ResultDisposition::Normal, expr )
+  #define REQUIRE_NOTHROW( ... ) INTERNAL_CATCH_NO_THROW( "REQUIRE_NOTHROW", Catch::ResultDisposition::Normal, __VA_ARGS__ )
+
+  #define CHECK( ... ) INTERNAL_CATCH_TEST( "CHECK", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+  #define CHECK_FALSE( ... ) INTERNAL_CATCH_TEST( "CHECK_FALSE", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::FalseTest, __VA_ARGS__ )
+  #define CHECKED_IF( ... ) INTERNAL_CATCH_IF( "CHECKED_IF", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::SuppressFail, __VA_ARGS__ )
+  #define CHECKED_ELSE( ... ) INTERNAL_CATCH_ELSE( "CHECKED_ELSE", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::SuppressFail, __VA_ARGS__ )
+  #define CHECK_NOFAIL( ... ) INTERNAL_CATCH_TEST( "CHECK_NOFAIL", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::SuppressFail, __VA_ARGS__ )
+
+  #define CHECK_THROWS( ... )  INTERNAL_CATCH_THROWS( "CHECK_THROWS", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+  #define CHECK_THROWS_AS( expr, exceptionType ) INTERNAL_CATCH_THROWS_AS( "CHECK_THROWS_AS", exceptionType, Catch::ResultDisposition::ContinueOnFailure, expr )
+  #define CHECK_NOTHROW( ... ) INTERNAL_CATCH_NO_THROW( "CHECK_NOTHROW", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+
+  #define TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE( __VA_ARGS__ )
+  #define TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEST_CASE_METHOD( className, __VA_ARGS__ )
+  #define METHOD_AS_TEST_CASE( method, ... ) INTERNAL_CATCH_METHOD_AS_TEST_CASE( method, __VA_ARGS__ )
+  #define REGISTER_TEST_CASE( Function, ... ) INTERNAL_CATCH_REGISTER_TESTCASE( Function, __VA_ARGS__ )
+  #define SECTION( ... ) INTERNAL_CATCH_SECTION( __VA_ARGS__ )
+  #define DYNAMIC_SECTION( ... ) INTERNAL_CATCH_DYNAMIC_SECTION( __VA_ARGS__ )
+  #define FAIL( ... ) INTERNAL_CATCH_MSG( "FAIL", Catch::ResultWas::ExplicitFailure, Catch::ResultDisposition::Normal, __VA_ARGS__ )
+  #define FAIL_CHECK( ... ) INTERNAL_CATCH_MSG( "FAIL_CHECK", Catch::ResultWas::ExplicitFailure, Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+  #define SUCCEED( ... ) INTERNAL_CATCH_MSG( "SUCCEED", Catch::ResultWas::Ok, Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+  #define SKIP( ... ) INTERNAL_CATCH_MSG( "SKIP", Catch::ResultWas::ExplicitSkip, Catch::ResultDisposition::Normal, __VA_ARGS__ )
+
+
+  #if !defined(CATCH_CONFIG_RUNTIME_STATIC_REQUIRE)
+    #define STATIC_REQUIRE( ... )       static_assert(   __VA_ARGS__,  #__VA_ARGS__ ); SUCCEED( #__VA_ARGS__ )
+    #define STATIC_REQUIRE_FALSE( ... ) static_assert( !(__VA_ARGS__), "!(" #__VA_ARGS__ ")" ); SUCCEED( "!(" #__VA_ARGS__ ")" )
+    #define STATIC_CHECK( ... )       static_assert(   __VA_ARGS__,  #__VA_ARGS__ ); SUCCEED( #__VA_ARGS__ )
+    #define STATIC_CHECK_FALSE( ... ) static_assert( !(__VA_ARGS__), "!(" #__VA_ARGS__ ")" ); SUCCEED( "!(" #__VA_ARGS__ ")" )
+  #else
+    #define STATIC_REQUIRE( ... )       REQUIRE( __VA_ARGS__ )
+    #define STATIC_REQUIRE_FALSE( ... ) REQUIRE_FALSE( __VA_ARGS__ )
+    #define STATIC_CHECK( ... )       CHECK( __VA_ARGS__ )
+    #define STATIC_CHECK_FALSE( ... ) CHECK_FALSE( __VA_ARGS__ )
+  #endif
+
+  // "BDD-style" convenience wrappers
+  #define SCENARIO( ... ) TEST_CASE( "Scenario: " __VA_ARGS__ )
+  #define SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TEST_CASE_METHOD( className, "Scenario: " __VA_ARGS__ )
+  #define GIVEN( desc )     INTERNAL_CATCH_DYNAMIC_SECTION( "    Given: " << desc )
+  #define AND_GIVEN( desc ) INTERNAL_CATCH_DYNAMIC_SECTION( "And given: " << desc )
+  #define WHEN( desc )      INTERNAL_CATCH_DYNAMIC_SECTION( "     When: " << desc )
+  #define AND_WHEN( desc )  INTERNAL_CATCH_DYNAMIC_SECTION( " And when: " << desc )
+  #define THEN( desc )      INTERNAL_CATCH_DYNAMIC_SECTION( "     Then: " << desc )
+  #define AND_THEN( desc )  INTERNAL_CATCH_DYNAMIC_SECTION( "      And: " << desc )
+
+#elif !defined(CATCH_CONFIG_PREFIX_ALL) && defined(CATCH_CONFIG_DISABLE) // ^^ unprefixed, implemented | vv unprefixed, disabled
+
+  #define REQUIRE( ... )       (void)(0)
+  #define REQUIRE_FALSE( ... ) (void)(0)
+
+  #define REQUIRE_THROWS( ... ) (void)(0)
+  #define REQUIRE_THROWS_AS( expr, exceptionType ) (void)(0)
+  #define REQUIRE_NOTHROW( ... ) (void)(0)
+
+  #define CHECK( ... ) (void)(0)
+  #define CHECK_FALSE( ... ) (void)(0)
+  #define CHECKED_IF( ... ) if (__VA_ARGS__)
+  #define CHECKED_ELSE( ... ) if (!(__VA_ARGS__))
+  #define CHECK_NOFAIL( ... ) (void)(0)
+
+  #define CHECK_THROWS( ... )  (void)(0)
+  #define CHECK_THROWS_AS( expr, exceptionType ) (void)(0)
+  #define CHECK_NOTHROW( ... ) (void)(0)
+
+  #define TEST_CASE( ... )  INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ), __VA_ARGS__)
+  #define TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ))
+  #define METHOD_AS_TEST_CASE( method, ... )
+  #define REGISTER_TEST_CASE( Function, ... ) (void)(0)
+  #define SECTION( ... )
+  #define DYNAMIC_SECTION( ... )
+  #define FAIL( ... ) (void)(0)
+  #define FAIL_CHECK( ... ) (void)(0)
+  #define SUCCEED( ... ) (void)(0)
+  #define SKIP( ... ) (void)(0)
+
+  #define STATIC_REQUIRE( ... )       (void)(0)
+  #define STATIC_REQUIRE_FALSE( ... ) (void)(0)
+  #define STATIC_CHECK( ... )       (void)(0)
+  #define STATIC_CHECK_FALSE( ... ) (void)(0)
+
+  // "BDD-style" convenience wrappers
+  #define SCENARIO( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ) )
+  #define SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ), className )
+
+  #define GIVEN( desc )
+  #define AND_GIVEN( desc )
+  #define WHEN( desc )
+  #define AND_WHEN( desc )
+  #define THEN( desc )
+  #define AND_THEN( desc )
+
+#endif // ^^ unprefixed, disabled
+
+// end of user facing macros
+
+#endif // CATCH_TEST_MACROS_HPP_INCLUDED
+
+
+#ifndef CATCH_TEMPLATE_TEST_REGISTRY_HPP_INCLUDED
+#define CATCH_TEMPLATE_TEST_REGISTRY_HPP_INCLUDED
+
+
+
+#ifndef CATCH_PREPROCESSOR_HPP_INCLUDED
+#define CATCH_PREPROCESSOR_HPP_INCLUDED
+
+
+#if defined(__GNUC__)
+// We need to silence "empty __VA_ARGS__ warning", and using just _Pragma does not work
+#pragma GCC system_header
+#endif
+
+
+#define CATCH_RECURSION_LEVEL0(...) __VA_ARGS__
+#define CATCH_RECURSION_LEVEL1(...) CATCH_RECURSION_LEVEL0(CATCH_RECURSION_LEVEL0(CATCH_RECURSION_LEVEL0(__VA_ARGS__)))
+#define CATCH_RECURSION_LEVEL2(...) CATCH_RECURSION_LEVEL1(CATCH_RECURSION_LEVEL1(CATCH_RECURSION_LEVEL1(__VA_ARGS__)))
+#define CATCH_RECURSION_LEVEL3(...) CATCH_RECURSION_LEVEL2(CATCH_RECURSION_LEVEL2(CATCH_RECURSION_LEVEL2(__VA_ARGS__)))
+#define CATCH_RECURSION_LEVEL4(...) CATCH_RECURSION_LEVEL3(CATCH_RECURSION_LEVEL3(CATCH_RECURSION_LEVEL3(__VA_ARGS__)))
+#define CATCH_RECURSION_LEVEL5(...) CATCH_RECURSION_LEVEL4(CATCH_RECURSION_LEVEL4(CATCH_RECURSION_LEVEL4(__VA_ARGS__)))
+
+#ifdef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#define INTERNAL_CATCH_EXPAND_VARGS(...) __VA_ARGS__
+// MSVC needs more evaluations
+#define CATCH_RECURSION_LEVEL6(...) CATCH_RECURSION_LEVEL5(CATCH_RECURSION_LEVEL5(CATCH_RECURSION_LEVEL5(__VA_ARGS__)))
+#define CATCH_RECURSE(...)  CATCH_RECURSION_LEVEL6(CATCH_RECURSION_LEVEL6(__VA_ARGS__))
+#else
+#define CATCH_RECURSE(...)  CATCH_RECURSION_LEVEL5(__VA_ARGS__)
+#endif
+
+#define CATCH_REC_END(...)
+#define CATCH_REC_OUT
+
+#define CATCH_EMPTY()
+#define CATCH_DEFER(id) id CATCH_EMPTY()
+
+#define CATCH_REC_GET_END2() 0, CATCH_REC_END
+#define CATCH_REC_GET_END1(...) CATCH_REC_GET_END2
+#define CATCH_REC_GET_END(...) CATCH_REC_GET_END1
+#define CATCH_REC_NEXT0(test, next, ...) next CATCH_REC_OUT
+#define CATCH_REC_NEXT1(test, next) CATCH_DEFER ( CATCH_REC_NEXT0 ) ( test, next, 0)
+#define CATCH_REC_NEXT(test, next)  CATCH_REC_NEXT1(CATCH_REC_GET_END test, next)
+
+#define CATCH_REC_LIST0(f, x, peek, ...) , f(x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1) ) ( f, peek, __VA_ARGS__ )
+#define CATCH_REC_LIST1(f, x, peek, ...) , f(x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST0) ) ( f, peek, __VA_ARGS__ )
+#define CATCH_REC_LIST2(f, x, peek, ...)   f(x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1) ) ( f, peek, __VA_ARGS__ )
+
+#define CATCH_REC_LIST0_UD(f, userdata, x, peek, ...) , f(userdata, x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1_UD) ) ( f, userdata, peek, __VA_ARGS__ )
+#define CATCH_REC_LIST1_UD(f, userdata, x, peek, ...) , f(userdata, x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST0_UD) ) ( f, userdata, peek, __VA_ARGS__ )
+#define CATCH_REC_LIST2_UD(f, userdata, x, peek, ...)   f(userdata, x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1_UD) ) ( f, userdata, peek, __VA_ARGS__ )
+
+// Applies the function macro `f` to each of the remaining parameters, inserts commas between the results,
+// and passes userdata as the first parameter to each invocation,
+// e.g. CATCH_REC_LIST_UD(f, x, a, b, c) evaluates to f(x, a), f(x, b), f(x, c)
+#define CATCH_REC_LIST_UD(f, userdata, ...) CATCH_RECURSE(CATCH_REC_LIST2_UD(f, userdata, __VA_ARGS__, ()()(), ()()(), ()()(), 0))
+
+#define CATCH_REC_LIST(f, ...) CATCH_RECURSE(CATCH_REC_LIST2(f, __VA_ARGS__, ()()(), ()()(), ()()(), 0))
+
+#define INTERNAL_CATCH_STRINGIZE(...) INTERNAL_CATCH_STRINGIZE2(__VA_ARGS__)
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#define INTERNAL_CATCH_STRINGIZE2(...) #__VA_ARGS__
+#define INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS(param) INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_REMOVE_PARENS(param))
+#else
+// MSVC is adding extra space and needs another indirection to expand INTERNAL_CATCH_NOINTERNAL_CATCH_DEF
+#define INTERNAL_CATCH_STRINGIZE2(...) INTERNAL_CATCH_STRINGIZE3(__VA_ARGS__)
+#define INTERNAL_CATCH_STRINGIZE3(...) #__VA_ARGS__
+#define INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS(param) (INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_REMOVE_PARENS(param)) + 1)
+#endif
+
+#define INTERNAL_CATCH_MAKE_NAMESPACE2(...) ns_##__VA_ARGS__
+#define INTERNAL_CATCH_MAKE_NAMESPACE(name) INTERNAL_CATCH_MAKE_NAMESPACE2(name)
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#define INTERNAL_CATCH_MAKE_TYPE_LIST2(...) decltype(get_wrapper<INTERNAL_CATCH_REMOVE_PARENS_GEN(__VA_ARGS__)>())
+#define INTERNAL_CATCH_MAKE_TYPE_LIST(...) INTERNAL_CATCH_MAKE_TYPE_LIST2(INTERNAL_CATCH_REMOVE_PARENS(__VA_ARGS__))
+#else
+#define INTERNAL_CATCH_MAKE_TYPE_LIST2(...) INTERNAL_CATCH_EXPAND_VARGS(decltype(get_wrapper<INTERNAL_CATCH_REMOVE_PARENS_GEN(__VA_ARGS__)>()))
+#define INTERNAL_CATCH_MAKE_TYPE_LIST(...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_MAKE_TYPE_LIST2(INTERNAL_CATCH_REMOVE_PARENS(__VA_ARGS__)))
+#endif
+
+#define INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(...)\
+    CATCH_REC_LIST(INTERNAL_CATCH_MAKE_TYPE_LIST,__VA_ARGS__)
+
+#define INTERNAL_CATCH_REMOVE_PARENS_1_ARG(_0) INTERNAL_CATCH_REMOVE_PARENS(_0)
+#define INTERNAL_CATCH_REMOVE_PARENS_2_ARG(_0, _1) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_1_ARG(_1)
+#define INTERNAL_CATCH_REMOVE_PARENS_3_ARG(_0, _1, _2) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_2_ARG(_1, _2)
+#define INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_0, _1, _2, _3) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_3_ARG(_1, _2, _3)
+#define INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_0, _1, _2, _3, _4) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_1, _2, _3, _4)
+#define INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_0, _1, _2, _3, _4, _5) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_1, _2, _3, _4, _5)
+#define INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_0, _1, _2, _3, _4, _5, _6) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_1, _2, _3, _4, _5, _6)
+#define INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_0, _1, _2, _3, _4, _5, _6, _7) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_1, _2, _3, _4, _5, _6, _7)
+#define INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_1, _2, _3, _4, _5, _6, _7, _8)
+#define INTERNAL_CATCH_REMOVE_PARENS_10_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9)
+#define INTERNAL_CATCH_REMOVE_PARENS_11_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_10_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10)
+
+#define INTERNAL_CATCH_VA_NARGS_IMPL(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) N
+
+#define INTERNAL_CATCH_TYPE_GEN\
+    template<typename...> struct TypeList {};\
+    template<typename...Ts>\
+    constexpr auto get_wrapper() noexcept -> TypeList<Ts...> { return {}; }\
+    template<template<typename...> class...> struct TemplateTypeList{};\
+    template<template<typename...> class...Cs>\
+    constexpr auto get_wrapper() noexcept -> TemplateTypeList<Cs...> { return {}; }\
+    template<typename...>\
+    struct append;\
+    template<typename...>\
+    struct rewrap;\
+    template<template<typename...> class, typename...>\
+    struct create;\
+    template<template<typename...> class, typename>\
+    struct convert;\
+    \
+    template<typename T> \
+    struct append<T> { using type = T; };\
+    template< template<typename...> class L1, typename...E1, template<typename...> class L2, typename...E2, typename...Rest>\
+    struct append<L1<E1...>, L2<E2...>, Rest...> { using type = typename append<L1<E1...,E2...>, Rest...>::type; };\
+    template< template<typename...> class L1, typename...E1, typename...Rest>\
+    struct append<L1<E1...>, TypeList<mpl_::na>, Rest...> { using type = L1<E1...>; };\
+    \
+    template< template<typename...> class Container, template<typename...> class List, typename...elems>\
+    struct rewrap<TemplateTypeList<Container>, List<elems...>> { using type = TypeList<Container<elems...>>; };\
+    template< template<typename...> class Container, template<typename...> class List, class...Elems, typename...Elements>\
+    struct rewrap<TemplateTypeList<Container>, List<Elems...>, Elements...> { using type = typename append<TypeList<Container<Elems...>>, typename rewrap<TemplateTypeList<Container>, Elements...>::type>::type; };\
+    \
+    template<template <typename...> class Final, template< typename...> class...Containers, typename...Types>\
+    struct create<Final, TemplateTypeList<Containers...>, TypeList<Types...>> { using type = typename append<Final<>, typename rewrap<TemplateTypeList<Containers>, Types...>::type...>::type; };\
+    template<template <typename...> class Final, template <typename...> class List, typename...Ts>\
+    struct convert<Final, List<Ts...>> { using type = typename append<Final<>,TypeList<Ts>...>::type; };
+
+#define INTERNAL_CATCH_NTTP_1(signature, ...)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)> struct Nttp{};\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    constexpr auto get_wrapper() noexcept -> Nttp<__VA_ARGS__> { return {}; } \
+    template<template<INTERNAL_CATCH_REMOVE_PARENS(signature)> class...> struct NttpTemplateTypeList{};\
+    template<template<INTERNAL_CATCH_REMOVE_PARENS(signature)> class...Cs>\
+    constexpr auto get_wrapper() noexcept -> NttpTemplateTypeList<Cs...> { return {}; } \
+    \
+    template< template<INTERNAL_CATCH_REMOVE_PARENS(signature)> class Container, template<INTERNAL_CATCH_REMOVE_PARENS(signature)> class List, INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    struct rewrap<NttpTemplateTypeList<Container>, List<__VA_ARGS__>> { using type = TypeList<Container<__VA_ARGS__>>; };\
+    template< template<INTERNAL_CATCH_REMOVE_PARENS(signature)> class Container, template<INTERNAL_CATCH_REMOVE_PARENS(signature)> class List, INTERNAL_CATCH_REMOVE_PARENS(signature), typename...Elements>\
+    struct rewrap<NttpTemplateTypeList<Container>, List<__VA_ARGS__>, Elements...> { using type = typename append<TypeList<Container<__VA_ARGS__>>, typename rewrap<NttpTemplateTypeList<Container>, Elements...>::type>::type; };\
+    template<template <typename...> class Final, template<INTERNAL_CATCH_REMOVE_PARENS(signature)> class...Containers, typename...Types>\
+    struct create<Final, NttpTemplateTypeList<Containers...>, TypeList<Types...>> { using type = typename append<Final<>, typename rewrap<NttpTemplateTypeList<Containers>, Types...>::type...>::type; };
+
+#define INTERNAL_CATCH_DECLARE_SIG_TEST0(TestName)
+#define INTERNAL_CATCH_DECLARE_SIG_TEST1(TestName, signature)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    static void TestName()
+#define INTERNAL_CATCH_DECLARE_SIG_TEST_X(TestName, signature, ...)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    static void TestName()
+
+#define INTERNAL_CATCH_DEFINE_SIG_TEST0(TestName)
+#define INTERNAL_CATCH_DEFINE_SIG_TEST1(TestName, signature)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    static void TestName()
+#define INTERNAL_CATCH_DEFINE_SIG_TEST_X(TestName, signature,...)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    static void TestName()
+
+#define INTERNAL_CATCH_NTTP_REGISTER0(TestFunc, signature)\
+    template<typename Type>\
+    void reg_test(TypeList<Type>, Catch::NameAndTags nameAndTags)\
+    {\
+        Catch::AutoReg( Catch::makeTestInvoker(&TestFunc<Type>), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), nameAndTags);\
+    }
+
+#define INTERNAL_CATCH_NTTP_REGISTER(TestFunc, signature, ...)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    void reg_test(Nttp<__VA_ARGS__>, Catch::NameAndTags nameAndTags)\
+    {\
+        Catch::AutoReg( Catch::makeTestInvoker(&TestFunc<__VA_ARGS__>), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), nameAndTags);\
+    }
+
+#define INTERNAL_CATCH_NTTP_REGISTER_METHOD0(TestName, signature, ...)\
+    template<typename Type>\
+    void reg_test(TypeList<Type>, Catch::StringRef className, Catch::NameAndTags nameAndTags)\
+    {\
+        Catch::AutoReg( Catch::makeTestInvoker(&TestName<Type>::test), CATCH_INTERNAL_LINEINFO, className, nameAndTags);\
+    }
+
+#define INTERNAL_CATCH_NTTP_REGISTER_METHOD(TestName, signature, ...)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    void reg_test(Nttp<__VA_ARGS__>, Catch::StringRef className, Catch::NameAndTags nameAndTags)\
+    {\
+        Catch::AutoReg( Catch::makeTestInvoker(&TestName<__VA_ARGS__>::test), CATCH_INTERNAL_LINEINFO, className, nameAndTags);\
+    }
+
+#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD0(TestName, ClassName)
+#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD1(TestName, ClassName, signature)\
+    template<typename TestType> \
+    struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName)<TestType> { \
+        void test();\
+    }
+
+#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X(TestName, ClassName, signature, ...)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)> \
+    struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName)<__VA_ARGS__> { \
+        void test();\
+    }
+
+#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD0(TestName)
+#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD1(TestName, signature)\
+    template<typename TestType> \
+    void INTERNAL_CATCH_MAKE_NAMESPACE(TestName)::TestName<TestType>::test()
+#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X(TestName, signature, ...)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)> \
+    void INTERNAL_CATCH_MAKE_NAMESPACE(TestName)::TestName<__VA_ARGS__>::test()
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#define INTERNAL_CATCH_NTTP_0
+#define INTERNAL_CATCH_NTTP_GEN(...) INTERNAL_CATCH_VA_NARGS_IMPL(__VA_ARGS__, INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1( __VA_ARGS__), INTERNAL_CATCH_NTTP_1( __VA_ARGS__), INTERNAL_CATCH_NTTP_1( __VA_ARGS__), INTERNAL_CATCH_NTTP_1( __VA_ARGS__),INTERNAL_CATCH_NTTP_1( __VA_ARGS__), INTERNAL_CATCH_NTTP_0)
+#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(TestName, ...) INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD1, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD0)(TestName, __VA_ARGS__)
+#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD(TestName, ClassName, ...) INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD1, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD0)(TestName, ClassName, __VA_ARGS__)
+#define INTERNAL_CATCH_NTTP_REG_METHOD_GEN(TestName, ...) INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD0, INTERNAL_CATCH_NTTP_REGISTER_METHOD0)(TestName, __VA_ARGS__)
+#define INTERNAL_CATCH_NTTP_REG_GEN(TestFunc, ...) INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER0, INTERNAL_CATCH_NTTP_REGISTER0)(TestFunc, __VA_ARGS__)
+#define INTERNAL_CATCH_DEFINE_SIG_TEST(TestName, ...) INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,INTERNAL_CATCH_DEFINE_SIG_TEST_X,INTERNAL_CATCH_DEFINE_SIG_TEST1, INTERNAL_CATCH_DEFINE_SIG_TEST0)(TestName, __VA_ARGS__)
+#define INTERNAL_CATCH_DECLARE_SIG_TEST(TestName, ...) INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DECLARE_SIG_TEST_X,INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,INTERNAL_CATCH_DECLARE_SIG_TEST_X,INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST1, INTERNAL_CATCH_DECLARE_SIG_TEST0)(TestName, __VA_ARGS__)
+#define INTERNAL_CATCH_REMOVE_PARENS_GEN(...) INTERNAL_CATCH_VA_NARGS_IMPL(__VA_ARGS__, INTERNAL_CATCH_REMOVE_PARENS_11_ARG,INTERNAL_CATCH_REMOVE_PARENS_10_ARG,INTERNAL_CATCH_REMOVE_PARENS_9_ARG,INTERNAL_CATCH_REMOVE_PARENS_8_ARG,INTERNAL_CATCH_REMOVE_PARENS_7_ARG,INTERNAL_CATCH_REMOVE_PARENS_6_ARG,INTERNAL_CATCH_REMOVE_PARENS_5_ARG,INTERNAL_CATCH_REMOVE_PARENS_4_ARG,INTERNAL_CATCH_REMOVE_PARENS_3_ARG,INTERNAL_CATCH_REMOVE_PARENS_2_ARG,INTERNAL_CATCH_REMOVE_PARENS_1_ARG)(__VA_ARGS__)
+#else
+#define INTERNAL_CATCH_NTTP_0(signature)
+#define INTERNAL_CATCH_NTTP_GEN(...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(__VA_ARGS__, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1,INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_0)( __VA_ARGS__))
+#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(TestName, ...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD1, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD0)(TestName, __VA_ARGS__))
+#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD(TestName, ClassName, ...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD1, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD0)(TestName, ClassName, __VA_ARGS__))
+#define INTERNAL_CATCH_NTTP_REG_METHOD_GEN(TestName, ...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD0, INTERNAL_CATCH_NTTP_REGISTER_METHOD0)(TestName, __VA_ARGS__))
+#define INTERNAL_CATCH_NTTP_REG_GEN(TestFunc, ...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER0, INTERNAL_CATCH_NTTP_REGISTER0)(TestFunc, __VA_ARGS__))
+#define INTERNAL_CATCH_DEFINE_SIG_TEST(TestName, ...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,INTERNAL_CATCH_DEFINE_SIG_TEST_X,INTERNAL_CATCH_DEFINE_SIG_TEST1, INTERNAL_CATCH_DEFINE_SIG_TEST0)(TestName, __VA_ARGS__))
+#define INTERNAL_CATCH_DECLARE_SIG_TEST(TestName, ...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DECLARE_SIG_TEST_X,INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,INTERNAL_CATCH_DECLARE_SIG_TEST_X,INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST1, INTERNAL_CATCH_DECLARE_SIG_TEST0)(TestName, __VA_ARGS__))
+#define INTERNAL_CATCH_REMOVE_PARENS_GEN(...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(__VA_ARGS__, INTERNAL_CATCH_REMOVE_PARENS_11_ARG,INTERNAL_CATCH_REMOVE_PARENS_10_ARG,INTERNAL_CATCH_REMOVE_PARENS_9_ARG,INTERNAL_CATCH_REMOVE_PARENS_8_ARG,INTERNAL_CATCH_REMOVE_PARENS_7_ARG,INTERNAL_CATCH_REMOVE_PARENS_6_ARG,INTERNAL_CATCH_REMOVE_PARENS_5_ARG,INTERNAL_CATCH_REMOVE_PARENS_4_ARG,INTERNAL_CATCH_REMOVE_PARENS_3_ARG,INTERNAL_CATCH_REMOVE_PARENS_2_ARG,INTERNAL_CATCH_REMOVE_PARENS_1_ARG)(__VA_ARGS__))
+#endif
+
+#endif // CATCH_PREPROCESSOR_HPP_INCLUDED
+
+
+// GCC 5 and older do not properly handle disabling unused-variable warning
+// with a _Pragma. This means that we have to leak the suppression to the
+// user code as well :-(
+#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ <= 5
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+#if defined(CATCH_CONFIG_DISABLE)
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( TestName, TestFunc, Name, Tags, Signature, ... )  \
+        INTERNAL_CATCH_DEFINE_SIG_TEST(TestFunc, INTERNAL_CATCH_REMOVE_PARENS(Signature))
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( TestNameClass, TestName, ClassName, Name, Tags, Signature, ... )    \
+        namespace{                                                                                  \
+            namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName) {                                      \
+            INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD(TestName, ClassName, INTERNAL_CATCH_REMOVE_PARENS(Signature));\
+        }                                                                                           \
+        }                                                                                           \
+        INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(TestName, INTERNAL_CATCH_REMOVE_PARENS(Signature))
+
+    #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(Name, Tags, ...) \
+            INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, typename TestType, __VA_ARGS__ )
+    #else
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(Name, Tags, ...) \
+            INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, typename TestType, __VA_ARGS__ ) )
+    #endif
+
+    #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(Name, Tags, Signature, ...) \
+            INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, Signature, __VA_ARGS__ )
+    #else
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(Name, Tags, Signature, ...) \
+            INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, Signature, __VA_ARGS__ ) )
+    #endif
+
+    #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION( ClassName, Name, Tags,... ) \
+            INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_CLASS_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ )
+    #else
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION( ClassName, Name, Tags,... ) \
+            INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_CLASS_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) )
+    #endif
+
+    #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION( ClassName, Name, Tags, Signature, ... ) \
+            INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_CLASS_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ )
+    #else
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION( ClassName, Name, Tags, Signature, ... ) \
+            INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_CLASS_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) )
+    #endif
+#endif
+
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_2(TestName, TestFunc, Name, Tags, Signature, ... )\
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS \
+        INTERNAL_CATCH_DECLARE_SIG_TEST(TestFunc, INTERNAL_CATCH_REMOVE_PARENS(Signature));\
+        namespace {\
+        namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName){\
+            INTERNAL_CATCH_TYPE_GEN\
+            INTERNAL_CATCH_NTTP_GEN(INTERNAL_CATCH_REMOVE_PARENS(Signature))\
+            INTERNAL_CATCH_NTTP_REG_GEN(TestFunc,INTERNAL_CATCH_REMOVE_PARENS(Signature))\
+            template<typename...Types> \
+            struct TestName{\
+                TestName(){\
+                    size_t index = 0;                                    \
+                    constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, __VA_ARGS__)}; /* NOLINT(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,hicpp-avoid-c-arrays) */\
+                    using expander = size_t[]; /* NOLINT(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,hicpp-avoid-c-arrays) */\
+                    (void)expander{(reg_test(Types{}, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index]), Tags } ), index++)... };/* NOLINT */ \
+                }\
+            };\
+            static const int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\
+            TestName<INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(__VA_ARGS__)>();\
+            return 0;\
+        }();\
+        }\
+        }\
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        INTERNAL_CATCH_DEFINE_SIG_TEST(TestFunc,INTERNAL_CATCH_REMOVE_PARENS(Signature))
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE(Name, Tags, ...) \
+        INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, typename TestType, __VA_ARGS__ )
+#else
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE(Name, Tags, ...) \
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, typename TestType, __VA_ARGS__ ) )
+#endif
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(Name, Tags, Signature, ...) \
+        INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, Signature, __VA_ARGS__ )
+#else
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(Name, Tags, Signature, ...) \
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, Signature, __VA_ARGS__ ) )
+#endif
+
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(TestName, TestFuncName, Name, Tags, Signature, TmplTypes, TypesList) \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                      \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                      \
+        CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS                \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS       \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS \
+        template<typename TestType> static void TestFuncName();       \
+        namespace {\
+        namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName) {                                     \
+            INTERNAL_CATCH_TYPE_GEN                                                  \
+            INTERNAL_CATCH_NTTP_GEN(INTERNAL_CATCH_REMOVE_PARENS(Signature))         \
+            template<typename... Types>                               \
+            struct TestName {                                         \
+                void reg_tests() {                                          \
+                    size_t index = 0;                                    \
+                    using expander = size_t[];                           \
+                    constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TmplTypes))};\
+                    constexpr char const* types_list[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TypesList))};\
+                    constexpr auto num_types = sizeof(types_list) / sizeof(types_list[0]);\
+                    (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestFuncName<Types> ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index / num_types]) + '<' + std::string(types_list[index % num_types]) + '>', Tags } ), index++)... };/* NOLINT */\
+                }                                                     \
+            };                                                        \
+            static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){ \
+                using TestInit = typename create<TestName, decltype(get_wrapper<INTERNAL_CATCH_REMOVE_PARENS(TmplTypes)>()), TypeList<INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(INTERNAL_CATCH_REMOVE_PARENS(TypesList))>>::type; \
+                TestInit t;                                           \
+                t.reg_tests();                                        \
+                return 0;                                             \
+            }();                                                      \
+        }                                                             \
+        }                                                             \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                       \
+        template<typename TestType>                                   \
+        static void TestFuncName()
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(Name, Tags, ...)\
+        INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, typename T,__VA_ARGS__)
+#else
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(Name, Tags, ...)\
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, typename T, __VA_ARGS__ ) )
+#endif
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(Name, Tags, Signature, ...)\
+        INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, Signature, __VA_ARGS__)
+#else
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(Name, Tags, Signature, ...)\
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, Signature, __VA_ARGS__ ) )
+#endif
+
+    #define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_2(TestName, TestFunc, Name, Tags, TmplList)\
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS \
+        template<typename TestType> static void TestFunc();       \
+        namespace {\
+        namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName){\
+        INTERNAL_CATCH_TYPE_GEN\
+        template<typename... Types>                               \
+        struct TestName {                                         \
+            void reg_tests() {                                          \
+                size_t index = 0;                                    \
+                using expander = size_t[];                           \
+                (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestFunc<Types> ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ Name " - " INTERNAL_CATCH_STRINGIZE(TmplList) " - " + std::to_string(index), Tags } ), index++)... };/* NOLINT */\
+            }                                                     \
+        };\
+        static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){ \
+                using TestInit = typename convert<TestName, TmplList>::type; \
+                TestInit t;                                           \
+                t.reg_tests();                                        \
+                return 0;                                             \
+            }();                                                      \
+        }}\
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                       \
+        template<typename TestType>                                   \
+        static void TestFunc()
+
+    #define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE(Name, Tags, TmplList) \
+        INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, TmplList )
+
+
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( TestNameClass, TestName, ClassName, Name, Tags, Signature, ... ) \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+        namespace {\
+        namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName){ \
+            INTERNAL_CATCH_TYPE_GEN\
+            INTERNAL_CATCH_NTTP_GEN(INTERNAL_CATCH_REMOVE_PARENS(Signature))\
+            INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD(TestName, ClassName, INTERNAL_CATCH_REMOVE_PARENS(Signature));\
+            INTERNAL_CATCH_NTTP_REG_METHOD_GEN(TestName, INTERNAL_CATCH_REMOVE_PARENS(Signature))\
+            template<typename...Types> \
+            struct TestNameClass{\
+                TestNameClass(){\
+                    size_t index = 0;                                    \
+                    constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, __VA_ARGS__)};\
+                    using expander = size_t[];\
+                    (void)expander{(reg_test(Types{}, #ClassName, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index]), Tags } ), index++)... };/* NOLINT */ \
+                }\
+            };\
+            static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\
+                TestNameClass<INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(__VA_ARGS__)>();\
+                return 0;\
+        }();\
+        }\
+        }\
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(TestName, INTERNAL_CATCH_REMOVE_PARENS(Signature))
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( ClassName, Name, Tags,... ) \
+        INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_CLASS_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ )
+#else
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( ClassName, Name, Tags,... ) \
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_CLASS_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) )
+#endif
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( ClassName, Name, Tags, Signature, ... ) \
+        INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_CLASS_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ )
+#else
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( ClassName, Name, Tags, Signature, ... ) \
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_CLASS_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) )
+#endif
+
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2(TestNameClass, TestName, ClassName, Name, Tags, Signature, TmplTypes, TypesList)\
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+        template<typename TestType> \
+            struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName <TestType>) { \
+                void test();\
+            };\
+        namespace {\
+        namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestNameClass) {\
+            INTERNAL_CATCH_TYPE_GEN                  \
+            INTERNAL_CATCH_NTTP_GEN(INTERNAL_CATCH_REMOVE_PARENS(Signature))\
+            template<typename...Types>\
+            struct TestNameClass{\
+                void reg_tests(){\
+                    std::size_t index = 0;\
+                    using expander = std::size_t[];\
+                    constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TmplTypes))};\
+                    constexpr char const* types_list[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TypesList))};\
+                    constexpr auto num_types = sizeof(types_list) / sizeof(types_list[0]);\
+                    (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestName<Types>::test ), CATCH_INTERNAL_LINEINFO, #ClassName, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index / num_types]) + '<' + std::string(types_list[index % num_types]) + '>', Tags } ), index++)... };/* NOLINT */ \
+                }\
+            };\
+            static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\
+                using TestInit = typename create<TestNameClass, decltype(get_wrapper<INTERNAL_CATCH_REMOVE_PARENS(TmplTypes)>()), TypeList<INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(INTERNAL_CATCH_REMOVE_PARENS(TypesList))>>::type;\
+                TestInit t;\
+                t.reg_tests();\
+                return 0;\
+            }(); \
+        }\
+        }\
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        template<typename TestType> \
+        void TestName<TestType>::test()
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( ClassName, Name, Tags, ... )\
+        INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), ClassName, Name, Tags, typename T, __VA_ARGS__ )
+#else
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( ClassName, Name, Tags, ... )\
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), ClassName, Name, Tags, typename T,__VA_ARGS__ ) )
+#endif
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( ClassName, Name, Tags, Signature, ... )\
+        INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), ClassName, Name, Tags, Signature, __VA_ARGS__ )
+#else
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( ClassName, Name, Tags, Signature, ... )\
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), ClassName, Name, Tags, Signature,__VA_ARGS__ ) )
+#endif
+
+    #define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD_2( TestNameClass, TestName, ClassName, Name, Tags, TmplList) \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS \
+        template<typename TestType> \
+        struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName <TestType>) { \
+            void test();\
+        };\
+        namespace {\
+        namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName){ \
+            INTERNAL_CATCH_TYPE_GEN\
+            template<typename...Types>\
+            struct TestNameClass{\
+                void reg_tests(){\
+                    size_t index = 0;\
+                    using expander = size_t[];\
+                    (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestName<Types>::test ), CATCH_INTERNAL_LINEINFO, #ClassName##_catch_sr, Catch::NameAndTags{ Name " - " INTERNAL_CATCH_STRINGIZE(TmplList) " - " + std::to_string(index), Tags } ), index++)... };/* NOLINT */ \
+                }\
+            };\
+            static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\
+                using TestInit = typename convert<TestNameClass, TmplList>::type;\
+                TestInit t;\
+                t.reg_tests();\
+                return 0;\
+            }(); \
+        }}\
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        template<typename TestType> \
+        void TestName<TestType>::test()
+
+#define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD(ClassName, Name, Tags, TmplList) \
+        INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), ClassName, Name, Tags, TmplList )
+
+
+#endif // CATCH_TEMPLATE_TEST_REGISTRY_HPP_INCLUDED
+
+
+#if defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_DISABLE)
+
+  #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define CATCH_TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE( __VA_ARGS__ )
+    #define CATCH_TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG( __VA_ARGS__ )
+    #define CATCH_TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+    #define CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ )
+    #define CATCH_TEMPLATE_PRODUCT_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE( __VA_ARGS__ )
+    #define CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( __VA_ARGS__ )
+    #define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, __VA_ARGS__ )
+    #define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ )
+    #define CATCH_TEMPLATE_LIST_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE(__VA_ARGS__)
+    #define CATCH_TEMPLATE_LIST_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD( className, __VA_ARGS__ )
+  #else
+    #define CATCH_TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE( __VA_ARGS__ ) )
+    #define CATCH_TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG( __VA_ARGS__ ) )
+    #define CATCH_TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ ) )
+    #define CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ ) )
+    #define CATCH_TEMPLATE_PRODUCT_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE( __VA_ARGS__ ) )
+    #define CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( __VA_ARGS__ ) )
+    #define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, __VA_ARGS__ ) )
+    #define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ ) )
+    #define CATCH_TEMPLATE_LIST_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE( __VA_ARGS__ ) )
+    #define CATCH_TEMPLATE_LIST_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD( className, __VA_ARGS__ ) )
+  #endif
+
+#elif defined(CATCH_CONFIG_PREFIX_ALL) && defined(CATCH_CONFIG_DISABLE)
+
+  #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define CATCH_TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__)
+    #define CATCH_TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(__VA_ARGS__)
+    #define CATCH_TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(className, __VA_ARGS__)
+    #define CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(className, __VA_ARGS__ )
+  #else
+    #define CATCH_TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__) )
+    #define CATCH_TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(__VA_ARGS__) )
+    #define CATCH_TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(className, __VA_ARGS__ ) )
+    #define CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(className, __VA_ARGS__ ) )
+  #endif
+
+  // When disabled, these can be shared between proper preprocessor and MSVC preprocessor
+  #define CATCH_TEMPLATE_PRODUCT_TEST_CASE( ... ) CATCH_TEMPLATE_TEST_CASE( __VA_ARGS__ )
+  #define CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( ... ) CATCH_TEMPLATE_TEST_CASE( __VA_ARGS__ )
+  #define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, ... ) CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+  #define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, ... ) CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+  #define CATCH_TEMPLATE_LIST_TEST_CASE( ... ) CATCH_TEMPLATE_TEST_CASE(__VA_ARGS__)
+  #define CATCH_TEMPLATE_LIST_TEST_CASE_METHOD( className, ... ) CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+
+#elif !defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_DISABLE)
+
+  #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE( __VA_ARGS__ )
+    #define TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG( __VA_ARGS__ )
+    #define TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+    #define TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ )
+    #define TEMPLATE_PRODUCT_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE( __VA_ARGS__ )
+    #define TEMPLATE_PRODUCT_TEST_CASE_SIG( ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( __VA_ARGS__ )
+    #define TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, __VA_ARGS__ )
+    #define TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ )
+    #define TEMPLATE_LIST_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE(__VA_ARGS__)
+    #define TEMPLATE_LIST_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD( className, __VA_ARGS__ )
+  #else
+    #define TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE( __VA_ARGS__ ) )
+    #define TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG( __VA_ARGS__ ) )
+    #define TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ ) )
+    #define TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ ) )
+    #define TEMPLATE_PRODUCT_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE( __VA_ARGS__ ) )
+    #define TEMPLATE_PRODUCT_TEST_CASE_SIG( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( __VA_ARGS__ ) )
+    #define TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, __VA_ARGS__ ) )
+    #define TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ ) )
+    #define TEMPLATE_LIST_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE( __VA_ARGS__ ) )
+    #define TEMPLATE_LIST_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD( className, __VA_ARGS__ ) )
+  #endif
+
+#elif !defined(CATCH_CONFIG_PREFIX_ALL) && defined(CATCH_CONFIG_DISABLE)
+
+  #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__)
+    #define TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(__VA_ARGS__)
+    #define TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(className, __VA_ARGS__)
+    #define TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(className, __VA_ARGS__ )
+  #else
+    #define TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__) )
+    #define TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(__VA_ARGS__) )
+    #define TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(className, __VA_ARGS__ ) )
+    #define TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(className, __VA_ARGS__ ) )
+  #endif
+
+  // When disabled, these can be shared between proper preprocessor and MSVC preprocessor
+  #define TEMPLATE_PRODUCT_TEST_CASE( ... ) TEMPLATE_TEST_CASE( __VA_ARGS__ )
+  #define TEMPLATE_PRODUCT_TEST_CASE_SIG( ... ) TEMPLATE_TEST_CASE( __VA_ARGS__ )
+  #define TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, ... ) TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+  #define TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, ... ) TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+  #define TEMPLATE_LIST_TEST_CASE( ... ) TEMPLATE_TEST_CASE(__VA_ARGS__)
+  #define TEMPLATE_LIST_TEST_CASE_METHOD( className, ... ) TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+
+#endif // end of user facing macro declarations
+
+
+#endif // CATCH_TEMPLATE_TEST_MACROS_HPP_INCLUDED
+
+
+#ifndef CATCH_TEST_CASE_INFO_HPP_INCLUDED
+#define CATCH_TEST_CASE_INFO_HPP_INCLUDED
+
+
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpadded"
+#endif
+
+namespace Catch {
+
+    /**
+     * A **view** of a tag string that provides case insensitive comparisons
+     *
+     * Note that in Catch2 internals, the square brackets around tags are
+     * not a part of tag's representation, so e.g. "[cool-tag]" is represented
+     * as "cool-tag" internally.
+     */
+    struct Tag {
+        constexpr Tag(StringRef original_):
+            original(original_)
+        {}
+        StringRef original;
+
+        friend bool operator< ( Tag const& lhs, Tag const& rhs );
+        friend bool operator==( Tag const& lhs, Tag const& rhs );
+    };
+
+    class ITestInvoker;
+    struct NameAndTags;
+
+    enum class TestCaseProperties : uint8_t {
+        None = 0,
+        IsHidden = 1 << 1,
+        ShouldFail = 1 << 2,
+        MayFail = 1 << 3,
+        Throws = 1 << 4,
+        NonPortable = 1 << 5,
+        Benchmark = 1 << 6
+    };
+
+    /**
+     * Various metadata about the test case.
+     *
+     * A test case is uniquely identified by its (class)name and tags
+     * combination, with source location being ignored, and other properties
+     * being determined from tags.
+     *
+     * Tags are kept sorted.
+     */
+    struct TestCaseInfo : Detail::NonCopyable {
+
+        TestCaseInfo(StringRef _className,
+                     NameAndTags const& _nameAndTags,
+                     SourceLineInfo const& _lineInfo);
+
+        bool isHidden() const;
+        bool throws() const;
+        bool okToFail() const;
+        bool expectedToFail() const;
+
+        // Adds the tag(s) with test's filename (for the -# flag)
+        void addFilenameTag();
+
+        //! Orders by name, classname and tags
+        friend bool operator<( TestCaseInfo const& lhs,
+                               TestCaseInfo const& rhs );
+
+
+        std::string tagsAsString() const;
+
+        std::string name;
+        StringRef className;
+    private:
+        std::string backingTags;
+        // Internally we copy tags to the backing storage and then add
+        // refs to this storage to the tags vector.
+        void internalAppendTag(StringRef tagString);
+    public:
+        std::vector<Tag> tags;
+        SourceLineInfo lineInfo;
+        TestCaseProperties properties = TestCaseProperties::None;
+    };
+
+    /**
+     * Wrapper over the test case information and the test case invoker
+     *
+     * Does not own either, and is specifically made to be cheap
+     * to copy around.
+     */
+    class TestCaseHandle {
+        TestCaseInfo* m_info;
+        ITestInvoker* m_invoker;
+    public:
+        TestCaseHandle(TestCaseInfo* info, ITestInvoker* invoker) :
+            m_info(info), m_invoker(invoker) {}
+
+        void invoke() const {
+            m_invoker->invoke();
+        }
+
+        TestCaseInfo const& getTestCaseInfo() const;
+    };
+
+    Detail::unique_ptr<TestCaseInfo>
+    makeTestCaseInfo( StringRef className,
+                      NameAndTags const& nameAndTags,
+                      SourceLineInfo const& lineInfo );
+}
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+#endif // CATCH_TEST_CASE_INFO_HPP_INCLUDED
+
+
+#ifndef CATCH_TRANSLATE_EXCEPTION_HPP_INCLUDED
+#define CATCH_TRANSLATE_EXCEPTION_HPP_INCLUDED
+
+
+
+#ifndef CATCH_INTERFACES_EXCEPTION_HPP_INCLUDED
+#define CATCH_INTERFACES_EXCEPTION_HPP_INCLUDED
+
+
+#include <string>
+#include <vector>
+
+namespace Catch {
+    using exceptionTranslateFunction = std::string(*)();
+
+    class IExceptionTranslator;
+    using ExceptionTranslators = std::vector<Detail::unique_ptr<IExceptionTranslator const>>;
+
+    class IExceptionTranslator {
+    public:
+        virtual ~IExceptionTranslator(); // = default
+        virtual std::string translate( ExceptionTranslators::const_iterator it, ExceptionTranslators::const_iterator itEnd ) const = 0;
+    };
+
+    class IExceptionTranslatorRegistry {
+    public:
+        virtual ~IExceptionTranslatorRegistry(); // = default
+        virtual std::string translateActiveException() const = 0;
+    };
+
+} // namespace Catch
+
+#endif // CATCH_INTERFACES_EXCEPTION_HPP_INCLUDED
+
+#include <exception>
+
+namespace Catch {
+    namespace Detail {
+        void registerTranslatorImpl(
+            Detail::unique_ptr<IExceptionTranslator>&& translator );
+    }
+
+    class ExceptionTranslatorRegistrar {
+        template<typename T>
+        class ExceptionTranslator : public IExceptionTranslator {
+        public:
+
+            ExceptionTranslator( std::string(*translateFunction)( T const& ) )
+            : m_translateFunction( translateFunction )
+            {}
+
+            std::string translate( ExceptionTranslators::const_iterator it, ExceptionTranslators::const_iterator itEnd ) const override {
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+                try {
+                    if( it == itEnd )
+                        std::rethrow_exception(std::current_exception());
+                    else
+                        return (*it)->translate( it+1, itEnd );
+                }
+                catch( T const& ex ) {
+                    return m_translateFunction( ex );
+                }
+#else
+                return "You should never get here!";
+#endif
+            }
+
+        protected:
+            std::string(*m_translateFunction)( T const& );
+        };
+
+    public:
+        template<typename T>
+        ExceptionTranslatorRegistrar( std::string(*translateFunction)( T const& ) ) {
+            Detail::registerTranslatorImpl(
+                Detail::make_unique<ExceptionTranslator<T>>(
+                    translateFunction ) );
+        }
+    };
+
+} // namespace Catch
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_TRANSLATE_EXCEPTION2( translatorName, signature ) \
+    static std::string translatorName( signature ); \
+    CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+    CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+    namespace{ Catch::ExceptionTranslatorRegistrar INTERNAL_CATCH_UNIQUE_NAME( catch_internal_ExceptionRegistrar )( &translatorName ); } \
+    CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+    static std::string translatorName( signature )
+
+#define INTERNAL_CATCH_TRANSLATE_EXCEPTION( signature ) INTERNAL_CATCH_TRANSLATE_EXCEPTION2( INTERNAL_CATCH_UNIQUE_NAME( catch_internal_ExceptionTranslator ), signature )
+
+#if defined(CATCH_CONFIG_DISABLE)
+    #define INTERNAL_CATCH_TRANSLATE_EXCEPTION_NO_REG( translatorName, signature) \
+            static std::string translatorName( signature )
+#endif
+
+
+// This macro is always prefixed
+#if !defined(CATCH_CONFIG_DISABLE)
+#define CATCH_TRANSLATE_EXCEPTION( signature ) INTERNAL_CATCH_TRANSLATE_EXCEPTION( signature )
+#else
+#define CATCH_TRANSLATE_EXCEPTION( signature ) INTERNAL_CATCH_TRANSLATE_EXCEPTION_NO_REG( INTERNAL_CATCH_UNIQUE_NAME( catch_internal_ExceptionTranslator ), signature )
+#endif
+
+
+#endif // CATCH_TRANSLATE_EXCEPTION_HPP_INCLUDED
+
+
+#ifndef CATCH_VERSION_HPP_INCLUDED
+#define CATCH_VERSION_HPP_INCLUDED
+
+#include <iosfwd>
+
+namespace Catch {
+
+    // Versioning information
+    struct Version {
+        Version( Version const& ) = delete;
+        Version& operator=( Version const& ) = delete;
+        Version(    unsigned int _majorVersion,
+                    unsigned int _minorVersion,
+                    unsigned int _patchNumber,
+                    char const * const _branchName,
+                    unsigned int _buildNumber );
+
+        unsigned int const majorVersion;
+        unsigned int const minorVersion;
+        unsigned int const patchNumber;
+
+        // buildNumber is only used if branchName is not null
+        char const * const branchName;
+        unsigned int const buildNumber;
+
+        friend std::ostream& operator << ( std::ostream& os, Version const& version );
+    };
+
+    Version const& libraryVersion();
+}
+
+#endif // CATCH_VERSION_HPP_INCLUDED
+
+
+#ifndef CATCH_VERSION_MACROS_HPP_INCLUDED
+#define CATCH_VERSION_MACROS_HPP_INCLUDED
+
+#define CATCH_VERSION_MAJOR 3
+#define CATCH_VERSION_MINOR 5
+#define CATCH_VERSION_PATCH 3
+
+#endif // CATCH_VERSION_MACROS_HPP_INCLUDED
+
+
+/** \file
+ * This is a convenience header for Catch2's Generator support. It includes
+ * **all** of Catch2 headers related to generators.
+ *
+ * Generally the Catch2 users should use specific includes they need,
+ * but this header can be used instead for ease-of-experimentation, or
+ * just plain convenience, at the cost of (significantly) increased
+ * compilation times.
+ *
+ * When a new header is added to either the `generators` folder,
+ * or to the corresponding internal subfolder, it should be added here.
+ */
+
+#ifndef CATCH_GENERATORS_ALL_HPP_INCLUDED
+#define CATCH_GENERATORS_ALL_HPP_INCLUDED
+
+
+
+#ifndef CATCH_GENERATOR_EXCEPTION_HPP_INCLUDED
+#define CATCH_GENERATOR_EXCEPTION_HPP_INCLUDED
+
+#include <exception>
+
+namespace Catch {
+
+    // Exception type to be thrown when a Generator runs into an error,
+    // e.g. it cannot initialize the first return value based on
+    // runtime information
+    class GeneratorException : public std::exception {
+        const char* const m_msg = "";
+
+    public:
+        GeneratorException(const char* msg):
+            m_msg(msg)
+        {}
+
+        const char* what() const noexcept override final;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_GENERATOR_EXCEPTION_HPP_INCLUDED
+
+
+#ifndef CATCH_GENERATORS_HPP_INCLUDED
+#define CATCH_GENERATORS_HPP_INCLUDED
+
+
+
+#ifndef CATCH_INTERFACES_GENERATORTRACKER_HPP_INCLUDED
+#define CATCH_INTERFACES_GENERATORTRACKER_HPP_INCLUDED
+
+
+#include <string>
+
+namespace Catch {
+
+    namespace Generators {
+        class GeneratorUntypedBase {
+            // Caches result from `toStringImpl`, assume that when it is an
+            // empty string, the cache is invalidated.
+            mutable std::string m_stringReprCache;
+
+            // Counts based on `next` returning true
+            std::size_t m_currentElementIndex = 0;
+
+            /**
+             * Attempts to move the generator to the next element
+             *
+             * Returns true iff the move succeeded (and a valid element
+             * can be retrieved).
+             */
+            virtual bool next() = 0;
+
+            //! Customization point for `currentElementAsString`
+            virtual std::string stringifyImpl() const = 0;
+
+        public:
+            GeneratorUntypedBase() = default;
+            // Generation of copy ops is deprecated (and Clang will complain)
+            // if there is a user destructor defined
+            GeneratorUntypedBase(GeneratorUntypedBase const&) = default;
+            GeneratorUntypedBase& operator=(GeneratorUntypedBase const&) = default;
+
+            virtual ~GeneratorUntypedBase(); // = default;
+
+            /**
+             * Attempts to move the generator to the next element
+             *
+             * Serves as a non-virtual interface to `next`, so that the
+             * top level interface can provide sanity checking and shared
+             * features.
+             *
+             * As with `next`, returns true iff the move succeeded and
+             * the generator has new valid element to provide.
+             */
+            bool countedNext();
+
+            std::size_t currentElementIndex() const { return m_currentElementIndex; }
+
+            /**
+             * Returns generator's current element as user-friendly string.
+             *
+             * By default returns string equivalent to calling
+             * `Catch::Detail::stringify` on the current element, but generators
+             * can customize their implementation as needed.
+             *
+             * Not thread-safe due to internal caching.
+             *
+             * The returned ref is valid only until the generator instance
+             * is destructed, or it moves onto the next element, whichever
+             * comes first.
+             */
+            StringRef currentElementAsString() const;
+        };
+        using GeneratorBasePtr = Catch::Detail::unique_ptr<GeneratorUntypedBase>;
+
+    } // namespace Generators
+
+    class IGeneratorTracker {
+    public:
+        virtual ~IGeneratorTracker(); // = default;
+        virtual auto hasGenerator() const -> bool = 0;
+        virtual auto getGenerator() const -> Generators::GeneratorBasePtr const& = 0;
+        virtual void setGenerator( Generators::GeneratorBasePtr&& generator ) = 0;
+    };
+
+} // namespace Catch
+
+#endif // CATCH_INTERFACES_GENERATORTRACKER_HPP_INCLUDED
+
+#include <vector>
+#include <tuple>
+
+namespace Catch {
+
+namespace Generators {
+
+namespace Detail {
+
+    //! Throws GeneratorException with the provided message
+    [[noreturn]]
+    void throw_generator_exception(char const * msg);
+
+} // end namespace detail
+
+    template<typename T>
+    class IGenerator : public GeneratorUntypedBase {
+        std::string stringifyImpl() const override {
+            return ::Catch::Detail::stringify( get() );
+        }
+
+    public:
+        // Returns the current element of the generator
+        //
+        // \Precondition The generator is either freshly constructed,
+        // or the last call to `next()` returned true
+        virtual T const& get() const = 0;
+        using type = T;
+    };
+
+    template <typename T>
+    using GeneratorPtr = Catch::Detail::unique_ptr<IGenerator<T>>;
+
+    template <typename T>
+    class GeneratorWrapper final {
+        GeneratorPtr<T> m_generator;
+    public:
+        //! Takes ownership of the passed pointer.
+        GeneratorWrapper(IGenerator<T>* generator):
+            m_generator(generator) {}
+        GeneratorWrapper(GeneratorPtr<T> generator):
+            m_generator(CATCH_MOVE(generator)) {}
+
+        T const& get() const {
+            return m_generator->get();
+        }
+        bool next() {
+            return m_generator->countedNext();
+        }
+    };
+
+
+    template<typename T>
+    class SingleValueGenerator final : public IGenerator<T> {
+        T m_value;
+    public:
+        SingleValueGenerator(T const& value) :
+            m_value(value)
+        {}
+        SingleValueGenerator(T&& value):
+            m_value(CATCH_MOVE(value))
+        {}
+
+        T const& get() const override {
+            return m_value;
+        }
+        bool next() override {
+            return false;
+        }
+    };
+
+    template<typename T>
+    class FixedValuesGenerator final : public IGenerator<T> {
+        static_assert(!std::is_same<T, bool>::value,
+            "FixedValuesGenerator does not support bools because of std::vector<bool>"
+            "specialization, use SingleValue Generator instead.");
+        std::vector<T> m_values;
+        size_t m_idx = 0;
+    public:
+        FixedValuesGenerator( std::initializer_list<T> values ) : m_values( values ) {}
+
+        T const& get() const override {
+            return m_values[m_idx];
+        }
+        bool next() override {
+            ++m_idx;
+            return m_idx < m_values.size();
+        }
+    };
+
+    template <typename T, typename DecayedT = std::decay_t<T>>
+    GeneratorWrapper<DecayedT> value( T&& value ) {
+        return GeneratorWrapper<DecayedT>(
+            Catch::Detail::make_unique<SingleValueGenerator<DecayedT>>(
+                CATCH_FORWARD( value ) ) );
+    }
+    template <typename T>
+    GeneratorWrapper<T> values(std::initializer_list<T> values) {
+        return GeneratorWrapper<T>(Catch::Detail::make_unique<FixedValuesGenerator<T>>(values));
+    }
+
+    template<typename T>
+    class Generators : public IGenerator<T> {
+        std::vector<GeneratorWrapper<T>> m_generators;
+        size_t m_current = 0;
+
+        void add_generator( GeneratorWrapper<T>&& generator ) {
+            m_generators.emplace_back( CATCH_MOVE( generator ) );
+        }
+        void add_generator( T const& val ) {
+            m_generators.emplace_back( value( val ) );
+        }
+        void add_generator( T&& val ) {
+            m_generators.emplace_back( value( CATCH_MOVE( val ) ) );
+        }
+        template <typename U>
+        std::enable_if_t<!std::is_same<std::decay_t<U>, T>::value>
+        add_generator( U&& val ) {
+            add_generator( T( CATCH_FORWARD( val ) ) );
+        }
+
+        template <typename U> void add_generators( U&& valueOrGenerator ) {
+            add_generator( CATCH_FORWARD( valueOrGenerator ) );
+        }
+
+        template <typename U, typename... Gs>
+        void add_generators( U&& valueOrGenerator, Gs&&... moreGenerators ) {
+            add_generator( CATCH_FORWARD( valueOrGenerator ) );
+            add_generators( CATCH_FORWARD( moreGenerators )... );
+        }
+
+    public:
+        template <typename... Gs>
+        Generators(Gs &&... moreGenerators) {
+            m_generators.reserve(sizeof...(Gs));
+            add_generators(CATCH_FORWARD(moreGenerators)...);
+        }
+
+        T const& get() const override {
+            return m_generators[m_current].get();
+        }
+
+        bool next() override {
+            if (m_current >= m_generators.size()) {
+                return false;
+            }
+            const bool current_status = m_generators[m_current].next();
+            if (!current_status) {
+                ++m_current;
+            }
+            return m_current < m_generators.size();
+        }
+    };
+
+
+    template <typename... Ts>
+    GeneratorWrapper<std::tuple<std::decay_t<Ts>...>>
+    table( std::initializer_list<std::tuple<std::decay_t<Ts>...>> tuples ) {
+        return values<std::tuple<Ts...>>( tuples );
+    }
+
+    // Tag type to signal that a generator sequence should convert arguments to a specific type
+    template <typename T>
+    struct as {};
+
+    template<typename T, typename... Gs>
+    auto makeGenerators( GeneratorWrapper<T>&& generator, Gs &&... moreGenerators ) -> Generators<T> {
+        return Generators<T>(CATCH_MOVE(generator), CATCH_FORWARD(moreGenerators)...);
+    }
+    template<typename T>
+    auto makeGenerators( GeneratorWrapper<T>&& generator ) -> Generators<T> {
+        return Generators<T>(CATCH_MOVE(generator));
+    }
+    template<typename T, typename... Gs>
+    auto makeGenerators( T&& val, Gs &&... moreGenerators ) -> Generators<std::decay_t<T>> {
+        return makeGenerators( value( CATCH_FORWARD( val ) ), CATCH_FORWARD( moreGenerators )... );
+    }
+    template<typename T, typename U, typename... Gs>
+    auto makeGenerators( as<T>, U&& val, Gs &&... moreGenerators ) -> Generators<T> {
+        return makeGenerators( value( T( CATCH_FORWARD( val ) ) ), CATCH_FORWARD( moreGenerators )... );
+    }
+
+    IGeneratorTracker* acquireGeneratorTracker( StringRef generatorName,
+                                                SourceLineInfo const& lineInfo );
+    IGeneratorTracker* createGeneratorTracker( StringRef generatorName,
+                                               SourceLineInfo lineInfo,
+                                               GeneratorBasePtr&& generator );
+
+    template<typename L>
+    auto generate( StringRef generatorName, SourceLineInfo const& lineInfo, L const& generatorExpression ) -> typename decltype(generatorExpression())::type {
+        using UnderlyingType = typename decltype(generatorExpression())::type;
+
+        IGeneratorTracker* tracker = acquireGeneratorTracker( generatorName, lineInfo );
+        // Creation of tracker is delayed after generator creation, so
+        // that constructing generator can fail without breaking everything.
+        if (!tracker) {
+            tracker = createGeneratorTracker(
+                generatorName,
+                lineInfo,
+                Catch::Detail::make_unique<Generators<UnderlyingType>>(
+                    generatorExpression() ) );
+        }
+
+        auto const& generator = static_cast<IGenerator<UnderlyingType> const&>( *tracker->getGenerator() );
+        return generator.get();
+    }
+
+} // namespace Generators
+} // namespace Catch
+
+#define CATCH_INTERNAL_GENERATOR_STRINGIZE_IMPL( ... ) #__VA_ARGS__##_catch_sr
+#define CATCH_INTERNAL_GENERATOR_STRINGIZE(...) CATCH_INTERNAL_GENERATOR_STRINGIZE_IMPL(__VA_ARGS__)
+
+#define GENERATE( ... ) \
+    Catch::Generators::generate( CATCH_INTERNAL_GENERATOR_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)), \
+                                 CATCH_INTERNAL_LINEINFO, \
+                                 [ ]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) //NOLINT(google-build-using-namespace)
+#define GENERATE_COPY( ... ) \
+    Catch::Generators::generate( CATCH_INTERNAL_GENERATOR_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)), \
+                                 CATCH_INTERNAL_LINEINFO, \
+                                 [=]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) //NOLINT(google-build-using-namespace)
+#define GENERATE_REF( ... ) \
+    Catch::Generators::generate( CATCH_INTERNAL_GENERATOR_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)), \
+                                 CATCH_INTERNAL_LINEINFO, \
+                                 [&]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) //NOLINT(google-build-using-namespace)
+
+#endif // CATCH_GENERATORS_HPP_INCLUDED
+
+
+#ifndef CATCH_GENERATORS_ADAPTERS_HPP_INCLUDED
+#define CATCH_GENERATORS_ADAPTERS_HPP_INCLUDED
+
+
+#include <cassert>
+
+namespace Catch {
+namespace Generators {
+
+    template <typename T>
+    class TakeGenerator final : public IGenerator<T> {
+        GeneratorWrapper<T> m_generator;
+        size_t m_returned = 0;
+        size_t m_target;
+    public:
+        TakeGenerator(size_t target, GeneratorWrapper<T>&& generator):
+            m_generator(CATCH_MOVE(generator)),
+            m_target(target)
+        {
+            assert(target != 0 && "Empty generators are not allowed");
+        }
+        T const& get() const override {
+            return m_generator.get();
+        }
+        bool next() override {
+            ++m_returned;
+            if (m_returned >= m_target) {
+                return false;
+            }
+
+            const auto success = m_generator.next();
+            // If the underlying generator does not contain enough values
+            // then we cut short as well
+            if (!success) {
+                m_returned = m_target;
+            }
+            return success;
+        }
+    };
+
+    template <typename T>
+    GeneratorWrapper<T> take(size_t target, GeneratorWrapper<T>&& generator) {
+        return GeneratorWrapper<T>(Catch::Detail::make_unique<TakeGenerator<T>>(target, CATCH_MOVE(generator)));
+    }
+
+
+    template <typename T, typename Predicate>
+    class FilterGenerator final : public IGenerator<T> {
+        GeneratorWrapper<T> m_generator;
+        Predicate m_predicate;
+    public:
+        template <typename P = Predicate>
+        FilterGenerator(P&& pred, GeneratorWrapper<T>&& generator):
+            m_generator(CATCH_MOVE(generator)),
+            m_predicate(CATCH_FORWARD(pred))
+        {
+            if (!m_predicate(m_generator.get())) {
+                // It might happen that there are no values that pass the
+                // filter. In that case we throw an exception.
+                auto has_initial_value = next();
+                if (!has_initial_value) {
+                    Detail::throw_generator_exception("No valid value found in filtered generator");
+                }
+            }
+        }
+
+        T const& get() const override {
+            return m_generator.get();
+        }
+
+        bool next() override {
+            bool success = m_generator.next();
+            if (!success) {
+                return false;
+            }
+            while (!m_predicate(m_generator.get()) && (success = m_generator.next()) == true);
+            return success;
+        }
+    };
+
+
+    template <typename T, typename Predicate>
+    GeneratorWrapper<T> filter(Predicate&& pred, GeneratorWrapper<T>&& generator) {
+        return GeneratorWrapper<T>(Catch::Detail::make_unique<FilterGenerator<T, Predicate>>(CATCH_FORWARD(pred), CATCH_MOVE(generator)));
+    }
+
+    template <typename T>
+    class RepeatGenerator final : public IGenerator<T> {
+        static_assert(!std::is_same<T, bool>::value,
+            "RepeatGenerator currently does not support bools"
+            "because of std::vector<bool> specialization");
+        GeneratorWrapper<T> m_generator;
+        mutable std::vector<T> m_returned;
+        size_t m_target_repeats;
+        size_t m_current_repeat = 0;
+        size_t m_repeat_index = 0;
+    public:
+        RepeatGenerator(size_t repeats, GeneratorWrapper<T>&& generator):
+            m_generator(CATCH_MOVE(generator)),
+            m_target_repeats(repeats)
+        {
+            assert(m_target_repeats > 0 && "Repeat generator must repeat at least once");
+        }
+
+        T const& get() const override {
+            if (m_current_repeat == 0) {
+                m_returned.push_back(m_generator.get());
+                return m_returned.back();
+            }
+            return m_returned[m_repeat_index];
+        }
+
+        bool next() override {
+            // There are 2 basic cases:
+            // 1) We are still reading the generator
+            // 2) We are reading our own cache
+
+            // In the first case, we need to poke the underlying generator.
+            // If it happily moves, we are left in that state, otherwise it is time to start reading from our cache
+            if (m_current_repeat == 0) {
+                const auto success = m_generator.next();
+                if (!success) {
+                    ++m_current_repeat;
+                }
+                return m_current_repeat < m_target_repeats;
+            }
+
+            // In the second case, we need to move indices forward and check that we haven't run up against the end
+            ++m_repeat_index;
+            if (m_repeat_index == m_returned.size()) {
+                m_repeat_index = 0;
+                ++m_current_repeat;
+            }
+            return m_current_repeat < m_target_repeats;
+        }
+    };
+
+    template <typename T>
+    GeneratorWrapper<T> repeat(size_t repeats, GeneratorWrapper<T>&& generator) {
+        return GeneratorWrapper<T>(Catch::Detail::make_unique<RepeatGenerator<T>>(repeats, CATCH_MOVE(generator)));
+    }
+
+    template <typename T, typename U, typename Func>
+    class MapGenerator final : public IGenerator<T> {
+        // TBD: provide static assert for mapping function, for friendly error message
+        GeneratorWrapper<U> m_generator;
+        Func m_function;
+        // To avoid returning dangling reference, we have to save the values
+        T m_cache;
+    public:
+        template <typename F2 = Func>
+        MapGenerator(F2&& function, GeneratorWrapper<U>&& generator) :
+            m_generator(CATCH_MOVE(generator)),
+            m_function(CATCH_FORWARD(function)),
+            m_cache(m_function(m_generator.get()))
+        {}
+
+        T const& get() const override {
+            return m_cache;
+        }
+        bool next() override {
+            const auto success = m_generator.next();
+            if (success) {
+                m_cache = m_function(m_generator.get());
+            }
+            return success;
+        }
+    };
+
+    template <typename Func, typename U, typename T = FunctionReturnType<Func, U>>
+    GeneratorWrapper<T> map(Func&& function, GeneratorWrapper<U>&& generator) {
+        return GeneratorWrapper<T>(
+            Catch::Detail::make_unique<MapGenerator<T, U, Func>>(CATCH_FORWARD(function), CATCH_MOVE(generator))
+        );
+    }
+
+    template <typename T, typename U, typename Func>
+    GeneratorWrapper<T> map(Func&& function, GeneratorWrapper<U>&& generator) {
+        return GeneratorWrapper<T>(
+            Catch::Detail::make_unique<MapGenerator<T, U, Func>>(CATCH_FORWARD(function), CATCH_MOVE(generator))
+        );
+    }
+
+    template <typename T>
+    class ChunkGenerator final : public IGenerator<std::vector<T>> {
+        std::vector<T> m_chunk;
+        size_t m_chunk_size;
+        GeneratorWrapper<T> m_generator;
+        bool m_used_up = false;
+    public:
+        ChunkGenerator(size_t size, GeneratorWrapper<T> generator) :
+            m_chunk_size(size), m_generator(CATCH_MOVE(generator))
+        {
+            m_chunk.reserve(m_chunk_size);
+            if (m_chunk_size != 0) {
+                m_chunk.push_back(m_generator.get());
+                for (size_t i = 1; i < m_chunk_size; ++i) {
+                    if (!m_generator.next()) {
+                        Detail::throw_generator_exception("Not enough values to initialize the first chunk");
+                    }
+                    m_chunk.push_back(m_generator.get());
+                }
+            }
+        }
+        std::vector<T> const& get() const override {
+            return m_chunk;
+        }
+        bool next() override {
+            m_chunk.clear();
+            for (size_t idx = 0; idx < m_chunk_size; ++idx) {
+                if (!m_generator.next()) {
+                    return false;
+                }
+                m_chunk.push_back(m_generator.get());
+            }
+            return true;
+        }
+    };
+
+    template <typename T>
+    GeneratorWrapper<std::vector<T>> chunk(size_t size, GeneratorWrapper<T>&& generator) {
+        return GeneratorWrapper<std::vector<T>>(
+            Catch::Detail::make_unique<ChunkGenerator<T>>(size, CATCH_MOVE(generator))
+        );
+    }
+
+} // namespace Generators
+} // namespace Catch
+
+
+#endif // CATCH_GENERATORS_ADAPTERS_HPP_INCLUDED
+
+
+#ifndef CATCH_GENERATORS_RANDOM_HPP_INCLUDED
+#define CATCH_GENERATORS_RANDOM_HPP_INCLUDED
+
+
+
+#ifndef CATCH_RANDOM_NUMBER_GENERATOR_HPP_INCLUDED
+#define CATCH_RANDOM_NUMBER_GENERATOR_HPP_INCLUDED
+
+#include <cstdint>
+
+namespace Catch {
+
+    // This is a simple implementation of C++11 Uniform Random Number
+    // Generator. It does not provide all operators, because Catch2
+    // does not use it, but it should behave as expected inside stdlib's
+    // distributions.
+    // The implementation is based on the PCG family (http://pcg-random.org)
+    class SimplePcg32 {
+        using state_type = std::uint64_t;
+    public:
+        using result_type = std::uint32_t;
+        static constexpr result_type (min)() {
+            return 0;
+        }
+        static constexpr result_type (max)() {
+            return static_cast<result_type>(-1);
+        }
+
+        // Provide some default initial state for the default constructor
+        SimplePcg32():SimplePcg32(0xed743cc4U) {}
+
+        explicit SimplePcg32(result_type seed_);
+
+        void seed(result_type seed_);
+        void discard(uint64_t skip);
+
+        result_type operator()();
+
+    private:
+        friend bool operator==(SimplePcg32 const& lhs, SimplePcg32 const& rhs);
+        friend bool operator!=(SimplePcg32 const& lhs, SimplePcg32 const& rhs);
+
+        // In theory we also need operator<< and operator>>
+        // In practice we do not use them, so we will skip them for now
+
+
+        std::uint64_t m_state;
+        // This part of the state determines which "stream" of the numbers
+        // is chosen -- we take it as a constant for Catch2, so we only
+        // need to deal with seeding the main state.
+        // Picked by reading 8 bytes from `/dev/random` :-)
+        static const std::uint64_t s_inc = (0x13ed0cc53f939476ULL << 1ULL) | 1ULL;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_RANDOM_NUMBER_GENERATOR_HPP_INCLUDED
+
+
+
+#ifndef CATCH_UNIFORM_INTEGER_DISTRIBUTION_HPP_INCLUDED
+#define CATCH_UNIFORM_INTEGER_DISTRIBUTION_HPP_INCLUDED
+
+
+
+
+#ifndef CATCH_RANDOM_INTEGER_HELPERS_HPP_INCLUDED
+#define CATCH_RANDOM_INTEGER_HELPERS_HPP_INCLUDED
+
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+namespace Catch {
+    namespace Detail {
+
+        template <std::size_t>
+        struct SizedUnsignedType;
+#define SizedUnsignedTypeHelper( TYPE )        \
+    template <>                                \
+    struct SizedUnsignedType<sizeof( TYPE )> { \
+        using type = TYPE;                     \
+    }
+
+        SizedUnsignedTypeHelper( std::uint8_t );
+        SizedUnsignedTypeHelper( std::uint16_t );
+        SizedUnsignedTypeHelper( std::uint32_t );
+        SizedUnsignedTypeHelper( std::uint64_t );
+#undef SizedUnsignedTypeHelper
+
+        template <std::size_t sz>
+        using SizedUnsignedType_t = typename SizedUnsignedType<sz>::type;
+
+        template <typename T>
+        using DoubleWidthUnsignedType_t = SizedUnsignedType_t<2 * sizeof( T )>;
+
+        template <typename T>
+        struct ExtendedMultResult {
+            T upper;
+            T lower;
+            bool operator==( ExtendedMultResult const& rhs ) const {
+                return upper == rhs.upper && lower == rhs.lower;
+            }
+        };
+
+        // Returns 128 bit result of multiplying lhs and rhs
+        constexpr ExtendedMultResult<std::uint64_t>
+        extendedMult( std::uint64_t lhs, std::uint64_t rhs ) {
+            // We use the simple long multiplication approach for
+            // correctness, we can use platform specific builtins
+            // for performance later.
+
+            // Split the lhs and rhs into two 32bit "digits", so that we can
+            // do 64 bit arithmetic to handle carry bits.
+            //            32b    32b    32b    32b
+            //     lhs                  L1     L2
+            //   * rhs                  R1     R2
+            //            ------------------------
+            //                       |  R2 * L2  |
+            //                 |  R2 * L1  |
+            //                 |  R1 * L2  |
+            //           |  R1 * L1  |
+            //           -------------------------
+            //           |  a  |  b  |  c  |  d  |
+
+#define CarryBits( x ) ( x >> 32 )
+#define Digits( x ) ( x & 0xFF'FF'FF'FF )
+
+            auto r2l2 = Digits( rhs ) * Digits( lhs );
+            auto r2l1 = Digits( rhs ) * CarryBits( lhs );
+            auto r1l2 = CarryBits( rhs ) * Digits( lhs );
+            auto r1l1 = CarryBits( rhs ) * CarryBits( lhs );
+
+            // Sum to columns first
+            auto d = Digits( r2l2 );
+            auto c = CarryBits( r2l2 ) + Digits( r2l1 ) + Digits( r1l2 );
+            auto b = CarryBits( r2l1 ) + CarryBits( r1l2 ) + Digits( r1l1 );
+            auto a = CarryBits( r1l1 );
+
+            // Propagate carries between columns
+            c += CarryBits( d );
+            b += CarryBits( c );
+            a += CarryBits( b );
+
+            // Remove the used carries
+            c = Digits( c );
+            b = Digits( b );
+            a = Digits( a );
+
+#undef CarryBits
+#undef Digits
+
+            return {
+                a << 32 | b, // upper 64 bits
+                c << 32 | d  // lower 64 bits
+            };
+        }
+
+        template <typename UInt>
+        constexpr ExtendedMultResult<UInt> extendedMult( UInt lhs, UInt rhs ) {
+            static_assert( std::is_unsigned<UInt>::value,
+                           "extendedMult can only handle unsigned integers" );
+            static_assert( sizeof( UInt ) < sizeof( std::uint64_t ),
+                           "Generic extendedMult can only handle types smaller "
+                           "than uint64_t" );
+            using WideType = DoubleWidthUnsignedType_t<UInt>;
+
+            auto result = WideType( lhs ) * WideType( rhs );
+            return {
+                static_cast<UInt>( result >> ( CHAR_BIT * sizeof( UInt ) ) ),
+                static_cast<UInt>( result & UInt( -1 ) ) };
+        }
+
+
+        template <typename TargetType,
+                  typename Generator>
+            std::enable_if_t<sizeof(typename Generator::result_type) >= sizeof(TargetType),
+            TargetType> fillBitsFrom(Generator& gen) {
+            using gresult_type = typename Generator::result_type;
+            static_assert( std::is_unsigned<TargetType>::value, "Only unsigned integers are supported" );
+            static_assert( Generator::min() == 0 &&
+                           Generator::max() == static_cast<gresult_type>( -1 ),
+                           "Generator must be able to output all numbers in its result type (effectively it must be a random bit generator)" );
+
+            // We want to return the top bits from a generator, as they are
+            // usually considered higher quality.
+            constexpr auto generated_bits = sizeof( gresult_type ) * CHAR_BIT;
+            constexpr auto return_bits = sizeof( TargetType ) * CHAR_BIT;
+
+            return static_cast<TargetType>( gen() >>
+                                            ( generated_bits - return_bits) );
+        }
+
+        template <typename TargetType,
+                  typename Generator>
+            std::enable_if_t<sizeof(typename Generator::result_type) < sizeof(TargetType),
+            TargetType> fillBitsFrom(Generator& gen) {
+            using gresult_type = typename Generator::result_type;
+            static_assert( std::is_unsigned<TargetType>::value,
+                           "Only unsigned integers are supported" );
+            static_assert( Generator::min() == 0 &&
+                           Generator::max() == static_cast<gresult_type>( -1 ),
+                           "Generator must be able to output all numbers in its result type (effectively it must be a random bit generator)" );
+
+            constexpr auto generated_bits = sizeof( gresult_type ) * CHAR_BIT;
+            constexpr auto return_bits = sizeof( TargetType ) * CHAR_BIT;
+            std::size_t filled_bits = 0;
+            TargetType ret = 0;
+            do {
+                ret <<= generated_bits;
+                ret |= gen();
+                filled_bits += generated_bits;
+            } while ( filled_bits < return_bits );
+
+            return ret;
+        }
+
+        /*
+         * Transposes numbers into unsigned type while keeping their ordering
+         *
+         * This means that signed types are changed so that the ordering is
+         * [INT_MIN, ..., -1, 0, ..., INT_MAX], rather than order we would
+         * get by simple casting ([0, ..., INT_MAX, INT_MIN, ..., -1])
+         */
+        template <typename OriginalType, typename UnsignedType>
+        std::enable_if_t<std::is_signed<OriginalType>::value, UnsignedType>
+        transposeToNaturalOrder( UnsignedType in ) {
+            static_assert(
+                sizeof( OriginalType ) == sizeof( UnsignedType ),
+                "reordering requires the same sized types on both sides" );
+            static_assert( std::is_unsigned<UnsignedType>::value,
+                           "Input type must be unsigned" );
+            // Assuming 2s complement (standardized in current C++), the
+            // positive and negative numbers are already internally ordered,
+            // and their difference is in the top bit. Swapping it orders
+            // them the desired way.
+            constexpr auto highest_bit =
+                UnsignedType( 1 ) << ( sizeof( UnsignedType ) * CHAR_BIT - 1 );
+            return static_cast<UnsignedType>( in ^ highest_bit );
+        }
+
+
+
+        template <typename OriginalType,
+                  typename UnsignedType>
+        std::enable_if_t<std::is_unsigned<OriginalType>::value, UnsignedType>
+            transposeToNaturalOrder(UnsignedType in) {
+            static_assert(
+                sizeof( OriginalType ) == sizeof( UnsignedType ),
+                "reordering requires the same sized types on both sides" );
+            static_assert( std::is_unsigned<UnsignedType>::value, "Input type must be unsigned" );
+            // No reordering is needed for unsigned -> unsigned
+            return in;
+        }
+    } // namespace Detail
+} // namespace Catch
+
+#endif // CATCH_RANDOM_INTEGER_HELPERS_HPP_INCLUDED
+
+namespace Catch {
+
+    namespace Detail {
+        // Indirection to enable make_unsigned<bool> behaviour.
+        template <typename T>
+        struct make_unsigned {
+            using type = std::make_unsigned_t<T>;
+        };
+
+        template <>
+        struct make_unsigned<bool> {
+            using type = uint8_t;
+        };
+
+        template <typename T>
+        using make_unsigned_t = typename make_unsigned<T>::type;
+    }
+
+/**
+ * Implementation of uniform distribution on integers.
+ *
+ * Unlike `std::uniform_int_distribution`, this implementation supports
+ * various 1 byte integral types, including bool (but you should not
+ * actually use it for bools).
+ *
+ * The underlying algorithm is based on the one described in "Fast Random
+ * Integer Generation in an Interval" by Daniel Lemire, but has been
+ * optimized under the assumption of reuse of the same distribution object.
+ */
+template <typename IntegerType>
+class uniform_integer_distribution {
+    static_assert(std::is_integral<IntegerType>::value, "...");
+
+    using UnsignedIntegerType = Detail::make_unsigned_t<IntegerType>;
+
+    // Only the left bound is stored, and we store it converted to its
+    // unsigned image. This avoids having to do the conversions inside
+    // the operator(), at the cost of having to do the conversion in
+    // the a() getter. The right bound is only needed in the b() getter,
+    // so we recompute it there from other stored data.
+    UnsignedIntegerType m_a;
+
+    // How many different values are there in [a, b]. a == b => 1, can be 0 for distribution over all values in the type.
+    UnsignedIntegerType m_ab_distance;
+
+    // We hoisted this out of the main generation function. Technically,
+    // this means that using this distribution will be slower than Lemire's
+    // algorithm if this distribution instance will be used only few times,
+    // but it will be faster if it is used many times. Since Catch2 uses
+    // distributions only to implement random generators, we assume that each
+    // distribution will be reused many times and this is an optimization.
+    UnsignedIntegerType m_rejection_threshold = 0;
+
+    UnsignedIntegerType computeDistance(IntegerType a, IntegerType b) const {
+        // This overflows and returns 0 if a == 0 and b == TYPE_MAX.
+        // We handle that later when generating the number.
+        return transposeTo(b) - transposeTo(a) + 1;
+    }
+
+    static UnsignedIntegerType computeRejectionThreshold(UnsignedIntegerType ab_distance) {
+        // distance == 0 means that we will return all possible values from
+        // the type's range, and that we shouldn't reject anything.
+        if ( ab_distance == 0 ) { return 0; }
+        return ( ~ab_distance + 1 ) % ab_distance;
+    }
+
+    static UnsignedIntegerType transposeTo(IntegerType in) {
+        return Detail::transposeToNaturalOrder<IntegerType>(
+            static_cast<UnsignedIntegerType>( in ) );
+    }
+    static IntegerType transposeBack(UnsignedIntegerType in) {
+        return static_cast<IntegerType>(
+            Detail::transposeToNaturalOrder<IntegerType>(in) );
+    }
+
+public:
+    using result_type = IntegerType;
+
+    uniform_integer_distribution( IntegerType a, IntegerType b ):
+        m_a( transposeTo(a) ),
+        m_ab_distance( computeDistance(a, b) ),
+        m_rejection_threshold( computeRejectionThreshold(m_ab_distance) ) {
+        assert( a <= b );
+    }
+
+    template <typename Generator>
+    result_type operator()( Generator& g ) {
+        // All possible values of result_type are valid.
+        if ( m_ab_distance == 0 ) {
+            return transposeBack( Detail::fillBitsFrom<UnsignedIntegerType>( g ) );
+        }
+
+        auto random_number = Detail::fillBitsFrom<UnsignedIntegerType>( g );
+        auto emul = Detail::extendedMult( random_number, m_ab_distance );
+        // Unlike Lemire's algorithm we skip the ab_distance check, since
+        // we precomputed the rejection threshold, which is always tighter.
+        while (emul.lower < m_rejection_threshold) {
+            random_number = Detail::fillBitsFrom<UnsignedIntegerType>( g );
+            emul = Detail::extendedMult( random_number, m_ab_distance );
+        }
+
+        return transposeBack(m_a + emul.upper);
+    }
+
+    result_type a() const { return transposeBack(m_a); }
+    result_type b() const { return transposeBack(m_ab_distance + m_a - 1); }
+};
+
+} // end namespace Catch
+
+#endif // CATCH_UNIFORM_INTEGER_DISTRIBUTION_HPP_INCLUDED
+
+
+
+#ifndef CATCH_UNIFORM_FLOATING_POINT_DISTRIBUTION_HPP_INCLUDED
+#define CATCH_UNIFORM_FLOATING_POINT_DISTRIBUTION_HPP_INCLUDED
+
+
+
+
+#ifndef CATCH_RANDOM_FLOATING_POINT_HELPERS_HPP_INCLUDED
+#define CATCH_RANDOM_FLOATING_POINT_HELPERS_HPP_INCLUDED
+
+
+
+#ifndef CATCH_POLYFILLS_HPP_INCLUDED
+#define CATCH_POLYFILLS_HPP_INCLUDED
+
+namespace Catch {
+
+    bool isnan(float f);
+    bool isnan(double d);
+
+    float nextafter(float x, float y);
+    double nextafter(double x, double y);
+
+}
+
+#endif // CATCH_POLYFILLS_HPP_INCLUDED
+
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+namespace Catch {
+
+    namespace Detail {
+        /**
+         * Returns the largest magnitude of 1-ULP distance inside the [a, b] range.
+         *
+         * Assumes `a < b`.
+         */
+        template <typename FloatType>
+        FloatType gamma(FloatType a, FloatType b) {
+            static_assert( std::is_floating_point<FloatType>::value,
+                           "gamma returns the largest ULP magnitude within "
+                           "floating point range [a, b]. This only makes sense "
+                           "for floating point types" );
+            assert( a <= b );
+
+            const auto gamma_up = Catch::nextafter( a, std::numeric_limits<FloatType>::infinity() ) - a;
+            const auto gamma_down = b - Catch::nextafter( b, -std::numeric_limits<FloatType>::infinity() );
+
+            return gamma_up < gamma_down ? gamma_down : gamma_up;
+        }
+
+        template <typename FloatingPoint>
+        struct DistanceTypePicker;
+        template <>
+        struct DistanceTypePicker<float> {
+            using type = std::uint32_t;
+        };
+        template <>
+        struct DistanceTypePicker<double> {
+            using type = std::uint64_t;
+        };
+
+        template <typename T>
+        using DistanceType = typename DistanceTypePicker<T>::type;
+
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        /**
+         * Computes the number of equi-distant floats in [a, b]
+         *
+         * Since not every range can be split into equidistant floats
+         * exactly, we actually compute ceil(b/distance - a/distance),
+         * because in those cases we want to overcount.
+         *
+         * Uses modified Dekker's FastTwoSum algorithm to handle rounding.
+         */
+        template <typename FloatType>
+        DistanceType<FloatType>
+        count_equidistant_floats( FloatType a, FloatType b, FloatType distance ) {
+            assert( a <= b );
+            // We get distance as gamma for our uniform float distribution,
+            // so this will round perfectly.
+            const auto ag = a / distance;
+            const auto bg = b / distance;
+
+            const auto s = bg - ag;
+            const auto err = ( std::fabs( a ) <= std::fabs( b ) )
+                                 ? -ag - ( s - bg )
+                                 : bg - ( s + ag );
+            const auto ceil_s = static_cast<DistanceType<FloatType>>( std::ceil( s ) );
+
+            return ( ceil_s != s ) ? ceil_s : ceil_s + ( err > 0 );
+        }
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+
+    }
+
+} // end namespace Catch
+
+#endif // CATCH_RANDOM_FLOATING_POINT_HELPERS_HPP_INCLUDED
+
+#include <cmath>
+#include <type_traits>
+
+namespace Catch {
+
+    namespace Detail {
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        // The issue with overflow only happens with maximal ULP and HUGE
+        // distance, e.g. when generating numbers in [-inf, inf] for given
+        // type. So we only check for the largest possible ULP in the
+        // type, and return something that does not overflow to inf in 1 mult.
+        constexpr std::uint64_t calculate_max_steps_in_one_go(double gamma) {
+            if ( gamma == 1.99584030953472e+292 ) { return 9007199254740991; }
+            return static_cast<std::uint64_t>( -1 );
+        }
+        constexpr std::uint32_t calculate_max_steps_in_one_go(float gamma) {
+            if ( gamma == 2.028241e+31f ) { return 16777215; }
+            return static_cast<std::uint32_t>( -1 );
+        }
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+    }
+
+/**
+ * Implementation of uniform distribution on floating point numbers.
+ *
+ * Note that we support only `float` and `double` types, because these
+ * usually mean the same thing across different platform. `long double`
+ * varies wildly by platform and thus we cannot provide reproducible
+ * implementation. Also note that we don't implement all parts of
+ * distribution per standard: this distribution is not serializable, nor
+ * can the range be arbitrarily reset.
+ *
+ * The implementation also uses different approach than the one taken by
+ * `std::uniform_real_distribution`, where instead of generating a number
+ * between [0, 1) and then multiplying the range bounds with it, we first
+ * split the [a, b] range into a set of equidistributed floating point
+ * numbers, and then use uniform int distribution to pick which one to
+ * return.
+ *
+ * This has the advantage of guaranteeing uniformity (the multiplication
+ * method loses uniformity due to rounding when multiplying floats), except
+ * for small non-uniformity at one side of the interval, where we have
+ * to deal with the fact that not every interval is splittable into
+ * equidistributed floats.
+ *
+ * Based on "Drawing random floating-point numbers from an interval" by
+ * Frederic Goualard.
+ */
+template <typename FloatType>
+class uniform_floating_point_distribution {
+    static_assert(std::is_floating_point<FloatType>::value, "...");
+    static_assert(!std::is_same<FloatType, long double>::value,
+                  "We do not support long double due to inconsistent behaviour between platforms");
+
+    using WidthType = Detail::DistanceType<FloatType>;
+
+    FloatType m_a, m_b;
+    FloatType m_ulp_magnitude;
+    WidthType m_floats_in_range;
+    uniform_integer_distribution<WidthType> m_int_dist;
+
+    // In specific cases, we can overflow into `inf` when computing the
+    // `steps * g` offset. To avoid this, we don't offset by more than this
+    // in one multiply + addition.
+    WidthType m_max_steps_in_one_go;
+    // We don't want to do the magnitude check every call to `operator()`
+    bool m_a_has_leq_magnitude;
+
+public:
+    using result_type = FloatType;
+
+    uniform_floating_point_distribution( FloatType a, FloatType b ):
+        m_a( a ),
+        m_b( b ),
+        m_ulp_magnitude( Detail::gamma( m_a, m_b ) ),
+        m_floats_in_range( Detail::count_equidistant_floats( m_a, m_b, m_ulp_magnitude ) ),
+        m_int_dist(0, m_floats_in_range),
+        m_max_steps_in_one_go( Detail::calculate_max_steps_in_one_go(m_ulp_magnitude)),
+        m_a_has_leq_magnitude(std::fabs(m_a) <= std::fabs(m_b))
+    {
+        assert( a <= b );
+    }
+
+    template <typename Generator>
+    result_type operator()( Generator& g ) {
+        WidthType steps = m_int_dist( g );
+        if ( m_a_has_leq_magnitude ) {
+            if ( steps == m_floats_in_range ) { return m_a; }
+            auto b = m_b;
+            while (steps > m_max_steps_in_one_go) {
+                b -= m_max_steps_in_one_go * m_ulp_magnitude;
+                steps -= m_max_steps_in_one_go;
+            }
+            return b - steps * m_ulp_magnitude;
+        } else {
+            if ( steps == m_floats_in_range ) { return m_b; }
+            auto a = m_a;
+            while (steps > m_max_steps_in_one_go) {
+                a += m_max_steps_in_one_go * m_ulp_magnitude;
+                steps -= m_max_steps_in_one_go;
+            }
+            return a + steps * m_ulp_magnitude;
+        }
+    }
+
+    result_type a() const { return m_a; }
+    result_type b() const { return m_b; }
+};
+
+} // end namespace Catch
+
+#endif // CATCH_UNIFORM_FLOATING_POINT_DISTRIBUTION_HPP_INCLUDED
+
+namespace Catch {
+namespace Generators {
+namespace Detail {
+    // Returns a suitable seed for a random floating generator based off
+    // the primary internal rng. It does so by taking current value from
+    // the rng and returning it as the seed.
+    std::uint32_t getSeed();
+}
+
+template <typename Float>
+class RandomFloatingGenerator final : public IGenerator<Float> {
+    Catch::SimplePcg32 m_rng;
+    Catch::uniform_floating_point_distribution<Float> m_dist;
+    Float m_current_number;
+public:
+    RandomFloatingGenerator( Float a, Float b, std::uint32_t seed ):
+        m_rng(seed),
+        m_dist(a, b) {
+        static_cast<void>(next());
+    }
+
+    Float const& get() const override {
+        return m_current_number;
+    }
+    bool next() override {
+        m_current_number = m_dist(m_rng);
+        return true;
+    }
+};
+
+template <>
+class RandomFloatingGenerator<long double> final : public IGenerator<long double> {
+    // We still rely on <random> for this specialization, but we don't
+    // want to drag it into the header.
+    struct PImpl;
+    Catch::Detail::unique_ptr<PImpl> m_pimpl;
+    long double m_current_number;
+
+public:
+    RandomFloatingGenerator( long double a, long double b, std::uint32_t seed );
+
+    long double const& get() const override { return m_current_number; }
+    bool next() override;
+
+    ~RandomFloatingGenerator() override; // = default
+};
+
+template <typename Integer>
+class RandomIntegerGenerator final : public IGenerator<Integer> {
+    Catch::SimplePcg32 m_rng;
+    Catch::uniform_integer_distribution<Integer> m_dist;
+    Integer m_current_number;
+public:
+    RandomIntegerGenerator( Integer a, Integer b, std::uint32_t seed ):
+        m_rng(seed),
+        m_dist(a, b) {
+        static_cast<void>(next());
+    }
+
+    Integer const& get() const override {
+        return m_current_number;
+    }
+    bool next() override {
+        m_current_number = m_dist(m_rng);
+        return true;
+    }
+};
+
+template <typename T>
+std::enable_if_t<std::is_integral<T>::value, GeneratorWrapper<T>>
+random(T a, T b) {
+    return GeneratorWrapper<T>(
+        Catch::Detail::make_unique<RandomIntegerGenerator<T>>(a, b, Detail::getSeed())
+    );
+}
+
+template <typename T>
+std::enable_if_t<std::is_floating_point<T>::value,
+GeneratorWrapper<T>>
+random(T a, T b) {
+    return GeneratorWrapper<T>(
+        Catch::Detail::make_unique<RandomFloatingGenerator<T>>(a, b, Detail::getSeed())
+    );
+}
+
+
+} // namespace Generators
+} // namespace Catch
+
+
+#endif // CATCH_GENERATORS_RANDOM_HPP_INCLUDED
+
+
+#ifndef CATCH_GENERATORS_RANGE_HPP_INCLUDED
+#define CATCH_GENERATORS_RANGE_HPP_INCLUDED
+
+
+#include <iterator>
+#include <type_traits>
+
+namespace Catch {
+namespace Generators {
+
+
+template <typename T>
+class RangeGenerator final : public IGenerator<T> {
+    T m_current;
+    T m_end;
+    T m_step;
+    bool m_positive;
+
+public:
+    RangeGenerator(T const& start, T const& end, T const& step):
+        m_current(start),
+        m_end(end),
+        m_step(step),
+        m_positive(m_step > T(0))
+    {
+        assert(m_current != m_end && "Range start and end cannot be equal");
+        assert(m_step != T(0) && "Step size cannot be zero");
+        assert(((m_positive && m_current <= m_end) || (!m_positive && m_current >= m_end)) && "Step moves away from end");
+    }
+
+    RangeGenerator(T const& start, T const& end):
+        RangeGenerator(start, end, (start < end) ? T(1) : T(-1))
+    {}
+
+    T const& get() const override {
+        return m_current;
+    }
+
+    bool next() override {
+        m_current += m_step;
+        return (m_positive) ? (m_current < m_end) : (m_current > m_end);
+    }
+};
+
+template <typename T>
+GeneratorWrapper<T> range(T const& start, T const& end, T const& step) {
+    static_assert(std::is_arithmetic<T>::value && !std::is_same<T, bool>::value, "Type must be numeric");
+    return GeneratorWrapper<T>(Catch::Detail::make_unique<RangeGenerator<T>>(start, end, step));
+}
+
+template <typename T>
+GeneratorWrapper<T> range(T const& start, T const& end) {
+    static_assert(std::is_integral<T>::value && !std::is_same<T, bool>::value, "Type must be an integer");
+    return GeneratorWrapper<T>(Catch::Detail::make_unique<RangeGenerator<T>>(start, end));
+}
+
+
+template <typename T>
+class IteratorGenerator final : public IGenerator<T> {
+    static_assert(!std::is_same<T, bool>::value,
+        "IteratorGenerator currently does not support bools"
+        "because of std::vector<bool> specialization");
+
+    std::vector<T> m_elems;
+    size_t m_current = 0;
+public:
+    template <typename InputIterator, typename InputSentinel>
+    IteratorGenerator(InputIterator first, InputSentinel last):m_elems(first, last) {
+        if (m_elems.empty()) {
+            Detail::throw_generator_exception("IteratorGenerator received no valid values");
+        }
+    }
+
+    T const& get() const override {
+        return m_elems[m_current];
+    }
+
+    bool next() override {
+        ++m_current;
+        return m_current != m_elems.size();
+    }
+};
+
+template <typename InputIterator,
+          typename InputSentinel,
+          typename ResultType = typename std::iterator_traits<InputIterator>::value_type>
+GeneratorWrapper<ResultType> from_range(InputIterator from, InputSentinel to) {
+    return GeneratorWrapper<ResultType>(Catch::Detail::make_unique<IteratorGenerator<ResultType>>(from, to));
+}
+
+template <typename Container>
+auto from_range(Container const& cnt) {
+    using std::begin;
+    using std::end;
+    return from_range( begin( cnt ), end( cnt ) );
+}
+
+
+} // namespace Generators
+} // namespace Catch
+
+
+#endif // CATCH_GENERATORS_RANGE_HPP_INCLUDED
+
+#endif // CATCH_GENERATORS_ALL_HPP_INCLUDED
+
+
+/** \file
+ * This is a convenience header for Catch2's interfaces. It includes
+ * **all** of Catch2 headers related to interfaces.
+ *
+ * Generally the Catch2 users should use specific includes they need,
+ * but this header can be used instead for ease-of-experimentation, or
+ * just plain convenience, at the cost of somewhat increased compilation
+ * times.
+ *
+ * When a new header is added to either the `interfaces` folder, or to
+ * the corresponding internal subfolder, it should be added here.
+ */
+
+
+#ifndef CATCH_INTERFACES_ALL_HPP_INCLUDED
+#define CATCH_INTERFACES_ALL_HPP_INCLUDED
+
+
+
+#ifndef CATCH_INTERFACES_REPORTER_HPP_INCLUDED
+#define CATCH_INTERFACES_REPORTER_HPP_INCLUDED
+
+
+
+#ifndef CATCH_TEST_RUN_INFO_HPP_INCLUDED
+#define CATCH_TEST_RUN_INFO_HPP_INCLUDED
+
+
+namespace Catch {
+
+    struct TestRunInfo {
+        constexpr TestRunInfo(StringRef _name) : name(_name) {}
+        StringRef name;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_TEST_RUN_INFO_HPP_INCLUDED
+
+#include <map>
+#include <string>
+#include <vector>
+#include <iosfwd>
+
+namespace Catch {
+
+    struct ReporterDescription;
+    struct ListenerDescription;
+    struct TagInfo;
+    struct TestCaseInfo;
+    class TestCaseHandle;
+    class IConfig;
+    class IStream;
+    enum class ColourMode : std::uint8_t;
+
+    struct ReporterConfig {
+        ReporterConfig( IConfig const* _fullConfig,
+                        Detail::unique_ptr<IStream> _stream,
+                        ColourMode colourMode,
+                        std::map<std::string, std::string> customOptions );
+
+        ReporterConfig( ReporterConfig&& ) = default;
+        ReporterConfig& operator=( ReporterConfig&& ) = default;
+        ~ReporterConfig(); // = default
+
+        Detail::unique_ptr<IStream> takeStream() &&;
+        IConfig const* fullConfig() const;
+        ColourMode colourMode() const;
+        std::map<std::string, std::string> const& customOptions() const;
+
+    private:
+        Detail::unique_ptr<IStream> m_stream;
+        IConfig const* m_fullConfig;
+        ColourMode m_colourMode;
+        std::map<std::string, std::string> m_customOptions;
+    };
+
+    struct AssertionStats {
+        AssertionStats( AssertionResult const& _assertionResult,
+                        std::vector<MessageInfo> const& _infoMessages,
+                        Totals const& _totals );
+
+        AssertionStats( AssertionStats const& )              = default;
+        AssertionStats( AssertionStats && )                  = default;
+        AssertionStats& operator = ( AssertionStats const& ) = delete;
+        AssertionStats& operator = ( AssertionStats && )     = delete;
+
+        AssertionResult assertionResult;
+        std::vector<MessageInfo> infoMessages;
+        Totals totals;
+    };
+
+    struct SectionStats {
+        SectionStats(   SectionInfo&& _sectionInfo,
+                        Counts const& _assertions,
+                        double _durationInSeconds,
+                        bool _missingAssertions );
+
+        SectionInfo sectionInfo;
+        Counts assertions;
+        double durationInSeconds;
+        bool missingAssertions;
+    };
+
+    struct TestCaseStats {
+        TestCaseStats(  TestCaseInfo const& _testInfo,
+                        Totals const& _totals,
+                        std::string&& _stdOut,
+                        std::string&& _stdErr,
+                        bool _aborting );
+
+        TestCaseInfo const * testInfo;
+        Totals totals;
+        std::string stdOut;
+        std::string stdErr;
+        bool aborting;
+    };
+
+    struct TestRunStats {
+        TestRunStats(   TestRunInfo const& _runInfo,
+                        Totals const& _totals,
+                        bool _aborting );
+
+        TestRunInfo runInfo;
+        Totals totals;
+        bool aborting;
+    };
+
+    //! By setting up its preferences, a reporter can modify Catch2's behaviour
+    //! in some regards, e.g. it can request Catch2 to capture writes to
+    //! stdout/stderr during test execution, and pass them to the reporter.
+    struct ReporterPreferences {
+        //! Catch2 should redirect writes to stdout and pass them to the
+        //! reporter
+        bool shouldRedirectStdOut = false;
+        //! Catch2 should call `Reporter::assertionEnded` even for passing
+        //! assertions
+        bool shouldReportAllAssertions = false;
+    };
+
+    /**
+     * The common base for all reporters and event listeners
+     *
+     * Implementing classes must also implement:
+     *
+     *     //! User-friendly description of the reporter/listener type
+     *     static std::string getDescription()
+     *
+     * Generally shouldn't be derived from by users of Catch2 directly,
+     * instead they should derive from one of the utility bases that
+     * derive from this class.
+     */
+    class IEventListener {
+    protected:
+        //! Derived classes can set up their preferences here
+        ReporterPreferences m_preferences;
+        //! The test run's config as filled in from CLI and defaults
+        IConfig const* m_config;
+
+    public:
+        IEventListener( IConfig const* config ): m_config( config ) {}
+
+        virtual ~IEventListener(); // = default;
+
+        // Implementing class must also provide the following static methods:
+        // static std::string getDescription();
+
+        ReporterPreferences const& getPreferences() const {
+            return m_preferences;
+        }
+
+        //! Called when no test cases match provided test spec
+        virtual void noMatchingTestCases( StringRef unmatchedSpec ) = 0;
+        //! Called for all invalid test specs from the cli
+        virtual void reportInvalidTestSpec( StringRef invalidArgument ) = 0;
+
+        /**
+         * Called once in a testing run before tests are started
+         *
+         * Not called if tests won't be run (e.g. only listing will happen)
+         */
+        virtual void testRunStarting( TestRunInfo const& testRunInfo ) = 0;
+
+        //! Called _once_ for each TEST_CASE, no matter how many times it is entered
+        virtual void testCaseStarting( TestCaseInfo const& testInfo ) = 0;
+        //! Called _every time_ a TEST_CASE is entered, including repeats (due to sections)
+        virtual void testCasePartialStarting( TestCaseInfo const& testInfo, uint64_t partNumber ) = 0;
+        //! Called when a `SECTION` is being entered. Not called for skipped sections
+        virtual void sectionStarting( SectionInfo const& sectionInfo ) = 0;
+
+        //! Called when user-code is being probed before the actual benchmark runs
+        virtual void benchmarkPreparing( StringRef benchmarkName ) = 0;
+        //! Called after probe but before the user-code is being benchmarked
+        virtual void benchmarkStarting( BenchmarkInfo const& benchmarkInfo ) = 0;
+        //! Called with the benchmark results if benchmark successfully finishes
+        virtual void benchmarkEnded( BenchmarkStats<> const& benchmarkStats ) = 0;
+        //! Called if running the benchmarks fails for any reason
+        virtual void benchmarkFailed( StringRef benchmarkName ) = 0;
+
+        //! Called before assertion success/failure is evaluated
+        virtual void assertionStarting( AssertionInfo const& assertionInfo ) = 0;
+
+        //! Called after assertion was fully evaluated
+        virtual void assertionEnded( AssertionStats const& assertionStats ) = 0;
+
+        //! Called after a `SECTION` has finished running
+        virtual void sectionEnded( SectionStats const& sectionStats ) = 0;
+        //! Called _every time_ a TEST_CASE is entered, including repeats (due to sections)
+        virtual void testCasePartialEnded(TestCaseStats const& testCaseStats, uint64_t partNumber ) = 0;
+        //! Called _once_ for each TEST_CASE, no matter how many times it is entered
+        virtual void testCaseEnded( TestCaseStats const& testCaseStats ) = 0;
+        /**
+         * Called once after all tests in a testing run are finished
+         *
+         * Not called if tests weren't run (e.g. only listings happened)
+         */
+        virtual void testRunEnded( TestRunStats const& testRunStats ) = 0;
+
+        /**
+         * Called with test cases that are skipped due to the test run aborting.
+         * NOT called for test cases that are explicitly skipped using the `SKIP` macro.
+         *
+         * Deprecated - will be removed in the next major release.
+         */
+        virtual void skipTest( TestCaseInfo const& testInfo ) = 0;
+
+        //! Called if a fatal error (signal/structured exception) occurred
+        virtual void fatalErrorEncountered( StringRef error ) = 0;
+
+        //! Writes out information about provided reporters using reporter-specific format
+        virtual void listReporters(std::vector<ReporterDescription> const& descriptions) = 0;
+        //! Writes out the provided listeners descriptions using reporter-specific format
+        virtual void listListeners(std::vector<ListenerDescription> const& descriptions) = 0;
+        //! Writes out information about provided tests using reporter-specific format
+        virtual void listTests(std::vector<TestCaseHandle> const& tests) = 0;
+        //! Writes out information about the provided tags using reporter-specific format
+        virtual void listTags(std::vector<TagInfo> const& tags) = 0;
+    };
+    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
+
+} // end namespace Catch
+
+#endif // CATCH_INTERFACES_REPORTER_HPP_INCLUDED
+
+
+#ifndef CATCH_INTERFACES_REPORTER_FACTORY_HPP_INCLUDED
+#define CATCH_INTERFACES_REPORTER_FACTORY_HPP_INCLUDED
+
+
+#include <string>
+
+namespace Catch {
+
+    struct ReporterConfig;
+    class IConfig;
+    class IEventListener;
+    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
+
+
+    class IReporterFactory {
+    public:
+        virtual ~IReporterFactory(); // = default
+
+        virtual IEventListenerPtr
+        create( ReporterConfig&& config ) const = 0;
+        virtual std::string getDescription() const = 0;
+    };
+    using IReporterFactoryPtr = Detail::unique_ptr<IReporterFactory>;
+
+    class EventListenerFactory {
+    public:
+        virtual ~EventListenerFactory(); // = default
+        virtual IEventListenerPtr create( IConfig const* config ) const = 0;
+        //! Return a meaningful name for the listener, e.g. its type name
+        virtual StringRef getName() const = 0;
+        //! Return listener's description if available
+        virtual std::string getDescription() const = 0;
+    };
+} // namespace Catch
+
+#endif // CATCH_INTERFACES_REPORTER_FACTORY_HPP_INCLUDED
+
+
+#ifndef CATCH_INTERFACES_TAG_ALIAS_REGISTRY_HPP_INCLUDED
+#define CATCH_INTERFACES_TAG_ALIAS_REGISTRY_HPP_INCLUDED
+
+#include <string>
+
+namespace Catch {
+
+    struct TagAlias;
+
+    class ITagAliasRegistry {
+    public:
+        virtual ~ITagAliasRegistry(); // = default
+        // Nullptr if not present
+        virtual TagAlias const* find( std::string const& alias ) const = 0;
+        virtual std::string expandAliases( std::string const& unexpandedTestSpec ) const = 0;
+
+        static ITagAliasRegistry const& get();
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_INTERFACES_TAG_ALIAS_REGISTRY_HPP_INCLUDED
+
+
+#ifndef CATCH_INTERFACES_TESTCASE_HPP_INCLUDED
+#define CATCH_INTERFACES_TESTCASE_HPP_INCLUDED
+
+#include <vector>
+
+namespace Catch {
+
+    struct TestCaseInfo;
+    class TestCaseHandle;
+    class IConfig;
+
+    class ITestCaseRegistry {
+    public:
+        virtual ~ITestCaseRegistry(); // = default
+        // TODO: this exists only for adding filenames to test cases -- let's expose this in a saner way later
+        virtual std::vector<TestCaseInfo* > const& getAllInfos() const = 0;
+        virtual std::vector<TestCaseHandle> const& getAllTests() const = 0;
+        virtual std::vector<TestCaseHandle> const& getAllTestsSorted( IConfig const& config ) const = 0;
+    };
+
+}
+
+#endif // CATCH_INTERFACES_TESTCASE_HPP_INCLUDED
+
+#endif // CATCH_INTERFACES_ALL_HPP_INCLUDED
+
+
+#ifndef CATCH_CASE_INSENSITIVE_COMPARISONS_HPP_INCLUDED
+#define CATCH_CASE_INSENSITIVE_COMPARISONS_HPP_INCLUDED
+
+
+namespace Catch {
+    namespace Detail {
+        //! Provides case-insensitive `op<` semantics when called
+        struct CaseInsensitiveLess {
+            bool operator()( StringRef lhs,
+                             StringRef rhs ) const;
+        };
+
+        //! Provides case-insensitive `op==` semantics when called
+        struct CaseInsensitiveEqualTo {
+            bool operator()( StringRef lhs,
+                             StringRef rhs ) const;
+        };
+
+    } // namespace Detail
+} // namespace Catch
+
+#endif // CATCH_CASE_INSENSITIVE_COMPARISONS_HPP_INCLUDED
+
+
+
+/** \file
+ * Wrapper for ANDROID_LOGWRITE configuration option
+ *
+ * We want to default to enabling it when compiled for android, but
+ * users of the library should also be able to disable it if they want
+ * to.
+ */
+
+#ifndef CATCH_CONFIG_ANDROID_LOGWRITE_HPP_INCLUDED
+#define CATCH_CONFIG_ANDROID_LOGWRITE_HPP_INCLUDED
+
+
+#if defined(__ANDROID__)
+#    define CATCH_INTERNAL_CONFIG_ANDROID_LOGWRITE
+#endif
+
+
+#if defined( CATCH_INTERNAL_CONFIG_ANDROID_LOGWRITE ) && \
+    !defined( CATCH_CONFIG_NO_ANDROID_LOGWRITE ) &&      \
+    !defined( CATCH_CONFIG_ANDROID_LOGWRITE )
+#    define CATCH_CONFIG_ANDROID_LOGWRITE
+#endif
+
+#endif // CATCH_CONFIG_ANDROID_LOGWRITE_HPP_INCLUDED
+
+
+
+/** \file
+ * Wrapper for UNCAUGHT_EXCEPTIONS configuration option
+ *
+ * For some functionality, Catch2 requires to know whether there is
+ * an active exception. Because `std::uncaught_exception` is deprecated
+ * in C++17, we want to use `std::uncaught_exceptions` if possible.
+ */
+
+#ifndef CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
+#define CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
+
+
+#if defined(_MSC_VER)
+#  if _MSC_VER >= 1900 // Visual Studio 2015 or newer
+#    define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
+#  endif
+#endif
+
+
+#include <exception>
+
+#if defined(__cpp_lib_uncaught_exceptions) \
+    && !defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
+
+#  define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
+#endif // __cpp_lib_uncaught_exceptions
+
+
+#if defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) \
+    && !defined(CATCH_CONFIG_NO_CPP17_UNCAUGHT_EXCEPTIONS) \
+    && !defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
+
+#  define CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
+#endif
+
+
+#endif // CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
+
+
+#ifndef CATCH_CONSOLE_COLOUR_HPP_INCLUDED
+#define CATCH_CONSOLE_COLOUR_HPP_INCLUDED
+
+
+#include <iosfwd>
+#include <cstdint>
+
+namespace Catch {
+
+    enum class ColourMode : std::uint8_t;
+    class IStream;
+
+    struct Colour {
+        enum Code {
+            None = 0,
+
+            White,
+            Red,
+            Green,
+            Blue,
+            Cyan,
+            Yellow,
+            Grey,
+
+            Bright = 0x10,
+
+            BrightRed = Bright | Red,
+            BrightGreen = Bright | Green,
+            LightGrey = Bright | Grey,
+            BrightWhite = Bright | White,
+            BrightYellow = Bright | Yellow,
+
+            // By intention
+            FileName = LightGrey,
+            Warning = BrightYellow,
+            ResultError = BrightRed,
+            ResultSuccess = BrightGreen,
+            ResultExpectedFailure = Warning,
+
+            Error = BrightRed,
+            Success = Green,
+            Skip = LightGrey,
+
+            OriginalExpression = Cyan,
+            ReconstructedExpression = BrightYellow,
+
+            SecondaryText = LightGrey,
+            Headers = White
+        };
+    };
+
+    class ColourImpl {
+    protected:
+        //! The associated stream of this ColourImpl instance
+        IStream* m_stream;
+    public:
+        ColourImpl( IStream* stream ): m_stream( stream ) {}
+
+        //! RAII wrapper around writing specific colour of text using specific
+        //! colour impl into a stream.
+        class ColourGuard {
+            ColourImpl const* m_colourImpl;
+            Colour::Code m_code;
+            bool m_engaged = false;
+
+        public:
+            //! Does **not** engage the guard/start the colour
+            ColourGuard( Colour::Code code,
+                         ColourImpl const* colour );
+
+            ColourGuard( ColourGuard const& rhs ) = delete;
+            ColourGuard& operator=( ColourGuard const& rhs ) = delete;
+
+            ColourGuard( ColourGuard&& rhs ) noexcept;
+            ColourGuard& operator=( ColourGuard&& rhs ) noexcept;
+
+            //! Removes colour _if_ the guard was engaged
+            ~ColourGuard();
+
+            /**
+             * Explicitly engages colour for given stream.
+             *
+             * The API based on operator<< should be preferred.
+             */
+            ColourGuard& engage( std::ostream& stream ) &;
+            /**
+             * Explicitly engages colour for given stream.
+             *
+             * The API based on operator<< should be preferred.
+             */
+            ColourGuard&& engage( std::ostream& stream ) &&;
+
+        private:
+            //! Engages the guard and starts using colour
+            friend std::ostream& operator<<( std::ostream& lhs,
+                                             ColourGuard& guard ) {
+                guard.engageImpl( lhs );
+                return lhs;
+            }
+            //! Engages the guard and starts using colour
+            friend std::ostream& operator<<( std::ostream& lhs,
+                                            ColourGuard&& guard) {
+                guard.engageImpl( lhs );
+                return lhs;
+            }
+
+            void engageImpl( std::ostream& stream );
+
+        };
+
+        virtual ~ColourImpl(); // = default
+        /**
+         * Creates a guard object for given colour and this colour impl
+         *
+         * **Important:**
+         * the guard starts disengaged, and has to be engaged explicitly.
+         */
+        ColourGuard guardColour( Colour::Code colourCode );
+
+    private:
+        virtual void use( Colour::Code colourCode ) const = 0;
+    };
+
+    //! Provides ColourImpl based on global config and target compilation platform
+    Detail::unique_ptr<ColourImpl> makeColourImpl( ColourMode colourSelection,
+                                                   IStream* stream );
+
+    //! Checks if specific colour impl has been compiled into the binary
+    bool isColourImplAvailable( ColourMode colourSelection );
+
+} // end namespace Catch
+
+#endif // CATCH_CONSOLE_COLOUR_HPP_INCLUDED
+
+
+#ifndef CATCH_CONSOLE_WIDTH_HPP_INCLUDED
+#define CATCH_CONSOLE_WIDTH_HPP_INCLUDED
+
+// This include must be kept so that user's configured value for CONSOLE_WIDTH
+// is used before we attempt to provide a default value
+
+#ifndef CATCH_CONFIG_CONSOLE_WIDTH
+#define CATCH_CONFIG_CONSOLE_WIDTH 80
+#endif
+
+#endif // CATCH_CONSOLE_WIDTH_HPP_INCLUDED
+
+
+#ifndef CATCH_CONTAINER_NONMEMBERS_HPP_INCLUDED
+#define CATCH_CONTAINER_NONMEMBERS_HPP_INCLUDED
+
+
+#include <cstddef>
+#include <initializer_list>
+
+// We want a simple polyfill over `std::empty`, `std::size` and so on
+// for C++14 or C++ libraries with incomplete support.
+// We also have to handle that MSVC std lib will happily provide these
+// under older standards.
+#if defined(CATCH_CPP17_OR_GREATER) || defined(_MSC_VER)
+
+// We are already using this header either way, so there shouldn't
+// be much additional overhead in including it to get the feature
+// test macros
+#include <string>
+
+#  if !defined(__cpp_lib_nonmember_container_access)
+#      define CATCH_CONFIG_POLYFILL_NONMEMBER_CONTAINER_ACCESS
+#  endif
+
+#else
+#define CATCH_CONFIG_POLYFILL_NONMEMBER_CONTAINER_ACCESS
+#endif
+
+
+
+namespace Catch {
+namespace Detail {
+
+#if defined(CATCH_CONFIG_POLYFILL_NONMEMBER_CONTAINER_ACCESS)
+    template <typename Container>
+    constexpr auto empty(Container const& cont) -> decltype(cont.empty()) {
+        return cont.empty();
+    }
+    template <typename T, std::size_t N>
+    constexpr bool empty(const T (&)[N]) noexcept {
+        // GCC < 7 does not support the const T(&)[] parameter syntax
+        // so we have to ignore the length explicitly
+        (void)N;
+        return false;
+    }
+    template <typename T>
+    constexpr bool empty(std::initializer_list<T> list) noexcept {
+        return list.size() > 0;
+    }
+
+
+    template <typename Container>
+    constexpr auto size(Container const& cont) -> decltype(cont.size()) {
+        return cont.size();
+    }
+    template <typename T, std::size_t N>
+    constexpr std::size_t size(const T(&)[N]) noexcept {
+        return N;
+    }
+#endif // CATCH_CONFIG_POLYFILL_NONMEMBER_CONTAINER_ACCESS
+
+} // end namespace Detail
+} // end namespace Catch
+
+
+
+#endif // CATCH_CONTAINER_NONMEMBERS_HPP_INCLUDED
+
+
+#ifndef CATCH_DEBUG_CONSOLE_HPP_INCLUDED
+#define CATCH_DEBUG_CONSOLE_HPP_INCLUDED
+
+#include <string>
+
+namespace Catch {
+    void writeToDebugConsole( std::string const& text );
+}
+
+#endif // CATCH_DEBUG_CONSOLE_HPP_INCLUDED
+
+
+#ifndef CATCH_DEBUGGER_HPP_INCLUDED
+#define CATCH_DEBUGGER_HPP_INCLUDED
+
+
+namespace Catch {
+    bool isDebuggerActive();
+}
+
+#ifdef CATCH_PLATFORM_MAC
+
+    #if defined(__i386__) || defined(__x86_64__)
+        #define CATCH_TRAP() __asm__("int $3\n" : : ) /* NOLINT */
+    #elif defined(__aarch64__)
+        #define CATCH_TRAP() __asm__(".inst 0xd43e0000")
+    #elif defined(__POWERPC__)
+        #define CATCH_TRAP() __asm__("li r0, 20\nsc\nnop\nli r0, 37\nli r4, 2\nsc\nnop\n" \
+        : : : "memory","r0","r3","r4" ) /* NOLINT */
+    #endif
+
+#elif defined(CATCH_PLATFORM_IPHONE)
+
+    // use inline assembler
+    #if defined(__i386__) || defined(__x86_64__)
+        #define CATCH_TRAP()  __asm__("int $3")
+    #elif defined(__aarch64__)
+        #define CATCH_TRAP()  __asm__(".inst 0xd4200000")
+    #elif defined(__arm__) && !defined(__thumb__)
+        #define CATCH_TRAP()  __asm__(".inst 0xe7f001f0")
+    #elif defined(__arm__) &&  defined(__thumb__)
+        #define CATCH_TRAP()  __asm__(".inst 0xde01")
+    #endif
+
+#elif defined(CATCH_PLATFORM_LINUX)
+    // If we can use inline assembler, do it because this allows us to break
+    // directly at the location of the failing check instead of breaking inside
+    // raise() called from it, i.e. one stack frame below.
+    #if defined(__GNUC__) && (defined(__i386) || defined(__x86_64))
+        #define CATCH_TRAP() asm volatile ("int $3") /* NOLINT */
+    #else // Fall back to the generic way.
+        #include <signal.h>
+
+        #define CATCH_TRAP() raise(SIGTRAP)
+    #endif
+#elif defined(_MSC_VER)
+    #define CATCH_TRAP() __debugbreak()
+#elif defined(__MINGW32__)
+    extern "C" __declspec(dllimport) void __stdcall DebugBreak();
+    #define CATCH_TRAP() DebugBreak()
+#endif
+
+#ifndef CATCH_BREAK_INTO_DEBUGGER
+    #ifdef CATCH_TRAP
+        #define CATCH_BREAK_INTO_DEBUGGER() []{ if( Catch::isDebuggerActive() ) { CATCH_TRAP(); } }()
+    #else
+        #define CATCH_BREAK_INTO_DEBUGGER() []{}()
+    #endif
+#endif
+
+#endif // CATCH_DEBUGGER_HPP_INCLUDED
+
+
+#ifndef CATCH_ENFORCE_HPP_INCLUDED
+#define CATCH_ENFORCE_HPP_INCLUDED
+
+
+#include <exception>
+
+namespace Catch {
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+    template <typename Ex>
+    [[noreturn]]
+    void throw_exception(Ex const& e) {
+        throw e;
+    }
+#else // ^^ Exceptions are enabled //  Exceptions are disabled vv
+    [[noreturn]]
+    void throw_exception(std::exception const& e);
+#endif
+
+    [[noreturn]]
+    void throw_logic_error(std::string const& msg);
+    [[noreturn]]
+    void throw_domain_error(std::string const& msg);
+    [[noreturn]]
+    void throw_runtime_error(std::string const& msg);
+
+} // namespace Catch;
+
+#define CATCH_MAKE_MSG(...) \
+    (Catch::ReusableStringStream() << __VA_ARGS__).str()
+
+#define CATCH_INTERNAL_ERROR(...) \
+    Catch::throw_logic_error(CATCH_MAKE_MSG( CATCH_INTERNAL_LINEINFO << ": Internal Catch2 error: " << __VA_ARGS__))
+
+#define CATCH_ERROR(...) \
+    Catch::throw_domain_error(CATCH_MAKE_MSG( __VA_ARGS__ ))
+
+#define CATCH_RUNTIME_ERROR(...) \
+    Catch::throw_runtime_error(CATCH_MAKE_MSG( __VA_ARGS__ ))
+
+#define CATCH_ENFORCE( condition, ... ) \
+    do{ if( !(condition) ) CATCH_ERROR( __VA_ARGS__ ); } while(false)
+
+
+#endif // CATCH_ENFORCE_HPP_INCLUDED
+
+
+#ifndef CATCH_ENUM_VALUES_REGISTRY_HPP_INCLUDED
+#define CATCH_ENUM_VALUES_REGISTRY_HPP_INCLUDED
+
+
+#include <vector>
+
+namespace Catch {
+
+    namespace Detail {
+
+        Catch::Detail::unique_ptr<EnumInfo> makeEnumInfo( StringRef enumName, StringRef allValueNames, std::vector<int> const& values );
+
+        class EnumValuesRegistry : public IMutableEnumValuesRegistry {
+
+            std::vector<Catch::Detail::unique_ptr<EnumInfo>> m_enumInfos;
+
+            EnumInfo const& registerEnum( StringRef enumName, StringRef allValueNames, std::vector<int> const& values) override;
+        };
+
+        std::vector<StringRef> parseEnums( StringRef enums );
+
+    } // Detail
+
+} // Catch
+
+#endif // CATCH_ENUM_VALUES_REGISTRY_HPP_INCLUDED
+
+
+#ifndef CATCH_ERRNO_GUARD_HPP_INCLUDED
+#define CATCH_ERRNO_GUARD_HPP_INCLUDED
+
+namespace Catch {
+
+    //! Simple RAII class that stores the value of `errno`
+    //! at construction and restores it at destruction.
+    class ErrnoGuard {
+    public:
+        // Keep these outlined to avoid dragging in macros from <cerrno>
+
+        ErrnoGuard();
+        ~ErrnoGuard();
+    private:
+        int m_oldErrno;
+    };
+
+}
+
+#endif // CATCH_ERRNO_GUARD_HPP_INCLUDED
+
+
+#ifndef CATCH_EXCEPTION_TRANSLATOR_REGISTRY_HPP_INCLUDED
+#define CATCH_EXCEPTION_TRANSLATOR_REGISTRY_HPP_INCLUDED
+
+
+#include <vector>
+#include <string>
+
+namespace Catch {
+
+    class ExceptionTranslatorRegistry : public IExceptionTranslatorRegistry {
+    public:
+        ~ExceptionTranslatorRegistry() override;
+        void registerTranslator( Detail::unique_ptr<IExceptionTranslator>&& translator );
+        std::string translateActiveException() const override;
+
+    private:
+        ExceptionTranslators m_translators;
+    };
+}
+
+#endif // CATCH_EXCEPTION_TRANSLATOR_REGISTRY_HPP_INCLUDED
+
+
+#ifndef CATCH_FATAL_CONDITION_HANDLER_HPP_INCLUDED
+#define CATCH_FATAL_CONDITION_HANDLER_HPP_INCLUDED
+
+#include <cassert>
+
+namespace Catch {
+
+    /**
+     * Wrapper for platform-specific fatal error (signals/SEH) handlers
+     *
+     * Tries to be cooperative with other handlers, and not step over
+     * other handlers. This means that unknown structured exceptions
+     * are passed on, previous signal handlers are called, and so on.
+     *
+     * Can only be instantiated once, and assumes that once a signal
+     * is caught, the binary will end up terminating. Thus, there
+     */
+    class FatalConditionHandler {
+        bool m_started = false;
+
+        // Install/disengage implementation for specific platform.
+        // Should be if-defed to work on current platform, can assume
+        // engage-disengage 1:1 pairing.
+        void engage_platform();
+        void disengage_platform() noexcept;
+    public:
+        // Should also have platform-specific implementations as needed
+        FatalConditionHandler();
+        ~FatalConditionHandler();
+
+        void engage() {
+            assert(!m_started && "Handler cannot be installed twice.");
+            m_started = true;
+            engage_platform();
+        }
+
+        void disengage() noexcept {
+            assert(m_started && "Handler cannot be uninstalled without being installed first");
+            m_started = false;
+            disengage_platform();
+        }
+    };
+
+    //! Simple RAII guard for (dis)engaging the FatalConditionHandler
+    class FatalConditionHandlerGuard {
+        FatalConditionHandler* m_handler;
+    public:
+        FatalConditionHandlerGuard(FatalConditionHandler* handler):
+            m_handler(handler) {
+            m_handler->engage();
+        }
+        ~FatalConditionHandlerGuard() {
+            m_handler->disengage();
+        }
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_FATAL_CONDITION_HANDLER_HPP_INCLUDED
+
+
+#ifndef CATCH_FLOATING_POINT_HELPERS_HPP_INCLUDED
+#define CATCH_FLOATING_POINT_HELPERS_HPP_INCLUDED
+
+
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <utility>
+#include <limits>
+
+namespace Catch {
+    namespace Detail {
+
+        uint32_t convertToBits(float f);
+        uint64_t convertToBits(double d);
+
+        // Used when we know we want == comparison of two doubles
+        // to centralize warning suppression
+        bool directCompare( float lhs, float rhs );
+        bool directCompare( double lhs, double rhs );
+
+    } // end namespace Detail
+
+
+
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic push
+    // We do a bunch of direct compensations of floating point numbers,
+    // because we know what we are doing and actually do want the direct
+    // comparison behaviour.
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+
+    /**
+     * Calculates the ULP distance between two floating point numbers
+     *
+     * The ULP distance of two floating point numbers is the count of
+     * valid floating point numbers representable between them.
+     *
+     * There are some exceptions between how this function counts the
+     * distance, and the interpretation of the standard as implemented.
+     * by e.g. `nextafter`. For this function it always holds that:
+     * * `(x == y) => ulpDistance(x, y) == 0` (so `ulpDistance(-0, 0) == 0`)
+     * * `ulpDistance(maxFinite, INF) == 1`
+     * * `ulpDistance(x, -x) == 2 * ulpDistance(x, 0)`
+     *
+     * \pre `!isnan( lhs )`
+     * \pre `!isnan( rhs )`
+     * \pre floating point numbers are represented in IEEE-754 format
+     */
+    template <typename FP>
+    uint64_t ulpDistance( FP lhs, FP rhs ) {
+        assert( std::numeric_limits<FP>::is_iec559 &&
+            "ulpDistance assumes IEEE-754 format for floating point types" );
+        assert( !Catch::isnan( lhs ) &&
+                "Distance between NaN and number is not meaningful" );
+        assert( !Catch::isnan( rhs ) &&
+                "Distance between NaN and number is not meaningful" );
+
+        // We want X == Y to imply 0 ULP distance even if X and Y aren't
+        // bit-equal (-0 and 0), or X - Y != 0 (same sign infinities).
+        if ( lhs == rhs ) { return 0; }
+
+        // We need a properly typed positive zero for type inference.
+        static constexpr FP positive_zero{};
+
+        // We want to ensure that +/- 0 is always represented as positive zero
+        if ( lhs == positive_zero ) { lhs = positive_zero; }
+        if ( rhs == positive_zero ) { rhs = positive_zero; }
+
+        // If arguments have different signs, we can handle them by summing
+        // how far are they from 0 each.
+        if ( std::signbit( lhs ) != std::signbit( rhs ) ) {
+            return ulpDistance( std::abs( lhs ), positive_zero ) +
+                   ulpDistance( std::abs( rhs ), positive_zero );
+        }
+
+        // When both lhs and rhs are of the same sign, we can just
+        // read the numbers bitwise as integers, and then subtract them
+        // (assuming IEEE).
+        uint64_t lc = Detail::convertToBits( lhs );
+        uint64_t rc = Detail::convertToBits( rhs );
+
+        // The ulp distance between two numbers is symmetric, so to avoid
+        // dealing with overflows we want the bigger converted number on the lhs
+        if ( lc < rc ) {
+            std::swap( lc, rc );
+        }
+
+        return lc - rc;
+    }
+
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+
+
+} // end namespace Catch
+
+#endif // CATCH_FLOATING_POINT_HELPERS_HPP_INCLUDED
+
+
+#ifndef CATCH_GETENV_HPP_INCLUDED
+#define CATCH_GETENV_HPP_INCLUDED
+
+namespace Catch {
+namespace Detail {
+
+    //! Wrapper over `std::getenv` that compiles on UWP (and always returns nullptr there)
+    char const* getEnv(char const* varName);
+
+}
+}
+
+#endif // CATCH_GETENV_HPP_INCLUDED
+
+
+#ifndef CATCH_IS_PERMUTATION_HPP_INCLUDED
+#define CATCH_IS_PERMUTATION_HPP_INCLUDED
+
+#include <algorithm>
+#include <iterator>
+
+namespace Catch {
+    namespace Detail {
+
+        template <typename ForwardIter,
+                  typename Sentinel,
+                  typename T,
+                  typename Comparator>
+        ForwardIter find_sentinel( ForwardIter start,
+                                   Sentinel sentinel,
+                                   T const& value,
+                                   Comparator cmp ) {
+            while ( start != sentinel ) {
+                if ( cmp( *start, value ) ) { break; }
+                ++start;
+            }
+            return start;
+        }
+
+        template <typename ForwardIter,
+                  typename Sentinel,
+                  typename T,
+                  typename Comparator>
+        std::ptrdiff_t count_sentinel( ForwardIter start,
+                                       Sentinel sentinel,
+                                       T const& value,
+                                       Comparator cmp ) {
+            std::ptrdiff_t count = 0;
+            while ( start != sentinel ) {
+                if ( cmp( *start, value ) ) { ++count; }
+                ++start;
+            }
+            return count;
+        }
+
+        template <typename ForwardIter, typename Sentinel>
+        std::enable_if_t<!std::is_same<ForwardIter, Sentinel>::value,
+                         std::ptrdiff_t>
+        sentinel_distance( ForwardIter iter, const Sentinel sentinel ) {
+            std::ptrdiff_t dist = 0;
+            while ( iter != sentinel ) {
+                ++iter;
+                ++dist;
+            }
+            return dist;
+        }
+
+        template <typename ForwardIter>
+        std::ptrdiff_t sentinel_distance( ForwardIter first,
+                                          ForwardIter last ) {
+            return std::distance( first, last );
+        }
+
+        template <typename ForwardIter1,
+                  typename Sentinel1,
+                  typename ForwardIter2,
+                  typename Sentinel2,
+                  typename Comparator>
+        bool check_element_counts( ForwardIter1 first_1,
+                                   const Sentinel1 end_1,
+                                   ForwardIter2 first_2,
+                                   const Sentinel2 end_2,
+                                   Comparator cmp ) {
+            auto cursor = first_1;
+            while ( cursor != end_1 ) {
+                if ( find_sentinel( first_1, cursor, *cursor, cmp ) ==
+                     cursor ) {
+                    // we haven't checked this element yet
+                    const auto count_in_range_2 =
+                        count_sentinel( first_2, end_2, *cursor, cmp );
+                    // Not a single instance in 2nd range, so it cannot be a
+                    // permutation of 1st range
+                    if ( count_in_range_2 == 0 ) { return false; }
+
+                    const auto count_in_range_1 =
+                        count_sentinel( cursor, end_1, *cursor, cmp );
+                    if ( count_in_range_1 != count_in_range_2 ) {
+                        return false;
+                    }
+                }
+
+                ++cursor;
+            }
+
+            return true;
+        }
+
+        template <typename ForwardIter1,
+                  typename Sentinel1,
+                  typename ForwardIter2,
+                  typename Sentinel2,
+                  typename Comparator>
+        bool is_permutation( ForwardIter1 first_1,
+                             const Sentinel1 end_1,
+                             ForwardIter2 first_2,
+                             const Sentinel2 end_2,
+                             Comparator cmp ) {
+            // TODO: no optimization for stronger iterators, because we would also have to constrain on sentinel vs not sentinel types
+            // TODO: Comparator has to be "both sides", e.g. a == b => b == a
+            // This skips shared prefix of the two ranges
+            while (first_1 != end_1 && first_2 != end_2 && cmp(*first_1, *first_2)) {
+                ++first_1;
+                ++first_2;
+            }
+
+            // We need to handle case where at least one of the ranges has no more elements
+            if (first_1 == end_1 || first_2 == end_2) {
+                return first_1 == end_1 && first_2 == end_2;
+            }
+
+            // pair counting is n**2, so we pay linear walk to compare the sizes first
+            auto dist_1 = sentinel_distance( first_1, end_1 );
+            auto dist_2 = sentinel_distance( first_2, end_2 );
+
+            if (dist_1 != dist_2) { return false; }
+
+            // Since we do not try to handle stronger iterators pair (e.g.
+            // bidir) optimally, the only thing left to do is to check counts in
+            // the remaining ranges.
+            return check_element_counts( first_1, end_1, first_2, end_2, cmp );
+        }
+
+    } // namespace Detail
+} // namespace Catch
+
+#endif // CATCH_IS_PERMUTATION_HPP_INCLUDED
+
+
+#ifndef CATCH_ISTREAM_HPP_INCLUDED
+#define CATCH_ISTREAM_HPP_INCLUDED
+
+
+#include <iosfwd>
+#include <cstddef>
+#include <ostream>
+#include <string>
+
+namespace Catch {
+
+    class IStream {
+    public:
+        virtual ~IStream(); // = default
+        virtual std::ostream& stream() = 0;
+        /**
+         * Best guess on whether the instance is writing to a console (e.g. via stdout/stderr)
+         *
+         * This is useful for e.g. Win32 colour support, because the Win32
+         * API manipulates console directly, unlike POSIX escape codes,
+         * that can be written anywhere.
+         *
+         * Due to variety of ways to change where the stdout/stderr is
+         * _actually_ being written, users should always assume that
+         * the answer might be wrong.
+         */
+        virtual bool isConsole() const { return false; }
+    };
+
+    /**
+     * Creates a stream wrapper that writes to specific file.
+     *
+     * Also recognizes 4 special filenames
+     * * `-` for stdout
+     * * `%stdout` for stdout
+     * * `%stderr` for stderr
+     * * `%debug` for platform specific debugging output
+     *
+     * \throws if passed an unrecognized %-prefixed stream
+     */
+    auto makeStream( std::string const& filename ) -> Detail::unique_ptr<IStream>;
+
+}
+
+#endif // CATCH_STREAM_HPP_INCLUDED
+
+
+#ifndef CATCH_JSONWRITER_HPP_INCLUDED
+#define CATCH_JSONWRITER_HPP_INCLUDED
+
+
+#include <cstdint>
+#include <sstream>
+
+namespace Catch {
+    class JsonObjectWriter;
+    class JsonArrayWriter;
+
+    struct JsonUtils {
+        static void indent( std::ostream& os, std::uint64_t level );
+        static void appendCommaNewline( std::ostream& os,
+                                        bool& should_comma,
+                                        std::uint64_t level );
+    };
+
+    class JsonValueWriter {
+    public:
+        JsonValueWriter( std::ostream& os );
+        JsonValueWriter( std::ostream& os, std::uint64_t indent_level );
+
+        JsonObjectWriter writeObject() &&;
+        JsonArrayWriter writeArray() &&;
+
+        template <typename T>
+        void write( T const& value ) && {
+            writeImpl( value, !std::is_arithmetic<T>::value );
+        }
+        void write( StringRef value ) &&;
+        void write( bool value ) &&;
+
+    private:
+        void writeImpl( StringRef value, bool quote );
+
+        // Without this SFINAE, this overload is a better match
+        // for `std::string`, `char const*`, `char const[N]` args.
+        // While it would still work, it would cause code bloat
+        // and multiple iteration over the strings
+        template <typename T,
+                  typename = typename std::enable_if_t<
+                      !std::is_convertible<T, StringRef>::value>>
+        void writeImpl( T const& value, bool quote_value ) {
+            m_sstream << value;
+            writeImpl( m_sstream.str(), quote_value );
+        }
+
+        std::ostream& m_os;
+        std::stringstream m_sstream;
+        std::uint64_t m_indent_level;
+    };
+
+    class JsonObjectWriter {
+    public:
+        JsonObjectWriter( std::ostream& os );
+        JsonObjectWriter( std::ostream& os, std::uint64_t indent_level );
+
+        JsonObjectWriter( JsonObjectWriter&& source ) noexcept;
+        JsonObjectWriter& operator=( JsonObjectWriter&& source ) = delete;
+
+        ~JsonObjectWriter();
+
+        JsonValueWriter write( StringRef key );
+
+    private:
+        std::ostream& m_os;
+        std::uint64_t m_indent_level;
+        bool m_should_comma = false;
+        bool m_active = true;
+    };
+
+    class JsonArrayWriter {
+    public:
+        JsonArrayWriter( std::ostream& os );
+        JsonArrayWriter( std::ostream& os, std::uint64_t indent_level );
+
+        JsonArrayWriter( JsonArrayWriter&& source ) noexcept;
+        JsonArrayWriter& operator=( JsonArrayWriter&& source ) = delete;
+
+        ~JsonArrayWriter();
+
+        JsonObjectWriter writeObject();
+        JsonArrayWriter writeArray();
+
+        template <typename T>
+        JsonArrayWriter& write( T const& value ) {
+            return writeImpl( value );
+        }
+
+        JsonArrayWriter& write( bool value );
+
+    private:
+        template <typename T>
+        JsonArrayWriter& writeImpl( T const& value ) {
+            JsonUtils::appendCommaNewline(
+                m_os, m_should_comma, m_indent_level + 1 );
+            JsonValueWriter{ m_os }.write( value );
+
+            return *this;
+        }
+
+        std::ostream& m_os;
+        std::uint64_t m_indent_level;
+        bool m_should_comma = false;
+        bool m_active = true;
+    };
+
+} // namespace Catch
+
+#endif // CATCH_JSONWRITER_HPP_INCLUDED
+
+
+#ifndef CATCH_LEAK_DETECTOR_HPP_INCLUDED
+#define CATCH_LEAK_DETECTOR_HPP_INCLUDED
+
+namespace Catch {
+
+    struct LeakDetector {
+        LeakDetector();
+        ~LeakDetector();
+    };
+
+}
+#endif // CATCH_LEAK_DETECTOR_HPP_INCLUDED
+
+
+#ifndef CATCH_LIST_HPP_INCLUDED
+#define CATCH_LIST_HPP_INCLUDED
+
+
+#include <set>
+#include <string>
+
+
+namespace Catch {
+
+    class IEventListener;
+    class Config;
+
+
+    struct ReporterDescription {
+        std::string name, description;
+    };
+    struct ListenerDescription {
+        StringRef name;
+        std::string description;
+    };
+
+    struct TagInfo {
+        void add(StringRef spelling);
+        std::string all() const;
+
+        std::set<StringRef> spellings;
+        std::size_t count = 0;
+    };
+
+    bool list( IEventListener& reporter, Config const& config );
+
+} // end namespace Catch
+
+#endif // CATCH_LIST_HPP_INCLUDED
+
+
+#ifndef CATCH_OUTPUT_REDIRECT_HPP_INCLUDED
+#define CATCH_OUTPUT_REDIRECT_HPP_INCLUDED
+
+
+#include <cstdio>
+#include <iosfwd>
+#include <string>
+
+namespace Catch {
+
+    class RedirectedStream {
+        std::ostream& m_originalStream;
+        std::ostream& m_redirectionStream;
+        std::streambuf* m_prevBuf;
+
+    public:
+        RedirectedStream( std::ostream& originalStream, std::ostream& redirectionStream );
+        ~RedirectedStream();
+    };
+
+    class RedirectedStdOut {
+        ReusableStringStream m_rss;
+        RedirectedStream m_cout;
+    public:
+        RedirectedStdOut();
+        auto str() const -> std::string;
+    };
+
+    // StdErr has two constituent streams in C++, std::cerr and std::clog
+    // This means that we need to redirect 2 streams into 1 to keep proper
+    // order of writes
+    class RedirectedStdErr {
+        ReusableStringStream m_rss;
+        RedirectedStream m_cerr;
+        RedirectedStream m_clog;
+    public:
+        RedirectedStdErr();
+        auto str() const -> std::string;
+    };
+
+    class RedirectedStreams {
+    public:
+        RedirectedStreams(RedirectedStreams const&) = delete;
+        RedirectedStreams& operator=(RedirectedStreams const&) = delete;
+        RedirectedStreams(RedirectedStreams&&) = delete;
+        RedirectedStreams& operator=(RedirectedStreams&&) = delete;
+
+        RedirectedStreams(std::string& redirectedCout, std::string& redirectedCerr);
+        ~RedirectedStreams();
+    private:
+        std::string& m_redirectedCout;
+        std::string& m_redirectedCerr;
+        RedirectedStdOut m_redirectedStdOut;
+        RedirectedStdErr m_redirectedStdErr;
+    };
+
+#if defined(CATCH_CONFIG_NEW_CAPTURE)
+
+    // Windows's implementation of std::tmpfile is terrible (it tries
+    // to create a file inside system folder, thus requiring elevated
+    // privileges for the binary), so we have to use tmpnam(_s) and
+    // create the file ourselves there.
+    class TempFile {
+    public:
+        TempFile(TempFile const&) = delete;
+        TempFile& operator=(TempFile const&) = delete;
+        TempFile(TempFile&&) = delete;
+        TempFile& operator=(TempFile&&) = delete;
+
+        TempFile();
+        ~TempFile();
+
+        std::FILE* getFile();
+        std::string getContents();
+
+    private:
+        std::FILE* m_file = nullptr;
+    #if defined(_MSC_VER)
+        char m_buffer[L_tmpnam] = { 0 };
+    #endif
+    };
+
+
+    class OutputRedirect {
+    public:
+        OutputRedirect(OutputRedirect const&) = delete;
+        OutputRedirect& operator=(OutputRedirect const&) = delete;
+        OutputRedirect(OutputRedirect&&) = delete;
+        OutputRedirect& operator=(OutputRedirect&&) = delete;
+
+
+        OutputRedirect(std::string& stdout_dest, std::string& stderr_dest);
+        ~OutputRedirect();
+
+    private:
+        int m_originalStdout = -1;
+        int m_originalStderr = -1;
+        TempFile m_stdoutFile;
+        TempFile m_stderrFile;
+        std::string& m_stdoutDest;
+        std::string& m_stderrDest;
+    };
+
+#endif
+
+} // end namespace Catch
+
+#endif // CATCH_OUTPUT_REDIRECT_HPP_INCLUDED
+
+
+#ifndef CATCH_PARSE_NUMBERS_HPP_INCLUDED
+#define CATCH_PARSE_NUMBERS_HPP_INCLUDED
+
+
+#include <string>
+
+namespace Catch {
+
+    /**
+     * Parses unsigned int from the input, using provided base
+     *
+     * Effectively a wrapper around std::stoul but with better error checking
+     * e.g. "-1" is rejected, instead of being parsed as UINT_MAX.
+     */
+    Optional<unsigned int> parseUInt(std::string const& input, int base = 10);
+}
+
+#endif // CATCH_PARSE_NUMBERS_HPP_INCLUDED
+
+
+#ifndef CATCH_REPORTER_REGISTRY_HPP_INCLUDED
+#define CATCH_REPORTER_REGISTRY_HPP_INCLUDED
+
+
+#include <map>
+#include <string>
+#include <vector>
+
+namespace Catch {
+
+    class IEventListener;
+    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
+    class IReporterFactory;
+    using IReporterFactoryPtr = Detail::unique_ptr<IReporterFactory>;
+    struct ReporterConfig;
+    class EventListenerFactory;
+
+    class ReporterRegistry {
+        struct ReporterRegistryImpl;
+        Detail::unique_ptr<ReporterRegistryImpl> m_impl;
+
+    public:
+        ReporterRegistry();
+        ~ReporterRegistry(); // = default;
+
+        IEventListenerPtr create( std::string const& name,
+                                  ReporterConfig&& config ) const;
+
+        void registerReporter( std::string const& name,
+                               IReporterFactoryPtr factory );
+
+        void
+        registerListener( Detail::unique_ptr<EventListenerFactory> factory );
+
+        std::map<std::string,
+                 IReporterFactoryPtr,
+                 Detail::CaseInsensitiveLess> const&
+        getFactories() const;
+
+        std::vector<Detail::unique_ptr<EventListenerFactory>> const&
+        getListeners() const;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_REGISTRY_HPP_INCLUDED
+
+
+#ifndef CATCH_RUN_CONTEXT_HPP_INCLUDED
+#define CATCH_RUN_CONTEXT_HPP_INCLUDED
+
+
+
+#ifndef CATCH_TEST_CASE_TRACKER_HPP_INCLUDED
+#define CATCH_TEST_CASE_TRACKER_HPP_INCLUDED
+
+
+#include <string>
+#include <vector>
+
+namespace Catch {
+namespace TestCaseTracking {
+
+    struct NameAndLocation {
+        std::string name;
+        SourceLineInfo location;
+
+        NameAndLocation( std::string&& _name, SourceLineInfo const& _location );
+        friend bool operator==(NameAndLocation const& lhs, NameAndLocation const& rhs) {
+            // This is a very cheap check that should have a very high hit rate.
+            // If we get to SourceLineInfo::operator==, we will redo it, but the
+            // cost of repeating is trivial at that point (we will be paying
+            // multiple strcmp/memcmps at that point).
+            if ( lhs.location.line != rhs.location.line ) { return false; }
+            return lhs.name == rhs.name && lhs.location == rhs.location;
+        }
+        friend bool operator!=(NameAndLocation const& lhs,
+                               NameAndLocation const& rhs) {
+            return !( lhs == rhs );
+        }
+    };
+
+    /**
+     * This is a variant of `NameAndLocation` that does not own the name string
+     *
+     * This avoids extra allocations when trying to locate a tracker by its
+     * name and location, as long as we make sure that trackers only keep
+     * around the owning variant.
+     */
+    struct NameAndLocationRef {
+        StringRef name;
+        SourceLineInfo location;
+
+        constexpr NameAndLocationRef( StringRef name_,
+                                      SourceLineInfo location_ ):
+            name( name_ ), location( location_ ) {}
+
+        friend bool operator==( NameAndLocation const& lhs,
+                                NameAndLocationRef const& rhs ) {
+            // This is a very cheap check that should have a very high hit rate.
+            // If we get to SourceLineInfo::operator==, we will redo it, but the
+            // cost of repeating is trivial at that point (we will be paying
+            // multiple strcmp/memcmps at that point).
+            if ( lhs.location.line != rhs.location.line ) { return false; }
+            return StringRef( lhs.name ) == rhs.name &&
+                   lhs.location == rhs.location;
+        }
+        friend bool operator==( NameAndLocationRef const& lhs,
+                                NameAndLocation const& rhs ) {
+            return rhs == lhs;
+        }
+    };
+
+    class ITracker;
+
+    using ITrackerPtr = Catch::Detail::unique_ptr<ITracker>;
+
+    class ITracker {
+        NameAndLocation m_nameAndLocation;
+
+        using Children = std::vector<ITrackerPtr>;
+
+    protected:
+        enum CycleState {
+            NotStarted,
+            Executing,
+            ExecutingChildren,
+            NeedsAnotherRun,
+            CompletedSuccessfully,
+            Failed
+        };
+
+        ITracker* m_parent = nullptr;
+        Children m_children;
+        CycleState m_runState = NotStarted;
+
+    public:
+        ITracker( NameAndLocation&& nameAndLoc, ITracker* parent ):
+            m_nameAndLocation( CATCH_MOVE(nameAndLoc) ),
+            m_parent( parent )
+        {}
+
+
+        // static queries
+        NameAndLocation const& nameAndLocation() const {
+            return m_nameAndLocation;
+        }
+        ITracker* parent() const {
+            return m_parent;
+        }
+
+        virtual ~ITracker(); // = default
+
+
+        // dynamic queries
+
+        //! Returns true if tracker run to completion (successfully or not)
+        virtual bool isComplete() const = 0;
+        //! Returns true if tracker run to completion successfully
+        bool isSuccessfullyCompleted() const {
+            return m_runState == CompletedSuccessfully;
+        }
+        //! Returns true if tracker has started but hasn't been completed
+        bool isOpen() const;
+        //! Returns true iff tracker has started
+        bool hasStarted() const;
+
+        // actions
+        virtual void close() = 0; // Successfully complete
+        virtual void fail() = 0;
+        void markAsNeedingAnotherRun();
+
+        //! Register a nested ITracker
+        void addChild( ITrackerPtr&& child );
+        /**
+         * Returns ptr to specific child if register with this tracker.
+         *
+         * Returns nullptr if not found.
+         */
+        ITracker* findChild( NameAndLocationRef const& nameAndLocation );
+        //! Have any children been added?
+        bool hasChildren() const {
+            return !m_children.empty();
+        }
+
+
+        //! Marks tracker as executing a child, doing se recursively up the tree
+        void openChild();
+
+        /**
+         * Returns true if the instance is a section tracker
+         *
+         * Subclasses should override to true if they are, replaces RTTI
+         * for internal debug checks.
+         */
+        virtual bool isSectionTracker() const;
+        /**
+         * Returns true if the instance is a generator tracker
+         *
+         * Subclasses should override to true if they are, replaces RTTI
+         * for internal debug checks.
+         */
+        virtual bool isGeneratorTracker() const;
+    };
+
+    class TrackerContext {
+
+        enum RunState {
+            NotStarted,
+            Executing,
+            CompletedCycle
+        };
+
+        ITrackerPtr m_rootTracker;
+        ITracker* m_currentTracker = nullptr;
+        RunState m_runState = NotStarted;
+
+    public:
+
+        ITracker& startRun();
+
+        void startCycle() {
+            m_currentTracker = m_rootTracker.get();
+            m_runState = Executing;
+        }
+        void completeCycle();
+
+        bool completedCycle() const;
+        ITracker& currentTracker() { return *m_currentTracker; }
+        void setCurrentTracker( ITracker* tracker );
+    };
+
+    class TrackerBase : public ITracker {
+    protected:
+
+        TrackerContext& m_ctx;
+
+    public:
+        TrackerBase( NameAndLocation&& nameAndLocation, TrackerContext& ctx, ITracker* parent );
+
+        bool isComplete() const override;
+
+        void open();
+
+        void close() override;
+        void fail() override;
+
+    private:
+        void moveToParent();
+        void moveToThis();
+    };
+
+    class SectionTracker : public TrackerBase {
+        std::vector<StringRef> m_filters;
+        // Note that lifetime-wise we piggy back off the name stored in the `ITracker` parent`.
+        // Currently it allocates owns the name, so this is safe. If it is later refactored
+        // to not own the name, the name still has to outlive the `ITracker` parent, so
+        // this should still be safe.
+        StringRef m_trimmed_name;
+    public:
+        SectionTracker( NameAndLocation&& nameAndLocation, TrackerContext& ctx, ITracker* parent );
+
+        bool isSectionTracker() const override;
+
+        bool isComplete() const override;
+
+        static SectionTracker& acquire( TrackerContext& ctx, NameAndLocationRef const& nameAndLocation );
+
+        void tryOpen();
+
+        void addInitialFilters( std::vector<std::string> const& filters );
+        void addNextFilters( std::vector<StringRef> const& filters );
+        //! Returns filters active in this tracker
+        std::vector<StringRef> const& getFilters() const { return m_filters; }
+        //! Returns whitespace-trimmed name of the tracked section
+        StringRef trimmedName() const;
+    };
+
+} // namespace TestCaseTracking
+
+using TestCaseTracking::ITracker;
+using TestCaseTracking::TrackerContext;
+using TestCaseTracking::SectionTracker;
+
+} // namespace Catch
+
+#endif // CATCH_TEST_CASE_TRACKER_HPP_INCLUDED
+
+#include <string>
+
+namespace Catch {
+
+    class IGeneratorTracker;
+    class IConfig;
+    class IEventListener;
+    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    class RunContext final : public IResultCapture {
+
+    public:
+        RunContext( RunContext const& ) = delete;
+        RunContext& operator =( RunContext const& ) = delete;
+
+        explicit RunContext( IConfig const* _config, IEventListenerPtr&& reporter );
+
+        ~RunContext() override;
+
+        Totals runTest(TestCaseHandle const& testCase);
+
+    public: // IResultCapture
+
+        // Assertion handlers
+        void handleExpr
+                (   AssertionInfo const& info,
+                    ITransientExpression const& expr,
+                    AssertionReaction& reaction ) override;
+        void handleMessage
+                (   AssertionInfo const& info,
+                    ResultWas::OfType resultType,
+                    StringRef message,
+                    AssertionReaction& reaction ) override;
+        void handleUnexpectedExceptionNotThrown
+                (   AssertionInfo const& info,
+                    AssertionReaction& reaction ) override;
+        void handleUnexpectedInflightException
+                (   AssertionInfo const& info,
+                    std::string&& message,
+                    AssertionReaction& reaction ) override;
+        void handleIncomplete
+                (   AssertionInfo const& info ) override;
+        void handleNonExpr
+                (   AssertionInfo const &info,
+                    ResultWas::OfType resultType,
+                    AssertionReaction &reaction ) override;
+
+        void notifyAssertionStarted( AssertionInfo const& info ) override;
+        bool sectionStarted( StringRef sectionName,
+                             SourceLineInfo const& sectionLineInfo,
+                             Counts& assertions ) override;
+
+        void sectionEnded( SectionEndInfo&& endInfo ) override;
+        void sectionEndedEarly( SectionEndInfo&& endInfo ) override;
+
+        IGeneratorTracker*
+        acquireGeneratorTracker( StringRef generatorName,
+                                 SourceLineInfo const& lineInfo ) override;
+        IGeneratorTracker* createGeneratorTracker(
+            StringRef generatorName,
+            SourceLineInfo lineInfo,
+            Generators::GeneratorBasePtr&& generator ) override;
+
+
+        void benchmarkPreparing( StringRef name ) override;
+        void benchmarkStarting( BenchmarkInfo const& info ) override;
+        void benchmarkEnded( BenchmarkStats<> const& stats ) override;
+        void benchmarkFailed( StringRef error ) override;
+
+        void pushScopedMessage( MessageInfo const& message ) override;
+        void popScopedMessage( MessageInfo const& message ) override;
+
+        void emplaceUnscopedMessage( MessageBuilder&& builder ) override;
+
+        std::string getCurrentTestName() const override;
+
+        const AssertionResult* getLastResult() const override;
+
+        void exceptionEarlyReported() override;
+
+        void handleFatalErrorCondition( StringRef message ) override;
+
+        bool lastAssertionPassed() override;
+
+        void assertionPassed() override;
+
+    public:
+        // !TBD We need to do this another way!
+        bool aborting() const;
+
+    private:
+
+        void runCurrentTest( std::string& redirectedCout, std::string& redirectedCerr );
+        void invokeActiveTestCase();
+
+        void resetAssertionInfo();
+        bool testForMissingAssertions( Counts& assertions );
+
+        void assertionEnded( AssertionResult&& result );
+        void reportExpr
+                (   AssertionInfo const &info,
+                    ResultWas::OfType resultType,
+                    ITransientExpression const *expr,
+                    bool negated );
+
+        void populateReaction( AssertionReaction& reaction );
+
+    private:
+
+        void handleUnfinishedSections();
+
+        TestRunInfo m_runInfo;
+        TestCaseHandle const* m_activeTestCase = nullptr;
+        ITracker* m_testCaseTracker = nullptr;
+        Optional<AssertionResult> m_lastResult;
+
+        IConfig const* m_config;
+        Totals m_totals;
+        IEventListenerPtr m_reporter;
+        std::vector<MessageInfo> m_messages;
+        std::vector<ScopedMessage> m_messageScopes; /* Keeps owners of so-called unscoped messages. */
+        AssertionInfo m_lastAssertionInfo;
+        std::vector<SectionEndInfo> m_unfinishedSections;
+        std::vector<ITracker*> m_activeSections;
+        TrackerContext m_trackerContext;
+        FatalConditionHandler m_fatalConditionhandler;
+        bool m_lastAssertionPassed = false;
+        bool m_shouldReportUnexpected = true;
+        bool m_includeSuccessfulResults;
+    };
+
+    void seedRng(IConfig const& config);
+    unsigned int rngSeed();
+} // end namespace Catch
+
+#endif // CATCH_RUN_CONTEXT_HPP_INCLUDED
+
+
+#ifndef CATCH_SHARDING_HPP_INCLUDED
+#define CATCH_SHARDING_HPP_INCLUDED
+
+#include <cassert>
+#include <cmath>
+#include <algorithm>
+
+namespace Catch {
+
+    template<typename Container>
+    Container createShard(Container const& container, std::size_t const shardCount, std::size_t const shardIndex) {
+        assert(shardCount > shardIndex);
+
+        if (shardCount == 1) {
+            return container;
+        }
+
+        const std::size_t totalTestCount = container.size();
+
+        const std::size_t shardSize = totalTestCount / shardCount;
+        const std::size_t leftoverTests = totalTestCount % shardCount;
+
+        const std::size_t startIndex = shardIndex * shardSize + (std::min)(shardIndex, leftoverTests);
+        const std::size_t endIndex = (shardIndex + 1) * shardSize + (std::min)(shardIndex + 1, leftoverTests);
+
+        auto startIterator = std::next(container.begin(), static_cast<std::ptrdiff_t>(startIndex));
+        auto endIterator = std::next(container.begin(), static_cast<std::ptrdiff_t>(endIndex));
+
+        return Container(startIterator, endIterator);
+    }
+
+}
+
+#endif // CATCH_SHARDING_HPP_INCLUDED
+
+
+#ifndef CATCH_SINGLETONS_HPP_INCLUDED
+#define CATCH_SINGLETONS_HPP_INCLUDED
+
+namespace Catch {
+
+    struct ISingleton {
+        virtual ~ISingleton(); // = default
+    };
+
+
+    void addSingleton( ISingleton* singleton );
+    void cleanupSingletons();
+
+
+    template<typename SingletonImplT, typename InterfaceT = SingletonImplT, typename MutableInterfaceT = InterfaceT>
+    class Singleton : SingletonImplT, public ISingleton {
+
+        static auto getInternal() -> Singleton* {
+            static Singleton* s_instance = nullptr;
+            if( !s_instance ) {
+                s_instance = new Singleton;
+                addSingleton( s_instance );
+            }
+            return s_instance;
+        }
+
+    public:
+        static auto get() -> InterfaceT const& {
+            return *getInternal();
+        }
+        static auto getMutable() -> MutableInterfaceT& {
+            return *getInternal();
+        }
+    };
+
+} // namespace Catch
+
+#endif // CATCH_SINGLETONS_HPP_INCLUDED
+
+
+#ifndef CATCH_STARTUP_EXCEPTION_REGISTRY_HPP_INCLUDED
+#define CATCH_STARTUP_EXCEPTION_REGISTRY_HPP_INCLUDED
+
+
+#include <vector>
+#include <exception>
+
+namespace Catch {
+
+    class StartupExceptionRegistry {
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+    public:
+        void add(std::exception_ptr const& exception) noexcept;
+        std::vector<std::exception_ptr> const& getExceptions() const noexcept;
+    private:
+        std::vector<std::exception_ptr> m_exceptions;
+#endif
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_STARTUP_EXCEPTION_REGISTRY_HPP_INCLUDED
+
+
+
+#ifndef CATCH_STDSTREAMS_HPP_INCLUDED
+#define CATCH_STDSTREAMS_HPP_INCLUDED
+
+#include <iosfwd>
+
+namespace Catch {
+
+    std::ostream& cout();
+    std::ostream& cerr();
+    std::ostream& clog();
+
+} // namespace Catch
+
+#endif
+
+
+#ifndef CATCH_STRING_MANIP_HPP_INCLUDED
+#define CATCH_STRING_MANIP_HPP_INCLUDED
+
+
+#include <cstdint>
+#include <string>
+#include <iosfwd>
+#include <vector>
+
+namespace Catch {
+
+    bool startsWith( std::string const& s, std::string const& prefix );
+    bool startsWith( StringRef s, char prefix );
+    bool endsWith( std::string const& s, std::string const& suffix );
+    bool endsWith( std::string const& s, char suffix );
+    bool contains( std::string const& s, std::string const& infix );
+    void toLowerInPlace( std::string& s );
+    std::string toLower( std::string const& s );
+    char toLower( char c );
+    //! Returns a new string without whitespace at the start/end
+    std::string trim( std::string const& str );
+    //! Returns a substring of the original ref without whitespace. Beware lifetimes!
+    StringRef trim(StringRef ref);
+
+    // !!! Be aware, returns refs into original string - make sure original string outlives them
+    std::vector<StringRef> splitStringRef( StringRef str, char delimiter );
+    bool replaceInPlace( std::string& str, std::string const& replaceThis, std::string const& withThis );
+
+    /**
+     * Helper for streaming a "count [maybe-plural-of-label]" human-friendly string
+     *
+     * Usage example:
+     * ```cpp
+     * std::cout << "Found " << pluralise(count, "error") << '\n';
+     * ```
+     *
+     * **Important:** The provided string must outlive the instance
+     */
+    class pluralise {
+        std::uint64_t m_count;
+        StringRef m_label;
+
+    public:
+        constexpr pluralise(std::uint64_t count, StringRef label):
+            m_count(count),
+            m_label(label)
+        {}
+
+        friend std::ostream& operator << ( std::ostream& os, pluralise const& pluraliser );
+    };
+}
+
+#endif // CATCH_STRING_MANIP_HPP_INCLUDED
+
+
+#ifndef CATCH_TAG_ALIAS_REGISTRY_HPP_INCLUDED
+#define CATCH_TAG_ALIAS_REGISTRY_HPP_INCLUDED
+
+
+#include <map>
+#include <string>
+
+namespace Catch {
+    struct SourceLineInfo;
+
+    class TagAliasRegistry : public ITagAliasRegistry {
+    public:
+        ~TagAliasRegistry() override;
+        TagAlias const* find( std::string const& alias ) const override;
+        std::string expandAliases( std::string const& unexpandedTestSpec ) const override;
+        void add( std::string const& alias, std::string const& tag, SourceLineInfo const& lineInfo );
+
+    private:
+        std::map<std::string, TagAlias> m_registry;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_TAG_ALIAS_REGISTRY_HPP_INCLUDED
+
+
+#ifndef CATCH_TEST_CASE_INFO_HASHER_HPP_INCLUDED
+#define CATCH_TEST_CASE_INFO_HASHER_HPP_INCLUDED
+
+#include <cstdint>
+
+namespace Catch {
+
+    struct TestCaseInfo;
+
+    class TestCaseInfoHasher {
+    public:
+        using hash_t = std::uint64_t;
+        TestCaseInfoHasher( hash_t seed );
+        uint32_t operator()( TestCaseInfo const& t ) const;
+
+    private:
+        hash_t m_seed;
+    };
+
+} // namespace Catch
+
+#endif /* CATCH_TEST_CASE_INFO_HASHER_HPP_INCLUDED */
+
+
+#ifndef CATCH_TEST_CASE_REGISTRY_IMPL_HPP_INCLUDED
+#define CATCH_TEST_CASE_REGISTRY_IMPL_HPP_INCLUDED
+
+
+#include <vector>
+
+namespace Catch {
+
+    class IConfig;
+    class ITestInvoker;
+    class TestCaseHandle;
+    class TestSpec;
+
+    std::vector<TestCaseHandle> sortTests( IConfig const& config, std::vector<TestCaseHandle> const& unsortedTestCases );
+
+    bool isThrowSafe( TestCaseHandle const& testCase, IConfig const& config );
+
+    std::vector<TestCaseHandle> filterTests( std::vector<TestCaseHandle> const& testCases, TestSpec const& testSpec, IConfig const& config );
+    std::vector<TestCaseHandle> const& getAllTestCasesSorted( IConfig const& config );
+
+    class TestRegistry : public ITestCaseRegistry {
+    public:
+        void registerTest( Detail::unique_ptr<TestCaseInfo> testInfo, Detail::unique_ptr<ITestInvoker> testInvoker );
+
+        std::vector<TestCaseInfo*> const& getAllInfos() const override;
+        std::vector<TestCaseHandle> const& getAllTests() const override;
+        std::vector<TestCaseHandle> const& getAllTestsSorted( IConfig const& config ) const override;
+
+    private:
+        std::vector<Detail::unique_ptr<TestCaseInfo>> m_owned_test_infos;
+        // Keeps a materialized vector for `getAllInfos`.
+        // We should get rid of that eventually (see interface note)
+        std::vector<TestCaseInfo*> m_viewed_test_infos;
+
+        std::vector<Detail::unique_ptr<ITestInvoker>> m_invokers;
+        std::vector<TestCaseHandle> m_handles;
+        mutable TestRunOrder m_currentSortOrder = TestRunOrder::Declared;
+        mutable std::vector<TestCaseHandle> m_sortedFunctions;
+    };
+
+    ///////////////////////////////////////////////////////////////////////////
+
+
+} // end namespace Catch
+
+
+#endif // CATCH_TEST_CASE_REGISTRY_IMPL_HPP_INCLUDED
+
+
+#ifndef CATCH_TEST_SPEC_PARSER_HPP_INCLUDED
+#define CATCH_TEST_SPEC_PARSER_HPP_INCLUDED
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpadded"
+#endif
+
+
+#include <vector>
+#include <string>
+
+namespace Catch {
+
+    class ITagAliasRegistry;
+
+    class TestSpecParser {
+        enum Mode{ None, Name, QuotedName, Tag, EscapedName };
+        Mode m_mode = None;
+        Mode lastMode = None;
+        bool m_exclusion = false;
+        std::size_t m_pos = 0;
+        std::size_t m_realPatternPos = 0;
+        std::string m_arg;
+        std::string m_substring;
+        std::string m_patternName;
+        std::vector<std::size_t> m_escapeChars;
+        TestSpec::Filter m_currentFilter;
+        TestSpec m_testSpec;
+        ITagAliasRegistry const* m_tagAliases = nullptr;
+
+    public:
+        TestSpecParser( ITagAliasRegistry const& tagAliases );
+
+        TestSpecParser& parse( std::string const& arg );
+        TestSpec testSpec();
+
+    private:
+        bool visitChar( char c );
+        void startNewMode( Mode mode );
+        bool processNoneChar( char c );
+        void processNameChar( char c );
+        bool processOtherChar( char c );
+        void endMode();
+        void escape();
+        bool isControlChar( char c ) const;
+        void saveLastMode();
+        void revertBackToLastMode();
+        void addFilter();
+        bool separate();
+
+        // Handles common preprocessing of the pattern for name/tag patterns
+        std::string preprocessPattern();
+        // Adds the current pattern as a test name
+        void addNamePattern();
+        // Adds the current pattern as a tag
+        void addTagPattern();
+
+        inline void addCharToPattern(char c) {
+            m_substring += c;
+            m_patternName += c;
+            m_realPatternPos++;
+        }
+
+    };
+
+} // namespace Catch
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+#endif // CATCH_TEST_SPEC_PARSER_HPP_INCLUDED
+
+
+#ifndef CATCH_TEXTFLOW_HPP_INCLUDED
+#define CATCH_TEXTFLOW_HPP_INCLUDED
+
+
+#include <cassert>
+#include <string>
+#include <vector>
+
+namespace Catch {
+    namespace TextFlow {
+
+        class Columns;
+
+        /**
+         * Represents a column of text with specific width and indentation
+         *
+         * When written out to a stream, it will perform linebreaking
+         * of the provided text so that the written lines fit within
+         * target width.
+         */
+        class Column {
+            // String to be written out
+            std::string m_string;
+            // Width of the column for linebreaking
+            size_t m_width = CATCH_CONFIG_CONSOLE_WIDTH - 1;
+            // Indentation of other lines (including first if initial indent is unset)
+            size_t m_indent = 0;
+            // Indentation of the first line
+            size_t m_initialIndent = std::string::npos;
+
+        public:
+            /**
+             * Iterates "lines" in `Column` and returns them
+             */
+            class const_iterator {
+                friend Column;
+                struct EndTag {};
+
+                Column const& m_column;
+                // Where does the current line start?
+                size_t m_lineStart = 0;
+                // How long should the current line be?
+                size_t m_lineLength = 0;
+                // How far have we checked the string to iterate?
+                size_t m_parsedTo = 0;
+                // Should a '-' be appended to the line?
+                bool m_addHyphen = false;
+
+                const_iterator( Column const& column, EndTag ):
+                    m_column( column ), m_lineStart( m_column.m_string.size() ) {}
+
+                // Calculates the length of the current line
+                void calcLength();
+
+                // Returns current indentation width
+                size_t indentSize() const;
+
+                // Creates an indented and (optionally) suffixed string from
+                // current iterator position, indentation and length.
+                std::string addIndentAndSuffix( size_t position,
+                                                size_t length ) const;
+
+            public:
+                using difference_type = std::ptrdiff_t;
+                using value_type = std::string;
+                using pointer = value_type*;
+                using reference = value_type&;
+                using iterator_category = std::forward_iterator_tag;
+
+                explicit const_iterator( Column const& column );
+
+                std::string operator*() const;
+
+                const_iterator& operator++();
+                const_iterator operator++( int );
+
+                bool operator==( const_iterator const& other ) const {
+                    return m_lineStart == other.m_lineStart && &m_column == &other.m_column;
+                }
+                bool operator!=( const_iterator const& other ) const {
+                    return !operator==( other );
+                }
+            };
+            using iterator = const_iterator;
+
+            explicit Column( std::string const& text ): m_string( text ) {}
+            explicit Column( std::string&& text ):
+                m_string( CATCH_MOVE(text)) {}
+
+            Column& width( size_t newWidth ) & {
+                assert( newWidth > 0 );
+                m_width = newWidth;
+                return *this;
+            }
+            Column&& width( size_t newWidth ) && {
+                assert( newWidth > 0 );
+                m_width = newWidth;
+                return CATCH_MOVE( *this );
+            }
+            Column& indent( size_t newIndent ) & {
+                m_indent = newIndent;
+                return *this;
+            }
+            Column&& indent( size_t newIndent ) && {
+                m_indent = newIndent;
+                return CATCH_MOVE( *this );
+            }
+            Column& initialIndent( size_t newIndent ) & {
+                m_initialIndent = newIndent;
+                return *this;
+            }
+            Column&& initialIndent( size_t newIndent ) && {
+                m_initialIndent = newIndent;
+                return CATCH_MOVE( *this );
+            }
+
+            size_t width() const { return m_width; }
+            const_iterator begin() const { return const_iterator( *this ); }
+            const_iterator end() const { return { *this, const_iterator::EndTag{} }; }
+
+            friend std::ostream& operator<<( std::ostream& os,
+                                             Column const& col );
+
+            friend Columns operator+( Column const& lhs, Column const& rhs );
+            friend Columns operator+( Column&& lhs, Column&& rhs );
+        };
+
+        //! Creates a column that serves as an empty space of specific width
+        Column Spacer( size_t spaceWidth );
+
+        class Columns {
+            std::vector<Column> m_columns;
+
+        public:
+            class iterator {
+                friend Columns;
+                struct EndTag {};
+
+                std::vector<Column> const& m_columns;
+                std::vector<Column::const_iterator> m_iterators;
+                size_t m_activeIterators;
+
+                iterator( Columns const& columns, EndTag );
+
+            public:
+                using difference_type = std::ptrdiff_t;
+                using value_type = std::string;
+                using pointer = value_type*;
+                using reference = value_type&;
+                using iterator_category = std::forward_iterator_tag;
+
+                explicit iterator( Columns const& columns );
+
+                auto operator==( iterator const& other ) const -> bool {
+                    return m_iterators == other.m_iterators;
+                }
+                auto operator!=( iterator const& other ) const -> bool {
+                    return m_iterators != other.m_iterators;
+                }
+                std::string operator*() const;
+                iterator& operator++();
+                iterator operator++( int );
+            };
+            using const_iterator = iterator;
+
+            iterator begin() const { return iterator( *this ); }
+            iterator end() const { return { *this, iterator::EndTag() }; }
+
+            friend Columns& operator+=( Columns& lhs, Column const& rhs );
+            friend Columns& operator+=( Columns& lhs, Column&& rhs );
+            friend Columns operator+( Columns const& lhs, Column const& rhs );
+            friend Columns operator+( Columns&& lhs, Column&& rhs );
+
+            friend std::ostream& operator<<( std::ostream& os,
+                                             Columns const& cols );
+        };
+
+    } // namespace TextFlow
+} // namespace Catch
+#endif // CATCH_TEXTFLOW_HPP_INCLUDED
+
+
+#ifndef CATCH_TO_STRING_HPP_INCLUDED
+#define CATCH_TO_STRING_HPP_INCLUDED
+
+#include <string>
+
+
+namespace Catch {
+    template <typename T>
+    std::string to_string(T const& t) {
+#if defined(CATCH_CONFIG_CPP11_TO_STRING)
+        return std::to_string(t);
+#else
+        ReusableStringStream rss;
+        rss << t;
+        return rss.str();
+#endif
+    }
+} // end namespace Catch
+
+#endif // CATCH_TO_STRING_HPP_INCLUDED
+
+
+#ifndef CATCH_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
+#define CATCH_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
+
+namespace Catch {
+    bool uncaught_exceptions();
+} // end namespace Catch
+
+#endif // CATCH_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
+
+
+#ifndef CATCH_XMLWRITER_HPP_INCLUDED
+#define CATCH_XMLWRITER_HPP_INCLUDED
+
+
+#include <iosfwd>
+#include <vector>
+
+namespace Catch {
+    enum class XmlFormatting {
+        None = 0x00,
+        Indent = 0x01,
+        Newline = 0x02,
+    };
+
+    XmlFormatting operator | (XmlFormatting lhs, XmlFormatting rhs);
+    XmlFormatting operator & (XmlFormatting lhs, XmlFormatting rhs);
+
+    /**
+     * Helper for XML-encoding text (escaping angle brackets, quotes, etc)
+     *
+     * Note: doesn't take ownership of passed strings, and thus the
+     *       encoded string must outlive the encoding instance.
+     */
+    class XmlEncode {
+    public:
+        enum ForWhat { ForTextNodes, ForAttributes };
+
+        XmlEncode( StringRef str, ForWhat forWhat = ForTextNodes );
+
+        void encodeTo( std::ostream& os ) const;
+
+        friend std::ostream& operator << ( std::ostream& os, XmlEncode const& xmlEncode );
+
+    private:
+        StringRef m_str;
+        ForWhat m_forWhat;
+    };
+
+    class XmlWriter {
+    public:
+
+        class ScopedElement {
+        public:
+            ScopedElement( XmlWriter* writer, XmlFormatting fmt );
+
+            ScopedElement( ScopedElement&& other ) noexcept;
+            ScopedElement& operator=( ScopedElement&& other ) noexcept;
+
+            ~ScopedElement();
+
+            ScopedElement&
+            writeText( StringRef text,
+                       XmlFormatting fmt = XmlFormatting::Newline |
+                                           XmlFormatting::Indent );
+
+            ScopedElement& writeAttribute( StringRef name,
+                                           StringRef attribute );
+            template <typename T,
+                      // Without this SFINAE, this overload is a better match
+                      // for `std::string`, `char const*`, `char const[N]` args.
+                      // While it would still work, it would cause code bloat
+                      // and multiple iteration over the strings
+                      typename = typename std::enable_if_t<
+                          !std::is_convertible<T, StringRef>::value>>
+            ScopedElement& writeAttribute( StringRef name,
+                                           T const& attribute ) {
+                m_writer->writeAttribute( name, attribute );
+                return *this;
+            }
+
+        private:
+            XmlWriter* m_writer = nullptr;
+            XmlFormatting m_fmt;
+        };
+
+        XmlWriter( std::ostream& os );
+        ~XmlWriter();
+
+        XmlWriter( XmlWriter const& ) = delete;
+        XmlWriter& operator=( XmlWriter const& ) = delete;
+
+        XmlWriter& startElement( std::string const& name, XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent);
+
+        ScopedElement scopedElement( std::string const& name, XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent);
+
+        XmlWriter& endElement(XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent);
+
+        //! The attribute content is XML-encoded
+        XmlWriter& writeAttribute( StringRef name, StringRef attribute );
+
+        //! Writes the attribute as "true/false"
+        XmlWriter& writeAttribute( StringRef name, bool attribute );
+
+        //! The attribute content is XML-encoded
+        XmlWriter& writeAttribute( StringRef name, char const* attribute );
+
+        //! The attribute value must provide op<<(ostream&, T). The resulting
+        //! serialization is XML-encoded
+        template <typename T,
+                  // Without this SFINAE, this overload is a better match
+                  // for `std::string`, `char const*`, `char const[N]` args.
+                  // While it would still work, it would cause code bloat
+                  // and multiple iteration over the strings
+                  typename = typename std::enable_if_t<
+                      !std::is_convertible<T, StringRef>::value>>
+        XmlWriter& writeAttribute( StringRef name, T const& attribute ) {
+            ReusableStringStream rss;
+            rss << attribute;
+            return writeAttribute( name, rss.str() );
+        }
+
+        //! Writes escaped `text` in a element
+        XmlWriter& writeText( StringRef text,
+                              XmlFormatting fmt = XmlFormatting::Newline |
+                                                  XmlFormatting::Indent );
+
+        //! Writes XML comment as "<!-- text -->"
+        XmlWriter& writeComment( StringRef text,
+                                 XmlFormatting fmt = XmlFormatting::Newline |
+                                                     XmlFormatting::Indent );
+
+        void writeStylesheetRef( StringRef url );
+
+        void ensureTagClosed();
+
+    private:
+
+        void applyFormatting(XmlFormatting fmt);
+
+        void writeDeclaration();
+
+        void newlineIfNecessary();
+
+        bool m_tagIsOpen = false;
+        bool m_needsNewline = false;
+        std::vector<std::string> m_tags;
+        std::string m_indent;
+        std::ostream& m_os;
+    };
+
+}
+
+#endif // CATCH_XMLWRITER_HPP_INCLUDED
+
+
+/** \file
+ * This is a convenience header for Catch2's Matcher support. It includes
+ * **all** of Catch2 headers related to matchers.
+ *
+ * Generally the Catch2 users should use specific includes they need,
+ * but this header can be used instead for ease-of-experimentation, or
+ * just plain convenience, at the cost of increased compilation times.
+ *
+ * When a new header is added to either the `matchers` folder, or to
+ * the corresponding internal subfolder, it should be added here.
+ */
+
+#ifndef CATCH_MATCHERS_ALL_HPP_INCLUDED
+#define CATCH_MATCHERS_ALL_HPP_INCLUDED
+
+
+
+#ifndef CATCH_MATCHERS_HPP_INCLUDED
+#define CATCH_MATCHERS_HPP_INCLUDED
+
+
+
+#ifndef CATCH_MATCHERS_IMPL_HPP_INCLUDED
+#define CATCH_MATCHERS_IMPL_HPP_INCLUDED
+
+
+#include <string>
+
+namespace Catch {
+
+    template<typename ArgT, typename MatcherT>
+    class MatchExpr : public ITransientExpression {
+        ArgT && m_arg;
+        MatcherT const& m_matcher;
+    public:
+        MatchExpr( ArgT && arg, MatcherT const& matcher )
+        :   ITransientExpression{ true, matcher.match( arg ) }, // not forwarding arg here on purpose
+            m_arg( CATCH_FORWARD(arg) ),
+            m_matcher( matcher )
+        {}
+
+        void streamReconstructedExpression( std::ostream& os ) const override {
+            os << Catch::Detail::stringify( m_arg )
+               << ' '
+               << m_matcher.toString();
+        }
+    };
+
+    namespace Matchers {
+        template <typename ArgT>
+        class MatcherBase;
+    }
+
+    using StringMatcher = Matchers::MatcherBase<std::string>;
+
+    void handleExceptionMatchExpr( AssertionHandler& handler, StringMatcher const& matcher );
+
+    template<typename ArgT, typename MatcherT>
+    auto makeMatchExpr( ArgT && arg, MatcherT const& matcher ) -> MatchExpr<ArgT, MatcherT> {
+        return MatchExpr<ArgT, MatcherT>( CATCH_FORWARD(arg), matcher );
+    }
+
+} // namespace Catch
+
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CHECK_THAT( macroName, matcher, resultDisposition, arg ) \
+    do { \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(arg) ", " CATCH_INTERNAL_STRINGIFY(matcher), resultDisposition ); \
+        INTERNAL_CATCH_TRY { \
+            catchAssertionHandler.handleExpr( Catch::makeMatchExpr( arg, matcher ) ); \
+        } INTERNAL_CATCH_CATCH( catchAssertionHandler ) \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( false )
+
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_THROWS_MATCHES( macroName, exceptionType, resultDisposition, matcher, ... ) \
+    do { \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__) ", " CATCH_INTERNAL_STRINGIFY(exceptionType) ", " CATCH_INTERNAL_STRINGIFY(matcher), resultDisposition ); \
+        if( catchAssertionHandler.allowThrows() ) \
+            try { \
+                static_cast<void>(__VA_ARGS__ ); \
+                catchAssertionHandler.handleUnexpectedExceptionNotThrown(); \
+            } \
+            catch( exceptionType const& ex ) { \
+                catchAssertionHandler.handleExpr( Catch::makeMatchExpr( ex, matcher ) ); \
+            } \
+            catch( ... ) { \
+                catchAssertionHandler.handleUnexpectedInflightException(); \
+            } \
+        else \
+            catchAssertionHandler.handleThrowingCallSkipped(); \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( false )
+
+
+#endif // CATCH_MATCHERS_IMPL_HPP_INCLUDED
+
+#include <string>
+#include <vector>
+
+namespace Catch {
+namespace Matchers {
+
+    class MatcherUntypedBase {
+    public:
+        MatcherUntypedBase() = default;
+
+        MatcherUntypedBase(MatcherUntypedBase const&) = default;
+        MatcherUntypedBase(MatcherUntypedBase&&) = default;
+
+        MatcherUntypedBase& operator = (MatcherUntypedBase const&) = delete;
+        MatcherUntypedBase& operator = (MatcherUntypedBase&&) = delete;
+
+        std::string toString() const;
+
+    protected:
+        virtual ~MatcherUntypedBase(); // = default;
+        virtual std::string describe() const = 0;
+        mutable std::string m_cachedToString;
+    };
+
+
+    template<typename T>
+    class MatcherBase : public MatcherUntypedBase {
+    public:
+        virtual bool match( T const& arg ) const = 0;
+    };
+
+    namespace Detail {
+
+        template<typename ArgT>
+        class MatchAllOf final : public MatcherBase<ArgT> {
+            std::vector<MatcherBase<ArgT> const*> m_matchers;
+
+        public:
+            MatchAllOf() = default;
+            MatchAllOf(MatchAllOf const&) = delete;
+            MatchAllOf& operator=(MatchAllOf const&) = delete;
+            MatchAllOf(MatchAllOf&&) = default;
+            MatchAllOf& operator=(MatchAllOf&&) = default;
+
+
+            bool match( ArgT const& arg ) const override {
+                for( auto matcher : m_matchers ) {
+                    if (!matcher->match(arg))
+                        return false;
+                }
+                return true;
+            }
+            std::string describe() const override {
+                std::string description;
+                description.reserve( 4 + m_matchers.size()*32 );
+                description += "( ";
+                bool first = true;
+                for( auto matcher : m_matchers ) {
+                    if( first )
+                        first = false;
+                    else
+                        description += " and ";
+                    description += matcher->toString();
+                }
+                description += " )";
+                return description;
+            }
+
+            friend MatchAllOf operator&& (MatchAllOf&& lhs, MatcherBase<ArgT> const& rhs) {
+                lhs.m_matchers.push_back(&rhs);
+                return CATCH_MOVE(lhs);
+            }
+            friend MatchAllOf operator&& (MatcherBase<ArgT> const& lhs, MatchAllOf&& rhs) {
+                rhs.m_matchers.insert(rhs.m_matchers.begin(), &lhs);
+                return CATCH_MOVE(rhs);
+            }
+        };
+
+        //! lvalue overload is intentionally deleted, users should
+        //! not be trying to compose stored composition matchers
+        template<typename ArgT>
+        MatchAllOf<ArgT> operator&& (MatchAllOf<ArgT> const& lhs, MatcherBase<ArgT> const& rhs) = delete;
+        //! lvalue overload is intentionally deleted, users should
+        //! not be trying to compose stored composition matchers
+        template<typename ArgT>
+        MatchAllOf<ArgT> operator&& (MatcherBase<ArgT> const& lhs, MatchAllOf<ArgT> const& rhs) = delete;
+
+        template<typename ArgT>
+        class MatchAnyOf final : public MatcherBase<ArgT> {
+            std::vector<MatcherBase<ArgT> const*> m_matchers;
+        public:
+            MatchAnyOf() = default;
+            MatchAnyOf(MatchAnyOf const&) = delete;
+            MatchAnyOf& operator=(MatchAnyOf const&) = delete;
+            MatchAnyOf(MatchAnyOf&&) = default;
+            MatchAnyOf& operator=(MatchAnyOf&&) = default;
+
+            bool match( ArgT const& arg ) const override {
+                for( auto matcher : m_matchers ) {
+                    if (matcher->match(arg))
+                        return true;
+                }
+                return false;
+            }
+            std::string describe() const override {
+                std::string description;
+                description.reserve( 4 + m_matchers.size()*32 );
+                description += "( ";
+                bool first = true;
+                for( auto matcher : m_matchers ) {
+                    if( first )
+                        first = false;
+                    else
+                        description += " or ";
+                    description += matcher->toString();
+                }
+                description += " )";
+                return description;
+            }
+
+            friend MatchAnyOf operator|| (MatchAnyOf&& lhs, MatcherBase<ArgT> const& rhs) {
+                lhs.m_matchers.push_back(&rhs);
+                return CATCH_MOVE(lhs);
+            }
+            friend MatchAnyOf operator|| (MatcherBase<ArgT> const& lhs, MatchAnyOf&& rhs) {
+                rhs.m_matchers.insert(rhs.m_matchers.begin(), &lhs);
+                return CATCH_MOVE(rhs);
+            }
+        };
+
+        //! lvalue overload is intentionally deleted, users should
+        //! not be trying to compose stored composition matchers
+        template<typename ArgT>
+        MatchAnyOf<ArgT> operator|| (MatchAnyOf<ArgT> const& lhs, MatcherBase<ArgT> const& rhs) = delete;
+        //! lvalue overload is intentionally deleted, users should
+        //! not be trying to compose stored composition matchers
+        template<typename ArgT>
+        MatchAnyOf<ArgT> operator|| (MatcherBase<ArgT> const& lhs, MatchAnyOf<ArgT> const& rhs) = delete;
+
+        template<typename ArgT>
+        class MatchNotOf final : public MatcherBase<ArgT> {
+            MatcherBase<ArgT> const& m_underlyingMatcher;
+
+        public:
+            explicit MatchNotOf( MatcherBase<ArgT> const& underlyingMatcher ):
+                m_underlyingMatcher( underlyingMatcher )
+            {}
+
+            bool match( ArgT const& arg ) const override {
+                return !m_underlyingMatcher.match( arg );
+            }
+
+            std::string describe() const override {
+                return "not " + m_underlyingMatcher.toString();
+            }
+        };
+
+    } // namespace Detail
+
+    template <typename T>
+    Detail::MatchAllOf<T> operator&& (MatcherBase<T> const& lhs, MatcherBase<T> const& rhs) {
+        return Detail::MatchAllOf<T>{} && lhs && rhs;
+    }
+    template <typename T>
+    Detail::MatchAnyOf<T> operator|| (MatcherBase<T> const& lhs, MatcherBase<T> const& rhs) {
+        return Detail::MatchAnyOf<T>{} || lhs || rhs;
+    }
+
+    template <typename T>
+    Detail::MatchNotOf<T> operator! (MatcherBase<T> const& matcher) {
+        return Detail::MatchNotOf<T>{ matcher };
+    }
+
+
+} // namespace Matchers
+} // namespace Catch
+
+
+#if defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_DISABLE)
+  #define CATCH_REQUIRE_THROWS_WITH( expr, matcher ) INTERNAL_CATCH_THROWS_STR_MATCHES( "CATCH_REQUIRE_THROWS_WITH", Catch::ResultDisposition::Normal, matcher, expr )
+  #define CATCH_REQUIRE_THROWS_MATCHES( expr, exceptionType, matcher ) INTERNAL_CATCH_THROWS_MATCHES( "CATCH_REQUIRE_THROWS_MATCHES", exceptionType, Catch::ResultDisposition::Normal, matcher, expr )
+
+  #define CATCH_CHECK_THROWS_WITH( expr, matcher ) INTERNAL_CATCH_THROWS_STR_MATCHES( "CATCH_CHECK_THROWS_WITH", Catch::ResultDisposition::ContinueOnFailure, matcher, expr )
+  #define CATCH_CHECK_THROWS_MATCHES( expr, exceptionType, matcher ) INTERNAL_CATCH_THROWS_MATCHES( "CATCH_CHECK_THROWS_MATCHES", exceptionType, Catch::ResultDisposition::ContinueOnFailure, matcher, expr )
+
+  #define CATCH_CHECK_THAT( arg, matcher ) INTERNAL_CHECK_THAT( "CATCH_CHECK_THAT", matcher, Catch::ResultDisposition::ContinueOnFailure, arg )
+  #define CATCH_REQUIRE_THAT( arg, matcher ) INTERNAL_CHECK_THAT( "CATCH_REQUIRE_THAT", matcher, Catch::ResultDisposition::Normal, arg )
+
+#elif defined(CATCH_CONFIG_PREFIX_ALL) && defined(CATCH_CONFIG_DISABLE)
+
+  #define CATCH_REQUIRE_THROWS_WITH( expr, matcher )                   (void)(0)
+  #define CATCH_REQUIRE_THROWS_MATCHES( expr, exceptionType, matcher ) (void)(0)
+
+  #define CATCH_CHECK_THROWS_WITH( expr, matcher )                     (void)(0)
+  #define CATCH_CHECK_THROWS_MATCHES( expr, exceptionType, matcher )   (void)(0)
+
+  #define CATCH_CHECK_THAT( arg, matcher )                             (void)(0)
+  #define CATCH_REQUIRE_THAT( arg, matcher )                           (void)(0)
+
+#elif !defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_DISABLE)
+
+  #define REQUIRE_THROWS_WITH( expr, matcher ) INTERNAL_CATCH_THROWS_STR_MATCHES( "REQUIRE_THROWS_WITH", Catch::ResultDisposition::Normal, matcher, expr )
+  #define REQUIRE_THROWS_MATCHES( expr, exceptionType, matcher ) INTERNAL_CATCH_THROWS_MATCHES( "REQUIRE_THROWS_MATCHES", exceptionType, Catch::ResultDisposition::Normal, matcher, expr )
+
+  #define CHECK_THROWS_WITH( expr, matcher ) INTERNAL_CATCH_THROWS_STR_MATCHES( "CHECK_THROWS_WITH", Catch::ResultDisposition::ContinueOnFailure, matcher, expr )
+  #define CHECK_THROWS_MATCHES( expr, exceptionType, matcher ) INTERNAL_CATCH_THROWS_MATCHES( "CHECK_THROWS_MATCHES", exceptionType, Catch::ResultDisposition::ContinueOnFailure, matcher, expr )
+
+  #define CHECK_THAT( arg, matcher ) INTERNAL_CHECK_THAT( "CHECK_THAT", matcher, Catch::ResultDisposition::ContinueOnFailure, arg )
+  #define REQUIRE_THAT( arg, matcher ) INTERNAL_CHECK_THAT( "REQUIRE_THAT", matcher, Catch::ResultDisposition::Normal, arg )
+
+#elif !defined(CATCH_CONFIG_PREFIX_ALL) && defined(CATCH_CONFIG_DISABLE)
+
+  #define REQUIRE_THROWS_WITH( expr, matcher )                   (void)(0)
+  #define REQUIRE_THROWS_MATCHES( expr, exceptionType, matcher ) (void)(0)
+
+  #define CHECK_THROWS_WITH( expr, matcher )                     (void)(0)
+  #define CHECK_THROWS_MATCHES( expr, exceptionType, matcher )   (void)(0)
+
+  #define CHECK_THAT( arg, matcher )                             (void)(0)
+  #define REQUIRE_THAT( arg, matcher )                           (void)(0)
+
+#endif // end of user facing macro declarations
+
+#endif // CATCH_MATCHERS_HPP_INCLUDED
+
+
+#ifndef CATCH_MATCHERS_CONTAINER_PROPERTIES_HPP_INCLUDED
+#define CATCH_MATCHERS_CONTAINER_PROPERTIES_HPP_INCLUDED
+
+
+
+#ifndef CATCH_MATCHERS_TEMPLATED_HPP_INCLUDED
+#define CATCH_MATCHERS_TEMPLATED_HPP_INCLUDED
+
+
+#include <array>
+#include <algorithm>
+#include <string>
+#include <type_traits>
+
+namespace Catch {
+namespace Matchers {
+    class MatcherGenericBase : public MatcherUntypedBase {
+    public:
+        MatcherGenericBase() = default;
+        ~MatcherGenericBase() override; // = default;
+
+        MatcherGenericBase(MatcherGenericBase const&) = default;
+        MatcherGenericBase(MatcherGenericBase&&) = default;
+
+        MatcherGenericBase& operator=(MatcherGenericBase const&) = delete;
+        MatcherGenericBase& operator=(MatcherGenericBase&&) = delete;
+    };
+
+
+    namespace Detail {
+        template<std::size_t N, std::size_t M>
+        std::array<void const*, N + M> array_cat(std::array<void const*, N> && lhs, std::array<void const*, M> && rhs) {
+            std::array<void const*, N + M> arr{};
+            std::copy_n(lhs.begin(), N, arr.begin());
+            std::copy_n(rhs.begin(), M, arr.begin() + N);
+            return arr;
+        }
+
+        template<std::size_t N>
+        std::array<void const*, N+1> array_cat(std::array<void const*, N> && lhs, void const* rhs) {
+            std::array<void const*, N+1> arr{};
+            std::copy_n(lhs.begin(), N, arr.begin());
+            arr[N] = rhs;
+            return arr;
+        }
+
+        template<std::size_t N>
+        std::array<void const*, N+1> array_cat(void const* lhs, std::array<void const*, N> && rhs) {
+            std::array<void const*, N + 1> arr{ {lhs} };
+            std::copy_n(rhs.begin(), N, arr.begin() + 1);
+            return arr;
+        }
+
+        template<typename T>
+        using is_generic_matcher = std::is_base_of<
+            Catch::Matchers::MatcherGenericBase,
+            std::remove_cv_t<std::remove_reference_t<T>>
+        >;
+
+        template<typename... Ts>
+        using are_generic_matchers = Catch::Detail::conjunction<is_generic_matcher<Ts>...>;
+
+        template<typename T>
+        using is_matcher = std::is_base_of<
+            Catch::Matchers::MatcherUntypedBase,
+            std::remove_cv_t<std::remove_reference_t<T>>
+        >;
+
+
+        template<std::size_t N, typename Arg>
+        bool match_all_of(Arg&&, std::array<void const*, N> const&, std::index_sequence<>) {
+            return true;
+        }
+
+        template<typename T, typename... MatcherTs, std::size_t N, typename Arg, std::size_t Idx, std::size_t... Indices>
+        bool match_all_of(Arg&& arg, std::array<void const*, N> const& matchers, std::index_sequence<Idx, Indices...>) {
+            return static_cast<T const*>(matchers[Idx])->match(arg) && match_all_of<MatcherTs...>(arg, matchers, std::index_sequence<Indices...>{});
+        }
+
+
+        template<std::size_t N, typename Arg>
+        bool match_any_of(Arg&&, std::array<void const*, N> const&, std::index_sequence<>) {
+            return false;
+        }
+
+        template<typename T, typename... MatcherTs, std::size_t N, typename Arg, std::size_t Idx, std::size_t... Indices>
+        bool match_any_of(Arg&& arg, std::array<void const*, N> const& matchers, std::index_sequence<Idx, Indices...>) {
+            return static_cast<T const*>(matchers[Idx])->match(arg) || match_any_of<MatcherTs...>(arg, matchers, std::index_sequence<Indices...>{});
+        }
+
+        std::string describe_multi_matcher(StringRef combine, std::string const* descriptions_begin, std::string const* descriptions_end);
+
+        template<typename... MatcherTs, std::size_t... Idx>
+        std::string describe_multi_matcher(StringRef combine, std::array<void const*, sizeof...(MatcherTs)> const& matchers, std::index_sequence<Idx...>) {
+            std::array<std::string, sizeof...(MatcherTs)> descriptions {{
+                static_cast<MatcherTs const*>(matchers[Idx])->toString()...
+            }};
+
+            return describe_multi_matcher(combine, descriptions.data(), descriptions.data() + descriptions.size());
+        }
+
+
+        template<typename... MatcherTs>
+        class MatchAllOfGeneric final : public MatcherGenericBase {
+        public:
+            MatchAllOfGeneric(MatchAllOfGeneric const&) = delete;
+            MatchAllOfGeneric& operator=(MatchAllOfGeneric const&) = delete;
+            MatchAllOfGeneric(MatchAllOfGeneric&&) = default;
+            MatchAllOfGeneric& operator=(MatchAllOfGeneric&&) = default;
+
+            MatchAllOfGeneric(MatcherTs const&... matchers) : m_matchers{ {std::addressof(matchers)...} } {}
+            explicit MatchAllOfGeneric(std::array<void const*, sizeof...(MatcherTs)> matchers) : m_matchers{matchers} {}
+
+            template<typename Arg>
+            bool match(Arg&& arg) const {
+                return match_all_of<MatcherTs...>(arg, m_matchers, std::index_sequence_for<MatcherTs...>{});
+            }
+
+            std::string describe() const override {
+                return describe_multi_matcher<MatcherTs...>(" and "_sr, m_matchers, std::index_sequence_for<MatcherTs...>{});
+            }
+
+            // Has to be public to enable the concatenating operators
+            // below, because they are not friend of the RHS, only LHS,
+            // and thus cannot access private fields of RHS
+            std::array<void const*, sizeof...( MatcherTs )> m_matchers;
+
+
+            //! Avoids type nesting for `GenericAllOf && GenericAllOf` case
+            template<typename... MatchersRHS>
+            friend
+            MatchAllOfGeneric<MatcherTs..., MatchersRHS...> operator && (
+                    MatchAllOfGeneric<MatcherTs...>&& lhs,
+                    MatchAllOfGeneric<MatchersRHS...>&& rhs) {
+                return MatchAllOfGeneric<MatcherTs..., MatchersRHS...>{array_cat(CATCH_MOVE(lhs.m_matchers), CATCH_MOVE(rhs.m_matchers))};
+            }
+
+            //! Avoids type nesting for `GenericAllOf && some matcher` case
+            template<typename MatcherRHS>
+            friend std::enable_if_t<is_matcher<MatcherRHS>::value,
+            MatchAllOfGeneric<MatcherTs..., MatcherRHS>> operator && (
+                    MatchAllOfGeneric<MatcherTs...>&& lhs,
+                    MatcherRHS const& rhs) {
+                return MatchAllOfGeneric<MatcherTs..., MatcherRHS>{array_cat(CATCH_MOVE(lhs.m_matchers), static_cast<void const*>(&rhs))};
+            }
+
+            //! Avoids type nesting for `some matcher && GenericAllOf` case
+            template<typename MatcherLHS>
+            friend std::enable_if_t<is_matcher<MatcherLHS>::value,
+            MatchAllOfGeneric<MatcherLHS, MatcherTs...>> operator && (
+                    MatcherLHS const& lhs,
+                    MatchAllOfGeneric<MatcherTs...>&& rhs) {
+                return MatchAllOfGeneric<MatcherLHS, MatcherTs...>{array_cat(static_cast<void const*>(std::addressof(lhs)), CATCH_MOVE(rhs.m_matchers))};
+            }
+        };
+
+
+        template<typename... MatcherTs>
+        class MatchAnyOfGeneric final : public MatcherGenericBase {
+        public:
+            MatchAnyOfGeneric(MatchAnyOfGeneric const&) = delete;
+            MatchAnyOfGeneric& operator=(MatchAnyOfGeneric const&) = delete;
+            MatchAnyOfGeneric(MatchAnyOfGeneric&&) = default;
+            MatchAnyOfGeneric& operator=(MatchAnyOfGeneric&&) = default;
+
+            MatchAnyOfGeneric(MatcherTs const&... matchers) : m_matchers{ {std::addressof(matchers)...} } {}
+            explicit MatchAnyOfGeneric(std::array<void const*, sizeof...(MatcherTs)> matchers) : m_matchers{matchers} {}
+
+            template<typename Arg>
+            bool match(Arg&& arg) const {
+                return match_any_of<MatcherTs...>(arg, m_matchers, std::index_sequence_for<MatcherTs...>{});
+            }
+
+            std::string describe() const override {
+                return describe_multi_matcher<MatcherTs...>(" or "_sr, m_matchers, std::index_sequence_for<MatcherTs...>{});
+            }
+
+
+            // Has to be public to enable the concatenating operators
+            // below, because they are not friend of the RHS, only LHS,
+            // and thus cannot access private fields of RHS
+            std::array<void const*, sizeof...( MatcherTs )> m_matchers;
+
+            //! Avoids type nesting for `GenericAnyOf || GenericAnyOf` case
+            template<typename... MatchersRHS>
+            friend MatchAnyOfGeneric<MatcherTs..., MatchersRHS...> operator || (
+                    MatchAnyOfGeneric<MatcherTs...>&& lhs,
+                    MatchAnyOfGeneric<MatchersRHS...>&& rhs) {
+                return MatchAnyOfGeneric<MatcherTs..., MatchersRHS...>{array_cat(CATCH_MOVE(lhs.m_matchers), CATCH_MOVE(rhs.m_matchers))};
+            }
+
+            //! Avoids type nesting for `GenericAnyOf || some matcher` case
+            template<typename MatcherRHS>
+            friend std::enable_if_t<is_matcher<MatcherRHS>::value,
+            MatchAnyOfGeneric<MatcherTs..., MatcherRHS>> operator || (
+                    MatchAnyOfGeneric<MatcherTs...>&& lhs,
+                    MatcherRHS const& rhs) {
+                return MatchAnyOfGeneric<MatcherTs..., MatcherRHS>{array_cat(CATCH_MOVE(lhs.m_matchers), static_cast<void const*>(std::addressof(rhs)))};
+            }
+
+            //! Avoids type nesting for `some matcher || GenericAnyOf` case
+            template<typename MatcherLHS>
+            friend std::enable_if_t<is_matcher<MatcherLHS>::value,
+            MatchAnyOfGeneric<MatcherLHS, MatcherTs...>> operator || (
+                MatcherLHS const& lhs,
+                MatchAnyOfGeneric<MatcherTs...>&& rhs) {
+                return MatchAnyOfGeneric<MatcherLHS, MatcherTs...>{array_cat(static_cast<void const*>(std::addressof(lhs)), CATCH_MOVE(rhs.m_matchers))};
+            }
+        };
+
+
+        template<typename MatcherT>
+        class MatchNotOfGeneric final : public MatcherGenericBase {
+            MatcherT const& m_matcher;
+
+        public:
+            MatchNotOfGeneric(MatchNotOfGeneric const&) = delete;
+            MatchNotOfGeneric& operator=(MatchNotOfGeneric const&) = delete;
+            MatchNotOfGeneric(MatchNotOfGeneric&&) = default;
+            MatchNotOfGeneric& operator=(MatchNotOfGeneric&&) = default;
+
+            explicit MatchNotOfGeneric(MatcherT const& matcher) : m_matcher{matcher} {}
+
+            template<typename Arg>
+            bool match(Arg&& arg) const {
+                return !m_matcher.match(arg);
+            }
+
+            std::string describe() const override {
+                return "not " + m_matcher.toString();
+            }
+
+            //! Negating negation can just unwrap and return underlying matcher
+            friend MatcherT const& operator ! (MatchNotOfGeneric<MatcherT> const& matcher) {
+                return matcher.m_matcher;
+            }
+        };
+    } // namespace Detail
+
+
+    // compose only generic matchers
+    template<typename MatcherLHS, typename MatcherRHS>
+    std::enable_if_t<Detail::are_generic_matchers<MatcherLHS, MatcherRHS>::value, Detail::MatchAllOfGeneric<MatcherLHS, MatcherRHS>>
+        operator && (MatcherLHS const& lhs, MatcherRHS const& rhs) {
+        return { lhs, rhs };
+    }
+
+    template<typename MatcherLHS, typename MatcherRHS>
+    std::enable_if_t<Detail::are_generic_matchers<MatcherLHS, MatcherRHS>::value, Detail::MatchAnyOfGeneric<MatcherLHS, MatcherRHS>>
+        operator || (MatcherLHS const& lhs, MatcherRHS const& rhs) {
+        return { lhs, rhs };
+    }
+
+    //! Wrap provided generic matcher in generic negator
+    template<typename MatcherT>
+    std::enable_if_t<Detail::is_generic_matcher<MatcherT>::value, Detail::MatchNotOfGeneric<MatcherT>>
+        operator ! (MatcherT const& matcher) {
+        return Detail::MatchNotOfGeneric<MatcherT>{matcher};
+    }
+
+
+    // compose mixed generic and non-generic matchers
+    template<typename MatcherLHS, typename ArgRHS>
+    std::enable_if_t<Detail::is_generic_matcher<MatcherLHS>::value, Detail::MatchAllOfGeneric<MatcherLHS, MatcherBase<ArgRHS>>>
+        operator && (MatcherLHS const& lhs, MatcherBase<ArgRHS> const& rhs) {
+        return { lhs, rhs };
+    }
+
+    template<typename ArgLHS, typename MatcherRHS>
+    std::enable_if_t<Detail::is_generic_matcher<MatcherRHS>::value, Detail::MatchAllOfGeneric<MatcherBase<ArgLHS>, MatcherRHS>>
+        operator && (MatcherBase<ArgLHS> const& lhs, MatcherRHS const& rhs) {
+        return { lhs, rhs };
+    }
+
+    template<typename MatcherLHS, typename ArgRHS>
+    std::enable_if_t<Detail::is_generic_matcher<MatcherLHS>::value, Detail::MatchAnyOfGeneric<MatcherLHS, MatcherBase<ArgRHS>>>
+        operator || (MatcherLHS const& lhs, MatcherBase<ArgRHS> const& rhs) {
+        return { lhs, rhs };
+    }
+
+    template<typename ArgLHS, typename MatcherRHS>
+    std::enable_if_t<Detail::is_generic_matcher<MatcherRHS>::value, Detail::MatchAnyOfGeneric<MatcherBase<ArgLHS>, MatcherRHS>>
+        operator || (MatcherBase<ArgLHS> const& lhs, MatcherRHS const& rhs) {
+        return { lhs, rhs };
+    }
+
+} // namespace Matchers
+} // namespace Catch
+
+#endif // CATCH_MATCHERS_TEMPLATED_HPP_INCLUDED
+
+namespace Catch {
+    namespace Matchers {
+
+        class IsEmptyMatcher final : public MatcherGenericBase {
+        public:
+            template <typename RangeLike>
+            bool match(RangeLike&& rng) const {
+#if defined(CATCH_CONFIG_POLYFILL_NONMEMBER_CONTAINER_ACCESS)
+                using Catch::Detail::empty;
+#else
+                using std::empty;
+#endif
+                return empty(rng);
+            }
+
+            std::string describe() const override;
+        };
+
+        class HasSizeMatcher final : public MatcherGenericBase {
+            std::size_t m_target_size;
+        public:
+            explicit HasSizeMatcher(std::size_t target_size):
+                m_target_size(target_size)
+            {}
+
+            template <typename RangeLike>
+            bool match(RangeLike&& rng) const {
+#if defined(CATCH_CONFIG_POLYFILL_NONMEMBER_CONTAINER_ACCESS)
+                using Catch::Detail::size;
+#else
+                using std::size;
+#endif
+                return size(rng) == m_target_size;
+            }
+
+            std::string describe() const override;
+        };
+
+        template <typename Matcher>
+        class SizeMatchesMatcher final : public MatcherGenericBase {
+            Matcher m_matcher;
+        public:
+            explicit SizeMatchesMatcher(Matcher m):
+                m_matcher(CATCH_MOVE(m))
+            {}
+
+            template <typename RangeLike>
+            bool match(RangeLike&& rng) const {
+#if defined(CATCH_CONFIG_POLYFILL_NONMEMBER_CONTAINER_ACCESS)
+                using Catch::Detail::size;
+#else
+                using std::size;
+#endif
+                return m_matcher.match(size(rng));
+            }
+
+            std::string describe() const override {
+                return "size matches " + m_matcher.describe();
+            }
+        };
+
+
+        //! Creates a matcher that accepts empty ranges/containers
+        IsEmptyMatcher IsEmpty();
+        //! Creates a matcher that accepts ranges/containers with specific size
+        HasSizeMatcher SizeIs(std::size_t sz);
+        template <typename Matcher>
+        std::enable_if_t<Detail::is_matcher<Matcher>::value,
+        SizeMatchesMatcher<Matcher>> SizeIs(Matcher&& m) {
+            return SizeMatchesMatcher<Matcher>{CATCH_FORWARD(m)};
+        }
+
+    } // end namespace Matchers
+} // end namespace Catch
+
+#endif // CATCH_MATCHERS_CONTAINER_PROPERTIES_HPP_INCLUDED
+
+
+#ifndef CATCH_MATCHERS_CONTAINS_HPP_INCLUDED
+#define CATCH_MATCHERS_CONTAINS_HPP_INCLUDED
+
+
+#include <algorithm>
+#include <functional>
+
+namespace Catch {
+    namespace Matchers {
+        //! Matcher for checking that an element in range is equal to specific element
+        template <typename T, typename Equality>
+        class ContainsElementMatcher final : public MatcherGenericBase {
+            T m_desired;
+            Equality m_eq;
+        public:
+            template <typename T2, typename Equality2>
+            ContainsElementMatcher(T2&& target, Equality2&& predicate):
+                m_desired(CATCH_FORWARD(target)),
+                m_eq(CATCH_FORWARD(predicate))
+            {}
+
+            std::string describe() const override {
+                return "contains element " + Catch::Detail::stringify(m_desired);
+            }
+
+            template <typename RangeLike>
+            bool match( RangeLike&& rng ) const {
+                for ( auto&& elem : rng ) {
+                    if ( m_eq( elem, m_desired ) ) { return true; }
+                }
+                return false;
+            }
+        };
+
+        //! Meta-matcher for checking that an element in a range matches a specific matcher
+        template <typename Matcher>
+        class ContainsMatcherMatcher final : public MatcherGenericBase {
+            Matcher m_matcher;
+        public:
+            // Note that we do a copy+move to avoid having to SFINAE this
+            // constructor (and also avoid some perfect forwarding failure
+            // cases)
+            ContainsMatcherMatcher(Matcher matcher):
+                m_matcher(CATCH_MOVE(matcher))
+            {}
+
+            template <typename RangeLike>
+            bool match(RangeLike&& rng) const {
+                for (auto&& elem : rng) {
+                    if (m_matcher.match(elem)) {
+                        return true;
+                    }
+                }
+                return false;
+            }
+
+            std::string describe() const override {
+                return "contains element matching " + m_matcher.describe();
+            }
+        };
+
+        /**
+         * Creates a matcher that checks whether a range contains a specific element.
+         *
+         * Uses `std::equal_to` to do the comparison
+         */
+        template <typename T>
+        std::enable_if_t<!Detail::is_matcher<T>::value,
+        ContainsElementMatcher<T, std::equal_to<>>> Contains(T&& elem) {
+            return { CATCH_FORWARD(elem), std::equal_to<>{} };
+        }
+
+        //! Creates a matcher that checks whether a range contains element matching a matcher
+        template <typename Matcher>
+        std::enable_if_t<Detail::is_matcher<Matcher>::value,
+        ContainsMatcherMatcher<Matcher>> Contains(Matcher&& matcher) {
+            return { CATCH_FORWARD(matcher) };
+        }
+
+        /**
+         * Creates a matcher that checks whether a range contains a specific element.
+         *
+         * Uses `eq` to do the comparisons, the element is provided on the rhs
+         */
+        template <typename T, typename Equality>
+        ContainsElementMatcher<T, Equality> Contains(T&& elem, Equality&& eq) {
+            return { CATCH_FORWARD(elem), CATCH_FORWARD(eq) };
+        }
+
+    }
+}
+
+#endif // CATCH_MATCHERS_CONTAINS_HPP_INCLUDED
+
+
+#ifndef CATCH_MATCHERS_EXCEPTION_HPP_INCLUDED
+#define CATCH_MATCHERS_EXCEPTION_HPP_INCLUDED
+
+
+namespace Catch {
+namespace Matchers {
+
+class ExceptionMessageMatcher final : public MatcherBase<std::exception> {
+    std::string m_message;
+public:
+
+    ExceptionMessageMatcher(std::string const& message):
+        m_message(message)
+    {}
+
+    bool match(std::exception const& ex) const override;
+
+    std::string describe() const override;
+};
+
+//! Creates a matcher that checks whether a std derived exception has the provided message
+ExceptionMessageMatcher Message(std::string const& message);
+
+template <typename StringMatcherType>
+class ExceptionMessageMatchesMatcher final
+    : public MatcherBase<std::exception> {
+    StringMatcherType m_matcher;
+
+public:
+    ExceptionMessageMatchesMatcher( StringMatcherType matcher ):
+        m_matcher( CATCH_MOVE( matcher ) ) {}
+
+    bool match( std::exception const& ex ) const override {
+        return m_matcher.match( ex.what() );
+    }
+
+    std::string describe() const override {
+        return " matches \"" + m_matcher.describe() + '"';
+    }
+};
+
+//! Creates a matcher that checks whether a message from an std derived
+//! exception matches a provided matcher
+template <typename StringMatcherType>
+ExceptionMessageMatchesMatcher<StringMatcherType>
+MessageMatches( StringMatcherType&& matcher ) {
+    return { CATCH_FORWARD( matcher ) };
+}
+
+} // namespace Matchers
+} // namespace Catch
+
+#endif // CATCH_MATCHERS_EXCEPTION_HPP_INCLUDED
+
+
+#ifndef CATCH_MATCHERS_FLOATING_POINT_HPP_INCLUDED
+#define CATCH_MATCHERS_FLOATING_POINT_HPP_INCLUDED
+
+
+namespace Catch {
+namespace Matchers {
+
+    namespace Detail {
+        enum class FloatingPointKind : uint8_t;
+    }
+
+    class  WithinAbsMatcher final : public MatcherBase<double> {
+    public:
+        WithinAbsMatcher(double target, double margin);
+        bool match(double const& matchee) const override;
+        std::string describe() const override;
+    private:
+        double m_target;
+        double m_margin;
+    };
+
+    //! Creates a matcher that accepts numbers within certain range of target
+    WithinAbsMatcher WithinAbs( double target, double margin );
+
+
+
+    class WithinUlpsMatcher final : public MatcherBase<double> {
+    public:
+        WithinUlpsMatcher( double target,
+                           uint64_t ulps,
+                           Detail::FloatingPointKind baseType );
+        bool match(double const& matchee) const override;
+        std::string describe() const override;
+    private:
+        double m_target;
+        uint64_t m_ulps;
+        Detail::FloatingPointKind m_type;
+    };
+
+    //! Creates a matcher that accepts doubles within certain ULP range of target
+    WithinUlpsMatcher WithinULP(double target, uint64_t maxUlpDiff);
+    //! Creates a matcher that accepts floats within certain ULP range of target
+    WithinUlpsMatcher WithinULP(float target, uint64_t maxUlpDiff);
+
+
+
+    // Given IEEE-754 format for floats and doubles, we can assume
+    // that float -> double promotion is lossless. Given this, we can
+    // assume that if we do the standard relative comparison of
+    // |lhs - rhs| <= epsilon * max(fabs(lhs), fabs(rhs)), then we get
+    // the same result if we do this for floats, as if we do this for
+    // doubles that were promoted from floats.
+    class WithinRelMatcher final : public MatcherBase<double> {
+    public:
+        WithinRelMatcher( double target, double epsilon );
+        bool match(double const& matchee) const override;
+        std::string describe() const override;
+    private:
+        double m_target;
+        double m_epsilon;
+    };
+
+    //! Creates a matcher that accepts doubles within certain relative range of target
+    WithinRelMatcher WithinRel(double target, double eps);
+    //! Creates a matcher that accepts doubles within 100*DBL_EPS relative range of target
+    WithinRelMatcher WithinRel(double target);
+    //! Creates a matcher that accepts doubles within certain relative range of target
+    WithinRelMatcher WithinRel(float target, float eps);
+    //! Creates a matcher that accepts floats within 100*FLT_EPS relative range of target
+    WithinRelMatcher WithinRel(float target);
+
+
+
+    class IsNaNMatcher final : public MatcherBase<double> {
+    public:
+        IsNaNMatcher() = default;
+        bool match( double const& matchee ) const override;
+        std::string describe() const override;
+    };
+
+    IsNaNMatcher IsNaN();
+
+} // namespace Matchers
+} // namespace Catch
+
+#endif // CATCH_MATCHERS_FLOATING_POINT_HPP_INCLUDED
+
+
+#ifndef CATCH_MATCHERS_PREDICATE_HPP_INCLUDED
+#define CATCH_MATCHERS_PREDICATE_HPP_INCLUDED
+
+
+#include <string>
+
+namespace Catch {
+namespace Matchers {
+
+namespace Detail {
+    std::string finalizeDescription(const std::string& desc);
+} // namespace Detail
+
+template <typename T, typename Predicate>
+class PredicateMatcher final : public MatcherBase<T> {
+    Predicate m_predicate;
+    std::string m_description;
+public:
+
+    PredicateMatcher(Predicate&& elem, std::string const& descr)
+        :m_predicate(CATCH_FORWARD(elem)),
+        m_description(Detail::finalizeDescription(descr))
+    {}
+
+    bool match( T const& item ) const override {
+        return m_predicate(item);
+    }
+
+    std::string describe() const override {
+        return m_description;
+    }
+};
+
+    /**
+     * Creates a matcher that calls delegates `match` to the provided predicate.
+     *
+     * The user has to explicitly specify the argument type to the matcher
+     */
+    template<typename T, typename Pred>
+    PredicateMatcher<T, Pred> Predicate(Pred&& predicate, std::string const& description = "") {
+        static_assert(is_callable<Pred(T)>::value, "Predicate not callable with argument T");
+        static_assert(std::is_same<bool, FunctionReturnType<Pred, T>>::value, "Predicate does not return bool");
+        return PredicateMatcher<T, Pred>(CATCH_FORWARD(predicate), description);
+    }
+
+} // namespace Matchers
+} // namespace Catch
+
+#endif // CATCH_MATCHERS_PREDICATE_HPP_INCLUDED
+
+
+#ifndef CATCH_MATCHERS_QUANTIFIERS_HPP_INCLUDED
+#define CATCH_MATCHERS_QUANTIFIERS_HPP_INCLUDED
+
+
+namespace Catch {
+    namespace Matchers {
+        // Matcher for checking that all elements in range matches a given matcher.
+        template <typename Matcher>
+        class AllMatchMatcher final : public MatcherGenericBase {
+            Matcher m_matcher;
+        public:
+            AllMatchMatcher(Matcher matcher):
+                m_matcher(CATCH_MOVE(matcher))
+            {}
+
+            std::string describe() const override {
+                return "all match " + m_matcher.describe();
+            }
+
+            template <typename RangeLike>
+            bool match(RangeLike&& rng) const {
+                for (auto&& elem : rng) {
+                    if (!m_matcher.match(elem)) {
+                        return false;
+                    }
+                }
+                return true;
+            }
+        };
+
+        // Matcher for checking that no element in range matches a given matcher.
+        template <typename Matcher>
+        class NoneMatchMatcher final : public MatcherGenericBase {
+            Matcher m_matcher;
+        public:
+            NoneMatchMatcher(Matcher matcher):
+                m_matcher(CATCH_MOVE(matcher))
+            {}
+
+            std::string describe() const override {
+                return "none match " + m_matcher.describe();
+            }
+
+            template <typename RangeLike>
+            bool match(RangeLike&& rng) const {
+                for (auto&& elem : rng) {
+                    if (m_matcher.match(elem)) {
+                        return false;
+                    }
+                }
+                return true;
+            }
+        };
+
+        // Matcher for checking that at least one element in range matches a given matcher.
+        template <typename Matcher>
+        class AnyMatchMatcher final : public MatcherGenericBase {
+            Matcher m_matcher;
+        public:
+            AnyMatchMatcher(Matcher matcher):
+                m_matcher(CATCH_MOVE(matcher))
+            {}
+
+            std::string describe() const override {
+                return "any match " + m_matcher.describe();
+            }
+
+            template <typename RangeLike>
+            bool match(RangeLike&& rng) const {
+                for (auto&& elem : rng) {
+                    if (m_matcher.match(elem)) {
+                        return true;
+                    }
+                }
+                return false;
+            }
+        };
+
+        // Matcher for checking that all elements in range are true.
+        class AllTrueMatcher final : public MatcherGenericBase {
+        public:
+            std::string describe() const override;
+
+            template <typename RangeLike>
+            bool match(RangeLike&& rng) const {
+                for (auto&& elem : rng) {
+                    if (!elem) {
+                        return false;
+                    }
+                }
+                return true;
+            }
+        };
+
+        // Matcher for checking that no element in range is true.
+        class NoneTrueMatcher final : public MatcherGenericBase {
+        public:
+            std::string describe() const override;
+
+            template <typename RangeLike>
+            bool match(RangeLike&& rng) const {
+                for (auto&& elem : rng) {
+                    if (elem) {
+                        return false;
+                    }
+                }
+                return true;
+            }
+        };
+
+        // Matcher for checking that any element in range is true.
+        class AnyTrueMatcher final : public MatcherGenericBase {
+        public:
+            std::string describe() const override;
+
+            template <typename RangeLike>
+            bool match(RangeLike&& rng) const {
+                for (auto&& elem : rng) {
+                    if (elem) {
+                        return true;
+                    }
+                }
+                return false;
+            }
+        };
+
+        // Creates a matcher that checks whether all elements in a range match a matcher
+        template <typename Matcher>
+        AllMatchMatcher<Matcher> AllMatch(Matcher&& matcher) {
+            return { CATCH_FORWARD(matcher) };
+        }
+
+        // Creates a matcher that checks whether no element in a range matches a matcher.
+        template <typename Matcher>
+        NoneMatchMatcher<Matcher> NoneMatch(Matcher&& matcher) {
+            return { CATCH_FORWARD(matcher) };
+        }
+
+        // Creates a matcher that checks whether any element in a range matches a matcher.
+        template <typename Matcher>
+        AnyMatchMatcher<Matcher> AnyMatch(Matcher&& matcher) {
+            return { CATCH_FORWARD(matcher) };
+        }
+
+        // Creates a matcher that checks whether all elements in a range are true
+        AllTrueMatcher AllTrue();
+
+        // Creates a matcher that checks whether no element in a range is true
+        NoneTrueMatcher NoneTrue();
+
+        // Creates a matcher that checks whether any element in a range is true
+        AnyTrueMatcher AnyTrue();
+    }
+}
+
+#endif // CATCH_MATCHERS_QUANTIFIERS_HPP_INCLUDED
+
+
+#ifndef CATCH_MATCHERS_RANGE_EQUALS_HPP_INCLUDED
+#define CATCH_MATCHERS_RANGE_EQUALS_HPP_INCLUDED
+
+
+#include <algorithm>
+#include <utility>
+
+namespace Catch {
+    namespace Matchers {
+
+        /**
+         * Matcher for checking that an element contains the same
+         * elements in the same order
+         */
+        template <typename TargetRangeLike, typename Equality>
+        class RangeEqualsMatcher final : public MatcherGenericBase {
+            TargetRangeLike m_desired;
+            Equality m_predicate;
+
+        public:
+            template <typename TargetRangeLike2, typename Equality2>
+            RangeEqualsMatcher( TargetRangeLike2&& range,
+                                Equality2&& predicate ):
+                m_desired( CATCH_FORWARD( range ) ),
+                m_predicate( CATCH_FORWARD( predicate ) ) {}
+
+            template <typename RangeLike>
+            bool match( RangeLike&& rng ) const {
+                auto rng_start = begin( rng );
+                const auto rng_end = end( rng );
+                auto target_start = begin( m_desired );
+                const auto target_end = end( m_desired );
+
+                while (rng_start != rng_end && target_start != target_end) {
+                    if (!m_predicate(*rng_start, *target_start)) {
+                        return false;
+                    }
+                    ++rng_start;
+                    ++target_start;
+                }
+                return rng_start == rng_end && target_start == target_end;
+            }
+
+            std::string describe() const override {
+                return "elements are " + Catch::Detail::stringify( m_desired );
+            }
+        };
+
+        /**
+         * Matcher for checking that an element contains the same
+         * elements (but not necessarily in the same order)
+         */
+        template <typename TargetRangeLike, typename Equality>
+        class UnorderedRangeEqualsMatcher final : public MatcherGenericBase {
+            TargetRangeLike m_desired;
+            Equality m_predicate;
+
+        public:
+            template <typename TargetRangeLike2, typename Equality2>
+            UnorderedRangeEqualsMatcher( TargetRangeLike2&& range,
+                                         Equality2&& predicate ):
+                m_desired( CATCH_FORWARD( range ) ),
+                m_predicate( CATCH_FORWARD( predicate ) ) {}
+
+            template <typename RangeLike>
+            bool match( RangeLike&& rng ) const {
+                using std::begin;
+                using std::end;
+                return Catch::Detail::is_permutation( begin( m_desired ),
+                                                      end( m_desired ),
+                                                      begin( rng ),
+                                                      end( rng ),
+                                                      m_predicate );
+            }
+
+            std::string describe() const override {
+                return "unordered elements are " +
+                       ::Catch::Detail::stringify( m_desired );
+            }
+        };
+
+        /**
+         * Creates a matcher that checks if all elements in a range are equal
+         * to all elements in another range.
+         *
+         * Uses `std::equal_to` to do the comparison
+         */
+        template <typename RangeLike>
+        std::enable_if_t<!Detail::is_matcher<RangeLike>::value,
+                         RangeEqualsMatcher<RangeLike, std::equal_to<>>>
+        RangeEquals( RangeLike&& range ) {
+            return { CATCH_FORWARD( range ), std::equal_to<>{} };
+        }
+
+        /**
+         * Creates a matcher that checks if all elements in a range are equal
+         * to all elements in another range.
+         *
+         * Uses to provided predicate `predicate` to do the comparisons
+         */
+        template <typename RangeLike, typename Equality>
+        RangeEqualsMatcher<RangeLike, Equality>
+        RangeEquals( RangeLike&& range, Equality&& predicate ) {
+            return { CATCH_FORWARD( range ), CATCH_FORWARD( predicate ) };
+        }
+
+        /**
+         * Creates a matcher that checks if all elements in a range are equal
+         * to all elements in another range, in some permutation
+         *
+         * Uses `std::equal_to` to do the comparison
+         */
+        template <typename RangeLike>
+        std::enable_if_t<
+            !Detail::is_matcher<RangeLike>::value,
+            UnorderedRangeEqualsMatcher<RangeLike, std::equal_to<>>>
+        UnorderedRangeEquals( RangeLike&& range ) {
+            return { CATCH_FORWARD( range ), std::equal_to<>{} };
+        }
+
+        /**
+         * Creates a matcher that checks if all elements in a range are equal
+         * to all elements in another range, in some permutation.
+         *
+         * Uses to provided predicate `predicate` to do the comparisons
+         */
+        template <typename RangeLike, typename Equality>
+        UnorderedRangeEqualsMatcher<RangeLike, Equality>
+        UnorderedRangeEquals( RangeLike&& range, Equality&& predicate ) {
+            return { CATCH_FORWARD( range ), CATCH_FORWARD( predicate ) };
+        }
+    } // namespace Matchers
+} // namespace Catch
+
+#endif // CATCH_MATCHERS_RANGE_EQUALS_HPP_INCLUDED
+
+
+#ifndef CATCH_MATCHERS_STRING_HPP_INCLUDED
+#define CATCH_MATCHERS_STRING_HPP_INCLUDED
+
+
+#include <string>
+
+namespace Catch {
+namespace Matchers {
+
+    struct CasedString {
+        CasedString( std::string const& str, CaseSensitive caseSensitivity );
+        std::string adjustString( std::string const& str ) const;
+        StringRef caseSensitivitySuffix() const;
+
+        CaseSensitive m_caseSensitivity;
+        std::string m_str;
+    };
+
+    class StringMatcherBase : public MatcherBase<std::string> {
+    protected:
+        CasedString m_comparator;
+        StringRef m_operation;
+
+    public:
+        StringMatcherBase( StringRef operation,
+                           CasedString const& comparator );
+        std::string describe() const override;
+    };
+
+    class StringEqualsMatcher final : public StringMatcherBase {
+    public:
+        StringEqualsMatcher( CasedString const& comparator );
+        bool match( std::string const& source ) const override;
+    };
+    class StringContainsMatcher final : public StringMatcherBase {
+    public:
+        StringContainsMatcher( CasedString const& comparator );
+        bool match( std::string const& source ) const override;
+    };
+    class StartsWithMatcher final : public StringMatcherBase {
+    public:
+        StartsWithMatcher( CasedString const& comparator );
+        bool match( std::string const& source ) const override;
+    };
+    class EndsWithMatcher final : public StringMatcherBase {
+    public:
+        EndsWithMatcher( CasedString const& comparator );
+        bool match( std::string const& source ) const override;
+    };
+
+    class RegexMatcher final : public MatcherBase<std::string> {
+        std::string m_regex;
+        CaseSensitive m_caseSensitivity;
+
+    public:
+        RegexMatcher( std::string regex, CaseSensitive caseSensitivity );
+        bool match( std::string const& matchee ) const override;
+        std::string describe() const override;
+    };
+
+    //! Creates matcher that accepts strings that are exactly equal to `str`
+    StringEqualsMatcher Equals( std::string const& str, CaseSensitive caseSensitivity = CaseSensitive::Yes );
+    //! Creates matcher that accepts strings that contain `str`
+    StringContainsMatcher ContainsSubstring( std::string const& str, CaseSensitive caseSensitivity = CaseSensitive::Yes );
+    //! Creates matcher that accepts strings that _end_ with `str`
+    EndsWithMatcher EndsWith( std::string const& str, CaseSensitive caseSensitivity = CaseSensitive::Yes );
+    //! Creates matcher that accepts strings that _start_ with `str`
+    StartsWithMatcher StartsWith( std::string const& str, CaseSensitive caseSensitivity = CaseSensitive::Yes );
+    //! Creates matcher that accepts strings matching `regex`
+    RegexMatcher Matches( std::string const& regex, CaseSensitive caseSensitivity = CaseSensitive::Yes );
+
+} // namespace Matchers
+} // namespace Catch
+
+#endif // CATCH_MATCHERS_STRING_HPP_INCLUDED
+
+
+#ifndef CATCH_MATCHERS_VECTOR_HPP_INCLUDED
+#define CATCH_MATCHERS_VECTOR_HPP_INCLUDED
+
+
+#include <algorithm>
+
+namespace Catch {
+namespace Matchers {
+
+    template<typename T, typename Alloc>
+    class VectorContainsElementMatcher final : public MatcherBase<std::vector<T, Alloc>> {
+        T const& m_comparator;
+
+    public:
+        VectorContainsElementMatcher(T const& comparator):
+            m_comparator(comparator)
+        {}
+
+        bool match(std::vector<T, Alloc> const& v) const override {
+            for (auto const& el : v) {
+                if (el == m_comparator) {
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        std::string describe() const override {
+            return "Contains: " + ::Catch::Detail::stringify( m_comparator );
+        }
+    };
+
+    template<typename T, typename AllocComp, typename AllocMatch>
+    class ContainsMatcher final : public MatcherBase<std::vector<T, AllocMatch>> {
+        std::vector<T, AllocComp> const& m_comparator;
+
+    public:
+        ContainsMatcher(std::vector<T, AllocComp> const& comparator):
+            m_comparator( comparator )
+        {}
+
+        bool match(std::vector<T, AllocMatch> const& v) const override {
+            // !TBD: see note in EqualsMatcher
+            if (m_comparator.size() > v.size())
+                return false;
+            for (auto const& comparator : m_comparator) {
+                auto present = false;
+                for (const auto& el : v) {
+                    if (el == comparator) {
+                        present = true;
+                        break;
+                    }
+                }
+                if (!present) {
+                    return false;
+                }
+            }
+            return true;
+        }
+        std::string describe() const override {
+            return "Contains: " + ::Catch::Detail::stringify( m_comparator );
+        }
+    };
+
+    template<typename T, typename AllocComp, typename AllocMatch>
+    class EqualsMatcher final : public MatcherBase<std::vector<T, AllocMatch>> {
+        std::vector<T, AllocComp> const& m_comparator;
+
+    public:
+        EqualsMatcher(std::vector<T, AllocComp> const& comparator):
+            m_comparator( comparator )
+        {}
+
+        bool match(std::vector<T, AllocMatch> const& v) const override {
+            // !TBD: This currently works if all elements can be compared using !=
+            // - a more general approach would be via a compare template that defaults
+            // to using !=. but could be specialised for, e.g. std::vector<T> etc
+            // - then just call that directly
+            if ( m_comparator.size() != v.size() ) { return false; }
+            for ( std::size_t i = 0; i < v.size(); ++i ) {
+                if ( !( m_comparator[i] == v[i] ) ) { return false; }
+            }
+            return true;
+        }
+        std::string describe() const override {
+            return "Equals: " + ::Catch::Detail::stringify( m_comparator );
+        }
+    };
+
+    template<typename T, typename AllocComp, typename AllocMatch>
+    class ApproxMatcher final : public MatcherBase<std::vector<T, AllocMatch>> {
+        std::vector<T, AllocComp> const& m_comparator;
+        mutable Catch::Approx approx = Catch::Approx::custom();
+
+    public:
+        ApproxMatcher(std::vector<T, AllocComp> const& comparator):
+            m_comparator( comparator )
+        {}
+
+        bool match(std::vector<T, AllocMatch> const& v) const override {
+            if (m_comparator.size() != v.size())
+                return false;
+            for (std::size_t i = 0; i < v.size(); ++i)
+                if (m_comparator[i] != approx(v[i]))
+                    return false;
+            return true;
+        }
+        std::string describe() const override {
+            return "is approx: " + ::Catch::Detail::stringify( m_comparator );
+        }
+        template <typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        ApproxMatcher& epsilon( T const& newEpsilon ) {
+            approx.epsilon(static_cast<double>(newEpsilon));
+            return *this;
+        }
+        template <typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        ApproxMatcher& margin( T const& newMargin ) {
+            approx.margin(static_cast<double>(newMargin));
+            return *this;
+        }
+        template <typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        ApproxMatcher& scale( T const& newScale ) {
+            approx.scale(static_cast<double>(newScale));
+            return *this;
+        }
+    };
+
+    template<typename T, typename AllocComp, typename AllocMatch>
+    class UnorderedEqualsMatcher final : public MatcherBase<std::vector<T, AllocMatch>> {
+        std::vector<T, AllocComp> const& m_target;
+
+    public:
+        UnorderedEqualsMatcher(std::vector<T, AllocComp> const& target):
+            m_target(target)
+        {}
+        bool match(std::vector<T, AllocMatch> const& vec) const override {
+            if (m_target.size() != vec.size()) {
+                return false;
+            }
+            return std::is_permutation(m_target.begin(), m_target.end(), vec.begin());
+        }
+
+        std::string describe() const override {
+            return "UnorderedEquals: " + ::Catch::Detail::stringify(m_target);
+        }
+    };
+
+
+    // The following functions create the actual matcher objects.
+    // This allows the types to be inferred
+
+    //! Creates a matcher that matches vectors that contain all elements in `comparator`
+    template<typename T, typename AllocComp = std::allocator<T>, typename AllocMatch = AllocComp>
+    ContainsMatcher<T, AllocComp, AllocMatch> Contains( std::vector<T, AllocComp> const& comparator ) {
+        return ContainsMatcher<T, AllocComp, AllocMatch>(comparator);
+    }
+
+    //! Creates a matcher that matches vectors that contain `comparator` as an element
+    template<typename T, typename Alloc = std::allocator<T>>
+    VectorContainsElementMatcher<T, Alloc> VectorContains( T const& comparator ) {
+        return VectorContainsElementMatcher<T, Alloc>(comparator);
+    }
+
+    //! Creates a matcher that matches vectors that are exactly equal to `comparator`
+    template<typename T, typename AllocComp = std::allocator<T>, typename AllocMatch = AllocComp>
+    EqualsMatcher<T, AllocComp, AllocMatch> Equals( std::vector<T, AllocComp> const& comparator ) {
+        return EqualsMatcher<T, AllocComp, AllocMatch>(comparator);
+    }
+
+    //! Creates a matcher that matches vectors that `comparator` as an element
+    template<typename T, typename AllocComp = std::allocator<T>, typename AllocMatch = AllocComp>
+    ApproxMatcher<T, AllocComp, AllocMatch> Approx( std::vector<T, AllocComp> const& comparator ) {
+        return ApproxMatcher<T, AllocComp, AllocMatch>(comparator);
+    }
+
+    //! Creates a matcher that matches vectors that is equal to `target` modulo permutation
+    template<typename T, typename AllocComp = std::allocator<T>, typename AllocMatch = AllocComp>
+    UnorderedEqualsMatcher<T, AllocComp, AllocMatch> UnorderedEquals(std::vector<T, AllocComp> const& target) {
+        return UnorderedEqualsMatcher<T, AllocComp, AllocMatch>(target);
+    }
+
+} // namespace Matchers
+} // namespace Catch
+
+#endif // CATCH_MATCHERS_VECTOR_HPP_INCLUDED
+
+#endif // CATCH_MATCHERS_ALL_HPP_INCLUDED
+
+
+/** \file
+ * This is a convenience header for Catch2's Reporter support. It includes
+ * **all** of Catch2 headers related to reporters, including all reporters.
+ *
+ * Generally the Catch2 users should use specific includes they need,
+ * but this header can be used instead for ease-of-experimentation, or
+ * just plain convenience, at the cost of (significantly) increased
+ * compilation times.
+ *
+ * When a new header (reporter) is added to either the `reporter` folder,
+ * or to the corresponding internal subfolder, it should be added here.
+ */
+
+#ifndef CATCH_REPORTERS_ALL_HPP_INCLUDED
+#define CATCH_REPORTERS_ALL_HPP_INCLUDED
+
+
+
+#ifndef CATCH_REPORTER_AUTOMAKE_HPP_INCLUDED
+#define CATCH_REPORTER_AUTOMAKE_HPP_INCLUDED
+
+
+
+#ifndef CATCH_REPORTER_STREAMING_BASE_HPP_INCLUDED
+#define CATCH_REPORTER_STREAMING_BASE_HPP_INCLUDED
+
+
+
+#ifndef CATCH_REPORTER_COMMON_BASE_HPP_INCLUDED
+#define CATCH_REPORTER_COMMON_BASE_HPP_INCLUDED
+
+
+#include <map>
+#include <string>
+
+namespace Catch {
+    class ColourImpl;
+
+    /**
+     * This is the base class for all reporters.
+     *
+     * If are writing a reporter, you must derive from this type, or one
+     * of the helper reporter bases that are derived from this type.
+     *
+     * ReporterBase centralizes handling of various common tasks in reporters,
+     * like storing the right stream for the reporters to write to, and
+     * providing the default implementation of the different listing events.
+     */
+    class ReporterBase : public IEventListener {
+    protected:
+        //! The stream wrapper as passed to us by outside code
+        Detail::unique_ptr<IStream> m_wrapped_stream;
+        //! Cached output stream from `m_wrapped_stream` to reduce
+        //! number of indirect calls needed to write output.
+        std::ostream& m_stream;
+        //! Colour implementation this reporter was configured for
+        Detail::unique_ptr<ColourImpl> m_colour;
+        //! The custom reporter options user passed down to the reporter
+        std::map<std::string, std::string> m_customOptions;
+
+    public:
+        ReporterBase( ReporterConfig&& config );
+        ~ReporterBase() override; // = default;
+
+        /**
+         * Provides a simple default listing of reporters.
+         *
+         * Should look roughly like the reporter listing in v2 and earlier
+         * versions of Catch2.
+         */
+        void listReporters(
+            std::vector<ReporterDescription> const& descriptions ) override;
+        /**
+         * Provides a simple default listing of listeners
+         *
+         * Looks similarly to listing of reporters, but with listener type
+         * instead of reporter name.
+         */
+        void listListeners(
+            std::vector<ListenerDescription> const& descriptions ) override;
+        /**
+         * Provides a simple default listing of tests.
+         *
+         * Should look roughly like the test listing in v2 and earlier versions
+         * of Catch2. Especially supports low-verbosity listing that mimics the
+         * old `--list-test-names-only` output.
+         */
+        void listTests( std::vector<TestCaseHandle> const& tests ) override;
+        /**
+         * Provides a simple default listing of tags.
+         *
+         * Should look roughly like the tag listing in v2 and earlier versions
+         * of Catch2.
+         */
+        void listTags( std::vector<TagInfo> const& tags ) override;
+    };
+} // namespace Catch
+
+#endif // CATCH_REPORTER_COMMON_BASE_HPP_INCLUDED
+
+#include <vector>
+
+namespace Catch {
+
+    class StreamingReporterBase : public ReporterBase {
+    public:
+        // GCC5 compat: we cannot use inherited constructor, because it
+        //              doesn't implement backport of P0136
+        StreamingReporterBase(ReporterConfig&& _config):
+            ReporterBase(CATCH_MOVE(_config))
+        {}
+        ~StreamingReporterBase() override;
+
+        void benchmarkPreparing( StringRef ) override {}
+        void benchmarkStarting( BenchmarkInfo const& ) override {}
+        void benchmarkEnded( BenchmarkStats<> const& ) override {}
+        void benchmarkFailed( StringRef ) override {}
+
+        void fatalErrorEncountered( StringRef /*error*/ ) override {}
+        void noMatchingTestCases( StringRef /*unmatchedSpec*/ ) override {}
+        void reportInvalidTestSpec( StringRef /*invalidArgument*/ ) override {}
+
+        void testRunStarting( TestRunInfo const& _testRunInfo ) override;
+
+        void testCaseStarting(TestCaseInfo const& _testInfo) override  {
+            currentTestCaseInfo = &_testInfo;
+        }
+        void testCasePartialStarting( TestCaseInfo const&, uint64_t ) override {}
+        void sectionStarting(SectionInfo const& _sectionInfo) override {
+            m_sectionStack.push_back(_sectionInfo);
+        }
+
+        void assertionStarting( AssertionInfo const& ) override {}
+        void assertionEnded( AssertionStats const& ) override {}
+
+        void sectionEnded(SectionStats const& /* _sectionStats */) override {
+            m_sectionStack.pop_back();
+        }
+        void testCasePartialEnded( TestCaseStats const&, uint64_t ) override {}
+        void testCaseEnded(TestCaseStats const& /* _testCaseStats */) override {
+            currentTestCaseInfo = nullptr;
+        }
+        void testRunEnded( TestRunStats const& /* _testRunStats */ ) override;
+
+        void skipTest(TestCaseInfo const&) override {
+            // Don't do anything with this by default.
+            // It can optionally be overridden in the derived class.
+        }
+
+    protected:
+        TestRunInfo currentTestRunInfo{ "test run has not started yet"_sr };
+        TestCaseInfo const* currentTestCaseInfo = nullptr;
+
+        //! Stack of all _active_ sections in the _current_ test case
+        std::vector<SectionInfo> m_sectionStack;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_STREAMING_BASE_HPP_INCLUDED
+
+#include <string>
+
+namespace Catch {
+
+    class AutomakeReporter final : public StreamingReporterBase {
+    public:
+        // GCC5 compat: we cannot use inherited constructor, because it
+        //              doesn't implement backport of P0136
+        AutomakeReporter(ReporterConfig&& _config):
+            StreamingReporterBase(CATCH_MOVE(_config))
+        {}
+        ~AutomakeReporter() override;
+
+        static std::string getDescription() {
+            using namespace std::string_literals;
+            return "Reports test results in the format of Automake .trs files"s;
+        }
+
+        void testCaseEnded(TestCaseStats const& _testCaseStats) override;
+        void skipTest(TestCaseInfo const& testInfo) override;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_AUTOMAKE_HPP_INCLUDED
+
+
+#ifndef CATCH_REPORTER_COMPACT_HPP_INCLUDED
+#define CATCH_REPORTER_COMPACT_HPP_INCLUDED
+
+
+
+
+namespace Catch {
+
+    class CompactReporter final : public StreamingReporterBase {
+    public:
+        using StreamingReporterBase::StreamingReporterBase;
+
+        ~CompactReporter() override;
+
+        static std::string getDescription();
+
+        void noMatchingTestCases( StringRef unmatchedSpec ) override;
+
+        void testRunStarting( TestRunInfo const& _testInfo ) override;
+
+        void assertionEnded(AssertionStats const& _assertionStats) override;
+
+        void sectionEnded(SectionStats const& _sectionStats) override;
+
+        void testRunEnded(TestRunStats const& _testRunStats) override;
+
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_COMPACT_HPP_INCLUDED
+
+
+#ifndef CATCH_REPORTER_CONSOLE_HPP_INCLUDED
+#define CATCH_REPORTER_CONSOLE_HPP_INCLUDED
+
+
+namespace Catch {
+    // Fwd decls
+    class TablePrinter;
+
+    class ConsoleReporter final : public StreamingReporterBase {
+        Detail::unique_ptr<TablePrinter> m_tablePrinter;
+
+    public:
+        ConsoleReporter(ReporterConfig&& config);
+        ~ConsoleReporter() override;
+        static std::string getDescription();
+
+        void noMatchingTestCases( StringRef unmatchedSpec ) override;
+        void reportInvalidTestSpec( StringRef arg ) override;
+
+        void assertionStarting(AssertionInfo const&) override;
+
+        void assertionEnded(AssertionStats const& _assertionStats) override;
+
+        void sectionStarting(SectionInfo const& _sectionInfo) override;
+        void sectionEnded(SectionStats const& _sectionStats) override;
+
+        void benchmarkPreparing( StringRef name ) override;
+        void benchmarkStarting(BenchmarkInfo const& info) override;
+        void benchmarkEnded(BenchmarkStats<> const& stats) override;
+        void benchmarkFailed( StringRef error ) override;
+
+        void testCaseEnded(TestCaseStats const& _testCaseStats) override;
+        void testRunEnded(TestRunStats const& _testRunStats) override;
+        void testRunStarting(TestRunInfo const& _testRunInfo) override;
+
+    private:
+        void lazyPrint();
+
+        void lazyPrintWithoutClosingBenchmarkTable();
+        void lazyPrintRunInfo();
+        void printTestCaseAndSectionHeader();
+
+        void printClosedHeader(std::string const& _name);
+        void printOpenHeader(std::string const& _name);
+
+        // if string has a : in first line will set indent to follow it on
+        // subsequent lines
+        void printHeaderString(std::string const& _string, std::size_t indent = 0);
+
+        void printTotalsDivider(Totals const& totals);
+
+        bool m_headerPrinted = false;
+        bool m_testRunInfoPrinted = false;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_CONSOLE_HPP_INCLUDED
+
+
+#ifndef CATCH_REPORTER_CUMULATIVE_BASE_HPP_INCLUDED
+#define CATCH_REPORTER_CUMULATIVE_BASE_HPP_INCLUDED
+
+
+#include <string>
+#include <vector>
+
+namespace Catch {
+
+    namespace Detail {
+
+        //! Represents either an assertion or a benchmark result to be handled by cumulative reporter later
+        class AssertionOrBenchmarkResult {
+            // This should really be a variant, but this is much faster
+            // to write and the data layout here is already terrible
+            // enough that we do not have to care about the object size.
+            Optional<AssertionStats> m_assertion;
+            Optional<BenchmarkStats<>> m_benchmark;
+        public:
+            AssertionOrBenchmarkResult(AssertionStats const& assertion);
+            AssertionOrBenchmarkResult(BenchmarkStats<> const& benchmark);
+
+            bool isAssertion() const;
+            bool isBenchmark() const;
+
+            AssertionStats const& asAssertion() const;
+            BenchmarkStats<> const& asBenchmark() const;
+        };
+    }
+
+    /**
+     * Utility base for reporters that need to handle all results at once
+     *
+     * It stores tree of all test cases, sections and assertions, and after the
+     * test run is finished, calls into `testRunEndedCumulative` to pass the
+     * control to the deriving class.
+     *
+     * If you are deriving from this class and override any testing related
+     * member functions, you should first call into the base's implementation to
+     * avoid breaking the tree construction.
+     *
+     * Due to the way this base functions, it has to expand assertions up-front,
+     * even if they are later unused (e.g. because the deriving reporter does
+     * not report successful assertions, or because the deriving reporter does
+     * not use assertion expansion at all). Derived classes can use two
+     * customization points, `m_shouldStoreSuccesfulAssertions` and
+     * `m_shouldStoreFailedAssertions`, to disable the expansion and gain extra
+     * performance. **Accessing the assertion expansions if it wasn't stored is
+     * UB.**
+     */
+    class CumulativeReporterBase : public ReporterBase {
+    public:
+        template<typename T, typename ChildNodeT>
+        struct Node {
+            explicit Node( T const& _value ) : value( _value ) {}
+
+            using ChildNodes = std::vector<Detail::unique_ptr<ChildNodeT>>;
+            T value;
+            ChildNodes children;
+        };
+        struct SectionNode {
+            explicit SectionNode(SectionStats const& _stats) : stats(_stats) {}
+
+            bool operator == (SectionNode const& other) const {
+                return stats.sectionInfo.lineInfo == other.stats.sectionInfo.lineInfo;
+            }
+
+            bool hasAnyAssertions() const;
+
+            SectionStats stats;
+            std::vector<Detail::unique_ptr<SectionNode>> childSections;
+            std::vector<Detail::AssertionOrBenchmarkResult> assertionsAndBenchmarks;
+            std::string stdOut;
+            std::string stdErr;
+        };
+
+
+        using TestCaseNode = Node<TestCaseStats, SectionNode>;
+        using TestRunNode = Node<TestRunStats, TestCaseNode>;
+
+        // GCC5 compat: we cannot use inherited constructor, because it
+        //              doesn't implement backport of P0136
+        CumulativeReporterBase(ReporterConfig&& _config):
+            ReporterBase(CATCH_MOVE(_config))
+        {}
+        ~CumulativeReporterBase() override;
+
+        void benchmarkPreparing( StringRef ) override {}
+        void benchmarkStarting( BenchmarkInfo const& ) override {}
+        void benchmarkEnded( BenchmarkStats<> const& benchmarkStats ) override;
+        void benchmarkFailed( StringRef ) override {}
+
+        void noMatchingTestCases( StringRef ) override {}
+        void reportInvalidTestSpec( StringRef ) override {}
+        void fatalErrorEncountered( StringRef /*error*/ ) override {}
+
+        void testRunStarting( TestRunInfo const& ) override {}
+
+        void testCaseStarting( TestCaseInfo const& ) override {}
+        void testCasePartialStarting( TestCaseInfo const&, uint64_t ) override {}
+        void sectionStarting( SectionInfo const& sectionInfo ) override;
+
+        void assertionStarting( AssertionInfo const& ) override {}
+
+        void assertionEnded( AssertionStats const& assertionStats ) override;
+        void sectionEnded( SectionStats const& sectionStats ) override;
+        void testCasePartialEnded( TestCaseStats const&, uint64_t ) override {}
+        void testCaseEnded( TestCaseStats const& testCaseStats ) override;
+        void testRunEnded( TestRunStats const& testRunStats ) override;
+        //! Customization point: called after last test finishes (testRunEnded has been handled)
+        virtual void testRunEndedCumulative() = 0;
+
+        void skipTest(TestCaseInfo const&) override {}
+
+    protected:
+        //! Should the cumulative base store the assertion expansion for successful assertions?
+        bool m_shouldStoreSuccesfulAssertions = true;
+        //! Should the cumulative base store the assertion expansion for failed assertions?
+        bool m_shouldStoreFailedAssertions = true;
+
+        // We need lazy construction here. We should probably refactor it
+        // later, after the events are redone.
+        //! The root node of the test run tree.
+        Detail::unique_ptr<TestRunNode> m_testRun;
+
+    private:
+        // Note: We rely on pointer identity being stable, which is why
+        //       we store pointers to the nodes rather than the values.
+        std::vector<Detail::unique_ptr<TestCaseNode>> m_testCases;
+        // Root section of the _current_ test case
+        Detail::unique_ptr<SectionNode> m_rootSection;
+        // Deepest section of the _current_ test case
+        SectionNode* m_deepestSection = nullptr;
+        // Stack of _active_ sections in the _current_ test case
+        std::vector<SectionNode*> m_sectionStack;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_CUMULATIVE_BASE_HPP_INCLUDED
+
+
+#ifndef CATCH_REPORTER_EVENT_LISTENER_HPP_INCLUDED
+#define CATCH_REPORTER_EVENT_LISTENER_HPP_INCLUDED
+
+
+namespace Catch {
+
+    /**
+     * Base class to simplify implementing listeners.
+     *
+     * Provides empty default implementation for all IEventListener member
+     * functions, so that a listener implementation can pick which
+     * member functions it actually cares about.
+     */
+    class EventListenerBase : public IEventListener {
+    public:
+        using IEventListener::IEventListener;
+
+        void reportInvalidTestSpec( StringRef unmatchedSpec ) override;
+        void fatalErrorEncountered( StringRef error ) override;
+
+        void benchmarkPreparing( StringRef name ) override;
+        void benchmarkStarting( BenchmarkInfo const& benchmarkInfo ) override;
+        void benchmarkEnded( BenchmarkStats<> const& benchmarkStats ) override;
+        void benchmarkFailed( StringRef error ) override;
+
+        void assertionStarting( AssertionInfo const& assertionInfo ) override;
+        void assertionEnded( AssertionStats const& assertionStats ) override;
+
+        void listReporters(
+            std::vector<ReporterDescription> const& descriptions ) override;
+        void listListeners(
+            std::vector<ListenerDescription> const& descriptions ) override;
+        void listTests( std::vector<TestCaseHandle> const& tests ) override;
+        void listTags( std::vector<TagInfo> const& tagInfos ) override;
+
+        void noMatchingTestCases( StringRef unmatchedSpec ) override;
+        void testRunStarting( TestRunInfo const& testRunInfo ) override;
+        void testCaseStarting( TestCaseInfo const& testInfo ) override;
+        void testCasePartialStarting( TestCaseInfo const& testInfo,
+                                      uint64_t partNumber ) override;
+        void sectionStarting( SectionInfo const& sectionInfo ) override;
+        void sectionEnded( SectionStats const& sectionStats ) override;
+        void testCasePartialEnded( TestCaseStats const& testCaseStats,
+                                   uint64_t partNumber ) override;
+        void testCaseEnded( TestCaseStats const& testCaseStats ) override;
+        void testRunEnded( TestRunStats const& testRunStats ) override;
+        void skipTest( TestCaseInfo const& testInfo ) override;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_EVENT_LISTENER_HPP_INCLUDED
+
+
+#ifndef CATCH_REPORTER_HELPERS_HPP_INCLUDED
+#define CATCH_REPORTER_HELPERS_HPP_INCLUDED
+
+#include <iosfwd>
+#include <string>
+#include <vector>
+
+
+namespace Catch {
+
+    class IConfig;
+    class TestCaseHandle;
+    class ColourImpl;
+
+    // Returns double formatted as %.3f (format expected on output)
+    std::string getFormattedDuration( double duration );
+
+    //! Should the reporter show duration of test given current configuration?
+    bool shouldShowDuration( IConfig const& config, double duration );
+
+    std::string serializeFilters( std::vector<std::string> const& filters );
+
+    struct lineOfChars {
+        char c;
+        constexpr lineOfChars( char c_ ): c( c_ ) {}
+
+        friend std::ostream& operator<<( std::ostream& out, lineOfChars value );
+    };
+
+    /**
+     * Lists reporter descriptions to the provided stream in user-friendly
+     * format
+     *
+     * Used as the default listing implementation by the first party reporter
+     * bases. The output should be backwards compatible with the output of
+     * Catch2 v2 binaries.
+     */
+    void
+    defaultListReporters( std::ostream& out,
+                          std::vector<ReporterDescription> const& descriptions,
+                          Verbosity verbosity );
+
+    /**
+     * Lists listeners descriptions to the provided stream in user-friendly
+     * format
+     */
+    void defaultListListeners( std::ostream& out,
+                               std::vector<ListenerDescription> const& descriptions );
+
+    /**
+     * Lists tag information to the provided stream in user-friendly format
+     *
+     * Used as the default listing implementation by the first party reporter
+     * bases. The output should be backwards compatible with the output of
+     * Catch2 v2 binaries.
+     */
+    void defaultListTags( std::ostream& out, std::vector<TagInfo> const& tags, bool isFiltered );
+
+    /**
+     * Lists test case information to the provided stream in user-friendly
+     * format
+     *
+     * Used as the default listing implementation by the first party reporter
+     * bases. The output is backwards compatible with the output of Catch2
+     * v2 binaries, and also supports the format specific to the old
+     * `--list-test-names-only` option, for people who used it in integrations.
+     */
+    void defaultListTests( std::ostream& out,
+                           ColourImpl* streamColour,
+                           std::vector<TestCaseHandle> const& tests,
+                           bool isFiltered,
+                           Verbosity verbosity );
+
+    /**
+     * Prints test run totals to the provided stream in user-friendly format
+     *
+     * Used by the console and compact reporters.
+     */
+    void printTestRunTotals( std::ostream& stream,
+                      ColourImpl& streamColour,
+                      Totals const& totals );
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_HELPERS_HPP_INCLUDED
+
+
+
+#ifndef CATCH_REPORTER_JSON_HPP_INCLUDED
+#define CATCH_REPORTER_JSON_HPP_INCLUDED
+
+
+#include <stack>
+
+namespace Catch {
+    class JsonReporter : public StreamingReporterBase {
+    public:
+        JsonReporter( ReporterConfig&& config );
+
+        ~JsonReporter() override;
+
+        static std::string getDescription();
+
+    public: // StreamingReporterBase
+        void testRunStarting( TestRunInfo const& runInfo ) override;
+        void testRunEnded( TestRunStats const& runStats ) override;
+
+        void testCaseStarting( TestCaseInfo const& tcInfo ) override;
+        void testCaseEnded( TestCaseStats const& tcStats ) override;
+
+        void testCasePartialStarting( TestCaseInfo const& tcInfo,
+                                      uint64_t index ) override;
+        void testCasePartialEnded( TestCaseStats const& tcStats,
+                                   uint64_t index ) override;
+
+        void sectionStarting( SectionInfo const& sectionInfo ) override;
+        void sectionEnded( SectionStats const& sectionStats ) override;
+
+        void assertionStarting( AssertionInfo const& assertionInfo ) override;
+        void assertionEnded( AssertionStats const& assertionStats ) override;
+
+        //void testRunEndedCumulative() override;
+
+        void benchmarkPreparing( StringRef name ) override;
+        void benchmarkStarting( BenchmarkInfo const& ) override;
+        void benchmarkEnded( BenchmarkStats<> const& ) override;
+        void benchmarkFailed( StringRef error ) override;
+
+        void listReporters(
+            std::vector<ReporterDescription> const& descriptions ) override;
+        void listListeners(
+            std::vector<ListenerDescription> const& descriptions ) override;
+        void listTests( std::vector<TestCaseHandle> const& tests ) override;
+        void listTags( std::vector<TagInfo> const& tags ) override;
+
+    private:
+        Timer m_testCaseTimer;
+        enum class Writer {
+            Object,
+            Array
+        };
+
+        JsonArrayWriter& startArray();
+        JsonArrayWriter& startArray( StringRef key );
+
+        JsonObjectWriter& startObject();
+        JsonObjectWriter& startObject( StringRef key );
+
+        void endObject();
+        void endArray();
+
+        bool isInside( Writer writer );
+
+        void startListing();
+        void endListing();
+
+        // Invariant:
+        // When m_writers is not empty and its top element is
+        // - Writer::Object, then m_objectWriters is not be empty
+        // - Writer::Array,  then m_arrayWriters shall not be empty
+        std::stack<JsonObjectWriter> m_objectWriters{};
+        std::stack<JsonArrayWriter> m_arrayWriters{};
+        std::stack<Writer> m_writers{};
+
+        bool m_startedListing = false;
+
+        // std::size_t m_sectionDepth = 0;
+        // std::size_t m_sectionStarted = 0;
+    };
+} // namespace Catch
+
+#endif // CATCH_REPORTER_JSON_HPP_INCLUDED
+
+
+#ifndef CATCH_REPORTER_JUNIT_HPP_INCLUDED
+#define CATCH_REPORTER_JUNIT_HPP_INCLUDED
+
+
+
+namespace Catch {
+
+    class JunitReporter final : public CumulativeReporterBase {
+    public:
+        JunitReporter(ReporterConfig&& _config);
+
+        static std::string getDescription();
+
+        void testRunStarting(TestRunInfo const& runInfo) override;
+
+        void testCaseStarting(TestCaseInfo const& testCaseInfo) override;
+        void assertionEnded(AssertionStats const& assertionStats) override;
+
+        void testCaseEnded(TestCaseStats const& testCaseStats) override;
+
+        void testRunEndedCumulative() override;
+
+    private:
+        void writeRun(TestRunNode const& testRunNode, double suiteTime);
+
+        void writeTestCase(TestCaseNode const& testCaseNode);
+
+        void writeSection( std::string const& className,
+                           std::string const& rootName,
+                           SectionNode const& sectionNode,
+                           bool testOkToFail );
+
+        void writeAssertions(SectionNode const& sectionNode);
+        void writeAssertion(AssertionStats const& stats);
+
+        XmlWriter xml;
+        Timer suiteTimer;
+        std::string stdOutForSuite;
+        std::string stdErrForSuite;
+        unsigned int unexpectedExceptions = 0;
+        bool m_okToFail = false;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_JUNIT_HPP_INCLUDED
+
+
+#ifndef CATCH_REPORTER_MULTI_HPP_INCLUDED
+#define CATCH_REPORTER_MULTI_HPP_INCLUDED
+
+
+namespace Catch {
+
+    class MultiReporter final : public IEventListener {
+        /*
+         * Stores all added reporters and listeners
+         *
+         * All Listeners are stored before all reporters, and individual
+         * listeners/reporters are stored in order of insertion.
+         */
+        std::vector<IEventListenerPtr> m_reporterLikes;
+        bool m_haveNoncapturingReporters = false;
+
+        // Keep track of how many listeners we have already inserted,
+        // so that we can insert them into the main vector at the right place
+        size_t m_insertedListeners = 0;
+
+        void updatePreferences(IEventListener const& reporterish);
+
+    public:
+        using IEventListener::IEventListener;
+
+        void addListener( IEventListenerPtr&& listener );
+        void addReporter( IEventListenerPtr&& reporter );
+
+    public: // IEventListener
+
+        void noMatchingTestCases( StringRef unmatchedSpec ) override;
+        void fatalErrorEncountered( StringRef error ) override;
+        void reportInvalidTestSpec( StringRef arg ) override;
+
+        void benchmarkPreparing( StringRef name ) override;
+        void benchmarkStarting( BenchmarkInfo const& benchmarkInfo ) override;
+        void benchmarkEnded( BenchmarkStats<> const& benchmarkStats ) override;
+        void benchmarkFailed( StringRef error ) override;
+
+        void testRunStarting( TestRunInfo const& testRunInfo ) override;
+        void testCaseStarting( TestCaseInfo const& testInfo ) override;
+        void testCasePartialStarting(TestCaseInfo const& testInfo, uint64_t partNumber) override;
+        void sectionStarting( SectionInfo const& sectionInfo ) override;
+        void assertionStarting( AssertionInfo const& assertionInfo ) override;
+
+        void assertionEnded( AssertionStats const& assertionStats ) override;
+        void sectionEnded( SectionStats const& sectionStats ) override;
+        void testCasePartialEnded(TestCaseStats const& testStats, uint64_t partNumber) override;
+        void testCaseEnded( TestCaseStats const& testCaseStats ) override;
+        void testRunEnded( TestRunStats const& testRunStats ) override;
+
+        void skipTest( TestCaseInfo const& testInfo ) override;
+
+        void listReporters(std::vector<ReporterDescription> const& descriptions) override;
+        void listListeners(std::vector<ListenerDescription> const& descriptions) override;
+        void listTests(std::vector<TestCaseHandle> const& tests) override;
+        void listTags(std::vector<TagInfo> const& tags) override;
+
+
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_MULTI_HPP_INCLUDED
+
+
+#ifndef CATCH_REPORTER_REGISTRARS_HPP_INCLUDED
+#define CATCH_REPORTER_REGISTRARS_HPP_INCLUDED
+
+
+#include <type_traits>
+
+namespace Catch {
+
+    namespace Detail {
+
+        template <typename T, typename = void>
+        struct has_description : std::false_type {};
+
+        template <typename T>
+        struct has_description<
+            T,
+            void_t<decltype( T::getDescription() )>>
+            : std::true_type {};
+
+        //! Indirection for reporter registration, so that the error handling is
+        //! independent on the reporter's concrete type
+        void registerReporterImpl( std::string const& name,
+                                   IReporterFactoryPtr reporterPtr );
+        //! Actually registers the factory, independent on listener's concrete type
+        void registerListenerImpl( Detail::unique_ptr<EventListenerFactory> listenerFactory );
+    } // namespace Detail
+
+    class IEventListener;
+    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
+
+    template <typename T>
+    class ReporterFactory : public IReporterFactory {
+
+        IEventListenerPtr create( ReporterConfig&& config ) const override {
+            return Detail::make_unique<T>( CATCH_MOVE(config) );
+        }
+
+        std::string getDescription() const override {
+            return T::getDescription();
+        }
+    };
+
+
+    template<typename T>
+    class ReporterRegistrar {
+    public:
+        explicit ReporterRegistrar( std::string const& name ) {
+            registerReporterImpl( name,
+                                  Detail::make_unique<ReporterFactory<T>>() );
+        }
+    };
+
+    template<typename T>
+    class ListenerRegistrar {
+
+        class TypedListenerFactory : public EventListenerFactory {
+            StringRef m_listenerName;
+
+            std::string getDescriptionImpl( std::true_type ) const {
+                return T::getDescription();
+            }
+
+            std::string getDescriptionImpl( std::false_type ) const {
+                return "(No description provided)";
+            }
+
+        public:
+            TypedListenerFactory( StringRef listenerName ):
+                m_listenerName( listenerName ) {}
+
+            IEventListenerPtr create( IConfig const* config ) const override {
+                return Detail::make_unique<T>( config );
+            }
+
+            StringRef getName() const override {
+                return m_listenerName;
+            }
+
+            std::string getDescription() const override {
+                return getDescriptionImpl( Detail::has_description<T>{} );
+            }
+        };
+
+    public:
+        ListenerRegistrar(StringRef listenerName) {
+            registerListenerImpl( Detail::make_unique<TypedListenerFactory>(listenerName) );
+        }
+    };
+}
+
+#if !defined(CATCH_CONFIG_DISABLE)
+
+#    define CATCH_REGISTER_REPORTER( name, reporterType )                      \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                              \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                               \
+        namespace {                                                            \
+            Catch::ReporterRegistrar<reporterType> INTERNAL_CATCH_UNIQUE_NAME( \
+                catch_internal_RegistrarFor )( name );                         \
+        }                                                                      \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#    define CATCH_REGISTER_LISTENER( listenerType )                            \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                              \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                               \
+        namespace {                                                            \
+            Catch::ListenerRegistrar<listenerType> INTERNAL_CATCH_UNIQUE_NAME( \
+                catch_internal_RegistrarFor )( #listenerType##_catch_sr );     \
+        }                                                                      \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#else // CATCH_CONFIG_DISABLE
+
+#define CATCH_REGISTER_REPORTER(name, reporterType)
+#define CATCH_REGISTER_LISTENER(listenerType)
+
+#endif // CATCH_CONFIG_DISABLE
+
+#endif // CATCH_REPORTER_REGISTRARS_HPP_INCLUDED
+
+
+#ifndef CATCH_REPORTER_SONARQUBE_HPP_INCLUDED
+#define CATCH_REPORTER_SONARQUBE_HPP_INCLUDED
+
+
+
+namespace Catch {
+
+    class SonarQubeReporter final : public CumulativeReporterBase {
+    public:
+        SonarQubeReporter(ReporterConfig&& config)
+        : CumulativeReporterBase(CATCH_MOVE(config))
+        , xml(m_stream) {
+            m_preferences.shouldRedirectStdOut = true;
+            m_preferences.shouldReportAllAssertions = true;
+            m_shouldStoreSuccesfulAssertions = false;
+        }
+
+        static std::string getDescription() {
+            using namespace std::string_literals;
+            return "Reports test results in the Generic Test Data SonarQube XML format"s;
+        }
+
+        void testRunStarting( TestRunInfo const& testRunInfo ) override;
+
+        void testRunEndedCumulative() override {
+            writeRun( *m_testRun );
+            xml.endElement();
+        }
+
+        void writeRun( TestRunNode const& runNode );
+
+        void writeTestFile(StringRef filename, std::vector<TestCaseNode const*> const& testCaseNodes);
+
+        void writeTestCase(TestCaseNode const& testCaseNode);
+
+        void writeSection(std::string const& rootName, SectionNode const& sectionNode, bool okToFail);
+
+        void writeAssertions(SectionNode const& sectionNode, bool okToFail);
+
+        void writeAssertion(AssertionStats const& stats, bool okToFail);
+
+    private:
+        XmlWriter xml;
+    };
+
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_SONARQUBE_HPP_INCLUDED
+
+
+#ifndef CATCH_REPORTER_TAP_HPP_INCLUDED
+#define CATCH_REPORTER_TAP_HPP_INCLUDED
+
+
+namespace Catch {
+
+    class TAPReporter final : public StreamingReporterBase {
+    public:
+        TAPReporter( ReporterConfig&& config ):
+            StreamingReporterBase( CATCH_MOVE(config) ) {
+            m_preferences.shouldReportAllAssertions = true;
+        }
+
+        static std::string getDescription() {
+            using namespace std::string_literals;
+            return "Reports test results in TAP format, suitable for test harnesses"s;
+        }
+
+        void testRunStarting( TestRunInfo const& testInfo ) override;
+
+        void noMatchingTestCases( StringRef unmatchedSpec ) override;
+
+        void assertionEnded(AssertionStats const& _assertionStats) override;
+
+        void testRunEnded(TestRunStats const& _testRunStats) override;
+
+    private:
+        std::size_t counter = 0;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_TAP_HPP_INCLUDED
+
+
+#ifndef CATCH_REPORTER_TEAMCITY_HPP_INCLUDED
+#define CATCH_REPORTER_TEAMCITY_HPP_INCLUDED
+
+
+#include <cstring>
+
+#ifdef __clang__
+#   pragma clang diagnostic push
+#   pragma clang diagnostic ignored "-Wpadded"
+#endif
+
+namespace Catch {
+
+    class TeamCityReporter final : public StreamingReporterBase {
+    public:
+        TeamCityReporter( ReporterConfig&& _config )
+        :   StreamingReporterBase( CATCH_MOVE(_config) )
+        {
+            m_preferences.shouldRedirectStdOut = true;
+        }
+
+        ~TeamCityReporter() override;
+
+        static std::string getDescription() {
+            using namespace std::string_literals;
+            return "Reports test results as TeamCity service messages"s;
+        }
+
+        void testRunStarting( TestRunInfo const& runInfo ) override;
+        void testRunEnded( TestRunStats const& runStats ) override;
+
+
+        void assertionEnded(AssertionStats const& assertionStats) override;
+
+        void sectionStarting(SectionInfo const& sectionInfo) override {
+            m_headerPrintedForThisSection = false;
+            StreamingReporterBase::sectionStarting( sectionInfo );
+        }
+
+        void testCaseStarting(TestCaseInfo const& testInfo) override;
+
+        void testCaseEnded(TestCaseStats const& testCaseStats) override;
+
+    private:
+        void printSectionHeader(std::ostream& os);
+
+        bool m_headerPrintedForThisSection = false;
+        Timer m_testTimer;
+    };
+
+} // end namespace Catch
+
+#ifdef __clang__
+#   pragma clang diagnostic pop
+#endif
+
+#endif // CATCH_REPORTER_TEAMCITY_HPP_INCLUDED
+
+
+#ifndef CATCH_REPORTER_XML_HPP_INCLUDED
+#define CATCH_REPORTER_XML_HPP_INCLUDED
+
+
+
+
+namespace Catch {
+    class XmlReporter : public StreamingReporterBase {
+    public:
+        XmlReporter(ReporterConfig&& _config);
+
+        ~XmlReporter() override;
+
+        static std::string getDescription();
+
+        virtual std::string getStylesheetRef() const;
+
+        void writeSourceInfo(SourceLineInfo const& sourceInfo);
+
+    public: // StreamingReporterBase
+
+        void testRunStarting(TestRunInfo const& testInfo) override;
+
+        void testCaseStarting(TestCaseInfo const& testInfo) override;
+
+        void sectionStarting(SectionInfo const& sectionInfo) override;
+
+        void assertionStarting(AssertionInfo const&) override;
+
+        void assertionEnded(AssertionStats const& assertionStats) override;
+
+        void sectionEnded(SectionStats const& sectionStats) override;
+
+        void testCaseEnded(TestCaseStats const& testCaseStats) override;
+
+        void testRunEnded(TestRunStats const& testRunStats) override;
+
+        void benchmarkPreparing( StringRef name ) override;
+        void benchmarkStarting(BenchmarkInfo const&) override;
+        void benchmarkEnded(BenchmarkStats<> const&) override;
+        void benchmarkFailed( StringRef error ) override;
+
+        void listReporters(std::vector<ReporterDescription> const& descriptions) override;
+        void listListeners(std::vector<ListenerDescription> const& descriptions) override;
+        void listTests(std::vector<TestCaseHandle> const& tests) override;
+        void listTags(std::vector<TagInfo> const& tags) override;
+
+    private:
+        Timer m_testCaseTimer;
+        XmlWriter m_xml;
+        int m_sectionDepth = 0;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_XML_HPP_INCLUDED
+
+#endif // CATCH_REPORTERS_ALL_HPP_INCLUDED
+
+#endif // CATCH_ALL_HPP_INCLUDED
+#endif // CATCH_AMALGAMATED_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/extras/gdbinit b/ethosu/regor/dependencies/thirdparty/Catch2/extras/gdbinit
new file mode 100644
index 00000000..fb3608ae
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/extras/gdbinit
@@ -0,0 +1,16 @@
+#
+# This file provides a way to skip stepping into Catch code when debugging with gdb.
+#
+# With the gdb "skip" command you can tell gdb to skip files or functions during debugging.
+# see https://xaizek.github.io/2016-05-26/skipping-standard-library-in-gdb/ for an example
+#
+# Basically the following line tells gdb to skip all functions containing the
+# regexp "Catch", which matches the complete Catch namespace.
+# If you want to skip just some parts of the Catch code you can modify the
+# regexp accordingly.
+# 
+# If you want to permanently skip stepping into Catch code copy the following
+# line into your ~/.gdbinit file
+# 
+
+skip -rfu Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/extras/lldbinit b/ethosu/regor/dependencies/thirdparty/Catch2/extras/lldbinit
new file mode 100644
index 00000000..4f13634d
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/extras/lldbinit
@@ -0,0 +1,16 @@
+#
+# This file provides a way to skip stepping into Catch code when debugging with lldb.
+#
+# With the setting "target.process.thread.step-avoid-regexp" you can tell lldb
+# to skip functions matching the regexp
+#
+# Basically the following line tells lldb to skip all functions containing the
+# regexp "Catch", which matches the complete Catch namespace.
+# If you want to skip just some parts of the Catch code you can modify the
+# regexp accordingly.
+#
+# If you want to permanently skip stepping into Catch code copy the following
+# line into your ~/.lldbinit file
+#
+
+settings set target.process.thread.step-avoid-regexp Catch
\ No newline at end of file
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/CMakeLists.txt b/ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/CMakeLists.txt
new file mode 100644
index 00000000..daba61df
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/CMakeLists.txt
@@ -0,0 +1,20 @@
+# License: Boost 1.0
+# By Paul Dreik 2020
+
+# add a library that brings in the main() function from libfuzzer
+# and has all the dependencies, so the individual fuzzers can be
+# added one line each.
+add_library(fuzzhelper NullOStream.h NullOStream.cpp)
+target_link_libraries(fuzzhelper PUBLIC Catch2::Catch2)
+
+# use C++17 so we can get string_view
+target_compile_features(fuzzhelper PUBLIC cxx_std_17)
+
+# This should be possible to set from the outside to be oss-fuzz compatible,
+# fix later. For now, target libFuzzer only.
+target_link_options(fuzzhelper PUBLIC "-fsanitize=fuzzer")
+
+foreach(fuzzer TestSpecParser XmlWriter textflow)
+add_executable(fuzz_${fuzzer} fuzz_${fuzzer}.cpp)
+target_link_libraries(fuzz_${fuzzer} PRIVATE fuzzhelper)
+endforeach()
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/NullOStream.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/NullOStream.cpp
new file mode 100644
index 00000000..e3a181e8
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/NullOStream.cpp
@@ -0,0 +1,18 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include "NullOStream.h"
+
+void NullOStream::avoidOutOfLineVirtualCompilerWarning()
+{
+}
+
+int NullStreambuf::overflow(int c){
+    setp(dummyBuffer, dummyBuffer + sizeof(dummyBuffer));
+    return (c == traits_type::eof()) ? '\0' : c;
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/NullOStream.h b/ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/NullOStream.h
new file mode 100644
index 00000000..abbec09c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/NullOStream.h
@@ -0,0 +1,28 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#pragma once
+
+#include <ostream>
+#include <streambuf>
+
+// from https://stackoverflow.com/a/8244052
+class NullStreambuf : public std::streambuf {
+  char dummyBuffer[64];
+
+protected:
+  virtual int overflow(int c) override final;
+};
+
+class NullOStream final : private NullStreambuf, public std::ostream {
+public:
+  NullOStream() : std::ostream(this) {}
+  NullStreambuf *rdbuf() { return this; }
+  virtual void avoidOutOfLineVirtualCompilerWarning();
+};
+
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/build_fuzzers.sh b/ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/build_fuzzers.sh
new file mode 100755
index 00000000..9788c68c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/build_fuzzers.sh
@@ -0,0 +1,33 @@
+#!/bin/sh
+#
+# Builds the fuzzers
+#
+# By Paul Dreik 20200923
+set -exu
+
+CATCHROOT=$(readlink -f $(dirname $0)/..)
+
+
+BUILDDIR=$CATCHROOT/build-fuzzers
+mkdir -p $BUILDDIR
+cd $BUILDDIR
+
+if which /usr/lib/ccache/clang++ >/dev/null 2>&1 ; then
+ CXX=/usr/lib/ccache/clang++
+else
+ CXX=clang++
+fi
+
+cmake $CATCHROOT \
+  -DCMAKE_CXX_COMPILER=$CXX \
+  -DCMAKE_CXX_FLAGS="-fsanitize=fuzzer-no-link,address,undefined -O3 -g" \
+  -DCATCH_DEVELOPMENT_BUILD=On \
+  -DCATCH_BUILD_EXAMPLES=Off \
+  -DCATCH_BUILD_EXTRA_TESTS=Off \
+  -DCATCH_BUILD_TESTING=Off \
+  -DBUILD_TESTING=Off \
+  -DCATCH_ENABLE_WERROR=Off \
+  -DCATCH_BUILD_FUZZERS=On
+
+cmake --build . -j $(nproc)
+
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/fuzz_TestSpecParser.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/fuzz_TestSpecParser.cpp
new file mode 100644
index 00000000..3aba8c84
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/fuzz_TestSpecParser.cpp
@@ -0,0 +1,22 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+//By Paul Dreik 2020
+
+#include <catch2/internal/catch_test_spec_parser.hpp>
+#include <catch2/internal/catch_tag_alias_registry.hpp>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+
+    Catch::TagAliasRegistry tar;
+    Catch::TestSpecParser tsp(tar);
+
+    std::string buf(Data,Data+Size);
+    tsp.parse(buf);
+
+    return 0;
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/fuzz_XmlWriter.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/fuzz_XmlWriter.cpp
new file mode 100644
index 00000000..70c4ed80
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/fuzz_XmlWriter.cpp
@@ -0,0 +1,22 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+//By Paul Dreik 2020
+
+#include <catch2/internal/catch_xmlwriter.hpp>
+
+#include "NullOStream.h"
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+
+    std::string buf(Data,Data+Size);
+    NullOStream nul;
+    Catch::XmlEncode encode(buf);
+    encode.encodeTo(nul);
+    return 0;
+}
+
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/fuzz_textflow.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/fuzz_textflow.cpp
new file mode 100644
index 00000000..7000f420
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/fuzz_textflow.cpp
@@ -0,0 +1,53 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+//By Paul Dreik 2020
+
+#include <catch2/internal/catch_textflow.hpp>
+
+#include "NullOStream.h"
+
+#include <string>
+#include <string_view>
+
+
+template<class Callback>
+void split(const char *Data, size_t Size, Callback callback) {
+
+    using namespace std::literals;
+    constexpr auto sep="\n~~~\n"sv;
+
+    std::string_view remainder(Data,Size);
+    for (;;) {
+        auto pos=remainder.find(sep);
+        if(pos==std::string_view::npos) {
+            //not found. use the remainder and exit
+            callback(remainder);
+            return;
+        } else {
+            //found. invoke callback on the first part, then proceed with the rest.
+            callback(remainder.substr(0,pos));
+            remainder=remainder.substr(pos+sep.size());
+        }
+    }
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+
+    Catch::TextFlow::Columns columns;
+
+    // break the input on separator
+    split((const char*)Data,Size,[&](std::string_view word) {
+        columns+=Catch::TextFlow::Column(std::string(word));
+    });
+
+    NullOStream nul;
+    nul << columns;
+
+    return 0;
+}
+
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/mdsnippets.json b/ethosu/regor/dependencies/thirdparty/Catch2/mdsnippets.json
new file mode 100644
index 00000000..5a60dadb
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/mdsnippets.json
@@ -0,0 +1,9 @@
+{
+  "ReadOnly": false,
+  "TocLevel": 5,
+  "Exclude": [
+    "cmake-build"
+  ],
+  "WriteHeader": false,
+  "Convention": "InPlaceOverwrite"
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/meson.build b/ethosu/regor/dependencies/thirdparty/Catch2/meson.build
new file mode 100644
index 00000000..3d0e715e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/meson.build
@@ -0,0 +1,19 @@
+#              Copyright Catch2 Authors
+# Distributed under the Boost Software License, Version 1.0.
+#   (See accompanying file LICENSE.txt or copy at
+#        https://www.boost.org/LICENSE_1_0.txt)
+
+# SPDX-License-Identifier: BSL-1.0
+
+project(
+  'catch2',
+  'cpp',
+  version: '3.5.3', # CML version placeholder, don't delete
+  license: 'BSL-1.0',
+  meson_version: '>=0.54.1',
+)
+
+subdir('src/catch2')
+if get_option('tests')
+  subdir('tests')
+endif
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/meson_options.txt b/ethosu/regor/dependencies/thirdparty/Catch2/meson_options.txt
new file mode 100644
index 00000000..76904873
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/meson_options.txt
@@ -0,0 +1 @@
+option('tests', type: 'boolean', value: true, description: 'Build the unit tests')
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/CMakeLists.txt b/ethosu/regor/dependencies/thirdparty/Catch2/src/CMakeLists.txt
new file mode 100644
index 00000000..0b200425
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/CMakeLists.txt
@@ -0,0 +1,524 @@
+include(CatchMiscFunctions)
+
+# CMake derives a Visual Studio project GUID from the file path but can be overridden via a property
+# (see https://gitlab.kitware.com/cmake/cmake/-/commit/c85367f4).  Using a non-constant GUID
+# can cause problems if other projects/repos want to reference the vcxproj file,
+# so we force a constant GUID here.
+set(Catch2_GUID_CMAKE "8d538cbe-01bf-4a2e-a98a-6c368fdf13d7" CACHE INTERNAL "Project GUID")
+set(Catch2WithMain_GUID_CMAKE "8bd3552a-2cfb-4a59-ab15-2031b97ada1e" CACHE INTERNAL "Project GUID")
+
+set(BENCHMARK_HEADERS
+  ${SOURCES_DIR}/benchmark/catch_benchmark.hpp
+  ${SOURCES_DIR}/benchmark/catch_benchmark_all.hpp
+  ${SOURCES_DIR}/benchmark/catch_chronometer.hpp
+  ${SOURCES_DIR}/benchmark/catch_clock.hpp
+  ${SOURCES_DIR}/benchmark/catch_constructor.hpp
+  ${SOURCES_DIR}/benchmark/catch_environment.hpp
+  ${SOURCES_DIR}/benchmark/catch_estimate.hpp
+  ${SOURCES_DIR}/benchmark/catch_execution_plan.hpp
+  ${SOURCES_DIR}/benchmark/catch_optimizer.hpp
+  ${SOURCES_DIR}/benchmark/catch_outlier_classification.hpp
+  ${SOURCES_DIR}/benchmark/catch_sample_analysis.hpp
+  ${SOURCES_DIR}/benchmark/detail/catch_analyse.hpp
+  ${SOURCES_DIR}/benchmark/detail/catch_benchmark_function.hpp
+  ${SOURCES_DIR}/benchmark/detail/catch_benchmark_stats.hpp
+  ${SOURCES_DIR}/benchmark/detail/catch_benchmark_stats_fwd.hpp
+  ${SOURCES_DIR}/benchmark/detail/catch_complete_invoke.hpp
+  ${SOURCES_DIR}/benchmark/detail/catch_estimate_clock.hpp
+  ${SOURCES_DIR}/benchmark/detail/catch_measure.hpp
+  ${SOURCES_DIR}/benchmark/detail/catch_repeat.hpp
+  ${SOURCES_DIR}/benchmark/detail/catch_run_for_at_least.hpp
+  ${SOURCES_DIR}/benchmark/detail/catch_stats.hpp
+  ${SOURCES_DIR}/benchmark/detail/catch_timing.hpp
+)
+set(BENCHMARK_SOURCES
+  ${SOURCES_DIR}/benchmark/catch_chronometer.cpp
+  ${SOURCES_DIR}/benchmark/detail/catch_analyse.cpp
+  ${SOURCES_DIR}/benchmark/detail/catch_benchmark_function.cpp
+  ${SOURCES_DIR}/benchmark/detail/catch_run_for_at_least.cpp
+  ${SOURCES_DIR}/benchmark/detail/catch_stats.cpp
+)
+set(BENCHMARK_FILES ${BENCHMARK_HEADERS} ${BENCHMARK_SOURCES})
+
+
+set(IMPL_HEADERS
+ "${PROJECT_BINARY_DIR}/generated-includes/catch2/catch_user_config.hpp"
+  ${SOURCES_DIR}/catch_user_config.hpp.in
+  ${SOURCES_DIR}/catch_all.hpp
+  ${SOURCES_DIR}/catch_approx.hpp
+  ${SOURCES_DIR}/catch_assertion_info.hpp
+  ${SOURCES_DIR}/catch_assertion_result.hpp
+  ${SOURCES_DIR}/catch_config.hpp
+  ${SOURCES_DIR}/catch_get_random_seed.hpp
+  ${SOURCES_DIR}/catch_message.hpp
+  ${SOURCES_DIR}/catch_section_info.hpp
+  ${SOURCES_DIR}/catch_session.hpp
+  ${SOURCES_DIR}/catch_tag_alias.hpp
+  ${SOURCES_DIR}/catch_tag_alias_autoregistrar.hpp
+  ${SOURCES_DIR}/catch_template_test_macros.hpp
+  ${SOURCES_DIR}/catch_test_case_info.hpp
+  ${SOURCES_DIR}/catch_test_macros.hpp
+  ${SOURCES_DIR}/catch_test_spec.hpp
+  ${SOURCES_DIR}/catch_timer.hpp
+  ${SOURCES_DIR}/catch_tostring.hpp
+  ${SOURCES_DIR}/catch_totals.hpp
+  ${SOURCES_DIR}/catch_translate_exception.hpp
+  ${SOURCES_DIR}/catch_version.hpp
+  ${SOURCES_DIR}/catch_version_macros.hpp
+  ${SOURCES_DIR}/internal/catch_assertion_handler.hpp
+  ${SOURCES_DIR}/internal/catch_case_insensitive_comparisons.hpp
+  ${SOURCES_DIR}/internal/catch_case_sensitive.hpp
+  ${SOURCES_DIR}/internal/catch_clara.hpp
+  ${SOURCES_DIR}/internal/catch_commandline.hpp
+  ${SOURCES_DIR}/internal/catch_compare_traits.hpp
+  ${SOURCES_DIR}/internal/catch_compiler_capabilities.hpp
+  ${SOURCES_DIR}/internal/catch_config_android_logwrite.hpp
+  ${SOURCES_DIR}/internal/catch_config_counter.hpp
+  ${SOURCES_DIR}/internal/catch_config_static_analysis_support.hpp
+  ${SOURCES_DIR}/internal/catch_config_uncaught_exceptions.hpp
+  ${SOURCES_DIR}/internal/catch_config_wchar.hpp
+  ${SOURCES_DIR}/internal/catch_console_colour.hpp
+  ${SOURCES_DIR}/internal/catch_console_width.hpp
+  ${SOURCES_DIR}/internal/catch_container_nonmembers.hpp
+  ${SOURCES_DIR}/internal/catch_context.hpp
+  ${SOURCES_DIR}/internal/catch_debug_console.hpp
+  ${SOURCES_DIR}/internal/catch_debugger.hpp
+  ${SOURCES_DIR}/internal/catch_decomposer.hpp
+  ${SOURCES_DIR}/internal/catch_enforce.hpp
+  ${SOURCES_DIR}/internal/catch_enum_values_registry.hpp
+  ${SOURCES_DIR}/internal/catch_errno_guard.hpp
+  ${SOURCES_DIR}/internal/catch_exception_translator_registry.hpp
+  ${SOURCES_DIR}/internal/catch_fatal_condition_handler.hpp
+  ${SOURCES_DIR}/internal/catch_floating_point_helpers.hpp
+  ${SOURCES_DIR}/internal/catch_getenv.hpp
+  ${SOURCES_DIR}/internal/catch_istream.hpp
+  ${SOURCES_DIR}/internal/catch_is_permutation.hpp
+  ${SOURCES_DIR}/internal/catch_jsonwriter.hpp
+  ${SOURCES_DIR}/internal/catch_lazy_expr.hpp
+  ${SOURCES_DIR}/internal/catch_leak_detector.hpp
+  ${SOURCES_DIR}/internal/catch_list.hpp
+  ${SOURCES_DIR}/internal/catch_logical_traits.hpp
+  ${SOURCES_DIR}/internal/catch_message_info.hpp
+  ${SOURCES_DIR}/internal/catch_meta.hpp
+  ${SOURCES_DIR}/internal/catch_move_and_forward.hpp
+  ${SOURCES_DIR}/internal/catch_noncopyable.hpp
+  ${SOURCES_DIR}/internal/catch_optional.hpp
+  ${SOURCES_DIR}/internal/catch_output_redirect.hpp
+  ${SOURCES_DIR}/internal/catch_parse_numbers.hpp
+  ${SOURCES_DIR}/internal/catch_platform.hpp
+  ${SOURCES_DIR}/internal/catch_polyfills.hpp
+  ${SOURCES_DIR}/internal/catch_preprocessor.hpp
+  ${SOURCES_DIR}/internal/catch_preprocessor_remove_parens.hpp
+  ${SOURCES_DIR}/internal/catch_random_floating_point_helpers.hpp
+  ${SOURCES_DIR}/internal/catch_random_integer_helpers.hpp
+  ${SOURCES_DIR}/internal/catch_random_number_generator.hpp
+  ${SOURCES_DIR}/internal/catch_random_seed_generation.hpp
+  ${SOURCES_DIR}/internal/catch_reporter_registry.hpp
+  ${SOURCES_DIR}/internal/catch_reporter_spec_parser.hpp
+  ${SOURCES_DIR}/internal/catch_result_type.hpp
+  ${SOURCES_DIR}/internal/catch_reusable_string_stream.hpp
+  ${SOURCES_DIR}/internal/catch_run_context.hpp
+  ${SOURCES_DIR}/internal/catch_section.hpp
+  ${SOURCES_DIR}/internal/catch_sharding.hpp
+  ${SOURCES_DIR}/internal/catch_singletons.hpp
+  ${SOURCES_DIR}/internal/catch_source_line_info.hpp
+  ${SOURCES_DIR}/internal/catch_startup_exception_registry.hpp
+  ${SOURCES_DIR}/internal/catch_stdstreams.hpp
+  ${SOURCES_DIR}/internal/catch_stream_end_stop.hpp
+  ${SOURCES_DIR}/internal/catch_string_manip.hpp
+  ${SOURCES_DIR}/internal/catch_stringref.hpp
+  ${SOURCES_DIR}/internal/catch_tag_alias_registry.hpp
+  ${SOURCES_DIR}/internal/catch_template_test_registry.hpp
+  ${SOURCES_DIR}/internal/catch_test_case_info_hasher.hpp
+  ${SOURCES_DIR}/internal/catch_test_case_registry_impl.hpp
+  ${SOURCES_DIR}/internal/catch_test_case_tracker.hpp
+  ${SOURCES_DIR}/internal/catch_test_failure_exception.hpp
+  ${SOURCES_DIR}/internal/catch_test_macro_impl.hpp
+  ${SOURCES_DIR}/internal/catch_test_registry.hpp
+  ${SOURCES_DIR}/internal/catch_test_run_info.hpp
+  ${SOURCES_DIR}/internal/catch_test_spec_parser.hpp
+  ${SOURCES_DIR}/internal/catch_textflow.hpp
+  ${SOURCES_DIR}/internal/catch_to_string.hpp
+  ${SOURCES_DIR}/internal/catch_uncaught_exceptions.hpp
+  ${SOURCES_DIR}/internal/catch_uniform_floating_point_distribution.hpp
+  ${SOURCES_DIR}/internal/catch_uniform_integer_distribution.hpp
+  ${SOURCES_DIR}/internal/catch_unique_name.hpp
+  ${SOURCES_DIR}/internal/catch_unique_ptr.hpp
+  ${SOURCES_DIR}/internal/catch_void_type.hpp
+  ${SOURCES_DIR}/internal/catch_wildcard_pattern.hpp
+  ${SOURCES_DIR}/internal/catch_windows_h_proxy.hpp
+  ${SOURCES_DIR}/internal/catch_xmlwriter.hpp
+)
+set(IMPL_SOURCES
+  ${SOURCES_DIR}/catch_approx.cpp
+  ${SOURCES_DIR}/catch_assertion_result.cpp
+  ${SOURCES_DIR}/catch_config.cpp
+  ${SOURCES_DIR}/catch_get_random_seed.cpp
+  ${SOURCES_DIR}/catch_message.cpp
+  ${SOURCES_DIR}/catch_registry_hub.cpp
+  ${SOURCES_DIR}/catch_session.cpp
+  ${SOURCES_DIR}/catch_tag_alias_autoregistrar.cpp
+  ${SOURCES_DIR}/catch_test_case_info.cpp
+  ${SOURCES_DIR}/catch_test_spec.cpp
+  ${SOURCES_DIR}/catch_timer.cpp
+  ${SOURCES_DIR}/catch_tostring.cpp
+  ${SOURCES_DIR}/catch_totals.cpp
+  ${SOURCES_DIR}/catch_translate_exception.cpp
+  ${SOURCES_DIR}/catch_version.cpp
+  ${SOURCES_DIR}/internal/catch_assertion_handler.cpp
+  ${SOURCES_DIR}/internal/catch_case_insensitive_comparisons.cpp
+  ${SOURCES_DIR}/internal/catch_clara.cpp
+  ${SOURCES_DIR}/internal/catch_commandline.cpp
+  ${SOURCES_DIR}/internal/catch_console_colour.cpp
+  ${SOURCES_DIR}/internal/catch_context.cpp
+  ${SOURCES_DIR}/internal/catch_debug_console.cpp
+  ${SOURCES_DIR}/internal/catch_debugger.cpp
+  ${SOURCES_DIR}/internal/catch_decomposer.cpp
+  ${SOURCES_DIR}/internal/catch_enforce.cpp
+  ${SOURCES_DIR}/internal/catch_enum_values_registry.cpp
+  ${SOURCES_DIR}/internal/catch_errno_guard.cpp
+  ${SOURCES_DIR}/internal/catch_exception_translator_registry.cpp
+  ${SOURCES_DIR}/internal/catch_fatal_condition_handler.cpp
+  ${SOURCES_DIR}/internal/catch_floating_point_helpers.cpp
+  ${SOURCES_DIR}/internal/catch_getenv.cpp
+  ${SOURCES_DIR}/internal/catch_istream.cpp
+  ${SOURCES_DIR}/internal/catch_jsonwriter.cpp
+  ${SOURCES_DIR}/internal/catch_lazy_expr.cpp
+  ${SOURCES_DIR}/internal/catch_leak_detector.cpp
+  ${SOURCES_DIR}/internal/catch_list.cpp
+  ${SOURCES_DIR}/internal/catch_message_info.cpp
+  ${SOURCES_DIR}/internal/catch_output_redirect.cpp
+  ${SOURCES_DIR}/internal/catch_parse_numbers.cpp
+  ${SOURCES_DIR}/internal/catch_polyfills.cpp
+  ${SOURCES_DIR}/internal/catch_random_number_generator.cpp
+  ${SOURCES_DIR}/internal/catch_random_seed_generation.cpp
+  ${SOURCES_DIR}/internal/catch_reporter_registry.cpp
+  ${SOURCES_DIR}/internal/catch_reporter_spec_parser.cpp
+  ${SOURCES_DIR}/internal/catch_result_type.cpp
+  ${SOURCES_DIR}/internal/catch_reusable_string_stream.cpp
+  ${SOURCES_DIR}/internal/catch_run_context.cpp
+  ${SOURCES_DIR}/internal/catch_section.cpp
+  ${SOURCES_DIR}/internal/catch_singletons.cpp
+  ${SOURCES_DIR}/internal/catch_source_line_info.cpp
+  ${SOURCES_DIR}/internal/catch_startup_exception_registry.cpp
+  ${SOURCES_DIR}/internal/catch_stdstreams.cpp
+  ${SOURCES_DIR}/internal/catch_string_manip.cpp
+  ${SOURCES_DIR}/internal/catch_stringref.cpp
+  ${SOURCES_DIR}/internal/catch_tag_alias_registry.cpp
+  ${SOURCES_DIR}/internal/catch_test_case_info_hasher.cpp
+  ${SOURCES_DIR}/internal/catch_test_case_registry_impl.cpp
+  ${SOURCES_DIR}/internal/catch_test_case_tracker.cpp
+  ${SOURCES_DIR}/internal/catch_test_failure_exception.cpp
+  ${SOURCES_DIR}/internal/catch_test_registry.cpp
+  ${SOURCES_DIR}/internal/catch_test_spec_parser.cpp
+  ${SOURCES_DIR}/internal/catch_textflow.cpp
+  ${SOURCES_DIR}/internal/catch_uncaught_exceptions.cpp
+  ${SOURCES_DIR}/internal/catch_wildcard_pattern.cpp
+  ${SOURCES_DIR}/internal/catch_xmlwriter.cpp
+)
+set(INTERNAL_FILES ${IMPL_SOURCES} ${IMPL_HEADERS})
+
+set(INTERFACE_HEADERS
+  ${SOURCES_DIR}/interfaces/catch_interfaces_all.hpp
+  ${SOURCES_DIR}/interfaces/catch_interfaces_capture.hpp
+  ${SOURCES_DIR}/interfaces/catch_interfaces_config.hpp
+  ${SOURCES_DIR}/interfaces/catch_interfaces_enum_values_registry.hpp
+  ${SOURCES_DIR}/interfaces/catch_interfaces_exception.hpp
+  ${SOURCES_DIR}/interfaces/catch_interfaces_generatortracker.hpp
+  ${SOURCES_DIR}/interfaces/catch_interfaces_registry_hub.hpp
+  ${SOURCES_DIR}/interfaces/catch_interfaces_reporter.hpp
+  ${SOURCES_DIR}/interfaces/catch_interfaces_reporter_factory.hpp
+  ${SOURCES_DIR}/interfaces/catch_interfaces_tag_alias_registry.hpp
+  ${SOURCES_DIR}/interfaces/catch_interfaces_test_invoker.hpp
+  ${SOURCES_DIR}/interfaces/catch_interfaces_testcase.hpp
+)
+set(INTERFACE_SOURCES
+  ${SOURCES_DIR}/interfaces/catch_interfaces_capture.cpp
+  ${SOURCES_DIR}/interfaces/catch_interfaces_config.cpp
+  ${SOURCES_DIR}/interfaces/catch_interfaces_exception.cpp
+  ${SOURCES_DIR}/interfaces/catch_interfaces_generatortracker.cpp
+  ${SOURCES_DIR}/interfaces/catch_interfaces_registry_hub.cpp
+  ${SOURCES_DIR}/interfaces/catch_interfaces_reporter.cpp
+  ${SOURCES_DIR}/interfaces/catch_interfaces_reporter_factory.cpp
+  ${SOURCES_DIR}/interfaces/catch_interfaces_testcase.cpp
+)
+set(INTERFACE_FILES ${INTERFACE_HEADERS} ${INTERFACE_SOURCES})
+
+set(GENERATOR_HEADERS
+  ${SOURCES_DIR}/generators/catch_generator_exception.hpp
+  ${SOURCES_DIR}/generators/catch_generators.hpp
+  ${SOURCES_DIR}/generators/catch_generators_adapters.hpp
+  ${SOURCES_DIR}/generators/catch_generators_all.hpp
+  ${SOURCES_DIR}/generators/catch_generators_random.hpp
+  ${SOURCES_DIR}/generators/catch_generators_range.hpp
+)
+set(GENERATOR_SOURCES
+  ${SOURCES_DIR}/generators/catch_generator_exception.cpp
+  ${SOURCES_DIR}/generators/catch_generators.cpp
+  ${SOURCES_DIR}/generators/catch_generators_random.cpp
+)
+set(GENERATOR_FILES ${GENERATOR_HEADERS} ${GENERATOR_SOURCES})
+
+set(MATCHER_HEADERS
+  ${SOURCES_DIR}/matchers/catch_matchers.hpp
+  ${SOURCES_DIR}/matchers/catch_matchers_all.hpp
+  ${SOURCES_DIR}/matchers/catch_matchers_container_properties.hpp
+  ${SOURCES_DIR}/matchers/catch_matchers_contains.hpp
+  ${SOURCES_DIR}/matchers/catch_matchers_range_equals.hpp
+  ${SOURCES_DIR}/matchers/catch_matchers_exception.hpp
+  ${SOURCES_DIR}/matchers/catch_matchers_floating_point.hpp
+  ${SOURCES_DIR}/matchers/catch_matchers_predicate.hpp
+  ${SOURCES_DIR}/matchers/catch_matchers_quantifiers.hpp
+  ${SOURCES_DIR}/matchers/catch_matchers_string.hpp
+  ${SOURCES_DIR}/matchers/catch_matchers_templated.hpp
+  ${SOURCES_DIR}/matchers/catch_matchers_vector.hpp
+  ${SOURCES_DIR}/matchers/internal/catch_matchers_impl.hpp
+)
+set(MATCHER_SOURCES
+  ${SOURCES_DIR}/matchers/catch_matchers.cpp
+  ${SOURCES_DIR}/matchers/catch_matchers_container_properties.cpp
+  ${SOURCES_DIR}/matchers/catch_matchers_exception.cpp
+  ${SOURCES_DIR}/matchers/catch_matchers_floating_point.cpp
+  ${SOURCES_DIR}/matchers/catch_matchers_predicate.cpp
+  ${SOURCES_DIR}/matchers/catch_matchers_quantifiers.cpp
+  ${SOURCES_DIR}/matchers/catch_matchers_string.cpp
+  ${SOURCES_DIR}/matchers/catch_matchers_templated.cpp
+  ${SOURCES_DIR}/matchers/internal/catch_matchers_impl.cpp
+)
+set(MATCHER_FILES ${MATCHER_HEADERS} ${MATCHER_SOURCES})
+
+set(REPORTER_HEADERS
+  ${SOURCES_DIR}/reporters/catch_reporter_automake.hpp
+  ${SOURCES_DIR}/reporters/catch_reporter_common_base.hpp
+  ${SOURCES_DIR}/reporters/catch_reporter_compact.hpp
+  ${SOURCES_DIR}/reporters/catch_reporter_console.hpp
+  ${SOURCES_DIR}/reporters/catch_reporter_cumulative_base.hpp
+  ${SOURCES_DIR}/reporters/catch_reporter_event_listener.hpp
+  ${SOURCES_DIR}/reporters/catch_reporter_helpers.hpp
+  ${SOURCES_DIR}/reporters/catch_reporter_json.hpp
+  ${SOURCES_DIR}/reporters/catch_reporter_junit.hpp
+  ${SOURCES_DIR}/reporters/catch_reporter_multi.hpp
+  ${SOURCES_DIR}/reporters/catch_reporter_registrars.hpp
+  ${SOURCES_DIR}/reporters/catch_reporter_sonarqube.hpp
+  ${SOURCES_DIR}/reporters/catch_reporter_streaming_base.hpp
+  ${SOURCES_DIR}/reporters/catch_reporter_tap.hpp
+  ${SOURCES_DIR}/reporters/catch_reporter_teamcity.hpp
+  ${SOURCES_DIR}/reporters/catch_reporter_xml.hpp
+  ${SOURCES_DIR}/reporters/catch_reporters_all.hpp
+)
+set(REPORTER_SOURCES
+  ${SOURCES_DIR}/reporters/catch_reporter_automake.cpp
+  ${SOURCES_DIR}/reporters/catch_reporter_common_base.cpp
+  ${SOURCES_DIR}/reporters/catch_reporter_compact.cpp
+  ${SOURCES_DIR}/reporters/catch_reporter_console.cpp
+  ${SOURCES_DIR}/reporters/catch_reporter_cumulative_base.cpp
+  ${SOURCES_DIR}/reporters/catch_reporter_event_listener.cpp
+  ${SOURCES_DIR}/reporters/catch_reporter_helpers.cpp
+  ${SOURCES_DIR}/reporters/catch_reporter_json.cpp
+  ${SOURCES_DIR}/reporters/catch_reporter_junit.cpp
+  ${SOURCES_DIR}/reporters/catch_reporter_multi.cpp
+  ${SOURCES_DIR}/reporters/catch_reporter_registrars.cpp
+  ${SOURCES_DIR}/reporters/catch_reporter_sonarqube.cpp
+  ${SOURCES_DIR}/reporters/catch_reporter_streaming_base.cpp
+  ${SOURCES_DIR}/reporters/catch_reporter_tap.cpp
+  ${SOURCES_DIR}/reporters/catch_reporter_teamcity.cpp
+  ${SOURCES_DIR}/reporters/catch_reporter_xml.cpp
+)
+set(REPORTER_FILES ${REPORTER_HEADERS} ${REPORTER_SOURCES})
+
+set(ALL_FILES
+  ${BENCHMARK_FILES}
+  ${GENERATOR_FILES}
+  ${REPORTER_FILES}
+  ${INTERFACE_FILES}
+  ${INTERNAL_FILES}
+  ${MATCHER_FILES}
+)
+
+set(FILTERED_FILES ${ALL_FILES})
+list(REMOVE_ITEM FILTERED_FILES "${PROJECT_BINARY_DIR}/generated-includes/catch2/catch_user_config.hpp")
+source_group(
+  TREE ${SOURCES_DIR}
+  PREFIX sources
+  FILES ${FILTERED_FILES}
+)
+source_group("generated headers"
+  FILES
+    "${PROJECT_BINARY_DIR}/generated-includes/catch2/catch_user_config.hpp"
+)
+
+add_library(Catch2 ${ALL_FILES})
+if (CATCH_ENABLE_REPRODUCIBLE_BUILD)
+    add_build_reproducibility_settings(Catch2)
+endif()
+add_library(Catch2::Catch2 ALIAS Catch2)
+
+if (ANDROID)
+    target_link_libraries(Catch2 PRIVATE log)
+endif()
+
+set_target_properties(Catch2 PROPERTIES
+                          DEBUG_POSTFIX "d"
+                                VERSION ${PROJECT_VERSION}
+                              SOVERSION ${PROJECT_VERSION})
+
+# depend on bunch of C++11 and C++14 features to have C++14 enabled by default
+target_compile_features(Catch2
+  PUBLIC
+    cxx_alignas
+    cxx_alignof
+    cxx_attributes
+    cxx_auto_type
+    cxx_constexpr
+    cxx_defaulted_functions
+    cxx_deleted_functions
+    cxx_final
+    cxx_lambdas
+    cxx_noexcept
+    cxx_override
+    cxx_range_for
+    cxx_rvalue_references
+    cxx_static_assert
+    cxx_strong_enums
+    cxx_trailing_return_types
+    cxx_unicode_literals
+    cxx_user_literals
+    cxx_variable_templates
+    cxx_variadic_macros
+)
+
+configure_file(
+  "${SOURCES_DIR}/catch_user_config.hpp.in"
+  "${PROJECT_BINARY_DIR}/generated-includes/catch2/catch_user_config.hpp"
+)
+
+target_include_directories(Catch2
+  PUBLIC
+    $<BUILD_INTERFACE:${SOURCES_DIR}/..>
+    $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/generated-includes>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+)
+
+
+add_library(Catch2WithMain
+    ${SOURCES_DIR}/internal/catch_main.cpp
+)
+if (CATCH_ENABLE_REPRODUCIBLE_BUILD)
+    add_build_reproducibility_settings(Catch2WithMain)
+endif()
+add_library(Catch2::Catch2WithMain ALIAS Catch2WithMain)
+target_link_libraries(Catch2WithMain PUBLIC Catch2)
+set_target_properties(Catch2WithMain
+  PROPERTIES
+    OUTPUT_NAME "Catch2Main"
+        VERSION ${PROJECT_VERSION}
+      SOVERSION ${PROJECT_VERSION}
+)
+set_target_properties(Catch2WithMain PROPERTIES DEBUG_POSTFIX "d")
+
+if (NOT_SUBPROJECT)
+    # create and install an export set for catch target as Catch2::Catch
+    install(
+      TARGETS
+        Catch2
+        Catch2WithMain
+      EXPORT
+        Catch2Targets
+      LIBRARY DESTINATION
+        ${CMAKE_INSTALL_LIBDIR}
+      ARCHIVE DESTINATION
+        ${CMAKE_INSTALL_LIBDIR}
+      RUNTIME DESTINATION
+        ${CMAKE_INSTALL_BINDIR}
+    )
+
+
+    install(
+      EXPORT
+        Catch2Targets
+      NAMESPACE
+        Catch2::
+      DESTINATION
+        ${CATCH_CMAKE_CONFIG_DESTINATION}
+    )
+    # Install the headers
+    install(
+      DIRECTORY
+        "${SOURCES_DIR}"
+        "${PROJECT_BINARY_DIR}/generated-includes/catch2" # Also install the generated header
+      DESTINATION
+        "${CMAKE_INSTALL_INCLUDEDIR}"
+      FILES_MATCHING
+        PATTERN "*.hpp"
+    )
+endif()
+
+# Some tests require a full recompilation of Catch2 lib with different
+# compilation flags. They can link against this target to recompile all
+# the sources into the binary.
+if (CATCH_BUILD_EXAMPLES OR CATCH_BUILD_EXTRA_TESTS)
+    add_library(Catch2_buildall_interface INTERFACE)
+    target_sources(Catch2_buildall_interface INTERFACE
+      ${ALL_FILES}
+      # Also include main entry point
+      ${SOURCES_DIR}/internal/catch_main.cpp
+    )
+    target_include_directories(Catch2_buildall_interface
+      INTERFACE
+        $<BUILD_INTERFACE:${SOURCES_DIR}/..>
+        $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/generated-includes>
+        $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+    )
+    target_compile_definitions(Catch2_buildall_interface
+      INTERFACE
+        CATCH_CONFIG_STATIC
+    )
+    target_compile_features(Catch2_buildall_interface
+      INTERFACE
+        cxx_alignas
+        cxx_alignof
+        cxx_attributes
+        cxx_auto_type
+        cxx_constexpr
+        cxx_defaulted_functions
+        cxx_deleted_functions
+        cxx_final
+        cxx_lambdas
+        cxx_noexcept
+        cxx_override
+        cxx_range_for
+        cxx_rvalue_references
+        cxx_static_assert
+        cxx_strong_enums
+        cxx_trailing_return_types
+        cxx_unicode_literals
+        cxx_user_literals
+        cxx_variable_templates
+        cxx_variadic_macros
+    )
+endif()
+
+list(APPEND CATCH_WARNING_TARGETS Catch2 Catch2WithMain)
+set(CATCH_WARNING_TARGETS ${CATCH_WARNING_TARGETS} PARENT_SCOPE)
+
+
+# We still do not support building dynamic library with hidden visibility
+# so we want to check & warn users if they do this. However, we won't abort
+# the configuration step so that we don't have to also provide an override.
+if (BUILD_SHARED_LIBS)
+    set_target_properties(Catch2 Catch2WithMain
+      PROPERTIES
+        WINDOWS_EXPORT_ALL_SYMBOLS ON
+    )
+
+    get_target_property(_VisPreset Catch2 CXX_VISIBILITY_PRESET)
+    if (NOT MSVC AND _VisPreset STREQUAL "hidden")
+        set_target_properties(Catch2 Catch2WithMain
+          PROPERTIES
+            CXX_VISIBILITY_PRESET "default"
+            VISIBILITY_INLINES_HIDDEN OFF
+        )
+        message(WARNING "Setting Catch2's visibility to default."
+                        " Hidden visibility is not supported.")
+    endif()
+endif()
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_benchmark.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_benchmark.hpp
new file mode 100644
index 00000000..3db40bb0
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_benchmark.hpp
@@ -0,0 +1,148 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#ifndef CATCH_BENCHMARK_HPP_INCLUDED
+#define CATCH_BENCHMARK_HPP_INCLUDED
+
+#include <catch2/catch_user_config.hpp>
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/internal/catch_context.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/internal/catch_test_failure_exception.hpp>
+#include <catch2/internal/catch_unique_name.hpp>
+#include <catch2/interfaces/catch_interfaces_capture.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
+#include <catch2/benchmark/detail/catch_benchmark_stats.hpp>
+#include <catch2/benchmark/catch_clock.hpp>
+#include <catch2/benchmark/catch_environment.hpp>
+#include <catch2/benchmark/catch_execution_plan.hpp>
+#include <catch2/benchmark/detail/catch_estimate_clock.hpp>
+#include <catch2/benchmark/detail/catch_analyse.hpp>
+#include <catch2/benchmark/detail/catch_benchmark_function.hpp>
+#include <catch2/benchmark/detail/catch_run_for_at_least.hpp>
+
+#include <algorithm>
+#include <chrono>
+#include <exception>
+#include <string>
+#include <cmath>
+
+namespace Catch {
+    namespace Benchmark {
+        struct Benchmark {
+            Benchmark(std::string&& benchmarkName)
+                : name(CATCH_MOVE(benchmarkName)) {}
+
+            template <class FUN>
+            Benchmark(std::string&& benchmarkName , FUN &&func)
+                : fun(CATCH_MOVE(func)), name(CATCH_MOVE(benchmarkName)) {}
+
+            template <typename Clock>
+            ExecutionPlan prepare(const IConfig &cfg, Environment env) const {
+                auto min_time = env.clock_resolution.mean * Detail::minimum_ticks;
+                auto run_time = std::max(min_time, std::chrono::duration_cast<decltype(min_time)>(cfg.benchmarkWarmupTime()));
+                auto&& test = Detail::run_for_at_least<Clock>(std::chrono::duration_cast<IDuration>(run_time), 1, fun);
+                int new_iters = static_cast<int>(std::ceil(min_time * test.iterations / test.elapsed));
+                return { new_iters, test.elapsed / test.iterations * new_iters * cfg.benchmarkSamples(), fun, std::chrono::duration_cast<FDuration>(cfg.benchmarkWarmupTime()), Detail::warmup_iterations };
+            }
+
+            template <typename Clock = default_clock>
+            void run() {
+                static_assert( Clock::is_steady,
+                               "Benchmarking clock should be steady" );
+                auto const* cfg = getCurrentContext().getConfig();
+
+                auto env = Detail::measure_environment<Clock>();
+
+                getResultCapture().benchmarkPreparing(name);
+                CATCH_TRY{
+                    auto plan = user_code([&] {
+                        return prepare<Clock>(*cfg, env);
+                    });
+
+                    BenchmarkInfo info {
+                        CATCH_MOVE(name),
+                        plan.estimated_duration.count(),
+                        plan.iterations_per_sample,
+                        cfg->benchmarkSamples(),
+                        cfg->benchmarkResamples(),
+                        env.clock_resolution.mean.count(),
+                        env.clock_cost.mean.count()
+                    };
+
+                    getResultCapture().benchmarkStarting(info);
+
+                    auto samples = user_code([&] {
+                        return plan.template run<Clock>(*cfg, env);
+                    });
+
+                    auto analysis = Detail::analyse(*cfg, samples.data(), samples.data() + samples.size());
+                    BenchmarkStats<> stats{ CATCH_MOVE(info), CATCH_MOVE(analysis.samples), analysis.mean, analysis.standard_deviation, analysis.outliers, analysis.outlier_variance };
+                    getResultCapture().benchmarkEnded(stats);
+                } CATCH_CATCH_ANON (TestFailureException const&) {
+                    getResultCapture().benchmarkFailed("Benchmark failed due to failed assertion"_sr);
+                } CATCH_CATCH_ALL{
+                    getResultCapture().benchmarkFailed(translateActiveException());
+                    // We let the exception go further up so that the
+                    // test case is marked as failed.
+                    std::rethrow_exception(std::current_exception());
+                }
+            }
+
+            // sets lambda to be used in fun *and* executes benchmark!
+            template <typename Fun, std::enable_if_t<!Detail::is_related<Fun, Benchmark>::value, int> = 0>
+                Benchmark & operator=(Fun func) {
+                auto const* cfg = getCurrentContext().getConfig();
+                if (!cfg->skipBenchmarks()) {
+                    fun = Detail::BenchmarkFunction(func);
+                    run();
+                }
+                return *this;
+            }
+
+            explicit operator bool() {
+                return true;
+            }
+
+        private:
+            Detail::BenchmarkFunction fun;
+            std::string name;
+        };
+    }
+} // namespace Catch
+
+#define INTERNAL_CATCH_GET_1_ARG(arg1, arg2, ...) arg1
+#define INTERNAL_CATCH_GET_2_ARG(arg1, arg2, ...) arg2
+
+#define INTERNAL_CATCH_BENCHMARK(BenchmarkName, name, benchmarkIndex)\
+    if( Catch::Benchmark::Benchmark BenchmarkName{name} ) \
+        BenchmarkName = [&](int benchmarkIndex)
+
+#define INTERNAL_CATCH_BENCHMARK_ADVANCED(BenchmarkName, name)\
+    if( Catch::Benchmark::Benchmark BenchmarkName{name} ) \
+        BenchmarkName = [&]
+
+#if defined(CATCH_CONFIG_PREFIX_ALL)
+
+#define CATCH_BENCHMARK(...) \
+    INTERNAL_CATCH_BENCHMARK(INTERNAL_CATCH_UNIQUE_NAME(CATCH2_INTERNAL_BENCHMARK_), INTERNAL_CATCH_GET_1_ARG(__VA_ARGS__,,), INTERNAL_CATCH_GET_2_ARG(__VA_ARGS__,,))
+#define CATCH_BENCHMARK_ADVANCED(name) \
+    INTERNAL_CATCH_BENCHMARK_ADVANCED(INTERNAL_CATCH_UNIQUE_NAME(CATCH2_INTERNAL_BENCHMARK_), name)
+
+#else
+
+#define BENCHMARK(...) \
+    INTERNAL_CATCH_BENCHMARK(INTERNAL_CATCH_UNIQUE_NAME(CATCH2_INTERNAL_BENCHMARK_), INTERNAL_CATCH_GET_1_ARG(__VA_ARGS__,,), INTERNAL_CATCH_GET_2_ARG(__VA_ARGS__,,))
+#define BENCHMARK_ADVANCED(name) \
+    INTERNAL_CATCH_BENCHMARK_ADVANCED(INTERNAL_CATCH_UNIQUE_NAME(CATCH2_INTERNAL_BENCHMARK_), name)
+
+#endif
+
+#endif // CATCH_BENCHMARK_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_benchmark_all.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_benchmark_all.hpp
new file mode 100644
index 00000000..56fc7c74
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_benchmark_all.hpp
@@ -0,0 +1,46 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+/** \file
+ * This is a convenience header for Catch2's benchmarking. It includes
+ * **all** of Catch2 headers related to benchmarking.
+ *
+ * Generally the Catch2 users should use specific includes they need,
+ * but this header can be used instead for ease-of-experimentation, or
+ * just plain convenience, at the cost of (significantly) increased
+ * compilation times.
+ *
+ * When a new header is added to either the `benchmark` folder, or to
+ * the corresponding internal (detail) subfolder, it should be added here.
+ */
+
+#ifndef CATCH_BENCHMARK_ALL_HPP_INCLUDED
+#define CATCH_BENCHMARK_ALL_HPP_INCLUDED
+
+#include <catch2/benchmark/catch_benchmark.hpp>
+#include <catch2/benchmark/catch_chronometer.hpp>
+#include <catch2/benchmark/catch_clock.hpp>
+#include <catch2/benchmark/catch_constructor.hpp>
+#include <catch2/benchmark/catch_environment.hpp>
+#include <catch2/benchmark/catch_estimate.hpp>
+#include <catch2/benchmark/catch_execution_plan.hpp>
+#include <catch2/benchmark/catch_optimizer.hpp>
+#include <catch2/benchmark/catch_outlier_classification.hpp>
+#include <catch2/benchmark/catch_sample_analysis.hpp>
+#include <catch2/benchmark/detail/catch_analyse.hpp>
+#include <catch2/benchmark/detail/catch_benchmark_function.hpp>
+#include <catch2/benchmark/detail/catch_benchmark_stats.hpp>
+#include <catch2/benchmark/detail/catch_benchmark_stats_fwd.hpp>
+#include <catch2/benchmark/detail/catch_complete_invoke.hpp>
+#include <catch2/benchmark/detail/catch_estimate_clock.hpp>
+#include <catch2/benchmark/detail/catch_measure.hpp>
+#include <catch2/benchmark/detail/catch_repeat.hpp>
+#include <catch2/benchmark/detail/catch_run_for_at_least.hpp>
+#include <catch2/benchmark/detail/catch_stats.hpp>
+#include <catch2/benchmark/detail/catch_timing.hpp>
+
+#endif // CATCH_BENCHMARK_ALL_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_chronometer.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_chronometer.cpp
new file mode 100644
index 00000000..92f03c9f
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_chronometer.cpp
@@ -0,0 +1,17 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/benchmark/catch_chronometer.hpp>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            ChronometerConcept::~ChronometerConcept() = default;
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_chronometer.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_chronometer.hpp
new file mode 100644
index 00000000..95498e6b
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_chronometer.hpp
@@ -0,0 +1,77 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#ifndef CATCH_CHRONOMETER_HPP_INCLUDED
+#define CATCH_CHRONOMETER_HPP_INCLUDED
+
+#include <catch2/benchmark/catch_clock.hpp>
+#include <catch2/benchmark/catch_optimizer.hpp>
+#include <catch2/internal/catch_meta.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            struct ChronometerConcept {
+                virtual void start() = 0;
+                virtual void finish() = 0;
+                virtual ~ChronometerConcept(); // = default;
+
+                ChronometerConcept() = default;
+                ChronometerConcept(ChronometerConcept const&) = default;
+                ChronometerConcept& operator=(ChronometerConcept const&) = default;
+            };
+            template <typename Clock>
+            struct ChronometerModel final : public ChronometerConcept {
+                void start() override { started = Clock::now(); }
+                void finish() override { finished = Clock::now(); }
+
+                IDuration elapsed() const {
+                    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+                        finished - started );
+                }
+
+                TimePoint<Clock> started;
+                TimePoint<Clock> finished;
+            };
+        } // namespace Detail
+
+        struct Chronometer {
+        public:
+            template <typename Fun>
+            void measure(Fun&& fun) { measure(CATCH_FORWARD(fun), is_callable<Fun(int)>()); }
+
+            int runs() const { return repeats; }
+
+            Chronometer(Detail::ChronometerConcept& meter, int repeats_)
+                : impl(&meter)
+                , repeats(repeats_) {}
+
+        private:
+            template <typename Fun>
+            void measure(Fun&& fun, std::false_type) {
+                measure([&fun](int) { return fun(); }, std::true_type());
+            }
+
+            template <typename Fun>
+            void measure(Fun&& fun, std::true_type) {
+                Detail::optimizer_barrier();
+                impl->start();
+                for (int i = 0; i < repeats; ++i) invoke_deoptimized(fun, i);
+                impl->finish();
+                Detail::optimizer_barrier();
+            }
+
+            Detail::ChronometerConcept* impl;
+            int repeats;
+        };
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_CHRONOMETER_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_clock.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_clock.hpp
new file mode 100644
index 00000000..4068c4d2
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_clock.hpp
@@ -0,0 +1,27 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#ifndef CATCH_CLOCK_HPP_INCLUDED
+#define CATCH_CLOCK_HPP_INCLUDED
+
+#include <chrono>
+
+namespace Catch {
+    namespace Benchmark {
+        using IDuration = std::chrono::nanoseconds;
+        using FDuration = std::chrono::duration<double, std::nano>;
+
+        template <typename Clock>
+        using TimePoint = typename Clock::time_point;
+
+        using default_clock = std::chrono::steady_clock;
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_CLOCK_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_constructor.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_constructor.hpp
new file mode 100644
index 00000000..853bd6c1
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_constructor.hpp
@@ -0,0 +1,82 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#ifndef CATCH_CONSTRUCTOR_HPP_INCLUDED
+#define CATCH_CONSTRUCTOR_HPP_INCLUDED
+
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <type_traits>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename T, bool Destruct>
+            struct ObjectStorage
+            {
+                ObjectStorage() = default;
+
+                ObjectStorage(const ObjectStorage& other)
+                {
+                    new(&data) T(other.stored_object());
+                }
+
+                ObjectStorage(ObjectStorage&& other)
+                {
+                    new(data) T(CATCH_MOVE(other.stored_object()));
+                }
+
+                ~ObjectStorage() { destruct_on_exit<T>(); }
+
+                template <typename... Args>
+                void construct(Args&&... args)
+                {
+                    new (data) T(CATCH_FORWARD(args)...);
+                }
+
+                template <bool AllowManualDestruction = !Destruct>
+                std::enable_if_t<AllowManualDestruction> destruct()
+                {
+                    stored_object().~T();
+                }
+
+            private:
+                // If this is a constructor benchmark, destruct the underlying object
+                template <typename U>
+                void destruct_on_exit(std::enable_if_t<Destruct, U>* = nullptr) { destruct<true>(); }
+                // Otherwise, don't
+                template <typename U>
+                void destruct_on_exit(std::enable_if_t<!Destruct, U>* = nullptr) { }
+
+#if defined( __GNUC__ ) && __GNUC__ <= 6
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+                T& stored_object() { return *reinterpret_cast<T*>( data ); }
+
+                T const& stored_object() const {
+                    return *reinterpret_cast<T const*>( data );
+                }
+#if defined( __GNUC__ ) && __GNUC__ <= 6
+#    pragma GCC diagnostic pop
+#endif
+
+                alignas( T ) unsigned char data[sizeof( T )]{};
+            };
+        } // namespace Detail
+
+        template <typename T>
+        using storage_for = Detail::ObjectStorage<T, true>;
+
+        template <typename T>
+        using destructable_object = Detail::ObjectStorage<T, false>;
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_CONSTRUCTOR_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_environment.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_environment.hpp
new file mode 100644
index 00000000..da3f2fa9
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_environment.hpp
@@ -0,0 +1,29 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#ifndef CATCH_ENVIRONMENT_HPP_INCLUDED
+#define CATCH_ENVIRONMENT_HPP_INCLUDED
+
+#include <catch2/benchmark/catch_clock.hpp>
+#include <catch2/benchmark/catch_outlier_classification.hpp>
+
+namespace Catch {
+    namespace Benchmark {
+        struct EnvironmentEstimate {
+            FDuration mean;
+            OutlierClassification outliers;
+        };
+        struct Environment {
+            EnvironmentEstimate clock_resolution;
+            EnvironmentEstimate clock_cost;
+        };
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_ENVIRONMENT_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_estimate.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_estimate.hpp
new file mode 100644
index 00000000..64383a2e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_estimate.hpp
@@ -0,0 +1,25 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#ifndef CATCH_ESTIMATE_HPP_INCLUDED
+#define CATCH_ESTIMATE_HPP_INCLUDED
+
+namespace Catch {
+    namespace Benchmark {
+        template <typename Type>
+        struct Estimate {
+            Type point;
+            Type lower_bound;
+            Type upper_bound;
+            double confidence_interval;
+        };
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_ESTIMATE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_execution_plan.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_execution_plan.hpp
new file mode 100644
index 00000000..17ca589f
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_execution_plan.hpp
@@ -0,0 +1,58 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#ifndef CATCH_EXECUTION_PLAN_HPP_INCLUDED
+#define CATCH_EXECUTION_PLAN_HPP_INCLUDED
+
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/benchmark/catch_clock.hpp>
+#include <catch2/benchmark/catch_environment.hpp>
+#include <catch2/benchmark/detail/catch_benchmark_function.hpp>
+#include <catch2/benchmark/detail/catch_repeat.hpp>
+#include <catch2/benchmark/detail/catch_run_for_at_least.hpp>
+
+#include <vector>
+
+namespace Catch {
+    namespace Benchmark {
+        struct ExecutionPlan {
+            int iterations_per_sample;
+            FDuration estimated_duration;
+            Detail::BenchmarkFunction benchmark;
+            FDuration warmup_time;
+            int warmup_iterations;
+
+            template <typename Clock>
+            std::vector<FDuration> run(const IConfig &cfg, Environment env) const {
+                // warmup a bit
+                Detail::run_for_at_least<Clock>(
+                    std::chrono::duration_cast<IDuration>( warmup_time ),
+                    warmup_iterations,
+                    Detail::repeat( []() { return Clock::now(); } )
+                );
+
+                std::vector<FDuration> times;
+                const auto num_samples = cfg.benchmarkSamples();
+                times.reserve( num_samples );
+                for ( size_t i = 0; i < num_samples; ++i ) {
+                    Detail::ChronometerModel<Clock> model;
+                    this->benchmark( Chronometer( model, iterations_per_sample ) );
+                    auto sample_time = model.elapsed() - env.clock_cost.mean;
+                    if ( sample_time < FDuration::zero() ) {
+                        sample_time = FDuration::zero();
+                    }
+                    times.push_back(sample_time / iterations_per_sample);
+                }
+                return times;
+            }
+        };
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_EXECUTION_PLAN_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_optimizer.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_optimizer.hpp
new file mode 100644
index 00000000..61e6571f
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_optimizer.hpp
@@ -0,0 +1,78 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#ifndef CATCH_OPTIMIZER_HPP_INCLUDED
+#define CATCH_OPTIMIZER_HPP_INCLUDED
+
+#if defined(_MSC_VER) || defined(__IAR_SYSTEMS_ICC__)
+#   include <atomic> // atomic_thread_fence
+#endif
+
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <type_traits>
+
+namespace Catch {
+    namespace Benchmark {
+#if defined(__GNUC__) || defined(__clang__)
+        template <typename T>
+        inline void keep_memory(T* p) {
+            asm volatile("" : : "g"(p) : "memory");
+        }
+        inline void keep_memory() {
+            asm volatile("" : : : "memory");
+        }
+
+        namespace Detail {
+            inline void optimizer_barrier() { keep_memory(); }
+        } // namespace Detail
+#elif defined(_MSC_VER) || defined(__IAR_SYSTEMS_ICC__)
+
+#if defined(_MSVC_VER)
+#pragma optimize("", off)
+#elif defined(__IAR_SYSTEMS_ICC__)
+// For IAR the pragma only affects the following function
+#pragma optimize=disable
+#endif
+        template <typename T>
+        inline void keep_memory(T* p) {
+            // thanks @milleniumbug
+            *reinterpret_cast<char volatile*>(p) = *reinterpret_cast<char const volatile*>(p);
+        }
+        // TODO equivalent keep_memory()
+#if defined(_MSVC_VER)
+#pragma optimize("", on)
+#endif
+
+        namespace Detail {
+            inline void optimizer_barrier() {
+                std::atomic_thread_fence(std::memory_order_seq_cst);
+            }
+        } // namespace Detail
+
+#endif
+
+        template <typename T>
+        inline void deoptimize_value(T&& x) {
+            keep_memory(&x);
+        }
+
+        template <typename Fn, typename... Args>
+        inline auto invoke_deoptimized(Fn&& fn, Args&&... args) -> std::enable_if_t<!std::is_same<void, decltype(fn(args...))>::value> {
+            deoptimize_value(CATCH_FORWARD(fn) (CATCH_FORWARD(args)...));
+        }
+
+        template <typename Fn, typename... Args>
+        inline auto invoke_deoptimized(Fn&& fn, Args&&... args) -> std::enable_if_t<std::is_same<void, decltype(fn(args...))>::value> {
+            CATCH_FORWARD((fn)) (CATCH_FORWARD(args)...);
+        }
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_OPTIMIZER_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_outlier_classification.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_outlier_classification.hpp
new file mode 100644
index 00000000..e31d65d4
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_outlier_classification.hpp
@@ -0,0 +1,29 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#ifndef CATCH_OUTLIER_CLASSIFICATION_HPP_INCLUDED
+#define CATCH_OUTLIER_CLASSIFICATION_HPP_INCLUDED
+
+namespace Catch {
+    namespace Benchmark {
+        struct OutlierClassification {
+            int samples_seen = 0;
+            int low_severe = 0;     // more than 3 times IQR below Q1
+            int low_mild = 0;       // 1.5 to 3 times IQR below Q1
+            int high_mild = 0;      // 1.5 to 3 times IQR above Q3
+            int high_severe = 0;    // more than 3 times IQR above Q3
+
+            int total() const {
+                return low_severe + low_mild + high_mild + high_severe;
+            }
+        };
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_OUTLIERS_CLASSIFICATION_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_sample_analysis.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_sample_analysis.hpp
new file mode 100644
index 00000000..aeb87d05
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_sample_analysis.hpp
@@ -0,0 +1,31 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#ifndef CATCH_SAMPLE_ANALYSIS_HPP_INCLUDED
+#define CATCH_SAMPLE_ANALYSIS_HPP_INCLUDED
+
+#include <catch2/benchmark/catch_estimate.hpp>
+#include <catch2/benchmark/catch_outlier_classification.hpp>
+#include <catch2/benchmark/catch_clock.hpp>
+
+#include <vector>
+
+namespace Catch {
+    namespace Benchmark {
+        struct SampleAnalysis {
+            std::vector<FDuration> samples;
+            Estimate<FDuration> mean;
+            Estimate<FDuration> standard_deviation;
+            OutlierClassification outliers;
+            double outlier_variance;
+        };
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_SAMPLE_ANALYSIS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_analyse.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_analyse.cpp
new file mode 100644
index 00000000..14d7f450
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_analyse.cpp
@@ -0,0 +1,85 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#include <catch2/benchmark/detail/catch_analyse.hpp>
+#include <catch2/benchmark/catch_clock.hpp>
+#include <catch2/benchmark/catch_sample_analysis.hpp>
+#include <catch2/benchmark/detail/catch_stats.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <vector>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            SampleAnalysis analyse(const IConfig &cfg, FDuration* first, FDuration* last) {
+                if (!cfg.benchmarkNoAnalysis()) {
+                    std::vector<double> samples;
+                    samples.reserve(static_cast<size_t>(last - first));
+                    for (auto current = first; current != last; ++current) {
+                        samples.push_back( current->count() );
+                    }
+
+                    auto analysis = Catch::Benchmark::Detail::analyse_samples(
+                        cfg.benchmarkConfidenceInterval(),
+                        cfg.benchmarkResamples(),
+                        samples.data(),
+                        samples.data() + samples.size() );
+                    auto outliers = Catch::Benchmark::Detail::classify_outliers(
+                        samples.data(), samples.data() + samples.size() );
+
+                    auto wrap_estimate = [](Estimate<double> e) {
+                        return Estimate<FDuration> {
+                            FDuration(e.point),
+                                FDuration(e.lower_bound),
+                                FDuration(e.upper_bound),
+                                e.confidence_interval,
+                        };
+                    };
+                    std::vector<FDuration> samples2;
+                    samples2.reserve(samples.size());
+                    for (auto s : samples) {
+                        samples2.push_back( FDuration( s ) );
+                    }
+
+                    return {
+                        CATCH_MOVE(samples2),
+                        wrap_estimate(analysis.mean),
+                        wrap_estimate(analysis.standard_deviation),
+                        outliers,
+                        analysis.outlier_variance,
+                    };
+                } else {
+                    std::vector<FDuration> samples;
+                    samples.reserve(static_cast<size_t>(last - first));
+
+                    FDuration mean = FDuration(0);
+                    int i = 0;
+                    for (auto it = first; it < last; ++it, ++i) {
+                        samples.push_back(*it);
+                        mean += *it;
+                    }
+                    mean /= i;
+
+                    return SampleAnalysis{
+                        CATCH_MOVE(samples),
+                        Estimate<FDuration>{ mean, mean, mean, 0.0 },
+                        Estimate<FDuration>{ FDuration( 0 ),
+                                             FDuration( 0 ),
+                                             FDuration( 0 ),
+                                             0.0 },
+                        OutlierClassification{},
+                        0.0
+                    };
+                }
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_analyse.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_analyse.hpp
new file mode 100644
index 00000000..5e3f7b0f
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_analyse.hpp
@@ -0,0 +1,27 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#ifndef CATCH_ANALYSE_HPP_INCLUDED
+#define CATCH_ANALYSE_HPP_INCLUDED
+
+#include <catch2/benchmark/catch_clock.hpp>
+#include <catch2/benchmark/catch_sample_analysis.hpp>
+
+
+namespace Catch {
+    class IConfig;
+
+    namespace Benchmark {
+        namespace Detail {
+            SampleAnalysis analyse(const IConfig &cfg, FDuration* first, FDuration* last);
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_ANALYSE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_benchmark_function.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_benchmark_function.cpp
new file mode 100644
index 00000000..b437d049
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_benchmark_function.cpp
@@ -0,0 +1,17 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/benchmark/detail/catch_benchmark_function.hpp>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            BenchmarkFunction::callable::~callable() = default;
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_benchmark_function.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_benchmark_function.hpp
new file mode 100644
index 00000000..144e4b6e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_benchmark_function.hpp
@@ -0,0 +1,107 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#ifndef CATCH_BENCHMARK_FUNCTION_HPP_INCLUDED
+#define CATCH_BENCHMARK_FUNCTION_HPP_INCLUDED
+
+#include <catch2/benchmark/catch_chronometer.hpp>
+#include <catch2/internal/catch_meta.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <type_traits>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename T, typename U>
+            struct is_related
+                : std::is_same<std::decay_t<T>, std::decay_t<U>> {};
+
+            /// We need to reinvent std::function because every piece of code that might add overhead
+            /// in a measurement context needs to have consistent performance characteristics so that we
+            /// can account for it in the measurement.
+            /// Implementations of std::function with optimizations that aren't always applicable, like
+            /// small buffer optimizations, are not uncommon.
+            /// This is effectively an implementation of std::function without any such optimizations;
+            /// it may be slow, but it is consistently slow.
+            struct BenchmarkFunction {
+            private:
+                struct callable {
+                    virtual void call(Chronometer meter) const = 0;
+                    virtual Catch::Detail::unique_ptr<callable> clone() const = 0;
+                    virtual ~callable(); // = default;
+
+                    callable() = default;
+                    callable(callable const&) = default;
+                    callable& operator=(callable const&) = default;
+                };
+                template <typename Fun>
+                struct model : public callable {
+                    model(Fun&& fun_) : fun(CATCH_MOVE(fun_)) {}
+                    model(Fun const& fun_) : fun(fun_) {}
+
+                    Catch::Detail::unique_ptr<callable> clone() const override {
+                        return Catch::Detail::make_unique<model<Fun>>( *this );
+                    }
+
+                    void call(Chronometer meter) const override {
+                        call(meter, is_callable<Fun(Chronometer)>());
+                    }
+                    void call(Chronometer meter, std::true_type) const {
+                        fun(meter);
+                    }
+                    void call(Chronometer meter, std::false_type) const {
+                        meter.measure(fun);
+                    }
+
+                    Fun fun;
+                };
+
+                struct do_nothing { void operator()() const {} };
+
+                template <typename T>
+                BenchmarkFunction(model<T>* c) : f(c) {}
+
+            public:
+                BenchmarkFunction()
+                    : f(new model<do_nothing>{ {} }) {}
+
+                template <typename Fun,
+                    std::enable_if_t<!is_related<Fun, BenchmarkFunction>::value, int> = 0>
+                    BenchmarkFunction(Fun&& fun)
+                    : f(new model<std::decay_t<Fun>>(CATCH_FORWARD(fun))) {}
+
+                BenchmarkFunction( BenchmarkFunction&& that ) noexcept:
+                    f( CATCH_MOVE( that.f ) ) {}
+
+                BenchmarkFunction(BenchmarkFunction const& that)
+                    : f(that.f->clone()) {}
+
+                BenchmarkFunction&
+                operator=( BenchmarkFunction&& that ) noexcept {
+                    f = CATCH_MOVE( that.f );
+                    return *this;
+                }
+
+                BenchmarkFunction& operator=(BenchmarkFunction const& that) {
+                    f = that.f->clone();
+                    return *this;
+                }
+
+                void operator()(Chronometer meter) const { f->call(meter); }
+
+            private:
+                Catch::Detail::unique_ptr<callable> f;
+            };
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_BENCHMARK_FUNCTION_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_benchmark_stats.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_benchmark_stats.hpp
new file mode 100644
index 00000000..3633bc9f
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_benchmark_stats.hpp
@@ -0,0 +1,48 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_BENCHMARK_STATS_HPP_INCLUDED
+#define CATCH_BENCHMARK_STATS_HPP_INCLUDED
+
+#include <catch2/benchmark/catch_estimate.hpp>
+#include <catch2/benchmark/catch_outlier_classification.hpp>
+// The fwd decl & default specialization needs to be seen by VS2017 before
+// BenchmarkStats itself, or VS2017 will report compilation error.
+#include <catch2/benchmark/detail/catch_benchmark_stats_fwd.hpp>
+
+#include <string>
+#include <vector>
+
+namespace Catch {
+
+    struct BenchmarkInfo {
+        std::string name;
+        double estimatedDuration;
+        int iterations;
+        unsigned int samples;
+        unsigned int resamples;
+        double clockResolution;
+        double clockCost;
+    };
+
+    // We need to keep template parameter for backwards compatibility,
+    // but we also do not want to use the template paraneter.
+    template <class Dummy>
+    struct BenchmarkStats {
+        BenchmarkInfo info;
+
+        std::vector<Benchmark::FDuration> samples;
+        Benchmark::Estimate<Benchmark::FDuration> mean;
+        Benchmark::Estimate<Benchmark::FDuration> standardDeviation;
+        Benchmark::OutlierClassification outliers;
+        double outlierVariance;
+    };
+
+
+} // end namespace Catch
+
+#endif // CATCH_BENCHMARK_STATS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_benchmark_stats_fwd.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_benchmark_stats_fwd.hpp
new file mode 100644
index 00000000..2ccc25d5
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_benchmark_stats_fwd.hpp
@@ -0,0 +1,23 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_BENCHMARK_STATS_FWD_HPP_INCLUDED
+#define CATCH_BENCHMARK_STATS_FWD_HPP_INCLUDED
+
+#include <catch2/benchmark/catch_clock.hpp>
+
+namespace Catch {
+
+    // We cannot forward declare the type with default template argument
+    // multiple times, so it is split out into a separate header so that
+    // we can prevent multiple declarations in dependees
+    template <typename Duration = Benchmark::FDuration>
+    struct BenchmarkStats;
+
+} // end namespace Catch
+
+#endif // CATCH_BENCHMARK_STATS_FWD_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_complete_invoke.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_complete_invoke.hpp
new file mode 100644
index 00000000..4dff4b7e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_complete_invoke.hpp
@@ -0,0 +1,58 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#ifndef CATCH_COMPLETE_INVOKE_HPP_INCLUDED
+#define CATCH_COMPLETE_INVOKE_HPP_INCLUDED
+
+#include <catch2/internal/catch_meta.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename T>
+            struct CompleteType { using type = T; };
+            template <>
+            struct CompleteType<void> { struct type {}; };
+
+            template <typename T>
+            using CompleteType_t = typename CompleteType<T>::type;
+
+            template <typename Result>
+            struct CompleteInvoker {
+                template <typename Fun, typename... Args>
+                static Result invoke(Fun&& fun, Args&&... args) {
+                    return CATCH_FORWARD(fun)(CATCH_FORWARD(args)...);
+                }
+            };
+            template <>
+            struct CompleteInvoker<void> {
+                template <typename Fun, typename... Args>
+                static CompleteType_t<void> invoke(Fun&& fun, Args&&... args) {
+                    CATCH_FORWARD(fun)(CATCH_FORWARD(args)...);
+                    return {};
+                }
+            };
+
+            // invoke and not return void :(
+            template <typename Fun, typename... Args>
+            CompleteType_t<FunctionReturnType<Fun, Args...>> complete_invoke(Fun&& fun, Args&&... args) {
+                return CompleteInvoker<FunctionReturnType<Fun, Args...>>::invoke(CATCH_FORWARD(fun), CATCH_FORWARD(args)...);
+            }
+
+        } // namespace Detail
+
+        template <typename Fun>
+        Detail::CompleteType_t<FunctionReturnType<Fun>> user_code(Fun&& fun) {
+            return Detail::complete_invoke(CATCH_FORWARD(fun));
+        }
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_COMPLETE_INVOKE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_estimate_clock.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_estimate_clock.hpp
new file mode 100644
index 00000000..8e355279
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_estimate_clock.hpp
@@ -0,0 +1,125 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#ifndef CATCH_ESTIMATE_CLOCK_HPP_INCLUDED
+#define CATCH_ESTIMATE_CLOCK_HPP_INCLUDED
+
+#include <catch2/benchmark/catch_clock.hpp>
+#include <catch2/benchmark/catch_environment.hpp>
+#include <catch2/benchmark/detail/catch_stats.hpp>
+#include <catch2/benchmark/detail/catch_measure.hpp>
+#include <catch2/benchmark/detail/catch_run_for_at_least.hpp>
+#include <catch2/benchmark/catch_clock.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
+
+#include <algorithm>
+#include <vector>
+#include <cmath>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename Clock>
+            std::vector<double> resolution(int k) {
+                std::vector<TimePoint<Clock>> times;
+                times.reserve(static_cast<size_t>(k + 1));
+                for ( int i = 0; i < k + 1; ++i ) {
+                    times.push_back( Clock::now() );
+                }
+
+                std::vector<double> deltas;
+                deltas.reserve(static_cast<size_t>(k));
+                for ( size_t idx = 1; idx < times.size(); ++idx ) {
+                    deltas.push_back( static_cast<double>(
+                        ( times[idx] - times[idx - 1] ).count() ) );
+                }
+
+                return deltas;
+            }
+
+            constexpr auto warmup_iterations = 10000;
+            constexpr auto warmup_time = std::chrono::milliseconds(100);
+            constexpr auto minimum_ticks = 1000;
+            constexpr auto warmup_seed = 10000;
+            constexpr auto clock_resolution_estimation_time = std::chrono::milliseconds(500);
+            constexpr auto clock_cost_estimation_time_limit = std::chrono::seconds(1);
+            constexpr auto clock_cost_estimation_tick_limit = 100000;
+            constexpr auto clock_cost_estimation_time = std::chrono::milliseconds(10);
+            constexpr auto clock_cost_estimation_iterations = 10000;
+
+            template <typename Clock>
+            int warmup() {
+                return run_for_at_least<Clock>(warmup_time, warmup_seed, &resolution<Clock>)
+                    .iterations;
+            }
+            template <typename Clock>
+            EnvironmentEstimate estimate_clock_resolution(int iterations) {
+                auto r = run_for_at_least<Clock>(clock_resolution_estimation_time, iterations, &resolution<Clock>)
+                    .result;
+                return {
+                    FDuration(mean(r.data(), r.data() + r.size())),
+                    classify_outliers(r.data(), r.data() + r.size()),
+                };
+            }
+            template <typename Clock>
+            EnvironmentEstimate estimate_clock_cost(FDuration resolution) {
+                auto time_limit = (std::min)(
+                    resolution * clock_cost_estimation_tick_limit,
+                    FDuration(clock_cost_estimation_time_limit));
+                auto time_clock = [](int k) {
+                    return Detail::measure<Clock>([k] {
+                        for (int i = 0; i < k; ++i) {
+                            volatile auto ignored = Clock::now();
+                            (void)ignored;
+                        }
+                    }).elapsed;
+                };
+                time_clock(1);
+                int iters = clock_cost_estimation_iterations;
+                auto&& r = run_for_at_least<Clock>(clock_cost_estimation_time, iters, time_clock);
+                std::vector<double> times;
+                int nsamples = static_cast<int>(std::ceil(time_limit / r.elapsed));
+                times.reserve(static_cast<size_t>(nsamples));
+                for ( int s = 0; s < nsamples; ++s ) {
+                    times.push_back( static_cast<double>(
+                        ( time_clock( r.iterations ) / r.iterations )
+                            .count() ) );
+                }
+                return {
+                    FDuration(mean(times.data(), times.data() + times.size())),
+                    classify_outliers(times.data(), times.data() + times.size()),
+                };
+            }
+
+            template <typename Clock>
+            Environment measure_environment() {
+#if defined(__clang__)
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wexit-time-destructors"
+#endif
+                static Catch::Detail::unique_ptr<Environment> env;
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
+                if (env) {
+                    return *env;
+                }
+
+                auto iters = Detail::warmup<Clock>();
+                auto resolution = Detail::estimate_clock_resolution<Clock>(iters);
+                auto cost = Detail::estimate_clock_cost<Clock>(resolution.mean);
+
+                env = Catch::Detail::make_unique<Environment>( Environment{resolution, cost} );
+                return *env;
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_ESTIMATE_CLOCK_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_measure.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_measure.hpp
new file mode 100644
index 00000000..37494a68
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_measure.hpp
@@ -0,0 +1,32 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#ifndef CATCH_MEASURE_HPP_INCLUDED
+#define CATCH_MEASURE_HPP_INCLUDED
+
+#include <catch2/benchmark/detail/catch_complete_invoke.hpp>
+#include <catch2/benchmark/detail/catch_timing.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename Clock, typename Fun, typename... Args>
+            TimingOf<Fun, Args...> measure(Fun&& fun, Args&&... args) {
+                auto start = Clock::now();
+                auto&& r = Detail::complete_invoke(fun, CATCH_FORWARD(args)...);
+                auto end = Clock::now();
+                auto delta = end - start;
+                return { delta, CATCH_FORWARD(r), 1 };
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_MEASURE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_repeat.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_repeat.hpp
new file mode 100644
index 00000000..08c03373
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_repeat.hpp
@@ -0,0 +1,36 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#ifndef CATCH_REPEAT_HPP_INCLUDED
+#define CATCH_REPEAT_HPP_INCLUDED
+
+#include <type_traits>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename Fun>
+            struct repeater {
+                void operator()(int k) const {
+                    for (int i = 0; i < k; ++i) {
+                        fun();
+                    }
+                }
+                Fun fun;
+            };
+            template <typename Fun>
+            repeater<std::decay_t<Fun>> repeat(Fun&& fun) {
+                return { CATCH_FORWARD(fun) };
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_REPEAT_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_run_for_at_least.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_run_for_at_least.cpp
new file mode 100644
index 00000000..3ebdcc05
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_run_for_at_least.cpp
@@ -0,0 +1,31 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/benchmark/detail/catch_run_for_at_least.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+
+#include <exception>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            struct optimized_away_error : std::exception {
+                const char* what() const noexcept override;
+            };
+
+            const char* optimized_away_error::what() const noexcept {
+                return "could not measure benchmark, maybe it was optimized away";
+            }
+
+            void throw_optimized_away_error() {
+                Catch::throw_exception(optimized_away_error{});
+            }
+
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_run_for_at_least.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_run_for_at_least.hpp
new file mode 100644
index 00000000..4dfa8bbb
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_run_for_at_least.hpp
@@ -0,0 +1,65 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#ifndef CATCH_RUN_FOR_AT_LEAST_HPP_INCLUDED
+#define CATCH_RUN_FOR_AT_LEAST_HPP_INCLUDED
+
+#include <catch2/benchmark/catch_clock.hpp>
+#include <catch2/benchmark/catch_chronometer.hpp>
+#include <catch2/benchmark/detail/catch_measure.hpp>
+#include <catch2/benchmark/detail/catch_complete_invoke.hpp>
+#include <catch2/benchmark/detail/catch_timing.hpp>
+#include <catch2/internal/catch_meta.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <type_traits>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            template <typename Clock, typename Fun>
+            TimingOf<Fun, int> measure_one(Fun&& fun, int iters, std::false_type) {
+                return Detail::measure<Clock>(fun, iters);
+            }
+            template <typename Clock, typename Fun>
+            TimingOf<Fun, Chronometer> measure_one(Fun&& fun, int iters, std::true_type) {
+                Detail::ChronometerModel<Clock> meter;
+                auto&& result = Detail::complete_invoke(fun, Chronometer(meter, iters));
+
+                return { meter.elapsed(), CATCH_MOVE(result), iters };
+            }
+
+            template <typename Clock, typename Fun>
+            using run_for_at_least_argument_t = std::conditional_t<is_callable<Fun(Chronometer)>::value, Chronometer, int>;
+
+
+            [[noreturn]]
+            void throw_optimized_away_error();
+
+            template <typename Clock, typename Fun>
+            TimingOf<Fun, run_for_at_least_argument_t<Clock, Fun>>
+                run_for_at_least(IDuration how_long,
+                                 const int initial_iterations,
+                                 Fun&& fun) {
+                auto iters = initial_iterations;
+                while (iters < (1 << 30)) {
+                    auto&& Timing = measure_one<Clock>(fun, iters, is_callable<Fun(Chronometer)>());
+
+                    if (Timing.elapsed >= how_long) {
+                        return { Timing.elapsed, CATCH_MOVE(Timing.result), iters };
+                    }
+                    iters *= 2;
+                }
+                throw_optimized_away_error();
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_RUN_FOR_AT_LEAST_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_stats.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_stats.cpp
new file mode 100644
index 00000000..52cee4ee
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_stats.cpp
@@ -0,0 +1,392 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#include <catch2/benchmark/detail/catch_stats.hpp>
+
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/internal/catch_floating_point_helpers.hpp>
+#include <catch2/internal/catch_random_number_generator.hpp>
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <numeric>
+#include <random>
+
+
+#if defined(CATCH_CONFIG_USE_ASYNC)
+#include <future>
+#endif
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            namespace {
+
+                template <typename URng, typename Estimator>
+                static sample
+                resample( URng& rng,
+                          unsigned int resamples,
+                          double const* first,
+                          double const* last,
+                          Estimator& estimator ) {
+                    auto n = static_cast<size_t>( last - first );
+                    std::uniform_int_distribution<size_t> dist( 0, n - 1 );
+
+                    sample out;
+                    out.reserve( resamples );
+                    std::vector<double> resampled;
+                    resampled.reserve( n );
+                    for ( size_t i = 0; i < resamples; ++i ) {
+                        resampled.clear();
+                        for ( size_t s = 0; s < n; ++s ) {
+                            resampled.push_back( first[dist( rng )] );
+                        }
+                        const auto estimate =
+                            estimator( resampled.data(), resampled.data() + resampled.size() );
+                        out.push_back( estimate );
+                    }
+                    std::sort( out.begin(), out.end() );
+                    return out;
+                }
+
+                static double outlier_variance( Estimate<double> mean,
+                                                Estimate<double> stddev,
+                                                int n ) {
+                    double sb = stddev.point;
+                    double mn = mean.point / n;
+                    double mg_min = mn / 2.;
+                    double sg = (std::min)( mg_min / 4., sb / std::sqrt( n ) );
+                    double sg2 = sg * sg;
+                    double sb2 = sb * sb;
+
+                    auto c_max = [n, mn, sb2, sg2]( double x ) -> double {
+                        double k = mn - x;
+                        double d = k * k;
+                        double nd = n * d;
+                        double k0 = -n * nd;
+                        double k1 = sb2 - n * sg2 + nd;
+                        double det = k1 * k1 - 4 * sg2 * k0;
+                        return static_cast<int>( -2. * k0 /
+                                                 ( k1 + std::sqrt( det ) ) );
+                    };
+
+                    auto var_out = [n, sb2, sg2]( double c ) {
+                        double nc = n - c;
+                        return ( nc / n ) * ( sb2 - nc * sg2 );
+                    };
+
+                    return (std::min)( var_out( 1 ),
+                                       var_out(
+                                           (std::min)( c_max( 0. ),
+                                                       c_max( mg_min ) ) ) ) /
+                           sb2;
+                }
+
+                static double erf_inv( double x ) {
+                    // Code accompanying the article "Approximating the erfinv
+                    // function" in GPU Computing Gems, Volume 2
+                    double w, p;
+
+                    w = -log( ( 1.0 - x ) * ( 1.0 + x ) );
+
+                    if ( w < 6.250000 ) {
+                        w = w - 3.125000;
+                        p = -3.6444120640178196996e-21;
+                        p = -1.685059138182016589e-19 + p * w;
+                        p = 1.2858480715256400167e-18 + p * w;
+                        p = 1.115787767802518096e-17 + p * w;
+                        p = -1.333171662854620906e-16 + p * w;
+                        p = 2.0972767875968561637e-17 + p * w;
+                        p = 6.6376381343583238325e-15 + p * w;
+                        p = -4.0545662729752068639e-14 + p * w;
+                        p = -8.1519341976054721522e-14 + p * w;
+                        p = 2.6335093153082322977e-12 + p * w;
+                        p = -1.2975133253453532498e-11 + p * w;
+                        p = -5.4154120542946279317e-11 + p * w;
+                        p = 1.051212273321532285e-09 + p * w;
+                        p = -4.1126339803469836976e-09 + p * w;
+                        p = -2.9070369957882005086e-08 + p * w;
+                        p = 4.2347877827932403518e-07 + p * w;
+                        p = -1.3654692000834678645e-06 + p * w;
+                        p = -1.3882523362786468719e-05 + p * w;
+                        p = 0.0001867342080340571352 + p * w;
+                        p = -0.00074070253416626697512 + p * w;
+                        p = -0.0060336708714301490533 + p * w;
+                        p = 0.24015818242558961693 + p * w;
+                        p = 1.6536545626831027356 + p * w;
+                    } else if ( w < 16.000000 ) {
+                        w = sqrt( w ) - 3.250000;
+                        p = 2.2137376921775787049e-09;
+                        p = 9.0756561938885390979e-08 + p * w;
+                        p = -2.7517406297064545428e-07 + p * w;
+                        p = 1.8239629214389227755e-08 + p * w;
+                        p = 1.5027403968909827627e-06 + p * w;
+                        p = -4.013867526981545969e-06 + p * w;
+                        p = 2.9234449089955446044e-06 + p * w;
+                        p = 1.2475304481671778723e-05 + p * w;
+                        p = -4.7318229009055733981e-05 + p * w;
+                        p = 6.8284851459573175448e-05 + p * w;
+                        p = 2.4031110387097893999e-05 + p * w;
+                        p = -0.0003550375203628474796 + p * w;
+                        p = 0.00095328937973738049703 + p * w;
+                        p = -0.0016882755560235047313 + p * w;
+                        p = 0.0024914420961078508066 + p * w;
+                        p = -0.0037512085075692412107 + p * w;
+                        p = 0.005370914553590063617 + p * w;
+                        p = 1.0052589676941592334 + p * w;
+                        p = 3.0838856104922207635 + p * w;
+                    } else {
+                        w = sqrt( w ) - 5.000000;
+                        p = -2.7109920616438573243e-11;
+                        p = -2.5556418169965252055e-10 + p * w;
+                        p = 1.5076572693500548083e-09 + p * w;
+                        p = -3.7894654401267369937e-09 + p * w;
+                        p = 7.6157012080783393804e-09 + p * w;
+                        p = -1.4960026627149240478e-08 + p * w;
+                        p = 2.9147953450901080826e-08 + p * w;
+                        p = -6.7711997758452339498e-08 + p * w;
+                        p = 2.2900482228026654717e-07 + p * w;
+                        p = -9.9298272942317002539e-07 + p * w;
+                        p = 4.5260625972231537039e-06 + p * w;
+                        p = -1.9681778105531670567e-05 + p * w;
+                        p = 7.5995277030017761139e-05 + p * w;
+                        p = -0.00021503011930044477347 + p * w;
+                        p = -0.00013871931833623122026 + p * w;
+                        p = 1.0103004648645343977 + p * w;
+                        p = 4.8499064014085844221 + p * w;
+                    }
+                    return p * x;
+                }
+
+                static double
+                standard_deviation( double const* first, double const* last ) {
+                    auto m = Catch::Benchmark::Detail::mean( first, last );
+                    double variance =
+                        std::accumulate( first,
+                                         last,
+                                         0.,
+                                         [m]( double a, double b ) {
+                                             double diff = b - m;
+                                             return a + diff * diff;
+                                         } ) /
+                        ( last - first );
+                    return std::sqrt( variance );
+                }
+
+                static sample jackknife( double ( *estimator )( double const*,
+                                                                double const* ),
+                                         double* first,
+                                         double* last ) {
+                    const auto second = first + 1;
+                    sample results;
+                    results.reserve( static_cast<size_t>( last - first ) );
+
+                    for ( auto it = first; it != last; ++it ) {
+                        std::iter_swap( it, first );
+                        results.push_back( estimator( second, last ) );
+                    }
+
+                    return results;
+                }
+
+
+            } // namespace
+        }     // namespace Detail
+    }         // namespace Benchmark
+} // namespace Catch
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+
+            double weighted_average_quantile( int k,
+                                              int q,
+                                              double* first,
+                                              double* last ) {
+                auto count = last - first;
+                double idx = (count - 1) * k / static_cast<double>(q);
+                int j = static_cast<int>(idx);
+                double g = idx - j;
+                std::nth_element(first, first + j, last);
+                auto xj = first[j];
+                if ( Catch::Detail::directCompare( g, 0 ) ) {
+                    return xj;
+                }
+
+                auto xj1 = *std::min_element(first + (j + 1), last);
+                return xj + g * (xj1 - xj);
+            }
+
+            OutlierClassification
+            classify_outliers( double const* first, double const* last ) {
+                std::vector<double> copy( first, last );
+
+                auto q1 = weighted_average_quantile( 1, 4, copy.data(), copy.data() + copy.size() );
+                auto q3 = weighted_average_quantile( 3, 4, copy.data(), copy.data() + copy.size() );
+                auto iqr = q3 - q1;
+                auto los = q1 - ( iqr * 3. );
+                auto lom = q1 - ( iqr * 1.5 );
+                auto him = q3 + ( iqr * 1.5 );
+                auto his = q3 + ( iqr * 3. );
+
+                OutlierClassification o;
+                for ( ; first != last; ++first ) {
+                    const double t = *first;
+                    if ( t < los ) {
+                        ++o.low_severe;
+                    } else if ( t < lom ) {
+                        ++o.low_mild;
+                    } else if ( t > his ) {
+                        ++o.high_severe;
+                    } else if ( t > him ) {
+                        ++o.high_mild;
+                    }
+                    ++o.samples_seen;
+                }
+                return o;
+            }
+
+            double mean( double const* first, double const* last ) {
+                auto count = last - first;
+                double sum = 0.;
+                while (first != last) {
+                    sum += *first;
+                    ++first;
+                }
+                return sum / static_cast<double>(count);
+            }
+
+            double normal_cdf( double x ) {
+                return std::erfc( -x / std::sqrt( 2.0 ) ) / 2.0;
+            }
+
+            double erfc_inv(double x) {
+                return erf_inv(1.0 - x);
+            }
+
+            double normal_quantile(double p) {
+                static const double ROOT_TWO = std::sqrt(2.0);
+
+                double result = 0.0;
+                assert(p >= 0 && p <= 1);
+                if (p < 0 || p > 1) {
+                    return result;
+                }
+
+                result = -erfc_inv(2.0 * p);
+                // result *= normal distribution standard deviation (1.0) * sqrt(2)
+                result *= /*sd * */ ROOT_TWO;
+                // result += normal disttribution mean (0)
+                return result;
+            }
+
+            Estimate<double>
+            bootstrap( double confidence_level,
+                       double* first,
+                       double* last,
+                       sample const& resample,
+                       double ( *estimator )( double const*, double const* ) ) {
+                auto n_samples = last - first;
+
+                double point = estimator( first, last );
+                // Degenerate case with a single sample
+                if ( n_samples == 1 )
+                    return { point, point, point, confidence_level };
+
+                sample jack = jackknife( estimator, first, last );
+                double jack_mean =
+                    mean( jack.data(), jack.data() + jack.size() );
+                double sum_squares = 0, sum_cubes = 0;
+                for ( double x : jack ) {
+                    auto difference = jack_mean - x;
+                    auto square = difference * difference;
+                    auto cube = square * difference;
+                    sum_squares += square;
+                    sum_cubes += cube;
+                }
+
+                double accel = sum_cubes / ( 6 * std::pow( sum_squares, 1.5 ) );
+                long n = static_cast<long>( resample.size() );
+                double prob_n =
+                    std::count_if( resample.begin(),
+                                   resample.end(),
+                                   [point]( double x ) { return x < point; } ) /
+                    static_cast<double>( n );
+                // degenerate case with uniform samples
+                if ( Catch::Detail::directCompare( prob_n, 0. ) ) {
+                    return { point, point, point, confidence_level };
+                }
+
+                double bias = normal_quantile( prob_n );
+                double z1 = normal_quantile( ( 1. - confidence_level ) / 2. );
+
+                auto cumn = [n]( double x ) -> long {
+                    return std::lround( normal_cdf( x ) *
+                                        static_cast<double>( n ) );
+                };
+                auto a = [bias, accel]( double b ) {
+                    return bias + b / ( 1. - accel * b );
+                };
+                double b1 = bias + z1;
+                double b2 = bias - z1;
+                double a1 = a( b1 );
+                double a2 = a( b2 );
+                auto lo = static_cast<size_t>( (std::max)( cumn( a1 ), 0l ) );
+                auto hi =
+                    static_cast<size_t>( (std::min)( cumn( a2 ), n - 1 ) );
+
+                return { point, resample[lo], resample[hi], confidence_level };
+            }
+
+            bootstrap_analysis analyse_samples(double confidence_level,
+                                               unsigned int n_resamples,
+                                               double* first,
+                                               double* last) {
+                auto mean = &Detail::mean;
+                auto stddev = &standard_deviation;
+
+#if defined(CATCH_CONFIG_USE_ASYNC)
+                auto Estimate = [=](double(*f)(double const*, double const*)) {
+                    std::random_device rd;
+                    auto seed = rd();
+                    return std::async(std::launch::async, [=] {
+                        SimplePcg32 rng( seed );
+                        auto resampled = resample(rng, n_resamples, first, last, f);
+                        return bootstrap(confidence_level, first, last, resampled, f);
+                    });
+                };
+
+                auto mean_future = Estimate(mean);
+                auto stddev_future = Estimate(stddev);
+
+                auto mean_estimate = mean_future.get();
+                auto stddev_estimate = stddev_future.get();
+#else
+                auto Estimate = [=](double(*f)(double const* , double const*)) {
+                    std::random_device rd;
+                    auto seed = rd();
+                    SimplePcg32 rng( seed );
+                    auto resampled = resample(rng, n_resamples, first, last, f);
+                    return bootstrap(confidence_level, first, last, resampled, f);
+                };
+
+                auto mean_estimate = Estimate(mean);
+                auto stddev_estimate = Estimate(stddev);
+#endif // CATCH_USE_ASYNC
+
+                auto n = static_cast<int>(last - first); // seriously, one can't use integral types without hell in C++
+                double outlier_variance = Detail::outlier_variance(mean_estimate, stddev_estimate, n);
+
+                return { mean_estimate, stddev_estimate, outlier_variance };
+            }
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_stats.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_stats.hpp
new file mode 100644
index 00000000..3bea612f
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_stats.hpp
@@ -0,0 +1,60 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#ifndef CATCH_STATS_HPP_INCLUDED
+#define CATCH_STATS_HPP_INCLUDED
+
+#include <catch2/benchmark/catch_estimate.hpp>
+#include <catch2/benchmark/catch_outlier_classification.hpp>
+
+#include <vector>
+
+namespace Catch {
+    namespace Benchmark {
+        namespace Detail {
+            using sample = std::vector<double>;
+
+            double weighted_average_quantile( int k,
+                                              int q,
+                                              double* first,
+                                              double* last );
+
+            OutlierClassification
+            classify_outliers( double const* first, double const* last );
+
+            double mean( double const* first, double const* last );
+
+            double normal_cdf( double x );
+
+            double erfc_inv(double x);
+
+            double normal_quantile(double p);
+
+            Estimate<double>
+            bootstrap( double confidence_level,
+                       double* first,
+                       double* last,
+                       sample const& resample,
+                       double ( *estimator )( double const*, double const* ) );
+
+            struct bootstrap_analysis {
+                Estimate<double> mean;
+                Estimate<double> standard_deviation;
+                double outlier_variance;
+            };
+
+            bootstrap_analysis analyse_samples(double confidence_level,
+                                               unsigned int n_resamples,
+                                               double* first,
+                                               double* last);
+        } // namespace Detail
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_STATS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_timing.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_timing.hpp
new file mode 100644
index 00000000..da567190
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_timing.hpp
@@ -0,0 +1,31 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Adapted from donated nonius code.
+
+#ifndef CATCH_TIMING_HPP_INCLUDED
+#define CATCH_TIMING_HPP_INCLUDED
+
+#include <catch2/benchmark/catch_clock.hpp>
+#include <catch2/benchmark/detail/catch_complete_invoke.hpp>
+
+#include <type_traits>
+
+namespace Catch {
+    namespace Benchmark {
+        template <typename Result>
+        struct Timing {
+            IDuration elapsed;
+            Result result;
+            int iterations;
+        };
+        template <typename Func, typename... Args>
+        using TimingOf = Timing<Detail::CompleteType_t<FunctionReturnType<Func, Args...>>>;
+    } // namespace Benchmark
+} // namespace Catch
+
+#endif // CATCH_TIMING_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_all.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_all.hpp
new file mode 100644
index 00000000..f2cc8536
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_all.hpp
@@ -0,0 +1,135 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+/** \file
+ * This is a convenience header for Catch2. It includes **all** of Catch2 headers.
+ *
+ * Generally the Catch2 users should use specific includes they need,
+ * but this header can be used instead for ease-of-experimentation, or
+ * just plain convenience, at the cost of (significantly) increased
+ * compilation times.
+ *
+ * When a new header is added to either the top level folder, or to the
+ * corresponding internal subfolder, it should be added here. Headers
+ * added to the various subparts (e.g. matchers, generators, etc...),
+ * should go their respective catch-all headers.
+ */
+
+#ifndef CATCH_ALL_HPP_INCLUDED
+#define CATCH_ALL_HPP_INCLUDED
+
+#include <catch2/benchmark/catch_benchmark_all.hpp>
+#include <catch2/catch_approx.hpp>
+#include <catch2/catch_assertion_info.hpp>
+#include <catch2/catch_assertion_result.hpp>
+#include <catch2/catch_config.hpp>
+#include <catch2/catch_get_random_seed.hpp>
+#include <catch2/catch_message.hpp>
+#include <catch2/catch_section_info.hpp>
+#include <catch2/catch_session.hpp>
+#include <catch2/catch_tag_alias.hpp>
+#include <catch2/catch_tag_alias_autoregistrar.hpp>
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_case_info.hpp>
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/catch_test_spec.hpp>
+#include <catch2/catch_timer.hpp>
+#include <catch2/catch_tostring.hpp>
+#include <catch2/catch_totals.hpp>
+#include <catch2/catch_translate_exception.hpp>
+#include <catch2/catch_version.hpp>
+#include <catch2/catch_version_macros.hpp>
+#include <catch2/generators/catch_generators_all.hpp>
+#include <catch2/interfaces/catch_interfaces_all.hpp>
+#include <catch2/internal/catch_assertion_handler.hpp>
+#include <catch2/internal/catch_case_insensitive_comparisons.hpp>
+#include <catch2/internal/catch_case_sensitive.hpp>
+#include <catch2/internal/catch_clara.hpp>
+#include <catch2/internal/catch_commandline.hpp>
+#include <catch2/internal/catch_compare_traits.hpp>
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/internal/catch_config_android_logwrite.hpp>
+#include <catch2/internal/catch_config_counter.hpp>
+#include <catch2/internal/catch_config_prefix_messages.hpp>
+#include <catch2/internal/catch_config_static_analysis_support.hpp>
+#include <catch2/internal/catch_config_uncaught_exceptions.hpp>
+#include <catch2/internal/catch_config_wchar.hpp>
+#include <catch2/internal/catch_console_colour.hpp>
+#include <catch2/internal/catch_console_width.hpp>
+#include <catch2/internal/catch_container_nonmembers.hpp>
+#include <catch2/internal/catch_context.hpp>
+#include <catch2/internal/catch_debug_console.hpp>
+#include <catch2/internal/catch_debugger.hpp>
+#include <catch2/internal/catch_decomposer.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_enum_values_registry.hpp>
+#include <catch2/internal/catch_errno_guard.hpp>
+#include <catch2/internal/catch_exception_translator_registry.hpp>
+#include <catch2/internal/catch_fatal_condition_handler.hpp>
+#include <catch2/internal/catch_floating_point_helpers.hpp>
+#include <catch2/internal/catch_getenv.hpp>
+#include <catch2/internal/catch_is_permutation.hpp>
+#include <catch2/internal/catch_istream.hpp>
+#include <catch2/internal/catch_jsonwriter.hpp>
+#include <catch2/internal/catch_lazy_expr.hpp>
+#include <catch2/internal/catch_leak_detector.hpp>
+#include <catch2/internal/catch_list.hpp>
+#include <catch2/internal/catch_logical_traits.hpp>
+#include <catch2/internal/catch_message_info.hpp>
+#include <catch2/internal/catch_meta.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/internal/catch_noncopyable.hpp>
+#include <catch2/internal/catch_optional.hpp>
+#include <catch2/internal/catch_output_redirect.hpp>
+#include <catch2/internal/catch_parse_numbers.hpp>
+#include <catch2/internal/catch_platform.hpp>
+#include <catch2/internal/catch_polyfills.hpp>
+#include <catch2/internal/catch_preprocessor.hpp>
+#include <catch2/internal/catch_preprocessor_internal_stringify.hpp>
+#include <catch2/internal/catch_preprocessor_remove_parens.hpp>
+#include <catch2/internal/catch_random_floating_point_helpers.hpp>
+#include <catch2/internal/catch_random_integer_helpers.hpp>
+#include <catch2/internal/catch_random_number_generator.hpp>
+#include <catch2/internal/catch_random_seed_generation.hpp>
+#include <catch2/internal/catch_reporter_registry.hpp>
+#include <catch2/internal/catch_reporter_spec_parser.hpp>
+#include <catch2/internal/catch_result_type.hpp>
+#include <catch2/internal/catch_reusable_string_stream.hpp>
+#include <catch2/internal/catch_run_context.hpp>
+#include <catch2/internal/catch_section.hpp>
+#include <catch2/internal/catch_sharding.hpp>
+#include <catch2/internal/catch_singletons.hpp>
+#include <catch2/internal/catch_source_line_info.hpp>
+#include <catch2/internal/catch_startup_exception_registry.hpp>
+#include <catch2/internal/catch_stdstreams.hpp>
+#include <catch2/internal/catch_stream_end_stop.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+#include <catch2/internal/catch_tag_alias_registry.hpp>
+#include <catch2/internal/catch_template_test_registry.hpp>
+#include <catch2/internal/catch_test_case_info_hasher.hpp>
+#include <catch2/internal/catch_test_case_registry_impl.hpp>
+#include <catch2/internal/catch_test_case_tracker.hpp>
+#include <catch2/internal/catch_test_failure_exception.hpp>
+#include <catch2/internal/catch_test_macro_impl.hpp>
+#include <catch2/internal/catch_test_registry.hpp>
+#include <catch2/internal/catch_test_run_info.hpp>
+#include <catch2/internal/catch_test_spec_parser.hpp>
+#include <catch2/internal/catch_textflow.hpp>
+#include <catch2/internal/catch_to_string.hpp>
+#include <catch2/internal/catch_uncaught_exceptions.hpp>
+#include <catch2/internal/catch_uniform_floating_point_distribution.hpp>
+#include <catch2/internal/catch_uniform_integer_distribution.hpp>
+#include <catch2/internal/catch_unique_name.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
+#include <catch2/internal/catch_void_type.hpp>
+#include <catch2/internal/catch_wildcard_pattern.hpp>
+#include <catch2/internal/catch_xmlwriter.hpp>
+#include <catch2/matchers/catch_matchers_all.hpp>
+#include <catch2/reporters/catch_reporters_all.hpp>
+
+#endif // CATCH_ALL_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_approx.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_approx.cpp
new file mode 100644
index 00000000..08c6799b
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_approx.cpp
@@ -0,0 +1,85 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/catch_approx.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_reusable_string_stream.hpp>
+
+#include <cmath>
+#include <limits>
+
+namespace {
+
+// Performs equivalent check of std::fabs(lhs - rhs) <= margin
+// But without the subtraction to allow for INFINITY in comparison
+bool marginComparison(double lhs, double rhs, double margin) {
+    return (lhs + margin >= rhs) && (rhs + margin >= lhs);
+}
+
+}
+
+namespace Catch {
+
+    Approx::Approx ( double value )
+    :   m_epsilon( static_cast<double>(std::numeric_limits<float>::epsilon())*100. ),
+        m_margin( 0.0 ),
+        m_scale( 0.0 ),
+        m_value( value )
+    {}
+
+    Approx Approx::custom() {
+        return Approx( 0 );
+    }
+
+    Approx Approx::operator-() const {
+        auto temp(*this);
+        temp.m_value = -temp.m_value;
+        return temp;
+    }
+
+
+    std::string Approx::toString() const {
+        ReusableStringStream rss;
+        rss << "Approx( " << ::Catch::Detail::stringify( m_value ) << " )";
+        return rss.str();
+    }
+
+    bool Approx::equalityComparisonImpl(const double other) const {
+        // First try with fixed margin, then compute margin based on epsilon, scale and Approx's value
+        // Thanks to Richard Harris for his help refining the scaled margin value
+        return marginComparison(m_value, other, m_margin)
+            || marginComparison(m_value, other, m_epsilon * (m_scale + std::fabs(std::isinf(m_value)? 0 : m_value)));
+    }
+
+    void Approx::setMargin(double newMargin) {
+        CATCH_ENFORCE(newMargin >= 0,
+            "Invalid Approx::margin: " << newMargin << '.'
+            << " Approx::Margin has to be non-negative.");
+        m_margin = newMargin;
+    }
+
+    void Approx::setEpsilon(double newEpsilon) {
+        CATCH_ENFORCE(newEpsilon >= 0 && newEpsilon <= 1.0,
+            "Invalid Approx::epsilon: " << newEpsilon << '.'
+            << " Approx::epsilon has to be in [0, 1]");
+        m_epsilon = newEpsilon;
+    }
+
+namespace literals {
+    Approx operator ""_a(long double val) {
+        return Approx(val);
+    }
+    Approx operator ""_a(unsigned long long val) {
+        return Approx(val);
+    }
+} // end namespace literals
+
+std::string StringMaker<Catch::Approx>::convert(Catch::Approx const& value) {
+    return value.toString();
+}
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_approx.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_approx.hpp
new file mode 100644
index 00000000..de4d2ab4
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_approx.hpp
@@ -0,0 +1,128 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_APPROX_HPP_INCLUDED
+#define CATCH_APPROX_HPP_INCLUDED
+
+#include <catch2/catch_tostring.hpp>
+
+#include <type_traits>
+
+namespace Catch {
+
+    class Approx {
+    private:
+        bool equalityComparisonImpl(double other) const;
+        // Sets and validates the new margin (margin >= 0)
+        void setMargin(double margin);
+        // Sets and validates the new epsilon (0 < epsilon < 1)
+        void setEpsilon(double epsilon);
+
+    public:
+        explicit Approx ( double value );
+
+        static Approx custom();
+
+        Approx operator-() const;
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        Approx operator()( T const& value ) const {
+            Approx approx( static_cast<double>(value) );
+            approx.m_epsilon = m_epsilon;
+            approx.m_margin = m_margin;
+            approx.m_scale = m_scale;
+            return approx;
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        explicit Approx( T const& value ): Approx(static_cast<double>(value))
+        {}
+
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        friend bool operator == ( const T& lhs, Approx const& rhs ) {
+            auto lhs_v = static_cast<double>(lhs);
+            return rhs.equalityComparisonImpl(lhs_v);
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        friend bool operator == ( Approx const& lhs, const T& rhs ) {
+            return operator==( rhs, lhs );
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        friend bool operator != ( T const& lhs, Approx const& rhs ) {
+            return !operator==( lhs, rhs );
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        friend bool operator != ( Approx const& lhs, T const& rhs ) {
+            return !operator==( rhs, lhs );
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        friend bool operator <= ( T const& lhs, Approx const& rhs ) {
+            return static_cast<double>(lhs) < rhs.m_value || lhs == rhs;
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        friend bool operator <= ( Approx const& lhs, T const& rhs ) {
+            return lhs.m_value < static_cast<double>(rhs) || lhs == rhs;
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        friend bool operator >= ( T const& lhs, Approx const& rhs ) {
+            return static_cast<double>(lhs) > rhs.m_value || lhs == rhs;
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        friend bool operator >= ( Approx const& lhs, T const& rhs ) {
+            return lhs.m_value > static_cast<double>(rhs) || lhs == rhs;
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        Approx& epsilon( T const& newEpsilon ) {
+            const auto epsilonAsDouble = static_cast<double>(newEpsilon);
+            setEpsilon(epsilonAsDouble);
+            return *this;
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        Approx& margin( T const& newMargin ) {
+            const auto marginAsDouble = static_cast<double>(newMargin);
+            setMargin(marginAsDouble);
+            return *this;
+        }
+
+        template <typename T, typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        Approx& scale( T const& newScale ) {
+            m_scale = static_cast<double>(newScale);
+            return *this;
+        }
+
+        std::string toString() const;
+
+    private:
+        double m_epsilon;
+        double m_margin;
+        double m_scale;
+        double m_value;
+    };
+
+namespace literals {
+    Approx operator ""_a(long double val);
+    Approx operator ""_a(unsigned long long val);
+} // end namespace literals
+
+template<>
+struct StringMaker<Catch::Approx> {
+    static std::string convert(Catch::Approx const& value);
+};
+
+} // end namespace Catch
+
+#endif // CATCH_APPROX_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_assertion_info.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_assertion_info.hpp
new file mode 100644
index 00000000..9d2f91fa
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_assertion_info.hpp
@@ -0,0 +1,28 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_ASSERTION_INFO_HPP_INCLUDED
+#define CATCH_ASSERTION_INFO_HPP_INCLUDED
+
+#include <catch2/internal/catch_result_type.hpp>
+#include <catch2/internal/catch_source_line_info.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+
+namespace Catch {
+
+    struct AssertionInfo {
+        // AssertionInfo() = delete;
+
+        StringRef macroName;
+        SourceLineInfo lineInfo;
+        StringRef capturedExpression;
+        ResultDisposition::Flags resultDisposition;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_ASSERTION_INFO_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_assertion_result.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_assertion_result.cpp
new file mode 100644
index 00000000..61d4fd06
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_assertion_result.cpp
@@ -0,0 +1,105 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/catch_assertion_result.hpp>
+#include <catch2/internal/catch_reusable_string_stream.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+namespace Catch {
+
+    AssertionResultData::AssertionResultData(ResultWas::OfType _resultType, LazyExpression const & _lazyExpression):
+        lazyExpression(_lazyExpression),
+        resultType(_resultType) {}
+
+    std::string AssertionResultData::reconstructExpression() const {
+
+        if( reconstructedExpression.empty() ) {
+            if( lazyExpression ) {
+                ReusableStringStream rss;
+                rss << lazyExpression;
+                reconstructedExpression = rss.str();
+            }
+        }
+        return reconstructedExpression;
+    }
+
+    AssertionResult::AssertionResult( AssertionInfo const& info, AssertionResultData&& data )
+    :   m_info( info ),
+        m_resultData( CATCH_MOVE(data) )
+    {}
+
+    // Result was a success
+    bool AssertionResult::succeeded() const {
+        return Catch::isOk( m_resultData.resultType );
+    }
+
+    // Result was a success, or failure is suppressed
+    bool AssertionResult::isOk() const {
+        return Catch::isOk( m_resultData.resultType ) || shouldSuppressFailure( m_info.resultDisposition );
+    }
+
+    ResultWas::OfType AssertionResult::getResultType() const {
+        return m_resultData.resultType;
+    }
+
+    bool AssertionResult::hasExpression() const {
+        return !m_info.capturedExpression.empty();
+    }
+
+    bool AssertionResult::hasMessage() const {
+        return !m_resultData.message.empty();
+    }
+
+    std::string AssertionResult::getExpression() const {
+        // Possibly overallocating by 3 characters should be basically free
+        std::string expr; expr.reserve(m_info.capturedExpression.size() + 3);
+        if (isFalseTest(m_info.resultDisposition)) {
+            expr += "!(";
+        }
+        expr += m_info.capturedExpression;
+        if (isFalseTest(m_info.resultDisposition)) {
+            expr += ')';
+        }
+        return expr;
+    }
+
+    std::string AssertionResult::getExpressionInMacro() const {
+        if ( m_info.macroName.empty() ) {
+            return static_cast<std::string>( m_info.capturedExpression );
+        }
+        std::string expr;
+        expr.reserve( m_info.macroName.size() + m_info.capturedExpression.size() + 4 );
+        expr += m_info.macroName;
+        expr += "( ";
+        expr += m_info.capturedExpression;
+        expr += " )";
+        return expr;
+    }
+
+    bool AssertionResult::hasExpandedExpression() const {
+        return hasExpression() && getExpandedExpression() != getExpression();
+    }
+
+    std::string AssertionResult::getExpandedExpression() const {
+        std::string expr = m_resultData.reconstructExpression();
+        return expr.empty()
+                ? getExpression()
+                : expr;
+    }
+
+    StringRef AssertionResult::getMessage() const {
+        return m_resultData.message;
+    }
+    SourceLineInfo AssertionResult::getSourceInfo() const {
+        return m_info.lineInfo;
+    }
+
+    StringRef AssertionResult::getTestMacroName() const {
+        return m_info.macroName;
+    }
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_assertion_result.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_assertion_result.hpp
new file mode 100644
index 00000000..48882876
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_assertion_result.hpp
@@ -0,0 +1,60 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_ASSERTION_RESULT_HPP_INCLUDED
+#define CATCH_ASSERTION_RESULT_HPP_INCLUDED
+
+#include <catch2/catch_assertion_info.hpp>
+#include <catch2/internal/catch_result_type.hpp>
+#include <catch2/internal/catch_source_line_info.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+#include <catch2/internal/catch_lazy_expr.hpp>
+
+#include <string>
+
+namespace Catch {
+
+    struct AssertionResultData
+    {
+        AssertionResultData() = delete;
+
+        AssertionResultData( ResultWas::OfType _resultType, LazyExpression const& _lazyExpression );
+
+        std::string message;
+        mutable std::string reconstructedExpression;
+        LazyExpression lazyExpression;
+        ResultWas::OfType resultType;
+
+        std::string reconstructExpression() const;
+    };
+
+    class AssertionResult {
+    public:
+        AssertionResult() = delete;
+        AssertionResult( AssertionInfo const& info, AssertionResultData&& data );
+
+        bool isOk() const;
+        bool succeeded() const;
+        ResultWas::OfType getResultType() const;
+        bool hasExpression() const;
+        bool hasMessage() const;
+        std::string getExpression() const;
+        std::string getExpressionInMacro() const;
+        bool hasExpandedExpression() const;
+        std::string getExpandedExpression() const;
+        StringRef getMessage() const;
+        SourceLineInfo getSourceInfo() const;
+        StringRef getTestMacroName() const;
+
+    //protected:
+        AssertionInfo m_info;
+        AssertionResultData m_resultData;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_ASSERTION_RESULT_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_config.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_config.cpp
new file mode 100644
index 00000000..34f50f17
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_config.cpp
@@ -0,0 +1,247 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/catch_config.hpp>
+#include <catch2/catch_user_config.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_parse_numbers.hpp>
+#include <catch2/internal/catch_stdstreams.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/internal/catch_test_spec_parser.hpp>
+#include <catch2/interfaces/catch_interfaces_tag_alias_registry.hpp>
+#include <catch2/internal/catch_getenv.hpp>
+
+#include <fstream>
+
+namespace Catch {
+
+    namespace {
+        static bool enableBazelEnvSupport() {
+#if defined( CATCH_CONFIG_BAZEL_SUPPORT )
+            return true;
+#else
+            return Detail::getEnv( "BAZEL_TEST" ) != nullptr;
+#endif
+        }
+
+        struct bazelShardingOptions {
+            unsigned int shardIndex, shardCount;
+            std::string shardFilePath;
+        };
+
+        static Optional<bazelShardingOptions> readBazelShardingOptions() {
+            const auto bazelShardIndex = Detail::getEnv( "TEST_SHARD_INDEX" );
+            const auto bazelShardTotal = Detail::getEnv( "TEST_TOTAL_SHARDS" );
+            const auto bazelShardInfoFile = Detail::getEnv( "TEST_SHARD_STATUS_FILE" );
+
+
+            const bool has_all =
+                bazelShardIndex && bazelShardTotal && bazelShardInfoFile;
+            if ( !has_all ) {
+                // We provide nice warning message if the input is
+                // misconfigured.
+                auto warn = []( const char* env_var ) {
+                    Catch::cerr()
+                        << "Warning: Bazel shard configuration is missing '"
+                        << env_var << "'. Shard configuration is skipped.\n";
+                };
+                if ( !bazelShardIndex ) {
+                    warn( "TEST_SHARD_INDEX" );
+                }
+                if ( !bazelShardTotal ) {
+                    warn( "TEST_TOTAL_SHARDS" );
+                }
+                if ( !bazelShardInfoFile ) {
+                    warn( "TEST_SHARD_STATUS_FILE" );
+                }
+                return {};
+            }
+
+            auto shardIndex = parseUInt( bazelShardIndex );
+            if ( !shardIndex ) {
+                Catch::cerr()
+                    << "Warning: could not parse 'TEST_SHARD_INDEX' ('" << bazelShardIndex
+                    << "') as unsigned int.\n";
+                return {};
+            }
+            auto shardTotal = parseUInt( bazelShardTotal );
+            if ( !shardTotal ) {
+                Catch::cerr()
+                    << "Warning: could not parse 'TEST_TOTAL_SHARD' ('"
+                    << bazelShardTotal << "') as unsigned int.\n";
+                return {};
+            }
+
+            return bazelShardingOptions{
+                *shardIndex, *shardTotal, bazelShardInfoFile };
+
+        }
+    } // end namespace
+
+
+    bool operator==( ProcessedReporterSpec const& lhs,
+                     ProcessedReporterSpec const& rhs ) {
+        return lhs.name == rhs.name &&
+               lhs.outputFilename == rhs.outputFilename &&
+               lhs.colourMode == rhs.colourMode &&
+               lhs.customOptions == rhs.customOptions;
+    }
+
+    Config::Config( ConfigData const& data ):
+        m_data( data ) {
+        // We need to trim filter specs to avoid trouble with superfluous
+        // whitespace (esp. important for bdd macros, as those are manually
+        // aligned with whitespace).
+
+        for (auto& elem : m_data.testsOrTags) {
+            elem = trim(elem);
+        }
+        for (auto& elem : m_data.sectionsToRun) {
+            elem = trim(elem);
+        }
+
+        // Insert the default reporter if user hasn't asked for a specific one
+        if ( m_data.reporterSpecifications.empty() ) {
+            m_data.reporterSpecifications.push_back( {
+#if defined( CATCH_CONFIG_DEFAULT_REPORTER )
+                CATCH_CONFIG_DEFAULT_REPORTER,
+#else
+                "console",
+#endif
+                {}, {}, {}
+            } );
+        }
+
+        if ( enableBazelEnvSupport() ) {
+            readBazelEnvVars();
+        }
+
+        // Bazel support can modify the test specs, so parsing has to happen
+        // after reading Bazel env vars.
+        TestSpecParser parser( ITagAliasRegistry::get() );
+        if ( !m_data.testsOrTags.empty() ) {
+            m_hasTestFilters = true;
+            for ( auto const& testOrTags : m_data.testsOrTags ) {
+                parser.parse( testOrTags );
+            }
+        }
+        m_testSpec = parser.testSpec();
+
+
+        // We now fixup the reporter specs to handle default output spec,
+        // default colour spec, etc
+        bool defaultOutputUsed = false;
+        for ( auto const& reporterSpec : m_data.reporterSpecifications ) {
+            // We do the default-output check separately, while always
+            // using the default output below to make the code simpler
+            // and avoid superfluous copies.
+            if ( reporterSpec.outputFile().none() ) {
+                CATCH_ENFORCE( !defaultOutputUsed,
+                               "Internal error: cannot use default output for "
+                               "multiple reporters" );
+                defaultOutputUsed = true;
+            }
+
+            m_processedReporterSpecs.push_back( ProcessedReporterSpec{
+                reporterSpec.name(),
+                reporterSpec.outputFile() ? *reporterSpec.outputFile()
+                                          : data.defaultOutputFilename,
+                reporterSpec.colourMode().valueOr( data.defaultColourMode ),
+                reporterSpec.customOptions() } );
+        }
+    }
+
+    Config::~Config() = default;
+
+
+    bool Config::listTests() const          { return m_data.listTests; }
+    bool Config::listTags() const           { return m_data.listTags; }
+    bool Config::listReporters() const      { return m_data.listReporters; }
+    bool Config::listListeners() const      { return m_data.listListeners; }
+
+    std::vector<std::string> const& Config::getTestsOrTags() const { return m_data.testsOrTags; }
+    std::vector<std::string> const& Config::getSectionsToRun() const { return m_data.sectionsToRun; }
+
+    std::vector<ReporterSpec> const& Config::getReporterSpecs() const {
+        return m_data.reporterSpecifications;
+    }
+
+    std::vector<ProcessedReporterSpec> const&
+    Config::getProcessedReporterSpecs() const {
+        return m_processedReporterSpecs;
+    }
+
+    TestSpec const& Config::testSpec() const { return m_testSpec; }
+    bool Config::hasTestFilters() const { return m_hasTestFilters; }
+
+    bool Config::showHelp() const { return m_data.showHelp; }
+
+    // IConfig interface
+    bool Config::allowThrows() const                   { return !m_data.noThrow; }
+    StringRef Config::name() const { return m_data.name.empty() ? m_data.processName : m_data.name; }
+    bool Config::includeSuccessfulResults() const      { return m_data.showSuccessfulTests; }
+    bool Config::warnAboutMissingAssertions() const {
+        return !!( m_data.warnings & WarnAbout::NoAssertions );
+    }
+    bool Config::warnAboutUnmatchedTestSpecs() const {
+        return !!( m_data.warnings & WarnAbout::UnmatchedTestSpec );
+    }
+    bool Config::zeroTestsCountAsSuccess() const       { return m_data.allowZeroTests; }
+    ShowDurations Config::showDurations() const        { return m_data.showDurations; }
+    double Config::minDuration() const                 { return m_data.minDuration; }
+    TestRunOrder Config::runOrder() const              { return m_data.runOrder; }
+    uint32_t Config::rngSeed() const                   { return m_data.rngSeed; }
+    unsigned int Config::shardCount() const            { return m_data.shardCount; }
+    unsigned int Config::shardIndex() const            { return m_data.shardIndex; }
+    ColourMode Config::defaultColourMode() const       { return m_data.defaultColourMode; }
+    bool Config::shouldDebugBreak() const              { return m_data.shouldDebugBreak; }
+    int Config::abortAfter() const                     { return m_data.abortAfter; }
+    bool Config::showInvisibles() const                { return m_data.showInvisibles; }
+    Verbosity Config::verbosity() const                { return m_data.verbosity; }
+
+    bool Config::skipBenchmarks() const                           { return m_data.skipBenchmarks; }
+    bool Config::benchmarkNoAnalysis() const                      { return m_data.benchmarkNoAnalysis; }
+    unsigned int Config::benchmarkSamples() const                 { return m_data.benchmarkSamples; }
+    double Config::benchmarkConfidenceInterval() const            { return m_data.benchmarkConfidenceInterval; }
+    unsigned int Config::benchmarkResamples() const               { return m_data.benchmarkResamples; }
+    std::chrono::milliseconds Config::benchmarkWarmupTime() const { return std::chrono::milliseconds(m_data.benchmarkWarmupTime); }
+
+    void Config::readBazelEnvVars() {
+        // Register a JUnit reporter for Bazel. Bazel sets an environment
+        // variable with the path to XML output. If this file is written to
+        // during test, Bazel will not generate a default XML output.
+        // This allows the XML output file to contain higher level of detail
+        // than what is possible otherwise.
+        const auto bazelOutputFile = Detail::getEnv( "XML_OUTPUT_FILE" );
+
+        if ( bazelOutputFile ) {
+            m_data.reporterSpecifications.push_back(
+                { "junit", std::string( bazelOutputFile ), {}, {} } );
+        }
+
+        const auto bazelTestSpec = Detail::getEnv( "TESTBRIDGE_TEST_ONLY" );
+        if ( bazelTestSpec ) {
+            // Presumably the test spec from environment should overwrite
+            // the one we got from CLI (if we got any)
+            m_data.testsOrTags.clear();
+            m_data.testsOrTags.push_back( bazelTestSpec );
+        }
+
+        const auto bazelShardOptions = readBazelShardingOptions();
+        if ( bazelShardOptions ) {
+            std::ofstream f( bazelShardOptions->shardFilePath,
+                             std::ios_base::out | std::ios_base::trunc );
+            if ( f.is_open() ) {
+                f << "";
+                m_data.shardIndex = bazelShardOptions->shardIndex;
+                m_data.shardCount = bazelShardOptions->shardCount;
+            }
+        }
+    }
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_config.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_config.hpp
new file mode 100644
index 00000000..17e983e5
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_config.hpp
@@ -0,0 +1,153 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_CONFIG_HPP_INCLUDED
+#define CATCH_CONFIG_HPP_INCLUDED
+
+#include <catch2/catch_test_spec.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
+#include <catch2/internal/catch_optional.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+#include <catch2/internal/catch_random_seed_generation.hpp>
+#include <catch2/internal/catch_reporter_spec_parser.hpp>
+
+#include <chrono>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace Catch {
+
+    class IStream;
+
+    /**
+     * `ReporterSpec` but with the defaults filled in.
+     *
+     * Like `ReporterSpec`, the semantics are unchecked.
+     */
+    struct ProcessedReporterSpec {
+        std::string name;
+        std::string outputFilename;
+        ColourMode colourMode;
+        std::map<std::string, std::string> customOptions;
+        friend bool operator==( ProcessedReporterSpec const& lhs,
+                                ProcessedReporterSpec const& rhs );
+        friend bool operator!=( ProcessedReporterSpec const& lhs,
+                                ProcessedReporterSpec const& rhs ) {
+            return !( lhs == rhs );
+        }
+    };
+
+    struct ConfigData {
+
+        bool listTests = false;
+        bool listTags = false;
+        bool listReporters = false;
+        bool listListeners = false;
+
+        bool showSuccessfulTests = false;
+        bool shouldDebugBreak = false;
+        bool noThrow = false;
+        bool showHelp = false;
+        bool showInvisibles = false;
+        bool filenamesAsTags = false;
+        bool libIdentify = false;
+        bool allowZeroTests = false;
+
+        int abortAfter = -1;
+        uint32_t rngSeed = generateRandomSeed(GenerateFrom::Default);
+
+        unsigned int shardCount = 1;
+        unsigned int shardIndex = 0;
+
+        bool skipBenchmarks = false;
+        bool benchmarkNoAnalysis = false;
+        unsigned int benchmarkSamples = 100;
+        double benchmarkConfidenceInterval = 0.95;
+        unsigned int benchmarkResamples = 100'000;
+        std::chrono::milliseconds::rep benchmarkWarmupTime = 100;
+
+        Verbosity verbosity = Verbosity::Normal;
+        WarnAbout::What warnings = WarnAbout::Nothing;
+        ShowDurations showDurations = ShowDurations::DefaultForReporter;
+        double minDuration = -1;
+        TestRunOrder runOrder = TestRunOrder::Declared;
+        ColourMode defaultColourMode = ColourMode::PlatformDefault;
+        WaitForKeypress::When waitForKeypress = WaitForKeypress::Never;
+
+        std::string defaultOutputFilename;
+        std::string name;
+        std::string processName;
+        std::vector<ReporterSpec> reporterSpecifications;
+
+        std::vector<std::string> testsOrTags;
+        std::vector<std::string> sectionsToRun;
+    };
+
+
+    class Config : public IConfig {
+    public:
+
+        Config() = default;
+        Config( ConfigData const& data );
+        ~Config() override; // = default in the cpp file
+
+        bool listTests() const;
+        bool listTags() const;
+        bool listReporters() const;
+        bool listListeners() const;
+
+        std::vector<ReporterSpec> const& getReporterSpecs() const;
+        std::vector<ProcessedReporterSpec> const&
+        getProcessedReporterSpecs() const;
+
+        std::vector<std::string> const& getTestsOrTags() const override;
+        std::vector<std::string> const& getSectionsToRun() const override;
+
+        TestSpec const& testSpec() const override;
+        bool hasTestFilters() const override;
+
+        bool showHelp() const;
+
+        // IConfig interface
+        bool allowThrows() const override;
+        StringRef name() const override;
+        bool includeSuccessfulResults() const override;
+        bool warnAboutMissingAssertions() const override;
+        bool warnAboutUnmatchedTestSpecs() const override;
+        bool zeroTestsCountAsSuccess() const override;
+        ShowDurations showDurations() const override;
+        double minDuration() const override;
+        TestRunOrder runOrder() const override;
+        uint32_t rngSeed() const override;
+        unsigned int shardCount() const override;
+        unsigned int shardIndex() const override;
+        ColourMode defaultColourMode() const override;
+        bool shouldDebugBreak() const override;
+        int abortAfter() const override;
+        bool showInvisibles() const override;
+        Verbosity verbosity() const override;
+        bool skipBenchmarks() const override;
+        bool benchmarkNoAnalysis() const override;
+        unsigned int benchmarkSamples() const override;
+        double benchmarkConfidenceInterval() const override;
+        unsigned int benchmarkResamples() const override;
+        std::chrono::milliseconds benchmarkWarmupTime() const override;
+
+    private:
+        // Reads Bazel env vars and applies them to the config
+        void readBazelEnvVars();
+
+        ConfigData m_data;
+        std::vector<ProcessedReporterSpec> m_processedReporterSpecs;
+        TestSpec m_testSpec;
+        bool m_hasTestFilters = false;
+    };
+} // end namespace Catch
+
+#endif // CATCH_CONFIG_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_get_random_seed.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_get_random_seed.cpp
new file mode 100644
index 00000000..b1225168
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_get_random_seed.cpp
@@ -0,0 +1,18 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/catch_get_random_seed.hpp>
+
+#include <catch2/internal/catch_context.hpp>
+#include <catch2/catch_config.hpp>
+
+namespace Catch {
+    std::uint32_t getSeed() {
+        return getCurrentContext().getConfig()->rngSeed();
+    }
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_get_random_seed.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_get_random_seed.hpp
new file mode 100644
index 00000000..3ab2a4a1
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_get_random_seed.hpp
@@ -0,0 +1,18 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_GET_RANDOM_SEED_HPP_INCLUDED
+#define CATCH_GET_RANDOM_SEED_HPP_INCLUDED
+
+#include <cstdint>
+
+namespace Catch {
+    //! Returns Catch2's current RNG seed.
+    std::uint32_t getSeed();
+}
+
+#endif // CATCH_GET_RANDOM_SEED_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_message.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_message.cpp
new file mode 100644
index 00000000..4b223d96
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_message.cpp
@@ -0,0 +1,117 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/catch_message.hpp>
+#include <catch2/interfaces/catch_interfaces_capture.hpp>
+#include <catch2/internal/catch_uncaught_exceptions.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <cassert>
+#include <stack>
+
+namespace Catch {
+
+    ////////////////////////////////////////////////////////////////////////////
+
+
+    ScopedMessage::ScopedMessage( MessageBuilder&& builder ):
+        m_info( CATCH_MOVE(builder.m_info) ) {
+        m_info.message = builder.m_stream.str();
+        getResultCapture().pushScopedMessage( m_info );
+    }
+
+    ScopedMessage::ScopedMessage( ScopedMessage&& old ) noexcept:
+        m_info( CATCH_MOVE( old.m_info ) ) {
+        old.m_moved = true;
+    }
+
+    ScopedMessage::~ScopedMessage() {
+        if ( !uncaught_exceptions() && !m_moved ){
+            getResultCapture().popScopedMessage(m_info);
+        }
+    }
+
+
+    Capturer::Capturer( StringRef macroName,
+                        SourceLineInfo const& lineInfo,
+                        ResultWas::OfType resultType,
+                        StringRef names ):
+        m_resultCapture( getResultCapture() ) {
+        auto trimmed = [&] (size_t start, size_t end) {
+            while (names[start] == ',' || isspace(static_cast<unsigned char>(names[start]))) {
+                ++start;
+            }
+            while (names[end] == ',' || isspace(static_cast<unsigned char>(names[end]))) {
+                --end;
+            }
+            return names.substr(start, end - start + 1);
+        };
+        auto skipq = [&] (size_t start, char quote) {
+            for (auto i = start + 1; i < names.size() ; ++i) {
+                if (names[i] == quote)
+                    return i;
+                if (names[i] == '\\')
+                    ++i;
+            }
+            CATCH_INTERNAL_ERROR("CAPTURE parsing encountered unmatched quote");
+        };
+
+        size_t start = 0;
+        std::stack<char> openings;
+        for (size_t pos = 0; pos < names.size(); ++pos) {
+            char c = names[pos];
+            switch (c) {
+            case '[':
+            case '{':
+            case '(':
+            // It is basically impossible to disambiguate between
+            // comparison and start of template args in this context
+//            case '<':
+                openings.push(c);
+                break;
+            case ']':
+            case '}':
+            case ')':
+//           case '>':
+                openings.pop();
+                break;
+            case '"':
+            case '\'':
+                pos = skipq(pos, c);
+                break;
+            case ',':
+                if (start != pos && openings.empty()) {
+                    m_messages.emplace_back(macroName, lineInfo, resultType);
+                    m_messages.back().message = static_cast<std::string>(trimmed(start, pos));
+                    m_messages.back().message += " := ";
+                    start = pos;
+                }
+            default:; // noop
+            }
+        }
+        assert(openings.empty() && "Mismatched openings");
+        m_messages.emplace_back(macroName, lineInfo, resultType);
+        m_messages.back().message = static_cast<std::string>(trimmed(start, names.size() - 1));
+        m_messages.back().message += " := ";
+    }
+    Capturer::~Capturer() {
+        if ( !uncaught_exceptions() ){
+            assert( m_captured == m_messages.size() );
+            for( size_t i = 0; i < m_captured; ++i  )
+                m_resultCapture.popScopedMessage( m_messages[i] );
+        }
+    }
+
+    void Capturer::captureValue( size_t index, std::string const& value ) {
+        assert( index < m_messages.size() );
+        m_messages[index].message += value;
+        m_resultCapture.pushScopedMessage( m_messages[index] );
+        m_captured++;
+    }
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_message.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_message.hpp
new file mode 100644
index 00000000..05325ee8
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_message.hpp
@@ -0,0 +1,150 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_MESSAGE_HPP_INCLUDED
+#define CATCH_MESSAGE_HPP_INCLUDED
+
+#include <catch2/internal/catch_config_prefix_messages.hpp>
+#include <catch2/internal/catch_result_type.hpp>
+#include <catch2/internal/catch_reusable_string_stream.hpp>
+#include <catch2/internal/catch_stream_end_stop.hpp>
+#include <catch2/internal/catch_message_info.hpp>
+#include <catch2/catch_tostring.hpp>
+#include <catch2/interfaces/catch_interfaces_capture.hpp>
+
+#include <string>
+#include <vector>
+
+namespace Catch {
+
+    struct SourceLineInfo;
+    class IResultCapture;
+
+    struct MessageStream {
+
+        template<typename T>
+        MessageStream& operator << ( T const& value ) {
+            m_stream << value;
+            return *this;
+        }
+
+        ReusableStringStream m_stream;
+    };
+
+    struct MessageBuilder : MessageStream {
+        MessageBuilder( StringRef macroName,
+                        SourceLineInfo const& lineInfo,
+                        ResultWas::OfType type ):
+            m_info(macroName, lineInfo, type) {}
+
+        template<typename T>
+        MessageBuilder&& operator << ( T const& value ) && {
+            m_stream << value;
+            return CATCH_MOVE(*this);
+        }
+
+        MessageInfo m_info;
+    };
+
+    class ScopedMessage {
+    public:
+        explicit ScopedMessage( MessageBuilder&& builder );
+        ScopedMessage( ScopedMessage& duplicate ) = delete;
+        ScopedMessage( ScopedMessage&& old ) noexcept;
+        ~ScopedMessage();
+
+        MessageInfo m_info;
+        bool m_moved = false;
+    };
+
+    class Capturer {
+        std::vector<MessageInfo> m_messages;
+        IResultCapture& m_resultCapture;
+        size_t m_captured = 0;
+    public:
+        Capturer( StringRef macroName, SourceLineInfo const& lineInfo, ResultWas::OfType resultType, StringRef names );
+
+        Capturer(Capturer const&) = delete;
+        Capturer& operator=(Capturer const&) = delete;
+
+        ~Capturer();
+
+        void captureValue( size_t index, std::string const& value );
+
+        template<typename T>
+        void captureValues( size_t index, T const& value ) {
+            captureValue( index, Catch::Detail::stringify( value ) );
+        }
+
+        template<typename T, typename... Ts>
+        void captureValues( size_t index, T const& value, Ts const&... values ) {
+            captureValue( index, Catch::Detail::stringify(value) );
+            captureValues( index+1, values... );
+        }
+    };
+
+} // end namespace Catch
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_MSG( macroName, messageType, resultDisposition, ... ) \
+    do { \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, Catch::StringRef(), resultDisposition ); \
+        catchAssertionHandler.handleMessage( messageType, ( Catch::MessageStream() << __VA_ARGS__ + ::Catch::StreamEndStop() ).m_stream.str() ); \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( false )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_CAPTURE( varName, macroName, ... ) \
+    Catch::Capturer varName( macroName##_catch_sr,        \
+                             CATCH_INTERNAL_LINEINFO,     \
+                             Catch::ResultWas::Info,      \
+                             #__VA_ARGS__##_catch_sr );   \
+    varName.captureValues( 0, __VA_ARGS__ )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_INFO( macroName, log ) \
+    const Catch::ScopedMessage INTERNAL_CATCH_UNIQUE_NAME( scopedMessage )( Catch::MessageBuilder( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, Catch::ResultWas::Info ) << log )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_UNSCOPED_INFO( macroName, log ) \
+    Catch::getResultCapture().emplaceUnscopedMessage( Catch::MessageBuilder( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, Catch::ResultWas::Info ) << log )
+
+
+#if defined(CATCH_CONFIG_PREFIX_MESSAGES) && !defined(CATCH_CONFIG_DISABLE)
+
+  #define CATCH_INFO( msg ) INTERNAL_CATCH_INFO( "CATCH_INFO", msg )
+  #define CATCH_UNSCOPED_INFO( msg ) INTERNAL_CATCH_UNSCOPED_INFO( "CATCH_UNSCOPED_INFO", msg )
+  #define CATCH_WARN( msg ) INTERNAL_CATCH_MSG( "CATCH_WARN", Catch::ResultWas::Warning, Catch::ResultDisposition::ContinueOnFailure, msg )
+  #define CATCH_CAPTURE( ... ) INTERNAL_CATCH_CAPTURE( INTERNAL_CATCH_UNIQUE_NAME(capturer), "CATCH_CAPTURE", __VA_ARGS__ )
+
+#elif defined(CATCH_CONFIG_PREFIX_MESSAGES) && defined(CATCH_CONFIG_DISABLE)
+
+  #define CATCH_INFO( msg )          (void)(0)
+  #define CATCH_UNSCOPED_INFO( msg ) (void)(0)
+  #define CATCH_WARN( msg )          (void)(0)
+  #define CATCH_CAPTURE( ... )       (void)(0)
+
+#elif !defined(CATCH_CONFIG_PREFIX_MESSAGES) && !defined(CATCH_CONFIG_DISABLE)
+
+  #define INFO( msg ) INTERNAL_CATCH_INFO( "INFO", msg )
+  #define UNSCOPED_INFO( msg ) INTERNAL_CATCH_UNSCOPED_INFO( "UNSCOPED_INFO", msg )
+  #define WARN( msg ) INTERNAL_CATCH_MSG( "WARN", Catch::ResultWas::Warning, Catch::ResultDisposition::ContinueOnFailure, msg )
+  #define CAPTURE( ... ) INTERNAL_CATCH_CAPTURE( INTERNAL_CATCH_UNIQUE_NAME(capturer), "CAPTURE", __VA_ARGS__ )
+
+#elif !defined(CATCH_CONFIG_PREFIX_MESSAGES) && defined(CATCH_CONFIG_DISABLE)
+
+  #define INFO( msg )          (void)(0)
+  #define UNSCOPED_INFO( msg ) (void)(0)
+  #define WARN( msg )          (void)(0)
+  #define CAPTURE( ... )       (void)(0)
+
+#endif // end of user facing macro declarations
+
+
+
+
+#endif // CATCH_MESSAGE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_registry_hub.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_registry_hub.cpp
new file mode 100644
index 00000000..3a594678
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_registry_hub.cpp
@@ -0,0 +1,106 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
+
+#include <catch2/internal/catch_context.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_test_case_registry_impl.hpp>
+#include <catch2/internal/catch_reporter_registry.hpp>
+#include <catch2/internal/catch_exception_translator_registry.hpp>
+#include <catch2/internal/catch_tag_alias_registry.hpp>
+#include <catch2/internal/catch_startup_exception_registry.hpp>
+#include <catch2/internal/catch_singletons.hpp>
+#include <catch2/internal/catch_enum_values_registry.hpp>
+#include <catch2/catch_test_case_info.hpp>
+#include <catch2/internal/catch_noncopyable.hpp>
+#include <catch2/interfaces/catch_interfaces_reporter_factory.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <exception>
+
+namespace Catch {
+
+    namespace {
+
+        class RegistryHub : public IRegistryHub,
+                            public IMutableRegistryHub,
+                            private Detail::NonCopyable {
+
+        public: // IRegistryHub
+            RegistryHub() = default;
+            ReporterRegistry const& getReporterRegistry() const override {
+                return m_reporterRegistry;
+            }
+            ITestCaseRegistry const& getTestCaseRegistry() const override {
+                return m_testCaseRegistry;
+            }
+            IExceptionTranslatorRegistry const& getExceptionTranslatorRegistry() const override {
+                return m_exceptionTranslatorRegistry;
+            }
+            ITagAliasRegistry const& getTagAliasRegistry() const override {
+                return m_tagAliasRegistry;
+            }
+            StartupExceptionRegistry const& getStartupExceptionRegistry() const override {
+                return m_exceptionRegistry;
+            }
+
+        public: // IMutableRegistryHub
+            void registerReporter( std::string const& name, IReporterFactoryPtr factory ) override {
+                m_reporterRegistry.registerReporter( name, CATCH_MOVE(factory) );
+            }
+            void registerListener( Detail::unique_ptr<EventListenerFactory> factory ) override {
+                m_reporterRegistry.registerListener( CATCH_MOVE(factory) );
+            }
+            void registerTest( Detail::unique_ptr<TestCaseInfo>&& testInfo, Detail::unique_ptr<ITestInvoker>&& invoker ) override {
+                m_testCaseRegistry.registerTest( CATCH_MOVE(testInfo), CATCH_MOVE(invoker) );
+            }
+            void registerTranslator( Detail::unique_ptr<IExceptionTranslator>&& translator ) override {
+                m_exceptionTranslatorRegistry.registerTranslator( CATCH_MOVE(translator) );
+            }
+            void registerTagAlias( std::string const& alias, std::string const& tag, SourceLineInfo const& lineInfo ) override {
+                m_tagAliasRegistry.add( alias, tag, lineInfo );
+            }
+            void registerStartupException() noexcept override {
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+                m_exceptionRegistry.add(std::current_exception());
+#else
+                CATCH_INTERNAL_ERROR("Attempted to register active exception under CATCH_CONFIG_DISABLE_EXCEPTIONS!");
+#endif
+            }
+            IMutableEnumValuesRegistry& getMutableEnumValuesRegistry() override {
+                return m_enumValuesRegistry;
+            }
+
+        private:
+            TestRegistry m_testCaseRegistry;
+            ReporterRegistry m_reporterRegistry;
+            ExceptionTranslatorRegistry m_exceptionTranslatorRegistry;
+            TagAliasRegistry m_tagAliasRegistry;
+            StartupExceptionRegistry m_exceptionRegistry;
+            Detail::EnumValuesRegistry m_enumValuesRegistry;
+        };
+    }
+
+    using RegistryHubSingleton = Singleton<RegistryHub, IRegistryHub, IMutableRegistryHub>;
+
+    IRegistryHub const& getRegistryHub() {
+        return RegistryHubSingleton::get();
+    }
+    IMutableRegistryHub& getMutableRegistryHub() {
+        return RegistryHubSingleton::getMutable();
+    }
+    void cleanUp() {
+        cleanupSingletons();
+        cleanUpContext();
+    }
+    std::string translateActiveException() {
+        return getRegistryHub().getExceptionTranslatorRegistry().translateActiveException();
+    }
+
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_section_info.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_section_info.hpp
new file mode 100644
index 00000000..7de8441c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_section_info.hpp
@@ -0,0 +1,42 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_SECTION_INFO_HPP_INCLUDED
+#define CATCH_SECTION_INFO_HPP_INCLUDED
+
+#include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/internal/catch_source_line_info.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+#include <catch2/catch_totals.hpp>
+
+#include <string>
+
+namespace Catch {
+
+    struct SectionInfo {
+        // The last argument is ignored, so that people can write
+        // SECTION("ShortName", "Proper description that is long") and
+        // still use the `-c` flag comfortably.
+        SectionInfo( SourceLineInfo const& _lineInfo, std::string _name,
+                    const char* const = nullptr ):
+            name(CATCH_MOVE(_name)),
+            lineInfo(_lineInfo)
+            {}
+
+        std::string name;
+        SourceLineInfo lineInfo;
+    };
+
+    struct SectionEndInfo {
+        SectionInfo sectionInfo;
+        Counts prevAssertions;
+        double durationInSeconds;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_SECTION_INFO_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_session.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_session.cpp
new file mode 100644
index 00000000..f1ed5f9c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_session.cpp
@@ -0,0 +1,364 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/catch_session.hpp>
+#include <catch2/internal/catch_console_colour.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_list.hpp>
+#include <catch2/internal/catch_context.hpp>
+#include <catch2/internal/catch_run_context.hpp>
+#include <catch2/catch_test_spec.hpp>
+#include <catch2/catch_version.hpp>
+#include <catch2/internal/catch_startup_exception_registry.hpp>
+#include <catch2/internal/catch_sharding.hpp>
+#include <catch2/internal/catch_test_case_registry_impl.hpp>
+#include <catch2/internal/catch_textflow.hpp>
+#include <catch2/internal/catch_windows_h_proxy.hpp>
+#include <catch2/reporters/catch_reporter_multi.hpp>
+#include <catch2/internal/catch_reporter_registry.hpp>
+#include <catch2/interfaces/catch_interfaces_reporter_factory.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/internal/catch_stdstreams.hpp>
+#include <catch2/internal/catch_istream.hpp>
+
+#include <algorithm>
+#include <cassert>
+#include <exception>
+#include <iomanip>
+#include <set>
+
+namespace Catch {
+
+    namespace {
+        const int MaxExitCode = 255;
+
+        IEventListenerPtr createReporter(std::string const& reporterName, ReporterConfig&& config) {
+            auto reporter = Catch::getRegistryHub().getReporterRegistry().create(reporterName, CATCH_MOVE(config));
+            CATCH_ENFORCE(reporter, "No reporter registered with name: '" << reporterName << '\'');
+
+            return reporter;
+        }
+
+        IEventListenerPtr prepareReporters(Config const* config) {
+            if (Catch::getRegistryHub().getReporterRegistry().getListeners().empty()
+                    && config->getProcessedReporterSpecs().size() == 1) {
+                auto const& spec = config->getProcessedReporterSpecs()[0];
+                return createReporter(
+                    spec.name,
+                    ReporterConfig( config,
+                                    makeStream( spec.outputFilename ),
+                                    spec.colourMode,
+                                    spec.customOptions ) );
+            }
+
+            auto multi = Detail::make_unique<MultiReporter>(config);
+
+            auto const& listeners = Catch::getRegistryHub().getReporterRegistry().getListeners();
+            for (auto const& listener : listeners) {
+                multi->addListener(listener->create(config));
+            }
+
+            for ( auto const& reporterSpec : config->getProcessedReporterSpecs() ) {
+                multi->addReporter( createReporter(
+                    reporterSpec.name,
+                    ReporterConfig( config,
+                                    makeStream( reporterSpec.outputFilename ),
+                                    reporterSpec.colourMode,
+                                    reporterSpec.customOptions ) ) );
+            }
+
+            return multi;
+        }
+
+        class TestGroup {
+        public:
+            explicit TestGroup(IEventListenerPtr&& reporter, Config const* config):
+                m_reporter(reporter.get()),
+                m_config{config},
+                m_context{config, CATCH_MOVE(reporter)} {
+
+                assert( m_config->testSpec().getInvalidSpecs().empty() &&
+                        "Invalid test specs should be handled before running tests" );
+
+                auto const& allTestCases = getAllTestCasesSorted(*m_config);
+                auto const& testSpec = m_config->testSpec();
+                if ( !testSpec.hasFilters() ) {
+                    for ( auto const& test : allTestCases ) {
+                        if ( !test.getTestCaseInfo().isHidden() ) {
+                            m_tests.emplace( &test );
+                        }
+                    }
+                } else {
+                    m_matches =
+                        testSpec.matchesByFilter( allTestCases, *m_config );
+                    for ( auto const& match : m_matches ) {
+                        m_tests.insert( match.tests.begin(),
+                                        match.tests.end() );
+                    }
+                }
+
+                m_tests = createShard(m_tests, m_config->shardCount(), m_config->shardIndex());
+            }
+
+            Totals execute() {
+                Totals totals;
+                for (auto const& testCase : m_tests) {
+                    if (!m_context.aborting())
+                        totals += m_context.runTest(*testCase);
+                    else
+                        m_reporter->skipTest(testCase->getTestCaseInfo());
+                }
+
+                for (auto const& match : m_matches) {
+                    if (match.tests.empty()) {
+                        m_unmatchedTestSpecs = true;
+                        m_reporter->noMatchingTestCases( match.name );
+                    }
+                }
+
+                return totals;
+            }
+
+            bool hadUnmatchedTestSpecs() const {
+                return m_unmatchedTestSpecs;
+            }
+
+
+        private:
+            IEventListener* m_reporter;
+            Config const* m_config;
+            RunContext m_context;
+            std::set<TestCaseHandle const*> m_tests;
+            TestSpec::Matches m_matches;
+            bool m_unmatchedTestSpecs = false;
+        };
+
+        void applyFilenamesAsTags() {
+            for (auto const& testInfo : getRegistryHub().getTestCaseRegistry().getAllInfos()) {
+                testInfo->addFilenameTag();
+            }
+        }
+
+    } // anon namespace
+
+    Session::Session() {
+        static bool alreadyInstantiated = false;
+        if( alreadyInstantiated ) {
+            CATCH_TRY { CATCH_INTERNAL_ERROR( "Only one instance of Catch::Session can ever be used" ); }
+            CATCH_CATCH_ALL { getMutableRegistryHub().registerStartupException(); }
+        }
+
+        // There cannot be exceptions at startup in no-exception mode.
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+        const auto& exceptions = getRegistryHub().getStartupExceptionRegistry().getExceptions();
+        if ( !exceptions.empty() ) {
+            config();
+            getCurrentMutableContext().setConfig(m_config.get());
+
+            m_startupExceptions = true;
+            auto errStream = makeStream( "%stderr" );
+            auto colourImpl = makeColourImpl(
+                ColourMode::PlatformDefault, errStream.get() );
+            auto guard = colourImpl->guardColour( Colour::Red );
+            errStream->stream() << "Errors occurred during startup!" << '\n';
+            // iterate over all exceptions and notify user
+            for ( const auto& ex_ptr : exceptions ) {
+                try {
+                    std::rethrow_exception(ex_ptr);
+                } catch ( std::exception const& ex ) {
+                    errStream->stream() << TextFlow::Column( ex.what() ).indent(2) << '\n';
+                }
+            }
+        }
+#endif
+
+        alreadyInstantiated = true;
+        m_cli = makeCommandLineParser( m_configData );
+    }
+    Session::~Session() {
+        Catch::cleanUp();
+    }
+
+    void Session::showHelp() const {
+        Catch::cout()
+                << "\nCatch2 v" << libraryVersion() << '\n'
+                << m_cli << '\n'
+                << "For more detailed usage please see the project docs\n\n" << std::flush;
+    }
+    void Session::libIdentify() {
+        Catch::cout()
+                << std::left << std::setw(16) << "description: " << "A Catch2 test executable\n"
+                << std::left << std::setw(16) << "category: " << "testframework\n"
+                << std::left << std::setw(16) << "framework: " << "Catch2\n"
+                << std::left << std::setw(16) << "version: " << libraryVersion() << '\n' << std::flush;
+    }
+
+    int Session::applyCommandLine( int argc, char const * const * argv ) {
+        if( m_startupExceptions )
+            return 1;
+
+        auto result = m_cli.parse( Clara::Args( argc, argv ) );
+
+        if( !result ) {
+            config();
+            getCurrentMutableContext().setConfig(m_config.get());
+            auto errStream = makeStream( "%stderr" );
+            auto colour = makeColourImpl( ColourMode::PlatformDefault, errStream.get() );
+
+            errStream->stream()
+                << colour->guardColour( Colour::Red )
+                << "\nError(s) in input:\n"
+                << TextFlow::Column( result.errorMessage() ).indent( 2 )
+                << "\n\n";
+            errStream->stream() << "Run with -? for usage\n\n" << std::flush;
+            return MaxExitCode;
+        }
+
+        if( m_configData.showHelp )
+            showHelp();
+        if( m_configData.libIdentify )
+            libIdentify();
+
+        m_config.reset();
+        return 0;
+    }
+
+#if defined(CATCH_CONFIG_WCHAR) && defined(_WIN32) && defined(UNICODE)
+    int Session::applyCommandLine( int argc, wchar_t const * const * argv ) {
+
+        char **utf8Argv = new char *[ argc ];
+
+        for ( int i = 0; i < argc; ++i ) {
+            int bufSize = WideCharToMultiByte( CP_UTF8, 0, argv[i], -1, nullptr, 0, nullptr, nullptr );
+
+            utf8Argv[ i ] = new char[ bufSize ];
+
+            WideCharToMultiByte( CP_UTF8, 0, argv[i], -1, utf8Argv[i], bufSize, nullptr, nullptr );
+        }
+
+        int returnCode = applyCommandLine( argc, utf8Argv );
+
+        for ( int i = 0; i < argc; ++i )
+            delete [] utf8Argv[ i ];
+
+        delete [] utf8Argv;
+
+        return returnCode;
+    }
+#endif
+
+    void Session::useConfigData( ConfigData const& configData ) {
+        m_configData = configData;
+        m_config.reset();
+    }
+
+    int Session::run() {
+        if( ( m_configData.waitForKeypress & WaitForKeypress::BeforeStart ) != 0 ) {
+            Catch::cout() << "...waiting for enter/ return before starting\n" << std::flush;
+            static_cast<void>(std::getchar());
+        }
+        int exitCode = runInternal();
+        if( ( m_configData.waitForKeypress & WaitForKeypress::BeforeExit ) != 0 ) {
+            Catch::cout() << "...waiting for enter/ return before exiting, with code: " << exitCode << '\n' << std::flush;
+            static_cast<void>(std::getchar());
+        }
+        return exitCode;
+    }
+
+    Clara::Parser const& Session::cli() const {
+        return m_cli;
+    }
+    void Session::cli( Clara::Parser const& newParser ) {
+        m_cli = newParser;
+    }
+    ConfigData& Session::configData() {
+        return m_configData;
+    }
+    Config& Session::config() {
+        if( !m_config )
+            m_config = Detail::make_unique<Config>( m_configData );
+        return *m_config;
+    }
+
+    int Session::runInternal() {
+        if( m_startupExceptions )
+            return 1;
+
+        if (m_configData.showHelp || m_configData.libIdentify) {
+            return 0;
+        }
+
+        if ( m_configData.shardIndex >= m_configData.shardCount ) {
+            Catch::cerr() << "The shard count (" << m_configData.shardCount
+                          << ") must be greater than the shard index ("
+                          << m_configData.shardIndex << ")\n"
+                          << std::flush;
+            return 1;
+        }
+
+        CATCH_TRY {
+            config(); // Force config to be constructed
+
+            seedRng( *m_config );
+
+            if (m_configData.filenamesAsTags) {
+                applyFilenamesAsTags();
+            }
+
+            // Set up global config instance before we start calling into other functions
+            getCurrentMutableContext().setConfig(m_config.get());
+
+            // Create reporter(s) so we can route listings through them
+            auto reporter = prepareReporters(m_config.get());
+
+            auto const& invalidSpecs = m_config->testSpec().getInvalidSpecs();
+            if ( !invalidSpecs.empty() ) {
+                for ( auto const& spec : invalidSpecs ) {
+                    reporter->reportInvalidTestSpec( spec );
+                }
+                return 1;
+            }
+
+
+            // Handle list request
+            if (list(*reporter, *m_config)) {
+                return 0;
+            }
+
+            TestGroup tests { CATCH_MOVE(reporter), m_config.get() };
+            auto const totals = tests.execute();
+
+            if ( tests.hadUnmatchedTestSpecs()
+                && m_config->warnAboutUnmatchedTestSpecs() ) {
+                return 3;
+            }
+
+            if ( totals.testCases.total() == 0
+                && !m_config->zeroTestsCountAsSuccess() ) {
+                return 2;
+            }
+
+            if ( totals.testCases.total() > 0 &&
+                 totals.testCases.total() == totals.testCases.skipped
+                && !m_config->zeroTestsCountAsSuccess() ) {
+                return 4;
+            }
+
+            // Note that on unices only the lower 8 bits are usually used, clamping
+            // the return value to 255 prevents false negative when some multiple
+            // of 256 tests has failed
+            return (std::min) (MaxExitCode, static_cast<int>(totals.assertions.failed));
+        }
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+        catch( std::exception& ex ) {
+            Catch::cerr() << ex.what() << '\n' << std::flush;
+            return MaxExitCode;
+        }
+#endif
+    }
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_session.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_session.hpp
new file mode 100644
index 00000000..c1de6d57
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_session.hpp
@@ -0,0 +1,62 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_SESSION_HPP_INCLUDED
+#define CATCH_SESSION_HPP_INCLUDED
+
+#include <catch2/internal/catch_commandline.hpp>
+#include <catch2/internal/catch_noncopyable.hpp>
+#include <catch2/catch_config.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
+#include <catch2/internal/catch_config_wchar.hpp>
+
+namespace Catch {
+
+    class Session : Detail::NonCopyable {
+    public:
+
+        Session();
+        ~Session();
+
+        void showHelp() const;
+        void libIdentify();
+
+        int applyCommandLine( int argc, char const * const * argv );
+    #if defined(CATCH_CONFIG_WCHAR) && defined(_WIN32) && defined(UNICODE)
+        int applyCommandLine( int argc, wchar_t const * const * argv );
+    #endif
+
+        void useConfigData( ConfigData const& configData );
+
+        template<typename CharT>
+        int run(int argc, CharT const * const argv[]) {
+            if (m_startupExceptions)
+                return 1;
+            int returnCode = applyCommandLine(argc, argv);
+            if (returnCode == 0)
+                returnCode = run();
+            return returnCode;
+        }
+
+        int run();
+
+        Clara::Parser const& cli() const;
+        void cli( Clara::Parser const& newParser );
+        ConfigData& configData();
+        Config& config();
+    private:
+        int runInternal();
+
+        Clara::Parser m_cli;
+        ConfigData m_configData;
+        Detail::unique_ptr<Config> m_config;
+        bool m_startupExceptions = false;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_SESSION_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tag_alias.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tag_alias.hpp
new file mode 100644
index 00000000..dc91f21a
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tag_alias.hpp
@@ -0,0 +1,29 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TAG_ALIAS_HPP_INCLUDED
+#define CATCH_TAG_ALIAS_HPP_INCLUDED
+
+#include <catch2/internal/catch_source_line_info.hpp>
+
+#include <string>
+
+namespace Catch {
+
+    struct TagAlias {
+        TagAlias(std::string const& _tag, SourceLineInfo _lineInfo):
+            tag(_tag),
+            lineInfo(_lineInfo)
+        {}
+
+        std::string tag;
+        SourceLineInfo lineInfo;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_TAG_ALIAS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tag_alias_autoregistrar.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tag_alias_autoregistrar.cpp
new file mode 100644
index 00000000..9b6633a2
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tag_alias_autoregistrar.cpp
@@ -0,0 +1,24 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/catch_tag_alias_autoregistrar.hpp>
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
+
+namespace Catch {
+
+    RegistrarForTagAliases::RegistrarForTagAliases(char const* alias, char const* tag, SourceLineInfo const& lineInfo) {
+        CATCH_TRY {
+            getMutableRegistryHub().registerTagAlias(alias, tag, lineInfo);
+        } CATCH_CATCH_ALL {
+            // Do not throw when constructing global objects, instead register the exception to be processed later
+            getMutableRegistryHub().registerStartupException();
+        }
+    }
+
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tag_alias_autoregistrar.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tag_alias_autoregistrar.hpp
new file mode 100644
index 00000000..9f80f72f
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tag_alias_autoregistrar.hpp
@@ -0,0 +1,29 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TAG_ALIAS_AUTOREGISTRAR_HPP_INCLUDED
+#define CATCH_TAG_ALIAS_AUTOREGISTRAR_HPP_INCLUDED
+
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/internal/catch_unique_name.hpp>
+#include <catch2/internal/catch_source_line_info.hpp>
+
+namespace Catch {
+
+    struct RegistrarForTagAliases {
+        RegistrarForTagAliases( char const* alias, char const* tag, SourceLineInfo const& lineInfo );
+    };
+
+} // end namespace Catch
+
+#define CATCH_REGISTER_TAG_ALIAS( alias, spec ) \
+    CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+    CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+    namespace{ Catch::RegistrarForTagAliases INTERNAL_CATCH_UNIQUE_NAME( AutoRegisterTagAlias )( alias, spec, CATCH_INTERNAL_LINEINFO ); } \
+    CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#endif // CATCH_TAG_ALIAS_AUTOREGISTRAR_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_template_test_macros.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_template_test_macros.hpp
new file mode 100644
index 00000000..3baee517
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_template_test_macros.hpp
@@ -0,0 +1,124 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TEMPLATE_TEST_MACROS_HPP_INCLUDED
+#define CATCH_TEMPLATE_TEST_MACROS_HPP_INCLUDED
+
+// We need this suppression to leak, because it took until GCC 10
+// for the front end to handle local suppression via _Pragma properly
+// inside templates (so `TEMPLATE_TEST_CASE` and co).
+// **THIS IS DIFFERENT FOR STANDARD TESTS, WHERE GCC 9 IS SUFFICIENT**
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && __GNUC__ < 10
+#pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/internal/catch_template_test_registry.hpp>
+#include <catch2/internal/catch_preprocessor.hpp>
+
+
+#if defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_DISABLE)
+
+  #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define CATCH_TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE( __VA_ARGS__ )
+    #define CATCH_TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG( __VA_ARGS__ )
+    #define CATCH_TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+    #define CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ )
+    #define CATCH_TEMPLATE_PRODUCT_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE( __VA_ARGS__ )
+    #define CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( __VA_ARGS__ )
+    #define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, __VA_ARGS__ )
+    #define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ )
+    #define CATCH_TEMPLATE_LIST_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE(__VA_ARGS__)
+    #define CATCH_TEMPLATE_LIST_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD( className, __VA_ARGS__ )
+  #else
+    #define CATCH_TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE( __VA_ARGS__ ) )
+    #define CATCH_TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG( __VA_ARGS__ ) )
+    #define CATCH_TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ ) )
+    #define CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ ) )
+    #define CATCH_TEMPLATE_PRODUCT_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE( __VA_ARGS__ ) )
+    #define CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( __VA_ARGS__ ) )
+    #define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, __VA_ARGS__ ) )
+    #define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ ) )
+    #define CATCH_TEMPLATE_LIST_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE( __VA_ARGS__ ) )
+    #define CATCH_TEMPLATE_LIST_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD( className, __VA_ARGS__ ) )
+  #endif
+
+#elif defined(CATCH_CONFIG_PREFIX_ALL) && defined(CATCH_CONFIG_DISABLE)
+
+  #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define CATCH_TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__)
+    #define CATCH_TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(__VA_ARGS__)
+    #define CATCH_TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(className, __VA_ARGS__)
+    #define CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(className, __VA_ARGS__ )
+  #else
+    #define CATCH_TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__) )
+    #define CATCH_TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(__VA_ARGS__) )
+    #define CATCH_TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(className, __VA_ARGS__ ) )
+    #define CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(className, __VA_ARGS__ ) )
+  #endif
+
+  // When disabled, these can be shared between proper preprocessor and MSVC preprocessor
+  #define CATCH_TEMPLATE_PRODUCT_TEST_CASE( ... ) CATCH_TEMPLATE_TEST_CASE( __VA_ARGS__ )
+  #define CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( ... ) CATCH_TEMPLATE_TEST_CASE( __VA_ARGS__ )
+  #define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, ... ) CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+  #define CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, ... ) CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+  #define CATCH_TEMPLATE_LIST_TEST_CASE( ... ) CATCH_TEMPLATE_TEST_CASE(__VA_ARGS__)
+  #define CATCH_TEMPLATE_LIST_TEST_CASE_METHOD( className, ... ) CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+
+#elif !defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_DISABLE)
+
+  #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE( __VA_ARGS__ )
+    #define TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG( __VA_ARGS__ )
+    #define TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+    #define TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ )
+    #define TEMPLATE_PRODUCT_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE( __VA_ARGS__ )
+    #define TEMPLATE_PRODUCT_TEST_CASE_SIG( ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( __VA_ARGS__ )
+    #define TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, __VA_ARGS__ )
+    #define TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ )
+    #define TEMPLATE_LIST_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE(__VA_ARGS__)
+    #define TEMPLATE_LIST_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD( className, __VA_ARGS__ )
+  #else
+    #define TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE( __VA_ARGS__ ) )
+    #define TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG( __VA_ARGS__ ) )
+    #define TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ ) )
+    #define TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ ) )
+    #define TEMPLATE_PRODUCT_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE( __VA_ARGS__ ) )
+    #define TEMPLATE_PRODUCT_TEST_CASE_SIG( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG( __VA_ARGS__ ) )
+    #define TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, __VA_ARGS__ ) )
+    #define TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, __VA_ARGS__ ) )
+    #define TEMPLATE_LIST_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE( __VA_ARGS__ ) )
+    #define TEMPLATE_LIST_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD( className, __VA_ARGS__ ) )
+  #endif
+
+#elif !defined(CATCH_CONFIG_PREFIX_ALL) && defined(CATCH_CONFIG_DISABLE)
+
+  #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__)
+    #define TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(__VA_ARGS__)
+    #define TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(className, __VA_ARGS__)
+    #define TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(className, __VA_ARGS__ )
+  #else
+    #define TEMPLATE_TEST_CASE( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(__VA_ARGS__) )
+    #define TEMPLATE_TEST_CASE_SIG( ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(__VA_ARGS__) )
+    #define TEMPLATE_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION(className, __VA_ARGS__ ) )
+    #define TEMPLATE_TEST_CASE_METHOD_SIG( className, ... ) INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION(className, __VA_ARGS__ ) )
+  #endif
+
+  // When disabled, these can be shared between proper preprocessor and MSVC preprocessor
+  #define TEMPLATE_PRODUCT_TEST_CASE( ... ) TEMPLATE_TEST_CASE( __VA_ARGS__ )
+  #define TEMPLATE_PRODUCT_TEST_CASE_SIG( ... ) TEMPLATE_TEST_CASE( __VA_ARGS__ )
+  #define TEMPLATE_PRODUCT_TEST_CASE_METHOD( className, ... ) TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+  #define TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( className, ... ) TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+  #define TEMPLATE_LIST_TEST_CASE( ... ) TEMPLATE_TEST_CASE(__VA_ARGS__)
+  #define TEMPLATE_LIST_TEST_CASE_METHOD( className, ... ) TEMPLATE_TEST_CASE_METHOD( className, __VA_ARGS__ )
+
+#endif // end of user facing macro declarations
+
+
+#endif // CATCH_TEMPLATE_TEST_MACROS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_case_info.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_case_info.cpp
new file mode 100644
index 00000000..2a4fbd33
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_case_info.cpp
@@ -0,0 +1,266 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/catch_test_case_info.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/internal/catch_case_insensitive_comparisons.hpp>
+#include <catch2/internal/catch_test_registry.hpp>
+
+#include <cassert>
+#include <cctype>
+#include <algorithm>
+
+namespace Catch {
+
+    namespace {
+        using TCP_underlying_type = uint8_t;
+        static_assert(sizeof(TestCaseProperties) == sizeof(TCP_underlying_type),
+                      "The size of the TestCaseProperties is different from the assumed size");
+
+        TestCaseProperties operator|(TestCaseProperties lhs, TestCaseProperties rhs) {
+            return static_cast<TestCaseProperties>(
+                static_cast<TCP_underlying_type>(lhs) | static_cast<TCP_underlying_type>(rhs)
+            );
+        }
+
+        TestCaseProperties& operator|=(TestCaseProperties& lhs, TestCaseProperties rhs) {
+            lhs = static_cast<TestCaseProperties>(
+                static_cast<TCP_underlying_type>(lhs) | static_cast<TCP_underlying_type>(rhs)
+            );
+            return lhs;
+        }
+
+        TestCaseProperties operator&(TestCaseProperties lhs, TestCaseProperties rhs) {
+            return static_cast<TestCaseProperties>(
+                static_cast<TCP_underlying_type>(lhs) & static_cast<TCP_underlying_type>(rhs)
+            );
+        }
+
+        bool applies(TestCaseProperties tcp) {
+            static_assert(static_cast<TCP_underlying_type>(TestCaseProperties::None) == 0,
+                          "TestCaseProperties::None must be equal to 0");
+            return tcp != TestCaseProperties::None;
+        }
+
+        TestCaseProperties parseSpecialTag( StringRef tag ) {
+            if( !tag.empty() && tag[0] == '.' )
+                return TestCaseProperties::IsHidden;
+            else if( tag == "!throws"_sr )
+                return TestCaseProperties::Throws;
+            else if( tag == "!shouldfail"_sr )
+                return TestCaseProperties::ShouldFail;
+            else if( tag == "!mayfail"_sr )
+                return TestCaseProperties::MayFail;
+            else if( tag == "!nonportable"_sr )
+                return TestCaseProperties::NonPortable;
+            else if( tag == "!benchmark"_sr )
+                return TestCaseProperties::Benchmark | TestCaseProperties::IsHidden;
+            else
+                return TestCaseProperties::None;
+        }
+        bool isReservedTag( StringRef tag ) {
+            return parseSpecialTag( tag ) == TestCaseProperties::None
+                && tag.size() > 0
+                && !std::isalnum( static_cast<unsigned char>(tag[0]) );
+        }
+        void enforceNotReservedTag( StringRef tag, SourceLineInfo const& _lineInfo ) {
+            CATCH_ENFORCE( !isReservedTag(tag),
+                          "Tag name: [" << tag << "] is not allowed.\n"
+                          << "Tag names starting with non alphanumeric characters are reserved\n"
+                          << _lineInfo );
+        }
+
+        std::string makeDefaultName() {
+            static size_t counter = 0;
+            return "Anonymous test case " + std::to_string(++counter);
+        }
+
+        StringRef extractFilenamePart(StringRef filename) {
+            size_t lastDot = filename.size();
+            while (lastDot > 0 && filename[lastDot - 1] != '.') {
+                --lastDot;
+            }
+            // In theory we could have filename without any extension in it
+            if ( lastDot == 0 ) { return StringRef(); }
+
+            --lastDot;
+            size_t nameStart = lastDot;
+            while (nameStart > 0 && filename[nameStart - 1] != '/' && filename[nameStart - 1] != '\\') {
+                --nameStart;
+            }
+
+            return filename.substr(nameStart, lastDot - nameStart);
+        }
+
+        // Returns the upper bound on size of extra tags ([#file]+[.])
+        size_t sizeOfExtraTags(StringRef filepath) {
+            // [.] is 3, [#] is another 3
+            const size_t extras = 3 + 3;
+            return extractFilenamePart(filepath).size() + extras;
+        }
+    } // end unnamed namespace
+
+    bool operator<(  Tag const& lhs, Tag const& rhs ) {
+        Detail::CaseInsensitiveLess cmp;
+        return cmp( lhs.original, rhs.original );
+    }
+    bool operator==( Tag const& lhs, Tag const& rhs ) {
+        Detail::CaseInsensitiveEqualTo cmp;
+        return cmp( lhs.original, rhs.original );
+    }
+
+    Detail::unique_ptr<TestCaseInfo>
+        makeTestCaseInfo(StringRef _className,
+                         NameAndTags const& nameAndTags,
+                         SourceLineInfo const& _lineInfo ) {
+        return Detail::make_unique<TestCaseInfo>(_className, nameAndTags, _lineInfo);
+    }
+
+    TestCaseInfo::TestCaseInfo(StringRef _className,
+                               NameAndTags const& _nameAndTags,
+                               SourceLineInfo const& _lineInfo):
+        name( _nameAndTags.name.empty() ? makeDefaultName() : _nameAndTags.name ),
+        className( _className ),
+        lineInfo( _lineInfo )
+    {
+        StringRef originalTags = _nameAndTags.tags;
+        // We need to reserve enough space to store all of the tags
+        // (including optional hidden tag and filename tag)
+        auto requiredSize = originalTags.size() + sizeOfExtraTags(_lineInfo.file);
+        backingTags.reserve(requiredSize);
+
+        // We cannot copy the tags directly, as we need to normalize
+        // some tags, so that [.foo] is copied as [.][foo].
+        size_t tagStart = 0;
+        size_t tagEnd = 0;
+        bool inTag = false;
+        for (size_t idx = 0; idx < originalTags.size(); ++idx) {
+            auto c = originalTags[idx];
+            if (c == '[') {
+                CATCH_ENFORCE(
+                    !inTag,
+                    "Found '[' inside a tag while registering test case '"
+                        << _nameAndTags.name << "' at " << _lineInfo );
+
+                inTag = true;
+                tagStart = idx;
+            }
+            if (c == ']') {
+                CATCH_ENFORCE(
+                    inTag,
+                    "Found unmatched ']' while registering test case '"
+                        << _nameAndTags.name << "' at " << _lineInfo );
+
+                inTag = false;
+                tagEnd = idx;
+                assert(tagStart < tagEnd);
+
+                // We need to check the tag for special meanings, copy
+                // it over to backing storage and actually reference the
+                // backing storage in the saved tags
+                StringRef tagStr = originalTags.substr(tagStart+1, tagEnd - tagStart - 1);
+                CATCH_ENFORCE( !tagStr.empty(),
+                               "Found an empty tag while registering test case '"
+                                   << _nameAndTags.name << "' at "
+                                   << _lineInfo );
+
+                enforceNotReservedTag(tagStr, lineInfo);
+                properties |= parseSpecialTag(tagStr);
+                // When copying a tag to the backing storage, we need to
+                // check if it is a merged hide tag, such as [.foo], and
+                // if it is, we need to handle it as if it was [foo].
+                if (tagStr.size() > 1 && tagStr[0] == '.') {
+                    tagStr = tagStr.substr(1, tagStr.size() - 1);
+                }
+                // We skip over dealing with the [.] tag, as we will add
+                // it later unconditionally and then sort and unique all
+                // the tags.
+                internalAppendTag(tagStr);
+            }
+        }
+        CATCH_ENFORCE( !inTag,
+                       "Found an unclosed tag while registering test case '"
+                           << _nameAndTags.name << "' at " << _lineInfo );
+
+
+        // Add [.] if relevant
+        if (isHidden()) {
+            internalAppendTag("."_sr);
+        }
+
+        // Sort and prepare tags
+        std::sort(begin(tags), end(tags));
+        tags.erase(std::unique(begin(tags), end(tags)),
+                   end(tags));
+    }
+
+    bool TestCaseInfo::isHidden() const {
+        return applies( properties & TestCaseProperties::IsHidden );
+    }
+    bool TestCaseInfo::throws() const {
+        return applies( properties & TestCaseProperties::Throws );
+    }
+    bool TestCaseInfo::okToFail() const {
+        return applies( properties & (TestCaseProperties::ShouldFail | TestCaseProperties::MayFail ) );
+    }
+    bool TestCaseInfo::expectedToFail() const {
+        return applies( properties & (TestCaseProperties::ShouldFail) );
+    }
+
+    void TestCaseInfo::addFilenameTag() {
+        std::string combined("#");
+        combined += extractFilenamePart(lineInfo.file);
+        internalAppendTag(combined);
+    }
+
+    std::string TestCaseInfo::tagsAsString() const {
+        std::string ret;
+        // '[' and ']' per tag
+        std::size_t full_size = 2 * tags.size();
+        for (const auto& tag : tags) {
+            full_size += tag.original.size();
+        }
+        ret.reserve(full_size);
+        for (const auto& tag : tags) {
+            ret.push_back('[');
+            ret += tag.original;
+            ret.push_back(']');
+        }
+
+        return ret;
+    }
+
+    void TestCaseInfo::internalAppendTag(StringRef tagStr) {
+        backingTags += '[';
+        const auto backingStart = backingTags.size();
+        backingTags += tagStr;
+        const auto backingEnd = backingTags.size();
+        backingTags += ']';
+        tags.emplace_back(StringRef(backingTags.c_str() + backingStart, backingEnd - backingStart));
+    }
+
+    bool operator<( TestCaseInfo const& lhs, TestCaseInfo const& rhs ) {
+        // We want to avoid redoing the string comparisons multiple times,
+        // so we store the result of a three-way comparison before using
+        // it in the actual comparison logic.
+        const auto cmpName = lhs.name.compare( rhs.name );
+        if ( cmpName != 0 ) {
+            return cmpName < 0;
+        }
+        const auto cmpClassName = lhs.className.compare( rhs.className );
+        if ( cmpClassName != 0 ) {
+            return cmpClassName < 0;
+        }
+        return lhs.tags < rhs.tags;
+    }
+
+    TestCaseInfo const& TestCaseHandle::getTestCaseInfo() const {
+        return *m_info;
+    }
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_case_info.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_case_info.hpp
new file mode 100644
index 00000000..da9927e7
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_case_info.hpp
@@ -0,0 +1,132 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TEST_CASE_INFO_HPP_INCLUDED
+#define CATCH_TEST_CASE_INFO_HPP_INCLUDED
+
+#include <catch2/interfaces/catch_interfaces_test_invoker.hpp>
+#include <catch2/internal/catch_source_line_info.hpp>
+#include <catch2/internal/catch_noncopyable.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
+
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpadded"
+#endif
+
+namespace Catch {
+
+    /**
+     * A **view** of a tag string that provides case insensitive comparisons
+     *
+     * Note that in Catch2 internals, the square brackets around tags are
+     * not a part of tag's representation, so e.g. "[cool-tag]" is represented
+     * as "cool-tag" internally.
+     */
+    struct Tag {
+        constexpr Tag(StringRef original_):
+            original(original_)
+        {}
+        StringRef original;
+
+        friend bool operator< ( Tag const& lhs, Tag const& rhs );
+        friend bool operator==( Tag const& lhs, Tag const& rhs );
+    };
+
+    class ITestInvoker;
+    struct NameAndTags;
+
+    enum class TestCaseProperties : uint8_t {
+        None = 0,
+        IsHidden = 1 << 1,
+        ShouldFail = 1 << 2,
+        MayFail = 1 << 3,
+        Throws = 1 << 4,
+        NonPortable = 1 << 5,
+        Benchmark = 1 << 6
+    };
+
+    /**
+     * Various metadata about the test case.
+     *
+     * A test case is uniquely identified by its (class)name and tags
+     * combination, with source location being ignored, and other properties
+     * being determined from tags.
+     *
+     * Tags are kept sorted.
+     */
+    struct TestCaseInfo : Detail::NonCopyable {
+
+        TestCaseInfo(StringRef _className,
+                     NameAndTags const& _nameAndTags,
+                     SourceLineInfo const& _lineInfo);
+
+        bool isHidden() const;
+        bool throws() const;
+        bool okToFail() const;
+        bool expectedToFail() const;
+
+        // Adds the tag(s) with test's filename (for the -# flag)
+        void addFilenameTag();
+
+        //! Orders by name, classname and tags
+        friend bool operator<( TestCaseInfo const& lhs,
+                               TestCaseInfo const& rhs );
+
+
+        std::string tagsAsString() const;
+
+        std::string name;
+        StringRef className;
+    private:
+        std::string backingTags;
+        // Internally we copy tags to the backing storage and then add
+        // refs to this storage to the tags vector.
+        void internalAppendTag(StringRef tagString);
+    public:
+        std::vector<Tag> tags;
+        SourceLineInfo lineInfo;
+        TestCaseProperties properties = TestCaseProperties::None;
+    };
+
+    /**
+     * Wrapper over the test case information and the test case invoker
+     *
+     * Does not own either, and is specifically made to be cheap
+     * to copy around.
+     */
+    class TestCaseHandle {
+        TestCaseInfo* m_info;
+        ITestInvoker* m_invoker;
+    public:
+        TestCaseHandle(TestCaseInfo* info, ITestInvoker* invoker) :
+            m_info(info), m_invoker(invoker) {}
+
+        void invoke() const {
+            m_invoker->invoke();
+        }
+
+        TestCaseInfo const& getTestCaseInfo() const;
+    };
+
+    Detail::unique_ptr<TestCaseInfo>
+    makeTestCaseInfo( StringRef className,
+                      NameAndTags const& nameAndTags,
+                      SourceLineInfo const& lineInfo );
+}
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+#endif // CATCH_TEST_CASE_INFO_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_macros.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_macros.hpp
new file mode 100644
index 00000000..1088afbe
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_macros.hpp
@@ -0,0 +1,226 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TEST_MACROS_HPP_INCLUDED
+#define CATCH_TEST_MACROS_HPP_INCLUDED
+
+#include <catch2/internal/catch_test_macro_impl.hpp>
+#include <catch2/catch_message.hpp>
+#include <catch2/catch_user_config.hpp>
+#include <catch2/internal/catch_section.hpp>
+#include <catch2/internal/catch_test_registry.hpp>
+#include <catch2/internal/catch_unique_name.hpp>
+
+
+// All of our user-facing macros support configuration toggle, that
+// forces them to be defined prefixed with CATCH_. We also like to
+// support another toggle that can minimize (disable) their implementation.
+// Given this, we have 4 different configuration options below
+
+#if defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_DISABLE)
+
+  #define CATCH_REQUIRE( ... ) INTERNAL_CATCH_TEST( "CATCH_REQUIRE", Catch::ResultDisposition::Normal, __VA_ARGS__ )
+  #define CATCH_REQUIRE_FALSE( ... ) INTERNAL_CATCH_TEST( "CATCH_REQUIRE_FALSE", Catch::ResultDisposition::Normal | Catch::ResultDisposition::FalseTest, __VA_ARGS__ )
+
+  #define CATCH_REQUIRE_THROWS( ... ) INTERNAL_CATCH_THROWS( "CATCH_REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__ )
+  #define CATCH_REQUIRE_THROWS_AS( expr, exceptionType ) INTERNAL_CATCH_THROWS_AS( "CATCH_REQUIRE_THROWS_AS", exceptionType, Catch::ResultDisposition::Normal, expr )
+  #define CATCH_REQUIRE_NOTHROW( ... ) INTERNAL_CATCH_NO_THROW( "CATCH_REQUIRE_NOTHROW", Catch::ResultDisposition::Normal, __VA_ARGS__ )
+
+  #define CATCH_CHECK( ... ) INTERNAL_CATCH_TEST( "CATCH_CHECK", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+  #define CATCH_CHECK_FALSE( ... ) INTERNAL_CATCH_TEST( "CATCH_CHECK_FALSE", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::FalseTest, __VA_ARGS__ )
+  #define CATCH_CHECKED_IF( ... ) INTERNAL_CATCH_IF( "CATCH_CHECKED_IF", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::SuppressFail, __VA_ARGS__ )
+  #define CATCH_CHECKED_ELSE( ... ) INTERNAL_CATCH_ELSE( "CATCH_CHECKED_ELSE", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::SuppressFail, __VA_ARGS__ )
+  #define CATCH_CHECK_NOFAIL( ... ) INTERNAL_CATCH_TEST( "CATCH_CHECK_NOFAIL", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::SuppressFail, __VA_ARGS__ )
+
+  #define CATCH_CHECK_THROWS( ... )  INTERNAL_CATCH_THROWS( "CATCH_CHECK_THROWS", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+  #define CATCH_CHECK_THROWS_AS( expr, exceptionType ) INTERNAL_CATCH_THROWS_AS( "CATCH_CHECK_THROWS_AS", exceptionType, Catch::ResultDisposition::ContinueOnFailure, expr )
+  #define CATCH_CHECK_NOTHROW( ... ) INTERNAL_CATCH_NO_THROW( "CATCH_CHECK_NOTHROW", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+
+  #define CATCH_TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE( __VA_ARGS__ )
+  #define CATCH_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEST_CASE_METHOD( className, __VA_ARGS__ )
+  #define CATCH_METHOD_AS_TEST_CASE( method, ... ) INTERNAL_CATCH_METHOD_AS_TEST_CASE( method, __VA_ARGS__ )
+  #define CATCH_REGISTER_TEST_CASE( Function, ... ) INTERNAL_CATCH_REGISTER_TESTCASE( Function, __VA_ARGS__ )
+  #define CATCH_SECTION( ... ) INTERNAL_CATCH_SECTION( __VA_ARGS__ )
+  #define CATCH_DYNAMIC_SECTION( ... ) INTERNAL_CATCH_DYNAMIC_SECTION( __VA_ARGS__ )
+  #define CATCH_FAIL( ... ) INTERNAL_CATCH_MSG( "CATCH_FAIL", Catch::ResultWas::ExplicitFailure, Catch::ResultDisposition::Normal, __VA_ARGS__ )
+  #define CATCH_FAIL_CHECK( ... ) INTERNAL_CATCH_MSG( "CATCH_FAIL_CHECK", Catch::ResultWas::ExplicitFailure, Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+  #define CATCH_SUCCEED( ... ) INTERNAL_CATCH_MSG( "CATCH_SUCCEED", Catch::ResultWas::Ok, Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+  #define CATCH_SKIP( ... ) INTERNAL_CATCH_MSG( "SKIP", Catch::ResultWas::ExplicitSkip, Catch::ResultDisposition::Normal, __VA_ARGS__ )
+
+
+  #if !defined(CATCH_CONFIG_RUNTIME_STATIC_REQUIRE)
+    #define CATCH_STATIC_REQUIRE( ... )       static_assert(   __VA_ARGS__ ,      #__VA_ARGS__ );     CATCH_SUCCEED( #__VA_ARGS__ )
+    #define CATCH_STATIC_REQUIRE_FALSE( ... ) static_assert( !(__VA_ARGS__), "!(" #__VA_ARGS__ ")" ); CATCH_SUCCEED( #__VA_ARGS__ )
+    #define CATCH_STATIC_CHECK( ... )       static_assert(   __VA_ARGS__ ,      #__VA_ARGS__ );     CATCH_SUCCEED( #__VA_ARGS__ )
+    #define CATCH_STATIC_CHECK_FALSE( ... ) static_assert( !(__VA_ARGS__), "!(" #__VA_ARGS__ ")" ); CATCH_SUCCEED( #__VA_ARGS__ )
+  #else
+    #define CATCH_STATIC_REQUIRE( ... )       CATCH_REQUIRE( __VA_ARGS__ )
+    #define CATCH_STATIC_REQUIRE_FALSE( ... ) CATCH_REQUIRE_FALSE( __VA_ARGS__ )
+    #define CATCH_STATIC_CHECK( ... )       CATCH_CHECK( __VA_ARGS__ )
+    #define CATCH_STATIC_CHECK_FALSE( ... ) CATCH_CHECK_FALSE( __VA_ARGS__ )
+  #endif
+
+
+  // "BDD-style" convenience wrappers
+  #define CATCH_SCENARIO( ... ) CATCH_TEST_CASE( "Scenario: " __VA_ARGS__ )
+  #define CATCH_SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TEST_CASE_METHOD( className, "Scenario: " __VA_ARGS__ )
+  #define CATCH_GIVEN( desc )     INTERNAL_CATCH_DYNAMIC_SECTION( "    Given: " << desc )
+  #define CATCH_AND_GIVEN( desc ) INTERNAL_CATCH_DYNAMIC_SECTION( "And given: " << desc )
+  #define CATCH_WHEN( desc )      INTERNAL_CATCH_DYNAMIC_SECTION( "     When: " << desc )
+  #define CATCH_AND_WHEN( desc )  INTERNAL_CATCH_DYNAMIC_SECTION( " And when: " << desc )
+  #define CATCH_THEN( desc )      INTERNAL_CATCH_DYNAMIC_SECTION( "     Then: " << desc )
+  #define CATCH_AND_THEN( desc )  INTERNAL_CATCH_DYNAMIC_SECTION( "      And: " << desc )
+
+#elif defined(CATCH_CONFIG_PREFIX_ALL) && defined(CATCH_CONFIG_DISABLE) // ^^ prefixed, implemented | vv prefixed, disabled
+
+  #define CATCH_REQUIRE( ... )        (void)(0)
+  #define CATCH_REQUIRE_FALSE( ... )  (void)(0)
+
+  #define CATCH_REQUIRE_THROWS( ... ) (void)(0)
+  #define CATCH_REQUIRE_THROWS_AS( expr, exceptionType ) (void)(0)
+  #define CATCH_REQUIRE_NOTHROW( ... ) (void)(0)
+
+  #define CATCH_CHECK( ... )         (void)(0)
+  #define CATCH_CHECK_FALSE( ... )   (void)(0)
+  #define CATCH_CHECKED_IF( ... )    if (__VA_ARGS__)
+  #define CATCH_CHECKED_ELSE( ... )  if (!(__VA_ARGS__))
+  #define CATCH_CHECK_NOFAIL( ... )  (void)(0)
+
+  #define CATCH_CHECK_THROWS( ... )  (void)(0)
+  #define CATCH_CHECK_THROWS_AS( expr, exceptionType ) (void)(0)
+  #define CATCH_CHECK_NOTHROW( ... ) (void)(0)
+
+  #define CATCH_TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ))
+  #define CATCH_TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ))
+  #define CATCH_METHOD_AS_TEST_CASE( method, ... )
+  #define CATCH_REGISTER_TEST_CASE( Function, ... ) (void)(0)
+  #define CATCH_SECTION( ... )
+  #define CATCH_DYNAMIC_SECTION( ... )
+  #define CATCH_FAIL( ... ) (void)(0)
+  #define CATCH_FAIL_CHECK( ... ) (void)(0)
+  #define CATCH_SUCCEED( ... ) (void)(0)
+  #define CATCH_SKIP( ... ) (void)(0)
+
+  #define CATCH_STATIC_REQUIRE( ... )       (void)(0)
+  #define CATCH_STATIC_REQUIRE_FALSE( ... ) (void)(0)
+  #define CATCH_STATIC_CHECK( ... )       (void)(0)
+  #define CATCH_STATIC_CHECK_FALSE( ... ) (void)(0)
+
+  // "BDD-style" convenience wrappers
+  #define CATCH_SCENARIO( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ))
+  #define CATCH_SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ), className )
+  #define CATCH_GIVEN( desc )
+  #define CATCH_AND_GIVEN( desc )
+  #define CATCH_WHEN( desc )
+  #define CATCH_AND_WHEN( desc )
+  #define CATCH_THEN( desc )
+  #define CATCH_AND_THEN( desc )
+
+#elif !defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_DISABLE) // ^^ prefixed, disabled | vv unprefixed, implemented
+
+  #define REQUIRE( ... ) INTERNAL_CATCH_TEST( "REQUIRE", Catch::ResultDisposition::Normal, __VA_ARGS__  )
+  #define REQUIRE_FALSE( ... ) INTERNAL_CATCH_TEST( "REQUIRE_FALSE", Catch::ResultDisposition::Normal | Catch::ResultDisposition::FalseTest, __VA_ARGS__ )
+
+  #define REQUIRE_THROWS( ... ) INTERNAL_CATCH_THROWS( "REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__ )
+  #define REQUIRE_THROWS_AS( expr, exceptionType ) INTERNAL_CATCH_THROWS_AS( "REQUIRE_THROWS_AS", exceptionType, Catch::ResultDisposition::Normal, expr )
+  #define REQUIRE_NOTHROW( ... ) INTERNAL_CATCH_NO_THROW( "REQUIRE_NOTHROW", Catch::ResultDisposition::Normal, __VA_ARGS__ )
+
+  #define CHECK( ... ) INTERNAL_CATCH_TEST( "CHECK", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+  #define CHECK_FALSE( ... ) INTERNAL_CATCH_TEST( "CHECK_FALSE", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::FalseTest, __VA_ARGS__ )
+  #define CHECKED_IF( ... ) INTERNAL_CATCH_IF( "CHECKED_IF", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::SuppressFail, __VA_ARGS__ )
+  #define CHECKED_ELSE( ... ) INTERNAL_CATCH_ELSE( "CHECKED_ELSE", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::SuppressFail, __VA_ARGS__ )
+  #define CHECK_NOFAIL( ... ) INTERNAL_CATCH_TEST( "CHECK_NOFAIL", Catch::ResultDisposition::ContinueOnFailure | Catch::ResultDisposition::SuppressFail, __VA_ARGS__ )
+
+  #define CHECK_THROWS( ... )  INTERNAL_CATCH_THROWS( "CHECK_THROWS", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+  #define CHECK_THROWS_AS( expr, exceptionType ) INTERNAL_CATCH_THROWS_AS( "CHECK_THROWS_AS", exceptionType, Catch::ResultDisposition::ContinueOnFailure, expr )
+  #define CHECK_NOTHROW( ... ) INTERNAL_CATCH_NO_THROW( "CHECK_NOTHROW", Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+
+  #define TEST_CASE( ... ) INTERNAL_CATCH_TESTCASE( __VA_ARGS__ )
+  #define TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TEST_CASE_METHOD( className, __VA_ARGS__ )
+  #define METHOD_AS_TEST_CASE( method, ... ) INTERNAL_CATCH_METHOD_AS_TEST_CASE( method, __VA_ARGS__ )
+  #define REGISTER_TEST_CASE( Function, ... ) INTERNAL_CATCH_REGISTER_TESTCASE( Function, __VA_ARGS__ )
+  #define SECTION( ... ) INTERNAL_CATCH_SECTION( __VA_ARGS__ )
+  #define DYNAMIC_SECTION( ... ) INTERNAL_CATCH_DYNAMIC_SECTION( __VA_ARGS__ )
+  #define FAIL( ... ) INTERNAL_CATCH_MSG( "FAIL", Catch::ResultWas::ExplicitFailure, Catch::ResultDisposition::Normal, __VA_ARGS__ )
+  #define FAIL_CHECK( ... ) INTERNAL_CATCH_MSG( "FAIL_CHECK", Catch::ResultWas::ExplicitFailure, Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+  #define SUCCEED( ... ) INTERNAL_CATCH_MSG( "SUCCEED", Catch::ResultWas::Ok, Catch::ResultDisposition::ContinueOnFailure, __VA_ARGS__ )
+  #define SKIP( ... ) INTERNAL_CATCH_MSG( "SKIP", Catch::ResultWas::ExplicitSkip, Catch::ResultDisposition::Normal, __VA_ARGS__ )
+
+
+  #if !defined(CATCH_CONFIG_RUNTIME_STATIC_REQUIRE)
+    #define STATIC_REQUIRE( ... )       static_assert(   __VA_ARGS__,  #__VA_ARGS__ ); SUCCEED( #__VA_ARGS__ )
+    #define STATIC_REQUIRE_FALSE( ... ) static_assert( !(__VA_ARGS__), "!(" #__VA_ARGS__ ")" ); SUCCEED( "!(" #__VA_ARGS__ ")" )
+    #define STATIC_CHECK( ... )       static_assert(   __VA_ARGS__,  #__VA_ARGS__ ); SUCCEED( #__VA_ARGS__ )
+    #define STATIC_CHECK_FALSE( ... ) static_assert( !(__VA_ARGS__), "!(" #__VA_ARGS__ ")" ); SUCCEED( "!(" #__VA_ARGS__ ")" )
+  #else
+    #define STATIC_REQUIRE( ... )       REQUIRE( __VA_ARGS__ )
+    #define STATIC_REQUIRE_FALSE( ... ) REQUIRE_FALSE( __VA_ARGS__ )
+    #define STATIC_CHECK( ... )       CHECK( __VA_ARGS__ )
+    #define STATIC_CHECK_FALSE( ... ) CHECK_FALSE( __VA_ARGS__ )
+  #endif
+
+  // "BDD-style" convenience wrappers
+  #define SCENARIO( ... ) TEST_CASE( "Scenario: " __VA_ARGS__ )
+  #define SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TEST_CASE_METHOD( className, "Scenario: " __VA_ARGS__ )
+  #define GIVEN( desc )     INTERNAL_CATCH_DYNAMIC_SECTION( "    Given: " << desc )
+  #define AND_GIVEN( desc ) INTERNAL_CATCH_DYNAMIC_SECTION( "And given: " << desc )
+  #define WHEN( desc )      INTERNAL_CATCH_DYNAMIC_SECTION( "     When: " << desc )
+  #define AND_WHEN( desc )  INTERNAL_CATCH_DYNAMIC_SECTION( " And when: " << desc )
+  #define THEN( desc )      INTERNAL_CATCH_DYNAMIC_SECTION( "     Then: " << desc )
+  #define AND_THEN( desc )  INTERNAL_CATCH_DYNAMIC_SECTION( "      And: " << desc )
+
+#elif !defined(CATCH_CONFIG_PREFIX_ALL) && defined(CATCH_CONFIG_DISABLE) // ^^ unprefixed, implemented | vv unprefixed, disabled
+
+  #define REQUIRE( ... )       (void)(0)
+  #define REQUIRE_FALSE( ... ) (void)(0)
+
+  #define REQUIRE_THROWS( ... ) (void)(0)
+  #define REQUIRE_THROWS_AS( expr, exceptionType ) (void)(0)
+  #define REQUIRE_NOTHROW( ... ) (void)(0)
+
+  #define CHECK( ... ) (void)(0)
+  #define CHECK_FALSE( ... ) (void)(0)
+  #define CHECKED_IF( ... ) if (__VA_ARGS__)
+  #define CHECKED_ELSE( ... ) if (!(__VA_ARGS__))
+  #define CHECK_NOFAIL( ... ) (void)(0)
+
+  #define CHECK_THROWS( ... )  (void)(0)
+  #define CHECK_THROWS_AS( expr, exceptionType ) (void)(0)
+  #define CHECK_NOTHROW( ... ) (void)(0)
+
+  #define TEST_CASE( ... )  INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ), __VA_ARGS__)
+  #define TEST_CASE_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ))
+  #define METHOD_AS_TEST_CASE( method, ... )
+  #define REGISTER_TEST_CASE( Function, ... ) (void)(0)
+  #define SECTION( ... )
+  #define DYNAMIC_SECTION( ... )
+  #define FAIL( ... ) (void)(0)
+  #define FAIL_CHECK( ... ) (void)(0)
+  #define SUCCEED( ... ) (void)(0)
+  #define SKIP( ... ) (void)(0)
+
+  #define STATIC_REQUIRE( ... )       (void)(0)
+  #define STATIC_REQUIRE_FALSE( ... ) (void)(0)
+  #define STATIC_CHECK( ... )       (void)(0)
+  #define STATIC_CHECK_FALSE( ... ) (void)(0)
+
+  // "BDD-style" convenience wrappers
+  #define SCENARIO( ... ) INTERNAL_CATCH_TESTCASE_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ) )
+  #define SCENARIO_METHOD( className, ... ) INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION(INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ), className )
+
+  #define GIVEN( desc )
+  #define AND_GIVEN( desc )
+  #define WHEN( desc )
+  #define AND_WHEN( desc )
+  #define THEN( desc )
+  #define AND_THEN( desc )
+
+#endif // ^^ unprefixed, disabled
+
+// end of user facing macros
+
+#endif // CATCH_TEST_MACROS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_spec.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_spec.cpp
new file mode 100644
index 00000000..f32f9864
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_spec.cpp
@@ -0,0 +1,141 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/catch_test_spec.hpp>
+#include <catch2/interfaces/catch_interfaces_testcase.hpp>
+#include <catch2/internal/catch_test_case_registry_impl.hpp>
+#include <catch2/internal/catch_reusable_string_stream.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/catch_test_case_info.hpp>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <ostream>
+
+namespace Catch {
+
+    TestSpec::Pattern::Pattern( std::string const& name )
+    : m_name( name )
+    {}
+
+    TestSpec::Pattern::~Pattern() = default;
+
+    std::string const& TestSpec::Pattern::name() const {
+        return m_name;
+    }
+
+
+    TestSpec::NamePattern::NamePattern( std::string const& name, std::string const& filterString )
+    : Pattern( filterString )
+    , m_wildcardPattern( toLower( name ), CaseSensitive::No )
+    {}
+
+    bool TestSpec::NamePattern::matches( TestCaseInfo const& testCase ) const {
+        return m_wildcardPattern.matches( testCase.name );
+    }
+
+    void TestSpec::NamePattern::serializeTo( std::ostream& out ) const {
+        out << '"' << name() << '"';
+    }
+
+
+    TestSpec::TagPattern::TagPattern( std::string const& tag, std::string const& filterString )
+    : Pattern( filterString )
+    , m_tag( tag )
+    {}
+
+    bool TestSpec::TagPattern::matches( TestCaseInfo const& testCase ) const {
+        return std::find( begin( testCase.tags ),
+                          end( testCase.tags ),
+                          Tag( m_tag ) ) != end( testCase.tags );
+    }
+
+    void TestSpec::TagPattern::serializeTo( std::ostream& out ) const {
+        out << name();
+    }
+
+    bool TestSpec::Filter::matches( TestCaseInfo const& testCase ) const {
+        bool should_use = !testCase.isHidden();
+        for (auto const& pattern : m_required) {
+            should_use = true;
+            if (!pattern->matches(testCase)) {
+                return false;
+            }
+        }
+        for (auto const& pattern : m_forbidden) {
+            if (pattern->matches(testCase)) {
+                return false;
+            }
+        }
+        return should_use;
+    }
+
+    void TestSpec::Filter::serializeTo( std::ostream& out ) const {
+        bool first = true;
+        for ( auto const& pattern : m_required ) {
+            if ( !first ) {
+                out << ' ';
+            }
+            out << *pattern;
+            first = false;
+        }
+        for ( auto const& pattern : m_forbidden ) {
+            if ( !first ) {
+                out << ' ';
+            }
+            out << *pattern;
+            first = false;
+        }
+    }
+
+
+    std::string TestSpec::extractFilterName( Filter const& filter ) {
+        Catch::ReusableStringStream sstr;
+        sstr << filter;
+        return sstr.str();
+    }
+
+    bool TestSpec::hasFilters() const {
+        return !m_filters.empty();
+    }
+
+    bool TestSpec::matches( TestCaseInfo const& testCase ) const {
+        return std::any_of( m_filters.begin(), m_filters.end(), [&]( Filter const& f ){ return f.matches( testCase ); } );
+    }
+
+    TestSpec::Matches TestSpec::matchesByFilter( std::vector<TestCaseHandle> const& testCases, IConfig const& config ) const {
+        Matches matches;
+        matches.reserve( m_filters.size() );
+        for ( auto const& filter : m_filters ) {
+            std::vector<TestCaseHandle const*> currentMatches;
+            for ( auto const& test : testCases )
+                if ( isThrowSafe( test, config ) &&
+                     filter.matches( test.getTestCaseInfo() ) )
+                    currentMatches.emplace_back( &test );
+            matches.push_back(
+                FilterMatch{ extractFilterName( filter ), currentMatches } );
+        }
+        return matches;
+    }
+
+    const TestSpec::vectorStrings& TestSpec::getInvalidSpecs() const {
+        return m_invalidSpecs;
+    }
+
+    void TestSpec::serializeTo( std::ostream& out ) const {
+        bool first = true;
+        for ( auto const& filter : m_filters ) {
+            if ( !first ) {
+                out << ',';
+            }
+            out << filter;
+            first = false;
+        }
+    }
+
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_spec.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_spec.hpp
new file mode 100644
index 00000000..f12baa63
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_spec.hpp
@@ -0,0 +1,119 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TEST_SPEC_HPP_INCLUDED
+#define CATCH_TEST_SPEC_HPP_INCLUDED
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpadded"
+#endif
+
+#include <catch2/internal/catch_unique_ptr.hpp>
+#include <catch2/internal/catch_wildcard_pattern.hpp>
+
+#include <iosfwd>
+#include <string>
+#include <vector>
+
+namespace Catch {
+
+    class IConfig;
+    struct TestCaseInfo;
+    class TestCaseHandle;
+
+    class TestSpec {
+
+        class Pattern {
+        public:
+            explicit Pattern( std::string const& name );
+            virtual ~Pattern();
+            virtual bool matches( TestCaseInfo const& testCase ) const = 0;
+            std::string const& name() const;
+        private:
+            virtual void serializeTo( std::ostream& out ) const = 0;
+            // Writes string that would be reparsed into the pattern
+            friend std::ostream& operator<<(std::ostream& out,
+                                            Pattern const& pattern) {
+                pattern.serializeTo( out );
+                return out;
+            }
+
+            std::string const m_name;
+        };
+
+        class NamePattern : public Pattern {
+        public:
+            explicit NamePattern( std::string const& name, std::string const& filterString );
+            bool matches( TestCaseInfo const& testCase ) const override;
+        private:
+            void serializeTo( std::ostream& out ) const override;
+
+            WildcardPattern m_wildcardPattern;
+        };
+
+        class TagPattern : public Pattern {
+        public:
+            explicit TagPattern( std::string const& tag, std::string const& filterString );
+            bool matches( TestCaseInfo const& testCase ) const override;
+        private:
+            void serializeTo( std::ostream& out ) const override;
+
+            std::string m_tag;
+        };
+
+        struct Filter {
+            std::vector<Detail::unique_ptr<Pattern>> m_required;
+            std::vector<Detail::unique_ptr<Pattern>> m_forbidden;
+
+            //! Serializes this filter into a string that would be parsed into
+            //! an equivalent filter
+            void serializeTo( std::ostream& out ) const;
+            friend std::ostream& operator<<(std::ostream& out, Filter const& f) {
+                f.serializeTo( out );
+                return out;
+            }
+
+            bool matches( TestCaseInfo const& testCase ) const;
+        };
+
+        static std::string extractFilterName( Filter const& filter );
+
+    public:
+        struct FilterMatch {
+            std::string name;
+            std::vector<TestCaseHandle const*> tests;
+        };
+        using Matches = std::vector<FilterMatch>;
+        using vectorStrings = std::vector<std::string>;
+
+        bool hasFilters() const;
+        bool matches( TestCaseInfo const& testCase ) const;
+        Matches matchesByFilter( std::vector<TestCaseHandle> const& testCases, IConfig const& config ) const;
+        const vectorStrings & getInvalidSpecs() const;
+
+    private:
+        std::vector<Filter> m_filters;
+        std::vector<std::string> m_invalidSpecs;
+
+        friend class TestSpecParser;
+        //! Serializes this test spec into a string that would be parsed into
+        //! equivalent test spec
+        void serializeTo( std::ostream& out ) const;
+        friend std::ostream& operator<<(std::ostream& out,
+                                        TestSpec const& spec) {
+            spec.serializeTo( out );
+            return out;
+        }
+    };
+}
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+#endif // CATCH_TEST_SPEC_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_timer.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_timer.cpp
new file mode 100644
index 00000000..d75ea70c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_timer.cpp
@@ -0,0 +1,37 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/catch_timer.hpp>
+
+#include <chrono>
+
+namespace Catch {
+
+    namespace {
+        static auto getCurrentNanosecondsSinceEpoch() -> uint64_t {
+            return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now().time_since_epoch()).count();
+        }
+    } // end unnamed namespace
+
+    void Timer::start() {
+       m_nanoseconds = getCurrentNanosecondsSinceEpoch();
+    }
+    auto Timer::getElapsedNanoseconds() const -> uint64_t {
+        return getCurrentNanosecondsSinceEpoch() - m_nanoseconds;
+    }
+    auto Timer::getElapsedMicroseconds() const -> uint64_t {
+        return getElapsedNanoseconds()/1000;
+    }
+    auto Timer::getElapsedMilliseconds() const -> unsigned int {
+        return static_cast<unsigned int>(getElapsedMicroseconds()/1000);
+    }
+    auto Timer::getElapsedSeconds() const -> double {
+        return getElapsedMicroseconds()/1000000.0;
+    }
+
+
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_timer.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_timer.hpp
new file mode 100644
index 00000000..f42589ff
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_timer.hpp
@@ -0,0 +1,27 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TIMER_HPP_INCLUDED
+#define CATCH_TIMER_HPP_INCLUDED
+
+#include <cstdint>
+
+namespace Catch {
+
+    class Timer {
+        uint64_t m_nanoseconds = 0;
+    public:
+        void start();
+        auto getElapsedNanoseconds() const -> uint64_t;
+        auto getElapsedMicroseconds() const -> uint64_t;
+        auto getElapsedMilliseconds() const -> unsigned int;
+        auto getElapsedSeconds() const -> double;
+    };
+
+} // namespace Catch
+
+#endif // CATCH_TIMER_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tostring.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tostring.cpp
new file mode 100644
index 00000000..b0b53c5e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tostring.cpp
@@ -0,0 +1,254 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/catch_tostring.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/internal/catch_context.hpp>
+#include <catch2/internal/catch_polyfills.hpp>
+
+#include <cmath>
+#include <iomanip>
+
+namespace Catch {
+
+namespace Detail {
+
+    namespace {
+        const int hexThreshold = 255;
+
+        struct Endianness {
+            enum Arch { Big, Little };
+
+            static Arch which() {
+                int one = 1;
+                // If the lowest byte we read is non-zero, we can assume
+                // that little endian format is used.
+                auto value = *reinterpret_cast<char*>(&one);
+                return value ? Little : Big;
+            }
+        };
+
+        template<typename T>
+        std::string fpToString(T value, int precision) {
+            if (Catch::isnan(value)) {
+                return "nan";
+            }
+
+            ReusableStringStream rss;
+            rss << std::setprecision(precision)
+                << std::fixed
+                << value;
+            std::string d = rss.str();
+            std::size_t i = d.find_last_not_of('0');
+            if (i != std::string::npos && i != d.size() - 1) {
+                if (d[i] == '.')
+                    i++;
+                d = d.substr(0, i + 1);
+            }
+            return d;
+        }
+    } // end unnamed namespace
+
+    std::string convertIntoString(StringRef string, bool escapeInvisibles) {
+        std::string ret;
+        // This is enough for the "don't escape invisibles" case, and a good
+        // lower bound on the "escape invisibles" case.
+        ret.reserve(string.size() + 2);
+
+        if (!escapeInvisibles) {
+            ret += '"';
+            ret += string;
+            ret += '"';
+            return ret;
+        }
+
+        ret += '"';
+        for (char c : string) {
+            switch (c) {
+            case '\r':
+                ret.append("\\r");
+                break;
+            case '\n':
+                ret.append("\\n");
+                break;
+            case '\t':
+                ret.append("\\t");
+                break;
+            case '\f':
+                ret.append("\\f");
+                break;
+            default:
+                ret.push_back(c);
+                break;
+            }
+        }
+        ret += '"';
+
+        return ret;
+    }
+
+    std::string convertIntoString(StringRef string) {
+        return convertIntoString(string, getCurrentContext().getConfig()->showInvisibles());
+    }
+
+    std::string rawMemoryToString( const void *object, std::size_t size ) {
+        // Reverse order for little endian architectures
+        int i = 0, end = static_cast<int>( size ), inc = 1;
+        if( Endianness::which() == Endianness::Little ) {
+            i = end-1;
+            end = inc = -1;
+        }
+
+        unsigned char const *bytes = static_cast<unsigned char const *>(object);
+        ReusableStringStream rss;
+        rss << "0x" << std::setfill('0') << std::hex;
+        for( ; i != end; i += inc )
+             rss << std::setw(2) << static_cast<unsigned>(bytes[i]);
+       return rss.str();
+    }
+} // end Detail namespace
+
+
+
+//// ======================================================= ////
+//
+//   Out-of-line defs for full specialization of StringMaker
+//
+//// ======================================================= ////
+
+std::string StringMaker<std::string>::convert(const std::string& str) {
+    return Detail::convertIntoString( str );
+}
+
+#ifdef CATCH_CONFIG_CPP17_STRING_VIEW
+std::string StringMaker<std::string_view>::convert(std::string_view str) {
+    return Detail::convertIntoString( StringRef( str.data(), str.size() ) );
+}
+#endif
+
+std::string StringMaker<char const*>::convert(char const* str) {
+    if (str) {
+        return Detail::convertIntoString( str );
+    } else {
+        return{ "{null string}" };
+    }
+}
+std::string StringMaker<char*>::convert(char* str) { // NOLINT(readability-non-const-parameter)
+    if (str) {
+        return Detail::convertIntoString( str );
+    } else {
+        return{ "{null string}" };
+    }
+}
+
+#ifdef CATCH_CONFIG_WCHAR
+std::string StringMaker<std::wstring>::convert(const std::wstring& wstr) {
+    std::string s;
+    s.reserve(wstr.size());
+    for (auto c : wstr) {
+        s += (c <= 0xff) ? static_cast<char>(c) : '?';
+    }
+    return ::Catch::Detail::stringify(s);
+}
+
+# ifdef CATCH_CONFIG_CPP17_STRING_VIEW
+std::string StringMaker<std::wstring_view>::convert(std::wstring_view str) {
+    return StringMaker<std::wstring>::convert(std::wstring(str));
+}
+# endif
+
+std::string StringMaker<wchar_t const*>::convert(wchar_t const * str) {
+    if (str) {
+        return ::Catch::Detail::stringify(std::wstring{ str });
+    } else {
+        return{ "{null string}" };
+    }
+}
+std::string StringMaker<wchar_t *>::convert(wchar_t * str) {
+    if (str) {
+        return ::Catch::Detail::stringify(std::wstring{ str });
+    } else {
+        return{ "{null string}" };
+    }
+}
+#endif
+
+#if defined(CATCH_CONFIG_CPP17_BYTE)
+#include <cstddef>
+std::string StringMaker<std::byte>::convert(std::byte value) {
+    return ::Catch::Detail::stringify(std::to_integer<unsigned long long>(value));
+}
+#endif // defined(CATCH_CONFIG_CPP17_BYTE)
+
+std::string StringMaker<int>::convert(int value) {
+    return ::Catch::Detail::stringify(static_cast<long long>(value));
+}
+std::string StringMaker<long>::convert(long value) {
+    return ::Catch::Detail::stringify(static_cast<long long>(value));
+}
+std::string StringMaker<long long>::convert(long long value) {
+    ReusableStringStream rss;
+    rss << value;
+    if (value > Detail::hexThreshold) {
+        rss << " (0x" << std::hex << value << ')';
+    }
+    return rss.str();
+}
+
+std::string StringMaker<unsigned int>::convert(unsigned int value) {
+    return ::Catch::Detail::stringify(static_cast<unsigned long long>(value));
+}
+std::string StringMaker<unsigned long>::convert(unsigned long value) {
+    return ::Catch::Detail::stringify(static_cast<unsigned long long>(value));
+}
+std::string StringMaker<unsigned long long>::convert(unsigned long long value) {
+    ReusableStringStream rss;
+    rss << value;
+    if (value > Detail::hexThreshold) {
+        rss << " (0x" << std::hex << value << ')';
+    }
+    return rss.str();
+}
+
+std::string StringMaker<signed char>::convert(signed char value) {
+    if (value == '\r') {
+        return "'\\r'";
+    } else if (value == '\f') {
+        return "'\\f'";
+    } else if (value == '\n') {
+        return "'\\n'";
+    } else if (value == '\t') {
+        return "'\\t'";
+    } else if ('\0' <= value && value < ' ') {
+        return ::Catch::Detail::stringify(static_cast<unsigned int>(value));
+    } else {
+        char chstr[] = "' '";
+        chstr[1] = value;
+        return chstr;
+    }
+}
+std::string StringMaker<char>::convert(char c) {
+    return ::Catch::Detail::stringify(static_cast<signed char>(c));
+}
+std::string StringMaker<unsigned char>::convert(unsigned char value) {
+    return ::Catch::Detail::stringify(static_cast<char>(value));
+}
+
+int StringMaker<float>::precision = 5;
+
+std::string StringMaker<float>::convert(float value) {
+    return Detail::fpToString(value, precision) + 'f';
+}
+
+int StringMaker<double>::precision = 10;
+
+std::string StringMaker<double>::convert(double value) {
+    return Detail::fpToString(value, precision);
+}
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tostring.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tostring.hpp
new file mode 100644
index 00000000..67a7c1d2
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tostring.hpp
@@ -0,0 +1,674 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TOSTRING_HPP_INCLUDED
+#define CATCH_TOSTRING_HPP_INCLUDED
+
+
+#include <vector>
+#include <cstddef>
+#include <type_traits>
+#include <string>
+
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/internal/catch_config_wchar.hpp>
+#include <catch2/internal/catch_reusable_string_stream.hpp>
+#include <catch2/internal/catch_void_type.hpp>
+#include <catch2/interfaces/catch_interfaces_enum_values_registry.hpp>
+
+#ifdef CATCH_CONFIG_CPP17_STRING_VIEW
+#include <string_view>
+#endif
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable:4180) // We attempt to stream a function (address) by const&, which MSVC complains about but is harmless
+#endif
+
+// We need a dummy global operator<< so we can bring it into Catch namespace later
+struct Catch_global_namespace_dummy{};
+std::ostream& operator<<(std::ostream&, Catch_global_namespace_dummy);
+
+namespace Catch {
+    // Bring in global namespace operator<< for ADL lookup in
+    // `IsStreamInsertable` below.
+    using ::operator<<;
+
+    namespace Detail {
+
+        inline std::size_t catch_strnlen(const char *str, std::size_t n) {
+            auto ret = std::char_traits<char>::find(str, n, '\0');
+            if (ret != nullptr) {
+                return static_cast<std::size_t>(ret - str);
+            }
+            return n;
+        }
+
+        constexpr StringRef unprintableString = "{?}"_sr;
+
+        //! Encases `string in quotes, and optionally escapes invisibles
+        std::string convertIntoString( StringRef string, bool escapeInvisibles );
+
+        //! Encases `string` in quotes, and escapes invisibles if user requested
+        //! it via CLI
+        std::string convertIntoString( StringRef string );
+
+        std::string rawMemoryToString( const void *object, std::size_t size );
+
+        template<typename T>
+        std::string rawMemoryToString( const T& object ) {
+          return rawMemoryToString( &object, sizeof(object) );
+        }
+
+        template<typename T>
+        class IsStreamInsertable {
+            template<typename Stream, typename U>
+            static auto test(int)
+                -> decltype(std::declval<Stream&>() << std::declval<U>(), std::true_type());
+
+            template<typename, typename>
+            static auto test(...)->std::false_type;
+
+        public:
+            static const bool value = decltype(test<std::ostream, const T&>(0))::value;
+        };
+
+        template<typename E>
+        std::string convertUnknownEnumToString( E e );
+
+        template<typename T>
+        std::enable_if_t<
+            !std::is_enum<T>::value && !std::is_base_of<std::exception, T>::value,
+        std::string> convertUnstreamable( T const& ) {
+            return std::string(Detail::unprintableString);
+        }
+        template<typename T>
+        std::enable_if_t<
+            !std::is_enum<T>::value && std::is_base_of<std::exception, T>::value,
+         std::string> convertUnstreamable(T const& ex) {
+            return ex.what();
+        }
+
+
+        template<typename T>
+        std::enable_if_t<
+            std::is_enum<T>::value,
+        std::string> convertUnstreamable( T const& value ) {
+            return convertUnknownEnumToString( value );
+        }
+
+#if defined(_MANAGED)
+        //! Convert a CLR string to a utf8 std::string
+        template<typename T>
+        std::string clrReferenceToString( T^ ref ) {
+            if (ref == nullptr)
+                return std::string("null");
+            auto bytes = System::Text::Encoding::UTF8->GetBytes(ref->ToString());
+            cli::pin_ptr<System::Byte> p = &bytes[0];
+            return std::string(reinterpret_cast<char const *>(p), bytes->Length);
+        }
+#endif
+
+    } // namespace Detail
+
+
+    template <typename T, typename = void>
+    struct StringMaker {
+        template <typename Fake = T>
+        static
+        std::enable_if_t<::Catch::Detail::IsStreamInsertable<Fake>::value, std::string>
+            convert(const Fake& value) {
+                ReusableStringStream rss;
+                // NB: call using the function-like syntax to avoid ambiguity with
+                // user-defined templated operator<< under clang.
+                rss.operator<<(value);
+                return rss.str();
+        }
+
+        template <typename Fake = T>
+        static
+        std::enable_if_t<!::Catch::Detail::IsStreamInsertable<Fake>::value, std::string>
+            convert( const Fake& value ) {
+#if !defined(CATCH_CONFIG_FALLBACK_STRINGIFIER)
+            return Detail::convertUnstreamable(value);
+#else
+            return CATCH_CONFIG_FALLBACK_STRINGIFIER(value);
+#endif
+        }
+    };
+
+    namespace Detail {
+
+        // This function dispatches all stringification requests inside of Catch.
+        // Should be preferably called fully qualified, like ::Catch::Detail::stringify
+        template <typename T>
+        std::string stringify(const T& e) {
+            return ::Catch::StringMaker<std::remove_cv_t<std::remove_reference_t<T>>>::convert(e);
+        }
+
+        template<typename E>
+        std::string convertUnknownEnumToString( E e ) {
+            return ::Catch::Detail::stringify(static_cast<std::underlying_type_t<E>>(e));
+        }
+
+#if defined(_MANAGED)
+        template <typename T>
+        std::string stringify( T^ e ) {
+            return ::Catch::StringMaker<T^>::convert(e);
+        }
+#endif
+
+    } // namespace Detail
+
+    // Some predefined specializations
+
+    template<>
+    struct StringMaker<std::string> {
+        static std::string convert(const std::string& str);
+    };
+
+#ifdef CATCH_CONFIG_CPP17_STRING_VIEW
+    template<>
+    struct StringMaker<std::string_view> {
+        static std::string convert(std::string_view str);
+    };
+#endif
+
+    template<>
+    struct StringMaker<char const *> {
+        static std::string convert(char const * str);
+    };
+    template<>
+    struct StringMaker<char *> {
+        static std::string convert(char * str);
+    };
+
+#if defined(CATCH_CONFIG_WCHAR)
+    template<>
+    struct StringMaker<std::wstring> {
+        static std::string convert(const std::wstring& wstr);
+    };
+
+# ifdef CATCH_CONFIG_CPP17_STRING_VIEW
+    template<>
+    struct StringMaker<std::wstring_view> {
+        static std::string convert(std::wstring_view str);
+    };
+# endif
+
+    template<>
+    struct StringMaker<wchar_t const *> {
+        static std::string convert(wchar_t const * str);
+    };
+    template<>
+    struct StringMaker<wchar_t *> {
+        static std::string convert(wchar_t * str);
+    };
+#endif // CATCH_CONFIG_WCHAR
+
+    template<size_t SZ>
+    struct StringMaker<char[SZ]> {
+        static std::string convert(char const* str) {
+            return Detail::convertIntoString(
+                StringRef( str, Detail::catch_strnlen( str, SZ ) ) );
+        }
+    };
+    template<size_t SZ>
+    struct StringMaker<signed char[SZ]> {
+        static std::string convert(signed char const* str) {
+            auto reinterpreted = reinterpret_cast<char const*>(str);
+            return Detail::convertIntoString(
+                StringRef(reinterpreted, Detail::catch_strnlen(reinterpreted, SZ)));
+        }
+    };
+    template<size_t SZ>
+    struct StringMaker<unsigned char[SZ]> {
+        static std::string convert(unsigned char const* str) {
+            auto reinterpreted = reinterpret_cast<char const*>(str);
+            return Detail::convertIntoString(
+                StringRef(reinterpreted, Detail::catch_strnlen(reinterpreted, SZ)));
+        }
+    };
+
+#if defined(CATCH_CONFIG_CPP17_BYTE)
+    template<>
+    struct StringMaker<std::byte> {
+        static std::string convert(std::byte value);
+    };
+#endif // defined(CATCH_CONFIG_CPP17_BYTE)
+    template<>
+    struct StringMaker<int> {
+        static std::string convert(int value);
+    };
+    template<>
+    struct StringMaker<long> {
+        static std::string convert(long value);
+    };
+    template<>
+    struct StringMaker<long long> {
+        static std::string convert(long long value);
+    };
+    template<>
+    struct StringMaker<unsigned int> {
+        static std::string convert(unsigned int value);
+    };
+    template<>
+    struct StringMaker<unsigned long> {
+        static std::string convert(unsigned long value);
+    };
+    template<>
+    struct StringMaker<unsigned long long> {
+        static std::string convert(unsigned long long value);
+    };
+
+    template<>
+    struct StringMaker<bool> {
+        static std::string convert(bool b) {
+            using namespace std::string_literals;
+            return b ? "true"s : "false"s;
+        }
+    };
+
+    template<>
+    struct StringMaker<char> {
+        static std::string convert(char c);
+    };
+    template<>
+    struct StringMaker<signed char> {
+        static std::string convert(signed char value);
+    };
+    template<>
+    struct StringMaker<unsigned char> {
+        static std::string convert(unsigned char value);
+    };
+
+    template<>
+    struct StringMaker<std::nullptr_t> {
+        static std::string convert(std::nullptr_t) {
+            using namespace std::string_literals;
+            return "nullptr"s;
+        }
+    };
+
+    template<>
+    struct StringMaker<float> {
+        static std::string convert(float value);
+        CATCH_EXPORT static int precision;
+    };
+
+    template<>
+    struct StringMaker<double> {
+        static std::string convert(double value);
+        CATCH_EXPORT static int precision;
+    };
+
+    template <typename T>
+    struct StringMaker<T*> {
+        template <typename U>
+        static std::string convert(U* p) {
+            if (p) {
+                return ::Catch::Detail::rawMemoryToString(p);
+            } else {
+                return "nullptr";
+            }
+        }
+    };
+
+    template <typename R, typename C>
+    struct StringMaker<R C::*> {
+        static std::string convert(R C::* p) {
+            if (p) {
+                return ::Catch::Detail::rawMemoryToString(p);
+            } else {
+                return "nullptr";
+            }
+        }
+    };
+
+#if defined(_MANAGED)
+    template <typename T>
+    struct StringMaker<T^> {
+        static std::string convert( T^ ref ) {
+            return ::Catch::Detail::clrReferenceToString(ref);
+        }
+    };
+#endif
+
+    namespace Detail {
+        template<typename InputIterator, typename Sentinel = InputIterator>
+        std::string rangeToString(InputIterator first, Sentinel last) {
+            ReusableStringStream rss;
+            rss << "{ ";
+            if (first != last) {
+                rss << ::Catch::Detail::stringify(*first);
+                for (++first; first != last; ++first)
+                    rss << ", " << ::Catch::Detail::stringify(*first);
+            }
+            rss << " }";
+            return rss.str();
+        }
+    }
+
+} // namespace Catch
+
+//////////////////////////////////////////////////////
+// Separate std-lib types stringification, so it can be selectively enabled
+// This means that we do not bring in their headers
+
+#if defined(CATCH_CONFIG_ENABLE_ALL_STRINGMAKERS)
+#  define CATCH_CONFIG_ENABLE_PAIR_STRINGMAKER
+#  define CATCH_CONFIG_ENABLE_TUPLE_STRINGMAKER
+#  define CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER
+#  define CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER
+#endif
+
+// Separate std::pair specialization
+#if defined(CATCH_CONFIG_ENABLE_PAIR_STRINGMAKER)
+#include <utility>
+namespace Catch {
+    template<typename T1, typename T2>
+    struct StringMaker<std::pair<T1, T2> > {
+        static std::string convert(const std::pair<T1, T2>& pair) {
+            ReusableStringStream rss;
+            rss << "{ "
+                << ::Catch::Detail::stringify(pair.first)
+                << ", "
+                << ::Catch::Detail::stringify(pair.second)
+                << " }";
+            return rss.str();
+        }
+    };
+}
+#endif // CATCH_CONFIG_ENABLE_PAIR_STRINGMAKER
+
+#if defined(CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER) && defined(CATCH_CONFIG_CPP17_OPTIONAL)
+#include <optional>
+namespace Catch {
+    template<typename T>
+    struct StringMaker<std::optional<T> > {
+        static std::string convert(const std::optional<T>& optional) {
+            if (optional.has_value()) {
+                return ::Catch::Detail::stringify(*optional);
+            } else {
+                return "{ }";
+            }
+        }
+    };
+    template <>
+    struct StringMaker<std::nullopt_t> {
+        static std::string convert(const std::nullopt_t&) {
+            return "{ }";
+        }
+    };
+}
+#endif // CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER
+
+// Separate std::tuple specialization
+#if defined(CATCH_CONFIG_ENABLE_TUPLE_STRINGMAKER)
+#include <tuple>
+namespace Catch {
+    namespace Detail {
+        template<
+            typename Tuple,
+            std::size_t N = 0,
+            bool = (N < std::tuple_size<Tuple>::value)
+            >
+            struct TupleElementPrinter {
+            static void print(const Tuple& tuple, std::ostream& os) {
+                os << (N ? ", " : " ")
+                    << ::Catch::Detail::stringify(std::get<N>(tuple));
+                TupleElementPrinter<Tuple, N + 1>::print(tuple, os);
+            }
+        };
+
+        template<
+            typename Tuple,
+            std::size_t N
+        >
+            struct TupleElementPrinter<Tuple, N, false> {
+            static void print(const Tuple&, std::ostream&) {}
+        };
+
+    }
+
+
+    template<typename ...Types>
+    struct StringMaker<std::tuple<Types...>> {
+        static std::string convert(const std::tuple<Types...>& tuple) {
+            ReusableStringStream rss;
+            rss << '{';
+            Detail::TupleElementPrinter<std::tuple<Types...>>::print(tuple, rss.get());
+            rss << " }";
+            return rss.str();
+        }
+    };
+}
+#endif // CATCH_CONFIG_ENABLE_TUPLE_STRINGMAKER
+
+#if defined(CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER) && defined(CATCH_CONFIG_CPP17_VARIANT)
+#include <variant>
+namespace Catch {
+    template<>
+    struct StringMaker<std::monostate> {
+        static std::string convert(const std::monostate&) {
+            return "{ }";
+        }
+    };
+
+    template<typename... Elements>
+    struct StringMaker<std::variant<Elements...>> {
+        static std::string convert(const std::variant<Elements...>& variant) {
+            if (variant.valueless_by_exception()) {
+                return "{valueless variant}";
+            } else {
+                return std::visit(
+                    [](const auto& value) {
+                        return ::Catch::Detail::stringify(value);
+                    },
+                    variant
+                );
+            }
+        }
+    };
+}
+#endif // CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER
+
+namespace Catch {
+    // Import begin/ end from std here
+    using std::begin;
+    using std::end;
+
+    namespace Detail {
+        template <typename T, typename = void>
+        struct is_range_impl : std::false_type {};
+
+        template <typename T>
+        struct is_range_impl<T, void_t<decltype(begin(std::declval<T>()))>> : std::true_type {};
+    } // namespace Detail
+
+    template <typename T>
+    struct is_range : Detail::is_range_impl<T> {};
+
+#if defined(_MANAGED) // Managed types are never ranges
+    template <typename T>
+    struct is_range<T^> {
+        static const bool value = false;
+    };
+#endif
+
+    template<typename Range>
+    std::string rangeToString( Range const& range ) {
+        return ::Catch::Detail::rangeToString( begin( range ), end( range ) );
+    }
+
+    // Handle vector<bool> specially
+    template<typename Allocator>
+    std::string rangeToString( std::vector<bool, Allocator> const& v ) {
+        ReusableStringStream rss;
+        rss << "{ ";
+        bool first = true;
+        for( bool b : v ) {
+            if( first )
+                first = false;
+            else
+                rss << ", ";
+            rss << ::Catch::Detail::stringify( b );
+        }
+        rss << " }";
+        return rss.str();
+    }
+
+    template<typename R>
+    struct StringMaker<R, std::enable_if_t<is_range<R>::value && !::Catch::Detail::IsStreamInsertable<R>::value>> {
+        static std::string convert( R const& range ) {
+            return rangeToString( range );
+        }
+    };
+
+    template <typename T, size_t SZ>
+    struct StringMaker<T[SZ]> {
+        static std::string convert(T const(&arr)[SZ]) {
+            return rangeToString(arr);
+        }
+    };
+
+
+} // namespace Catch
+
+// Separate std::chrono::duration specialization
+#include <ctime>
+#include <ratio>
+#include <chrono>
+
+
+namespace Catch {
+
+template <class Ratio>
+struct ratio_string {
+    static std::string symbol() {
+        Catch::ReusableStringStream rss;
+        rss << '[' << Ratio::num << '/'
+            << Ratio::den << ']';
+        return rss.str();
+    }
+};
+
+template <>
+struct ratio_string<std::atto> {
+    static char symbol() { return 'a'; }
+};
+template <>
+struct ratio_string<std::femto> {
+    static char symbol() { return 'f'; }
+};
+template <>
+struct ratio_string<std::pico> {
+    static char symbol() { return 'p'; }
+};
+template <>
+struct ratio_string<std::nano> {
+    static char symbol() { return 'n'; }
+};
+template <>
+struct ratio_string<std::micro> {
+    static char symbol() { return 'u'; }
+};
+template <>
+struct ratio_string<std::milli> {
+    static char symbol() { return 'm'; }
+};
+
+    ////////////
+    // std::chrono::duration specializations
+    template<typename Value, typename Ratio>
+    struct StringMaker<std::chrono::duration<Value, Ratio>> {
+        static std::string convert(std::chrono::duration<Value, Ratio> const& duration) {
+            ReusableStringStream rss;
+            rss << duration.count() << ' ' << ratio_string<Ratio>::symbol() << 's';
+            return rss.str();
+        }
+    };
+    template<typename Value>
+    struct StringMaker<std::chrono::duration<Value, std::ratio<1>>> {
+        static std::string convert(std::chrono::duration<Value, std::ratio<1>> const& duration) {
+            ReusableStringStream rss;
+            rss << duration.count() << " s";
+            return rss.str();
+        }
+    };
+    template<typename Value>
+    struct StringMaker<std::chrono::duration<Value, std::ratio<60>>> {
+        static std::string convert(std::chrono::duration<Value, std::ratio<60>> const& duration) {
+            ReusableStringStream rss;
+            rss << duration.count() << " m";
+            return rss.str();
+        }
+    };
+    template<typename Value>
+    struct StringMaker<std::chrono::duration<Value, std::ratio<3600>>> {
+        static std::string convert(std::chrono::duration<Value, std::ratio<3600>> const& duration) {
+            ReusableStringStream rss;
+            rss << duration.count() << " h";
+            return rss.str();
+        }
+    };
+
+    ////////////
+    // std::chrono::time_point specialization
+    // Generic time_point cannot be specialized, only std::chrono::time_point<system_clock>
+    template<typename Clock, typename Duration>
+    struct StringMaker<std::chrono::time_point<Clock, Duration>> {
+        static std::string convert(std::chrono::time_point<Clock, Duration> const& time_point) {
+            return ::Catch::Detail::stringify(time_point.time_since_epoch()) + " since epoch";
+        }
+    };
+    // std::chrono::time_point<system_clock> specialization
+    template<typename Duration>
+    struct StringMaker<std::chrono::time_point<std::chrono::system_clock, Duration>> {
+        static std::string convert(std::chrono::time_point<std::chrono::system_clock, Duration> const& time_point) {
+            auto converted = std::chrono::system_clock::to_time_t(time_point);
+
+#ifdef _MSC_VER
+            std::tm timeInfo = {};
+            gmtime_s(&timeInfo, &converted);
+#else
+            std::tm* timeInfo = std::gmtime(&converted);
+#endif
+
+            auto const timeStampSize = sizeof("2017-01-16T17:06:45Z");
+            char timeStamp[timeStampSize];
+            const char * const fmt = "%Y-%m-%dT%H:%M:%SZ";
+
+#ifdef _MSC_VER
+            std::strftime(timeStamp, timeStampSize, fmt, &timeInfo);
+#else
+            std::strftime(timeStamp, timeStampSize, fmt, timeInfo);
+#endif
+            return std::string(timeStamp, timeStampSize - 1);
+        }
+    };
+}
+
+#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
+
+#define INTERNAL_CATCH_REGISTER_ENUM( enumName, ... ) \
+namespace Catch { \
+    template<> struct StringMaker<enumName> { \
+        static std::string convert( enumName value ) { \
+            static const auto& enumInfo = ::Catch::getMutableRegistryHub().getMutableEnumValuesRegistry().registerEnum( #enumName, #__VA_ARGS__, { __VA_ARGS__ } ); \
+            return static_cast<std::string>(enumInfo.lookup( static_cast<int>( value ) )); \
+        } \
+    }; \
+}
+
+#define CATCH_REGISTER_ENUM( enumName, ... ) INTERNAL_CATCH_REGISTER_ENUM( enumName, __VA_ARGS__ )
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#endif // CATCH_TOSTRING_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_totals.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_totals.cpp
new file mode 100644
index 00000000..bd1954fb
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_totals.cpp
@@ -0,0 +1,65 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/catch_totals.hpp>
+
+namespace Catch {
+
+    Counts Counts::operator - ( Counts const& other ) const {
+        Counts diff;
+        diff.passed = passed - other.passed;
+        diff.failed = failed - other.failed;
+        diff.failedButOk = failedButOk - other.failedButOk;
+        diff.skipped = skipped - other.skipped;
+        return diff;
+    }
+
+    Counts& Counts::operator += ( Counts const& other ) {
+        passed += other.passed;
+        failed += other.failed;
+        failedButOk += other.failedButOk;
+        skipped += other.skipped;
+        return *this;
+    }
+
+    std::uint64_t Counts::total() const {
+        return passed + failed + failedButOk + skipped;
+    }
+    bool Counts::allPassed() const {
+        return failed == 0 && failedButOk == 0 && skipped == 0;
+    }
+    bool Counts::allOk() const {
+        return failed == 0;
+    }
+
+    Totals Totals::operator - ( Totals const& other ) const {
+        Totals diff;
+        diff.assertions = assertions - other.assertions;
+        diff.testCases = testCases - other.testCases;
+        return diff;
+    }
+
+    Totals& Totals::operator += ( Totals const& other ) {
+        assertions += other.assertions;
+        testCases += other.testCases;
+        return *this;
+    }
+
+    Totals Totals::delta( Totals const& prevTotals ) const {
+        Totals diff = *this - prevTotals;
+        if( diff.assertions.failed > 0 )
+            ++diff.testCases.failed;
+        else if( diff.assertions.failedButOk > 0 )
+            ++diff.testCases.failedButOk;
+        else if ( diff.assertions.skipped > 0 )
+            ++ diff.testCases.skipped;
+        else
+            ++diff.testCases.passed;
+        return diff;
+    }
+
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_totals.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_totals.hpp
new file mode 100644
index 00000000..386392c9
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_totals.hpp
@@ -0,0 +1,41 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TOTALS_HPP_INCLUDED
+#define CATCH_TOTALS_HPP_INCLUDED
+
+#include <cstdint>
+
+namespace Catch {
+
+    struct Counts {
+        Counts operator - ( Counts const& other ) const;
+        Counts& operator += ( Counts const& other );
+
+        std::uint64_t total() const;
+        bool allPassed() const;
+        bool allOk() const;
+
+        std::uint64_t passed = 0;
+        std::uint64_t failed = 0;
+        std::uint64_t failedButOk = 0;
+        std::uint64_t skipped = 0;
+    };
+
+    struct Totals {
+
+        Totals operator - ( Totals const& other ) const;
+        Totals& operator += ( Totals const& other );
+
+        Totals delta( Totals const& prevTotals ) const;
+
+        Counts assertions;
+        Counts testCases;
+    };
+}
+
+#endif // CATCH_TOTALS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_translate_exception.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_translate_exception.cpp
new file mode 100644
index 00000000..c4b28944
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_translate_exception.cpp
@@ -0,0 +1,20 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/catch_translate_exception.hpp>
+#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
+
+namespace Catch {
+    namespace Detail {
+        void registerTranslatorImpl(
+            Detail::unique_ptr<IExceptionTranslator>&& translator ) {
+            getMutableRegistryHub().registerTranslator(
+                CATCH_MOVE( translator ) );
+        }
+    } // namespace Detail
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_translate_exception.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_translate_exception.hpp
new file mode 100644
index 00000000..5a4dc5e3
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_translate_exception.hpp
@@ -0,0 +1,88 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TRANSLATE_EXCEPTION_HPP_INCLUDED
+#define CATCH_TRANSLATE_EXCEPTION_HPP_INCLUDED
+
+#include <catch2/interfaces/catch_interfaces_exception.hpp>
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/internal/catch_unique_name.hpp>
+
+#include <exception>
+
+namespace Catch {
+    namespace Detail {
+        void registerTranslatorImpl(
+            Detail::unique_ptr<IExceptionTranslator>&& translator );
+    }
+
+    class ExceptionTranslatorRegistrar {
+        template<typename T>
+        class ExceptionTranslator : public IExceptionTranslator {
+        public:
+
+            ExceptionTranslator( std::string(*translateFunction)( T const& ) )
+            : m_translateFunction( translateFunction )
+            {}
+
+            std::string translate( ExceptionTranslators::const_iterator it, ExceptionTranslators::const_iterator itEnd ) const override {
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+                try {
+                    if( it == itEnd )
+                        std::rethrow_exception(std::current_exception());
+                    else
+                        return (*it)->translate( it+1, itEnd );
+                }
+                catch( T const& ex ) {
+                    return m_translateFunction( ex );
+                }
+#else
+                return "You should never get here!";
+#endif
+            }
+
+        protected:
+            std::string(*m_translateFunction)( T const& );
+        };
+
+    public:
+        template<typename T>
+        ExceptionTranslatorRegistrar( std::string(*translateFunction)( T const& ) ) {
+            Detail::registerTranslatorImpl(
+                Detail::make_unique<ExceptionTranslator<T>>(
+                    translateFunction ) );
+        }
+    };
+
+} // namespace Catch
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_TRANSLATE_EXCEPTION2( translatorName, signature ) \
+    static std::string translatorName( signature ); \
+    CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+    CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+    namespace{ Catch::ExceptionTranslatorRegistrar INTERNAL_CATCH_UNIQUE_NAME( catch_internal_ExceptionRegistrar )( &translatorName ); } \
+    CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+    static std::string translatorName( signature )
+
+#define INTERNAL_CATCH_TRANSLATE_EXCEPTION( signature ) INTERNAL_CATCH_TRANSLATE_EXCEPTION2( INTERNAL_CATCH_UNIQUE_NAME( catch_internal_ExceptionTranslator ), signature )
+
+#if defined(CATCH_CONFIG_DISABLE)
+    #define INTERNAL_CATCH_TRANSLATE_EXCEPTION_NO_REG( translatorName, signature) \
+            static std::string translatorName( signature )
+#endif
+
+
+// This macro is always prefixed
+#if !defined(CATCH_CONFIG_DISABLE)
+#define CATCH_TRANSLATE_EXCEPTION( signature ) INTERNAL_CATCH_TRANSLATE_EXCEPTION( signature )
+#else
+#define CATCH_TRANSLATE_EXCEPTION( signature ) INTERNAL_CATCH_TRANSLATE_EXCEPTION_NO_REG( INTERNAL_CATCH_UNIQUE_NAME( catch_internal_ExceptionTranslator ), signature )
+#endif
+
+
+#endif // CATCH_TRANSLATE_EXCEPTION_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_user_config.hpp.in b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_user_config.hpp.in
new file mode 100644
index 00000000..10d61937
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_user_config.hpp.in
@@ -0,0 +1,220 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+/**\file
+ * **AUTOGENERATED FROM CMAKE CONFIGURATION**
+ *
+ * Contains materialized compile-time configuration provided to Catch2's
+ * CMake configuration. All compile-time configuration options need to
+ * be here, and also documented in `docs/configuration.md`.
+ */
+
+#ifndef CATCH_USER_CONFIG_HPP_INCLUDED
+#define CATCH_USER_CONFIG_HPP_INCLUDED
+
+
+// ------
+// Overridable compilation flags,
+// these can have 3 "states": Force Yes, Force No, Use Default.
+// Setting both Force Yes and Force No is an error
+// ------
+
+#cmakedefine CATCH_CONFIG_ANDROID_LOGWRITE
+#cmakedefine CATCH_CONFIG_NO_ANDROID_LOGWRITE
+
+#if defined( CATCH_CONFIG_ANDROID_LOGWRITE ) && \
+    defined( CATCH_CONFIG_NO_ANDROID_LOGWRITE )
+#    error Cannot force ANDROID_LOGWRITE to both ON and OFF
+#endif
+
+#cmakedefine CATCH_CONFIG_COLOUR_WIN32
+#cmakedefine CATCH_CONFIG_NO_COLOUR_WIN32
+
+#if defined( CATCH_CONFIG_COLOUR_WIN32 ) && \
+    defined( CATCH_CONFIG_NO_COLOUR_WIN32 )
+#    error Cannot force COLOUR_WIN32 to be ON and OFF
+#endif
+
+#cmakedefine CATCH_CONFIG_COUNTER
+#cmakedefine CATCH_CONFIG_NO_COUNTER
+
+#if defined( CATCH_CONFIG_COUNTER ) && \
+    defined( CATCH_CONFIG_NO_COUNTER )
+#    error Cannot force COUNTER to both ON and OFF
+#endif
+
+
+
+#cmakedefine CATCH_CONFIG_CPP11_TO_STRING
+#cmakedefine CATCH_CONFIG_NO_CPP11_TO_STRING
+
+#if defined( CATCH_CONFIG_CPP11_TO_STRING ) && \
+    defined( CATCH_CONFIG_NO_CPP11_TO_STRING )
+#    error Cannot force CPP11_TO_STRING to both ON and OFF
+#endif
+
+
+
+#cmakedefine CATCH_CONFIG_CPP17_BYTE
+#cmakedefine CATCH_CONFIG_NO_CPP17_BYTE
+
+#if defined( CATCH_CONFIG_CPP17_BYTE ) && \
+    defined( CATCH_CONFIG_NO_CPP17_BYTE )
+#    error Cannot force CPP17_BYTE to both ON and OFF
+#endif
+
+
+
+#cmakedefine CATCH_CONFIG_CPP17_OPTIONAL
+#cmakedefine CATCH_CONFIG_NO_CPP17_OPTIONAL
+
+#if defined( CATCH_CONFIG_CPP17_OPTIONAL ) && \
+    defined( CATCH_CONFIG_NO_CPP17_OPTIONAL )
+#    error Cannot force CPP17_OPTIONAL to both ON and OFF
+#endif
+
+
+
+#cmakedefine CATCH_CONFIG_CPP17_STRING_VIEW
+#cmakedefine CATCH_CONFIG_NO_CPP17_STRING_VIEW
+
+#if defined( CATCH_CONFIG_CPP17_STRING_VIEW ) && \
+    defined( CATCH_CONFIG_NO_CPP17_STRING_VIEW )
+#    error Cannot force CPP17_STRING_VIEW to both ON and OFF
+#endif
+
+
+
+#cmakedefine CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
+#cmakedefine CATCH_CONFIG_NO_CPP17_UNCAUGHT_EXCEPTIONS
+
+#if defined( CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS ) && \
+    defined( CATCH_CONFIG_NO_CPP17_UNCAUGHT_EXCEPTIONS )
+#    error Cannot force CPP17_UNCAUGHT_EXCEPTIONS to both ON and OFF
+#endif
+
+
+
+#cmakedefine CATCH_CONFIG_CPP17_VARIANT
+#cmakedefine CATCH_CONFIG_NO_CPP17_VARIANT
+
+#if defined( CATCH_CONFIG_CPP17_VARIANT ) && \
+    defined( CATCH_CONFIG_NO_CPP17_VARIANT )
+#    error Cannot force CPP17_VARIANT to both ON and OFF
+#endif
+
+
+
+#cmakedefine CATCH_CONFIG_GLOBAL_NEXTAFTER
+#cmakedefine CATCH_CONFIG_NO_GLOBAL_NEXTAFTER
+
+#if defined( CATCH_CONFIG_GLOBAL_NEXTAFTER ) && \
+    defined( CATCH_CONFIG_NO_GLOBAL_NEXTAFTER )
+#    error Cannot force GLOBAL_NEXTAFTER to both ON and OFF
+#endif
+
+
+
+#cmakedefine CATCH_CONFIG_POSIX_SIGNALS
+#cmakedefine CATCH_CONFIG_NO_POSIX_SIGNALS
+
+#if defined( CATCH_CONFIG_POSIX_SIGNALS ) && \
+    defined( CATCH_CONFIG_NO_POSIX_SIGNALS )
+#    error Cannot force POSIX_SIGNALS to both ON and OFF
+#endif
+
+
+
+#cmakedefine CATCH_CONFIG_GETENV
+#cmakedefine CATCH_CONFIG_NO_GETENV
+
+#if defined( CATCH_CONFIG_GETENV ) && \
+    defined( CATCH_CONFIG_NO_GETENV )
+#    error Cannot force GETENV to both ON and OFF
+#endif
+
+
+
+#cmakedefine CATCH_CONFIG_USE_ASYNC
+#cmakedefine CATCH_CONFIG_NO_USE_ASYNC
+
+#if defined( CATCH_CONFIG_USE_ASYNC ) && \
+    defined( CATCH_CONFIG_NO_USE_ASYNC )
+#    error Cannot force USE_ASYNC to both ON and OFF
+#endif
+
+
+
+#cmakedefine CATCH_CONFIG_WCHAR
+#cmakedefine CATCH_CONFIG_NO_WCHAR
+
+#if defined( CATCH_CONFIG_WCHAR ) && \
+    defined( CATCH_CONFIG_NO_WCHAR )
+#    error Cannot force WCHAR to both ON and OFF
+#endif
+
+
+
+#cmakedefine CATCH_CONFIG_WINDOWS_SEH
+#cmakedefine CATCH_CONFIG_NO_WINDOWS_SEH
+
+#if defined( CATCH_CONFIG_WINDOWS_SEH ) && \
+    defined( CATCH_CONFIG_NO_WINDOWS_SEH )
+#    error Cannot force WINDOWS_SEH to both ON and OFF
+#endif
+
+
+#cmakedefine CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT
+#cmakedefine CATCH_CONFIG_NO_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT
+
+#if defined( CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT ) && \
+    defined( CATCH_CONFIG_NO_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT )
+#    error Cannot force STATIC_ANALYSIS_SUPPORT to both ON and OFF
+#endif
+
+
+// ------
+// Simple toggle defines
+// their value is never used and they cannot be overridden
+// ------
+
+
+#cmakedefine CATCH_CONFIG_BAZEL_SUPPORT
+#cmakedefine CATCH_CONFIG_DISABLE_EXCEPTIONS
+#cmakedefine CATCH_CONFIG_DISABLE_EXCEPTIONS_CUSTOM_HANDLER
+#cmakedefine CATCH_CONFIG_DISABLE
+#cmakedefine CATCH_CONFIG_DISABLE_STRINGIFICATION
+#cmakedefine CATCH_CONFIG_ENABLE_ALL_STRINGMAKERS
+#cmakedefine CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER
+#cmakedefine CATCH_CONFIG_ENABLE_PAIR_STRINGMAKER
+#cmakedefine CATCH_CONFIG_ENABLE_TUPLE_STRINGMAKER
+#cmakedefine CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER
+#cmakedefine CATCH_CONFIG_EXPERIMENTAL_REDIRECT
+#cmakedefine CATCH_CONFIG_FAST_COMPILE
+#cmakedefine CATCH_CONFIG_NOSTDOUT
+#cmakedefine CATCH_CONFIG_PREFIX_ALL
+#cmakedefine CATCH_CONFIG_PREFIX_MESSAGES
+#cmakedefine CATCH_CONFIG_WINDOWS_CRTDBG
+
+#cmakedefine CATCH_CONFIG_SHARED_LIBRARY
+
+
+// ------
+// "Variable" defines, these have actual values
+// ------
+
+#define CATCH_CONFIG_DEFAULT_REPORTER "@CATCH_CONFIG_DEFAULT_REPORTER@"
+#define CATCH_CONFIG_CONSOLE_WIDTH @CATCH_CONFIG_CONSOLE_WIDTH@
+
+// Unlike the macros above, CATCH_CONFIG_FALLBACK_STRINGIFIER does not
+// have a good default value, so we cannot always define it, and cannot
+// even expose it as a variable in CMake. The users will have to find
+// out about it from docs and set it only if they use it.
+#cmakedefine CATCH_CONFIG_FALLBACK_STRINGIFIER @CATCH_CONFIG_FALLBACK_STRINGIFIER@
+
+#endif // CATCH_USER_CONFIG_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_version.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_version.cpp
new file mode 100644
index 00000000..2604be35
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_version.cpp
@@ -0,0 +1,43 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/catch_version.hpp>
+#include <ostream>
+
+namespace Catch {
+
+    Version::Version
+        (   unsigned int _majorVersion,
+            unsigned int _minorVersion,
+            unsigned int _patchNumber,
+            char const * const _branchName,
+            unsigned int _buildNumber )
+    :   majorVersion( _majorVersion ),
+        minorVersion( _minorVersion ),
+        patchNumber( _patchNumber ),
+        branchName( _branchName ),
+        buildNumber( _buildNumber )
+    {}
+
+    std::ostream& operator << ( std::ostream& os, Version const& version ) {
+        os  << version.majorVersion << '.'
+            << version.minorVersion << '.'
+            << version.patchNumber;
+        // branchName is never null -> 0th char is \0 if it is empty
+        if (version.branchName[0]) {
+            os << '-' << version.branchName
+               << '.' << version.buildNumber;
+        }
+        return os;
+    }
+
+    Version const& libraryVersion() {
+        static Version version( 3, 5, 3, "", 0 );
+        return version;
+    }
+
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_version.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_version.hpp
new file mode 100644
index 00000000..af698fad
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_version.hpp
@@ -0,0 +1,39 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_VERSION_HPP_INCLUDED
+#define CATCH_VERSION_HPP_INCLUDED
+
+#include <iosfwd>
+
+namespace Catch {
+
+    // Versioning information
+    struct Version {
+        Version( Version const& ) = delete;
+        Version& operator=( Version const& ) = delete;
+        Version(    unsigned int _majorVersion,
+                    unsigned int _minorVersion,
+                    unsigned int _patchNumber,
+                    char const * const _branchName,
+                    unsigned int _buildNumber );
+
+        unsigned int const majorVersion;
+        unsigned int const minorVersion;
+        unsigned int const patchNumber;
+
+        // buildNumber is only used if branchName is not null
+        char const * const branchName;
+        unsigned int const buildNumber;
+
+        friend std::ostream& operator << ( std::ostream& os, Version const& version );
+    };
+
+    Version const& libraryVersion();
+}
+
+#endif // CATCH_VERSION_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_version_macros.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_version_macros.hpp
new file mode 100644
index 00000000..921ff526
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_version_macros.hpp
@@ -0,0 +1,15 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_VERSION_MACROS_HPP_INCLUDED
+#define CATCH_VERSION_MACROS_HPP_INCLUDED
+
+#define CATCH_VERSION_MAJOR 3
+#define CATCH_VERSION_MINOR 5
+#define CATCH_VERSION_PATCH 3
+
+#endif // CATCH_VERSION_MACROS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generator_exception.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generator_exception.cpp
new file mode 100644
index 00000000..64324039
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generator_exception.cpp
@@ -0,0 +1,17 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/generators/catch_generator_exception.hpp>
+
+namespace Catch {
+
+    const char* GeneratorException::what() const noexcept {
+        return m_msg;
+    }
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generator_exception.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generator_exception.hpp
new file mode 100644
index 00000000..f353042e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generator_exception.hpp
@@ -0,0 +1,31 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_GENERATOR_EXCEPTION_HPP_INCLUDED
+#define CATCH_GENERATOR_EXCEPTION_HPP_INCLUDED
+
+#include <exception>
+
+namespace Catch {
+
+    // Exception type to be thrown when a Generator runs into an error,
+    // e.g. it cannot initialize the first return value based on
+    // runtime information
+    class GeneratorException : public std::exception {
+        const char* const m_msg = "";
+
+    public:
+        GeneratorException(const char* msg):
+            m_msg(msg)
+        {}
+
+        const char* what() const noexcept override final;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_GENERATOR_EXCEPTION_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators.cpp
new file mode 100644
index 00000000..3514e9f6
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators.cpp
@@ -0,0 +1,42 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/generators/catch_generators.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/generators/catch_generator_exception.hpp>
+#include <catch2/interfaces/catch_interfaces_capture.hpp>
+
+namespace Catch {
+
+    IGeneratorTracker::~IGeneratorTracker() = default;
+
+namespace Generators {
+
+namespace Detail {
+
+    [[noreturn]]
+    void throw_generator_exception(char const* msg) {
+        Catch::throw_exception(GeneratorException{ msg });
+    }
+} // end namespace Detail
+
+    GeneratorUntypedBase::~GeneratorUntypedBase() = default;
+
+    IGeneratorTracker* acquireGeneratorTracker(StringRef generatorName, SourceLineInfo const& lineInfo ) {
+        return getResultCapture().acquireGeneratorTracker( generatorName, lineInfo );
+    }
+
+    IGeneratorTracker* createGeneratorTracker( StringRef generatorName,
+                                 SourceLineInfo lineInfo,
+                                 GeneratorBasePtr&& generator ) {
+        return getResultCapture().createGeneratorTracker(
+            generatorName, lineInfo, CATCH_MOVE( generator ) );
+    }
+
+} // namespace Generators
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators.hpp
new file mode 100644
index 00000000..0f35a996
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators.hpp
@@ -0,0 +1,244 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_GENERATORS_HPP_INCLUDED
+#define CATCH_GENERATORS_HPP_INCLUDED
+
+#include <catch2/catch_tostring.hpp>
+#include <catch2/interfaces/catch_interfaces_generatortracker.hpp>
+#include <catch2/internal/catch_source_line_info.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/internal/catch_unique_name.hpp>
+
+#include <vector>
+#include <tuple>
+
+namespace Catch {
+
+namespace Generators {
+
+namespace Detail {
+
+    //! Throws GeneratorException with the provided message
+    [[noreturn]]
+    void throw_generator_exception(char const * msg);
+
+} // end namespace detail
+
+    template<typename T>
+    class IGenerator : public GeneratorUntypedBase {
+        std::string stringifyImpl() const override {
+            return ::Catch::Detail::stringify( get() );
+        }
+
+    public:
+        // Returns the current element of the generator
+        //
+        // \Precondition The generator is either freshly constructed,
+        // or the last call to `next()` returned true
+        virtual T const& get() const = 0;
+        using type = T;
+    };
+
+    template <typename T>
+    using GeneratorPtr = Catch::Detail::unique_ptr<IGenerator<T>>;
+
+    template <typename T>
+    class GeneratorWrapper final {
+        GeneratorPtr<T> m_generator;
+    public:
+        //! Takes ownership of the passed pointer.
+        GeneratorWrapper(IGenerator<T>* generator):
+            m_generator(generator) {}
+        GeneratorWrapper(GeneratorPtr<T> generator):
+            m_generator(CATCH_MOVE(generator)) {}
+
+        T const& get() const {
+            return m_generator->get();
+        }
+        bool next() {
+            return m_generator->countedNext();
+        }
+    };
+
+
+    template<typename T>
+    class SingleValueGenerator final : public IGenerator<T> {
+        T m_value;
+    public:
+        SingleValueGenerator(T const& value) :
+            m_value(value)
+        {}
+        SingleValueGenerator(T&& value):
+            m_value(CATCH_MOVE(value))
+        {}
+
+        T const& get() const override {
+            return m_value;
+        }
+        bool next() override {
+            return false;
+        }
+    };
+
+    template<typename T>
+    class FixedValuesGenerator final : public IGenerator<T> {
+        static_assert(!std::is_same<T, bool>::value,
+            "FixedValuesGenerator does not support bools because of std::vector<bool>"
+            "specialization, use SingleValue Generator instead.");
+        std::vector<T> m_values;
+        size_t m_idx = 0;
+    public:
+        FixedValuesGenerator( std::initializer_list<T> values ) : m_values( values ) {}
+
+        T const& get() const override {
+            return m_values[m_idx];
+        }
+        bool next() override {
+            ++m_idx;
+            return m_idx < m_values.size();
+        }
+    };
+
+    template <typename T, typename DecayedT = std::decay_t<T>>
+    GeneratorWrapper<DecayedT> value( T&& value ) {
+        return GeneratorWrapper<DecayedT>(
+            Catch::Detail::make_unique<SingleValueGenerator<DecayedT>>(
+                CATCH_FORWARD( value ) ) );
+    }
+    template <typename T>
+    GeneratorWrapper<T> values(std::initializer_list<T> values) {
+        return GeneratorWrapper<T>(Catch::Detail::make_unique<FixedValuesGenerator<T>>(values));
+    }
+
+    template<typename T>
+    class Generators : public IGenerator<T> {
+        std::vector<GeneratorWrapper<T>> m_generators;
+        size_t m_current = 0;
+
+        void add_generator( GeneratorWrapper<T>&& generator ) {
+            m_generators.emplace_back( CATCH_MOVE( generator ) );
+        }
+        void add_generator( T const& val ) {
+            m_generators.emplace_back( value( val ) );
+        }
+        void add_generator( T&& val ) {
+            m_generators.emplace_back( value( CATCH_MOVE( val ) ) );
+        }
+        template <typename U>
+        std::enable_if_t<!std::is_same<std::decay_t<U>, T>::value>
+        add_generator( U&& val ) {
+            add_generator( T( CATCH_FORWARD( val ) ) );
+        }
+
+        template <typename U> void add_generators( U&& valueOrGenerator ) {
+            add_generator( CATCH_FORWARD( valueOrGenerator ) );
+        }
+
+        template <typename U, typename... Gs>
+        void add_generators( U&& valueOrGenerator, Gs&&... moreGenerators ) {
+            add_generator( CATCH_FORWARD( valueOrGenerator ) );
+            add_generators( CATCH_FORWARD( moreGenerators )... );
+        }
+
+    public:
+        template <typename... Gs>
+        Generators(Gs &&... moreGenerators) {
+            m_generators.reserve(sizeof...(Gs));
+            add_generators(CATCH_FORWARD(moreGenerators)...);
+        }
+
+        T const& get() const override {
+            return m_generators[m_current].get();
+        }
+
+        bool next() override {
+            if (m_current >= m_generators.size()) {
+                return false;
+            }
+            const bool current_status = m_generators[m_current].next();
+            if (!current_status) {
+                ++m_current;
+            }
+            return m_current < m_generators.size();
+        }
+    };
+
+
+    template <typename... Ts>
+    GeneratorWrapper<std::tuple<std::decay_t<Ts>...>>
+    table( std::initializer_list<std::tuple<std::decay_t<Ts>...>> tuples ) {
+        return values<std::tuple<Ts...>>( tuples );
+    }
+
+    // Tag type to signal that a generator sequence should convert arguments to a specific type
+    template <typename T>
+    struct as {};
+
+    template<typename T, typename... Gs>
+    auto makeGenerators( GeneratorWrapper<T>&& generator, Gs &&... moreGenerators ) -> Generators<T> {
+        return Generators<T>(CATCH_MOVE(generator), CATCH_FORWARD(moreGenerators)...);
+    }
+    template<typename T>
+    auto makeGenerators( GeneratorWrapper<T>&& generator ) -> Generators<T> {
+        return Generators<T>(CATCH_MOVE(generator));
+    }
+    template<typename T, typename... Gs>
+    auto makeGenerators( T&& val, Gs &&... moreGenerators ) -> Generators<std::decay_t<T>> {
+        return makeGenerators( value( CATCH_FORWARD( val ) ), CATCH_FORWARD( moreGenerators )... );
+    }
+    template<typename T, typename U, typename... Gs>
+    auto makeGenerators( as<T>, U&& val, Gs &&... moreGenerators ) -> Generators<T> {
+        return makeGenerators( value( T( CATCH_FORWARD( val ) ) ), CATCH_FORWARD( moreGenerators )... );
+    }
+
+    IGeneratorTracker* acquireGeneratorTracker( StringRef generatorName,
+                                                SourceLineInfo const& lineInfo );
+    IGeneratorTracker* createGeneratorTracker( StringRef generatorName,
+                                               SourceLineInfo lineInfo,
+                                               GeneratorBasePtr&& generator );
+
+    template<typename L>
+    auto generate( StringRef generatorName, SourceLineInfo const& lineInfo, L const& generatorExpression ) -> typename decltype(generatorExpression())::type {
+        using UnderlyingType = typename decltype(generatorExpression())::type;
+
+        IGeneratorTracker* tracker = acquireGeneratorTracker( generatorName, lineInfo );
+        // Creation of tracker is delayed after generator creation, so
+        // that constructing generator can fail without breaking everything.
+        if (!tracker) {
+            tracker = createGeneratorTracker(
+                generatorName,
+                lineInfo,
+                Catch::Detail::make_unique<Generators<UnderlyingType>>(
+                    generatorExpression() ) );
+        }
+
+        auto const& generator = static_cast<IGenerator<UnderlyingType> const&>( *tracker->getGenerator() );
+        return generator.get();
+    }
+
+} // namespace Generators
+} // namespace Catch
+
+#define CATCH_INTERNAL_GENERATOR_STRINGIZE_IMPL( ... ) #__VA_ARGS__##_catch_sr
+#define CATCH_INTERNAL_GENERATOR_STRINGIZE(...) CATCH_INTERNAL_GENERATOR_STRINGIZE_IMPL(__VA_ARGS__)
+
+#define GENERATE( ... ) \
+    Catch::Generators::generate( CATCH_INTERNAL_GENERATOR_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)), \
+                                 CATCH_INTERNAL_LINEINFO, \
+                                 [ ]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) //NOLINT(google-build-using-namespace)
+#define GENERATE_COPY( ... ) \
+    Catch::Generators::generate( CATCH_INTERNAL_GENERATOR_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)), \
+                                 CATCH_INTERNAL_LINEINFO, \
+                                 [=]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) //NOLINT(google-build-using-namespace)
+#define GENERATE_REF( ... ) \
+    Catch::Generators::generate( CATCH_INTERNAL_GENERATOR_STRINGIZE(INTERNAL_CATCH_UNIQUE_NAME(generator)), \
+                                 CATCH_INTERNAL_LINEINFO, \
+                                 [&]{ using namespace Catch::Generators; return makeGenerators( __VA_ARGS__ ); } ) //NOLINT(google-build-using-namespace)
+
+#endif // CATCH_GENERATORS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_adapters.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_adapters.hpp
new file mode 100644
index 00000000..d5fc1e12
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_adapters.hpp
@@ -0,0 +1,241 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_GENERATORS_ADAPTERS_HPP_INCLUDED
+#define CATCH_GENERATORS_ADAPTERS_HPP_INCLUDED
+
+#include <catch2/generators/catch_generators.hpp>
+#include <catch2/internal/catch_meta.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <cassert>
+
+namespace Catch {
+namespace Generators {
+
+    template <typename T>
+    class TakeGenerator final : public IGenerator<T> {
+        GeneratorWrapper<T> m_generator;
+        size_t m_returned = 0;
+        size_t m_target;
+    public:
+        TakeGenerator(size_t target, GeneratorWrapper<T>&& generator):
+            m_generator(CATCH_MOVE(generator)),
+            m_target(target)
+        {
+            assert(target != 0 && "Empty generators are not allowed");
+        }
+        T const& get() const override {
+            return m_generator.get();
+        }
+        bool next() override {
+            ++m_returned;
+            if (m_returned >= m_target) {
+                return false;
+            }
+
+            const auto success = m_generator.next();
+            // If the underlying generator does not contain enough values
+            // then we cut short as well
+            if (!success) {
+                m_returned = m_target;
+            }
+            return success;
+        }
+    };
+
+    template <typename T>
+    GeneratorWrapper<T> take(size_t target, GeneratorWrapper<T>&& generator) {
+        return GeneratorWrapper<T>(Catch::Detail::make_unique<TakeGenerator<T>>(target, CATCH_MOVE(generator)));
+    }
+
+
+    template <typename T, typename Predicate>
+    class FilterGenerator final : public IGenerator<T> {
+        GeneratorWrapper<T> m_generator;
+        Predicate m_predicate;
+    public:
+        template <typename P = Predicate>
+        FilterGenerator(P&& pred, GeneratorWrapper<T>&& generator):
+            m_generator(CATCH_MOVE(generator)),
+            m_predicate(CATCH_FORWARD(pred))
+        {
+            if (!m_predicate(m_generator.get())) {
+                // It might happen that there are no values that pass the
+                // filter. In that case we throw an exception.
+                auto has_initial_value = next();
+                if (!has_initial_value) {
+                    Detail::throw_generator_exception("No valid value found in filtered generator");
+                }
+            }
+        }
+
+        T const& get() const override {
+            return m_generator.get();
+        }
+
+        bool next() override {
+            bool success = m_generator.next();
+            if (!success) {
+                return false;
+            }
+            while (!m_predicate(m_generator.get()) && (success = m_generator.next()) == true);
+            return success;
+        }
+    };
+
+
+    template <typename T, typename Predicate>
+    GeneratorWrapper<T> filter(Predicate&& pred, GeneratorWrapper<T>&& generator) {
+        return GeneratorWrapper<T>(Catch::Detail::make_unique<FilterGenerator<T, Predicate>>(CATCH_FORWARD(pred), CATCH_MOVE(generator)));
+    }
+
+    template <typename T>
+    class RepeatGenerator final : public IGenerator<T> {
+        static_assert(!std::is_same<T, bool>::value,
+            "RepeatGenerator currently does not support bools"
+            "because of std::vector<bool> specialization");
+        GeneratorWrapper<T> m_generator;
+        mutable std::vector<T> m_returned;
+        size_t m_target_repeats;
+        size_t m_current_repeat = 0;
+        size_t m_repeat_index = 0;
+    public:
+        RepeatGenerator(size_t repeats, GeneratorWrapper<T>&& generator):
+            m_generator(CATCH_MOVE(generator)),
+            m_target_repeats(repeats)
+        {
+            assert(m_target_repeats > 0 && "Repeat generator must repeat at least once");
+        }
+
+        T const& get() const override {
+            if (m_current_repeat == 0) {
+                m_returned.push_back(m_generator.get());
+                return m_returned.back();
+            }
+            return m_returned[m_repeat_index];
+        }
+
+        bool next() override {
+            // There are 2 basic cases:
+            // 1) We are still reading the generator
+            // 2) We are reading our own cache
+
+            // In the first case, we need to poke the underlying generator.
+            // If it happily moves, we are left in that state, otherwise it is time to start reading from our cache
+            if (m_current_repeat == 0) {
+                const auto success = m_generator.next();
+                if (!success) {
+                    ++m_current_repeat;
+                }
+                return m_current_repeat < m_target_repeats;
+            }
+
+            // In the second case, we need to move indices forward and check that we haven't run up against the end
+            ++m_repeat_index;
+            if (m_repeat_index == m_returned.size()) {
+                m_repeat_index = 0;
+                ++m_current_repeat;
+            }
+            return m_current_repeat < m_target_repeats;
+        }
+    };
+
+    template <typename T>
+    GeneratorWrapper<T> repeat(size_t repeats, GeneratorWrapper<T>&& generator) {
+        return GeneratorWrapper<T>(Catch::Detail::make_unique<RepeatGenerator<T>>(repeats, CATCH_MOVE(generator)));
+    }
+
+    template <typename T, typename U, typename Func>
+    class MapGenerator final : public IGenerator<T> {
+        // TBD: provide static assert for mapping function, for friendly error message
+        GeneratorWrapper<U> m_generator;
+        Func m_function;
+        // To avoid returning dangling reference, we have to save the values
+        T m_cache;
+    public:
+        template <typename F2 = Func>
+        MapGenerator(F2&& function, GeneratorWrapper<U>&& generator) :
+            m_generator(CATCH_MOVE(generator)),
+            m_function(CATCH_FORWARD(function)),
+            m_cache(m_function(m_generator.get()))
+        {}
+
+        T const& get() const override {
+            return m_cache;
+        }
+        bool next() override {
+            const auto success = m_generator.next();
+            if (success) {
+                m_cache = m_function(m_generator.get());
+            }
+            return success;
+        }
+    };
+
+    template <typename Func, typename U, typename T = FunctionReturnType<Func, U>>
+    GeneratorWrapper<T> map(Func&& function, GeneratorWrapper<U>&& generator) {
+        return GeneratorWrapper<T>(
+            Catch::Detail::make_unique<MapGenerator<T, U, Func>>(CATCH_FORWARD(function), CATCH_MOVE(generator))
+        );
+    }
+
+    template <typename T, typename U, typename Func>
+    GeneratorWrapper<T> map(Func&& function, GeneratorWrapper<U>&& generator) {
+        return GeneratorWrapper<T>(
+            Catch::Detail::make_unique<MapGenerator<T, U, Func>>(CATCH_FORWARD(function), CATCH_MOVE(generator))
+        );
+    }
+
+    template <typename T>
+    class ChunkGenerator final : public IGenerator<std::vector<T>> {
+        std::vector<T> m_chunk;
+        size_t m_chunk_size;
+        GeneratorWrapper<T> m_generator;
+        bool m_used_up = false;
+    public:
+        ChunkGenerator(size_t size, GeneratorWrapper<T> generator) :
+            m_chunk_size(size), m_generator(CATCH_MOVE(generator))
+        {
+            m_chunk.reserve(m_chunk_size);
+            if (m_chunk_size != 0) {
+                m_chunk.push_back(m_generator.get());
+                for (size_t i = 1; i < m_chunk_size; ++i) {
+                    if (!m_generator.next()) {
+                        Detail::throw_generator_exception("Not enough values to initialize the first chunk");
+                    }
+                    m_chunk.push_back(m_generator.get());
+                }
+            }
+        }
+        std::vector<T> const& get() const override {
+            return m_chunk;
+        }
+        bool next() override {
+            m_chunk.clear();
+            for (size_t idx = 0; idx < m_chunk_size; ++idx) {
+                if (!m_generator.next()) {
+                    return false;
+                }
+                m_chunk.push_back(m_generator.get());
+            }
+            return true;
+        }
+    };
+
+    template <typename T>
+    GeneratorWrapper<std::vector<T>> chunk(size_t size, GeneratorWrapper<T>&& generator) {
+        return GeneratorWrapper<std::vector<T>>(
+            Catch::Detail::make_unique<ChunkGenerator<T>>(size, CATCH_MOVE(generator))
+        );
+    }
+
+} // namespace Generators
+} // namespace Catch
+
+
+#endif // CATCH_GENERATORS_ADAPTERS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_all.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_all.hpp
new file mode 100644
index 00000000..c12d314e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_all.hpp
@@ -0,0 +1,30 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+/** \file
+ * This is a convenience header for Catch2's Generator support. It includes
+ * **all** of Catch2 headers related to generators.
+ *
+ * Generally the Catch2 users should use specific includes they need,
+ * but this header can be used instead for ease-of-experimentation, or
+ * just plain convenience, at the cost of (significantly) increased
+ * compilation times.
+ *
+ * When a new header is added to either the `generators` folder,
+ * or to the corresponding internal subfolder, it should be added here.
+ */
+
+#ifndef CATCH_GENERATORS_ALL_HPP_INCLUDED
+#define CATCH_GENERATORS_ALL_HPP_INCLUDED
+
+#include <catch2/generators/catch_generator_exception.hpp>
+#include <catch2/generators/catch_generators.hpp>
+#include <catch2/generators/catch_generators_adapters.hpp>
+#include <catch2/generators/catch_generators_random.hpp>
+#include <catch2/generators/catch_generators_range.hpp>
+
+#endif // CATCH_GENERATORS_ALL_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_random.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_random.cpp
new file mode 100644
index 00000000..00a8e634
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_random.cpp
@@ -0,0 +1,41 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/generators/catch_generators_random.hpp>
+#include <catch2/internal/catch_context.hpp>
+
+#include <random>
+
+namespace Catch {
+    namespace Generators {
+        namespace Detail {
+            std::uint32_t getSeed() { return sharedRng()(); }
+        } // namespace Detail
+
+        struct RandomFloatingGenerator<long double>::PImpl {
+            PImpl( long double a, long double b, uint32_t seed ):
+                rng( seed ), dist( a, b ) {}
+
+            Catch::SimplePcg32 rng;
+            std::uniform_real_distribution<long double> dist;
+        };
+
+        RandomFloatingGenerator<long double>::RandomFloatingGenerator(
+            long double a, long double b, std::uint32_t seed) :
+            m_pimpl(Catch::Detail::make_unique<PImpl>(a, b, seed)) {
+            static_cast<void>( next() );
+        }
+
+        RandomFloatingGenerator<long double>::~RandomFloatingGenerator() =
+            default;
+        bool RandomFloatingGenerator<long double>::next() {
+            m_current_number = m_pimpl->dist( m_pimpl->rng );
+            return true;
+        }
+    } // namespace Generators
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_random.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_random.hpp
new file mode 100644
index 00000000..71283561
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_random.hpp
@@ -0,0 +1,107 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_GENERATORS_RANDOM_HPP_INCLUDED
+#define CATCH_GENERATORS_RANDOM_HPP_INCLUDED
+
+#include <catch2/generators/catch_generators.hpp>
+#include <catch2/internal/catch_random_number_generator.hpp>
+#include <catch2/internal/catch_uniform_integer_distribution.hpp>
+#include <catch2/internal/catch_uniform_floating_point_distribution.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
+
+namespace Catch {
+namespace Generators {
+namespace Detail {
+    // Returns a suitable seed for a random floating generator based off
+    // the primary internal rng. It does so by taking current value from
+    // the rng and returning it as the seed.
+    std::uint32_t getSeed();
+}
+
+template <typename Float>
+class RandomFloatingGenerator final : public IGenerator<Float> {
+    Catch::SimplePcg32 m_rng;
+    Catch::uniform_floating_point_distribution<Float> m_dist;
+    Float m_current_number;
+public:
+    RandomFloatingGenerator( Float a, Float b, std::uint32_t seed ):
+        m_rng(seed),
+        m_dist(a, b) {
+        static_cast<void>(next());
+    }
+
+    Float const& get() const override {
+        return m_current_number;
+    }
+    bool next() override {
+        m_current_number = m_dist(m_rng);
+        return true;
+    }
+};
+
+template <>
+class RandomFloatingGenerator<long double> final : public IGenerator<long double> {
+    // We still rely on <random> for this specialization, but we don't
+    // want to drag it into the header.
+    struct PImpl;
+    Catch::Detail::unique_ptr<PImpl> m_pimpl;
+    long double m_current_number;
+
+public:
+    RandomFloatingGenerator( long double a, long double b, std::uint32_t seed );
+
+    long double const& get() const override { return m_current_number; }
+    bool next() override;
+
+    ~RandomFloatingGenerator() override; // = default
+};
+
+template <typename Integer>
+class RandomIntegerGenerator final : public IGenerator<Integer> {
+    Catch::SimplePcg32 m_rng;
+    Catch::uniform_integer_distribution<Integer> m_dist;
+    Integer m_current_number;
+public:
+    RandomIntegerGenerator( Integer a, Integer b, std::uint32_t seed ):
+        m_rng(seed),
+        m_dist(a, b) {
+        static_cast<void>(next());
+    }
+
+    Integer const& get() const override {
+        return m_current_number;
+    }
+    bool next() override {
+        m_current_number = m_dist(m_rng);
+        return true;
+    }
+};
+
+template <typename T>
+std::enable_if_t<std::is_integral<T>::value, GeneratorWrapper<T>>
+random(T a, T b) {
+    return GeneratorWrapper<T>(
+        Catch::Detail::make_unique<RandomIntegerGenerator<T>>(a, b, Detail::getSeed())
+    );
+}
+
+template <typename T>
+std::enable_if_t<std::is_floating_point<T>::value,
+GeneratorWrapper<T>>
+random(T a, T b) {
+    return GeneratorWrapper<T>(
+        Catch::Detail::make_unique<RandomFloatingGenerator<T>>(a, b, Detail::getSeed())
+    );
+}
+
+
+} // namespace Generators
+} // namespace Catch
+
+
+#endif // CATCH_GENERATORS_RANDOM_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_range.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_range.hpp
new file mode 100644
index 00000000..b67c1590
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_range.hpp
@@ -0,0 +1,111 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_GENERATORS_RANGE_HPP_INCLUDED
+#define CATCH_GENERATORS_RANGE_HPP_INCLUDED
+
+#include <catch2/generators/catch_generators.hpp>
+
+#include <iterator>
+#include <type_traits>
+
+namespace Catch {
+namespace Generators {
+
+
+template <typename T>
+class RangeGenerator final : public IGenerator<T> {
+    T m_current;
+    T m_end;
+    T m_step;
+    bool m_positive;
+
+public:
+    RangeGenerator(T const& start, T const& end, T const& step):
+        m_current(start),
+        m_end(end),
+        m_step(step),
+        m_positive(m_step > T(0))
+    {
+        assert(m_current != m_end && "Range start and end cannot be equal");
+        assert(m_step != T(0) && "Step size cannot be zero");
+        assert(((m_positive && m_current <= m_end) || (!m_positive && m_current >= m_end)) && "Step moves away from end");
+    }
+
+    RangeGenerator(T const& start, T const& end):
+        RangeGenerator(start, end, (start < end) ? T(1) : T(-1))
+    {}
+
+    T const& get() const override {
+        return m_current;
+    }
+
+    bool next() override {
+        m_current += m_step;
+        return (m_positive) ? (m_current < m_end) : (m_current > m_end);
+    }
+};
+
+template <typename T>
+GeneratorWrapper<T> range(T const& start, T const& end, T const& step) {
+    static_assert(std::is_arithmetic<T>::value && !std::is_same<T, bool>::value, "Type must be numeric");
+    return GeneratorWrapper<T>(Catch::Detail::make_unique<RangeGenerator<T>>(start, end, step));
+}
+
+template <typename T>
+GeneratorWrapper<T> range(T const& start, T const& end) {
+    static_assert(std::is_integral<T>::value && !std::is_same<T, bool>::value, "Type must be an integer");
+    return GeneratorWrapper<T>(Catch::Detail::make_unique<RangeGenerator<T>>(start, end));
+}
+
+
+template <typename T>
+class IteratorGenerator final : public IGenerator<T> {
+    static_assert(!std::is_same<T, bool>::value,
+        "IteratorGenerator currently does not support bools"
+        "because of std::vector<bool> specialization");
+
+    std::vector<T> m_elems;
+    size_t m_current = 0;
+public:
+    template <typename InputIterator, typename InputSentinel>
+    IteratorGenerator(InputIterator first, InputSentinel last):m_elems(first, last) {
+        if (m_elems.empty()) {
+            Detail::throw_generator_exception("IteratorGenerator received no valid values");
+        }
+    }
+
+    T const& get() const override {
+        return m_elems[m_current];
+    }
+
+    bool next() override {
+        ++m_current;
+        return m_current != m_elems.size();
+    }
+};
+
+template <typename InputIterator,
+          typename InputSentinel,
+          typename ResultType = typename std::iterator_traits<InputIterator>::value_type>
+GeneratorWrapper<ResultType> from_range(InputIterator from, InputSentinel to) {
+    return GeneratorWrapper<ResultType>(Catch::Detail::make_unique<IteratorGenerator<ResultType>>(from, to));
+}
+
+template <typename Container>
+auto from_range(Container const& cnt) {
+    using std::begin;
+    using std::end;
+    return from_range( begin( cnt ), end( cnt ) );
+}
+
+
+} // namespace Generators
+} // namespace Catch
+
+
+#endif // CATCH_GENERATORS_RANGE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_all.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_all.hpp
new file mode 100644
index 00000000..a99fdcdc
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_all.hpp
@@ -0,0 +1,37 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+/** \file
+ * This is a convenience header for Catch2's interfaces. It includes
+ * **all** of Catch2 headers related to interfaces.
+ *
+ * Generally the Catch2 users should use specific includes they need,
+ * but this header can be used instead for ease-of-experimentation, or
+ * just plain convenience, at the cost of somewhat increased compilation
+ * times.
+ *
+ * When a new header is added to either the `interfaces` folder, or to
+ * the corresponding internal subfolder, it should be added here.
+ */
+
+
+#ifndef CATCH_INTERFACES_ALL_HPP_INCLUDED
+#define CATCH_INTERFACES_ALL_HPP_INCLUDED
+
+#include <catch2/interfaces/catch_interfaces_capture.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/interfaces/catch_interfaces_enum_values_registry.hpp>
+#include <catch2/interfaces/catch_interfaces_exception.hpp>
+#include <catch2/interfaces/catch_interfaces_generatortracker.hpp>
+#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
+#include <catch2/interfaces/catch_interfaces_reporter.hpp>
+#include <catch2/interfaces/catch_interfaces_reporter_factory.hpp>
+#include <catch2/interfaces/catch_interfaces_tag_alias_registry.hpp>
+#include <catch2/interfaces/catch_interfaces_test_invoker.hpp>
+#include <catch2/interfaces/catch_interfaces_testcase.hpp>
+
+#endif // CATCH_INTERFACES_ALL_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_capture.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_capture.cpp
new file mode 100644
index 00000000..9b40ee5d
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_capture.cpp
@@ -0,0 +1,13 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/interfaces/catch_interfaces_capture.hpp>
+
+namespace Catch {
+    IResultCapture::~IResultCapture() = default;
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_capture.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_capture.hpp
new file mode 100644
index 00000000..a1876a4c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_capture.hpp
@@ -0,0 +1,110 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_INTERFACES_CAPTURE_HPP_INCLUDED
+#define CATCH_INTERFACES_CAPTURE_HPP_INCLUDED
+
+#include <string>
+#include <chrono>
+
+#include <catch2/internal/catch_stringref.hpp>
+#include <catch2/internal/catch_result_type.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
+#include <catch2/benchmark/detail/catch_benchmark_stats_fwd.hpp>
+
+namespace Catch {
+
+    class AssertionResult;
+    struct AssertionInfo;
+    struct SectionInfo;
+    struct SectionEndInfo;
+    struct MessageInfo;
+    struct MessageBuilder;
+    struct Counts;
+    struct AssertionReaction;
+    struct SourceLineInfo;
+
+    class ITransientExpression;
+    class IGeneratorTracker;
+
+    struct BenchmarkInfo;
+
+    namespace Generators {
+        class GeneratorUntypedBase;
+        using GeneratorBasePtr = Catch::Detail::unique_ptr<GeneratorUntypedBase>;
+    }
+
+
+    class IResultCapture {
+    public:
+        virtual ~IResultCapture();
+
+        virtual void notifyAssertionStarted( AssertionInfo const& info ) = 0;
+        virtual bool sectionStarted( StringRef sectionName,
+                                     SourceLineInfo const& sectionLineInfo,
+                                     Counts& assertions ) = 0;
+        virtual void sectionEnded( SectionEndInfo&& endInfo ) = 0;
+        virtual void sectionEndedEarly( SectionEndInfo&& endInfo ) = 0;
+
+        virtual IGeneratorTracker*
+        acquireGeneratorTracker( StringRef generatorName,
+                                 SourceLineInfo const& lineInfo ) = 0;
+        virtual IGeneratorTracker*
+        createGeneratorTracker( StringRef generatorName,
+                                SourceLineInfo lineInfo,
+                                Generators::GeneratorBasePtr&& generator ) = 0;
+
+        virtual void benchmarkPreparing( StringRef name ) = 0;
+        virtual void benchmarkStarting( BenchmarkInfo const& info ) = 0;
+        virtual void benchmarkEnded( BenchmarkStats<> const& stats ) = 0;
+        virtual void benchmarkFailed( StringRef error ) = 0;
+
+        virtual void pushScopedMessage( MessageInfo const& message ) = 0;
+        virtual void popScopedMessage( MessageInfo const& message ) = 0;
+
+        virtual void emplaceUnscopedMessage( MessageBuilder&& builder ) = 0;
+
+        virtual void handleFatalErrorCondition( StringRef message ) = 0;
+
+        virtual void handleExpr
+                (   AssertionInfo const& info,
+                    ITransientExpression const& expr,
+                    AssertionReaction& reaction ) = 0;
+        virtual void handleMessage
+                (   AssertionInfo const& info,
+                    ResultWas::OfType resultType,
+                    StringRef message,
+                    AssertionReaction& reaction ) = 0;
+        virtual void handleUnexpectedExceptionNotThrown
+                (   AssertionInfo const& info,
+                    AssertionReaction& reaction ) = 0;
+        virtual void handleUnexpectedInflightException
+                (   AssertionInfo const& info,
+                    std::string&& message,
+                    AssertionReaction& reaction ) = 0;
+        virtual void handleIncomplete
+                (   AssertionInfo const& info ) = 0;
+        virtual void handleNonExpr
+                (   AssertionInfo const &info,
+                    ResultWas::OfType resultType,
+                    AssertionReaction &reaction ) = 0;
+
+
+
+        virtual bool lastAssertionPassed() = 0;
+        virtual void assertionPassed() = 0;
+
+        // Deprecated, do not use:
+        virtual std::string getCurrentTestName() const = 0;
+        virtual const AssertionResult* getLastResult() const = 0;
+        virtual void exceptionEarlyReported() = 0;
+    };
+
+    IResultCapture& getResultCapture();
+}
+
+#endif // CATCH_INTERFACES_CAPTURE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_config.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_config.cpp
new file mode 100644
index 00000000..655bc1b4
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_config.cpp
@@ -0,0 +1,13 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+
+namespace Catch {
+    IConfig::~IConfig() = default;
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_config.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_config.hpp
new file mode 100644
index 00000000..eb054805
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_config.hpp
@@ -0,0 +1,100 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_INTERFACES_CONFIG_HPP_INCLUDED
+#define CATCH_INTERFACES_CONFIG_HPP_INCLUDED
+
+#include <catch2/internal/catch_noncopyable.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+
+#include <chrono>
+#include <iosfwd>
+#include <string>
+#include <vector>
+
+namespace Catch {
+
+    enum class Verbosity {
+        Quiet = 0,
+        Normal,
+        High
+    };
+
+    struct WarnAbout { enum What {
+        Nothing = 0x00,
+        //! A test case or leaf section did not run any assertions
+        NoAssertions = 0x01,
+        //! A command line test spec matched no test cases
+        UnmatchedTestSpec = 0x02,
+    }; };
+
+    enum class ShowDurations {
+        DefaultForReporter,
+        Always,
+        Never
+    };
+    enum class TestRunOrder {
+        Declared,
+        LexicographicallySorted,
+        Randomized
+    };
+    enum class ColourMode : std::uint8_t {
+        //! Let Catch2 pick implementation based on platform detection
+        PlatformDefault,
+        //! Use ANSI colour code escapes
+        ANSI,
+        //! Use Win32 console colour API
+        Win32,
+        //! Don't use any colour
+        None
+    };
+    struct WaitForKeypress { enum When {
+        Never,
+        BeforeStart = 1,
+        BeforeExit = 2,
+        BeforeStartAndExit = BeforeStart | BeforeExit
+    }; };
+
+    class TestSpec;
+    class IStream;
+
+    class IConfig : public Detail::NonCopyable {
+    public:
+        virtual ~IConfig();
+
+        virtual bool allowThrows() const = 0;
+        virtual StringRef name() const = 0;
+        virtual bool includeSuccessfulResults() const = 0;
+        virtual bool shouldDebugBreak() const = 0;
+        virtual bool warnAboutMissingAssertions() const = 0;
+        virtual bool warnAboutUnmatchedTestSpecs() const = 0;
+        virtual bool zeroTestsCountAsSuccess() const = 0;
+        virtual int abortAfter() const = 0;
+        virtual bool showInvisibles() const = 0;
+        virtual ShowDurations showDurations() const = 0;
+        virtual double minDuration() const = 0;
+        virtual TestSpec const& testSpec() const = 0;
+        virtual bool hasTestFilters() const = 0;
+        virtual std::vector<std::string> const& getTestsOrTags() const = 0;
+        virtual TestRunOrder runOrder() const = 0;
+        virtual uint32_t rngSeed() const = 0;
+        virtual unsigned int shardCount() const = 0;
+        virtual unsigned int shardIndex() const = 0;
+        virtual ColourMode defaultColourMode() const = 0;
+        virtual std::vector<std::string> const& getSectionsToRun() const = 0;
+        virtual Verbosity verbosity() const = 0;
+
+        virtual bool skipBenchmarks() const = 0;
+        virtual bool benchmarkNoAnalysis() const = 0;
+        virtual unsigned int benchmarkSamples() const = 0;
+        virtual double benchmarkConfidenceInterval() const = 0;
+        virtual unsigned int benchmarkResamples() const = 0;
+        virtual std::chrono::milliseconds benchmarkWarmupTime() const = 0;
+    };
+}
+
+#endif // CATCH_INTERFACES_CONFIG_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_enum_values_registry.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_enum_values_registry.hpp
new file mode 100644
index 00000000..38b052d9
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_enum_values_registry.hpp
@@ -0,0 +1,47 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_INTERFACES_ENUM_VALUES_REGISTRY_HPP_INCLUDED
+#define CATCH_INTERFACES_ENUM_VALUES_REGISTRY_HPP_INCLUDED
+
+#include <catch2/internal/catch_stringref.hpp>
+
+#include <vector>
+
+namespace Catch {
+
+    namespace Detail {
+        struct EnumInfo {
+            StringRef m_name;
+            std::vector<std::pair<int, StringRef>> m_values;
+
+            ~EnumInfo();
+
+            StringRef lookup( int value ) const;
+        };
+    } // namespace Detail
+
+    class IMutableEnumValuesRegistry {
+    public:
+        virtual ~IMutableEnumValuesRegistry(); // = default;
+
+        virtual Detail::EnumInfo const& registerEnum( StringRef enumName, StringRef allEnums, std::vector<int> const& values ) = 0;
+
+        template<typename E>
+        Detail::EnumInfo const& registerEnum( StringRef enumName, StringRef allEnums, std::initializer_list<E> values ) {
+            static_assert(sizeof(int) >= sizeof(E), "Cannot serialize enum to int");
+            std::vector<int> intValues;
+            intValues.reserve( values.size() );
+            for( auto enumValue : values )
+                intValues.push_back( static_cast<int>( enumValue ) );
+            return registerEnum( enumName, allEnums, intValues );
+        }
+    };
+
+} // Catch
+
+#endif // CATCH_INTERFACES_ENUM_VALUES_REGISTRY_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_exception.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_exception.cpp
new file mode 100644
index 00000000..44c272d2
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_exception.cpp
@@ -0,0 +1,14 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/interfaces/catch_interfaces_exception.hpp>
+
+namespace Catch {
+    IExceptionTranslator::~IExceptionTranslator() = default;
+    IExceptionTranslatorRegistry::~IExceptionTranslatorRegistry() = default;
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_exception.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_exception.hpp
new file mode 100644
index 00000000..fcc2a8f9
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_exception.hpp
@@ -0,0 +1,36 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_INTERFACES_EXCEPTION_HPP_INCLUDED
+#define CATCH_INTERFACES_EXCEPTION_HPP_INCLUDED
+
+#include <catch2/internal/catch_unique_ptr.hpp>
+
+#include <string>
+#include <vector>
+
+namespace Catch {
+    using exceptionTranslateFunction = std::string(*)();
+
+    class IExceptionTranslator;
+    using ExceptionTranslators = std::vector<Detail::unique_ptr<IExceptionTranslator const>>;
+
+    class IExceptionTranslator {
+    public:
+        virtual ~IExceptionTranslator(); // = default
+        virtual std::string translate( ExceptionTranslators::const_iterator it, ExceptionTranslators::const_iterator itEnd ) const = 0;
+    };
+
+    class IExceptionTranslatorRegistry {
+    public:
+        virtual ~IExceptionTranslatorRegistry(); // = default
+        virtual std::string translateActiveException() const = 0;
+    };
+
+} // namespace Catch
+
+#endif // CATCH_INTERFACES_EXCEPTION_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_generatortracker.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_generatortracker.cpp
new file mode 100644
index 00000000..e9fa5dda
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_generatortracker.cpp
@@ -0,0 +1,32 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/interfaces/catch_interfaces_generatortracker.hpp>
+#include <string>
+
+namespace Catch {
+    namespace Generators {
+
+        bool GeneratorUntypedBase::countedNext() {
+            auto ret = next();
+            if ( ret ) {
+                m_stringReprCache.clear();
+                ++m_currentElementIndex;
+            }
+            return ret;
+        }
+
+        StringRef GeneratorUntypedBase::currentElementAsString() const {
+            if ( m_stringReprCache.empty() ) {
+                m_stringReprCache = stringifyImpl();
+            }
+            return m_stringReprCache;
+        }
+
+    } // namespace Generators
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_generatortracker.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_generatortracker.hpp
new file mode 100644
index 00000000..d70cb593
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_generatortracker.hpp
@@ -0,0 +1,90 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_INTERFACES_GENERATORTRACKER_HPP_INCLUDED
+#define CATCH_INTERFACES_GENERATORTRACKER_HPP_INCLUDED
+
+#include <catch2/internal/catch_unique_ptr.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+
+#include <string>
+
+namespace Catch {
+
+    namespace Generators {
+        class GeneratorUntypedBase {
+            // Caches result from `toStringImpl`, assume that when it is an
+            // empty string, the cache is invalidated.
+            mutable std::string m_stringReprCache;
+
+            // Counts based on `next` returning true
+            std::size_t m_currentElementIndex = 0;
+
+            /**
+             * Attempts to move the generator to the next element
+             *
+             * Returns true iff the move succeeded (and a valid element
+             * can be retrieved).
+             */
+            virtual bool next() = 0;
+
+            //! Customization point for `currentElementAsString`
+            virtual std::string stringifyImpl() const = 0;
+
+        public:
+            GeneratorUntypedBase() = default;
+            // Generation of copy ops is deprecated (and Clang will complain)
+            // if there is a user destructor defined
+            GeneratorUntypedBase(GeneratorUntypedBase const&) = default;
+            GeneratorUntypedBase& operator=(GeneratorUntypedBase const&) = default;
+
+            virtual ~GeneratorUntypedBase(); // = default;
+
+            /**
+             * Attempts to move the generator to the next element
+             *
+             * Serves as a non-virtual interface to `next`, so that the
+             * top level interface can provide sanity checking and shared
+             * features.
+             *
+             * As with `next`, returns true iff the move succeeded and
+             * the generator has new valid element to provide.
+             */
+            bool countedNext();
+
+            std::size_t currentElementIndex() const { return m_currentElementIndex; }
+
+            /**
+             * Returns generator's current element as user-friendly string.
+             *
+             * By default returns string equivalent to calling
+             * `Catch::Detail::stringify` on the current element, but generators
+             * can customize their implementation as needed.
+             *
+             * Not thread-safe due to internal caching.
+             *
+             * The returned ref is valid only until the generator instance
+             * is destructed, or it moves onto the next element, whichever
+             * comes first.
+             */
+            StringRef currentElementAsString() const;
+        };
+        using GeneratorBasePtr = Catch::Detail::unique_ptr<GeneratorUntypedBase>;
+
+    } // namespace Generators
+
+    class IGeneratorTracker {
+    public:
+        virtual ~IGeneratorTracker(); // = default;
+        virtual auto hasGenerator() const -> bool = 0;
+        virtual auto getGenerator() const -> Generators::GeneratorBasePtr const& = 0;
+        virtual void setGenerator( Generators::GeneratorBasePtr&& generator ) = 0;
+    };
+
+} // namespace Catch
+
+#endif // CATCH_INTERFACES_GENERATORTRACKER_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_registry_hub.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_registry_hub.cpp
new file mode 100644
index 00000000..cd688a44
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_registry_hub.cpp
@@ -0,0 +1,14 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
+
+namespace Catch {
+    IRegistryHub::~IRegistryHub() = default;
+    IMutableRegistryHub::~IMutableRegistryHub() = default;
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_registry_hub.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_registry_hub.hpp
new file mode 100644
index 00000000..113f223e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_registry_hub.hpp
@@ -0,0 +1,66 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_INTERFACES_REGISTRY_HUB_HPP_INCLUDED
+#define CATCH_INTERFACES_REGISTRY_HUB_HPP_INCLUDED
+
+#include <catch2/internal/catch_unique_ptr.hpp>
+
+#include <string>
+
+namespace Catch {
+
+    class TestCaseHandle;
+    struct TestCaseInfo;
+    class ITestCaseRegistry;
+    class IExceptionTranslatorRegistry;
+    class IExceptionTranslator;
+    class ReporterRegistry;
+    class IReporterFactory;
+    class ITagAliasRegistry;
+    class ITestInvoker;
+    class IMutableEnumValuesRegistry;
+    struct SourceLineInfo;
+
+    class StartupExceptionRegistry;
+    class EventListenerFactory;
+
+    using IReporterFactoryPtr = Detail::unique_ptr<IReporterFactory>;
+
+    class IRegistryHub {
+    public:
+        virtual ~IRegistryHub(); // = default
+
+        virtual ReporterRegistry const& getReporterRegistry() const = 0;
+        virtual ITestCaseRegistry const& getTestCaseRegistry() const = 0;
+        virtual ITagAliasRegistry const& getTagAliasRegistry() const = 0;
+        virtual IExceptionTranslatorRegistry const& getExceptionTranslatorRegistry() const = 0;
+
+
+        virtual StartupExceptionRegistry const& getStartupExceptionRegistry() const = 0;
+    };
+
+    class IMutableRegistryHub {
+    public:
+        virtual ~IMutableRegistryHub(); // = default
+        virtual void registerReporter( std::string const& name, IReporterFactoryPtr factory ) = 0;
+        virtual void registerListener( Detail::unique_ptr<EventListenerFactory> factory ) = 0;
+        virtual void registerTest(Detail::unique_ptr<TestCaseInfo>&& testInfo, Detail::unique_ptr<ITestInvoker>&& invoker) = 0;
+        virtual void registerTranslator( Detail::unique_ptr<IExceptionTranslator>&& translator ) = 0;
+        virtual void registerTagAlias( std::string const& alias, std::string const& tag, SourceLineInfo const& lineInfo ) = 0;
+        virtual void registerStartupException() noexcept = 0;
+        virtual IMutableEnumValuesRegistry& getMutableEnumValuesRegistry() = 0;
+    };
+
+    IRegistryHub const& getRegistryHub();
+    IMutableRegistryHub& getMutableRegistryHub();
+    void cleanUp();
+    std::string translateActiveException();
+
+}
+
+#endif // CATCH_INTERFACES_REGISTRY_HUB_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_reporter.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_reporter.cpp
new file mode 100644
index 00000000..90536bb3
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_reporter.cpp
@@ -0,0 +1,93 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/interfaces/catch_interfaces_reporter.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/catch_message.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/internal/catch_istream.hpp>
+
+#include <cassert>
+
+namespace Catch {
+
+    ReporterConfig::ReporterConfig(
+        IConfig const* _fullConfig,
+        Detail::unique_ptr<IStream> _stream,
+        ColourMode colourMode,
+        std::map<std::string, std::string> customOptions ):
+        m_stream( CATCH_MOVE(_stream) ),
+        m_fullConfig( _fullConfig ),
+        m_colourMode( colourMode ),
+        m_customOptions( CATCH_MOVE( customOptions ) ) {}
+
+    Detail::unique_ptr<IStream> ReporterConfig::takeStream() && {
+        assert( m_stream );
+        return CATCH_MOVE( m_stream );
+    }
+    IConfig const * ReporterConfig::fullConfig() const { return m_fullConfig; }
+    ColourMode ReporterConfig::colourMode() const { return m_colourMode; }
+
+    std::map<std::string, std::string> const&
+    ReporterConfig::customOptions() const {
+        return m_customOptions;
+    }
+
+    ReporterConfig::~ReporterConfig() = default;
+
+    AssertionStats::AssertionStats( AssertionResult const& _assertionResult,
+                                    std::vector<MessageInfo> const& _infoMessages,
+                                    Totals const& _totals )
+    :   assertionResult( _assertionResult ),
+        infoMessages( _infoMessages ),
+        totals( _totals )
+    {
+        if( assertionResult.hasMessage() ) {
+            // Copy message into messages list.
+            // !TBD This should have been done earlier, somewhere
+            MessageBuilder builder( assertionResult.getTestMacroName(), assertionResult.getSourceInfo(), assertionResult.getResultType() );
+            builder.m_info.message = static_cast<std::string>(assertionResult.getMessage());
+
+            infoMessages.push_back( CATCH_MOVE(builder.m_info) );
+        }
+    }
+
+    SectionStats::SectionStats(  SectionInfo&& _sectionInfo,
+                                 Counts const& _assertions,
+                                 double _durationInSeconds,
+                                 bool _missingAssertions )
+    :   sectionInfo( CATCH_MOVE(_sectionInfo) ),
+        assertions( _assertions ),
+        durationInSeconds( _durationInSeconds ),
+        missingAssertions( _missingAssertions )
+    {}
+
+
+    TestCaseStats::TestCaseStats(  TestCaseInfo const& _testInfo,
+                                   Totals const& _totals,
+                                   std::string&& _stdOut,
+                                   std::string&& _stdErr,
+                                   bool _aborting )
+    : testInfo( &_testInfo ),
+        totals( _totals ),
+        stdOut( CATCH_MOVE(_stdOut) ),
+        stdErr( CATCH_MOVE(_stdErr) ),
+        aborting( _aborting )
+    {}
+
+
+    TestRunStats::TestRunStats(   TestRunInfo const& _runInfo,
+                    Totals const& _totals,
+                    bool _aborting )
+    :   runInfo( _runInfo ),
+        totals( _totals ),
+        aborting( _aborting )
+    {}
+
+    IEventListener::~IEventListener() = default;
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_reporter.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_reporter.hpp
new file mode 100644
index 00000000..a052c5db
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_reporter.hpp
@@ -0,0 +1,223 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_INTERFACES_REPORTER_HPP_INCLUDED
+#define CATCH_INTERFACES_REPORTER_HPP_INCLUDED
+
+#include <catch2/catch_section_info.hpp>
+#include <catch2/catch_totals.hpp>
+#include <catch2/catch_assertion_result.hpp>
+#include <catch2/internal/catch_message_info.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+#include <catch2/internal/catch_test_run_info.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
+#include <catch2/benchmark/detail/catch_benchmark_stats.hpp>
+
+#include <map>
+#include <string>
+#include <vector>
+#include <iosfwd>
+
+namespace Catch {
+
+    struct ReporterDescription;
+    struct ListenerDescription;
+    struct TagInfo;
+    struct TestCaseInfo;
+    class TestCaseHandle;
+    class IConfig;
+    class IStream;
+    enum class ColourMode : std::uint8_t;
+
+    struct ReporterConfig {
+        ReporterConfig( IConfig const* _fullConfig,
+                        Detail::unique_ptr<IStream> _stream,
+                        ColourMode colourMode,
+                        std::map<std::string, std::string> customOptions );
+
+        ReporterConfig( ReporterConfig&& ) = default;
+        ReporterConfig& operator=( ReporterConfig&& ) = default;
+        ~ReporterConfig(); // = default
+
+        Detail::unique_ptr<IStream> takeStream() &&;
+        IConfig const* fullConfig() const;
+        ColourMode colourMode() const;
+        std::map<std::string, std::string> const& customOptions() const;
+
+    private:
+        Detail::unique_ptr<IStream> m_stream;
+        IConfig const* m_fullConfig;
+        ColourMode m_colourMode;
+        std::map<std::string, std::string> m_customOptions;
+    };
+
+    struct AssertionStats {
+        AssertionStats( AssertionResult const& _assertionResult,
+                        std::vector<MessageInfo> const& _infoMessages,
+                        Totals const& _totals );
+
+        AssertionStats( AssertionStats const& )              = default;
+        AssertionStats( AssertionStats && )                  = default;
+        AssertionStats& operator = ( AssertionStats const& ) = delete;
+        AssertionStats& operator = ( AssertionStats && )     = delete;
+
+        AssertionResult assertionResult;
+        std::vector<MessageInfo> infoMessages;
+        Totals totals;
+    };
+
+    struct SectionStats {
+        SectionStats(   SectionInfo&& _sectionInfo,
+                        Counts const& _assertions,
+                        double _durationInSeconds,
+                        bool _missingAssertions );
+
+        SectionInfo sectionInfo;
+        Counts assertions;
+        double durationInSeconds;
+        bool missingAssertions;
+    };
+
+    struct TestCaseStats {
+        TestCaseStats(  TestCaseInfo const& _testInfo,
+                        Totals const& _totals,
+                        std::string&& _stdOut,
+                        std::string&& _stdErr,
+                        bool _aborting );
+
+        TestCaseInfo const * testInfo;
+        Totals totals;
+        std::string stdOut;
+        std::string stdErr;
+        bool aborting;
+    };
+
+    struct TestRunStats {
+        TestRunStats(   TestRunInfo const& _runInfo,
+                        Totals const& _totals,
+                        bool _aborting );
+
+        TestRunInfo runInfo;
+        Totals totals;
+        bool aborting;
+    };
+
+    //! By setting up its preferences, a reporter can modify Catch2's behaviour
+    //! in some regards, e.g. it can request Catch2 to capture writes to
+    //! stdout/stderr during test execution, and pass them to the reporter.
+    struct ReporterPreferences {
+        //! Catch2 should redirect writes to stdout and pass them to the
+        //! reporter
+        bool shouldRedirectStdOut = false;
+        //! Catch2 should call `Reporter::assertionEnded` even for passing
+        //! assertions
+        bool shouldReportAllAssertions = false;
+    };
+
+    /**
+     * The common base for all reporters and event listeners
+     *
+     * Implementing classes must also implement:
+     *
+     *     //! User-friendly description of the reporter/listener type
+     *     static std::string getDescription()
+     *
+     * Generally shouldn't be derived from by users of Catch2 directly,
+     * instead they should derive from one of the utility bases that
+     * derive from this class.
+     */
+    class IEventListener {
+    protected:
+        //! Derived classes can set up their preferences here
+        ReporterPreferences m_preferences;
+        //! The test run's config as filled in from CLI and defaults
+        IConfig const* m_config;
+
+    public:
+        IEventListener( IConfig const* config ): m_config( config ) {}
+
+        virtual ~IEventListener(); // = default;
+
+        // Implementing class must also provide the following static methods:
+        // static std::string getDescription();
+
+        ReporterPreferences const& getPreferences() const {
+            return m_preferences;
+        }
+
+        //! Called when no test cases match provided test spec
+        virtual void noMatchingTestCases( StringRef unmatchedSpec ) = 0;
+        //! Called for all invalid test specs from the cli
+        virtual void reportInvalidTestSpec( StringRef invalidArgument ) = 0;
+
+        /**
+         * Called once in a testing run before tests are started
+         *
+         * Not called if tests won't be run (e.g. only listing will happen)
+         */
+        virtual void testRunStarting( TestRunInfo const& testRunInfo ) = 0;
+
+        //! Called _once_ for each TEST_CASE, no matter how many times it is entered
+        virtual void testCaseStarting( TestCaseInfo const& testInfo ) = 0;
+        //! Called _every time_ a TEST_CASE is entered, including repeats (due to sections)
+        virtual void testCasePartialStarting( TestCaseInfo const& testInfo, uint64_t partNumber ) = 0;
+        //! Called when a `SECTION` is being entered. Not called for skipped sections
+        virtual void sectionStarting( SectionInfo const& sectionInfo ) = 0;
+
+        //! Called when user-code is being probed before the actual benchmark runs
+        virtual void benchmarkPreparing( StringRef benchmarkName ) = 0;
+        //! Called after probe but before the user-code is being benchmarked
+        virtual void benchmarkStarting( BenchmarkInfo const& benchmarkInfo ) = 0;
+        //! Called with the benchmark results if benchmark successfully finishes
+        virtual void benchmarkEnded( BenchmarkStats<> const& benchmarkStats ) = 0;
+        //! Called if running the benchmarks fails for any reason
+        virtual void benchmarkFailed( StringRef benchmarkName ) = 0;
+
+        //! Called before assertion success/failure is evaluated
+        virtual void assertionStarting( AssertionInfo const& assertionInfo ) = 0;
+
+        //! Called after assertion was fully evaluated
+        virtual void assertionEnded( AssertionStats const& assertionStats ) = 0;
+
+        //! Called after a `SECTION` has finished running
+        virtual void sectionEnded( SectionStats const& sectionStats ) = 0;
+        //! Called _every time_ a TEST_CASE is entered, including repeats (due to sections)
+        virtual void testCasePartialEnded(TestCaseStats const& testCaseStats, uint64_t partNumber ) = 0;
+        //! Called _once_ for each TEST_CASE, no matter how many times it is entered
+        virtual void testCaseEnded( TestCaseStats const& testCaseStats ) = 0;
+        /**
+         * Called once after all tests in a testing run are finished
+         *
+         * Not called if tests weren't run (e.g. only listings happened)
+         */
+        virtual void testRunEnded( TestRunStats const& testRunStats ) = 0;
+
+        /**
+         * Called with test cases that are skipped due to the test run aborting.
+         * NOT called for test cases that are explicitly skipped using the `SKIP` macro.
+         *
+         * Deprecated - will be removed in the next major release.
+         */
+        virtual void skipTest( TestCaseInfo const& testInfo ) = 0;
+
+        //! Called if a fatal error (signal/structured exception) occurred
+        virtual void fatalErrorEncountered( StringRef error ) = 0;
+
+        //! Writes out information about provided reporters using reporter-specific format
+        virtual void listReporters(std::vector<ReporterDescription> const& descriptions) = 0;
+        //! Writes out the provided listeners descriptions using reporter-specific format
+        virtual void listListeners(std::vector<ListenerDescription> const& descriptions) = 0;
+        //! Writes out information about provided tests using reporter-specific format
+        virtual void listTests(std::vector<TestCaseHandle> const& tests) = 0;
+        //! Writes out information about the provided tags using reporter-specific format
+        virtual void listTags(std::vector<TagInfo> const& tags) = 0;
+    };
+    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
+
+} // end namespace Catch
+
+#endif // CATCH_INTERFACES_REPORTER_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_reporter_factory.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_reporter_factory.cpp
new file mode 100644
index 00000000..4fe992fa
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_reporter_factory.cpp
@@ -0,0 +1,14 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/interfaces/catch_interfaces_reporter_factory.hpp>
+
+namespace Catch {
+    IReporterFactory::~IReporterFactory() = default;
+    EventListenerFactory::~EventListenerFactory() = default;
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_reporter_factory.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_reporter_factory.hpp
new file mode 100644
index 00000000..83ddd7bc
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_reporter_factory.hpp
@@ -0,0 +1,45 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_INTERFACES_REPORTER_FACTORY_HPP_INCLUDED
+#define CATCH_INTERFACES_REPORTER_FACTORY_HPP_INCLUDED
+
+#include <catch2/internal/catch_unique_ptr.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+
+#include <string>
+
+namespace Catch {
+
+    struct ReporterConfig;
+    class IConfig;
+    class IEventListener;
+    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
+
+
+    class IReporterFactory {
+    public:
+        virtual ~IReporterFactory(); // = default
+
+        virtual IEventListenerPtr
+        create( ReporterConfig&& config ) const = 0;
+        virtual std::string getDescription() const = 0;
+    };
+    using IReporterFactoryPtr = Detail::unique_ptr<IReporterFactory>;
+
+    class EventListenerFactory {
+    public:
+        virtual ~EventListenerFactory(); // = default
+        virtual IEventListenerPtr create( IConfig const* config ) const = 0;
+        //! Return a meaningful name for the listener, e.g. its type name
+        virtual StringRef getName() const = 0;
+        //! Return listener's description if available
+        virtual std::string getDescription() const = 0;
+    };
+} // namespace Catch
+
+#endif // CATCH_INTERFACES_REPORTER_FACTORY_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_tag_alias_registry.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_tag_alias_registry.hpp
new file mode 100644
index 00000000..5da0f8d1
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_tag_alias_registry.hpp
@@ -0,0 +1,29 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_INTERFACES_TAG_ALIAS_REGISTRY_HPP_INCLUDED
+#define CATCH_INTERFACES_TAG_ALIAS_REGISTRY_HPP_INCLUDED
+
+#include <string>
+
+namespace Catch {
+
+    struct TagAlias;
+
+    class ITagAliasRegistry {
+    public:
+        virtual ~ITagAliasRegistry(); // = default
+        // Nullptr if not present
+        virtual TagAlias const* find( std::string const& alias ) const = 0;
+        virtual std::string expandAliases( std::string const& unexpandedTestSpec ) const = 0;
+
+        static ITagAliasRegistry const& get();
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_INTERFACES_TAG_ALIAS_REGISTRY_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_test_invoker.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_test_invoker.hpp
new file mode 100644
index 00000000..3caeff9a
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_test_invoker.hpp
@@ -0,0 +1,21 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_INTERFACES_TEST_INVOKER_HPP_INCLUDED
+#define CATCH_INTERFACES_TEST_INVOKER_HPP_INCLUDED
+
+namespace Catch {
+
+    class ITestInvoker {
+    public:
+        virtual void invoke() const = 0;
+        virtual ~ITestInvoker(); // = default
+    };
+
+} // namespace Catch
+
+#endif // CATCH_INTERFACES_TEST_INVOKER_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_testcase.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_testcase.cpp
new file mode 100644
index 00000000..a543116c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_testcase.cpp
@@ -0,0 +1,13 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/interfaces/catch_interfaces_testcase.hpp>
+
+namespace Catch {
+    ITestCaseRegistry::~ITestCaseRegistry() = default;
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_testcase.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_testcase.hpp
new file mode 100644
index 00000000..daee8482
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_testcase.hpp
@@ -0,0 +1,30 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_INTERFACES_TESTCASE_HPP_INCLUDED
+#define CATCH_INTERFACES_TESTCASE_HPP_INCLUDED
+
+#include <vector>
+
+namespace Catch {
+
+    struct TestCaseInfo;
+    class TestCaseHandle;
+    class IConfig;
+
+    class ITestCaseRegistry {
+    public:
+        virtual ~ITestCaseRegistry(); // = default
+        // TODO: this exists only for adding filenames to test cases -- let's expose this in a saner way later
+        virtual std::vector<TestCaseInfo* > const& getAllInfos() const = 0;
+        virtual std::vector<TestCaseHandle> const& getAllTests() const = 0;
+        virtual std::vector<TestCaseHandle> const& getAllTestsSorted( IConfig const& config ) const = 0;
+    };
+
+}
+
+#endif // CATCH_INTERFACES_TESTCASE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_assertion_handler.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_assertion_handler.cpp
new file mode 100644
index 00000000..f650a707
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_assertion_handler.cpp
@@ -0,0 +1,82 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_assertion_handler.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/internal/catch_context.hpp>
+#include <catch2/internal/catch_debugger.hpp>
+#include <catch2/internal/catch_test_failure_exception.hpp>
+#include <catch2/matchers/catch_matchers_string.hpp>
+
+namespace Catch {
+
+    AssertionHandler::AssertionHandler
+        (   StringRef macroName,
+            SourceLineInfo const& lineInfo,
+            StringRef capturedExpression,
+            ResultDisposition::Flags resultDisposition )
+    :   m_assertionInfo{ macroName, lineInfo, capturedExpression, resultDisposition },
+        m_resultCapture( getResultCapture() )
+    {
+        m_resultCapture.notifyAssertionStarted( m_assertionInfo );
+    }
+
+    void AssertionHandler::handleExpr( ITransientExpression const& expr ) {
+        m_resultCapture.handleExpr( m_assertionInfo, expr, m_reaction );
+    }
+    void AssertionHandler::handleMessage(ResultWas::OfType resultType, StringRef message) {
+        m_resultCapture.handleMessage( m_assertionInfo, resultType, message, m_reaction );
+    }
+
+    auto AssertionHandler::allowThrows() const -> bool {
+        return getCurrentContext().getConfig()->allowThrows();
+    }
+
+    void AssertionHandler::complete() {
+        m_completed = true;
+        if( m_reaction.shouldDebugBreak ) {
+
+            // If you find your debugger stopping you here then go one level up on the
+            // call-stack for the code that caused it (typically a failed assertion)
+
+            // (To go back to the test and change execution, jump over the throw, next)
+            CATCH_BREAK_INTO_DEBUGGER();
+        }
+        if (m_reaction.shouldThrow) {
+            throw_test_failure_exception();
+        }
+        if ( m_reaction.shouldSkip ) {
+            throw_test_skip_exception();
+        }
+    }
+
+    void AssertionHandler::handleUnexpectedInflightException() {
+        m_resultCapture.handleUnexpectedInflightException( m_assertionInfo, Catch::translateActiveException(), m_reaction );
+    }
+
+    void AssertionHandler::handleExceptionThrownAsExpected() {
+        m_resultCapture.handleNonExpr(m_assertionInfo, ResultWas::Ok, m_reaction);
+    }
+    void AssertionHandler::handleExceptionNotThrownAsExpected() {
+        m_resultCapture.handleNonExpr(m_assertionInfo, ResultWas::Ok, m_reaction);
+    }
+
+    void AssertionHandler::handleUnexpectedExceptionNotThrown() {
+        m_resultCapture.handleUnexpectedExceptionNotThrown( m_assertionInfo, m_reaction );
+    }
+
+    void AssertionHandler::handleThrowingCallSkipped() {
+        m_resultCapture.handleNonExpr(m_assertionInfo, ResultWas::Ok, m_reaction);
+    }
+
+    // This is the overload that takes a string and infers the Equals matcher from it
+    // The more general overload, that takes any string matcher, is in catch_capture_matchers.cpp
+    void handleExceptionMatchExpr( AssertionHandler& handler, std::string const& str ) {
+        handleExceptionMatchExpr( handler, Matchers::Equals( str ) );
+    }
+
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_assertion_handler.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_assertion_handler.hpp
new file mode 100644
index 00000000..01dd7801
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_assertion_handler.hpp
@@ -0,0 +1,68 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_ASSERTION_HANDLER_HPP_INCLUDED
+#define CATCH_ASSERTION_HANDLER_HPP_INCLUDED
+
+#include <catch2/catch_assertion_info.hpp>
+#include <catch2/internal/catch_decomposer.hpp>
+#include <catch2/interfaces/catch_interfaces_capture.hpp>
+
+#include <string>
+
+namespace Catch {
+
+    struct AssertionReaction {
+        bool shouldDebugBreak = false;
+        bool shouldThrow = false;
+        bool shouldSkip = false;
+    };
+
+    class AssertionHandler {
+        AssertionInfo m_assertionInfo;
+        AssertionReaction m_reaction;
+        bool m_completed = false;
+        IResultCapture& m_resultCapture;
+
+    public:
+        AssertionHandler
+            (   StringRef macroName,
+                SourceLineInfo const& lineInfo,
+                StringRef capturedExpression,
+                ResultDisposition::Flags resultDisposition );
+        ~AssertionHandler() {
+            if ( !m_completed ) {
+                m_resultCapture.handleIncomplete( m_assertionInfo );
+            }
+        }
+
+
+        template<typename T>
+        void handleExpr( ExprLhs<T> const& expr ) {
+            handleExpr( expr.makeUnaryExpr() );
+        }
+        void handleExpr( ITransientExpression const& expr );
+
+        void handleMessage(ResultWas::OfType resultType, StringRef message);
+
+        void handleExceptionThrownAsExpected();
+        void handleUnexpectedExceptionNotThrown();
+        void handleExceptionNotThrownAsExpected();
+        void handleThrowingCallSkipped();
+        void handleUnexpectedInflightException();
+
+        void complete();
+
+        // query
+        auto allowThrows() const -> bool;
+    };
+
+    void handleExceptionMatchExpr( AssertionHandler& handler, std::string const& str );
+
+} // namespace Catch
+
+#endif // CATCH_ASSERTION_HANDLER_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_case_insensitive_comparisons.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_case_insensitive_comparisons.cpp
new file mode 100644
index 00000000..b3e7b53a
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_case_insensitive_comparisons.cpp
@@ -0,0 +1,35 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/internal/catch_case_insensitive_comparisons.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+
+#include <algorithm>
+
+namespace Catch {
+    namespace Detail {
+
+        bool CaseInsensitiveLess::operator()( StringRef lhs,
+                                              StringRef rhs ) const {
+            return std::lexicographical_compare(
+                lhs.begin(), lhs.end(),
+                rhs.begin(), rhs.end(),
+                []( char l, char r ) { return toLower( l ) < toLower( r ); } );
+        }
+
+        bool
+        CaseInsensitiveEqualTo::operator()( StringRef lhs,
+                                            StringRef rhs ) const {
+            return std::equal(
+                lhs.begin(), lhs.end(),
+                rhs.begin(), rhs.end(),
+                []( char l, char r ) { return toLower( l ) == toLower( r ); } );
+        }
+
+    } // namespace Detail
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_case_insensitive_comparisons.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_case_insensitive_comparisons.hpp
new file mode 100644
index 00000000..33b77826
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_case_insensitive_comparisons.hpp
@@ -0,0 +1,30 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_CASE_INSENSITIVE_COMPARISONS_HPP_INCLUDED
+#define CATCH_CASE_INSENSITIVE_COMPARISONS_HPP_INCLUDED
+
+#include <catch2/internal/catch_stringref.hpp>
+
+namespace Catch {
+    namespace Detail {
+        //! Provides case-insensitive `op<` semantics when called
+        struct CaseInsensitiveLess {
+            bool operator()( StringRef lhs,
+                             StringRef rhs ) const;
+        };
+
+        //! Provides case-insensitive `op==` semantics when called
+        struct CaseInsensitiveEqualTo {
+            bool operator()( StringRef lhs,
+                             StringRef rhs ) const;
+        };
+
+    } // namespace Detail
+} // namespace Catch
+
+#endif // CATCH_CASE_INSENSITIVE_COMPARISONS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_case_sensitive.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_case_sensitive.hpp
new file mode 100644
index 00000000..cc891104
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_case_sensitive.hpp
@@ -0,0 +1,17 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_CASE_SENSITIVE_HPP_INCLUDED
+#define CATCH_CASE_SENSITIVE_HPP_INCLUDED
+
+namespace Catch {
+
+    enum class CaseSensitive { Yes, No };
+
+} // namespace Catch
+
+#endif // CATCH_CASE_SENSITIVE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_clara.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_clara.cpp
new file mode 100644
index 00000000..c76089ee
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_clara.cpp
@@ -0,0 +1,464 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/internal/catch_clara.hpp>
+#include <catch2/internal/catch_console_width.hpp>
+#include <catch2/internal/catch_platform.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/internal/catch_textflow.hpp>
+#include <catch2/internal/catch_reusable_string_stream.hpp>
+
+#include <algorithm>
+#include <ostream>
+
+namespace {
+    bool isOptPrefix( char c ) {
+        return c == '-'
+#ifdef CATCH_PLATFORM_WINDOWS
+               || c == '/'
+#endif
+            ;
+    }
+
+    Catch::StringRef normaliseOpt( Catch::StringRef optName ) {
+        if ( optName[0] == '-'
+#if defined(CATCH_PLATFORM_WINDOWS)
+             || optName[0] == '/'
+#endif
+        ) {
+            return optName.substr( 1, optName.size() );
+        }
+
+        return optName;
+    }
+
+    static size_t find_first_separator(Catch::StringRef sr) {
+        auto is_separator = []( char c ) {
+            return c == ' ' || c == ':' || c == '=';
+        };
+        size_t pos = 0;
+        while (pos < sr.size()) {
+            if (is_separator(sr[pos])) { return pos; }
+            ++pos;
+        }
+
+        return Catch::StringRef::npos;
+    }
+
+} // namespace
+
+namespace Catch {
+    namespace Clara {
+        namespace Detail {
+
+            void TokenStream::loadBuffer() {
+                m_tokenBuffer.clear();
+
+                // Skip any empty strings
+                while ( it != itEnd && it->empty() ) {
+                    ++it;
+                }
+
+                if ( it != itEnd ) {
+                    StringRef next = *it;
+                    if ( isOptPrefix( next[0] ) ) {
+                        auto delimiterPos = find_first_separator(next);
+                        if ( delimiterPos != StringRef::npos ) {
+                            m_tokenBuffer.push_back(
+                                { TokenType::Option,
+                                  next.substr( 0, delimiterPos ) } );
+                            m_tokenBuffer.push_back(
+                                { TokenType::Argument,
+                                  next.substr( delimiterPos + 1, next.size() ) } );
+                        } else {
+                            if ( next[1] != '-' && next.size() > 2 ) {
+                                // Combined short args, e.g. "-ab" for "-a -b"
+                                for ( size_t i = 1; i < next.size(); ++i ) {
+                                    m_tokenBuffer.push_back(
+                                        { TokenType::Option,
+                                          next.substr( i, 1 ) } );
+                                }
+                            } else {
+                                m_tokenBuffer.push_back(
+                                    { TokenType::Option, next } );
+                            }
+                        }
+                    } else {
+                        m_tokenBuffer.push_back(
+                            { TokenType::Argument, next } );
+                    }
+                }
+            }
+
+            TokenStream::TokenStream( Args const& args ):
+                TokenStream( args.m_args.begin(), args.m_args.end() ) {}
+
+            TokenStream::TokenStream( Iterator it_, Iterator itEnd_ ):
+                it( it_ ), itEnd( itEnd_ ) {
+                loadBuffer();
+            }
+
+            TokenStream& TokenStream::operator++() {
+                if ( m_tokenBuffer.size() >= 2 ) {
+                    m_tokenBuffer.erase( m_tokenBuffer.begin() );
+                } else {
+                    if ( it != itEnd )
+                        ++it;
+                    loadBuffer();
+                }
+                return *this;
+            }
+
+            ParserResult convertInto( std::string const& source,
+                                      std::string& target ) {
+                target = source;
+                return ParserResult::ok( ParseResultType::Matched );
+            }
+
+            ParserResult convertInto( std::string const& source,
+                                      bool& target ) {
+                std::string srcLC = toLower( source );
+
+                if ( srcLC == "y" || srcLC == "1" || srcLC == "true" ||
+                     srcLC == "yes" || srcLC == "on" ) {
+                    target = true;
+                } else if ( srcLC == "n" || srcLC == "0" || srcLC == "false" ||
+                            srcLC == "no" || srcLC == "off" ) {
+                    target = false;
+                } else {
+                    return ParserResult::runtimeError(
+                        "Expected a boolean value but did not recognise: '" +
+                        source + '\'' );
+                }
+                return ParserResult::ok( ParseResultType::Matched );
+            }
+
+            size_t ParserBase::cardinality() const { return 1; }
+
+            InternalParseResult ParserBase::parse( Args const& args ) const {
+                return parse( static_cast<std::string>(args.exeName()), TokenStream( args ) );
+            }
+
+            ParseState::ParseState( ParseResultType type,
+                                    TokenStream remainingTokens ):
+                m_type( type ), m_remainingTokens( CATCH_MOVE(remainingTokens) ) {}
+
+            ParserResult BoundFlagRef::setFlag( bool flag ) {
+                m_ref = flag;
+                return ParserResult::ok( ParseResultType::Matched );
+            }
+
+            ResultBase::~ResultBase() = default;
+
+            bool BoundRef::isContainer() const { return false; }
+
+            bool BoundRef::isFlag() const { return false; }
+
+            bool BoundFlagRefBase::isFlag() const { return true; }
+
+} // namespace Detail
+
+        Detail::InternalParseResult Arg::parse(std::string const&,
+                                               Detail::TokenStream tokens) const {
+            auto validationResult = validate();
+            if (!validationResult)
+                return Detail::InternalParseResult(validationResult);
+
+            auto token = *tokens;
+            if (token.type != Detail::TokenType::Argument)
+                return Detail::InternalParseResult::ok(Detail::ParseState(
+                    ParseResultType::NoMatch, CATCH_MOVE(tokens)));
+
+            assert(!m_ref->isFlag());
+            auto valueRef =
+                static_cast<Detail::BoundValueRefBase*>(m_ref.get());
+
+            auto result = valueRef->setValue(static_cast<std::string>(token.token));
+            if ( !result )
+                return Detail::InternalParseResult( result );
+            else
+                return Detail::InternalParseResult::ok(
+                    Detail::ParseState( ParseResultType::Matched,
+                                        CATCH_MOVE( ++tokens ) ) );
+        }
+
+        Opt::Opt(bool& ref) :
+            ParserRefImpl(std::make_shared<Detail::BoundFlagRef>(ref)) {}
+
+        Detail::HelpColumns Opt::getHelpColumns() const {
+            ReusableStringStream oss;
+            bool first = true;
+            for (auto const& opt : m_optNames) {
+                if (first)
+                    first = false;
+                else
+                    oss << ", ";
+                oss << opt;
+            }
+            if (!m_hint.empty())
+                oss << " <" << m_hint << '>';
+            return { oss.str(), m_description };
+        }
+
+        bool Opt::isMatch(StringRef optToken) const {
+            auto normalisedToken = normaliseOpt(optToken);
+            for (auto const& name : m_optNames) {
+                if (normaliseOpt(name) == normalisedToken)
+                    return true;
+            }
+            return false;
+        }
+
+        Detail::InternalParseResult Opt::parse(std::string const&,
+                                       Detail::TokenStream tokens) const {
+            auto validationResult = validate();
+            if (!validationResult)
+                return Detail::InternalParseResult(validationResult);
+
+            if (tokens &&
+                tokens->type == Detail::TokenType::Option) {
+                auto const& token = *tokens;
+                if (isMatch(token.token)) {
+                    if (m_ref->isFlag()) {
+                        auto flagRef =
+                            static_cast<Detail::BoundFlagRefBase*>(
+                                m_ref.get());
+                        auto result = flagRef->setFlag(true);
+                        if (!result)
+                            return Detail::InternalParseResult(result);
+                        if (result.value() ==
+                            ParseResultType::ShortCircuitAll)
+                            return Detail::InternalParseResult::ok(Detail::ParseState(
+                                result.value(), CATCH_MOVE(tokens)));
+                    } else {
+                        auto valueRef =
+                            static_cast<Detail::BoundValueRefBase*>(
+                                m_ref.get());
+                        ++tokens;
+                        if (!tokens)
+                            return Detail::InternalParseResult::runtimeError(
+                                "Expected argument following " +
+                                token.token);
+                        auto const& argToken = *tokens;
+                        if (argToken.type != Detail::TokenType::Argument)
+                            return Detail::InternalParseResult::runtimeError(
+                                "Expected argument following " +
+                                token.token);
+                        const auto result = valueRef->setValue(static_cast<std::string>(argToken.token));
+                        if (!result)
+                            return Detail::InternalParseResult(result);
+                        if (result.value() ==
+                            ParseResultType::ShortCircuitAll)
+                            return Detail::InternalParseResult::ok(Detail::ParseState(
+                                result.value(), CATCH_MOVE(tokens)));
+                    }
+                    return Detail::InternalParseResult::ok(Detail::ParseState(
+                        ParseResultType::Matched, CATCH_MOVE(++tokens)));
+                }
+            }
+            return Detail::InternalParseResult::ok(
+                Detail::ParseState(ParseResultType::NoMatch, CATCH_MOVE(tokens)));
+        }
+
+        Detail::Result Opt::validate() const {
+            if (m_optNames.empty())
+                return Detail::Result::logicError("No options supplied to Opt");
+            for (auto const& name : m_optNames) {
+                if (name.empty())
+                    return Detail::Result::logicError(
+                        "Option name cannot be empty");
+#ifdef CATCH_PLATFORM_WINDOWS
+                if (name[0] != '-' && name[0] != '/')
+                    return Detail::Result::logicError(
+                        "Option name must begin with '-' or '/'");
+#else
+                if (name[0] != '-')
+                    return Detail::Result::logicError(
+                        "Option name must begin with '-'");
+#endif
+            }
+            return ParserRefImpl::validate();
+        }
+
+        ExeName::ExeName() :
+            m_name(std::make_shared<std::string>("<executable>")) {}
+
+        ExeName::ExeName(std::string& ref) : ExeName() {
+            m_ref = std::make_shared<Detail::BoundValueRef<std::string>>(ref);
+        }
+
+        Detail::InternalParseResult
+            ExeName::parse(std::string const&,
+                           Detail::TokenStream tokens) const {
+            return Detail::InternalParseResult::ok(
+                Detail::ParseState(ParseResultType::NoMatch, CATCH_MOVE(tokens)));
+        }
+
+        ParserResult ExeName::set(std::string const& newName) {
+            auto lastSlash = newName.find_last_of("\\/");
+            auto filename = (lastSlash == std::string::npos)
+                ? newName
+                : newName.substr(lastSlash + 1);
+
+            *m_name = filename;
+            if (m_ref)
+                return m_ref->setValue(filename);
+            else
+                return ParserResult::ok(ParseResultType::Matched);
+        }
+
+
+
+
+        Parser& Parser::operator|=( Parser const& other ) {
+            m_options.insert( m_options.end(),
+                              other.m_options.begin(),
+                              other.m_options.end() );
+            m_args.insert(
+                m_args.end(), other.m_args.begin(), other.m_args.end() );
+            return *this;
+        }
+
+        std::vector<Detail::HelpColumns> Parser::getHelpColumns() const {
+            std::vector<Detail::HelpColumns> cols;
+            cols.reserve( m_options.size() );
+            for ( auto const& o : m_options ) {
+                cols.push_back(o.getHelpColumns());
+            }
+            return cols;
+        }
+
+        void Parser::writeToStream( std::ostream& os ) const {
+            if ( !m_exeName.name().empty() ) {
+                os << "usage:\n"
+                   << "  " << m_exeName.name() << ' ';
+                bool required = true, first = true;
+                for ( auto const& arg : m_args ) {
+                    if ( first )
+                        first = false;
+                    else
+                        os << ' ';
+                    if ( arg.isOptional() && required ) {
+                        os << '[';
+                        required = false;
+                    }
+                    os << '<' << arg.hint() << '>';
+                    if ( arg.cardinality() == 0 )
+                        os << " ... ";
+                }
+                if ( !required )
+                    os << ']';
+                if ( !m_options.empty() )
+                    os << " options";
+                os << "\n\nwhere options are:\n";
+            }
+
+            auto rows = getHelpColumns();
+            size_t consoleWidth = CATCH_CONFIG_CONSOLE_WIDTH;
+            size_t optWidth = 0;
+            for ( auto const& cols : rows )
+                optWidth = ( std::max )( optWidth, cols.left.size() + 2 );
+
+            optWidth = ( std::min )( optWidth, consoleWidth / 2 );
+
+            for ( auto& cols : rows ) {
+                auto row = TextFlow::Column( CATCH_MOVE(cols.left) )
+                               .width( optWidth )
+                               .indent( 2 ) +
+                           TextFlow::Spacer( 4 ) +
+                           TextFlow::Column( static_cast<std::string>(cols.descriptions) )
+                               .width( consoleWidth - 7 - optWidth );
+                os << row << '\n';
+            }
+        }
+
+        Detail::Result Parser::validate() const {
+            for ( auto const& opt : m_options ) {
+                auto result = opt.validate();
+                if ( !result )
+                    return result;
+            }
+            for ( auto const& arg : m_args ) {
+                auto result = arg.validate();
+                if ( !result )
+                    return result;
+            }
+            return Detail::Result::ok();
+        }
+
+        Detail::InternalParseResult
+        Parser::parse( std::string const& exeName,
+                       Detail::TokenStream tokens ) const {
+
+            struct ParserInfo {
+                ParserBase const* parser = nullptr;
+                size_t count = 0;
+            };
+            std::vector<ParserInfo> parseInfos;
+            parseInfos.reserve( m_options.size() + m_args.size() );
+            for ( auto const& opt : m_options ) {
+                parseInfos.push_back( { &opt, 0 } );
+            }
+            for ( auto const& arg : m_args ) {
+                parseInfos.push_back( { &arg, 0 } );
+            }
+
+            m_exeName.set( exeName );
+
+            auto result = Detail::InternalParseResult::ok(
+                Detail::ParseState( ParseResultType::NoMatch, CATCH_MOVE(tokens) ) );
+            while ( result.value().remainingTokens() ) {
+                bool tokenParsed = false;
+
+                for ( auto& parseInfo : parseInfos ) {
+                    if ( parseInfo.parser->cardinality() == 0 ||
+                         parseInfo.count < parseInfo.parser->cardinality() ) {
+                        result = parseInfo.parser->parse(
+                            exeName, CATCH_MOVE(result).value().remainingTokens() );
+                        if ( !result )
+                            return result;
+                        if ( result.value().type() !=
+                             ParseResultType::NoMatch ) {
+                            tokenParsed = true;
+                            ++parseInfo.count;
+                            break;
+                        }
+                    }
+                }
+
+                if ( result.value().type() == ParseResultType::ShortCircuitAll )
+                    return result;
+                if ( !tokenParsed )
+                    return Detail::InternalParseResult::runtimeError(
+                        "Unrecognised token: " +
+                        result.value().remainingTokens()->token );
+            }
+            // !TBD Check missing required options
+            return result;
+        }
+
+        Args::Args(int argc, char const* const* argv) :
+            m_exeName(argv[0]), m_args(argv + 1, argv + argc) {}
+
+        Args::Args(std::initializer_list<StringRef> args) :
+            m_exeName(*args.begin()),
+            m_args(args.begin() + 1, args.end()) {}
+
+
+        Help::Help( bool& showHelpFlag ):
+            Opt( [&]( bool flag ) {
+                showHelpFlag = flag;
+                return ParserResult::ok( ParseResultType::ShortCircuitAll );
+            } ) {
+            static_cast<Opt&> ( *this )(
+                "display usage information" )["-?"]["-h"]["--help"]
+                .optional();
+        }
+
+    } // namespace Clara
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_clara.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_clara.hpp
new file mode 100644
index 00000000..d869593b
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_clara.hpp
@@ -0,0 +1,750 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_CLARA_HPP_INCLUDED
+#define CATCH_CLARA_HPP_INCLUDED
+
+#if defined( __clang__ )
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wweak-vtables"
+#    pragma clang diagnostic ignored "-Wshadow"
+#    pragma clang diagnostic ignored "-Wdeprecated"
+#endif
+
+#if defined( __GNUC__ )
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wsign-conversion"
+#endif
+
+#ifndef CLARA_CONFIG_OPTIONAL_TYPE
+#    ifdef __has_include
+#        if __has_include( <optional>) && __cplusplus >= 201703L
+#            include <optional>
+#            define CLARA_CONFIG_OPTIONAL_TYPE std::optional
+#        endif
+#    endif
+#endif
+
+#include <catch2/internal/catch_stringref.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/internal/catch_noncopyable.hpp>
+#include <catch2/internal/catch_void_type.hpp>
+
+#include <cassert>
+#include <memory>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+namespace Catch {
+    namespace Clara {
+
+        class Args;
+        class Parser;
+
+        // enum of result types from a parse
+        enum class ParseResultType {
+            Matched,
+            NoMatch,
+            ShortCircuitAll,
+            ShortCircuitSame
+        };
+
+        struct accept_many_t {};
+        constexpr accept_many_t accept_many {};
+
+        namespace Detail {
+            struct fake_arg {
+                template <typename T>
+                operator T();
+            };
+
+            template <typename F, typename = void>
+            struct is_unary_function : std::false_type {};
+
+            template <typename F>
+            struct is_unary_function<
+                F,
+                Catch::Detail::void_t<decltype(
+                    std::declval<F>()( fake_arg() ) )
+                >
+            > : std::true_type {};
+
+            // Traits for extracting arg and return type of lambdas (for single
+            // argument lambdas)
+            template <typename L>
+            struct UnaryLambdaTraits
+                : UnaryLambdaTraits<decltype( &L::operator() )> {};
+
+            template <typename ClassT, typename ReturnT, typename... Args>
+            struct UnaryLambdaTraits<ReturnT ( ClassT::* )( Args... ) const> {
+                static const bool isValid = false;
+            };
+
+            template <typename ClassT, typename ReturnT, typename ArgT>
+            struct UnaryLambdaTraits<ReturnT ( ClassT::* )( ArgT ) const> {
+                static const bool isValid = true;
+                using ArgType = std::remove_const_t<std::remove_reference_t<ArgT>>;
+                using ReturnType = ReturnT;
+            };
+
+            class TokenStream;
+
+            // Wraps a token coming from a token stream. These may not directly
+            // correspond to strings as a single string may encode an option +
+            // its argument if the : or = form is used
+            enum class TokenType { Option, Argument };
+            struct Token {
+                TokenType type;
+                StringRef token;
+            };
+
+            // Abstracts iterators into args as a stream of tokens, with option
+            // arguments uniformly handled
+            class TokenStream {
+                using Iterator = std::vector<StringRef>::const_iterator;
+                Iterator it;
+                Iterator itEnd;
+                std::vector<Token> m_tokenBuffer;
+                void loadBuffer();
+
+            public:
+                explicit TokenStream( Args const& args );
+                TokenStream( Iterator it, Iterator itEnd );
+
+                explicit operator bool() const {
+                    return !m_tokenBuffer.empty() || it != itEnd;
+                }
+
+                size_t count() const {
+                    return m_tokenBuffer.size() + ( itEnd - it );
+                }
+
+                Token operator*() const {
+                    assert( !m_tokenBuffer.empty() );
+                    return m_tokenBuffer.front();
+                }
+
+                Token const* operator->() const {
+                    assert( !m_tokenBuffer.empty() );
+                    return &m_tokenBuffer.front();
+                }
+
+                TokenStream& operator++();
+            };
+
+            //! Denotes type of a parsing result
+            enum class ResultType {
+                Ok,          ///< No errors
+                LogicError,  ///< Error in user-specified arguments for
+                             ///< construction
+                RuntimeError ///< Error in parsing inputs
+            };
+
+            class ResultBase {
+            protected:
+                ResultBase( ResultType type ): m_type( type ) {}
+                virtual ~ResultBase(); // = default;
+
+
+                ResultBase(ResultBase const&) = default;
+                ResultBase& operator=(ResultBase const&) = default;
+                ResultBase(ResultBase&&) = default;
+                ResultBase& operator=(ResultBase&&) = default;
+
+                virtual void enforceOk() const = 0;
+
+                ResultType m_type;
+            };
+
+            template <typename T>
+            class ResultValueBase : public ResultBase {
+            public:
+                T const& value() const& {
+                    enforceOk();
+                    return m_value;
+                }
+                T&& value() && {
+                    enforceOk();
+                    return CATCH_MOVE( m_value );
+                }
+
+            protected:
+                ResultValueBase( ResultType type ): ResultBase( type ) {}
+
+                ResultValueBase( ResultValueBase const& other ):
+                    ResultBase( other ) {
+                    if ( m_type == ResultType::Ok )
+                        new ( &m_value ) T( other.m_value );
+                }
+                ResultValueBase( ResultValueBase&& other ):
+                    ResultBase( other ) {
+                    if ( m_type == ResultType::Ok )
+                        new ( &m_value ) T( CATCH_MOVE(other.m_value) );
+                }
+
+
+                ResultValueBase( ResultType, T const& value ):
+                    ResultBase( ResultType::Ok ) {
+                    new ( &m_value ) T( value );
+                }
+                ResultValueBase( ResultType, T&& value ):
+                    ResultBase( ResultType::Ok ) {
+                    new ( &m_value ) T( CATCH_MOVE(value) );
+                }
+
+                ResultValueBase& operator=( ResultValueBase const& other ) {
+                    if ( m_type == ResultType::Ok )
+                        m_value.~T();
+                    ResultBase::operator=( other );
+                    if ( m_type == ResultType::Ok )
+                        new ( &m_value ) T( other.m_value );
+                    return *this;
+                }
+                ResultValueBase& operator=( ResultValueBase&& other ) {
+                    if ( m_type == ResultType::Ok ) m_value.~T();
+                    ResultBase::operator=( other );
+                    if ( m_type == ResultType::Ok )
+                        new ( &m_value ) T( CATCH_MOVE(other.m_value) );
+                    return *this;
+                }
+
+
+                ~ResultValueBase() override {
+                    if ( m_type == ResultType::Ok )
+                        m_value.~T();
+                }
+
+                union {
+                    T m_value;
+                };
+            };
+
+            template <> class ResultValueBase<void> : public ResultBase {
+            protected:
+                using ResultBase::ResultBase;
+            };
+
+            template <typename T = void>
+            class BasicResult : public ResultValueBase<T> {
+            public:
+                template <typename U>
+                explicit BasicResult( BasicResult<U> const& other ):
+                    ResultValueBase<T>( other.type() ),
+                    m_errorMessage( other.errorMessage() ) {
+                    assert( type() != ResultType::Ok );
+                }
+
+                template <typename U>
+                static auto ok( U&& value ) -> BasicResult {
+                    return { ResultType::Ok, CATCH_FORWARD(value) };
+                }
+                static auto ok() -> BasicResult { return { ResultType::Ok }; }
+                static auto logicError( std::string&& message )
+                    -> BasicResult {
+                    return { ResultType::LogicError, CATCH_MOVE(message) };
+                }
+                static auto runtimeError( std::string&& message )
+                    -> BasicResult {
+                    return { ResultType::RuntimeError, CATCH_MOVE(message) };
+                }
+
+                explicit operator bool() const {
+                    return m_type == ResultType::Ok;
+                }
+                auto type() const -> ResultType { return m_type; }
+                auto errorMessage() const -> std::string const& {
+                    return m_errorMessage;
+                }
+
+            protected:
+                void enforceOk() const override {
+
+                    // Errors shouldn't reach this point, but if they do
+                    // the actual error message will be in m_errorMessage
+                    assert( m_type != ResultType::LogicError );
+                    assert( m_type != ResultType::RuntimeError );
+                    if ( m_type != ResultType::Ok )
+                        std::abort();
+                }
+
+                std::string
+                    m_errorMessage; // Only populated if resultType is an error
+
+                BasicResult( ResultType type,
+                             std::string&& message ):
+                    ResultValueBase<T>( type ), m_errorMessage( CATCH_MOVE(message) ) {
+                    assert( m_type != ResultType::Ok );
+                }
+
+                using ResultValueBase<T>::ResultValueBase;
+                using ResultBase::m_type;
+            };
+
+            class ParseState {
+            public:
+                ParseState( ParseResultType type,
+                            TokenStream remainingTokens );
+
+                ParseResultType type() const { return m_type; }
+                TokenStream const& remainingTokens() const& {
+                    return m_remainingTokens;
+                }
+                TokenStream&& remainingTokens() && {
+                    return CATCH_MOVE( m_remainingTokens );
+                }
+
+            private:
+                ParseResultType m_type;
+                TokenStream m_remainingTokens;
+            };
+
+            using Result = BasicResult<void>;
+            using ParserResult = BasicResult<ParseResultType>;
+            using InternalParseResult = BasicResult<ParseState>;
+
+            struct HelpColumns {
+                std::string left;
+                StringRef descriptions;
+            };
+
+            template <typename T>
+            ParserResult convertInto( std::string const& source, T& target ) {
+                std::stringstream ss( source );
+                ss >> target;
+                if ( ss.fail() ) {
+                    return ParserResult::runtimeError(
+                        "Unable to convert '" + source +
+                        "' to destination type" );
+                } else {
+                    return ParserResult::ok( ParseResultType::Matched );
+                }
+            }
+            ParserResult convertInto( std::string const& source,
+                                      std::string& target );
+            ParserResult convertInto( std::string const& source, bool& target );
+
+#ifdef CLARA_CONFIG_OPTIONAL_TYPE
+            template <typename T>
+            auto convertInto( std::string const& source,
+                              CLARA_CONFIG_OPTIONAL_TYPE<T>& target )
+                -> ParserResult {
+                T temp;
+                auto result = convertInto( source, temp );
+                if ( result )
+                    target = CATCH_MOVE( temp );
+                return result;
+            }
+#endif // CLARA_CONFIG_OPTIONAL_TYPE
+
+            struct BoundRef : Catch::Detail::NonCopyable {
+                virtual ~BoundRef() = default;
+                virtual bool isContainer() const;
+                virtual bool isFlag() const;
+            };
+            struct BoundValueRefBase : BoundRef {
+                virtual auto setValue( std::string const& arg )
+                    -> ParserResult = 0;
+            };
+            struct BoundFlagRefBase : BoundRef {
+                virtual auto setFlag( bool flag ) -> ParserResult = 0;
+                bool isFlag() const override;
+            };
+
+            template <typename T> struct BoundValueRef : BoundValueRefBase {
+                T& m_ref;
+
+                explicit BoundValueRef( T& ref ): m_ref( ref ) {}
+
+                ParserResult setValue( std::string const& arg ) override {
+                    return convertInto( arg, m_ref );
+                }
+            };
+
+            template <typename T>
+            struct BoundValueRef<std::vector<T>> : BoundValueRefBase {
+                std::vector<T>& m_ref;
+
+                explicit BoundValueRef( std::vector<T>& ref ): m_ref( ref ) {}
+
+                auto isContainer() const -> bool override { return true; }
+
+                auto setValue( std::string const& arg )
+                    -> ParserResult override {
+                    T temp;
+                    auto result = convertInto( arg, temp );
+                    if ( result )
+                        m_ref.push_back( temp );
+                    return result;
+                }
+            };
+
+            struct BoundFlagRef : BoundFlagRefBase {
+                bool& m_ref;
+
+                explicit BoundFlagRef( bool& ref ): m_ref( ref ) {}
+
+                ParserResult setFlag( bool flag ) override;
+            };
+
+            template <typename ReturnType> struct LambdaInvoker {
+                static_assert(
+                    std::is_same<ReturnType, ParserResult>::value,
+                    "Lambda must return void or clara::ParserResult" );
+
+                template <typename L, typename ArgType>
+                static auto invoke( L const& lambda, ArgType const& arg )
+                    -> ParserResult {
+                    return lambda( arg );
+                }
+            };
+
+            template <> struct LambdaInvoker<void> {
+                template <typename L, typename ArgType>
+                static auto invoke( L const& lambda, ArgType const& arg )
+                    -> ParserResult {
+                    lambda( arg );
+                    return ParserResult::ok( ParseResultType::Matched );
+                }
+            };
+
+            template <typename ArgType, typename L>
+            auto invokeLambda( L const& lambda, std::string const& arg )
+                -> ParserResult {
+                ArgType temp{};
+                auto result = convertInto( arg, temp );
+                return !result ? result
+                               : LambdaInvoker<typename UnaryLambdaTraits<
+                                     L>::ReturnType>::invoke( lambda, temp );
+            }
+
+            template <typename L> struct BoundLambda : BoundValueRefBase {
+                L m_lambda;
+
+                static_assert(
+                    UnaryLambdaTraits<L>::isValid,
+                    "Supplied lambda must take exactly one argument" );
+                explicit BoundLambda( L const& lambda ): m_lambda( lambda ) {}
+
+                auto setValue( std::string const& arg )
+                    -> ParserResult override {
+                    return invokeLambda<typename UnaryLambdaTraits<L>::ArgType>(
+                        m_lambda, arg );
+                }
+            };
+
+            template <typename L> struct BoundManyLambda : BoundLambda<L> {
+                explicit BoundManyLambda( L const& lambda ): BoundLambda<L>( lambda ) {}
+                bool isContainer() const override { return true; }
+            };
+
+            template <typename L> struct BoundFlagLambda : BoundFlagRefBase {
+                L m_lambda;
+
+                static_assert(
+                    UnaryLambdaTraits<L>::isValid,
+                    "Supplied lambda must take exactly one argument" );
+                static_assert(
+                    std::is_same<typename UnaryLambdaTraits<L>::ArgType,
+                                 bool>::value,
+                    "flags must be boolean" );
+
+                explicit BoundFlagLambda( L const& lambda ):
+                    m_lambda( lambda ) {}
+
+                auto setFlag( bool flag ) -> ParserResult override {
+                    return LambdaInvoker<typename UnaryLambdaTraits<
+                        L>::ReturnType>::invoke( m_lambda, flag );
+                }
+            };
+
+            enum class Optionality { Optional, Required };
+
+            class ParserBase {
+            public:
+                virtual ~ParserBase() = default;
+                virtual auto validate() const -> Result { return Result::ok(); }
+                virtual auto parse( std::string const& exeName,
+                                    TokenStream tokens ) const
+                    -> InternalParseResult = 0;
+                virtual size_t cardinality() const;
+
+                InternalParseResult parse( Args const& args ) const;
+            };
+
+            template <typename DerivedT>
+            class ComposableParserImpl : public ParserBase {
+            public:
+                template <typename T>
+                auto operator|( T const& other ) const -> Parser;
+            };
+
+            // Common code and state for Args and Opts
+            template <typename DerivedT>
+            class ParserRefImpl : public ComposableParserImpl<DerivedT> {
+            protected:
+                Optionality m_optionality = Optionality::Optional;
+                std::shared_ptr<BoundRef> m_ref;
+                StringRef m_hint;
+                StringRef m_description;
+
+                explicit ParserRefImpl( std::shared_ptr<BoundRef> const& ref ):
+                    m_ref( ref ) {}
+
+            public:
+                template <typename LambdaT>
+                ParserRefImpl( accept_many_t,
+                               LambdaT const& ref,
+                               StringRef hint ):
+                    m_ref( std::make_shared<BoundManyLambda<LambdaT>>( ref ) ),
+                    m_hint( hint ) {}
+
+                template <typename T,
+                          typename = typename std::enable_if_t<
+                              !Detail::is_unary_function<T>::value>>
+                ParserRefImpl( T& ref, StringRef hint ):
+                    m_ref( std::make_shared<BoundValueRef<T>>( ref ) ),
+                    m_hint( hint ) {}
+
+                template <typename LambdaT,
+                          typename = typename std::enable_if_t<
+                              Detail::is_unary_function<LambdaT>::value>>
+                ParserRefImpl( LambdaT const& ref, StringRef hint ):
+                    m_ref( std::make_shared<BoundLambda<LambdaT>>( ref ) ),
+                    m_hint( hint ) {}
+
+                DerivedT& operator()( StringRef description ) & {
+                    m_description = description;
+                    return static_cast<DerivedT&>( *this );
+                }
+                DerivedT&& operator()( StringRef description ) && {
+                    m_description = description;
+                    return static_cast<DerivedT&&>( *this );
+                }
+
+                auto optional() -> DerivedT& {
+                    m_optionality = Optionality::Optional;
+                    return static_cast<DerivedT&>( *this );
+                }
+
+                auto required() -> DerivedT& {
+                    m_optionality = Optionality::Required;
+                    return static_cast<DerivedT&>( *this );
+                }
+
+                auto isOptional() const -> bool {
+                    return m_optionality == Optionality::Optional;
+                }
+
+                auto cardinality() const -> size_t override {
+                    if ( m_ref->isContainer() )
+                        return 0;
+                    else
+                        return 1;
+                }
+
+                StringRef hint() const { return m_hint; }
+            };
+
+        } // namespace detail
+
+
+        // A parser for arguments
+        class Arg : public Detail::ParserRefImpl<Arg> {
+        public:
+            using ParserRefImpl::ParserRefImpl;
+            using ParserBase::parse;
+
+            Detail::InternalParseResult
+                parse(std::string const&,
+                      Detail::TokenStream tokens) const override;
+        };
+
+        // A parser for options
+        class Opt : public Detail::ParserRefImpl<Opt> {
+        protected:
+            std::vector<StringRef> m_optNames;
+
+        public:
+            template <typename LambdaT>
+            explicit Opt(LambdaT const& ref) :
+                ParserRefImpl(
+                    std::make_shared<Detail::BoundFlagLambda<LambdaT>>(ref)) {}
+
+            explicit Opt(bool& ref);
+
+            template <typename LambdaT,
+                      typename = typename std::enable_if_t<
+                          Detail::is_unary_function<LambdaT>::value>>
+            Opt( LambdaT const& ref, StringRef hint ):
+                ParserRefImpl( ref, hint ) {}
+
+            template <typename LambdaT>
+            Opt( accept_many_t, LambdaT const& ref, StringRef hint ):
+                ParserRefImpl( accept_many, ref, hint ) {}
+
+            template <typename T,
+                      typename = typename std::enable_if_t<
+                          !Detail::is_unary_function<T>::value>>
+            Opt( T& ref, StringRef hint ):
+                ParserRefImpl( ref, hint ) {}
+
+            Opt& operator[]( StringRef optName ) & {
+                m_optNames.push_back(optName);
+                return *this;
+            }
+            Opt&& operator[]( StringRef optName ) && {
+                m_optNames.push_back( optName );
+                return CATCH_MOVE(*this);
+            }
+
+            Detail::HelpColumns getHelpColumns() const;
+
+            bool isMatch(StringRef optToken) const;
+
+            using ParserBase::parse;
+
+            Detail::InternalParseResult
+                parse(std::string const&,
+                      Detail::TokenStream tokens) const override;
+
+            Detail::Result validate() const override;
+        };
+
+        // Specifies the name of the executable
+        class ExeName : public Detail::ComposableParserImpl<ExeName> {
+            std::shared_ptr<std::string> m_name;
+            std::shared_ptr<Detail::BoundValueRefBase> m_ref;
+
+        public:
+            ExeName();
+            explicit ExeName(std::string& ref);
+
+            template <typename LambdaT>
+            explicit ExeName(LambdaT const& lambda) : ExeName() {
+                m_ref = std::make_shared<Detail::BoundLambda<LambdaT>>(lambda);
+            }
+
+            // The exe name is not parsed out of the normal tokens, but is
+            // handled specially
+            Detail::InternalParseResult
+                parse(std::string const&,
+                      Detail::TokenStream tokens) const override;
+
+            std::string const& name() const { return *m_name; }
+            Detail::ParserResult set(std::string const& newName);
+        };
+
+
+        // A Combined parser
+        class Parser : Detail::ParserBase {
+            mutable ExeName m_exeName;
+            std::vector<Opt> m_options;
+            std::vector<Arg> m_args;
+
+        public:
+
+            auto operator|=(ExeName const& exeName) -> Parser& {
+                m_exeName = exeName;
+                return *this;
+            }
+
+            auto operator|=(Arg const& arg) -> Parser& {
+                m_args.push_back(arg);
+                return *this;
+            }
+
+            friend Parser& operator|=( Parser& p, Opt const& opt ) {
+                p.m_options.push_back( opt );
+                return p;
+            }
+            friend Parser& operator|=( Parser& p, Opt&& opt ) {
+                p.m_options.push_back( CATCH_MOVE(opt) );
+                return p;
+            }
+
+            Parser& operator|=(Parser const& other);
+
+            template <typename T>
+            friend Parser operator|( Parser const& p, T&& rhs ) {
+                Parser temp( p );
+                temp |= rhs;
+                return temp;
+            }
+
+            template <typename T>
+            friend Parser operator|( Parser&& p, T&& rhs ) {
+                p |= CATCH_FORWARD(rhs);
+                return CATCH_MOVE(p);
+            }
+
+            std::vector<Detail::HelpColumns> getHelpColumns() const;
+
+            void writeToStream(std::ostream& os) const;
+
+            friend auto operator<<(std::ostream& os, Parser const& parser)
+                -> std::ostream& {
+                parser.writeToStream(os);
+                return os;
+            }
+
+            Detail::Result validate() const override;
+
+            using ParserBase::parse;
+            Detail::InternalParseResult
+                parse(std::string const& exeName,
+                      Detail::TokenStream tokens) const override;
+        };
+
+        /**
+         * Wrapper over argc + argv, assumes that the inputs outlive it
+         */
+        class Args {
+            friend Detail::TokenStream;
+            StringRef m_exeName;
+            std::vector<StringRef> m_args;
+
+        public:
+            Args(int argc, char const* const* argv);
+            // Helper constructor for testing
+            Args(std::initializer_list<StringRef> args);
+
+            StringRef exeName() const { return m_exeName; }
+        };
+
+
+        // Convenience wrapper for option parser that specifies the help option
+        struct Help : Opt {
+            Help(bool& showHelpFlag);
+        };
+
+        // Result type for parser operation
+        using Detail::ParserResult;
+
+        namespace Detail {
+            template <typename DerivedT>
+            template <typename T>
+            Parser
+                ComposableParserImpl<DerivedT>::operator|(T const& other) const {
+                return Parser() | static_cast<DerivedT const&>(*this) | other;
+            }
+        }
+
+    } // namespace Clara
+} // namespace Catch
+
+#if defined( __clang__ )
+#    pragma clang diagnostic pop
+#endif
+
+#if defined( __GNUC__ )
+#    pragma GCC diagnostic pop
+#endif
+
+#endif // CATCH_CLARA_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_commandline.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_commandline.cpp
new file mode 100644
index 00000000..212f1774
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_commandline.cpp
@@ -0,0 +1,314 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_commandline.hpp>
+
+#include <catch2/catch_config.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
+#include <catch2/internal/catch_reporter_registry.hpp>
+#include <catch2/internal/catch_console_colour.hpp>
+#include <catch2/internal/catch_parse_numbers.hpp>
+#include <catch2/internal/catch_reporter_spec_parser.hpp>
+
+#include <fstream>
+#include <string>
+
+namespace Catch {
+
+    Clara::Parser makeCommandLineParser( ConfigData& config ) {
+
+        using namespace Clara;
+
+        auto const setWarning = [&]( std::string const& warning ) {
+            if ( warning == "NoAssertions" ) {
+                config.warnings = static_cast<WarnAbout::What>(config.warnings | WarnAbout::NoAssertions);
+                return ParserResult::ok( ParseResultType::Matched );
+            } else if ( warning == "UnmatchedTestSpec" ) {
+                config.warnings = static_cast<WarnAbout::What>(config.warnings | WarnAbout::UnmatchedTestSpec);
+                return ParserResult::ok( ParseResultType::Matched );
+            }
+
+            return ParserResult ::runtimeError(
+                "Unrecognised warning option: '" + warning + '\'' );
+        };
+        auto const loadTestNamesFromFile = [&]( std::string const& filename ) {
+                std::ifstream f( filename.c_str() );
+                if( !f.is_open() )
+                    return ParserResult::runtimeError( "Unable to load input file: '" + filename + '\'' );
+
+                std::string line;
+                while( std::getline( f, line ) ) {
+                    line = trim(line);
+                    if( !line.empty() && !startsWith( line, '#' ) ) {
+                        if( !startsWith( line, '"' ) )
+                            line = '"' + CATCH_MOVE(line) + '"';
+                        config.testsOrTags.push_back( line );
+                        config.testsOrTags.emplace_back( "," );
+                    }
+                }
+                //Remove comma in the end
+                if(!config.testsOrTags.empty())
+                    config.testsOrTags.erase( config.testsOrTags.end()-1 );
+
+                return ParserResult::ok( ParseResultType::Matched );
+            };
+        auto const setTestOrder = [&]( std::string const& order ) {
+                if( startsWith( "declared", order ) )
+                    config.runOrder = TestRunOrder::Declared;
+                else if( startsWith( "lexical", order ) )
+                    config.runOrder = TestRunOrder::LexicographicallySorted;
+                else if( startsWith( "random", order ) )
+                    config.runOrder = TestRunOrder::Randomized;
+                else
+                    return ParserResult::runtimeError( "Unrecognised ordering: '" + order + '\'' );
+                return ParserResult::ok( ParseResultType::Matched );
+            };
+        auto const setRngSeed = [&]( std::string const& seed ) {
+                if( seed == "time" ) {
+                    config.rngSeed = generateRandomSeed(GenerateFrom::Time);
+                    return ParserResult::ok(ParseResultType::Matched);
+                } else if (seed == "random-device") {
+                    config.rngSeed = generateRandomSeed(GenerateFrom::RandomDevice);
+                    return ParserResult::ok(ParseResultType::Matched);
+                }
+
+                // TODO: ideally we should be parsing uint32_t directly
+                //       fix this later when we add new parse overload
+                auto parsedSeed = parseUInt( seed, 0 );
+                if ( !parsedSeed ) {
+                    return ParserResult::runtimeError( "Could not parse '" + seed + "' as seed" );
+                }
+                config.rngSeed = *parsedSeed;
+                return ParserResult::ok( ParseResultType::Matched );
+            };
+        auto const setDefaultColourMode = [&]( std::string const& colourMode ) {
+            Optional<ColourMode> maybeMode = Catch::Detail::stringToColourMode(toLower( colourMode ));
+            if ( !maybeMode ) {
+                return ParserResult::runtimeError(
+                    "colour mode must be one of: default, ansi, win32, "
+                    "or none. '" +
+                    colourMode + "' is not recognised" );
+            }
+            auto mode = *maybeMode;
+            if ( !isColourImplAvailable( mode ) ) {
+                return ParserResult::runtimeError(
+                    "colour mode '" + colourMode +
+                    "' is not supported in this binary" );
+            }
+            config.defaultColourMode = mode;
+            return ParserResult::ok( ParseResultType::Matched );
+        };
+        auto const setWaitForKeypress = [&]( std::string const& keypress ) {
+                auto keypressLc = toLower( keypress );
+                if (keypressLc == "never")
+                    config.waitForKeypress = WaitForKeypress::Never;
+                else if( keypressLc == "start" )
+                    config.waitForKeypress = WaitForKeypress::BeforeStart;
+                else if( keypressLc == "exit" )
+                    config.waitForKeypress = WaitForKeypress::BeforeExit;
+                else if( keypressLc == "both" )
+                    config.waitForKeypress = WaitForKeypress::BeforeStartAndExit;
+                else
+                    return ParserResult::runtimeError( "keypress argument must be one of: never, start, exit or both. '" + keypress + "' not recognised" );
+            return ParserResult::ok( ParseResultType::Matched );
+            };
+        auto const setVerbosity = [&]( std::string const& verbosity ) {
+            auto lcVerbosity = toLower( verbosity );
+            if( lcVerbosity == "quiet" )
+                config.verbosity = Verbosity::Quiet;
+            else if( lcVerbosity == "normal" )
+                config.verbosity = Verbosity::Normal;
+            else if( lcVerbosity == "high" )
+                config.verbosity = Verbosity::High;
+            else
+                return ParserResult::runtimeError( "Unrecognised verbosity, '" + verbosity + '\'' );
+            return ParserResult::ok( ParseResultType::Matched );
+        };
+        auto const setReporter = [&]( std::string const& userReporterSpec ) {
+            if ( userReporterSpec.empty() ) {
+                return ParserResult::runtimeError( "Received empty reporter spec." );
+            }
+
+            Optional<ReporterSpec> parsed =
+                parseReporterSpec( userReporterSpec );
+            if ( !parsed ) {
+                return ParserResult::runtimeError(
+                    "Could not parse reporter spec '" + userReporterSpec +
+                    "'" );
+            }
+
+            auto const& reporterSpec = *parsed;
+
+            auto const& factories =
+                getRegistryHub().getReporterRegistry().getFactories();
+            auto result = factories.find( reporterSpec.name() );
+
+            if ( result == factories.end() ) {
+                return ParserResult::runtimeError(
+                    "Unrecognized reporter, '" + reporterSpec.name() +
+                    "'. Check available with --list-reporters" );
+            }
+
+
+            const bool hadOutputFile = reporterSpec.outputFile().some();
+            config.reporterSpecifications.push_back( CATCH_MOVE( *parsed ) );
+            // It would be enough to check this only once at the very end, but
+            // there is  not a place where we could call this check, so do it
+            // every time it could fail. For valid inputs, this is still called
+            // at most once.
+            if (!hadOutputFile) {
+                int n_reporters_without_file = 0;
+                for (auto const& spec : config.reporterSpecifications) {
+                    if (spec.outputFile().none()) {
+                        n_reporters_without_file++;
+                    }
+                }
+                if (n_reporters_without_file > 1) {
+                    return ParserResult::runtimeError( "Only one reporter may have unspecified output file." );
+                }
+            }
+
+            return ParserResult::ok( ParseResultType::Matched );
+        };
+        auto const setShardCount = [&]( std::string const& shardCount ) {
+            auto parsedCount = parseUInt( shardCount );
+            if ( !parsedCount ) {
+                return ParserResult::runtimeError(
+                    "Could not parse '" + shardCount + "' as shard count" );
+            }
+            if ( *parsedCount == 0 ) {
+                return ParserResult::runtimeError(
+                    "Shard count must be positive" );
+            }
+            config.shardCount = *parsedCount;
+            return ParserResult::ok( ParseResultType::Matched );
+        };
+
+        auto const setShardIndex = [&](std::string const& shardIndex) {
+            auto parsedIndex = parseUInt( shardIndex );
+            if ( !parsedIndex ) {
+                return ParserResult::runtimeError(
+                    "Could not parse '" + shardIndex + "' as shard index" );
+            }
+            config.shardIndex = *parsedIndex;
+            return ParserResult::ok( ParseResultType::Matched );
+        };
+
+        auto cli
+            = ExeName( config.processName )
+            | Help( config.showHelp )
+            | Opt( config.showSuccessfulTests )
+                ["-s"]["--success"]
+                ( "include successful tests in output" )
+            | Opt( config.shouldDebugBreak )
+                ["-b"]["--break"]
+                ( "break into debugger on failure" )
+            | Opt( config.noThrow )
+                ["-e"]["--nothrow"]
+                ( "skip exception tests" )
+            | Opt( config.showInvisibles )
+                ["-i"]["--invisibles"]
+                ( "show invisibles (tabs, newlines)" )
+            | Opt( config.defaultOutputFilename, "filename" )
+                ["-o"]["--out"]
+                ( "default output filename" )
+            | Opt( accept_many, setReporter, "name[::key=value]*" )
+                ["-r"]["--reporter"]
+                ( "reporter to use (defaults to console)" )
+            | Opt( config.name, "name" )
+                ["-n"]["--name"]
+                ( "suite name" )
+            | Opt( [&]( bool ){ config.abortAfter = 1; } )
+                ["-a"]["--abort"]
+                ( "abort at first failure" )
+            | Opt( [&]( int x ){ config.abortAfter = x; }, "no. failures" )
+                ["-x"]["--abortx"]
+                ( "abort after x failures" )
+            | Opt( accept_many, setWarning, "warning name" )
+                ["-w"]["--warn"]
+                ( "enable warnings" )
+            | Opt( [&]( bool flag ) { config.showDurations = flag ? ShowDurations::Always : ShowDurations::Never; }, "yes|no" )
+                ["-d"]["--durations"]
+                ( "show test durations" )
+            | Opt( config.minDuration, "seconds" )
+                ["-D"]["--min-duration"]
+                ( "show test durations for tests taking at least the given number of seconds" )
+            | Opt( loadTestNamesFromFile, "filename" )
+                ["-f"]["--input-file"]
+                ( "load test names to run from a file" )
+            | Opt( config.filenamesAsTags )
+                ["-#"]["--filenames-as-tags"]
+                ( "adds a tag for the filename" )
+            | Opt( config.sectionsToRun, "section name" )
+                ["-c"]["--section"]
+                ( "specify section to run" )
+            | Opt( setVerbosity, "quiet|normal|high" )
+                ["-v"]["--verbosity"]
+                ( "set output verbosity" )
+            | Opt( config.listTests )
+                ["--list-tests"]
+                ( "list all/matching test cases" )
+            | Opt( config.listTags )
+                ["--list-tags"]
+                ( "list all/matching tags" )
+            | Opt( config.listReporters )
+                ["--list-reporters"]
+                ( "list all available reporters" )
+            | Opt( config.listListeners )
+                ["--list-listeners"]
+                ( "list all listeners" )
+            | Opt( setTestOrder, "decl|lex|rand" )
+                ["--order"]
+                ( "test case order (defaults to decl)" )
+            | Opt( setRngSeed, "'time'|'random-device'|number" )
+                ["--rng-seed"]
+                ( "set a specific seed for random numbers" )
+            | Opt( setDefaultColourMode, "ansi|win32|none|default" )
+                ["--colour-mode"]
+                ( "what color mode should be used as default" )
+            | Opt( config.libIdentify )
+                ["--libidentify"]
+                ( "report name and version according to libidentify standard" )
+            | Opt( setWaitForKeypress, "never|start|exit|both" )
+                ["--wait-for-keypress"]
+                ( "waits for a keypress before exiting" )
+            | Opt( config.skipBenchmarks)
+                ["--skip-benchmarks"]
+                ( "disable running benchmarks")
+            | Opt( config.benchmarkSamples, "samples" )
+                ["--benchmark-samples"]
+                ( "number of samples to collect (default: 100)" )
+            | Opt( config.benchmarkResamples, "resamples" )
+                ["--benchmark-resamples"]
+                ( "number of resamples for the bootstrap (default: 100000)" )
+            | Opt( config.benchmarkConfidenceInterval, "confidence interval" )
+                ["--benchmark-confidence-interval"]
+                ( "confidence interval for the bootstrap (between 0 and 1, default: 0.95)" )
+            | Opt( config.benchmarkNoAnalysis )
+                ["--benchmark-no-analysis"]
+                ( "perform only measurements; do not perform any analysis" )
+            | Opt( config.benchmarkWarmupTime, "benchmarkWarmupTime" )
+                ["--benchmark-warmup-time"]
+                ( "amount of time in milliseconds spent on warming up each test (default: 100)" )
+            | Opt( setShardCount, "shard count" )
+                ["--shard-count"]
+                ( "split the tests to execute into this many groups" )
+            | Opt( setShardIndex, "shard index" )
+                ["--shard-index"]
+                ( "index of the group of tests to execute (see --shard-count)" )
+            | Opt( config.allowZeroTests )
+                ["--allow-running-no-tests"]
+                ( "Treat 'No tests run' as a success" )
+            | Arg( config.testsOrTags, "test name|pattern|tags" )
+                ( "which test or tests to use" );
+
+        return cli;
+    }
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_commandline.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_commandline.hpp
new file mode 100644
index 00000000..8cc22547
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_commandline.hpp
@@ -0,0 +1,21 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_COMMANDLINE_HPP_INCLUDED
+#define CATCH_COMMANDLINE_HPP_INCLUDED
+
+#include <catch2/internal/catch_clara.hpp>
+
+namespace Catch {
+
+    struct ConfigData;
+
+    Clara::Parser makeCommandLineParser( ConfigData& config );
+
+} // end namespace Catch
+
+#endif // CATCH_COMMANDLINE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_compare_traits.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_compare_traits.hpp
new file mode 100644
index 00000000..6304b1ff
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_compare_traits.hpp
@@ -0,0 +1,75 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_COMPARE_TRAITS_HPP_INCLUDED
+#define CATCH_COMPARE_TRAITS_HPP_INCLUDED
+
+#include <catch2/internal/catch_void_type.hpp>
+
+#include <type_traits>
+
+namespace Catch {
+    namespace Detail {
+
+#if defined( __GNUC__ ) && !defined( __clang__ )
+#    pragma GCC diagnostic push
+    // GCC likes to complain about comparing bool with 0, in the decltype()
+    // that defines the comparable traits below.
+#    pragma GCC diagnostic ignored "-Wbool-compare"
+    // "ordered comparison of pointer with integer zero" same as above,
+    // but it does not have a separate warning flag to suppress
+#    pragma GCC diagnostic ignored "-Wextra"
+    // Did you know that comparing floats with `0` directly
+    // is super-duper dangerous in unevaluated context?
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+
+#if defined( __clang__ )
+#    pragma clang diagnostic push
+    // Did you know that comparing floats with `0` directly
+    // is super-duper dangerous in unevaluated context?
+#    pragma clang diagnostic ignored "-Wfloat-equal"
+#endif
+
+#define CATCH_DEFINE_COMPARABLE_TRAIT( id, op )                               \
+    template <typename, typename, typename = void>                            \
+    struct is_##id##_comparable : std::false_type {};                         \
+    template <typename T, typename U>                                         \
+    struct is_##id##_comparable<                                              \
+        T,                                                                    \
+        U,                                                                    \
+        void_t<decltype( std::declval<T>() op std::declval<U>() )>>           \
+        : std::true_type {};                                                  \
+    template <typename, typename = void>                                      \
+    struct is_##id##_0_comparable : std::false_type {};                       \
+    template <typename T>                                                     \
+    struct is_##id##_0_comparable<T,                                          \
+                                  void_t<decltype( std::declval<T>() op 0 )>> \
+        : std::true_type {};
+
+        // We need all 6 pre-spaceship comparison ops: <, <=, >, >=, ==, !=
+        CATCH_DEFINE_COMPARABLE_TRAIT( lt, < )
+        CATCH_DEFINE_COMPARABLE_TRAIT( le, <= )
+        CATCH_DEFINE_COMPARABLE_TRAIT( gt, > )
+        CATCH_DEFINE_COMPARABLE_TRAIT( ge, >= )
+        CATCH_DEFINE_COMPARABLE_TRAIT( eq, == )
+        CATCH_DEFINE_COMPARABLE_TRAIT( ne, != )
+
+#undef CATCH_DEFINE_COMPARABLE_TRAIT
+
+#if defined( __GNUC__ ) && !defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+#if defined( __clang__ )
+#    pragma clang diagnostic pop
+#endif
+
+
+    } // namespace Detail
+} // namespace Catch
+
+#endif // CATCH_COMPARE_TRAITS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_compiler_capabilities.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_compiler_capabilities.hpp
new file mode 100644
index 00000000..7f09da51
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_compiler_capabilities.hpp
@@ -0,0 +1,447 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_COMPILER_CAPABILITIES_HPP_INCLUDED
+#define CATCH_COMPILER_CAPABILITIES_HPP_INCLUDED
+
+// Detect a number of compiler features - by compiler
+// The following features are defined:
+//
+// CATCH_CONFIG_WINDOWS_SEH : is Windows SEH supported?
+// CATCH_CONFIG_POSIX_SIGNALS : are POSIX signals supported?
+// CATCH_CONFIG_DISABLE_EXCEPTIONS : Are exceptions enabled?
+// ****************
+// Note to maintainers: if new toggles are added please document them
+// in configuration.md, too
+// ****************
+
+// In general each macro has a _NO_<feature name> form
+// (e.g. CATCH_CONFIG_NO_POSIX_SIGNALS) which disables the feature.
+// Many features, at point of detection, define an _INTERNAL_ macro, so they
+// can be combined, en-mass, with the _NO_ forms later.
+
+#include <catch2/internal/catch_platform.hpp>
+#include <catch2/catch_user_config.hpp>
+
+#ifdef __cplusplus
+
+#  if (__cplusplus >= 201703L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#    define CATCH_CPP17_OR_GREATER
+#  endif
+
+#  if (__cplusplus >= 202002L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
+#    define CATCH_CPP20_OR_GREATER
+#  endif
+
+#endif
+
+// Only GCC compiler should be used in this block, so other compilers trying to
+// mask themselves as GCC should be ignored.
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && !defined(__CUDACC__) && !defined(__LCC__) && !defined(__NVCOMPILER)
+#    define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "GCC diagnostic push" )
+#    define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION  _Pragma( "GCC diagnostic pop" )
+
+// This only works on GCC 9+. so we have to also add a global suppression of Wparentheses
+// for older versions of GCC.
+#    define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \
+         _Pragma( "GCC diagnostic ignored \"-Wparentheses\"" )
+
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT \
+         _Pragma( "GCC diagnostic ignored \"-Wunused-result\"" )
+
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+         _Pragma( "GCC diagnostic ignored \"-Wunused-variable\"" )
+
+#    define CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
+         _Pragma( "GCC diagnostic ignored \"-Wuseless-cast\"" )
+
+#    define CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS \
+         _Pragma( "GCC diagnostic ignored \"-Wshadow\"" )
+
+#    define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__)
+
+#endif
+
+#if defined(__NVCOMPILER)
+#    define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "diag push" )
+#    define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION  _Pragma( "diag pop" )
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS _Pragma( "diag_suppress declared_but_not_referenced" )
+#endif
+
+#if defined(__CUDACC__) && !defined(__clang__)
+#  ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
+// New pragmas introduced in CUDA 11.5+
+#    define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "nv_diagnostic push" )
+#    define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION  _Pragma( "nv_diagnostic pop" )
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS _Pragma( "nv_diag_suppress 177" )
+#  else
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS _Pragma( "diag_suppress 177" )
+#  endif
+#endif
+
+// clang-cl defines _MSC_VER as well as __clang__, which could cause the
+// start/stop internal suppression macros to be double defined.
+#if defined(__clang__) && !defined(_MSC_VER)
+
+#    define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "clang diagnostic push" )
+#    define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION  _Pragma( "clang diagnostic pop" )
+
+#endif // __clang__ && !_MSC_VER
+
+#if defined(__clang__)
+
+// As of this writing, IBM XL's implementation of __builtin_constant_p has a bug
+// which results in calls to destructors being emitted for each temporary,
+// without a matching initialization. In practice, this can result in something
+// like `std::string::~string` being called on an uninitialized value.
+//
+// For example, this code will likely segfault under IBM XL:
+// ```
+// REQUIRE(std::string("12") + "34" == "1234")
+// ```
+//
+// Similarly, NVHPC's implementation of `__builtin_constant_p` has a bug which
+// results in calls to the immediately evaluated lambda expressions to be
+// reported as unevaluated lambdas.
+// https://developer.nvidia.com/nvidia_bug/3321845.
+//
+// Therefore, `CATCH_INTERNAL_IGNORE_BUT_WARN` is not implemented.
+#  if !defined(__ibmxl__) && !defined(__CUDACC__) && !defined( __NVCOMPILER )
+#    define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__) /* NOLINT(cppcoreguidelines-pro-type-vararg, hicpp-vararg) */
+#  endif
+
+
+#    define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+         _Pragma( "clang diagnostic ignored \"-Wexit-time-destructors\"" ) \
+         _Pragma( "clang diagnostic ignored \"-Wglobal-constructors\"")
+
+#    define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \
+         _Pragma( "clang diagnostic ignored \"-Wparentheses\"" )
+
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+         _Pragma( "clang diagnostic ignored \"-Wunused-variable\"" )
+
+#    define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \
+         _Pragma( "clang diagnostic ignored \"-Wgnu-zero-variadic-macro-arguments\"" )
+
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \
+         _Pragma( "clang diagnostic ignored \"-Wunused-template\"" )
+
+#    define CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS \
+        _Pragma( "clang diagnostic ignored \"-Wcomma\"" )
+
+#    define CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS \
+        _Pragma( "clang diagnostic ignored \"-Wshadow\"" )
+
+#endif // __clang__
+
+
+////////////////////////////////////////////////////////////////////////////////
+// We know some environments not to support full POSIX signals
+#if defined( CATCH_PLATFORM_WINDOWS ) ||                                       \
+    defined( CATCH_PLATFORM_PLAYSTATION ) ||                                   \
+    defined( __CYGWIN__ ) ||                                                   \
+    defined( __QNX__ ) ||                                                      \
+    defined( __EMSCRIPTEN__ ) ||                                               \
+    defined( __DJGPP__ ) ||                                                    \
+    defined( __OS400__ )
+#    define CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS
+#else
+#    define CATCH_INTERNAL_CONFIG_POSIX_SIGNALS
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Assume that some platforms do not support getenv.
+#if defined( CATCH_PLATFORM_WINDOWS_UWP ) ||                                   \
+    defined( CATCH_PLATFORM_PLAYSTATION ) ||                                   \
+    defined( _GAMING_XBOX )
+#    define CATCH_INTERNAL_CONFIG_NO_GETENV
+#else
+#    define CATCH_INTERNAL_CONFIG_GETENV
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Android somehow still does not support std::to_string
+#if defined(__ANDROID__)
+#    define CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Not all Windows environments support SEH properly
+#if defined(__MINGW32__)
+#    define CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// PS4
+#if defined(__ORBIS__)
+#    define CATCH_INTERNAL_CONFIG_NO_NEW_CAPTURE
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Cygwin
+#ifdef __CYGWIN__
+
+// Required for some versions of Cygwin to declare gettimeofday
+// see: http://stackoverflow.com/questions/36901803/gettimeofday-not-declared-in-this-scope-cygwin
+#   define _BSD_SOURCE
+// some versions of cygwin (most) do not support std::to_string. Use the libstd check.
+// https://gcc.gnu.org/onlinedocs/gcc-4.8.2/libstdc++/api/a01053_source.html line 2812-2813
+# if !((__cplusplus >= 201103L) && defined(_GLIBCXX_USE_C99) \
+           && !defined(_GLIBCXX_HAVE_BROKEN_VSWPRINTF))
+
+#    define CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING
+
+# endif
+#endif // __CYGWIN__
+
+////////////////////////////////////////////////////////////////////////////////
+// Visual C++
+#if defined(_MSC_VER)
+
+// We want to defer to nvcc-specific warning suppression if we are compiled
+// with nvcc masquerading for MSVC.
+#    if !defined( __CUDACC__ )
+#        define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+            __pragma( warning( push ) )
+#        define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+            __pragma( warning( pop ) )
+#    endif
+
+// Universal Windows platform does not support SEH
+// Or console colours (or console at all...)
+#  if defined(CATCH_PLATFORM_WINDOWS_UWP)
+#    define CATCH_INTERNAL_CONFIG_NO_COLOUR_WIN32
+#  else
+#    define CATCH_INTERNAL_CONFIG_WINDOWS_SEH
+#  endif
+
+// MSVC traditional preprocessor needs some workaround for __VA_ARGS__
+// _MSVC_TRADITIONAL == 0 means new conformant preprocessor
+// _MSVC_TRADITIONAL == 1 means old traditional non-conformant preprocessor
+#  if !defined(__clang__) // Handle Clang masquerading for msvc
+#    if !defined(_MSVC_TRADITIONAL) || (defined(_MSVC_TRADITIONAL) && _MSVC_TRADITIONAL)
+#      define CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#    endif // MSVC_TRADITIONAL
+#  endif // __clang__
+
+#endif // _MSC_VER
+
+#if defined(_REENTRANT) || defined(_MSC_VER)
+// Enable async processing, as -pthread is specified or no additional linking is required
+# define CATCH_INTERNAL_CONFIG_USE_ASYNC
+#endif // _MSC_VER
+
+////////////////////////////////////////////////////////////////////////////////
+// Check if we are compiled with -fno-exceptions or equivalent
+#if defined(__EXCEPTIONS) || defined(__cpp_exceptions) || defined(_CPPUNWIND)
+#  define CATCH_INTERNAL_CONFIG_EXCEPTIONS_ENABLED
+#endif
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Embarcadero C++Build
+#if defined(__BORLANDC__)
+    #define CATCH_INTERNAL_CONFIG_POLYFILL_ISNAN
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+// RTX is a special version of Windows that is real time.
+// This means that it is detected as Windows, but does not provide
+// the same set of capabilities as real Windows does.
+#if defined(UNDER_RTSS) || defined(RTX64_BUILD)
+    #define CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH
+    #define CATCH_INTERNAL_CONFIG_NO_ASYNC
+    #define CATCH_INTERNAL_CONFIG_NO_COLOUR_WIN32
+#endif
+
+#if !defined(_GLIBCXX_USE_C99_MATH_TR1)
+#define CATCH_INTERNAL_CONFIG_GLOBAL_NEXTAFTER
+#endif
+
+// Various stdlib support checks that require __has_include
+#if defined(__has_include)
+  // Check if string_view is available and usable
+  #if __has_include(<string_view>) && defined(CATCH_CPP17_OR_GREATER)
+  #    define CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW
+  #endif
+
+  // Check if optional is available and usable
+  #  if __has_include(<optional>) && defined(CATCH_CPP17_OR_GREATER)
+  #    define CATCH_INTERNAL_CONFIG_CPP17_OPTIONAL
+  #  endif // __has_include(<optional>) && defined(CATCH_CPP17_OR_GREATER)
+
+  // Check if byte is available and usable
+  #  if __has_include(<cstddef>) && defined(CATCH_CPP17_OR_GREATER)
+  #    include <cstddef>
+  #    if defined(__cpp_lib_byte) && (__cpp_lib_byte > 0)
+  #      define CATCH_INTERNAL_CONFIG_CPP17_BYTE
+  #    endif
+  #  endif // __has_include(<cstddef>) && defined(CATCH_CPP17_OR_GREATER)
+
+  // Check if variant is available and usable
+  #  if __has_include(<variant>) && defined(CATCH_CPP17_OR_GREATER)
+  #    if defined(__clang__) && (__clang_major__ < 8)
+         // work around clang bug with libstdc++ https://bugs.llvm.org/show_bug.cgi?id=31852
+         // fix should be in clang 8, workaround in libstdc++ 8.2
+  #      include <ciso646>
+  #      if defined(__GLIBCXX__) && defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE < 9)
+  #        define CATCH_CONFIG_NO_CPP17_VARIANT
+  #      else
+  #        define CATCH_INTERNAL_CONFIG_CPP17_VARIANT
+  #      endif // defined(__GLIBCXX__) && defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE < 9)
+  #    else
+  #      define CATCH_INTERNAL_CONFIG_CPP17_VARIANT
+  #    endif // defined(__clang__) && (__clang_major__ < 8)
+  #  endif // __has_include(<variant>) && defined(CATCH_CPP17_OR_GREATER)
+#endif // defined(__has_include)
+
+
+#if defined(CATCH_INTERNAL_CONFIG_WINDOWS_SEH) && !defined(CATCH_CONFIG_NO_WINDOWS_SEH) && !defined(CATCH_CONFIG_WINDOWS_SEH) && !defined(CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH)
+#   define CATCH_CONFIG_WINDOWS_SEH
+#endif
+// This is set by default, because we assume that unix compilers are posix-signal-compatible by default.
+#if defined(CATCH_INTERNAL_CONFIG_POSIX_SIGNALS) && !defined(CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS) && !defined(CATCH_CONFIG_NO_POSIX_SIGNALS) && !defined(CATCH_CONFIG_POSIX_SIGNALS)
+#   define CATCH_CONFIG_POSIX_SIGNALS
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_GETENV) && !defined(CATCH_INTERNAL_CONFIG_NO_GETENV) && !defined(CATCH_CONFIG_NO_GETENV) && !defined(CATCH_CONFIG_GETENV)
+#   define CATCH_CONFIG_GETENV
+#endif
+
+#if !defined(CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING) && !defined(CATCH_CONFIG_NO_CPP11_TO_STRING) && !defined(CATCH_CONFIG_CPP11_TO_STRING)
+#    define CATCH_CONFIG_CPP11_TO_STRING
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_CPP17_OPTIONAL) && !defined(CATCH_CONFIG_NO_CPP17_OPTIONAL) && !defined(CATCH_CONFIG_CPP17_OPTIONAL)
+#  define CATCH_CONFIG_CPP17_OPTIONAL
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_NO_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_CPP17_STRING_VIEW)
+#  define CATCH_CONFIG_CPP17_STRING_VIEW
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_CPP17_VARIANT) && !defined(CATCH_CONFIG_NO_CPP17_VARIANT) && !defined(CATCH_CONFIG_CPP17_VARIANT)
+#  define CATCH_CONFIG_CPP17_VARIANT
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_CPP17_BYTE) && !defined(CATCH_CONFIG_NO_CPP17_BYTE) && !defined(CATCH_CONFIG_CPP17_BYTE)
+#  define CATCH_CONFIG_CPP17_BYTE
+#endif
+
+
+#if defined(CATCH_CONFIG_EXPERIMENTAL_REDIRECT)
+#  define CATCH_INTERNAL_CONFIG_NEW_CAPTURE
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_NEW_CAPTURE) && !defined(CATCH_INTERNAL_CONFIG_NO_NEW_CAPTURE) && !defined(CATCH_CONFIG_NO_NEW_CAPTURE) && !defined(CATCH_CONFIG_NEW_CAPTURE)
+#  define CATCH_CONFIG_NEW_CAPTURE
+#endif
+
+#if !defined( CATCH_INTERNAL_CONFIG_EXCEPTIONS_ENABLED ) && \
+    !defined( CATCH_CONFIG_DISABLE_EXCEPTIONS ) &&          \
+    !defined( CATCH_CONFIG_NO_DISABLE_EXCEPTIONS )
+#  define CATCH_CONFIG_DISABLE_EXCEPTIONS
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_POLYFILL_ISNAN) && !defined(CATCH_CONFIG_NO_POLYFILL_ISNAN) && !defined(CATCH_CONFIG_POLYFILL_ISNAN)
+#  define CATCH_CONFIG_POLYFILL_ISNAN
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_USE_ASYNC)  && !defined(CATCH_INTERNAL_CONFIG_NO_ASYNC) && !defined(CATCH_CONFIG_NO_USE_ASYNC) && !defined(CATCH_CONFIG_USE_ASYNC)
+#  define CATCH_CONFIG_USE_ASYNC
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_GLOBAL_NEXTAFTER) && !defined(CATCH_CONFIG_NO_GLOBAL_NEXTAFTER) && !defined(CATCH_CONFIG_GLOBAL_NEXTAFTER)
+#  define CATCH_CONFIG_GLOBAL_NEXTAFTER
+#endif
+
+
+// Even if we do not think the compiler has that warning, we still have
+// to provide a macro that can be used by the code.
+#if !defined(CATCH_INTERNAL_START_WARNINGS_SUPPRESSION)
+#   define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION
+#endif
+#if !defined(CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION)
+#   define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#endif
+#if !defined(CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS)
+#   define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS
+#endif
+#if !defined(CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS)
+#   define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS
+#endif
+#if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT)
+#   define CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT
+#endif
+#if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS)
+#   define CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS
+#endif
+#if !defined(CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS)
+#   define CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS
+#endif
+#if !defined(CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS)
+#   define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS
+#endif
+#if !defined( CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS )
+#    define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
+#endif
+#if !defined( CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS )
+#    define CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS
+#endif
+#if !defined( CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS )
+#    define CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS
+#endif
+
+
+// The goal of this macro is to avoid evaluation of the arguments, but
+// still have the compiler warn on problems inside...
+#if !defined(CATCH_INTERNAL_IGNORE_BUT_WARN)
+#   define CATCH_INTERNAL_IGNORE_BUT_WARN(...)
+#endif
+
+#if defined(__APPLE__) && defined(__apple_build_version__) && (__clang_major__ < 10)
+#   undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
+#elif defined(__clang__) && (__clang_major__ < 5)
+#   undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS
+#endif
+
+
+#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+#define CATCH_TRY if ((true))
+#define CATCH_CATCH_ALL if ((false))
+#define CATCH_CATCH_ANON(type) if ((false))
+#else
+#define CATCH_TRY try
+#define CATCH_CATCH_ALL catch (...)
+#define CATCH_CATCH_ANON(type) catch (type)
+#endif
+
+#if defined(CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR) && !defined(CATCH_CONFIG_NO_TRADITIONAL_MSVC_PREPROCESSOR) && !defined(CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR)
+#define CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#endif
+
+#if defined( CATCH_PLATFORM_WINDOWS ) &&       \
+    !defined( CATCH_CONFIG_COLOUR_WIN32 ) && \
+    !defined( CATCH_CONFIG_NO_COLOUR_WIN32 ) && \
+    !defined( CATCH_INTERNAL_CONFIG_NO_COLOUR_WIN32 )
+#    define CATCH_CONFIG_COLOUR_WIN32
+#endif
+
+#if defined( CATCH_CONFIG_SHARED_LIBRARY ) && defined( _MSC_VER ) && \
+    !defined( CATCH_CONFIG_STATIC )
+#    ifdef Catch2_EXPORTS
+#        define CATCH_EXPORT //__declspec( dllexport ) // not needed
+#    else
+#        define CATCH_EXPORT __declspec( dllimport )
+#    endif
+#else
+#    define CATCH_EXPORT
+#endif
+
+#endif // CATCH_COMPILER_CAPABILITIES_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_android_logwrite.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_android_logwrite.hpp
new file mode 100644
index 00000000..490ee372
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_android_logwrite.hpp
@@ -0,0 +1,33 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+/** \file
+ * Wrapper for ANDROID_LOGWRITE configuration option
+ *
+ * We want to default to enabling it when compiled for android, but
+ * users of the library should also be able to disable it if they want
+ * to.
+ */
+
+#ifndef CATCH_CONFIG_ANDROID_LOGWRITE_HPP_INCLUDED
+#define CATCH_CONFIG_ANDROID_LOGWRITE_HPP_INCLUDED
+
+#include <catch2/catch_user_config.hpp>
+
+#if defined(__ANDROID__)
+#    define CATCH_INTERNAL_CONFIG_ANDROID_LOGWRITE
+#endif
+
+
+#if defined( CATCH_INTERNAL_CONFIG_ANDROID_LOGWRITE ) && \
+    !defined( CATCH_CONFIG_NO_ANDROID_LOGWRITE ) &&      \
+    !defined( CATCH_CONFIG_ANDROID_LOGWRITE )
+#    define CATCH_CONFIG_ANDROID_LOGWRITE
+#endif
+
+#endif // CATCH_CONFIG_ANDROID_LOGWRITE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_counter.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_counter.hpp
new file mode 100644
index 00000000..a482ce34
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_counter.hpp
@@ -0,0 +1,34 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+/** \file
+ * Wrapper for the CONFIG configuration option
+ *
+ * When generating internal unique names, there are two options. Either
+ * we mix in the current line number, or mix in an incrementing number.
+ * We prefer the latter, using `__COUNTER__`, but users might want to
+ * use the former.
+ */
+
+#ifndef CATCH_CONFIG_COUNTER_HPP_INCLUDED
+#define CATCH_CONFIG_COUNTER_HPP_INCLUDED
+
+#include <catch2/catch_user_config.hpp>
+
+#if ( !defined(__JETBRAINS_IDE__) || __JETBRAINS_IDE__ >= 20170300L )
+    #define CATCH_INTERNAL_CONFIG_COUNTER
+#endif
+
+#if defined( CATCH_INTERNAL_CONFIG_COUNTER ) && \
+    !defined( CATCH_CONFIG_NO_COUNTER ) && \
+    !defined( CATCH_CONFIG_COUNTER )
+#    define CATCH_CONFIG_COUNTER
+#endif
+
+
+#endif // CATCH_CONFIG_COUNTER_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_prefix_messages.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_prefix_messages.hpp
new file mode 100644
index 00000000..be1e9a96
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_prefix_messages.hpp
@@ -0,0 +1,29 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+/** \file
+ * Wrapper for the CATCH_CONFIG_PREFIX_MESSAGES configuration option
+ *
+ * CATCH_CONFIG_PREFIX_ALL can be used to avoid clashes with other macros
+ * by prepending CATCH_. This may not be desirable if the only clashes are with
+ * logger macros such as INFO and WARN. In this cases
+ * CATCH_CONFIG_PREFIX_MESSAGES can be used to only prefix a small subset
+ * of relevant macros.
+ *
+ */
+
+#ifndef CATCH_CONFIG_PREFIX_MESSAGES_HPP_INCLUDED
+#define CATCH_CONFIG_PREFIX_MESSAGES_HPP_INCLUDED
+
+#include <catch2/catch_user_config.hpp>
+
+#if defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_PREFIX_MESSAGES)
+    #define CATCH_CONFIG_PREFIX_MESSAGES
+#endif
+
+#endif // CATCH_CONFIG_PREFIX_MESSAGES_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_static_analysis_support.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_static_analysis_support.hpp
new file mode 100644
index 00000000..81bdf39f
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_static_analysis_support.hpp
@@ -0,0 +1,34 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+/** \file
+ * Wrapper for the STATIC_ANALYSIS_SUPPORT configuration option
+ *
+ * Some of Catch2's macros can be defined differently to work better with
+ * static analysis tools, like clang-tidy or coverity.
+ * Currently the main use case is to show that `SECTION`s are executed
+ * exclusively, and not all in one run of a `TEST_CASE`.
+ */
+
+#ifndef CATCH_CONFIG_STATIC_ANALYSIS_SUPPORT_HPP_INCLUDED
+#define CATCH_CONFIG_STATIC_ANALYSIS_SUPPORT_HPP_INCLUDED
+
+#include <catch2/catch_user_config.hpp>
+
+#if defined(__clang_analyzer__) || defined(__COVERITY__)
+    #define CATCH_INTERNAL_CONFIG_STATIC_ANALYSIS_SUPPORT
+#endif
+
+#if defined( CATCH_INTERNAL_CONFIG_STATIC_ANALYSIS_SUPPORT ) && \
+    !defined( CATCH_CONFIG_NO_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT ) && \
+    !defined( CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT )
+#    define CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT
+#endif
+
+
+#endif // CATCH_CONFIG_STATIC_ANALYSIS_SUPPORT_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_uncaught_exceptions.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_uncaught_exceptions.hpp
new file mode 100644
index 00000000..20b1dfca
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_uncaught_exceptions.hpp
@@ -0,0 +1,46 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+/** \file
+ * Wrapper for UNCAUGHT_EXCEPTIONS configuration option
+ *
+ * For some functionality, Catch2 requires to know whether there is
+ * an active exception. Because `std::uncaught_exception` is deprecated
+ * in C++17, we want to use `std::uncaught_exceptions` if possible.
+ */
+
+#ifndef CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
+#define CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
+
+#include <catch2/catch_user_config.hpp>
+
+#if defined(_MSC_VER)
+#  if _MSC_VER >= 1900 // Visual Studio 2015 or newer
+#    define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
+#  endif
+#endif
+
+
+#include <exception>
+
+#if defined(__cpp_lib_uncaught_exceptions) \
+    && !defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
+
+#  define CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
+#endif // __cpp_lib_uncaught_exceptions
+
+
+#if defined(CATCH_INTERNAL_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS) \
+    && !defined(CATCH_CONFIG_NO_CPP17_UNCAUGHT_EXCEPTIONS) \
+    && !defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
+
+#  define CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS
+#endif
+
+
+#endif // CATCH_CONFIG_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_wchar.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_wchar.hpp
new file mode 100644
index 00000000..90d85d05
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_wchar.hpp
@@ -0,0 +1,35 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+/** \file
+ * Wrapper for the WCHAR configuration option
+ *
+ * We want to support platforms that do not provide `wchar_t`, so we
+ * sometimes have to disable providing wchar_t overloads through Catch2,
+ * e.g. the StringMaker specialization for `std::wstring`.
+ */
+
+#ifndef CATCH_CONFIG_WCHAR_HPP_INCLUDED
+#define CATCH_CONFIG_WCHAR_HPP_INCLUDED
+
+#include <catch2/catch_user_config.hpp>
+
+// We assume that WCHAR should be enabled by default, and only disabled
+// for a shortlist (so far only DJGPP) of compilers.
+
+#if defined(__DJGPP__)
+#  define CATCH_INTERNAL_CONFIG_NO_WCHAR
+#endif // __DJGPP__
+
+#if !defined( CATCH_INTERNAL_CONFIG_NO_WCHAR ) && \
+    !defined( CATCH_CONFIG_NO_WCHAR ) && \
+    !defined( CATCH_CONFIG_WCHAR )
+#    define CATCH_CONFIG_WCHAR
+#endif
+
+#endif // CATCH_CONFIG_WCHAR_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_console_colour.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_console_colour.cpp
new file mode 100644
index 00000000..b19e01ec
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_console_colour.cpp
@@ -0,0 +1,282 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#if defined(__clang__)
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wexit-time-destructors"
+#endif
+
+
+#include <catch2/internal/catch_console_colour.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_errno_guard.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/internal/catch_istream.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/internal/catch_context.hpp>
+#include <catch2/internal/catch_platform.hpp>
+#include <catch2/internal/catch_debugger.hpp>
+#include <catch2/internal/catch_windows_h_proxy.hpp>
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+
+#include <cassert>
+#include <ostream>
+#include <utility>
+
+namespace Catch {
+
+    ColourImpl::~ColourImpl() = default;
+
+    ColourImpl::ColourGuard ColourImpl::guardColour( Colour::Code colourCode ) {
+        return ColourGuard(colourCode, this );
+    }
+
+    void ColourImpl::ColourGuard::engageImpl( std::ostream& stream ) {
+        assert( &stream == &m_colourImpl->m_stream->stream() &&
+                "Engaging colour guard for different stream than used by the "
+                "parent colour implementation" );
+        static_cast<void>( stream );
+
+        m_engaged = true;
+        m_colourImpl->use( m_code );
+    }
+
+    ColourImpl::ColourGuard::ColourGuard( Colour::Code code,
+                                          ColourImpl const* colour ):
+        m_colourImpl( colour ), m_code( code ) {
+    }
+    ColourImpl::ColourGuard::ColourGuard( ColourGuard&& rhs ) noexcept:
+        m_colourImpl( rhs.m_colourImpl ),
+        m_code( rhs.m_code ),
+        m_engaged( rhs.m_engaged ) {
+        rhs.m_engaged = false;
+    }
+    ColourImpl::ColourGuard&
+    ColourImpl::ColourGuard::operator=( ColourGuard&& rhs ) noexcept {
+        using std::swap;
+        swap( m_colourImpl, rhs.m_colourImpl );
+        swap( m_code, rhs.m_code );
+        swap( m_engaged, rhs.m_engaged );
+
+        return *this;
+    }
+    ColourImpl::ColourGuard::~ColourGuard() {
+        if ( m_engaged ) {
+            m_colourImpl->use( Colour::None );
+        }
+    }
+
+    ColourImpl::ColourGuard&
+    ColourImpl::ColourGuard::engage( std::ostream& stream ) & {
+        engageImpl( stream );
+        return *this;
+    }
+
+    ColourImpl::ColourGuard&&
+    ColourImpl::ColourGuard::engage( std::ostream& stream ) && {
+        engageImpl( stream );
+        return CATCH_MOVE(*this);
+    }
+
+    namespace {
+        //! A do-nothing implementation of colour, used as fallback for unknown
+        //! platforms, and when the user asks to deactivate all colours.
+        class NoColourImpl final : public ColourImpl {
+        public:
+            NoColourImpl( IStream* stream ): ColourImpl( stream ) {}
+
+        private:
+            void use( Colour::Code ) const override {}
+        };
+    } // namespace
+
+
+} // namespace Catch
+
+
+#if defined ( CATCH_CONFIG_COLOUR_WIN32 ) /////////////////////////////////////////
+
+namespace Catch {
+namespace {
+
+    class Win32ColourImpl final : public ColourImpl {
+    public:
+        Win32ColourImpl(IStream* stream):
+            ColourImpl(stream) {
+            CONSOLE_SCREEN_BUFFER_INFO csbiInfo;
+            GetConsoleScreenBufferInfo( GetStdHandle( STD_OUTPUT_HANDLE ),
+                                        &csbiInfo );
+            originalForegroundAttributes = csbiInfo.wAttributes & ~( BACKGROUND_GREEN | BACKGROUND_RED | BACKGROUND_BLUE | BACKGROUND_INTENSITY );
+            originalBackgroundAttributes = csbiInfo.wAttributes & ~( FOREGROUND_GREEN | FOREGROUND_RED | FOREGROUND_BLUE | FOREGROUND_INTENSITY );
+        }
+
+        static bool useImplementationForStream(IStream const& stream) {
+            // Win32 text colour APIs can only be used on console streams
+            // We cannot check that the output hasn't been redirected,
+            // so we just check that the original stream is console stream.
+            return stream.isConsole();
+        }
+
+    private:
+        void use( Colour::Code _colourCode ) const override {
+            switch( _colourCode ) {
+                case Colour::None:      return setTextAttribute( originalForegroundAttributes );
+                case Colour::White:     return setTextAttribute( FOREGROUND_GREEN | FOREGROUND_RED | FOREGROUND_BLUE );
+                case Colour::Red:       return setTextAttribute( FOREGROUND_RED );
+                case Colour::Green:     return setTextAttribute( FOREGROUND_GREEN );
+                case Colour::Blue:      return setTextAttribute( FOREGROUND_BLUE );
+                case Colour::Cyan:      return setTextAttribute( FOREGROUND_BLUE | FOREGROUND_GREEN );
+                case Colour::Yellow:    return setTextAttribute( FOREGROUND_RED | FOREGROUND_GREEN );
+                case Colour::Grey:      return setTextAttribute( 0 );
+
+                case Colour::LightGrey:     return setTextAttribute( FOREGROUND_INTENSITY );
+                case Colour::BrightRed:     return setTextAttribute( FOREGROUND_INTENSITY | FOREGROUND_RED );
+                case Colour::BrightGreen:   return setTextAttribute( FOREGROUND_INTENSITY | FOREGROUND_GREEN );
+                case Colour::BrightWhite:   return setTextAttribute( FOREGROUND_INTENSITY | FOREGROUND_GREEN | FOREGROUND_RED | FOREGROUND_BLUE );
+                case Colour::BrightYellow:  return setTextAttribute( FOREGROUND_INTENSITY | FOREGROUND_RED | FOREGROUND_GREEN );
+
+                case Colour::Bright: CATCH_INTERNAL_ERROR( "not a colour" );
+
+                default:
+                    CATCH_ERROR( "Unknown colour requested" );
+            }
+        }
+
+        void setTextAttribute( WORD _textAttribute ) const {
+            SetConsoleTextAttribute( GetStdHandle( STD_OUTPUT_HANDLE ),
+                                     _textAttribute |
+                                         originalBackgroundAttributes );
+        }
+        WORD originalForegroundAttributes;
+        WORD originalBackgroundAttributes;
+    };
+
+} // end anon namespace
+} // end namespace Catch
+
+#endif // Windows/ ANSI/ None
+
+
+#if defined( CATCH_PLATFORM_LINUX ) || defined( CATCH_PLATFORM_MAC )
+#    define CATCH_INTERNAL_HAS_ISATTY
+#    include <unistd.h>
+#endif
+
+namespace Catch {
+namespace {
+
+    class ANSIColourImpl final : public ColourImpl {
+    public:
+        ANSIColourImpl( IStream* stream ): ColourImpl( stream ) {}
+
+        static bool useImplementationForStream(IStream const& stream) {
+            // This is kinda messy due to trying to support a bunch of
+            // different platforms at once.
+            // The basic idea is that if we are asked to do autodetection (as
+            // opposed to being told to use posixy colours outright), then we
+            // only want to use the colours if we are writing to console.
+            // However, console might be redirected, so we make an attempt at
+            // checking for that on platforms where we know how to do that.
+            bool useColour = stream.isConsole();
+#if defined( CATCH_INTERNAL_HAS_ISATTY ) && \
+    !( defined( __DJGPP__ ) && defined( __STRICT_ANSI__ ) )
+            ErrnoGuard _; // for isatty
+            useColour = useColour && isatty( STDOUT_FILENO );
+#    endif
+#    if defined( CATCH_PLATFORM_MAC ) || defined( CATCH_PLATFORM_IPHONE )
+            useColour = useColour && !isDebuggerActive();
+#    endif
+
+            return useColour;
+        }
+
+    private:
+        void use( Colour::Code _colourCode ) const override {
+            auto setColour = [&out =
+                                  m_stream->stream()]( char const* escapeCode ) {
+                // The escape sequence must be flushed to console, otherwise
+                // if stdin and stderr are intermixed, we'd get accidentally
+                // coloured output.
+                out << '\033' << escapeCode << std::flush;
+            };
+            switch( _colourCode ) {
+                case Colour::None:
+                case Colour::White:     return setColour( "[0m" );
+                case Colour::Red:       return setColour( "[0;31m" );
+                case Colour::Green:     return setColour( "[0;32m" );
+                case Colour::Blue:      return setColour( "[0;34m" );
+                case Colour::Cyan:      return setColour( "[0;36m" );
+                case Colour::Yellow:    return setColour( "[0;33m" );
+                case Colour::Grey:      return setColour( "[1;30m" );
+
+                case Colour::LightGrey:     return setColour( "[0;37m" );
+                case Colour::BrightRed:     return setColour( "[1;31m" );
+                case Colour::BrightGreen:   return setColour( "[1;32m" );
+                case Colour::BrightWhite:   return setColour( "[1;37m" );
+                case Colour::BrightYellow:  return setColour( "[1;33m" );
+
+                case Colour::Bright: CATCH_INTERNAL_ERROR( "not a colour" );
+                default: CATCH_INTERNAL_ERROR( "Unknown colour requested" );
+            }
+        }
+    };
+
+} // end anon namespace
+} // end namespace Catch
+
+namespace Catch {
+
+    Detail::unique_ptr<ColourImpl> makeColourImpl( ColourMode colourSelection,
+                                                   IStream* stream ) {
+#if defined( CATCH_CONFIG_COLOUR_WIN32 )
+        if ( colourSelection == ColourMode::Win32 ) {
+            return Detail::make_unique<Win32ColourImpl>( stream );
+        }
+#endif
+        if ( colourSelection == ColourMode::ANSI ) {
+            return Detail::make_unique<ANSIColourImpl>( stream );
+        }
+        if ( colourSelection == ColourMode::None ) {
+            return Detail::make_unique<NoColourImpl>( stream );
+        }
+
+        if ( colourSelection == ColourMode::PlatformDefault) {
+#if defined( CATCH_CONFIG_COLOUR_WIN32 )
+            if ( Win32ColourImpl::useImplementationForStream( *stream ) ) {
+                return Detail::make_unique<Win32ColourImpl>( stream );
+            }
+#endif
+            if ( ANSIColourImpl::useImplementationForStream( *stream ) ) {
+                return Detail::make_unique<ANSIColourImpl>( stream );
+            }
+            return Detail::make_unique<NoColourImpl>( stream );
+        }
+
+        CATCH_ERROR( "Could not create colour impl for selection " << static_cast<int>(colourSelection) );
+    }
+
+    bool isColourImplAvailable( ColourMode colourSelection ) {
+        switch ( colourSelection ) {
+#if defined( CATCH_CONFIG_COLOUR_WIN32 )
+        case ColourMode::Win32:
+#endif
+        case ColourMode::ANSI:
+        case ColourMode::None:
+        case ColourMode::PlatformDefault:
+            return true;
+        default:
+            return false;
+        }
+    }
+
+
+} // end namespace Catch
+
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
+
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_console_colour.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_console_colour.hpp
new file mode 100644
index 00000000..d9144315
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_console_colour.hpp
@@ -0,0 +1,141 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_CONSOLE_COLOUR_HPP_INCLUDED
+#define CATCH_CONSOLE_COLOUR_HPP_INCLUDED
+
+#include <catch2/internal/catch_unique_ptr.hpp>
+
+#include <iosfwd>
+#include <cstdint>
+
+namespace Catch {
+
+    enum class ColourMode : std::uint8_t;
+    class IStream;
+
+    struct Colour {
+        enum Code {
+            None = 0,
+
+            White,
+            Red,
+            Green,
+            Blue,
+            Cyan,
+            Yellow,
+            Grey,
+
+            Bright = 0x10,
+
+            BrightRed = Bright | Red,
+            BrightGreen = Bright | Green,
+            LightGrey = Bright | Grey,
+            BrightWhite = Bright | White,
+            BrightYellow = Bright | Yellow,
+
+            // By intention
+            FileName = LightGrey,
+            Warning = BrightYellow,
+            ResultError = BrightRed,
+            ResultSuccess = BrightGreen,
+            ResultExpectedFailure = Warning,
+
+            Error = BrightRed,
+            Success = Green,
+            Skip = LightGrey,
+
+            OriginalExpression = Cyan,
+            ReconstructedExpression = BrightYellow,
+
+            SecondaryText = LightGrey,
+            Headers = White
+        };
+    };
+
+    class ColourImpl {
+    protected:
+        //! The associated stream of this ColourImpl instance
+        IStream* m_stream;
+    public:
+        ColourImpl( IStream* stream ): m_stream( stream ) {}
+
+        //! RAII wrapper around writing specific colour of text using specific
+        //! colour impl into a stream.
+        class ColourGuard {
+            ColourImpl const* m_colourImpl;
+            Colour::Code m_code;
+            bool m_engaged = false;
+
+        public:
+            //! Does **not** engage the guard/start the colour
+            ColourGuard( Colour::Code code,
+                         ColourImpl const* colour );
+
+            ColourGuard( ColourGuard const& rhs ) = delete;
+            ColourGuard& operator=( ColourGuard const& rhs ) = delete;
+
+            ColourGuard( ColourGuard&& rhs ) noexcept;
+            ColourGuard& operator=( ColourGuard&& rhs ) noexcept;
+
+            //! Removes colour _if_ the guard was engaged
+            ~ColourGuard();
+
+            /**
+             * Explicitly engages colour for given stream.
+             *
+             * The API based on operator<< should be preferred.
+             */
+            ColourGuard& engage( std::ostream& stream ) &;
+            /**
+             * Explicitly engages colour for given stream.
+             *
+             * The API based on operator<< should be preferred.
+             */
+            ColourGuard&& engage( std::ostream& stream ) &&;
+
+        private:
+            //! Engages the guard and starts using colour
+            friend std::ostream& operator<<( std::ostream& lhs,
+                                             ColourGuard& guard ) {
+                guard.engageImpl( lhs );
+                return lhs;
+            }
+            //! Engages the guard and starts using colour
+            friend std::ostream& operator<<( std::ostream& lhs,
+                                            ColourGuard&& guard) {
+                guard.engageImpl( lhs );
+                return lhs;
+            }
+
+            void engageImpl( std::ostream& stream );
+
+        };
+
+        virtual ~ColourImpl(); // = default
+        /**
+         * Creates a guard object for given colour and this colour impl
+         *
+         * **Important:**
+         * the guard starts disengaged, and has to be engaged explicitly.
+         */
+        ColourGuard guardColour( Colour::Code colourCode );
+
+    private:
+        virtual void use( Colour::Code colourCode ) const = 0;
+    };
+
+    //! Provides ColourImpl based on global config and target compilation platform
+    Detail::unique_ptr<ColourImpl> makeColourImpl( ColourMode colourSelection,
+                                                   IStream* stream );
+
+    //! Checks if specific colour impl has been compiled into the binary
+    bool isColourImplAvailable( ColourMode colourSelection );
+
+} // end namespace Catch
+
+#endif // CATCH_CONSOLE_COLOUR_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_console_width.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_console_width.hpp
new file mode 100644
index 00000000..16553617
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_console_width.hpp
@@ -0,0 +1,19 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_CONSOLE_WIDTH_HPP_INCLUDED
+#define CATCH_CONSOLE_WIDTH_HPP_INCLUDED
+
+// This include must be kept so that user's configured value for CONSOLE_WIDTH
+// is used before we attempt to provide a default value
+#include <catch2/catch_user_config.hpp>
+
+#ifndef CATCH_CONFIG_CONSOLE_WIDTH
+#define CATCH_CONFIG_CONSOLE_WIDTH 80
+#endif
+
+#endif // CATCH_CONSOLE_WIDTH_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_container_nonmembers.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_container_nonmembers.hpp
new file mode 100644
index 00000000..33b28a9e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_container_nonmembers.hpp
@@ -0,0 +1,73 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_CONTAINER_NONMEMBERS_HPP_INCLUDED
+#define CATCH_CONTAINER_NONMEMBERS_HPP_INCLUDED
+
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+
+#include <cstddef>
+#include <initializer_list>
+
+// We want a simple polyfill over `std::empty`, `std::size` and so on
+// for C++14 or C++ libraries with incomplete support.
+// We also have to handle that MSVC std lib will happily provide these
+// under older standards.
+#if defined(CATCH_CPP17_OR_GREATER) || defined(_MSC_VER)
+
+// We are already using this header either way, so there shouldn't
+// be much additional overhead in including it to get the feature
+// test macros
+#include <string>
+
+#  if !defined(__cpp_lib_nonmember_container_access)
+#      define CATCH_CONFIG_POLYFILL_NONMEMBER_CONTAINER_ACCESS
+#  endif
+
+#else
+#define CATCH_CONFIG_POLYFILL_NONMEMBER_CONTAINER_ACCESS
+#endif
+
+
+
+namespace Catch {
+namespace Detail {
+
+#if defined(CATCH_CONFIG_POLYFILL_NONMEMBER_CONTAINER_ACCESS)
+    template <typename Container>
+    constexpr auto empty(Container const& cont) -> decltype(cont.empty()) {
+        return cont.empty();
+    }
+    template <typename T, std::size_t N>
+    constexpr bool empty(const T (&)[N]) noexcept {
+        // GCC < 7 does not support the const T(&)[] parameter syntax
+        // so we have to ignore the length explicitly
+        (void)N;
+        return false;
+    }
+    template <typename T>
+    constexpr bool empty(std::initializer_list<T> list) noexcept {
+        return list.size() > 0;
+    }
+
+
+    template <typename Container>
+    constexpr auto size(Container const& cont) -> decltype(cont.size()) {
+        return cont.size();
+    }
+    template <typename T, std::size_t N>
+    constexpr std::size_t size(const T(&)[N]) noexcept {
+        return N;
+    }
+#endif // CATCH_CONFIG_POLYFILL_NONMEMBER_CONTAINER_ACCESS
+
+} // end namespace Detail
+} // end namespace Catch
+
+
+
+#endif // CATCH_CONTAINER_NONMEMBERS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_context.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_context.cpp
new file mode 100644
index 00000000..3b1cc277
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_context.cpp
@@ -0,0 +1,41 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_context.hpp>
+#include <catch2/internal/catch_noncopyable.hpp>
+#include <catch2/internal/catch_random_number_generator.hpp>
+
+namespace Catch {
+
+    Context* Context::currentContext = nullptr;
+
+    void cleanUpContext() {
+        delete Context::currentContext;
+        Context::currentContext = nullptr;
+    }
+    void Context::createContext() {
+        currentContext = new Context();
+    }
+
+    Context& getCurrentMutableContext() {
+        if ( !Context::currentContext ) { Context::createContext(); }
+        // NOLINTNEXTLINE(clang-analyzer-core.uninitialized.UndefReturn)
+        return *Context::currentContext;
+    }
+
+    void Context::setResultCapture( IResultCapture* resultCapture ) {
+        m_resultCapture = resultCapture;
+    }
+
+    void Context::setConfig( IConfig const* config ) { m_config = config; }
+
+    SimplePcg32& sharedRng() {
+        static SimplePcg32 s_rng;
+        return s_rng;
+    }
+
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_context.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_context.hpp
new file mode 100644
index 00000000..6ccb3b31
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_context.hpp
@@ -0,0 +1,51 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_CONTEXT_HPP_INCLUDED
+#define CATCH_CONTEXT_HPP_INCLUDED
+
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+
+namespace Catch {
+
+    class IResultCapture;
+    class IConfig;
+
+    class Context {
+        IConfig const* m_config = nullptr;
+        IResultCapture* m_resultCapture = nullptr;
+
+        CATCH_EXPORT static Context* currentContext;
+        friend Context& getCurrentMutableContext();
+        friend Context const& getCurrentContext();
+        static void createContext();
+        friend void cleanUpContext();
+
+    public:
+        IResultCapture* getResultCapture() const { return m_resultCapture; }
+        IConfig const* getConfig() const { return m_config; }
+        void setResultCapture( IResultCapture* resultCapture );
+        void setConfig( IConfig const* config );
+    };
+
+    Context& getCurrentMutableContext();
+
+    inline Context const& getCurrentContext() {
+        // We duplicate the logic from `getCurrentMutableContext` here,
+        // to avoid paying the call overhead in debug mode.
+        if ( !Context::currentContext ) { Context::createContext(); }
+        // NOLINTNEXTLINE(clang-analyzer-core.uninitialized.UndefReturn)
+        return *Context::currentContext;
+    }
+
+    void cleanUpContext();
+
+    class SimplePcg32;
+    SimplePcg32& sharedRng();
+}
+
+#endif // CATCH_CONTEXT_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_debug_console.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_debug_console.cpp
new file mode 100644
index 00000000..40dd0a60
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_debug_console.cpp
@@ -0,0 +1,45 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/internal/catch_debug_console.hpp>
+
+#include <catch2/internal/catch_config_android_logwrite.hpp>
+#include <catch2/internal/catch_platform.hpp>
+#include <catch2/internal/catch_windows_h_proxy.hpp>
+#include <catch2/catch_user_config.hpp>
+#include <catch2/internal/catch_stdstreams.hpp>
+
+#include <ostream>
+
+#if defined(CATCH_CONFIG_ANDROID_LOGWRITE)
+#include <android/log.h>
+
+    namespace Catch {
+        void writeToDebugConsole( std::string const& text ) {
+            __android_log_write( ANDROID_LOG_DEBUG, "Catch", text.c_str() );
+        }
+    }
+
+#elif defined(CATCH_PLATFORM_WINDOWS)
+
+    namespace Catch {
+        void writeToDebugConsole( std::string const& text ) {
+            ::OutputDebugStringA( text.c_str() );
+        }
+    }
+
+#else
+
+    namespace Catch {
+        void writeToDebugConsole( std::string const& text ) {
+            // !TBD: Need a version for Mac/ XCode and other IDEs
+            Catch::cout() << text;
+        }
+    }
+
+#endif // Platform
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_debug_console.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_debug_console.hpp
new file mode 100644
index 00000000..8784f785
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_debug_console.hpp
@@ -0,0 +1,17 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_DEBUG_CONSOLE_HPP_INCLUDED
+#define CATCH_DEBUG_CONSOLE_HPP_INCLUDED
+
+#include <string>
+
+namespace Catch {
+    void writeToDebugConsole( std::string const& text );
+}
+
+#endif // CATCH_DEBUG_CONSOLE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_debugger.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_debugger.cpp
new file mode 100644
index 00000000..bd3be172
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_debugger.cpp
@@ -0,0 +1,120 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_debugger.hpp>
+#include <catch2/internal/catch_errno_guard.hpp>
+#include <catch2/internal/catch_platform.hpp>
+#include <catch2/internal/catch_stdstreams.hpp>
+
+#if defined(CATCH_PLATFORM_MAC) || defined(CATCH_PLATFORM_IPHONE)
+
+#  include <cassert>
+#  include <sys/types.h>
+#  include <unistd.h>
+#  include <cstddef>
+#  include <ostream>
+
+#ifdef __apple_build_version__
+    // These headers will only compile with AppleClang (XCode)
+    // For other compilers (Clang, GCC, ... ) we need to exclude them
+#  include <sys/sysctl.h>
+#endif
+
+    namespace Catch {
+        #ifdef __apple_build_version__
+        // The following function is taken directly from the following technical note:
+        // https://developer.apple.com/library/archive/qa/qa1361/_index.html
+
+        // Returns true if the current process is being debugged (either
+        // running under the debugger or has a debugger attached post facto).
+        bool isDebuggerActive(){
+            int                 mib[4];
+            struct kinfo_proc   info;
+            std::size_t         size;
+
+            // Initialize the flags so that, if sysctl fails for some bizarre
+            // reason, we get a predictable result.
+
+            info.kp_proc.p_flag = 0;
+
+            // Initialize mib, which tells sysctl the info we want, in this case
+            // we're looking for information about a specific process ID.
+
+            mib[0] = CTL_KERN;
+            mib[1] = KERN_PROC;
+            mib[2] = KERN_PROC_PID;
+            mib[3] = getpid();
+
+            // Call sysctl.
+
+            size = sizeof(info);
+            if( sysctl(mib, sizeof(mib) / sizeof(*mib), &info, &size, nullptr, 0) != 0 ) {
+                Catch::cerr() << "\n** Call to sysctl failed - unable to determine if debugger is active **\n\n" << std::flush;
+                return false;
+            }
+
+            // We're being debugged if the P_TRACED flag is set.
+
+            return ( (info.kp_proc.p_flag & P_TRACED) != 0 );
+        }
+        #else
+        bool isDebuggerActive() {
+            // We need to find another way to determine this for non-appleclang compilers on macOS
+            return false;
+        }
+        #endif
+    } // namespace Catch
+
+#elif defined(CATCH_PLATFORM_LINUX)
+    #include <fstream>
+    #include <string>
+
+    namespace Catch{
+        // The standard POSIX way of detecting a debugger is to attempt to
+        // ptrace() the process, but this needs to be done from a child and not
+        // this process itself to still allow attaching to this process later
+        // if wanted, so is rather heavy. Under Linux we have the PID of the
+        // "debugger" (which doesn't need to be gdb, of course, it could also
+        // be strace, for example) in /proc/$PID/status, so just get it from
+        // there instead.
+        bool isDebuggerActive(){
+            // Libstdc++ has a bug, where std::ifstream sets errno to 0
+            // This way our users can properly assert over errno values
+            ErrnoGuard guard;
+            std::ifstream in("/proc/self/status");
+            for( std::string line; std::getline(in, line); ) {
+                static const int PREFIX_LEN = 11;
+                if( line.compare(0, PREFIX_LEN, "TracerPid:\t") == 0 ) {
+                    // We're traced if the PID is not 0 and no other PID starts
+                    // with 0 digit, so it's enough to check for just a single
+                    // character.
+                    return line.length() > PREFIX_LEN && line[PREFIX_LEN] != '0';
+                }
+            }
+
+            return false;
+        }
+    } // namespace Catch
+#elif defined(_MSC_VER)
+    extern "C" __declspec(dllimport) int __stdcall IsDebuggerPresent();
+    namespace Catch {
+        bool isDebuggerActive() {
+            return IsDebuggerPresent() != 0;
+        }
+    }
+#elif defined(__MINGW32__)
+    extern "C" __declspec(dllimport) int __stdcall IsDebuggerPresent();
+    namespace Catch {
+        bool isDebuggerActive() {
+            return IsDebuggerPresent() != 0;
+        }
+    }
+#else
+    namespace Catch {
+       bool isDebuggerActive() { return false; }
+    }
+#endif // Platform
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_debugger.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_debugger.hpp
new file mode 100644
index 00000000..25c5a260
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_debugger.hpp
@@ -0,0 +1,67 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_DEBUGGER_HPP_INCLUDED
+#define CATCH_DEBUGGER_HPP_INCLUDED
+
+#include <catch2/internal/catch_platform.hpp>
+
+namespace Catch {
+    bool isDebuggerActive();
+}
+
+#ifdef CATCH_PLATFORM_MAC
+
+    #if defined(__i386__) || defined(__x86_64__)
+        #define CATCH_TRAP() __asm__("int $3\n" : : ) /* NOLINT */
+    #elif defined(__aarch64__)
+        #define CATCH_TRAP() __asm__(".inst 0xd43e0000")
+    #elif defined(__POWERPC__)
+        #define CATCH_TRAP() __asm__("li r0, 20\nsc\nnop\nli r0, 37\nli r4, 2\nsc\nnop\n" \
+        : : : "memory","r0","r3","r4" ) /* NOLINT */
+    #endif
+
+#elif defined(CATCH_PLATFORM_IPHONE)
+
+    // use inline assembler
+    #if defined(__i386__) || defined(__x86_64__)
+        #define CATCH_TRAP()  __asm__("int $3")
+    #elif defined(__aarch64__)
+        #define CATCH_TRAP()  __asm__(".inst 0xd4200000")
+    #elif defined(__arm__) && !defined(__thumb__)
+        #define CATCH_TRAP()  __asm__(".inst 0xe7f001f0")
+    #elif defined(__arm__) &&  defined(__thumb__)
+        #define CATCH_TRAP()  __asm__(".inst 0xde01")
+    #endif
+
+#elif defined(CATCH_PLATFORM_LINUX)
+    // If we can use inline assembler, do it because this allows us to break
+    // directly at the location of the failing check instead of breaking inside
+    // raise() called from it, i.e. one stack frame below.
+    #if defined(__GNUC__) && (defined(__i386) || defined(__x86_64))
+        #define CATCH_TRAP() asm volatile ("int $3") /* NOLINT */
+    #else // Fall back to the generic way.
+        #include <signal.h>
+
+        #define CATCH_TRAP() raise(SIGTRAP)
+    #endif
+#elif defined(_MSC_VER)
+    #define CATCH_TRAP() __debugbreak()
+#elif defined(__MINGW32__)
+    extern "C" __declspec(dllimport) void __stdcall DebugBreak();
+    #define CATCH_TRAP() DebugBreak()
+#endif
+
+#ifndef CATCH_BREAK_INTO_DEBUGGER
+    #ifdef CATCH_TRAP
+        #define CATCH_BREAK_INTO_DEBUGGER() []{ if( Catch::isDebuggerActive() ) { CATCH_TRAP(); } }()
+    #else
+        #define CATCH_BREAK_INTO_DEBUGGER() []{}()
+    #endif
+#endif
+
+#endif // CATCH_DEBUGGER_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_decomposer.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_decomposer.cpp
new file mode 100644
index 00000000..17a7bc95
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_decomposer.cpp
@@ -0,0 +1,28 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/internal/catch_decomposer.hpp>
+
+namespace Catch {
+
+    void ITransientExpression::streamReconstructedExpression(
+        std::ostream& os ) const {
+        // We can't make this function pure virtual to keep ITransientExpression
+        // constexpr, so we write error message instead
+        os << "Some class derived from ITransientExpression without overriding streamReconstructedExpression";
+    }
+
+    void formatReconstructedExpression( std::ostream &os, std::string const& lhs, StringRef op, std::string const& rhs ) {
+        if( lhs.size() + rhs.size() < 40 &&
+                lhs.find('\n') == std::string::npos &&
+                rhs.find('\n') == std::string::npos )
+            os << lhs << ' ' << op << ' ' << rhs;
+        else
+            os << lhs << '\n' << op << '\n' << rhs;
+    }
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_decomposer.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_decomposer.hpp
new file mode 100644
index 00000000..e693bf57
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_decomposer.hpp
@@ -0,0 +1,452 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_DECOMPOSER_HPP_INCLUDED
+#define CATCH_DECOMPOSER_HPP_INCLUDED
+
+#include <catch2/catch_tostring.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+#include <catch2/internal/catch_compare_traits.hpp>
+#include <catch2/internal/catch_test_failure_exception.hpp>
+#include <catch2/internal/catch_logical_traits.hpp>
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+
+#include <type_traits>
+#include <iosfwd>
+
+/** \file
+ * Why does decomposing look the way it does:
+ *
+ * Conceptually, decomposing is simple. We change `REQUIRE( a == b )` into
+ * `Decomposer{} <= a == b`, so that `Decomposer{} <= a` is evaluated first,
+ * and our custom operator is used for `a == b`, because `a` is transformed
+ * into `ExprLhs<T&>` and then into `BinaryExpr<T&, U&>`.
+ *
+ * In practice, decomposing ends up a mess, because we have to support
+ * various fun things.
+ *
+ * 1) Types that are only comparable with literal 0, and they do this by
+ *    comparing against a magic type with pointer constructor and deleted
+ *    other constructors. Example: `REQUIRE((a <=> b) == 0)` in libstdc++
+ *
+ * 2) Types that are only comparable with literal 0, and they do this by
+ *    comparing against a magic type with consteval integer constructor.
+ *    Example: `REQUIRE((a <=> b) == 0)` in current MSVC STL.
+ *
+ * 3) Types that have no linkage, and so we cannot form a reference to
+ *    them. Example: some implementations of traits.
+ *
+ * 4) Starting with C++20, when the compiler sees `a == b`, it also uses
+ *    `b == a` when constructing the overload set. For us this means that
+ *    when the compiler handles `ExprLhs<T> == b`, it also tries to resolve
+ *    the overload set for `b == ExprLhs<T>`.
+ *
+ * To accomodate these use cases, decomposer ended up rather complex.
+ *
+ * 1) These types are handled by adding SFINAE overloads to our comparison
+ *    operators, checking whether `T == U` are comparable with the given
+ *    operator, and if not, whether T (or U) are comparable with literal 0.
+ *    If yes, the overload compares T (or U) with 0 literal inline in the
+ *    definition.
+ *
+ *    Note that for extra correctness, we check  that the other type is
+ *    either an `int` (literal 0 is captured as `int` by templates), or
+ *    a `long` (some platforms use 0L for `NULL` and we want to support
+ *    that for pointer comparisons).
+ *
+ * 2) For these types, `is_foo_comparable<T, int>` is true, but letting
+ *    them fall into the overload that actually does `T == int` causes
+ *    compilation error. Handling them requires that the decomposition
+ *    is `constexpr`, so that P2564R3 applies and the `consteval` from
+ *    their accompanying magic type is propagated through the `constexpr`
+ *    call stack.
+ *
+ *    However this is not enough to handle these types automatically,
+ *    because our default is to capture types by reference, to avoid
+ *    runtime copies. While these references cannot become dangling,
+ *    they outlive the constexpr context and thus the default capture
+ *    path cannot be actually constexpr.
+ *
+ *    The solution is to capture these types by value, by explicitly
+ *    specializing `Catch::capture_by_value` for them. Catch2 provides
+ *    specialization for `std::foo_ordering`s, but users can specialize
+ *    the trait for their own types as well.
+ *
+ * 3) If a type has no linkage, we also cannot capture it by reference.
+ *    The solution is once again to capture them by value. We handle
+ *    the common cases by using `std::is_arithmetic` as the default
+ *    for `Catch::capture_by_value`, but that is only a some-effort
+ *    heuristic. But as with 2), users can specialize `capture_by_value`
+ *    for their own types as needed.
+ *
+ * 4) To support C++20 and make the SFINAE on our decomposing operators
+ *    work, the SFINAE has to happen in return type, rather than in
+ *    a template type. This is due to our use of logical type traits
+ *    (`conjunction`/`disjunction`/`negation`), that we use to workaround
+ *    an issue in older (9-) versions of GCC. I still blame C++20 for
+ *    this, because without the comparison order switching, the logical
+ *    traits could still be used in template type.
+ *
+ * There are also other side concerns, e.g. supporting both `REQUIRE(a)`
+ * and `REQUIRE(a == b)`, or making `REQUIRE_THAT(a, IsEqual(b))` slot
+ * nicely into the same expression handling logic, but these are rather
+ * straightforward and add only a bit of complexity (e.g. common base
+ * class for decomposed expressions).
+ */
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable:4389) // '==' : signed/unsigned mismatch
+#pragma warning(disable:4018) // more "signed/unsigned mismatch"
+#pragma warning(disable:4312) // Converting int to T* using reinterpret_cast (issue on x64 platform)
+#pragma warning(disable:4180) // qualifier applied to function type has no meaning
+#pragma warning(disable:4800) // Forcing result to true or false
+#endif
+
+#ifdef __clang__
+#  pragma clang diagnostic push
+#  pragma clang diagnostic ignored "-Wsign-compare"
+#elif defined __GNUC__
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wsign-compare"
+#endif
+
+#if defined(CATCH_CPP20_OR_GREATER) && __has_include(<compare>)
+#  include <compare>
+#    if defined( __cpp_lib_three_way_comparison ) && \
+            __cpp_lib_three_way_comparison >= 201907L
+#      define CATCH_CONFIG_CPP20_COMPARE_OVERLOADS
+#    endif
+#endif
+
+namespace Catch {
+
+    // Note: There is nothing that stops us from extending this,
+    //       e.g. to `std::is_scalar`, but the more encompassing
+    //       traits are usually also more expensive. For now we
+    //       keep this as it used to be and it can be changed later.
+    template <typename T>
+    struct capture_by_value
+        : std::integral_constant<bool, std::is_arithmetic<T>{}> {};
+
+#if defined( CATCH_CONFIG_CPP20_COMPARE_OVERLOADS )
+    template <>
+    struct capture_by_value<std::strong_ordering> : std::true_type {};
+    template <>
+    struct capture_by_value<std::weak_ordering> : std::true_type {};
+    template <>
+    struct capture_by_value<std::partial_ordering> : std::true_type {};
+#endif
+
+    template <typename T>
+    struct always_false : std::false_type {};
+
+    class ITransientExpression {
+        bool m_isBinaryExpression;
+        bool m_result;
+
+    public:
+        constexpr auto isBinaryExpression() const -> bool { return m_isBinaryExpression; }
+        constexpr auto getResult() const -> bool { return m_result; }
+        //! This function **has** to be overriden by the derived class.
+        virtual void streamReconstructedExpression( std::ostream& os ) const;
+
+        constexpr ITransientExpression( bool isBinaryExpression, bool result )
+        :   m_isBinaryExpression( isBinaryExpression ),
+            m_result( result )
+        {}
+
+        ITransientExpression() = default;
+        ITransientExpression(ITransientExpression const&) = default;
+        ITransientExpression& operator=(ITransientExpression const&) = default;
+
+        // We don't actually need a virtual destructor, but many static analysers
+        // complain if it's not here :-(
+        virtual ~ITransientExpression() = default;
+
+        friend std::ostream& operator<<(std::ostream& out, ITransientExpression const& expr) {
+            expr.streamReconstructedExpression(out);
+            return out;
+        }
+    };
+
+    void formatReconstructedExpression( std::ostream &os, std::string const& lhs, StringRef op, std::string const& rhs );
+
+    template<typename LhsT, typename RhsT>
+    class BinaryExpr  : public ITransientExpression {
+        LhsT m_lhs;
+        StringRef m_op;
+        RhsT m_rhs;
+
+        void streamReconstructedExpression( std::ostream &os ) const override {
+            formatReconstructedExpression
+                    ( os, Catch::Detail::stringify( m_lhs ), m_op, Catch::Detail::stringify( m_rhs ) );
+        }
+
+    public:
+        constexpr BinaryExpr( bool comparisonResult, LhsT lhs, StringRef op, RhsT rhs )
+        :   ITransientExpression{ true, comparisonResult },
+            m_lhs( lhs ),
+            m_op( op ),
+            m_rhs( rhs )
+        {}
+
+        template<typename T>
+        auto operator && ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename T>
+        auto operator || ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename T>
+        auto operator == ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename T>
+        auto operator != ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename T>
+        auto operator > ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename T>
+        auto operator < ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename T>
+        auto operator >= ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename T>
+        auto operator <= ( T ) const -> BinaryExpr<LhsT, RhsT const&> const {
+            static_assert(always_false<T>::value,
+            "chained comparisons are not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+    };
+
+    template<typename LhsT>
+    class UnaryExpr : public ITransientExpression {
+        LhsT m_lhs;
+
+        void streamReconstructedExpression( std::ostream &os ) const override {
+            os << Catch::Detail::stringify( m_lhs );
+        }
+
+    public:
+        explicit constexpr UnaryExpr( LhsT lhs )
+        :   ITransientExpression{ false, static_cast<bool>(lhs) },
+            m_lhs( lhs )
+        {}
+    };
+
+
+    template<typename LhsT>
+    class ExprLhs {
+        LhsT m_lhs;
+    public:
+        explicit constexpr ExprLhs( LhsT lhs ) : m_lhs( lhs ) {}
+
+#define CATCH_INTERNAL_DEFINE_EXPRESSION_EQUALITY_OPERATOR( id, op )           \
+    template <typename RhsT>                                                   \
+    constexpr friend auto operator op( ExprLhs&& lhs, RhsT&& rhs )             \
+        ->std::enable_if_t<                                                    \
+            Detail::conjunction<Detail::is_##id##_comparable<LhsT, RhsT>,      \
+                                Detail::negation<capture_by_value<             \
+                                    std::remove_reference_t<RhsT>>>>::value,   \
+            BinaryExpr<LhsT, RhsT const&>> {                                   \
+        return {                                                               \
+            static_cast<bool>( lhs.m_lhs op rhs ), lhs.m_lhs, #op##_sr, rhs }; \
+    }                                                                          \
+    template <typename RhsT>                                                   \
+    constexpr friend auto operator op( ExprLhs&& lhs, RhsT rhs )               \
+        ->std::enable_if_t<                                                    \
+            Detail::conjunction<Detail::is_##id##_comparable<LhsT, RhsT>,      \
+                                capture_by_value<RhsT>>::value,                \
+            BinaryExpr<LhsT, RhsT>> {                                          \
+        return {                                                               \
+            static_cast<bool>( lhs.m_lhs op rhs ), lhs.m_lhs, #op##_sr, rhs }; \
+    }                                                                          \
+    template <typename RhsT>                                                   \
+    constexpr friend auto operator op( ExprLhs&& lhs, RhsT rhs )               \
+        ->std::enable_if_t<                                                    \
+            Detail::conjunction<                                               \
+                Detail::negation<Detail::is_##id##_comparable<LhsT, RhsT>>,    \
+                Detail::is_eq_0_comparable<LhsT>,                              \
+              /* We allow long because we want `ptr op NULL` to be accepted */ \
+                Detail::disjunction<std::is_same<RhsT, int>,                   \
+                                    std::is_same<RhsT, long>>>::value,         \
+            BinaryExpr<LhsT, RhsT>> {                                          \
+        if ( rhs != 0 ) { throw_test_failure_exception(); }                    \
+        return {                                                               \
+            static_cast<bool>( lhs.m_lhs op 0 ), lhs.m_lhs, #op##_sr, rhs };   \
+    }                                                                          \
+    template <typename RhsT>                                                   \
+    constexpr friend auto operator op( ExprLhs&& lhs, RhsT rhs )               \
+        ->std::enable_if_t<                                                    \
+            Detail::conjunction<                                               \
+                Detail::negation<Detail::is_##id##_comparable<LhsT, RhsT>>,    \
+                Detail::is_eq_0_comparable<RhsT>,                              \
+              /* We allow long because we want `ptr op NULL` to be accepted */ \
+                Detail::disjunction<std::is_same<LhsT, int>,                   \
+                                    std::is_same<LhsT, long>>>::value,         \
+            BinaryExpr<LhsT, RhsT>> {                                          \
+        if ( lhs.m_lhs != 0 ) { throw_test_failure_exception(); }              \
+        return { static_cast<bool>( 0 op rhs ), lhs.m_lhs, #op##_sr, rhs };    \
+    }
+
+        CATCH_INTERNAL_DEFINE_EXPRESSION_EQUALITY_OPERATOR( eq, == )
+        CATCH_INTERNAL_DEFINE_EXPRESSION_EQUALITY_OPERATOR( ne, != )
+
+    #undef CATCH_INTERNAL_DEFINE_EXPRESSION_EQUALITY_OPERATOR
+
+
+#define CATCH_INTERNAL_DEFINE_EXPRESSION_COMPARISON_OPERATOR( id, op )         \
+    template <typename RhsT>                                                   \
+    constexpr friend auto operator op( ExprLhs&& lhs, RhsT&& rhs )             \
+        ->std::enable_if_t<                                                    \
+            Detail::conjunction<Detail::is_##id##_comparable<LhsT, RhsT>,      \
+                                Detail::negation<capture_by_value<             \
+                                    std::remove_reference_t<RhsT>>>>::value,   \
+            BinaryExpr<LhsT, RhsT const&>> {                                   \
+        return {                                                               \
+            static_cast<bool>( lhs.m_lhs op rhs ), lhs.m_lhs, #op##_sr, rhs }; \
+    }                                                                          \
+    template <typename RhsT>                                                   \
+    constexpr friend auto operator op( ExprLhs&& lhs, RhsT rhs )               \
+        ->std::enable_if_t<                                                    \
+            Detail::conjunction<Detail::is_##id##_comparable<LhsT, RhsT>,      \
+                                capture_by_value<RhsT>>::value,                \
+            BinaryExpr<LhsT, RhsT>> {                                          \
+        return {                                                               \
+            static_cast<bool>( lhs.m_lhs op rhs ), lhs.m_lhs, #op##_sr, rhs }; \
+    }                                                                          \
+    template <typename RhsT>                                                   \
+    constexpr friend auto operator op( ExprLhs&& lhs, RhsT rhs )               \
+        ->std::enable_if_t<                                                    \
+            Detail::conjunction<                                               \
+                Detail::negation<Detail::is_##id##_comparable<LhsT, RhsT>>,    \
+                Detail::is_##id##_0_comparable<LhsT>,                          \
+                std::is_same<RhsT, int>>::value,                               \
+            BinaryExpr<LhsT, RhsT>> {                                          \
+        if ( rhs != 0 ) { throw_test_failure_exception(); }                    \
+        return {                                                               \
+            static_cast<bool>( lhs.m_lhs op 0 ), lhs.m_lhs, #op##_sr, rhs };   \
+    }                                                                          \
+    template <typename RhsT>                                                   \
+    constexpr friend auto operator op( ExprLhs&& lhs, RhsT rhs )               \
+        ->std::enable_if_t<                                                    \
+            Detail::conjunction<                                               \
+                Detail::negation<Detail::is_##id##_comparable<LhsT, RhsT>>,    \
+                Detail::is_##id##_0_comparable<RhsT>,                          \
+                std::is_same<LhsT, int>>::value,                               \
+            BinaryExpr<LhsT, RhsT>> {                                          \
+        if ( lhs.m_lhs != 0 ) { throw_test_failure_exception(); }              \
+        return { static_cast<bool>( 0 op rhs ), lhs.m_lhs, #op##_sr, rhs };    \
+    }
+
+        CATCH_INTERNAL_DEFINE_EXPRESSION_COMPARISON_OPERATOR( lt, < )
+        CATCH_INTERNAL_DEFINE_EXPRESSION_COMPARISON_OPERATOR( le, <= )
+        CATCH_INTERNAL_DEFINE_EXPRESSION_COMPARISON_OPERATOR( gt, > )
+        CATCH_INTERNAL_DEFINE_EXPRESSION_COMPARISON_OPERATOR( ge, >= )
+
+    #undef CATCH_INTERNAL_DEFINE_EXPRESSION_COMPARISON_OPERATOR
+
+
+#define CATCH_INTERNAL_DEFINE_EXPRESSION_OPERATOR( op )                        \
+    template <typename RhsT>                                                   \
+    constexpr friend auto operator op( ExprLhs&& lhs, RhsT&& rhs )             \
+        ->std::enable_if_t<                                                    \
+            !capture_by_value<std::remove_reference_t<RhsT>>::value,           \
+            BinaryExpr<LhsT, RhsT const&>> {                                   \
+        return {                                                               \
+            static_cast<bool>( lhs.m_lhs op rhs ), lhs.m_lhs, #op##_sr, rhs }; \
+    }                                                                          \
+    template <typename RhsT>                                                   \
+    constexpr friend auto operator op( ExprLhs&& lhs, RhsT rhs )               \
+        ->std::enable_if_t<capture_by_value<RhsT>::value,                      \
+                           BinaryExpr<LhsT, RhsT>> {                           \
+        return {                                                               \
+            static_cast<bool>( lhs.m_lhs op rhs ), lhs.m_lhs, #op##_sr, rhs }; \
+    }
+
+        CATCH_INTERNAL_DEFINE_EXPRESSION_OPERATOR(|)
+        CATCH_INTERNAL_DEFINE_EXPRESSION_OPERATOR(&)
+        CATCH_INTERNAL_DEFINE_EXPRESSION_OPERATOR(^)
+
+    #undef CATCH_INTERNAL_DEFINE_EXPRESSION_OPERATOR
+
+        template<typename RhsT>
+        friend auto operator && ( ExprLhs &&, RhsT && ) -> BinaryExpr<LhsT, RhsT const&> {
+            static_assert(always_false<RhsT>::value,
+            "operator&& is not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        template<typename RhsT>
+        friend auto operator || ( ExprLhs &&, RhsT && ) -> BinaryExpr<LhsT, RhsT const&> {
+            static_assert(always_false<RhsT>::value,
+            "operator|| is not supported inside assertions, "
+            "wrap the expression inside parentheses, or decompose it");
+        }
+
+        constexpr auto makeUnaryExpr() const -> UnaryExpr<LhsT> {
+            return UnaryExpr<LhsT>{ m_lhs };
+        }
+    };
+
+    struct Decomposer {
+        template <typename T,
+                  std::enable_if_t<
+                      !capture_by_value<std::remove_reference_t<T>>::value,
+                      int> = 0>
+        constexpr friend auto operator <= ( Decomposer &&, T && lhs ) -> ExprLhs<T const&> {
+            return ExprLhs<const T&>{ lhs };
+        }
+
+        template <typename T,
+                  std::enable_if_t<capture_by_value<T>::value, int> = 0>
+        constexpr friend auto operator <= ( Decomposer &&, T value ) -> ExprLhs<T> {
+            return ExprLhs<T>{ value };
+        }
+    };
+
+} // end namespace Catch
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+#ifdef __clang__
+#  pragma clang diagnostic pop
+#elif defined __GNUC__
+#  pragma GCC diagnostic pop
+#endif
+
+#endif // CATCH_DECOMPOSER_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_enforce.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_enforce.cpp
new file mode 100644
index 00000000..3f696406
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_enforce.cpp
@@ -0,0 +1,41 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_stdstreams.hpp>
+
+#include <stdexcept>
+
+
+namespace Catch {
+#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) && !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS_CUSTOM_HANDLER)
+    [[noreturn]]
+    void throw_exception(std::exception const& e) {
+        Catch::cerr() << "Catch will terminate because it needed to throw an exception.\n"
+                      << "The message was: " << e.what() << '\n';
+        std::terminate();
+    }
+#endif
+
+    [[noreturn]]
+    void throw_logic_error(std::string const& msg) {
+        throw_exception(std::logic_error(msg));
+    }
+
+    [[noreturn]]
+    void throw_domain_error(std::string const& msg) {
+        throw_exception(std::domain_error(msg));
+    }
+
+    [[noreturn]]
+    void throw_runtime_error(std::string const& msg) {
+        throw_exception(std::runtime_error(msg));
+    }
+
+
+
+} // namespace Catch;
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_enforce.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_enforce.hpp
new file mode 100644
index 00000000..076cea3c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_enforce.hpp
@@ -0,0 +1,54 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_ENFORCE_HPP_INCLUDED
+#define CATCH_ENFORCE_HPP_INCLUDED
+
+#include <catch2/internal/catch_source_line_info.hpp>
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/internal/catch_reusable_string_stream.hpp>
+
+#include <exception>
+
+namespace Catch {
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+    template <typename Ex>
+    [[noreturn]]
+    void throw_exception(Ex const& e) {
+        throw e;
+    }
+#else // ^^ Exceptions are enabled //  Exceptions are disabled vv
+    [[noreturn]]
+    void throw_exception(std::exception const& e);
+#endif
+
+    [[noreturn]]
+    void throw_logic_error(std::string const& msg);
+    [[noreturn]]
+    void throw_domain_error(std::string const& msg);
+    [[noreturn]]
+    void throw_runtime_error(std::string const& msg);
+
+} // namespace Catch;
+
+#define CATCH_MAKE_MSG(...) \
+    (Catch::ReusableStringStream() << __VA_ARGS__).str()
+
+#define CATCH_INTERNAL_ERROR(...) \
+    Catch::throw_logic_error(CATCH_MAKE_MSG( CATCH_INTERNAL_LINEINFO << ": Internal Catch2 error: " << __VA_ARGS__))
+
+#define CATCH_ERROR(...) \
+    Catch::throw_domain_error(CATCH_MAKE_MSG( __VA_ARGS__ ))
+
+#define CATCH_RUNTIME_ERROR(...) \
+    Catch::throw_runtime_error(CATCH_MAKE_MSG( __VA_ARGS__ ))
+
+#define CATCH_ENFORCE( condition, ... ) \
+    do{ if( !(condition) ) CATCH_ERROR( __VA_ARGS__ ); } while(false)
+
+
+#endif // CATCH_ENFORCE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_enum_values_registry.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_enum_values_registry.cpp
new file mode 100644
index 00000000..a94b6088
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_enum_values_registry.cpp
@@ -0,0 +1,73 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_enum_values_registry.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+
+#include <cassert>
+
+namespace Catch {
+
+    IMutableEnumValuesRegistry::~IMutableEnumValuesRegistry() = default;
+
+    namespace Detail {
+
+        namespace {
+            // Extracts the actual name part of an enum instance
+            // In other words, it returns the Blue part of Bikeshed::Colour::Blue
+            StringRef extractInstanceName(StringRef enumInstance) {
+                // Find last occurrence of ":"
+                size_t name_start = enumInstance.size();
+                while (name_start > 0 && enumInstance[name_start - 1] != ':') {
+                    --name_start;
+                }
+                return enumInstance.substr(name_start, enumInstance.size() - name_start);
+            }
+        }
+
+        std::vector<StringRef> parseEnums( StringRef enums ) {
+            auto enumValues = splitStringRef( enums, ',' );
+            std::vector<StringRef> parsed;
+            parsed.reserve( enumValues.size() );
+            for( auto const& enumValue : enumValues ) {
+                parsed.push_back(trim(extractInstanceName(enumValue)));
+            }
+            return parsed;
+        }
+
+        EnumInfo::~EnumInfo() = default;
+
+        StringRef EnumInfo::lookup( int value ) const {
+            for( auto const& valueToName : m_values ) {
+                if( valueToName.first == value )
+                    return valueToName.second;
+            }
+            return "{** unexpected enum value **}"_sr;
+        }
+
+        Catch::Detail::unique_ptr<EnumInfo> makeEnumInfo( StringRef enumName, StringRef allValueNames, std::vector<int> const& values ) {
+            auto enumInfo = Catch::Detail::make_unique<EnumInfo>();
+            enumInfo->m_name = enumName;
+            enumInfo->m_values.reserve( values.size() );
+
+            const auto valueNames = Catch::Detail::parseEnums( allValueNames );
+            assert( valueNames.size() == values.size() );
+            std::size_t i = 0;
+            for( auto value : values )
+                enumInfo->m_values.emplace_back(value, valueNames[i++]);
+
+            return enumInfo;
+        }
+
+        EnumInfo const& EnumValuesRegistry::registerEnum( StringRef enumName, StringRef allValueNames, std::vector<int> const& values ) {
+            m_enumInfos.push_back(makeEnumInfo(enumName, allValueNames, values));
+            return *m_enumInfos.back();
+        }
+
+    } // Detail
+} // Catch
+
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_enum_values_registry.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_enum_values_registry.hpp
new file mode 100644
index 00000000..de994c35
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_enum_values_registry.hpp
@@ -0,0 +1,36 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_ENUM_VALUES_REGISTRY_HPP_INCLUDED
+#define CATCH_ENUM_VALUES_REGISTRY_HPP_INCLUDED
+
+#include <catch2/interfaces/catch_interfaces_enum_values_registry.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+
+#include <vector>
+
+namespace Catch {
+
+    namespace Detail {
+
+        Catch::Detail::unique_ptr<EnumInfo> makeEnumInfo( StringRef enumName, StringRef allValueNames, std::vector<int> const& values );
+
+        class EnumValuesRegistry : public IMutableEnumValuesRegistry {
+
+            std::vector<Catch::Detail::unique_ptr<EnumInfo>> m_enumInfos;
+
+            EnumInfo const& registerEnum( StringRef enumName, StringRef allValueNames, std::vector<int> const& values) override;
+        };
+
+        std::vector<StringRef> parseEnums( StringRef enums );
+
+    } // Detail
+
+} // Catch
+
+#endif // CATCH_ENUM_VALUES_REGISTRY_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_errno_guard.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_errno_guard.cpp
new file mode 100644
index 00000000..3bbf8b4c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_errno_guard.cpp
@@ -0,0 +1,16 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/internal/catch_errno_guard.hpp>
+
+#include <cerrno>
+
+namespace Catch {
+        ErrnoGuard::ErrnoGuard():m_oldErrno(errno){}
+        ErrnoGuard::~ErrnoGuard() { errno = m_oldErrno; }
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_errno_guard.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_errno_guard.hpp
new file mode 100644
index 00000000..df1237d5
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_errno_guard.hpp
@@ -0,0 +1,27 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_ERRNO_GUARD_HPP_INCLUDED
+#define CATCH_ERRNO_GUARD_HPP_INCLUDED
+
+namespace Catch {
+
+    //! Simple RAII class that stores the value of `errno`
+    //! at construction and restores it at destruction.
+    class ErrnoGuard {
+    public:
+        // Keep these outlined to avoid dragging in macros from <cerrno>
+
+        ErrnoGuard();
+        ~ErrnoGuard();
+    private:
+        int m_oldErrno;
+    };
+
+}
+
+#endif // CATCH_ERRNO_GUARD_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_exception_translator_registry.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_exception_translator_registry.cpp
new file mode 100644
index 00000000..1eb61147
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_exception_translator_registry.cpp
@@ -0,0 +1,87 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_exception_translator_registry.hpp>
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_test_failure_exception.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <exception>
+
+namespace Catch {
+
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+    namespace {
+        static std::string tryTranslators(
+            std::vector<
+                Detail::unique_ptr<IExceptionTranslator const>> const& translators ) {
+            if ( translators.empty() ) {
+                std::rethrow_exception( std::current_exception() );
+            } else {
+                return translators[0]->translate( translators.begin() + 1,
+                                                  translators.end() );
+            }
+        }
+
+    }
+#endif //!defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+
+    ExceptionTranslatorRegistry::~ExceptionTranslatorRegistry() = default;
+
+    void ExceptionTranslatorRegistry::registerTranslator( Detail::unique_ptr<IExceptionTranslator>&& translator ) {
+        m_translators.push_back( CATCH_MOVE( translator ) );
+    }
+
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+    std::string ExceptionTranslatorRegistry::translateActiveException() const {
+        // Compiling a mixed mode project with MSVC means that CLR
+        // exceptions will be caught in (...) as well. However, these do
+        // do not fill-in std::current_exception and thus lead to crash
+        // when attempting rethrow.
+        // /EHa switch also causes structured exceptions to be caught
+        // here, but they fill-in current_exception properly, so
+        // at worst the output should be a little weird, instead of
+        // causing a crash.
+        if ( std::current_exception() == nullptr ) {
+            return "Non C++ exception. Possibly a CLR exception.";
+        }
+
+        // First we try user-registered translators. If none of them can
+        // handle the exception, it will be rethrown handled by our defaults.
+        try {
+            return tryTranslators(m_translators);
+        }
+        // To avoid having to handle TFE explicitly everywhere, we just
+        // rethrow it so that it goes back up the caller.
+        catch( TestFailureException& ) {
+            std::rethrow_exception(std::current_exception());
+        }
+        catch( TestSkipException& ) {
+            std::rethrow_exception(std::current_exception());
+        }
+        catch( std::exception const& ex ) {
+            return ex.what();
+        }
+        catch( std::string const& msg ) {
+            return msg;
+        }
+        catch( const char* msg ) {
+            return msg;
+        }
+        catch(...) {
+            return "Unknown exception";
+        }
+    }
+
+#else // ^^ Exceptions are enabled // Exceptions are disabled vv
+    std::string ExceptionTranslatorRegistry::translateActiveException() const {
+        CATCH_INTERNAL_ERROR("Attempted to translate active exception under CATCH_CONFIG_DISABLE_EXCEPTIONS!");
+    }
+#endif
+
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_exception_translator_registry.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_exception_translator_registry.hpp
new file mode 100644
index 00000000..3123e93d
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_exception_translator_registry.hpp
@@ -0,0 +1,30 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_EXCEPTION_TRANSLATOR_REGISTRY_HPP_INCLUDED
+#define CATCH_EXCEPTION_TRANSLATOR_REGISTRY_HPP_INCLUDED
+
+#include <catch2/interfaces/catch_interfaces_exception.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
+
+#include <vector>
+#include <string>
+
+namespace Catch {
+
+    class ExceptionTranslatorRegistry : public IExceptionTranslatorRegistry {
+    public:
+        ~ExceptionTranslatorRegistry() override;
+        void registerTranslator( Detail::unique_ptr<IExceptionTranslator>&& translator );
+        std::string translateActiveException() const override;
+
+    private:
+        ExceptionTranslators m_translators;
+    };
+}
+
+#endif // CATCH_EXCEPTION_TRANSLATOR_REGISTRY_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_fatal_condition_handler.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_fatal_condition_handler.cpp
new file mode 100644
index 00000000..9ef5b217
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_fatal_condition_handler.cpp
@@ -0,0 +1,244 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+/** \file
+ * This file provides platform specific implementations of FatalConditionHandler
+ *
+ * This means that there is a lot of conditional compilation, and platform
+ * specific code. Currently, Catch2 supports a dummy handler (if no
+ * handler is desired), and 2 platform specific handlers:
+ *  * Windows' SEH
+ *  * POSIX signals
+ *
+ * Consequently, various pieces of code below are compiled if either of
+ * the platform specific handlers is enabled, or if none of them are
+ * enabled. It is assumed that both cannot be enabled at the same time,
+ * and doing so should cause a compilation error.
+ *
+ * If another platform specific handler is added, the compile guards
+ * below will need to be updated taking these assumptions into account.
+ */
+
+#include <catch2/internal/catch_fatal_condition_handler.hpp>
+
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/internal/catch_context.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/interfaces/catch_interfaces_capture.hpp>
+#include <catch2/internal/catch_windows_h_proxy.hpp>
+#include <catch2/internal/catch_stdstreams.hpp>
+
+#include <algorithm>
+
+#if !defined( CATCH_CONFIG_WINDOWS_SEH ) && !defined( CATCH_CONFIG_POSIX_SIGNALS )
+
+namespace Catch {
+
+    // If neither SEH nor signal handling is required, the handler impls
+    // do not have to do anything, and can be empty.
+    void FatalConditionHandler::engage_platform() {}
+    void FatalConditionHandler::disengage_platform() noexcept {}
+    FatalConditionHandler::FatalConditionHandler() = default;
+    FatalConditionHandler::~FatalConditionHandler() = default;
+
+} // end namespace Catch
+
+#endif // !CATCH_CONFIG_WINDOWS_SEH && !CATCH_CONFIG_POSIX_SIGNALS
+
+#if defined( CATCH_CONFIG_WINDOWS_SEH ) && defined( CATCH_CONFIG_POSIX_SIGNALS )
+#error "Inconsistent configuration: Windows' SEH handling and POSIX signals cannot be enabled at the same time"
+#endif // CATCH_CONFIG_WINDOWS_SEH && CATCH_CONFIG_POSIX_SIGNALS
+
+#if defined( CATCH_CONFIG_WINDOWS_SEH ) || defined( CATCH_CONFIG_POSIX_SIGNALS )
+
+namespace {
+    //! Signals fatal error message to the run context
+    void reportFatal( char const * const message ) {
+        Catch::getCurrentContext().getResultCapture()->handleFatalErrorCondition( message );
+    }
+
+    //! Minimal size Catch2 needs for its own fatal error handling.
+    //! Picked empirically, so it might not be sufficient on all
+    //! platforms, and for all configurations.
+    constexpr std::size_t minStackSizeForErrors = 32 * 1024;
+} // end unnamed namespace
+
+#endif // CATCH_CONFIG_WINDOWS_SEH || CATCH_CONFIG_POSIX_SIGNALS
+
+#if defined( CATCH_CONFIG_WINDOWS_SEH )
+
+namespace Catch {
+
+    struct SignalDefs { DWORD id; const char* name; };
+
+    // There is no 1-1 mapping between signals and windows exceptions.
+    // Windows can easily distinguish between SO and SigSegV,
+    // but SigInt, SigTerm, etc are handled differently.
+    static SignalDefs signalDefs[] = {
+        { EXCEPTION_ILLEGAL_INSTRUCTION,  "SIGILL - Illegal instruction signal" },
+        { EXCEPTION_STACK_OVERFLOW, "SIGSEGV - Stack overflow" },
+        { EXCEPTION_ACCESS_VIOLATION, "SIGSEGV - Segmentation violation signal" },
+        { EXCEPTION_INT_DIVIDE_BY_ZERO, "Divide by zero error" },
+    };
+
+    static LONG CALLBACK topLevelExceptionFilter(PEXCEPTION_POINTERS ExceptionInfo) {
+        for (auto const& def : signalDefs) {
+            if (ExceptionInfo->ExceptionRecord->ExceptionCode == def.id) {
+                reportFatal(def.name);
+            }
+        }
+        // If its not an exception we care about, pass it along.
+        // This stops us from eating debugger breaks etc.
+        return EXCEPTION_CONTINUE_SEARCH;
+    }
+
+    // Since we do not support multiple instantiations, we put these
+    // into global variables and rely on cleaning them up in outlined
+    // constructors/destructors
+    static LPTOP_LEVEL_EXCEPTION_FILTER previousTopLevelExceptionFilter = nullptr;
+
+
+    // For MSVC, we reserve part of the stack memory for handling
+    // memory overflow structured exception.
+    FatalConditionHandler::FatalConditionHandler() {
+        ULONG guaranteeSize = static_cast<ULONG>(minStackSizeForErrors);
+        if (!SetThreadStackGuarantee(&guaranteeSize)) {
+            // We do not want to fully error out, because needing
+            // the stack reserve should be rare enough anyway.
+            Catch::cerr()
+                << "Failed to reserve piece of stack."
+                << " Stack overflows will not be reported successfully.";
+        }
+    }
+
+    // We do not attempt to unset the stack guarantee, because
+    // Windows does not support lowering the stack size guarantee.
+    FatalConditionHandler::~FatalConditionHandler() = default;
+
+
+    void FatalConditionHandler::engage_platform() {
+        // Register as a the top level exception filter.
+        previousTopLevelExceptionFilter = SetUnhandledExceptionFilter(topLevelExceptionFilter);
+    }
+
+    void FatalConditionHandler::disengage_platform() noexcept {
+        if (SetUnhandledExceptionFilter(previousTopLevelExceptionFilter) != topLevelExceptionFilter) {
+            Catch::cerr()
+                << "Unexpected SEH unhandled exception filter on disengage."
+                << " The filter was restored, but might be rolled back unexpectedly.";
+        }
+        previousTopLevelExceptionFilter = nullptr;
+    }
+
+} // end namespace Catch
+
+#endif // CATCH_CONFIG_WINDOWS_SEH
+
+#if defined( CATCH_CONFIG_POSIX_SIGNALS )
+
+#include <signal.h>
+
+namespace Catch {
+
+    struct SignalDefs {
+        int id;
+        const char* name;
+    };
+
+    static SignalDefs signalDefs[] = {
+        { SIGINT,  "SIGINT - Terminal interrupt signal" },
+        { SIGILL,  "SIGILL - Illegal instruction signal" },
+        { SIGFPE,  "SIGFPE - Floating point error signal" },
+        { SIGSEGV, "SIGSEGV - Segmentation violation signal" },
+        { SIGTERM, "SIGTERM - Termination request signal" },
+        { SIGABRT, "SIGABRT - Abort (abnormal termination) signal" }
+    };
+
+// Older GCCs trigger -Wmissing-field-initializers for T foo = {}
+// which is zero initialization, but not explicit. We want to avoid
+// that.
+#if defined(__GNUC__)
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#endif
+
+    static char* altStackMem = nullptr;
+    static std::size_t altStackSize = 0;
+    static stack_t oldSigStack{};
+    static struct sigaction oldSigActions[sizeof(signalDefs) / sizeof(SignalDefs)]{};
+
+    static void restorePreviousSignalHandlers() noexcept {
+        // We set signal handlers back to the previous ones. Hopefully
+        // nobody overwrote them in the meantime, and doesn't expect
+        // their signal handlers to live past ours given that they
+        // installed them after ours..
+        for (std::size_t i = 0; i < sizeof(signalDefs) / sizeof(SignalDefs); ++i) {
+            sigaction(signalDefs[i].id, &oldSigActions[i], nullptr);
+        }
+        // Return the old stack
+        sigaltstack(&oldSigStack, nullptr);
+    }
+
+    static void handleSignal( int sig ) {
+        char const * name = "<unknown signal>";
+        for (auto const& def : signalDefs) {
+            if (sig == def.id) {
+                name = def.name;
+                break;
+            }
+        }
+        // We need to restore previous signal handlers and let them do
+        // their thing, so that the users can have the debugger break
+        // when a signal is raised, and so on.
+        restorePreviousSignalHandlers();
+        reportFatal( name );
+        raise( sig );
+    }
+
+    FatalConditionHandler::FatalConditionHandler() {
+        assert(!altStackMem && "Cannot initialize POSIX signal handler when one already exists");
+        if (altStackSize == 0) {
+            altStackSize = std::max(static_cast<size_t>(SIGSTKSZ), minStackSizeForErrors);
+        }
+        altStackMem = new char[altStackSize]();
+    }
+
+    FatalConditionHandler::~FatalConditionHandler() {
+        delete[] altStackMem;
+        // We signal that another instance can be constructed by zeroing
+        // out the pointer.
+        altStackMem = nullptr;
+    }
+
+    void FatalConditionHandler::engage_platform() {
+        stack_t sigStack;
+        sigStack.ss_sp = altStackMem;
+        sigStack.ss_size = altStackSize;
+        sigStack.ss_flags = 0;
+        sigaltstack(&sigStack, &oldSigStack);
+        struct sigaction sa = { };
+
+        sa.sa_handler = handleSignal;
+        sa.sa_flags = SA_ONSTACK;
+        for (std::size_t i = 0; i < sizeof(signalDefs)/sizeof(SignalDefs); ++i) {
+            sigaction(signalDefs[i].id, &sa, &oldSigActions[i]);
+        }
+    }
+
+#if defined(__GNUC__)
+#    pragma GCC diagnostic pop
+#endif
+
+
+    void FatalConditionHandler::disengage_platform() noexcept {
+        restorePreviousSignalHandlers();
+    }
+
+} // end namespace Catch
+
+#endif // CATCH_CONFIG_POSIX_SIGNALS
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_fatal_condition_handler.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_fatal_condition_handler.hpp
new file mode 100644
index 00000000..81728b56
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_fatal_condition_handler.hpp
@@ -0,0 +1,66 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_FATAL_CONDITION_HANDLER_HPP_INCLUDED
+#define CATCH_FATAL_CONDITION_HANDLER_HPP_INCLUDED
+
+#include <cassert>
+
+namespace Catch {
+
+    /**
+     * Wrapper for platform-specific fatal error (signals/SEH) handlers
+     *
+     * Tries to be cooperative with other handlers, and not step over
+     * other handlers. This means that unknown structured exceptions
+     * are passed on, previous signal handlers are called, and so on.
+     *
+     * Can only be instantiated once, and assumes that once a signal
+     * is caught, the binary will end up terminating. Thus, there
+     */
+    class FatalConditionHandler {
+        bool m_started = false;
+
+        // Install/disengage implementation for specific platform.
+        // Should be if-defed to work on current platform, can assume
+        // engage-disengage 1:1 pairing.
+        void engage_platform();
+        void disengage_platform() noexcept;
+    public:
+        // Should also have platform-specific implementations as needed
+        FatalConditionHandler();
+        ~FatalConditionHandler();
+
+        void engage() {
+            assert(!m_started && "Handler cannot be installed twice.");
+            m_started = true;
+            engage_platform();
+        }
+
+        void disengage() noexcept {
+            assert(m_started && "Handler cannot be uninstalled without being installed first");
+            m_started = false;
+            disengage_platform();
+        }
+    };
+
+    //! Simple RAII guard for (dis)engaging the FatalConditionHandler
+    class FatalConditionHandlerGuard {
+        FatalConditionHandler* m_handler;
+    public:
+        FatalConditionHandlerGuard(FatalConditionHandler* handler):
+            m_handler(handler) {
+            m_handler->engage();
+        }
+        ~FatalConditionHandlerGuard() {
+            m_handler->disengage();
+        }
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_FATAL_CONDITION_HANDLER_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_floating_point_helpers.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_floating_point_helpers.cpp
new file mode 100644
index 00000000..9631ed6d
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_floating_point_helpers.cpp
@@ -0,0 +1,43 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/internal/catch_floating_point_helpers.hpp>
+
+#include <cstring>
+
+namespace Catch {
+    namespace Detail {
+
+        uint32_t convertToBits(float f) {
+            static_assert(sizeof(float) == sizeof(uint32_t), "Important ULP matcher assumption violated");
+            uint32_t i;
+            std::memcpy(&i, &f, sizeof(f));
+            return i;
+        }
+
+        uint64_t convertToBits(double d) {
+            static_assert(sizeof(double) == sizeof(uint64_t), "Important ULP matcher assumption violated");
+            uint64_t i;
+            std::memcpy(&i, &d, sizeof(d));
+            return i;
+        }
+
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        bool directCompare( float lhs, float rhs ) { return lhs == rhs; }
+        bool directCompare( double lhs, double rhs ) { return lhs == rhs; }
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+
+
+    } // end namespace Detail
+} // end namespace Catch
+
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_floating_point_helpers.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_floating_point_helpers.hpp
new file mode 100644
index 00000000..b2143726
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_floating_point_helpers.hpp
@@ -0,0 +1,108 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_FLOATING_POINT_HELPERS_HPP_INCLUDED
+#define CATCH_FLOATING_POINT_HELPERS_HPP_INCLUDED
+
+#include <catch2/internal/catch_polyfills.hpp>
+
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <utility>
+#include <limits>
+
+namespace Catch {
+    namespace Detail {
+
+        uint32_t convertToBits(float f);
+        uint64_t convertToBits(double d);
+
+        // Used when we know we want == comparison of two doubles
+        // to centralize warning suppression
+        bool directCompare( float lhs, float rhs );
+        bool directCompare( double lhs, double rhs );
+
+    } // end namespace Detail
+
+
+
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic push
+    // We do a bunch of direct compensations of floating point numbers,
+    // because we know what we are doing and actually do want the direct
+    // comparison behaviour.
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+
+    /**
+     * Calculates the ULP distance between two floating point numbers
+     *
+     * The ULP distance of two floating point numbers is the count of
+     * valid floating point numbers representable between them.
+     *
+     * There are some exceptions between how this function counts the
+     * distance, and the interpretation of the standard as implemented.
+     * by e.g. `nextafter`. For this function it always holds that:
+     * * `(x == y) => ulpDistance(x, y) == 0` (so `ulpDistance(-0, 0) == 0`)
+     * * `ulpDistance(maxFinite, INF) == 1`
+     * * `ulpDistance(x, -x) == 2 * ulpDistance(x, 0)`
+     *
+     * \pre `!isnan( lhs )`
+     * \pre `!isnan( rhs )`
+     * \pre floating point numbers are represented in IEEE-754 format
+     */
+    template <typename FP>
+    uint64_t ulpDistance( FP lhs, FP rhs ) {
+        assert( std::numeric_limits<FP>::is_iec559 &&
+            "ulpDistance assumes IEEE-754 format for floating point types" );
+        assert( !Catch::isnan( lhs ) &&
+                "Distance between NaN and number is not meaningful" );
+        assert( !Catch::isnan( rhs ) &&
+                "Distance between NaN and number is not meaningful" );
+
+        // We want X == Y to imply 0 ULP distance even if X and Y aren't
+        // bit-equal (-0 and 0), or X - Y != 0 (same sign infinities).
+        if ( lhs == rhs ) { return 0; }
+
+        // We need a properly typed positive zero for type inference.
+        static constexpr FP positive_zero{};
+
+        // We want to ensure that +/- 0 is always represented as positive zero
+        if ( lhs == positive_zero ) { lhs = positive_zero; }
+        if ( rhs == positive_zero ) { rhs = positive_zero; }
+
+        // If arguments have different signs, we can handle them by summing
+        // how far are they from 0 each.
+        if ( std::signbit( lhs ) != std::signbit( rhs ) ) {
+            return ulpDistance( std::abs( lhs ), positive_zero ) +
+                   ulpDistance( std::abs( rhs ), positive_zero );
+        }
+
+        // When both lhs and rhs are of the same sign, we can just
+        // read the numbers bitwise as integers, and then subtract them
+        // (assuming IEEE).
+        uint64_t lc = Detail::convertToBits( lhs );
+        uint64_t rc = Detail::convertToBits( rhs );
+
+        // The ulp distance between two numbers is symmetric, so to avoid
+        // dealing with overflows we want the bigger converted number on the lhs
+        if ( lc < rc ) {
+            std::swap( lc, rc );
+        }
+
+        return lc - rc;
+    }
+
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+
+
+} // end namespace Catch
+
+#endif // CATCH_FLOATING_POINT_HELPERS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_getenv.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_getenv.cpp
new file mode 100644
index 00000000..a9a592c7
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_getenv.cpp
@@ -0,0 +1,37 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/internal/catch_getenv.hpp>
+
+#include <catch2/internal/catch_platform.hpp>
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+
+#include <cstdlib>
+
+namespace Catch {
+    namespace Detail {
+
+#if !defined (CATCH_CONFIG_GETENV)
+        char const* getEnv( char const* ) { return nullptr; }
+#else
+
+        char const* getEnv( char const* varName ) {
+#    if defined( _MSC_VER )
+#        pragma warning( push )
+#        pragma warning( disable : 4996 ) // use getenv_s instead of getenv
+#    endif
+
+            return std::getenv( varName );
+
+#    if defined( _MSC_VER )
+#        pragma warning( pop )
+#    endif
+        }
+#endif
+} // namespace Detail
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_getenv.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_getenv.hpp
new file mode 100644
index 00000000..31ef7974
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_getenv.hpp
@@ -0,0 +1,20 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_GETENV_HPP_INCLUDED
+#define CATCH_GETENV_HPP_INCLUDED
+
+namespace Catch {
+namespace Detail {
+
+    //! Wrapper over `std::getenv` that compiles on UWP (and always returns nullptr there)
+    char const* getEnv(char const* varName);
+
+}
+}
+
+#endif // CATCH_GETENV_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_is_permutation.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_is_permutation.hpp
new file mode 100644
index 00000000..708053d3
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_is_permutation.hpp
@@ -0,0 +1,138 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_IS_PERMUTATION_HPP_INCLUDED
+#define CATCH_IS_PERMUTATION_HPP_INCLUDED
+
+#include <algorithm>
+#include <iterator>
+
+namespace Catch {
+    namespace Detail {
+
+        template <typename ForwardIter,
+                  typename Sentinel,
+                  typename T,
+                  typename Comparator>
+        ForwardIter find_sentinel( ForwardIter start,
+                                   Sentinel sentinel,
+                                   T const& value,
+                                   Comparator cmp ) {
+            while ( start != sentinel ) {
+                if ( cmp( *start, value ) ) { break; }
+                ++start;
+            }
+            return start;
+        }
+
+        template <typename ForwardIter,
+                  typename Sentinel,
+                  typename T,
+                  typename Comparator>
+        std::ptrdiff_t count_sentinel( ForwardIter start,
+                                       Sentinel sentinel,
+                                       T const& value,
+                                       Comparator cmp ) {
+            std::ptrdiff_t count = 0;
+            while ( start != sentinel ) {
+                if ( cmp( *start, value ) ) { ++count; }
+                ++start;
+            }
+            return count;
+        }
+
+        template <typename ForwardIter, typename Sentinel>
+        std::enable_if_t<!std::is_same<ForwardIter, Sentinel>::value,
+                         std::ptrdiff_t>
+        sentinel_distance( ForwardIter iter, const Sentinel sentinel ) {
+            std::ptrdiff_t dist = 0;
+            while ( iter != sentinel ) {
+                ++iter;
+                ++dist;
+            }
+            return dist;
+        }
+
+        template <typename ForwardIter>
+        std::ptrdiff_t sentinel_distance( ForwardIter first,
+                                          ForwardIter last ) {
+            return std::distance( first, last );
+        }
+
+        template <typename ForwardIter1,
+                  typename Sentinel1,
+                  typename ForwardIter2,
+                  typename Sentinel2,
+                  typename Comparator>
+        bool check_element_counts( ForwardIter1 first_1,
+                                   const Sentinel1 end_1,
+                                   ForwardIter2 first_2,
+                                   const Sentinel2 end_2,
+                                   Comparator cmp ) {
+            auto cursor = first_1;
+            while ( cursor != end_1 ) {
+                if ( find_sentinel( first_1, cursor, *cursor, cmp ) ==
+                     cursor ) {
+                    // we haven't checked this element yet
+                    const auto count_in_range_2 =
+                        count_sentinel( first_2, end_2, *cursor, cmp );
+                    // Not a single instance in 2nd range, so it cannot be a
+                    // permutation of 1st range
+                    if ( count_in_range_2 == 0 ) { return false; }
+
+                    const auto count_in_range_1 =
+                        count_sentinel( cursor, end_1, *cursor, cmp );
+                    if ( count_in_range_1 != count_in_range_2 ) {
+                        return false;
+                    }
+                }
+
+                ++cursor;
+            }
+
+            return true;
+        }
+
+        template <typename ForwardIter1,
+                  typename Sentinel1,
+                  typename ForwardIter2,
+                  typename Sentinel2,
+                  typename Comparator>
+        bool is_permutation( ForwardIter1 first_1,
+                             const Sentinel1 end_1,
+                             ForwardIter2 first_2,
+                             const Sentinel2 end_2,
+                             Comparator cmp ) {
+            // TODO: no optimization for stronger iterators, because we would also have to constrain on sentinel vs not sentinel types
+            // TODO: Comparator has to be "both sides", e.g. a == b => b == a
+            // This skips shared prefix of the two ranges
+            while (first_1 != end_1 && first_2 != end_2 && cmp(*first_1, *first_2)) {
+                ++first_1;
+                ++first_2;
+            }
+
+            // We need to handle case where at least one of the ranges has no more elements
+            if (first_1 == end_1 || first_2 == end_2) {
+                return first_1 == end_1 && first_2 == end_2;
+            }
+
+            // pair counting is n**2, so we pay linear walk to compare the sizes first
+            auto dist_1 = sentinel_distance( first_1, end_1 );
+            auto dist_2 = sentinel_distance( first_2, end_2 );
+
+            if (dist_1 != dist_2) { return false; }
+
+            // Since we do not try to handle stronger iterators pair (e.g.
+            // bidir) optimally, the only thing left to do is to check counts in
+            // the remaining ranges.
+            return check_element_counts( first_1, end_1, first_2, end_2, cmp );
+        }
+
+    } // namespace Detail
+} // namespace Catch
+
+#endif // CATCH_IS_PERMUTATION_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_istream.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_istream.cpp
new file mode 100644
index 00000000..2867ce74
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_istream.cpp
@@ -0,0 +1,154 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/internal/catch_istream.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_debug_console.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
+#include <catch2/internal/catch_stdstreams.hpp>
+
+#include <cstdio>
+#include <fstream>
+#include <sstream>
+#include <vector>
+
+namespace Catch {
+
+    Catch::IStream::~IStream() = default;
+
+namespace Detail {
+    namespace {
+        template<typename WriterF, std::size_t bufferSize=256>
+        class StreamBufImpl final : public std::streambuf {
+            char data[bufferSize];
+            WriterF m_writer;
+
+        public:
+            StreamBufImpl() {
+                setp( data, data + sizeof(data) );
+            }
+
+            ~StreamBufImpl() noexcept override {
+                StreamBufImpl::sync();
+            }
+
+        private:
+            int overflow( int c ) override {
+                sync();
+
+                if( c != EOF ) {
+                    if( pbase() == epptr() )
+                        m_writer( std::string( 1, static_cast<char>( c ) ) );
+                    else
+                        sputc( static_cast<char>( c ) );
+                }
+                return 0;
+            }
+
+            int sync() override {
+                if( pbase() != pptr() ) {
+                    m_writer( std::string( pbase(), static_cast<std::string::size_type>( pptr() - pbase() ) ) );
+                    setp( pbase(), epptr() );
+                }
+                return 0;
+            }
+        };
+
+        ///////////////////////////////////////////////////////////////////////////
+
+        struct OutputDebugWriter {
+
+            void operator()( std::string const& str ) {
+                if ( !str.empty() ) {
+                    writeToDebugConsole( str );
+                }
+            }
+        };
+
+        ///////////////////////////////////////////////////////////////////////////
+
+        class FileStream final : public IStream {
+            std::ofstream m_ofs;
+        public:
+            FileStream( std::string const& filename ) {
+                m_ofs.open( filename.c_str() );
+                CATCH_ENFORCE( !m_ofs.fail(), "Unable to open file: '" << filename << '\'' );
+                m_ofs << std::unitbuf;
+            }
+        public: // IStream
+            std::ostream& stream() override {
+                return m_ofs;
+            }
+        };
+
+        ///////////////////////////////////////////////////////////////////////////
+
+        class CoutStream final : public IStream {
+            std::ostream m_os;
+        public:
+            // Store the streambuf from cout up-front because
+            // cout may get redirected when running tests
+            CoutStream() : m_os( Catch::cout().rdbuf() ) {}
+
+        public: // IStream
+            std::ostream& stream() override { return m_os; }
+            bool isConsole() const override { return true; }
+        };
+
+        class CerrStream : public IStream {
+            std::ostream m_os;
+
+        public:
+            // Store the streambuf from cerr up-front because
+            // cout may get redirected when running tests
+            CerrStream(): m_os( Catch::cerr().rdbuf() ) {}
+
+        public: // IStream
+            std::ostream& stream() override { return m_os; }
+            bool isConsole() const override { return true; }
+        };
+
+        ///////////////////////////////////////////////////////////////////////////
+
+        class DebugOutStream final : public IStream {
+            Detail::unique_ptr<StreamBufImpl<OutputDebugWriter>> m_streamBuf;
+            std::ostream m_os;
+        public:
+            DebugOutStream()
+            :   m_streamBuf( Detail::make_unique<StreamBufImpl<OutputDebugWriter>>() ),
+                m_os( m_streamBuf.get() )
+            {}
+
+        public: // IStream
+            std::ostream& stream() override { return m_os; }
+        };
+
+    } // unnamed namespace
+} // namespace Detail
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    auto makeStream( std::string const& filename ) -> Detail::unique_ptr<IStream> {
+        if ( filename.empty() || filename == "-" ) {
+            return Detail::make_unique<Detail::CoutStream>();
+        }
+        if( filename[0] == '%' ) {
+            if ( filename == "%debug" ) {
+                return Detail::make_unique<Detail::DebugOutStream>();
+            } else if ( filename == "%stderr" ) {
+                return Detail::make_unique<Detail::CerrStream>();
+            } else if ( filename == "%stdout" ) {
+                return Detail::make_unique<Detail::CoutStream>();
+            } else {
+                CATCH_ERROR( "Unrecognised stream: '" << filename << '\'' );
+            }
+        }
+        return Detail::make_unique<Detail::FileStream>( filename );
+    }
+
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_istream.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_istream.hpp
new file mode 100644
index 00000000..e6b9a2d3
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_istream.hpp
@@ -0,0 +1,54 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_ISTREAM_HPP_INCLUDED
+#define CATCH_ISTREAM_HPP_INCLUDED
+
+#include <catch2/internal/catch_noncopyable.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
+
+#include <iosfwd>
+#include <cstddef>
+#include <ostream>
+#include <string>
+
+namespace Catch {
+
+    class IStream {
+    public:
+        virtual ~IStream(); // = default
+        virtual std::ostream& stream() = 0;
+        /**
+         * Best guess on whether the instance is writing to a console (e.g. via stdout/stderr)
+         *
+         * This is useful for e.g. Win32 colour support, because the Win32
+         * API manipulates console directly, unlike POSIX escape codes,
+         * that can be written anywhere.
+         *
+         * Due to variety of ways to change where the stdout/stderr is
+         * _actually_ being written, users should always assume that
+         * the answer might be wrong.
+         */
+        virtual bool isConsole() const { return false; }
+    };
+
+    /**
+     * Creates a stream wrapper that writes to specific file.
+     *
+     * Also recognizes 4 special filenames
+     * * `-` for stdout
+     * * `%stdout` for stdout
+     * * `%stderr` for stderr
+     * * `%debug` for platform specific debugging output
+     *
+     * \throws if passed an unrecognized %-prefixed stream
+     */
+    auto makeStream( std::string const& filename ) -> Detail::unique_ptr<IStream>;
+
+}
+
+#endif // CATCH_STREAM_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_jsonwriter.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_jsonwriter.cpp
new file mode 100644
index 00000000..1a96e348
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_jsonwriter.cpp
@@ -0,0 +1,148 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_jsonwriter.hpp>
+
+namespace Catch {
+    void JsonUtils::indent( std::ostream& os, std::uint64_t level ) {
+        for ( std::uint64_t i = 0; i < level; ++i ) {
+            os << "  ";
+        }
+    }
+    void JsonUtils::appendCommaNewline( std::ostream& os,
+                                        bool& should_comma,
+                                        std::uint64_t level ) {
+        if ( should_comma ) { os << ','; }
+        should_comma = true;
+        os << '\n';
+        indent( os, level );
+    }
+
+    JsonObjectWriter::JsonObjectWriter( std::ostream& os ):
+        JsonObjectWriter{ os, 0 } {}
+
+    JsonObjectWriter::JsonObjectWriter( std::ostream& os,
+                                        std::uint64_t indent_level ):
+        m_os{ os }, m_indent_level{ indent_level } {
+        m_os << '{';
+    }
+    JsonObjectWriter::JsonObjectWriter( JsonObjectWriter&& source ) noexcept:
+        m_os{ source.m_os },
+        m_indent_level{ source.m_indent_level },
+        m_should_comma{ source.m_should_comma },
+        m_active{ source.m_active } {
+        source.m_active = false;
+    }
+
+    JsonObjectWriter::~JsonObjectWriter() {
+        if ( !m_active ) { return; }
+
+        m_os << '\n';
+        JsonUtils::indent( m_os, m_indent_level );
+        m_os << '}';
+    }
+
+    JsonValueWriter JsonObjectWriter::write( StringRef key ) {
+        JsonUtils::appendCommaNewline(
+            m_os, m_should_comma, m_indent_level + 1 );
+
+        m_os << '"' << key << "\": ";
+        return JsonValueWriter{ m_os, m_indent_level + 1 };
+    }
+
+    JsonArrayWriter::JsonArrayWriter( std::ostream& os ):
+        JsonArrayWriter{ os, 0 } {}
+    JsonArrayWriter::JsonArrayWriter( std::ostream& os,
+                                      std::uint64_t indent_level ):
+        m_os{ os }, m_indent_level{ indent_level } {
+        m_os << '[';
+    }
+    JsonArrayWriter::JsonArrayWriter( JsonArrayWriter&& source ) noexcept:
+        m_os{ source.m_os },
+        m_indent_level{ source.m_indent_level },
+        m_should_comma{ source.m_should_comma },
+        m_active{ source.m_active } {
+        source.m_active = false;
+    }
+    JsonArrayWriter::~JsonArrayWriter() {
+        if ( !m_active ) { return; }
+
+        m_os << '\n';
+        JsonUtils::indent( m_os, m_indent_level );
+        m_os << ']';
+    }
+
+    JsonObjectWriter JsonArrayWriter::writeObject() {
+        JsonUtils::appendCommaNewline(
+            m_os, m_should_comma, m_indent_level + 1 );
+        return JsonObjectWriter{ m_os, m_indent_level + 1 };
+    }
+
+    JsonArrayWriter JsonArrayWriter::writeArray() {
+        JsonUtils::appendCommaNewline(
+            m_os, m_should_comma, m_indent_level + 1 );
+        return JsonArrayWriter{ m_os, m_indent_level + 1 };
+    }
+
+    JsonArrayWriter& JsonArrayWriter::write( bool value ) {
+        return writeImpl( value );
+    }
+
+    JsonValueWriter::JsonValueWriter( std::ostream& os ):
+        JsonValueWriter{ os, 0 } {}
+
+    JsonValueWriter::JsonValueWriter( std::ostream& os,
+                                      std::uint64_t indent_level ):
+        m_os{ os }, m_indent_level{ indent_level } {}
+
+    JsonObjectWriter JsonValueWriter::writeObject() && {
+        return JsonObjectWriter{ m_os, m_indent_level };
+    }
+
+    JsonArrayWriter JsonValueWriter::writeArray() && {
+        return JsonArrayWriter{ m_os, m_indent_level };
+    }
+
+    void JsonValueWriter::write( Catch::StringRef value ) && {
+        writeImpl( value, true );
+    }
+
+    void JsonValueWriter::write( bool value ) && {
+        writeImpl( value ? "true"_sr : "false"_sr, false );
+    }
+
+    void JsonValueWriter::writeImpl( Catch::StringRef value, bool quote ) {
+        if ( quote ) { m_os << '"'; }
+        for (char c : value) {
+            // Escape list taken from https://www.json.org/json-en.html,
+            // string definition.
+            // Note that while forward slash _can_ be escaped, it does
+            // not have to be, if JSON is not further embedded somewhere
+            // where forward slash is meaningful.
+            if ( c == '"' ) {
+                m_os << "\\\"";
+            } else if ( c == '\\' ) {
+                m_os << "\\\\";
+            } else if ( c == '\b' ) {
+                m_os << "\\b";
+            } else if ( c == '\f' ) {
+                m_os << "\\f";
+            } else if ( c == '\n' ) {
+                m_os << "\\n";
+            } else if ( c == '\r' ) {
+                m_os << "\\r";
+            } else if ( c == '\t' ) {
+                m_os << "\\t";
+            } else {
+                m_os << c;
+            }
+        }
+        if ( quote ) { m_os << '"'; }
+    }
+
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_jsonwriter.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_jsonwriter.hpp
new file mode 100644
index 00000000..23b56d13
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_jsonwriter.hpp
@@ -0,0 +1,120 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_JSONWRITER_HPP_INCLUDED
+#define CATCH_JSONWRITER_HPP_INCLUDED
+
+#include <catch2/internal/catch_reusable_string_stream.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+
+#include <cstdint>
+#include <sstream>
+
+namespace Catch {
+    class JsonObjectWriter;
+    class JsonArrayWriter;
+
+    struct JsonUtils {
+        static void indent( std::ostream& os, std::uint64_t level );
+        static void appendCommaNewline( std::ostream& os,
+                                        bool& should_comma,
+                                        std::uint64_t level );
+    };
+
+    class JsonValueWriter {
+    public:
+        JsonValueWriter( std::ostream& os );
+        JsonValueWriter( std::ostream& os, std::uint64_t indent_level );
+
+        JsonObjectWriter writeObject() &&;
+        JsonArrayWriter writeArray() &&;
+
+        template <typename T>
+        void write( T const& value ) && {
+            writeImpl( value, !std::is_arithmetic<T>::value );
+        }
+        void write( StringRef value ) &&;
+        void write( bool value ) &&;
+
+    private:
+        void writeImpl( StringRef value, bool quote );
+
+        // Without this SFINAE, this overload is a better match
+        // for `std::string`, `char const*`, `char const[N]` args.
+        // While it would still work, it would cause code bloat
+        // and multiple iteration over the strings
+        template <typename T,
+                  typename = typename std::enable_if_t<
+                      !std::is_convertible<T, StringRef>::value>>
+        void writeImpl( T const& value, bool quote_value ) {
+            m_sstream << value;
+            writeImpl( m_sstream.str(), quote_value );
+        }
+
+        std::ostream& m_os;
+        std::stringstream m_sstream;
+        std::uint64_t m_indent_level;
+    };
+
+    class JsonObjectWriter {
+    public:
+        JsonObjectWriter( std::ostream& os );
+        JsonObjectWriter( std::ostream& os, std::uint64_t indent_level );
+
+        JsonObjectWriter( JsonObjectWriter&& source ) noexcept;
+        JsonObjectWriter& operator=( JsonObjectWriter&& source ) = delete;
+
+        ~JsonObjectWriter();
+
+        JsonValueWriter write( StringRef key );
+
+    private:
+        std::ostream& m_os;
+        std::uint64_t m_indent_level;
+        bool m_should_comma = false;
+        bool m_active = true;
+    };
+
+    class JsonArrayWriter {
+    public:
+        JsonArrayWriter( std::ostream& os );
+        JsonArrayWriter( std::ostream& os, std::uint64_t indent_level );
+
+        JsonArrayWriter( JsonArrayWriter&& source ) noexcept;
+        JsonArrayWriter& operator=( JsonArrayWriter&& source ) = delete;
+
+        ~JsonArrayWriter();
+
+        JsonObjectWriter writeObject();
+        JsonArrayWriter writeArray();
+
+        template <typename T>
+        JsonArrayWriter& write( T const& value ) {
+            return writeImpl( value );
+        }
+
+        JsonArrayWriter& write( bool value );
+
+    private:
+        template <typename T>
+        JsonArrayWriter& writeImpl( T const& value ) {
+            JsonUtils::appendCommaNewline(
+                m_os, m_should_comma, m_indent_level + 1 );
+            JsonValueWriter{ m_os }.write( value );
+
+            return *this;
+        }
+
+        std::ostream& m_os;
+        std::uint64_t m_indent_level;
+        bool m_should_comma = false;
+        bool m_active = true;
+    };
+
+} // namespace Catch
+
+#endif // CATCH_JSONWRITER_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_lazy_expr.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_lazy_expr.cpp
new file mode 100644
index 00000000..56a5ae5e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_lazy_expr.cpp
@@ -0,0 +1,29 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/internal/catch_lazy_expr.hpp>
+#include <catch2/internal/catch_decomposer.hpp>
+
+namespace Catch {
+
+    auto operator << (std::ostream& os, LazyExpression const& lazyExpr) -> std::ostream& {
+        if (lazyExpr.m_isNegated)
+            os << '!';
+
+        if (lazyExpr) {
+            if (lazyExpr.m_isNegated && lazyExpr.m_transientExpression->isBinaryExpression())
+                os << '(' << *lazyExpr.m_transientExpression << ')';
+            else
+                os << *lazyExpr.m_transientExpression;
+        } else {
+            os << "{** error - unchecked empty expression requested **}";
+        }
+        return os;
+    }
+
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_lazy_expr.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_lazy_expr.hpp
new file mode 100644
index 00000000..36e0ac50
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_lazy_expr.hpp
@@ -0,0 +1,40 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_LAZY_EXPR_HPP_INCLUDED
+#define CATCH_LAZY_EXPR_HPP_INCLUDED
+
+#include <iosfwd>
+
+namespace Catch {
+
+    class ITransientExpression;
+
+    class LazyExpression {
+        friend class AssertionHandler;
+        friend struct AssertionStats;
+        friend class RunContext;
+
+        ITransientExpression const* m_transientExpression = nullptr;
+        bool m_isNegated;
+    public:
+        LazyExpression( bool isNegated ):
+            m_isNegated(isNegated)
+        {}
+        LazyExpression(LazyExpression const& other) = default;
+        LazyExpression& operator = ( LazyExpression const& ) = delete;
+
+        explicit operator bool() const {
+            return m_transientExpression != nullptr;
+        }
+
+        friend auto operator << ( std::ostream& os, LazyExpression const& lazyExpr ) -> std::ostream&;
+    };
+
+} // namespace Catch
+
+#endif // CATCH_LAZY_EXPR_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_leak_detector.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_leak_detector.cpp
new file mode 100644
index 00000000..691bc772
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_leak_detector.cpp
@@ -0,0 +1,38 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/internal/catch_leak_detector.hpp>
+#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
+#include <catch2/catch_user_config.hpp>
+
+#ifdef CATCH_CONFIG_WINDOWS_CRTDBG
+#include <crtdbg.h>
+
+namespace Catch {
+
+    LeakDetector::LeakDetector() {
+        int flag = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
+        flag |= _CRTDBG_LEAK_CHECK_DF;
+        flag |= _CRTDBG_ALLOC_MEM_DF;
+        _CrtSetDbgFlag(flag);
+        _CrtSetReportMode(_CRT_WARN, _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG);
+        _CrtSetReportFile(_CRT_WARN, _CRTDBG_FILE_STDERR);
+        // Change this to leaking allocation's number to break there
+        _CrtSetBreakAlloc(-1);
+    }
+}
+
+#else // ^^ Windows crt debug heap enabled // Windows crt debug heap disabled vv
+
+    Catch::LeakDetector::LeakDetector() = default;
+
+#endif // CATCH_CONFIG_WINDOWS_CRTDBG
+
+Catch::LeakDetector::~LeakDetector() {
+    Catch::cleanUp();
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_leak_detector.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_leak_detector.hpp
new file mode 100644
index 00000000..94c8f325
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_leak_detector.hpp
@@ -0,0 +1,19 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_LEAK_DETECTOR_HPP_INCLUDED
+#define CATCH_LEAK_DETECTOR_HPP_INCLUDED
+
+namespace Catch {
+
+    struct LeakDetector {
+        LeakDetector();
+        ~LeakDetector();
+    };
+
+}
+#endif // CATCH_LEAK_DETECTOR_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_list.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_list.cpp
new file mode 100644
index 00000000..5bd06a2a
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_list.cpp
@@ -0,0 +1,120 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_list.hpp>
+
+#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
+#include <catch2/interfaces/catch_interfaces_reporter.hpp>
+#include <catch2/interfaces/catch_interfaces_reporter_factory.hpp>
+#include <catch2/internal/catch_test_case_registry_impl.hpp>
+#include <catch2/internal/catch_reporter_registry.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/internal/catch_case_insensitive_comparisons.hpp>
+#include <catch2/catch_config.hpp>
+#include <catch2/catch_test_case_info.hpp>
+
+namespace Catch {
+    namespace {
+
+        void listTests(IEventListener& reporter, IConfig const& config) {
+            auto const& testSpec = config.testSpec();
+            auto matchedTestCases = filterTests(getAllTestCasesSorted(config), testSpec, config);
+            reporter.listTests(matchedTestCases);
+        }
+
+        void listTags(IEventListener& reporter, IConfig const& config) {
+            auto const& testSpec = config.testSpec();
+            std::vector<TestCaseHandle> matchedTestCases = filterTests(getAllTestCasesSorted(config), testSpec, config);
+
+            std::map<StringRef, TagInfo, Detail::CaseInsensitiveLess> tagCounts;
+            for (auto const& testCase : matchedTestCases) {
+                for (auto const& tagName : testCase.getTestCaseInfo().tags) {
+                    auto it = tagCounts.find(tagName.original);
+                    if (it == tagCounts.end())
+                        it = tagCounts.insert(std::make_pair(tagName.original, TagInfo())).first;
+                    it->second.add(tagName.original);
+                }
+            }
+
+            std::vector<TagInfo> infos; infos.reserve(tagCounts.size());
+            for (auto& tagc : tagCounts) {
+                infos.push_back(CATCH_MOVE(tagc.second));
+            }
+
+            reporter.listTags(infos);
+        }
+
+        void listReporters(IEventListener& reporter) {
+            std::vector<ReporterDescription> descriptions;
+
+            auto const& factories = getRegistryHub().getReporterRegistry().getFactories();
+            descriptions.reserve(factories.size());
+            for (auto const& fac : factories) {
+                descriptions.push_back({ fac.first, fac.second->getDescription() });
+            }
+
+            reporter.listReporters(descriptions);
+        }
+
+        void listListeners(IEventListener& reporter) {
+            std::vector<ListenerDescription> descriptions;
+
+            auto const& factories =
+                getRegistryHub().getReporterRegistry().getListeners();
+            descriptions.reserve( factories.size() );
+            for ( auto const& fac : factories ) {
+                descriptions.push_back( { fac->getName(), fac->getDescription() } );
+            }
+
+            reporter.listListeners( descriptions );
+        }
+
+    } // end anonymous namespace
+
+    void TagInfo::add( StringRef spelling ) {
+        ++count;
+        spellings.insert( spelling );
+    }
+
+    std::string TagInfo::all() const {
+        // 2 per tag for brackets '[' and ']'
+        size_t size =  spellings.size() * 2;
+        for (auto const& spelling : spellings) {
+            size += spelling.size();
+        }
+
+        std::string out; out.reserve(size);
+        for (auto const& spelling : spellings) {
+            out += '[';
+            out += spelling;
+            out += ']';
+        }
+        return out;
+    }
+
+    bool list( IEventListener& reporter, Config const& config ) {
+        bool listed = false;
+        if (config.listTests()) {
+            listed = true;
+            listTests(reporter, config);
+        }
+        if (config.listTags()) {
+            listed = true;
+            listTags(reporter, config);
+        }
+        if (config.listReporters()) {
+            listed = true;
+            listReporters(reporter);
+        }
+        if ( config.listListeners() ) {
+            listed = true;
+            listListeners( reporter );
+        }
+        return listed;
+    }
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_list.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_list.hpp
new file mode 100644
index 00000000..9b4abd82
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_list.hpp
@@ -0,0 +1,43 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_LIST_HPP_INCLUDED
+#define CATCH_LIST_HPP_INCLUDED
+
+#include <catch2/internal/catch_stringref.hpp>
+
+#include <set>
+#include <string>
+
+
+namespace Catch {
+
+    class IEventListener;
+    class Config;
+
+
+    struct ReporterDescription {
+        std::string name, description;
+    };
+    struct ListenerDescription {
+        StringRef name;
+        std::string description;
+    };
+
+    struct TagInfo {
+        void add(StringRef spelling);
+        std::string all() const;
+
+        std::set<StringRef> spellings;
+        std::size_t count = 0;
+    };
+
+    bool list( IEventListener& reporter, Config const& config );
+
+} // end namespace Catch
+
+#endif // CATCH_LIST_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_logical_traits.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_logical_traits.hpp
new file mode 100644
index 00000000..bd875659
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_logical_traits.hpp
@@ -0,0 +1,44 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_LOGICAL_TRAITS_HPP_INCLUDED
+#define CATCH_LOGICAL_TRAITS_HPP_INCLUDED
+
+#include <type_traits>
+
+namespace Catch {
+namespace Detail {
+
+#if defined( __cpp_lib_logical_traits ) && __cpp_lib_logical_traits >= 201510
+
+    using std::conjunction;
+    using std::disjunction;
+    using std::negation;
+
+#else
+
+    template <class...> struct conjunction : std::true_type {};
+    template <class B1> struct conjunction<B1> : B1 {};
+    template <class B1, class... Bn>
+    struct conjunction<B1, Bn...>
+        : std::conditional_t<bool( B1::value ), conjunction<Bn...>, B1> {};
+
+    template <class...> struct disjunction : std::false_type {};
+    template <class B1> struct disjunction<B1> : B1 {};
+    template <class B1, class... Bn>
+    struct disjunction<B1, Bn...>
+        : std::conditional_t<bool( B1::value ), B1, disjunction<Bn...>> {};
+
+    template <class B>
+    struct negation : std::integral_constant<bool, !bool(B::value)> {};
+
+#endif
+
+} // namespace Detail
+} // namespace Catch
+
+#endif // CATCH_LOGICAL_TRAITS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_main.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_main.cpp
new file mode 100644
index 00000000..503b540f
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_main.cpp
@@ -0,0 +1,39 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/catch_session.hpp>
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/internal/catch_config_wchar.hpp>
+#include <catch2/internal/catch_leak_detector.hpp>
+#include <catch2/internal/catch_platform.hpp>
+
+namespace Catch {
+    CATCH_INTERNAL_START_WARNINGS_SUPPRESSION
+    CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS
+    static LeakDetector leakDetector;
+    CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+}
+
+// Allow users of amalgamated .cpp file to remove our main and provide their own.
+#if !defined(CATCH_AMALGAMATED_CUSTOM_MAIN)
+
+#if defined(CATCH_CONFIG_WCHAR) && defined(CATCH_PLATFORM_WINDOWS) && defined(_UNICODE) && !defined(DO_NOT_USE_WMAIN)
+// Standard C/C++ Win32 Unicode wmain entry point
+extern "C" int __cdecl wmain (int argc, wchar_t * argv[], wchar_t * []) {
+#else
+// Standard C/C++ main entry point
+int main (int argc, char * argv[]) {
+#endif
+
+    // We want to force the linker not to discard the global variable
+    // and its constructor, as it (optionally) registers leak detector
+    (void)&Catch::leakDetector;
+
+    return Catch::Session().run( argc, argv );
+}
+
+#endif // !defined(CATCH_AMALGAMATED_CUSTOM_MAIN
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_message_info.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_message_info.cpp
new file mode 100644
index 00000000..e0e9dc7e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_message_info.cpp
@@ -0,0 +1,25 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/internal/catch_message_info.hpp>
+
+namespace Catch {
+
+    MessageInfo::MessageInfo(   StringRef _macroName,
+                                SourceLineInfo const& _lineInfo,
+                                ResultWas::OfType _type )
+    :   macroName( _macroName ),
+        lineInfo( _lineInfo ),
+        type( _type ),
+        sequence( ++globalCount )
+    {}
+
+    // This may need protecting if threading support is added
+    unsigned int MessageInfo::globalCount = 0;
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_message_info.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_message_info.hpp
new file mode 100644
index 00000000..1ef43fda
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_message_info.hpp
@@ -0,0 +1,42 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_MESSAGE_INFO_HPP_INCLUDED
+#define CATCH_MESSAGE_INFO_HPP_INCLUDED
+
+#include <catch2/internal/catch_result_type.hpp>
+#include <catch2/internal/catch_source_line_info.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+
+#include <string>
+
+namespace Catch {
+
+    struct MessageInfo {
+        MessageInfo(    StringRef _macroName,
+                        SourceLineInfo const& _lineInfo,
+                        ResultWas::OfType _type );
+
+        StringRef macroName;
+        std::string message;
+        SourceLineInfo lineInfo;
+        ResultWas::OfType type;
+        unsigned int sequence;
+
+        bool operator == (MessageInfo const& other) const {
+            return sequence == other.sequence;
+        }
+        bool operator < (MessageInfo const& other) const {
+            return sequence < other.sequence;
+        }
+    private:
+        static unsigned int globalCount;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_MESSAGE_INFO_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_meta.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_meta.hpp
new file mode 100644
index 00000000..6fbc13a5
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_meta.hpp
@@ -0,0 +1,47 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_META_HPP_INCLUDED
+#define CATCH_META_HPP_INCLUDED
+
+#include <type_traits>
+
+namespace Catch {
+    template <typename>
+    struct true_given : std::true_type {};
+
+    struct is_callable_tester {
+        template <typename Fun, typename... Args>
+        static true_given<decltype(std::declval<Fun>()(std::declval<Args>()...))> test(int);
+        template <typename...>
+        static std::false_type test(...);
+    };
+
+    template <typename T>
+    struct is_callable;
+
+    template <typename Fun, typename... Args>
+    struct is_callable<Fun(Args...)> : decltype(is_callable_tester::test<Fun, Args...>(0)) {};
+
+
+#if defined(__cpp_lib_is_invocable) && __cpp_lib_is_invocable >= 201703
+    // std::result_of is deprecated in C++17 and removed in C++20. Hence, it is
+    // replaced with std::invoke_result here.
+    template <typename Func, typename... U>
+    using FunctionReturnType = std::remove_reference_t<std::remove_cv_t<std::invoke_result_t<Func, U...>>>;
+#else
+    template <typename Func, typename... U>
+    using FunctionReturnType = std::remove_reference_t<std::remove_cv_t<std::result_of_t<Func(U...)>>>;
+#endif
+
+} // namespace Catch
+
+namespace mpl_{
+    struct na;
+}
+
+#endif // CATCH_META_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_move_and_forward.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_move_and_forward.hpp
new file mode 100644
index 00000000..383d85cf
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_move_and_forward.hpp
@@ -0,0 +1,19 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_MOVE_AND_FORWARD_HPP_INCLUDED
+#define CATCH_MOVE_AND_FORWARD_HPP_INCLUDED
+
+#include <type_traits>
+
+//! Replacement for std::move with better compile time performance
+#define CATCH_MOVE(...) static_cast<std::remove_reference_t<decltype(__VA_ARGS__)>&&>(__VA_ARGS__)
+
+//! Replacement for std::forward with better compile time performance
+#define CATCH_FORWARD(...) static_cast<decltype(__VA_ARGS__)&&>(__VA_ARGS__)
+
+#endif // CATCH_MOVE_AND_FORWARD_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_noncopyable.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_noncopyable.hpp
new file mode 100644
index 00000000..eb0823e8
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_noncopyable.hpp
@@ -0,0 +1,28 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_NONCOPYABLE_HPP_INCLUDED
+#define CATCH_NONCOPYABLE_HPP_INCLUDED
+
+namespace Catch {
+    namespace Detail {
+
+        //! Deriving classes become noncopyable and nonmovable
+        class NonCopyable {
+            NonCopyable( NonCopyable const& ) = delete;
+            NonCopyable( NonCopyable&& ) = delete;
+            NonCopyable& operator=( NonCopyable const& ) = delete;
+            NonCopyable& operator=( NonCopyable&& ) = delete;
+
+        protected:
+            NonCopyable() noexcept = default;
+        };
+
+    } // namespace Detail
+} // namespace Catch
+
+#endif // CATCH_NONCOPYABLE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_optional.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_optional.hpp
new file mode 100644
index 00000000..d1e953ad
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_optional.hpp
@@ -0,0 +1,117 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_OPTIONAL_HPP_INCLUDED
+#define CATCH_OPTIONAL_HPP_INCLUDED
+
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <cassert>
+
+namespace Catch {
+
+    // An optional type
+    template<typename T>
+    class Optional {
+    public:
+        Optional(): nullableValue( nullptr ) {}
+        ~Optional() { reset(); }
+
+        Optional( T const& _value ):
+            nullableValue( new ( storage ) T( _value ) ) {}
+        Optional( T&& _value ):
+            nullableValue( new ( storage ) T( CATCH_MOVE( _value ) ) ) {}
+
+        Optional& operator=( T const& _value ) {
+            reset();
+            nullableValue = new ( storage ) T( _value );
+            return *this;
+        }
+        Optional& operator=( T&& _value ) {
+            reset();
+            nullableValue = new ( storage ) T( CATCH_MOVE( _value ) );
+            return *this;
+        }
+
+        Optional( Optional const& _other ):
+            nullableValue( _other ? new ( storage ) T( *_other ) : nullptr ) {}
+        Optional( Optional&& _other ):
+            nullableValue( _other ? new ( storage ) T( CATCH_MOVE( *_other ) )
+                                  : nullptr ) {}
+
+        Optional& operator=( Optional const& _other ) {
+            if ( &_other != this ) {
+                reset();
+                if ( _other ) { nullableValue = new ( storage ) T( *_other ); }
+            }
+            return *this;
+        }
+        Optional& operator=( Optional&& _other ) {
+            if ( &_other != this ) {
+                reset();
+                if ( _other ) {
+                    nullableValue = new ( storage ) T( CATCH_MOVE( *_other ) );
+                }
+            }
+            return *this;
+        }
+
+        void reset() {
+            if ( nullableValue ) { nullableValue->~T(); }
+            nullableValue = nullptr;
+        }
+
+        T& operator*() {
+            assert(nullableValue);
+            return *nullableValue;
+        }
+        T const& operator*() const {
+            assert(nullableValue);
+            return *nullableValue;
+        }
+        T* operator->() {
+            assert(nullableValue);
+            return nullableValue;
+        }
+        const T* operator->() const {
+            assert(nullableValue);
+            return nullableValue;
+        }
+
+        T valueOr( T const& defaultValue ) const {
+            return nullableValue ? *nullableValue : defaultValue;
+        }
+
+        bool some() const { return nullableValue != nullptr; }
+        bool none() const { return nullableValue == nullptr; }
+
+        bool operator !() const { return nullableValue == nullptr; }
+        explicit operator bool() const {
+            return some();
+        }
+
+        friend bool operator==(Optional const& a, Optional const& b) {
+            if (a.none() && b.none()) {
+                return true;
+            } else if (a.some() && b.some()) {
+                return *a == *b;
+            } else {
+                return false;
+            }
+        }
+        friend bool operator!=(Optional const& a, Optional const& b) {
+            return !( a == b );
+        }
+
+    private:
+        T* nullableValue;
+        alignas(alignof(T)) char storage[sizeof(T)];
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_OPTIONAL_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_output_redirect.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_output_redirect.cpp
new file mode 100644
index 00000000..02f7c982
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_output_redirect.cpp
@@ -0,0 +1,146 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_output_redirect.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_stdstreams.hpp>
+
+#include <cstdio>
+#include <cstring>
+#include <sstream>
+
+#if defined(CATCH_CONFIG_NEW_CAPTURE)
+    #if defined(_MSC_VER)
+    #include <io.h>      //_dup and _dup2
+    #define dup _dup
+    #define dup2 _dup2
+    #define fileno _fileno
+    #else
+    #include <unistd.h>  // dup and dup2
+    #endif
+#endif
+
+
+namespace Catch {
+
+    RedirectedStream::RedirectedStream( std::ostream& originalStream, std::ostream& redirectionStream )
+    :   m_originalStream( originalStream ),
+        m_redirectionStream( redirectionStream ),
+        m_prevBuf( m_originalStream.rdbuf() )
+    {
+        m_originalStream.rdbuf( m_redirectionStream.rdbuf() );
+    }
+
+    RedirectedStream::~RedirectedStream() {
+        m_originalStream.rdbuf( m_prevBuf );
+    }
+
+    RedirectedStdOut::RedirectedStdOut() : m_cout( Catch::cout(), m_rss.get() ) {}
+    auto RedirectedStdOut::str() const -> std::string { return m_rss.str(); }
+
+    RedirectedStdErr::RedirectedStdErr()
+    :   m_cerr( Catch::cerr(), m_rss.get() ),
+        m_clog( Catch::clog(), m_rss.get() )
+    {}
+    auto RedirectedStdErr::str() const -> std::string { return m_rss.str(); }
+
+    RedirectedStreams::RedirectedStreams(std::string& redirectedCout, std::string& redirectedCerr)
+    :   m_redirectedCout(redirectedCout),
+        m_redirectedCerr(redirectedCerr)
+    {}
+
+    RedirectedStreams::~RedirectedStreams() {
+        m_redirectedCout += m_redirectedStdOut.str();
+        m_redirectedCerr += m_redirectedStdErr.str();
+    }
+
+#if defined(CATCH_CONFIG_NEW_CAPTURE)
+
+#if defined(_MSC_VER)
+    TempFile::TempFile() {
+        if (tmpnam_s(m_buffer)) {
+            CATCH_RUNTIME_ERROR("Could not get a temp filename");
+        }
+        if (fopen_s(&m_file, m_buffer, "w+")) {
+            char buffer[100];
+            if (strerror_s(buffer, errno)) {
+                CATCH_RUNTIME_ERROR("Could not translate errno to a string");
+            }
+            CATCH_RUNTIME_ERROR("Could not open the temp file: '" << m_buffer << "' because: " << buffer);
+        }
+    }
+#else
+    TempFile::TempFile() {
+        m_file = std::tmpfile();
+        if (!m_file) {
+            CATCH_RUNTIME_ERROR("Could not create a temp file.");
+        }
+    }
+
+#endif
+
+    TempFile::~TempFile() {
+         // TBD: What to do about errors here?
+         std::fclose(m_file);
+         // We manually create the file on Windows only, on Linux
+         // it will be autodeleted
+#if defined(_MSC_VER)
+         std::remove(m_buffer);
+#endif
+    }
+
+
+    FILE* TempFile::getFile() {
+        return m_file;
+    }
+
+    std::string TempFile::getContents() {
+        std::stringstream sstr;
+        char buffer[100] = {};
+        std::rewind(m_file);
+        while (std::fgets(buffer, sizeof(buffer), m_file)) {
+            sstr << buffer;
+        }
+        return sstr.str();
+    }
+
+    OutputRedirect::OutputRedirect(std::string& stdout_dest, std::string& stderr_dest) :
+        m_originalStdout(dup(1)),
+        m_originalStderr(dup(2)),
+        m_stdoutDest(stdout_dest),
+        m_stderrDest(stderr_dest) {
+        dup2(fileno(m_stdoutFile.getFile()), 1);
+        dup2(fileno(m_stderrFile.getFile()), 2);
+    }
+
+    OutputRedirect::~OutputRedirect() {
+        Catch::cout() << std::flush;
+        fflush(stdout);
+        // Since we support overriding these streams, we flush cerr
+        // even though std::cerr is unbuffered
+        Catch::cerr() << std::flush;
+        Catch::clog() << std::flush;
+        fflush(stderr);
+
+        dup2(m_originalStdout, 1);
+        dup2(m_originalStderr, 2);
+
+        m_stdoutDest += m_stdoutFile.getContents();
+        m_stderrDest += m_stderrFile.getContents();
+    }
+
+#endif // CATCH_CONFIG_NEW_CAPTURE
+
+} // namespace Catch
+
+#if defined(CATCH_CONFIG_NEW_CAPTURE)
+    #if defined(_MSC_VER)
+    #undef dup
+    #undef dup2
+    #undef fileno
+    #endif
+#endif
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_output_redirect.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_output_redirect.hpp
new file mode 100644
index 00000000..dc89223b
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_output_redirect.hpp
@@ -0,0 +1,118 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_OUTPUT_REDIRECT_HPP_INCLUDED
+#define CATCH_OUTPUT_REDIRECT_HPP_INCLUDED
+
+#include <catch2/internal/catch_platform.hpp>
+#include <catch2/internal/catch_reusable_string_stream.hpp>
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+
+#include <cstdio>
+#include <iosfwd>
+#include <string>
+
+namespace Catch {
+
+    class RedirectedStream {
+        std::ostream& m_originalStream;
+        std::ostream& m_redirectionStream;
+        std::streambuf* m_prevBuf;
+
+    public:
+        RedirectedStream( std::ostream& originalStream, std::ostream& redirectionStream );
+        ~RedirectedStream();
+    };
+
+    class RedirectedStdOut {
+        ReusableStringStream m_rss;
+        RedirectedStream m_cout;
+    public:
+        RedirectedStdOut();
+        auto str() const -> std::string;
+    };
+
+    // StdErr has two constituent streams in C++, std::cerr and std::clog
+    // This means that we need to redirect 2 streams into 1 to keep proper
+    // order of writes
+    class RedirectedStdErr {
+        ReusableStringStream m_rss;
+        RedirectedStream m_cerr;
+        RedirectedStream m_clog;
+    public:
+        RedirectedStdErr();
+        auto str() const -> std::string;
+    };
+
+    class RedirectedStreams {
+    public:
+        RedirectedStreams(RedirectedStreams const&) = delete;
+        RedirectedStreams& operator=(RedirectedStreams const&) = delete;
+        RedirectedStreams(RedirectedStreams&&) = delete;
+        RedirectedStreams& operator=(RedirectedStreams&&) = delete;
+
+        RedirectedStreams(std::string& redirectedCout, std::string& redirectedCerr);
+        ~RedirectedStreams();
+    private:
+        std::string& m_redirectedCout;
+        std::string& m_redirectedCerr;
+        RedirectedStdOut m_redirectedStdOut;
+        RedirectedStdErr m_redirectedStdErr;
+    };
+
+#if defined(CATCH_CONFIG_NEW_CAPTURE)
+
+    // Windows's implementation of std::tmpfile is terrible (it tries
+    // to create a file inside system folder, thus requiring elevated
+    // privileges for the binary), so we have to use tmpnam(_s) and
+    // create the file ourselves there.
+    class TempFile {
+    public:
+        TempFile(TempFile const&) = delete;
+        TempFile& operator=(TempFile const&) = delete;
+        TempFile(TempFile&&) = delete;
+        TempFile& operator=(TempFile&&) = delete;
+
+        TempFile();
+        ~TempFile();
+
+        std::FILE* getFile();
+        std::string getContents();
+
+    private:
+        std::FILE* m_file = nullptr;
+    #if defined(_MSC_VER)
+        char m_buffer[L_tmpnam] = { 0 };
+    #endif
+    };
+
+
+    class OutputRedirect {
+    public:
+        OutputRedirect(OutputRedirect const&) = delete;
+        OutputRedirect& operator=(OutputRedirect const&) = delete;
+        OutputRedirect(OutputRedirect&&) = delete;
+        OutputRedirect& operator=(OutputRedirect&&) = delete;
+
+
+        OutputRedirect(std::string& stdout_dest, std::string& stderr_dest);
+        ~OutputRedirect();
+
+    private:
+        int m_originalStdout = -1;
+        int m_originalStderr = -1;
+        TempFile m_stdoutFile;
+        TempFile m_stderrFile;
+        std::string& m_stdoutDest;
+        std::string& m_stderrDest;
+    };
+
+#endif
+
+} // end namespace Catch
+
+#endif // CATCH_OUTPUT_REDIRECT_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_parse_numbers.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_parse_numbers.cpp
new file mode 100644
index 00000000..d949ac19
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_parse_numbers.cpp
@@ -0,0 +1,52 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/internal/catch_parse_numbers.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+
+#include <limits>
+#include <stdexcept>
+
+namespace Catch {
+
+    Optional<unsigned int> parseUInt(std::string const& input, int base) {
+        auto trimmed = trim( input );
+        // std::stoull is annoying and accepts numbers starting with '-',
+        // it just negates them into unsigned int
+        if ( trimmed.empty() || trimmed[0] == '-' ) {
+            return {};
+        }
+
+        CATCH_TRY {
+            size_t pos = 0;
+            const auto ret = std::stoull( trimmed, &pos, base );
+
+            // We did not consume the whole input, so there is an issue
+            // This can be bunch of different stuff, like multiple numbers
+            // in the input, or invalid digits/characters and so on. Either
+            // way, we do not want to return the partially parsed result.
+            if ( pos != trimmed.size() ) {
+                return {};
+            }
+            // Too large
+            if ( ret > std::numeric_limits<unsigned int>::max() ) {
+                return {};
+            }
+            return static_cast<unsigned int>(ret);
+        }
+        CATCH_CATCH_ANON( std::invalid_argument const& ) {
+            // no conversion could be performed
+        }
+        CATCH_CATCH_ANON( std::out_of_range const& ) {
+            // the input does not fit into an unsigned long long
+        }
+        return {};
+    }
+
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_parse_numbers.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_parse_numbers.hpp
new file mode 100644
index 00000000..3dabf95f
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_parse_numbers.hpp
@@ -0,0 +1,26 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_PARSE_NUMBERS_HPP_INCLUDED
+#define CATCH_PARSE_NUMBERS_HPP_INCLUDED
+
+#include <catch2/internal/catch_optional.hpp>
+
+#include <string>
+
+namespace Catch {
+
+    /**
+     * Parses unsigned int from the input, using provided base
+     *
+     * Effectively a wrapper around std::stoul but with better error checking
+     * e.g. "-1" is rejected, instead of being parsed as UINT_MAX.
+     */
+    Optional<unsigned int> parseUInt(std::string const& input, int base = 10);
+}
+
+#endif // CATCH_PARSE_NUMBERS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_platform.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_platform.hpp
new file mode 100644
index 00000000..b11d9ccd
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_platform.hpp
@@ -0,0 +1,37 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_PLATFORM_HPP_INCLUDED
+#define CATCH_PLATFORM_HPP_INCLUDED
+
+// See e.g.:
+// https://opensource.apple.com/source/CarbonHeaders/CarbonHeaders-18.1/TargetConditionals.h.auto.html
+#ifdef __APPLE__
+#  include <TargetConditionals.h>
+#  if (defined(TARGET_OS_OSX) && TARGET_OS_OSX == 1) || \
+      (defined(TARGET_OS_MAC) && TARGET_OS_MAC == 1)
+#    define CATCH_PLATFORM_MAC
+#  elif (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE == 1)
+#    define CATCH_PLATFORM_IPHONE
+#  endif
+
+#elif defined(linux) || defined(__linux) || defined(__linux__)
+#  define CATCH_PLATFORM_LINUX
+
+#elif defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(_MSC_VER) || defined(__MINGW32__)
+#  define CATCH_PLATFORM_WINDOWS
+
+#  if defined( WINAPI_FAMILY ) && ( WINAPI_FAMILY == WINAPI_FAMILY_APP )
+#      define CATCH_PLATFORM_WINDOWS_UWP
+#  endif
+
+#elif defined(__ORBIS__) || defined(__PROSPERO__)
+#  define CATCH_PLATFORM_PLAYSTATION
+
+#endif
+
+#endif // CATCH_PLATFORM_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_polyfills.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_polyfills.cpp
new file mode 100644
index 00000000..776c2243
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_polyfills.cpp
@@ -0,0 +1,42 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/internal/catch_polyfills.hpp>
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/catch_user_config.hpp>
+
+#include <cmath>
+
+namespace Catch {
+
+#if !defined(CATCH_CONFIG_POLYFILL_ISNAN)
+    bool isnan(float f) {
+        return std::isnan(f);
+    }
+    bool isnan(double d) {
+        return std::isnan(d);
+    }
+#else
+    // For now we only use this for embarcadero
+    bool isnan(float f) {
+        return std::_isnan(f);
+    }
+    bool isnan(double d) {
+        return std::_isnan(d);
+    }
+#endif
+
+#if !defined( CATCH_CONFIG_GLOBAL_NEXTAFTER )
+    float nextafter( float x, float y ) { return std::nextafter( x, y ); }
+    double nextafter( double x, double y ) { return std::nextafter( x, y ); }
+#else
+    float nextafter( float x, float y ) { return ::nextafterf( x, y ); }
+    double nextafter( double x, double y ) { return ::nextafter( x, y ); }
+#endif
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_polyfills.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_polyfills.hpp
new file mode 100644
index 00000000..4503f8f2
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_polyfills.hpp
@@ -0,0 +1,21 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_POLYFILLS_HPP_INCLUDED
+#define CATCH_POLYFILLS_HPP_INCLUDED
+
+namespace Catch {
+
+    bool isnan(float f);
+    bool isnan(double d);
+
+    float nextafter(float x, float y);
+    double nextafter(double x, double y);
+
+}
+
+#endif // CATCH_POLYFILLS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_preprocessor.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_preprocessor.hpp
new file mode 100644
index 00000000..08e844d2
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_preprocessor.hpp
@@ -0,0 +1,237 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_PREPROCESSOR_HPP_INCLUDED
+#define CATCH_PREPROCESSOR_HPP_INCLUDED
+
+#include <catch2/internal/catch_preprocessor_remove_parens.hpp>
+
+#if defined(__GNUC__)
+// We need to silence "empty __VA_ARGS__ warning", and using just _Pragma does not work
+#pragma GCC system_header
+#endif
+
+
+#define CATCH_RECURSION_LEVEL0(...) __VA_ARGS__
+#define CATCH_RECURSION_LEVEL1(...) CATCH_RECURSION_LEVEL0(CATCH_RECURSION_LEVEL0(CATCH_RECURSION_LEVEL0(__VA_ARGS__)))
+#define CATCH_RECURSION_LEVEL2(...) CATCH_RECURSION_LEVEL1(CATCH_RECURSION_LEVEL1(CATCH_RECURSION_LEVEL1(__VA_ARGS__)))
+#define CATCH_RECURSION_LEVEL3(...) CATCH_RECURSION_LEVEL2(CATCH_RECURSION_LEVEL2(CATCH_RECURSION_LEVEL2(__VA_ARGS__)))
+#define CATCH_RECURSION_LEVEL4(...) CATCH_RECURSION_LEVEL3(CATCH_RECURSION_LEVEL3(CATCH_RECURSION_LEVEL3(__VA_ARGS__)))
+#define CATCH_RECURSION_LEVEL5(...) CATCH_RECURSION_LEVEL4(CATCH_RECURSION_LEVEL4(CATCH_RECURSION_LEVEL4(__VA_ARGS__)))
+
+#ifdef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#define INTERNAL_CATCH_EXPAND_VARGS(...) __VA_ARGS__
+// MSVC needs more evaluations
+#define CATCH_RECURSION_LEVEL6(...) CATCH_RECURSION_LEVEL5(CATCH_RECURSION_LEVEL5(CATCH_RECURSION_LEVEL5(__VA_ARGS__)))
+#define CATCH_RECURSE(...)  CATCH_RECURSION_LEVEL6(CATCH_RECURSION_LEVEL6(__VA_ARGS__))
+#else
+#define CATCH_RECURSE(...)  CATCH_RECURSION_LEVEL5(__VA_ARGS__)
+#endif
+
+#define CATCH_REC_END(...)
+#define CATCH_REC_OUT
+
+#define CATCH_EMPTY()
+#define CATCH_DEFER(id) id CATCH_EMPTY()
+
+#define CATCH_REC_GET_END2() 0, CATCH_REC_END
+#define CATCH_REC_GET_END1(...) CATCH_REC_GET_END2
+#define CATCH_REC_GET_END(...) CATCH_REC_GET_END1
+#define CATCH_REC_NEXT0(test, next, ...) next CATCH_REC_OUT
+#define CATCH_REC_NEXT1(test, next) CATCH_DEFER ( CATCH_REC_NEXT0 ) ( test, next, 0)
+#define CATCH_REC_NEXT(test, next)  CATCH_REC_NEXT1(CATCH_REC_GET_END test, next)
+
+#define CATCH_REC_LIST0(f, x, peek, ...) , f(x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1) ) ( f, peek, __VA_ARGS__ )
+#define CATCH_REC_LIST1(f, x, peek, ...) , f(x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST0) ) ( f, peek, __VA_ARGS__ )
+#define CATCH_REC_LIST2(f, x, peek, ...)   f(x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1) ) ( f, peek, __VA_ARGS__ )
+
+#define CATCH_REC_LIST0_UD(f, userdata, x, peek, ...) , f(userdata, x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1_UD) ) ( f, userdata, peek, __VA_ARGS__ )
+#define CATCH_REC_LIST1_UD(f, userdata, x, peek, ...) , f(userdata, x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST0_UD) ) ( f, userdata, peek, __VA_ARGS__ )
+#define CATCH_REC_LIST2_UD(f, userdata, x, peek, ...)   f(userdata, x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1_UD) ) ( f, userdata, peek, __VA_ARGS__ )
+
+// Applies the function macro `f` to each of the remaining parameters, inserts commas between the results,
+// and passes userdata as the first parameter to each invocation,
+// e.g. CATCH_REC_LIST_UD(f, x, a, b, c) evaluates to f(x, a), f(x, b), f(x, c)
+#define CATCH_REC_LIST_UD(f, userdata, ...) CATCH_RECURSE(CATCH_REC_LIST2_UD(f, userdata, __VA_ARGS__, ()()(), ()()(), ()()(), 0))
+
+#define CATCH_REC_LIST(f, ...) CATCH_RECURSE(CATCH_REC_LIST2(f, __VA_ARGS__, ()()(), ()()(), ()()(), 0))
+
+#define INTERNAL_CATCH_STRINGIZE(...) INTERNAL_CATCH_STRINGIZE2(__VA_ARGS__)
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#define INTERNAL_CATCH_STRINGIZE2(...) #__VA_ARGS__
+#define INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS(param) INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_REMOVE_PARENS(param))
+#else
+// MSVC is adding extra space and needs another indirection to expand INTERNAL_CATCH_NOINTERNAL_CATCH_DEF
+#define INTERNAL_CATCH_STRINGIZE2(...) INTERNAL_CATCH_STRINGIZE3(__VA_ARGS__)
+#define INTERNAL_CATCH_STRINGIZE3(...) #__VA_ARGS__
+#define INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS(param) (INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_REMOVE_PARENS(param)) + 1)
+#endif
+
+#define INTERNAL_CATCH_MAKE_NAMESPACE2(...) ns_##__VA_ARGS__
+#define INTERNAL_CATCH_MAKE_NAMESPACE(name) INTERNAL_CATCH_MAKE_NAMESPACE2(name)
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#define INTERNAL_CATCH_MAKE_TYPE_LIST2(...) decltype(get_wrapper<INTERNAL_CATCH_REMOVE_PARENS_GEN(__VA_ARGS__)>())
+#define INTERNAL_CATCH_MAKE_TYPE_LIST(...) INTERNAL_CATCH_MAKE_TYPE_LIST2(INTERNAL_CATCH_REMOVE_PARENS(__VA_ARGS__))
+#else
+#define INTERNAL_CATCH_MAKE_TYPE_LIST2(...) INTERNAL_CATCH_EXPAND_VARGS(decltype(get_wrapper<INTERNAL_CATCH_REMOVE_PARENS_GEN(__VA_ARGS__)>()))
+#define INTERNAL_CATCH_MAKE_TYPE_LIST(...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_MAKE_TYPE_LIST2(INTERNAL_CATCH_REMOVE_PARENS(__VA_ARGS__)))
+#endif
+
+#define INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(...)\
+    CATCH_REC_LIST(INTERNAL_CATCH_MAKE_TYPE_LIST,__VA_ARGS__)
+
+#define INTERNAL_CATCH_REMOVE_PARENS_1_ARG(_0) INTERNAL_CATCH_REMOVE_PARENS(_0)
+#define INTERNAL_CATCH_REMOVE_PARENS_2_ARG(_0, _1) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_1_ARG(_1)
+#define INTERNAL_CATCH_REMOVE_PARENS_3_ARG(_0, _1, _2) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_2_ARG(_1, _2)
+#define INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_0, _1, _2, _3) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_3_ARG(_1, _2, _3)
+#define INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_0, _1, _2, _3, _4) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_1, _2, _3, _4)
+#define INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_0, _1, _2, _3, _4, _5) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_1, _2, _3, _4, _5)
+#define INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_0, _1, _2, _3, _4, _5, _6) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_1, _2, _3, _4, _5, _6)
+#define INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_0, _1, _2, _3, _4, _5, _6, _7) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_1, _2, _3, _4, _5, _6, _7)
+#define INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_1, _2, _3, _4, _5, _6, _7, _8)
+#define INTERNAL_CATCH_REMOVE_PARENS_10_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9)
+#define INTERNAL_CATCH_REMOVE_PARENS_11_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_10_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10)
+
+#define INTERNAL_CATCH_VA_NARGS_IMPL(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) N
+
+#define INTERNAL_CATCH_TYPE_GEN\
+    template<typename...> struct TypeList {};\
+    template<typename...Ts>\
+    constexpr auto get_wrapper() noexcept -> TypeList<Ts...> { return {}; }\
+    template<template<typename...> class...> struct TemplateTypeList{};\
+    template<template<typename...> class...Cs>\
+    constexpr auto get_wrapper() noexcept -> TemplateTypeList<Cs...> { return {}; }\
+    template<typename...>\
+    struct append;\
+    template<typename...>\
+    struct rewrap;\
+    template<template<typename...> class, typename...>\
+    struct create;\
+    template<template<typename...> class, typename>\
+    struct convert;\
+    \
+    template<typename T> \
+    struct append<T> { using type = T; };\
+    template< template<typename...> class L1, typename...E1, template<typename...> class L2, typename...E2, typename...Rest>\
+    struct append<L1<E1...>, L2<E2...>, Rest...> { using type = typename append<L1<E1...,E2...>, Rest...>::type; };\
+    template< template<typename...> class L1, typename...E1, typename...Rest>\
+    struct append<L1<E1...>, TypeList<mpl_::na>, Rest...> { using type = L1<E1...>; };\
+    \
+    template< template<typename...> class Container, template<typename...> class List, typename...elems>\
+    struct rewrap<TemplateTypeList<Container>, List<elems...>> { using type = TypeList<Container<elems...>>; };\
+    template< template<typename...> class Container, template<typename...> class List, class...Elems, typename...Elements>\
+    struct rewrap<TemplateTypeList<Container>, List<Elems...>, Elements...> { using type = typename append<TypeList<Container<Elems...>>, typename rewrap<TemplateTypeList<Container>, Elements...>::type>::type; };\
+    \
+    template<template <typename...> class Final, template< typename...> class...Containers, typename...Types>\
+    struct create<Final, TemplateTypeList<Containers...>, TypeList<Types...>> { using type = typename append<Final<>, typename rewrap<TemplateTypeList<Containers>, Types...>::type...>::type; };\
+    template<template <typename...> class Final, template <typename...> class List, typename...Ts>\
+    struct convert<Final, List<Ts...>> { using type = typename append<Final<>,TypeList<Ts>...>::type; };
+
+#define INTERNAL_CATCH_NTTP_1(signature, ...)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)> struct Nttp{};\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    constexpr auto get_wrapper() noexcept -> Nttp<__VA_ARGS__> { return {}; } \
+    template<template<INTERNAL_CATCH_REMOVE_PARENS(signature)> class...> struct NttpTemplateTypeList{};\
+    template<template<INTERNAL_CATCH_REMOVE_PARENS(signature)> class...Cs>\
+    constexpr auto get_wrapper() noexcept -> NttpTemplateTypeList<Cs...> { return {}; } \
+    \
+    template< template<INTERNAL_CATCH_REMOVE_PARENS(signature)> class Container, template<INTERNAL_CATCH_REMOVE_PARENS(signature)> class List, INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    struct rewrap<NttpTemplateTypeList<Container>, List<__VA_ARGS__>> { using type = TypeList<Container<__VA_ARGS__>>; };\
+    template< template<INTERNAL_CATCH_REMOVE_PARENS(signature)> class Container, template<INTERNAL_CATCH_REMOVE_PARENS(signature)> class List, INTERNAL_CATCH_REMOVE_PARENS(signature), typename...Elements>\
+    struct rewrap<NttpTemplateTypeList<Container>, List<__VA_ARGS__>, Elements...> { using type = typename append<TypeList<Container<__VA_ARGS__>>, typename rewrap<NttpTemplateTypeList<Container>, Elements...>::type>::type; };\
+    template<template <typename...> class Final, template<INTERNAL_CATCH_REMOVE_PARENS(signature)> class...Containers, typename...Types>\
+    struct create<Final, NttpTemplateTypeList<Containers...>, TypeList<Types...>> { using type = typename append<Final<>, typename rewrap<NttpTemplateTypeList<Containers>, Types...>::type...>::type; };
+
+#define INTERNAL_CATCH_DECLARE_SIG_TEST0(TestName)
+#define INTERNAL_CATCH_DECLARE_SIG_TEST1(TestName, signature)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    static void TestName()
+#define INTERNAL_CATCH_DECLARE_SIG_TEST_X(TestName, signature, ...)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    static void TestName()
+
+#define INTERNAL_CATCH_DEFINE_SIG_TEST0(TestName)
+#define INTERNAL_CATCH_DEFINE_SIG_TEST1(TestName, signature)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    static void TestName()
+#define INTERNAL_CATCH_DEFINE_SIG_TEST_X(TestName, signature,...)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    static void TestName()
+
+#define INTERNAL_CATCH_NTTP_REGISTER0(TestFunc, signature)\
+    template<typename Type>\
+    void reg_test(TypeList<Type>, Catch::NameAndTags nameAndTags)\
+    {\
+        Catch::AutoReg( Catch::makeTestInvoker(&TestFunc<Type>), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), nameAndTags);\
+    }
+
+#define INTERNAL_CATCH_NTTP_REGISTER(TestFunc, signature, ...)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    void reg_test(Nttp<__VA_ARGS__>, Catch::NameAndTags nameAndTags)\
+    {\
+        Catch::AutoReg( Catch::makeTestInvoker(&TestFunc<__VA_ARGS__>), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), nameAndTags);\
+    }
+
+#define INTERNAL_CATCH_NTTP_REGISTER_METHOD0(TestName, signature, ...)\
+    template<typename Type>\
+    void reg_test(TypeList<Type>, Catch::StringRef className, Catch::NameAndTags nameAndTags)\
+    {\
+        Catch::AutoReg( Catch::makeTestInvoker(&TestName<Type>::test), CATCH_INTERNAL_LINEINFO, className, nameAndTags);\
+    }
+
+#define INTERNAL_CATCH_NTTP_REGISTER_METHOD(TestName, signature, ...)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)>\
+    void reg_test(Nttp<__VA_ARGS__>, Catch::StringRef className, Catch::NameAndTags nameAndTags)\
+    {\
+        Catch::AutoReg( Catch::makeTestInvoker(&TestName<__VA_ARGS__>::test), CATCH_INTERNAL_LINEINFO, className, nameAndTags);\
+    }
+
+#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD0(TestName, ClassName)
+#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD1(TestName, ClassName, signature)\
+    template<typename TestType> \
+    struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName)<TestType> { \
+        void test();\
+    }
+
+#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X(TestName, ClassName, signature, ...)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)> \
+    struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName)<__VA_ARGS__> { \
+        void test();\
+    }
+
+#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD0(TestName)
+#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD1(TestName, signature)\
+    template<typename TestType> \
+    void INTERNAL_CATCH_MAKE_NAMESPACE(TestName)::TestName<TestType>::test()
+#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X(TestName, signature, ...)\
+    template<INTERNAL_CATCH_REMOVE_PARENS(signature)> \
+    void INTERNAL_CATCH_MAKE_NAMESPACE(TestName)::TestName<__VA_ARGS__>::test()
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+#define INTERNAL_CATCH_NTTP_0
+#define INTERNAL_CATCH_NTTP_GEN(...) INTERNAL_CATCH_VA_NARGS_IMPL(__VA_ARGS__, INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1(__VA_ARGS__), INTERNAL_CATCH_NTTP_1( __VA_ARGS__), INTERNAL_CATCH_NTTP_1( __VA_ARGS__), INTERNAL_CATCH_NTTP_1( __VA_ARGS__), INTERNAL_CATCH_NTTP_1( __VA_ARGS__),INTERNAL_CATCH_NTTP_1( __VA_ARGS__), INTERNAL_CATCH_NTTP_0)
+#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(TestName, ...) INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD1, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD0)(TestName, __VA_ARGS__)
+#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD(TestName, ClassName, ...) INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD1, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD0)(TestName, ClassName, __VA_ARGS__)
+#define INTERNAL_CATCH_NTTP_REG_METHOD_GEN(TestName, ...) INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD0, INTERNAL_CATCH_NTTP_REGISTER_METHOD0)(TestName, __VA_ARGS__)
+#define INTERNAL_CATCH_NTTP_REG_GEN(TestFunc, ...) INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER0, INTERNAL_CATCH_NTTP_REGISTER0)(TestFunc, __VA_ARGS__)
+#define INTERNAL_CATCH_DEFINE_SIG_TEST(TestName, ...) INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,INTERNAL_CATCH_DEFINE_SIG_TEST_X,INTERNAL_CATCH_DEFINE_SIG_TEST1, INTERNAL_CATCH_DEFINE_SIG_TEST0)(TestName, __VA_ARGS__)
+#define INTERNAL_CATCH_DECLARE_SIG_TEST(TestName, ...) INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DECLARE_SIG_TEST_X,INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,INTERNAL_CATCH_DECLARE_SIG_TEST_X,INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST1, INTERNAL_CATCH_DECLARE_SIG_TEST0)(TestName, __VA_ARGS__)
+#define INTERNAL_CATCH_REMOVE_PARENS_GEN(...) INTERNAL_CATCH_VA_NARGS_IMPL(__VA_ARGS__, INTERNAL_CATCH_REMOVE_PARENS_11_ARG,INTERNAL_CATCH_REMOVE_PARENS_10_ARG,INTERNAL_CATCH_REMOVE_PARENS_9_ARG,INTERNAL_CATCH_REMOVE_PARENS_8_ARG,INTERNAL_CATCH_REMOVE_PARENS_7_ARG,INTERNAL_CATCH_REMOVE_PARENS_6_ARG,INTERNAL_CATCH_REMOVE_PARENS_5_ARG,INTERNAL_CATCH_REMOVE_PARENS_4_ARG,INTERNAL_CATCH_REMOVE_PARENS_3_ARG,INTERNAL_CATCH_REMOVE_PARENS_2_ARG,INTERNAL_CATCH_REMOVE_PARENS_1_ARG)(__VA_ARGS__)
+#else
+#define INTERNAL_CATCH_NTTP_0(signature)
+#define INTERNAL_CATCH_NTTP_GEN(...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(__VA_ARGS__, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_1,INTERNAL_CATCH_NTTP_1, INTERNAL_CATCH_NTTP_0)( __VA_ARGS__))
+#define INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(TestName, ...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD1, INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD0)(TestName, __VA_ARGS__))
+#define INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD(TestName, ClassName, ...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X,INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD_X, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD1, INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD0)(TestName, ClassName, __VA_ARGS__))
+#define INTERNAL_CATCH_NTTP_REG_METHOD_GEN(TestName, ...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD, INTERNAL_CATCH_NTTP_REGISTER_METHOD0, INTERNAL_CATCH_NTTP_REGISTER_METHOD0)(TestName, __VA_ARGS__))
+#define INTERNAL_CATCH_NTTP_REG_GEN(TestFunc, ...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER, INTERNAL_CATCH_NTTP_REGISTER0, INTERNAL_CATCH_NTTP_REGISTER0)(TestFunc, __VA_ARGS__))
+#define INTERNAL_CATCH_DEFINE_SIG_TEST(TestName, ...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,INTERNAL_CATCH_DEFINE_SIG_TEST_X,INTERNAL_CATCH_DEFINE_SIG_TEST1, INTERNAL_CATCH_DEFINE_SIG_TEST0)(TestName, __VA_ARGS__))
+#define INTERNAL_CATCH_DECLARE_SIG_TEST(TestName, ...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL( "dummy", __VA_ARGS__, INTERNAL_CATCH_DECLARE_SIG_TEST_X,INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DEFINE_SIG_TEST_X,INTERNAL_CATCH_DECLARE_SIG_TEST_X,INTERNAL_CATCH_DECLARE_SIG_TEST_X, INTERNAL_CATCH_DECLARE_SIG_TEST1, INTERNAL_CATCH_DECLARE_SIG_TEST0)(TestName, __VA_ARGS__))
+#define INTERNAL_CATCH_REMOVE_PARENS_GEN(...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_VA_NARGS_IMPL(__VA_ARGS__, INTERNAL_CATCH_REMOVE_PARENS_11_ARG,INTERNAL_CATCH_REMOVE_PARENS_10_ARG,INTERNAL_CATCH_REMOVE_PARENS_9_ARG,INTERNAL_CATCH_REMOVE_PARENS_8_ARG,INTERNAL_CATCH_REMOVE_PARENS_7_ARG,INTERNAL_CATCH_REMOVE_PARENS_6_ARG,INTERNAL_CATCH_REMOVE_PARENS_5_ARG,INTERNAL_CATCH_REMOVE_PARENS_4_ARG,INTERNAL_CATCH_REMOVE_PARENS_3_ARG,INTERNAL_CATCH_REMOVE_PARENS_2_ARG,INTERNAL_CATCH_REMOVE_PARENS_1_ARG)(__VA_ARGS__))
+#endif
+
+#endif // CATCH_PREPROCESSOR_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_preprocessor_internal_stringify.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_preprocessor_internal_stringify.hpp
new file mode 100644
index 00000000..2fd64e1c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_preprocessor_internal_stringify.hpp
@@ -0,0 +1,19 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_PREPROCESSOR_INTERNAL_STRINGIFY_HPP_INCLUDED
+#define CATCH_PREPROCESSOR_INTERNAL_STRINGIFY_HPP_INCLUDED
+
+#include <catch2/catch_user_config.hpp>
+
+#if !defined(CATCH_CONFIG_DISABLE_STRINGIFICATION)
+  #define CATCH_INTERNAL_STRINGIFY(...) #__VA_ARGS__##_catch_sr
+#else
+  #define CATCH_INTERNAL_STRINGIFY(...) "Disabled by CATCH_CONFIG_DISABLE_STRINGIFICATION"_catch_sr
+#endif
+
+#endif // CATCH_PREPROCESSOR_INTERNAL_STRINGIFY_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_preprocessor_remove_parens.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_preprocessor_remove_parens.hpp
new file mode 100644
index 00000000..dee11cd6
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_preprocessor_remove_parens.hpp
@@ -0,0 +1,19 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_PREPROCESSOR_REMOVE_PARENS_HPP_INCLUDED
+#define CATCH_PREPROCESSOR_REMOVE_PARENS_HPP_INCLUDED
+
+#define INTERNAL_CATCH_EXPAND1( param ) INTERNAL_CATCH_EXPAND2( param )
+#define INTERNAL_CATCH_EXPAND2( ... ) INTERNAL_CATCH_NO##__VA_ARGS__
+#define INTERNAL_CATCH_DEF( ... ) INTERNAL_CATCH_DEF __VA_ARGS__
+#define INTERNAL_CATCH_NOINTERNAL_CATCH_DEF
+
+#define INTERNAL_CATCH_REMOVE_PARENS( ... ) \
+    INTERNAL_CATCH_EXPAND1( INTERNAL_CATCH_DEF __VA_ARGS__ )
+
+#endif // CATCH_PREPROCESSOR_REMOVE_PARENS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_floating_point_helpers.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_floating_point_helpers.hpp
new file mode 100644
index 00000000..c59c0539
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_floating_point_helpers.hpp
@@ -0,0 +1,94 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#ifndef CATCH_RANDOM_FLOATING_POINT_HELPERS_HPP_INCLUDED
+#define CATCH_RANDOM_FLOATING_POINT_HELPERS_HPP_INCLUDED
+
+#include <catch2/internal/catch_polyfills.hpp>
+
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+namespace Catch {
+
+    namespace Detail {
+        /**
+         * Returns the largest magnitude of 1-ULP distance inside the [a, b] range.
+         *
+         * Assumes `a < b`.
+         */
+        template <typename FloatType>
+        FloatType gamma(FloatType a, FloatType b) {
+            static_assert( std::is_floating_point<FloatType>::value,
+                           "gamma returns the largest ULP magnitude within "
+                           "floating point range [a, b]. This only makes sense "
+                           "for floating point types" );
+            assert( a <= b );
+
+            const auto gamma_up = Catch::nextafter( a, std::numeric_limits<FloatType>::infinity() ) - a;
+            const auto gamma_down = b - Catch::nextafter( b, -std::numeric_limits<FloatType>::infinity() );
+
+            return gamma_up < gamma_down ? gamma_down : gamma_up;
+        }
+
+        template <typename FloatingPoint>
+        struct DistanceTypePicker;
+        template <>
+        struct DistanceTypePicker<float> {
+            using type = std::uint32_t;
+        };
+        template <>
+        struct DistanceTypePicker<double> {
+            using type = std::uint64_t;
+        };
+
+        template <typename T>
+        using DistanceType = typename DistanceTypePicker<T>::type;
+
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        /**
+         * Computes the number of equi-distant floats in [a, b]
+         *
+         * Since not every range can be split into equidistant floats
+         * exactly, we actually compute ceil(b/distance - a/distance),
+         * because in those cases we want to overcount.
+         *
+         * Uses modified Dekker's FastTwoSum algorithm to handle rounding.
+         */
+        template <typename FloatType>
+        DistanceType<FloatType>
+        count_equidistant_floats( FloatType a, FloatType b, FloatType distance ) {
+            assert( a <= b );
+            // We get distance as gamma for our uniform float distribution,
+            // so this will round perfectly.
+            const auto ag = a / distance;
+            const auto bg = b / distance;
+
+            const auto s = bg - ag;
+            const auto err = ( std::fabs( a ) <= std::fabs( b ) )
+                                 ? -ag - ( s - bg )
+                                 : bg - ( s + ag );
+            const auto ceil_s = static_cast<DistanceType<FloatType>>( std::ceil( s ) );
+
+            return ( ceil_s != s ) ? ceil_s : ceil_s + ( err > 0 );
+        }
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+
+    }
+
+} // end namespace Catch
+
+#endif // CATCH_RANDOM_FLOATING_POINT_HELPERS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_integer_helpers.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_integer_helpers.hpp
new file mode 100644
index 00000000..fadc1e15
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_integer_helpers.hpp
@@ -0,0 +1,201 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#ifndef CATCH_RANDOM_INTEGER_HELPERS_HPP_INCLUDED
+#define CATCH_RANDOM_INTEGER_HELPERS_HPP_INCLUDED
+
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+namespace Catch {
+    namespace Detail {
+
+        template <std::size_t>
+        struct SizedUnsignedType;
+#define SizedUnsignedTypeHelper( TYPE )        \
+    template <>                                \
+    struct SizedUnsignedType<sizeof( TYPE )> { \
+        using type = TYPE;                     \
+    }
+
+        SizedUnsignedTypeHelper( std::uint8_t );
+        SizedUnsignedTypeHelper( std::uint16_t );
+        SizedUnsignedTypeHelper( std::uint32_t );
+        SizedUnsignedTypeHelper( std::uint64_t );
+#undef SizedUnsignedTypeHelper
+
+        template <std::size_t sz>
+        using SizedUnsignedType_t = typename SizedUnsignedType<sz>::type;
+
+        template <typename T>
+        using DoubleWidthUnsignedType_t = SizedUnsignedType_t<2 * sizeof( T )>;
+
+        template <typename T>
+        struct ExtendedMultResult {
+            T upper;
+            T lower;
+            bool operator==( ExtendedMultResult const& rhs ) const {
+                return upper == rhs.upper && lower == rhs.lower;
+            }
+        };
+
+        // Returns 128 bit result of multiplying lhs and rhs
+        constexpr ExtendedMultResult<std::uint64_t>
+        extendedMult( std::uint64_t lhs, std::uint64_t rhs ) {
+            // We use the simple long multiplication approach for
+            // correctness, we can use platform specific builtins
+            // for performance later.
+
+            // Split the lhs and rhs into two 32bit "digits", so that we can
+            // do 64 bit arithmetic to handle carry bits.
+            //            32b    32b    32b    32b
+            //     lhs                  L1     L2
+            //   * rhs                  R1     R2
+            //            ------------------------
+            //                       |  R2 * L2  |
+            //                 |  R2 * L1  |
+            //                 |  R1 * L2  |
+            //           |  R1 * L1  |
+            //           -------------------------
+            //           |  a  |  b  |  c  |  d  |
+
+#define CarryBits( x ) ( x >> 32 )
+#define Digits( x ) ( x & 0xFF'FF'FF'FF )
+
+            auto r2l2 = Digits( rhs ) * Digits( lhs );
+            auto r2l1 = Digits( rhs ) * CarryBits( lhs );
+            auto r1l2 = CarryBits( rhs ) * Digits( lhs );
+            auto r1l1 = CarryBits( rhs ) * CarryBits( lhs );
+
+            // Sum to columns first
+            auto d = Digits( r2l2 );
+            auto c = CarryBits( r2l2 ) + Digits( r2l1 ) + Digits( r1l2 );
+            auto b = CarryBits( r2l1 ) + CarryBits( r1l2 ) + Digits( r1l1 );
+            auto a = CarryBits( r1l1 );
+
+            // Propagate carries between columns
+            c += CarryBits( d );
+            b += CarryBits( c );
+            a += CarryBits( b );
+
+            // Remove the used carries
+            c = Digits( c );
+            b = Digits( b );
+            a = Digits( a );
+
+#undef CarryBits
+#undef Digits
+
+            return {
+                a << 32 | b, // upper 64 bits
+                c << 32 | d  // lower 64 bits
+            };
+        }
+
+        template <typename UInt>
+        constexpr ExtendedMultResult<UInt> extendedMult( UInt lhs, UInt rhs ) {
+            static_assert( std::is_unsigned<UInt>::value,
+                           "extendedMult can only handle unsigned integers" );
+            static_assert( sizeof( UInt ) < sizeof( std::uint64_t ),
+                           "Generic extendedMult can only handle types smaller "
+                           "than uint64_t" );
+            using WideType = DoubleWidthUnsignedType_t<UInt>;
+
+            auto result = WideType( lhs ) * WideType( rhs );
+            return {
+                static_cast<UInt>( result >> ( CHAR_BIT * sizeof( UInt ) ) ),
+                static_cast<UInt>( result & UInt( -1 ) ) };
+        }
+
+
+        template <typename TargetType,
+                  typename Generator>
+            std::enable_if_t<sizeof(typename Generator::result_type) >= sizeof(TargetType),
+            TargetType> fillBitsFrom(Generator& gen) {
+            using gresult_type = typename Generator::result_type;
+            static_assert( std::is_unsigned<TargetType>::value, "Only unsigned integers are supported" );
+            static_assert( Generator::min() == 0 &&
+                           Generator::max() == static_cast<gresult_type>( -1 ),
+                           "Generator must be able to output all numbers in its result type (effectively it must be a random bit generator)" );
+
+            // We want to return the top bits from a generator, as they are
+            // usually considered higher quality.
+            constexpr auto generated_bits = sizeof( gresult_type ) * CHAR_BIT;
+            constexpr auto return_bits = sizeof( TargetType ) * CHAR_BIT;
+
+            return static_cast<TargetType>( gen() >>
+                                            ( generated_bits - return_bits) );
+        }
+
+        template <typename TargetType,
+                  typename Generator>
+            std::enable_if_t<sizeof(typename Generator::result_type) < sizeof(TargetType),
+            TargetType> fillBitsFrom(Generator& gen) {
+            using gresult_type = typename Generator::result_type;
+            static_assert( std::is_unsigned<TargetType>::value,
+                           "Only unsigned integers are supported" );
+            static_assert( Generator::min() == 0 &&
+                           Generator::max() == static_cast<gresult_type>( -1 ),
+                           "Generator must be able to output all numbers in its result type (effectively it must be a random bit generator)" );
+
+            constexpr auto generated_bits = sizeof( gresult_type ) * CHAR_BIT;
+            constexpr auto return_bits = sizeof( TargetType ) * CHAR_BIT;
+            std::size_t filled_bits = 0;
+            TargetType ret = 0;
+            do {
+                ret <<= generated_bits;
+                ret |= gen();
+                filled_bits += generated_bits;
+            } while ( filled_bits < return_bits );
+
+            return ret;
+        }
+
+        /*
+         * Transposes numbers into unsigned type while keeping their ordering
+         *
+         * This means that signed types are changed so that the ordering is
+         * [INT_MIN, ..., -1, 0, ..., INT_MAX], rather than order we would
+         * get by simple casting ([0, ..., INT_MAX, INT_MIN, ..., -1])
+         */
+        template <typename OriginalType, typename UnsignedType>
+        std::enable_if_t<std::is_signed<OriginalType>::value, UnsignedType>
+        transposeToNaturalOrder( UnsignedType in ) {
+            static_assert(
+                sizeof( OriginalType ) == sizeof( UnsignedType ),
+                "reordering requires the same sized types on both sides" );
+            static_assert( std::is_unsigned<UnsignedType>::value,
+                           "Input type must be unsigned" );
+            // Assuming 2s complement (standardized in current C++), the
+            // positive and negative numbers are already internally ordered,
+            // and their difference is in the top bit. Swapping it orders
+            // them the desired way.
+            constexpr auto highest_bit =
+                UnsignedType( 1 ) << ( sizeof( UnsignedType ) * CHAR_BIT - 1 );
+            return static_cast<UnsignedType>( in ^ highest_bit );
+        }
+
+
+
+        template <typename OriginalType,
+                  typename UnsignedType>
+        std::enable_if_t<std::is_unsigned<OriginalType>::value, UnsignedType>
+            transposeToNaturalOrder(UnsignedType in) {
+            static_assert(
+                sizeof( OriginalType ) == sizeof( UnsignedType ),
+                "reordering requires the same sized types on both sides" );
+            static_assert( std::is_unsigned<UnsignedType>::value, "Input type must be unsigned" );
+            // No reordering is needed for unsigned -> unsigned
+            return in;
+        }
+    } // namespace Detail
+} // namespace Catch
+
+#endif // CATCH_RANDOM_INTEGER_HELPERS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_number_generator.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_number_generator.cpp
new file mode 100644
index 00000000..6a79dff3
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_number_generator.cpp
@@ -0,0 +1,70 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_random_number_generator.hpp>
+
+namespace Catch {
+
+namespace {
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable:4146) // we negate uint32 during the rotate
+#endif
+        // Safe rotr implementation thanks to John Regehr
+        uint32_t rotate_right(uint32_t val, uint32_t count) {
+            const uint32_t mask = 31;
+            count &= mask;
+            return (val >> count) | (val << (-count & mask));
+        }
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+}
+
+
+    SimplePcg32::SimplePcg32(result_type seed_) {
+        seed(seed_);
+    }
+
+
+    void SimplePcg32::seed(result_type seed_) {
+        m_state = 0;
+        (*this)();
+        m_state += seed_;
+        (*this)();
+    }
+
+    void SimplePcg32::discard(uint64_t skip) {
+        // We could implement this to run in O(log n) steps, but this
+        // should suffice for our use case.
+        for (uint64_t s = 0; s < skip; ++s) {
+            static_cast<void>((*this)());
+        }
+    }
+
+    SimplePcg32::result_type SimplePcg32::operator()() {
+        // prepare the output value
+        const uint32_t xorshifted = static_cast<uint32_t>(((m_state >> 18u) ^ m_state) >> 27u);
+        const auto output = rotate_right(xorshifted, m_state >> 59u);
+
+        // advance state
+        m_state = m_state * 6364136223846793005ULL + s_inc;
+
+        return output;
+    }
+
+    bool operator==(SimplePcg32 const& lhs, SimplePcg32 const& rhs) {
+        return lhs.m_state == rhs.m_state;
+    }
+
+    bool operator!=(SimplePcg32 const& lhs, SimplePcg32 const& rhs) {
+        return lhs.m_state != rhs.m_state;
+    }
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_number_generator.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_number_generator.hpp
new file mode 100644
index 00000000..e4129bec
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_number_generator.hpp
@@ -0,0 +1,59 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_RANDOM_NUMBER_GENERATOR_HPP_INCLUDED
+#define CATCH_RANDOM_NUMBER_GENERATOR_HPP_INCLUDED
+
+#include <cstdint>
+
+namespace Catch {
+
+    // This is a simple implementation of C++11 Uniform Random Number
+    // Generator. It does not provide all operators, because Catch2
+    // does not use it, but it should behave as expected inside stdlib's
+    // distributions.
+    // The implementation is based on the PCG family (http://pcg-random.org)
+    class SimplePcg32 {
+        using state_type = std::uint64_t;
+    public:
+        using result_type = std::uint32_t;
+        static constexpr result_type (min)() {
+            return 0;
+        }
+        static constexpr result_type (max)() {
+            return static_cast<result_type>(-1);
+        }
+
+        // Provide some default initial state for the default constructor
+        SimplePcg32():SimplePcg32(0xed743cc4U) {}
+
+        explicit SimplePcg32(result_type seed_);
+
+        void seed(result_type seed_);
+        void discard(uint64_t skip);
+
+        result_type operator()();
+
+    private:
+        friend bool operator==(SimplePcg32 const& lhs, SimplePcg32 const& rhs);
+        friend bool operator!=(SimplePcg32 const& lhs, SimplePcg32 const& rhs);
+
+        // In theory we also need operator<< and operator>>
+        // In practice we do not use them, so we will skip them for now
+
+
+        std::uint64_t m_state;
+        // This part of the state determines which "stream" of the numbers
+        // is chosen -- we take it as a constant for Catch2, so we only
+        // need to deal with seeding the main state.
+        // Picked by reading 8 bytes from `/dev/random` :-)
+        static const std::uint64_t s_inc = (0x13ed0cc53f939476ULL << 1ULL) | 1ULL;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_RANDOM_NUMBER_GENERATOR_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_seed_generation.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_seed_generation.cpp
new file mode 100644
index 00000000..fdc3fa19
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_seed_generation.cpp
@@ -0,0 +1,35 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/internal/catch_random_seed_generation.hpp>
+
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_random_integer_helpers.hpp>
+
+#include <ctime>
+#include <random>
+
+namespace Catch {
+
+    std::uint32_t generateRandomSeed( GenerateFrom from ) {
+        switch ( from ) {
+        case GenerateFrom::Time:
+            return static_cast<std::uint32_t>( std::time( nullptr ) );
+
+        case GenerateFrom::Default:
+        case GenerateFrom::RandomDevice: {
+            std::random_device rd;
+            return Detail::fillBitsFrom<std::uint32_t>( rd );
+        }
+
+        default:
+            CATCH_ERROR("Unknown generation method");
+        }
+    }
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_seed_generation.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_seed_generation.hpp
new file mode 100644
index 00000000..d0d6fb24
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_seed_generation.hpp
@@ -0,0 +1,26 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_RANDOM_SEED_GENERATION_HPP_INCLUDED
+#define CATCH_RANDOM_SEED_GENERATION_HPP_INCLUDED
+
+#include <cstdint>
+
+namespace Catch {
+
+    enum class GenerateFrom {
+        Time,
+        RandomDevice,
+        //! Currently equivalent to RandomDevice, but can change at any point
+        Default
+    };
+
+    std::uint32_t generateRandomSeed(GenerateFrom from);
+
+} // end namespace Catch
+
+#endif // CATCH_RANDOM_SEED_GENERATION_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reporter_registry.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reporter_registry.cpp
new file mode 100644
index 00000000..cea8c4dc
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reporter_registry.cpp
@@ -0,0 +1,91 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/interfaces/catch_interfaces_reporter_factory.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/internal/catch_reporter_registry.hpp>
+#include <catch2/reporters/catch_reporter_automake.hpp>
+#include <catch2/reporters/catch_reporter_compact.hpp>
+#include <catch2/reporters/catch_reporter_console.hpp>
+#include <catch2/reporters/catch_reporter_json.hpp>
+#include <catch2/reporters/catch_reporter_junit.hpp>
+#include <catch2/reporters/catch_reporter_registrars.hpp>
+#include <catch2/reporters/catch_reporter_sonarqube.hpp>
+#include <catch2/reporters/catch_reporter_tap.hpp>
+#include <catch2/reporters/catch_reporter_teamcity.hpp>
+#include <catch2/reporters/catch_reporter_xml.hpp>
+
+namespace Catch {
+    struct ReporterRegistry::ReporterRegistryImpl {
+        std::vector<Detail::unique_ptr<EventListenerFactory>> listeners;
+        std::map<std::string, IReporterFactoryPtr, Detail::CaseInsensitiveLess>
+            factories;
+    };
+
+    ReporterRegistry::ReporterRegistry():
+        m_impl( Detail::make_unique<ReporterRegistryImpl>() ) {
+        // Because it is impossible to move out of initializer list,
+        // we have to add the elements manually
+        m_impl->factories["Automake"] =
+            Detail::make_unique<ReporterFactory<AutomakeReporter>>();
+        m_impl->factories["compact"] =
+            Detail::make_unique<ReporterFactory<CompactReporter>>();
+        m_impl->factories["console"] =
+            Detail::make_unique<ReporterFactory<ConsoleReporter>>();
+        m_impl->factories["JUnit"] =
+            Detail::make_unique<ReporterFactory<JunitReporter>>();
+        m_impl->factories["SonarQube"] =
+            Detail::make_unique<ReporterFactory<SonarQubeReporter>>();
+        m_impl->factories["TAP"] =
+            Detail::make_unique<ReporterFactory<TAPReporter>>();
+        m_impl->factories["TeamCity"] =
+            Detail::make_unique<ReporterFactory<TeamCityReporter>>();
+        m_impl->factories["XML"] =
+            Detail::make_unique<ReporterFactory<XmlReporter>>();
+        m_impl->factories["JSON"] =
+            Detail::make_unique<ReporterFactory<JsonReporter>>();
+    }
+
+    ReporterRegistry::~ReporterRegistry() = default;
+
+    IEventListenerPtr
+    ReporterRegistry::create( std::string const& name,
+                              ReporterConfig&& config ) const {
+        auto it = m_impl->factories.find( name );
+        if ( it == m_impl->factories.end() ) return nullptr;
+        return it->second->create( CATCH_MOVE( config ) );
+    }
+
+    void ReporterRegistry::registerReporter( std::string const& name,
+                                             IReporterFactoryPtr factory ) {
+        CATCH_ENFORCE( name.find( "::" ) == name.npos,
+                       "'::' is not allowed in reporter name: '" + name +
+                           '\'' );
+        auto ret = m_impl->factories.emplace( name, CATCH_MOVE( factory ) );
+        CATCH_ENFORCE( ret.second,
+                       "reporter using '" + name +
+                           "' as name was already registered" );
+    }
+    void ReporterRegistry::registerListener(
+        Detail::unique_ptr<EventListenerFactory> factory ) {
+        m_impl->listeners.push_back( CATCH_MOVE( factory ) );
+    }
+
+    std::map<std::string,
+             IReporterFactoryPtr,
+             Detail::CaseInsensitiveLess> const&
+    ReporterRegistry::getFactories() const {
+        return m_impl->factories;
+    }
+
+    std::vector<Detail::unique_ptr<EventListenerFactory>> const&
+    ReporterRegistry::getListeners() const {
+        return m_impl->listeners;
+    }
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reporter_registry.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reporter_registry.hpp
new file mode 100644
index 00000000..92a88927
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reporter_registry.hpp
@@ -0,0 +1,55 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_REPORTER_REGISTRY_HPP_INCLUDED
+#define CATCH_REPORTER_REGISTRY_HPP_INCLUDED
+
+#include <catch2/internal/catch_case_insensitive_comparisons.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
+
+#include <map>
+#include <string>
+#include <vector>
+
+namespace Catch {
+
+    class IEventListener;
+    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
+    class IReporterFactory;
+    using IReporterFactoryPtr = Detail::unique_ptr<IReporterFactory>;
+    struct ReporterConfig;
+    class EventListenerFactory;
+
+    class ReporterRegistry {
+        struct ReporterRegistryImpl;
+        Detail::unique_ptr<ReporterRegistryImpl> m_impl;
+
+    public:
+        ReporterRegistry();
+        ~ReporterRegistry(); // = default;
+
+        IEventListenerPtr create( std::string const& name,
+                                  ReporterConfig&& config ) const;
+
+        void registerReporter( std::string const& name,
+                               IReporterFactoryPtr factory );
+
+        void
+        registerListener( Detail::unique_ptr<EventListenerFactory> factory );
+
+        std::map<std::string,
+                 IReporterFactoryPtr,
+                 Detail::CaseInsensitiveLess> const&
+        getFactories() const;
+
+        std::vector<Detail::unique_ptr<EventListenerFactory>> const&
+        getListeners() const;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_REGISTRY_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reporter_spec_parser.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reporter_spec_parser.cpp
new file mode 100644
index 00000000..2b08758a
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reporter_spec_parser.cpp
@@ -0,0 +1,173 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/internal/catch_reporter_spec_parser.hpp>
+
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <algorithm>
+
+namespace Catch {
+
+    namespace {
+        struct kvPair {
+            StringRef key, value;
+        };
+
+        kvPair splitKVPair(StringRef kvString) {
+            auto splitPos = static_cast<size_t>(
+                std::find( kvString.begin(), kvString.end(), '=' ) -
+                kvString.begin() );
+
+            return { kvString.substr( 0, splitPos ),
+                     kvString.substr( splitPos + 1, kvString.size() ) };
+        }
+    }
+
+    namespace Detail {
+        std::vector<std::string> splitReporterSpec( StringRef reporterSpec ) {
+            static constexpr auto separator = "::";
+            static constexpr size_t separatorSize = 2;
+
+            size_t separatorPos = 0;
+            auto findNextSeparator = [&reporterSpec]( size_t startPos ) {
+                static_assert(
+                    separatorSize == 2,
+                    "The code below currently assumes 2 char separator" );
+
+                auto currentPos = startPos;
+                do {
+                    while ( currentPos < reporterSpec.size() &&
+                            reporterSpec[currentPos] != separator[0] ) {
+                        ++currentPos;
+                    }
+                    if ( currentPos + 1 < reporterSpec.size() &&
+                         reporterSpec[currentPos + 1] == separator[1] ) {
+                        return currentPos;
+                    }
+                    ++currentPos;
+                } while ( currentPos < reporterSpec.size() );
+
+                return static_cast<size_t>( -1 );
+            };
+
+            std::vector<std::string> parts;
+
+            while ( separatorPos < reporterSpec.size() ) {
+                const auto nextSeparator = findNextSeparator( separatorPos );
+                parts.push_back( static_cast<std::string>( reporterSpec.substr(
+                    separatorPos, nextSeparator - separatorPos ) ) );
+
+                if ( nextSeparator == static_cast<size_t>( -1 ) ) {
+                    break;
+                }
+                separatorPos = nextSeparator + separatorSize;
+            }
+
+            // Handle a separator at the end.
+            // This is not a valid spec, but we want to do validation in a
+            // centralized place
+            if ( separatorPos == reporterSpec.size() ) {
+                parts.emplace_back();
+            }
+
+            return parts;
+        }
+
+        Optional<ColourMode> stringToColourMode( StringRef colourMode ) {
+            if ( colourMode == "default" ) {
+                return ColourMode::PlatformDefault;
+            } else if ( colourMode == "ansi" ) {
+                return ColourMode::ANSI;
+            } else if ( colourMode == "win32" ) {
+                return ColourMode::Win32;
+            } else if ( colourMode == "none" ) {
+                return ColourMode::None;
+            } else {
+                return {};
+            }
+        }
+    } // namespace Detail
+
+
+    bool operator==( ReporterSpec const& lhs, ReporterSpec const& rhs ) {
+        return lhs.m_name == rhs.m_name &&
+               lhs.m_outputFileName == rhs.m_outputFileName &&
+               lhs.m_colourMode == rhs.m_colourMode &&
+               lhs.m_customOptions == rhs.m_customOptions;
+    }
+
+    Optional<ReporterSpec> parseReporterSpec( StringRef reporterSpec ) {
+        auto parts = Detail::splitReporterSpec( reporterSpec );
+
+        assert( parts.size() > 0 && "Split should never return empty vector" );
+
+        std::map<std::string, std::string> kvPairs;
+        Optional<std::string> outputFileName;
+        Optional<ColourMode> colourMode;
+
+        // First part is always reporter name, so we skip it
+        for ( size_t i = 1; i < parts.size(); ++i ) {
+            auto kv = splitKVPair( parts[i] );
+            auto key = kv.key, value = kv.value;
+
+            if ( key.empty() || value.empty() ) { // NOLINT(bugprone-branch-clone)
+                return {};
+            } else if ( key[0] == 'X' ) {
+                // This is a reporter-specific option, we don't check these
+                // apart from basic sanity checks
+                if ( key.size() == 1 ) {
+                    return {};
+                }
+
+                auto ret = kvPairs.emplace( std::string(kv.key), std::string(kv.value) );
+                if ( !ret.second ) {
+                    // Duplicated key. We might want to handle this differently,
+                    // e.g. by overwriting the existing value?
+                    return {};
+                }
+            } else if ( key == "out" ) {
+                // Duplicated key
+                if ( outputFileName ) {
+                    return {};
+                }
+                outputFileName = static_cast<std::string>( value );
+            } else if ( key == "colour-mode" ) {
+                // Duplicated key
+                if ( colourMode ) {
+                    return {};
+                }
+                colourMode = Detail::stringToColourMode( value );
+                // Parsing failed
+                if ( !colourMode ) {
+                    return {};
+                }
+            } else {
+                // Unrecognized option
+                return {};
+            }
+        }
+
+        return ReporterSpec{ CATCH_MOVE( parts[0] ),
+                             CATCH_MOVE( outputFileName ),
+                             CATCH_MOVE( colourMode ),
+                             CATCH_MOVE( kvPairs ) };
+    }
+
+ReporterSpec::ReporterSpec(
+        std::string name,
+        Optional<std::string> outputFileName,
+        Optional<ColourMode> colourMode,
+        std::map<std::string, std::string> customOptions ):
+        m_name( CATCH_MOVE( name ) ),
+        m_outputFileName( CATCH_MOVE( outputFileName ) ),
+        m_colourMode( CATCH_MOVE( colourMode ) ),
+        m_customOptions( CATCH_MOVE( customOptions ) ) {}
+
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reporter_spec_parser.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reporter_spec_parser.hpp
new file mode 100644
index 00000000..9f447ee2
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reporter_spec_parser.hpp
@@ -0,0 +1,85 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_REPORTER_SPEC_PARSER_HPP_INCLUDED
+#define CATCH_REPORTER_SPEC_PARSER_HPP_INCLUDED
+
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/internal/catch_optional.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+
+#include <map>
+#include <string>
+#include <vector>
+
+namespace Catch {
+
+    enum class ColourMode : std::uint8_t;
+
+    namespace Detail {
+        //! Splits the reporter spec into reporter name and kv-pair options
+        std::vector<std::string> splitReporterSpec( StringRef reporterSpec );
+
+        Optional<ColourMode> stringToColourMode( StringRef colourMode );
+    }
+
+    /**
+     * Structured reporter spec that a reporter can be created from
+     *
+     * Parsing has been validated, but semantics have not. This means e.g.
+     * that the colour mode is known to Catch2, but it might not be
+     * compiled into the binary, and the output filename might not be
+     * openable.
+     */
+    class ReporterSpec {
+        std::string m_name;
+        Optional<std::string> m_outputFileName;
+        Optional<ColourMode> m_colourMode;
+        std::map<std::string, std::string> m_customOptions;
+
+        friend bool operator==( ReporterSpec const& lhs,
+                                ReporterSpec const& rhs );
+        friend bool operator!=( ReporterSpec const& lhs,
+                                ReporterSpec const& rhs ) {
+            return !( lhs == rhs );
+        }
+
+    public:
+        ReporterSpec(
+            std::string name,
+            Optional<std::string> outputFileName,
+            Optional<ColourMode> colourMode,
+            std::map<std::string, std::string> customOptions );
+
+        std::string const& name() const { return m_name; }
+
+        Optional<std::string> const& outputFile() const {
+            return m_outputFileName;
+        }
+
+        Optional<ColourMode> const& colourMode() const { return m_colourMode; }
+
+        std::map<std::string, std::string> const& customOptions() const {
+            return m_customOptions;
+        }
+    };
+
+    /**
+     * Parses provided reporter spec string into
+     *
+     * Returns empty optional on errors, e.g.
+     *  * field that is not first and not a key+value pair
+     *  * duplicated keys in kv pair
+     *  * unknown catch reporter option
+     *  * empty key/value in an custom kv pair
+     *  * ...
+     */
+    Optional<ReporterSpec> parseReporterSpec( StringRef reporterSpec );
+
+}
+
+#endif // CATCH_REPORTER_SPEC_PARSER_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_result_type.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_result_type.cpp
new file mode 100644
index 00000000..6cedce71
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_result_type.cpp
@@ -0,0 +1,26 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_result_type.hpp>
+
+namespace Catch {
+
+    bool isOk( ResultWas::OfType resultType ) {
+        return ( resultType & ResultWas::FailureBit ) == 0;
+    }
+    bool isJustInfo( int flags ) {
+        return flags == ResultWas::Info;
+    }
+
+    ResultDisposition::Flags operator | ( ResultDisposition::Flags lhs, ResultDisposition::Flags rhs ) {
+        return static_cast<ResultDisposition::Flags>( static_cast<int>( lhs ) | static_cast<int>( rhs ) );
+    }
+
+    bool shouldContinueOnFailure( int flags )    { return ( flags & ResultDisposition::ContinueOnFailure ) != 0; }
+    bool shouldSuppressFailure( int flags )      { return ( flags & ResultDisposition::SuppressFail ) != 0; }
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_result_type.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_result_type.hpp
new file mode 100644
index 00000000..e66afaff
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_result_type.hpp
@@ -0,0 +1,57 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_RESULT_TYPE_HPP_INCLUDED
+#define CATCH_RESULT_TYPE_HPP_INCLUDED
+
+namespace Catch {
+
+    // ResultWas::OfType enum
+    struct ResultWas { enum OfType {
+        Unknown = -1,
+        Ok = 0,
+        Info = 1,
+        Warning = 2,
+        // TODO: Should explicit skip be considered "not OK" (cf. isOk)? I.e., should it have the failure bit?
+        ExplicitSkip = 4,
+
+        FailureBit = 0x10,
+
+        ExpressionFailed = FailureBit | 1,
+        ExplicitFailure = FailureBit | 2,
+
+        Exception = 0x100 | FailureBit,
+
+        ThrewException = Exception | 1,
+        DidntThrowException = Exception | 2,
+
+        FatalErrorCondition = 0x200 | FailureBit
+
+    }; };
+
+    bool isOk( ResultWas::OfType resultType );
+    bool isJustInfo( int flags );
+
+
+    // ResultDisposition::Flags enum
+    struct ResultDisposition { enum Flags {
+        Normal = 0x01,
+
+        ContinueOnFailure = 0x02,   // Failures fail test, but execution continues
+        FalseTest = 0x04,           // Prefix expression with !
+        SuppressFail = 0x08         // Failures are reported but do not fail the test
+    }; };
+
+    ResultDisposition::Flags operator | ( ResultDisposition::Flags lhs, ResultDisposition::Flags rhs );
+
+    bool shouldContinueOnFailure( int flags );
+    inline bool isFalseTest( int flags ) { return ( flags & ResultDisposition::FalseTest ) != 0; }
+    bool shouldSuppressFailure( int flags );
+
+} // end namespace Catch
+
+#endif // CATCH_RESULT_TYPE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reusable_string_stream.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reusable_string_stream.cpp
new file mode 100644
index 00000000..33eafde4
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reusable_string_stream.cpp
@@ -0,0 +1,62 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_reusable_string_stream.hpp>
+#include <catch2/internal/catch_singletons.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
+
+#include <cstdio>
+#include <sstream>
+#include <vector>
+
+namespace Catch {
+
+    // This class encapsulates the idea of a pool of ostringstreams that can be reused.
+    struct StringStreams {
+        std::vector<Detail::unique_ptr<std::ostringstream>> m_streams;
+        std::vector<std::size_t> m_unused;
+        std::ostringstream m_referenceStream; // Used for copy state/ flags from
+
+        auto add() -> std::size_t {
+            if( m_unused.empty() ) {
+                m_streams.push_back( Detail::make_unique<std::ostringstream>() );
+                return m_streams.size()-1;
+            }
+            else {
+                auto index = m_unused.back();
+                m_unused.pop_back();
+                return index;
+            }
+        }
+
+        void release( std::size_t index ) {
+            m_streams[index]->copyfmt( m_referenceStream ); // Restore initial flags and other state
+            m_unused.push_back(index);
+        }
+    };
+
+    ReusableStringStream::ReusableStringStream()
+    :   m_index( Singleton<StringStreams>::getMutable().add() ),
+        m_oss( Singleton<StringStreams>::getMutable().m_streams[m_index].get() )
+    {}
+
+    ReusableStringStream::~ReusableStringStream() {
+        static_cast<std::ostringstream*>( m_oss )->str("");
+        m_oss->clear();
+        Singleton<StringStreams>::getMutable().release( m_index );
+    }
+
+    std::string ReusableStringStream::str() const {
+        return static_cast<std::ostringstream*>( m_oss )->str();
+    }
+
+    void ReusableStringStream::str( std::string const& str ) {
+        static_cast<std::ostringstream*>( m_oss )->str( str );
+    }
+
+
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reusable_string_stream.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reusable_string_stream.hpp
new file mode 100644
index 00000000..5b864f35
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reusable_string_stream.hpp
@@ -0,0 +1,57 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_REUSABLE_STRING_STREAM_HPP_INCLUDED
+#define CATCH_REUSABLE_STRING_STREAM_HPP_INCLUDED
+
+#include <catch2/internal/catch_noncopyable.hpp>
+
+#include <iosfwd>
+#include <cstddef>
+#include <ostream>
+#include <string>
+
+namespace Catch {
+
+    class ReusableStringStream : Detail::NonCopyable {
+        std::size_t m_index;
+        std::ostream* m_oss;
+    public:
+        ReusableStringStream();
+        ~ReusableStringStream();
+
+        //! Returns the serialized state
+        std::string str() const;
+        //! Sets internal state to `str`
+        void str(std::string const& str);
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+// Old versions of GCC do not understand -Wnonnull-compare
+#pragma GCC diagnostic ignored "-Wpragmas"
+// Streaming a function pointer triggers Waddress and Wnonnull-compare
+// on GCC, because it implicitly converts it to bool and then decides
+// that the check it uses (a? true : false) is tautological and cannot
+// be null...
+#pragma GCC diagnostic ignored "-Waddress"
+#pragma GCC diagnostic ignored "-Wnonnull-compare"
+#endif
+
+        template<typename T>
+        auto operator << ( T const& value ) -> ReusableStringStream& {
+            *m_oss << value;
+            return *this;
+        }
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+        auto get() -> std::ostream& { return *m_oss; }
+    };
+}
+
+#endif // CATCH_REUSABLE_STRING_STREAM_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_run_context.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_run_context.cpp
new file mode 100644
index 00000000..77b476d8
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_run_context.cpp
@@ -0,0 +1,700 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_run_context.hpp>
+
+#include <catch2/catch_user_config.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/interfaces/catch_interfaces_generatortracker.hpp>
+#include <catch2/interfaces/catch_interfaces_reporter.hpp>
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/internal/catch_context.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_fatal_condition_handler.hpp>
+#include <catch2/internal/catch_random_number_generator.hpp>
+#include <catch2/catch_timer.hpp>
+#include <catch2/internal/catch_output_redirect.hpp>
+#include <catch2/internal/catch_assertion_handler.hpp>
+#include <catch2/internal/catch_test_failure_exception.hpp>
+#include <catch2/internal/catch_result_type.hpp>
+
+#include <cassert>
+#include <algorithm>
+
+namespace Catch {
+
+    namespace Generators {
+        namespace {
+            struct GeneratorTracker final : TestCaseTracking::TrackerBase,
+                                      IGeneratorTracker {
+                GeneratorBasePtr m_generator;
+
+                GeneratorTracker(
+                    TestCaseTracking::NameAndLocation&& nameAndLocation,
+                    TrackerContext& ctx,
+                    ITracker* parent ):
+                    TrackerBase( CATCH_MOVE( nameAndLocation ), ctx, parent ) {}
+
+                static GeneratorTracker*
+                acquire( TrackerContext& ctx,
+                         TestCaseTracking::NameAndLocationRef const&
+                             nameAndLocation ) {
+                    GeneratorTracker* tracker;
+
+                    ITracker& currentTracker = ctx.currentTracker();
+                    // Under specific circumstances, the generator we want
+                    // to acquire is also the current tracker. If this is
+                    // the case, we have to avoid looking through current
+                    // tracker's children, and instead return the current
+                    // tracker.
+                    // A case where this check is important is e.g.
+                    //     for (int i = 0; i < 5; ++i) {
+                    //         int n = GENERATE(1, 2);
+                    //     }
+                    //
+                    // without it, the code above creates 5 nested generators.
+                    if ( currentTracker.nameAndLocation() == nameAndLocation ) {
+                        auto thisTracker = currentTracker.parent()->findChild(
+                            nameAndLocation );
+                        assert( thisTracker );
+                        assert( thisTracker->isGeneratorTracker() );
+                        tracker = static_cast<GeneratorTracker*>( thisTracker );
+                    } else if ( ITracker* childTracker =
+                                    currentTracker.findChild(
+                                        nameAndLocation ) ) {
+                        assert( childTracker );
+                        assert( childTracker->isGeneratorTracker() );
+                        tracker =
+                            static_cast<GeneratorTracker*>( childTracker );
+                    } else {
+                        return nullptr;
+                    }
+
+                    if ( !tracker->isComplete() ) { tracker->open(); }
+
+                    return tracker;
+                }
+
+                // TrackerBase interface
+                bool isGeneratorTracker() const override { return true; }
+                auto hasGenerator() const -> bool override {
+                    return !!m_generator;
+                }
+                void close() override {
+                    TrackerBase::close();
+                    // If a generator has a child (it is followed by a section)
+                    // and none of its children have started, then we must wait
+                    // until later to start consuming its values.
+                    // This catches cases where `GENERATE` is placed between two
+                    // `SECTION`s.
+                    // **The check for m_children.empty cannot be removed**.
+                    // doing so would break `GENERATE` _not_ followed by
+                    // `SECTION`s.
+                    const bool should_wait_for_child = [&]() {
+                        // No children -> nobody to wait for
+                        if ( m_children.empty() ) { return false; }
+                        // If at least one child started executing, don't wait
+                        if ( std::find_if(
+                                 m_children.begin(),
+                                 m_children.end(),
+                                 []( TestCaseTracking::ITrackerPtr const&
+                                         tracker ) {
+                                     return tracker->hasStarted();
+                                 } ) != m_children.end() ) {
+                            return false;
+                        }
+
+                        // No children have started. We need to check if they
+                        // _can_ start, and thus we should wait for them, or
+                        // they cannot start (due to filters), and we shouldn't
+                        // wait for them
+                        ITracker* parent = m_parent;
+                        // This is safe: there is always at least one section
+                        // tracker in a test case tracking tree
+                        while ( !parent->isSectionTracker() ) {
+                            parent = parent->parent();
+                        }
+                        assert( parent &&
+                                "Missing root (test case) level section" );
+
+                        auto const& parentSection =
+                            static_cast<SectionTracker const&>( *parent );
+                        auto const& filters = parentSection.getFilters();
+                        // No filters -> no restrictions on running sections
+                        if ( filters.empty() ) { return true; }
+
+                        for ( auto const& child : m_children ) {
+                            if ( child->isSectionTracker() &&
+                                 std::find( filters.begin(),
+                                            filters.end(),
+                                            static_cast<SectionTracker const&>(
+                                                *child )
+                                                .trimmedName() ) !=
+                                     filters.end() ) {
+                                return true;
+                            }
+                        }
+                        return false;
+                    }();
+
+                    // This check is a bit tricky, because m_generator->next()
+                    // has a side-effect, where it consumes generator's current
+                    // value, but we do not want to invoke the side-effect if
+                    // this generator is still waiting for any child to start.
+                    assert( m_generator && "Tracker without generator" );
+                    if ( should_wait_for_child ||
+                         ( m_runState == CompletedSuccessfully &&
+                           m_generator->countedNext() ) ) {
+                        m_children.clear();
+                        m_runState = Executing;
+                    }
+                }
+
+                // IGeneratorTracker interface
+                auto getGenerator() const -> GeneratorBasePtr const& override {
+                    return m_generator;
+                }
+                void setGenerator( GeneratorBasePtr&& generator ) override {
+                    m_generator = CATCH_MOVE( generator );
+                }
+            };
+        } // namespace
+    }
+
+    RunContext::RunContext(IConfig const* _config, IEventListenerPtr&& reporter)
+    :   m_runInfo(_config->name()),
+        m_config(_config),
+        m_reporter(CATCH_MOVE(reporter)),
+        m_lastAssertionInfo{ StringRef(), SourceLineInfo("",0), StringRef(), ResultDisposition::Normal },
+        m_includeSuccessfulResults( m_config->includeSuccessfulResults() || m_reporter->getPreferences().shouldReportAllAssertions )
+    {
+        getCurrentMutableContext().setResultCapture( this );
+        m_reporter->testRunStarting(m_runInfo);
+    }
+
+    RunContext::~RunContext() {
+        m_reporter->testRunEnded(TestRunStats(m_runInfo, m_totals, aborting()));
+    }
+
+    Totals RunContext::runTest(TestCaseHandle const& testCase) {
+        const Totals prevTotals = m_totals;
+
+        auto const& testInfo = testCase.getTestCaseInfo();
+        m_reporter->testCaseStarting(testInfo);
+        m_activeTestCase = &testCase;
+
+
+        ITracker& rootTracker = m_trackerContext.startRun();
+        assert(rootTracker.isSectionTracker());
+        static_cast<SectionTracker&>(rootTracker).addInitialFilters(m_config->getSectionsToRun());
+
+        // We intentionally only seed the internal RNG once per test case,
+        // before it is first invoked. The reason for that is a complex
+        // interplay of generator/section implementation details and the
+        // Random*Generator types.
+        //
+        // The issue boils down to us needing to seed the Random*Generators
+        // with different seed each, so that they return different sequences
+        // of random numbers. We do this by giving them a number from the
+        // shared RNG instance as their seed.
+        //
+        // However, this runs into an issue if the reseeding happens each
+        // time the test case is entered (as opposed to first time only),
+        // because multiple generators could get the same seed, e.g. in
+        // ```cpp
+        // TEST_CASE() {
+        //     auto i = GENERATE(take(10, random(0, 100));
+        //     SECTION("A") {
+        //         auto j = GENERATE(take(10, random(0, 100));
+        //     }
+        //     SECTION("B") {
+        //         auto k = GENERATE(take(10, random(0, 100));
+        //     }
+        // }
+        // ```
+        // `i` and `j` would properly return values from different sequences,
+        // but `i` and `k` would return the same sequence, because their seed
+        // would be the same.
+        // (The reason their seeds would be the same is that the generator
+        //  for k would be initialized when the test case is entered the second
+        //  time, after the shared RNG instance was reset to the same value
+        //  it had when the generator for i was initialized.)
+        seedRng( *m_config );
+
+        uint64_t testRuns = 0;
+        std::string redirectedCout;
+        std::string redirectedCerr;
+        do {
+            m_trackerContext.startCycle();
+            m_testCaseTracker = &SectionTracker::acquire(m_trackerContext, TestCaseTracking::NameAndLocationRef(testInfo.name, testInfo.lineInfo));
+
+            m_reporter->testCasePartialStarting(testInfo, testRuns);
+
+            const auto beforeRunTotals = m_totals;
+            std::string oneRunCout, oneRunCerr;
+            runCurrentTest(oneRunCout, oneRunCerr);
+            redirectedCout += oneRunCout;
+            redirectedCerr += oneRunCerr;
+
+            const auto singleRunTotals = m_totals.delta(beforeRunTotals);
+            auto statsForOneRun = TestCaseStats(testInfo, singleRunTotals, CATCH_MOVE(oneRunCout), CATCH_MOVE(oneRunCerr), aborting());
+
+            m_reporter->testCasePartialEnded(statsForOneRun, testRuns);
+            ++testRuns;
+        } while (!m_testCaseTracker->isSuccessfullyCompleted() && !aborting());
+
+        Totals deltaTotals = m_totals.delta(prevTotals);
+        if (testInfo.expectedToFail() && deltaTotals.testCases.passed > 0) {
+            deltaTotals.assertions.failed++;
+            deltaTotals.testCases.passed--;
+            deltaTotals.testCases.failed++;
+        }
+        m_totals.testCases += deltaTotals.testCases;
+        m_reporter->testCaseEnded(TestCaseStats(testInfo,
+                                  deltaTotals,
+                                  CATCH_MOVE(redirectedCout),
+                                  CATCH_MOVE(redirectedCerr),
+                                  aborting()));
+
+        m_activeTestCase = nullptr;
+        m_testCaseTracker = nullptr;
+
+        return deltaTotals;
+    }
+
+
+    void RunContext::assertionEnded(AssertionResult&& result) {
+        if (result.getResultType() == ResultWas::Ok) {
+            m_totals.assertions.passed++;
+            m_lastAssertionPassed = true;
+        } else if (result.getResultType() == ResultWas::ExplicitSkip) {
+            m_totals.assertions.skipped++;
+            m_lastAssertionPassed = true;
+        } else if (!result.succeeded()) {
+            m_lastAssertionPassed = false;
+            if (result.isOk()) {
+            }
+            else if( m_activeTestCase->getTestCaseInfo().okToFail() )
+                m_totals.assertions.failedButOk++;
+            else
+                m_totals.assertions.failed++;
+        }
+        else {
+            m_lastAssertionPassed = true;
+        }
+
+        m_reporter->assertionEnded(AssertionStats(result, m_messages, m_totals));
+
+        if ( result.getResultType() != ResultWas::Warning ) {
+            m_messageScopes.clear();
+        }
+
+        // Reset working state. assertion info will be reset after
+        // populateReaction is run if it is needed
+        m_lastResult = CATCH_MOVE( result );
+    }
+    void RunContext::resetAssertionInfo() {
+        m_lastAssertionInfo.macroName = StringRef();
+        m_lastAssertionInfo.capturedExpression = "{Unknown expression after the reported line}"_sr;
+        m_lastAssertionInfo.resultDisposition = ResultDisposition::Normal;
+    }
+
+    void RunContext::notifyAssertionStarted( AssertionInfo const& info ) {
+        m_reporter->assertionStarting( info );
+    }
+
+    bool RunContext::sectionStarted( StringRef sectionName,
+                                     SourceLineInfo const& sectionLineInfo,
+                                     Counts& assertions ) {
+        ITracker& sectionTracker =
+            SectionTracker::acquire( m_trackerContext,
+                                     TestCaseTracking::NameAndLocationRef(
+                                         sectionName, sectionLineInfo ) );
+
+        if (!sectionTracker.isOpen())
+            return false;
+        m_activeSections.push_back(&sectionTracker);
+
+        SectionInfo sectionInfo( sectionLineInfo, static_cast<std::string>(sectionName) );
+        m_lastAssertionInfo.lineInfo = sectionInfo.lineInfo;
+
+        m_reporter->sectionStarting(sectionInfo);
+
+        assertions = m_totals.assertions;
+
+        return true;
+    }
+    IGeneratorTracker*
+    RunContext::acquireGeneratorTracker( StringRef generatorName,
+                                         SourceLineInfo const& lineInfo ) {
+        using namespace Generators;
+        GeneratorTracker* tracker = GeneratorTracker::acquire(
+            m_trackerContext,
+            TestCaseTracking::NameAndLocationRef(
+                 generatorName, lineInfo ) );
+        m_lastAssertionInfo.lineInfo = lineInfo;
+        return tracker;
+    }
+
+    IGeneratorTracker* RunContext::createGeneratorTracker(
+        StringRef generatorName,
+        SourceLineInfo lineInfo,
+        Generators::GeneratorBasePtr&& generator ) {
+
+        auto nameAndLoc = TestCaseTracking::NameAndLocation( static_cast<std::string>( generatorName ), lineInfo );
+        auto& currentTracker = m_trackerContext.currentTracker();
+        assert(
+            currentTracker.nameAndLocation() != nameAndLoc &&
+            "Trying to create tracker for a genreator that already has one" );
+
+        auto newTracker = Catch::Detail::make_unique<Generators::GeneratorTracker>(
+            CATCH_MOVE(nameAndLoc), m_trackerContext, &currentTracker );
+        auto ret = newTracker.get();
+        currentTracker.addChild( CATCH_MOVE( newTracker ) );
+
+        ret->setGenerator( CATCH_MOVE( generator ) );
+        ret->open();
+        return ret;
+    }
+
+    bool RunContext::testForMissingAssertions(Counts& assertions) {
+        if (assertions.total() != 0)
+            return false;
+        if (!m_config->warnAboutMissingAssertions())
+            return false;
+        if (m_trackerContext.currentTracker().hasChildren())
+            return false;
+        m_totals.assertions.failed++;
+        assertions.failed++;
+        return true;
+    }
+
+    void RunContext::sectionEnded(SectionEndInfo&& endInfo) {
+        Counts assertions = m_totals.assertions - endInfo.prevAssertions;
+        bool missingAssertions = testForMissingAssertions(assertions);
+
+        if (!m_activeSections.empty()) {
+            m_activeSections.back()->close();
+            m_activeSections.pop_back();
+        }
+
+        m_reporter->sectionEnded(SectionStats(CATCH_MOVE(endInfo.sectionInfo), assertions, endInfo.durationInSeconds, missingAssertions));
+        m_messages.clear();
+        m_messageScopes.clear();
+    }
+
+    void RunContext::sectionEndedEarly(SectionEndInfo&& endInfo) {
+        if ( m_unfinishedSections.empty() ) {
+            m_activeSections.back()->fail();
+        } else {
+            m_activeSections.back()->close();
+        }
+        m_activeSections.pop_back();
+
+        m_unfinishedSections.push_back(CATCH_MOVE(endInfo));
+    }
+
+    void RunContext::benchmarkPreparing( StringRef name ) {
+        m_reporter->benchmarkPreparing(name);
+    }
+    void RunContext::benchmarkStarting( BenchmarkInfo const& info ) {
+        m_reporter->benchmarkStarting( info );
+    }
+    void RunContext::benchmarkEnded( BenchmarkStats<> const& stats ) {
+        m_reporter->benchmarkEnded( stats );
+    }
+    void RunContext::benchmarkFailed( StringRef error ) {
+        m_reporter->benchmarkFailed( error );
+    }
+
+    void RunContext::pushScopedMessage(MessageInfo const & message) {
+        m_messages.push_back(message);
+    }
+
+    void RunContext::popScopedMessage(MessageInfo const & message) {
+        m_messages.erase(std::remove(m_messages.begin(), m_messages.end(), message), m_messages.end());
+    }
+
+    void RunContext::emplaceUnscopedMessage( MessageBuilder&& builder ) {
+        m_messageScopes.emplace_back( CATCH_MOVE(builder) );
+    }
+
+    std::string RunContext::getCurrentTestName() const {
+        return m_activeTestCase
+            ? m_activeTestCase->getTestCaseInfo().name
+            : std::string();
+    }
+
+    const AssertionResult * RunContext::getLastResult() const {
+        return &(*m_lastResult);
+    }
+
+    void RunContext::exceptionEarlyReported() {
+        m_shouldReportUnexpected = false;
+    }
+
+    void RunContext::handleFatalErrorCondition( StringRef message ) {
+        // First notify reporter that bad things happened
+        m_reporter->fatalErrorEncountered(message);
+
+        // Don't rebuild the result -- the stringification itself can cause more fatal errors
+        // Instead, fake a result data.
+        AssertionResultData tempResult( ResultWas::FatalErrorCondition, { false } );
+        tempResult.message = static_cast<std::string>(message);
+        AssertionResult result(m_lastAssertionInfo, CATCH_MOVE(tempResult));
+
+        assertionEnded(CATCH_MOVE(result) );
+        resetAssertionInfo();
+
+        handleUnfinishedSections();
+
+        // Recreate section for test case (as we will lose the one that was in scope)
+        auto const& testCaseInfo = m_activeTestCase->getTestCaseInfo();
+        SectionInfo testCaseSection(testCaseInfo.lineInfo, testCaseInfo.name);
+
+        Counts assertions;
+        assertions.failed = 1;
+        SectionStats testCaseSectionStats(CATCH_MOVE(testCaseSection), assertions, 0, false);
+        m_reporter->sectionEnded(testCaseSectionStats);
+
+        auto const& testInfo = m_activeTestCase->getTestCaseInfo();
+
+        Totals deltaTotals;
+        deltaTotals.testCases.failed = 1;
+        deltaTotals.assertions.failed = 1;
+        m_reporter->testCaseEnded(TestCaseStats(testInfo,
+                                  deltaTotals,
+                                  std::string(),
+                                  std::string(),
+                                  false));
+        m_totals.testCases.failed++;
+        m_reporter->testRunEnded(TestRunStats(m_runInfo, m_totals, false));
+    }
+
+    bool RunContext::lastAssertionPassed() {
+         return m_lastAssertionPassed;
+    }
+
+    void RunContext::assertionPassed() {
+        m_lastAssertionPassed = true;
+        ++m_totals.assertions.passed;
+        resetAssertionInfo();
+        m_messageScopes.clear();
+    }
+
+    bool RunContext::aborting() const {
+        return m_totals.assertions.failed >= static_cast<std::size_t>(m_config->abortAfter());
+    }
+
+    void RunContext::runCurrentTest(std::string & redirectedCout, std::string & redirectedCerr) {
+        auto const& testCaseInfo = m_activeTestCase->getTestCaseInfo();
+        SectionInfo testCaseSection(testCaseInfo.lineInfo, testCaseInfo.name);
+        m_reporter->sectionStarting(testCaseSection);
+        Counts prevAssertions = m_totals.assertions;
+        double duration = 0;
+        m_shouldReportUnexpected = true;
+        m_lastAssertionInfo = { "TEST_CASE"_sr, testCaseInfo.lineInfo, StringRef(), ResultDisposition::Normal };
+
+        Timer timer;
+        CATCH_TRY {
+            if (m_reporter->getPreferences().shouldRedirectStdOut) {
+#if !defined(CATCH_CONFIG_EXPERIMENTAL_REDIRECT)
+                RedirectedStreams redirectedStreams(redirectedCout, redirectedCerr);
+
+                timer.start();
+                invokeActiveTestCase();
+#else
+                OutputRedirect r(redirectedCout, redirectedCerr);
+                timer.start();
+                invokeActiveTestCase();
+#endif
+            } else {
+                timer.start();
+                invokeActiveTestCase();
+            }
+            duration = timer.getElapsedSeconds();
+        } CATCH_CATCH_ANON (TestFailureException&) {
+            // This just means the test was aborted due to failure
+        } CATCH_CATCH_ANON (TestSkipException&) {
+            // This just means the test was explicitly skipped
+        } CATCH_CATCH_ALL {
+            // Under CATCH_CONFIG_FAST_COMPILE, unexpected exceptions under REQUIRE assertions
+            // are reported without translation at the point of origin.
+            if( m_shouldReportUnexpected ) {
+                AssertionReaction dummyReaction;
+                handleUnexpectedInflightException( m_lastAssertionInfo, translateActiveException(), dummyReaction );
+            }
+        }
+        Counts assertions = m_totals.assertions - prevAssertions;
+        bool missingAssertions = testForMissingAssertions(assertions);
+
+        m_testCaseTracker->close();
+        handleUnfinishedSections();
+        m_messages.clear();
+        m_messageScopes.clear();
+
+        SectionStats testCaseSectionStats(CATCH_MOVE(testCaseSection), assertions, duration, missingAssertions);
+        m_reporter->sectionEnded(testCaseSectionStats);
+    }
+
+    void RunContext::invokeActiveTestCase() {
+        // We need to engage a handler for signals/structured exceptions
+        // before running the tests themselves, or the binary can crash
+        // without failed test being reported.
+        FatalConditionHandlerGuard _(&m_fatalConditionhandler);
+        // We keep having issue where some compilers warn about an unused
+        // variable, even though the type has non-trivial constructor and
+        // destructor. This is annoying and ugly, but it makes them stfu.
+        (void)_;
+
+        m_activeTestCase->invoke();
+    }
+
+    void RunContext::handleUnfinishedSections() {
+        // If sections ended prematurely due to an exception we stored their
+        // infos here so we can tear them down outside the unwind process.
+        for (auto it = m_unfinishedSections.rbegin(),
+             itEnd = m_unfinishedSections.rend();
+             it != itEnd;
+             ++it)
+            sectionEnded(CATCH_MOVE(*it));
+        m_unfinishedSections.clear();
+    }
+
+    void RunContext::handleExpr(
+        AssertionInfo const& info,
+        ITransientExpression const& expr,
+        AssertionReaction& reaction
+    ) {
+        bool negated = isFalseTest( info.resultDisposition );
+        bool result = expr.getResult() != negated;
+
+        if( result ) {
+            if (!m_includeSuccessfulResults) {
+                assertionPassed();
+            }
+            else {
+                reportExpr(info, ResultWas::Ok, &expr, negated);
+            }
+        }
+        else {
+            reportExpr(info, ResultWas::ExpressionFailed, &expr, negated );
+            populateReaction( reaction );
+        }
+        resetAssertionInfo();
+    }
+    void RunContext::reportExpr(
+            AssertionInfo const &info,
+            ResultWas::OfType resultType,
+            ITransientExpression const *expr,
+            bool negated ) {
+
+        m_lastAssertionInfo = info;
+        AssertionResultData data( resultType, LazyExpression( negated ) );
+
+        AssertionResult assertionResult{ info, CATCH_MOVE( data ) };
+        assertionResult.m_resultData.lazyExpression.m_transientExpression = expr;
+
+        assertionEnded( CATCH_MOVE(assertionResult) );
+    }
+
+    void RunContext::handleMessage(
+            AssertionInfo const& info,
+            ResultWas::OfType resultType,
+            StringRef message,
+            AssertionReaction& reaction
+    ) {
+        m_lastAssertionInfo = info;
+
+        AssertionResultData data( resultType, LazyExpression( false ) );
+        data.message = static_cast<std::string>(message);
+        AssertionResult assertionResult{ m_lastAssertionInfo,
+                                         CATCH_MOVE( data ) };
+
+        const auto isOk = assertionResult.isOk();
+        assertionEnded( CATCH_MOVE(assertionResult) );
+        if ( !isOk ) {
+            populateReaction( reaction );
+        } else if ( resultType == ResultWas::ExplicitSkip ) {
+            // TODO: Need to handle this explicitly, as ExplicitSkip is
+            // considered "OK"
+            reaction.shouldSkip = true;
+        }
+        resetAssertionInfo();
+    }
+    void RunContext::handleUnexpectedExceptionNotThrown(
+            AssertionInfo const& info,
+            AssertionReaction& reaction
+    ) {
+        handleNonExpr(info, Catch::ResultWas::DidntThrowException, reaction);
+    }
+
+    void RunContext::handleUnexpectedInflightException(
+            AssertionInfo const& info,
+            std::string&& message,
+            AssertionReaction& reaction
+    ) {
+        m_lastAssertionInfo = info;
+
+        AssertionResultData data( ResultWas::ThrewException, LazyExpression( false ) );
+        data.message = CATCH_MOVE(message);
+        AssertionResult assertionResult{ info, CATCH_MOVE(data) };
+        assertionEnded( CATCH_MOVE(assertionResult) );
+        populateReaction( reaction );
+        resetAssertionInfo();
+    }
+
+    void RunContext::populateReaction( AssertionReaction& reaction ) {
+        reaction.shouldDebugBreak = m_config->shouldDebugBreak();
+        reaction.shouldThrow = aborting() || (m_lastAssertionInfo.resultDisposition & ResultDisposition::Normal);
+    }
+
+    void RunContext::handleIncomplete(
+            AssertionInfo const& info
+    ) {
+        using namespace std::string_literals;
+        m_lastAssertionInfo = info;
+
+        AssertionResultData data( ResultWas::ThrewException, LazyExpression( false ) );
+        data.message = "Exception translation was disabled by CATCH_CONFIG_FAST_COMPILE"s;
+        AssertionResult assertionResult{ info, CATCH_MOVE( data ) };
+        assertionEnded( CATCH_MOVE(assertionResult) );
+        resetAssertionInfo();
+    }
+    void RunContext::handleNonExpr(
+            AssertionInfo const &info,
+            ResultWas::OfType resultType,
+            AssertionReaction &reaction
+    ) {
+        m_lastAssertionInfo = info;
+
+        AssertionResultData data( resultType, LazyExpression( false ) );
+        AssertionResult assertionResult{ info, CATCH_MOVE( data ) };
+
+        const auto isOk = assertionResult.isOk();
+        assertionEnded( CATCH_MOVE(assertionResult) );
+        if ( !isOk ) { populateReaction( reaction ); }
+        resetAssertionInfo();
+    }
+
+
+    IResultCapture& getResultCapture() {
+        if (auto* capture = getCurrentContext().getResultCapture())
+            return *capture;
+        else
+            CATCH_INTERNAL_ERROR("No result capture instance");
+    }
+
+    void seedRng(IConfig const& config) {
+        sharedRng().seed(config.rngSeed());
+    }
+
+    unsigned int rngSeed() {
+        return getCurrentContext().getConfig()->rngSeed();
+    }
+
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_run_context.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_run_context.hpp
new file mode 100644
index 00000000..c749304d
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_run_context.hpp
@@ -0,0 +1,161 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_RUN_CONTEXT_HPP_INCLUDED
+#define CATCH_RUN_CONTEXT_HPP_INCLUDED
+
+#include <catch2/interfaces/catch_interfaces_capture.hpp>
+#include <catch2/internal/catch_test_registry.hpp>
+#include <catch2/internal/catch_test_run_info.hpp>
+#include <catch2/internal/catch_fatal_condition_handler.hpp>
+#include <catch2/catch_test_case_info.hpp>
+#include <catch2/catch_message.hpp>
+#include <catch2/catch_totals.hpp>
+#include <catch2/internal/catch_test_case_tracker.hpp>
+#include <catch2/catch_assertion_info.hpp>
+#include <catch2/catch_assertion_result.hpp>
+#include <catch2/internal/catch_optional.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <string>
+
+namespace Catch {
+
+    class IGeneratorTracker;
+    class IConfig;
+    class IEventListener;
+    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    class RunContext final : public IResultCapture {
+
+    public:
+        RunContext( RunContext const& ) = delete;
+        RunContext& operator =( RunContext const& ) = delete;
+
+        explicit RunContext( IConfig const* _config, IEventListenerPtr&& reporter );
+
+        ~RunContext() override;
+
+        Totals runTest(TestCaseHandle const& testCase);
+
+    public: // IResultCapture
+
+        // Assertion handlers
+        void handleExpr
+                (   AssertionInfo const& info,
+                    ITransientExpression const& expr,
+                    AssertionReaction& reaction ) override;
+        void handleMessage
+                (   AssertionInfo const& info,
+                    ResultWas::OfType resultType,
+                    StringRef message,
+                    AssertionReaction& reaction ) override;
+        void handleUnexpectedExceptionNotThrown
+                (   AssertionInfo const& info,
+                    AssertionReaction& reaction ) override;
+        void handleUnexpectedInflightException
+                (   AssertionInfo const& info,
+                    std::string&& message,
+                    AssertionReaction& reaction ) override;
+        void handleIncomplete
+                (   AssertionInfo const& info ) override;
+        void handleNonExpr
+                (   AssertionInfo const &info,
+                    ResultWas::OfType resultType,
+                    AssertionReaction &reaction ) override;
+
+        void notifyAssertionStarted( AssertionInfo const& info ) override;
+        bool sectionStarted( StringRef sectionName,
+                             SourceLineInfo const& sectionLineInfo,
+                             Counts& assertions ) override;
+
+        void sectionEnded( SectionEndInfo&& endInfo ) override;
+        void sectionEndedEarly( SectionEndInfo&& endInfo ) override;
+
+        IGeneratorTracker*
+        acquireGeneratorTracker( StringRef generatorName,
+                                 SourceLineInfo const& lineInfo ) override;
+        IGeneratorTracker* createGeneratorTracker(
+            StringRef generatorName,
+            SourceLineInfo lineInfo,
+            Generators::GeneratorBasePtr&& generator ) override;
+
+
+        void benchmarkPreparing( StringRef name ) override;
+        void benchmarkStarting( BenchmarkInfo const& info ) override;
+        void benchmarkEnded( BenchmarkStats<> const& stats ) override;
+        void benchmarkFailed( StringRef error ) override;
+
+        void pushScopedMessage( MessageInfo const& message ) override;
+        void popScopedMessage( MessageInfo const& message ) override;
+
+        void emplaceUnscopedMessage( MessageBuilder&& builder ) override;
+
+        std::string getCurrentTestName() const override;
+
+        const AssertionResult* getLastResult() const override;
+
+        void exceptionEarlyReported() override;
+
+        void handleFatalErrorCondition( StringRef message ) override;
+
+        bool lastAssertionPassed() override;
+
+        void assertionPassed() override;
+
+    public:
+        // !TBD We need to do this another way!
+        bool aborting() const;
+
+    private:
+
+        void runCurrentTest( std::string& redirectedCout, std::string& redirectedCerr );
+        void invokeActiveTestCase();
+
+        void resetAssertionInfo();
+        bool testForMissingAssertions( Counts& assertions );
+
+        void assertionEnded( AssertionResult&& result );
+        void reportExpr
+                (   AssertionInfo const &info,
+                    ResultWas::OfType resultType,
+                    ITransientExpression const *expr,
+                    bool negated );
+
+        void populateReaction( AssertionReaction& reaction );
+
+    private:
+
+        void handleUnfinishedSections();
+
+        TestRunInfo m_runInfo;
+        TestCaseHandle const* m_activeTestCase = nullptr;
+        ITracker* m_testCaseTracker = nullptr;
+        Optional<AssertionResult> m_lastResult;
+
+        IConfig const* m_config;
+        Totals m_totals;
+        IEventListenerPtr m_reporter;
+        std::vector<MessageInfo> m_messages;
+        std::vector<ScopedMessage> m_messageScopes; /* Keeps owners of so-called unscoped messages. */
+        AssertionInfo m_lastAssertionInfo;
+        std::vector<SectionEndInfo> m_unfinishedSections;
+        std::vector<ITracker*> m_activeSections;
+        TrackerContext m_trackerContext;
+        FatalConditionHandler m_fatalConditionhandler;
+        bool m_lastAssertionPassed = false;
+        bool m_shouldReportUnexpected = true;
+        bool m_includeSuccessfulResults;
+    };
+
+    void seedRng(IConfig const& config);
+    unsigned int rngSeed();
+} // end namespace Catch
+
+#endif // CATCH_RUN_CONTEXT_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_section.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_section.cpp
new file mode 100644
index 00000000..677c2164
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_section.cpp
@@ -0,0 +1,60 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_section.hpp>
+#include <catch2/interfaces/catch_interfaces_capture.hpp>
+#include <catch2/internal/catch_uncaught_exceptions.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+namespace Catch {
+
+    Section::Section( SectionInfo&& info ):
+        m_info( CATCH_MOVE( info ) ),
+        m_sectionIncluded(
+            getResultCapture().sectionStarted( m_info.name, m_info.lineInfo, m_assertions ) ) {
+        // Non-"included" sections will not use the timing information
+        // anyway, so don't bother with the potential syscall.
+        if (m_sectionIncluded) {
+            m_timer.start();
+        }
+    }
+
+    Section::Section( SourceLineInfo const& _lineInfo,
+                      StringRef _name,
+                      const char* const ):
+        m_info( { "invalid", static_cast<std::size_t>( -1 ) }, std::string{} ),
+        m_sectionIncluded(
+            getResultCapture().sectionStarted( _name, _lineInfo, m_assertions ) ) {
+        // We delay initialization the SectionInfo member until we know
+        // this section needs it, so we avoid allocating std::string for name.
+        // We also delay timer start to avoid the potential syscall unless we
+        // will actually use the result.
+        if ( m_sectionIncluded ) {
+            m_info.name = static_cast<std::string>( _name );
+            m_info.lineInfo = _lineInfo;
+            m_timer.start();
+        }
+    }
+
+    Section::~Section() {
+        if( m_sectionIncluded ) {
+            SectionEndInfo endInfo{ CATCH_MOVE(m_info), m_assertions, m_timer.getElapsedSeconds() };
+            if ( uncaught_exceptions() ) {
+                getResultCapture().sectionEndedEarly( CATCH_MOVE(endInfo) );
+            } else {
+                getResultCapture().sectionEnded( CATCH_MOVE( endInfo ) );
+            }
+        }
+    }
+
+    // This indicates whether the section should be executed or not
+    Section::operator bool() const {
+        return m_sectionIncluded;
+    }
+
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_section.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_section.hpp
new file mode 100644
index 00000000..e56c79f3
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_section.hpp
@@ -0,0 +1,104 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_SECTION_HPP_INCLUDED
+#define CATCH_SECTION_HPP_INCLUDED
+
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/internal/catch_config_static_analysis_support.hpp>
+#include <catch2/internal/catch_noncopyable.hpp>
+#include <catch2/catch_section_info.hpp>
+#include <catch2/catch_timer.hpp>
+#include <catch2/catch_totals.hpp>
+#include <catch2/internal/catch_unique_name.hpp>
+
+namespace Catch {
+
+    class Section : Detail::NonCopyable {
+    public:
+        Section( SectionInfo&& info );
+        Section( SourceLineInfo const& _lineInfo,
+                 StringRef _name,
+                 const char* const = nullptr );
+        ~Section();
+
+        // This indicates whether the section should be executed or not
+        explicit operator bool() const;
+
+    private:
+        SectionInfo m_info;
+
+        Counts m_assertions;
+        bool m_sectionIncluded;
+        Timer m_timer;
+    };
+
+} // end namespace Catch
+
+#if !defined(CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT)
+#    define INTERNAL_CATCH_SECTION( ... )                                 \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                         \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                  \
+        if ( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME(            \
+                 catch_internal_Section ) =                               \
+                 Catch::Section( CATCH_INTERNAL_LINEINFO, __VA_ARGS__ ) ) \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#    define INTERNAL_CATCH_DYNAMIC_SECTION( ... )                     \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                     \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS              \
+        if ( Catch::Section const& INTERNAL_CATCH_UNIQUE_NAME(        \
+                 catch_internal_Section ) =                           \
+                 Catch::SectionInfo(                                  \
+                     CATCH_INTERNAL_LINEINFO,                         \
+                     ( Catch::ReusableStringStream() << __VA_ARGS__ ) \
+                         .str() ) )                                   \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#else
+
+// These section definitions imply that at most one section at one level
+// will be intered (because only one section's __LINE__ can be equal to
+// the dummy `catchInternalSectionHint` variable from `TEST_CASE`).
+
+namespace Catch {
+    namespace Detail {
+        // Intentionally without linkage, as it should only be used as a dummy
+        // symbol for static analysis.
+        // The arguments are used as a dummy for checking warnings in the passed
+        // expressions.
+        int GetNewSectionHint( StringRef, const char* const = nullptr );
+    } // namespace Detail
+} // namespace Catch
+
+
+#    define INTERNAL_CATCH_SECTION( ... )                                   \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                           \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                    \
+        CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS                             \
+        if ( [[maybe_unused]] const int catchInternalPreviousSectionHint =  \
+                 catchInternalSectionHint,                                  \
+             catchInternalSectionHint =                                     \
+                 Catch::Detail::GetNewSectionHint(__VA_ARGS__);             \
+             catchInternalPreviousSectionHint == __LINE__ )                 \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#    define INTERNAL_CATCH_DYNAMIC_SECTION( ... )                           \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                           \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                    \
+        CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS                             \
+        if ( [[maybe_unused]] const int catchInternalPreviousSectionHint =  \
+                 catchInternalSectionHint,                                  \
+             catchInternalSectionHint = Catch::Detail::GetNewSectionHint(   \
+                ( Catch::ReusableStringStream() << __VA_ARGS__ ).str());    \
+             catchInternalPreviousSectionHint == __LINE__ )                 \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#endif
+
+
+#endif // CATCH_SECTION_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_sharding.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_sharding.hpp
new file mode 100644
index 00000000..22561f4b
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_sharding.hpp
@@ -0,0 +1,41 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_SHARDING_HPP_INCLUDED
+#define CATCH_SHARDING_HPP_INCLUDED
+
+#include <cassert>
+#include <cmath>
+#include <algorithm>
+
+namespace Catch {
+
+    template<typename Container>
+    Container createShard(Container const& container, std::size_t const shardCount, std::size_t const shardIndex) {
+        assert(shardCount > shardIndex);
+
+        if (shardCount == 1) {
+            return container;
+        }
+
+        const std::size_t totalTestCount = container.size();
+
+        const std::size_t shardSize = totalTestCount / shardCount;
+        const std::size_t leftoverTests = totalTestCount % shardCount;
+
+        const std::size_t startIndex = shardIndex * shardSize + (std::min)(shardIndex, leftoverTests);
+        const std::size_t endIndex = (shardIndex + 1) * shardSize + (std::min)(shardIndex + 1, leftoverTests);
+
+        auto startIterator = std::next(container.begin(), static_cast<std::ptrdiff_t>(startIndex));
+        auto endIterator = std::next(container.begin(), static_cast<std::ptrdiff_t>(endIndex));
+
+        return Container(startIterator, endIterator);
+    }
+
+}
+
+#endif // CATCH_SHARDING_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_singletons.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_singletons.cpp
new file mode 100644
index 00000000..4e856def
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_singletons.cpp
@@ -0,0 +1,36 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_singletons.hpp>
+
+#include <vector>
+
+namespace Catch {
+
+    namespace {
+        static auto getSingletons() -> std::vector<ISingleton*>*& {
+            static std::vector<ISingleton*>* g_singletons = nullptr;
+            if( !g_singletons )
+                g_singletons = new std::vector<ISingleton*>();
+            return g_singletons;
+        }
+    }
+
+    ISingleton::~ISingleton() = default;
+
+    void addSingleton(ISingleton* singleton ) {
+        getSingletons()->push_back( singleton );
+    }
+    void cleanupSingletons() {
+        auto& singletons = getSingletons();
+        for( auto singleton : *singletons )
+            delete singleton;
+        delete singletons;
+        singletons = nullptr;
+    }
+
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_singletons.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_singletons.hpp
new file mode 100644
index 00000000..a28a13da
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_singletons.hpp
@@ -0,0 +1,45 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_SINGLETONS_HPP_INCLUDED
+#define CATCH_SINGLETONS_HPP_INCLUDED
+
+namespace Catch {
+
+    struct ISingleton {
+        virtual ~ISingleton(); // = default
+    };
+
+
+    void addSingleton( ISingleton* singleton );
+    void cleanupSingletons();
+
+
+    template<typename SingletonImplT, typename InterfaceT = SingletonImplT, typename MutableInterfaceT = InterfaceT>
+    class Singleton : SingletonImplT, public ISingleton {
+
+        static auto getInternal() -> Singleton* {
+            static Singleton* s_instance = nullptr;
+            if( !s_instance ) {
+                s_instance = new Singleton;
+                addSingleton( s_instance );
+            }
+            return s_instance;
+        }
+
+    public:
+        static auto get() -> InterfaceT const& {
+            return *getInternal();
+        }
+        static auto getMutable() -> MutableInterfaceT& {
+            return *getInternal();
+        }
+    };
+
+} // namespace Catch
+
+#endif // CATCH_SINGLETONS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_source_line_info.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_source_line_info.cpp
new file mode 100644
index 00000000..f55f1c9f
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_source_line_info.cpp
@@ -0,0 +1,33 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_source_line_info.hpp>
+
+#include <cstring>
+#include <ostream>
+
+namespace Catch {
+
+    bool SourceLineInfo::operator == ( SourceLineInfo const& other ) const noexcept {
+        return line == other.line && (file == other.file || std::strcmp(file, other.file) == 0);
+    }
+    bool SourceLineInfo::operator < ( SourceLineInfo const& other ) const noexcept {
+        // We can assume that the same file will usually have the same pointer.
+        // Thus, if the pointers are the same, there is no point in calling the strcmp
+        return line < other.line || ( line == other.line && file != other.file && (std::strcmp(file, other.file) < 0));
+    }
+
+    std::ostream& operator << ( std::ostream& os, SourceLineInfo const& info ) {
+#ifndef __GNUG__
+        os << info.file << '(' << info.line << ')';
+#else
+        os << info.file << ':' << info.line;
+#endif
+        return os;
+    }
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_source_line_info.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_source_line_info.hpp
new file mode 100644
index 00000000..c5980527
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_source_line_info.hpp
@@ -0,0 +1,37 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_SOURCE_LINE_INFO_HPP_INCLUDED
+#define CATCH_SOURCE_LINE_INFO_HPP_INCLUDED
+
+#include <cstddef>
+#include <iosfwd>
+
+namespace Catch {
+
+    struct SourceLineInfo {
+
+        SourceLineInfo() = delete;
+        constexpr SourceLineInfo( char const* _file, std::size_t _line ) noexcept:
+            file( _file ),
+            line( _line )
+        {}
+
+        bool operator == ( SourceLineInfo const& other ) const noexcept;
+        bool operator < ( SourceLineInfo const& other ) const noexcept;
+
+        char const* file;
+        std::size_t line;
+
+        friend std::ostream& operator << (std::ostream& os, SourceLineInfo const& info);
+    };
+}
+
+#define CATCH_INTERNAL_LINEINFO \
+    ::Catch::SourceLineInfo( __FILE__, static_cast<std::size_t>( __LINE__ ) )
+
+#endif // CATCH_SOURCE_LINE_INFO_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_startup_exception_registry.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_startup_exception_registry.cpp
new file mode 100644
index 00000000..16076637
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_startup_exception_registry.cpp
@@ -0,0 +1,29 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/internal/catch_startup_exception_registry.hpp>
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/catch_user_config.hpp>
+
+namespace Catch {
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+    void StartupExceptionRegistry::add( std::exception_ptr const& exception ) noexcept {
+        CATCH_TRY {
+            m_exceptions.push_back(exception);
+        } CATCH_CATCH_ALL {
+            // If we run out of memory during start-up there's really not a lot more we can do about it
+            std::terminate();
+        }
+    }
+
+    std::vector<std::exception_ptr> const& StartupExceptionRegistry::getExceptions() const noexcept {
+        return m_exceptions;
+    }
+#endif
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_startup_exception_registry.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_startup_exception_registry.hpp
new file mode 100644
index 00000000..aef4667d
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_startup_exception_registry.hpp
@@ -0,0 +1,29 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_STARTUP_EXCEPTION_REGISTRY_HPP_INCLUDED
+#define CATCH_STARTUP_EXCEPTION_REGISTRY_HPP_INCLUDED
+
+
+#include <vector>
+#include <exception>
+
+namespace Catch {
+
+    class StartupExceptionRegistry {
+#if !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+    public:
+        void add(std::exception_ptr const& exception) noexcept;
+        std::vector<std::exception_ptr> const& getExceptions() const noexcept;
+    private:
+        std::vector<std::exception_ptr> m_exceptions;
+#endif
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_STARTUP_EXCEPTION_REGISTRY_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stdstreams.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stdstreams.cpp
new file mode 100644
index 00000000..a4502b2b
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stdstreams.cpp
@@ -0,0 +1,24 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/internal/catch_stdstreams.hpp>
+
+#include <catch2/catch_user_config.hpp>
+
+#include <iostream>
+
+namespace Catch {
+
+// If you #define this you must implement these functions
+#if !defined( CATCH_CONFIG_NOSTDOUT )
+    std::ostream& cout() { return std::cout; }
+    std::ostream& cerr() { return std::cerr; }
+    std::ostream& clog() { return std::clog; }
+#endif
+
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stdstreams.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stdstreams.hpp
new file mode 100644
index 00000000..02aec635
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stdstreams.hpp
@@ -0,0 +1,22 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#ifndef CATCH_STDSTREAMS_HPP_INCLUDED
+#define CATCH_STDSTREAMS_HPP_INCLUDED
+
+#include <iosfwd>
+
+namespace Catch {
+
+    std::ostream& cout();
+    std::ostream& cerr();
+    std::ostream& clog();
+
+} // namespace Catch
+
+#endif
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stream_end_stop.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stream_end_stop.hpp
new file mode 100644
index 00000000..66d678cf
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stream_end_stop.hpp
@@ -0,0 +1,30 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_STREAM_END_STOP_HPP_INCLUDED
+#define CATCH_STREAM_END_STOP_HPP_INCLUDED
+
+#include <catch2/internal/catch_stringref.hpp>
+
+namespace Catch {
+
+    // Use this in variadic streaming macros to allow
+    //    << +StreamEndStop
+    // as well as
+    //    << stuff +StreamEndStop
+    struct StreamEndStop {
+        constexpr StringRef operator+() const { return StringRef(); }
+
+        template <typename T>
+        constexpr friend T const& operator+( T const& value, StreamEndStop ) {
+            return value;
+        }
+    };
+
+} // namespace Catch
+
+#endif // CATCH_STREAM_END_STOP_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_string_manip.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_string_manip.cpp
new file mode 100644
index 00000000..ce1abaa0
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_string_manip.cpp
@@ -0,0 +1,116 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+
+#include <ostream>
+#include <cstring>
+#include <cctype>
+#include <vector>
+
+namespace Catch {
+
+    bool startsWith( std::string const& s, std::string const& prefix ) {
+        return s.size() >= prefix.size() && std::equal(prefix.begin(), prefix.end(), s.begin());
+    }
+    bool startsWith( StringRef s, char prefix ) {
+        return !s.empty() && s[0] == prefix;
+    }
+    bool endsWith( std::string const& s, std::string const& suffix ) {
+        return s.size() >= suffix.size() && std::equal(suffix.rbegin(), suffix.rend(), s.rbegin());
+    }
+    bool endsWith( std::string const& s, char suffix ) {
+        return !s.empty() && s[s.size()-1] == suffix;
+    }
+    bool contains( std::string const& s, std::string const& infix ) {
+        return s.find( infix ) != std::string::npos;
+    }
+    void toLowerInPlace( std::string& s ) {
+        for ( char& c : s ) {
+            c = toLower( c );
+        }
+    }
+    std::string toLower( std::string const& s ) {
+        std::string lc = s;
+        toLowerInPlace( lc );
+        return lc;
+    }
+    char toLower(char c) {
+        return static_cast<char>(std::tolower(static_cast<unsigned char>(c)));
+    }
+
+    std::string trim( std::string const& str ) {
+        static char const* whitespaceChars = "\n\r\t ";
+        std::string::size_type start = str.find_first_not_of( whitespaceChars );
+        std::string::size_type end = str.find_last_not_of( whitespaceChars );
+
+        return start != std::string::npos ? str.substr( start, 1+end-start ) : std::string();
+    }
+
+    StringRef trim(StringRef ref) {
+        const auto is_ws = [](char c) {
+            return c == ' ' || c == '\t' || c == '\n' || c == '\r';
+        };
+        size_t real_begin = 0;
+        while (real_begin < ref.size() && is_ws(ref[real_begin])) { ++real_begin; }
+        size_t real_end = ref.size();
+        while (real_end > real_begin && is_ws(ref[real_end - 1])) { --real_end; }
+
+        return ref.substr(real_begin, real_end - real_begin);
+    }
+
+    bool replaceInPlace( std::string& str, std::string const& replaceThis, std::string const& withThis ) {
+        std::size_t i = str.find( replaceThis );
+        if (i == std::string::npos) {
+            return false;
+        }
+        std::size_t copyBegin = 0;
+        std::string origStr = CATCH_MOVE(str);
+        str.clear();
+        // There is at least one replacement, so reserve with the best guess
+        // we can make without actually counting the number of occurences.
+        str.reserve(origStr.size() - replaceThis.size() + withThis.size());
+        do {
+            str.append(origStr, copyBegin, i-copyBegin );
+            str += withThis;
+            copyBegin = i + replaceThis.size();
+            if( copyBegin < origStr.size() )
+                i = origStr.find( replaceThis, copyBegin );
+            else
+                i = std::string::npos;
+        } while( i != std::string::npos );
+        if ( copyBegin < origStr.size() ) {
+            str.append(origStr, copyBegin, origStr.size() );
+        }
+        return true;
+    }
+
+    std::vector<StringRef> splitStringRef( StringRef str, char delimiter ) {
+        std::vector<StringRef> subStrings;
+        std::size_t start = 0;
+        for(std::size_t pos = 0; pos < str.size(); ++pos ) {
+            if( str[pos] == delimiter ) {
+                if( pos - start > 1 )
+                    subStrings.push_back( str.substr( start, pos-start ) );
+                start = pos+1;
+            }
+        }
+        if( start < str.size() )
+            subStrings.push_back( str.substr( start, str.size()-start ) );
+        return subStrings;
+    }
+
+    std::ostream& operator << ( std::ostream& os, pluralise const& pluraliser ) {
+        os << pluraliser.m_count << ' ' << pluraliser.m_label;
+        if( pluraliser.m_count != 1 )
+            os << 's';
+        return os;
+    }
+
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_string_manip.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_string_manip.hpp
new file mode 100644
index 00000000..dc0c552c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_string_manip.hpp
@@ -0,0 +1,61 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_STRING_MANIP_HPP_INCLUDED
+#define CATCH_STRING_MANIP_HPP_INCLUDED
+
+#include <catch2/internal/catch_stringref.hpp>
+
+#include <cstdint>
+#include <string>
+#include <iosfwd>
+#include <vector>
+
+namespace Catch {
+
+    bool startsWith( std::string const& s, std::string const& prefix );
+    bool startsWith( StringRef s, char prefix );
+    bool endsWith( std::string const& s, std::string const& suffix );
+    bool endsWith( std::string const& s, char suffix );
+    bool contains( std::string const& s, std::string const& infix );
+    void toLowerInPlace( std::string& s );
+    std::string toLower( std::string const& s );
+    char toLower( char c );
+    //! Returns a new string without whitespace at the start/end
+    std::string trim( std::string const& str );
+    //! Returns a substring of the original ref without whitespace. Beware lifetimes!
+    StringRef trim(StringRef ref);
+
+    // !!! Be aware, returns refs into original string - make sure original string outlives them
+    std::vector<StringRef> splitStringRef( StringRef str, char delimiter );
+    bool replaceInPlace( std::string& str, std::string const& replaceThis, std::string const& withThis );
+
+    /**
+     * Helper for streaming a "count [maybe-plural-of-label]" human-friendly string
+     *
+     * Usage example:
+     * ```cpp
+     * std::cout << "Found " << pluralise(count, "error") << '\n';
+     * ```
+     *
+     * **Important:** The provided string must outlive the instance
+     */
+    class pluralise {
+        std::uint64_t m_count;
+        StringRef m_label;
+
+    public:
+        constexpr pluralise(std::uint64_t count, StringRef label):
+            m_count(count),
+            m_label(label)
+        {}
+
+        friend std::ostream& operator << ( std::ostream& os, pluralise const& pluraliser );
+    };
+}
+
+#endif // CATCH_STRING_MANIP_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stringref.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stringref.cpp
new file mode 100644
index 00000000..232498eb
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stringref.cpp
@@ -0,0 +1,66 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_stringref.hpp>
+
+#include <algorithm>
+#include <ostream>
+#include <cstring>
+#include <cstdint>
+
+namespace Catch {
+    StringRef::StringRef( char const* rawChars ) noexcept
+    : StringRef( rawChars, std::strlen(rawChars) )
+    {}
+
+
+    bool StringRef::operator<(StringRef rhs) const noexcept {
+        if (m_size < rhs.m_size) {
+            return strncmp(m_start, rhs.m_start, m_size) <= 0;
+        }
+        return strncmp(m_start, rhs.m_start, rhs.m_size) < 0;
+    }
+
+    int StringRef::compare( StringRef rhs ) const {
+        auto cmpResult =
+            strncmp( m_start, rhs.m_start, std::min( m_size, rhs.m_size ) );
+
+        // This means that strncmp found a difference before the strings
+        // ended, and we can return it directly
+        if ( cmpResult != 0 ) {
+            return cmpResult;
+        }
+
+        // If strings are equal up to length, then their comparison results on
+        // their size
+        if ( m_size < rhs.m_size ) {
+            return -1;
+        } else if ( m_size > rhs.m_size ) {
+            return 1;
+        } else {
+            return 0;
+        }
+    }
+
+    auto operator << ( std::ostream& os, StringRef str ) -> std::ostream& {
+        return os.write(str.data(), static_cast<std::streamsize>(str.size()));
+    }
+
+    std::string operator+(StringRef lhs, StringRef rhs) {
+        std::string ret;
+        ret.reserve(lhs.size() + rhs.size());
+        ret += lhs;
+        ret += rhs;
+        return ret;
+    }
+
+    auto operator+=( std::string& lhs, StringRef rhs ) -> std::string& {
+        lhs.append(rhs.data(), rhs.size());
+        return lhs;
+    }
+
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stringref.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stringref.hpp
new file mode 100644
index 00000000..421ce712
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stringref.hpp
@@ -0,0 +1,123 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_STRINGREF_HPP_INCLUDED
+#define CATCH_STRINGREF_HPP_INCLUDED
+
+#include <cstddef>
+#include <string>
+#include <iosfwd>
+#include <cassert>
+
+#include <cstring>
+
+namespace Catch {
+
+    /// A non-owning string class (similar to the forthcoming std::string_view)
+    /// Note that, because a StringRef may be a substring of another string,
+    /// it may not be null terminated.
+    class StringRef {
+    public:
+        using size_type = std::size_t;
+        using const_iterator = const char*;
+
+        static constexpr size_type npos{ static_cast<size_type>( -1 ) };
+
+    private:
+        static constexpr char const* const s_empty = "";
+
+        char const* m_start = s_empty;
+        size_type m_size = 0;
+
+    public: // construction
+        constexpr StringRef() noexcept = default;
+
+        StringRef( char const* rawChars ) noexcept;
+
+        constexpr StringRef( char const* rawChars, size_type size ) noexcept
+        :   m_start( rawChars ),
+            m_size( size )
+        {}
+
+        StringRef( std::string const& stdString ) noexcept
+        :   m_start( stdString.c_str() ),
+            m_size( stdString.size() )
+        {}
+
+        explicit operator std::string() const {
+            return std::string(m_start, m_size);
+        }
+
+    public: // operators
+        auto operator == ( StringRef other ) const noexcept -> bool {
+            return m_size == other.m_size
+                && (std::memcmp( m_start, other.m_start, m_size ) == 0);
+        }
+        auto operator != (StringRef other) const noexcept -> bool {
+            return !(*this == other);
+        }
+
+        constexpr auto operator[] ( size_type index ) const noexcept -> char {
+            assert(index < m_size);
+            return m_start[index];
+        }
+
+        bool operator<(StringRef rhs) const noexcept;
+
+    public: // named queries
+        constexpr auto empty() const noexcept -> bool {
+            return m_size == 0;
+        }
+        constexpr auto size() const noexcept -> size_type {
+            return m_size;
+        }
+
+        // Returns a substring of [start, start + length).
+        // If start + length > size(), then the substring is [start, size()).
+        // If start > size(), then the substring is empty.
+        constexpr StringRef substr(size_type start, size_type length) const noexcept {
+            if (start < m_size) {
+                const auto shortened_size = m_size - start;
+                return StringRef(m_start + start, (shortened_size < length) ? shortened_size : length);
+            } else {
+                return StringRef();
+            }
+        }
+
+        // Returns the current start pointer. May not be null-terminated.
+        constexpr char const* data() const noexcept {
+            return m_start;
+        }
+
+        constexpr const_iterator begin() const { return m_start; }
+        constexpr const_iterator end() const { return m_start + m_size; }
+
+
+        friend std::string& operator += (std::string& lhs, StringRef rhs);
+        friend std::ostream& operator << (std::ostream& os, StringRef str);
+        friend std::string operator+(StringRef lhs, StringRef rhs);
+
+        /**
+         * Provides a three-way comparison with rhs
+         *
+         * Returns negative number if lhs < rhs, 0 if lhs == rhs, and a positive
+         * number if lhs > rhs
+         */
+        int compare( StringRef rhs ) const;
+    };
+
+
+    constexpr auto operator ""_sr( char const* rawChars, std::size_t size ) noexcept -> StringRef {
+        return StringRef( rawChars, size );
+    }
+} // namespace Catch
+
+constexpr auto operator ""_catch_sr( char const* rawChars, std::size_t size ) noexcept -> Catch::StringRef {
+    return Catch::StringRef( rawChars, size );
+}
+
+#endif // CATCH_STRINGREF_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_tag_alias_registry.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_tag_alias_registry.cpp
new file mode 100644
index 00000000..510df167
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_tag_alias_registry.cpp
@@ -0,0 +1,54 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_tag_alias_registry.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+
+namespace Catch {
+
+    TagAliasRegistry::~TagAliasRegistry() = default;
+
+    TagAlias const* TagAliasRegistry::find( std::string const& alias ) const {
+        auto it = m_registry.find( alias );
+        if( it != m_registry.end() )
+            return &(it->second);
+        else
+            return nullptr;
+    }
+
+    std::string TagAliasRegistry::expandAliases( std::string const& unexpandedTestSpec ) const {
+        std::string expandedTestSpec = unexpandedTestSpec;
+        for( auto const& registryKvp : m_registry ) {
+            std::size_t pos = expandedTestSpec.find( registryKvp.first );
+            if( pos != std::string::npos ) {
+                expandedTestSpec =  expandedTestSpec.substr( 0, pos ) +
+                                    registryKvp.second.tag +
+                                    expandedTestSpec.substr( pos + registryKvp.first.size() );
+            }
+        }
+        return expandedTestSpec;
+    }
+
+    void TagAliasRegistry::add( std::string const& alias, std::string const& tag, SourceLineInfo const& lineInfo ) {
+        CATCH_ENFORCE( startsWith(alias, "[@") && endsWith(alias, ']'),
+                      "error: tag alias, '" << alias << "' is not of the form [@alias name].\n" << lineInfo );
+
+        CATCH_ENFORCE( m_registry.insert(std::make_pair(alias, TagAlias(tag, lineInfo))).second,
+                      "error: tag alias, '" << alias << "' already registered.\n"
+                      << "\tFirst seen at: " << find(alias)->lineInfo << "\n"
+                      << "\tRedefined at: " << lineInfo );
+    }
+
+    ITagAliasRegistry::~ITagAliasRegistry() = default;
+
+    ITagAliasRegistry const& ITagAliasRegistry::get() {
+        return getRegistryHub().getTagAliasRegistry();
+    }
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_tag_alias_registry.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_tag_alias_registry.hpp
new file mode 100644
index 00000000..64c0f8f3
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_tag_alias_registry.hpp
@@ -0,0 +1,33 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TAG_ALIAS_REGISTRY_HPP_INCLUDED
+#define CATCH_TAG_ALIAS_REGISTRY_HPP_INCLUDED
+
+#include <catch2/interfaces/catch_interfaces_tag_alias_registry.hpp>
+#include <catch2/catch_tag_alias.hpp>
+
+#include <map>
+#include <string>
+
+namespace Catch {
+    struct SourceLineInfo;
+
+    class TagAliasRegistry : public ITagAliasRegistry {
+    public:
+        ~TagAliasRegistry() override;
+        TagAlias const* find( std::string const& alias ) const override;
+        std::string expandAliases( std::string const& unexpandedTestSpec ) const override;
+        void add( std::string const& alias, std::string const& tag, SourceLineInfo const& lineInfo );
+
+    private:
+        std::map<std::string, TagAlias> m_registry;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_TAG_ALIAS_REGISTRY_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_template_test_registry.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_template_test_registry.hpp
new file mode 100644
index 00000000..0ea354bc
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_template_test_registry.hpp
@@ -0,0 +1,337 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TEMPLATE_TEST_REGISTRY_HPP_INCLUDED
+#define CATCH_TEMPLATE_TEST_REGISTRY_HPP_INCLUDED
+
+#include <catch2/internal/catch_test_registry.hpp>
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/internal/catch_preprocessor.hpp>
+#include <catch2/internal/catch_meta.hpp>
+#include <catch2/internal/catch_unique_name.hpp>
+
+
+// GCC 5 and older do not properly handle disabling unused-variable warning
+// with a _Pragma. This means that we have to leak the suppression to the
+// user code as well :-(
+#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ <= 5
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+#if defined(CATCH_CONFIG_DISABLE)
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( TestName, TestFunc, Name, Tags, Signature, ... )  \
+        INTERNAL_CATCH_DEFINE_SIG_TEST(TestFunc, INTERNAL_CATCH_REMOVE_PARENS(Signature))
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( TestNameClass, TestName, ClassName, Name, Tags, Signature, ... )    \
+        namespace{                                                                                  \
+            namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName) {                                      \
+            INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD(TestName, ClassName, INTERNAL_CATCH_REMOVE_PARENS(Signature));\
+        }                                                                                           \
+        }                                                                                           \
+        INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(TestName, INTERNAL_CATCH_REMOVE_PARENS(Signature))
+
+    #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(Name, Tags, ...) \
+            INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, typename TestType, __VA_ARGS__ )
+    #else
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION(Name, Tags, ...) \
+            INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, typename TestType, __VA_ARGS__ ) )
+    #endif
+
+    #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(Name, Tags, Signature, ...) \
+            INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, Signature, __VA_ARGS__ )
+    #else
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG_NO_REGISTRATION(Name, Tags, Signature, ...) \
+            INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, Signature, __VA_ARGS__ ) )
+    #endif
+
+    #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION( ClassName, Name, Tags,... ) \
+            INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_CLASS_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ )
+    #else
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION( ClassName, Name, Tags,... ) \
+            INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_CLASS_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) )
+    #endif
+
+    #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION( ClassName, Name, Tags, Signature, ... ) \
+            INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_CLASS_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ )
+    #else
+        #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG_NO_REGISTRATION( ClassName, Name, Tags, Signature, ... ) \
+            INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_NO_REGISTRATION_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_CLASS_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) )
+    #endif
+#endif
+
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_2(TestName, TestFunc, Name, Tags, Signature, ... )\
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS \
+        INTERNAL_CATCH_DECLARE_SIG_TEST(TestFunc, INTERNAL_CATCH_REMOVE_PARENS(Signature));\
+        namespace {\
+        namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName){\
+            INTERNAL_CATCH_TYPE_GEN\
+            INTERNAL_CATCH_NTTP_GEN(INTERNAL_CATCH_REMOVE_PARENS(Signature))\
+            INTERNAL_CATCH_NTTP_REG_GEN(TestFunc,INTERNAL_CATCH_REMOVE_PARENS(Signature))\
+            template<typename...Types> \
+            struct TestName{\
+                TestName(){\
+                    size_t index = 0;                                    \
+                    constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, __VA_ARGS__)}; /* NOLINT(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,hicpp-avoid-c-arrays) */\
+                    using expander = size_t[]; /* NOLINT(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,hicpp-avoid-c-arrays) */\
+                    (void)expander{(reg_test(Types{}, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index]), Tags } ), index++)... };/* NOLINT */ \
+                }\
+            };\
+            static const int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\
+            TestName<INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(__VA_ARGS__)>();\
+            return 0;\
+        }();\
+        }\
+        }\
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        INTERNAL_CATCH_DEFINE_SIG_TEST(TestFunc,INTERNAL_CATCH_REMOVE_PARENS(Signature))
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE(Name, Tags, ...) \
+        INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, typename TestType, __VA_ARGS__ )
+#else
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE(Name, Tags, ...) \
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, typename TestType, __VA_ARGS__ ) )
+#endif
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(Name, Tags, Signature, ...) \
+        INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, Signature, __VA_ARGS__ )
+#else
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_SIG(Name, Tags, Signature, ...) \
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, Signature, __VA_ARGS__ ) )
+#endif
+
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(TestName, TestFuncName, Name, Tags, Signature, TmplTypes, TypesList) \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                      \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                      \
+        CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS                \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS       \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS \
+        template<typename TestType> static void TestFuncName();       \
+        namespace {\
+        namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName) {                                     \
+            INTERNAL_CATCH_TYPE_GEN                                                  \
+            INTERNAL_CATCH_NTTP_GEN(INTERNAL_CATCH_REMOVE_PARENS(Signature))         \
+            template<typename... Types>                               \
+            struct TestName {                                         \
+                void reg_tests() {                                          \
+                    size_t index = 0;                                    \
+                    using expander = size_t[];                           \
+                    constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TmplTypes))};\
+                    constexpr char const* types_list[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TypesList))};\
+                    constexpr auto num_types = sizeof(types_list) / sizeof(types_list[0]);\
+                    (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestFuncName<Types> ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index / num_types]) + '<' + std::string(types_list[index % num_types]) + '>', Tags } ), index++)... };/* NOLINT */\
+                }                                                     \
+            };                                                        \
+            static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){ \
+                using TestInit = typename create<TestName, decltype(get_wrapper<INTERNAL_CATCH_REMOVE_PARENS(TmplTypes)>()), TypeList<INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(INTERNAL_CATCH_REMOVE_PARENS(TypesList))>>::type; \
+                TestInit t;                                           \
+                t.reg_tests();                                        \
+                return 0;                                             \
+            }();                                                      \
+        }                                                             \
+        }                                                             \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                       \
+        template<typename TestType>                                   \
+        static void TestFuncName()
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(Name, Tags, ...)\
+        INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, typename T,__VA_ARGS__)
+#else
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE(Name, Tags, ...)\
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, typename T, __VA_ARGS__ ) )
+#endif
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(Name, Tags, Signature, ...)\
+        INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2(INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, Signature, __VA_ARGS__)
+#else
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_SIG(Name, Tags, Signature, ...)\
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, Signature, __VA_ARGS__ ) )
+#endif
+
+    #define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_2(TestName, TestFunc, Name, Tags, TmplList)\
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS \
+        template<typename TestType> static void TestFunc();       \
+        namespace {\
+        namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName){\
+        INTERNAL_CATCH_TYPE_GEN\
+        template<typename... Types>                               \
+        struct TestName {                                         \
+            void reg_tests() {                                          \
+                size_t index = 0;                                    \
+                using expander = size_t[];                           \
+                (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestFunc<Types> ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ Name " - " INTERNAL_CATCH_STRINGIZE(TmplList) " - " + std::to_string(index), Tags } ), index++)... };/* NOLINT */\
+            }                                                     \
+        };\
+        static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){ \
+                using TestInit = typename convert<TestName, TmplList>::type; \
+                TestInit t;                                           \
+                t.reg_tests();                                        \
+                return 0;                                             \
+            }();                                                      \
+        }}\
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION                       \
+        template<typename TestType>                                   \
+        static void TestFunc()
+
+    #define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE(Name, Tags, TmplList) \
+        INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), Name, Tags, TmplList )
+
+
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( TestNameClass, TestName, ClassName, Name, Tags, Signature, ... ) \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+        namespace {\
+        namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName){ \
+            INTERNAL_CATCH_TYPE_GEN\
+            INTERNAL_CATCH_NTTP_GEN(INTERNAL_CATCH_REMOVE_PARENS(Signature))\
+            INTERNAL_CATCH_DECLARE_SIG_TEST_METHOD(TestName, ClassName, INTERNAL_CATCH_REMOVE_PARENS(Signature));\
+            INTERNAL_CATCH_NTTP_REG_METHOD_GEN(TestName, INTERNAL_CATCH_REMOVE_PARENS(Signature))\
+            template<typename...Types> \
+            struct TestNameClass{\
+                TestNameClass(){\
+                    size_t index = 0;                                    \
+                    constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, __VA_ARGS__)};\
+                    using expander = size_t[];\
+                    (void)expander{(reg_test(Types{}, #ClassName, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index]), Tags } ), index++)... };/* NOLINT */ \
+                }\
+            };\
+            static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\
+                TestNameClass<INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(__VA_ARGS__)>();\
+                return 0;\
+        }();\
+        }\
+        }\
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        INTERNAL_CATCH_DEFINE_SIG_TEST_METHOD(TestName, INTERNAL_CATCH_REMOVE_PARENS(Signature))
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( ClassName, Name, Tags,... ) \
+        INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_CLASS_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ )
+#else
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD( ClassName, Name, Tags,... ) \
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_CLASS_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ) , ClassName, Name, Tags, typename T, __VA_ARGS__ ) )
+#endif
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( ClassName, Name, Tags, Signature, ... ) \
+        INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_CLASS_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ )
+#else
+    #define INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_SIG( ClassName, Name, Tags, Signature, ... ) \
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_CLASS_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ) , ClassName, Name, Tags, Signature, __VA_ARGS__ ) )
+#endif
+
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2(TestNameClass, TestName, ClassName, Name, Tags, Signature, TmplTypes, TypesList)\
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+        template<typename TestType> \
+            struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName <TestType>) { \
+                void test();\
+            };\
+        namespace {\
+        namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestNameClass) {\
+            INTERNAL_CATCH_TYPE_GEN                  \
+            INTERNAL_CATCH_NTTP_GEN(INTERNAL_CATCH_REMOVE_PARENS(Signature))\
+            template<typename...Types>\
+            struct TestNameClass{\
+                void reg_tests(){\
+                    std::size_t index = 0;\
+                    using expander = std::size_t[];\
+                    constexpr char const* tmpl_types[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TmplTypes))};\
+                    constexpr char const* types_list[] = {CATCH_REC_LIST(INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS, INTERNAL_CATCH_REMOVE_PARENS(TypesList))};\
+                    constexpr auto num_types = sizeof(types_list) / sizeof(types_list[0]);\
+                    (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestName<Types>::test ), CATCH_INTERNAL_LINEINFO, #ClassName, Catch::NameAndTags{ Name " - " + std::string(tmpl_types[index / num_types]) + '<' + std::string(types_list[index % num_types]) + '>', Tags } ), index++)... };/* NOLINT */ \
+                }\
+            };\
+            static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\
+                using TestInit = typename create<TestNameClass, decltype(get_wrapper<INTERNAL_CATCH_REMOVE_PARENS(TmplTypes)>()), TypeList<INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(INTERNAL_CATCH_REMOVE_PARENS(TypesList))>>::type;\
+                TestInit t;\
+                t.reg_tests();\
+                return 0;\
+            }(); \
+        }\
+        }\
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        template<typename TestType> \
+        void TestName<TestType>::test()
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( ClassName, Name, Tags, ... )\
+        INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), ClassName, Name, Tags, typename T, __VA_ARGS__ )
+#else
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD( ClassName, Name, Tags, ... )\
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), ClassName, Name, Tags, typename T,__VA_ARGS__ ) )
+#endif
+
+#ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( ClassName, Name, Tags, Signature, ... )\
+        INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), ClassName, Name, Tags, Signature, __VA_ARGS__ )
+#else
+    #define INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_SIG( ClassName, Name, Tags, Signature, ... )\
+        INTERNAL_CATCH_EXPAND_VARGS( INTERNAL_CATCH_TEMPLATE_PRODUCT_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), ClassName, Name, Tags, Signature,__VA_ARGS__ ) )
+#endif
+
+    #define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD_2( TestNameClass, TestName, ClassName, Name, Tags, TmplList) \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_COMMA_WARNINGS \
+        template<typename TestType> \
+        struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName <TestType>) { \
+            void test();\
+        };\
+        namespace {\
+        namespace INTERNAL_CATCH_MAKE_NAMESPACE(TestName){ \
+            INTERNAL_CATCH_TYPE_GEN\
+            template<typename...Types>\
+            struct TestNameClass{\
+                void reg_tests(){\
+                    size_t index = 0;\
+                    using expander = size_t[];\
+                    (void)expander{(Catch::AutoReg( Catch::makeTestInvoker( &TestName<Types>::test ), CATCH_INTERNAL_LINEINFO, #ClassName##_catch_sr, Catch::NameAndTags{ Name " - " INTERNAL_CATCH_STRINGIZE(TmplList) " - " + std::to_string(index), Tags } ), index++)... };/* NOLINT */ \
+                }\
+            };\
+            static int INTERNAL_CATCH_UNIQUE_NAME( globalRegistrar ) = [](){\
+                using TestInit = typename convert<TestNameClass, TmplList>::type;\
+                TestInit t;\
+                t.reg_tests();\
+                return 0;\
+            }(); \
+        }}\
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        template<typename TestType> \
+        void TestName<TestType>::test()
+
+#define INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD(ClassName, Name, Tags, TmplList) \
+        INTERNAL_CATCH_TEMPLATE_LIST_TEST_CASE_METHOD_2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEMPLATE_TEST_ ), ClassName, Name, Tags, TmplList )
+
+
+#endif // CATCH_TEMPLATE_TEST_REGISTRY_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_info_hasher.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_info_hasher.cpp
new file mode 100644
index 00000000..e1731ebe
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_info_hasher.cpp
@@ -0,0 +1,39 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/catch_test_case_info.hpp>
+#include <catch2/internal/catch_test_case_info_hasher.hpp>
+
+namespace Catch {
+    TestCaseInfoHasher::TestCaseInfoHasher( hash_t seed ): m_seed( seed ) {}
+
+    uint32_t TestCaseInfoHasher::operator()( TestCaseInfo const& t ) const {
+        // FNV-1a hash algorithm that is designed for uniqueness:
+        const hash_t prime = 1099511628211u;
+        hash_t hash = 14695981039346656037u;
+        for ( const char c : t.name ) {
+            hash ^= c;
+            hash *= prime;
+        }
+        for ( const char c : t.className ) {
+            hash ^= c;
+            hash *= prime;
+        }
+        for ( const Tag& tag : t.tags ) {
+            for ( const char c : tag.original ) {
+                hash ^= c;
+                hash *= prime;
+            }
+        }
+        hash ^= m_seed;
+        hash *= prime;
+        const uint32_t low{ static_cast<uint32_t>( hash ) };
+        const uint32_t high{ static_cast<uint32_t>( hash >> 32 ) };
+        return low * high;
+    }
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_info_hasher.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_info_hasher.hpp
new file mode 100644
index 00000000..b0422dcc
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_info_hasher.hpp
@@ -0,0 +1,29 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TEST_CASE_INFO_HASHER_HPP_INCLUDED
+#define CATCH_TEST_CASE_INFO_HASHER_HPP_INCLUDED
+
+#include <cstdint>
+
+namespace Catch {
+
+    struct TestCaseInfo;
+
+    class TestCaseInfoHasher {
+    public:
+        using hash_t = std::uint64_t;
+        TestCaseInfoHasher( hash_t seed );
+        uint32_t operator()( TestCaseInfo const& t ) const;
+
+    private:
+        hash_t m_seed;
+    };
+
+} // namespace Catch
+
+#endif /* CATCH_TEST_CASE_INFO_HASHER_HPP_INCLUDED */
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_registry_impl.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_registry_impl.cpp
new file mode 100644
index 00000000..c2b052da
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_registry_impl.cpp
@@ -0,0 +1,151 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_test_case_registry_impl.hpp>
+
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
+#include <catch2/internal/catch_sharding.hpp>
+#include <catch2/catch_test_case_info.hpp>
+#include <catch2/catch_test_spec.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/internal/catch_test_case_info_hasher.hpp>
+
+#include <algorithm>
+#include <set>
+
+namespace Catch {
+
+    namespace {
+        static void enforceNoDuplicateTestCases(
+            std::vector<TestCaseHandle> const& tests ) {
+            auto testInfoCmp = []( TestCaseInfo const* lhs,
+                                   TestCaseInfo const* rhs ) {
+                return *lhs < *rhs;
+            };
+            std::set<TestCaseInfo const*, decltype( testInfoCmp )&> seenTests(
+                testInfoCmp );
+            for ( auto const& test : tests ) {
+                const auto infoPtr = &test.getTestCaseInfo();
+                const auto prev = seenTests.insert( infoPtr );
+                CATCH_ENFORCE( prev.second,
+                               "error: test case \""
+                                   << infoPtr->name << "\", with tags \""
+                                   << infoPtr->tagsAsString()
+                                   << "\" already defined.\n"
+                                   << "\tFirst seen at "
+                                   << ( *prev.first )->lineInfo << "\n"
+                                   << "\tRedefined at " << infoPtr->lineInfo );
+            }
+        }
+
+        static bool matchTest( TestCaseHandle const& testCase,
+                               TestSpec const& testSpec,
+                               IConfig const& config ) {
+            return testSpec.matches( testCase.getTestCaseInfo() ) &&
+                   isThrowSafe( testCase, config );
+        }
+
+    } // end unnamed namespace
+
+    std::vector<TestCaseHandle> sortTests( IConfig const& config, std::vector<TestCaseHandle> const& unsortedTestCases ) {
+        switch (config.runOrder()) {
+        case TestRunOrder::Declared:
+            return unsortedTestCases;
+
+        case TestRunOrder::LexicographicallySorted: {
+            std::vector<TestCaseHandle> sorted = unsortedTestCases;
+            std::sort(
+                sorted.begin(),
+                sorted.end(),
+                []( TestCaseHandle const& lhs, TestCaseHandle const& rhs ) {
+                    return lhs.getTestCaseInfo() < rhs.getTestCaseInfo();
+                }
+            );
+            return sorted;
+        }
+        case TestRunOrder::Randomized: {
+            using TestWithHash = std::pair<TestCaseInfoHasher::hash_t, TestCaseHandle>;
+
+            TestCaseInfoHasher h{ config.rngSeed() };
+            std::vector<TestWithHash> indexed_tests;
+            indexed_tests.reserve(unsortedTestCases.size());
+
+            for (auto const& handle : unsortedTestCases) {
+                indexed_tests.emplace_back(h(handle.getTestCaseInfo()), handle);
+            }
+
+            std::sort( indexed_tests.begin(),
+                       indexed_tests.end(),
+                       []( TestWithHash const& lhs, TestWithHash const& rhs ) {
+                           if ( lhs.first == rhs.first ) {
+                               return lhs.second.getTestCaseInfo() <
+                                      rhs.second.getTestCaseInfo();
+                           }
+                           return lhs.first < rhs.first;
+                       } );
+
+            std::vector<TestCaseHandle> randomized;
+            randomized.reserve(indexed_tests.size());
+
+            for (auto const& indexed : indexed_tests) {
+                randomized.push_back(indexed.second);
+            }
+
+            return randomized;
+        }
+        }
+
+        CATCH_INTERNAL_ERROR("Unknown test order value!");
+    }
+
+    bool isThrowSafe( TestCaseHandle const& testCase, IConfig const& config ) {
+        return !testCase.getTestCaseInfo().throws() || config.allowThrows();
+    }
+
+    std::vector<TestCaseHandle> filterTests( std::vector<TestCaseHandle> const& testCases, TestSpec const& testSpec, IConfig const& config ) {
+        std::vector<TestCaseHandle> filtered;
+        filtered.reserve( testCases.size() );
+        for (auto const& testCase : testCases) {
+            if ((!testSpec.hasFilters() && !testCase.getTestCaseInfo().isHidden()) ||
+                (testSpec.hasFilters() && matchTest(testCase, testSpec, config))) {
+                filtered.push_back(testCase);
+            }
+        }
+        return createShard(filtered, config.shardCount(), config.shardIndex());
+    }
+    std::vector<TestCaseHandle> const& getAllTestCasesSorted( IConfig const& config ) {
+        return getRegistryHub().getTestCaseRegistry().getAllTestsSorted( config );
+    }
+
+    void TestRegistry::registerTest(Detail::unique_ptr<TestCaseInfo> testInfo, Detail::unique_ptr<ITestInvoker> testInvoker) {
+        m_handles.emplace_back(testInfo.get(), testInvoker.get());
+        m_viewed_test_infos.push_back(testInfo.get());
+        m_owned_test_infos.push_back(CATCH_MOVE(testInfo));
+        m_invokers.push_back(CATCH_MOVE(testInvoker));
+    }
+
+    std::vector<TestCaseInfo*> const& TestRegistry::getAllInfos() const {
+        return m_viewed_test_infos;
+    }
+
+    std::vector<TestCaseHandle> const& TestRegistry::getAllTests() const {
+        return m_handles;
+    }
+    std::vector<TestCaseHandle> const& TestRegistry::getAllTestsSorted( IConfig const& config ) const {
+        if( m_sortedFunctions.empty() )
+            enforceNoDuplicateTestCases( m_handles );
+
+        if(  m_currentSortOrder != config.runOrder() || m_sortedFunctions.empty() ) {
+            m_sortedFunctions = sortTests( config, m_handles );
+            m_currentSortOrder = config.runOrder();
+        }
+        return m_sortedFunctions;
+    }
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_registry_impl.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_registry_impl.hpp
new file mode 100644
index 00000000..99a38498
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_registry_impl.hpp
@@ -0,0 +1,57 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TEST_CASE_REGISTRY_IMPL_HPP_INCLUDED
+#define CATCH_TEST_CASE_REGISTRY_IMPL_HPP_INCLUDED
+
+#include <catch2/interfaces/catch_interfaces_testcase.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
+
+#include <vector>
+
+namespace Catch {
+
+    class IConfig;
+    class ITestInvoker;
+    class TestCaseHandle;
+    class TestSpec;
+
+    std::vector<TestCaseHandle> sortTests( IConfig const& config, std::vector<TestCaseHandle> const& unsortedTestCases );
+
+    bool isThrowSafe( TestCaseHandle const& testCase, IConfig const& config );
+
+    std::vector<TestCaseHandle> filterTests( std::vector<TestCaseHandle> const& testCases, TestSpec const& testSpec, IConfig const& config );
+    std::vector<TestCaseHandle> const& getAllTestCasesSorted( IConfig const& config );
+
+    class TestRegistry : public ITestCaseRegistry {
+    public:
+        void registerTest( Detail::unique_ptr<TestCaseInfo> testInfo, Detail::unique_ptr<ITestInvoker> testInvoker );
+
+        std::vector<TestCaseInfo*> const& getAllInfos() const override;
+        std::vector<TestCaseHandle> const& getAllTests() const override;
+        std::vector<TestCaseHandle> const& getAllTestsSorted( IConfig const& config ) const override;
+
+    private:
+        std::vector<Detail::unique_ptr<TestCaseInfo>> m_owned_test_infos;
+        // Keeps a materialized vector for `getAllInfos`.
+        // We should get rid of that eventually (see interface note)
+        std::vector<TestCaseInfo*> m_viewed_test_infos;
+
+        std::vector<Detail::unique_ptr<ITestInvoker>> m_invokers;
+        std::vector<TestCaseHandle> m_handles;
+        mutable TestRunOrder m_currentSortOrder = TestRunOrder::Declared;
+        mutable std::vector<TestCaseHandle> m_sortedFunctions;
+    };
+
+    ///////////////////////////////////////////////////////////////////////////
+
+
+} // end namespace Catch
+
+
+#endif // CATCH_TEST_CASE_REGISTRY_IMPL_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_tracker.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_tracker.cpp
new file mode 100644
index 00000000..1470b91c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_tracker.cpp
@@ -0,0 +1,239 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_test_case_tracker.hpp>
+
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <algorithm>
+#include <cassert>
+
+#if defined(__clang__)
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wexit-time-destructors"
+#endif
+
+namespace Catch {
+namespace TestCaseTracking {
+
+    NameAndLocation::NameAndLocation( std::string&& _name, SourceLineInfo const& _location )
+    :   name( CATCH_MOVE(_name) ),
+        location( _location )
+    {}
+
+
+    ITracker::~ITracker() = default;
+
+    void ITracker::markAsNeedingAnotherRun() {
+        m_runState = NeedsAnotherRun;
+    }
+
+    void ITracker::addChild( ITrackerPtr&& child ) {
+        m_children.push_back( CATCH_MOVE(child) );
+    }
+
+    ITracker* ITracker::findChild( NameAndLocationRef const& nameAndLocation ) {
+        auto it = std::find_if(
+            m_children.begin(),
+            m_children.end(),
+            [&nameAndLocation]( ITrackerPtr const& tracker ) {
+                auto const& tnameAndLoc = tracker->nameAndLocation();
+                if ( tnameAndLoc.location.line !=
+                     nameAndLocation.location.line ) {
+                    return false;
+                }
+                return tnameAndLoc == nameAndLocation;
+            } );
+        return ( it != m_children.end() ) ? it->get() : nullptr;
+    }
+
+    bool ITracker::isSectionTracker() const { return false; }
+    bool ITracker::isGeneratorTracker() const { return false; }
+
+    bool ITracker::isOpen() const {
+        return m_runState != NotStarted && !isComplete();
+    }
+
+    bool ITracker::hasStarted() const { return m_runState != NotStarted; }
+
+    void ITracker::openChild() {
+        if (m_runState != ExecutingChildren) {
+            m_runState = ExecutingChildren;
+            if (m_parent) {
+                m_parent->openChild();
+            }
+        }
+    }
+
+    ITracker& TrackerContext::startRun() {
+        using namespace std::string_literals;
+        m_rootTracker = Catch::Detail::make_unique<SectionTracker>(
+            NameAndLocation( "{root}"s, CATCH_INTERNAL_LINEINFO ),
+            *this,
+            nullptr );
+        m_currentTracker = nullptr;
+        m_runState = Executing;
+        return *m_rootTracker;
+    }
+
+    void TrackerContext::completeCycle() {
+        m_runState = CompletedCycle;
+    }
+
+    bool TrackerContext::completedCycle() const {
+        return m_runState == CompletedCycle;
+    }
+    void TrackerContext::setCurrentTracker( ITracker* tracker ) {
+        m_currentTracker = tracker;
+    }
+
+
+    TrackerBase::TrackerBase( NameAndLocation&& nameAndLocation, TrackerContext& ctx, ITracker* parent ):
+        ITracker(CATCH_MOVE(nameAndLocation), parent),
+        m_ctx( ctx )
+    {}
+
+    bool TrackerBase::isComplete() const {
+        return m_runState == CompletedSuccessfully || m_runState == Failed;
+    }
+
+    void TrackerBase::open() {
+        m_runState = Executing;
+        moveToThis();
+        if( m_parent )
+            m_parent->openChild();
+    }
+
+    void TrackerBase::close() {
+
+        // Close any still open children (e.g. generators)
+        while( &m_ctx.currentTracker() != this )
+            m_ctx.currentTracker().close();
+
+        switch( m_runState ) {
+            case NeedsAnotherRun:
+                break;
+
+            case Executing:
+                m_runState = CompletedSuccessfully;
+                break;
+            case ExecutingChildren:
+                if( std::all_of(m_children.begin(), m_children.end(), [](ITrackerPtr const& t){ return t->isComplete(); }) )
+                    m_runState = CompletedSuccessfully;
+                break;
+
+            case NotStarted:
+            case CompletedSuccessfully:
+            case Failed:
+                CATCH_INTERNAL_ERROR( "Illogical state: " << m_runState );
+
+            default:
+                CATCH_INTERNAL_ERROR( "Unknown state: " << m_runState );
+        }
+        moveToParent();
+        m_ctx.completeCycle();
+    }
+    void TrackerBase::fail() {
+        m_runState = Failed;
+        if( m_parent )
+            m_parent->markAsNeedingAnotherRun();
+        moveToParent();
+        m_ctx.completeCycle();
+    }
+
+    void TrackerBase::moveToParent() {
+        assert( m_parent );
+        m_ctx.setCurrentTracker( m_parent );
+    }
+    void TrackerBase::moveToThis() {
+        m_ctx.setCurrentTracker( this );
+    }
+
+    SectionTracker::SectionTracker( NameAndLocation&& nameAndLocation, TrackerContext& ctx, ITracker* parent )
+    :   TrackerBase( CATCH_MOVE(nameAndLocation), ctx, parent ),
+        m_trimmed_name(trim(StringRef(ITracker::nameAndLocation().name)))
+    {
+        if( parent ) {
+            while ( !parent->isSectionTracker() ) {
+                parent = parent->parent();
+            }
+
+            SectionTracker& parentSection = static_cast<SectionTracker&>( *parent );
+            addNextFilters( parentSection.m_filters );
+        }
+    }
+
+    bool SectionTracker::isComplete() const {
+        bool complete = true;
+
+        if (m_filters.empty()
+            || m_filters[0].empty()
+            || std::find(m_filters.begin(), m_filters.end(), m_trimmed_name) != m_filters.end()) {
+            complete = TrackerBase::isComplete();
+        }
+        return complete;
+    }
+
+    bool SectionTracker::isSectionTracker() const { return true; }
+
+    SectionTracker& SectionTracker::acquire( TrackerContext& ctx, NameAndLocationRef const& nameAndLocation ) {
+        SectionTracker* tracker;
+
+        ITracker& currentTracker = ctx.currentTracker();
+        if ( ITracker* childTracker =
+                 currentTracker.findChild( nameAndLocation ) ) {
+            assert( childTracker );
+            assert( childTracker->isSectionTracker() );
+            tracker = static_cast<SectionTracker*>( childTracker );
+        } else {
+            auto newTracker = Catch::Detail::make_unique<SectionTracker>(
+                NameAndLocation{ static_cast<std::string>(nameAndLocation.name),
+                                 nameAndLocation.location },
+                ctx,
+                &currentTracker );
+            tracker = newTracker.get();
+            currentTracker.addChild( CATCH_MOVE( newTracker ) );
+        }
+
+        if ( !ctx.completedCycle() ) {
+            tracker->tryOpen();
+        }
+
+        return *tracker;
+    }
+
+    void SectionTracker::tryOpen() {
+        if( !isComplete() )
+            open();
+    }
+
+    void SectionTracker::addInitialFilters( std::vector<std::string> const& filters ) {
+        if( !filters.empty() ) {
+            m_filters.reserve( m_filters.size() + filters.size() + 2 );
+            m_filters.emplace_back(StringRef{}); // Root - should never be consulted
+            m_filters.emplace_back(StringRef{}); // Test Case - not a section filter
+            m_filters.insert( m_filters.end(), filters.begin(), filters.end() );
+        }
+    }
+    void SectionTracker::addNextFilters( std::vector<StringRef> const& filters ) {
+        if( filters.size() > 1 )
+            m_filters.insert( m_filters.end(), filters.begin()+1, filters.end() );
+    }
+
+    StringRef SectionTracker::trimmedName() const {
+        return m_trimmed_name;
+    }
+
+} // namespace TestCaseTracking
+
+} // namespace Catch
+
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_tracker.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_tracker.hpp
new file mode 100644
index 00000000..50278c91
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_tracker.hpp
@@ -0,0 +1,244 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TEST_CASE_TRACKER_HPP_INCLUDED
+#define CATCH_TEST_CASE_TRACKER_HPP_INCLUDED
+
+#include <catch2/internal/catch_source_line_info.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+
+#include <string>
+#include <vector>
+
+namespace Catch {
+namespace TestCaseTracking {
+
+    struct NameAndLocation {
+        std::string name;
+        SourceLineInfo location;
+
+        NameAndLocation( std::string&& _name, SourceLineInfo const& _location );
+        friend bool operator==(NameAndLocation const& lhs, NameAndLocation const& rhs) {
+            // This is a very cheap check that should have a very high hit rate.
+            // If we get to SourceLineInfo::operator==, we will redo it, but the
+            // cost of repeating is trivial at that point (we will be paying
+            // multiple strcmp/memcmps at that point).
+            if ( lhs.location.line != rhs.location.line ) { return false; }
+            return lhs.name == rhs.name && lhs.location == rhs.location;
+        }
+        friend bool operator!=(NameAndLocation const& lhs,
+                               NameAndLocation const& rhs) {
+            return !( lhs == rhs );
+        }
+    };
+
+    /**
+     * This is a variant of `NameAndLocation` that does not own the name string
+     *
+     * This avoids extra allocations when trying to locate a tracker by its
+     * name and location, as long as we make sure that trackers only keep
+     * around the owning variant.
+     */
+    struct NameAndLocationRef {
+        StringRef name;
+        SourceLineInfo location;
+
+        constexpr NameAndLocationRef( StringRef name_,
+                                      SourceLineInfo location_ ):
+            name( name_ ), location( location_ ) {}
+
+        friend bool operator==( NameAndLocation const& lhs,
+                                NameAndLocationRef const& rhs ) {
+            // This is a very cheap check that should have a very high hit rate.
+            // If we get to SourceLineInfo::operator==, we will redo it, but the
+            // cost of repeating is trivial at that point (we will be paying
+            // multiple strcmp/memcmps at that point).
+            if ( lhs.location.line != rhs.location.line ) { return false; }
+            return StringRef( lhs.name ) == rhs.name &&
+                   lhs.location == rhs.location;
+        }
+        friend bool operator==( NameAndLocationRef const& lhs,
+                                NameAndLocation const& rhs ) {
+            return rhs == lhs;
+        }
+    };
+
+    class ITracker;
+
+    using ITrackerPtr = Catch::Detail::unique_ptr<ITracker>;
+
+    class ITracker {
+        NameAndLocation m_nameAndLocation;
+
+        using Children = std::vector<ITrackerPtr>;
+
+    protected:
+        enum CycleState {
+            NotStarted,
+            Executing,
+            ExecutingChildren,
+            NeedsAnotherRun,
+            CompletedSuccessfully,
+            Failed
+        };
+
+        ITracker* m_parent = nullptr;
+        Children m_children;
+        CycleState m_runState = NotStarted;
+
+    public:
+        ITracker( NameAndLocation&& nameAndLoc, ITracker* parent ):
+            m_nameAndLocation( CATCH_MOVE(nameAndLoc) ),
+            m_parent( parent )
+        {}
+
+
+        // static queries
+        NameAndLocation const& nameAndLocation() const {
+            return m_nameAndLocation;
+        }
+        ITracker* parent() const {
+            return m_parent;
+        }
+
+        virtual ~ITracker(); // = default
+
+
+        // dynamic queries
+
+        //! Returns true if tracker run to completion (successfully or not)
+        virtual bool isComplete() const = 0;
+        //! Returns true if tracker run to completion successfully
+        bool isSuccessfullyCompleted() const {
+            return m_runState == CompletedSuccessfully;
+        }
+        //! Returns true if tracker has started but hasn't been completed
+        bool isOpen() const;
+        //! Returns true iff tracker has started
+        bool hasStarted() const;
+
+        // actions
+        virtual void close() = 0; // Successfully complete
+        virtual void fail() = 0;
+        void markAsNeedingAnotherRun();
+
+        //! Register a nested ITracker
+        void addChild( ITrackerPtr&& child );
+        /**
+         * Returns ptr to specific child if register with this tracker.
+         *
+         * Returns nullptr if not found.
+         */
+        ITracker* findChild( NameAndLocationRef const& nameAndLocation );
+        //! Have any children been added?
+        bool hasChildren() const {
+            return !m_children.empty();
+        }
+
+
+        //! Marks tracker as executing a child, doing se recursively up the tree
+        void openChild();
+
+        /**
+         * Returns true if the instance is a section tracker
+         *
+         * Subclasses should override to true if they are, replaces RTTI
+         * for internal debug checks.
+         */
+        virtual bool isSectionTracker() const;
+        /**
+         * Returns true if the instance is a generator tracker
+         *
+         * Subclasses should override to true if they are, replaces RTTI
+         * for internal debug checks.
+         */
+        virtual bool isGeneratorTracker() const;
+    };
+
+    class TrackerContext {
+
+        enum RunState {
+            NotStarted,
+            Executing,
+            CompletedCycle
+        };
+
+        ITrackerPtr m_rootTracker;
+        ITracker* m_currentTracker = nullptr;
+        RunState m_runState = NotStarted;
+
+    public:
+
+        ITracker& startRun();
+
+        void startCycle() {
+            m_currentTracker = m_rootTracker.get();
+            m_runState = Executing;
+        }
+        void completeCycle();
+
+        bool completedCycle() const;
+        ITracker& currentTracker() { return *m_currentTracker; }
+        void setCurrentTracker( ITracker* tracker );
+    };
+
+    class TrackerBase : public ITracker {
+    protected:
+
+        TrackerContext& m_ctx;
+
+    public:
+        TrackerBase( NameAndLocation&& nameAndLocation, TrackerContext& ctx, ITracker* parent );
+
+        bool isComplete() const override;
+
+        void open();
+
+        void close() override;
+        void fail() override;
+
+    private:
+        void moveToParent();
+        void moveToThis();
+    };
+
+    class SectionTracker : public TrackerBase {
+        std::vector<StringRef> m_filters;
+        // Note that lifetime-wise we piggy back off the name stored in the `ITracker` parent`.
+        // Currently it allocates owns the name, so this is safe. If it is later refactored
+        // to not own the name, the name still has to outlive the `ITracker` parent, so
+        // this should still be safe.
+        StringRef m_trimmed_name;
+    public:
+        SectionTracker( NameAndLocation&& nameAndLocation, TrackerContext& ctx, ITracker* parent );
+
+        bool isSectionTracker() const override;
+
+        bool isComplete() const override;
+
+        static SectionTracker& acquire( TrackerContext& ctx, NameAndLocationRef const& nameAndLocation );
+
+        void tryOpen();
+
+        void addInitialFilters( std::vector<std::string> const& filters );
+        void addNextFilters( std::vector<StringRef> const& filters );
+        //! Returns filters active in this tracker
+        std::vector<StringRef> const& getFilters() const { return m_filters; }
+        //! Returns whitespace-trimmed name of the tracked section
+        StringRef trimmedName() const;
+    };
+
+} // namespace TestCaseTracking
+
+using TestCaseTracking::ITracker;
+using TestCaseTracking::TrackerContext;
+using TestCaseTracking::SectionTracker;
+
+} // namespace Catch
+
+#endif // CATCH_TEST_CASE_TRACKER_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_failure_exception.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_failure_exception.cpp
new file mode 100644
index 00000000..8ea31313
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_failure_exception.cpp
@@ -0,0 +1,31 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/internal/catch_test_failure_exception.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/catch_user_config.hpp>
+
+namespace Catch {
+
+    void throw_test_failure_exception() {
+#if !defined( CATCH_CONFIG_DISABLE_EXCEPTIONS )
+        throw TestFailureException{};
+#else
+        CATCH_ERROR( "Test failure requires aborting test!" );
+#endif
+    }
+
+    void throw_test_skip_exception() {
+#if !defined( CATCH_CONFIG_DISABLE_EXCEPTIONS )
+        throw Catch::TestSkipException();
+#else
+        CATCH_ERROR( "Explicitly skipping tests during runtime requires exceptions" );
+#endif
+    }
+
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_failure_exception.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_failure_exception.hpp
new file mode 100644
index 00000000..1ef88364
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_failure_exception.hpp
@@ -0,0 +1,34 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TEST_FAILURE_EXCEPTION_HPP_INCLUDED
+#define CATCH_TEST_FAILURE_EXCEPTION_HPP_INCLUDED
+
+namespace Catch {
+
+    //! Used to signal that an assertion macro failed
+    struct TestFailureException{};
+    //! Used to signal that the remainder of a test should be skipped
+    struct TestSkipException {};
+
+    /**
+     * Outlines throwing of `TestFailureException` into a single TU
+     *
+     * Also handles `CATCH_CONFIG_DISABLE_EXCEPTIONS` for callers.
+     */
+    [[noreturn]] void throw_test_failure_exception();
+
+    /**
+     * Outlines throwing of `TestSkipException` into a single TU
+     *
+     * Also handles `CATCH_CONFIG_DISABLE_EXCEPTIONS` for callers.
+     */
+    [[noreturn]] void throw_test_skip_exception();
+
+} // namespace Catch
+
+#endif // CATCH_TEST_FAILURE_EXCEPTION_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_macro_impl.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_macro_impl.hpp
new file mode 100644
index 00000000..0d95650f
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_macro_impl.hpp
@@ -0,0 +1,157 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TEST_MACRO_IMPL_HPP_INCLUDED
+#define CATCH_TEST_MACRO_IMPL_HPP_INCLUDED
+
+#include <catch2/catch_user_config.hpp>
+#include <catch2/internal/catch_assertion_handler.hpp>
+#include <catch2/internal/catch_preprocessor_internal_stringify.hpp>
+#include <catch2/interfaces/catch_interfaces_capture.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+#include <catch2/internal/catch_source_line_info.hpp>
+
+// We need this suppression to leak, because it took until GCC 10
+// for the front end to handle local suppression via _Pragma properly
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && __GNUC__ <= 9
+  #pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+
+#if !defined(CATCH_CONFIG_DISABLE)
+
+#if defined(CATCH_CONFIG_FAST_COMPILE) || defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+
+///////////////////////////////////////////////////////////////////////////////
+// Another way to speed-up compilation is to omit local try-catch for REQUIRE*
+// macros.
+#define INTERNAL_CATCH_TRY
+#define INTERNAL_CATCH_CATCH( capturer )
+
+#else // CATCH_CONFIG_FAST_COMPILE
+
+#define INTERNAL_CATCH_TRY try
+#define INTERNAL_CATCH_CATCH( handler ) catch(...) { (handler).handleUnexpectedInflightException(); }
+
+#endif
+
+#define INTERNAL_CATCH_REACT( handler ) handler.complete();
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_TEST( macroName, resultDisposition, ... ) \
+    do { /* NOLINT(bugprone-infinite-loop) */ \
+        /* The expression should not be evaluated, but warnings should hopefully be checked */ \
+        CATCH_INTERNAL_IGNORE_BUT_WARN(__VA_ARGS__); \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__), resultDisposition ); \
+        INTERNAL_CATCH_TRY { \
+            CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+            CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \
+            catchAssertionHandler.handleExpr( Catch::Decomposer() <= __VA_ARGS__ ); /* NOLINT(bugprone-chained-comparison) */ \
+            CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        } INTERNAL_CATCH_CATCH( catchAssertionHandler ) \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( (void)0, (false) && static_cast<const bool&>( !!(__VA_ARGS__) ) ) // the expression here is never evaluated at runtime but it forces the compiler to give it a look
+    // The double negation silences MSVC's C4800 warning, the static_cast forces short-circuit evaluation if the type has overloaded &&.
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_IF( macroName, resultDisposition, ... ) \
+    INTERNAL_CATCH_TEST( macroName, resultDisposition, __VA_ARGS__ ); \
+    if( Catch::getResultCapture().lastAssertionPassed() )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_ELSE( macroName, resultDisposition, ... ) \
+    INTERNAL_CATCH_TEST( macroName, resultDisposition, __VA_ARGS__ ); \
+    if( !Catch::getResultCapture().lastAssertionPassed() )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_NO_THROW( macroName, resultDisposition, ... ) \
+    do { \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__), resultDisposition ); \
+        try { \
+            CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+            CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
+            static_cast<void>(__VA_ARGS__); \
+            CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+            catchAssertionHandler.handleExceptionNotThrownAsExpected(); \
+        } \
+        catch( ... ) { \
+            catchAssertionHandler.handleUnexpectedInflightException(); \
+        } \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( false )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_THROWS( macroName, resultDisposition, ... ) \
+    do { \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__), resultDisposition); \
+        if( catchAssertionHandler.allowThrows() ) \
+            try { \
+                CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+                CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT \
+                CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
+                static_cast<void>(__VA_ARGS__); \
+                CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+                catchAssertionHandler.handleUnexpectedExceptionNotThrown(); \
+            } \
+            catch( ... ) { \
+                catchAssertionHandler.handleExceptionThrownAsExpected(); \
+            } \
+        else \
+            catchAssertionHandler.handleThrowingCallSkipped(); \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( false )
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_THROWS_AS( macroName, exceptionType, resultDisposition, expr ) \
+    do { \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(expr) ", " CATCH_INTERNAL_STRINGIFY(exceptionType), resultDisposition ); \
+        if( catchAssertionHandler.allowThrows() ) \
+            try { \
+                CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+                CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT \
+                CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
+                static_cast<void>(expr); \
+                CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+                catchAssertionHandler.handleUnexpectedExceptionNotThrown(); \
+            } \
+            catch( exceptionType const& ) { \
+                catchAssertionHandler.handleExceptionThrownAsExpected(); \
+            } \
+            catch( ... ) { \
+                catchAssertionHandler.handleUnexpectedInflightException(); \
+            } \
+        else \
+            catchAssertionHandler.handleThrowingCallSkipped(); \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( false )
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+// Although this is matcher-based, it can be used with just a string
+#define INTERNAL_CATCH_THROWS_STR_MATCHES( macroName, resultDisposition, matcher, ... ) \
+    do { \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__) ", " CATCH_INTERNAL_STRINGIFY(matcher), resultDisposition ); \
+        if( catchAssertionHandler.allowThrows() ) \
+            try { \
+                CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+                CATCH_INTERNAL_SUPPRESS_UNUSED_RESULT \
+                CATCH_INTERNAL_SUPPRESS_USELESS_CAST_WARNINGS \
+                static_cast<void>(__VA_ARGS__); \
+                CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+                catchAssertionHandler.handleUnexpectedExceptionNotThrown(); \
+            } \
+            catch( ... ) { \
+                Catch::handleExceptionMatchExpr( catchAssertionHandler, matcher ); \
+            } \
+        else \
+            catchAssertionHandler.handleThrowingCallSkipped(); \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( false )
+
+#endif // CATCH_CONFIG_DISABLE
+
+#endif // CATCH_TEST_MACRO_IMPL_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_registry.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_registry.cpp
new file mode 100644
index 00000000..e9c999fe
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_registry.cpp
@@ -0,0 +1,82 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_test_registry.hpp>
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/catch_test_case_info.hpp>
+#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <algorithm>
+#include <iterator>
+
+namespace Catch {
+    ITestInvoker::~ITestInvoker() = default;
+
+    namespace {
+        static StringRef extractClassName( StringRef classOrMethodName ) {
+            if ( !startsWith( classOrMethodName, '&' ) ) {
+                return classOrMethodName;
+            }
+
+            // Remove the leading '&' to avoid having to special case it later
+            const auto methodName =
+                classOrMethodName.substr( 1, classOrMethodName.size() );
+
+            auto reverseStart = std::make_reverse_iterator( methodName.end() );
+            auto reverseEnd = std::make_reverse_iterator( methodName.begin() );
+
+            // We make a simplifying assumption that ":" is only present
+            // in the input as part of "::" from C++ typenames (this is
+            // relatively safe assumption because the input is generated
+            // as stringification of type through preprocessor).
+            auto lastColons = std::find( reverseStart, reverseEnd, ':' ) + 1;
+            auto secondLastColons =
+                std::find( lastColons + 1, reverseEnd, ':' );
+
+            auto const startIdx = reverseEnd - secondLastColons;
+            auto const classNameSize = secondLastColons - lastColons - 1;
+
+            return methodName.substr(
+                static_cast<std::size_t>( startIdx ),
+                static_cast<std::size_t>( classNameSize ) );
+        }
+
+        class TestInvokerAsFunction final : public ITestInvoker {
+            using TestType = void ( * )();
+            TestType m_testAsFunction;
+
+        public:
+            TestInvokerAsFunction( TestType testAsFunction ) noexcept:
+                m_testAsFunction( testAsFunction ) {}
+
+            void invoke() const override { m_testAsFunction(); }
+        };
+
+    } // namespace
+
+    Detail::unique_ptr<ITestInvoker> makeTestInvoker( void(*testAsFunction)() ) {
+        return Detail::make_unique<TestInvokerAsFunction>( testAsFunction );
+    }
+
+    AutoReg::AutoReg( Detail::unique_ptr<ITestInvoker> invoker, SourceLineInfo const& lineInfo, StringRef classOrMethod, NameAndTags const& nameAndTags ) noexcept {
+        CATCH_TRY {
+            getMutableRegistryHub()
+                    .registerTest(
+                        makeTestCaseInfo(
+                            extractClassName( classOrMethod ),
+                            nameAndTags,
+                            lineInfo),
+                        CATCH_MOVE(invoker)
+                    );
+        } CATCH_CATCH_ALL {
+            // Do not throw when constructing global objects, instead register the exception to be processed later
+            getMutableRegistryHub().registerStartupException();
+        }
+    }
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_registry.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_registry.hpp
new file mode 100644
index 00000000..c62fbdcc
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_registry.hpp
@@ -0,0 +1,173 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TEST_REGISTRY_HPP_INCLUDED
+#define CATCH_TEST_REGISTRY_HPP_INCLUDED
+
+#include <catch2/internal/catch_config_static_analysis_support.hpp>
+#include <catch2/internal/catch_source_line_info.hpp>
+#include <catch2/internal/catch_noncopyable.hpp>
+#include <catch2/interfaces/catch_interfaces_test_invoker.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
+#include <catch2/internal/catch_unique_name.hpp>
+#include <catch2/internal/catch_preprocessor_remove_parens.hpp>
+
+// GCC 5 and older do not properly handle disabling unused-variable warning
+// with a _Pragma. This means that we have to leak the suppression to the
+// user code as well :-(
+#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ <= 5
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+
+
+namespace Catch {
+
+template<typename C>
+class TestInvokerAsMethod : public ITestInvoker {
+    void (C::*m_testAsMethod)();
+public:
+    TestInvokerAsMethod( void (C::*testAsMethod)() ) noexcept : m_testAsMethod( testAsMethod ) {}
+
+    void invoke() const override {
+        C obj;
+        (obj.*m_testAsMethod)();
+    }
+};
+
+Detail::unique_ptr<ITestInvoker> makeTestInvoker( void(*testAsFunction)() );
+
+template<typename C>
+Detail::unique_ptr<ITestInvoker> makeTestInvoker( void (C::*testAsMethod)() ) {
+    return Detail::make_unique<TestInvokerAsMethod<C>>( testAsMethod );
+}
+
+struct NameAndTags {
+    constexpr NameAndTags( StringRef name_ = StringRef(),
+                           StringRef tags_ = StringRef() ) noexcept:
+        name( name_ ), tags( tags_ ) {}
+    StringRef name;
+    StringRef tags;
+};
+
+struct AutoReg : Detail::NonCopyable {
+    AutoReg( Detail::unique_ptr<ITestInvoker> invoker, SourceLineInfo const& lineInfo, StringRef classOrMethod, NameAndTags const& nameAndTags ) noexcept;
+};
+
+} // end namespace Catch
+
+#if defined(CATCH_CONFIG_DISABLE)
+    #define INTERNAL_CATCH_TESTCASE_NO_REGISTRATION( TestName, ... ) \
+        static inline void TestName()
+    #define INTERNAL_CATCH_TESTCASE_METHOD_NO_REGISTRATION( TestName, ClassName, ... ) \
+        namespace{                        \
+            struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName) { \
+                void test();              \
+            };                            \
+        }                                 \
+        void TestName::test()
+#endif
+
+
+#if !defined(CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT)
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_TESTCASE2( TestName, ... ) \
+        static void TestName(); \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+        namespace{ const Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( Catch::makeTestInvoker( &TestName ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ __VA_ARGS__ } ); } /* NOLINT */ \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        static void TestName()
+    #define INTERNAL_CATCH_TESTCASE( ... ) \
+        INTERNAL_CATCH_TESTCASE2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ), __VA_ARGS__ )
+
+#else  // ^^ !CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT | vv CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT
+
+
+// Dummy registrator for the dumy test case macros
+namespace Catch {
+    namespace Detail {
+        struct DummyUse {
+            DummyUse( void ( * )( int ), Catch::NameAndTags const& );
+        };
+    } // namespace Detail
+} // namespace Catch
+
+// Note that both the presence of the argument and its exact name are
+// necessary for the section support.
+
+// We provide a shadowed variable so that a `SECTION` inside non-`TEST_CASE`
+// tests can compile. The redefined `TEST_CASE` shadows this with param.
+static int catchInternalSectionHint = 0;
+
+#    define INTERNAL_CATCH_TESTCASE2( fname, ... )                         \
+        static void fname( int );                                          \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                          \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                           \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS                   \
+        static const Catch::Detail::DummyUse INTERNAL_CATCH_UNIQUE_NAME(   \
+            dummyUser )( &(fname), Catch::NameAndTags{ __VA_ARGS__ } );    \
+        CATCH_INTERNAL_SUPPRESS_SHADOW_WARNINGS                            \
+        static void fname( [[maybe_unused]] int catchInternalSectionHint ) \
+            CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+#    define INTERNAL_CATCH_TESTCASE( ... ) \
+        INTERNAL_CATCH_TESTCASE2( INTERNAL_CATCH_UNIQUE_NAME( dummyFunction ), __VA_ARGS__ )
+
+
+#endif // CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_TEST_CASE_METHOD2( TestName, ClassName, ... )\
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+        namespace{ \
+            struct TestName : INTERNAL_CATCH_REMOVE_PARENS(ClassName) { \
+                void test(); \
+            }; \
+            const Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( \
+            Catch::makeTestInvoker( &TestName::test ),                    \
+            CATCH_INTERNAL_LINEINFO,                                      \
+            #ClassName##_catch_sr,                                        \
+            Catch::NameAndTags{ __VA_ARGS__ } ); /* NOLINT */ \
+        } \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        void TestName::test()
+    #define INTERNAL_CATCH_TEST_CASE_METHOD( ClassName, ... ) \
+        INTERNAL_CATCH_TEST_CASE_METHOD2( INTERNAL_CATCH_UNIQUE_NAME( CATCH2_INTERNAL_TEST_ ), ClassName, __VA_ARGS__ )
+
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_METHOD_AS_TEST_CASE( QualifiedMethod, ... ) \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+        CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+        namespace {                                                           \
+        const Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( \
+            Catch::makeTestInvoker( &QualifiedMethod ),                   \
+            CATCH_INTERNAL_LINEINFO,                                      \
+            "&" #QualifiedMethod##_catch_sr,                              \
+            Catch::NameAndTags{ __VA_ARGS__ } );                          \
+    } /* NOLINT */ \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+
+    ///////////////////////////////////////////////////////////////////////////////
+    #define INTERNAL_CATCH_REGISTER_TESTCASE( Function, ... ) \
+        do { \
+            CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \
+            CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \
+            CATCH_INTERNAL_SUPPRESS_UNUSED_VARIABLE_WARNINGS \
+            Catch::AutoReg INTERNAL_CATCH_UNIQUE_NAME( autoRegistrar )( Catch::makeTestInvoker( Function ), CATCH_INTERNAL_LINEINFO, Catch::StringRef(), Catch::NameAndTags{ __VA_ARGS__ } ); /* NOLINT */ \
+            CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION \
+        } while(false)
+
+
+#endif // CATCH_TEST_REGISTRY_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_run_info.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_run_info.hpp
new file mode 100644
index 00000000..90357b0a
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_run_info.hpp
@@ -0,0 +1,22 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TEST_RUN_INFO_HPP_INCLUDED
+#define CATCH_TEST_RUN_INFO_HPP_INCLUDED
+
+#include <catch2/internal/catch_stringref.hpp>
+
+namespace Catch {
+
+    struct TestRunInfo {
+        constexpr TestRunInfo(StringRef _name) : name(_name) {}
+        StringRef name;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_TEST_RUN_INFO_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_spec_parser.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_spec_parser.cpp
new file mode 100644
index 00000000..d6e4cb58
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_spec_parser.cpp
@@ -0,0 +1,239 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_test_spec_parser.hpp>
+
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/interfaces/catch_interfaces_tag_alias_registry.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+
+namespace Catch {
+
+    TestSpecParser::TestSpecParser( ITagAliasRegistry const& tagAliases ) : m_tagAliases( &tagAliases ) {}
+
+    TestSpecParser& TestSpecParser::parse( std::string const& arg ) {
+        m_mode = None;
+        m_exclusion = false;
+        m_arg = m_tagAliases->expandAliases( arg );
+        m_escapeChars.clear();
+        m_substring.reserve(m_arg.size());
+        m_patternName.reserve(m_arg.size());
+        m_realPatternPos = 0;
+
+        for( m_pos = 0; m_pos < m_arg.size(); ++m_pos )
+          //if visitChar fails
+           if( !visitChar( m_arg[m_pos] ) ){
+               m_testSpec.m_invalidSpecs.push_back(arg);
+               break;
+           }
+        endMode();
+        return *this;
+    }
+    TestSpec TestSpecParser::testSpec() {
+        addFilter();
+        return CATCH_MOVE(m_testSpec);
+    }
+    bool TestSpecParser::visitChar( char c ) {
+        if( (m_mode != EscapedName) && (c == '\\') ) {
+            escape();
+            addCharToPattern(c);
+            return true;
+        }else if((m_mode != EscapedName) && (c == ',') )  {
+            return separate();
+        }
+
+        switch( m_mode ) {
+        case None:
+            if( processNoneChar( c ) )
+                return true;
+            break;
+        case Name:
+            processNameChar( c );
+            break;
+        case EscapedName:
+            endMode();
+            addCharToPattern(c);
+            return true;
+        default:
+        case Tag:
+        case QuotedName:
+            if( processOtherChar( c ) )
+                return true;
+            break;
+        }
+
+        m_substring += c;
+        if( !isControlChar( c ) ) {
+            m_patternName += c;
+            m_realPatternPos++;
+        }
+        return true;
+    }
+    // Two of the processing methods return true to signal the caller to return
+    // without adding the given character to the current pattern strings
+    bool TestSpecParser::processNoneChar( char c ) {
+        switch( c ) {
+        case ' ':
+            return true;
+        case '~':
+            m_exclusion = true;
+            return false;
+        case '[':
+            startNewMode( Tag );
+            return false;
+        case '"':
+            startNewMode( QuotedName );
+            return false;
+        default:
+            startNewMode( Name );
+            return false;
+        }
+    }
+    void TestSpecParser::processNameChar( char c ) {
+        if( c == '[' ) {
+            if( m_substring == "exclude:" )
+                m_exclusion = true;
+            else
+                endMode();
+            startNewMode( Tag );
+        }
+    }
+    bool TestSpecParser::processOtherChar( char c ) {
+        if( !isControlChar( c ) )
+            return false;
+        m_substring += c;
+        endMode();
+        return true;
+    }
+    void TestSpecParser::startNewMode( Mode mode ) {
+        m_mode = mode;
+    }
+    void TestSpecParser::endMode() {
+        switch( m_mode ) {
+        case Name:
+        case QuotedName:
+            return addNamePattern();
+        case Tag:
+            return addTagPattern();
+        case EscapedName:
+            revertBackToLastMode();
+            return;
+        case None:
+        default:
+            return startNewMode( None );
+        }
+    }
+    void TestSpecParser::escape() {
+        saveLastMode();
+        m_mode = EscapedName;
+        m_escapeChars.push_back(m_realPatternPos);
+    }
+    bool TestSpecParser::isControlChar( char c ) const {
+        switch( m_mode ) {
+            default:
+                return false;
+            case None:
+                return c == '~';
+            case Name:
+                return c == '[';
+            case EscapedName:
+                return true;
+            case QuotedName:
+                return c == '"';
+            case Tag:
+                return c == '[' || c == ']';
+        }
+    }
+
+    void TestSpecParser::addFilter() {
+        if( !m_currentFilter.m_required.empty() || !m_currentFilter.m_forbidden.empty() ) {
+            m_testSpec.m_filters.push_back( CATCH_MOVE(m_currentFilter) );
+            m_currentFilter = TestSpec::Filter();
+        }
+    }
+
+    void TestSpecParser::saveLastMode() {
+      lastMode = m_mode;
+    }
+
+    void TestSpecParser::revertBackToLastMode() {
+      m_mode = lastMode;
+    }
+
+    bool TestSpecParser::separate() {
+      if( (m_mode==QuotedName) || (m_mode==Tag) ){
+         //invalid argument, signal failure to previous scope.
+         m_mode = None;
+         m_pos = m_arg.size();
+         m_substring.clear();
+         m_patternName.clear();
+         m_realPatternPos = 0;
+         return false;
+      }
+      endMode();
+      addFilter();
+      return true; //success
+    }
+
+    std::string TestSpecParser::preprocessPattern() {
+        std::string token = m_patternName;
+        for (std::size_t i = 0; i < m_escapeChars.size(); ++i)
+            token = token.substr(0, m_escapeChars[i] - i) + token.substr(m_escapeChars[i] - i + 1);
+        m_escapeChars.clear();
+        if (startsWith(token, "exclude:")) {
+            m_exclusion = true;
+            token = token.substr(8);
+        }
+
+        m_patternName.clear();
+        m_realPatternPos = 0;
+
+        return token;
+    }
+
+    void TestSpecParser::addNamePattern() {
+        auto token = preprocessPattern();
+
+        if (!token.empty()) {
+            if (m_exclusion) {
+                m_currentFilter.m_forbidden.emplace_back(Detail::make_unique<TestSpec::NamePattern>(token, m_substring));
+            } else {
+                m_currentFilter.m_required.emplace_back(Detail::make_unique<TestSpec::NamePattern>(token, m_substring));
+            }
+        }
+        m_substring.clear();
+        m_exclusion = false;
+        m_mode = None;
+    }
+
+    void TestSpecParser::addTagPattern() {
+        auto token = preprocessPattern();
+
+        if (!token.empty()) {
+            // If the tag pattern is the "hide and tag" shorthand (e.g. [.foo])
+            // we have to create a separate hide tag and shorten the real one
+            if (token.size() > 1 && token[0] == '.') {
+                token.erase(token.begin());
+                if (m_exclusion) {
+                    m_currentFilter.m_forbidden.emplace_back(Detail::make_unique<TestSpec::TagPattern>(".", m_substring));
+                } else {
+                    m_currentFilter.m_required.emplace_back(Detail::make_unique<TestSpec::TagPattern>(".", m_substring));
+                }
+            }
+            if (m_exclusion) {
+                m_currentFilter.m_forbidden.emplace_back(Detail::make_unique<TestSpec::TagPattern>(token, m_substring));
+            } else {
+                m_currentFilter.m_required.emplace_back(Detail::make_unique<TestSpec::TagPattern>(token, m_substring));
+            }
+        }
+        m_substring.clear();
+        m_exclusion = false;
+        m_mode = None;
+    }
+
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_spec_parser.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_spec_parser.hpp
new file mode 100644
index 00000000..aa2917db
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_spec_parser.hpp
@@ -0,0 +1,81 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TEST_SPEC_PARSER_HPP_INCLUDED
+#define CATCH_TEST_SPEC_PARSER_HPP_INCLUDED
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpadded"
+#endif
+
+#include <catch2/catch_test_spec.hpp>
+
+#include <vector>
+#include <string>
+
+namespace Catch {
+
+    class ITagAliasRegistry;
+
+    class TestSpecParser {
+        enum Mode{ None, Name, QuotedName, Tag, EscapedName };
+        Mode m_mode = None;
+        Mode lastMode = None;
+        bool m_exclusion = false;
+        std::size_t m_pos = 0;
+        std::size_t m_realPatternPos = 0;
+        std::string m_arg;
+        std::string m_substring;
+        std::string m_patternName;
+        std::vector<std::size_t> m_escapeChars;
+        TestSpec::Filter m_currentFilter;
+        TestSpec m_testSpec;
+        ITagAliasRegistry const* m_tagAliases = nullptr;
+
+    public:
+        TestSpecParser( ITagAliasRegistry const& tagAliases );
+
+        TestSpecParser& parse( std::string const& arg );
+        TestSpec testSpec();
+
+    private:
+        bool visitChar( char c );
+        void startNewMode( Mode mode );
+        bool processNoneChar( char c );
+        void processNameChar( char c );
+        bool processOtherChar( char c );
+        void endMode();
+        void escape();
+        bool isControlChar( char c ) const;
+        void saveLastMode();
+        void revertBackToLastMode();
+        void addFilter();
+        bool separate();
+
+        // Handles common preprocessing of the pattern for name/tag patterns
+        std::string preprocessPattern();
+        // Adds the current pattern as a test name
+        void addNamePattern();
+        // Adds the current pattern as a tag
+        void addTagPattern();
+
+        inline void addCharToPattern(char c) {
+            m_substring += c;
+            m_patternName += c;
+            m_realPatternPos++;
+        }
+
+    };
+
+} // namespace Catch
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+#endif // CATCH_TEST_SPEC_PARSER_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_textflow.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_textflow.cpp
new file mode 100644
index 00000000..857fd2b9
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_textflow.cpp
@@ -0,0 +1,268 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_textflow.hpp>
+
+#include <algorithm>
+#include <cstring>
+#include <ostream>
+
+namespace {
+    bool isWhitespace( char c ) {
+        return c == ' ' || c == '\t' || c == '\n' || c == '\r';
+    }
+
+    bool isBreakableBefore( char c ) {
+        static const char chars[] = "[({<|";
+        return std::memchr( chars, c, sizeof( chars ) - 1 ) != nullptr;
+    }
+
+    bool isBreakableAfter( char c ) {
+        static const char chars[] = "])}>.,:;*+-=&/\\";
+        return std::memchr( chars, c, sizeof( chars ) - 1 ) != nullptr;
+    }
+
+    bool isBoundary( std::string const& line, size_t at ) {
+        assert( at > 0 );
+        assert( at <= line.size() );
+
+        return at == line.size() ||
+               ( isWhitespace( line[at] ) && !isWhitespace( line[at - 1] ) ) ||
+               isBreakableBefore( line[at] ) ||
+               isBreakableAfter( line[at - 1] );
+    }
+
+} // namespace
+
+namespace Catch {
+    namespace TextFlow {
+
+        void Column::const_iterator::calcLength() {
+            m_addHyphen = false;
+            m_parsedTo = m_lineStart;
+
+            std::string const& current_line = m_column.m_string;
+            if ( current_line[m_lineStart] == '\n' ) {
+                ++m_parsedTo;
+            }
+
+            const auto maxLineLength = m_column.m_width - indentSize();
+            const auto maxParseTo = std::min(current_line.size(), m_lineStart + maxLineLength);
+            while ( m_parsedTo < maxParseTo &&
+                    current_line[m_parsedTo] != '\n' ) {
+                ++m_parsedTo;
+            }
+
+            // If we encountered a newline before the column is filled,
+            // then we linebreak at the newline and consider this line
+            // finished.
+            if ( m_parsedTo < m_lineStart + maxLineLength ) {
+                m_lineLength = m_parsedTo - m_lineStart;
+            } else {
+                // Look for a natural linebreak boundary in the column
+                // (We look from the end, so that the first found boundary is
+                // the right one)
+                size_t newLineLength = maxLineLength;
+                while ( newLineLength > 0 && !isBoundary( current_line, m_lineStart + newLineLength ) ) {
+                    --newLineLength;
+                }
+                while ( newLineLength > 0 &&
+                        isWhitespace( current_line[m_lineStart + newLineLength - 1] ) ) {
+                    --newLineLength;
+                }
+
+                // If we found one, then that is where we linebreak
+                if ( newLineLength > 0 ) {
+                    m_lineLength = newLineLength;
+                } else {
+                    // Otherwise we have to split text with a hyphen
+                    m_addHyphen = true;
+                    m_lineLength = maxLineLength - 1;
+                }
+            }
+        }
+
+        size_t Column::const_iterator::indentSize() const {
+            auto initial =
+                m_lineStart == 0 ? m_column.m_initialIndent : std::string::npos;
+            return initial == std::string::npos ? m_column.m_indent : initial;
+        }
+
+        std::string
+        Column::const_iterator::addIndentAndSuffix( size_t position,
+                                              size_t length ) const {
+            std::string ret;
+            const auto desired_indent = indentSize();
+            ret.reserve( desired_indent + length + m_addHyphen );
+            ret.append( desired_indent, ' ' );
+            ret.append( m_column.m_string, position, length );
+            if ( m_addHyphen ) {
+                ret.push_back( '-' );
+            }
+
+            return ret;
+        }
+
+        Column::const_iterator::const_iterator( Column const& column ): m_column( column ) {
+            assert( m_column.m_width > m_column.m_indent );
+            assert( m_column.m_initialIndent == std::string::npos ||
+                    m_column.m_width > m_column.m_initialIndent );
+            calcLength();
+            if ( m_lineLength == 0 ) {
+                m_lineStart = m_column.m_string.size();
+            }
+        }
+
+        std::string Column::const_iterator::operator*() const {
+            assert( m_lineStart <= m_parsedTo );
+            return addIndentAndSuffix( m_lineStart, m_lineLength );
+        }
+
+        Column::const_iterator& Column::const_iterator::operator++() {
+            m_lineStart += m_lineLength;
+            std::string const& current_line = m_column.m_string;
+            if ( m_lineStart < current_line.size() && current_line[m_lineStart] == '\n' ) {
+                m_lineStart += 1;
+            } else {
+                while ( m_lineStart < current_line.size() &&
+                        isWhitespace( current_line[m_lineStart] ) ) {
+                    ++m_lineStart;
+                }
+            }
+
+            if ( m_lineStart != current_line.size() ) {
+                calcLength();
+            }
+            return *this;
+        }
+
+        Column::const_iterator Column::const_iterator::operator++( int ) {
+            const_iterator prev( *this );
+            operator++();
+            return prev;
+        }
+
+        std::ostream& operator<<( std::ostream& os, Column const& col ) {
+            bool first = true;
+            for ( auto line : col ) {
+                if ( first ) {
+                    first = false;
+                } else {
+                    os << '\n';
+                }
+                os << line;
+            }
+            return os;
+        }
+
+        Column Spacer( size_t spaceWidth ) {
+            Column ret{ "" };
+            ret.width( spaceWidth );
+            return ret;
+        }
+
+        Columns::iterator::iterator( Columns const& columns, EndTag ):
+            m_columns( columns.m_columns ), m_activeIterators( 0 ) {
+
+            m_iterators.reserve( m_columns.size() );
+            for ( auto const& col : m_columns ) {
+                m_iterators.push_back( col.end() );
+            }
+        }
+
+        Columns::iterator::iterator( Columns const& columns ):
+            m_columns( columns.m_columns ),
+            m_activeIterators( m_columns.size() ) {
+
+            m_iterators.reserve( m_columns.size() );
+            for ( auto const& col : m_columns ) {
+                m_iterators.push_back( col.begin() );
+            }
+        }
+
+        std::string Columns::iterator::operator*() const {
+            std::string row, padding;
+
+            for ( size_t i = 0; i < m_columns.size(); ++i ) {
+                const auto width = m_columns[i].width();
+                if ( m_iterators[i] != m_columns[i].end() ) {
+                    std::string col = *m_iterators[i];
+                    row += padding;
+                    row += col;
+
+                    padding.clear();
+                    if ( col.size() < width ) {
+                        padding.append( width - col.size(), ' ' );
+                    }
+                } else {
+                    padding.append( width, ' ' );
+                }
+            }
+            return row;
+        }
+
+        Columns::iterator& Columns::iterator::operator++() {
+            for ( size_t i = 0; i < m_columns.size(); ++i ) {
+                if ( m_iterators[i] != m_columns[i].end() ) {
+                    ++m_iterators[i];
+                }
+            }
+            return *this;
+        }
+
+        Columns::iterator Columns::iterator::operator++( int ) {
+            iterator prev( *this );
+            operator++();
+            return prev;
+        }
+
+        std::ostream& operator<<( std::ostream& os, Columns const& cols ) {
+            bool first = true;
+            for ( auto line : cols ) {
+                if ( first ) {
+                    first = false;
+                } else {
+                    os << '\n';
+                }
+                os << line;
+            }
+            return os;
+        }
+
+        Columns operator+(Column const& lhs, Column const& rhs) {
+            Columns cols;
+            cols += lhs;
+            cols += rhs;
+            return cols;
+        }
+        Columns operator+(Column&& lhs, Column&& rhs) {
+            Columns cols;
+            cols += CATCH_MOVE( lhs );
+            cols += CATCH_MOVE( rhs );
+            return cols;
+        }
+
+        Columns& operator+=(Columns& lhs, Column const& rhs) {
+            lhs.m_columns.push_back( rhs );
+            return lhs;
+        }
+        Columns& operator+=(Columns& lhs, Column&& rhs) {
+            lhs.m_columns.push_back( CATCH_MOVE(rhs) );
+            return lhs;
+        }
+        Columns operator+( Columns const& lhs, Column const& rhs ) {
+            auto combined( lhs );
+            combined += rhs;
+            return combined;
+        }
+        Columns operator+( Columns&& lhs, Column&& rhs ) {
+            lhs += CATCH_MOVE( rhs );
+            return CATCH_MOVE( lhs );
+        }
+
+    } // namespace TextFlow
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_textflow.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_textflow.hpp
new file mode 100644
index 00000000..a78451d5
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_textflow.hpp
@@ -0,0 +1,189 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TEXTFLOW_HPP_INCLUDED
+#define CATCH_TEXTFLOW_HPP_INCLUDED
+
+#include <catch2/internal/catch_console_width.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <cassert>
+#include <string>
+#include <vector>
+
+namespace Catch {
+    namespace TextFlow {
+
+        class Columns;
+
+        /**
+         * Represents a column of text with specific width and indentation
+         *
+         * When written out to a stream, it will perform linebreaking
+         * of the provided text so that the written lines fit within
+         * target width.
+         */
+        class Column {
+            // String to be written out
+            std::string m_string;
+            // Width of the column for linebreaking
+            size_t m_width = CATCH_CONFIG_CONSOLE_WIDTH - 1;
+            // Indentation of other lines (including first if initial indent is unset)
+            size_t m_indent = 0;
+            // Indentation of the first line
+            size_t m_initialIndent = std::string::npos;
+
+        public:
+            /**
+             * Iterates "lines" in `Column` and returns them
+             */
+            class const_iterator {
+                friend Column;
+                struct EndTag {};
+
+                Column const& m_column;
+                // Where does the current line start?
+                size_t m_lineStart = 0;
+                // How long should the current line be?
+                size_t m_lineLength = 0;
+                // How far have we checked the string to iterate?
+                size_t m_parsedTo = 0;
+                // Should a '-' be appended to the line?
+                bool m_addHyphen = false;
+
+                const_iterator( Column const& column, EndTag ):
+                    m_column( column ), m_lineStart( m_column.m_string.size() ) {}
+
+                // Calculates the length of the current line
+                void calcLength();
+
+                // Returns current indentation width
+                size_t indentSize() const;
+
+                // Creates an indented and (optionally) suffixed string from
+                // current iterator position, indentation and length.
+                std::string addIndentAndSuffix( size_t position,
+                                                size_t length ) const;
+
+            public:
+                using difference_type = std::ptrdiff_t;
+                using value_type = std::string;
+                using pointer = value_type*;
+                using reference = value_type&;
+                using iterator_category = std::forward_iterator_tag;
+
+                explicit const_iterator( Column const& column );
+
+                std::string operator*() const;
+
+                const_iterator& operator++();
+                const_iterator operator++( int );
+
+                bool operator==( const_iterator const& other ) const {
+                    return m_lineStart == other.m_lineStart && &m_column == &other.m_column;
+                }
+                bool operator!=( const_iterator const& other ) const {
+                    return !operator==( other );
+                }
+            };
+            using iterator = const_iterator;
+
+            explicit Column( std::string const& text ): m_string( text ) {}
+            explicit Column( std::string&& text ):
+                m_string( CATCH_MOVE(text)) {}
+
+            Column& width( size_t newWidth ) & {
+                assert( newWidth > 0 );
+                m_width = newWidth;
+                return *this;
+            }
+            Column&& width( size_t newWidth ) && {
+                assert( newWidth > 0 );
+                m_width = newWidth;
+                return CATCH_MOVE( *this );
+            }
+            Column& indent( size_t newIndent ) & {
+                m_indent = newIndent;
+                return *this;
+            }
+            Column&& indent( size_t newIndent ) && {
+                m_indent = newIndent;
+                return CATCH_MOVE( *this );
+            }
+            Column& initialIndent( size_t newIndent ) & {
+                m_initialIndent = newIndent;
+                return *this;
+            }
+            Column&& initialIndent( size_t newIndent ) && {
+                m_initialIndent = newIndent;
+                return CATCH_MOVE( *this );
+            }
+
+            size_t width() const { return m_width; }
+            const_iterator begin() const { return const_iterator( *this ); }
+            const_iterator end() const { return { *this, const_iterator::EndTag{} }; }
+
+            friend std::ostream& operator<<( std::ostream& os,
+                                             Column const& col );
+
+            friend Columns operator+( Column const& lhs, Column const& rhs );
+            friend Columns operator+( Column&& lhs, Column&& rhs );
+        };
+
+        //! Creates a column that serves as an empty space of specific width
+        Column Spacer( size_t spaceWidth );
+
+        class Columns {
+            std::vector<Column> m_columns;
+
+        public:
+            class iterator {
+                friend Columns;
+                struct EndTag {};
+
+                std::vector<Column> const& m_columns;
+                std::vector<Column::const_iterator> m_iterators;
+                size_t m_activeIterators;
+
+                iterator( Columns const& columns, EndTag );
+
+            public:
+                using difference_type = std::ptrdiff_t;
+                using value_type = std::string;
+                using pointer = value_type*;
+                using reference = value_type&;
+                using iterator_category = std::forward_iterator_tag;
+
+                explicit iterator( Columns const& columns );
+
+                auto operator==( iterator const& other ) const -> bool {
+                    return m_iterators == other.m_iterators;
+                }
+                auto operator!=( iterator const& other ) const -> bool {
+                    return m_iterators != other.m_iterators;
+                }
+                std::string operator*() const;
+                iterator& operator++();
+                iterator operator++( int );
+            };
+            using const_iterator = iterator;
+
+            iterator begin() const { return iterator( *this ); }
+            iterator end() const { return { *this, iterator::EndTag() }; }
+
+            friend Columns& operator+=( Columns& lhs, Column const& rhs );
+            friend Columns& operator+=( Columns& lhs, Column&& rhs );
+            friend Columns operator+( Columns const& lhs, Column const& rhs );
+            friend Columns operator+( Columns&& lhs, Column&& rhs );
+
+            friend std::ostream& operator<<( std::ostream& os,
+                                             Columns const& cols );
+        };
+
+    } // namespace TextFlow
+} // namespace Catch
+#endif // CATCH_TEXTFLOW_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_to_string.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_to_string.hpp
new file mode 100644
index 00000000..c7462161
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_to_string.hpp
@@ -0,0 +1,29 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_TO_STRING_HPP_INCLUDED
+#define CATCH_TO_STRING_HPP_INCLUDED
+
+#include <string>
+
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/internal/catch_reusable_string_stream.hpp>
+
+namespace Catch {
+    template <typename T>
+    std::string to_string(T const& t) {
+#if defined(CATCH_CONFIG_CPP11_TO_STRING)
+        return std::to_string(t);
+#else
+        ReusableStringStream rss;
+        rss << t;
+        return rss.str();
+#endif
+    }
+} // end namespace Catch
+
+#endif // CATCH_TO_STRING_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_uncaught_exceptions.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_uncaught_exceptions.cpp
new file mode 100644
index 00000000..8cfabc0f
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_uncaught_exceptions.cpp
@@ -0,0 +1,25 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/internal/catch_uncaught_exceptions.hpp>
+#include <catch2/internal/catch_config_uncaught_exceptions.hpp>
+#include <catch2/catch_user_config.hpp>
+
+#include <exception>
+
+namespace Catch {
+    bool uncaught_exceptions() {
+#if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS)
+        return false;
+#elif defined(CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS)
+        return std::uncaught_exceptions() > 0;
+#else
+        return std::uncaught_exception();
+#endif
+  }
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_uncaught_exceptions.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_uncaught_exceptions.hpp
new file mode 100644
index 00000000..8520864c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_uncaught_exceptions.hpp
@@ -0,0 +1,15 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
+#define CATCH_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
+
+namespace Catch {
+    bool uncaught_exceptions();
+} // end namespace Catch
+
+#endif // CATCH_UNCAUGHT_EXCEPTIONS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_uniform_floating_point_distribution.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_uniform_floating_point_distribution.hpp
new file mode 100644
index 00000000..23d03b43
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_uniform_floating_point_distribution.hpp
@@ -0,0 +1,131 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#ifndef CATCH_UNIFORM_FLOATING_POINT_DISTRIBUTION_HPP_INCLUDED
+#define CATCH_UNIFORM_FLOATING_POINT_DISTRIBUTION_HPP_INCLUDED
+
+#include <catch2/internal/catch_random_floating_point_helpers.hpp>
+#include <catch2/internal/catch_uniform_integer_distribution.hpp>
+
+#include <cmath>
+#include <type_traits>
+
+namespace Catch {
+
+    namespace Detail {
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        // The issue with overflow only happens with maximal ULP and HUGE
+        // distance, e.g. when generating numbers in [-inf, inf] for given
+        // type. So we only check for the largest possible ULP in the
+        // type, and return something that does not overflow to inf in 1 mult.
+        constexpr std::uint64_t calculate_max_steps_in_one_go(double gamma) {
+            if ( gamma == 1.99584030953472e+292 ) { return 9007199254740991; }
+            return static_cast<std::uint64_t>( -1 );
+        }
+        constexpr std::uint32_t calculate_max_steps_in_one_go(float gamma) {
+            if ( gamma == 2.028241e+31f ) { return 16777215; }
+            return static_cast<std::uint32_t>( -1 );
+        }
+#if defined( __GNUC__ ) || defined( __clang__ )
+#    pragma GCC diagnostic pop
+#endif
+    }
+
+/**
+ * Implementation of uniform distribution on floating point numbers.
+ *
+ * Note that we support only `float` and `double` types, because these
+ * usually mean the same thing across different platform. `long double`
+ * varies wildly by platform and thus we cannot provide reproducible
+ * implementation. Also note that we don't implement all parts of
+ * distribution per standard: this distribution is not serializable, nor
+ * can the range be arbitrarily reset.
+ *
+ * The implementation also uses different approach than the one taken by
+ * `std::uniform_real_distribution`, where instead of generating a number
+ * between [0, 1) and then multiplying the range bounds with it, we first
+ * split the [a, b] range into a set of equidistributed floating point
+ * numbers, and then use uniform int distribution to pick which one to
+ * return.
+ *
+ * This has the advantage of guaranteeing uniformity (the multiplication
+ * method loses uniformity due to rounding when multiplying floats), except
+ * for small non-uniformity at one side of the interval, where we have
+ * to deal with the fact that not every interval is splittable into
+ * equidistributed floats.
+ *
+ * Based on "Drawing random floating-point numbers from an interval" by
+ * Frederic Goualard.
+ */
+template <typename FloatType>
+class uniform_floating_point_distribution {
+    static_assert(std::is_floating_point<FloatType>::value, "...");
+    static_assert(!std::is_same<FloatType, long double>::value,
+                  "We do not support long double due to inconsistent behaviour between platforms");
+
+    using WidthType = Detail::DistanceType<FloatType>;
+
+    FloatType m_a, m_b;
+    FloatType m_ulp_magnitude;
+    WidthType m_floats_in_range;
+    uniform_integer_distribution<WidthType> m_int_dist;
+
+    // In specific cases, we can overflow into `inf` when computing the
+    // `steps * g` offset. To avoid this, we don't offset by more than this
+    // in one multiply + addition.
+    WidthType m_max_steps_in_one_go;
+    // We don't want to do the magnitude check every call to `operator()`
+    bool m_a_has_leq_magnitude;
+
+public:
+    using result_type = FloatType;
+
+    uniform_floating_point_distribution( FloatType a, FloatType b ):
+        m_a( a ),
+        m_b( b ),
+        m_ulp_magnitude( Detail::gamma( m_a, m_b ) ),
+        m_floats_in_range( Detail::count_equidistant_floats( m_a, m_b, m_ulp_magnitude ) ),
+        m_int_dist(0, m_floats_in_range),
+        m_max_steps_in_one_go( Detail::calculate_max_steps_in_one_go(m_ulp_magnitude)),
+        m_a_has_leq_magnitude(std::fabs(m_a) <= std::fabs(m_b))
+    {
+        assert( a <= b );
+    }
+
+    template <typename Generator>
+    result_type operator()( Generator& g ) {
+        WidthType steps = m_int_dist( g );
+        if ( m_a_has_leq_magnitude ) {
+            if ( steps == m_floats_in_range ) { return m_a; }
+            auto b = m_b;
+            while (steps > m_max_steps_in_one_go) {
+                b -= m_max_steps_in_one_go * m_ulp_magnitude;
+                steps -= m_max_steps_in_one_go;
+            }
+            return b - steps * m_ulp_magnitude;
+        } else {
+            if ( steps == m_floats_in_range ) { return m_b; }
+            auto a = m_a;
+            while (steps > m_max_steps_in_one_go) {
+                a += m_max_steps_in_one_go * m_ulp_magnitude;
+                steps -= m_max_steps_in_one_go;
+            }
+            return a + steps * m_ulp_magnitude;
+        }
+    }
+
+    result_type a() const { return m_a; }
+    result_type b() const { return m_b; }
+};
+
+} // end namespace Catch
+
+#endif // CATCH_UNIFORM_FLOATING_POINT_DISTRIBUTION_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_uniform_integer_distribution.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_uniform_integer_distribution.hpp
new file mode 100644
index 00000000..afa2015d
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_uniform_integer_distribution.hpp
@@ -0,0 +1,124 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#ifndef CATCH_UNIFORM_INTEGER_DISTRIBUTION_HPP_INCLUDED
+#define CATCH_UNIFORM_INTEGER_DISTRIBUTION_HPP_INCLUDED
+
+#include <catch2/internal/catch_random_integer_helpers.hpp>
+
+namespace Catch {
+
+    namespace Detail {
+        // Indirection to enable make_unsigned<bool> behaviour.
+        template <typename T>
+        struct make_unsigned {
+            using type = std::make_unsigned_t<T>;
+        };
+
+        template <>
+        struct make_unsigned<bool> {
+            using type = uint8_t;
+        };
+
+        template <typename T>
+        using make_unsigned_t = typename make_unsigned<T>::type;
+    }
+
+/**
+ * Implementation of uniform distribution on integers.
+ *
+ * Unlike `std::uniform_int_distribution`, this implementation supports
+ * various 1 byte integral types, including bool (but you should not
+ * actually use it for bools).
+ *
+ * The underlying algorithm is based on the one described in "Fast Random
+ * Integer Generation in an Interval" by Daniel Lemire, but has been
+ * optimized under the assumption of reuse of the same distribution object.
+ */
+template <typename IntegerType>
+class uniform_integer_distribution {
+    static_assert(std::is_integral<IntegerType>::value, "...");
+
+    using UnsignedIntegerType = Detail::make_unsigned_t<IntegerType>;
+
+    // Only the left bound is stored, and we store it converted to its
+    // unsigned image. This avoids having to do the conversions inside
+    // the operator(), at the cost of having to do the conversion in
+    // the a() getter. The right bound is only needed in the b() getter,
+    // so we recompute it there from other stored data.
+    UnsignedIntegerType m_a;
+
+    // How many different values are there in [a, b]. a == b => 1, can be 0 for distribution over all values in the type.
+    UnsignedIntegerType m_ab_distance;
+
+    // We hoisted this out of the main generation function. Technically,
+    // this means that using this distribution will be slower than Lemire's
+    // algorithm if this distribution instance will be used only few times,
+    // but it will be faster if it is used many times. Since Catch2 uses
+    // distributions only to implement random generators, we assume that each
+    // distribution will be reused many times and this is an optimization.
+    UnsignedIntegerType m_rejection_threshold = 0;
+
+    UnsignedIntegerType computeDistance(IntegerType a, IntegerType b) const {
+        // This overflows and returns 0 if a == 0 and b == TYPE_MAX.
+        // We handle that later when generating the number.
+        return transposeTo(b) - transposeTo(a) + 1;
+    }
+
+    static UnsignedIntegerType computeRejectionThreshold(UnsignedIntegerType ab_distance) {
+        // distance == 0 means that we will return all possible values from
+        // the type's range, and that we shouldn't reject anything.
+        if ( ab_distance == 0 ) { return 0; }
+        return ( ~ab_distance + 1 ) % ab_distance;
+    }
+
+    static UnsignedIntegerType transposeTo(IntegerType in) {
+        return Detail::transposeToNaturalOrder<IntegerType>(
+            static_cast<UnsignedIntegerType>( in ) );
+    }
+    static IntegerType transposeBack(UnsignedIntegerType in) {
+        return static_cast<IntegerType>(
+            Detail::transposeToNaturalOrder<IntegerType>(in) );
+    }
+
+public:
+    using result_type = IntegerType;
+
+    uniform_integer_distribution( IntegerType a, IntegerType b ):
+        m_a( transposeTo(a) ),
+        m_ab_distance( computeDistance(a, b) ),
+        m_rejection_threshold( computeRejectionThreshold(m_ab_distance) ) {
+        assert( a <= b );
+    }
+
+    template <typename Generator>
+    result_type operator()( Generator& g ) {
+        // All possible values of result_type are valid.
+        if ( m_ab_distance == 0 ) {
+            return transposeBack( Detail::fillBitsFrom<UnsignedIntegerType>( g ) );
+        }
+
+        auto random_number = Detail::fillBitsFrom<UnsignedIntegerType>( g );
+        auto emul = Detail::extendedMult( random_number, m_ab_distance );
+        // Unlike Lemire's algorithm we skip the ab_distance check, since
+        // we precomputed the rejection threshold, which is always tighter.
+        while (emul.lower < m_rejection_threshold) {
+            random_number = Detail::fillBitsFrom<UnsignedIntegerType>( g );
+            emul = Detail::extendedMult( random_number, m_ab_distance );
+        }
+
+        return transposeBack(m_a + emul.upper);
+    }
+
+    result_type a() const { return transposeBack(m_a); }
+    result_type b() const { return transposeBack(m_ab_distance + m_a - 1); }
+};
+
+} // end namespace Catch
+
+#endif // CATCH_UNIFORM_INTEGER_DISTRIBUTION_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_unique_name.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_unique_name.hpp
new file mode 100644
index 00000000..c6e1c2ca
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_unique_name.hpp
@@ -0,0 +1,20 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_UNIQUE_NAME_HPP_INCLUDED
+#define CATCH_UNIQUE_NAME_HPP_INCLUDED
+
+#include <catch2/internal/catch_config_counter.hpp>
+#define INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line ) name##line
+#define INTERNAL_CATCH_UNIQUE_NAME_LINE( name, line ) INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line )
+#ifdef CATCH_CONFIG_COUNTER
+#  define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __COUNTER__ )
+#else
+#  define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __LINE__ )
+#endif
+
+#endif // CATCH_UNIQUE_NAME_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_unique_ptr.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_unique_ptr.hpp
new file mode 100644
index 00000000..49cbc785
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_unique_ptr.hpp
@@ -0,0 +1,118 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_UNIQUE_PTR_HPP_INCLUDED
+#define CATCH_UNIQUE_PTR_HPP_INCLUDED
+
+#include <cassert>
+#include <type_traits>
+
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+namespace Catch {
+namespace Detail {
+    /**
+     * A reimplementation of `std::unique_ptr` for improved compilation performance
+     *
+     * Does not support arrays nor custom deleters.
+     */
+    template <typename T>
+    class unique_ptr {
+        T* m_ptr;
+    public:
+        constexpr unique_ptr(std::nullptr_t = nullptr):
+            m_ptr{}
+        {}
+        explicit constexpr unique_ptr(T* ptr):
+            m_ptr(ptr)
+        {}
+
+        template <typename U, typename = std::enable_if_t<std::is_base_of<T, U>::value>>
+        unique_ptr(unique_ptr<U>&& from):
+            m_ptr(from.release())
+        {}
+
+        template <typename U, typename = std::enable_if_t<std::is_base_of<T, U>::value>>
+        unique_ptr& operator=(unique_ptr<U>&& from) {
+            reset(from.release());
+
+            return *this;
+        }
+
+        unique_ptr(unique_ptr const&) = delete;
+        unique_ptr& operator=(unique_ptr const&) = delete;
+
+        unique_ptr(unique_ptr&& rhs) noexcept:
+            m_ptr(rhs.m_ptr) {
+            rhs.m_ptr = nullptr;
+        }
+        unique_ptr& operator=(unique_ptr&& rhs) noexcept {
+            reset(rhs.release());
+
+            return *this;
+        }
+
+        ~unique_ptr() {
+            delete m_ptr;
+        }
+
+        T& operator*() {
+            assert(m_ptr);
+            return *m_ptr;
+        }
+        T const& operator*() const {
+            assert(m_ptr);
+            return *m_ptr;
+        }
+        T* operator->() noexcept {
+            assert(m_ptr);
+            return m_ptr;
+        }
+        T const* operator->() const noexcept {
+            assert(m_ptr);
+            return m_ptr;
+        }
+
+        T* get() { return m_ptr; }
+        T const* get() const { return m_ptr; }
+
+        void reset(T* ptr = nullptr) {
+            delete m_ptr;
+            m_ptr = ptr;
+        }
+
+        T* release() {
+            auto temp = m_ptr;
+            m_ptr = nullptr;
+            return temp;
+        }
+
+        explicit operator bool() const {
+            return m_ptr;
+        }
+
+        friend void swap(unique_ptr& lhs, unique_ptr& rhs) {
+            auto temp = lhs.m_ptr;
+            lhs.m_ptr = rhs.m_ptr;
+            rhs.m_ptr = temp;
+        }
+    };
+
+    //! Specialization to cause compile-time error for arrays
+    template <typename T>
+    class unique_ptr<T[]>;
+
+    template <typename T, typename... Args>
+    unique_ptr<T> make_unique(Args&&... args) {
+        return unique_ptr<T>(new T(CATCH_FORWARD(args)...));
+    }
+
+
+} // end namespace Detail
+} // end namespace Catch
+
+#endif // CATCH_UNIQUE_PTR_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_void_type.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_void_type.hpp
new file mode 100644
index 00000000..dacc83dd
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_void_type.hpp
@@ -0,0 +1,25 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_VOID_TYPE_HPP_INCLUDED
+#define CATCH_VOID_TYPE_HPP_INCLUDED
+
+
+namespace Catch {
+    namespace Detail {
+
+        template <typename...>
+        struct make_void { using type = void; };
+
+        template <typename... Ts>
+        using void_t = typename make_void<Ts...>::type;
+
+    } // namespace Detail
+} // namespace Catch
+
+
+#endif // CATCH_VOID_TYPE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_wildcard_pattern.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_wildcard_pattern.cpp
new file mode 100644
index 00000000..09c5a405
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_wildcard_pattern.cpp
@@ -0,0 +1,47 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/internal/catch_wildcard_pattern.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+
+namespace Catch {
+
+    WildcardPattern::WildcardPattern( std::string const& pattern,
+                                      CaseSensitive caseSensitivity )
+    :   m_caseSensitivity( caseSensitivity ),
+        m_pattern( normaliseString( pattern ) )
+    {
+        if( startsWith( m_pattern, '*' ) ) {
+            m_pattern = m_pattern.substr( 1 );
+            m_wildcard = WildcardAtStart;
+        }
+        if( endsWith( m_pattern, '*' ) ) {
+            m_pattern = m_pattern.substr( 0, m_pattern.size()-1 );
+            m_wildcard = static_cast<WildcardPosition>( m_wildcard | WildcardAtEnd );
+        }
+    }
+
+    bool WildcardPattern::matches( std::string const& str ) const {
+        switch( m_wildcard ) {
+            case NoWildcard:
+                return m_pattern == normaliseString( str );
+            case WildcardAtStart:
+                return endsWith( normaliseString( str ), m_pattern );
+            case WildcardAtEnd:
+                return startsWith( normaliseString( str ), m_pattern );
+            case WildcardAtBothEnds:
+                return contains( normaliseString( str ), m_pattern );
+            default:
+                CATCH_INTERNAL_ERROR( "Unknown enum" );
+        }
+    }
+
+    std::string WildcardPattern::normaliseString( std::string const& str ) const {
+        return trim( m_caseSensitivity == CaseSensitive::No ? toLower( str ) : str );
+    }
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_wildcard_pattern.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_wildcard_pattern.hpp
new file mode 100644
index 00000000..72479ba8
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_wildcard_pattern.hpp
@@ -0,0 +1,38 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_WILDCARD_PATTERN_HPP_INCLUDED
+#define CATCH_WILDCARD_PATTERN_HPP_INCLUDED
+
+#include <catch2/internal/catch_case_sensitive.hpp>
+
+#include <string>
+
+namespace Catch
+{
+    class WildcardPattern {
+        enum WildcardPosition {
+            NoWildcard = 0,
+            WildcardAtStart = 1,
+            WildcardAtEnd = 2,
+            WildcardAtBothEnds = WildcardAtStart | WildcardAtEnd
+        };
+
+    public:
+
+        WildcardPattern( std::string const& pattern, CaseSensitive caseSensitivity );
+        bool matches( std::string const& str ) const;
+
+    private:
+        std::string normaliseString( std::string const& str ) const;
+        CaseSensitive m_caseSensitivity;
+        WildcardPosition m_wildcard = NoWildcard;
+        std::string m_pattern;
+    };
+}
+
+#endif // CATCH_WILDCARD_PATTERN_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_windows_h_proxy.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_windows_h_proxy.hpp
new file mode 100644
index 00000000..e3b91495
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_windows_h_proxy.hpp
@@ -0,0 +1,28 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_WINDOWS_H_PROXY_HPP_INCLUDED
+#define CATCH_WINDOWS_H_PROXY_HPP_INCLUDED
+
+#include <catch2/internal/catch_platform.hpp>
+
+#if defined(CATCH_PLATFORM_WINDOWS)
+
+// We might end up with the define made globally through the compiler,
+// and we don't want to trigger warnings for this
+#if !defined(NOMINMAX)
+#  define NOMINMAX
+#endif
+#if !defined(WIN32_LEAN_AND_MEAN)
+#  define WIN32_LEAN_AND_MEAN
+#endif
+
+#include <windows.h>
+
+#endif // defined(CATCH_PLATFORM_WINDOWS)
+
+#endif // CATCH_WINDOWS_H_PROXY_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_xmlwriter.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_xmlwriter.cpp
new file mode 100644
index 00000000..6c1d45df
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_xmlwriter.cpp
@@ -0,0 +1,348 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+// Note: swapping these two includes around causes MSVC to error out
+//       while in /permissive- mode. No, I don't know why.
+//       Tested on VS 2019, 18.{3, 4}.x
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_xmlwriter.hpp>
+
+#include <cstdint>
+#include <iomanip>
+#include <type_traits>
+
+namespace Catch {
+
+namespace {
+
+    size_t trailingBytes(unsigned char c) {
+        if ((c & 0xE0) == 0xC0) {
+            return 2;
+        }
+        if ((c & 0xF0) == 0xE0) {
+            return 3;
+        }
+        if ((c & 0xF8) == 0xF0) {
+            return 4;
+        }
+        CATCH_INTERNAL_ERROR("Invalid multibyte utf-8 start byte encountered");
+    }
+
+    uint32_t headerValue(unsigned char c) {
+        if ((c & 0xE0) == 0xC0) {
+            return c & 0x1F;
+        }
+        if ((c & 0xF0) == 0xE0) {
+            return c & 0x0F;
+        }
+        if ((c & 0xF8) == 0xF0) {
+            return c & 0x07;
+        }
+        CATCH_INTERNAL_ERROR("Invalid multibyte utf-8 start byte encountered");
+    }
+
+    void hexEscapeChar(std::ostream& os, unsigned char c) {
+        std::ios_base::fmtflags f(os.flags());
+        os << "\\x"
+            << std::uppercase << std::hex << std::setfill('0') << std::setw(2)
+            << static_cast<int>(c);
+        os.flags(f);
+    }
+
+    bool shouldNewline(XmlFormatting fmt) {
+        return !!(static_cast<std::underlying_type_t<XmlFormatting>>(fmt & XmlFormatting::Newline));
+    }
+
+    bool shouldIndent(XmlFormatting fmt) {
+        return !!(static_cast<std::underlying_type_t<XmlFormatting>>(fmt & XmlFormatting::Indent));
+    }
+
+} // anonymous namespace
+
+    XmlFormatting operator | (XmlFormatting lhs, XmlFormatting rhs) {
+        return static_cast<XmlFormatting>(
+            static_cast<std::underlying_type_t<XmlFormatting>>(lhs) |
+            static_cast<std::underlying_type_t<XmlFormatting>>(rhs)
+        );
+    }
+
+    XmlFormatting operator & (XmlFormatting lhs, XmlFormatting rhs) {
+        return static_cast<XmlFormatting>(
+            static_cast<std::underlying_type_t<XmlFormatting>>(lhs) &
+            static_cast<std::underlying_type_t<XmlFormatting>>(rhs)
+        );
+    }
+
+
+    XmlEncode::XmlEncode( StringRef str, ForWhat forWhat )
+    :   m_str( str ),
+        m_forWhat( forWhat )
+    {}
+
+    void XmlEncode::encodeTo( std::ostream& os ) const {
+        // Apostrophe escaping not necessary if we always use " to write attributes
+        // (see: http://www.w3.org/TR/xml/#syntax)
+
+        for( std::size_t idx = 0; idx < m_str.size(); ++ idx ) {
+            unsigned char c = static_cast<unsigned char>(m_str[idx]);
+            switch (c) {
+            case '<':   os << "&lt;"; break;
+            case '&':   os << "&amp;"; break;
+
+            case '>':
+                // See: http://www.w3.org/TR/xml/#syntax
+                if (idx > 2 && m_str[idx - 1] == ']' && m_str[idx - 2] == ']')
+                    os << "&gt;";
+                else
+                    os << c;
+                break;
+
+            case '\"':
+                if (m_forWhat == ForAttributes)
+                    os << "&quot;";
+                else
+                    os << c;
+                break;
+
+            default:
+                // Check for control characters and invalid utf-8
+
+                // Escape control characters in standard ascii
+                // see http://stackoverflow.com/questions/404107/why-are-control-characters-illegal-in-xml-1-0
+                if (c < 0x09 || (c > 0x0D && c < 0x20) || c == 0x7F) {
+                    hexEscapeChar(os, c);
+                    break;
+                }
+
+                // Plain ASCII: Write it to stream
+                if (c < 0x7F) {
+                    os << c;
+                    break;
+                }
+
+                // UTF-8 territory
+                // Check if the encoding is valid and if it is not, hex escape bytes.
+                // Important: We do not check the exact decoded values for validity, only the encoding format
+                // First check that this bytes is a valid lead byte:
+                // This means that it is not encoded as 1111 1XXX
+                // Or as 10XX XXXX
+                if (c <  0xC0 ||
+                    c >= 0xF8) {
+                    hexEscapeChar(os, c);
+                    break;
+                }
+
+                auto encBytes = trailingBytes(c);
+                // Are there enough bytes left to avoid accessing out-of-bounds memory?
+                if (idx + encBytes - 1 >= m_str.size()) {
+                    hexEscapeChar(os, c);
+                    break;
+                }
+                // The header is valid, check data
+                // The next encBytes bytes must together be a valid utf-8
+                // This means: bitpattern 10XX XXXX and the extracted value is sane (ish)
+                bool valid = true;
+                uint32_t value = headerValue(c);
+                for (std::size_t n = 1; n < encBytes; ++n) {
+                    unsigned char nc = static_cast<unsigned char>(m_str[idx + n]);
+                    valid &= ((nc & 0xC0) == 0x80);
+                    value = (value << 6) | (nc & 0x3F);
+                }
+
+                if (
+                    // Wrong bit pattern of following bytes
+                    (!valid) ||
+                    // Overlong encodings
+                    (value < 0x80) ||
+                    (0x80 <= value && value < 0x800   && encBytes > 2) ||
+                    (0x800 < value && value < 0x10000 && encBytes > 3) ||
+                    // Encoded value out of range
+                    (value >= 0x110000)
+                    ) {
+                    hexEscapeChar(os, c);
+                    break;
+                }
+
+                // If we got here, this is in fact a valid(ish) utf-8 sequence
+                for (std::size_t n = 0; n < encBytes; ++n) {
+                    os << m_str[idx + n];
+                }
+                idx += encBytes - 1;
+                break;
+            }
+        }
+    }
+
+    std::ostream& operator << ( std::ostream& os, XmlEncode const& xmlEncode ) {
+        xmlEncode.encodeTo( os );
+        return os;
+    }
+
+    XmlWriter::ScopedElement::ScopedElement( XmlWriter* writer, XmlFormatting fmt )
+    :   m_writer( writer ),
+        m_fmt(fmt)
+    {}
+
+    XmlWriter::ScopedElement::ScopedElement( ScopedElement&& other ) noexcept
+    :   m_writer( other.m_writer ),
+        m_fmt(other.m_fmt)
+    {
+        other.m_writer = nullptr;
+        other.m_fmt = XmlFormatting::None;
+    }
+    XmlWriter::ScopedElement& XmlWriter::ScopedElement::operator=( ScopedElement&& other ) noexcept {
+        if ( m_writer ) {
+            m_writer->endElement();
+        }
+        m_writer = other.m_writer;
+        other.m_writer = nullptr;
+        m_fmt = other.m_fmt;
+        other.m_fmt = XmlFormatting::None;
+        return *this;
+    }
+
+
+    XmlWriter::ScopedElement::~ScopedElement() {
+        if (m_writer) {
+            m_writer->endElement(m_fmt);
+        }
+    }
+
+    XmlWriter::ScopedElement&
+    XmlWriter::ScopedElement::writeText( StringRef text, XmlFormatting fmt ) {
+        m_writer->writeText( text, fmt );
+        return *this;
+    }
+
+    XmlWriter::ScopedElement&
+    XmlWriter::ScopedElement::writeAttribute( StringRef name,
+                                              StringRef attribute ) {
+        m_writer->writeAttribute( name, attribute );
+        return *this;
+    }
+
+
+    XmlWriter::XmlWriter( std::ostream& os ) : m_os( os )
+    {
+        writeDeclaration();
+    }
+
+    XmlWriter::~XmlWriter() {
+        while (!m_tags.empty()) {
+            endElement();
+        }
+        newlineIfNecessary();
+    }
+
+    XmlWriter& XmlWriter::startElement( std::string const& name, XmlFormatting fmt ) {
+        ensureTagClosed();
+        newlineIfNecessary();
+        if (shouldIndent(fmt)) {
+            m_os << m_indent;
+            m_indent += "  ";
+        }
+        m_os << '<' << name;
+        m_tags.push_back( name );
+        m_tagIsOpen = true;
+        applyFormatting(fmt);
+        return *this;
+    }
+
+    XmlWriter::ScopedElement XmlWriter::scopedElement( std::string const& name, XmlFormatting fmt ) {
+        ScopedElement scoped( this, fmt );
+        startElement( name, fmt );
+        return scoped;
+    }
+
+    XmlWriter& XmlWriter::endElement(XmlFormatting fmt) {
+        m_indent = m_indent.substr(0, m_indent.size() - 2);
+
+        if( m_tagIsOpen ) {
+            m_os << "/>";
+            m_tagIsOpen = false;
+        } else {
+            newlineIfNecessary();
+            if (shouldIndent(fmt)) {
+                m_os << m_indent;
+            }
+            m_os << "</" << m_tags.back() << '>';
+        }
+        m_os << std::flush;
+        applyFormatting(fmt);
+        m_tags.pop_back();
+        return *this;
+    }
+
+    XmlWriter& XmlWriter::writeAttribute( StringRef name,
+                                          StringRef attribute ) {
+        if( !name.empty() && !attribute.empty() )
+            m_os << ' ' << name << "=\"" << XmlEncode( attribute, XmlEncode::ForAttributes ) << '"';
+        return *this;
+    }
+
+    XmlWriter& XmlWriter::writeAttribute( StringRef name, bool attribute ) {
+        writeAttribute(name, (attribute ? "true"_sr : "false"_sr));
+        return *this;
+    }
+
+    XmlWriter& XmlWriter::writeAttribute( StringRef name,
+                                          char const* attribute ) {
+        writeAttribute( name, StringRef( attribute ) );
+        return *this;
+    }
+
+    XmlWriter& XmlWriter::writeText( StringRef text, XmlFormatting fmt ) {
+        CATCH_ENFORCE(!m_tags.empty(), "Cannot write text as top level element");
+        if( !text.empty() ){
+            bool tagWasOpen = m_tagIsOpen;
+            ensureTagClosed();
+            if (tagWasOpen && shouldIndent(fmt)) {
+                m_os << m_indent;
+            }
+            m_os << XmlEncode( text, XmlEncode::ForTextNodes );
+            applyFormatting(fmt);
+        }
+        return *this;
+    }
+
+    XmlWriter& XmlWriter::writeComment( StringRef text, XmlFormatting fmt ) {
+        ensureTagClosed();
+        if (shouldIndent(fmt)) {
+            m_os << m_indent;
+        }
+        m_os << "<!-- " << text << " -->";
+        applyFormatting(fmt);
+        return *this;
+    }
+
+    void XmlWriter::writeStylesheetRef( StringRef url ) {
+        m_os << R"(<?xml-stylesheet type="text/xsl" href=")" << url << R"("?>)" << '\n';
+    }
+
+    void XmlWriter::ensureTagClosed() {
+        if( m_tagIsOpen ) {
+            m_os << '>' << std::flush;
+            newlineIfNecessary();
+            m_tagIsOpen = false;
+        }
+    }
+
+    void XmlWriter::applyFormatting(XmlFormatting fmt) {
+        m_needsNewline = shouldNewline(fmt);
+    }
+
+    void XmlWriter::writeDeclaration() {
+        m_os << R"(<?xml version="1.0" encoding="UTF-8"?>)" << '\n';
+    }
+
+    void XmlWriter::newlineIfNecessary() {
+        if( m_needsNewline ) {
+            m_os << '\n' << std::flush;
+            m_needsNewline = false;
+        }
+    }
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_xmlwriter.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_xmlwriter.hpp
new file mode 100644
index 00000000..ec55f3c4
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_xmlwriter.hpp
@@ -0,0 +1,152 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_XMLWRITER_HPP_INCLUDED
+#define CATCH_XMLWRITER_HPP_INCLUDED
+
+#include <catch2/internal/catch_reusable_string_stream.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+
+#include <iosfwd>
+#include <vector>
+
+namespace Catch {
+    enum class XmlFormatting {
+        None = 0x00,
+        Indent = 0x01,
+        Newline = 0x02,
+    };
+
+    XmlFormatting operator | (XmlFormatting lhs, XmlFormatting rhs);
+    XmlFormatting operator & (XmlFormatting lhs, XmlFormatting rhs);
+
+    /**
+     * Helper for XML-encoding text (escaping angle brackets, quotes, etc)
+     *
+     * Note: doesn't take ownership of passed strings, and thus the
+     *       encoded string must outlive the encoding instance.
+     */
+    class XmlEncode {
+    public:
+        enum ForWhat { ForTextNodes, ForAttributes };
+
+        XmlEncode( StringRef str, ForWhat forWhat = ForTextNodes );
+
+        void encodeTo( std::ostream& os ) const;
+
+        friend std::ostream& operator << ( std::ostream& os, XmlEncode const& xmlEncode );
+
+    private:
+        StringRef m_str;
+        ForWhat m_forWhat;
+    };
+
+    class XmlWriter {
+    public:
+
+        class ScopedElement {
+        public:
+            ScopedElement( XmlWriter* writer, XmlFormatting fmt );
+
+            ScopedElement( ScopedElement&& other ) noexcept;
+            ScopedElement& operator=( ScopedElement&& other ) noexcept;
+
+            ~ScopedElement();
+
+            ScopedElement&
+            writeText( StringRef text,
+                       XmlFormatting fmt = XmlFormatting::Newline |
+                                           XmlFormatting::Indent );
+
+            ScopedElement& writeAttribute( StringRef name,
+                                           StringRef attribute );
+            template <typename T,
+                      // Without this SFINAE, this overload is a better match
+                      // for `std::string`, `char const*`, `char const[N]` args.
+                      // While it would still work, it would cause code bloat
+                      // and multiple iteration over the strings
+                      typename = typename std::enable_if_t<
+                          !std::is_convertible<T, StringRef>::value>>
+            ScopedElement& writeAttribute( StringRef name,
+                                           T const& attribute ) {
+                m_writer->writeAttribute( name, attribute );
+                return *this;
+            }
+
+        private:
+            XmlWriter* m_writer = nullptr;
+            XmlFormatting m_fmt;
+        };
+
+        XmlWriter( std::ostream& os );
+        ~XmlWriter();
+
+        XmlWriter( XmlWriter const& ) = delete;
+        XmlWriter& operator=( XmlWriter const& ) = delete;
+
+        XmlWriter& startElement( std::string const& name, XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent);
+
+        ScopedElement scopedElement( std::string const& name, XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent);
+
+        XmlWriter& endElement(XmlFormatting fmt = XmlFormatting::Newline | XmlFormatting::Indent);
+
+        //! The attribute content is XML-encoded
+        XmlWriter& writeAttribute( StringRef name, StringRef attribute );
+
+        //! Writes the attribute as "true/false"
+        XmlWriter& writeAttribute( StringRef name, bool attribute );
+
+        //! The attribute content is XML-encoded
+        XmlWriter& writeAttribute( StringRef name, char const* attribute );
+
+        //! The attribute value must provide op<<(ostream&, T). The resulting
+        //! serialization is XML-encoded
+        template <typename T,
+                  // Without this SFINAE, this overload is a better match
+                  // for `std::string`, `char const*`, `char const[N]` args.
+                  // While it would still work, it would cause code bloat
+                  // and multiple iteration over the strings
+                  typename = typename std::enable_if_t<
+                      !std::is_convertible<T, StringRef>::value>>
+        XmlWriter& writeAttribute( StringRef name, T const& attribute ) {
+            ReusableStringStream rss;
+            rss << attribute;
+            return writeAttribute( name, rss.str() );
+        }
+
+        //! Writes escaped `text` in a element
+        XmlWriter& writeText( StringRef text,
+                              XmlFormatting fmt = XmlFormatting::Newline |
+                                                  XmlFormatting::Indent );
+
+        //! Writes XML comment as "<!-- text -->"
+        XmlWriter& writeComment( StringRef text,
+                                 XmlFormatting fmt = XmlFormatting::Newline |
+                                                     XmlFormatting::Indent );
+
+        void writeStylesheetRef( StringRef url );
+
+        void ensureTagClosed();
+
+    private:
+
+        void applyFormatting(XmlFormatting fmt);
+
+        void writeDeclaration();
+
+        void newlineIfNecessary();
+
+        bool m_tagIsOpen = false;
+        bool m_needsNewline = false;
+        std::vector<std::string> m_tags;
+        std::string m_indent;
+        std::ostream& m_os;
+    };
+
+}
+
+#endif // CATCH_XMLWRITER_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers.cpp
new file mode 100644
index 00000000..123b3041
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers.cpp
@@ -0,0 +1,25 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+
+#include <catch2/matchers/catch_matchers.hpp>
+
+namespace Catch {
+namespace Matchers {
+
+    std::string MatcherUntypedBase::toString() const {
+        if (m_cachedToString.empty()) {
+            m_cachedToString = describe();
+        }
+        return m_cachedToString;
+    }
+
+    MatcherUntypedBase::~MatcherUntypedBase() = default;
+
+} // namespace Matchers
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers.hpp
new file mode 100644
index 00000000..3d996c39
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers.hpp
@@ -0,0 +1,237 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_MATCHERS_HPP_INCLUDED
+#define CATCH_MATCHERS_HPP_INCLUDED
+
+#include <catch2/matchers/internal/catch_matchers_impl.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <string>
+#include <vector>
+
+namespace Catch {
+namespace Matchers {
+
+    class MatcherUntypedBase {
+    public:
+        MatcherUntypedBase() = default;
+
+        MatcherUntypedBase(MatcherUntypedBase const&) = default;
+        MatcherUntypedBase(MatcherUntypedBase&&) = default;
+
+        MatcherUntypedBase& operator = (MatcherUntypedBase const&) = delete;
+        MatcherUntypedBase& operator = (MatcherUntypedBase&&) = delete;
+
+        std::string toString() const;
+
+    protected:
+        virtual ~MatcherUntypedBase(); // = default;
+        virtual std::string describe() const = 0;
+        mutable std::string m_cachedToString;
+    };
+
+
+    template<typename T>
+    class MatcherBase : public MatcherUntypedBase {
+    public:
+        virtual bool match( T const& arg ) const = 0;
+    };
+
+    namespace Detail {
+
+        template<typename ArgT>
+        class MatchAllOf final : public MatcherBase<ArgT> {
+            std::vector<MatcherBase<ArgT> const*> m_matchers;
+
+        public:
+            MatchAllOf() = default;
+            MatchAllOf(MatchAllOf const&) = delete;
+            MatchAllOf& operator=(MatchAllOf const&) = delete;
+            MatchAllOf(MatchAllOf&&) = default;
+            MatchAllOf& operator=(MatchAllOf&&) = default;
+
+
+            bool match( ArgT const& arg ) const override {
+                for( auto matcher : m_matchers ) {
+                    if (!matcher->match(arg))
+                        return false;
+                }
+                return true;
+            }
+            std::string describe() const override {
+                std::string description;
+                description.reserve( 4 + m_matchers.size()*32 );
+                description += "( ";
+                bool first = true;
+                for( auto matcher : m_matchers ) {
+                    if( first )
+                        first = false;
+                    else
+                        description += " and ";
+                    description += matcher->toString();
+                }
+                description += " )";
+                return description;
+            }
+
+            friend MatchAllOf operator&& (MatchAllOf&& lhs, MatcherBase<ArgT> const& rhs) {
+                lhs.m_matchers.push_back(&rhs);
+                return CATCH_MOVE(lhs);
+            }
+            friend MatchAllOf operator&& (MatcherBase<ArgT> const& lhs, MatchAllOf&& rhs) {
+                rhs.m_matchers.insert(rhs.m_matchers.begin(), &lhs);
+                return CATCH_MOVE(rhs);
+            }
+        };
+
+        //! lvalue overload is intentionally deleted, users should
+        //! not be trying to compose stored composition matchers
+        template<typename ArgT>
+        MatchAllOf<ArgT> operator&& (MatchAllOf<ArgT> const& lhs, MatcherBase<ArgT> const& rhs) = delete;
+        //! lvalue overload is intentionally deleted, users should
+        //! not be trying to compose stored composition matchers
+        template<typename ArgT>
+        MatchAllOf<ArgT> operator&& (MatcherBase<ArgT> const& lhs, MatchAllOf<ArgT> const& rhs) = delete;
+
+        template<typename ArgT>
+        class MatchAnyOf final : public MatcherBase<ArgT> {
+            std::vector<MatcherBase<ArgT> const*> m_matchers;
+        public:
+            MatchAnyOf() = default;
+            MatchAnyOf(MatchAnyOf const&) = delete;
+            MatchAnyOf& operator=(MatchAnyOf const&) = delete;
+            MatchAnyOf(MatchAnyOf&&) = default;
+            MatchAnyOf& operator=(MatchAnyOf&&) = default;
+
+            bool match( ArgT const& arg ) const override {
+                for( auto matcher : m_matchers ) {
+                    if (matcher->match(arg))
+                        return true;
+                }
+                return false;
+            }
+            std::string describe() const override {
+                std::string description;
+                description.reserve( 4 + m_matchers.size()*32 );
+                description += "( ";
+                bool first = true;
+                for( auto matcher : m_matchers ) {
+                    if( first )
+                        first = false;
+                    else
+                        description += " or ";
+                    description += matcher->toString();
+                }
+                description += " )";
+                return description;
+            }
+
+            friend MatchAnyOf operator|| (MatchAnyOf&& lhs, MatcherBase<ArgT> const& rhs) {
+                lhs.m_matchers.push_back(&rhs);
+                return CATCH_MOVE(lhs);
+            }
+            friend MatchAnyOf operator|| (MatcherBase<ArgT> const& lhs, MatchAnyOf&& rhs) {
+                rhs.m_matchers.insert(rhs.m_matchers.begin(), &lhs);
+                return CATCH_MOVE(rhs);
+            }
+        };
+
+        //! lvalue overload is intentionally deleted, users should
+        //! not be trying to compose stored composition matchers
+        template<typename ArgT>
+        MatchAnyOf<ArgT> operator|| (MatchAnyOf<ArgT> const& lhs, MatcherBase<ArgT> const& rhs) = delete;
+        //! lvalue overload is intentionally deleted, users should
+        //! not be trying to compose stored composition matchers
+        template<typename ArgT>
+        MatchAnyOf<ArgT> operator|| (MatcherBase<ArgT> const& lhs, MatchAnyOf<ArgT> const& rhs) = delete;
+
+        template<typename ArgT>
+        class MatchNotOf final : public MatcherBase<ArgT> {
+            MatcherBase<ArgT> const& m_underlyingMatcher;
+
+        public:
+            explicit MatchNotOf( MatcherBase<ArgT> const& underlyingMatcher ):
+                m_underlyingMatcher( underlyingMatcher )
+            {}
+
+            bool match( ArgT const& arg ) const override {
+                return !m_underlyingMatcher.match( arg );
+            }
+
+            std::string describe() const override {
+                return "not " + m_underlyingMatcher.toString();
+            }
+        };
+
+    } // namespace Detail
+
+    template <typename T>
+    Detail::MatchAllOf<T> operator&& (MatcherBase<T> const& lhs, MatcherBase<T> const& rhs) {
+        return Detail::MatchAllOf<T>{} && lhs && rhs;
+    }
+    template <typename T>
+    Detail::MatchAnyOf<T> operator|| (MatcherBase<T> const& lhs, MatcherBase<T> const& rhs) {
+        return Detail::MatchAnyOf<T>{} || lhs || rhs;
+    }
+
+    template <typename T>
+    Detail::MatchNotOf<T> operator! (MatcherBase<T> const& matcher) {
+        return Detail::MatchNotOf<T>{ matcher };
+    }
+
+
+} // namespace Matchers
+} // namespace Catch
+
+
+#if defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_DISABLE)
+  #define CATCH_REQUIRE_THROWS_WITH( expr, matcher ) INTERNAL_CATCH_THROWS_STR_MATCHES( "CATCH_REQUIRE_THROWS_WITH", Catch::ResultDisposition::Normal, matcher, expr )
+  #define CATCH_REQUIRE_THROWS_MATCHES( expr, exceptionType, matcher ) INTERNAL_CATCH_THROWS_MATCHES( "CATCH_REQUIRE_THROWS_MATCHES", exceptionType, Catch::ResultDisposition::Normal, matcher, expr )
+
+  #define CATCH_CHECK_THROWS_WITH( expr, matcher ) INTERNAL_CATCH_THROWS_STR_MATCHES( "CATCH_CHECK_THROWS_WITH", Catch::ResultDisposition::ContinueOnFailure, matcher, expr )
+  #define CATCH_CHECK_THROWS_MATCHES( expr, exceptionType, matcher ) INTERNAL_CATCH_THROWS_MATCHES( "CATCH_CHECK_THROWS_MATCHES", exceptionType, Catch::ResultDisposition::ContinueOnFailure, matcher, expr )
+
+  #define CATCH_CHECK_THAT( arg, matcher ) INTERNAL_CHECK_THAT( "CATCH_CHECK_THAT", matcher, Catch::ResultDisposition::ContinueOnFailure, arg )
+  #define CATCH_REQUIRE_THAT( arg, matcher ) INTERNAL_CHECK_THAT( "CATCH_REQUIRE_THAT", matcher, Catch::ResultDisposition::Normal, arg )
+
+#elif defined(CATCH_CONFIG_PREFIX_ALL) && defined(CATCH_CONFIG_DISABLE)
+
+  #define CATCH_REQUIRE_THROWS_WITH( expr, matcher )                   (void)(0)
+  #define CATCH_REQUIRE_THROWS_MATCHES( expr, exceptionType, matcher ) (void)(0)
+
+  #define CATCH_CHECK_THROWS_WITH( expr, matcher )                     (void)(0)
+  #define CATCH_CHECK_THROWS_MATCHES( expr, exceptionType, matcher )   (void)(0)
+
+  #define CATCH_CHECK_THAT( arg, matcher )                             (void)(0)
+  #define CATCH_REQUIRE_THAT( arg, matcher )                           (void)(0)
+
+#elif !defined(CATCH_CONFIG_PREFIX_ALL) && !defined(CATCH_CONFIG_DISABLE)
+
+  #define REQUIRE_THROWS_WITH( expr, matcher ) INTERNAL_CATCH_THROWS_STR_MATCHES( "REQUIRE_THROWS_WITH", Catch::ResultDisposition::Normal, matcher, expr )
+  #define REQUIRE_THROWS_MATCHES( expr, exceptionType, matcher ) INTERNAL_CATCH_THROWS_MATCHES( "REQUIRE_THROWS_MATCHES", exceptionType, Catch::ResultDisposition::Normal, matcher, expr )
+
+  #define CHECK_THROWS_WITH( expr, matcher ) INTERNAL_CATCH_THROWS_STR_MATCHES( "CHECK_THROWS_WITH", Catch::ResultDisposition::ContinueOnFailure, matcher, expr )
+  #define CHECK_THROWS_MATCHES( expr, exceptionType, matcher ) INTERNAL_CATCH_THROWS_MATCHES( "CHECK_THROWS_MATCHES", exceptionType, Catch::ResultDisposition::ContinueOnFailure, matcher, expr )
+
+  #define CHECK_THAT( arg, matcher ) INTERNAL_CHECK_THAT( "CHECK_THAT", matcher, Catch::ResultDisposition::ContinueOnFailure, arg )
+  #define REQUIRE_THAT( arg, matcher ) INTERNAL_CHECK_THAT( "REQUIRE_THAT", matcher, Catch::ResultDisposition::Normal, arg )
+
+#elif !defined(CATCH_CONFIG_PREFIX_ALL) && defined(CATCH_CONFIG_DISABLE)
+
+  #define REQUIRE_THROWS_WITH( expr, matcher )                   (void)(0)
+  #define REQUIRE_THROWS_MATCHES( expr, exceptionType, matcher ) (void)(0)
+
+  #define CHECK_THROWS_WITH( expr, matcher )                     (void)(0)
+  #define CHECK_THROWS_MATCHES( expr, exceptionType, matcher )   (void)(0)
+
+  #define CHECK_THAT( arg, matcher )                             (void)(0)
+  #define REQUIRE_THAT( arg, matcher )                           (void)(0)
+
+#endif // end of user facing macro declarations
+
+#endif // CATCH_MATCHERS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_all.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_all.hpp
new file mode 100644
index 00000000..83fe5386
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_all.hpp
@@ -0,0 +1,36 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+/** \file
+ * This is a convenience header for Catch2's Matcher support. It includes
+ * **all** of Catch2 headers related to matchers.
+ *
+ * Generally the Catch2 users should use specific includes they need,
+ * but this header can be used instead for ease-of-experimentation, or
+ * just plain convenience, at the cost of increased compilation times.
+ *
+ * When a new header is added to either the `matchers` folder, or to
+ * the corresponding internal subfolder, it should be added here.
+ */
+
+#ifndef CATCH_MATCHERS_ALL_HPP_INCLUDED
+#define CATCH_MATCHERS_ALL_HPP_INCLUDED
+
+#include <catch2/matchers/catch_matchers.hpp>
+#include <catch2/matchers/catch_matchers_container_properties.hpp>
+#include <catch2/matchers/catch_matchers_contains.hpp>
+#include <catch2/matchers/catch_matchers_exception.hpp>
+#include <catch2/matchers/catch_matchers_floating_point.hpp>
+#include <catch2/matchers/catch_matchers_predicate.hpp>
+#include <catch2/matchers/catch_matchers_quantifiers.hpp>
+#include <catch2/matchers/catch_matchers_range_equals.hpp>
+#include <catch2/matchers/catch_matchers_string.hpp>
+#include <catch2/matchers/catch_matchers_templated.hpp>
+#include <catch2/matchers/catch_matchers_vector.hpp>
+#include <catch2/matchers/internal/catch_matchers_impl.hpp>
+
+#endif // CATCH_MATCHERS_ALL_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_container_properties.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_container_properties.cpp
new file mode 100644
index 00000000..f0c535bc
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_container_properties.cpp
@@ -0,0 +1,34 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/matchers/catch_matchers_container_properties.hpp>
+#include <catch2/internal/catch_reusable_string_stream.hpp>
+
+namespace Catch {
+namespace Matchers {
+
+    std::string IsEmptyMatcher::describe() const {
+        return "is empty";
+    }
+
+    std::string HasSizeMatcher::describe() const {
+        ReusableStringStream sstr;
+        sstr << "has size == " << m_target_size;
+        return sstr.str();
+    }
+
+    IsEmptyMatcher IsEmpty() {
+        return {};
+    }
+
+    HasSizeMatcher SizeIs(std::size_t sz) {
+        return HasSizeMatcher{ sz };
+    }
+
+} // end namespace Matchers
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_container_properties.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_container_properties.hpp
new file mode 100644
index 00000000..5f3fc7fb
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_container_properties.hpp
@@ -0,0 +1,90 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_MATCHERS_CONTAINER_PROPERTIES_HPP_INCLUDED
+#define CATCH_MATCHERS_CONTAINER_PROPERTIES_HPP_INCLUDED
+
+#include <catch2/matchers/catch_matchers_templated.hpp>
+#include <catch2/internal/catch_container_nonmembers.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+namespace Catch {
+    namespace Matchers {
+
+        class IsEmptyMatcher final : public MatcherGenericBase {
+        public:
+            template <typename RangeLike>
+            bool match(RangeLike&& rng) const {
+#if defined(CATCH_CONFIG_POLYFILL_NONMEMBER_CONTAINER_ACCESS)
+                using Catch::Detail::empty;
+#else
+                using std::empty;
+#endif
+                return empty(rng);
+            }
+
+            std::string describe() const override;
+        };
+
+        class HasSizeMatcher final : public MatcherGenericBase {
+            std::size_t m_target_size;
+        public:
+            explicit HasSizeMatcher(std::size_t target_size):
+                m_target_size(target_size)
+            {}
+
+            template <typename RangeLike>
+            bool match(RangeLike&& rng) const {
+#if defined(CATCH_CONFIG_POLYFILL_NONMEMBER_CONTAINER_ACCESS)
+                using Catch::Detail::size;
+#else
+                using std::size;
+#endif
+                return size(rng) == m_target_size;
+            }
+
+            std::string describe() const override;
+        };
+
+        template <typename Matcher>
+        class SizeMatchesMatcher final : public MatcherGenericBase {
+            Matcher m_matcher;
+        public:
+            explicit SizeMatchesMatcher(Matcher m):
+                m_matcher(CATCH_MOVE(m))
+            {}
+
+            template <typename RangeLike>
+            bool match(RangeLike&& rng) const {
+#if defined(CATCH_CONFIG_POLYFILL_NONMEMBER_CONTAINER_ACCESS)
+                using Catch::Detail::size;
+#else
+                using std::size;
+#endif
+                return m_matcher.match(size(rng));
+            }
+
+            std::string describe() const override {
+                return "size matches " + m_matcher.describe();
+            }
+        };
+
+
+        //! Creates a matcher that accepts empty ranges/containers
+        IsEmptyMatcher IsEmpty();
+        //! Creates a matcher that accepts ranges/containers with specific size
+        HasSizeMatcher SizeIs(std::size_t sz);
+        template <typename Matcher>
+        std::enable_if_t<Detail::is_matcher<Matcher>::value,
+        SizeMatchesMatcher<Matcher>> SizeIs(Matcher&& m) {
+            return SizeMatchesMatcher<Matcher>{CATCH_FORWARD(m)};
+        }
+
+    } // end namespace Matchers
+} // end namespace Catch
+
+#endif // CATCH_MATCHERS_CONTAINER_PROPERTIES_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_contains.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_contains.hpp
new file mode 100644
index 00000000..877d6924
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_contains.hpp
@@ -0,0 +1,102 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_MATCHERS_CONTAINS_HPP_INCLUDED
+#define CATCH_MATCHERS_CONTAINS_HPP_INCLUDED
+
+#include <catch2/matchers/catch_matchers_templated.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <algorithm>
+#include <functional>
+
+namespace Catch {
+    namespace Matchers {
+        //! Matcher for checking that an element in range is equal to specific element
+        template <typename T, typename Equality>
+        class ContainsElementMatcher final : public MatcherGenericBase {
+            T m_desired;
+            Equality m_eq;
+        public:
+            template <typename T2, typename Equality2>
+            ContainsElementMatcher(T2&& target, Equality2&& predicate):
+                m_desired(CATCH_FORWARD(target)),
+                m_eq(CATCH_FORWARD(predicate))
+            {}
+
+            std::string describe() const override {
+                return "contains element " + Catch::Detail::stringify(m_desired);
+            }
+
+            template <typename RangeLike>
+            bool match( RangeLike&& rng ) const {
+                for ( auto&& elem : rng ) {
+                    if ( m_eq( elem, m_desired ) ) { return true; }
+                }
+                return false;
+            }
+        };
+
+        //! Meta-matcher for checking that an element in a range matches a specific matcher
+        template <typename Matcher>
+        class ContainsMatcherMatcher final : public MatcherGenericBase {
+            Matcher m_matcher;
+        public:
+            // Note that we do a copy+move to avoid having to SFINAE this
+            // constructor (and also avoid some perfect forwarding failure
+            // cases)
+            ContainsMatcherMatcher(Matcher matcher):
+                m_matcher(CATCH_MOVE(matcher))
+            {}
+
+            template <typename RangeLike>
+            bool match(RangeLike&& rng) const {
+                for (auto&& elem : rng) {
+                    if (m_matcher.match(elem)) {
+                        return true;
+                    }
+                }
+                return false;
+            }
+
+            std::string describe() const override {
+                return "contains element matching " + m_matcher.describe();
+            }
+        };
+
+        /**
+         * Creates a matcher that checks whether a range contains a specific element.
+         *
+         * Uses `std::equal_to` to do the comparison
+         */
+        template <typename T>
+        std::enable_if_t<!Detail::is_matcher<T>::value,
+        ContainsElementMatcher<T, std::equal_to<>>> Contains(T&& elem) {
+            return { CATCH_FORWARD(elem), std::equal_to<>{} };
+        }
+
+        //! Creates a matcher that checks whether a range contains element matching a matcher
+        template <typename Matcher>
+        std::enable_if_t<Detail::is_matcher<Matcher>::value,
+        ContainsMatcherMatcher<Matcher>> Contains(Matcher&& matcher) {
+            return { CATCH_FORWARD(matcher) };
+        }
+
+        /**
+         * Creates a matcher that checks whether a range contains a specific element.
+         *
+         * Uses `eq` to do the comparisons, the element is provided on the rhs
+         */
+        template <typename T, typename Equality>
+        ContainsElementMatcher<T, Equality> Contains(T&& elem, Equality&& eq) {
+            return { CATCH_FORWARD(elem), CATCH_FORWARD(eq) };
+        }
+
+    }
+}
+
+#endif // CATCH_MATCHERS_CONTAINS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_exception.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_exception.cpp
new file mode 100644
index 00000000..8147390b
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_exception.cpp
@@ -0,0 +1,26 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/matchers/catch_matchers_exception.hpp>
+
+namespace Catch {
+namespace Matchers {
+
+bool ExceptionMessageMatcher::match(std::exception const& ex) const {
+    return ex.what() == m_message;
+}
+
+std::string ExceptionMessageMatcher::describe() const {
+    return "exception message matches \"" + m_message + '"';
+}
+
+ExceptionMessageMatcher Message(std::string const& message) {
+    return ExceptionMessageMatcher(message);
+}
+
+} // namespace Matchers
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_exception.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_exception.hpp
new file mode 100644
index 00000000..e7c3a636
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_exception.hpp
@@ -0,0 +1,61 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_MATCHERS_EXCEPTION_HPP_INCLUDED
+#define CATCH_MATCHERS_EXCEPTION_HPP_INCLUDED
+
+#include <catch2/matchers/catch_matchers.hpp>
+
+namespace Catch {
+namespace Matchers {
+
+class ExceptionMessageMatcher final : public MatcherBase<std::exception> {
+    std::string m_message;
+public:
+
+    ExceptionMessageMatcher(std::string const& message):
+        m_message(message)
+    {}
+
+    bool match(std::exception const& ex) const override;
+
+    std::string describe() const override;
+};
+
+//! Creates a matcher that checks whether a std derived exception has the provided message
+ExceptionMessageMatcher Message(std::string const& message);
+
+template <typename StringMatcherType>
+class ExceptionMessageMatchesMatcher final
+    : public MatcherBase<std::exception> {
+    StringMatcherType m_matcher;
+
+public:
+    ExceptionMessageMatchesMatcher( StringMatcherType matcher ):
+        m_matcher( CATCH_MOVE( matcher ) ) {}
+
+    bool match( std::exception const& ex ) const override {
+        return m_matcher.match( ex.what() );
+    }
+
+    std::string describe() const override {
+        return " matches \"" + m_matcher.describe() + '"';
+    }
+};
+
+//! Creates a matcher that checks whether a message from an std derived
+//! exception matches a provided matcher
+template <typename StringMatcherType>
+ExceptionMessageMatchesMatcher<StringMatcherType>
+MessageMatches( StringMatcherType&& matcher ) {
+    return { CATCH_FORWARD( matcher ) };
+}
+
+} // namespace Matchers
+} // namespace Catch
+
+#endif // CATCH_MATCHERS_EXCEPTION_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_floating_point.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_floating_point.cpp
new file mode 100644
index 00000000..206332ef
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_floating_point.cpp
@@ -0,0 +1,226 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/matchers/catch_matchers_floating_point.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_polyfills.hpp>
+#include <catch2/internal/catch_to_string.hpp>
+#include <catch2/catch_tostring.hpp>
+#include <catch2/internal/catch_floating_point_helpers.hpp>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <cstdint>
+#include <sstream>
+#include <iomanip>
+#include <limits>
+
+
+namespace Catch {
+namespace {
+
+    template <typename FP>
+    bool almostEqualUlps(FP lhs, FP rhs, uint64_t maxUlpDiff) {
+        // Comparison with NaN should always be false.
+        // This way we can rule it out before getting into the ugly details
+        if (Catch::isnan(lhs) || Catch::isnan(rhs)) {
+            return false;
+        }
+
+        // This should also handle positive and negative zeros, infinities
+        const auto ulpDist = ulpDistance(lhs, rhs);
+
+        return ulpDist <= maxUlpDiff;
+    }
+
+
+template <typename FP>
+FP step(FP start, FP direction, uint64_t steps) {
+    for (uint64_t i = 0; i < steps; ++i) {
+        start = Catch::nextafter(start, direction);
+    }
+    return start;
+}
+
+// Performs equivalent check of std::fabs(lhs - rhs) <= margin
+// But without the subtraction to allow for INFINITY in comparison
+bool marginComparison(double lhs, double rhs, double margin) {
+    return (lhs + margin >= rhs) && (rhs + margin >= lhs);
+}
+
+template <typename FloatingPoint>
+void write(std::ostream& out, FloatingPoint num) {
+    out << std::scientific
+        << std::setprecision(std::numeric_limits<FloatingPoint>::max_digits10 - 1)
+        << num;
+}
+
+} // end anonymous namespace
+
+namespace Matchers {
+namespace Detail {
+
+    enum class FloatingPointKind : uint8_t {
+        Float,
+        Double
+    };
+
+} // end namespace Detail
+
+
+    WithinAbsMatcher::WithinAbsMatcher(double target, double margin)
+        :m_target{ target }, m_margin{ margin } {
+        CATCH_ENFORCE(margin >= 0, "Invalid margin: " << margin << '.'
+            << " Margin has to be non-negative.");
+    }
+
+    // Performs equivalent check of std::fabs(lhs - rhs) <= margin
+    // But without the subtraction to allow for INFINITY in comparison
+    bool WithinAbsMatcher::match(double const& matchee) const {
+        return (matchee + m_margin >= m_target) && (m_target + m_margin >= matchee);
+    }
+
+    std::string WithinAbsMatcher::describe() const {
+        return "is within " + ::Catch::Detail::stringify(m_margin) + " of " + ::Catch::Detail::stringify(m_target);
+    }
+
+
+    WithinUlpsMatcher::WithinUlpsMatcher(double target, uint64_t ulps, Detail::FloatingPointKind baseType)
+        :m_target{ target }, m_ulps{ ulps }, m_type{ baseType } {
+        CATCH_ENFORCE(m_type == Detail::FloatingPointKind::Double
+                   || m_ulps < (std::numeric_limits<uint32_t>::max)(),
+            "Provided ULP is impossibly large for a float comparison.");
+        CATCH_ENFORCE( std::numeric_limits<double>::is_iec559,
+                       "WithinUlp matcher only supports platforms with "
+                       "IEEE-754 compatible floating point representation" );
+    }
+
+#if defined(__clang__)
+#pragma clang diagnostic push
+// Clang <3.5 reports on the default branch in the switch below
+#pragma clang diagnostic ignored "-Wunreachable-code"
+#endif
+
+    bool WithinUlpsMatcher::match(double const& matchee) const {
+        switch (m_type) {
+        case Detail::FloatingPointKind::Float:
+            return almostEqualUlps<float>(static_cast<float>(matchee), static_cast<float>(m_target), m_ulps);
+        case Detail::FloatingPointKind::Double:
+            return almostEqualUlps<double>(matchee, m_target, m_ulps);
+        default:
+            CATCH_INTERNAL_ERROR( "Unknown Detail::FloatingPointKind value" );
+        }
+    }
+
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+    std::string WithinUlpsMatcher::describe() const {
+        std::stringstream ret;
+
+        ret << "is within " << m_ulps << " ULPs of ";
+
+        if (m_type == Detail::FloatingPointKind::Float) {
+            write(ret, static_cast<float>(m_target));
+            ret << 'f';
+        } else {
+            write(ret, m_target);
+        }
+
+        ret << " ([";
+        if (m_type == Detail::FloatingPointKind::Double) {
+            write( ret,
+                   step( m_target,
+                         -std::numeric_limits<double>::infinity(),
+                         m_ulps ) );
+            ret << ", ";
+            write( ret,
+                   step( m_target,
+                         std::numeric_limits<double>::infinity(),
+                         m_ulps ) );
+        } else {
+            // We have to cast INFINITY to float because of MinGW, see #1782
+            write( ret,
+                   step( static_cast<float>( m_target ),
+                         -std::numeric_limits<float>::infinity(),
+                         m_ulps ) );
+            ret << ", ";
+            write( ret,
+                   step( static_cast<float>( m_target ),
+                         std::numeric_limits<float>::infinity(),
+                         m_ulps ) );
+        }
+        ret << "])";
+
+        return ret.str();
+    }
+
+    WithinRelMatcher::WithinRelMatcher(double target, double epsilon):
+        m_target(target),
+        m_epsilon(epsilon){
+        CATCH_ENFORCE(m_epsilon >= 0., "Relative comparison with epsilon <  0 does not make sense.");
+        CATCH_ENFORCE(m_epsilon  < 1., "Relative comparison with epsilon >= 1 does not make sense.");
+    }
+
+    bool WithinRelMatcher::match(double const& matchee) const {
+        const auto relMargin = m_epsilon * (std::max)(std::fabs(matchee), std::fabs(m_target));
+        return marginComparison(matchee, m_target,
+                                std::isinf(relMargin)? 0 : relMargin);
+    }
+
+    std::string WithinRelMatcher::describe() const {
+        Catch::ReusableStringStream sstr;
+        sstr << "and " << m_target << " are within " << m_epsilon * 100. << "% of each other";
+        return sstr.str();
+    }
+
+
+WithinUlpsMatcher WithinULP(double target, uint64_t maxUlpDiff) {
+    return WithinUlpsMatcher(target, maxUlpDiff, Detail::FloatingPointKind::Double);
+}
+
+WithinUlpsMatcher WithinULP(float target, uint64_t maxUlpDiff) {
+    return WithinUlpsMatcher(target, maxUlpDiff, Detail::FloatingPointKind::Float);
+}
+
+WithinAbsMatcher WithinAbs(double target, double margin) {
+    return WithinAbsMatcher(target, margin);
+}
+
+WithinRelMatcher WithinRel(double target, double eps) {
+    return WithinRelMatcher(target, eps);
+}
+
+WithinRelMatcher WithinRel(double target) {
+    return WithinRelMatcher(target, std::numeric_limits<double>::epsilon() * 100);
+}
+
+WithinRelMatcher WithinRel(float target, float eps) {
+    return WithinRelMatcher(target, eps);
+}
+
+WithinRelMatcher WithinRel(float target) {
+    return WithinRelMatcher(target, std::numeric_limits<float>::epsilon() * 100);
+}
+
+
+
+bool IsNaNMatcher::match( double const& matchee ) const {
+    return std::isnan( matchee );
+}
+
+std::string IsNaNMatcher::describe() const {
+    using namespace std::string_literals;
+    return "is NaN"s;
+}
+
+IsNaNMatcher IsNaN() { return IsNaNMatcher(); }
+
+    } // namespace Matchers
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_floating_point.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_floating_point.hpp
new file mode 100644
index 00000000..76816633
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_floating_point.hpp
@@ -0,0 +1,94 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_MATCHERS_FLOATING_POINT_HPP_INCLUDED
+#define CATCH_MATCHERS_FLOATING_POINT_HPP_INCLUDED
+
+#include <catch2/matchers/catch_matchers.hpp>
+
+namespace Catch {
+namespace Matchers {
+
+    namespace Detail {
+        enum class FloatingPointKind : uint8_t;
+    }
+
+    class  WithinAbsMatcher final : public MatcherBase<double> {
+    public:
+        WithinAbsMatcher(double target, double margin);
+        bool match(double const& matchee) const override;
+        std::string describe() const override;
+    private:
+        double m_target;
+        double m_margin;
+    };
+
+    //! Creates a matcher that accepts numbers within certain range of target
+    WithinAbsMatcher WithinAbs( double target, double margin );
+
+
+
+    class WithinUlpsMatcher final : public MatcherBase<double> {
+    public:
+        WithinUlpsMatcher( double target,
+                           uint64_t ulps,
+                           Detail::FloatingPointKind baseType );
+        bool match(double const& matchee) const override;
+        std::string describe() const override;
+    private:
+        double m_target;
+        uint64_t m_ulps;
+        Detail::FloatingPointKind m_type;
+    };
+
+    //! Creates a matcher that accepts doubles within certain ULP range of target
+    WithinUlpsMatcher WithinULP(double target, uint64_t maxUlpDiff);
+    //! Creates a matcher that accepts floats within certain ULP range of target
+    WithinUlpsMatcher WithinULP(float target, uint64_t maxUlpDiff);
+
+
+
+    // Given IEEE-754 format for floats and doubles, we can assume
+    // that float -> double promotion is lossless. Given this, we can
+    // assume that if we do the standard relative comparison of
+    // |lhs - rhs| <= epsilon * max(fabs(lhs), fabs(rhs)), then we get
+    // the same result if we do this for floats, as if we do this for
+    // doubles that were promoted from floats.
+    class WithinRelMatcher final : public MatcherBase<double> {
+    public:
+        WithinRelMatcher( double target, double epsilon );
+        bool match(double const& matchee) const override;
+        std::string describe() const override;
+    private:
+        double m_target;
+        double m_epsilon;
+    };
+
+    //! Creates a matcher that accepts doubles within certain relative range of target
+    WithinRelMatcher WithinRel(double target, double eps);
+    //! Creates a matcher that accepts doubles within 100*DBL_EPS relative range of target
+    WithinRelMatcher WithinRel(double target);
+    //! Creates a matcher that accepts doubles within certain relative range of target
+    WithinRelMatcher WithinRel(float target, float eps);
+    //! Creates a matcher that accepts floats within 100*FLT_EPS relative range of target
+    WithinRelMatcher WithinRel(float target);
+
+
+
+    class IsNaNMatcher final : public MatcherBase<double> {
+    public:
+        IsNaNMatcher() = default;
+        bool match( double const& matchee ) const override;
+        std::string describe() const override;
+    };
+
+    IsNaNMatcher IsNaN();
+
+} // namespace Matchers
+} // namespace Catch
+
+#endif // CATCH_MATCHERS_FLOATING_POINT_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_predicate.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_predicate.cpp
new file mode 100644
index 00000000..f5445375
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_predicate.cpp
@@ -0,0 +1,17 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/matchers/catch_matchers_predicate.hpp>
+
+std::string Catch::Matchers::Detail::finalizeDescription(const std::string& desc) {
+    if (desc.empty()) {
+        return "matches undescribed predicate";
+    } else {
+        return "matches predicate: \"" + desc + '"';
+    }
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_predicate.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_predicate.hpp
new file mode 100644
index 00000000..2d1cc33a
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_predicate.hpp
@@ -0,0 +1,59 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_MATCHERS_PREDICATE_HPP_INCLUDED
+#define CATCH_MATCHERS_PREDICATE_HPP_INCLUDED
+
+#include <catch2/matchers/catch_matchers.hpp>
+#include <catch2/internal/catch_meta.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <string>
+
+namespace Catch {
+namespace Matchers {
+
+namespace Detail {
+    std::string finalizeDescription(const std::string& desc);
+} // namespace Detail
+
+template <typename T, typename Predicate>
+class PredicateMatcher final : public MatcherBase<T> {
+    Predicate m_predicate;
+    std::string m_description;
+public:
+
+    PredicateMatcher(Predicate&& elem, std::string const& descr)
+        :m_predicate(CATCH_FORWARD(elem)),
+        m_description(Detail::finalizeDescription(descr))
+    {}
+
+    bool match( T const& item ) const override {
+        return m_predicate(item);
+    }
+
+    std::string describe() const override {
+        return m_description;
+    }
+};
+
+    /**
+     * Creates a matcher that calls delegates `match` to the provided predicate.
+     *
+     * The user has to explicitly specify the argument type to the matcher
+     */
+    template<typename T, typename Pred>
+    PredicateMatcher<T, Pred> Predicate(Pred&& predicate, std::string const& description = "") {
+        static_assert(is_callable<Pred(T)>::value, "Predicate not callable with argument T");
+        static_assert(std::is_same<bool, FunctionReturnType<Pred, T>>::value, "Predicate does not return bool");
+        return PredicateMatcher<T, Pred>(CATCH_FORWARD(predicate), description);
+    }
+
+} // namespace Matchers
+} // namespace Catch
+
+#endif // CATCH_MATCHERS_PREDICATE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_quantifiers.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_quantifiers.cpp
new file mode 100644
index 00000000..5a752467
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_quantifiers.cpp
@@ -0,0 +1,24 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/matchers/catch_matchers_quantifiers.hpp>
+
+namespace Catch {
+    namespace Matchers {
+        std::string AllTrueMatcher::describe() const { return "contains only true"; }
+
+        AllTrueMatcher AllTrue() { return AllTrueMatcher{}; }
+
+        std::string NoneTrueMatcher::describe() const { return "contains no true"; }
+
+        NoneTrueMatcher NoneTrue() { return NoneTrueMatcher{}; }
+
+        std::string AnyTrueMatcher::describe() const { return "contains at least one true"; }
+
+        AnyTrueMatcher AnyTrue() { return AnyTrueMatcher{}; }
+    } // namespace Matchers
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_quantifiers.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_quantifiers.hpp
new file mode 100644
index 00000000..977b0c76
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_quantifiers.hpp
@@ -0,0 +1,165 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_MATCHERS_QUANTIFIERS_HPP_INCLUDED
+#define CATCH_MATCHERS_QUANTIFIERS_HPP_INCLUDED
+
+#include <catch2/matchers/catch_matchers_templated.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+namespace Catch {
+    namespace Matchers {
+        // Matcher for checking that all elements in range matches a given matcher.
+        template <typename Matcher>
+        class AllMatchMatcher final : public MatcherGenericBase {
+            Matcher m_matcher;
+        public:
+            AllMatchMatcher(Matcher matcher):
+                m_matcher(CATCH_MOVE(matcher))
+            {}
+
+            std::string describe() const override {
+                return "all match " + m_matcher.describe();
+            }
+
+            template <typename RangeLike>
+            bool match(RangeLike&& rng) const {
+                for (auto&& elem : rng) {
+                    if (!m_matcher.match(elem)) {
+                        return false;
+                    }
+                }
+                return true;
+            }
+        };
+
+        // Matcher for checking that no element in range matches a given matcher.
+        template <typename Matcher>
+        class NoneMatchMatcher final : public MatcherGenericBase {
+            Matcher m_matcher;
+        public:
+            NoneMatchMatcher(Matcher matcher):
+                m_matcher(CATCH_MOVE(matcher))
+            {}
+
+            std::string describe() const override {
+                return "none match " + m_matcher.describe();
+            }
+
+            template <typename RangeLike>
+            bool match(RangeLike&& rng) const {
+                for (auto&& elem : rng) {
+                    if (m_matcher.match(elem)) {
+                        return false;
+                    }
+                }
+                return true;
+            }
+        };
+
+        // Matcher for checking that at least one element in range matches a given matcher.
+        template <typename Matcher>
+        class AnyMatchMatcher final : public MatcherGenericBase {
+            Matcher m_matcher;
+        public:
+            AnyMatchMatcher(Matcher matcher):
+                m_matcher(CATCH_MOVE(matcher))
+            {}
+
+            std::string describe() const override {
+                return "any match " + m_matcher.describe();
+            }
+
+            template <typename RangeLike>
+            bool match(RangeLike&& rng) const {
+                for (auto&& elem : rng) {
+                    if (m_matcher.match(elem)) {
+                        return true;
+                    }
+                }
+                return false;
+            }
+        };
+
+        // Matcher for checking that all elements in range are true.
+        class AllTrueMatcher final : public MatcherGenericBase {
+        public:
+            std::string describe() const override;
+
+            template <typename RangeLike>
+            bool match(RangeLike&& rng) const {
+                for (auto&& elem : rng) {
+                    if (!elem) {
+                        return false;
+                    }
+                }
+                return true;
+            }
+        };
+
+        // Matcher for checking that no element in range is true.
+        class NoneTrueMatcher final : public MatcherGenericBase {
+        public:
+            std::string describe() const override;
+
+            template <typename RangeLike>
+            bool match(RangeLike&& rng) const {
+                for (auto&& elem : rng) {
+                    if (elem) {
+                        return false;
+                    }
+                }
+                return true;
+            }
+        };
+
+        // Matcher for checking that any element in range is true.
+        class AnyTrueMatcher final : public MatcherGenericBase {
+        public:
+            std::string describe() const override;
+
+            template <typename RangeLike>
+            bool match(RangeLike&& rng) const {
+                for (auto&& elem : rng) {
+                    if (elem) {
+                        return true;
+                    }
+                }
+                return false;
+            }
+        };
+
+        // Creates a matcher that checks whether all elements in a range match a matcher
+        template <typename Matcher>
+        AllMatchMatcher<Matcher> AllMatch(Matcher&& matcher) {
+            return { CATCH_FORWARD(matcher) };
+        }
+
+        // Creates a matcher that checks whether no element in a range matches a matcher.
+        template <typename Matcher>
+        NoneMatchMatcher<Matcher> NoneMatch(Matcher&& matcher) {
+            return { CATCH_FORWARD(matcher) };
+        }
+
+        // Creates a matcher that checks whether any element in a range matches a matcher.
+        template <typename Matcher>
+        AnyMatchMatcher<Matcher> AnyMatch(Matcher&& matcher) {
+            return { CATCH_FORWARD(matcher) };
+        }
+
+        // Creates a matcher that checks whether all elements in a range are true
+        AllTrueMatcher AllTrue();
+
+        // Creates a matcher that checks whether no element in a range is true
+        NoneTrueMatcher NoneTrue();
+
+        // Creates a matcher that checks whether any element in a range is true
+        AnyTrueMatcher AnyTrue();
+    }
+}
+
+#endif // CATCH_MATCHERS_QUANTIFIERS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_range_equals.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_range_equals.hpp
new file mode 100644
index 00000000..95b781a4
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_range_equals.hpp
@@ -0,0 +1,144 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_MATCHERS_RANGE_EQUALS_HPP_INCLUDED
+#define CATCH_MATCHERS_RANGE_EQUALS_HPP_INCLUDED
+
+#include <catch2/internal/catch_is_permutation.hpp>
+#include <catch2/matchers/catch_matchers_templated.hpp>
+
+#include <algorithm>
+#include <utility>
+
+namespace Catch {
+    namespace Matchers {
+
+        /**
+         * Matcher for checking that an element contains the same
+         * elements in the same order
+         */
+        template <typename TargetRangeLike, typename Equality>
+        class RangeEqualsMatcher final : public MatcherGenericBase {
+            TargetRangeLike m_desired;
+            Equality m_predicate;
+
+        public:
+            template <typename TargetRangeLike2, typename Equality2>
+            RangeEqualsMatcher( TargetRangeLike2&& range,
+                                Equality2&& predicate ):
+                m_desired( CATCH_FORWARD( range ) ),
+                m_predicate( CATCH_FORWARD( predicate ) ) {}
+
+            template <typename RangeLike>
+            bool match( RangeLike&& rng ) const {
+                auto rng_start = begin( rng );
+                const auto rng_end = end( rng );
+                auto target_start = begin( m_desired );
+                const auto target_end = end( m_desired );
+
+                while (rng_start != rng_end && target_start != target_end) {
+                    if (!m_predicate(*rng_start, *target_start)) {
+                        return false;
+                    }
+                    ++rng_start;
+                    ++target_start;
+                }
+                return rng_start == rng_end && target_start == target_end;
+            }
+
+            std::string describe() const override {
+                return "elements are " + Catch::Detail::stringify( m_desired );
+            }
+        };
+
+        /**
+         * Matcher for checking that an element contains the same
+         * elements (but not necessarily in the same order)
+         */
+        template <typename TargetRangeLike, typename Equality>
+        class UnorderedRangeEqualsMatcher final : public MatcherGenericBase {
+            TargetRangeLike m_desired;
+            Equality m_predicate;
+
+        public:
+            template <typename TargetRangeLike2, typename Equality2>
+            UnorderedRangeEqualsMatcher( TargetRangeLike2&& range,
+                                         Equality2&& predicate ):
+                m_desired( CATCH_FORWARD( range ) ),
+                m_predicate( CATCH_FORWARD( predicate ) ) {}
+
+            template <typename RangeLike>
+            bool match( RangeLike&& rng ) const {
+                using std::begin;
+                using std::end;
+                return Catch::Detail::is_permutation( begin( m_desired ),
+                                                      end( m_desired ),
+                                                      begin( rng ),
+                                                      end( rng ),
+                                                      m_predicate );
+            }
+
+            std::string describe() const override {
+                return "unordered elements are " +
+                       ::Catch::Detail::stringify( m_desired );
+            }
+        };
+
+        /**
+         * Creates a matcher that checks if all elements in a range are equal
+         * to all elements in another range.
+         *
+         * Uses `std::equal_to` to do the comparison
+         */
+        template <typename RangeLike>
+        std::enable_if_t<!Detail::is_matcher<RangeLike>::value,
+                         RangeEqualsMatcher<RangeLike, std::equal_to<>>>
+        RangeEquals( RangeLike&& range ) {
+            return { CATCH_FORWARD( range ), std::equal_to<>{} };
+        }
+
+        /**
+         * Creates a matcher that checks if all elements in a range are equal
+         * to all elements in another range.
+         *
+         * Uses to provided predicate `predicate` to do the comparisons
+         */
+        template <typename RangeLike, typename Equality>
+        RangeEqualsMatcher<RangeLike, Equality>
+        RangeEquals( RangeLike&& range, Equality&& predicate ) {
+            return { CATCH_FORWARD( range ), CATCH_FORWARD( predicate ) };
+        }
+
+        /**
+         * Creates a matcher that checks if all elements in a range are equal
+         * to all elements in another range, in some permutation
+         *
+         * Uses `std::equal_to` to do the comparison
+         */
+        template <typename RangeLike>
+        std::enable_if_t<
+            !Detail::is_matcher<RangeLike>::value,
+            UnorderedRangeEqualsMatcher<RangeLike, std::equal_to<>>>
+        UnorderedRangeEquals( RangeLike&& range ) {
+            return { CATCH_FORWARD( range ), std::equal_to<>{} };
+        }
+
+        /**
+         * Creates a matcher that checks if all elements in a range are equal
+         * to all elements in another range, in some permutation.
+         *
+         * Uses to provided predicate `predicate` to do the comparisons
+         */
+        template <typename RangeLike, typename Equality>
+        UnorderedRangeEqualsMatcher<RangeLike, Equality>
+        UnorderedRangeEquals( RangeLike&& range, Equality&& predicate ) {
+            return { CATCH_FORWARD( range ), CATCH_FORWARD( predicate ) };
+        }
+    } // namespace Matchers
+} // namespace Catch
+
+#endif // CATCH_MATCHERS_RANGE_EQUALS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_string.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_string.cpp
new file mode 100644
index 00000000..55002848
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_string.cpp
@@ -0,0 +1,114 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/matchers/catch_matchers_string.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/catch_tostring.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <regex>
+
+namespace Catch {
+namespace Matchers {
+
+    CasedString::CasedString( std::string const& str, CaseSensitive caseSensitivity )
+    :   m_caseSensitivity( caseSensitivity ),
+        m_str( adjustString( str ) )
+    {}
+    std::string CasedString::adjustString( std::string const& str ) const {
+        return m_caseSensitivity == CaseSensitive::No
+               ? toLower( str )
+               : str;
+    }
+    StringRef CasedString::caseSensitivitySuffix() const {
+        return m_caseSensitivity == CaseSensitive::Yes
+                   ? StringRef()
+                   : " (case insensitive)"_sr;
+    }
+
+
+    StringMatcherBase::StringMatcherBase( StringRef operation, CasedString const& comparator )
+    : m_comparator( comparator ),
+      m_operation( operation ) {
+    }
+
+    std::string StringMatcherBase::describe() const {
+        std::string description;
+        description.reserve(5 + m_operation.size() + m_comparator.m_str.size() +
+                                    m_comparator.caseSensitivitySuffix().size());
+        description += m_operation;
+        description += ": \"";
+        description += m_comparator.m_str;
+        description += '"';
+        description += m_comparator.caseSensitivitySuffix();
+        return description;
+    }
+
+    StringEqualsMatcher::StringEqualsMatcher( CasedString const& comparator ) : StringMatcherBase( "equals"_sr, comparator ) {}
+
+    bool StringEqualsMatcher::match( std::string const& source ) const {
+        return m_comparator.adjustString( source ) == m_comparator.m_str;
+    }
+
+
+    StringContainsMatcher::StringContainsMatcher( CasedString const& comparator ) : StringMatcherBase( "contains"_sr, comparator ) {}
+
+    bool StringContainsMatcher::match( std::string const& source ) const {
+        return contains( m_comparator.adjustString( source ), m_comparator.m_str );
+    }
+
+
+    StartsWithMatcher::StartsWithMatcher( CasedString const& comparator ) : StringMatcherBase( "starts with"_sr, comparator ) {}
+
+    bool StartsWithMatcher::match( std::string const& source ) const {
+        return startsWith( m_comparator.adjustString( source ), m_comparator.m_str );
+    }
+
+
+    EndsWithMatcher::EndsWithMatcher( CasedString const& comparator ) : StringMatcherBase( "ends with"_sr, comparator ) {}
+
+    bool EndsWithMatcher::match( std::string const& source ) const {
+        return endsWith( m_comparator.adjustString( source ), m_comparator.m_str );
+    }
+
+
+
+    RegexMatcher::RegexMatcher(std::string regex, CaseSensitive caseSensitivity): m_regex(CATCH_MOVE(regex)), m_caseSensitivity(caseSensitivity) {}
+
+    bool RegexMatcher::match(std::string const& matchee) const {
+        auto flags = std::regex::ECMAScript; // ECMAScript is the default syntax option anyway
+        if (m_caseSensitivity == CaseSensitive::No) {
+            flags |= std::regex::icase;
+        }
+        auto reg = std::regex(m_regex, flags);
+        return std::regex_match(matchee, reg);
+    }
+
+    std::string RegexMatcher::describe() const {
+        return "matches " + ::Catch::Detail::stringify(m_regex) + ((m_caseSensitivity == CaseSensitive::Yes)? " case sensitively" : " case insensitively");
+    }
+
+
+    StringEqualsMatcher Equals( std::string const& str, CaseSensitive caseSensitivity ) {
+        return StringEqualsMatcher( CasedString( str, caseSensitivity) );
+    }
+    StringContainsMatcher ContainsSubstring( std::string const& str, CaseSensitive caseSensitivity ) {
+        return StringContainsMatcher( CasedString( str, caseSensitivity) );
+    }
+    EndsWithMatcher EndsWith( std::string const& str, CaseSensitive caseSensitivity ) {
+        return EndsWithMatcher( CasedString( str, caseSensitivity) );
+    }
+    StartsWithMatcher StartsWith( std::string const& str, CaseSensitive caseSensitivity ) {
+        return StartsWithMatcher( CasedString( str, caseSensitivity) );
+    }
+
+    RegexMatcher Matches(std::string const& regex, CaseSensitive caseSensitivity) {
+        return RegexMatcher(regex, caseSensitivity);
+    }
+
+} // namespace Matchers
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_string.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_string.hpp
new file mode 100644
index 00000000..718022e3
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_string.hpp
@@ -0,0 +1,85 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_MATCHERS_STRING_HPP_INCLUDED
+#define CATCH_MATCHERS_STRING_HPP_INCLUDED
+
+#include <catch2/matchers/catch_matchers.hpp>
+#include <catch2/internal/catch_case_sensitive.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+
+#include <string>
+
+namespace Catch {
+namespace Matchers {
+
+    struct CasedString {
+        CasedString( std::string const& str, CaseSensitive caseSensitivity );
+        std::string adjustString( std::string const& str ) const;
+        StringRef caseSensitivitySuffix() const;
+
+        CaseSensitive m_caseSensitivity;
+        std::string m_str;
+    };
+
+    class StringMatcherBase : public MatcherBase<std::string> {
+    protected:
+        CasedString m_comparator;
+        StringRef m_operation;
+
+    public:
+        StringMatcherBase( StringRef operation,
+                           CasedString const& comparator );
+        std::string describe() const override;
+    };
+
+    class StringEqualsMatcher final : public StringMatcherBase {
+    public:
+        StringEqualsMatcher( CasedString const& comparator );
+        bool match( std::string const& source ) const override;
+    };
+    class StringContainsMatcher final : public StringMatcherBase {
+    public:
+        StringContainsMatcher( CasedString const& comparator );
+        bool match( std::string const& source ) const override;
+    };
+    class StartsWithMatcher final : public StringMatcherBase {
+    public:
+        StartsWithMatcher( CasedString const& comparator );
+        bool match( std::string const& source ) const override;
+    };
+    class EndsWithMatcher final : public StringMatcherBase {
+    public:
+        EndsWithMatcher( CasedString const& comparator );
+        bool match( std::string const& source ) const override;
+    };
+
+    class RegexMatcher final : public MatcherBase<std::string> {
+        std::string m_regex;
+        CaseSensitive m_caseSensitivity;
+
+    public:
+        RegexMatcher( std::string regex, CaseSensitive caseSensitivity );
+        bool match( std::string const& matchee ) const override;
+        std::string describe() const override;
+    };
+
+    //! Creates matcher that accepts strings that are exactly equal to `str`
+    StringEqualsMatcher Equals( std::string const& str, CaseSensitive caseSensitivity = CaseSensitive::Yes );
+    //! Creates matcher that accepts strings that contain `str`
+    StringContainsMatcher ContainsSubstring( std::string const& str, CaseSensitive caseSensitivity = CaseSensitive::Yes );
+    //! Creates matcher that accepts strings that _end_ with `str`
+    EndsWithMatcher EndsWith( std::string const& str, CaseSensitive caseSensitivity = CaseSensitive::Yes );
+    //! Creates matcher that accepts strings that _start_ with `str`
+    StartsWithMatcher StartsWith( std::string const& str, CaseSensitive caseSensitivity = CaseSensitive::Yes );
+    //! Creates matcher that accepts strings matching `regex`
+    RegexMatcher Matches( std::string const& regex, CaseSensitive caseSensitivity = CaseSensitive::Yes );
+
+} // namespace Matchers
+} // namespace Catch
+
+#endif // CATCH_MATCHERS_STRING_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_templated.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_templated.cpp
new file mode 100644
index 00000000..2fc529d2
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_templated.cpp
@@ -0,0 +1,41 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/matchers/catch_matchers_templated.hpp>
+
+namespace Catch {
+namespace Matchers {
+    MatcherGenericBase::~MatcherGenericBase() = default;
+
+    namespace Detail {
+
+        std::string describe_multi_matcher(StringRef combine, std::string const* descriptions_begin, std::string const* descriptions_end) {
+            std::string description;
+            std::size_t combined_size = 4;
+            for ( auto desc = descriptions_begin; desc != descriptions_end; ++desc ) {
+                combined_size += desc->size();
+            }
+            combined_size += static_cast<size_t>(descriptions_end - descriptions_begin - 1) * combine.size();
+
+            description.reserve(combined_size);
+
+            description += "( ";
+            bool first = true;
+            for( auto desc = descriptions_begin; desc != descriptions_end; ++desc ) {
+                if( first )
+                    first = false;
+                else
+                    description += combine;
+                description += *desc;
+            }
+            description += " )";
+            return description;
+        }
+
+    } // namespace Detail
+} // namespace Matchers
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_templated.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_templated.hpp
new file mode 100644
index 00000000..ba0661ae
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_templated.hpp
@@ -0,0 +1,296 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_MATCHERS_TEMPLATED_HPP_INCLUDED
+#define CATCH_MATCHERS_TEMPLATED_HPP_INCLUDED
+
+#include <catch2/matchers/catch_matchers.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/internal/catch_logical_traits.hpp>
+
+#include <array>
+#include <algorithm>
+#include <string>
+#include <type_traits>
+
+namespace Catch {
+namespace Matchers {
+    class MatcherGenericBase : public MatcherUntypedBase {
+    public:
+        MatcherGenericBase() = default;
+        ~MatcherGenericBase() override; // = default;
+
+        MatcherGenericBase(MatcherGenericBase const&) = default;
+        MatcherGenericBase(MatcherGenericBase&&) = default;
+
+        MatcherGenericBase& operator=(MatcherGenericBase const&) = delete;
+        MatcherGenericBase& operator=(MatcherGenericBase&&) = delete;
+    };
+
+
+    namespace Detail {
+        template<std::size_t N, std::size_t M>
+        std::array<void const*, N + M> array_cat(std::array<void const*, N> && lhs, std::array<void const*, M> && rhs) {
+            std::array<void const*, N + M> arr{};
+            std::copy_n(lhs.begin(), N, arr.begin());
+            std::copy_n(rhs.begin(), M, arr.begin() + N);
+            return arr;
+        }
+
+        template<std::size_t N>
+        std::array<void const*, N+1> array_cat(std::array<void const*, N> && lhs, void const* rhs) {
+            std::array<void const*, N+1> arr{};
+            std::copy_n(lhs.begin(), N, arr.begin());
+            arr[N] = rhs;
+            return arr;
+        }
+
+        template<std::size_t N>
+        std::array<void const*, N+1> array_cat(void const* lhs, std::array<void const*, N> && rhs) {
+            std::array<void const*, N + 1> arr{ {lhs} };
+            std::copy_n(rhs.begin(), N, arr.begin() + 1);
+            return arr;
+        }
+
+        template<typename T>
+        using is_generic_matcher = std::is_base_of<
+            Catch::Matchers::MatcherGenericBase,
+            std::remove_cv_t<std::remove_reference_t<T>>
+        >;
+
+        template<typename... Ts>
+        using are_generic_matchers = Catch::Detail::conjunction<is_generic_matcher<Ts>...>;
+
+        template<typename T>
+        using is_matcher = std::is_base_of<
+            Catch::Matchers::MatcherUntypedBase,
+            std::remove_cv_t<std::remove_reference_t<T>>
+        >;
+
+
+        template<std::size_t N, typename Arg>
+        bool match_all_of(Arg&&, std::array<void const*, N> const&, std::index_sequence<>) {
+            return true;
+        }
+
+        template<typename T, typename... MatcherTs, std::size_t N, typename Arg, std::size_t Idx, std::size_t... Indices>
+        bool match_all_of(Arg&& arg, std::array<void const*, N> const& matchers, std::index_sequence<Idx, Indices...>) {
+            return static_cast<T const*>(matchers[Idx])->match(arg) && match_all_of<MatcherTs...>(arg, matchers, std::index_sequence<Indices...>{});
+        }
+
+
+        template<std::size_t N, typename Arg>
+        bool match_any_of(Arg&&, std::array<void const*, N> const&, std::index_sequence<>) {
+            return false;
+        }
+
+        template<typename T, typename... MatcherTs, std::size_t N, typename Arg, std::size_t Idx, std::size_t... Indices>
+        bool match_any_of(Arg&& arg, std::array<void const*, N> const& matchers, std::index_sequence<Idx, Indices...>) {
+            return static_cast<T const*>(matchers[Idx])->match(arg) || match_any_of<MatcherTs...>(arg, matchers, std::index_sequence<Indices...>{});
+        }
+
+        std::string describe_multi_matcher(StringRef combine, std::string const* descriptions_begin, std::string const* descriptions_end);
+
+        template<typename... MatcherTs, std::size_t... Idx>
+        std::string describe_multi_matcher(StringRef combine, std::array<void const*, sizeof...(MatcherTs)> const& matchers, std::index_sequence<Idx...>) {
+            std::array<std::string, sizeof...(MatcherTs)> descriptions {{
+                static_cast<MatcherTs const*>(matchers[Idx])->toString()...
+            }};
+
+            return describe_multi_matcher(combine, descriptions.data(), descriptions.data() + descriptions.size());
+        }
+
+
+        template<typename... MatcherTs>
+        class MatchAllOfGeneric final : public MatcherGenericBase {
+        public:
+            MatchAllOfGeneric(MatchAllOfGeneric const&) = delete;
+            MatchAllOfGeneric& operator=(MatchAllOfGeneric const&) = delete;
+            MatchAllOfGeneric(MatchAllOfGeneric&&) = default;
+            MatchAllOfGeneric& operator=(MatchAllOfGeneric&&) = default;
+
+            MatchAllOfGeneric(MatcherTs const&... matchers) : m_matchers{ {std::addressof(matchers)...} } {}
+            explicit MatchAllOfGeneric(std::array<void const*, sizeof...(MatcherTs)> matchers) : m_matchers{matchers} {}
+
+            template<typename Arg>
+            bool match(Arg&& arg) const {
+                return match_all_of<MatcherTs...>(arg, m_matchers, std::index_sequence_for<MatcherTs...>{});
+            }
+
+            std::string describe() const override {
+                return describe_multi_matcher<MatcherTs...>(" and "_sr, m_matchers, std::index_sequence_for<MatcherTs...>{});
+            }
+
+            // Has to be public to enable the concatenating operators
+            // below, because they are not friend of the RHS, only LHS,
+            // and thus cannot access private fields of RHS
+            std::array<void const*, sizeof...( MatcherTs )> m_matchers;
+
+
+            //! Avoids type nesting for `GenericAllOf && GenericAllOf` case
+            template<typename... MatchersRHS>
+            friend
+            MatchAllOfGeneric<MatcherTs..., MatchersRHS...> operator && (
+                    MatchAllOfGeneric<MatcherTs...>&& lhs,
+                    MatchAllOfGeneric<MatchersRHS...>&& rhs) {
+                return MatchAllOfGeneric<MatcherTs..., MatchersRHS...>{array_cat(CATCH_MOVE(lhs.m_matchers), CATCH_MOVE(rhs.m_matchers))};
+            }
+
+            //! Avoids type nesting for `GenericAllOf && some matcher` case
+            template<typename MatcherRHS>
+            friend std::enable_if_t<is_matcher<MatcherRHS>::value,
+            MatchAllOfGeneric<MatcherTs..., MatcherRHS>> operator && (
+                    MatchAllOfGeneric<MatcherTs...>&& lhs,
+                    MatcherRHS const& rhs) {
+                return MatchAllOfGeneric<MatcherTs..., MatcherRHS>{array_cat(CATCH_MOVE(lhs.m_matchers), static_cast<void const*>(&rhs))};
+            }
+
+            //! Avoids type nesting for `some matcher && GenericAllOf` case
+            template<typename MatcherLHS>
+            friend std::enable_if_t<is_matcher<MatcherLHS>::value,
+            MatchAllOfGeneric<MatcherLHS, MatcherTs...>> operator && (
+                    MatcherLHS const& lhs,
+                    MatchAllOfGeneric<MatcherTs...>&& rhs) {
+                return MatchAllOfGeneric<MatcherLHS, MatcherTs...>{array_cat(static_cast<void const*>(std::addressof(lhs)), CATCH_MOVE(rhs.m_matchers))};
+            }
+        };
+
+
+        template<typename... MatcherTs>
+        class MatchAnyOfGeneric final : public MatcherGenericBase {
+        public:
+            MatchAnyOfGeneric(MatchAnyOfGeneric const&) = delete;
+            MatchAnyOfGeneric& operator=(MatchAnyOfGeneric const&) = delete;
+            MatchAnyOfGeneric(MatchAnyOfGeneric&&) = default;
+            MatchAnyOfGeneric& operator=(MatchAnyOfGeneric&&) = default;
+
+            MatchAnyOfGeneric(MatcherTs const&... matchers) : m_matchers{ {std::addressof(matchers)...} } {}
+            explicit MatchAnyOfGeneric(std::array<void const*, sizeof...(MatcherTs)> matchers) : m_matchers{matchers} {}
+
+            template<typename Arg>
+            bool match(Arg&& arg) const {
+                return match_any_of<MatcherTs...>(arg, m_matchers, std::index_sequence_for<MatcherTs...>{});
+            }
+
+            std::string describe() const override {
+                return describe_multi_matcher<MatcherTs...>(" or "_sr, m_matchers, std::index_sequence_for<MatcherTs...>{});
+            }
+
+
+            // Has to be public to enable the concatenating operators
+            // below, because they are not friend of the RHS, only LHS,
+            // and thus cannot access private fields of RHS
+            std::array<void const*, sizeof...( MatcherTs )> m_matchers;
+
+            //! Avoids type nesting for `GenericAnyOf || GenericAnyOf` case
+            template<typename... MatchersRHS>
+            friend MatchAnyOfGeneric<MatcherTs..., MatchersRHS...> operator || (
+                    MatchAnyOfGeneric<MatcherTs...>&& lhs,
+                    MatchAnyOfGeneric<MatchersRHS...>&& rhs) {
+                return MatchAnyOfGeneric<MatcherTs..., MatchersRHS...>{array_cat(CATCH_MOVE(lhs.m_matchers), CATCH_MOVE(rhs.m_matchers))};
+            }
+
+            //! Avoids type nesting for `GenericAnyOf || some matcher` case
+            template<typename MatcherRHS>
+            friend std::enable_if_t<is_matcher<MatcherRHS>::value,
+            MatchAnyOfGeneric<MatcherTs..., MatcherRHS>> operator || (
+                    MatchAnyOfGeneric<MatcherTs...>&& lhs,
+                    MatcherRHS const& rhs) {
+                return MatchAnyOfGeneric<MatcherTs..., MatcherRHS>{array_cat(CATCH_MOVE(lhs.m_matchers), static_cast<void const*>(std::addressof(rhs)))};
+            }
+
+            //! Avoids type nesting for `some matcher || GenericAnyOf` case
+            template<typename MatcherLHS>
+            friend std::enable_if_t<is_matcher<MatcherLHS>::value,
+            MatchAnyOfGeneric<MatcherLHS, MatcherTs...>> operator || (
+                MatcherLHS const& lhs,
+                MatchAnyOfGeneric<MatcherTs...>&& rhs) {
+                return MatchAnyOfGeneric<MatcherLHS, MatcherTs...>{array_cat(static_cast<void const*>(std::addressof(lhs)), CATCH_MOVE(rhs.m_matchers))};
+            }
+        };
+
+
+        template<typename MatcherT>
+        class MatchNotOfGeneric final : public MatcherGenericBase {
+            MatcherT const& m_matcher;
+
+        public:
+            MatchNotOfGeneric(MatchNotOfGeneric const&) = delete;
+            MatchNotOfGeneric& operator=(MatchNotOfGeneric const&) = delete;
+            MatchNotOfGeneric(MatchNotOfGeneric&&) = default;
+            MatchNotOfGeneric& operator=(MatchNotOfGeneric&&) = default;
+
+            explicit MatchNotOfGeneric(MatcherT const& matcher) : m_matcher{matcher} {}
+
+            template<typename Arg>
+            bool match(Arg&& arg) const {
+                return !m_matcher.match(arg);
+            }
+
+            std::string describe() const override {
+                return "not " + m_matcher.toString();
+            }
+
+            //! Negating negation can just unwrap and return underlying matcher
+            friend MatcherT const& operator ! (MatchNotOfGeneric<MatcherT> const& matcher) {
+                return matcher.m_matcher;
+            }
+        };
+    } // namespace Detail
+
+
+    // compose only generic matchers
+    template<typename MatcherLHS, typename MatcherRHS>
+    std::enable_if_t<Detail::are_generic_matchers<MatcherLHS, MatcherRHS>::value, Detail::MatchAllOfGeneric<MatcherLHS, MatcherRHS>>
+        operator && (MatcherLHS const& lhs, MatcherRHS const& rhs) {
+        return { lhs, rhs };
+    }
+
+    template<typename MatcherLHS, typename MatcherRHS>
+    std::enable_if_t<Detail::are_generic_matchers<MatcherLHS, MatcherRHS>::value, Detail::MatchAnyOfGeneric<MatcherLHS, MatcherRHS>>
+        operator || (MatcherLHS const& lhs, MatcherRHS const& rhs) {
+        return { lhs, rhs };
+    }
+
+    //! Wrap provided generic matcher in generic negator
+    template<typename MatcherT>
+    std::enable_if_t<Detail::is_generic_matcher<MatcherT>::value, Detail::MatchNotOfGeneric<MatcherT>>
+        operator ! (MatcherT const& matcher) {
+        return Detail::MatchNotOfGeneric<MatcherT>{matcher};
+    }
+
+
+    // compose mixed generic and non-generic matchers
+    template<typename MatcherLHS, typename ArgRHS>
+    std::enable_if_t<Detail::is_generic_matcher<MatcherLHS>::value, Detail::MatchAllOfGeneric<MatcherLHS, MatcherBase<ArgRHS>>>
+        operator && (MatcherLHS const& lhs, MatcherBase<ArgRHS> const& rhs) {
+        return { lhs, rhs };
+    }
+
+    template<typename ArgLHS, typename MatcherRHS>
+    std::enable_if_t<Detail::is_generic_matcher<MatcherRHS>::value, Detail::MatchAllOfGeneric<MatcherBase<ArgLHS>, MatcherRHS>>
+        operator && (MatcherBase<ArgLHS> const& lhs, MatcherRHS const& rhs) {
+        return { lhs, rhs };
+    }
+
+    template<typename MatcherLHS, typename ArgRHS>
+    std::enable_if_t<Detail::is_generic_matcher<MatcherLHS>::value, Detail::MatchAnyOfGeneric<MatcherLHS, MatcherBase<ArgRHS>>>
+        operator || (MatcherLHS const& lhs, MatcherBase<ArgRHS> const& rhs) {
+        return { lhs, rhs };
+    }
+
+    template<typename ArgLHS, typename MatcherRHS>
+    std::enable_if_t<Detail::is_generic_matcher<MatcherRHS>::value, Detail::MatchAnyOfGeneric<MatcherBase<ArgLHS>, MatcherRHS>>
+        operator || (MatcherBase<ArgLHS> const& lhs, MatcherRHS const& rhs) {
+        return { lhs, rhs };
+    }
+
+} // namespace Matchers
+} // namespace Catch
+
+#endif // CATCH_MATCHERS_TEMPLATED_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_vector.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_vector.hpp
new file mode 100644
index 00000000..fffbfdf6
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_vector.hpp
@@ -0,0 +1,194 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_MATCHERS_VECTOR_HPP_INCLUDED
+#define CATCH_MATCHERS_VECTOR_HPP_INCLUDED
+
+#include <catch2/matchers/catch_matchers.hpp>
+#include <catch2/catch_approx.hpp>
+
+#include <algorithm>
+
+namespace Catch {
+namespace Matchers {
+
+    template<typename T, typename Alloc>
+    class VectorContainsElementMatcher final : public MatcherBase<std::vector<T, Alloc>> {
+        T const& m_comparator;
+
+    public:
+        VectorContainsElementMatcher(T const& comparator):
+            m_comparator(comparator)
+        {}
+
+        bool match(std::vector<T, Alloc> const& v) const override {
+            for (auto const& el : v) {
+                if (el == m_comparator) {
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        std::string describe() const override {
+            return "Contains: " + ::Catch::Detail::stringify( m_comparator );
+        }
+    };
+
+    template<typename T, typename AllocComp, typename AllocMatch>
+    class ContainsMatcher final : public MatcherBase<std::vector<T, AllocMatch>> {
+        std::vector<T, AllocComp> const& m_comparator;
+
+    public:
+        ContainsMatcher(std::vector<T, AllocComp> const& comparator):
+            m_comparator( comparator )
+        {}
+
+        bool match(std::vector<T, AllocMatch> const& v) const override {
+            // !TBD: see note in EqualsMatcher
+            if (m_comparator.size() > v.size())
+                return false;
+            for (auto const& comparator : m_comparator) {
+                auto present = false;
+                for (const auto& el : v) {
+                    if (el == comparator) {
+                        present = true;
+                        break;
+                    }
+                }
+                if (!present) {
+                    return false;
+                }
+            }
+            return true;
+        }
+        std::string describe() const override {
+            return "Contains: " + ::Catch::Detail::stringify( m_comparator );
+        }
+    };
+
+    template<typename T, typename AllocComp, typename AllocMatch>
+    class EqualsMatcher final : public MatcherBase<std::vector<T, AllocMatch>> {
+        std::vector<T, AllocComp> const& m_comparator;
+
+    public:
+        EqualsMatcher(std::vector<T, AllocComp> const& comparator):
+            m_comparator( comparator )
+        {}
+
+        bool match(std::vector<T, AllocMatch> const& v) const override {
+            // !TBD: This currently works if all elements can be compared using !=
+            // - a more general approach would be via a compare template that defaults
+            // to using !=. but could be specialised for, e.g. std::vector<T> etc
+            // - then just call that directly
+            if ( m_comparator.size() != v.size() ) { return false; }
+            for ( std::size_t i = 0; i < v.size(); ++i ) {
+                if ( !( m_comparator[i] == v[i] ) ) { return false; }
+            }
+            return true;
+        }
+        std::string describe() const override {
+            return "Equals: " + ::Catch::Detail::stringify( m_comparator );
+        }
+    };
+
+    template<typename T, typename AllocComp, typename AllocMatch>
+    class ApproxMatcher final : public MatcherBase<std::vector<T, AllocMatch>> {
+        std::vector<T, AllocComp> const& m_comparator;
+        mutable Catch::Approx approx = Catch::Approx::custom();
+
+    public:
+        ApproxMatcher(std::vector<T, AllocComp> const& comparator):
+            m_comparator( comparator )
+        {}
+
+        bool match(std::vector<T, AllocMatch> const& v) const override {
+            if (m_comparator.size() != v.size())
+                return false;
+            for (std::size_t i = 0; i < v.size(); ++i)
+                if (m_comparator[i] != approx(v[i]))
+                    return false;
+            return true;
+        }
+        std::string describe() const override {
+            return "is approx: " + ::Catch::Detail::stringify( m_comparator );
+        }
+        template <typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        ApproxMatcher& epsilon( T const& newEpsilon ) {
+            approx.epsilon(static_cast<double>(newEpsilon));
+            return *this;
+        }
+        template <typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        ApproxMatcher& margin( T const& newMargin ) {
+            approx.margin(static_cast<double>(newMargin));
+            return *this;
+        }
+        template <typename = std::enable_if_t<std::is_constructible<double, T>::value>>
+        ApproxMatcher& scale( T const& newScale ) {
+            approx.scale(static_cast<double>(newScale));
+            return *this;
+        }
+    };
+
+    template<typename T, typename AllocComp, typename AllocMatch>
+    class UnorderedEqualsMatcher final : public MatcherBase<std::vector<T, AllocMatch>> {
+        std::vector<T, AllocComp> const& m_target;
+
+    public:
+        UnorderedEqualsMatcher(std::vector<T, AllocComp> const& target):
+            m_target(target)
+        {}
+        bool match(std::vector<T, AllocMatch> const& vec) const override {
+            if (m_target.size() != vec.size()) {
+                return false;
+            }
+            return std::is_permutation(m_target.begin(), m_target.end(), vec.begin());
+        }
+
+        std::string describe() const override {
+            return "UnorderedEquals: " + ::Catch::Detail::stringify(m_target);
+        }
+    };
+
+
+    // The following functions create the actual matcher objects.
+    // This allows the types to be inferred
+
+    //! Creates a matcher that matches vectors that contain all elements in `comparator`
+    template<typename T, typename AllocComp = std::allocator<T>, typename AllocMatch = AllocComp>
+    ContainsMatcher<T, AllocComp, AllocMatch> Contains( std::vector<T, AllocComp> const& comparator ) {
+        return ContainsMatcher<T, AllocComp, AllocMatch>(comparator);
+    }
+
+    //! Creates a matcher that matches vectors that contain `comparator` as an element
+    template<typename T, typename Alloc = std::allocator<T>>
+    VectorContainsElementMatcher<T, Alloc> VectorContains( T const& comparator ) {
+        return VectorContainsElementMatcher<T, Alloc>(comparator);
+    }
+
+    //! Creates a matcher that matches vectors that are exactly equal to `comparator`
+    template<typename T, typename AllocComp = std::allocator<T>, typename AllocMatch = AllocComp>
+    EqualsMatcher<T, AllocComp, AllocMatch> Equals( std::vector<T, AllocComp> const& comparator ) {
+        return EqualsMatcher<T, AllocComp, AllocMatch>(comparator);
+    }
+
+    //! Creates a matcher that matches vectors that `comparator` as an element
+    template<typename T, typename AllocComp = std::allocator<T>, typename AllocMatch = AllocComp>
+    ApproxMatcher<T, AllocComp, AllocMatch> Approx( std::vector<T, AllocComp> const& comparator ) {
+        return ApproxMatcher<T, AllocComp, AllocMatch>(comparator);
+    }
+
+    //! Creates a matcher that matches vectors that is equal to `target` modulo permutation
+    template<typename T, typename AllocComp = std::allocator<T>, typename AllocMatch = AllocComp>
+    UnorderedEqualsMatcher<T, AllocComp, AllocMatch> UnorderedEquals(std::vector<T, AllocComp> const& target) {
+        return UnorderedEqualsMatcher<T, AllocComp, AllocMatch>(target);
+    }
+
+} // namespace Matchers
+} // namespace Catch
+
+#endif // CATCH_MATCHERS_VECTOR_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/internal/catch_matchers_impl.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/internal/catch_matchers_impl.cpp
new file mode 100644
index 00000000..41b462e5
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/internal/catch_matchers_impl.cpp
@@ -0,0 +1,25 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/matchers/internal/catch_matchers_impl.hpp>
+#include <catch2/matchers/catch_matchers.hpp>
+#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+namespace Catch {
+
+    // This is the general overload that takes a any string matcher
+    // There is another overload, in catch_assertionhandler.h/.cpp, that only takes a string and infers
+    // the Equals matcher (so the header does not mention matchers)
+    void handleExceptionMatchExpr( AssertionHandler& handler, StringMatcher const& matcher ) {
+        std::string exceptionMessage = Catch::translateActiveException();
+        MatchExpr<std::string, StringMatcher const&> expr( CATCH_MOVE(exceptionMessage), matcher );
+        handler.handleExpr( expr );
+    }
+
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/internal/catch_matchers_impl.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/internal/catch_matchers_impl.hpp
new file mode 100644
index 00000000..2ee9f0c0
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/internal/catch_matchers_impl.hpp
@@ -0,0 +1,88 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_MATCHERS_IMPL_HPP_INCLUDED
+#define CATCH_MATCHERS_IMPL_HPP_INCLUDED
+
+#include <catch2/internal/catch_assertion_handler.hpp>
+#include <catch2/internal/catch_source_line_info.hpp>
+#include <catch2/internal/catch_decomposer.hpp>
+#include <catch2/internal/catch_preprocessor_internal_stringify.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <string>
+
+namespace Catch {
+
+    template<typename ArgT, typename MatcherT>
+    class MatchExpr : public ITransientExpression {
+        ArgT && m_arg;
+        MatcherT const& m_matcher;
+    public:
+        MatchExpr( ArgT && arg, MatcherT const& matcher )
+        :   ITransientExpression{ true, matcher.match( arg ) }, // not forwarding arg here on purpose
+            m_arg( CATCH_FORWARD(arg) ),
+            m_matcher( matcher )
+        {}
+
+        void streamReconstructedExpression( std::ostream& os ) const override {
+            os << Catch::Detail::stringify( m_arg )
+               << ' '
+               << m_matcher.toString();
+        }
+    };
+
+    namespace Matchers {
+        template <typename ArgT>
+        class MatcherBase;
+    }
+
+    using StringMatcher = Matchers::MatcherBase<std::string>;
+
+    void handleExceptionMatchExpr( AssertionHandler& handler, StringMatcher const& matcher );
+
+    template<typename ArgT, typename MatcherT>
+    auto makeMatchExpr( ArgT && arg, MatcherT const& matcher ) -> MatchExpr<ArgT, MatcherT> {
+        return MatchExpr<ArgT, MatcherT>( CATCH_FORWARD(arg), matcher );
+    }
+
+} // namespace Catch
+
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CHECK_THAT( macroName, matcher, resultDisposition, arg ) \
+    do { \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(arg) ", " CATCH_INTERNAL_STRINGIFY(matcher), resultDisposition ); \
+        INTERNAL_CATCH_TRY { \
+            catchAssertionHandler.handleExpr( Catch::makeMatchExpr( arg, matcher ) ); \
+        } INTERNAL_CATCH_CATCH( catchAssertionHandler ) \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( false )
+
+
+///////////////////////////////////////////////////////////////////////////////
+#define INTERNAL_CATCH_THROWS_MATCHES( macroName, exceptionType, resultDisposition, matcher, ... ) \
+    do { \
+        Catch::AssertionHandler catchAssertionHandler( macroName##_catch_sr, CATCH_INTERNAL_LINEINFO, CATCH_INTERNAL_STRINGIFY(__VA_ARGS__) ", " CATCH_INTERNAL_STRINGIFY(exceptionType) ", " CATCH_INTERNAL_STRINGIFY(matcher), resultDisposition ); \
+        if( catchAssertionHandler.allowThrows() ) \
+            try { \
+                static_cast<void>(__VA_ARGS__ ); \
+                catchAssertionHandler.handleUnexpectedExceptionNotThrown(); \
+            } \
+            catch( exceptionType const& ex ) { \
+                catchAssertionHandler.handleExpr( Catch::makeMatchExpr( ex, matcher ) ); \
+            } \
+            catch( ... ) { \
+                catchAssertionHandler.handleUnexpectedInflightException(); \
+            } \
+        else \
+            catchAssertionHandler.handleThrowingCallSkipped(); \
+        INTERNAL_CATCH_REACT( catchAssertionHandler ) \
+    } while( false )
+
+
+#endif // CATCH_MATCHERS_IMPL_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/meson.build b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/meson.build
new file mode 100644
index 00000000..cc45e641
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/meson.build
@@ -0,0 +1,392 @@
+#              Copyright Catch2 Authors
+# Distributed under the Boost Software License, Version 1.0.
+#   (See accompanying file LICENSE.txt or copy at
+#        https://www.boost.org/LICENSE_1_0.txt)
+
+# SPDX-License-Identifier: BSL-1.0
+pkg = import('pkgconfig')
+
+conf_data = configuration_data()
+conf_data.set('CATCH_CONFIG_DEFAULT_REPORTER', 'console')
+conf_data.set('CATCH_CONFIG_CONSOLE_WIDTH', '80')
+
+configure_file(
+  input: 'catch_user_config.hpp.in',
+  output: 'catch_user_config.hpp',
+  format: 'cmake@',
+  install_dir: get_option('includedir') / 'catch2',
+  configuration: conf_data,
+)
+
+fs = import('fs')
+
+benchmark_headers = [
+  'benchmark/catch_benchmark.hpp',
+  'benchmark/catch_benchmark_all.hpp',
+  'benchmark/catch_chronometer.hpp',
+  'benchmark/catch_clock.hpp',
+  'benchmark/catch_constructor.hpp',
+  'benchmark/catch_environment.hpp',
+  'benchmark/catch_estimate.hpp',
+  'benchmark/catch_execution_plan.hpp',
+  'benchmark/catch_optimizer.hpp',
+  'benchmark/catch_outlier_classification.hpp',
+  'benchmark/catch_sample_analysis.hpp',
+  'benchmark/detail/catch_analyse.hpp',
+  'benchmark/detail/catch_benchmark_function.hpp',
+  'benchmark/detail/catch_benchmark_stats.hpp',
+  'benchmark/detail/catch_benchmark_stats_fwd.hpp',
+  'benchmark/detail/catch_complete_invoke.hpp',
+  'benchmark/detail/catch_estimate_clock.hpp',
+  'benchmark/detail/catch_measure.hpp',
+  'benchmark/detail/catch_repeat.hpp',
+  'benchmark/detail/catch_run_for_at_least.hpp',
+  'benchmark/detail/catch_stats.hpp',
+  'benchmark/detail/catch_timing.hpp',
+]
+
+benchmark_sources = files(
+  'benchmark/catch_chronometer.cpp',
+  'benchmark/detail/catch_analyse.cpp',
+  'benchmark/detail/catch_benchmark_function.cpp',
+  'benchmark/detail/catch_run_for_at_least.cpp',
+  'benchmark/detail/catch_stats.cpp',
+)
+
+internal_headers = [
+  'generators/catch_generator_exception.hpp',
+  'generators/catch_generators.hpp',
+  'generators/catch_generators_adapters.hpp',
+  'generators/catch_generators_all.hpp',
+  'generators/catch_generators_random.hpp',
+  'generators/catch_generators_range.hpp',
+  'interfaces/catch_interfaces_all.hpp',
+  'interfaces/catch_interfaces_capture.hpp',
+  'interfaces/catch_interfaces_config.hpp',
+  'interfaces/catch_interfaces_enum_values_registry.hpp',
+  'interfaces/catch_interfaces_exception.hpp',
+  'interfaces/catch_interfaces_generatortracker.hpp',
+  'interfaces/catch_interfaces_registry_hub.hpp',
+  'interfaces/catch_interfaces_reporter.hpp',
+  'interfaces/catch_interfaces_reporter_factory.hpp',
+  'interfaces/catch_interfaces_tag_alias_registry.hpp',
+  'interfaces/catch_interfaces_test_invoker.hpp',
+  'interfaces/catch_interfaces_testcase.hpp',
+  'internal/catch_assertion_handler.hpp',
+  'internal/catch_case_insensitive_comparisons.hpp',
+  'internal/catch_case_sensitive.hpp',
+  'internal/catch_clara.hpp',
+  'internal/catch_commandline.hpp',
+  'internal/catch_compare_traits.hpp',
+  'internal/catch_compiler_capabilities.hpp',
+  'internal/catch_config_android_logwrite.hpp',
+  'internal/catch_config_counter.hpp',
+  'internal/catch_config_static_analysis_support.hpp',
+  'internal/catch_config_uncaught_exceptions.hpp',
+  'internal/catch_config_wchar.hpp',
+  'internal/catch_console_colour.hpp',
+  'internal/catch_console_width.hpp',
+  'internal/catch_container_nonmembers.hpp',
+  'internal/catch_context.hpp',
+  'internal/catch_debug_console.hpp',
+  'internal/catch_debugger.hpp',
+  'internal/catch_decomposer.hpp',
+  'internal/catch_enforce.hpp',
+  'internal/catch_enum_values_registry.hpp',
+  'internal/catch_errno_guard.hpp',
+  'internal/catch_exception_translator_registry.hpp',
+  'internal/catch_fatal_condition_handler.hpp',
+  'internal/catch_floating_point_helpers.hpp',
+  'internal/catch_getenv.hpp',
+  'internal/catch_istream.hpp',
+  'internal/catch_is_permutation.hpp',
+  'internal/catch_jsonwriter.hpp',
+  'internal/catch_lazy_expr.hpp',
+  'internal/catch_leak_detector.hpp',
+  'internal/catch_list.hpp',
+  'internal/catch_logical_traits.hpp',
+  'internal/catch_message_info.hpp',
+  'internal/catch_meta.hpp',
+  'internal/catch_move_and_forward.hpp',
+  'internal/catch_noncopyable.hpp',
+  'internal/catch_optional.hpp',
+  'internal/catch_output_redirect.hpp',
+  'internal/catch_parse_numbers.hpp',
+  'internal/catch_platform.hpp',
+  'internal/catch_polyfills.hpp',
+  'internal/catch_preprocessor.hpp',
+  'internal/catch_preprocessor_internal_stringify.hpp',
+  'internal/catch_preprocessor_remove_parens.hpp',
+  'internal/catch_random_floating_point_helpers.hpp',
+  'internal/catch_random_integer_helpers.hpp',
+  'internal/catch_random_number_generator.hpp',
+  'internal/catch_random_seed_generation.hpp',
+  'internal/catch_reporter_registry.hpp',
+  'internal/catch_reporter_spec_parser.hpp',
+  'internal/catch_result_type.hpp',
+  'internal/catch_reusable_string_stream.hpp',
+  'internal/catch_run_context.hpp',
+  'internal/catch_section.hpp',
+  'internal/catch_sharding.hpp',
+  'internal/catch_singletons.hpp',
+  'internal/catch_source_line_info.hpp',
+  'internal/catch_startup_exception_registry.hpp',
+  'internal/catch_stdstreams.hpp',
+  'internal/catch_stream_end_stop.hpp',
+  'internal/catch_string_manip.hpp',
+  'internal/catch_stringref.hpp',
+  'internal/catch_tag_alias_registry.hpp',
+  'internal/catch_template_test_registry.hpp',
+  'internal/catch_test_case_info_hasher.hpp',
+  'internal/catch_test_case_registry_impl.hpp',
+  'internal/catch_test_case_tracker.hpp',
+  'internal/catch_test_failure_exception.hpp',
+  'internal/catch_test_macro_impl.hpp',
+  'internal/catch_test_registry.hpp',
+  'internal/catch_test_run_info.hpp',
+  'internal/catch_test_spec_parser.hpp',
+  'internal/catch_textflow.hpp',
+  'internal/catch_to_string.hpp',
+  'internal/catch_uncaught_exceptions.hpp',
+  'internal/catch_uniform_floating_point_distribution.hpp',
+  'internal/catch_uniform_integer_distribution.hpp',
+  'internal/catch_unique_name.hpp',
+  'internal/catch_unique_ptr.hpp',
+  'internal/catch_void_type.hpp',
+  'internal/catch_wildcard_pattern.hpp',
+  'internal/catch_windows_h_proxy.hpp',
+  'internal/catch_xmlwriter.hpp',
+  'matchers/catch_matchers.hpp',
+  'matchers/catch_matchers_all.hpp',
+  'matchers/catch_matchers_container_properties.hpp',
+  'matchers/catch_matchers_contains.hpp',
+  'matchers/catch_matchers_exception.hpp',
+  'matchers/catch_matchers_floating_point.hpp',
+  'matchers/catch_matchers_predicate.hpp',
+  'matchers/catch_matchers_quantifiers.hpp',
+  'matchers/catch_matchers_range_equals.hpp',
+  'matchers/catch_matchers_string.hpp',
+  'matchers/catch_matchers_templated.hpp',
+  'matchers/catch_matchers_vector.hpp',
+  'matchers/internal/catch_matchers_impl.hpp',
+  'catch_all.hpp',
+  'catch_approx.hpp',
+  'catch_assertion_info.hpp',
+  'catch_assertion_result.hpp',
+  'catch_config.hpp',
+  'catch_get_random_seed.hpp',
+  'catch_message.hpp',
+  'catch_section_info.hpp',
+  'catch_session.hpp',
+  'catch_tag_alias.hpp',
+  'catch_tag_alias_autoregistrar.hpp',
+  'catch_template_test_macros.hpp',
+  'catch_test_case_info.hpp',
+  'catch_test_macros.hpp',
+  'catch_test_spec.hpp',
+  'catch_timer.hpp',
+  'catch_tostring.hpp',
+  'catch_totals.hpp',
+  'catch_translate_exception.hpp',
+  'catch_version.hpp',
+  'catch_version_macros.hpp',
+]
+
+internal_sources = files(
+  'generators/catch_generator_exception.cpp',
+  'generators/catch_generators.cpp',
+  'generators/catch_generators_random.cpp',
+  'interfaces/catch_interfaces_capture.cpp',
+  'interfaces/catch_interfaces_config.cpp',
+  'interfaces/catch_interfaces_exception.cpp',
+  'interfaces/catch_interfaces_generatortracker.cpp',
+  'interfaces/catch_interfaces_registry_hub.cpp',
+  'interfaces/catch_interfaces_reporter.cpp',
+  'interfaces/catch_interfaces_reporter_factory.cpp',
+  'interfaces/catch_interfaces_testcase.cpp',
+  'internal/catch_assertion_handler.cpp',
+  'internal/catch_case_insensitive_comparisons.cpp',
+  'internal/catch_clara.cpp',
+  'internal/catch_commandline.cpp',
+  'internal/catch_console_colour.cpp',
+  'internal/catch_context.cpp',
+  'internal/catch_debug_console.cpp',
+  'internal/catch_debugger.cpp',
+  'internal/catch_decomposer.cpp',
+  'internal/catch_enforce.cpp',
+  'internal/catch_enum_values_registry.cpp',
+  'internal/catch_errno_guard.cpp',
+  'internal/catch_exception_translator_registry.cpp',
+  'internal/catch_fatal_condition_handler.cpp',
+  'internal/catch_floating_point_helpers.cpp',
+  'internal/catch_getenv.cpp',
+  'internal/catch_istream.cpp',
+  'internal/catch_jsonwriter.cpp',
+  'internal/catch_lazy_expr.cpp',
+  'internal/catch_leak_detector.cpp',
+  'internal/catch_list.cpp',
+  'internal/catch_message_info.cpp',
+  'internal/catch_output_redirect.cpp',
+  'internal/catch_parse_numbers.cpp',
+  'internal/catch_polyfills.cpp',
+  'internal/catch_random_number_generator.cpp',
+  'internal/catch_random_seed_generation.cpp',
+  'internal/catch_reporter_registry.cpp',
+  'internal/catch_reporter_spec_parser.cpp',
+  'internal/catch_result_type.cpp',
+  'internal/catch_reusable_string_stream.cpp',
+  'internal/catch_run_context.cpp',
+  'internal/catch_section.cpp',
+  'internal/catch_singletons.cpp',
+  'internal/catch_source_line_info.cpp',
+  'internal/catch_startup_exception_registry.cpp',
+  'internal/catch_stdstreams.cpp',
+  'internal/catch_string_manip.cpp',
+  'internal/catch_stringref.cpp',
+  'internal/catch_tag_alias_registry.cpp',
+  'internal/catch_test_case_info_hasher.cpp',
+  'internal/catch_test_case_registry_impl.cpp',
+  'internal/catch_test_case_tracker.cpp',
+  'internal/catch_test_failure_exception.cpp',
+  'internal/catch_test_registry.cpp',
+  'internal/catch_test_spec_parser.cpp',
+  'internal/catch_textflow.cpp',
+  'internal/catch_uncaught_exceptions.cpp',
+  'internal/catch_wildcard_pattern.cpp',
+  'internal/catch_xmlwriter.cpp',
+  'matchers/catch_matchers.cpp',
+  'matchers/catch_matchers_container_properties.cpp',
+  'matchers/catch_matchers_exception.cpp',
+  'matchers/catch_matchers_floating_point.cpp',
+  'matchers/catch_matchers_predicate.cpp',
+  'matchers/catch_matchers_quantifiers.cpp',
+  'matchers/catch_matchers_string.cpp',
+  'matchers/catch_matchers_templated.cpp',
+  'matchers/internal/catch_matchers_impl.cpp',
+  'catch_approx.cpp',
+  'catch_assertion_result.cpp',
+  'catch_config.cpp',
+  'catch_get_random_seed.cpp',
+  'catch_message.cpp',
+  'catch_registry_hub.cpp',
+  'catch_session.cpp',
+  'catch_tag_alias_autoregistrar.cpp',
+  'catch_test_case_info.cpp',
+  'catch_test_spec.cpp',
+  'catch_timer.cpp',
+  'catch_tostring.cpp',
+  'catch_totals.cpp',
+  'catch_translate_exception.cpp',
+  'catch_version.cpp',
+)
+
+reporter_headers = [
+  'reporters/catch_reporter_automake.hpp',
+  'reporters/catch_reporter_common_base.hpp',
+  'reporters/catch_reporter_compact.hpp',
+  'reporters/catch_reporter_console.hpp',
+  'reporters/catch_reporter_cumulative_base.hpp',
+  'reporters/catch_reporter_event_listener.hpp',
+  'reporters/catch_reporter_helpers.hpp',
+  'reporters/catch_reporter_json.hpp',
+  'reporters/catch_reporter_junit.hpp',
+  'reporters/catch_reporter_multi.hpp',
+  'reporters/catch_reporter_registrars.hpp',
+  'reporters/catch_reporter_sonarqube.hpp',
+  'reporters/catch_reporter_streaming_base.hpp',
+  'reporters/catch_reporter_tap.hpp',
+  'reporters/catch_reporter_teamcity.hpp',
+  'reporters/catch_reporter_xml.hpp',
+  'reporters/catch_reporters_all.hpp',
+]
+
+reporter_sources = files(
+  'reporters/catch_reporter_automake.cpp',
+  'reporters/catch_reporter_common_base.cpp',
+  'reporters/catch_reporter_compact.cpp',
+  'reporters/catch_reporter_console.cpp',
+  'reporters/catch_reporter_cumulative_base.cpp',
+  'reporters/catch_reporter_event_listener.cpp',
+  'reporters/catch_reporter_helpers.cpp',
+  'reporters/catch_reporter_json.cpp',
+  'reporters/catch_reporter_junit.cpp',
+  'reporters/catch_reporter_multi.cpp',
+  'reporters/catch_reporter_registrars.cpp',
+  'reporters/catch_reporter_sonarqube.cpp',
+  'reporters/catch_reporter_streaming_base.cpp',
+  'reporters/catch_reporter_tap.cpp',
+  'reporters/catch_reporter_teamcity.cpp',
+  'reporters/catch_reporter_xml.cpp',
+)
+
+headers = benchmark_headers + internal_headers + reporter_headers
+sources = benchmark_sources + internal_sources + reporter_sources
+
+# The headers must be installed with their full paths, which meson
+# v0.63 supports using `preserve_path` for `install_headers`. We'd
+# like to be compatible with Debian 11 (current stable) which has
+# meson v0.56.2. Instead, let's use the technique from
+# https://github.com/mesonbuild/meson/issues/14 with some tweaks to
+# make it compatible with newer meson.
+include_subdir = 'catch2'
+foreach file : headers
+  file_path = file.split('/')
+
+  folder = ''
+  foreach path : file_path
+    if path != file_path[-1]
+      folder = folder / path
+    endif
+  endforeach
+
+  install_headers(file, subdir: join_paths(include_subdir, folder))
+endforeach
+
+catch2_dependencies = []
+# Check if this is an Android NDK build.
+if ((host_machine.system() == 'android') or
+  # Check if this is an Android Termux build.
+  (host_machine.system() == 'linux' and fs.is_dir('/data/data/com.termux')))
+  log_dep = meson.get_compiler('cpp').find_library('log')
+  catch2_dependencies += log_dep
+endif
+
+catch2 = static_library(
+  'Catch2',
+  sources,
+  dependencies: catch2_dependencies,
+  include_directories: '..',
+  install: true,
+)
+
+catch2_dep = declare_dependency(
+  link_with: catch2,
+  include_directories: '..',
+)
+
+pkg.generate(
+  catch2,
+  filebase: 'catch2',
+  description: 'A modern, C++-native, test framework for C++14 and above',
+  url: 'https://github.com/catchorg/Catch2',
+)
+
+catch2_with_main = static_library(
+  'Catch2Main',
+  'internal/catch_main.cpp',
+  link_with: catch2,
+  include_directories: '..',
+  install: true,
+)
+
+catch2_with_main_dep = declare_dependency(
+  link_with: [catch2, catch2_with_main],
+  include_directories: '..',
+)
+
+pkg.generate(
+  catch2_with_main,
+  filebase: 'catch2-with-main',
+  description: 'A modern, C++-native, test framework for C++14 and above (links in default main)',
+  requires: 'catch2 = ' + meson.project_version(),
+)
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_automake.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_automake.cpp
new file mode 100644
index 00000000..5e506a6b
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_automake.cpp
@@ -0,0 +1,37 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/reporters/catch_reporter_automake.hpp>
+#include <catch2/catch_test_case_info.hpp>
+
+#include <ostream>
+
+namespace Catch {
+
+    AutomakeReporter::~AutomakeReporter() = default;
+
+    void AutomakeReporter::testCaseEnded(TestCaseStats const& _testCaseStats) {
+        // Possible values to emit are PASS, XFAIL, SKIP, FAIL, XPASS and ERROR.
+        m_stream << ":test-result: ";
+        if ( _testCaseStats.totals.testCases.skipped > 0 ) {
+            m_stream << "SKIP";
+        } else if (_testCaseStats.totals.assertions.allPassed()) {
+            m_stream << "PASS";
+        } else if (_testCaseStats.totals.assertions.allOk()) {
+            m_stream << "XFAIL";
+        } else {
+            m_stream << "FAIL";
+        }
+        m_stream << ' ' << _testCaseStats.testInfo->name << '\n';
+        StreamingReporterBase::testCaseEnded(_testCaseStats);
+    }
+
+    void AutomakeReporter::skipTest(TestCaseInfo const& testInfo) {
+        m_stream << ":test-result: SKIP " << testInfo.name << '\n';
+    }
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_automake.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_automake.hpp
new file mode 100644
index 00000000..a639428c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_automake.hpp
@@ -0,0 +1,38 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_REPORTER_AUTOMAKE_HPP_INCLUDED
+#define CATCH_REPORTER_AUTOMAKE_HPP_INCLUDED
+
+#include <catch2/reporters/catch_reporter_streaming_base.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <string>
+
+namespace Catch {
+
+    class AutomakeReporter final : public StreamingReporterBase {
+    public:
+        // GCC5 compat: we cannot use inherited constructor, because it
+        //              doesn't implement backport of P0136
+        AutomakeReporter(ReporterConfig&& _config):
+            StreamingReporterBase(CATCH_MOVE(_config))
+        {}
+        ~AutomakeReporter() override;
+
+        static std::string getDescription() {
+            using namespace std::string_literals;
+            return "Reports test results in the format of Automake .trs files"s;
+        }
+
+        void testCaseEnded(TestCaseStats const& _testCaseStats) override;
+        void skipTest(TestCaseInfo const& testInfo) override;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_AUTOMAKE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_common_base.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_common_base.cpp
new file mode 100644
index 00000000..a1ca76a0
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_common_base.cpp
@@ -0,0 +1,49 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/reporters/catch_reporter_common_base.hpp>
+
+#include <catch2/reporters/catch_reporter_helpers.hpp>
+#include <catch2/internal/catch_console_colour.hpp>
+#include <catch2/internal/catch_istream.hpp>
+
+
+namespace Catch {
+    ReporterBase::ReporterBase( ReporterConfig&& config ):
+        IEventListener( config.fullConfig() ),
+        m_wrapped_stream( CATCH_MOVE(config).takeStream() ),
+        m_stream( m_wrapped_stream->stream() ),
+        m_colour( makeColourImpl( config.colourMode(), m_wrapped_stream.get() ) ),
+        m_customOptions( config.customOptions() )
+    {}
+
+    ReporterBase::~ReporterBase() = default;
+
+    void ReporterBase::listReporters(
+        std::vector<ReporterDescription> const& descriptions ) {
+        defaultListReporters(m_stream, descriptions, m_config->verbosity());
+    }
+
+    void ReporterBase::listListeners(
+        std::vector<ListenerDescription> const& descriptions ) {
+        defaultListListeners( m_stream, descriptions );
+    }
+
+    void ReporterBase::listTests(std::vector<TestCaseHandle> const& tests) {
+        defaultListTests(m_stream,
+                         m_colour.get(),
+                         tests,
+                         m_config->hasTestFilters(),
+                         m_config->verbosity());
+    }
+
+    void ReporterBase::listTags(std::vector<TagInfo> const& tags) {
+        defaultListTags( m_stream, tags, m_config->hasTestFilters() );
+    }
+
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_common_base.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_common_base.hpp
new file mode 100644
index 00000000..b4f0a9ff
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_common_base.hpp
@@ -0,0 +1,79 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_REPORTER_COMMON_BASE_HPP_INCLUDED
+#define CATCH_REPORTER_COMMON_BASE_HPP_INCLUDED
+
+#include <catch2/interfaces/catch_interfaces_reporter.hpp>
+
+#include <map>
+#include <string>
+
+namespace Catch {
+    class ColourImpl;
+
+    /**
+     * This is the base class for all reporters.
+     *
+     * If are writing a reporter, you must derive from this type, or one
+     * of the helper reporter bases that are derived from this type.
+     *
+     * ReporterBase centralizes handling of various common tasks in reporters,
+     * like storing the right stream for the reporters to write to, and
+     * providing the default implementation of the different listing events.
+     */
+    class ReporterBase : public IEventListener {
+    protected:
+        //! The stream wrapper as passed to us by outside code
+        Detail::unique_ptr<IStream> m_wrapped_stream;
+        //! Cached output stream from `m_wrapped_stream` to reduce
+        //! number of indirect calls needed to write output.
+        std::ostream& m_stream;
+        //! Colour implementation this reporter was configured for
+        Detail::unique_ptr<ColourImpl> m_colour;
+        //! The custom reporter options user passed down to the reporter
+        std::map<std::string, std::string> m_customOptions;
+
+    public:
+        ReporterBase( ReporterConfig&& config );
+        ~ReporterBase() override; // = default;
+
+        /**
+         * Provides a simple default listing of reporters.
+         *
+         * Should look roughly like the reporter listing in v2 and earlier
+         * versions of Catch2.
+         */
+        void listReporters(
+            std::vector<ReporterDescription> const& descriptions ) override;
+        /**
+         * Provides a simple default listing of listeners
+         *
+         * Looks similarly to listing of reporters, but with listener type
+         * instead of reporter name.
+         */
+        void listListeners(
+            std::vector<ListenerDescription> const& descriptions ) override;
+        /**
+         * Provides a simple default listing of tests.
+         *
+         * Should look roughly like the test listing in v2 and earlier versions
+         * of Catch2. Especially supports low-verbosity listing that mimics the
+         * old `--list-test-names-only` output.
+         */
+        void listTests( std::vector<TestCaseHandle> const& tests ) override;
+        /**
+         * Provides a simple default listing of tags.
+         *
+         * Should look roughly like the tag listing in v2 and earlier versions
+         * of Catch2.
+         */
+        void listTags( std::vector<TagInfo> const& tags ) override;
+    };
+} // namespace Catch
+
+#endif // CATCH_REPORTER_COMMON_BASE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_compact.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_compact.cpp
new file mode 100644
index 00000000..0f855944
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_compact.cpp
@@ -0,0 +1,254 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/reporters/catch_reporter_compact.hpp>
+
+#include <catch2/catch_get_random_seed.hpp>
+#include <catch2/catch_test_spec.hpp>
+#include <catch2/reporters/catch_reporter_helpers.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/internal/catch_platform.hpp>
+#include <catch2/internal/catch_console_colour.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+
+#include <ostream>
+
+namespace Catch {
+namespace {
+
+    // Colour::LightGrey
+    static constexpr Colour::Code compactDimColour = Colour::FileName;
+
+#ifdef CATCH_PLATFORM_MAC
+    static constexpr Catch::StringRef compactFailedString = "FAILED"_sr;
+    static constexpr Catch::StringRef compactPassedString = "PASSED"_sr;
+#else
+    static constexpr Catch::StringRef compactFailedString = "failed"_sr;
+    static constexpr Catch::StringRef compactPassedString = "passed"_sr;
+#endif
+
+// Implementation of CompactReporter formatting
+class AssertionPrinter {
+public:
+    AssertionPrinter& operator= (AssertionPrinter const&) = delete;
+    AssertionPrinter(AssertionPrinter const&) = delete;
+    AssertionPrinter(std::ostream& _stream, AssertionStats const& _stats, bool _printInfoMessages, ColourImpl* colourImpl_)
+        : stream(_stream)
+        , result(_stats.assertionResult)
+        , messages(_stats.infoMessages)
+        , itMessage(_stats.infoMessages.begin())
+        , printInfoMessages(_printInfoMessages)
+        , colourImpl(colourImpl_)
+    {}
+
+    void print() {
+        printSourceInfo();
+
+        itMessage = messages.begin();
+
+        switch (result.getResultType()) {
+        case ResultWas::Ok:
+            printResultType(Colour::ResultSuccess, compactPassedString);
+            printOriginalExpression();
+            printReconstructedExpression();
+            if (!result.hasExpression())
+                printRemainingMessages(Colour::None);
+            else
+                printRemainingMessages();
+            break;
+        case ResultWas::ExpressionFailed:
+            if (result.isOk())
+                printResultType(Colour::ResultSuccess, compactFailedString + " - but was ok"_sr);
+            else
+                printResultType(Colour::Error, compactFailedString);
+            printOriginalExpression();
+            printReconstructedExpression();
+            printRemainingMessages();
+            break;
+        case ResultWas::ThrewException:
+            printResultType(Colour::Error, compactFailedString);
+            printIssue("unexpected exception with message:");
+            printMessage();
+            printExpressionWas();
+            printRemainingMessages();
+            break;
+        case ResultWas::FatalErrorCondition:
+            printResultType(Colour::Error, compactFailedString);
+            printIssue("fatal error condition with message:");
+            printMessage();
+            printExpressionWas();
+            printRemainingMessages();
+            break;
+        case ResultWas::DidntThrowException:
+            printResultType(Colour::Error, compactFailedString);
+            printIssue("expected exception, got none");
+            printExpressionWas();
+            printRemainingMessages();
+            break;
+        case ResultWas::Info:
+            printResultType(Colour::None, "info"_sr);
+            printMessage();
+            printRemainingMessages();
+            break;
+        case ResultWas::Warning:
+            printResultType(Colour::None, "warning"_sr);
+            printMessage();
+            printRemainingMessages();
+            break;
+        case ResultWas::ExplicitFailure:
+            printResultType(Colour::Error, compactFailedString);
+            printIssue("explicitly");
+            printRemainingMessages(Colour::None);
+            break;
+        case ResultWas::ExplicitSkip:
+            printResultType(Colour::Skip, "skipped"_sr);
+            printMessage();
+            printRemainingMessages();
+            break;
+            // These cases are here to prevent compiler warnings
+        case ResultWas::Unknown:
+        case ResultWas::FailureBit:
+        case ResultWas::Exception:
+            printResultType(Colour::Error, "** internal error **");
+            break;
+        }
+    }
+
+private:
+    void printSourceInfo() const {
+        stream << colourImpl->guardColour( Colour::FileName )
+               << result.getSourceInfo() << ':';
+    }
+
+    void printResultType(Colour::Code colour, StringRef passOrFail) const {
+        if (!passOrFail.empty()) {
+            stream << colourImpl->guardColour(colour) << ' ' << passOrFail;
+            stream << ':';
+        }
+    }
+
+    void printIssue(char const* issue) const {
+        stream << ' ' << issue;
+    }
+
+    void printExpressionWas() {
+        if (result.hasExpression()) {
+            stream << ';';
+            {
+                stream << colourImpl->guardColour(compactDimColour) << " expression was:";
+            }
+            printOriginalExpression();
+        }
+    }
+
+    void printOriginalExpression() const {
+        if (result.hasExpression()) {
+            stream << ' ' << result.getExpression();
+        }
+    }
+
+    void printReconstructedExpression() const {
+        if (result.hasExpandedExpression()) {
+            stream << colourImpl->guardColour(compactDimColour) << " for: ";
+            stream << result.getExpandedExpression();
+        }
+    }
+
+    void printMessage() {
+        if (itMessage != messages.end()) {
+            stream << " '" << itMessage->message << '\'';
+            ++itMessage;
+        }
+    }
+
+    void printRemainingMessages(Colour::Code colour = compactDimColour) {
+        if (itMessage == messages.end())
+            return;
+
+        const auto itEnd = messages.cend();
+        const auto N = static_cast<std::size_t>(itEnd - itMessage);
+
+        stream << colourImpl->guardColour( colour ) << " with "
+               << pluralise( N, "message"_sr ) << ':';
+
+        while (itMessage != itEnd) {
+            // If this assertion is a warning ignore any INFO messages
+            if (printInfoMessages || itMessage->type != ResultWas::Info) {
+                printMessage();
+                if (itMessage != itEnd) {
+                    stream << colourImpl->guardColour(compactDimColour) << " and";
+                }
+                continue;
+            }
+            ++itMessage;
+        }
+    }
+
+private:
+    std::ostream& stream;
+    AssertionResult const& result;
+    std::vector<MessageInfo> const& messages;
+    std::vector<MessageInfo>::const_iterator itMessage;
+    bool printInfoMessages;
+    ColourImpl* colourImpl;
+};
+
+} // anon namespace
+
+        std::string CompactReporter::getDescription() {
+            return "Reports test results on a single line, suitable for IDEs";
+        }
+
+        void CompactReporter::noMatchingTestCases( StringRef unmatchedSpec ) {
+            m_stream << "No test cases matched '" << unmatchedSpec << "'\n";
+        }
+
+        void CompactReporter::testRunStarting( TestRunInfo const& ) {
+            if ( m_config->testSpec().hasFilters() ) {
+                m_stream << m_colour->guardColour( Colour::BrightYellow )
+                         << "Filters: "
+                         << m_config->testSpec()
+                         << '\n';
+            }
+            m_stream << "RNG seed: " << getSeed() << '\n';
+        }
+
+        void CompactReporter::assertionEnded( AssertionStats const& _assertionStats ) {
+            AssertionResult const& result = _assertionStats.assertionResult;
+
+            bool printInfoMessages = true;
+
+            // Drop out if result was successful and we're not printing those
+            if( !m_config->includeSuccessfulResults() && result.isOk() ) {
+                if( result.getResultType() != ResultWas::Warning && result.getResultType() != ResultWas::ExplicitSkip )
+                    return;
+                printInfoMessages = false;
+            }
+
+            AssertionPrinter printer( m_stream, _assertionStats, printInfoMessages, m_colour.get() );
+            printer.print();
+
+            m_stream << '\n' << std::flush;
+        }
+
+        void CompactReporter::sectionEnded(SectionStats const& _sectionStats) {
+            double dur = _sectionStats.durationInSeconds;
+            if ( shouldShowDuration( *m_config, dur ) ) {
+                m_stream << getFormattedDuration( dur ) << " s: " << _sectionStats.sectionInfo.name << '\n' << std::flush;
+            }
+        }
+
+        void CompactReporter::testRunEnded( TestRunStats const& _testRunStats ) {
+            printTestRunTotals( m_stream, *m_colour, _testRunStats.totals );
+            m_stream << "\n\n" << std::flush;
+            StreamingReporterBase::testRunEnded( _testRunStats );
+        }
+
+        CompactReporter::~CompactReporter() = default;
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_compact.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_compact.hpp
new file mode 100644
index 00000000..d95bbff1
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_compact.hpp
@@ -0,0 +1,39 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_REPORTER_COMPACT_HPP_INCLUDED
+#define CATCH_REPORTER_COMPACT_HPP_INCLUDED
+
+
+#include <catch2/reporters/catch_reporter_streaming_base.hpp>
+
+
+namespace Catch {
+
+    class CompactReporter final : public StreamingReporterBase {
+    public:
+        using StreamingReporterBase::StreamingReporterBase;
+
+        ~CompactReporter() override;
+
+        static std::string getDescription();
+
+        void noMatchingTestCases( StringRef unmatchedSpec ) override;
+
+        void testRunStarting( TestRunInfo const& _testInfo ) override;
+
+        void assertionEnded(AssertionStats const& _assertionStats) override;
+
+        void sectionEnded(SectionStats const& _sectionStats) override;
+
+        void testRunEnded(TestRunStats const& _testRunStats) override;
+
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_COMPACT_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_console.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_console.cpp
new file mode 100644
index 00000000..c5678548
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_console.cpp
@@ -0,0 +1,665 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/reporters/catch_reporter_console.hpp>
+
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/catch_test_spec.hpp>
+#include <catch2/internal/catch_console_colour.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/catch_version.hpp>
+#include <catch2/internal/catch_textflow.hpp>
+#include <catch2/internal/catch_reusable_string_stream.hpp>
+#include <catch2/internal/catch_stringref.hpp>
+#include <catch2/catch_test_case_info.hpp>
+#include <catch2/internal/catch_console_width.hpp>
+#include <catch2/reporters/catch_reporter_helpers.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/catch_get_random_seed.hpp>
+
+#include <cstdio>
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable:4061) // Not all labels are EXPLICITLY handled in switch
+ // Note that 4062 (not all labels are handled and default is missing) is enabled
+#endif
+
+#if defined(__clang__)
+#  pragma clang diagnostic push
+// For simplicity, benchmarking-only helpers are always enabled
+#  pragma clang diagnostic ignored "-Wunused-function"
+#endif
+
+
+
+namespace Catch {
+
+namespace {
+
+// Formatter impl for ConsoleReporter
+class ConsoleAssertionPrinter {
+public:
+    ConsoleAssertionPrinter& operator= (ConsoleAssertionPrinter const&) = delete;
+    ConsoleAssertionPrinter(ConsoleAssertionPrinter const&) = delete;
+    ConsoleAssertionPrinter(std::ostream& _stream, AssertionStats const& _stats, ColourImpl* colourImpl_, bool _printInfoMessages)
+        : stream(_stream),
+        stats(_stats),
+        result(_stats.assertionResult),
+        colour(Colour::None),
+        messages(_stats.infoMessages),
+        colourImpl(colourImpl_),
+        printInfoMessages(_printInfoMessages) {
+        switch (result.getResultType()) {
+        case ResultWas::Ok:
+            colour = Colour::Success;
+            passOrFail = "PASSED"_sr;
+            //if( result.hasMessage() )
+            if (messages.size() == 1)
+                messageLabel = "with message"_sr;
+            if (messages.size() > 1)
+                messageLabel = "with messages"_sr;
+            break;
+        case ResultWas::ExpressionFailed:
+            if (result.isOk()) {
+                colour = Colour::Success;
+                passOrFail = "FAILED - but was ok"_sr;
+            } else {
+                colour = Colour::Error;
+                passOrFail = "FAILED"_sr;
+            }
+            if (messages.size() == 1)
+                messageLabel = "with message"_sr;
+            if (messages.size() > 1)
+                messageLabel = "with messages"_sr;
+            break;
+        case ResultWas::ThrewException:
+            colour = Colour::Error;
+            passOrFail = "FAILED"_sr;
+            // todo switch
+            switch (messages.size()) { case 0:
+                messageLabel = "due to unexpected exception with "_sr;
+                break;
+            case 1:
+                messageLabel = "due to unexpected exception with message"_sr;
+                break;
+            default:
+                messageLabel = "due to unexpected exception with messages"_sr;
+                break;
+            }
+            break;
+        case ResultWas::FatalErrorCondition:
+            colour = Colour::Error;
+            passOrFail = "FAILED"_sr;
+            messageLabel = "due to a fatal error condition"_sr;
+            break;
+        case ResultWas::DidntThrowException:
+            colour = Colour::Error;
+            passOrFail = "FAILED"_sr;
+            messageLabel = "because no exception was thrown where one was expected"_sr;
+            break;
+        case ResultWas::Info:
+            messageLabel = "info"_sr;
+            break;
+        case ResultWas::Warning:
+            messageLabel = "warning"_sr;
+            break;
+        case ResultWas::ExplicitFailure:
+            passOrFail = "FAILED"_sr;
+            colour = Colour::Error;
+            if (messages.size() == 1)
+                messageLabel = "explicitly with message"_sr;
+            if (messages.size() > 1)
+                messageLabel = "explicitly with messages"_sr;
+            break;
+        case ResultWas::ExplicitSkip:
+            colour = Colour::Skip;
+            passOrFail = "SKIPPED"_sr;
+            if (messages.size() == 1)
+                messageLabel = "explicitly with message"_sr;
+            if (messages.size() > 1)
+                messageLabel = "explicitly with messages"_sr;
+            break;
+            // These cases are here to prevent compiler warnings
+        case ResultWas::Unknown:
+        case ResultWas::FailureBit:
+        case ResultWas::Exception:
+            passOrFail = "** internal error **"_sr;
+            colour = Colour::Error;
+            break;
+        }
+    }
+
+    void print() const {
+        printSourceInfo();
+        if (stats.totals.assertions.total() > 0) {
+            printResultType();
+            printOriginalExpression();
+            printReconstructedExpression();
+        } else {
+            stream << '\n';
+        }
+        printMessage();
+    }
+
+private:
+    void printResultType() const {
+        if (!passOrFail.empty()) {
+            stream << colourImpl->guardColour(colour) << passOrFail << ":\n";
+        }
+    }
+    void printOriginalExpression() const {
+        if (result.hasExpression()) {
+            stream << colourImpl->guardColour( Colour::OriginalExpression )
+                   << "  " << result.getExpressionInMacro() << '\n';
+        }
+    }
+    void printReconstructedExpression() const {
+        if (result.hasExpandedExpression()) {
+            stream << "with expansion:\n";
+            stream << colourImpl->guardColour( Colour::ReconstructedExpression )
+                   << TextFlow::Column( result.getExpandedExpression() )
+                          .indent( 2 )
+                   << '\n';
+        }
+    }
+    void printMessage() const {
+        if (!messageLabel.empty())
+            stream << messageLabel << ':' << '\n';
+        for (auto const& msg : messages) {
+            // If this assertion is a warning ignore any INFO messages
+            if (printInfoMessages || msg.type != ResultWas::Info)
+                stream << TextFlow::Column(msg.message).indent(2) << '\n';
+        }
+    }
+    void printSourceInfo() const {
+        stream << colourImpl->guardColour( Colour::FileName )
+               << result.getSourceInfo() << ": ";
+    }
+
+    std::ostream& stream;
+    AssertionStats const& stats;
+    AssertionResult const& result;
+    Colour::Code colour;
+    StringRef passOrFail;
+    StringRef messageLabel;
+    std::vector<MessageInfo> const& messages;
+    ColourImpl* colourImpl;
+    bool printInfoMessages;
+};
+
+std::size_t makeRatio( std::uint64_t number, std::uint64_t total ) {
+    const auto ratio = total > 0 ? CATCH_CONFIG_CONSOLE_WIDTH * number / total : 0;
+    return (ratio == 0 && number > 0) ? 1 : static_cast<std::size_t>(ratio);
+}
+
+std::size_t&
+findMax( std::size_t& i, std::size_t& j, std::size_t& k, std::size_t& l ) {
+    if (i > j && i > k && i > l)
+        return i;
+    else if (j > k && j > l)
+        return j;
+    else if (k > l)
+        return k;
+    else
+        return l;
+}
+
+struct ColumnBreak {};
+struct RowBreak {};
+struct OutputFlush {};
+
+class Duration {
+    enum class Unit {
+        Auto,
+        Nanoseconds,
+        Microseconds,
+        Milliseconds,
+        Seconds,
+        Minutes
+    };
+    static const uint64_t s_nanosecondsInAMicrosecond = 1000;
+    static const uint64_t s_nanosecondsInAMillisecond = 1000 * s_nanosecondsInAMicrosecond;
+    static const uint64_t s_nanosecondsInASecond = 1000 * s_nanosecondsInAMillisecond;
+    static const uint64_t s_nanosecondsInAMinute = 60 * s_nanosecondsInASecond;
+
+    double m_inNanoseconds;
+    Unit m_units;
+
+public:
+    explicit Duration(double inNanoseconds, Unit units = Unit::Auto)
+        : m_inNanoseconds(inNanoseconds),
+        m_units(units) {
+        if (m_units == Unit::Auto) {
+            if (m_inNanoseconds < s_nanosecondsInAMicrosecond)
+                m_units = Unit::Nanoseconds;
+            else if (m_inNanoseconds < s_nanosecondsInAMillisecond)
+                m_units = Unit::Microseconds;
+            else if (m_inNanoseconds < s_nanosecondsInASecond)
+                m_units = Unit::Milliseconds;
+            else if (m_inNanoseconds < s_nanosecondsInAMinute)
+                m_units = Unit::Seconds;
+            else
+                m_units = Unit::Minutes;
+        }
+
+    }
+
+    auto value() const -> double {
+        switch (m_units) {
+        case Unit::Microseconds:
+            return m_inNanoseconds / static_cast<double>(s_nanosecondsInAMicrosecond);
+        case Unit::Milliseconds:
+            return m_inNanoseconds / static_cast<double>(s_nanosecondsInAMillisecond);
+        case Unit::Seconds:
+            return m_inNanoseconds / static_cast<double>(s_nanosecondsInASecond);
+        case Unit::Minutes:
+            return m_inNanoseconds / static_cast<double>(s_nanosecondsInAMinute);
+        default:
+            return m_inNanoseconds;
+        }
+    }
+    StringRef unitsAsString() const {
+        switch (m_units) {
+        case Unit::Nanoseconds:
+            return "ns"_sr;
+        case Unit::Microseconds:
+            return "us"_sr;
+        case Unit::Milliseconds:
+            return "ms"_sr;
+        case Unit::Seconds:
+            return "s"_sr;
+        case Unit::Minutes:
+            return "m"_sr;
+        default:
+            return "** internal error **"_sr;
+        }
+
+    }
+    friend auto operator << (std::ostream& os, Duration const& duration) -> std::ostream& {
+        return os << duration.value() << ' ' << duration.unitsAsString();
+    }
+};
+} // end anon namespace
+
+enum class Justification { Left, Right };
+
+struct ColumnInfo {
+    std::string name;
+    std::size_t width;
+    Justification justification;
+};
+
+class TablePrinter {
+    std::ostream& m_os;
+    std::vector<ColumnInfo> m_columnInfos;
+    ReusableStringStream m_oss;
+    int m_currentColumn = -1;
+    bool m_isOpen = false;
+
+public:
+    TablePrinter( std::ostream& os, std::vector<ColumnInfo> columnInfos )
+    :   m_os( os ),
+        m_columnInfos( CATCH_MOVE( columnInfos ) ) {}
+
+    auto columnInfos() const -> std::vector<ColumnInfo> const& {
+        return m_columnInfos;
+    }
+
+    void open() {
+        if (!m_isOpen) {
+            m_isOpen = true;
+            *this << RowBreak();
+
+			TextFlow::Columns headerCols;
+			for (auto const& info : m_columnInfos) {
+                assert(info.width > 2);
+				headerCols += TextFlow::Column(info.name).width(info.width - 2);
+                headerCols += TextFlow::Spacer( 2 );
+			}
+			m_os << headerCols << '\n';
+
+            m_os << lineOfChars('-') << '\n';
+        }
+    }
+    void close() {
+        if (m_isOpen) {
+            *this << RowBreak();
+            m_os << '\n' << std::flush;
+            m_isOpen = false;
+        }
+    }
+
+    template<typename T>
+    friend TablePrinter& operator<< (TablePrinter& tp, T const& value) {
+        tp.m_oss << value;
+        return tp;
+    }
+
+    friend TablePrinter& operator<< (TablePrinter& tp, ColumnBreak) {
+        auto colStr = tp.m_oss.str();
+        const auto strSize = colStr.size();
+        tp.m_oss.str("");
+        tp.open();
+        if (tp.m_currentColumn == static_cast<int>(tp.m_columnInfos.size() - 1)) {
+            tp.m_currentColumn = -1;
+            tp.m_os << '\n';
+        }
+        tp.m_currentColumn++;
+
+        auto colInfo = tp.m_columnInfos[tp.m_currentColumn];
+        auto padding = (strSize + 1 < colInfo.width)
+            ? std::string(colInfo.width - (strSize + 1), ' ')
+            : std::string();
+        if (colInfo.justification == Justification::Left)
+            tp.m_os << colStr << padding << ' ';
+        else
+            tp.m_os << padding << colStr << ' ';
+        return tp;
+    }
+
+    friend TablePrinter& operator<< (TablePrinter& tp, RowBreak) {
+        if (tp.m_currentColumn > 0) {
+            tp.m_os << '\n';
+            tp.m_currentColumn = -1;
+        }
+        return tp;
+    }
+
+    friend TablePrinter& operator<<(TablePrinter& tp, OutputFlush) {
+        tp.m_os << std::flush;
+        return tp;
+    }
+};
+
+ConsoleReporter::ConsoleReporter(ReporterConfig&& config):
+    StreamingReporterBase( CATCH_MOVE( config ) ),
+    m_tablePrinter(Detail::make_unique<TablePrinter>(m_stream,
+        [&config]() -> std::vector<ColumnInfo> {
+        if (config.fullConfig()->benchmarkNoAnalysis())
+        {
+            return{
+                { "benchmark name", CATCH_CONFIG_CONSOLE_WIDTH - 43, Justification::Left },
+                { "     samples", 14, Justification::Right },
+                { "  iterations", 14, Justification::Right },
+                { "        mean", 14, Justification::Right }
+            };
+        }
+        else
+        {
+            return{
+                { "benchmark name", CATCH_CONFIG_CONSOLE_WIDTH - 43, Justification::Left },
+                { "samples      mean       std dev", 14, Justification::Right },
+                { "iterations   low mean   low std dev", 14, Justification::Right },
+                { "est run time high mean  high std dev", 14, Justification::Right }
+            };
+        }
+    }())) {}
+ConsoleReporter::~ConsoleReporter() = default;
+
+std::string ConsoleReporter::getDescription() {
+    return "Reports test results as plain lines of text";
+}
+
+void ConsoleReporter::noMatchingTestCases( StringRef unmatchedSpec ) {
+    m_stream << "No test cases matched '" << unmatchedSpec << "'\n";
+}
+
+void ConsoleReporter::reportInvalidTestSpec( StringRef arg ) {
+    m_stream << "Invalid Filter: " << arg << '\n';
+}
+
+void ConsoleReporter::assertionStarting(AssertionInfo const&) {}
+
+void ConsoleReporter::assertionEnded(AssertionStats const& _assertionStats) {
+    AssertionResult const& result = _assertionStats.assertionResult;
+
+    bool includeResults = m_config->includeSuccessfulResults() || !result.isOk();
+
+    // Drop out if result was successful but we're not printing them.
+    // TODO: Make configurable whether skips should be printed
+    if (!includeResults && result.getResultType() != ResultWas::Warning && result.getResultType() != ResultWas::ExplicitSkip)
+        return;
+
+    lazyPrint();
+
+    ConsoleAssertionPrinter printer(m_stream, _assertionStats, m_colour.get(), includeResults);
+    printer.print();
+    m_stream << '\n' << std::flush;
+}
+
+void ConsoleReporter::sectionStarting(SectionInfo const& _sectionInfo) {
+    m_tablePrinter->close();
+    m_headerPrinted = false;
+    StreamingReporterBase::sectionStarting(_sectionInfo);
+}
+void ConsoleReporter::sectionEnded(SectionStats const& _sectionStats) {
+    m_tablePrinter->close();
+    if (_sectionStats.missingAssertions) {
+        lazyPrint();
+        auto guard =
+            m_colour->guardColour( Colour::ResultError ).engage( m_stream );
+        if (m_sectionStack.size() > 1)
+            m_stream << "\nNo assertions in section";
+        else
+            m_stream << "\nNo assertions in test case";
+        m_stream << " '" << _sectionStats.sectionInfo.name << "'\n\n" << std::flush;
+    }
+    double dur = _sectionStats.durationInSeconds;
+    if (shouldShowDuration(*m_config, dur)) {
+        m_stream << getFormattedDuration(dur) << " s: " << _sectionStats.sectionInfo.name << '\n' << std::flush;
+    }
+    if (m_headerPrinted) {
+        m_headerPrinted = false;
+    }
+    StreamingReporterBase::sectionEnded(_sectionStats);
+}
+
+void ConsoleReporter::benchmarkPreparing( StringRef name ) {
+	lazyPrintWithoutClosingBenchmarkTable();
+
+	auto nameCol = TextFlow::Column( static_cast<std::string>( name ) )
+                       .width( m_tablePrinter->columnInfos()[0].width - 2 );
+
+	bool firstLine = true;
+	for (auto line : nameCol) {
+		if (!firstLine)
+			(*m_tablePrinter) << ColumnBreak() << ColumnBreak() << ColumnBreak();
+		else
+			firstLine = false;
+
+		(*m_tablePrinter) << line << ColumnBreak();
+	}
+}
+
+void ConsoleReporter::benchmarkStarting(BenchmarkInfo const& info) {
+    (*m_tablePrinter) << info.samples << ColumnBreak()
+        << info.iterations << ColumnBreak();
+    if ( !m_config->benchmarkNoAnalysis() ) {
+        ( *m_tablePrinter )
+            << Duration( info.estimatedDuration ) << ColumnBreak();
+    }
+    ( *m_tablePrinter ) << OutputFlush{};
+}
+void ConsoleReporter::benchmarkEnded(BenchmarkStats<> const& stats) {
+    if (m_config->benchmarkNoAnalysis())
+    {
+        (*m_tablePrinter) << Duration(stats.mean.point.count()) << ColumnBreak();
+    }
+    else
+    {
+        (*m_tablePrinter) << ColumnBreak()
+            << Duration(stats.mean.point.count()) << ColumnBreak()
+            << Duration(stats.mean.lower_bound.count()) << ColumnBreak()
+            << Duration(stats.mean.upper_bound.count()) << ColumnBreak() << ColumnBreak()
+            << Duration(stats.standardDeviation.point.count()) << ColumnBreak()
+            << Duration(stats.standardDeviation.lower_bound.count()) << ColumnBreak()
+            << Duration(stats.standardDeviation.upper_bound.count()) << ColumnBreak() << ColumnBreak() << ColumnBreak() << ColumnBreak() << ColumnBreak();
+    }
+}
+
+void ConsoleReporter::benchmarkFailed( StringRef error ) {
+    auto guard = m_colour->guardColour( Colour::Red ).engage( m_stream );
+    (*m_tablePrinter)
+        << "Benchmark failed (" << error << ')'
+        << ColumnBreak() << RowBreak();
+}
+
+void ConsoleReporter::testCaseEnded(TestCaseStats const& _testCaseStats) {
+    m_tablePrinter->close();
+    StreamingReporterBase::testCaseEnded(_testCaseStats);
+    m_headerPrinted = false;
+}
+void ConsoleReporter::testRunEnded(TestRunStats const& _testRunStats) {
+    printTotalsDivider(_testRunStats.totals);
+    printTestRunTotals( m_stream, *m_colour, _testRunStats.totals );
+    m_stream << '\n' << std::flush;
+    StreamingReporterBase::testRunEnded(_testRunStats);
+}
+void ConsoleReporter::testRunStarting(TestRunInfo const& _testRunInfo) {
+    StreamingReporterBase::testRunStarting(_testRunInfo);
+    if ( m_config->testSpec().hasFilters() ) {
+        m_stream << m_colour->guardColour( Colour::BrightYellow ) << "Filters: "
+                 << m_config->testSpec() << '\n';
+    }
+    m_stream << "Randomness seeded to: " << getSeed() << '\n';
+}
+
+void ConsoleReporter::lazyPrint() {
+
+    m_tablePrinter->close();
+    lazyPrintWithoutClosingBenchmarkTable();
+}
+
+void ConsoleReporter::lazyPrintWithoutClosingBenchmarkTable() {
+
+    if ( !m_testRunInfoPrinted ) {
+        lazyPrintRunInfo();
+    }
+    if (!m_headerPrinted) {
+        printTestCaseAndSectionHeader();
+        m_headerPrinted = true;
+    }
+}
+void ConsoleReporter::lazyPrintRunInfo() {
+    m_stream << '\n'
+             << lineOfChars( '~' ) << '\n'
+             << m_colour->guardColour( Colour::SecondaryText )
+             << currentTestRunInfo.name << " is a Catch2 v" << libraryVersion()
+             << " host application.\n"
+             << "Run with -? for options\n\n";
+
+    m_testRunInfoPrinted = true;
+}
+void ConsoleReporter::printTestCaseAndSectionHeader() {
+    assert(!m_sectionStack.empty());
+    printOpenHeader(currentTestCaseInfo->name);
+
+    if (m_sectionStack.size() > 1) {
+        auto guard = m_colour->guardColour( Colour::Headers ).engage( m_stream );
+
+        auto
+            it = m_sectionStack.begin() + 1, // Skip first section (test case)
+            itEnd = m_sectionStack.end();
+        for (; it != itEnd; ++it)
+            printHeaderString(it->name, 2);
+    }
+
+    SourceLineInfo lineInfo = m_sectionStack.back().lineInfo;
+
+
+    m_stream << lineOfChars( '-' ) << '\n'
+             << m_colour->guardColour( Colour::FileName ) << lineInfo << '\n'
+             << lineOfChars( '.' ) << "\n\n"
+             << std::flush;
+}
+
+void ConsoleReporter::printClosedHeader(std::string const& _name) {
+    printOpenHeader(_name);
+    m_stream << lineOfChars('.') << '\n';
+}
+void ConsoleReporter::printOpenHeader(std::string const& _name) {
+    m_stream << lineOfChars('-') << '\n';
+    {
+        auto guard = m_colour->guardColour( Colour::Headers ).engage( m_stream );
+        printHeaderString(_name);
+    }
+}
+
+void ConsoleReporter::printHeaderString(std::string const& _string, std::size_t indent) {
+    // We want to get a bit fancy with line breaking here, so that subsequent
+    // lines start after ":" if one is present, e.g.
+    // ```
+    // blablabla: Fancy
+    //            linebreaking
+    // ```
+    // but we also want to avoid problems with overly long indentation causing
+    // the text to take up too many lines, e.g.
+    // ```
+    // blablabla: F
+    //            a
+    //            n
+    //            c
+    //            y
+    //            .
+    //            .
+    //            .
+    // ```
+    // So we limit the prefix indentation check to first quarter of the possible
+    // width
+    std::size_t idx = _string.find( ": " );
+    if ( idx != std::string::npos && idx < CATCH_CONFIG_CONSOLE_WIDTH / 4 ) {
+        idx += 2;
+    } else {
+        idx = 0;
+    }
+    m_stream << TextFlow::Column( _string )
+                  .indent( indent + idx )
+                  .initialIndent( indent )
+           << '\n';
+}
+
+void ConsoleReporter::printTotalsDivider(Totals const& totals) {
+    if (totals.testCases.total() > 0) {
+        std::size_t failedRatio = makeRatio(totals.testCases.failed, totals.testCases.total());
+        std::size_t failedButOkRatio = makeRatio(totals.testCases.failedButOk, totals.testCases.total());
+        std::size_t passedRatio = makeRatio(totals.testCases.passed, totals.testCases.total());
+        std::size_t skippedRatio = makeRatio(totals.testCases.skipped, totals.testCases.total());
+        while (failedRatio + failedButOkRatio + passedRatio + skippedRatio < CATCH_CONFIG_CONSOLE_WIDTH - 1)
+            findMax(failedRatio, failedButOkRatio, passedRatio, skippedRatio)++;
+        while (failedRatio + failedButOkRatio + passedRatio > CATCH_CONFIG_CONSOLE_WIDTH - 1)
+            findMax(failedRatio, failedButOkRatio, passedRatio, skippedRatio)--;
+
+        m_stream << m_colour->guardColour( Colour::Error )
+                 << std::string( failedRatio, '=' )
+                 << m_colour->guardColour( Colour::ResultExpectedFailure )
+                 << std::string( failedButOkRatio, '=' );
+        if ( totals.testCases.allPassed() ) {
+            m_stream << m_colour->guardColour( Colour::ResultSuccess )
+                     << std::string( passedRatio, '=' );
+        } else {
+            m_stream << m_colour->guardColour( Colour::Success )
+                     << std::string( passedRatio, '=' );
+        }
+        m_stream << m_colour->guardColour( Colour::Skip )
+                 << std::string( skippedRatio, '=' );
+    } else {
+        m_stream << m_colour->guardColour( Colour::Warning )
+                 << std::string( CATCH_CONFIG_CONSOLE_WIDTH - 1, '=' );
+    }
+    m_stream << '\n';
+}
+
+} // end namespace Catch
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+#if defined(__clang__)
+#  pragma clang diagnostic pop
+#endif
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_console.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_console.hpp
new file mode 100644
index 00000000..24377262
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_console.hpp
@@ -0,0 +1,67 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_REPORTER_CONSOLE_HPP_INCLUDED
+#define CATCH_REPORTER_CONSOLE_HPP_INCLUDED
+
+#include <catch2/reporters/catch_reporter_streaming_base.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
+
+namespace Catch {
+    // Fwd decls
+    class TablePrinter;
+
+    class ConsoleReporter final : public StreamingReporterBase {
+        Detail::unique_ptr<TablePrinter> m_tablePrinter;
+
+    public:
+        ConsoleReporter(ReporterConfig&& config);
+        ~ConsoleReporter() override;
+        static std::string getDescription();
+
+        void noMatchingTestCases( StringRef unmatchedSpec ) override;
+        void reportInvalidTestSpec( StringRef arg ) override;
+
+        void assertionStarting(AssertionInfo const&) override;
+
+        void assertionEnded(AssertionStats const& _assertionStats) override;
+
+        void sectionStarting(SectionInfo const& _sectionInfo) override;
+        void sectionEnded(SectionStats const& _sectionStats) override;
+
+        void benchmarkPreparing( StringRef name ) override;
+        void benchmarkStarting(BenchmarkInfo const& info) override;
+        void benchmarkEnded(BenchmarkStats<> const& stats) override;
+        void benchmarkFailed( StringRef error ) override;
+
+        void testCaseEnded(TestCaseStats const& _testCaseStats) override;
+        void testRunEnded(TestRunStats const& _testRunStats) override;
+        void testRunStarting(TestRunInfo const& _testRunInfo) override;
+
+    private:
+        void lazyPrint();
+
+        void lazyPrintWithoutClosingBenchmarkTable();
+        void lazyPrintRunInfo();
+        void printTestCaseAndSectionHeader();
+
+        void printClosedHeader(std::string const& _name);
+        void printOpenHeader(std::string const& _name);
+
+        // if string has a : in first line will set indent to follow it on
+        // subsequent lines
+        void printHeaderString(std::string const& _string, std::size_t indent = 0);
+
+        void printTotalsDivider(Totals const& totals);
+
+        bool m_headerPrinted = false;
+        bool m_testRunInfoPrinted = false;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_CONSOLE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_cumulative_base.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_cumulative_base.cpp
new file mode 100644
index 00000000..09169632
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_cumulative_base.cpp
@@ -0,0 +1,158 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/reporters/catch_reporter_cumulative_base.hpp>
+
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <algorithm>
+#include <cassert>
+
+namespace Catch {
+    namespace {
+        struct BySectionInfo {
+            BySectionInfo( SectionInfo const& other ): m_other( other ) {}
+            BySectionInfo( BySectionInfo const& other ) = default;
+            bool operator()(
+                Detail::unique_ptr<CumulativeReporterBase::SectionNode> const&
+                    node ) const {
+                return (
+                    ( node->stats.sectionInfo.name == m_other.name ) &&
+                    ( node->stats.sectionInfo.lineInfo == m_other.lineInfo ) );
+            }
+            void operator=( BySectionInfo const& ) = delete;
+
+        private:
+            SectionInfo const& m_other;
+        };
+
+    } // namespace
+
+    namespace Detail {
+        AssertionOrBenchmarkResult::AssertionOrBenchmarkResult(
+            AssertionStats const& assertion ):
+            m_assertion( assertion ) {}
+
+        AssertionOrBenchmarkResult::AssertionOrBenchmarkResult(
+            BenchmarkStats<> const& benchmark ):
+            m_benchmark( benchmark ) {}
+
+        bool AssertionOrBenchmarkResult::isAssertion() const {
+            return m_assertion.some();
+        }
+        bool AssertionOrBenchmarkResult::isBenchmark() const {
+            return m_benchmark.some();
+        }
+
+        AssertionStats const& AssertionOrBenchmarkResult::asAssertion() const {
+            assert(m_assertion.some());
+
+            return *m_assertion;
+        }
+        BenchmarkStats<> const& AssertionOrBenchmarkResult::asBenchmark() const {
+            assert(m_benchmark.some());
+
+            return *m_benchmark;
+        }
+
+    }
+
+    CumulativeReporterBase::~CumulativeReporterBase() = default;
+
+    void CumulativeReporterBase::benchmarkEnded(BenchmarkStats<> const& benchmarkStats) {
+        m_sectionStack.back()->assertionsAndBenchmarks.emplace_back(benchmarkStats);
+    }
+
+    void
+    CumulativeReporterBase::sectionStarting( SectionInfo const& sectionInfo ) {
+        // We need a copy, because SectionStats expect to take ownership
+        SectionStats incompleteStats( SectionInfo(sectionInfo), Counts(), 0, false );
+        SectionNode* node;
+        if ( m_sectionStack.empty() ) {
+            if ( !m_rootSection ) {
+                m_rootSection =
+                    Detail::make_unique<SectionNode>( incompleteStats );
+            }
+            node = m_rootSection.get();
+        } else {
+            SectionNode& parentNode = *m_sectionStack.back();
+            auto it = std::find_if( parentNode.childSections.begin(),
+                                    parentNode.childSections.end(),
+                                    BySectionInfo( sectionInfo ) );
+            if ( it == parentNode.childSections.end() ) {
+                auto newNode =
+                    Detail::make_unique<SectionNode>( incompleteStats );
+                node = newNode.get();
+                parentNode.childSections.push_back( CATCH_MOVE( newNode ) );
+            } else {
+                node = it->get();
+            }
+        }
+
+        m_deepestSection = node;
+        m_sectionStack.push_back( node );
+    }
+
+    void CumulativeReporterBase::assertionEnded(
+        AssertionStats const& assertionStats ) {
+        assert( !m_sectionStack.empty() );
+        // AssertionResult holds a pointer to a temporary DecomposedExpression,
+        // which getExpandedExpression() calls to build the expression string.
+        // Our section stack copy of the assertionResult will likely outlive the
+        // temporary, so it must be expanded or discarded now to avoid calling
+        // a destroyed object later.
+        if ( m_shouldStoreFailedAssertions &&
+             !assertionStats.assertionResult.isOk() ) {
+            static_cast<void>(
+                assertionStats.assertionResult.getExpandedExpression() );
+        }
+        if ( m_shouldStoreSuccesfulAssertions &&
+             assertionStats.assertionResult.isOk() ) {
+            static_cast<void>(
+                assertionStats.assertionResult.getExpandedExpression() );
+        }
+        SectionNode& sectionNode = *m_sectionStack.back();
+        sectionNode.assertionsAndBenchmarks.emplace_back( assertionStats );
+    }
+
+    void CumulativeReporterBase::sectionEnded( SectionStats const& sectionStats ) {
+        assert( !m_sectionStack.empty() );
+        SectionNode& node = *m_sectionStack.back();
+        node.stats = sectionStats;
+        m_sectionStack.pop_back();
+    }
+
+    void CumulativeReporterBase::testCaseEnded(
+        TestCaseStats const& testCaseStats ) {
+        auto node = Detail::make_unique<TestCaseNode>( testCaseStats );
+        assert( m_sectionStack.size() == 0 );
+        node->children.push_back( CATCH_MOVE(m_rootSection) );
+        m_testCases.push_back( CATCH_MOVE(node) );
+
+        assert( m_deepestSection );
+        m_deepestSection->stdOut = testCaseStats.stdOut;
+        m_deepestSection->stdErr = testCaseStats.stdErr;
+    }
+
+
+    void CumulativeReporterBase::testRunEnded( TestRunStats const& testRunStats ) {
+        assert(!m_testRun && "CumulativeReporterBase assumes there can only be one test run");
+        m_testRun = Detail::make_unique<TestRunNode>( testRunStats );
+        m_testRun->children.swap( m_testCases );
+        testRunEndedCumulative();
+    }
+
+    bool CumulativeReporterBase::SectionNode::hasAnyAssertions() const {
+        return std::any_of(
+            assertionsAndBenchmarks.begin(),
+            assertionsAndBenchmarks.end(),
+            []( Detail::AssertionOrBenchmarkResult const& res ) {
+                return res.isAssertion();
+            } );
+    }
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_cumulative_base.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_cumulative_base.hpp
new file mode 100644
index 00000000..267b39fd
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_cumulative_base.hpp
@@ -0,0 +1,151 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_REPORTER_CUMULATIVE_BASE_HPP_INCLUDED
+#define CATCH_REPORTER_CUMULATIVE_BASE_HPP_INCLUDED
+
+#include <catch2/reporters/catch_reporter_common_base.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
+#include <catch2/internal/catch_optional.hpp>
+
+#include <string>
+#include <vector>
+
+namespace Catch {
+
+    namespace Detail {
+
+        //! Represents either an assertion or a benchmark result to be handled by cumulative reporter later
+        class AssertionOrBenchmarkResult {
+            // This should really be a variant, but this is much faster
+            // to write and the data layout here is already terrible
+            // enough that we do not have to care about the object size.
+            Optional<AssertionStats> m_assertion;
+            Optional<BenchmarkStats<>> m_benchmark;
+        public:
+            AssertionOrBenchmarkResult(AssertionStats const& assertion);
+            AssertionOrBenchmarkResult(BenchmarkStats<> const& benchmark);
+
+            bool isAssertion() const;
+            bool isBenchmark() const;
+
+            AssertionStats const& asAssertion() const;
+            BenchmarkStats<> const& asBenchmark() const;
+        };
+    }
+
+    /**
+     * Utility base for reporters that need to handle all results at once
+     *
+     * It stores tree of all test cases, sections and assertions, and after the
+     * test run is finished, calls into `testRunEndedCumulative` to pass the
+     * control to the deriving class.
+     *
+     * If you are deriving from this class and override any testing related
+     * member functions, you should first call into the base's implementation to
+     * avoid breaking the tree construction.
+     *
+     * Due to the way this base functions, it has to expand assertions up-front,
+     * even if they are later unused (e.g. because the deriving reporter does
+     * not report successful assertions, or because the deriving reporter does
+     * not use assertion expansion at all). Derived classes can use two
+     * customization points, `m_shouldStoreSuccesfulAssertions` and
+     * `m_shouldStoreFailedAssertions`, to disable the expansion and gain extra
+     * performance. **Accessing the assertion expansions if it wasn't stored is
+     * UB.**
+     */
+    class CumulativeReporterBase : public ReporterBase {
+    public:
+        template<typename T, typename ChildNodeT>
+        struct Node {
+            explicit Node( T const& _value ) : value( _value ) {}
+
+            using ChildNodes = std::vector<Detail::unique_ptr<ChildNodeT>>;
+            T value;
+            ChildNodes children;
+        };
+        struct SectionNode {
+            explicit SectionNode(SectionStats const& _stats) : stats(_stats) {}
+
+            bool operator == (SectionNode const& other) const {
+                return stats.sectionInfo.lineInfo == other.stats.sectionInfo.lineInfo;
+            }
+
+            bool hasAnyAssertions() const;
+
+            SectionStats stats;
+            std::vector<Detail::unique_ptr<SectionNode>> childSections;
+            std::vector<Detail::AssertionOrBenchmarkResult> assertionsAndBenchmarks;
+            std::string stdOut;
+            std::string stdErr;
+        };
+
+
+        using TestCaseNode = Node<TestCaseStats, SectionNode>;
+        using TestRunNode = Node<TestRunStats, TestCaseNode>;
+
+        // GCC5 compat: we cannot use inherited constructor, because it
+        //              doesn't implement backport of P0136
+        CumulativeReporterBase(ReporterConfig&& _config):
+            ReporterBase(CATCH_MOVE(_config))
+        {}
+        ~CumulativeReporterBase() override;
+
+        void benchmarkPreparing( StringRef ) override {}
+        void benchmarkStarting( BenchmarkInfo const& ) override {}
+        void benchmarkEnded( BenchmarkStats<> const& benchmarkStats ) override;
+        void benchmarkFailed( StringRef ) override {}
+
+        void noMatchingTestCases( StringRef ) override {}
+        void reportInvalidTestSpec( StringRef ) override {}
+        void fatalErrorEncountered( StringRef /*error*/ ) override {}
+
+        void testRunStarting( TestRunInfo const& ) override {}
+
+        void testCaseStarting( TestCaseInfo const& ) override {}
+        void testCasePartialStarting( TestCaseInfo const&, uint64_t ) override {}
+        void sectionStarting( SectionInfo const& sectionInfo ) override;
+
+        void assertionStarting( AssertionInfo const& ) override {}
+
+        void assertionEnded( AssertionStats const& assertionStats ) override;
+        void sectionEnded( SectionStats const& sectionStats ) override;
+        void testCasePartialEnded( TestCaseStats const&, uint64_t ) override {}
+        void testCaseEnded( TestCaseStats const& testCaseStats ) override;
+        void testRunEnded( TestRunStats const& testRunStats ) override;
+        //! Customization point: called after last test finishes (testRunEnded has been handled)
+        virtual void testRunEndedCumulative() = 0;
+
+        void skipTest(TestCaseInfo const&) override {}
+
+    protected:
+        //! Should the cumulative base store the assertion expansion for successful assertions?
+        bool m_shouldStoreSuccesfulAssertions = true;
+        //! Should the cumulative base store the assertion expansion for failed assertions?
+        bool m_shouldStoreFailedAssertions = true;
+
+        // We need lazy construction here. We should probably refactor it
+        // later, after the events are redone.
+        //! The root node of the test run tree.
+        Detail::unique_ptr<TestRunNode> m_testRun;
+
+    private:
+        // Note: We rely on pointer identity being stable, which is why
+        //       we store pointers to the nodes rather than the values.
+        std::vector<Detail::unique_ptr<TestCaseNode>> m_testCases;
+        // Root section of the _current_ test case
+        Detail::unique_ptr<SectionNode> m_rootSection;
+        // Deepest section of the _current_ test case
+        SectionNode* m_deepestSection = nullptr;
+        // Stack of _active_ sections in the _current_ test case
+        std::vector<SectionNode*> m_sectionStack;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_CUMULATIVE_BASE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_event_listener.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_event_listener.cpp
new file mode 100644
index 00000000..e94063ba
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_event_listener.cpp
@@ -0,0 +1,40 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/reporters/catch_reporter_event_listener.hpp>
+
+namespace Catch {
+
+    void EventListenerBase::fatalErrorEncountered( StringRef ) {}
+
+    void EventListenerBase::benchmarkPreparing( StringRef ) {}
+    void EventListenerBase::benchmarkStarting( BenchmarkInfo const& ) {}
+    void EventListenerBase::benchmarkEnded( BenchmarkStats<> const& ) {}
+    void EventListenerBase::benchmarkFailed( StringRef ) {}
+
+    void EventListenerBase::assertionStarting( AssertionInfo const& ) {}
+
+    void EventListenerBase::assertionEnded( AssertionStats const& ) {}
+    void EventListenerBase::listReporters(
+        std::vector<ReporterDescription> const& ) {}
+    void EventListenerBase::listListeners(
+        std::vector<ListenerDescription> const& ) {}
+    void EventListenerBase::listTests( std::vector<TestCaseHandle> const& ) {}
+    void EventListenerBase::listTags( std::vector<TagInfo> const& ) {}
+    void EventListenerBase::noMatchingTestCases( StringRef ) {}
+    void EventListenerBase::reportInvalidTestSpec( StringRef ) {}
+    void EventListenerBase::testRunStarting( TestRunInfo const& ) {}
+    void EventListenerBase::testCaseStarting( TestCaseInfo const& ) {}
+    void EventListenerBase::testCasePartialStarting(TestCaseInfo const&, uint64_t) {}
+    void EventListenerBase::sectionStarting( SectionInfo const& ) {}
+    void EventListenerBase::sectionEnded( SectionStats const& ) {}
+    void EventListenerBase::testCasePartialEnded(TestCaseStats const&, uint64_t) {}
+    void EventListenerBase::testCaseEnded( TestCaseStats const& ) {}
+    void EventListenerBase::testRunEnded( TestRunStats const& ) {}
+    void EventListenerBase::skipTest( TestCaseInfo const& ) {}
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_event_listener.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_event_listener.hpp
new file mode 100644
index 00000000..346263e2
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_event_listener.hpp
@@ -0,0 +1,60 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_REPORTER_EVENT_LISTENER_HPP_INCLUDED
+#define CATCH_REPORTER_EVENT_LISTENER_HPP_INCLUDED
+
+#include <catch2/interfaces/catch_interfaces_reporter.hpp>
+
+namespace Catch {
+
+    /**
+     * Base class to simplify implementing listeners.
+     *
+     * Provides empty default implementation for all IEventListener member
+     * functions, so that a listener implementation can pick which
+     * member functions it actually cares about.
+     */
+    class EventListenerBase : public IEventListener {
+    public:
+        using IEventListener::IEventListener;
+
+        void reportInvalidTestSpec( StringRef unmatchedSpec ) override;
+        void fatalErrorEncountered( StringRef error ) override;
+
+        void benchmarkPreparing( StringRef name ) override;
+        void benchmarkStarting( BenchmarkInfo const& benchmarkInfo ) override;
+        void benchmarkEnded( BenchmarkStats<> const& benchmarkStats ) override;
+        void benchmarkFailed( StringRef error ) override;
+
+        void assertionStarting( AssertionInfo const& assertionInfo ) override;
+        void assertionEnded( AssertionStats const& assertionStats ) override;
+
+        void listReporters(
+            std::vector<ReporterDescription> const& descriptions ) override;
+        void listListeners(
+            std::vector<ListenerDescription> const& descriptions ) override;
+        void listTests( std::vector<TestCaseHandle> const& tests ) override;
+        void listTags( std::vector<TagInfo> const& tagInfos ) override;
+
+        void noMatchingTestCases( StringRef unmatchedSpec ) override;
+        void testRunStarting( TestRunInfo const& testRunInfo ) override;
+        void testCaseStarting( TestCaseInfo const& testInfo ) override;
+        void testCasePartialStarting( TestCaseInfo const& testInfo,
+                                      uint64_t partNumber ) override;
+        void sectionStarting( SectionInfo const& sectionInfo ) override;
+        void sectionEnded( SectionStats const& sectionStats ) override;
+        void testCasePartialEnded( TestCaseStats const& testCaseStats,
+                                   uint64_t partNumber ) override;
+        void testCaseEnded( TestCaseStats const& testCaseStats ) override;
+        void testRunEnded( TestRunStats const& testRunStats ) override;
+        void skipTest( TestCaseInfo const& testInfo ) override;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_EVENT_LISTENER_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_helpers.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_helpers.cpp
new file mode 100644
index 00000000..ffb32ffb
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_helpers.cpp
@@ -0,0 +1,343 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/reporters/catch_reporter_helpers.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/internal/catch_console_width.hpp>
+#include <catch2/internal/catch_errno_guard.hpp>
+#include <catch2/internal/catch_textflow.hpp>
+#include <catch2/internal/catch_reusable_string_stream.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/internal/catch_console_colour.hpp>
+#include <catch2/catch_tostring.hpp>
+#include <catch2/catch_test_case_info.hpp>
+
+#include <algorithm>
+#include <cfloat>
+#include <cstdio>
+#include <ostream>
+#include <iomanip>
+
+namespace Catch {
+
+    namespace {
+        void listTestNamesOnly(std::ostream& out,
+                               std::vector<TestCaseHandle> const& tests) {
+            for (auto const& test : tests) {
+                auto const& testCaseInfo = test.getTestCaseInfo();
+
+                if (startsWith(testCaseInfo.name, '#')) {
+                    out << '"' << testCaseInfo.name << '"';
+                } else {
+                    out << testCaseInfo.name;
+                }
+
+                out << '\n';
+            }
+            out << std::flush;
+        }
+    } // end unnamed namespace
+
+
+    // Because formatting using c++ streams is stateful, drop down to C is
+    // required Alternatively we could use stringstream, but its performance
+    // is... not good.
+    std::string getFormattedDuration( double duration ) {
+        // Max exponent + 1 is required to represent the whole part
+        // + 1 for decimal point
+        // + 3 for the 3 decimal places
+        // + 1 for null terminator
+        const std::size_t maxDoubleSize = DBL_MAX_10_EXP + 1 + 1 + 3 + 1;
+        char buffer[maxDoubleSize];
+
+        // Save previous errno, to prevent sprintf from overwriting it
+        ErrnoGuard guard;
+#ifdef _MSC_VER
+        size_t printedLength = static_cast<size_t>(
+            sprintf_s( buffer, "%.3f", duration ) );
+#else
+        size_t printedLength = static_cast<size_t>(
+            std::snprintf( buffer, maxDoubleSize, "%.3f", duration ) );
+#endif
+        return std::string( buffer, printedLength );
+    }
+
+    bool shouldShowDuration( IConfig const& config, double duration ) {
+        if ( config.showDurations() == ShowDurations::Always ) {
+            return true;
+        }
+        if ( config.showDurations() == ShowDurations::Never ) {
+            return false;
+        }
+        const double min = config.minDuration();
+        return min >= 0 && duration >= min;
+    }
+
+    std::string serializeFilters( std::vector<std::string> const& filters ) {
+        // We add a ' ' separator between each filter
+        size_t serialized_size = filters.size() - 1;
+        for (auto const& filter : filters) {
+            serialized_size += filter.size();
+        }
+
+        std::string serialized;
+        serialized.reserve(serialized_size);
+        bool first = true;
+
+        for (auto const& filter : filters) {
+            if (!first) {
+                serialized.push_back(' ');
+            }
+            first = false;
+            serialized.append(filter);
+        }
+
+        return serialized;
+    }
+
+    std::ostream& operator<<( std::ostream& out, lineOfChars value ) {
+        for ( size_t idx = 0; idx < CATCH_CONFIG_CONSOLE_WIDTH - 1; ++idx ) {
+            out.put( value.c );
+        }
+        return out;
+    }
+
+    void
+    defaultListReporters( std::ostream& out,
+                          std::vector<ReporterDescription> const& descriptions,
+                          Verbosity verbosity ) {
+        out << "Available reporters:\n";
+        const auto maxNameLen =
+            std::max_element( descriptions.begin(),
+                              descriptions.end(),
+                              []( ReporterDescription const& lhs,
+                                  ReporterDescription const& rhs ) {
+                                  return lhs.name.size() < rhs.name.size();
+                              } )
+                ->name.size();
+
+        for ( auto const& desc : descriptions ) {
+            if ( verbosity == Verbosity::Quiet ) {
+                out << TextFlow::Column( desc.name )
+                           .indent( 2 )
+                           .width( 5 + maxNameLen )
+                    << '\n';
+            } else {
+                out << TextFlow::Column( desc.name + ':' )
+                               .indent( 2 )
+                               .width( 5 + maxNameLen ) +
+                           TextFlow::Column( desc.description )
+                               .initialIndent( 0 )
+                               .indent( 2 )
+                               .width( CATCH_CONFIG_CONSOLE_WIDTH - maxNameLen - 8 )
+                    << '\n';
+            }
+        }
+        out << '\n' << std::flush;
+    }
+
+    void defaultListListeners( std::ostream& out,
+                               std::vector<ListenerDescription> const& descriptions ) {
+        out << "Registered listeners:\n";
+
+        if(descriptions.empty()) {
+            return;
+        }
+
+        const auto maxNameLen =
+            std::max_element( descriptions.begin(),
+                              descriptions.end(),
+                              []( ListenerDescription const& lhs,
+                                  ListenerDescription const& rhs ) {
+                                  return lhs.name.size() < rhs.name.size();
+                              } )
+                ->name.size();
+
+        for ( auto const& desc : descriptions ) {
+            out << TextFlow::Column( static_cast<std::string>( desc.name ) +
+                                     ':' )
+                           .indent( 2 )
+                           .width( maxNameLen + 5 ) +
+                       TextFlow::Column( desc.description )
+                           .initialIndent( 0 )
+                           .indent( 2 )
+                           .width( CATCH_CONFIG_CONSOLE_WIDTH - maxNameLen - 8 )
+                << '\n';
+        }
+
+        out << '\n' << std::flush;
+    }
+
+    void defaultListTags( std::ostream& out,
+                          std::vector<TagInfo> const& tags,
+                          bool isFiltered ) {
+        if ( isFiltered ) {
+            out << "Tags for matching test cases:\n";
+        } else {
+            out << "All available tags:\n";
+        }
+
+        for ( auto const& tagCount : tags ) {
+            ReusableStringStream rss;
+            rss << "  " << std::setw( 2 ) << tagCount.count << "  ";
+            auto str = rss.str();
+            auto wrapper = TextFlow::Column( tagCount.all() )
+                               .initialIndent( 0 )
+                               .indent( str.size() )
+                               .width( CATCH_CONFIG_CONSOLE_WIDTH - 10 );
+            out << str << wrapper << '\n';
+        }
+        out << pluralise(tags.size(), "tag"_sr) << "\n\n" << std::flush;
+    }
+
+    void defaultListTests(std::ostream& out, ColourImpl* streamColour, std::vector<TestCaseHandle> const& tests, bool isFiltered, Verbosity verbosity) {
+        // We special case this to provide the equivalent of old
+        // `--list-test-names-only`, which could then be used by the
+        // `--input-file` option.
+        if (verbosity == Verbosity::Quiet) {
+            listTestNamesOnly(out, tests);
+            return;
+        }
+
+        if (isFiltered) {
+            out << "Matching test cases:\n";
+        } else {
+            out << "All available test cases:\n";
+        }
+
+        for (auto const& test : tests) {
+            auto const& testCaseInfo = test.getTestCaseInfo();
+            Colour::Code colour = testCaseInfo.isHidden()
+                ? Colour::SecondaryText
+                : Colour::None;
+            auto colourGuard = streamColour->guardColour( colour ).engage( out );
+
+            out << TextFlow::Column(testCaseInfo.name).indent(2) << '\n';
+            if (verbosity >= Verbosity::High) {
+                out << TextFlow::Column(Catch::Detail::stringify(testCaseInfo.lineInfo)).indent(4) << '\n';
+            }
+            if (!testCaseInfo.tags.empty() &&
+                verbosity > Verbosity::Quiet) {
+                out << TextFlow::Column(testCaseInfo.tagsAsString()).indent(6) << '\n';
+            }
+        }
+
+        if (isFiltered) {
+            out << pluralise(tests.size(), "matching test case"_sr);
+        } else {
+            out << pluralise(tests.size(), "test case"_sr);
+        }
+        out << "\n\n" << std::flush;
+    }
+
+    namespace {
+        class SummaryColumn {
+        public:
+            SummaryColumn( std::string suffix, Colour::Code colour ):
+                m_suffix( CATCH_MOVE( suffix ) ), m_colour( colour ) {}
+
+            SummaryColumn&& addRow( std::uint64_t count ) && {
+                std::string row = std::to_string(count);
+                auto const new_width = std::max( m_width, row.size() );
+                if ( new_width > m_width ) {
+                    for ( auto& oldRow : m_rows ) {
+                        oldRow.insert( 0, new_width - m_width, ' ' );
+                    }
+                } else {
+                    row.insert( 0, m_width - row.size(), ' ' );
+                }
+                m_width = new_width;
+                m_rows.push_back( row );
+                return std::move( *this );
+            }
+
+            std::string const& getSuffix() const { return m_suffix; }
+            Colour::Code getColour() const { return m_colour; }
+            std::string const& getRow( std::size_t index ) const {
+                return m_rows[index];
+            }
+
+        private:
+            std::string m_suffix;
+            Colour::Code m_colour;
+            std::size_t m_width = 0;
+            std::vector<std::string> m_rows;
+        };
+
+        void printSummaryRow( std::ostream& stream,
+                              ColourImpl& colour,
+                              StringRef label,
+                              std::vector<SummaryColumn> const& cols,
+                              std::size_t row ) {
+            for ( auto const& col : cols ) {
+                auto const& value = col.getRow( row );
+                auto const& suffix = col.getSuffix();
+                if ( suffix.empty() ) {
+                    stream << label << ": ";
+                    if ( value != "0" ) {
+                        stream << value;
+                    } else {
+                        stream << colour.guardColour( Colour::Warning )
+                               << "- none -";
+                    }
+                } else if ( value != "0" ) {
+                    stream << colour.guardColour( Colour::LightGrey ) << " | "
+                           << colour.guardColour( col.getColour() ) << value
+                           << ' ' << suffix;
+                }
+            }
+            stream << '\n';
+        }
+    } // namespace
+
+    void printTestRunTotals( std::ostream& stream,
+                             ColourImpl& streamColour,
+                             Totals const& totals ) {
+        if ( totals.testCases.total() == 0 ) {
+            stream << streamColour.guardColour( Colour::Warning )
+                   << "No tests ran\n";
+            return;
+        }
+
+        if ( totals.assertions.total() > 0 && totals.testCases.allPassed() ) {
+            stream << streamColour.guardColour( Colour::ResultSuccess )
+                   << "All tests passed";
+            stream << " ("
+                   << pluralise( totals.assertions.passed, "assertion"_sr )
+                   << " in "
+                   << pluralise( totals.testCases.passed, "test case"_sr )
+                   << ')' << '\n';
+            return;
+        }
+
+        std::vector<SummaryColumn> columns;
+        // Don't include "skipped assertions" in total count
+        const auto totalAssertionCount =
+            totals.assertions.total() - totals.assertions.skipped;
+        columns.push_back( SummaryColumn( "", Colour::None )
+                               .addRow( totals.testCases.total() )
+                               .addRow( totalAssertionCount ) );
+        columns.push_back( SummaryColumn( "passed", Colour::Success )
+                               .addRow( totals.testCases.passed )
+                               .addRow( totals.assertions.passed ) );
+        columns.push_back( SummaryColumn( "failed", Colour::ResultError )
+                               .addRow( totals.testCases.failed )
+                               .addRow( totals.assertions.failed ) );
+        columns.push_back( SummaryColumn( "skipped", Colour::Skip )
+                               .addRow( totals.testCases.skipped )
+                               // Don't print "skipped assertions"
+                               .addRow( 0 ) );
+        columns.push_back(
+            SummaryColumn( "failed as expected", Colour::ResultExpectedFailure )
+                .addRow( totals.testCases.failedButOk )
+                .addRow( totals.assertions.failedButOk ) );
+        printSummaryRow( stream, streamColour, "test cases"_sr, columns, 0 );
+        printSummaryRow( stream, streamColour, "assertions"_sr, columns, 1 );
+    }
+
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_helpers.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_helpers.hpp
new file mode 100644
index 00000000..316cb404
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_helpers.hpp
@@ -0,0 +1,95 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_REPORTER_HELPERS_HPP_INCLUDED
+#define CATCH_REPORTER_HELPERS_HPP_INCLUDED
+
+#include <iosfwd>
+#include <string>
+#include <vector>
+
+#include <catch2/internal/catch_list.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/catch_totals.hpp>
+
+namespace Catch {
+
+    class IConfig;
+    class TestCaseHandle;
+    class ColourImpl;
+
+    // Returns double formatted as %.3f (format expected on output)
+    std::string getFormattedDuration( double duration );
+
+    //! Should the reporter show duration of test given current configuration?
+    bool shouldShowDuration( IConfig const& config, double duration );
+
+    std::string serializeFilters( std::vector<std::string> const& filters );
+
+    struct lineOfChars {
+        char c;
+        constexpr lineOfChars( char c_ ): c( c_ ) {}
+
+        friend std::ostream& operator<<( std::ostream& out, lineOfChars value );
+    };
+
+    /**
+     * Lists reporter descriptions to the provided stream in user-friendly
+     * format
+     *
+     * Used as the default listing implementation by the first party reporter
+     * bases. The output should be backwards compatible with the output of
+     * Catch2 v2 binaries.
+     */
+    void
+    defaultListReporters( std::ostream& out,
+                          std::vector<ReporterDescription> const& descriptions,
+                          Verbosity verbosity );
+
+    /**
+     * Lists listeners descriptions to the provided stream in user-friendly
+     * format
+     */
+    void defaultListListeners( std::ostream& out,
+                               std::vector<ListenerDescription> const& descriptions );
+
+    /**
+     * Lists tag information to the provided stream in user-friendly format
+     *
+     * Used as the default listing implementation by the first party reporter
+     * bases. The output should be backwards compatible with the output of
+     * Catch2 v2 binaries.
+     */
+    void defaultListTags( std::ostream& out, std::vector<TagInfo> const& tags, bool isFiltered );
+
+    /**
+     * Lists test case information to the provided stream in user-friendly
+     * format
+     *
+     * Used as the default listing implementation by the first party reporter
+     * bases. The output is backwards compatible with the output of Catch2
+     * v2 binaries, and also supports the format specific to the old
+     * `--list-test-names-only` option, for people who used it in integrations.
+     */
+    void defaultListTests( std::ostream& out,
+                           ColourImpl* streamColour,
+                           std::vector<TestCaseHandle> const& tests,
+                           bool isFiltered,
+                           Verbosity verbosity );
+
+    /**
+     * Prints test run totals to the provided stream in user-friendly format
+     *
+     * Used by the console and compact reporters.
+     */
+    void printTestRunTotals( std::ostream& stream,
+                      ColourImpl& streamColour,
+                      Totals const& totals );
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_HELPERS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_json.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_json.cpp
new file mode 100644
index 00000000..6a8e655f
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_json.cpp
@@ -0,0 +1,372 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+//
+#include <catch2/catch_test_case_info.hpp>
+#include <catch2/catch_test_spec.hpp>
+#include <catch2/catch_version.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/internal/catch_list.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/reporters/catch_reporter_json.hpp>
+
+namespace Catch {
+    namespace {
+        void writeSourceInfo( JsonObjectWriter& writer,
+                              SourceLineInfo const& sourceInfo ) {
+            auto source_location_writer =
+                writer.write( "source-location"_sr ).writeObject();
+            source_location_writer.write( "filename"_sr )
+                .write( sourceInfo.file );
+            source_location_writer.write( "line"_sr ).write( sourceInfo.line );
+        }
+
+        void writeTags( JsonArrayWriter writer, std::vector<Tag> const& tags ) {
+            for ( auto const& tag : tags ) {
+                writer.write( tag.original );
+            }
+        }
+
+        void writeProperties( JsonArrayWriter writer,
+                              TestCaseInfo const& info ) {
+            if ( info.isHidden() ) { writer.write( "is-hidden"_sr ); }
+            if ( info.okToFail() ) { writer.write( "ok-to-fail"_sr ); }
+            if ( info.expectedToFail() ) {
+                writer.write( "expected-to-fail"_sr );
+            }
+            if ( info.throws() ) { writer.write( "throws"_sr ); }
+        }
+
+    } // namespace
+
+    JsonReporter::JsonReporter( ReporterConfig&& config ):
+        StreamingReporterBase{ CATCH_MOVE( config ) } {
+
+        m_preferences.shouldRedirectStdOut = true;
+        // TBD: Do we want to report all assertions? XML reporter does
+        //      not, but for machine-parseable reporters I think the answer
+        //      should be yes.
+        m_preferences.shouldReportAllAssertions = true;
+
+        m_objectWriters.emplace( m_stream );
+        m_writers.emplace( Writer::Object );
+        auto& writer = m_objectWriters.top();
+
+        writer.write( "version"_sr ).write( 1 );
+
+        {
+            auto metadata_writer = writer.write( "metadata"_sr ).writeObject();
+            metadata_writer.write( "name"_sr ).write( m_config->name() );
+            metadata_writer.write( "rng-seed"_sr ).write( m_config->rngSeed() );
+            metadata_writer.write( "catch2-version"_sr )
+                .write( libraryVersion() );
+            if ( m_config->testSpec().hasFilters() ) {
+                metadata_writer.write( "filters"_sr )
+                    .write( m_config->testSpec() );
+            }
+        }
+    }
+
+    JsonReporter::~JsonReporter() {
+        endListing();
+        // TODO: Ensure this closes the top level object, add asserts
+        assert( m_writers.size() == 1 && "Only the top level object should be open" );
+        assert( m_writers.top() == Writer::Object );
+        endObject();
+        m_stream << '\n' << std::flush;
+        assert( m_writers.empty() );
+    }
+
+    JsonArrayWriter& JsonReporter::startArray() {
+        m_arrayWriters.emplace( m_arrayWriters.top().writeArray() );
+        m_writers.emplace( Writer::Array );
+        return m_arrayWriters.top();
+    }
+    JsonArrayWriter& JsonReporter::startArray( StringRef key ) {
+        m_arrayWriters.emplace(
+            m_objectWriters.top().write( key ).writeArray() );
+        m_writers.emplace( Writer::Array );
+        return m_arrayWriters.top();
+    }
+
+    JsonObjectWriter& JsonReporter::startObject() {
+        m_objectWriters.emplace( m_arrayWriters.top().writeObject() );
+        m_writers.emplace( Writer::Object );
+        return m_objectWriters.top();
+    }
+    JsonObjectWriter& JsonReporter::startObject( StringRef key ) {
+        m_objectWriters.emplace(
+            m_objectWriters.top().write( key ).writeObject() );
+        m_writers.emplace( Writer::Object );
+        return m_objectWriters.top();
+    }
+
+    void JsonReporter::endObject() {
+        assert( isInside( Writer::Object ) );
+        m_objectWriters.pop();
+        m_writers.pop();
+    }
+    void JsonReporter::endArray() {
+        assert( isInside( Writer::Array ) );
+        m_arrayWriters.pop();
+        m_writers.pop();
+    }
+
+    bool JsonReporter::isInside( Writer writer ) {
+        return !m_writers.empty() && m_writers.top() == writer;
+    }
+
+    void JsonReporter::startListing() {
+        if ( !m_startedListing ) { startObject( "listings"_sr ); }
+        m_startedListing = true;
+    }
+    void JsonReporter::endListing() {
+        if ( m_startedListing ) { endObject(); }
+        m_startedListing = false;
+    }
+
+    std::string JsonReporter::getDescription() {
+        return "Outputs listings as JSON. Test listing is Work-in-Progress!";
+    }
+
+    void JsonReporter::testRunStarting( TestRunInfo const& runInfo ) {
+        StreamingReporterBase::testRunStarting( runInfo );
+        endListing();
+
+        assert( isInside( Writer::Object ) );
+        startObject( "test-run"_sr );
+        startArray( "test-cases"_sr );
+    }
+
+     static void writeCounts( JsonObjectWriter&& writer, Counts const& counts ) {
+        writer.write( "passed"_sr ).write( counts.passed );
+        writer.write( "failed"_sr ).write( counts.failed );
+        writer.write( "fail-but-ok"_sr ).write( counts.failedButOk );
+        writer.write( "skipped"_sr ).write( counts.skipped );
+    }
+
+    void JsonReporter::testRunEnded(TestRunStats const& runStats) {
+        assert( isInside( Writer::Array ) );
+        // End "test-cases"
+        endArray();
+
+        {
+            auto totals =
+                m_objectWriters.top().write( "totals"_sr ).writeObject();
+            writeCounts( totals.write( "assertions"_sr ).writeObject(),
+                         runStats.totals.assertions );
+            writeCounts( totals.write( "test-cases"_sr ).writeObject(),
+                         runStats.totals.testCases );
+        }
+
+        // End the "test-run" object
+        endObject();
+    }
+
+    void JsonReporter::testCaseStarting( TestCaseInfo const& tcInfo ) {
+        StreamingReporterBase::testCaseStarting( tcInfo );
+
+        assert( isInside( Writer::Array ) &&
+                "We should be in the 'test-cases' array" );
+        startObject();
+        // "test-info" prelude
+        {
+            auto testInfo =
+                m_objectWriters.top().write( "test-info"_sr ).writeObject();
+            // TODO: handle testName vs className!!
+            testInfo.write( "name"_sr ).write( tcInfo.name );
+            writeSourceInfo(testInfo, tcInfo.lineInfo);
+            writeTags( testInfo.write( "tags"_sr ).writeArray(), tcInfo.tags );
+            writeProperties( testInfo.write( "properties"_sr ).writeArray(),
+                             tcInfo );
+        }
+
+
+        // Start the array for individual test runs (testCasePartial pairs)
+        startArray( "runs"_sr );
+    }
+
+    void JsonReporter::testCaseEnded( TestCaseStats const& tcStats ) {
+        StreamingReporterBase::testCaseEnded( tcStats );
+
+        // We need to close the 'runs' array before finishing the test case
+        assert( isInside( Writer::Array ) );
+        endArray();
+
+        {
+            auto totals =
+                m_objectWriters.top().write( "totals"_sr ).writeObject();
+            writeCounts( totals.write( "assertions"_sr ).writeObject(),
+                         tcStats.totals.assertions );
+            // We do not write the test case totals, because there will always be just one test case here.
+            // TODO: overall "result" -> success, skip, fail here? Or in partial result?
+        }
+        // We do not write out stderr/stdout, because we instead wrote those out in partial runs
+
+        // TODO: aborting?
+
+        // And we also close this test case's object
+        assert( isInside( Writer::Object ) );
+        endObject();
+    }
+
+    void JsonReporter::testCasePartialStarting( TestCaseInfo const& /*tcInfo*/,
+                                                uint64_t index ) {
+        startObject();
+        m_objectWriters.top().write( "run-idx"_sr ).write( index );
+        startArray( "path"_sr );
+        // TODO: we want to delay most of the printing to the 'root' section
+        // TODO: childSection key name?
+    }
+
+    void JsonReporter::testCasePartialEnded( TestCaseStats const& tcStats,
+                                             uint64_t /*index*/ ) {
+        // Fixme: the top level section handles this.
+        //// path object
+        endArray();
+        if ( !tcStats.stdOut.empty() ) {
+            m_objectWriters.top()
+                .write( "captured-stdout"_sr )
+                .write( tcStats.stdOut );
+        }
+        if ( !tcStats.stdErr.empty() ) {
+            m_objectWriters.top()
+                .write( "captured-stderr"_sr )
+                .write( tcStats.stdErr );
+        }
+        {
+            auto totals =
+                m_objectWriters.top().write( "totals"_sr ).writeObject();
+            writeCounts( totals.write( "assertions"_sr ).writeObject(),
+                         tcStats.totals.assertions );
+            // We do not write the test case totals, because there will
+            // always be just one test case here.
+            // TODO: overall "result" -> success, skip, fail here? Or in
+            // partial result?
+        }
+        // TODO: aborting?
+        // run object
+        endObject();
+    }
+
+    void JsonReporter::sectionStarting( SectionInfo const& sectionInfo ) {
+        assert( isInside( Writer::Array ) &&
+                "Section should always start inside an object" );
+        // We want to nest top level sections, even though it shares name
+        // and source loc with the TEST_CASE
+        auto& sectionObject = startObject();
+        sectionObject.write( "kind"_sr ).write( "section"_sr );
+        sectionObject.write( "name"_sr ).write( sectionInfo.name );
+        writeSourceInfo( m_objectWriters.top(), sectionInfo.lineInfo );
+
+
+        // TBD: Do we want to create this event lazily? It would become
+        //      rather complex, but we could do it, and it would look
+        //      better for empty sections. OTOH, empty sections should
+        //      be rare.
+        startArray( "path"_sr );
+    }
+    void JsonReporter::sectionEnded( SectionStats const& /*sectionStats */) {
+        // End the subpath array
+        endArray();
+        // TODO: metadata
+        // TODO: what info do we have here?
+
+        // End the section object
+        endObject();
+    }
+
+    void JsonReporter::assertionStarting( AssertionInfo const& /*assertionInfo*/ ) {}
+    void JsonReporter::assertionEnded( AssertionStats const& assertionStats ) {
+        // TODO: There is lot of different things to handle here, but
+        //       we can fill it in later, after we show that the basic
+        //       outline and streaming reporter impl works well enough.
+        //if ( !m_config->includeSuccessfulResults()
+        //    && assertionStats.assertionResult.isOk() ) {
+        //    return;
+        //}
+        assert( isInside( Writer::Array ) );
+        auto assertionObject = m_arrayWriters.top().writeObject();
+
+        assertionObject.write( "kind"_sr ).write( "assertion"_sr );
+        writeSourceInfo( assertionObject,
+                         assertionStats.assertionResult.getSourceInfo() );
+        assertionObject.write( "status"_sr )
+            .write( assertionStats.assertionResult.isOk() );
+        // TODO: handling of result.
+        // TODO: messages
+        // TODO: totals?
+    }
+
+
+    void JsonReporter::benchmarkPreparing( StringRef name ) { (void)name; }
+    void JsonReporter::benchmarkStarting( BenchmarkInfo const& ) {}
+    void JsonReporter::benchmarkEnded( BenchmarkStats<> const& ) {}
+    void JsonReporter::benchmarkFailed( StringRef error ) { (void)error; }
+
+    void JsonReporter::listReporters(
+        std::vector<ReporterDescription> const& descriptions ) {
+        startListing();
+
+        auto writer =
+            m_objectWriters.top().write( "reporters"_sr ).writeArray();
+        for ( auto const& desc : descriptions ) {
+            auto desc_writer = writer.writeObject();
+            desc_writer.write( "name"_sr ).write( desc.name );
+            desc_writer.write( "description"_sr ).write( desc.description );
+        }
+    }
+    void JsonReporter::listListeners(
+        std::vector<ListenerDescription> const& descriptions ) {
+        startListing();
+
+        auto writer =
+            m_objectWriters.top().write( "listeners"_sr ).writeArray();
+
+        for ( auto const& desc : descriptions ) {
+            auto desc_writer = writer.writeObject();
+            desc_writer.write( "name"_sr ).write( desc.name );
+            desc_writer.write( "description"_sr ).write( desc.description );
+        }
+    }
+    void JsonReporter::listTests( std::vector<TestCaseHandle> const& tests ) {
+        startListing();
+
+        auto writer = m_objectWriters.top().write( "tests"_sr ).writeArray();
+
+        for ( auto const& test : tests ) {
+            auto desc_writer = writer.writeObject();
+            auto const& info = test.getTestCaseInfo();
+
+            desc_writer.write( "name"_sr ).write( info.name );
+            desc_writer.write( "class-name"_sr ).write( info.className );
+            {
+                auto tag_writer = desc_writer.write( "tags"_sr ).writeArray();
+                for ( auto const& tag : info.tags ) {
+                    tag_writer.write( tag.original );
+                }
+            }
+            writeSourceInfo( desc_writer, info.lineInfo );
+        }
+    }
+    void JsonReporter::listTags( std::vector<TagInfo> const& tags ) {
+        startListing();
+
+        auto writer = m_objectWriters.top().write( "tags"_sr ).writeArray();
+        for ( auto const& tag : tags ) {
+            auto tag_writer = writer.writeObject();
+            {
+                auto aliases_writer =
+                    tag_writer.write( "aliases"_sr ).writeArray();
+                for ( auto alias : tag.spellings ) {
+                    aliases_writer.write( alias );
+                }
+            }
+            tag_writer.write( "count"_sr ).write( tag.count );
+        }
+    }
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_json.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_json.hpp
new file mode 100644
index 00000000..c938ca39
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_json.hpp
@@ -0,0 +1,95 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#ifndef CATCH_REPORTER_JSON_HPP_INCLUDED
+#define CATCH_REPORTER_JSON_HPP_INCLUDED
+
+#include <catch2/catch_timer.hpp>
+#include <catch2/internal/catch_jsonwriter.hpp>
+#include <catch2/reporters/catch_reporter_streaming_base.hpp>
+
+#include <stack>
+
+namespace Catch {
+    class JsonReporter : public StreamingReporterBase {
+    public:
+        JsonReporter( ReporterConfig&& config );
+
+        ~JsonReporter() override;
+
+        static std::string getDescription();
+
+    public: // StreamingReporterBase
+        void testRunStarting( TestRunInfo const& runInfo ) override;
+        void testRunEnded( TestRunStats const& runStats ) override;
+
+        void testCaseStarting( TestCaseInfo const& tcInfo ) override;
+        void testCaseEnded( TestCaseStats const& tcStats ) override;
+
+        void testCasePartialStarting( TestCaseInfo const& tcInfo,
+                                      uint64_t index ) override;
+        void testCasePartialEnded( TestCaseStats const& tcStats,
+                                   uint64_t index ) override;
+
+        void sectionStarting( SectionInfo const& sectionInfo ) override;
+        void sectionEnded( SectionStats const& sectionStats ) override;
+
+        void assertionStarting( AssertionInfo const& assertionInfo ) override;
+        void assertionEnded( AssertionStats const& assertionStats ) override;
+
+        //void testRunEndedCumulative() override;
+
+        void benchmarkPreparing( StringRef name ) override;
+        void benchmarkStarting( BenchmarkInfo const& ) override;
+        void benchmarkEnded( BenchmarkStats<> const& ) override;
+        void benchmarkFailed( StringRef error ) override;
+
+        void listReporters(
+            std::vector<ReporterDescription> const& descriptions ) override;
+        void listListeners(
+            std::vector<ListenerDescription> const& descriptions ) override;
+        void listTests( std::vector<TestCaseHandle> const& tests ) override;
+        void listTags( std::vector<TagInfo> const& tags ) override;
+
+    private:
+        Timer m_testCaseTimer;
+        enum class Writer {
+            Object,
+            Array
+        };
+
+        JsonArrayWriter& startArray();
+        JsonArrayWriter& startArray( StringRef key );
+
+        JsonObjectWriter& startObject();
+        JsonObjectWriter& startObject( StringRef key );
+
+        void endObject();
+        void endArray();
+
+        bool isInside( Writer writer );
+
+        void startListing();
+        void endListing();
+
+        // Invariant:
+        // When m_writers is not empty and its top element is
+        // - Writer::Object, then m_objectWriters is not be empty
+        // - Writer::Array,  then m_arrayWriters shall not be empty
+        std::stack<JsonObjectWriter> m_objectWriters{};
+        std::stack<JsonArrayWriter> m_arrayWriters{};
+        std::stack<Writer> m_writers{};
+
+        bool m_startedListing = false;
+
+        // std::size_t m_sectionDepth = 0;
+        // std::size_t m_sectionStarted = 0;
+    };
+} // namespace Catch
+
+#endif // CATCH_REPORTER_JSON_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_junit.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_junit.cpp
new file mode 100644
index 00000000..4589365c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_junit.cpp
@@ -0,0 +1,309 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/reporters/catch_reporter_junit.hpp>
+
+#include <catch2/reporters/catch_reporter_helpers.hpp>
+#include <catch2/catch_tostring.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/internal/catch_textflow.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/catch_test_case_info.hpp>
+#include <catch2/catch_test_spec.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <cassert>
+#include <ctime>
+#include <algorithm>
+#include <iomanip>
+
+namespace Catch {
+
+    namespace {
+        std::string getCurrentTimestamp() {
+            time_t rawtime;
+            std::time(&rawtime);
+
+            std::tm timeInfo = {};
+#if defined (_MSC_VER) || defined (__MINGW32__)
+            gmtime_s(&timeInfo, &rawtime);
+#elif defined (CATCH_PLATFORM_PLAYSTATION)
+            gmtime_s(&rawtime, &timeInfo);
+#elif defined (__IAR_SYSTEMS_ICC__)
+            timeInfo = *std::gmtime(&rawtime);
+#else
+            gmtime_r(&rawtime, &timeInfo);
+#endif
+
+            auto const timeStampSize = sizeof("2017-01-16T17:06:45Z");
+            char timeStamp[timeStampSize];
+            const char * const fmt = "%Y-%m-%dT%H:%M:%SZ";
+
+            std::strftime(timeStamp, timeStampSize, fmt, &timeInfo);
+
+            return std::string(timeStamp, timeStampSize - 1);
+        }
+
+        std::string fileNameTag(std::vector<Tag> const& tags) {
+            auto it = std::find_if(begin(tags),
+                                   end(tags),
+                                   [] (Tag const& tag) {
+                                       return tag.original.size() > 0
+                                           && tag.original[0] == '#'; });
+            if (it != tags.end()) {
+                return static_cast<std::string>(
+                    it->original.substr(1, it->original.size() - 1)
+                );
+            }
+            return std::string();
+        }
+
+        // Formats the duration in seconds to 3 decimal places.
+        // This is done because some genius defined Maven Surefire schema
+        // in a way that only accepts 3 decimal places, and tools like
+        // Jenkins use that schema for validation JUnit reporter output.
+        std::string formatDuration( double seconds ) {
+            ReusableStringStream rss;
+            rss << std::fixed << std::setprecision( 3 ) << seconds;
+            return rss.str();
+        }
+
+        static void normalizeNamespaceMarkers(std::string& str) {
+            std::size_t pos = str.find( "::" );
+            while ( pos != std::string::npos ) {
+                str.replace( pos, 2, "." );
+                pos += 1;
+                pos = str.find( "::", pos );
+            }
+        }
+
+    } // anonymous namespace
+
+    JunitReporter::JunitReporter( ReporterConfig&& _config )
+        :   CumulativeReporterBase( CATCH_MOVE(_config) ),
+            xml( m_stream )
+        {
+            m_preferences.shouldRedirectStdOut = true;
+            m_preferences.shouldReportAllAssertions = true;
+            m_shouldStoreSuccesfulAssertions = false;
+        }
+
+    std::string JunitReporter::getDescription() {
+        return "Reports test results in an XML format that looks like Ant's junitreport target";
+    }
+
+    void JunitReporter::testRunStarting( TestRunInfo const& runInfo )  {
+        CumulativeReporterBase::testRunStarting( runInfo );
+        xml.startElement( "testsuites" );
+        suiteTimer.start();
+        stdOutForSuite.clear();
+        stdErrForSuite.clear();
+        unexpectedExceptions = 0;
+    }
+
+    void JunitReporter::testCaseStarting( TestCaseInfo const& testCaseInfo ) {
+        m_okToFail = testCaseInfo.okToFail();
+    }
+
+    void JunitReporter::assertionEnded( AssertionStats const& assertionStats ) {
+        if( assertionStats.assertionResult.getResultType() == ResultWas::ThrewException && !m_okToFail )
+            unexpectedExceptions++;
+        CumulativeReporterBase::assertionEnded( assertionStats );
+    }
+
+    void JunitReporter::testCaseEnded( TestCaseStats const& testCaseStats ) {
+        stdOutForSuite += testCaseStats.stdOut;
+        stdErrForSuite += testCaseStats.stdErr;
+        CumulativeReporterBase::testCaseEnded( testCaseStats );
+    }
+
+    void JunitReporter::testRunEndedCumulative() {
+        const auto suiteTime = suiteTimer.getElapsedSeconds();
+        writeRun( *m_testRun, suiteTime );
+        xml.endElement();
+    }
+
+    void JunitReporter::writeRun( TestRunNode const& testRunNode, double suiteTime ) {
+        XmlWriter::ScopedElement e = xml.scopedElement( "testsuite" );
+
+        TestRunStats const& stats = testRunNode.value;
+        xml.writeAttribute( "name"_sr, stats.runInfo.name );
+        xml.writeAttribute( "errors"_sr, unexpectedExceptions );
+        xml.writeAttribute( "failures"_sr, stats.totals.assertions.failed-unexpectedExceptions );
+        xml.writeAttribute( "skipped"_sr, stats.totals.assertions.skipped );
+        xml.writeAttribute( "tests"_sr, stats.totals.assertions.total() );
+        xml.writeAttribute( "hostname"_sr, "tbd"_sr ); // !TBD
+        if( m_config->showDurations() == ShowDurations::Never )
+            xml.writeAttribute( "time"_sr, ""_sr );
+        else
+            xml.writeAttribute( "time"_sr, formatDuration( suiteTime ) );
+        xml.writeAttribute( "timestamp"_sr, getCurrentTimestamp() );
+
+        // Write properties
+        {
+            auto properties = xml.scopedElement("properties");
+            xml.scopedElement("property")
+                .writeAttribute("name"_sr, "random-seed"_sr)
+                .writeAttribute("value"_sr, m_config->rngSeed());
+            if (m_config->testSpec().hasFilters()) {
+                xml.scopedElement("property")
+                    .writeAttribute("name"_sr, "filters"_sr)
+                    .writeAttribute("value"_sr, m_config->testSpec());
+            }
+        }
+
+        // Write test cases
+        for( auto const& child : testRunNode.children )
+            writeTestCase( *child );
+
+        xml.scopedElement( "system-out" ).writeText( trim( stdOutForSuite ), XmlFormatting::Newline );
+        xml.scopedElement( "system-err" ).writeText( trim( stdErrForSuite ), XmlFormatting::Newline );
+    }
+
+    void JunitReporter::writeTestCase( TestCaseNode const& testCaseNode ) {
+        TestCaseStats const& stats = testCaseNode.value;
+
+        // All test cases have exactly one section - which represents the
+        // test case itself. That section may have 0-n nested sections
+        assert( testCaseNode.children.size() == 1 );
+        SectionNode const& rootSection = *testCaseNode.children.front();
+
+        std::string className =
+            static_cast<std::string>( stats.testInfo->className );
+
+        if( className.empty() ) {
+            className = fileNameTag(stats.testInfo->tags);
+            if ( className.empty() ) {
+                className = "global";
+            }
+        }
+
+        if ( !m_config->name().empty() )
+            className = static_cast<std::string>(m_config->name()) + '.' + className;
+
+        normalizeNamespaceMarkers(className);
+
+        writeSection( className, "", rootSection, stats.testInfo->okToFail() );
+    }
+
+    void JunitReporter::writeSection( std::string const& className,
+                                      std::string const& rootName,
+                                      SectionNode const& sectionNode,
+                                      bool testOkToFail) {
+        std::string name = trim( sectionNode.stats.sectionInfo.name );
+        if( !rootName.empty() )
+            name = rootName + '/' + name;
+
+        if( sectionNode.hasAnyAssertions()
+           || !sectionNode.stdOut.empty()
+           || !sectionNode.stdErr.empty() ) {
+            XmlWriter::ScopedElement e = xml.scopedElement( "testcase" );
+            if( className.empty() ) {
+                xml.writeAttribute( "classname"_sr, name );
+                xml.writeAttribute( "name"_sr, "root"_sr );
+            }
+            else {
+                xml.writeAttribute( "classname"_sr, className );
+                xml.writeAttribute( "name"_sr, name );
+            }
+            xml.writeAttribute( "time"_sr, formatDuration( sectionNode.stats.durationInSeconds ) );
+            // This is not ideal, but it should be enough to mimic gtest's
+            // junit output.
+            // Ideally the JUnit reporter would also handle `skipTest`
+            // events and write those out appropriately.
+            xml.writeAttribute( "status"_sr, "run"_sr );
+
+            if (sectionNode.stats.assertions.failedButOk) {
+                xml.scopedElement("skipped")
+                    .writeAttribute("message", "TEST_CASE tagged with !mayfail");
+            }
+
+            writeAssertions( sectionNode );
+
+
+            if( !sectionNode.stdOut.empty() )
+                xml.scopedElement( "system-out" ).writeText( trim( sectionNode.stdOut ), XmlFormatting::Newline );
+            if( !sectionNode.stdErr.empty() )
+                xml.scopedElement( "system-err" ).writeText( trim( sectionNode.stdErr ), XmlFormatting::Newline );
+        }
+        for( auto const& childNode : sectionNode.childSections )
+            if( className.empty() )
+                writeSection( name, "", *childNode, testOkToFail );
+            else
+                writeSection( className, name, *childNode, testOkToFail );
+    }
+
+    void JunitReporter::writeAssertions( SectionNode const& sectionNode ) {
+        for (auto const& assertionOrBenchmark : sectionNode.assertionsAndBenchmarks) {
+            if (assertionOrBenchmark.isAssertion()) {
+                writeAssertion(assertionOrBenchmark.asAssertion());
+            }
+        }
+    }
+
+    void JunitReporter::writeAssertion( AssertionStats const& stats ) {
+        AssertionResult const& result = stats.assertionResult;
+        if ( !result.isOk() ||
+             result.getResultType() == ResultWas::ExplicitSkip ) {
+            std::string elementName;
+            switch( result.getResultType() ) {
+                case ResultWas::ThrewException:
+                case ResultWas::FatalErrorCondition:
+                    elementName = "error";
+                    break;
+                case ResultWas::ExplicitFailure:
+                case ResultWas::ExpressionFailed:
+                case ResultWas::DidntThrowException:
+                    elementName = "failure";
+                    break;
+                case ResultWas::ExplicitSkip:
+                    elementName = "skipped";
+                    break;
+                // We should never see these here:
+                case ResultWas::Info:
+                case ResultWas::Warning:
+                case ResultWas::Ok:
+                case ResultWas::Unknown:
+                case ResultWas::FailureBit:
+                case ResultWas::Exception:
+                    elementName = "internalError";
+                    break;
+            }
+
+            XmlWriter::ScopedElement e = xml.scopedElement( elementName );
+
+            xml.writeAttribute( "message"_sr, result.getExpression() );
+            xml.writeAttribute( "type"_sr, result.getTestMacroName() );
+
+            ReusableStringStream rss;
+            if ( result.getResultType() == ResultWas::ExplicitSkip ) {
+                rss << "SKIPPED\n";
+            } else {
+                rss << "FAILED" << ":\n";
+                if (result.hasExpression()) {
+                    rss << "  ";
+                    rss << result.getExpressionInMacro();
+                    rss << '\n';
+                }
+                if (result.hasExpandedExpression()) {
+                    rss << "with expansion:\n";
+                    rss << TextFlow::Column(result.getExpandedExpression()).indent(2) << '\n';
+                }
+            }
+
+            if( result.hasMessage() )
+                rss << result.getMessage() << '\n';
+            for( auto const& msg : stats.infoMessages )
+                if( msg.type == ResultWas::Info )
+                    rss << msg.message << '\n';
+
+            rss << "at " << result.getSourceInfo();
+            xml.writeText( rss.str(), XmlFormatting::Newline );
+        }
+    }
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_junit.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_junit.hpp
new file mode 100644
index 00000000..7cb53c25
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_junit.hpp
@@ -0,0 +1,56 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_REPORTER_JUNIT_HPP_INCLUDED
+#define CATCH_REPORTER_JUNIT_HPP_INCLUDED
+
+
+#include <catch2/reporters/catch_reporter_cumulative_base.hpp>
+#include <catch2/internal/catch_xmlwriter.hpp>
+#include <catch2/catch_timer.hpp>
+
+namespace Catch {
+
+    class JunitReporter final : public CumulativeReporterBase {
+    public:
+        JunitReporter(ReporterConfig&& _config);
+
+        static std::string getDescription();
+
+        void testRunStarting(TestRunInfo const& runInfo) override;
+
+        void testCaseStarting(TestCaseInfo const& testCaseInfo) override;
+        void assertionEnded(AssertionStats const& assertionStats) override;
+
+        void testCaseEnded(TestCaseStats const& testCaseStats) override;
+
+        void testRunEndedCumulative() override;
+
+    private:
+        void writeRun(TestRunNode const& testRunNode, double suiteTime);
+
+        void writeTestCase(TestCaseNode const& testCaseNode);
+
+        void writeSection( std::string const& className,
+                           std::string const& rootName,
+                           SectionNode const& sectionNode,
+                           bool testOkToFail );
+
+        void writeAssertions(SectionNode const& sectionNode);
+        void writeAssertion(AssertionStats const& stats);
+
+        XmlWriter xml;
+        Timer suiteTimer;
+        std::string stdOutForSuite;
+        std::string stdErrForSuite;
+        unsigned int unexpectedExceptions = 0;
+        bool m_okToFail = false;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_JUNIT_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_multi.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_multi.cpp
new file mode 100644
index 00000000..531902be
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_multi.cpp
@@ -0,0 +1,197 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/reporters/catch_reporter_multi.hpp>
+
+#include <catch2/catch_config.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/internal/catch_stdstreams.hpp>
+
+#include <ostream>
+
+namespace Catch {
+    void MultiReporter::updatePreferences(IEventListener const& reporterish) {
+        m_preferences.shouldRedirectStdOut |=
+            reporterish.getPreferences().shouldRedirectStdOut;
+        m_preferences.shouldReportAllAssertions |=
+            reporterish.getPreferences().shouldReportAllAssertions;
+    }
+
+    void MultiReporter::addListener( IEventListenerPtr&& listener ) {
+        updatePreferences(*listener);
+        m_reporterLikes.insert(m_reporterLikes.begin() + m_insertedListeners, CATCH_MOVE(listener) );
+        ++m_insertedListeners;
+    }
+
+    void MultiReporter::addReporter( IEventListenerPtr&& reporter ) {
+        updatePreferences(*reporter);
+
+        // We will need to output the captured stdout if there are reporters
+        // that do not want it captured.
+        // We do not consider listeners, because it is generally assumed that
+        // listeners are output-transparent, even though they can ask for stdout
+        // capture to do something with it.
+        m_haveNoncapturingReporters |= !reporter->getPreferences().shouldRedirectStdOut;
+
+        // Reporters can always be placed to the back without breaking the
+        // reporting order
+        m_reporterLikes.push_back( CATCH_MOVE( reporter ) );
+    }
+
+    void MultiReporter::noMatchingTestCases( StringRef unmatchedSpec ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->noMatchingTestCases( unmatchedSpec );
+        }
+    }
+
+    void MultiReporter::fatalErrorEncountered( StringRef error ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->fatalErrorEncountered( error );
+        }
+    }
+
+    void MultiReporter::reportInvalidTestSpec( StringRef arg ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->reportInvalidTestSpec( arg );
+        }
+    }
+
+    void MultiReporter::benchmarkPreparing( StringRef name ) {
+        for (auto& reporterish : m_reporterLikes) {
+            reporterish->benchmarkPreparing(name);
+        }
+    }
+    void MultiReporter::benchmarkStarting( BenchmarkInfo const& benchmarkInfo ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->benchmarkStarting( benchmarkInfo );
+        }
+    }
+    void MultiReporter::benchmarkEnded( BenchmarkStats<> const& benchmarkStats ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->benchmarkEnded( benchmarkStats );
+        }
+    }
+
+    void MultiReporter::benchmarkFailed( StringRef error ) {
+        for (auto& reporterish : m_reporterLikes) {
+            reporterish->benchmarkFailed(error);
+        }
+    }
+
+    void MultiReporter::testRunStarting( TestRunInfo const& testRunInfo ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->testRunStarting( testRunInfo );
+        }
+    }
+
+    void MultiReporter::testCaseStarting( TestCaseInfo const& testInfo ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->testCaseStarting( testInfo );
+        }
+    }
+
+    void
+    MultiReporter::testCasePartialStarting( TestCaseInfo const& testInfo,
+                                                uint64_t partNumber ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->testCasePartialStarting( testInfo, partNumber );
+        }
+    }
+
+    void MultiReporter::sectionStarting( SectionInfo const& sectionInfo ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->sectionStarting( sectionInfo );
+        }
+    }
+
+    void MultiReporter::assertionStarting( AssertionInfo const& assertionInfo ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->assertionStarting( assertionInfo );
+        }
+    }
+
+    void MultiReporter::assertionEnded( AssertionStats const& assertionStats ) {
+        const bool reportByDefault =
+            assertionStats.assertionResult.getResultType() != ResultWas::Ok ||
+            m_config->includeSuccessfulResults();
+
+        for ( auto & reporterish : m_reporterLikes ) {
+            if ( reportByDefault ||
+                 reporterish->getPreferences().shouldReportAllAssertions ) {
+                    reporterish->assertionEnded( assertionStats );
+            }
+        }
+    }
+
+    void MultiReporter::sectionEnded( SectionStats const& sectionStats ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->sectionEnded( sectionStats );
+        }
+    }
+
+    void MultiReporter::testCasePartialEnded( TestCaseStats const& testStats,
+                                                  uint64_t partNumber ) {
+        if ( m_preferences.shouldRedirectStdOut &&
+             m_haveNoncapturingReporters ) {
+            if ( !testStats.stdOut.empty() ) {
+                Catch::cout() << testStats.stdOut << std::flush;
+            }
+            if ( !testStats.stdErr.empty() ) {
+                Catch::cerr() << testStats.stdErr << std::flush;
+            }
+        }
+
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->testCasePartialEnded( testStats, partNumber );
+        }
+    }
+
+    void MultiReporter::testCaseEnded( TestCaseStats const& testCaseStats ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->testCaseEnded( testCaseStats );
+        }
+    }
+
+    void MultiReporter::testRunEnded( TestRunStats const& testRunStats ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->testRunEnded( testRunStats );
+        }
+    }
+
+
+    void MultiReporter::skipTest( TestCaseInfo const& testInfo ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->skipTest( testInfo );
+        }
+    }
+
+    void MultiReporter::listReporters(std::vector<ReporterDescription> const& descriptions) {
+        for (auto& reporterish : m_reporterLikes) {
+            reporterish->listReporters(descriptions);
+        }
+    }
+
+    void MultiReporter::listListeners(
+        std::vector<ListenerDescription> const& descriptions ) {
+        for ( auto& reporterish : m_reporterLikes ) {
+            reporterish->listListeners( descriptions );
+        }
+    }
+
+    void MultiReporter::listTests(std::vector<TestCaseHandle> const& tests) {
+        for (auto& reporterish : m_reporterLikes) {
+            reporterish->listTests(tests);
+        }
+    }
+
+    void MultiReporter::listTags(std::vector<TagInfo> const& tags) {
+        for (auto& reporterish : m_reporterLikes) {
+            reporterish->listTags(tags);
+        }
+    }
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_multi.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_multi.hpp
new file mode 100644
index 00000000..66113837
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_multi.hpp
@@ -0,0 +1,72 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_REPORTER_MULTI_HPP_INCLUDED
+#define CATCH_REPORTER_MULTI_HPP_INCLUDED
+
+#include <catch2/interfaces/catch_interfaces_reporter.hpp>
+
+namespace Catch {
+
+    class MultiReporter final : public IEventListener {
+        /*
+         * Stores all added reporters and listeners
+         *
+         * All Listeners are stored before all reporters, and individual
+         * listeners/reporters are stored in order of insertion.
+         */
+        std::vector<IEventListenerPtr> m_reporterLikes;
+        bool m_haveNoncapturingReporters = false;
+
+        // Keep track of how many listeners we have already inserted,
+        // so that we can insert them into the main vector at the right place
+        size_t m_insertedListeners = 0;
+
+        void updatePreferences(IEventListener const& reporterish);
+
+    public:
+        using IEventListener::IEventListener;
+
+        void addListener( IEventListenerPtr&& listener );
+        void addReporter( IEventListenerPtr&& reporter );
+
+    public: // IEventListener
+
+        void noMatchingTestCases( StringRef unmatchedSpec ) override;
+        void fatalErrorEncountered( StringRef error ) override;
+        void reportInvalidTestSpec( StringRef arg ) override;
+
+        void benchmarkPreparing( StringRef name ) override;
+        void benchmarkStarting( BenchmarkInfo const& benchmarkInfo ) override;
+        void benchmarkEnded( BenchmarkStats<> const& benchmarkStats ) override;
+        void benchmarkFailed( StringRef error ) override;
+
+        void testRunStarting( TestRunInfo const& testRunInfo ) override;
+        void testCaseStarting( TestCaseInfo const& testInfo ) override;
+        void testCasePartialStarting(TestCaseInfo const& testInfo, uint64_t partNumber) override;
+        void sectionStarting( SectionInfo const& sectionInfo ) override;
+        void assertionStarting( AssertionInfo const& assertionInfo ) override;
+
+        void assertionEnded( AssertionStats const& assertionStats ) override;
+        void sectionEnded( SectionStats const& sectionStats ) override;
+        void testCasePartialEnded(TestCaseStats const& testStats, uint64_t partNumber) override;
+        void testCaseEnded( TestCaseStats const& testCaseStats ) override;
+        void testRunEnded( TestRunStats const& testRunStats ) override;
+
+        void skipTest( TestCaseInfo const& testInfo ) override;
+
+        void listReporters(std::vector<ReporterDescription> const& descriptions) override;
+        void listListeners(std::vector<ListenerDescription> const& descriptions) override;
+        void listTests(std::vector<TestCaseHandle> const& tests) override;
+        void listTags(std::vector<TagInfo> const& tags) override;
+
+
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_MULTI_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_registrars.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_registrars.cpp
new file mode 100644
index 00000000..2a3ac957
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_registrars.cpp
@@ -0,0 +1,36 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+#include <catch2/reporters/catch_reporter_registrars.hpp>
+
+#include <catch2/interfaces/catch_interfaces_registry_hub.hpp>
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+
+namespace Catch {
+    namespace Detail {
+
+        void registerReporterImpl( std::string const& name,
+                                   IReporterFactoryPtr reporterPtr ) {
+            CATCH_TRY {
+                getMutableRegistryHub().registerReporter(
+                    name, CATCH_MOVE( reporterPtr ) );
+            }
+            CATCH_CATCH_ALL {
+                // Do not throw when constructing global objects, instead
+                // register the exception to be processed later
+                getMutableRegistryHub().registerStartupException();
+            }
+        }
+
+        void registerListenerImpl( Detail::unique_ptr<EventListenerFactory> listenerFactory ) {
+            getMutableRegistryHub().registerListener( CATCH_MOVE(listenerFactory) );
+        }
+
+
+    } // namespace Detail
+} // namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_registrars.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_registrars.hpp
new file mode 100644
index 00000000..a93963f0
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_registrars.hpp
@@ -0,0 +1,131 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_REPORTER_REGISTRARS_HPP_INCLUDED
+#define CATCH_REPORTER_REGISTRARS_HPP_INCLUDED
+
+#include <catch2/interfaces/catch_interfaces_reporter_factory.hpp>
+#include <catch2/internal/catch_compiler_capabilities.hpp>
+#include <catch2/internal/catch_unique_name.hpp>
+#include <catch2/internal/catch_unique_ptr.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/internal/catch_void_type.hpp>
+
+#include <type_traits>
+
+namespace Catch {
+
+    namespace Detail {
+
+        template <typename T, typename = void>
+        struct has_description : std::false_type {};
+
+        template <typename T>
+        struct has_description<
+            T,
+            void_t<decltype( T::getDescription() )>>
+            : std::true_type {};
+
+        //! Indirection for reporter registration, so that the error handling is
+        //! independent on the reporter's concrete type
+        void registerReporterImpl( std::string const& name,
+                                   IReporterFactoryPtr reporterPtr );
+        //! Actually registers the factory, independent on listener's concrete type
+        void registerListenerImpl( Detail::unique_ptr<EventListenerFactory> listenerFactory );
+    } // namespace Detail
+
+    class IEventListener;
+    using IEventListenerPtr = Detail::unique_ptr<IEventListener>;
+
+    template <typename T>
+    class ReporterFactory : public IReporterFactory {
+
+        IEventListenerPtr create( ReporterConfig&& config ) const override {
+            return Detail::make_unique<T>( CATCH_MOVE(config) );
+        }
+
+        std::string getDescription() const override {
+            return T::getDescription();
+        }
+    };
+
+
+    template<typename T>
+    class ReporterRegistrar {
+    public:
+        explicit ReporterRegistrar( std::string const& name ) {
+            registerReporterImpl( name,
+                                  Detail::make_unique<ReporterFactory<T>>() );
+        }
+    };
+
+    template<typename T>
+    class ListenerRegistrar {
+
+        class TypedListenerFactory : public EventListenerFactory {
+            StringRef m_listenerName;
+
+            std::string getDescriptionImpl( std::true_type ) const {
+                return T::getDescription();
+            }
+
+            std::string getDescriptionImpl( std::false_type ) const {
+                return "(No description provided)";
+            }
+
+        public:
+            TypedListenerFactory( StringRef listenerName ):
+                m_listenerName( listenerName ) {}
+
+            IEventListenerPtr create( IConfig const* config ) const override {
+                return Detail::make_unique<T>( config );
+            }
+
+            StringRef getName() const override {
+                return m_listenerName;
+            }
+
+            std::string getDescription() const override {
+                return getDescriptionImpl( Detail::has_description<T>{} );
+            }
+        };
+
+    public:
+        ListenerRegistrar(StringRef listenerName) {
+            registerListenerImpl( Detail::make_unique<TypedListenerFactory>(listenerName) );
+        }
+    };
+}
+
+#if !defined(CATCH_CONFIG_DISABLE)
+
+#    define CATCH_REGISTER_REPORTER( name, reporterType )                      \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                              \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                               \
+        namespace {                                                            \
+            Catch::ReporterRegistrar<reporterType> INTERNAL_CATCH_UNIQUE_NAME( \
+                catch_internal_RegistrarFor )( name );                         \
+        }                                                                      \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#    define CATCH_REGISTER_LISTENER( listenerType )                            \
+        CATCH_INTERNAL_START_WARNINGS_SUPPRESSION                              \
+        CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS                               \
+        namespace {                                                            \
+            Catch::ListenerRegistrar<listenerType> INTERNAL_CATCH_UNIQUE_NAME( \
+                catch_internal_RegistrarFor )( #listenerType##_catch_sr );     \
+        }                                                                      \
+        CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION
+
+#else // CATCH_CONFIG_DISABLE
+
+#define CATCH_REGISTER_REPORTER(name, reporterType)
+#define CATCH_REGISTER_LISTENER(listenerType)
+
+#endif // CATCH_CONFIG_DISABLE
+
+#endif // CATCH_REPORTER_REGISTRARS_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_sonarqube.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_sonarqube.cpp
new file mode 100644
index 00000000..9c391b1f
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_sonarqube.cpp
@@ -0,0 +1,162 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/reporters/catch_reporter_sonarqube.hpp>
+
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/catch_test_case_info.hpp>
+#include <catch2/internal/catch_reusable_string_stream.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/catch_test_spec.hpp>
+#include <catch2/reporters/catch_reporter_helpers.hpp>
+
+#include <map>
+
+namespace Catch {
+
+    namespace {
+        std::string createMetadataString(IConfig const& config) {
+            ReusableStringStream sstr;
+            if ( config.testSpec().hasFilters() ) {
+                sstr << "filters='"
+                         << config.testSpec()
+                         << "' ";
+            }
+            sstr << "rng-seed=" << config.rngSeed();
+            return sstr.str();
+        }
+    }
+
+    void SonarQubeReporter::testRunStarting(TestRunInfo const& testRunInfo) {
+        CumulativeReporterBase::testRunStarting(testRunInfo);
+
+        xml.writeComment( createMetadataString( *m_config ) );
+        xml.startElement("testExecutions");
+        xml.writeAttribute("version"_sr, '1');
+    }
+
+    void SonarQubeReporter::writeRun( TestRunNode const& runNode ) {
+        std::map<StringRef, std::vector<TestCaseNode const*>> testsPerFile;
+
+        for ( auto const& child : runNode.children ) {
+            testsPerFile[child->value.testInfo->lineInfo.file].push_back(
+                child.get() );
+        }
+
+        for ( auto const& kv : testsPerFile ) {
+            writeTestFile( kv.first, kv.second );
+        }
+    }
+
+    void SonarQubeReporter::writeTestFile(StringRef filename, std::vector<TestCaseNode const*> const& testCaseNodes) {
+        XmlWriter::ScopedElement e = xml.scopedElement("file");
+        xml.writeAttribute("path"_sr, filename);
+
+        for (auto const& child : testCaseNodes)
+            writeTestCase(*child);
+    }
+
+    void SonarQubeReporter::writeTestCase(TestCaseNode const& testCaseNode) {
+        // All test cases have exactly one section - which represents the
+        // test case itself. That section may have 0-n nested sections
+        assert(testCaseNode.children.size() == 1);
+        SectionNode const& rootSection = *testCaseNode.children.front();
+        writeSection("", rootSection, testCaseNode.value.testInfo->okToFail());
+    }
+
+    void SonarQubeReporter::writeSection(std::string const& rootName, SectionNode const& sectionNode, bool okToFail) {
+        std::string name = trim(sectionNode.stats.sectionInfo.name);
+        if (!rootName.empty())
+            name = rootName + '/' + name;
+
+        if ( sectionNode.hasAnyAssertions()
+            || !sectionNode.stdOut.empty()
+            ||  !sectionNode.stdErr.empty() ) {
+            XmlWriter::ScopedElement e = xml.scopedElement("testCase");
+            xml.writeAttribute("name"_sr, name);
+            xml.writeAttribute("duration"_sr, static_cast<long>(sectionNode.stats.durationInSeconds * 1000));
+
+            writeAssertions(sectionNode, okToFail);
+        }
+
+        for (auto const& childNode : sectionNode.childSections)
+            writeSection(name, *childNode, okToFail);
+    }
+
+    void SonarQubeReporter::writeAssertions(SectionNode const& sectionNode, bool okToFail) {
+        for (auto const& assertionOrBenchmark : sectionNode.assertionsAndBenchmarks) {
+            if (assertionOrBenchmark.isAssertion()) {
+                writeAssertion(assertionOrBenchmark.asAssertion(), okToFail);
+            }
+        }
+    }
+
+    void SonarQubeReporter::writeAssertion(AssertionStats const& stats, bool okToFail) {
+        AssertionResult const& result = stats.assertionResult;
+        if ( !result.isOk() ||
+             result.getResultType() == ResultWas::ExplicitSkip ) {
+            std::string elementName;
+            if (okToFail) {
+                elementName = "skipped";
+            } else {
+                switch (result.getResultType()) {
+                case ResultWas::ThrewException:
+                case ResultWas::FatalErrorCondition:
+                    elementName = "error";
+                    break;
+                case ResultWas::ExplicitFailure:
+                case ResultWas::ExpressionFailed:
+                case ResultWas::DidntThrowException:
+                    elementName = "failure";
+                    break;
+                case ResultWas::ExplicitSkip:
+                    elementName = "skipped";
+                    break;
+                    // We should never see these here:
+                case ResultWas::Info:
+                case ResultWas::Warning:
+                case ResultWas::Ok:
+                case ResultWas::Unknown:
+                case ResultWas::FailureBit:
+                case ResultWas::Exception:
+                    elementName = "internalError";
+                    break;
+                }
+            }
+
+            XmlWriter::ScopedElement e = xml.scopedElement(elementName);
+
+            ReusableStringStream messageRss;
+            messageRss << result.getTestMacroName() << '(' << result.getExpression() << ')';
+            xml.writeAttribute("message"_sr, messageRss.str());
+
+            ReusableStringStream textRss;
+            if ( result.getResultType() == ResultWas::ExplicitSkip ) {
+                textRss << "SKIPPED\n";
+            } else {
+                textRss << "FAILED:\n";
+                if (result.hasExpression()) {
+                    textRss << '\t' << result.getExpressionInMacro() << '\n';
+                }
+                if (result.hasExpandedExpression()) {
+                    textRss << "with expansion:\n\t" << result.getExpandedExpression() << '\n';
+                }
+            }
+
+            if (result.hasMessage())
+                textRss << result.getMessage() << '\n';
+
+            for (auto const& msg : stats.infoMessages)
+                if (msg.type == ResultWas::Info)
+                    textRss << msg.message << '\n';
+
+            textRss << "at " << result.getSourceInfo();
+            xml.writeText(textRss.str(), XmlFormatting::Newline);
+        }
+    }
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_sonarqube.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_sonarqube.hpp
new file mode 100644
index 00000000..906578bb
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_sonarqube.hpp
@@ -0,0 +1,59 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_REPORTER_SONARQUBE_HPP_INCLUDED
+#define CATCH_REPORTER_SONARQUBE_HPP_INCLUDED
+
+#include <catch2/reporters/catch_reporter_cumulative_base.hpp>
+
+#include <catch2/internal/catch_xmlwriter.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+namespace Catch {
+
+    class SonarQubeReporter final : public CumulativeReporterBase {
+    public:
+        SonarQubeReporter(ReporterConfig&& config)
+        : CumulativeReporterBase(CATCH_MOVE(config))
+        , xml(m_stream) {
+            m_preferences.shouldRedirectStdOut = true;
+            m_preferences.shouldReportAllAssertions = true;
+            m_shouldStoreSuccesfulAssertions = false;
+        }
+
+        static std::string getDescription() {
+            using namespace std::string_literals;
+            return "Reports test results in the Generic Test Data SonarQube XML format"s;
+        }
+
+        void testRunStarting( TestRunInfo const& testRunInfo ) override;
+
+        void testRunEndedCumulative() override {
+            writeRun( *m_testRun );
+            xml.endElement();
+        }
+
+        void writeRun( TestRunNode const& runNode );
+
+        void writeTestFile(StringRef filename, std::vector<TestCaseNode const*> const& testCaseNodes);
+
+        void writeTestCase(TestCaseNode const& testCaseNode);
+
+        void writeSection(std::string const& rootName, SectionNode const& sectionNode, bool okToFail);
+
+        void writeAssertions(SectionNode const& sectionNode, bool okToFail);
+
+        void writeAssertion(AssertionStats const& stats, bool okToFail);
+
+    private:
+        XmlWriter xml;
+    };
+
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_SONARQUBE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_streaming_base.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_streaming_base.cpp
new file mode 100644
index 00000000..f1cc425a
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_streaming_base.cpp
@@ -0,0 +1,23 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/reporters/catch_reporter_streaming_base.hpp>
+
+namespace Catch {
+
+    StreamingReporterBase::~StreamingReporterBase() = default;
+
+    void
+    StreamingReporterBase::testRunStarting( TestRunInfo const& _testRunInfo ) {
+        currentTestRunInfo = _testRunInfo;
+    }
+
+    void StreamingReporterBase::testRunEnded( TestRunStats const& ) {
+        currentTestCaseInfo = nullptr;
+    }
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_streaming_base.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_streaming_base.hpp
new file mode 100644
index 00000000..5448000c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_streaming_base.hpp
@@ -0,0 +1,73 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_REPORTER_STREAMING_BASE_HPP_INCLUDED
+#define CATCH_REPORTER_STREAMING_BASE_HPP_INCLUDED
+
+#include <catch2/reporters/catch_reporter_common_base.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+#include <vector>
+
+namespace Catch {
+
+    class StreamingReporterBase : public ReporterBase {
+    public:
+        // GCC5 compat: we cannot use inherited constructor, because it
+        //              doesn't implement backport of P0136
+        StreamingReporterBase(ReporterConfig&& _config):
+            ReporterBase(CATCH_MOVE(_config))
+        {}
+        ~StreamingReporterBase() override;
+
+        void benchmarkPreparing( StringRef ) override {}
+        void benchmarkStarting( BenchmarkInfo const& ) override {}
+        void benchmarkEnded( BenchmarkStats<> const& ) override {}
+        void benchmarkFailed( StringRef ) override {}
+
+        void fatalErrorEncountered( StringRef /*error*/ ) override {}
+        void noMatchingTestCases( StringRef /*unmatchedSpec*/ ) override {}
+        void reportInvalidTestSpec( StringRef /*invalidArgument*/ ) override {}
+
+        void testRunStarting( TestRunInfo const& _testRunInfo ) override;
+
+        void testCaseStarting(TestCaseInfo const& _testInfo) override  {
+            currentTestCaseInfo = &_testInfo;
+        }
+        void testCasePartialStarting( TestCaseInfo const&, uint64_t ) override {}
+        void sectionStarting(SectionInfo const& _sectionInfo) override {
+            m_sectionStack.push_back(_sectionInfo);
+        }
+
+        void assertionStarting( AssertionInfo const& ) override {}
+        void assertionEnded( AssertionStats const& ) override {}
+
+        void sectionEnded(SectionStats const& /* _sectionStats */) override {
+            m_sectionStack.pop_back();
+        }
+        void testCasePartialEnded( TestCaseStats const&, uint64_t ) override {}
+        void testCaseEnded(TestCaseStats const& /* _testCaseStats */) override {
+            currentTestCaseInfo = nullptr;
+        }
+        void testRunEnded( TestRunStats const& /* _testRunStats */ ) override;
+
+        void skipTest(TestCaseInfo const&) override {
+            // Don't do anything with this by default.
+            // It can optionally be overridden in the derived class.
+        }
+
+    protected:
+        TestRunInfo currentTestRunInfo{ "test run has not started yet"_sr };
+        TestCaseInfo const* currentTestCaseInfo = nullptr;
+
+        //! Stack of all _active_ sections in the _current_ test case
+        std::vector<SectionInfo> m_sectionStack;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_STREAMING_BASE_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_tap.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_tap.cpp
new file mode 100644
index 00000000..67d406fb
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_tap.cpp
@@ -0,0 +1,228 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/reporters/catch_reporter_tap.hpp>
+#include <catch2/internal/catch_console_colour.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/catch_test_case_info.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/catch_test_spec.hpp>
+#include <catch2/reporters/catch_reporter_helpers.hpp>
+
+#include <algorithm>
+#include <ostream>
+
+namespace Catch {
+
+    namespace {
+        // Yes, this has to be outside the class and namespaced by naming.
+        // Making older compiler happy is hard.
+        static constexpr StringRef tapFailedString = "not ok"_sr;
+        static constexpr StringRef tapPassedString = "ok"_sr;
+        static constexpr Colour::Code tapDimColour = Colour::FileName;
+
+        class TapAssertionPrinter {
+        public:
+            TapAssertionPrinter& operator= (TapAssertionPrinter const&) = delete;
+            TapAssertionPrinter(TapAssertionPrinter const&) = delete;
+            TapAssertionPrinter(std::ostream& _stream, AssertionStats const& _stats, std::size_t _counter, ColourImpl* colour_)
+                : stream(_stream)
+                , result(_stats.assertionResult)
+                , messages(_stats.infoMessages)
+                , itMessage(_stats.infoMessages.begin())
+                , printInfoMessages(true)
+                , counter(_counter)
+                , colourImpl( colour_ ) {}
+
+            void print() {
+                itMessage = messages.begin();
+
+                switch (result.getResultType()) {
+                case ResultWas::Ok:
+                    printResultType(tapPassedString);
+                    printOriginalExpression();
+                    printReconstructedExpression();
+                    if (!result.hasExpression())
+                        printRemainingMessages(Colour::None);
+                    else
+                        printRemainingMessages();
+                    break;
+                case ResultWas::ExpressionFailed:
+                    if (result.isOk()) {
+                        printResultType(tapPassedString);
+                    } else {
+                        printResultType(tapFailedString);
+                    }
+                    printOriginalExpression();
+                    printReconstructedExpression();
+                    if (result.isOk()) {
+                        printIssue(" # TODO");
+                    }
+                    printRemainingMessages();
+                    break;
+                case ResultWas::ThrewException:
+                    printResultType(tapFailedString);
+                    printIssue("unexpected exception with message:"_sr);
+                    printMessage();
+                    printExpressionWas();
+                    printRemainingMessages();
+                    break;
+                case ResultWas::FatalErrorCondition:
+                    printResultType(tapFailedString);
+                    printIssue("fatal error condition with message:"_sr);
+                    printMessage();
+                    printExpressionWas();
+                    printRemainingMessages();
+                    break;
+                case ResultWas::DidntThrowException:
+                    printResultType(tapFailedString);
+                    printIssue("expected exception, got none"_sr);
+                    printExpressionWas();
+                    printRemainingMessages();
+                    break;
+                case ResultWas::Info:
+                    printResultType("info"_sr);
+                    printMessage();
+                    printRemainingMessages();
+                    break;
+                case ResultWas::Warning:
+                    printResultType("warning"_sr);
+                    printMessage();
+                    printRemainingMessages();
+                    break;
+                case ResultWas::ExplicitFailure:
+                    printResultType(tapFailedString);
+                    printIssue("explicitly"_sr);
+                    printRemainingMessages(Colour::None);
+                    break;
+                case ResultWas::ExplicitSkip:
+                    printResultType(tapPassedString);
+                    printIssue(" # SKIP"_sr);
+                    printMessage();
+                    printRemainingMessages();
+                    break;
+                    // These cases are here to prevent compiler warnings
+                case ResultWas::Unknown:
+                case ResultWas::FailureBit:
+                case ResultWas::Exception:
+                    printResultType("** internal error **"_sr);
+                    break;
+                }
+            }
+
+        private:
+            void printResultType(StringRef passOrFail) const {
+                if (!passOrFail.empty()) {
+                    stream << passOrFail << ' ' << counter << " -";
+                }
+            }
+
+            void printIssue(StringRef issue) const {
+                stream << ' ' << issue;
+            }
+
+            void printExpressionWas() {
+                if (result.hasExpression()) {
+                    stream << ';';
+                    stream << colourImpl->guardColour( tapDimColour )
+                           << " expression was:";
+                    printOriginalExpression();
+                }
+            }
+
+            void printOriginalExpression() const {
+                if (result.hasExpression()) {
+                    stream << ' ' << result.getExpression();
+                }
+            }
+
+            void printReconstructedExpression() const {
+                if (result.hasExpandedExpression()) {
+                    stream << colourImpl->guardColour( tapDimColour ) << " for: ";
+
+                    std::string expr = result.getExpandedExpression();
+                    std::replace(expr.begin(), expr.end(), '\n', ' ');
+                    stream << expr;
+                }
+            }
+
+            void printMessage() {
+                if (itMessage != messages.end()) {
+                    stream << " '" << itMessage->message << '\'';
+                    ++itMessage;
+                }
+            }
+
+            void printRemainingMessages(Colour::Code colour = tapDimColour) {
+                if (itMessage == messages.end()) {
+                    return;
+                }
+
+                // using messages.end() directly (or auto) yields compilation error:
+                std::vector<MessageInfo>::const_iterator itEnd = messages.end();
+                const std::size_t N = static_cast<std::size_t>(itEnd - itMessage);
+
+                stream << colourImpl->guardColour( colour ) << " with "
+                       << pluralise( N, "message"_sr ) << ':';
+
+                for (; itMessage != itEnd; ) {
+                    // If this assertion is a warning ignore any INFO messages
+                    if (printInfoMessages || itMessage->type != ResultWas::Info) {
+                        stream << " '" << itMessage->message << '\'';
+                        if (++itMessage != itEnd) {
+                            stream << colourImpl->guardColour(tapDimColour) << " and";
+                        }
+                    }
+                }
+            }
+
+        private:
+            std::ostream& stream;
+            AssertionResult const& result;
+            std::vector<MessageInfo> const& messages;
+            std::vector<MessageInfo>::const_iterator itMessage;
+            bool printInfoMessages;
+            std::size_t counter;
+            ColourImpl* colourImpl;
+        };
+
+    } // End anonymous namespace
+
+    void TAPReporter::testRunStarting( TestRunInfo const& ) {
+        if ( m_config->testSpec().hasFilters() ) {
+            m_stream << "# filters: " << m_config->testSpec() << '\n';
+        }
+        m_stream << "# rng-seed: " << m_config->rngSeed() << '\n';
+    }
+
+    void TAPReporter::noMatchingTestCases( StringRef unmatchedSpec ) {
+        m_stream << "# No test cases matched '" << unmatchedSpec << "'\n";
+    }
+
+    void TAPReporter::assertionEnded(AssertionStats const& _assertionStats) {
+        ++counter;
+
+        m_stream << "# " << currentTestCaseInfo->name << '\n';
+        TapAssertionPrinter printer(m_stream, _assertionStats, counter, m_colour.get());
+        printer.print();
+
+        m_stream << '\n' << std::flush;
+    }
+
+    void TAPReporter::testRunEnded(TestRunStats const& _testRunStats) {
+        m_stream << "1.." << _testRunStats.totals.assertions.total();
+        if (_testRunStats.totals.testCases.total() == 0) {
+            m_stream << " # Skipped: No tests ran.";
+        }
+        m_stream << "\n\n" << std::flush;
+        StreamingReporterBase::testRunEnded(_testRunStats);
+    }
+
+
+
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_tap.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_tap.hpp
new file mode 100644
index 00000000..e6889bb1
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_tap.hpp
@@ -0,0 +1,42 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_REPORTER_TAP_HPP_INCLUDED
+#define CATCH_REPORTER_TAP_HPP_INCLUDED
+
+#include <catch2/reporters/catch_reporter_streaming_base.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+
+namespace Catch {
+
+    class TAPReporter final : public StreamingReporterBase {
+    public:
+        TAPReporter( ReporterConfig&& config ):
+            StreamingReporterBase( CATCH_MOVE(config) ) {
+            m_preferences.shouldReportAllAssertions = true;
+        }
+
+        static std::string getDescription() {
+            using namespace std::string_literals;
+            return "Reports test results in TAP format, suitable for test harnesses"s;
+        }
+
+        void testRunStarting( TestRunInfo const& testInfo ) override;
+
+        void noMatchingTestCases( StringRef unmatchedSpec ) override;
+
+        void assertionEnded(AssertionStats const& _assertionStats) override;
+
+        void testRunEnded(TestRunStats const& _testRunStats) override;
+
+    private:
+        std::size_t counter = 0;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_TAP_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_teamcity.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_teamcity.cpp
new file mode 100644
index 00000000..38aa55a6
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_teamcity.cpp
@@ -0,0 +1,177 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/reporters/catch_reporter_teamcity.hpp>
+
+#include <catch2/reporters/catch_reporter_helpers.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/internal/catch_enforce.hpp>
+#include <catch2/internal/catch_textflow.hpp>
+#include <catch2/catch_test_case_info.hpp>
+
+#include <cassert>
+#include <ostream>
+
+namespace Catch {
+
+    namespace {
+        // if string has a : in first line will set indent to follow it on
+        // subsequent lines
+        void printHeaderString(std::ostream& os, std::string const& _string, std::size_t indent = 0) {
+            std::size_t i = _string.find(": ");
+            if (i != std::string::npos)
+                i += 2;
+            else
+                i = 0;
+            os << TextFlow::Column(_string)
+                  .indent(indent + i)
+                  .initialIndent(indent) << '\n';
+        }
+
+        std::string escape(StringRef str) {
+            std::string escaped = static_cast<std::string>(str);
+            replaceInPlace(escaped, "|", "||");
+            replaceInPlace(escaped, "'", "|'");
+            replaceInPlace(escaped, "\n", "|n");
+            replaceInPlace(escaped, "\r", "|r");
+            replaceInPlace(escaped, "[", "|[");
+            replaceInPlace(escaped, "]", "|]");
+            return escaped;
+        }
+    } // end anonymous namespace
+
+
+    TeamCityReporter::~TeamCityReporter() = default;
+
+    void TeamCityReporter::testRunStarting( TestRunInfo const& runInfo ) {
+        m_stream << "##teamcity[testSuiteStarted name='" << escape( runInfo.name )
+               << "']\n";
+    }
+
+    void TeamCityReporter::testRunEnded( TestRunStats const& runStats ) {
+        m_stream << "##teamcity[testSuiteFinished name='"
+               << escape( runStats.runInfo.name ) << "']\n";
+    }
+
+    void TeamCityReporter::assertionEnded(AssertionStats const& assertionStats) {
+        AssertionResult const& result = assertionStats.assertionResult;
+        if ( !result.isOk() ||
+             result.getResultType() == ResultWas::ExplicitSkip ) {
+
+            ReusableStringStream msg;
+            if (!m_headerPrintedForThisSection)
+                printSectionHeader(msg.get());
+            m_headerPrintedForThisSection = true;
+
+            msg << result.getSourceInfo() << '\n';
+
+            switch (result.getResultType()) {
+            case ResultWas::ExpressionFailed:
+                msg << "expression failed";
+                break;
+            case ResultWas::ThrewException:
+                msg << "unexpected exception";
+                break;
+            case ResultWas::FatalErrorCondition:
+                msg << "fatal error condition";
+                break;
+            case ResultWas::DidntThrowException:
+                msg << "no exception was thrown where one was expected";
+                break;
+            case ResultWas::ExplicitFailure:
+                msg << "explicit failure";
+                break;
+            case ResultWas::ExplicitSkip:
+                msg << "explicit skip";
+                break;
+
+                // We shouldn't get here because of the isOk() test
+            case ResultWas::Ok:
+            case ResultWas::Info:
+            case ResultWas::Warning:
+                CATCH_ERROR("Internal error in TeamCity reporter");
+                // These cases are here to prevent compiler warnings
+            case ResultWas::Unknown:
+            case ResultWas::FailureBit:
+            case ResultWas::Exception:
+                CATCH_ERROR("Not implemented");
+            }
+            if (assertionStats.infoMessages.size() == 1)
+                msg << " with message:";
+            if (assertionStats.infoMessages.size() > 1)
+                msg << " with messages:";
+            for (auto const& messageInfo : assertionStats.infoMessages)
+                msg << "\n  \"" << messageInfo.message << '"';
+
+
+            if (result.hasExpression()) {
+                msg <<
+                    "\n  " << result.getExpressionInMacro() << "\n"
+                    "with expansion:\n"
+                    "  " << result.getExpandedExpression() << '\n';
+            }
+
+            if ( result.getResultType() == ResultWas::ExplicitSkip ) {
+                m_stream << "##teamcity[testIgnored";
+            } else if ( currentTestCaseInfo->okToFail() ) {
+                msg << "- failure ignore as test marked as 'ok to fail'\n";
+                m_stream << "##teamcity[testIgnored";
+            } else {
+                m_stream << "##teamcity[testFailed";
+            }
+            m_stream << " name='" << escape( currentTestCaseInfo->name ) << '\''
+                     << " message='" << escape( msg.str() ) << '\'' << "]\n";
+        }
+        m_stream.flush();
+    }
+
+    void TeamCityReporter::testCaseStarting(TestCaseInfo const& testInfo) {
+        m_testTimer.start();
+        StreamingReporterBase::testCaseStarting(testInfo);
+        m_stream << "##teamcity[testStarted name='"
+            << escape(testInfo.name) << "']\n";
+        m_stream.flush();
+    }
+
+    void TeamCityReporter::testCaseEnded(TestCaseStats const& testCaseStats) {
+        StreamingReporterBase::testCaseEnded(testCaseStats);
+        auto const& testCaseInfo = *testCaseStats.testInfo;
+        if (!testCaseStats.stdOut.empty())
+            m_stream << "##teamcity[testStdOut name='"
+            << escape(testCaseInfo.name)
+            << "' out='" << escape(testCaseStats.stdOut) << "']\n";
+        if (!testCaseStats.stdErr.empty())
+            m_stream << "##teamcity[testStdErr name='"
+            << escape(testCaseInfo.name)
+            << "' out='" << escape(testCaseStats.stdErr) << "']\n";
+        m_stream << "##teamcity[testFinished name='"
+            << escape(testCaseInfo.name) << "' duration='"
+            << m_testTimer.getElapsedMilliseconds() << "']\n";
+        m_stream.flush();
+    }
+
+    void TeamCityReporter::printSectionHeader(std::ostream& os) {
+        assert(!m_sectionStack.empty());
+
+        if (m_sectionStack.size() > 1) {
+            os << lineOfChars('-') << '\n';
+
+            std::vector<SectionInfo>::const_iterator
+                it = m_sectionStack.begin() + 1, // Skip first section (test case)
+                itEnd = m_sectionStack.end();
+            for (; it != itEnd; ++it)
+                printHeaderString(os, it->name);
+            os << lineOfChars('-') << '\n';
+        }
+
+        SourceLineInfo lineInfo = m_sectionStack.front().lineInfo;
+
+        os << lineInfo << '\n';
+        os << lineOfChars('.') << "\n\n";
+    }
+
+} // end namespace Catch
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_teamcity.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_teamcity.hpp
new file mode 100644
index 00000000..662e9892
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_teamcity.hpp
@@ -0,0 +1,66 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_REPORTER_TEAMCITY_HPP_INCLUDED
+#define CATCH_REPORTER_TEAMCITY_HPP_INCLUDED
+
+#include <catch2/reporters/catch_reporter_streaming_base.hpp>
+#include <catch2/catch_timer.hpp>
+
+#include <cstring>
+
+#ifdef __clang__
+#   pragma clang diagnostic push
+#   pragma clang diagnostic ignored "-Wpadded"
+#endif
+
+namespace Catch {
+
+    class TeamCityReporter final : public StreamingReporterBase {
+    public:
+        TeamCityReporter( ReporterConfig&& _config )
+        :   StreamingReporterBase( CATCH_MOVE(_config) )
+        {
+            m_preferences.shouldRedirectStdOut = true;
+        }
+
+        ~TeamCityReporter() override;
+
+        static std::string getDescription() {
+            using namespace std::string_literals;
+            return "Reports test results as TeamCity service messages"s;
+        }
+
+        void testRunStarting( TestRunInfo const& runInfo ) override;
+        void testRunEnded( TestRunStats const& runStats ) override;
+
+
+        void assertionEnded(AssertionStats const& assertionStats) override;
+
+        void sectionStarting(SectionInfo const& sectionInfo) override {
+            m_headerPrintedForThisSection = false;
+            StreamingReporterBase::sectionStarting( sectionInfo );
+        }
+
+        void testCaseStarting(TestCaseInfo const& testInfo) override;
+
+        void testCaseEnded(TestCaseStats const& testCaseStats) override;
+
+    private:
+        void printSectionHeader(std::ostream& os);
+
+        bool m_headerPrintedForThisSection = false;
+        Timer m_testTimer;
+    };
+
+} // end namespace Catch
+
+#ifdef __clang__
+#   pragma clang diagnostic pop
+#endif
+
+#endif // CATCH_REPORTER_TEAMCITY_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_xml.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_xml.cpp
new file mode 100644
index 00000000..35a3028e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_xml.cpp
@@ -0,0 +1,333 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#include <catch2/reporters/catch_reporter_xml.hpp>
+
+#include <catch2/reporters/catch_reporter_helpers.hpp>
+#include <catch2/interfaces/catch_interfaces_config.hpp>
+#include <catch2/catch_test_spec.hpp>
+#include <catch2/internal/catch_string_manip.hpp>
+#include <catch2/internal/catch_list.hpp>
+#include <catch2/catch_test_case_info.hpp>
+#include <catch2/internal/catch_move_and_forward.hpp>
+#include <catch2/catch_version.hpp>
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable:4061) // Not all labels are EXPLICITLY handled in switch
+                              // Note that 4062 (not all labels are handled
+                              // and default is missing) is enabled
+#endif
+
+namespace Catch {
+    XmlReporter::XmlReporter( ReporterConfig&& _config )
+    :   StreamingReporterBase( CATCH_MOVE(_config) ),
+        m_xml(m_stream)
+    {
+        m_preferences.shouldRedirectStdOut = true;
+        m_preferences.shouldReportAllAssertions = true;
+    }
+
+    XmlReporter::~XmlReporter() = default;
+
+    std::string XmlReporter::getDescription() {
+        return "Reports test results as an XML document";
+    }
+
+    std::string XmlReporter::getStylesheetRef() const {
+        return std::string();
+    }
+
+    void XmlReporter::writeSourceInfo( SourceLineInfo const& sourceInfo ) {
+        m_xml
+            .writeAttribute( "filename"_sr, sourceInfo.file )
+            .writeAttribute( "line"_sr, sourceInfo.line );
+    }
+
+    void XmlReporter::testRunStarting( TestRunInfo const& testInfo ) {
+        StreamingReporterBase::testRunStarting( testInfo );
+        std::string stylesheetRef = getStylesheetRef();
+        if( !stylesheetRef.empty() )
+            m_xml.writeStylesheetRef( stylesheetRef );
+        m_xml.startElement("Catch2TestRun")
+             .writeAttribute("name"_sr, m_config->name())
+             .writeAttribute("rng-seed"_sr, m_config->rngSeed())
+             .writeAttribute("xml-format-version"_sr, 3)
+             .writeAttribute("catch2-version"_sr, libraryVersion());
+        if ( m_config->testSpec().hasFilters() ) {
+            m_xml.writeAttribute( "filters"_sr, m_config->testSpec() );
+        }
+    }
+
+    void XmlReporter::testCaseStarting( TestCaseInfo const& testInfo ) {
+        StreamingReporterBase::testCaseStarting(testInfo);
+        m_xml.startElement( "TestCase" )
+            .writeAttribute( "name"_sr, trim( StringRef(testInfo.name) ) )
+            .writeAttribute( "tags"_sr, testInfo.tagsAsString() );
+
+        writeSourceInfo( testInfo.lineInfo );
+
+        if ( m_config->showDurations() == ShowDurations::Always )
+            m_testCaseTimer.start();
+        m_xml.ensureTagClosed();
+    }
+
+    void XmlReporter::sectionStarting( SectionInfo const& sectionInfo ) {
+        StreamingReporterBase::sectionStarting( sectionInfo );
+        if( m_sectionDepth++ > 0 ) {
+            m_xml.startElement( "Section" )
+                .writeAttribute( "name"_sr, trim( StringRef(sectionInfo.name) ) );
+            writeSourceInfo( sectionInfo.lineInfo );
+            m_xml.ensureTagClosed();
+        }
+    }
+
+    void XmlReporter::assertionStarting( AssertionInfo const& ) { }
+
+    void XmlReporter::assertionEnded( AssertionStats const& assertionStats ) {
+
+        AssertionResult const& result = assertionStats.assertionResult;
+
+        bool includeResults = m_config->includeSuccessfulResults() || !result.isOk();
+
+        if( includeResults || result.getResultType() == ResultWas::Warning ) {
+            // Print any info messages in <Info> tags.
+            for( auto const& msg : assertionStats.infoMessages ) {
+                if( msg.type == ResultWas::Info && includeResults ) {
+                    auto t = m_xml.scopedElement( "Info" );
+                    writeSourceInfo( msg.lineInfo );
+                    t.writeText( msg.message );
+                } else if ( msg.type == ResultWas::Warning ) {
+                    auto t = m_xml.scopedElement( "Warning" );
+                    writeSourceInfo( msg.lineInfo );
+                    t.writeText( msg.message );
+                }
+            }
+        }
+
+        // Drop out if result was successful but we're not printing them.
+        if ( !includeResults && result.getResultType() != ResultWas::Warning &&
+             result.getResultType() != ResultWas::ExplicitSkip ) {
+            return;
+        }
+
+        // Print the expression if there is one.
+        if( result.hasExpression() ) {
+            m_xml.startElement( "Expression" )
+                .writeAttribute( "success"_sr, result.succeeded() )
+                .writeAttribute( "type"_sr, result.getTestMacroName() );
+
+            writeSourceInfo( result.getSourceInfo() );
+
+            m_xml.scopedElement( "Original" )
+                .writeText( result.getExpression() );
+            m_xml.scopedElement( "Expanded" )
+                .writeText( result.getExpandedExpression() );
+        }
+
+        // And... Print a result applicable to each result type.
+        switch( result.getResultType() ) {
+            case ResultWas::ThrewException:
+                m_xml.startElement( "Exception" );
+                writeSourceInfo( result.getSourceInfo() );
+                m_xml.writeText( result.getMessage() );
+                m_xml.endElement();
+                break;
+            case ResultWas::FatalErrorCondition:
+                m_xml.startElement( "FatalErrorCondition" );
+                writeSourceInfo( result.getSourceInfo() );
+                m_xml.writeText( result.getMessage() );
+                m_xml.endElement();
+                break;
+            case ResultWas::Info:
+                m_xml.scopedElement( "Info" )
+                     .writeText( result.getMessage() );
+                break;
+            case ResultWas::Warning:
+                // Warning will already have been written
+                break;
+            case ResultWas::ExplicitFailure:
+                m_xml.startElement( "Failure" );
+                writeSourceInfo( result.getSourceInfo() );
+                m_xml.writeText( result.getMessage() );
+                m_xml.endElement();
+                break;
+            case ResultWas::ExplicitSkip:
+                m_xml.startElement( "Skip" );
+                writeSourceInfo( result.getSourceInfo() );
+                m_xml.writeText( result.getMessage() );
+                m_xml.endElement();
+                break;
+            default:
+                break;
+        }
+
+        if( result.hasExpression() )
+            m_xml.endElement();
+    }
+
+    void XmlReporter::sectionEnded( SectionStats const& sectionStats ) {
+        StreamingReporterBase::sectionEnded( sectionStats );
+        if ( --m_sectionDepth > 0 ) {
+            {
+                XmlWriter::ScopedElement e = m_xml.scopedElement( "OverallResults" );
+                e.writeAttribute( "successes"_sr, sectionStats.assertions.passed );
+                e.writeAttribute( "failures"_sr, sectionStats.assertions.failed );
+                e.writeAttribute( "expectedFailures"_sr, sectionStats.assertions.failedButOk );
+                e.writeAttribute( "skipped"_sr, sectionStats.assertions.skipped > 0 );
+
+                if ( m_config->showDurations() == ShowDurations::Always )
+                    e.writeAttribute( "durationInSeconds"_sr, sectionStats.durationInSeconds );
+            }
+            // Ends assertion tag
+            m_xml.endElement();
+        }
+    }
+
+    void XmlReporter::testCaseEnded( TestCaseStats const& testCaseStats ) {
+        StreamingReporterBase::testCaseEnded( testCaseStats );
+        XmlWriter::ScopedElement e = m_xml.scopedElement( "OverallResult" );
+        e.writeAttribute( "success"_sr, testCaseStats.totals.assertions.allOk() );
+        e.writeAttribute( "skips"_sr, testCaseStats.totals.assertions.skipped );
+
+        if ( m_config->showDurations() == ShowDurations::Always )
+            e.writeAttribute( "durationInSeconds"_sr, m_testCaseTimer.getElapsedSeconds() );
+        if( !testCaseStats.stdOut.empty() )
+            m_xml.scopedElement( "StdOut" ).writeText( trim( StringRef(testCaseStats.stdOut) ), XmlFormatting::Newline );
+        if( !testCaseStats.stdErr.empty() )
+            m_xml.scopedElement( "StdErr" ).writeText( trim( StringRef(testCaseStats.stdErr) ), XmlFormatting::Newline );
+
+        m_xml.endElement();
+    }
+
+    void XmlReporter::testRunEnded( TestRunStats const& testRunStats ) {
+        StreamingReporterBase::testRunEnded( testRunStats );
+        m_xml.scopedElement( "OverallResults" )
+            .writeAttribute( "successes"_sr, testRunStats.totals.assertions.passed )
+            .writeAttribute( "failures"_sr, testRunStats.totals.assertions.failed )
+            .writeAttribute( "expectedFailures"_sr, testRunStats.totals.assertions.failedButOk )
+            .writeAttribute( "skips"_sr, testRunStats.totals.assertions.skipped );
+        m_xml.scopedElement( "OverallResultsCases")
+            .writeAttribute( "successes"_sr, testRunStats.totals.testCases.passed )
+            .writeAttribute( "failures"_sr, testRunStats.totals.testCases.failed )
+            .writeAttribute( "expectedFailures"_sr, testRunStats.totals.testCases.failedButOk )
+            .writeAttribute( "skips"_sr, testRunStats.totals.testCases.skipped );
+        m_xml.endElement();
+    }
+
+    void XmlReporter::benchmarkPreparing( StringRef name ) {
+        m_xml.startElement("BenchmarkResults")
+             .writeAttribute("name"_sr, name);
+    }
+
+    void XmlReporter::benchmarkStarting(BenchmarkInfo const &info) {
+        m_xml.writeAttribute("samples"_sr, info.samples)
+            .writeAttribute("resamples"_sr, info.resamples)
+            .writeAttribute("iterations"_sr, info.iterations)
+            .writeAttribute("clockResolution"_sr, info.clockResolution)
+            .writeAttribute("estimatedDuration"_sr, info.estimatedDuration)
+            .writeComment("All values in nano seconds"_sr);
+    }
+
+    void XmlReporter::benchmarkEnded(BenchmarkStats<> const& benchmarkStats) {
+        m_xml.scopedElement("mean")
+            .writeAttribute("value"_sr, benchmarkStats.mean.point.count())
+            .writeAttribute("lowerBound"_sr, benchmarkStats.mean.lower_bound.count())
+            .writeAttribute("upperBound"_sr, benchmarkStats.mean.upper_bound.count())
+            .writeAttribute("ci"_sr, benchmarkStats.mean.confidence_interval);
+        m_xml.scopedElement("standardDeviation")
+            .writeAttribute("value"_sr, benchmarkStats.standardDeviation.point.count())
+            .writeAttribute("lowerBound"_sr, benchmarkStats.standardDeviation.lower_bound.count())
+            .writeAttribute("upperBound"_sr, benchmarkStats.standardDeviation.upper_bound.count())
+            .writeAttribute("ci"_sr, benchmarkStats.standardDeviation.confidence_interval);
+        m_xml.scopedElement("outliers")
+            .writeAttribute("variance"_sr, benchmarkStats.outlierVariance)
+            .writeAttribute("lowMild"_sr, benchmarkStats.outliers.low_mild)
+            .writeAttribute("lowSevere"_sr, benchmarkStats.outliers.low_severe)
+            .writeAttribute("highMild"_sr, benchmarkStats.outliers.high_mild)
+            .writeAttribute("highSevere"_sr, benchmarkStats.outliers.high_severe);
+        m_xml.endElement();
+    }
+
+    void XmlReporter::benchmarkFailed(StringRef error) {
+        m_xml.scopedElement("failed").
+            writeAttribute("message"_sr, error);
+        m_xml.endElement();
+    }
+
+    void XmlReporter::listReporters(std::vector<ReporterDescription> const& descriptions) {
+        auto outerTag = m_xml.scopedElement("AvailableReporters");
+        for (auto const& reporter : descriptions) {
+            auto inner = m_xml.scopedElement("Reporter");
+            m_xml.startElement("Name", XmlFormatting::Indent)
+                 .writeText(reporter.name, XmlFormatting::None)
+                 .endElement(XmlFormatting::Newline);
+            m_xml.startElement("Description", XmlFormatting::Indent)
+                 .writeText(reporter.description, XmlFormatting::None)
+                 .endElement(XmlFormatting::Newline);
+        }
+    }
+
+    void XmlReporter::listListeners(std::vector<ListenerDescription> const& descriptions) {
+        auto outerTag = m_xml.scopedElement( "RegisteredListeners" );
+        for ( auto const& listener : descriptions ) {
+            auto inner = m_xml.scopedElement( "Listener" );
+            m_xml.startElement( "Name", XmlFormatting::Indent )
+                .writeText( listener.name, XmlFormatting::None )
+                .endElement( XmlFormatting::Newline );
+            m_xml.startElement( "Description", XmlFormatting::Indent )
+                .writeText( listener.description, XmlFormatting::None )
+                .endElement( XmlFormatting::Newline );
+        }
+    }
+
+    void XmlReporter::listTests(std::vector<TestCaseHandle> const& tests) {
+        auto outerTag = m_xml.scopedElement("MatchingTests");
+        for (auto const& test : tests) {
+            auto innerTag = m_xml.scopedElement("TestCase");
+            auto const& testInfo = test.getTestCaseInfo();
+            m_xml.startElement("Name", XmlFormatting::Indent)
+                 .writeText(testInfo.name, XmlFormatting::None)
+                 .endElement(XmlFormatting::Newline);
+            m_xml.startElement("ClassName", XmlFormatting::Indent)
+                 .writeText(testInfo.className, XmlFormatting::None)
+                 .endElement(XmlFormatting::Newline);
+            m_xml.startElement("Tags", XmlFormatting::Indent)
+                 .writeText(testInfo.tagsAsString(), XmlFormatting::None)
+                 .endElement(XmlFormatting::Newline);
+
+            auto sourceTag = m_xml.scopedElement("SourceInfo");
+            m_xml.startElement("File", XmlFormatting::Indent)
+                 .writeText(testInfo.lineInfo.file, XmlFormatting::None)
+                 .endElement(XmlFormatting::Newline);
+            m_xml.startElement("Line", XmlFormatting::Indent)
+                 .writeText(std::to_string(testInfo.lineInfo.line), XmlFormatting::None)
+                 .endElement(XmlFormatting::Newline);
+        }
+    }
+
+    void XmlReporter::listTags(std::vector<TagInfo> const& tags) {
+        auto outerTag = m_xml.scopedElement("TagsFromMatchingTests");
+        for (auto const& tag : tags) {
+            auto innerTag = m_xml.scopedElement("Tag");
+            m_xml.startElement("Count", XmlFormatting::Indent)
+                 .writeText(std::to_string(tag.count), XmlFormatting::None)
+                 .endElement(XmlFormatting::Newline);
+            auto aliasTag = m_xml.scopedElement("Aliases");
+            for (auto const& alias : tag.spellings) {
+                m_xml.startElement("Alias", XmlFormatting::Indent)
+                     .writeText(alias, XmlFormatting::None)
+                     .endElement(XmlFormatting::Newline);
+            }
+        }
+    }
+
+} // end namespace Catch
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_xml.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_xml.hpp
new file mode 100644
index 00000000..bead7a87
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_xml.hpp
@@ -0,0 +1,66 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+#ifndef CATCH_REPORTER_XML_HPP_INCLUDED
+#define CATCH_REPORTER_XML_HPP_INCLUDED
+
+#include <catch2/reporters/catch_reporter_streaming_base.hpp>
+
+#include <catch2/internal/catch_xmlwriter.hpp>
+#include <catch2/catch_timer.hpp>
+
+
+namespace Catch {
+    class XmlReporter : public StreamingReporterBase {
+    public:
+        XmlReporter(ReporterConfig&& _config);
+
+        ~XmlReporter() override;
+
+        static std::string getDescription();
+
+        virtual std::string getStylesheetRef() const;
+
+        void writeSourceInfo(SourceLineInfo const& sourceInfo);
+
+    public: // StreamingReporterBase
+
+        void testRunStarting(TestRunInfo const& testInfo) override;
+
+        void testCaseStarting(TestCaseInfo const& testInfo) override;
+
+        void sectionStarting(SectionInfo const& sectionInfo) override;
+
+        void assertionStarting(AssertionInfo const&) override;
+
+        void assertionEnded(AssertionStats const& assertionStats) override;
+
+        void sectionEnded(SectionStats const& sectionStats) override;
+
+        void testCaseEnded(TestCaseStats const& testCaseStats) override;
+
+        void testRunEnded(TestRunStats const& testRunStats) override;
+
+        void benchmarkPreparing( StringRef name ) override;
+        void benchmarkStarting(BenchmarkInfo const&) override;
+        void benchmarkEnded(BenchmarkStats<> const&) override;
+        void benchmarkFailed( StringRef error ) override;
+
+        void listReporters(std::vector<ReporterDescription> const& descriptions) override;
+        void listListeners(std::vector<ListenerDescription> const& descriptions) override;
+        void listTests(std::vector<TestCaseHandle> const& tests) override;
+        void listTags(std::vector<TagInfo> const& tags) override;
+
+    private:
+        Timer m_testCaseTimer;
+        XmlWriter m_xml;
+        int m_sectionDepth = 0;
+    };
+
+} // end namespace Catch
+
+#endif // CATCH_REPORTER_XML_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporters_all.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporters_all.hpp
new file mode 100644
index 00000000..5c713fe1
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporters_all.hpp
@@ -0,0 +1,41 @@
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+/** \file
+ * This is a convenience header for Catch2's Reporter support. It includes
+ * **all** of Catch2 headers related to reporters, including all reporters.
+ *
+ * Generally the Catch2 users should use specific includes they need,
+ * but this header can be used instead for ease-of-experimentation, or
+ * just plain convenience, at the cost of (significantly) increased
+ * compilation times.
+ *
+ * When a new header (reporter) is added to either the `reporter` folder,
+ * or to the corresponding internal subfolder, it should be added here.
+ */
+
+#ifndef CATCH_REPORTERS_ALL_HPP_INCLUDED
+#define CATCH_REPORTERS_ALL_HPP_INCLUDED
+
+#include <catch2/reporters/catch_reporter_automake.hpp>
+#include <catch2/reporters/catch_reporter_common_base.hpp>
+#include <catch2/reporters/catch_reporter_compact.hpp>
+#include <catch2/reporters/catch_reporter_console.hpp>
+#include <catch2/reporters/catch_reporter_cumulative_base.hpp>
+#include <catch2/reporters/catch_reporter_event_listener.hpp>
+#include <catch2/reporters/catch_reporter_helpers.hpp>
+#include <catch2/reporters/catch_reporter_json.hpp>
+#include <catch2/reporters/catch_reporter_junit.hpp>
+#include <catch2/reporters/catch_reporter_multi.hpp>
+#include <catch2/reporters/catch_reporter_registrars.hpp>
+#include <catch2/reporters/catch_reporter_sonarqube.hpp>
+#include <catch2/reporters/catch_reporter_streaming_base.hpp>
+#include <catch2/reporters/catch_reporter_tap.hpp>
+#include <catch2/reporters/catch_reporter_teamcity.hpp>
+#include <catch2/reporters/catch_reporter_xml.hpp>
+
+#endif // CATCH_REPORTERS_ALL_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/third_party/clara.hpp b/ethosu/regor/dependencies/thirdparty/Catch2/third_party/clara.hpp
new file mode 100644
index 00000000..eb4c7275
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/third_party/clara.hpp
@@ -0,0 +1,1267 @@
+// Copyright 2017 Two Blue Cubes Ltd. All rights reserved.
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// See https://github.com/philsquared/Clara for more details
+
+// Clara v1.1.5
+
+#ifndef CLARA_HPP_INCLUDED
+#define CLARA_HPP_INCLUDED
+
+#ifndef CLARA_CONFIG_CONSOLE_WIDTH
+#define CLARA_CONFIG_CONSOLE_WIDTH 80
+#endif
+
+#ifndef CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH
+#define CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH CLARA_CONFIG_CONSOLE_WIDTH
+#endif
+
+#ifndef CLARA_CONFIG_OPTIONAL_TYPE
+#ifdef __has_include
+#if __has_include(<optional>) && __cplusplus >= 201703L
+#include <optional>
+#define CLARA_CONFIG_OPTIONAL_TYPE std::optional
+#endif
+#endif
+#endif
+
+
+// ----------- #included from clara_textflow.hpp -----------
+
+// TextFlowCpp
+//
+// A single-header library for wrapping and laying out basic text, by Phil Nash
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// This project is hosted at https://github.com/philsquared/textflowcpp
+
+#ifndef CLARA_TEXTFLOW_HPP_INCLUDED
+#define CLARA_TEXTFLOW_HPP_INCLUDED
+
+#include <cassert>
+#include <ostream>
+#include <sstream>
+#include <vector>
+
+#ifndef CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH
+#define CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH 80
+#endif
+
+
+namespace clara { namespace TextFlow {
+
+    inline auto isWhitespace( char c ) -> bool {
+        static std::string chars = " \t\n\r";
+        return chars.find( c ) != std::string::npos;
+    }
+    inline auto isBreakableBefore( char c ) -> bool {
+        static std::string chars = "[({<|";
+        return chars.find( c ) != std::string::npos;
+    }
+    inline auto isBreakableAfter( char c ) -> bool {
+        static std::string chars = "])}>.,:;*+-=&/\\";
+        return chars.find( c ) != std::string::npos;
+    }
+
+    class Columns;
+
+    class Column {
+        std::vector<std::string> m_strings;
+        size_t m_width = CLARA_TEXTFLOW_CONFIG_CONSOLE_WIDTH;
+        size_t m_indent = 0;
+        size_t m_initialIndent = std::string::npos;
+
+    public:
+        class iterator {
+            friend Column;
+
+            Column const& m_column;
+            size_t m_stringIndex = 0;
+            size_t m_pos = 0;
+
+            size_t m_len = 0;
+            size_t m_end = 0;
+            bool m_suffix = false;
+
+            iterator( Column const& column, size_t stringIndex )
+            :   m_column( column ),
+                m_stringIndex( stringIndex )
+            {}
+
+            auto line() const -> std::string const& { return m_column.m_strings[m_stringIndex]; }
+
+            auto isBoundary( size_t at ) const -> bool {
+                assert( at > 0 );
+                assert( at <= line().size() );
+
+                return at == line().size() ||
+                       ( isWhitespace( line()[at] ) && !isWhitespace( line()[at-1] ) ) ||
+                       isBreakableBefore( line()[at] ) ||
+                       isBreakableAfter( line()[at-1] );
+            }
+
+            void calcLength() {
+                assert( m_stringIndex < m_column.m_strings.size() );
+
+                m_suffix = false;
+                auto width = m_column.m_width-indent();
+                m_end = m_pos;
+                if (line()[m_pos] == '\n') {
+                    ++m_end;
+                }
+                while( m_end < line().size() && line()[m_end] != '\n' )
+                    ++m_end;
+
+                if( m_end < m_pos + width ) {
+                    m_len = m_end - m_pos;
+                }
+                else {
+                    size_t len = width;
+                    while (len > 0 && !isBoundary(m_pos + len))
+                        --len;
+                    while (len > 0 && isWhitespace( line()[m_pos + len - 1] ))
+                        --len;
+
+                    if (len > 0) {
+                        m_len = len;
+                    } else {
+                        m_suffix = true;
+                        m_len = width - 1;
+                    }
+                }
+            }
+
+            auto indent() const -> size_t {
+                auto initial = m_pos == 0 && m_stringIndex == 0 ? m_column.m_initialIndent : std::string::npos;
+                return initial == std::string::npos ? m_column.m_indent : initial;
+            }
+
+            auto addIndentAndSuffix(std::string const &plain) const -> std::string {
+                return std::string( indent(), ' ' ) + (m_suffix ? plain + "-" : plain);
+            }
+
+        public:
+            using difference_type = std::ptrdiff_t;
+            using value_type = std::string;
+            using pointer = value_type*;
+            using reference = value_type&;
+            using iterator_category = std::forward_iterator_tag;
+
+            explicit iterator( Column const& column ) : m_column( column ) {
+                assert( m_column.m_width > m_column.m_indent );
+                assert( m_column.m_initialIndent == std::string::npos || m_column.m_width > m_column.m_initialIndent );
+                calcLength();
+                if( m_len == 0 )
+                    m_stringIndex++; // Empty string
+            }
+
+            auto operator *() const -> std::string {
+                assert( m_stringIndex < m_column.m_strings.size() );
+                assert( m_pos <= m_end );
+                return addIndentAndSuffix(line().substr(m_pos, m_len));
+            }
+
+            auto operator ++() -> iterator& {
+                m_pos += m_len;
+                if( m_pos < line().size() && line()[m_pos] == '\n' )
+                    m_pos += 1;
+                else
+                    while( m_pos < line().size() && isWhitespace( line()[m_pos] ) )
+                        ++m_pos;
+
+                if( m_pos == line().size() ) {
+                    m_pos = 0;
+                    ++m_stringIndex;
+                }
+                if( m_stringIndex < m_column.m_strings.size() )
+                    calcLength();
+                return *this;
+            }
+            auto operator ++(int) -> iterator {
+                iterator prev( *this );
+                operator++();
+                return prev;
+            }
+
+            auto operator ==( iterator const& other ) const -> bool {
+                return
+                    m_pos == other.m_pos &&
+                    m_stringIndex == other.m_stringIndex &&
+                    &m_column == &other.m_column;
+            }
+            auto operator !=( iterator const& other ) const -> bool {
+                return !operator==( other );
+            }
+        };
+        using const_iterator = iterator;
+
+        explicit Column( std::string const& text ) { m_strings.push_back( text ); }
+
+        auto width( size_t newWidth ) -> Column& {
+            assert( newWidth > 0 );
+            m_width = newWidth;
+            return *this;
+        }
+        auto indent( size_t newIndent ) -> Column& {
+            m_indent = newIndent;
+            return *this;
+        }
+        auto initialIndent( size_t newIndent ) -> Column& {
+            m_initialIndent = newIndent;
+            return *this;
+        }
+
+        auto width() const -> size_t { return m_width; }
+        auto begin() const -> iterator { return iterator( *this ); }
+        auto end() const -> iterator { return { *this, m_strings.size() }; }
+
+        inline friend std::ostream& operator << ( std::ostream& os, Column const& col ) {
+            bool first = true;
+            for( auto line : col ) {
+                if( first )
+                    first = false;
+                else
+                    os << "\n";
+                os <<  line;
+            }
+            return os;
+        }
+
+        auto operator + ( Column const& other ) -> Columns;
+
+        auto toString() const -> std::string {
+            std::ostringstream oss;
+            oss << *this;
+            return oss.str();
+        }
+    };
+
+    class Spacer : public Column {
+
+    public:
+        explicit Spacer( size_t spaceWidth ) : Column( "" ) {
+            width( spaceWidth );
+        }
+    };
+
+    class Columns {
+        std::vector<Column> m_columns;
+
+    public:
+
+        class iterator {
+            friend Columns;
+            struct EndTag {};
+
+            std::vector<Column> const& m_columns;
+            std::vector<Column::iterator> m_iterators;
+            size_t m_activeIterators;
+
+            iterator( Columns const& columns, EndTag )
+            :   m_columns( columns.m_columns ),
+                m_activeIterators( 0 )
+            {
+                m_iterators.reserve( m_columns.size() );
+
+                for( auto const& col : m_columns )
+                    m_iterators.push_back( col.end() );
+            }
+
+        public:
+            using difference_type = std::ptrdiff_t;
+            using value_type = std::string;
+            using pointer = value_type*;
+            using reference = value_type&;
+            using iterator_category = std::forward_iterator_tag;
+
+            explicit iterator( Columns const& columns )
+            :   m_columns( columns.m_columns ),
+                m_activeIterators( m_columns.size() )
+            {
+                m_iterators.reserve( m_columns.size() );
+
+                for( auto const& col : m_columns )
+                    m_iterators.push_back( col.begin() );
+            }
+
+            auto operator ==( iterator const& other ) const -> bool {
+                return m_iterators == other.m_iterators;
+            }
+            auto operator !=( iterator const& other ) const -> bool {
+                return m_iterators != other.m_iterators;
+            }
+            auto operator *() const -> std::string {
+                std::string row, padding;
+
+                for( size_t i = 0; i < m_columns.size(); ++i ) {
+                    auto width = m_columns[i].width();
+                    if( m_iterators[i] != m_columns[i].end() ) {
+                        std::string col = *m_iterators[i];
+                        row += padding + col;
+                        if( col.size() < width )
+                            padding = std::string( width - col.size(), ' ' );
+                        else
+                            padding = "";
+                    }
+                    else {
+                        padding += std::string( width, ' ' );
+                    }
+                }
+                return row;
+            }
+            auto operator ++() -> iterator& {
+                for( size_t i = 0; i < m_columns.size(); ++i ) {
+                    if (m_iterators[i] != m_columns[i].end())
+                        ++m_iterators[i];
+                }
+                return *this;
+            }
+            auto operator ++(int) -> iterator {
+                iterator prev( *this );
+                operator++();
+                return prev;
+            }
+        };
+        using const_iterator = iterator;
+
+        auto begin() const -> iterator { return iterator( *this ); }
+        auto end() const -> iterator { return { *this, iterator::EndTag() }; }
+
+        auto operator += ( Column const& col ) -> Columns& {
+            m_columns.push_back( col );
+            return *this;
+        }
+        auto operator + ( Column const& col ) -> Columns {
+            Columns combined = *this;
+            combined += col;
+            return combined;
+        }
+
+        inline friend std::ostream& operator << ( std::ostream& os, Columns const& cols ) {
+
+            bool first = true;
+            for( auto line : cols ) {
+                if( first )
+                    first = false;
+                else
+                    os << "\n";
+                os << line;
+            }
+            return os;
+        }
+
+        auto toString() const -> std::string {
+            std::ostringstream oss;
+            oss << *this;
+            return oss.str();
+        }
+    };
+
+    inline auto Column::operator + ( Column const& other ) -> Columns {
+        Columns cols;
+        cols += *this;
+        cols += other;
+        return cols;
+    }
+}}
+
+#endif // CLARA_TEXTFLOW_HPP_INCLUDED
+
+// ----------- end of #include from clara_textflow.hpp -----------
+// ........... back in clara.hpp
+
+
+#include <memory>
+#include <set>
+#include <algorithm>
+
+#if !defined(CLARA_PLATFORM_WINDOWS) && ( defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(_MSC_VER) )
+#define CLARA_PLATFORM_WINDOWS
+#endif
+
+namespace clara {
+namespace detail {
+
+    // Traits for extracting arg and return type of lambdas (for single argument lambdas)
+    template<typename L>
+    struct UnaryLambdaTraits : UnaryLambdaTraits<decltype( &L::operator() )> {};
+
+    template<typename ClassT, typename ReturnT, typename... Args>
+    struct UnaryLambdaTraits<ReturnT( ClassT::* )( Args... ) const> {
+        static const bool isValid = false;
+    };
+
+    template<typename ClassT, typename ReturnT, typename ArgT>
+    struct UnaryLambdaTraits<ReturnT( ClassT::* )( ArgT ) const> {
+        static const bool isValid = true;
+        using ArgType = typename std::remove_const<typename std::remove_reference<ArgT>::type>::type;
+        using ReturnType = ReturnT;
+    };
+
+    class TokenStream;
+
+    // Transport for raw args (copied from main args, or supplied via init list for testing)
+    class Args {
+        friend TokenStream;
+        std::string m_exeName;
+        std::vector<std::string> m_args;
+
+    public:
+        Args( int argc, char const* const* argv )
+            : m_exeName(argv[0]),
+              m_args(argv + 1, argv + argc) {}
+
+        Args( std::initializer_list<std::string> args )
+        :   m_exeName( *args.begin() ),
+            m_args( args.begin()+1, args.end() )
+        {}
+
+        auto exeName() const -> std::string {
+            return m_exeName;
+        }
+    };
+
+    // Wraps a token coming from a token stream. These may not directly correspond to strings as a single string
+    // may encode an option + its argument if the : or = form is used
+    enum class TokenType {
+        Option, Argument
+    };
+    struct Token {
+        TokenType type;
+        std::string token;
+    };
+
+    inline auto isOptPrefix( char c ) -> bool {
+        return c == '-'
+#ifdef CLARA_PLATFORM_WINDOWS
+            || c == '/'
+#endif
+        ;
+    }
+
+    // Abstracts iterators into args as a stream of tokens, with option arguments uniformly handled
+    class TokenStream {
+        using Iterator = std::vector<std::string>::const_iterator;
+        Iterator it;
+        Iterator itEnd;
+        std::vector<Token> m_tokenBuffer;
+
+        void loadBuffer() {
+            m_tokenBuffer.resize( 0 );
+
+            // Skip any empty strings
+            while( it != itEnd && it->empty() )
+                ++it;
+
+            if( it != itEnd ) {
+                auto const &next = *it;
+                if( isOptPrefix( next[0] ) ) {
+                    auto delimiterPos = next.find_first_of( " :=" );
+                    if( delimiterPos != std::string::npos ) {
+                        m_tokenBuffer.push_back( { TokenType::Option, next.substr( 0, delimiterPos ) } );
+                        m_tokenBuffer.push_back( { TokenType::Argument, next.substr( delimiterPos + 1 ) } );
+                    } else {
+                        if( next[1] != '-' && next.size() > 2 ) {
+                            std::string opt = "- ";
+                            for( size_t i = 1; i < next.size(); ++i ) {
+                                opt[1] = next[i];
+                                m_tokenBuffer.push_back( { TokenType::Option, opt } );
+                            }
+                        } else {
+                            m_tokenBuffer.push_back( { TokenType::Option, next } );
+                        }
+                    }
+                } else {
+                    m_tokenBuffer.push_back( { TokenType::Argument, next } );
+                }
+            }
+        }
+
+    public:
+        explicit TokenStream( Args const &args ) : TokenStream( args.m_args.begin(), args.m_args.end() ) {}
+
+        TokenStream( Iterator it, Iterator itEnd ) : it( it ), itEnd( itEnd ) {
+            loadBuffer();
+        }
+
+        explicit operator bool() const {
+            return !m_tokenBuffer.empty() || it != itEnd;
+        }
+
+        auto count() const -> size_t { return m_tokenBuffer.size() + (itEnd - it); }
+
+        auto operator*() const -> Token {
+            assert( !m_tokenBuffer.empty() );
+            return m_tokenBuffer.front();
+        }
+
+        auto operator->() const -> Token const * {
+            assert( !m_tokenBuffer.empty() );
+            return &m_tokenBuffer.front();
+        }
+
+        auto operator++() -> TokenStream & {
+            if( m_tokenBuffer.size() >= 2 ) {
+                m_tokenBuffer.erase( m_tokenBuffer.begin() );
+            } else {
+                if( it != itEnd )
+                    ++it;
+                loadBuffer();
+            }
+            return *this;
+        }
+    };
+
+
+    class ResultBase {
+    public:
+        enum Type {
+            Ok, LogicError, RuntimeError
+        };
+
+    protected:
+        ResultBase( Type type ) : m_type( type ) {}
+        virtual ~ResultBase() = default;
+
+        virtual void enforceOk() const = 0;
+
+        Type m_type;
+    };
+
+    template<typename T>
+    class ResultValueBase : public ResultBase {
+    public:
+        auto value() const -> T const & {
+            enforceOk();
+            return m_value;
+        }
+
+    protected:
+        ResultValueBase( Type type ) : ResultBase( type ) {}
+
+        ResultValueBase( ResultValueBase const &other ) : ResultBase( other ) {
+            if( m_type == ResultBase::Ok )
+                new( &m_value ) T( other.m_value );
+        }
+
+        ResultValueBase( Type, T const &value ) : ResultBase( Ok ) {
+            new( &m_value ) T( value );
+        }
+
+        auto operator=( ResultValueBase const &other ) -> ResultValueBase & {
+            if( m_type == ResultBase::Ok )
+                m_value.~T();
+            ResultBase::operator=(other);
+            if( m_type == ResultBase::Ok )
+                new( &m_value ) T( other.m_value );
+            return *this;
+        }
+
+        ~ResultValueBase() override {
+            if( m_type == Ok )
+                m_value.~T();
+        }
+
+        union {
+            T m_value;
+        };
+    };
+
+    template<>
+    class ResultValueBase<void> : public ResultBase {
+    protected:
+        using ResultBase::ResultBase;
+    };
+
+    template<typename T = void>
+    class BasicResult : public ResultValueBase<T> {
+    public:
+        template<typename U>
+        explicit BasicResult( BasicResult<U> const &other )
+        :   ResultValueBase<T>( other.type() ),
+            m_errorMessage( other.errorMessage() )
+        {
+            assert( type() != ResultBase::Ok );
+        }
+
+        template<typename U>
+        static auto ok( U const &value ) -> BasicResult { return { ResultBase::Ok, value }; }
+        static auto ok() -> BasicResult { return { ResultBase::Ok }; }
+        static auto logicError( std::string const &message ) -> BasicResult { return { ResultBase::LogicError, message }; }
+        static auto runtimeError( std::string const &message ) -> BasicResult { return { ResultBase::RuntimeError, message }; }
+
+        explicit operator bool() const { return m_type == ResultBase::Ok; }
+        auto type() const -> ResultBase::Type { return m_type; }
+        auto errorMessage() const -> std::string { return m_errorMessage; }
+
+    protected:
+        void enforceOk() const override {
+
+            // Errors shouldn't reach this point, but if they do
+            // the actual error message will be in m_errorMessage
+            assert( m_type != ResultBase::LogicError );
+            assert( m_type != ResultBase::RuntimeError );
+            if( m_type != ResultBase::Ok )
+                std::abort();
+        }
+
+        std::string m_errorMessage; // Only populated if resultType is an error
+
+        BasicResult( ResultBase::Type type, std::string const &message )
+        :   ResultValueBase<T>(type),
+            m_errorMessage(message)
+        {
+            assert( m_type != ResultBase::Ok );
+        }
+
+        using ResultValueBase<T>::ResultValueBase;
+        using ResultBase::m_type;
+    };
+
+    enum class ParseResultType {
+        Matched, NoMatch, ShortCircuitAll, ShortCircuitSame
+    };
+
+    class ParseState {
+    public:
+
+        ParseState( ParseResultType type, TokenStream const &remainingTokens )
+        : m_type(type),
+          m_remainingTokens( remainingTokens )
+        {}
+
+        auto type() const -> ParseResultType { return m_type; }
+        auto remainingTokens() const -> TokenStream { return m_remainingTokens; }
+
+    private:
+        ParseResultType m_type;
+        TokenStream m_remainingTokens;
+    };
+
+    using Result = BasicResult<void>;
+    using ParserResult = BasicResult<ParseResultType>;
+    using InternalParseResult = BasicResult<ParseState>;
+
+    struct HelpColumns {
+        std::string left;
+        std::string right;
+    };
+
+    template<typename T>
+    inline auto convertInto( std::string const &source, T& target ) -> ParserResult {
+        std::stringstream ss;
+        ss << source;
+        ss >> target;
+        if( ss.fail() )
+            return ParserResult::runtimeError( "Unable to convert '" + source + "' to destination type" );
+        else
+            return ParserResult::ok( ParseResultType::Matched );
+    }
+    inline auto convertInto( std::string const &source, std::string& target ) -> ParserResult {
+        target = source;
+        return ParserResult::ok( ParseResultType::Matched );
+    }
+    inline auto convertInto( std::string const &source, bool &target ) -> ParserResult {
+        std::string srcLC = source;
+        std::transform( srcLC.begin(), srcLC.end(), srcLC.begin(), []( unsigned char c ) { return static_cast<char>( ::tolower( c ) ); } );
+        if (srcLC == "y" || srcLC == "1" || srcLC == "true" || srcLC == "yes" || srcLC == "on")
+            target = true;
+        else if (srcLC == "n" || srcLC == "0" || srcLC == "false" || srcLC == "no" || srcLC == "off")
+            target = false;
+        else
+            return ParserResult::runtimeError( "Expected a boolean value but did not recognise: '" + source + "'" );
+        return ParserResult::ok( ParseResultType::Matched );
+    }
+#ifdef CLARA_CONFIG_OPTIONAL_TYPE
+    template<typename T>
+    inline auto convertInto( std::string const &source, CLARA_CONFIG_OPTIONAL_TYPE<T>& target ) -> ParserResult {
+        T temp;
+        auto result = convertInto( source, temp );
+        if( result )
+            target = std::move(temp);
+        return result;
+    }
+#endif // CLARA_CONFIG_OPTIONAL_TYPE
+
+    struct NonCopyable {
+        NonCopyable() = default;
+        NonCopyable( NonCopyable const & ) = delete;
+        NonCopyable( NonCopyable && ) = delete;
+        NonCopyable &operator=( NonCopyable const & ) = delete;
+        NonCopyable &operator=( NonCopyable && ) = delete;
+    };
+
+    struct BoundRef : NonCopyable {
+        virtual ~BoundRef() = default;
+        virtual auto isContainer() const -> bool { return false; }
+        virtual auto isFlag() const -> bool { return false; }
+    };
+    struct BoundValueRefBase : BoundRef {
+        virtual auto setValue( std::string const &arg ) -> ParserResult = 0;
+    };
+    struct BoundFlagRefBase : BoundRef {
+        virtual auto setFlag( bool flag ) -> ParserResult = 0;
+        virtual auto isFlag() const -> bool { return true; }
+    };
+
+    template<typename T>
+    struct BoundValueRef : BoundValueRefBase {
+        T &m_ref;
+
+        explicit BoundValueRef( T &ref ) : m_ref( ref ) {}
+
+        auto setValue( std::string const &arg ) -> ParserResult override {
+            return convertInto( arg, m_ref );
+        }
+    };
+
+    template<typename T>
+    struct BoundValueRef<std::vector<T>> : BoundValueRefBase {
+        std::vector<T> &m_ref;
+
+        explicit BoundValueRef( std::vector<T> &ref ) : m_ref( ref ) {}
+
+        auto isContainer() const -> bool override { return true; }
+
+        auto setValue( std::string const &arg ) -> ParserResult override {
+            T temp;
+            auto result = convertInto( arg, temp );
+            if( result )
+                m_ref.push_back( temp );
+            return result;
+        }
+    };
+
+    struct BoundFlagRef : BoundFlagRefBase {
+        bool &m_ref;
+
+        explicit BoundFlagRef( bool &ref ) : m_ref( ref ) {}
+
+        auto setFlag( bool flag ) -> ParserResult override {
+            m_ref = flag;
+            return ParserResult::ok( ParseResultType::Matched );
+        }
+    };
+
+    template<typename ReturnType>
+    struct LambdaInvoker {
+        static_assert( std::is_same<ReturnType, ParserResult>::value, "Lambda must return void or clara::ParserResult" );
+
+        template<typename L, typename ArgType>
+        static auto invoke( L const &lambda, ArgType const &arg ) -> ParserResult {
+            return lambda( arg );
+        }
+    };
+
+    template<>
+    struct LambdaInvoker<void> {
+        template<typename L, typename ArgType>
+        static auto invoke( L const &lambda, ArgType const &arg ) -> ParserResult {
+            lambda( arg );
+            return ParserResult::ok( ParseResultType::Matched );
+        }
+    };
+
+    template<typename ArgType, typename L>
+    inline auto invokeLambda( L const &lambda, std::string const &arg ) -> ParserResult {
+        ArgType temp{};
+        auto result = convertInto( arg, temp );
+        return !result
+           ? result
+           : LambdaInvoker<typename UnaryLambdaTraits<L>::ReturnType>::invoke( lambda, temp );
+    }
+
+
+    template<typename L>
+    struct BoundLambda : BoundValueRefBase {
+        L m_lambda;
+
+        static_assert( UnaryLambdaTraits<L>::isValid, "Supplied lambda must take exactly one argument" );
+        explicit BoundLambda( L const &lambda ) : m_lambda( lambda ) {}
+
+        auto setValue( std::string const &arg ) -> ParserResult override {
+            return invokeLambda<typename UnaryLambdaTraits<L>::ArgType>( m_lambda, arg );
+        }
+    };
+
+    template<typename L>
+    struct BoundFlagLambda : BoundFlagRefBase {
+        L m_lambda;
+
+        static_assert( UnaryLambdaTraits<L>::isValid, "Supplied lambda must take exactly one argument" );
+        static_assert( std::is_same<typename UnaryLambdaTraits<L>::ArgType, bool>::value, "flags must be boolean" );
+
+        explicit BoundFlagLambda( L const &lambda ) : m_lambda( lambda ) {}
+
+        auto setFlag( bool flag ) -> ParserResult override {
+            return LambdaInvoker<typename UnaryLambdaTraits<L>::ReturnType>::invoke( m_lambda, flag );
+        }
+    };
+
+    enum class Optionality { Optional, Required };
+
+    struct Parser;
+
+    class ParserBase {
+    public:
+        virtual ~ParserBase() = default;
+        virtual auto validate() const -> Result { return Result::ok(); }
+        virtual auto parse( std::string const& exeName, TokenStream const &tokens) const -> InternalParseResult  = 0;
+        virtual auto cardinality() const -> size_t { return 1; }
+
+        auto parse( Args const &args ) const -> InternalParseResult {
+            return parse( args.exeName(), TokenStream( args ) );
+        }
+    };
+
+    template<typename DerivedT>
+    class ComposableParserImpl : public ParserBase {
+    public:
+        template<typename T>
+        auto operator|( T const &other ) const -> Parser;
+
+		template<typename T>
+        auto operator+( T const &other ) const -> Parser;
+    };
+
+    // Common code and state for Args and Opts
+    template<typename DerivedT>
+    class ParserRefImpl : public ComposableParserImpl<DerivedT> {
+    protected:
+        Optionality m_optionality = Optionality::Optional;
+        std::shared_ptr<BoundRef> m_ref;
+        std::string m_hint;
+        std::string m_description;
+
+        explicit ParserRefImpl( std::shared_ptr<BoundRef> const &ref ) : m_ref( ref ) {}
+
+    public:
+        template<typename T>
+        ParserRefImpl( T &ref, std::string const &hint )
+        :   m_ref( std::make_shared<BoundValueRef<T>>( ref ) ),
+            m_hint( hint )
+        {}
+
+        template<typename LambdaT>
+        ParserRefImpl( LambdaT const &ref, std::string const &hint )
+        :   m_ref( std::make_shared<BoundLambda<LambdaT>>( ref ) ),
+            m_hint(hint)
+        {}
+
+        auto operator()( std::string const &description ) -> DerivedT & {
+            m_description = description;
+            return static_cast<DerivedT &>( *this );
+        }
+
+        auto optional() -> DerivedT & {
+            m_optionality = Optionality::Optional;
+            return static_cast<DerivedT &>( *this );
+        };
+
+        auto required() -> DerivedT & {
+            m_optionality = Optionality::Required;
+            return static_cast<DerivedT &>( *this );
+        };
+
+        auto isOptional() const -> bool {
+            return m_optionality == Optionality::Optional;
+        }
+
+        auto cardinality() const -> size_t override {
+            if( m_ref->isContainer() )
+                return 0;
+            else
+                return 1;
+        }
+
+        auto hint() const -> std::string { return m_hint; }
+    };
+
+    class ExeName : public ComposableParserImpl<ExeName> {
+        std::shared_ptr<std::string> m_name;
+        std::shared_ptr<BoundValueRefBase> m_ref;
+
+        template<typename LambdaT>
+        static auto makeRef(LambdaT const &lambda) -> std::shared_ptr<BoundValueRefBase> {
+            return std::make_shared<BoundLambda<LambdaT>>( lambda) ;
+        }
+
+    public:
+        ExeName() : m_name( std::make_shared<std::string>( "<executable>" ) ) {}
+
+        explicit ExeName( std::string &ref ) : ExeName() {
+            m_ref = std::make_shared<BoundValueRef<std::string>>( ref );
+        }
+
+        template<typename LambdaT>
+        explicit ExeName( LambdaT const& lambda ) : ExeName() {
+            m_ref = std::make_shared<BoundLambda<LambdaT>>( lambda );
+        }
+
+        // The exe name is not parsed out of the normal tokens, but is handled specially
+        auto parse( std::string const&, TokenStream const &tokens ) const -> InternalParseResult override {
+            return InternalParseResult::ok( ParseState( ParseResultType::NoMatch, tokens ) );
+        }
+
+        auto name() const -> std::string { return *m_name; }
+        auto set( std::string const& newName ) -> ParserResult {
+
+            auto lastSlash = newName.find_last_of( "\\/" );
+            auto filename = ( lastSlash == std::string::npos )
+                    ? newName
+                    : newName.substr( lastSlash+1 );
+
+            *m_name = filename;
+            if( m_ref )
+                return m_ref->setValue( filename );
+            else
+                return ParserResult::ok( ParseResultType::Matched );
+        }
+    };
+
+    class Arg : public ParserRefImpl<Arg> {
+    public:
+        using ParserRefImpl::ParserRefImpl;
+
+        auto parse( std::string const &, TokenStream const &tokens ) const -> InternalParseResult override {
+            auto validationResult = validate();
+            if( !validationResult )
+                return InternalParseResult( validationResult );
+
+            auto remainingTokens = tokens;
+            auto const &token = *remainingTokens;
+            if( token.type != TokenType::Argument )
+                return InternalParseResult::ok( ParseState( ParseResultType::NoMatch, remainingTokens ) );
+
+            assert( !m_ref->isFlag() );
+            auto valueRef = static_cast<detail::BoundValueRefBase*>( m_ref.get() );
+
+            auto result = valueRef->setValue( remainingTokens->token );
+            if( !result )
+                return InternalParseResult( result );
+            else
+                return InternalParseResult::ok( ParseState( ParseResultType::Matched, ++remainingTokens ) );
+        }
+    };
+
+    inline auto normaliseOpt( std::string const &optName ) -> std::string {
+#ifdef CLARA_PLATFORM_WINDOWS
+        if( optName[0] == '/' )
+            return "-" + optName.substr( 1 );
+        else
+#endif
+            return optName;
+    }
+
+    class Opt : public ParserRefImpl<Opt> {
+    protected:
+        std::vector<std::string> m_optNames;
+
+    public:
+        template<typename LambdaT>
+        explicit Opt( LambdaT const &ref ) : ParserRefImpl( std::make_shared<BoundFlagLambda<LambdaT>>( ref ) ) {}
+
+        explicit Opt( bool &ref ) : ParserRefImpl( std::make_shared<BoundFlagRef>( ref ) ) {}
+
+        template<typename LambdaT>
+        Opt( LambdaT const &ref, std::string const &hint ) : ParserRefImpl( ref, hint ) {}
+
+        template<typename T>
+        Opt( T &ref, std::string const &hint ) : ParserRefImpl( ref, hint ) {}
+
+        auto operator[]( std::string const &optName ) -> Opt & {
+            m_optNames.push_back( optName );
+            return *this;
+        }
+
+        auto getHelpColumns() const -> std::vector<HelpColumns> {
+            std::ostringstream oss;
+            bool first = true;
+            for( auto const &opt : m_optNames ) {
+                if (first)
+                    first = false;
+                else
+                    oss << ", ";
+                oss << opt;
+            }
+            if( !m_hint.empty() )
+                oss << " <" << m_hint << ">";
+            return { { oss.str(), m_description } };
+        }
+
+        auto isMatch( std::string const &optToken ) const -> bool {
+            auto normalisedToken = normaliseOpt( optToken );
+            for( auto const &name : m_optNames ) {
+                if( normaliseOpt( name ) == normalisedToken )
+                    return true;
+            }
+            return false;
+        }
+
+        using ParserBase::parse;
+
+        auto parse( std::string const&, TokenStream const &tokens ) const -> InternalParseResult override {
+            auto validationResult = validate();
+            if( !validationResult )
+                return InternalParseResult( validationResult );
+
+            auto remainingTokens = tokens;
+            if( remainingTokens && remainingTokens->type == TokenType::Option ) {
+                auto const &token = *remainingTokens;
+                if( isMatch(token.token ) ) {
+                    if( m_ref->isFlag() ) {
+                        auto flagRef = static_cast<detail::BoundFlagRefBase*>( m_ref.get() );
+                        auto result = flagRef->setFlag( true );
+                        if( !result )
+                            return InternalParseResult( result );
+                        if( result.value() == ParseResultType::ShortCircuitAll )
+                            return InternalParseResult::ok( ParseState( result.value(), remainingTokens ) );
+                    } else {
+                        auto valueRef = static_cast<detail::BoundValueRefBase*>( m_ref.get() );
+                        ++remainingTokens;
+                        if( !remainingTokens )
+                            return InternalParseResult::runtimeError( "Expected argument following " + token.token );
+                        auto const &argToken = *remainingTokens;
+                        if( argToken.type != TokenType::Argument )
+                            return InternalParseResult::runtimeError( "Expected argument following " + token.token );
+                        auto result = valueRef->setValue( argToken.token );
+                        if( !result )
+                            return InternalParseResult( result );
+                        if( result.value() == ParseResultType::ShortCircuitAll )
+                            return InternalParseResult::ok( ParseState( result.value(), remainingTokens ) );
+                    }
+                    return InternalParseResult::ok( ParseState( ParseResultType::Matched, ++remainingTokens ) );
+                }
+            }
+            return InternalParseResult::ok( ParseState( ParseResultType::NoMatch, remainingTokens ) );
+        }
+
+        auto validate() const -> Result override {
+            if( m_optNames.empty() )
+                return Result::logicError( "No options supplied to Opt" );
+            for( auto const &name : m_optNames ) {
+                if( name.empty() )
+                    return Result::logicError( "Option name cannot be empty" );
+#ifdef CLARA_PLATFORM_WINDOWS
+                if( name[0] != '-' && name[0] != '/' )
+                    return Result::logicError( "Option name must begin with '-' or '/'" );
+#else
+                if( name[0] != '-' )
+                    return Result::logicError( "Option name must begin with '-'" );
+#endif
+            }
+            return ParserRefImpl::validate();
+        }
+    };
+
+    struct Help : Opt {
+        Help( bool &showHelpFlag )
+        :   Opt([&]( bool flag ) {
+                showHelpFlag = flag;
+                return ParserResult::ok( ParseResultType::ShortCircuitAll );
+            })
+        {
+            static_cast<Opt &>( *this )
+                    ("display usage information")
+                    ["-?"]["-h"]["--help"]
+                    .optional();
+        }
+    };
+
+
+    struct Parser : ParserBase {
+
+        mutable ExeName m_exeName;
+        std::vector<Opt> m_options;
+        std::vector<Arg> m_args;
+
+        auto operator|=( ExeName const &exeName ) -> Parser & {
+            m_exeName = exeName;
+            return *this;
+        }
+
+        auto operator|=( Arg const &arg ) -> Parser & {
+            m_args.push_back(arg);
+            return *this;
+        }
+
+        auto operator|=( Opt const &opt ) -> Parser & {
+            m_options.push_back(opt);
+            return *this;
+        }
+
+        auto operator|=( Parser const &other ) -> Parser & {
+            m_options.insert(m_options.end(), other.m_options.begin(), other.m_options.end());
+            m_args.insert(m_args.end(), other.m_args.begin(), other.m_args.end());
+            return *this;
+        }
+
+        template<typename T>
+        auto operator|( T const &other ) const -> Parser {
+            return Parser( *this ) |= other;
+        }
+
+        // Forward deprecated interface with '+' instead of '|'
+        template<typename T>
+        auto operator+=( T const &other ) -> Parser & { return operator|=( other ); }
+        template<typename T>
+        auto operator+( T const &other ) const -> Parser { return operator|( other ); }
+
+        auto getHelpColumns() const -> std::vector<HelpColumns> {
+            std::vector<HelpColumns> cols;
+            for (auto const &o : m_options) {
+                auto childCols = o.getHelpColumns();
+                cols.insert( cols.end(), childCols.begin(), childCols.end() );
+            }
+            return cols;
+        }
+
+        void writeToStream( std::ostream &os ) const {
+            if (!m_exeName.name().empty()) {
+                os << "usage:\n" << "  " << m_exeName.name() << " ";
+                bool required = true, first = true;
+                for( auto const &arg : m_args ) {
+                    if (first)
+                        first = false;
+                    else
+                        os << " ";
+                    if( arg.isOptional() && required ) {
+                        os << "[";
+                        required = false;
+                    }
+                    os << "<" << arg.hint() << ">";
+                    if( arg.cardinality() == 0 )
+                        os << " ... ";
+                }
+                if( !required )
+                    os << "]";
+                if( !m_options.empty() )
+                    os << " options";
+                os << "\n\nwhere options are:" << std::endl;
+            }
+
+            auto rows = getHelpColumns();
+            size_t consoleWidth = CLARA_CONFIG_CONSOLE_WIDTH;
+            size_t optWidth = 0;
+            for( auto const &cols : rows )
+                optWidth = (std::max)(optWidth, cols.left.size() + 2);
+
+            optWidth = (std::min)(optWidth, consoleWidth/2);
+
+            for( auto const &cols : rows ) {
+                auto row =
+                        TextFlow::Column( cols.left ).width( optWidth ).indent( 2 ) +
+                        TextFlow::Spacer(4) +
+                        TextFlow::Column( cols.right ).width( consoleWidth - 7 - optWidth );
+                os << row << std::endl;
+            }
+        }
+
+        friend auto operator<<( std::ostream &os, Parser const &parser ) -> std::ostream& {
+            parser.writeToStream( os );
+            return os;
+        }
+
+        auto validate() const -> Result override {
+            for( auto const &opt : m_options ) {
+                auto result = opt.validate();
+                if( !result )
+                    return result;
+            }
+            for( auto const &arg : m_args ) {
+                auto result = arg.validate();
+                if( !result )
+                    return result;
+            }
+            return Result::ok();
+        }
+
+        using ParserBase::parse;
+
+        auto parse( std::string const& exeName, TokenStream const &tokens ) const -> InternalParseResult override {
+
+            struct ParserInfo {
+                ParserBase const* parser = nullptr;
+                size_t count = 0;
+            };
+            const size_t totalParsers = m_options.size() + m_args.size();
+            assert( totalParsers < 512 );
+            // ParserInfo parseInfos[totalParsers]; // <-- this is what we really want to do
+            ParserInfo parseInfos[512];
+
+            {
+                size_t i = 0;
+                for (auto const &opt : m_options) parseInfos[i++].parser = &opt;
+                for (auto const &arg : m_args) parseInfos[i++].parser = &arg;
+            }
+
+            m_exeName.set( exeName );
+
+            auto result = InternalParseResult::ok( ParseState( ParseResultType::NoMatch, tokens ) );
+            while( result.value().remainingTokens() ) {
+                bool tokenParsed = false;
+
+                for( size_t i = 0; i < totalParsers; ++i ) {
+                    auto&  parseInfo = parseInfos[i];
+                    if( parseInfo.parser->cardinality() == 0 || parseInfo.count < parseInfo.parser->cardinality() ) {
+                        result = parseInfo.parser->parse(exeName, result.value().remainingTokens());
+                        if (!result)
+                            return result;
+                        if (result.value().type() != ParseResultType::NoMatch) {
+                            tokenParsed = true;
+                            ++parseInfo.count;
+                            break;
+                        }
+                    }
+                }
+
+                if( result.value().type() == ParseResultType::ShortCircuitAll )
+                    return result;
+                if( !tokenParsed )
+                    return InternalParseResult::runtimeError( "Unrecognised token: " + result.value().remainingTokens()->token );
+            }
+            // !TBD Check missing required options
+            return result;
+        }
+    };
+
+    template<typename DerivedT>
+    template<typename T>
+    auto ComposableParserImpl<DerivedT>::operator|( T const &other ) const -> Parser {
+        return Parser() | static_cast<DerivedT const &>( *this ) | other;
+    }
+} // namespace detail
+
+
+// A Combined parser
+using detail::Parser;
+
+// A parser for options
+using detail::Opt;
+
+// A parser for arguments
+using detail::Arg;
+
+// Wrapper for argc, argv from main()
+using detail::Args;
+
+// Specifies the name of the executable
+using detail::ExeName;
+
+// Convenience wrapper for option parser that specifies the help option
+using detail::Help;
+
+// enum of result types from a parse
+using detail::ParseResultType;
+
+// Result type for parser operation
+using detail::ParserResult;
+
+
+} // namespace clara
+
+#endif // CLARA_HPP_INCLUDED
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/CMakeLists.txt b/ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/CMakeLists.txt
new file mode 100644
index 00000000..bf80846c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/CMakeLists.txt
@@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.0)
+
+project(CatchCoverageHelper)
+
+add_executable(CoverageHelper coverage-helper.cpp)
+set_property(TARGET CoverageHelper PROPERTY CXX_STANDARD 11)
+set_property(TARGET CoverageHelper PROPERTY CXX_STANDARD_REQUIRED ON)
+set_property(TARGET CoverageHelper PROPERTY CXX_EXTENSIONS OFF)
+if (MSVC)
+    target_compile_options( CoverageHelper PRIVATE /W4 /w44265 /WX /w44061 /w44062 )
+endif()
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/SelfTest.vcxproj.user b/ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/SelfTest.vcxproj.user
new file mode 100644
index 00000000..ffffc575
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/SelfTest.vcxproj.user
@@ -0,0 +1,23 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <UseFullPaths>false</UseFullPaths>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <UseFullPaths>false</UseFullPaths>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <UseFullPaths>false</UseFullPaths>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <UseFullPaths>false</UseFullPaths>
+    </ClCompile>
+  </ItemDefinitionGroup>
+</Project>
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/appveyorBuildConfigurationScript.bat b/ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/appveyorBuildConfigurationScript.bat
new file mode 100644
index 00000000..727f829b
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/appveyorBuildConfigurationScript.bat
@@ -0,0 +1,21 @@
+SETLOCAL EnableDelayedExpansion
+
+@REM  # Possibilities:
+@REM  # Debug build + coverage
+@REM  # Debug build + examples
+@REM  # Debug build +   ---
+@REM  # Release build
+if "%CONFIGURATION%"=="Debug" (
+  if "%coverage%"=="1" (
+    @REM # coverage needs to build the special helper as well as the main
+    cmake -Htools/misc -Bbuild-misc -A%PLATFORM% || exit /b !ERRORLEVEL!
+    cmake --build build-misc || exit /b !ERRORLEVEL!
+    cmake -H. -BBuild -A%PLATFORM% -DCATCH_TEST_USE_WMAIN=%wmain% -DMEMORYCHECK_COMMAND=build-misc\Debug\CoverageHelper.exe -DMEMORYCHECK_COMMAND_OPTIONS=--sep-- -DMEMORYCHECK_TYPE=Valgrind -DCATCH_BUILD_EXAMPLES=%examples% -DCATCH_BUILD_EXTRA_TESTS=%examples% -DCATCH_ENABLE_CONFIGURE_TESTS=%configure_tests% -DCATCH_DEVELOPMENT_BUILD=ON || exit /b !ERRORLEVEL!
+  ) else (
+    @REM # We know that coverage is 0
+    cmake -H. -BBuild -A%PLATFORM% -DCATCH_TEST_USE_WMAIN=%wmain% -DCATCH_BUILD_EXAMPLES=%examples% -DCATCH_BUILD_EXTRA_TESTS=%examples% -DCATCH_BUILD_SURROGATES=%surrogates% -DCATCH_DEVELOPMENT_BUILD=ON -DCATCH_ENABLE_CONFIGURE_TESTS=%configure_tests% || exit /b !ERRORLEVEL!
+  )
+)
+if "%CONFIGURATION%"=="Release" (
+  cmake -H. -BBuild -A%PLATFORM% -DCATCH_TEST_USE_WMAIN=%wmain% -DCATCH_DEVELOPMENT_BUILD=ON || exit /b !ERRORLEVEL!
+)
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/appveyorMergeCoverageScript.py b/ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/appveyorMergeCoverageScript.py
new file mode 100644
index 00000000..5b71f6e1
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/appveyorMergeCoverageScript.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+
+import glob
+import subprocess
+
+if __name__ == '__main__':
+    cov_files = list(glob.glob('tests/cov-report*.bin'))
+    base_cmd = ['OpenCppCoverage', '--quiet', '--export_type=cobertura:cobertura.xml'] + ['--input_coverage={}'.format(f) for f in cov_files]
+    subprocess.check_call(base_cmd)
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/appveyorTestRunScript.bat b/ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/appveyorTestRunScript.bat
new file mode 100644
index 00000000..5982fc92
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/appveyorTestRunScript.bat
@@ -0,0 +1,17 @@
+SETLOCAL EnableDelayedExpansion
+
+rem Disable launching the JIT debugger for ctest.exe
+reg add "HKLM\SOFTWARE\Microsoft\Windows NT\CurrentVersion\AeDebug\AutoExclusionList" /v "ctest.ext" /t REG_DWORD /d 1
+cd Build
+if "%CONFIGURATION%"=="Debug" (
+  if "%coverage%"=="1" (
+    ctest -j 2 -C %CONFIGURATION% -D ExperimentalMemCheck || exit /b !ERRORLEVEL!
+    python ..\tools\misc\appveyorMergeCoverageScript.py || exit /b !ERRORLEVEL!
+    codecov --root .. --no-color --disable gcov -f cobertura.xml -t %CODECOV_TOKEN% || exit /b !ERRORLEVEL!
+  ) else (
+    ctest -j 2 -C %CONFIGURATION% || exit /b !ERRORLEVEL!
+  )
+)
+if "%CONFIGURATION%"=="Release" (
+  ctest -j 2 -C %CONFIGURATION% || exit /b !ERRORLEVEL!
+)
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/coverage-helper.cpp b/ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/coverage-helper.cpp
new file mode 100644
index 00000000..9e7a8ca4
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/coverage-helper.cpp
@@ -0,0 +1,142 @@
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <numeric>
+#include <regex>
+#include <string>
+#include <vector>
+
+std::string escape_arg(const std::string& arg) {
+    if (arg.empty() == false &&
+        arg.find_first_of(" \t\n\v\"") == arg.npos) {
+        return arg;
+    }
+
+    std::string escaped;
+    escaped.push_back('"');
+    for (auto it = arg.begin(); ; ++it) {
+        int num_backslashes = 0;
+
+        while (it != arg.end() && *it == '\\') {
+            ++it;
+            ++num_backslashes;
+        }
+
+        if (it == arg.end()) {
+            escaped.append(num_backslashes * 2, '\\');
+            break;
+        } else if (*it == '"') {
+            escaped.append((num_backslashes + 1) * 2, '\\');
+            escaped.push_back('"');
+            escaped.push_back(*it);
+        } else {
+            escaped.append(num_backslashes, '\\');
+            escaped.push_back(*it);
+        }
+    }
+    escaped.push_back('"');
+
+    return escaped;
+}
+
+
+void create_empty_file(std::string const& path) {
+    std::ofstream ofs(path);
+    ofs << '\n';
+}
+
+const std::string separator = "--sep--";
+const std::string logfile_prefix = "--log-file=";
+
+bool starts_with(std::string const& str, std::string const& pref) {
+    return str.find(pref) == 0;
+}
+
+int parse_log_file_arg(std::string const& arg) {
+    assert(starts_with(arg, logfile_prefix) && "Attempting to parse incorrect arg!");
+    auto fname = arg.substr(logfile_prefix.size());
+    create_empty_file(fname);
+    std::regex regex("MemoryChecker\\.(\\d+)\\.log", std::regex::icase);
+    std::smatch match;
+    if (std::regex_search(fname, match, regex)) {
+        return std::stoi(match[1]);
+    } else {
+        throw std::domain_error("Couldn't find desired expression in string: " + fname);
+    }
+}
+
+std::string catch_path(std::string path) {
+    auto start = path.find("catch");
+    // try capitalized instead
+    if (start == std::string::npos) {
+        start = path.find("Catch");
+    }
+    if (start == std::string::npos) {
+        throw std::domain_error("Couldn't find Catch's base path");
+    }
+    auto end = path.find_first_of("\\/", start);
+    return path.substr(0, end);
+}
+
+std::string windowsify_path(std::string path) {
+    for (auto& c : path) {
+        if (c == '/') {
+            c = '\\';
+        }
+    }
+    return path;
+}
+
+int exec_cmd(std::string const& cmd, int log_num, std::string const& path) {
+    std::array<char, 128> buffer;
+
+    // cmd has already been escaped outside this function.
+    auto real_cmd = "OpenCppCoverage --export_type binary:cov-report" + std::to_string(log_num)
+        + ".bin --quiet " + "--sources " + escape_arg(path) + "\\src" + " --cover_children -- " + cmd;
+    std::cout << "=== Marker ===: Cmd: " << real_cmd << '\n';
+    auto pipe = _popen(real_cmd.c_str(), "r");
+
+    if (!pipe) {
+        throw std::runtime_error("popen() failed!");
+    }
+    while (!feof(pipe)) {
+        if (fgets(buffer.data(), 128, pipe) != nullptr) {
+            std::cout << buffer.data();
+        }
+    }
+
+    auto ret = _pclose(pipe);
+    if (ret == -1) {
+        throw std::runtime_error("underlying error in pclose()");
+    }
+
+    return ret;
+}
+
+// argv should be:
+// [0]: our path
+// [1]: "--log-file=<path>"
+// [2]: "--sep--"
+// [3]+: the actual command
+
+int main(int argc, char** argv) {
+    std::vector<std::string> args(argv, argv + argc);
+    auto sep = std::find(begin(args), end(args), separator);
+    assert(sep - begin(args) == 2 && "Structure differs from expected!");
+
+    auto num = parse_log_file_arg(args[1]);
+
+    auto cmdline = std::accumulate(++sep, end(args), std::string{}, [] (const std::string& lhs, const std::string& rhs) {
+        return lhs + ' ' + escape_arg(rhs);
+    });
+
+    try {
+        return exec_cmd(cmdline, num, windowsify_path(catch_path(args[0])));
+    } catch (std::exception const& ex) {
+        std::cerr << "Helper failed with: '" << ex.what() << "'\n";
+        return 12;
+    }
+}
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/installOpenCppCoverage.ps1 b/ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/installOpenCppCoverage.ps1
new file mode 100644
index 00000000..215fe20e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/installOpenCppCoverage.ps1
@@ -0,0 +1,19 @@
+# Downloads are done from the official github release page links
+$downloadUrl = "https://github.com/OpenCppCoverage/OpenCppCoverage/releases/download/release-0.9.9.0/OpenCppCoverageSetup-x64-0.9.9.0.exe"
+$installerPath = [System.IO.Path]::Combine($Env:USERPROFILE, "Downloads", "OpenCppCoverageSetup.exe")
+
+if(-Not (Test-Path $installerPath)) {
+    Write-Host -ForegroundColor White ("Downloading OpenCppCoverage from: " + $downloadUrl)
+    Start-FileDownload $downloadUrl -FileName $installerPath
+}
+
+Write-Host -ForegroundColor White "About to install OpenCppCoverage..."
+
+$installProcess = (Start-Process $installerPath -ArgumentList '/VERYSILENT' -PassThru -Wait)
+if($installProcess.ExitCode -ne 0) {
+    throw [System.String]::Format("Failed to install OpenCppCoverage, ExitCode: {0}.", $installProcess.ExitCode)
+}
+
+# Assume standard, boring, installation path of ".../Program Files/OpenCppCoverage"
+$installPath = [System.IO.Path]::Combine(${Env:ProgramFiles}, "OpenCppCoverage")
+$env:Path="$env:Path;$installPath"
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/approvalTests.py b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/approvalTests.py
new file mode 100755
index 00000000..4146b646
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/approvalTests.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+
+import io
+import os
+import sys
+import subprocess
+import re
+import difflib
+import shutil
+
+import scriptCommon
+from scriptCommon import catchPath
+
+if os.name == 'nt':
+    # Enable console colours on windows
+    os.system('')
+
+rootPath = os.path.join(catchPath, 'tests/SelfTest/Baselines')
+# Init so it is guaranteed to fail loudly if the scoping gets messed up
+outputDirPath = None
+
+if len(sys.argv) == 3:
+    cmdPath = sys.argv[1]
+    outputDirBasePath = sys.argv[2]
+    outputDirPath = os.path.join(outputDirBasePath, 'ApprovalTests')
+    if not os.path.isdir(outputDirPath):
+        os.mkdir(outputDirPath)
+else:
+    print('Usage: {} path-to-SelfTest-executable path-to-temp-output-dir'.format(sys.argv[0]))
+    exit(1)
+
+
+
+def get_rawResultsPath(baseName):
+    return os.path.join(outputDirPath, '_{0}.tmp'.format(baseName))
+
+def get_baselinesPath(baseName):
+    return os.path.join(rootPath, '{0}.approved.txt'.format(baseName))
+
+def _get_unapprovedPath(path, baseName):
+    return os.path.join(path, '{0}.unapproved.txt'.format(baseName))
+
+def get_filteredResultsPath(baseName):
+    return _get_unapprovedPath(outputDirPath, baseName)
+
+def get_unapprovedResultsPath(baseName):
+    return _get_unapprovedPath(rootPath, baseName)
+
+langFilenameParser = re.compile(r'(.+\.[ch]pp)')
+filelocParser = re.compile(r'''
+    (?P<path_prefix>tests/SelfTest/(?:\w+/)*)  # We separate prefix and fname, so that
+    (?P<filename>\w+\.tests\.[ch]pp)           # we can keep only filename
+    (?::|\()                                   # Linux has : as separator between fname and line number, Windows uses (
+    (\d*)                                      # line number
+    \)?                                        # Windows also uses an ending separator, )
+''', re.VERBOSE)
+lineNumberParser = re.compile(r' line="[0-9]*"')
+hexParser = re.compile(r'\b(0[xX][0-9a-fA-F]+)\b')
+# Note: junit must serialize time with 3 (or or less) decimal places
+#       before generalizing this parser, make sure that this is checked
+#       in other places too.
+junitDurationsParser = re.compile(r' time="[0-9]+\.[0-9]{3}"')
+durationParser = re.compile(r''' duration=['"][0-9]+['"]''')
+timestampsParser = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}\:\d{2}\:\d{2}Z')
+versionParser = re.compile(r'[0-9]+\.[0-9]+\.[0-9]+(-\w*\.[0-9]+)?')
+nullParser = re.compile(r'\b(__null|nullptr)\b')
+exeNameParser = re.compile(r'''
+    \b
+    SelfTest                  # Expected executable name
+    (?:.exe)?                 # Executable name contains .exe on Windows.
+    \b
+''', re.VERBOSE)
+# This is a hack until something more reasonable is figured out
+specialCaseParser = re.compile(r'file\((\d+)\)')
+
+sinceEpochParser = re.compile(r'\d+ .+ since epoch')
+
+# The weird OR is there to always have at least empty string for group 1
+tapTestNumParser = re.compile(r'^((?:not ok)|(?:ok)|(?:warning)|(?:info)) (\d+) -')
+
+overallResult = 0
+
+def diffFiles(fileA, fileB):
+    with io.open(fileA, 'r', encoding='utf-8', errors='surrogateescape') as file:
+        aLines = [line.rstrip() for line in file.readlines()]
+    with io.open(fileB, 'r', encoding='utf-8', errors='surrogateescape') as file:
+        bLines = [line.rstrip() for line in file.readlines()]
+
+    shortenedFilenameA = fileA.rsplit(os.sep, 1)[-1]
+    shortenedFilenameB = fileB.rsplit(os.sep, 1)[-1]
+
+    diff = difflib.unified_diff(aLines, bLines, fromfile=shortenedFilenameA, tofile=shortenedFilenameB, n=0)
+    return [line for line in diff if line[0] in ('+', '-')]
+
+
+def normalizeFilepath(line):
+    # Sometimes the path separators used by compiler and Python can differ,
+    # so we try to match the path with both forward and backward path
+    # separators, to make the paths relative to Catch2 repo root.
+    forwardSlashPath = catchPath.replace('\\', '/')
+    if forwardSlashPath in line:
+        line = line.replace(forwardSlashPath + '/', '')
+    backwardSlashPath = catchPath.replace('/', '\\')
+    if backwardSlashPath in line:
+        line = line.replace(backwardSlashPath + '\\', '')
+
+    m = langFilenameParser.match(line)
+    if m:
+        filepath = m.group(0)
+        # go from \ in windows paths to /
+        filepath = filepath.replace('\\', '/')
+        # remove start of relative path
+        filepath = filepath.replace('../', '')
+        line = line[:m.start()] + filepath + line[m.end():]
+
+    return line
+
+def filterLine(line, isCompact):
+    line = normalizeFilepath(line)
+
+    # strip source line numbers
+    # Note that this parser assumes an already normalized filepath from above,
+    # and might break terribly if it is moved around before the normalization.
+    line = filelocParser.sub('\g<filename>:<line number>', line)
+
+    line = lineNumberParser.sub(" ", line)
+
+    if isCompact:
+        line = line.replace(': FAILED', ': failed')
+        line = line.replace(': PASSED', ': passed')
+
+    # strip out the test order number in TAP to avoid massive diffs for every change
+    line = tapTestNumParser.sub("\g<1> {test-number} -", line)
+
+    # strip Catch2 version number
+    line = versionParser.sub("<version>", line)
+
+    # replace *null* with 0
+    line = nullParser.sub("0", line)
+
+    # strip executable name
+    line = exeNameParser.sub("<exe-name>", line)
+
+    # strip hexadecimal numbers (presumably pointer values)
+    line = hexParser.sub("0x<hex digits>", line)
+
+    # strip durations and timestamps
+    line = junitDurationsParser.sub(' time="{duration}"', line)
+    line = durationParser.sub(' duration="{duration}"', line)
+    line = timestampsParser.sub('{iso8601-timestamp}', line)
+    line = specialCaseParser.sub('file:\g<1>', line)
+    line = sinceEpochParser.sub('{since-epoch-report}', line)
+    return line
+
+
+def run_test(baseName, args):
+    args[0:0] = [cmdPath]
+    if not os.path.exists(cmdPath):
+        raise Exception("Executable doesn't exist at " + cmdPath)
+
+    print(args)
+    rawResultsPath = get_rawResultsPath(baseName)
+    f = open(rawResultsPath, 'w')
+    subprocess.call(args, stdout=f, stderr=f)
+    f.close()
+
+
+def check_outputs(baseName):
+    global overallResult
+    rawResultsPath = get_rawResultsPath(baseName)
+    baselinesPath = get_baselinesPath(baseName)
+    filteredResultsPath = get_filteredResultsPath(baseName)
+
+    rawFile = io.open(rawResultsPath, 'r', encoding='utf-8', errors='surrogateescape')
+    filteredFile = io.open(filteredResultsPath, 'w', encoding='utf-8', errors='surrogateescape')
+    for line in rawFile:
+        filteredFile.write(filterLine(line, 'compact' in baseName).rstrip() + "\n")
+    filteredFile.close()
+    rawFile.close()
+
+    os.remove(rawResultsPath)
+    print()
+    print(baseName + ":")
+    if not os.path.exists(baselinesPath):
+        print(  'first approval')
+        overallResult += 1
+        return
+
+    diffResult = diffFiles(baselinesPath, filteredResultsPath)
+    if diffResult:
+        print('\n'.join(diffResult))
+        print("  \n****************************\n  \033[91mResults differed\033[0m")
+        overallResult += 1
+        shutil.move(filteredResultsPath, get_unapprovedResultsPath(baseName))
+    else:
+        os.remove(filteredResultsPath)
+        print("  \033[92mResults matched\033[0m")
+
+
+def approve(baseName, args):
+    run_test(baseName, args)
+    check_outputs(baseName)
+
+
+print("Running approvals against executable:")
+print("  " + cmdPath)
+
+
+base_args = ["--order", "lex", "--rng-seed", "1", "--colour-mode", "none"]
+
+## special cases first:
+# Standard console reporter
+approve("console.std", ["~[!nonportable]~[!benchmark]~[approvals] *"] + base_args)
+
+# console reporter, include passes, warn about No Assertions, limit failures to first 4
+approve("console.swa4", ["~[!nonportable]~[!benchmark]~[approvals] *", "-s", "-w", "NoAssertions", "-x", "4"] + base_args)
+
+## Common reporter checks: include passes, warn about No Assertions
+reporters = ('console', 'junit', 'xml', 'compact', 'sonarqube', 'tap', 'teamcity', 'automake')
+for reporter in reporters:
+    filename = '{}.sw'.format(reporter)
+    common_args = ["~[!nonportable]~[!benchmark]~[approvals] *", "-s", "-w", "NoAssertions"] + base_args
+    reporter_args = ['-r', reporter]
+    approve(filename, common_args + reporter_args)
+
+
+## All reporters at the same time
+common_args = ["~[!nonportable]~[!benchmark]~[approvals] *", "-s", "-w", "NoAssertions"] + base_args
+filenames = ['{}.sw.multi'.format(reporter) for reporter in reporters]
+reporter_args = []
+for reporter, filename in zip(reporters, filenames):
+    reporter_args += ['-r', '{}::out={}'.format(reporter, get_rawResultsPath(filename))]
+
+run_test("default.sw.multi", common_args + reporter_args)
+
+check_outputs("default.sw.multi")
+for reporter, filename in zip(reporters, filenames):
+    check_outputs(filename)
+
+
+if overallResult != 0:
+    print("If these differences are expected, run approve.py to approve new baselines.")
+    exit(2)
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/approve.py b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/approve.py
new file mode 100755
index 00000000..6d73be5c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/approve.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import shutil
+import glob
+from scriptCommon import catchPath
+
+rootPath = os.path.join( catchPath, 'tests/SelfTest/Baselines' )
+
+if len(sys.argv) > 1:
+    files = [os.path.join( rootPath, f ) for f in sys.argv[1:]]
+else:
+    files = glob.glob( os.path.join( rootPath, "*.unapproved.txt" ) )
+
+
+def approveFile( approvedFile, unapprovedFile ):
+    justFilename = unapprovedFile[len(rootPath)+1:]
+    if os.path.exists( unapprovedFile ):
+        if os.path.exists( approvedFile ):
+            os.remove( approvedFile )
+        os.rename( unapprovedFile, approvedFile )
+        print( "approved " + justFilename )
+    else:
+        print( "approval file " + justFilename + " does not exist" )
+
+if files:
+    for unapprovedFile in files:
+        approveFile( unapprovedFile.replace( "unapproved.txt", "approved.txt" ), unapprovedFile )
+else:
+    print( "no files to approve" )
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/buildAndTest.cmd b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/buildAndTest.cmd
new file mode 100644
index 00000000..7c10e564
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/buildAndTest.cmd
@@ -0,0 +1,17 @@
+rem Start at the root of the Catch project directory, for example:
+rem cd Catch2
+
+rem begin-snippet: catch2-build-and-test-win
+rem 1. Regenerate the amalgamated distribution
+python tools\scripts\generateAmalgamatedFiles.py
+
+rem 2. Configure the full test build
+cmake -B debug-build -S . -DCMAKE_BUILD_TYPE=Debug --preset all-tests
+
+rem 3. Run the actual build
+cmake --build debug-build
+
+rem 4. Run the tests using CTest
+cd debug-build
+ctest -j 4 --output-on-failure -C Debug
+rem end-snippet
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/buildAndTest.sh b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/buildAndTest.sh
new file mode 100755
index 00000000..01a82837
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/buildAndTest.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env sh
+
+# Start at the root of the Catch project directory, for example:
+# cd Catch2
+
+# begin-snippet: catch2-build-and-test
+# 1. Regenerate the amalgamated distribution
+./tools/scripts/generateAmalgamatedFiles.py
+
+# 2. Configure the full test build
+cmake -B debug-build -S . -DCMAKE_BUILD_TYPE=Debug --preset all-tests
+
+# 3. Run the actual build
+cmake --build debug-build
+
+# 4. Run the tests using CTest
+cd debug-build
+ctest -j 4 --output-on-failure -C Debug
+# end-snippet
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/checkConvenienceHeaders.py b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/checkConvenienceHeaders.py
new file mode 100755
index 00000000..41b52ced
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/checkConvenienceHeaders.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+
+"""
+Checks that all of the "catch_foo_all.hpp" headers include all subheaders.
+
+The logic is simple: given a folder, e.g. `catch2/matchers`, then the
+ccorresponding header is called `catch_matchers_all.hpp` and contains
+* all headers in `catch2/matchers`,
+* all headers in `catch2/matchers/{internal, detail}`,
+* all convenience catch_matchers_*_all.hpp headers from any non-internal subfolders
+
+The top level header is called `catch_all.hpp`.
+"""
+
+internal_dirs = ['detail', 'internal']
+
+from scriptCommon import catchPath
+from glob import glob
+from pprint import pprint
+import os
+import re
+
+def normalized_path(path):
+    """Replaces \ in paths on Windows with /"""
+    return path.replace('\\', '/')
+
+def normalized_paths(paths):
+    """Replaces \ with / in every path"""
+    return [normalized_path(path) for path in paths]
+
+source_path = catchPath + '/src/catch2'
+source_path = normalized_path(source_path)
+include_parser = re.compile(r'#include <(catch2/.+\.hpp)>')
+
+errors_found = False
+
+def headers_in_folder(folder):
+    return glob(folder + '/*.hpp')
+
+def folders_in_folder(folder):
+    return [x for x in os.scandir(folder) if x.is_dir()]
+
+def collated_includes(folder):
+    base = headers_in_folder(folder)
+    for subfolder in folders_in_folder(folder):
+        if subfolder.name in internal_dirs:
+            base.extend(headers_in_folder(subfolder.path))
+        else:
+            base.append(subfolder.path + '/catch_{}_all.hpp'.format(subfolder.name))
+    return normalized_paths(sorted(base))
+
+def includes_from_file(header):
+    includes = []
+    with open(header, 'r', encoding = 'utf-8') as file:
+        for line in file:
+            if not line.startswith('#include'):
+                continue
+            match = include_parser.match(line)
+            if match:
+                includes.append(match.group(1))
+    return normalized_paths(includes)
+
+def normalize_includes(includes):
+    """Returns """
+    return [include[len(catchPath)+5:] for include in includes]
+
+def get_duplicates(xs):
+    seen = set()
+    duplicated = []
+    for x in xs:
+        if x in seen:
+            duplicated.append(x)
+        seen.add(x)
+    return duplicated
+
+def verify_convenience_header(folder):
+    """
+    Performs the actual checking of convenience header for specific folder.
+    Checks that
+    1) The header even exists
+    2) That all includes in the header are sorted
+    3) That there are no duplicated includes
+    4) That all includes that should be in the header are actually present in the header
+    5) That there are no superfluous includes that should not be in the header
+    """
+    global errors_found
+
+    path = normalized_path(folder.path)
+
+    assert path.startswith(source_path), '{} does not start with {}'.format(path, source_path)
+    stripped_path = path[len(source_path) + 1:]
+    path_pieces = stripped_path.split('/')
+
+    if path == source_path:
+        header_name = 'catch_all.hpp'
+    else:
+        header_name = 'catch_{}_all.hpp'.format('_'.join(path_pieces))
+
+    # 1) Does it exist?
+    full_path = path + '/' + header_name
+    if not os.path.isfile(full_path):
+        errors_found = True
+        print('Missing convenience header: {}'.format(full_path))
+        return
+    file_incs = includes_from_file(path + '/' + header_name)
+    # 2) Are the includes are sorted?
+    if sorted(file_incs) != file_incs:
+        errors_found = True
+        print("'{}': Includes are not in sorted order!".format(header_name))
+
+    # 3) Are there no duplicates?
+    duplicated = get_duplicates(file_incs)
+    for duplicate in duplicated:
+        errors_found = True
+        print("'{}': Duplicated include: '{}'".format(header_name, duplicate))
+
+    target_includes = normalize_includes(collated_includes(path))
+    # Avoid requiring the convenience header to include itself
+    target_includes = [x for x in target_includes if header_name not in x]
+    # 4) Are all required headers present?
+    file_incs_set = set(file_incs)
+    for include in target_includes:
+        if (include not in file_incs_set and
+            include != 'catch2/internal/catch_windows_h_proxy.hpp'):
+            errors_found = True
+            print("'{}': missing include '{}'".format(header_name, include))
+
+    # 5) Are there any superfluous headers?
+    desired_set = set(target_includes)
+    for include in file_incs:
+        if include not in desired_set:
+            errors_found = True
+            print("'{}': superfluous include '{}'".format(header_name, include))
+
+
+
+def walk_source_folders(current):
+    verify_convenience_header(current)
+    for folder in folders_in_folder(current.path):
+        fname = folder.name
+        if fname not in internal_dirs:
+            walk_source_folders(folder)
+
+# This is an ugly hack because we cannot instantiate DirEntry manually
+base_dir = [x for x in os.scandir(catchPath + '/src') if x.name == 'catch2']
+walk_source_folders(base_dir[0])
+
+# Propagate error "code" upwards
+if not errors_found:
+    print('Everything ok')
+exit(errors_found)
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/checkDuplicateFilenames.py b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/checkDuplicateFilenames.py
new file mode 100755
index 00000000..b46a2b4b
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/checkDuplicateFilenames.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+
+files_set = set()
+
+for root, dir, files in os.walk("src/catch2"):
+    for file in files:
+        if file not in files_set:
+            files_set.add(file)
+        else:
+            print("File %s is duplicate" % file)
+            sys.exit(1)
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/checkLicense.py b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/checkLicense.py
new file mode 100755
index 00000000..7078d3ec
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/checkLicense.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+
+import sys
+import glob
+
+correct_licence = """\
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+"""
+
+def check_licence_in_file(filename: str) -> bool:
+    with open(filename, 'r') as f:
+        file_preamble = ''.join(f.readlines()[:7])
+
+    if correct_licence != file_preamble:
+        print('File {} does not have proper licence'.format(filename))
+        return False
+    return True
+
+def check_licences_in_path(path: str) -> int:
+    failed = 0
+    files_to_check = glob.glob(path + '/**/*.cpp', recursive=True) \
+                   + glob.glob(path + '/**/*.hpp', recursive=True)
+    for file in files_to_check:
+        if not check_licence_in_file(file):
+            failed += 1
+    return failed
+
+def check_licences():
+    failed = 0
+    # Add 'extras' after the amalgamted files are regenerated with the new script (past 3.4.0)
+    roots = ['src/catch2', 'tests', 'examples', 'fuzzing']
+    for root in roots:
+        failed += check_licences_in_path(root)
+    
+    if failed:
+        print('{} files are missing licence'.format(failed))
+        sys.exit(1)
+
+if __name__ == "__main__":
+    check_licences()
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/developBuild.py b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/developBuild.py
new file mode 100755
index 00000000..8837770c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/developBuild.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+
+import releaseCommon
+
+v = releaseCommon.Version()
+v.incrementBuildNumber()
+releaseCommon.performUpdates(v)
+
+print( "Updated files to v{0}".format( v.getVersionString() ) )
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/extractFeaturesFromReleaseNotes.py b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/extractFeaturesFromReleaseNotes.py
new file mode 100644
index 00000000..d8be0437
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/extractFeaturesFromReleaseNotes.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+
+#
+# extractFeaturesFromReleaseNotes.py
+#
+# Read the release notes - docs/release-notes.md - and generate text
+# for pasting in to individual documentation pages, to indicate which
+# versions recent features were released in.
+#
+# Using the output of the file is easier than manually constructing
+# the text to paste in to documentation pages.
+#
+# One way to use this:
+# - run this script, saving the output to some temporary file
+# - diff this output with the actual release notes page
+# - the differences are Markdown text that can be pasted in to the
+#   appropriate documentation pages in the docs/ directory.
+# - each release also has a github link to show which documentation files
+#   were changed in it.
+#   This can be helpful to see which documentation pages
+#   to add the 'Introduced in Catch ...' snippets to the relevant pages.
+#
+
+import re
+
+
+def create_introduced_in_text(version, bug_number = None):
+    """Generate text to paste in to documentation file"""
+    if bug_number:
+        return '> [Introduced](https://github.com/catchorg/Catch2/issues/%s) in Catch %s.' % (bug_number, version)
+    else:
+        # Use this text for changes that don't have issue numbers
+        return '> Introduced in Catch %s.' % version
+
+
+def link_to_changes_in_release(release, releases):
+    """
+    Markdown text for a hyperlink showing all edits in a release, or empty string
+
+    :param release: A release version, as a string 
+    :param releases: A container of releases, in descending order - newest to oldest
+    :return: Markdown text for a hyperlink showing the differences between the give release and the prior one,
+             or empty string, if the previous release is not known
+    """
+
+    if release == releases[-1]:
+        # This is the earliest release we know about
+        return ''
+    index = releases.index(release)
+    previous_release = releases[index + 1]
+    return '\n[Changes in %s](https://github.com/catchorg/Catch2/compare/v%s...v%s)' % (release, previous_release, release)
+
+
+def write_recent_release_notes_with_introduced_text():
+    current_version = None
+    release_toc_regex = r'\[(\d.\d.\d)\]\(#\d+\)<br>'
+    issue_number_regex = r'#[0-9]+'
+    releases = []
+    with open('../docs/release-notes.md') as release_notes:
+        for line in release_notes:
+            line = line[:-1]
+            print(line)
+
+            # Extract version number from table of contents
+            match = re.search(release_toc_regex, line)
+            if match:
+                release_name = match.group(1)
+                releases.append(release_name)
+
+            if line.startswith('## '):
+                # It's a section with version number
+                current_version = line.replace('## ', '')
+
+                # We decided not to add released-date info for older versions
+                if current_version == 'Older versions':
+                    break
+
+                print(create_introduced_in_text(current_version))
+                print(link_to_changes_in_release(current_version, releases))
+
+            # Not yet found a version number, so to avoid picking up hyperlinks to
+            # version numbers in the index, keep going
+            if not current_version:
+                continue
+
+            for bug_link in re.findall(issue_number_regex, line):
+                bug_number = bug_link.replace('#', '')
+                print(create_introduced_in_text(current_version, bug_number))
+
+
+if __name__ == '__main__':
+    write_recent_release_notes_with_introduced_text()
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/fixWhitespace.py b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/fixWhitespace.py
new file mode 100755
index 00000000..5840e790
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/fixWhitespace.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+
+import os
+from scriptCommon import catchPath
+
+def isSourceFile( path ):
+    return path.endswith( ".cpp" ) or path.endswith( ".h" ) or path.endswith( ".hpp" )
+
+def fixAllFilesInDir( dir ):
+    changedFiles = 0
+    for f in os.listdir( dir ):
+        path = os.path.join( dir,f )
+        if os.path.isfile( path ):
+            if isSourceFile( path ):
+                if fixFile( path ):
+                    changedFiles += 1
+        else:
+            fixAllFilesInDir( path )
+    return changedFiles
+
+def fixFile( path ):
+    f = open( path, 'r' )
+    lines = []
+    changed = 0
+    for line in f:
+        trimmed = line.rstrip() + "\n"
+        trimmed = trimmed.replace('\t', '    ')
+        if trimmed != line:
+            changed = changed +1
+        lines.append( trimmed )
+    f.close()
+    if changed > 0:
+        global changedFiles
+        changedFiles = changedFiles + 1
+        print( path + ":" )
+        print( " - fixed " + str(changed) + " line(s)" )
+        altPath = path + ".backup"
+        os.rename( path, altPath )
+        f2 = open( path, 'w' )
+        for line in lines:
+            f2.write( line )
+        f2.close()
+        os.remove( altPath )
+        return True
+    return False
+
+changedFiles = fixAllFilesInDir(catchPath)
+if changedFiles > 0:
+    print( "Fixed " + str(changedFiles) + " file(s)" )
+else:
+    print( "No trailing whitespace found" )
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/generateAmalgamatedFiles.py b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/generateAmalgamatedFiles.py
new file mode 100755
index 00000000..e3e86aab
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/generateAmalgamatedFiles.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+#              Copyright Catch2 Authors
+# Distributed under the Boost Software License, Version 1.0.
+#   (See accompanying file LICENSE.txt or copy at
+#        https://www.boost.org/LICENSE_1_0.txt)
+# SPDX-License-Identifier: BSL-1.0
+
+import os
+import re
+import datetime
+
+from scriptCommon import catchPath
+from releaseCommon import Version
+
+root_path = os.path.join(catchPath, 'src')
+starting_header = os.path.join(root_path, 'catch2', 'catch_all.hpp')
+output_header = os.path.join(catchPath, 'extras', 'catch_amalgamated.hpp')
+output_cpp = os.path.join(catchPath, 'extras', 'catch_amalgamated.cpp')
+
+# REUSE-IgnoreStart
+
+# These are the copyright comments in each file, we want to ignore them
+copyright_lines = [
+'//              Copyright Catch2 Authors\n',
+'// Distributed under the Boost Software License, Version 1.0.\n',
+'//   (See accompanying file LICENSE.txt or copy at\n',
+'//        https://www.boost.org/LICENSE_1_0.txt)\n',
+'// SPDX-License-Identifier: BSL-1.0\n',
+]
+
+# The header of the amalgamated file: copyright information + explanation
+# what this file is.
+file_header = '''\
+
+//              Copyright Catch2 Authors
+// Distributed under the Boost Software License, Version 1.0.
+//   (See accompanying file LICENSE.txt or copy at
+//        https://www.boost.org/LICENSE_1_0.txt)
+
+// SPDX-License-Identifier: BSL-1.0
+
+//  Catch v{version_string}
+//  Generated: {generation_time}
+//  ----------------------------------------------------------
+//  This file is an amalgamation of multiple different files.
+//  You probably shouldn't edit it directly.
+//  ----------------------------------------------------------
+'''
+
+# REUSE-IgnoreEnd
+
+# Returns file header with proper version string and generation time
+def formatted_file_header(version):
+    return file_header.format(version_string=version.getVersionString(),
+                              generation_time=datetime.datetime.now())
+
+# Which headers were already concatenated (and thus should not be
+# processed again)
+concatenated_headers = set()
+
+internal_include_parser = re.compile(r'\s*#include <(catch2/.*)>.*')
+
+def concatenate_file(out, filename: str, expand_headers: bool) -> int:
+    # Gathers statistics on how many headers were expanded
+    concatenated = 1
+    with open(filename, mode='r', encoding='utf-8') as input:
+        for line in input:
+            if line in copyright_lines:
+                continue
+            m = internal_include_parser.match(line)
+            # anything that isn't a Catch2 header can just be copied to
+            # the resulting file
+            if not m:
+                out.write(line)
+                continue
+
+            # TBD: We can also strip out include guards from our own
+            # headers, but it wasn't worth the time at the time of writing
+            # this script.
+
+            # We do not want to expand headers for the cpp file
+            # amalgamation but neither do we want to copy them to output
+            if not expand_headers:
+                continue
+
+            next_header = m.group(1)
+            # We have to avoid re-expanding the same header over and
+            # over again, or the header will end up with couple
+            # hundred thousands lines (~300k as of preview3 :-) )
+            if next_header in concatenated_headers:
+                continue
+
+            # Skip including the auto-generated user config file,
+            # because it has not been generated yet at this point.
+            # The code around it should be written so that just not including
+            # it is equivalent with all-default user configuration.
+            if next_header == 'catch2/catch_user_config.hpp':
+                concatenated_headers.add(next_header)
+                continue
+
+            concatenated_headers.add(next_header)
+            concatenated += concatenate_file(out, os.path.join(root_path, next_header), expand_headers)
+
+    return concatenated
+
+
+def generate_header():
+    with open(output_header, mode='w', encoding='utf-8') as header:
+        header.write(formatted_file_header(Version()))
+        header.write('#ifndef CATCH_AMALGAMATED_HPP_INCLUDED\n')
+        header.write('#define CATCH_AMALGAMATED_HPP_INCLUDED\n')
+        print('Concatenated {} headers'.format(concatenate_file(header, starting_header, True)))
+        header.write('#endif // CATCH_AMALGAMATED_HPP_INCLUDED\n')
+
+def generate_cpp():
+    from glob import glob
+    cpp_files = sorted(glob(os.path.join(root_path, 'catch2', '**/*.cpp'), recursive=True))
+    with open(output_cpp, mode='w', encoding='utf-8') as cpp:
+        cpp.write(formatted_file_header(Version()))
+        cpp.write('\n#include "catch_amalgamated.hpp"\n')
+        concatenate_file(cpp, os.path.join(root_path, 'catch2/internal/catch_windows_h_proxy.hpp'), False)
+        for file in cpp_files:
+            concatenate_file(cpp, file, False)
+    print('Concatenated {} cpp files'.format(len(cpp_files)))
+
+if __name__ == "__main__":
+    generate_header()
+    generate_cpp()
+
+
+# Notes:
+# * For .cpp files, internal includes have to be stripped and rewritten
+# * for .hpp files, internal includes have to be resolved and included
+# * The .cpp file needs to start with `#include "catch_amalgamated.hpp"
+# * include guards can be left/stripped, doesn't matter
+# * *.cpp files should be included sorted, to minimize diffs between versions
+# * *.hpp files should also be somehow sorted -> use catch_all.hpp as the
+# *       entrypoint
+# * allow disabling main in the .cpp amalgamation
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/majorRelease.py b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/majorRelease.py
new file mode 100755
index 00000000..eb712b46
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/majorRelease.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+
+import releaseCommon
+
+v = releaseCommon.Version()
+v.incrementMajorVersion()
+releaseCommon.performUpdates(v)
+
+print( "Updated files to v{0}".format( v.getVersionString() ) )
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/minorRelease.py b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/minorRelease.py
new file mode 100755
index 00000000..0992c8fb
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/minorRelease.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+
+import releaseCommon
+
+v = releaseCommon.Version()
+v.incrementMinorVersion()
+releaseCommon.performUpdates(v)
+
+print( "Updated files to v{0}".format( v.getVersionString() ) )
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/patchRelease.py b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/patchRelease.py
new file mode 100755
index 00000000..48256c15
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/patchRelease.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+
+import releaseCommon
+
+v = releaseCommon.Version()
+v.incrementPatchNumber()
+releaseCommon.performUpdates(v)
+
+print( "Updated files to v{0}".format( v.getVersionString() ) )
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/releaseCommon.py b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/releaseCommon.py
new file mode 100644
index 00000000..81efa762
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/releaseCommon.py
@@ -0,0 +1,143 @@
+import os
+import re
+import string
+import fnmatch
+
+from scriptCommon import catchPath
+
+versionParser = re.compile( r'(\s*static\sVersion\sversion)\s*\(\s*(.*)\s*,\s*(.*)\s*,\s*(.*)\s*,\s*\"(.*)\"\s*,\s*(.*)\s*\).*' )
+rootPath = os.path.join( catchPath, 'src/catch2' )
+versionPath = os.path.join( rootPath, "catch_version.cpp" )
+definePath = os.path.join(rootPath, 'catch_version_macros.hpp')
+readmePath = os.path.join( catchPath, "README.md" )
+cmakePath = os.path.join(catchPath, 'CMakeLists.txt')
+mesonPath = os.path.join(catchPath, 'meson.build')
+
+class Version:
+    def __init__(self):
+        f = open( versionPath, 'r' )
+        for line in f:
+            m = versionParser.match( line )
+            if m:
+                self.variableDecl = m.group(1)
+                self.majorVersion = int(m.group(2))
+                self.minorVersion = int(m.group(3))
+                self.patchNumber = int(m.group(4))
+                self.branchName = m.group(5)
+                self.buildNumber = int(m.group(6))
+        f.close()
+
+    def nonDevelopRelease(self):
+        if self.branchName != "":
+            self.branchName = ""
+            self.buildNumber = 0
+    def developBuild(self):
+        if self.branchName == "":
+            self.branchName = "develop"
+            self.buildNumber = 0
+
+    def incrementBuildNumber(self):
+        self.developBuild()
+        self.buildNumber = self.buildNumber+1
+
+    def incrementPatchNumber(self):
+        self.nonDevelopRelease()
+        self.patchNumber = self.patchNumber+1
+
+    def incrementMinorVersion(self):
+        self.nonDevelopRelease()
+        self.patchNumber = 0
+        self.minorVersion = self.minorVersion+1
+
+    def incrementMajorVersion(self):
+        self.nonDevelopRelease()
+        self.patchNumber = 0
+        self.minorVersion = 0
+        self.majorVersion = self.majorVersion+1
+
+    def getVersionString(self):
+        versionString = '{0}.{1}.{2}'.format( self.majorVersion, self.minorVersion, self.patchNumber )
+        if self.branchName != "":
+            versionString = versionString + '-{0}.{1}'.format( self.branchName, self.buildNumber )
+        return versionString
+
+    def updateVersionFile(self):
+        f = open( versionPath, 'r' )
+        lines = []
+        for line in f:
+            m = versionParser.match( line )
+            if m:
+                lines.append( '{0}( {1}, {2}, {3}, "{4}", {5} );'.format( self.variableDecl, self.majorVersion, self.minorVersion, self.patchNumber, self.branchName, self.buildNumber ) )
+            else:
+                lines.append( line.rstrip() )
+        f.close()
+        f = open( versionPath, 'w' )
+        for line in lines:
+            f.write( line + "\n" )
+
+
+def updateCmakeFile(version):
+    with open(cmakePath, 'rb') as file:
+        lines = file.readlines()
+    replacementRegex = re.compile(b'''VERSION (\\d+.\\d+.\\d+) # CML version placeholder, don't delete''')
+    replacement = '''VERSION {0} # CML version placeholder, don't delete'''.format(version.getVersionString()).encode('ascii')
+    with open(cmakePath, 'wb') as file:
+        for line in lines:
+            file.write(replacementRegex.sub(replacement, line))
+
+
+def updateMesonFile(version):
+    with open(mesonPath, 'rb') as file:
+        lines = file.readlines()
+    replacementRegex = re.compile(b'''version\\s*:\\s*'(\\d+.\\d+.\\d+)', # CML version placeholder, don't delete''')
+    replacement = '''version: '{0}', # CML version placeholder, don't delete'''.format(version.getVersionString()).encode('ascii')
+    with open(mesonPath, 'wb') as file:
+        for line in lines:
+            file.write(replacementRegex.sub(replacement, line))
+
+
+def updateVersionDefine(version):
+    # First member of the tuple is the compiled regex object, the second is replacement if it matches
+    replacementRegexes = [(re.compile(b'#define CATCH_VERSION_MAJOR \\d+'),'#define CATCH_VERSION_MAJOR {}'.format(version.majorVersion).encode('ascii')),
+                          (re.compile(b'#define CATCH_VERSION_MINOR \\d+'),'#define CATCH_VERSION_MINOR {}'.format(version.minorVersion).encode('ascii')),
+                          (re.compile(b'#define CATCH_VERSION_PATCH \\d+'),'#define CATCH_VERSION_PATCH {}'.format(version.patchNumber).encode('ascii')),
+                         ]
+    with open(definePath, 'rb') as file:
+        lines = file.readlines()
+    with open(definePath, 'wb') as file:
+        for line in lines:
+            for replacement in replacementRegexes:
+                line = replacement[0].sub(replacement[1], line)
+            file.write(line)
+
+
+def updateVersionPlaceholder(filename, version):
+    with open(filename, 'rb') as file:
+        lines = file.readlines()
+    placeholderRegex = re.compile(b'Catch[0-9]? X.Y.Z')
+    replacement = 'Catch2 {}.{}.{}'.format(version.majorVersion, version.minorVersion, version.patchNumber).encode('ascii')
+    with open(filename, 'wb') as file:
+        for line in lines:
+            file.write(placeholderRegex.sub(replacement, line))
+
+
+def updateDocumentationVersionPlaceholders(version):
+    print('Updating version placeholder in documentation')
+    docsPath = os.path.join(catchPath, 'docs/')
+    for basePath, _, files in os.walk(docsPath):
+        for file in files:
+            if fnmatch.fnmatch(file, "*.md") and "contributing.md" != file:
+                updateVersionPlaceholder(os.path.join(basePath, file), version)
+
+
+def performUpdates(version):
+    version.updateVersionFile()
+    updateVersionDefine(version)
+
+    import generateAmalgamatedFiles
+    generateAmalgamatedFiles.generate_header()
+    generateAmalgamatedFiles.generate_cpp()
+
+    updateCmakeFile(version)
+    updateMesonFile(version)
+    updateDocumentationVersionPlaceholders(version)
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/scriptCommon.py b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/scriptCommon.py
new file mode 100644
index 00000000..5894185d
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/scriptCommon.py
@@ -0,0 +1,4 @@
+import os
+import sys
+
+catchPath = os.path.dirname(os.path.dirname(os.path.realpath( os.path.dirname(sys.argv[0]))))
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/updateDocumentSnippets.py b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/updateDocumentSnippets.py
new file mode 100755
index 00000000..a070eea8
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/updateDocumentSnippets.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+
+from scriptCommon import catchPath
+import os
+import subprocess
+
+# ---------------------------------------------------
+# Update code examples
+# ---------------------------------------------------
+# For info on mdsnippets, see https://github.com/SimonCropp/MarkdownSnippets
+
+# install dotnet SDK from http://go.microsoft.com/fwlink/?LinkID=798306&clcid=0x409
+# Then install MarkdownSnippets.Tool with
+#   dotnet tool install -g MarkdownSnippets.Tool
+# To update:
+#   dotnet tool update  -g MarkdownSnippets.Tool
+# To uninstall (e.g. to downgrade to a lower version)
+# dotnet tool uninstall -g MarkdownSnippets.Tool
+
+os.chdir(catchPath)
+
+subprocess.run('dotnet tool update  -g MarkdownSnippets.Tool --version 21.2.0', shell=True, check=True)
+subprocess.run('mdsnippets', shell=True, check=True)
diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/updateDocumentToC.py b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/updateDocumentToC.py
new file mode 100755
index 00000000..1840cecc
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/updateDocumentToC.py
@@ -0,0 +1,447 @@
+#!/usr/bin/env python3
+
+#
+# updateDocumentToC.py
+#
+# Insert table of contents at top of Catch markdown documents.
+#
+# This script is distributed under the GNU General Public License v3.0
+#
+# It is based on markdown-toclify version 1.7.1 by Sebastian Raschka,
+# https://github.com/rasbt/markdown-toclify
+#
+
+import argparse
+import glob
+import os
+import re
+import sys
+
+from scriptCommon import catchPath
+
+# Configuration:
+
+minTocEntries = 4
+
+headingExcludeDefault = [1,3,4,5]  # use level 2 headers for at default
+headingExcludeRelease = [1,3,4,5]  # use level 1 headers for release-notes.md
+
+documentsDefault = os.path.join(os.path.relpath(catchPath), 'docs/*.md')
+releaseNotesName = 'release-notes.md'
+
+contentTitle = '**Contents**'
+contentLineNo = 4
+contentLineNdx = contentLineNo - 1
+
+# End configuration
+
+VALIDS = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-&'
+
+def readLines(in_file):
+    """Returns a list of lines from a input markdown file."""
+
+    with open(in_file, 'r') as inf:
+        in_contents = inf.read().split('\n')
+    return in_contents
+
+def removeLines(lines, remove=('[[back to top]', '<a class="mk-toclify"')):
+    """Removes existing [back to top] links and <a id> tags."""
+
+    if not remove:
+        return lines[:]
+
+    out = []
+    for l in lines:
+        if l.startswith(remove):
+            continue
+        out.append(l)
+    return out
+
+def removeToC(lines):
+    """Removes existing table of contents starting at index contentLineNdx."""
+    if not lines[contentLineNdx ].startswith(contentTitle):
+        return lines[:]
+
+    result_top = lines[:contentLineNdx]
+
+    pos = contentLineNdx + 1
+    while lines[pos].startswith('['):
+        pos = pos + 1
+
+    result_bottom = lines[pos + 1:]
+
+    return result_top + result_bottom
+
+def dashifyHeadline(line):
+    """
+    Takes a header line from a Markdown document and
+    returns a tuple of the
+        '#'-stripped version of the head line,
+        a string version for <a id=''></a> anchor tags,
+        and the level of the headline as integer.
+    E.g.,
+    >>> dashifyHeadline('### some header lvl3')
+    ('Some header lvl3', 'some-header-lvl3', 3)
+
+    """
+    stripped_right = line.rstrip('#')
+    stripped_both = stripped_right.lstrip('#')
+    level = len(stripped_right) - len(stripped_both)
+    stripped_wspace = stripped_both.strip()
+
+    # GitHub's sluggification works in an interesting way
+    # 1) '+', '/', '(', ')' and so on are just removed
+    # 2) spaces are converted into '-' directly
+    # 3) multiple -- are not collapsed
+
+    dashified = ''
+    for c in stripped_wspace:
+        if c in VALIDS:
+            dashified += c.lower()
+        elif c.isspace():
+            dashified += '-'
+        else:
+            # Unknown symbols are just removed
+            continue
+
+    return [stripped_wspace, dashified, level]
+
+def tagAndCollect(lines, id_tag=True, back_links=False, exclude_h=None):
+    """
+    Gets headlines from the markdown document and creates anchor tags.
+
+    Keyword arguments:
+        lines: a list of sublists where every sublist
+            represents a line from a Markdown document.
+        id_tag: if true, creates inserts a the <a id> tags (not req. by GitHub)
+        back_links: if true, adds "back to top" links below each headline
+        exclude_h: header levels to exclude. E.g., [2, 3]
+            excludes level 2 and 3 headings.
+
+    Returns a tuple of 2 lists:
+        1st list:
+            A modified version of the input list where
+            <a id="some-header"></a> anchor tags where inserted
+            above the header lines (if github is False).
+
+        2nd list:
+            A list of 3-value sublists, where the first value
+            represents the heading, the second value the string
+            that was inserted assigned to the IDs in the anchor tags,
+            and the third value is an integer that represents the headline level.
+            E.g.,
+            [['some header lvl3', 'some-header-lvl3', 3], ...]
+
+    """
+    out_contents = []
+    headlines = []
+    for l in lines:
+        saw_headline = False
+
+        orig_len = len(l)
+        l_stripped = l.lstrip()
+
+        if l_stripped.startswith(('# ', '## ', '### ', '#### ', '##### ', '###### ')):
+
+            # comply with new markdown standards
+
+            # not a headline if '#' not followed by whitespace '##no-header':
+            if not l.lstrip('#').startswith(' '):
+                continue
+            # not a headline if more than 6 '#':
+            if len(l) - len(l.lstrip('#')) > 6:
+                continue
+            # headers can be indented by at most 3 spaces:
+            if orig_len - len(l_stripped) > 3:
+                continue
+
+            # ignore empty headers
+            if not set(l) - {'#', ' '}:
+                continue
+
+            saw_headline = True
+            dashified = dashifyHeadline(l)
+
+            if not exclude_h or not dashified[-1] in exclude_h:
+                if id_tag:
+                    id_tag = '<a class="mk-toclify" id="%s"></a>'\
+                              % (dashified[1])
+                    out_contents.append(id_tag)
+                headlines.append(dashified)
+
+        out_contents.append(l)
+        if back_links and saw_headline:
+            out_contents.append('[[back to top](#table-of-contents)]')
+    return out_contents, headlines
+
+def positioningHeadlines(headlines):
+    """
+    Strips unnecessary whitespaces/tabs if first header is not left-aligned
+    """
+    left_just = False
+    for row in headlines:
+        if row[-1] == 1:
+            left_just = True
+            break
+    if not left_just:
+        for row in headlines:
+            row[-1] -= 1
+    return headlines
+
+def createToc(headlines, hyperlink=True, top_link=False, no_toc_header=False):
+    """
+    Creates the table of contents from the headline list
+    that was returned by the tagAndCollect function.
+
+    Keyword Arguments:
+        headlines: list of lists
+            e.g., ['Some header lvl3', 'some-header-lvl3', 3]
+        hyperlink: Creates hyperlinks in Markdown format if True,
+            e.g., '- [Some header lvl1](#some-header-lvl1)'
+        top_link: if True, add a id tag for linking the table
+            of contents itself (for the back-to-top-links)
+        no_toc_header: suppresses TOC header if True.
+
+    Returns  a list of headlines for a table of contents
+    in Markdown format,
+    e.g., ['        - [Some header lvl3](#some-header-lvl3)', ...]
+
+    """
+    processed = []
+    if not no_toc_header:
+        if top_link:
+            processed.append('<a class="mk-toclify" id="table-of-contents"></a>\n')
+        processed.append(contentTitle + '<br>')
+
+    for line in headlines:
+        if hyperlink:
+            item = '[%s](#%s)' % (line[0], line[1])
+        else:
+            item = '%s- %s' % ((line[2]-1)*'    ', line[0])
+        processed.append(item + '<br>')
+    processed.append('\n')
+    return processed
+
+def buildMarkdown(toc_headlines, body, spacer=0, placeholder=None):
+    """
+    Returns a string with the Markdown output contents incl.
+    the table of contents.
+
+    Keyword arguments:
+        toc_headlines: lines for the table of contents
+            as created by the createToc function.
+        body: contents of the Markdown file including
+            ID-anchor tags as returned by the
+            tagAndCollect function.
+        spacer: Adds vertical space after the table
+            of contents. Height in pixels.
+        placeholder: If a placeholder string is provided, the placeholder
+            will be replaced by the TOC instead of inserting the TOC at
+            the top of the document
+
+    """
+    if spacer:
+        spacer_line = ['\n<div style="height:%spx;"></div>\n' % (spacer)]
+        toc_markdown = "\n".join(toc_headlines + spacer_line)
+    else:
+        toc_markdown = "\n".join(toc_headlines)
+
+    if placeholder:
+        body_markdown = "\n".join(body)
+        markdown = body_markdown.replace(placeholder, toc_markdown)
+    else:
+        body_markdown_p1 = "\n".join(body[:contentLineNdx ]) + '\n'
+        body_markdown_p2 = "\n".join(body[ contentLineNdx:])
+        markdown = body_markdown_p1 + toc_markdown + body_markdown_p2
+
+    return markdown
+
+def outputMarkdown(markdown_cont, output_file):
+    """
+    Writes to an output file if `outfile` is a valid path.
+
+    """
+    if output_file:
+        with open(output_file, 'w') as out:
+            out.write(markdown_cont)
+
+def markdownToclify(
+    input_file,
+    output_file=None,
+    min_toc_len=2,
+    github=False,
+    back_to_top=False,
+    nolink=False,
+    no_toc_header=False,
+    spacer=0,
+    placeholder=None,
+    exclude_h=None):
+    """ Function to add table of contents to markdown files.
+
+    Parameters
+    -----------
+      input_file: str
+        Path to the markdown input file.
+
+      output_file: str (default: None)
+        Path to the markdown output file.
+
+      min_toc_len: int (default: 2)
+        Minimum number of entries to create a table of contents for.
+
+      github: bool (default: False)
+        Uses GitHub TOC syntax if True.
+
+      back_to_top: bool (default: False)
+        Inserts back-to-top links below headings if True.
+
+      nolink: bool (default: False)
+        Creates the table of contents without internal links if True.
+
+      no_toc_header: bool (default: False)
+        Suppresses the Table of Contents header if True
+
+      spacer: int (default: 0)
+        Inserts horizontal space (in pixels) after the table of contents.
+
+      placeholder: str (default: None)
+        Inserts the TOC at the placeholder string instead
+        of inserting the TOC at the top of the document.
+
+      exclude_h: list (default None)
+        Excludes header levels, e.g., if [2, 3], ignores header
+        levels 2 and 3 in the TOC.
+
+    Returns
+    -----------
+    changed: Boolean
+      True if the file has been updated, False otherwise.
+
+    """
+    cleaned_contents = removeLines(
+        removeToC(readLines(input_file)),
+        remove=('[[back to top]', '<a class="mk-toclify"'))
+
+    processed_contents, raw_headlines = tagAndCollect(
+        cleaned_contents,
+        id_tag=not github,
+        back_links=back_to_top,
+        exclude_h=exclude_h)
+
+    # add table of contents?
+    if len(raw_headlines) < min_toc_len:
+        processed_headlines = []
+    else:
+        leftjustified_headlines = positioningHeadlines(raw_headlines)
+
+        processed_headlines = createToc(
+            leftjustified_headlines,
+            hyperlink=not nolink,
+            top_link=not nolink and not github,
+            no_toc_header=no_toc_header)
+
+    if nolink:
+        processed_contents = cleaned_contents
+
+    cont = buildMarkdown(
+        toc_headlines=processed_headlines,
+        body=processed_contents,
+        spacer=spacer,
+        placeholder=placeholder)
+
+    if output_file:
+        outputMarkdown(cont, output_file)
+
+def isReleaseNotes(f):
+    return os.path.basename(f) == releaseNotesName
+
+def excludeHeadingsFor(f):
+    return headingExcludeRelease if isReleaseNotes(f) else headingExcludeDefault
+
+def updateSingleDocumentToC(input_file, min_toc_len, verbose=False):
+    """Add or update table of contents in specified file. Return 1 if file changed, 0 otherwise."""
+    if verbose :
+        print( 'file: {}'.format(input_file))
+
+    output_file = input_file + '.tmp'
+
+    markdownToclify(
+        input_file=input_file,
+        output_file=output_file,
+        min_toc_len=min_toc_len,
+        github=True,
+        back_to_top=False,
+        nolink=False,
+        no_toc_header=False,
+        spacer=False,
+        placeholder=False,
+        exclude_h=excludeHeadingsFor(input_file))
+
+    # prevent race-condition (Python 3.3):
+    if sys.version_info >= (3, 3):
+        os.replace(output_file, input_file)
+    else:
+        os.remove(input_file)
+        os.rename(output_file, input_file)
+
+    return 1
+
+def updateDocumentToC(paths, min_toc_len, verbose):
+    """Add or update table of contents to specified paths. Return number of changed files"""
+    n = 0
+    for g in paths:
+        for f in glob.glob(g):
+            if os.path.isfile(f):
+                n = n + updateSingleDocumentToC(input_file=f, min_toc_len=min_toc_len, verbose=verbose)
+    return n
+
+def updateDocumentToCMain():
+    """Add or update table of contents to specified paths."""
+
+    parser = argparse.ArgumentParser(
+        description='Add or update table of contents in markdown documents.',
+        epilog="""""",
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument(
+        'Input',
+        metavar='file',
+        type=str,
+        nargs=argparse.REMAINDER,
+        help='files to process, at default: docs/*.md')
+
+    parser.add_argument(
+        '-v', '--verbose',
+        action='store_true',
+        help='report the name of the file being processed')
+
+    parser.add_argument(
+        '--min-toc-entries',
+        dest='minTocEntries',
+        default=minTocEntries,
+        type=int,
+        metavar='N',
+        help='the minimum number of entries to create a table of contents for [{default}]'.format(default=minTocEntries))
+
+    parser.add_argument(
+        '--remove-toc',
+        action='store_const',
+        dest='minTocEntries',
+        const=99,
+        help='remove all tables of contents')
+
+    args = parser.parse_args()
+
+    paths = args.Input if args.Input else [documentsDefault]
+
+    changedFiles = updateDocumentToC(paths=paths, min_toc_len=args.minTocEntries, verbose=args.verbose)
+
+    if changedFiles > 0:
+        print( "Processed table of contents in " + str(changedFiles) + " file(s)" )
+    else:
+        print( "No table of contents added or updated" )
+
+if __name__ == '__main__':
+    updateDocumentToCMain()
+
+# end of file
diff --git a/ethosu/regor/dependencies/thirdparty/download_thirdparty.sh b/ethosu/regor/dependencies/thirdparty/download_thirdparty.sh
new file mode 100644
index 00000000..20616a21
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/download_thirdparty.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+# Script to download and prune the required thirdparty software.
+# The script should be run from the directory in which it resides. It uses
+# git clone to fetch the required software before copying just the files that
+# required along with all of the associated license headers and attribution,
+# which is assumed to reside in the files in the root directory.
+
+set -u
+
+cwd=$PWD
+download_dir=$(mktemp -d)
+
+# clean up function
+cleanup() {
+    echo "Cleaning up..."
+    rm -rf ${download_dir}
+}
+
+# handle control-c
+trap cleanup SIGINT
+
+# Add THIRDPARTY.md update text
+read -r -d '' thirdparty_md_text <<'__HERE_DOC__'
+# Third Party Software
+
+The following lists the Third Party software versions and licenses used by Vela:
+__HERE_DOC__
+
+
+# FlatBuffers
+version=v23.5.26
+clone_url=https://github.com/google/flatbuffers.git
+output_dir=${cwd}/flatbuffers
+thirdparty_md_text+="\n* FlatBuffers [${version}](https://github.com/google/flatbuffers/releases/tag/${version}) - \n"
+thirdparty_md_text+="([Apache-2.0 License](https://github.com/google/flatbuffers/blob/${version}/LICENSE))\n"
+git rm -rf ${output_dir}
+rm -rf ${download_dir} ${output_dir}
+mkdir ${output_dir}
+git clone --depth=1 --branch ${version} ${clone_url} ${download_dir}
+cp ${download_dir}/* ${output_dir}
+cp -r ${download_dir}/include ${output_dir}
+git add ${output_dir}
+
+# fmt
+version=10.2.1
+clone_url=https://github.com/fmtlib/fmt.git
+output_dir=${cwd}/fmt
+thirdparty_md_text+="\n* fmt [${version}](https://github.com/fmtlib/fmt/releases/tag/${version}) - \n"
+thirdparty_md_text+="([MIT License](https://github.com/fmtlib/fmt/blob/${version}/LICENSE))\n"
+git rm -rf ${output_dir}
+rm -rf ${download_dir} ${output_dir}
+mkdir ${output_dir}
+git clone --depth=1 --branch ${version} ${clone_url} ${download_dir}
+cp ${download_dir}/* ${output_dir}
+cp -r ${download_dir}/include ${output_dir}
+cp -r ${download_dir}/src ${output_dir}
+git add ${output_dir}
+
+# Catch2
+version=v3.5.3
+clone_url=https://github.com/catchorg/Catch2.git
+output_dir=${cwd}/Catch2
+thirdparty_md_text+="\n* Catch2 [${version}](https://github.com/catchorg/Catch2/releases/tag/${version}) - \n"
+thirdparty_md_text+="([BSL-1.0 License](https://github.com/catchorg/Catch2/blob/${version}/LICENSE.txt))\n"
+git rm -rf ${output_dir}
+rm -rf ${download_dir} ${output_dir}
+mkdir ${output_dir}
+git clone --depth=1 --branch ${version} ${clone_url} ${download_dir}
+cp ${download_dir}/* ${output_dir}
+cp -r ${download_dir}/CMake ${output_dir}
+cp -r ${download_dir}/extras ${output_dir}
+cp -r ${download_dir}/fuzzing ${output_dir}
+cp -r ${download_dir}/src ${output_dir}
+cp -r ${download_dir}/third_party ${output_dir}
+cp -r ${download_dir}/tools ${output_dir}
+git add ${output_dir}
+
+# Gemmlowp
+version=09d81e02ab15b41405caebeb5eb63fd12555aee3
+clone_url=https://github.com/google/gemmlowp.git
+output_dir=${cwd}/gemmlowp
+thirdparty_md_text+="\n* Gemmlowp [${version}](https://github.com/google/gemmlowp/tree/${version}) - \n"
+thirdparty_md_text+="([Apache-2.0 License](https://github.com/google/gemmlowp/blob/${version}/LICENSE))\n"
+git rm -rf ${output_dir}
+rm -rf ${download_dir} ${output_dir}
+mkdir ${output_dir}
+git clone ${clone_url} ${download_dir}
+cd ${download_dir}
+git checkout ${version}
+cd ${cwd}
+cp ${download_dir}/* ${output_dir}
+cp -r ${download_dir}/fixedpoint ${output_dir}
+cp -r ${download_dir}/internal ${output_dir}
+git add ${output_dir}
+
+# pybind11
+version=v2.11.1
+clone_url=https://github.com/pybind/pybind11.git
+output_dir=${cwd}/pybind11
+thirdparty_md_text+="\n* pybind11 [${version}](https://github.com/pybind/pybind11/releases/tag/${version}) - \n"
+thirdparty_md_text+="([BSD-3-Clause License](https://github.com/pybind/pybind11/blob/${version}/LICENSE))\n"
+git rm -rf ${output_dir}
+rm -rf ${download_dir} ${output_dir}
+mkdir ${output_dir}
+git clone --depth=1 --branch ${version} ${clone_url} ${download_dir}
+cp ${download_dir}/* ${output_dir}
+cp -r ${download_dir}/include ${output_dir}
+cp -r ${download_dir}/pybind11 ${output_dir}
+cp -r ${download_dir}/tools ${output_dir}
+git add ${output_dir}
+
+# clean up
+cleanup
+trap - SIGINT
+
+# report the update text for THIRDPARTY.md
+echo "Update and check THIRDPARTY.md with:"
+echo "$thirdparty_md_text"
\ No newline at end of file
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/BUILD.bazel b/ethosu/regor/dependencies/thirdparty/flatbuffers/BUILD.bazel
new file mode 100644
index 00000000..b4f015a0
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/BUILD.bazel
@@ -0,0 +1,139 @@
+load("@aspect_rules_js//npm:defs.bzl", "npm_link_package")
+load("@npm//:defs.bzl", "npm_link_all_packages")
+load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
+
+licenses(["notice"])
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+npm_link_all_packages(name = "node_modules")
+
+npm_link_package(
+    name = "node_modules/flatbuffers",
+    src = "//ts:flatbuffers",
+)
+
+exports_files([
+    "LICENSE",
+    "tsconfig.json",
+])
+
+config_setting(
+    name = "platform_freebsd",
+    constraint_values = [
+        "@platforms//os:freebsd",
+    ],
+)
+
+config_setting(
+    name = "platform_openbsd",
+    constraint_values = [
+        "@platforms//os:openbsd",
+    ],
+)
+
+filegroup(
+    name = "distribution",
+    srcs = [
+        "BUILD.bazel",
+        "WORKSPACE",
+        "build_defs.bzl",
+        "typescript.bzl",
+        "//grpc/src/compiler:distribution",
+        "//reflection:distribution",
+        "//src:distribution",
+        "//ts:distribution",
+    ] + glob([
+        "include/flatbuffers/*.h",
+    ]),
+    visibility = ["//visibility:public"],
+)
+
+# Public flatc library to compile flatbuffer files at runtime.
+cc_library(
+    name = "flatbuffers",
+    hdrs = ["//:public_headers"],
+    linkstatic = 1,
+    strip_include_prefix = "/include",
+    deps = ["//src:flatbuffers"],
+)
+
+# Public C++ headers for the Flatbuffers library.
+filegroup(
+    name = "public_headers",
+    srcs = [
+        "include/flatbuffers/allocator.h",
+        "include/flatbuffers/array.h",
+        "include/flatbuffers/base.h",
+        "include/flatbuffers/buffer.h",
+        "include/flatbuffers/buffer_ref.h",
+        "include/flatbuffers/code_generator.h",
+        "include/flatbuffers/code_generators.h",
+        "include/flatbuffers/default_allocator.h",
+        "include/flatbuffers/detached_buffer.h",
+        "include/flatbuffers/file_manager.h",
+        "include/flatbuffers/flatbuffer_builder.h",
+        "include/flatbuffers/flatbuffers.h",
+        "include/flatbuffers/flex_flat_util.h",
+        "include/flatbuffers/flexbuffers.h",
+        "include/flatbuffers/grpc.h",
+        "include/flatbuffers/hash.h",
+        "include/flatbuffers/idl.h",
+        "include/flatbuffers/minireflect.h",
+        "include/flatbuffers/reflection.h",
+        "include/flatbuffers/reflection_generated.h",
+        "include/flatbuffers/registry.h",
+        "include/flatbuffers/stl_emulation.h",
+        "include/flatbuffers/string.h",
+        "include/flatbuffers/struct.h",
+        "include/flatbuffers/table.h",
+        "include/flatbuffers/util.h",
+        "include/flatbuffers/vector.h",
+        "include/flatbuffers/vector_downward.h",
+        "include/flatbuffers/verifier.h",
+    ],
+)
+
+# Public flatc compiler library.
+cc_library(
+    name = "flatc_library",
+    linkstatic = 1,
+    deps = [
+        "//src:flatc_library",
+    ],
+)
+
+# Public flatc compiler.
+cc_binary(
+    name = "flatc",
+    data = ["//reflection:reflection_fbs_schema"],
+    deps = [
+        "//src:flatc",
+    ],
+)
+
+filegroup(
+    name = "flatc_headers",
+    srcs = [
+        "include/flatbuffers/flatc.h",
+    ],
+    visibility = ["//:__subpackages__"],
+)
+
+# Library used by flatbuffer_cc_library rules.
+cc_library(
+    name = "runtime_cc",
+    hdrs = [
+        "include/flatbuffers/base.h",
+        "include/flatbuffers/flatbuffers.h",
+        "include/flatbuffers/flexbuffers.h",
+        "include/flatbuffers/stl_emulation.h",
+        "include/flatbuffers/util.h",
+        "include/flatbuffers/vector.h",
+        "include/flatbuffers/verifier.h",
+    ],
+    linkstatic = 1,
+    strip_include_prefix = "/include",
+)
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/CHANGELOG.md b/ethosu/regor/dependencies/thirdparty/flatbuffers/CHANGELOG.md
new file mode 100644
index 00000000..145c62b1
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/CHANGELOG.md
@@ -0,0 +1,155 @@
+# Flatbuffers Change Log
+
+All major or breaking changes will be documented in this file, as well as any
+new features that should be highlighted. Minor fixes or improvements are not
+necessarily listed.
+
+## [23.5.26 (May 26 2023)](https://github.com/google/flatbuffers/releases/tag/v23.5.26)
+
+* Mostly bug fixing for 64-bit support
+* Adds support for specifying underling type of unions in C++ and TS/JS (#7954)
+
+## [23.5.9 (May 9 2023)](https://github.com/google/flatbuffers/releases/tag/v23.5.9)
+
+* 64-bit support for C++ (#7935)
+
+## [23.5.8 (May 8 2023)](https://github.com/google/flatbuffers/releases/tag/v23.5.8)
+
+* add key_field to compiled tests
+* Add golden language directory
+* Rework cmake flatc codegeneration (#7938)
+* remove defining generated files in test srcs
+* Add binary schema reflection (#7932)
+* Migrate from rules_nodejs to rules_js/rules_ts (take 2) (#7928)
+* `flat_buffers.dart`: mark const variable finals for internal Dart linters
+* fixed some windows warnings (#7929)
+* inject no long for FBS generation to remove logs in flattests (#7926)
+* Revert "Migrate from rules_nodejs to rules_js/rules_ts (#7923)" (#7927)
+* Migrate from rules_nodejs to rules_js/rules_ts (#7923)
+* Only generate @kotlin.ExperimentalUnsigned annotation on create*Vector methods having an unsigned array type parameter. (#7881)
+* additional check for absl::string_view availability (#7897)
+* Optionally generate Python type annotations (#7858)
+* Replace deprecated command with environment file (#7921)
+* drop glibc from runtime dependencies (#7906)
+* Make JSON supporting advanced union features (#7869)
+* Allow to use functions from `BuildFlatBuffers.cmake` from a flatbuffers installation installed with CMake. (#7912)
+* TS/JS: Use TypeError instead of Error when appropriate (#7910)
+* Go: make generated code more compliant to "go fmt" (#7907)
+* Support file_identifier in Go (#7904)
+* Optionally generate type prefixes and suffixes for python code (#7857)
+* Go: add test for FinishWithFileIdentifier (#7905)
+* Fix go_sample.sh (#7903)
+* [TS/JS] Upgrade dependencies (#7889)
+* Add a FileWriter interface (#7821)
+* TS/JS: Use minvalue from enum if not found (#7888)
+* [CS] Verifier (#7850)
+* README.md: PyPI case typo (#7880)
+* Update go documentation link to point to root module (#7879)
+* use Bool for flatbuffers bool instead of Byte (#7876)
+* fix using null string in vector (#7872)
+* Add `flatbuffers-64` branch to CI for pushes
+* made changes to the rust docs so they would compile. new_with_capacity is deprecated should use with_capacity, get_root_as_monster should be root_as_monster (#7871)
+* Adding comment for code clarification (#7856)
+* ToCamelCase() when kLowerCamel now converts first char to lower. (#7838)
+* Fix help output for --java-checkerframework (#7854)
+* Update filename to README.md and improve formatting (#7855)
+* Update stale.yml
+* Updated remaining usages of LICENSE.txt
+
+## [23.3.3 (Mar 3 2023)](https://github.com/google/flatbuffers/releases/tag/v23.3.3)
+
+* Refactoring of `flatc` generators to use an interface (#7797).
+
+* Removed legacy cmake support and set min to 3.8 (#7801).
+
+## [23.1.21 (Jan 21 2023)](https://github.com/google/flatbuffers/releases/tag/v23.1.20)
+
+* Reworked entry points for Typescript/Javascript and compatibility for single
+  file build (#7510)
+
+## [23.1.20 (Jan 20 2023)](https://github.com/google/flatbuffers/releases/tag/v23.1.20)
+
+* Removed go.mod files after some versioning issues were being report (#7780).
+
+## [23.1.4 (Jan 4 2023)](https://github.com/google/flatbuffers/releases/tag/v23.1.4)
+
+* Major release! Just kidding, we are continuing the
+  [versioning scheme](https://github.com/google/flatbuffers/wiki/Versioning) of
+  using a date to signify releases. This results in the first release of the new
+  year to bump the tradition major version field.
+
+* Go minimum version is now 1.19 (#7720) with the addition of Go modules.
+
+* Added CI support for Big Endian regression testing (#7707).
+
+* Fixed `getFullyQualifiedName` in typescript to return name delimited by '.'
+  instead of '_' (#7730).
+
+* Fixed the versioning scheme to not include leading zeros which are not
+  consistently handled by every package manager. Only the last release
+  (12.12.06) should have suffered from this.
+
+## [22.12.06 (Dec 06 2022)](https://github.com/google/flatbuffers/releases/tag/v22.12.06)
+
+* Bug fixing release, no major changes.
+
+## [22.10.25 (Oct 25 2022)](https://github.com/google/flatbuffers/releases/tag/v22.10.25)
+
+* Added Nim language support with generator and runtime libraries (#7534).
+
+## [22.9.29 (Sept 29 2022)](https://github.com/google/flatbuffers/releases/tag/v22.9.29)
+
+* Rust soundness fixes to avoid the crate from bing labelled unsafe (#7518).
+
+## [22.9.24 (Sept 24 2022)](https://github.com/google/flatbuffers/releases/tag/v22.9.24)
+
+* 20 Major releases in a row? Nope, we switched to a new
+  [versioning scheme](https://github.com/google/flatbuffers/wiki/Versioning)
+  that is based on date.
+
+* Python supports fixed size arrays now (#7529).
+
+* Behavior change in how C++ object API uses `UnPackTo`. The original intent of
+  this was to reduce allocations by reusing an existing object to pack data
+  into. At some point, this logic started to merge the states of the two objects
+  instead of clearing the state of the packee. This change goes back to the
+  original intention, the packed object is cleared when getting data packed into
+  it (#7527).
+
+* Fixed a bug in C++ alignment that was using `sizeof()` instead of the intended
+  `AlignOf()` for structs (#7520).
+
+* C# has an
+  [official Nuget package](https://www.nuget.org/packages/Google.FlatBuffers)
+  now (#7496).
+
+## 2.0.8 (Aug 29 2022)
+
+* Fix for `--keep-prefix` the was generating the wrong include statements for
+  C++ (#7469). The bug was introduced in 2.0.7.
+
+* Added the `Verifier::Options` option struct to allow specifying runtime
+  configuration settings for the verifier (#7489). This allows to skip verifying
+  nested flatbuffers, a on-by-default change that was introduced in 2.0.7. This
+  deprecates the existing `Verifier` constructor, which may be removed in a
+  future version.
+
+* Refactor of `tests/test.cpp` that lead to ~10% speedup in compilation of the
+  entire project (#7487).
+
+## 2.0.7 (Aug 22 2022)
+
+* This is the first version with an explicit change log, so all the previous
+  features will not be listed.
+
+* Verifier now checks that buffers are at least the minimum size required to be
+  a flatbuffers (12 bytes). This includes nested flatbuffers, which previously
+  could be declared valid at size 0.
+
+* Annotated binaries. Given a flatbuffer binary and a schema (or binary schema)
+  one can generate an annotated flatbuffer (.afb) to describe each byte in the
+  binary with schema metadata and value.
+
+* First binary schema generator (Lua) to generate Lua code via a .bfbs file.
+  This is mostly an implementation detail of flatc internals, but will be slowly
+  applied to the other language generators.
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/CMakeLists.txt b/ethosu/regor/dependencies/thirdparty/flatbuffers/CMakeLists.txt
new file mode 100644
index 00000000..a895a340
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/CMakeLists.txt
@@ -0,0 +1,712 @@
+# This is the legacy minimum version flatbuffers supported for a while.
+cmake_minimum_required(VERSION 3.8...3.25.2)
+
+# Attempt to read the current version of flatbuffers by looking at the latest tag.
+include(CMake/Version.cmake)
+
+project(FlatBuffers
+        VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}
+        LANGUAGES CXX)
+
+# generate compile_commands.json
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# NOTE: Code coverage only works on Linux & OSX.
+option(FLATBUFFERS_CODE_COVERAGE "Enable the code coverage build option." OFF)
+option(FLATBUFFERS_BUILD_TESTS "Enable the build of tests and samples." ON)
+option(FLATBUFFERS_INSTALL "Enable the installation of targets." ON)
+option(FLATBUFFERS_BUILD_FLATLIB "Enable the build of the flatbuffers library"
+       ON)
+option(FLATBUFFERS_BUILD_FLATC "Enable the build of the flatbuffers compiler"
+       ON)
+option(FLATBUFFERS_STATIC_FLATC "Build flatbuffers compiler with -static flag"
+       OFF)
+option(FLATBUFFERS_BUILD_FLATHASH "Enable the build of flathash" OFF)
+option(FLATBUFFERS_BUILD_BENCHMARKS "Enable the build of flatbenchmark."
+       OFF)
+option(FLATBUFFERS_BUILD_GRPCTEST "Enable the build of grpctest" OFF)
+option(FLATBUFFERS_BUILD_SHAREDLIB
+       "Enable the build of the flatbuffers shared library"
+       OFF)
+option(FLATBUFFERS_LIBCXX_WITH_CLANG "Force libc++ when using Clang" ON)
+# NOTE: Sanitizer check only works on Linux & OSX (gcc & llvm).
+option(FLATBUFFERS_CODE_SANITIZE
+      "Add '-fsanitize' flags to 'flattests' and 'flatc' targets."
+      OFF)
+option(FLATBUFFERS_PACKAGE_REDHAT
+       "Build an rpm using the 'package' target."
+       OFF)
+option(FLATBUFFERS_PACKAGE_DEBIAN
+       "Build an deb using the 'package' target."
+       OFF)
+option(FLATBUFFERS_BUILD_CPP17
+       "Enable the build of c++17 test target. \"
+       Requirements: Clang6, GCC7, MSVC2017 (_MSC_VER >= 1914)  or higher."
+       OFF)
+option(FLATBUFFERS_BUILD_LEGACY
+       "Run C++ code generator with '--cpp-std c++0x' switch."
+       OFF)
+option(FLATBUFFERS_ENABLE_PCH
+       "Enable precompile headers support for 'flatbuffers' and 'flatc'. \"
+        Only work if CMake supports 'target_precompile_headers'. \"
+        This can speed up compilation time."
+       OFF)
+option(FLATBUFFERS_SKIP_MONSTER_EXTRA
+      "Skip generating monster_extra.fbs that contains non-supported numerical\"
+      types." OFF)
+option(FLATBUFFERS_STRICT_MODE
+      "Build flatbuffers with all warnings as errors (-Werror or /WX)."
+      OFF)
+
+if(NOT DEFINED FLATBUFFERS_CPP_STD)
+  set(FLATBUFFERS_CPP_STD 11)
+endif()
+
+set(MSVC_LIKE OFF)
+if(MSVC OR CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "MSVC")
+    set(MSVC_LIKE ON)
+endif()
+
+if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  set(IS_CLANG ON)
+else()
+  set(IS_CLANG OFF)
+endif()
+
+if(DEFINED FLATBUFFERS_COMPILATION_TIMINGS)
+  message("Recording Compilation Timings to ${FLATBUFFERS_COMPILATION_TIMINGS}")
+  file(REMOVE ${FLATBUFFERS_COMPILATION_TIMINGS})
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "time -f 'Wall: %E User: %U Sys: %S | %C' -q -a -o ${FLATBUFFERS_COMPILATION_TIMINGS}")
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_CUSTOM "time -f 'Wall: %E User: %U Sys: %S | %C' -q -a -o ${FLATBUFFERS_COMPILATION_TIMINGS}")
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "time -f 'Wall: %E User: %U Sys: %S | %C' -q -a -o ${FLATBUFFERS_COMPILATION_TIMINGS}")
+endif()
+
+if(NOT FLATBUFFERS_BUILD_FLATC AND FLATBUFFERS_BUILD_TESTS)
+    message(WARNING
+    "Cannot build tests without building the compiler. Tests will be disabled.")
+    set(FLATBUFFERS_BUILD_TESTS OFF)
+endif()
+
+if(DEFINED FLATBUFFERS_MAX_PARSING_DEPTH)
+  # Override the default recursion depth limit.
+  add_definitions(-DFLATBUFFERS_MAX_PARSING_DEPTH=${FLATBUFFERS_MAX_PARSING_DEPTH})
+  message(STATUS "FLATBUFFERS_MAX_PARSING_DEPTH: ${FLATBUFFERS_MAX_PARSING_DEPTH}")
+endif()
+
+# Auto-detect locale-narrow 'strtod_l' and  'strtoull_l' functions.
+if(NOT DEFINED FLATBUFFERS_LOCALE_INDEPENDENT)
+  include(CheckCXXSymbolExists)
+
+  set(FLATBUFFERS_LOCALE_INDEPENDENT 0)
+  if(MSVC_LIKE)
+    check_cxx_symbol_exists(_strtof_l stdlib.h FLATBUFFERS_HAS_STRTOF_L)
+    check_cxx_symbol_exists(_strtoui64_l stdlib.h FLATBUFFERS_HAS_STRTOULL_L)
+  else()
+    check_cxx_symbol_exists(strtof_l stdlib.h FLATBUFFERS_HAS_STRTOF_L)
+    check_cxx_symbol_exists(strtoull_l stdlib.h FLATBUFFERS_HAS_STRTOULL_L)
+  endif()
+  if(FLATBUFFERS_HAS_STRTOF_L AND FLATBUFFERS_HAS_STRTOULL_L)
+    set(FLATBUFFERS_LOCALE_INDEPENDENT 1)
+  endif()
+endif()
+add_definitions(-DFLATBUFFERS_LOCALE_INDEPENDENT=$<BOOL:${FLATBUFFERS_LOCALE_INDEPENDENT}>)
+
+if(NOT WIN32)
+  check_symbol_exists(realpath "stdlib.h" HAVE_REALPATH)
+  if(NOT HAVE_REALPATH)
+    add_definitions(-DFLATBUFFERS_NO_ABSOLUTE_PATH_RESOLUTION)
+  endif()
+endif()
+
+set(FlatBuffers_Library_SRCS
+  include/flatbuffers/allocator.h
+  include/flatbuffers/array.h
+  include/flatbuffers/base.h
+  include/flatbuffers/buffer.h
+  include/flatbuffers/buffer_ref.h
+  include/flatbuffers/default_allocator.h
+  include/flatbuffers/detached_buffer.h
+  include/flatbuffers/code_generator.h
+  include/flatbuffers/file_manager.h
+  include/flatbuffers/flatbuffer_builder.h
+  include/flatbuffers/flatbuffers.h
+  include/flatbuffers/flexbuffers.h
+  include/flatbuffers/flex_flat_util.h
+  include/flatbuffers/hash.h
+  include/flatbuffers/idl.h
+  include/flatbuffers/minireflect.h
+  include/flatbuffers/reflection.h
+  include/flatbuffers/reflection_generated.h
+  include/flatbuffers/registry.h
+  include/flatbuffers/stl_emulation.h
+  include/flatbuffers/string.h
+  include/flatbuffers/struct.h
+  include/flatbuffers/table.h
+  include/flatbuffers/util.h
+  include/flatbuffers/vector.h
+  include/flatbuffers/vector_downward.h
+  include/flatbuffers/verifier.h
+  src/idl_parser.cpp
+  src/idl_gen_text.cpp
+  src/reflection.cpp
+  src/util.cpp
+)
+
+set(FlatBuffers_Compiler_SRCS
+  ${FlatBuffers_Library_SRCS}
+  src/idl_gen_binary.cpp
+  src/idl_gen_text.cpp
+  src/idl_gen_cpp.cpp
+  src/idl_gen_csharp.cpp
+  src/idl_gen_dart.cpp
+  src/idl_gen_kotlin.cpp
+  src/idl_gen_go.cpp
+  src/idl_gen_java.cpp
+  src/idl_gen_ts.cpp
+  src/idl_gen_php.cpp
+  src/idl_gen_python.cpp
+  src/idl_gen_lobster.cpp
+  src/idl_gen_rust.cpp
+  src/idl_gen_fbs.cpp
+  src/idl_gen_grpc.cpp
+  src/idl_gen_json_schema.cpp
+  src/idl_gen_swift.cpp
+  src/file_name_saving_file_manager.cpp
+  src/file_binary_writer.cpp
+  src/file_writer.cpp
+  src/idl_namer.h
+  src/namer.h
+  src/flatc.cpp
+  src/flatc_main.cpp
+  src/bfbs_gen.h
+  src/bfbs_gen_lua.h
+  src/bfbs_gen_nim.h
+  src/bfbs_namer.h
+  include/flatbuffers/code_generators.h
+  src/binary_annotator.h
+  src/binary_annotator.cpp
+  src/annotated_binary_text_gen.h
+  src/annotated_binary_text_gen.cpp
+  src/bfbs_gen_lua.cpp
+  src/bfbs_gen_nim.cpp
+  src/code_generators.cpp
+  grpc/src/compiler/schema_interface.h
+  grpc/src/compiler/cpp_generator.h
+  grpc/src/compiler/cpp_generator.cc
+  grpc/src/compiler/go_generator.h
+  grpc/src/compiler/go_generator.cc
+  grpc/src/compiler/java_generator.h
+  grpc/src/compiler/java_generator.cc
+  grpc/src/compiler/python_generator.h
+  grpc/src/compiler/python_generator.cc
+  grpc/src/compiler/swift_generator.h
+  grpc/src/compiler/swift_generator.cc
+  grpc/src/compiler/ts_generator.h
+  grpc/src/compiler/ts_generator.cc
+)
+
+set(FlatHash_SRCS
+  include/flatbuffers/hash.h
+  src/flathash.cpp
+)
+
+set(FlatBuffers_Tests_SRCS
+  ${FlatBuffers_Library_SRCS}
+  src/idl_gen_fbs.cpp
+  tests/evolution_test.cpp
+  tests/flexbuffers_test.cpp
+  tests/fuzz_test.cpp
+  tests/json_test.cpp
+  tests/key_field_test.cpp
+  tests/monster_test.cpp
+  tests/optional_scalars_test.cpp
+  tests/parser_test.cpp
+  tests/proto_test.cpp
+  tests/reflection_test.cpp
+  tests/test.cpp
+  tests/test_assert.h
+  tests/test_assert.cpp
+  tests/test_builder.h
+  tests/test_builder.cpp
+  tests/util_test.cpp
+  tests/native_type_test_impl.h
+  tests/native_type_test_impl.cpp
+  tests/alignment_test.h
+  tests/alignment_test.cpp
+  tests/64bit/offset64_test.h
+  tests/64bit/offset64_test.cpp
+  include/flatbuffers/code_generators.h
+  src/code_generators.cpp
+)
+
+set(FlatBuffers_Tests_CPP17_SRCS
+  ${FlatBuffers_Library_SRCS}
+  tests/test_assert.h
+  tests/test_assert.cpp
+  tests/cpp17/test_cpp17.cpp
+)
+
+set(FlatBuffers_Sample_Binary_SRCS
+  samples/sample_binary.cpp
+)
+
+set(FlatBuffers_Sample_Text_SRCS
+  ${FlatBuffers_Library_SRCS}
+  samples/sample_text.cpp
+)
+
+set(FlatBuffers_Sample_BFBS_SRCS
+  ${FlatBuffers_Library_SRCS}
+  samples/sample_bfbs.cpp
+)
+
+set(FlatBuffers_GRPCTest_SRCS
+  include/flatbuffers/flatbuffers.h
+  include/flatbuffers/grpc.h
+  include/flatbuffers/util.h
+  src/util.cpp
+  tests/monster_test.grpc.fb.h
+  tests/test_assert.h
+  tests/test_builder.h
+  tests/monster_test.grpc.fb.cc
+  tests/test_assert.cpp
+  tests/test_builder.cpp
+  grpc/tests/grpctest.cpp
+  grpc/tests/message_builder_test.cpp
+)
+
+# TODO(dbaileychess): Figure out how this would now work. I posted a question on
+# https://stackoverflow.com/questions/71772330/override-target-compile-options-via-cmake-command-line.
+# Append FLATBUFFERS_CXX_FLAGS to CMAKE_CXX_FLAGS.
+if(DEFINED FLATBUFFERS_CXX_FLAGS)
+  message(STATUS "extend CXX_FLAGS with ${FLATBUFFERS_CXX_FLAGS}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLATBUFFERS_CXX_FLAGS}")
+endif()
+message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+
+function(add_fsanitize_to_target _target _sanitizer)
+  if(WIN32)
+    target_compile_definitions(${_target} PRIVATE FLATBUFFERS_MEMORY_LEAK_TRACKING)
+    message(STATUS "Sanitizer MSVC::_CrtDumpMemoryLeaks added to ${_target}")
+  else()
+    # FLATBUFFERS_CODE_SANITIZE: boolean {ON,OFF,YES,NO} or string with list of sanitizer.
+    # List of sanitizer is string starts with '=': "=address,undefined,thread,memory".
+    if(IS_CLANG OR (CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 4.9))
+      set(_sanitizer_flags "=address,undefined")
+      if(_sanitizer MATCHES "=.*")
+        # override default by user-defined sanitizer list
+        set(_sanitizer_flags ${_sanitizer})
+      endif()
+      target_compile_options(${_target} PRIVATE
+        -g -fsigned-char -fno-omit-frame-pointer
+        "-fsanitize${_sanitizer_flags}")
+      target_link_libraries(${_target} PRIVATE
+        "-fsanitize${_sanitizer_flags}")
+      set_target_properties(${_target} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+      message(STATUS "Sanitizer ${_sanitizer_flags} added to ${_target}")
+    endif()
+  endif()
+endfunction()
+
+function(add_pch_to_target _target _pch_header)
+  # the command is available since cmake 3.16
+  if(COMMAND target_precompile_headers)
+    target_precompile_headers(${_target} PRIVATE ${_pch_header})
+    if(NOT MSVC)
+      set_source_files_properties(src/util.cpp PROPERTIES SKIP_PRECOMPILE_HEADERS ON)
+    endif()
+  endif()
+endfunction()
+
+include_directories(include)
+include_directories(grpc)
+
+# Creates an interface library that stores the configuration settings that each
+# target links too. This is a compromise between setting configuration globally
+# with add_compile_options() and the more targetted target_compile_options().
+# This way each target in this file can share settings and override them if
+# needed.
+add_library(ProjectConfig INTERFACE)
+target_compile_features(ProjectConfig
+  INTERFACE
+    cxx_std_${FLATBUFFERS_CPP_STD}
+)
+
+# Force the standard to be met.
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# We shouldn't rely on any compiler-extensions to make things work.
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+if(MSVC_LIKE)
+  target_compile_options(ProjectConfig
+    INTERFACE
+      /W4
+      $<$<BOOL:${FLATBUFFERS_STRICT_MODE}>:
+        /WX       # Treat all compiler warnings as errors
+      >
+      /wd4512   # C4512: assignment operator could not be generated
+      /wd4316   # C4316: object allocated on the heap may not be aligned
+      /wd4456   # C4456: hides previous local declaration
+      $<$<CXX_COMPILER_ID:Clang>:
+        /D_CRT_SECURE_NO_WARNINGS
+      >
+  )
+else()
+  target_compile_options(ProjectConfig
+    INTERFACE
+      -Wall
+      $<$<BOOL:${FLATBUFFERS_STRICT_MODE}>:
+        -Werror   # Treat all compiler warnings as errors
+
+        -fno-rtti # Disable runtime type information
+
+        $<$<CXX_COMPILER_ID:GNU>:
+          # False positive string overflow
+          # https://github.com/google/flatbuffers/issues/7366
+          -Wno-error=stringop-overflow
+        >
+      >
+      -pedantic
+      -Wextra
+      -Wno-unused-parameter
+      -Wold-style-cast
+      -fsigned-char
+      -Wnon-virtual-dtor
+
+      # This isn't working for some reason: $<$<CXX_COMPILER_ID:CLANG>:
+      $<$<BOOL:${IS_CLANG}>:
+        -Wnewline-eof
+        -Wno-unknown-warning-option
+        -Wmissing-declarations
+        -Wzero-as-null-pointer-constant
+        $<$<VERSION_GREATER:$<CXX_COMPILER_VERSION>,3.8>:
+          -Wimplicit-fallthrough
+          -Wextra-semi
+          $<$<BOOL:${FLATBUFFERS_STRICT_MODE}>:
+            -Werror=unused-private-field
+          >
+        >
+      >
+
+      $<$<CXX_COMPILER_ID:GNU>:
+        $<$<VERSION_GREATER:$<CXX_COMPILER_VERSION>,4.4>:
+          -Wunused-result
+          -Wunused-parameter
+          -Werror=unused-parameter
+          -Wmissing-declarations
+        >
+        $<$<VERSION_GREATER:$<CXX_COMPILER_VERSION>,4.7>:
+          -Wzero-as-null-pointer-constant
+        >
+        $<$<VERSION_GREATER:$<CXX_COMPILER_VERSION>,7.0>:
+          -faligned-new
+          $<$<BOOL:${FLATBUFFERS_STRICT_MODE}>:
+            -Werror=implicit-fallthrough=2
+          >
+        >
+        $<$<VERSION_GREATER:$<CXX_COMPILER_VERSION>,8.0>:
+          -Wextra-semi
+        >
+      >
+
+      $<$<BOOL:${FLATBUFFERS_CODE_COVERAGE}>:
+        -g
+        -fprofile-arcs
+        -ftest-coverage
+      >
+    )
+
+  if(FLATBUFFERS_CODE_COVERAGE)
+    target_link_options(ProjectConfig
+      INTERFACE
+        -fprofile-arcs
+        -ftest-coverage
+    )
+  endif()
+endif()
+
+if(FLATBUFFERS_BUILD_FLATLIB)
+  add_library(flatbuffers STATIC ${FlatBuffers_Library_SRCS})
+
+  # Attach header directory for when build via add_subdirectory().
+  target_include_directories(flatbuffers
+    INTERFACE
+      $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+  )
+  target_link_libraries(flatbuffers PRIVATE $<BUILD_INTERFACE:ProjectConfig>)
+
+  if(FLATBUFFERS_ENABLE_PCH)
+    add_pch_to_target(flatbuffers include/flatbuffers/pch/pch.h)
+  endif()
+endif()
+
+if(FLATBUFFERS_BUILD_FLATC)
+  add_executable(flatc ${FlatBuffers_Compiler_SRCS})
+  if(FLATBUFFERS_ENABLE_PCH)
+    add_pch_to_target(flatc include/flatbuffers/pch/flatc_pch.h)
+  endif()
+
+  target_link_libraries(flatc PRIVATE $<BUILD_INTERFACE:ProjectConfig>)
+  target_compile_options(flatc
+    PRIVATE
+      $<$<AND:$<BOOL:${MSVC_LIKE}>,$<CONFIG:Release>>:
+        /MT
+      >
+  )
+
+  if(FLATBUFFERS_CODE_SANITIZE AND NOT WIN32)
+    add_fsanitize_to_target(flatc ${FLATBUFFERS_CODE_SANITIZE})
+  endif()
+  if(NOT FLATBUFFERS_FLATC_EXECUTABLE)
+    set(FLATBUFFERS_FLATC_EXECUTABLE $<TARGET_FILE:flatc>)
+  endif()
+  if(FLATBUFFERS_STATIC_FLATC AND NOT MSVC)
+    target_link_libraries(flatc PRIVATE -static)
+  endif()
+endif()
+
+if(FLATBUFFERS_BUILD_FLATHASH)
+  add_executable(flathash ${FlatHash_SRCS})
+  target_link_libraries(flathash PRIVATE $<BUILD_INTERFACE:ProjectConfig>)
+endif()
+
+if(FLATBUFFERS_BUILD_SHAREDLIB)
+  add_library(flatbuffers_shared SHARED ${FlatBuffers_Library_SRCS})
+  target_link_libraries(flatbuffers_shared PRIVATE $<BUILD_INTERFACE:ProjectConfig>)
+  # FlatBuffers use calendar-based versioning and do not provide any ABI
+  # stability guarantees. Therefore, always use the full version as SOVERSION
+  # in order to avoid breaking reverse dependencies on upgrades.
+  set(FlatBuffers_Library_SONAME_FULL "${PROJECT_VERSION}")
+  set_target_properties(flatbuffers_shared PROPERTIES
+                        OUTPUT_NAME flatbuffers
+                        SOVERSION "${FlatBuffers_Library_SONAME_FULL}"
+                        VERSION "${FlatBuffers_Library_SONAME_FULL}")
+  if(FLATBUFFERS_ENABLE_PCH)
+    add_pch_to_target(flatbuffers_shared include/flatbuffers/pch/pch.h)
+  endif()
+endif()
+
+function(compile_schema SRC_FBS OPT OUT_GEN_FILE) 
+  get_filename_component(SRC_FBS_DIR ${SRC_FBS} PATH)
+  string(REGEX REPLACE "\\.fbs$" "_generated.h" GEN_HEADER ${SRC_FBS})
+  add_custom_command(
+    OUTPUT ${GEN_HEADER}
+    COMMAND "${FLATBUFFERS_FLATC_EXECUTABLE}"
+      ${OPT}
+      -o "${SRC_FBS_DIR}"
+      "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS}"
+    DEPENDS flatc
+    COMMENT "flatc generation: `${SRC_FBS}` -> `${GEN_HEADER}`"
+    )
+  set(${OUT_GEN_FILE} ${GEN_HEADER} PARENT_SCOPE)
+endfunction()
+
+function(compile_schema_for_test SRC_FBS OPT)
+  compile_schema("${SRC_FBS}" "${OPT}" GEN_FILE)
+  target_sources(flattests PRIVATE ${GEN_FILE})
+endfunction()
+
+function(compile_schema_for_samples SRC_FBS OPT)
+  compile_schema("${SRC_FBS}" "${OPT}" GEN_FILE)
+  target_sources(flatsample PRIVATE ${GEN_FILE})
+endfunction()
+
+if(FLATBUFFERS_BUILD_TESTS)
+  add_executable(flattests ${FlatBuffers_Tests_SRCS})
+  target_link_libraries(flattests PRIVATE $<BUILD_INTERFACE:ProjectConfig>)
+  target_include_directories(flattests PUBLIC 
+    # Ideally everything is fully qualified from the root directories
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_BINARY_DIR}
+    # TODO(derekbailey): update includes to fully qualify src/ and tests/
+    src 
+    tests 
+    ${CMAKE_CURRENT_BINARY_DIR}/tests 
+  )
+
+  # Have tests load data from the source directory, not the build directory.
+  add_definitions(-DFLATBUFFERS_TEST_PATH_PREFIX=${CMAKE_CURRENT_SOURCE_DIR}/)
+
+  # The flattest target needs some generated files
+  SET(FLATC_OPT --cpp --gen-mutable --gen-object-api --reflect-names)
+  SET(FLATC_OPT_COMP ${FLATC_OPT};--gen-compare)
+
+  compile_schema_for_test(tests/alignment_test.fbs "${FLATC_OPT_COMP}")
+  compile_schema_for_test(tests/arrays_test.fbs "${FLATC_OPT_COMP};--scoped-enums")
+  compile_schema_for_test(tests/native_inline_table_test.fbs "${FLATC_OPT_COMP}")
+  compile_schema_for_test(tests/native_type_test.fbs "${FLATC_OPT}")
+  compile_schema_for_test(tests/key_field/key_field_sample.fbs "${FLATC_OPT_COMP}")
+  compile_schema_for_test(tests/64bit/test_64bit.fbs "${FLATC_OPT_COMP};--bfbs-gen-embed")
+  compile_schema_for_test(tests/64bit/evolution/v1.fbs "${FLATC_OPT_COMP}")
+  compile_schema_for_test(tests/64bit/evolution/v2.fbs "${FLATC_OPT_COMP}")
+  compile_schema_for_test(tests/union_underlying_type_test.fbs "${FLATC_OPT_COMP}")
+
+  if(FLATBUFFERS_CODE_SANITIZE)
+    add_fsanitize_to_target(flattests ${FLATBUFFERS_CODE_SANITIZE})
+  endif()
+  
+  include_directories(${CMAKE_CURRENT_BINARY_DIR}/samples)
+
+  add_executable(flatsamplebinary ${FlatBuffers_Sample_Binary_SRCS})
+  add_executable(flatsampletext ${FlatBuffers_Sample_Text_SRCS})
+  add_executable(flatsamplebfbs ${FlatBuffers_Sample_BFBS_SRCS})
+
+  # Add a library so there is a single target that the generated samples can 
+  # link too.
+  add_library(flatsample INTERFACE)
+
+  # Since flatsample has no sources, we have to explicitly set the linker lang.
+  set_target_properties(flatsample PROPERTIES LINKER_LANGUAGE CXX)
+  
+  compile_schema_for_samples(samples/monster.fbs "${FLATC_OPT_COMP}")
+
+  target_link_libraries(flatsamplebinary PRIVATE $<BUILD_INTERFACE:ProjectConfig> flatsample)
+  target_link_libraries(flatsampletext PRIVATE $<BUILD_INTERFACE:ProjectConfig> flatsample)
+  target_link_libraries(flatsamplebfbs PRIVATE $<BUILD_INTERFACE:ProjectConfig> flatsample)
+
+  if(FLATBUFFERS_BUILD_CPP17)
+    add_executable(flattests_cpp17 ${FlatBuffers_Tests_CPP17_SRCS})
+    target_link_libraries(flattests_cpp17 PRIVATE $<BUILD_INTERFACE:ProjectConfig>)
+    target_include_directories(flattests_cpp17 PUBLIC src tests)
+    target_compile_features(flattests_cpp17 PRIVATE cxx_std_17) # requires cmake 3.8
+
+    if(FLATBUFFERS_CODE_SANITIZE)
+      add_fsanitize_to_target(flattests_cpp17 ${FLATBUFFERS_CODE_SANITIZE})
+    endif()
+  endif(FLATBUFFERS_BUILD_CPP17)
+endif()
+
+if(FLATBUFFERS_BUILD_GRPCTEST)
+  if(NOT GRPC_INSTALL_PATH)
+    message(SEND_ERROR "GRPC_INSTALL_PATH variable is not defined. See grpc/README.md")
+  endif()
+  if(NOT PROTOBUF_DOWNLOAD_PATH)
+    message(SEND_ERROR "PROTOBUF_DOWNLOAD_PATH variable is not defined. See grpc/README.md")
+  endif()
+  INCLUDE_DIRECTORIES(${GRPC_INSTALL_PATH}/include)
+  INCLUDE_DIRECTORIES(${PROTOBUF_DOWNLOAD_PATH}/src)
+  find_package(Threads REQUIRED)
+  list(APPEND CMAKE_PREFIX_PATH ${GRPC_INSTALL_PATH})
+  find_package(absl CONFIG REQUIRED)
+  find_package(protobuf CONFIG REQUIRED)
+  find_package(gRPC CONFIG REQUIRED)
+  add_executable(grpctest ${FlatBuffers_GRPCTest_SRCS})
+  target_link_libraries(grpctext
+    PRIVATE
+      $<BUILD_INTERFACE:ProjectConfig>
+      gRPC::grpc++_unsecure
+      gRPC::gpr
+      pthread
+      dl
+  )
+endif()
+
+if(FLATBUFFERS_INSTALL)
+  include(GNUInstallDirs)
+
+  install(DIRECTORY include/flatbuffers DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+  set(FB_CMAKE_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/flatbuffers")
+
+  configure_file(CMake/flatbuffers-config-version.cmake.in flatbuffers-config-version.cmake @ONLY)
+  install(
+      FILES
+        "CMake/flatbuffers-config.cmake"
+        "CMake/BuildFlatBuffers.cmake"
+        "${CMAKE_CURRENT_BINARY_DIR}/flatbuffers-config-version.cmake"
+      DESTINATION ${FB_CMAKE_DIR}
+  )
+
+  if(FLATBUFFERS_BUILD_FLATLIB)
+    install(
+      TARGETS flatbuffers EXPORT FlatBuffersTargets
+      ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+      INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+    )
+
+    install(EXPORT FlatBuffersTargets
+      FILE FlatBuffersTargets.cmake
+      NAMESPACE flatbuffers::
+      DESTINATION ${FB_CMAKE_DIR}
+    )
+  endif()
+
+  if(FLATBUFFERS_BUILD_FLATC)
+    install(
+      TARGETS flatc EXPORT FlatcTargets
+      RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    )
+
+    install(
+      EXPORT FlatcTargets
+      FILE FlatcTargets.cmake
+      NAMESPACE flatbuffers::
+      DESTINATION ${FB_CMAKE_DIR}
+    )
+  endif()
+
+  if(FLATBUFFERS_BUILD_SHAREDLIB)
+    install(
+      TARGETS flatbuffers_shared EXPORT FlatBuffersSharedTargets
+      ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+      RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}
+      LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+      INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+    )
+
+    install(
+      EXPORT FlatBuffersSharedTargets
+      FILE FlatBuffersSharedTargets.cmake
+      NAMESPACE flatbuffers::
+      DESTINATION ${FB_CMAKE_DIR}
+    )
+  endif()
+
+  if(FLATBUFFERS_BUILD_SHAREDLIB OR FLATBUFFERS_BUILD_FLATLIB)
+      configure_file(CMake/flatbuffers.pc.in flatbuffers.pc @ONLY)
+      install(
+        FILES "${CMAKE_CURRENT_BINARY_DIR}/flatbuffers.pc"
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig
+      )
+  endif()
+endif()
+
+if(FLATBUFFERS_BUILD_TESTS)
+  enable_testing()
+
+  add_test(NAME flattests COMMAND flattests)
+  if(FLATBUFFERS_BUILD_CPP17)
+    add_test(NAME flattests_cpp17 COMMAND flattests_cpp17)
+  endif()
+  if(FLATBUFFERS_BUILD_GRPCTEST)
+    add_test(NAME grpctest COMMAND grpctest)
+  endif()
+endif()
+
+include(CMake/BuildFlatBuffers.cmake)
+
+if(UNIX)
+    # Use of CPack only supported on Linux systems.
+    if(FLATBUFFERS_PACKAGE_DEBIAN)
+        include(CMake/PackageDebian.cmake)
+        include(CPack)
+    endif()
+    if (FLATBUFFERS_PACKAGE_REDHAT)
+        include(CMake/PackageRedhat.cmake)
+        include(CPack)
+    endif()
+endif()
+
+# Include for running Google Benchmarks.
+if(FLATBUFFERS_BUILD_BENCHMARKS)
+  add_subdirectory(benchmarks)
+endif()
+
+# Add FlatBuffers::FlatBuffers interface, needed for FetchContent_Declare
+add_library(FlatBuffers INTERFACE)
+add_library(FlatBuffers::FlatBuffers ALIAS FlatBuffers)
+target_include_directories(
+  FlatBuffers
+  INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+            $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/include>)
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/CONTRIBUTING.md b/ethosu/regor/dependencies/thirdparty/flatbuffers/CONTRIBUTING.md
new file mode 100644
index 00000000..17428add
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/CONTRIBUTING.md
@@ -0,0 +1,42 @@
+Contributing    {#contributing}
+============
+
+Want to contribute? Great! First, read this page (including the small print at
+the end).
+
+# Before you contribute
+Before we can use your code, you must sign the
+[Google Individual Contributor License Agreement](https://developers.google.com/open-source/cla/individual?csw=1)
+(CLA), which you can do online. The CLA is necessary mainly because you own the
+copyright to your changes, even after your contribution becomes part of our
+codebase, so we need your permission to use and distribute your code. We also
+need to be sure of various other things—for instance that you'll tell us if you
+know that your code infringes on other people's patents. You don't have to sign
+the CLA until after you've submitted your code for review and a member has
+approved it, but you must do it before we can put your code into our codebase.
+Before you start working on a larger contribution, you should get in touch with
+us first through the issue tracker with your idea so that we can help out and
+possibly guide you. Coordinating up front makes it much easier to avoid
+frustration later on.
+
+# Code reviews
+All submissions, including submissions by project members, require review. We
+use Github pull requests for this purpose.
+
+Some tips for good pull requests:
+* Use our code
+  [style guide](https://google.github.io/styleguide/cppguide.html).
+  When in doubt, try to stay true to the existing code of the project.
+* Write a descriptive commit message. What problem are you solving and what
+  are the consequences? Where and what did you test? Some good tips:
+  [here](http://robots.thoughtbot.com/5-useful-tips-for-a-better-commit-message)
+  and [here](https://www.kernel.org/doc/Documentation/SubmittingPatches).
+* If your PR consists of multiple commits which are successive improvements /
+  fixes to your first commit, consider squashing them into a single commit
+  (`git rebase -i`) such that your PR is a single commit on top of the current
+  HEAD. This make reviewing the code so much easier, and our history more
+  readable.
+
+# The small print
+Contributions made by corporations are covered by a different agreement than
+the one above, the Software Grant and Corporate Contributor License Agreement.
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/FlatBuffers.podspec b/ethosu/regor/dependencies/thirdparty/flatbuffers/FlatBuffers.podspec
new file mode 100644
index 00000000..cecb1137
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/FlatBuffers.podspec
@@ -0,0 +1,21 @@
+Pod::Spec.new do |s|
+  s.name             = 'FlatBuffers'
+  s.version          = '23.5.26'
+  s.summary          = 'FlatBuffers: Memory Efficient Serialization Library'
+
+  s.description      = "FlatBuffers is a cross platform serialization library architected for
+  maximum memory efficiency. It allows you to directly access serialized
+  data without parsing/unpacking it first, while still having great 
+  forwards/backwards compatibility."
+
+  s.homepage         = 'https://github.com/google/flatbuffers'
+  s.license          = { :type => 'Apache2.0', :file => 'LICENSE' }
+  s.author           = { 'mustii' => 'mustii@mmk.one' }
+  s.source           = { :git => 'https://github.com/google/flatbuffers.git', :tag => s.version.to_s, :submodules => true }
+
+  s.ios.deployment_target = '11.0'
+  s.osx.deployment_target = '10.14'
+
+  s.swift_version = '5.0'
+  s.source_files = 'swift/Sources/Flatbuffers/*.swift'
+end
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/Formatters.md b/ethosu/regor/dependencies/thirdparty/flatbuffers/Formatters.md
new file mode 100644
index 00000000..877c5f20
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/Formatters.md
@@ -0,0 +1,22 @@
+# Format Guidelines
+
+If you are interesting in contributing to the flatbuffers project, please take a second to read this document. Each language has it's own set of rules, that are defined in their respective formatter/linter documents.
+
+# Notes
+
+- Run the linter on the language you are working on before making a Pull Request.
+- DONT format/lint the generated code.
+
+# Languages
+
+## C++
+
+C++ uses `clang-format` as it's formatter. Run the following script `sh scripts/clang-format-git.sh`, and it should style the C++ code according to [google style guide](https://google.github.io/styleguide/cppguide.html).
+
+## Swift
+
+Swift uses swiftformat as it's formatter. Take a look at [how to install here](https://github.com/nicklockwood/SwiftFormat/blob/master/README.md#how-do-i-install-it). Run the following command `swiftformat --config swift.swiftformat .` in the root directory of the project
+
+## Typescript
+
+Typescript uses eslint as it's linter. Take a look at [how to install here](https://eslint.org/docs/user-guide/getting-started). Run the following command `eslint ts/** --ext .ts` in the root directory of the project
\ No newline at end of file
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/LICENSE b/ethosu/regor/dependencies/thirdparty/flatbuffers/LICENSE
new file mode 100644
index 00000000..d6456956
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/Package.swift b/ethosu/regor/dependencies/thirdparty/flatbuffers/Package.swift
new file mode 100644
index 00000000..982277a9
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/Package.swift
@@ -0,0 +1,37 @@
+// swift-tools-version:5.2
+/*
+ * Copyright 2020 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import PackageDescription
+
+let package = Package(
+  name: "FlatBuffers",
+  platforms: [
+    .iOS(.v11),
+    .macOS(.v10_14),
+  ],
+  products: [
+    .library(
+      name: "FlatBuffers",
+      targets: ["FlatBuffers"]),
+  ],
+  targets: [
+    .target(
+      name: "FlatBuffers",
+      dependencies: [],
+      path: "swift/Sources",
+      exclude: ["Documentation.docc/Resources/code/swift"]),
+  ])
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/Package@swift-5.5.swift b/ethosu/regor/dependencies/thirdparty/flatbuffers/Package@swift-5.5.swift
new file mode 100644
index 00000000..13313560
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/Package@swift-5.5.swift
@@ -0,0 +1,37 @@
+// swift-tools-version:5.5
+/*
+ * Copyright 2020 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import PackageDescription
+
+let package = Package(
+  name: "FlatBuffers",
+  platforms: [
+    .iOS(.v11),
+    .macOS(.v10_14),
+  ],
+  products: [
+    .library(
+      name: "FlatBuffers",
+      targets: ["FlatBuffers"]),
+  ],
+  targets: [
+    .target(
+      name: "FlatBuffers",
+      dependencies: [],
+      path: "swift/Sources"),
+  ])
+
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/README.md b/ethosu/regor/dependencies/thirdparty/flatbuffers/README.md
new file mode 100644
index 00000000..875c73bb
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/README.md
@@ -0,0 +1,121 @@
+![logo](http://google.github.io/flatbuffers/fpl_logo_small.png) FlatBuffers
+===========
+
+![Build status](https://github.com/google/flatbuffers/actions/workflows/build.yml/badge.svg?branch=master)
+[![BuildKite status](https://badge.buildkite.com/7979d93bc6279aa539971f271253c65d5e8fe2fe43c90bbb25.svg)](https://buildkite.com/bazel/flatbuffers)
+[![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/flatbuffers.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:flatbuffers)
+[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/google/flatbuffers/badge)](https://api.securityscorecards.dev/projects/github.com/google/flatbuffers)
+[![Join the chat at https://gitter.im/google/flatbuffers](https://badges.gitter.im/google/flatbuffers.svg)](https://gitter.im/google/flatbuffers?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+[![Discord Chat](https://img.shields.io/discord/656202785926152206.svg)](https:///discord.gg/6qgKs3R)
+[![Twitter Follow](https://img.shields.io/twitter/follow/wvo.svg?style=social)](https://twitter.com/wvo)
+[![Twitter Follow](https://img.shields.io/twitter/follow/dbaileychess.svg?style=social)](https://twitter.com/dbaileychess)
+
+
+**FlatBuffers** is a cross platform serialization library architected for
+maximum memory efficiency. It allows you to directly access serialized data without parsing/unpacking it first, while still having great forwards/backwards compatibility.
+
+## Quick Start
+
+1. Build the compiler for flatbuffers (`flatc`)
+
+    Use `cmake` to create the build files for your platform and then perform the compliation (Linux example).
+
+    ```
+    cmake -G "Unix Makefiles"
+    make -j
+    ```
+
+2. Define your flatbuffer schema (`.fbs`)
+
+    Write the [schema](https://flatbuffers.dev/flatbuffers_guide_writing_schema.html) to define the data you want to serialize. See [monster.fbs](https://github.com/google/flatbuffers/blob/master/samples/monster.fbs) for an example.
+
+3. Generate code for your language(s)
+
+    Use the `flatc` compiler to take your schema and generate language-specific code:
+
+    ```
+    ./flatc --cpp --rust monster.fbs
+    ```
+    
+    Which generates `monster_generated.h` and `monster_generated.rs` files.
+
+4. Serialize data
+
+    Use the generated code, as well as the `FlatBufferBuilder` to construct your serialized buffer. ([`C++` example](https://github.com/google/flatbuffers/blob/master/samples/sample_binary.cpp#L24-L56))
+
+5. Transmit/store/save Buffer
+
+    Use your serialized buffer however you want. Send it to someone, save it for later, etc...
+
+6. Read the data
+
+    Use the generated accessors to read the data from the serialized buffer.
+    
+    It doesn't need to be the same language/schema version, FlatBuffers ensures the data is readable across languages and schema versions. See the [`Rust` example](https://github.com/google/flatbuffers/blob/master/samples/sample_binary.rs#L92-L106) reading the data written by `C++`.
+
+## Documentation
+
+**Go to our [landing page][] to browse our documentation.**
+
+## Supported operating systems
+- Windows
+- macOS
+- Linux
+- Android
+- And any others with a recent C++ compiler (C++ 11 and newer)
+
+## Supported programming languages
+
+Code generation and runtime libraries for many popular languages.
+
+1. C
+1. C++ - [snapcraft.io](https://snapcraft.io/flatbuffers)
+1. C# - [nuget.org](https://www.nuget.org/packages/Google.FlatBuffers)
+1. Dart - [pub.dev](https://pub.dev/packages/flat_buffers)
+1. Go - [go.dev](https://pkg.go.dev/github.com/google/flatbuffers)
+1. Java - [Maven](https://search.maven.org/artifact/com.google.flatbuffers/flatbuffers-java)
+1. JavaScript - [NPM](https://www.npmjs.com/package/flatbuffers)
+1. Kotlin
+1. Lobster
+1. Lua
+1. PHP
+1. Python - [PyPI](https://pypi.org/project/flatbuffers/)
+1. Rust - [crates.io](https://crates.io/crates/flatbuffers)
+1. Swift - [swiftpackageindex](https://swiftpackageindex.com/google/flatbuffers)
+1. TypeScript - [NPM](https://www.npmjs.com/package/flatbuffers)
+1. Nim
+
+## Versioning
+
+FlatBuffers does not follow traditional SemVer versioning (see [rationale](https://github.com/google/flatbuffers/wiki/Versioning)) but rather uses a format of the date of the release.
+
+## Contribution
+
+* [FlatBuffers Issues Tracker][] to submit an issue.
+* [stackoverflow.com][] with [`flatbuffers` tag][] for any questions regarding FlatBuffers.
+
+*To contribute to this project,* see [CONTRIBUTING][].
+
+## Community
+
+* [FlatBuffers Google Group][] to discuss FlatBuffers with other developers and users.
+* [Discord Server](https:///discord.gg/6qgKs3R)
+* [Gitter](https://gitter.im/google/flatbuffers)
+
+
+## Security
+
+Please see our [Security Policy](SECURITY.md) for reporting vulnerabilities.
+
+## Licensing
+*Flatbuffers* is licensed under the Apache License, Version 2.0. See [LICENSE][] for the full license text.
+
+<br>
+
+   [CONTRIBUTING]: http://github.com/google/flatbuffers/blob/master/CONTRIBUTING.md
+   [`flatbuffers` tag]: https://stackoverflow.com/questions/tagged/flatbuffers
+   [FlatBuffers Google Group]: https://groups.google.com/forum/#!forum/flatbuffers
+   [FlatBuffers Issues Tracker]: http://github.com/google/flatbuffers/issues
+   [stackoverflow.com]: http://stackoverflow.com/search?q=flatbuffers
+   [landing page]: https://google.github.io/flatbuffers
+   [LICENSE]: https://github.com/google/flatbuffers/blob/master/LICENSE
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/SECURITY.md b/ethosu/regor/dependencies/thirdparty/flatbuffers/SECURITY.md
new file mode 100644
index 00000000..c61f66f8
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/SECURITY.md
@@ -0,0 +1,11 @@
+# Security Policy
+
+## Reporting a Vulnerability
+
+To report a security issue, please use http://g.co/vulnz. We use
+http://g.co/vulnz for our intake, and do coordination and disclosure here on
+GitHub (including using GitHub Security Advisory). The Google Security Team will
+respond within 5 working days of your report on g.co/vulnz.
+
+Select the `I want to report a technical security or an abuse risk related bug
+in a Google product (SQLi, XSS, etc.)` option and complete the form.
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/WORKSPACE b/ethosu/regor/dependencies/thirdparty/flatbuffers/WORKSPACE
new file mode 100644
index 00000000..d7a8b2ca
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/WORKSPACE
@@ -0,0 +1,156 @@
+workspace(name = "com_github_google_flatbuffers")
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive", "http_file")
+
+http_archive(
+    name = "platforms",
+    sha256 = "379113459b0feaf6bfbb584a91874c065078aa673222846ac765f86661c27407",
+    urls = [
+        "https://mirror.bazel.build/github.com/bazelbuild/platforms/releases/download/0.0.5/platforms-0.0.5.tar.gz",
+        "https://github.com/bazelbuild/platforms/releases/download/0.0.5/platforms-0.0.5.tar.gz",
+    ],
+)
+
+http_archive(
+    name = "build_bazel_rules_swift",
+    sha256 = "a2fd565e527f83fb3f9eb07eb9737240e668c9242d3bc318712efa54a7deda97",
+    url = "https://github.com/bazelbuild/rules_swift/releases/download/0.27.0/rules_swift.0.27.0.tar.gz",
+)
+
+load(
+    "@build_bazel_rules_swift//swift:repositories.bzl",
+    "swift_rules_dependencies",
+)
+
+swift_rules_dependencies()
+
+load(
+    "@build_bazel_rules_swift//swift:extras.bzl",
+    "swift_rules_extra_dependencies",
+)
+
+swift_rules_extra_dependencies()
+
+http_archive(
+    name = "io_bazel_rules_go",
+    sha256 = "ae013bf35bd23234d1dea46b079f1e05ba74ac0321423830119d3e787ec73483",
+    urls = [
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_go/releases/download/v0.36.0/rules_go-v0.36.0.zip",
+        "https://github.com/bazelbuild/rules_go/releases/download/v0.36.0/rules_go-v0.36.0.zip",
+    ],
+)
+
+load("@io_bazel_rules_go//go:deps.bzl", "go_rules_dependencies")
+
+go_rules_dependencies()
+
+##### Protobuf
+_PROTOBUF_VERSION = "3.15.2"
+
+http_archive(
+    name = "com_google_protobuf",
+    strip_prefix = "protobuf-" + _PROTOBUF_VERSION,
+    urls = [
+        "https://github.com/protocolbuffers/protobuf/archive/v" + _PROTOBUF_VERSION + ".tar.gz",
+    ],
+)
+
+##### GRPC
+_GRPC_VERSION = "1.49.0"  # https://github.com/grpc/grpc/releases/tag/v1.48.0
+
+http_archive(
+    name = "com_github_grpc_grpc",
+    patch_args = ["-p1"],
+    patches = ["//grpc:build_grpc_with_cxx14.patch"],
+    sha256 = "15715e1847cc9e42014f02c727dbcb48e39dbdb90f79ad3d66fe4361709ff935",
+    strip_prefix = "grpc-" + _GRPC_VERSION,
+    urls = ["https://github.com/grpc/grpc/archive/refs/tags/v" + _GRPC_VERSION + ".tar.gz"],
+)
+
+load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
+
+grpc_deps()
+
+load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
+
+grpc_extra_deps()
+
+# rules_go from https://github.com/bazelbuild/rules_go/releases/tag/v0.34.0
+
+http_archive(
+    name = "aspect_rules_js",
+    sha256 = "124ed29fb0b3d0cba5b44f8f8e07897cf61b34e35e33b1f83d1a943dfd91b193",
+    strip_prefix = "rules_js-1.24.0",
+    url = "https://github.com/aspect-build/rules_js/releases/download/v1.24.0/rules_js-v1.24.0.tar.gz",
+)
+
+load("@aspect_rules_js//js:repositories.bzl", "rules_js_dependencies")
+
+rules_js_dependencies()
+
+load("@aspect_rules_js//npm:npm_import.bzl", "npm_translate_lock", "pnpm_repository")
+
+pnpm_repository(name = "pnpm")
+
+http_archive(
+    name = "aspect_rules_ts",
+    sha256 = "8eb25d1fdafc0836f5778d33fb8eaac37c64176481d67872b54b0a05de5be5c0",
+    strip_prefix = "rules_ts-1.3.3",
+    url = "https://github.com/aspect-build/rules_ts/releases/download/v1.3.3/rules_ts-v1.3.3.tar.gz",
+)
+
+load("@aspect_rules_ts//ts:repositories.bzl", "rules_ts_dependencies")
+
+rules_ts_dependencies(
+    # Since rules_ts doesn't always have the newest integrity hashes, we
+    # compute it manually here.
+    #   $ curl --silent https://registry.npmjs.org/typescript/5.0.4 | jq ._integrity
+    ts_integrity = "sha512-cW9T5W9xY37cc+jfEnaUvX91foxtHkza3Nw3wkoF4sSlKn0MONdkdEndig/qPBWXNkmplh3NzayQzCiHM4/hqw==",
+    ts_version_from = "//:package.json",
+)
+
+load("@rules_nodejs//nodejs:repositories.bzl", "DEFAULT_NODE_VERSION", "nodejs_register_toolchains")
+
+nodejs_register_toolchains(
+    name = "nodejs",
+    node_version = DEFAULT_NODE_VERSION,
+)
+
+npm_translate_lock(
+    name = "npm",
+    npmrc = "//:.npmrc",
+    pnpm_lock = "//:pnpm-lock.yaml",
+    # Set this to True when the lock file needs to be updated, commit the
+    # changes, then set to False again.
+    update_pnpm_lock = False,
+    verify_node_modules_ignored = "//:.bazelignore",
+)
+
+load("@npm//:repositories.bzl", "npm_repositories")
+
+npm_repositories()
+
+http_archive(
+    name = "aspect_rules_esbuild",
+    sha256 = "2ea31bd97181a315e048be693ddc2815fddda0f3a12ca7b7cc6e91e80f31bac7",
+    strip_prefix = "rules_esbuild-0.14.4",
+    url = "https://github.com/aspect-build/rules_esbuild/releases/download/v0.14.4/rules_esbuild-v0.14.4.tar.gz",
+)
+
+# Register a toolchain containing esbuild npm package and native bindings
+load("@aspect_rules_esbuild//esbuild:repositories.bzl", "LATEST_VERSION", "esbuild_register_toolchains")
+
+esbuild_register_toolchains(
+    name = "esbuild",
+    esbuild_version = LATEST_VERSION,
+)
+
+http_file(
+    name = "bazel_linux_x86_64",
+    downloaded_file_path = "bazel",
+    executable = True,
+    sha256 = "e89747d63443e225b140d7d37ded952dacea73aaed896bca01ccd745827c6289",
+    urls = [
+        "https://github.com/bazelbuild/bazel/releases/download/6.1.2/bazel-6.1.2-linux-x86_64",
+    ],
+)
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/build_defs.bzl b/ethosu/regor/dependencies/thirdparty/flatbuffers/build_defs.bzl
new file mode 100644
index 00000000..e2d21e45
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/build_defs.bzl
@@ -0,0 +1,280 @@
+# Description:
+#   BUILD rules for generating flatbuffer files in various languages.
+
+"""
+Rules for building C++ flatbuffers with Bazel.
+"""
+
+load("@rules_cc//cc:defs.bzl", "cc_library")
+
+TRUE_FLATC_PATH = "@com_github_google_flatbuffers//:flatc"
+
+DEFAULT_INCLUDE_PATHS = [
+    "./",
+    "$(GENDIR)",
+    "$(BINDIR)",
+    "$(execpath @com_github_google_flatbuffers//:flatc).runfiles/com_github_google_flatbuffers",
+]
+
+def default_include_paths(flatc_path):
+    return [
+        "./",
+        "$(GENDIR)",
+        "$(BINDIR)",
+        "$(execpath %s).runfiles/com_github_google_flatbuffers" % (flatc_path),
+    ]
+
+DEFAULT_FLATC_ARGS = [
+    "--gen-object-api",
+    "--gen-compare",
+    "--no-includes",
+    "--gen-mutable",
+    "--reflect-names",
+    "--cpp-ptr-type flatbuffers::unique_ptr",
+]
+
+def flatbuffer_library_public(
+        name,
+        srcs,
+        outs,
+        language_flag,
+        out_prefix = "",
+        includes = [],
+        include_paths = None,
+        flatc_args = DEFAULT_FLATC_ARGS,
+        reflection_name = "",
+        reflection_visibility = None,
+        compatible_with = None,
+        restricted_to = None,
+        target_compatible_with = None,
+        flatc_path = "@com_github_google_flatbuffers//:flatc",
+        output_to_bindir = False,
+        tools = None,
+        extra_env = None,
+        **kwargs):
+    """Generates code files for reading/writing the given flatbuffers in the requested language using the public compiler.
+
+    Args:
+      name: Rule name.
+      srcs: Source .fbs files. Sent in order to the compiler.
+      outs: Output files from flatc.
+      language_flag: Target language flag. One of [-c, -j, -js].
+      out_prefix: Prepend this path to the front of all generated files except on
+          single source targets. Usually is a directory name.
+      includes: Optional, list of filegroups of schemas that the srcs depend on.
+      include_paths: Optional, list of paths the includes files can be found in.
+      flatc_args: Optional, list of additional arguments to pass to flatc.
+      reflection_name: Optional, if set this will generate the flatbuffer
+        reflection binaries for the schemas.
+      reflection_visibility: The visibility of the generated reflection Fileset.
+      output_to_bindir: Passed to genrule for output to bin directory.
+      compatible_with: Optional, The list of environments this rule can be
+        built for, in addition to default-supported environments.
+      restricted_to: Optional, The list of environments this rule can be built
+        for, instead of default-supported environments.
+      target_compatible_with: Optional, The list of target platform constraints
+        to use.
+      flatc_path: Bazel target corresponding to the flatc compiler to use.
+      output_to_bindir: Passed to genrule for output to bin directory.
+      tools: Optional, passed to genrule for list of tools to make available
+        during the action.
+      extra_env: Optional, must be a string of "VAR1=VAL1 VAR2=VAL2". These get
+        set as environment variables that "flatc_path" sees.
+      **kwargs: Passed to the underlying genrule.
+
+
+    This rule creates a filegroup(name) with all generated source files, and
+    optionally a Fileset([reflection_name]) with all generated reflection
+    binaries.
+    """
+    if include_paths == None:
+        include_paths = default_include_paths(flatc_path)
+    include_paths_cmd = ["-I %s" % (s) for s in include_paths]
+
+    extra_env = extra_env or ""
+
+    # '$(@D)' when given a single source target will give the appropriate
+    # directory. Appending 'out_prefix' is only necessary when given a build
+    # target with multiple sources.
+    output_directory = (
+        ("-o $(@D)/%s" % (out_prefix)) if len(srcs) > 1 else ("-o $(@D)")
+    )
+    genrule_cmd = " ".join([
+        "SRCS=($(SRCS));",
+        "for f in $${SRCS[@]:0:%s}; do" % len(srcs),
+        "OUTPUT_FILE=\"$(OUTS)\" %s $(location %s)" % (extra_env, flatc_path),
+        " ".join(include_paths_cmd),
+        " ".join(flatc_args),
+        language_flag,
+        output_directory,
+        "$$f;",
+        "done",
+    ])
+    native.genrule(
+        name = name,
+        srcs = srcs + includes,
+        outs = outs,
+        output_to_bindir = output_to_bindir,
+        tools = (tools or []) + [flatc_path],
+        cmd = genrule_cmd,
+        compatible_with = compatible_with,
+        target_compatible_with = target_compatible_with,
+        restricted_to = restricted_to,
+        message = "Generating flatbuffer files for %s:" % (name),
+        **kwargs
+    )
+    if reflection_name:
+        reflection_genrule_cmd = " ".join([
+            "SRCS=($(SRCS));",
+            "for f in $${SRCS[@]:0:%s}; do" % len(srcs),
+            "$(location %s)" % (TRUE_FLATC_PATH),
+            "-b --schema",
+            " ".join(flatc_args),
+            " ".join(include_paths_cmd),
+            language_flag,
+            output_directory,
+            "$$f;",
+            "done",
+        ])
+        reflection_outs = [
+            (out_prefix + "%s.bfbs") % (s.replace(".fbs", "").split("/")[-1])
+            for s in srcs
+        ]
+        native.genrule(
+            name = "%s_srcs" % reflection_name,
+            srcs = srcs + includes,
+            outs = reflection_outs,
+            output_to_bindir = output_to_bindir,
+            tools = [TRUE_FLATC_PATH],
+            compatible_with = compatible_with,
+            restricted_to = restricted_to,
+            target_compatible_with = target_compatible_with,
+            cmd = reflection_genrule_cmd,
+            message = "Generating flatbuffer reflection binary for %s:" % (name),
+            visibility = reflection_visibility,
+        )
+        native.filegroup(
+            name = "%s_out" % reflection_name,
+            srcs = reflection_outs,
+            visibility = reflection_visibility,
+            compatible_with = compatible_with,
+            restricted_to = restricted_to,
+        )
+
+def flatbuffer_cc_library(
+        name,
+        srcs,
+        srcs_filegroup_name = "",
+        outs = [],
+        out_prefix = "",
+        deps = [],
+        includes = [],
+        include_paths = None,
+        cc_include_paths = [],
+        flatc_args = DEFAULT_FLATC_ARGS,
+        visibility = None,
+        compatible_with = None,
+        restricted_to = None,
+        target_compatible_with = None,
+        srcs_filegroup_visibility = None,
+        gen_reflections = False):
+    """A cc_library with the generated reader/writers for the given flatbuffer definitions.
+
+    Args:
+      name: Rule name.
+      srcs: Source .fbs files. Sent in order to the compiler.
+      srcs_filegroup_name: Name of the output filegroup that holds srcs. Pass this
+          filegroup into the `includes` parameter of any other
+          flatbuffer_cc_library that depends on this one's schemas.
+      outs: Additional outputs expected to be generated by flatc.
+      out_prefix: Prepend this path to the front of all generated files. Usually
+          is a directory name.
+      deps: Optional, list of other flatbuffer_cc_library's to depend on. Cannot be specified
+          alongside includes.
+      includes: Optional, list of filegroups of schemas that the srcs depend on.
+          Use of this is discouraged, and may be deprecated.
+      include_paths: Optional, list of paths the includes files can be found in.
+      cc_include_paths: Optional, list of paths to add to the cc_library includes attribute.
+      flatc_args: Optional list of additional arguments to pass to flatc
+          (e.g. --gen-mutable).
+      visibility: The visibility of the generated cc_library. By default, use the
+          default visibility of the project.
+      srcs_filegroup_visibility: The visibility of the generated srcs filegroup.
+          By default, use the value of the visibility parameter above.
+      gen_reflections: Optional, if true this will generate the flatbuffer
+        reflection binaries for the schemas.
+      compatible_with: Optional, The list of environments this rule can be built
+        for, in addition to default-supported environments.
+      restricted_to: Optional, The list of environments this rule can be built
+        for, instead of default-supported environments.
+      target_compatible_with: Optional, The list of target platform constraints
+        to use.
+
+    This produces:
+      filegroup([name]_srcs): all generated .h files.
+      filegroup(srcs_filegroup_name if specified, or [name]_includes if not):
+          Other flatbuffer_cc_library's can pass this in for their `includes`
+          parameter, if they depend on the schemas in this library.
+      Fileset([name]_reflection): (Optional) all generated reflection binaries.
+      cc_library([name]): library with sources and flatbuffers deps.
+    """
+    output_headers = [
+        (out_prefix + "%s_generated.h") % (s.replace(".fbs", "").split("/")[-1].split(":")[-1])
+        for s in srcs
+    ]
+    if deps and includes:
+        # There is no inherent reason we couldn't support both, but this discourages
+        # use of includes without good reason.
+        fail("Cannot specify both deps and include in flatbuffer_cc_library.")
+    if deps:
+        includes = [d + "_includes" for d in deps]
+    reflection_name = "%s_reflection" % name if gen_reflections else ""
+
+    srcs_lib = "%s_srcs" % (name)
+    flatbuffer_library_public(
+        name = srcs_lib,
+        srcs = srcs,
+        outs = outs + output_headers,
+        language_flag = "-c",
+        out_prefix = out_prefix,
+        includes = includes,
+        include_paths = include_paths,
+        flatc_args = flatc_args,
+        compatible_with = compatible_with,
+        restricted_to = restricted_to,
+        target_compatible_with = target_compatible_with,
+        reflection_name = reflection_name,
+        reflection_visibility = visibility,
+    )
+    cc_library(
+        name = name,
+        hdrs = [
+            ":" + srcs_lib,
+        ],
+        srcs = [
+            ":" + srcs_lib,
+        ],
+        features = [
+            "-parse_headers",
+        ],
+        deps = [
+            "@com_github_google_flatbuffers//:runtime_cc",
+            "@com_github_google_flatbuffers//:flatbuffers",
+        ] + deps,
+        includes = cc_include_paths,
+        compatible_with = compatible_with,
+        restricted_to = restricted_to,
+        target_compatible_with = target_compatible_with,
+        linkstatic = 1,
+        visibility = visibility,
+    )
+
+    # A filegroup for the `srcs`. That is, all the schema files for this
+    # Flatbuffer set.
+    native.filegroup(
+        name = srcs_filegroup_name if srcs_filegroup_name else "%s_includes" % (name),
+        srcs = srcs + includes,
+        compatible_with = compatible_with,
+        restricted_to = restricted_to,
+        visibility = srcs_filegroup_visibility if srcs_filegroup_visibility != None else visibility,
+    )
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/composer.json b/ethosu/regor/dependencies/thirdparty/flatbuffers/composer.json
new file mode 100644
index 00000000..807709c1
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/composer.json
@@ -0,0 +1,18 @@
+{
+  "name": "google/flatbuffers",
+  "type": "library",
+  "description": "FlatBuffers for PHP",
+  "keywords": ["google", "flatbuffers", "serialization"],
+  "homepage": "https://github.com/google/flatbuffers",
+  "license": "Apache-2.0",
+  "require": {
+    "php": ">=5.4"
+  },
+  "require-dev": {
+  },
+  "autoload": {
+    "psr-4": {
+      "Google\\FlatBuffers\\": "php"
+    }
+  }
+}
\ No newline at end of file
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/conanfile.py b/ethosu/regor/dependencies/thirdparty/flatbuffers/conanfile.py
new file mode 100644
index 00000000..9d622908
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/conanfile.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""Conan recipe package for Google FlatBuffers
+"""
+import os
+import shutil
+from conans import ConanFile, CMake, tools
+
+
+class FlatbuffersConan(ConanFile):
+    name = "flatbuffers"
+    license = "Apache-2.0"
+    url = "https://github.com/google/flatbuffers"
+    homepage = "http://google.github.io/flatbuffers/"
+    author = "Wouter van Oortmerssen"
+    topics = ("conan", "flatbuffers", "serialization", "rpc", "json-parser")
+    description = "Memory Efficient Serialization Library"
+    settings = "os", "compiler", "build_type", "arch"
+    options = {"shared": [True, False], "fPIC": [True, False]}
+    default_options = {"shared": False, "fPIC": True}
+    generators = "cmake"
+    exports = "LICENSE"
+    exports_sources = ["CMake/*", "include/*", "src/*", "grpc/*", "CMakeLists.txt", "conan/CMakeLists.txt"]
+
+    def source(self):
+        """Wrap the original CMake file to call conan_basic_setup
+        """
+        shutil.move("CMakeLists.txt", "CMakeListsOriginal.txt")
+        shutil.move(os.path.join("conan", "CMakeLists.txt"), "CMakeLists.txt")
+
+    def config_options(self):
+        """Remove fPIC option on Windows platform
+        """
+        if self.settings.os == "Windows":
+            self.options.remove("fPIC")
+
+    def configure_cmake(self):
+        """Create CMake instance and execute configure step
+        """
+        cmake = CMake(self)
+        cmake.definitions["FLATBUFFERS_BUILD_TESTS"] = False
+        cmake.definitions["FLATBUFFERS_BUILD_SHAREDLIB"] = self.options.shared
+        cmake.definitions["FLATBUFFERS_BUILD_FLATLIB"] = not self.options.shared
+        cmake.configure()
+        return cmake
+
+    def build(self):
+        """Configure, build and install FlatBuffers using CMake.
+        """
+        cmake = self.configure_cmake()
+        cmake.build()
+
+    def package(self):
+        """Copy Flatbuffers' artifacts to package folder
+        """
+        cmake = self.configure_cmake()
+        cmake.install()
+        self.copy(pattern="LICENSE", dst="licenses")
+        self.copy(pattern="FindFlatBuffers.cmake", dst=os.path.join("lib", "cmake", "flatbuffers"), src="CMake")
+        self.copy(pattern="flathash*", dst="bin", src="bin")
+        self.copy(pattern="flatc*", dst="bin", src="bin")
+        if self.settings.os == "Windows" and self.options.shared:
+            if self.settings.compiler == "Visual Studio":
+                shutil.move(os.path.join(self.package_folder, "lib", "%s.dll" % self.name),
+                            os.path.join(self.package_folder, "bin", "%s.dll" % self.name))
+            elif self.settings.compiler == "gcc":
+                shutil.move(os.path.join(self.package_folder, "lib", "lib%s.dll" % self.name),
+                            os.path.join(self.package_folder, "bin", "lib%s.dll" % self.name))
+
+    def package_info(self):
+        """Collect built libraries names and solve flatc path.
+        """
+        self.cpp_info.libs = tools.collect_libs(self)
+        self.user_info.flatc = os.path.join(self.package_folder, "bin", "flatc")
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/allocator.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/allocator.h
new file mode 100644
index 00000000..30427190
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/allocator.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_ALLOCATOR_H_
+#define FLATBUFFERS_ALLOCATOR_H_
+
+#include "flatbuffers/base.h"
+
+namespace flatbuffers {
+
+// Allocator interface. This is flatbuffers-specific and meant only for
+// `vector_downward` usage.
+class Allocator {
+ public:
+  virtual ~Allocator() {}
+
+  // Allocate `size` bytes of memory.
+  virtual uint8_t *allocate(size_t size) = 0;
+
+  // Deallocate `size` bytes of memory at `p` allocated by this allocator.
+  virtual void deallocate(uint8_t *p, size_t size) = 0;
+
+  // Reallocate `new_size` bytes of memory, replacing the old region of size
+  // `old_size` at `p`. In contrast to a normal realloc, this grows downwards,
+  // and is intended specifcally for `vector_downward` use.
+  // `in_use_back` and `in_use_front` indicate how much of `old_size` is
+  // actually in use at each end, and needs to be copied.
+  virtual uint8_t *reallocate_downward(uint8_t *old_p, size_t old_size,
+                                       size_t new_size, size_t in_use_back,
+                                       size_t in_use_front) {
+    FLATBUFFERS_ASSERT(new_size > old_size);  // vector_downward only grows
+    uint8_t *new_p = allocate(new_size);
+    memcpy_downward(old_p, old_size, new_p, new_size, in_use_back,
+                    in_use_front);
+    deallocate(old_p, old_size);
+    return new_p;
+  }
+
+ protected:
+  // Called by `reallocate_downward` to copy memory from `old_p` of `old_size`
+  // to `new_p` of `new_size`. Only memory of size `in_use_front` and
+  // `in_use_back` will be copied from the front and back of the old memory
+  // allocation.
+  void memcpy_downward(uint8_t *old_p, size_t old_size, uint8_t *new_p,
+                       size_t new_size, size_t in_use_back,
+                       size_t in_use_front) {
+    memcpy(new_p + new_size - in_use_back, old_p + old_size - in_use_back,
+           in_use_back);
+    memcpy(new_p, old_p, in_use_front);
+  }
+};
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_ALLOCATOR_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/array.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/array.h
new file mode 100644
index 00000000..f4bfbf05
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/array.h
@@ -0,0 +1,256 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_ARRAY_H_
+#define FLATBUFFERS_ARRAY_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "flatbuffers/base.h"
+#include "flatbuffers/stl_emulation.h"
+#include "flatbuffers/vector.h"
+
+namespace flatbuffers {
+
+// This is used as a helper type for accessing arrays.
+template<typename T, uint16_t length> class Array {
+  // Array<T> can carry only POD data types (scalars or structs).
+  typedef typename flatbuffers::bool_constant<flatbuffers::is_scalar<T>::value>
+      scalar_tag;
+  typedef
+      typename flatbuffers::conditional<scalar_tag::value, T, const T *>::type
+          IndirectHelperType;
+
+ public:
+  typedef uint16_t size_type;
+  typedef typename IndirectHelper<IndirectHelperType>::return_type return_type;
+  typedef VectorConstIterator<T, return_type, uoffset_t> const_iterator;
+  typedef VectorReverseIterator<const_iterator> const_reverse_iterator;
+
+  // If T is a LE-scalar or a struct (!scalar_tag::value).
+  static FLATBUFFERS_CONSTEXPR bool is_span_observable =
+      (scalar_tag::value && (FLATBUFFERS_LITTLEENDIAN || sizeof(T) == 1)) ||
+      !scalar_tag::value;
+
+  FLATBUFFERS_CONSTEXPR uint16_t size() const { return length; }
+
+  return_type Get(uoffset_t i) const {
+    FLATBUFFERS_ASSERT(i < size());
+    return IndirectHelper<IndirectHelperType>::Read(Data(), i);
+  }
+
+  return_type operator[](uoffset_t i) const { return Get(i); }
+
+  // If this is a Vector of enums, T will be its storage type, not the enum
+  // type. This function makes it convenient to retrieve value with enum
+  // type E.
+  template<typename E> E GetEnum(uoffset_t i) const {
+    return static_cast<E>(Get(i));
+  }
+
+  const_iterator begin() const { return const_iterator(Data(), 0); }
+  const_iterator end() const { return const_iterator(Data(), size()); }
+
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+  const_iterator cbegin() const { return begin(); }
+  const_iterator cend() const { return end(); }
+
+  const_reverse_iterator crbegin() const { return rbegin(); }
+  const_reverse_iterator crend() const { return rend(); }
+
+  // Get a mutable pointer to elements inside this array.
+  // This method used to mutate arrays of structs followed by a @p Mutate
+  // operation. For primitive types use @p Mutate directly.
+  // @warning Assignments and reads to/from the dereferenced pointer are not
+  //  automatically converted to the correct endianness.
+  typename flatbuffers::conditional<scalar_tag::value, void, T *>::type
+  GetMutablePointer(uoffset_t i) const {
+    FLATBUFFERS_ASSERT(i < size());
+    return const_cast<T *>(&data()[i]);
+  }
+
+  // Change elements if you have a non-const pointer to this object.
+  void Mutate(uoffset_t i, const T &val) { MutateImpl(scalar_tag(), i, val); }
+
+  // The raw data in little endian format. Use with care.
+  const uint8_t *Data() const { return data_; }
+
+  uint8_t *Data() { return data_; }
+
+  // Similarly, but typed, much like std::vector::data
+  const T *data() const { return reinterpret_cast<const T *>(Data()); }
+  T *data() { return reinterpret_cast<T *>(Data()); }
+
+  // Copy data from a span with endian conversion.
+  // If this Array and the span overlap, the behavior is undefined.
+  void CopyFromSpan(flatbuffers::span<const T, length> src) {
+    const auto p1 = reinterpret_cast<const uint8_t *>(src.data());
+    const auto p2 = Data();
+    FLATBUFFERS_ASSERT(!(p1 >= p2 && p1 < (p2 + length)) &&
+                       !(p2 >= p1 && p2 < (p1 + length)));
+    (void)p1;
+    (void)p2;
+    CopyFromSpanImpl(flatbuffers::bool_constant<is_span_observable>(), src);
+  }
+
+ protected:
+  void MutateImpl(flatbuffers::true_type, uoffset_t i, const T &val) {
+    FLATBUFFERS_ASSERT(i < size());
+    WriteScalar(data() + i, val);
+  }
+
+  void MutateImpl(flatbuffers::false_type, uoffset_t i, const T &val) {
+    *(GetMutablePointer(i)) = val;
+  }
+
+  void CopyFromSpanImpl(flatbuffers::true_type,
+                        flatbuffers::span<const T, length> src) {
+    // Use std::memcpy() instead of std::copy() to avoid performance degradation
+    // due to aliasing if T is char or unsigned char.
+    // The size is known at compile time, so memcpy would be inlined.
+    std::memcpy(data(), src.data(), length * sizeof(T));
+  }
+
+  // Copy data from flatbuffers::span with endian conversion.
+  void CopyFromSpanImpl(flatbuffers::false_type,
+                        flatbuffers::span<const T, length> src) {
+    for (size_type k = 0; k < length; k++) { Mutate(k, src[k]); }
+  }
+
+  // This class is only used to access pre-existing data. Don't ever
+  // try to construct these manually.
+  // 'constexpr' allows us to use 'size()' at compile time.
+  // @note Must not use 'FLATBUFFERS_CONSTEXPR' here, as const is not allowed on
+  //  a constructor.
+#if defined(__cpp_constexpr)
+  constexpr Array();
+#else
+  Array();
+#endif
+
+  uint8_t data_[length * sizeof(T)];
+
+ private:
+  // This class is a pointer. Copying will therefore create an invalid object.
+  // Private and unimplemented copy constructor.
+  Array(const Array &);
+  Array &operator=(const Array &);
+};
+
+// Specialization for Array[struct] with access using Offset<void> pointer.
+// This specialization used by idl_gen_text.cpp.
+template<typename T, uint16_t length, template<typename> class OffsetT>
+class Array<OffsetT<T>, length> {
+  static_assert(flatbuffers::is_same<T, void>::value, "unexpected type T");
+
+ public:
+  typedef const void *return_type;
+  typedef uint16_t size_type;
+
+  const uint8_t *Data() const { return data_; }
+
+  // Make idl_gen_text.cpp::PrintContainer happy.
+  return_type operator[](uoffset_t) const {
+    FLATBUFFERS_ASSERT(false);
+    return nullptr;
+  }
+
+ private:
+  // This class is only used to access pre-existing data.
+  Array();
+  Array(const Array &);
+  Array &operator=(const Array &);
+
+  uint8_t data_[1];
+};
+
+template<class U, uint16_t N>
+FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<U, N> make_span(Array<U, N> &arr)
+    FLATBUFFERS_NOEXCEPT {
+  static_assert(
+      Array<U, N>::is_span_observable,
+      "wrong type U, only plain struct, LE-scalar, or byte types are allowed");
+  return span<U, N>(arr.data(), N);
+}
+
+template<class U, uint16_t N>
+FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<const U, N> make_span(
+    const Array<U, N> &arr) FLATBUFFERS_NOEXCEPT {
+  static_assert(
+      Array<U, N>::is_span_observable,
+      "wrong type U, only plain struct, LE-scalar, or byte types are allowed");
+  return span<const U, N>(arr.data(), N);
+}
+
+template<class U, uint16_t N>
+FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<uint8_t, sizeof(U) * N>
+make_bytes_span(Array<U, N> &arr) FLATBUFFERS_NOEXCEPT {
+  static_assert(Array<U, N>::is_span_observable,
+                "internal error, Array<T> might hold only scalars or structs");
+  return span<uint8_t, sizeof(U) * N>(arr.Data(), sizeof(U) * N);
+}
+
+template<class U, uint16_t N>
+FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<const uint8_t, sizeof(U) * N>
+make_bytes_span(const Array<U, N> &arr) FLATBUFFERS_NOEXCEPT {
+  static_assert(Array<U, N>::is_span_observable,
+                "internal error, Array<T> might hold only scalars or structs");
+  return span<const uint8_t, sizeof(U) * N>(arr.Data(), sizeof(U) * N);
+}
+
+// Cast a raw T[length] to a raw flatbuffers::Array<T, length>
+// without endian conversion. Use with care.
+// TODO: move these Cast-methods to `internal` namespace.
+template<typename T, uint16_t length>
+Array<T, length> &CastToArray(T (&arr)[length]) {
+  return *reinterpret_cast<Array<T, length> *>(arr);
+}
+
+template<typename T, uint16_t length>
+const Array<T, length> &CastToArray(const T (&arr)[length]) {
+  return *reinterpret_cast<const Array<T, length> *>(arr);
+}
+
+template<typename E, typename T, uint16_t length>
+Array<E, length> &CastToArrayOfEnum(T (&arr)[length]) {
+  static_assert(sizeof(E) == sizeof(T), "invalid enum type E");
+  return *reinterpret_cast<Array<E, length> *>(arr);
+}
+
+template<typename E, typename T, uint16_t length>
+const Array<E, length> &CastToArrayOfEnum(const T (&arr)[length]) {
+  static_assert(sizeof(E) == sizeof(T), "invalid enum type E");
+  return *reinterpret_cast<const Array<E, length> *>(arr);
+}
+
+template<typename T, uint16_t length>
+bool operator==(const Array<T, length> &lhs,
+                const Array<T, length> &rhs) noexcept {
+  return std::addressof(lhs) == std::addressof(rhs) ||
+         (lhs.size() == rhs.size() &&
+          std::memcmp(lhs.Data(), rhs.Data(), rhs.size() * sizeof(T)) == 0);
+}
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_ARRAY_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/base.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/base.h
new file mode 100644
index 00000000..5c4cae79
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/base.h
@@ -0,0 +1,495 @@
+#ifndef FLATBUFFERS_BASE_H_
+#define FLATBUFFERS_BASE_H_
+
+// clang-format off
+
+// If activate should be declared and included first.
+#if defined(FLATBUFFERS_MEMORY_LEAK_TRACKING) && \
+    defined(_MSC_VER) && defined(_DEBUG)
+  // The _CRTDBG_MAP_ALLOC inside <crtdbg.h> will replace
+  // calloc/free (etc) to its debug version using #define directives.
+  #define _CRTDBG_MAP_ALLOC
+  #include <stdlib.h>
+  #include <crtdbg.h>
+  // Replace operator new by trace-enabled version.
+  #define DEBUG_NEW new(_NORMAL_BLOCK, __FILE__, __LINE__)
+  #define new DEBUG_NEW
+#endif
+
+#if !defined(FLATBUFFERS_ASSERT)
+#include <assert.h>
+#define FLATBUFFERS_ASSERT assert
+#elif defined(FLATBUFFERS_ASSERT_INCLUDE)
+// Include file with forward declaration
+#include FLATBUFFERS_ASSERT_INCLUDE
+#endif
+
+#ifndef ARDUINO
+#include <cstdint>
+#endif
+
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+
+#if defined(ARDUINO) && !defined(ARDUINOSTL_M_H) && defined(__AVR__)
+  #include <utility.h>
+#else
+  #include <utility>
+#endif
+
+#include <string>
+#include <type_traits>
+#include <vector>
+#include <set>
+#include <algorithm>
+#include <limits>
+#include <iterator>
+#include <memory>
+
+#if defined(__unix__) && !defined(FLATBUFFERS_LOCALE_INDEPENDENT)
+  #include <unistd.h>
+#endif
+
+#ifdef __ANDROID__
+  #include <android/api-level.h>
+#endif
+
+#if defined(__ICCARM__)
+#include <intrinsics.h>
+#endif
+
+// Note the __clang__ check is needed, because clang presents itself
+// as an older GNUC compiler (4.2).
+// Clang 3.3 and later implement all of the ISO C++ 2011 standard.
+// Clang 3.4 and later implement all of the ISO C++ 2014 standard.
+// http://clang.llvm.org/cxx_status.html
+
+// Note the MSVC value '__cplusplus' may be incorrect:
+// The '__cplusplus' predefined macro in the MSVC stuck at the value 199711L,
+// indicating (erroneously!) that the compiler conformed to the C++98 Standard.
+// This value should be correct starting from MSVC2017-15.7-Preview-3.
+// The '__cplusplus' will be valid only if MSVC2017-15.7-P3 and the `/Zc:__cplusplus` switch is set.
+// Workaround (for details see MSDN):
+// Use the _MSC_VER and _MSVC_LANG definition instead of the __cplusplus  for compatibility.
+// The _MSVC_LANG macro reports the Standard version regardless of the '/Zc:__cplusplus' switch.
+
+#if defined(__GNUC__) && !defined(__clang__)
+  #define FLATBUFFERS_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#else
+  #define FLATBUFFERS_GCC 0
+#endif
+
+#if defined(__clang__)
+  #define FLATBUFFERS_CLANG (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
+#else
+  #define FLATBUFFERS_CLANG 0
+#endif
+
+/// @cond FLATBUFFERS_INTERNAL
+#if __cplusplus <= 199711L && \
+    (!defined(_MSC_VER) || _MSC_VER < 1600) && \
+    (!defined(__GNUC__) || \
+      (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__ < 40400))
+  #error A C++11 compatible compiler with support for the auto typing is \
+         required for FlatBuffers.
+  #error __cplusplus _MSC_VER __GNUC__  __GNUC_MINOR__  __GNUC_PATCHLEVEL__
+#endif
+
+#if !defined(__clang__) && \
+    defined(__GNUC__) && \
+    (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__ < 40600)
+  // Backwards compatibility for g++ 4.4, and 4.5 which don't have the nullptr
+  // and constexpr keywords. Note the __clang__ check is needed, because clang
+  // presents itself as an older GNUC compiler.
+  #ifndef nullptr_t
+    const class nullptr_t {
+    public:
+      template<class T> inline operator T*() const { return 0; }
+    private:
+      void operator&() const;
+    } nullptr = {};
+  #endif
+  #ifndef constexpr
+    #define constexpr const
+  #endif
+#endif
+
+// The wire format uses a little endian encoding (since that's efficient for
+// the common platforms).
+#if defined(__s390x__)
+  #define FLATBUFFERS_LITTLEENDIAN 0
+#endif // __s390x__
+#if !defined(FLATBUFFERS_LITTLEENDIAN)
+  #if defined(__GNUC__) || defined(__clang__) || defined(__ICCARM__)
+    #if (defined(__BIG_ENDIAN__) || \
+         (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))
+      #define FLATBUFFERS_LITTLEENDIAN 0
+    #else
+      #define FLATBUFFERS_LITTLEENDIAN 1
+    #endif // __BIG_ENDIAN__
+  #elif defined(_MSC_VER)
+    #if defined(_M_PPC)
+      #define FLATBUFFERS_LITTLEENDIAN 0
+    #else
+      #define FLATBUFFERS_LITTLEENDIAN 1
+    #endif
+  #else
+    #error Unable to determine endianness, define FLATBUFFERS_LITTLEENDIAN.
+  #endif
+#endif // !defined(FLATBUFFERS_LITTLEENDIAN)
+
+#define FLATBUFFERS_VERSION_MAJOR 23
+#define FLATBUFFERS_VERSION_MINOR 5
+#define FLATBUFFERS_VERSION_REVISION 26
+#define FLATBUFFERS_STRING_EXPAND(X) #X
+#define FLATBUFFERS_STRING(X) FLATBUFFERS_STRING_EXPAND(X)
+namespace flatbuffers {
+  // Returns version as string  "MAJOR.MINOR.REVISION".
+  const char* FLATBUFFERS_VERSION();
+}
+
+#if (!defined(_MSC_VER) || _MSC_VER > 1600) && \
+    (!defined(__GNUC__) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 407)) || \
+    defined(__clang__)
+  #define FLATBUFFERS_FINAL_CLASS final
+  #define FLATBUFFERS_OVERRIDE override
+  #define FLATBUFFERS_EXPLICIT_CPP11 explicit
+  #define FLATBUFFERS_VTABLE_UNDERLYING_TYPE : flatbuffers::voffset_t
+#else
+  #define FLATBUFFERS_FINAL_CLASS
+  #define FLATBUFFERS_OVERRIDE
+  #define FLATBUFFERS_EXPLICIT_CPP11
+  #define FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+#endif
+
+#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && \
+    (!defined(__GNUC__) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 406)) || \
+    (defined(__cpp_constexpr) && __cpp_constexpr >= 200704)
+  #define FLATBUFFERS_CONSTEXPR constexpr
+  #define FLATBUFFERS_CONSTEXPR_CPP11 constexpr
+  #define FLATBUFFERS_CONSTEXPR_DEFINED
+#else
+  #define FLATBUFFERS_CONSTEXPR const
+  #define FLATBUFFERS_CONSTEXPR_CPP11
+#endif
+
+#if (defined(__cplusplus) && __cplusplus >= 201402L) || \
+    (defined(__cpp_constexpr) && __cpp_constexpr >= 201304)
+  #define FLATBUFFERS_CONSTEXPR_CPP14 FLATBUFFERS_CONSTEXPR_CPP11
+#else
+  #define FLATBUFFERS_CONSTEXPR_CPP14
+#endif
+
+#if (defined(__GXX_EXPERIMENTAL_CXX0X__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 406)) || \
+    (defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190023026)) || \
+    defined(__clang__)
+  #define FLATBUFFERS_NOEXCEPT noexcept
+#else
+  #define FLATBUFFERS_NOEXCEPT
+#endif
+
+// NOTE: the FLATBUFFERS_DELETE_FUNC macro may change the access mode to
+// private, so be sure to put it at the end or reset access mode explicitly.
+#if (!defined(_MSC_VER) || _MSC_FULL_VER >= 180020827) && \
+    (!defined(__GNUC__) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 404)) || \
+    defined(__clang__)
+  #define FLATBUFFERS_DELETE_FUNC(func) func = delete
+#else
+  #define FLATBUFFERS_DELETE_FUNC(func) private: func
+#endif
+
+#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && \
+    (!defined(__GNUC__) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 409)) || \
+    defined(__clang__)
+  #define FLATBUFFERS_DEFAULT_DECLARATION
+#endif
+
+// Check if we can use template aliases
+// Not possible if Microsoft Compiler before 2012
+// Possible is the language feature __cpp_alias_templates is defined well
+// Or possible if the C++ std is C+11 or newer
+#if (defined(_MSC_VER) && _MSC_VER > 1700 /* MSVC2012 */) \
+    || (defined(__cpp_alias_templates) && __cpp_alias_templates >= 200704) \
+    || (defined(__cplusplus) && __cplusplus >= 201103L)
+  #define FLATBUFFERS_TEMPLATES_ALIASES
+#endif
+
+#ifndef FLATBUFFERS_HAS_STRING_VIEW
+  // Only provide flatbuffers::string_view if __has_include can be used
+  // to detect a header that provides an implementation
+  #if defined(__has_include)
+    // Check for std::string_view (in c++17)
+    #if __has_include(<string_view>) && (__cplusplus >= 201606 || (defined(_HAS_CXX17) && _HAS_CXX17))
+      #include <string_view>
+      namespace flatbuffers {
+        typedef std::string_view string_view;
+      }
+      #define FLATBUFFERS_HAS_STRING_VIEW 1
+    // Check for std::experimental::string_view (in c++14, compiler-dependent)
+    #elif __has_include(<experimental/string_view>) && (__cplusplus >= 201411)
+      #include <experimental/string_view>
+      namespace flatbuffers {
+        typedef std::experimental::string_view string_view;
+      }
+      #define FLATBUFFERS_HAS_STRING_VIEW 1
+    // Check for absl::string_view
+    #elif __has_include("absl/strings/string_view.h") && \
+          __has_include("absl/base/config.h") && \
+          (__cplusplus >= 201411)
+      #include "absl/base/config.h"
+      #if !defined(ABSL_USES_STD_STRING_VIEW)
+        #include "absl/strings/string_view.h"
+        namespace flatbuffers {
+          typedef absl::string_view string_view;
+        }
+        #define FLATBUFFERS_HAS_STRING_VIEW 1
+      #endif
+    #endif
+  #endif // __has_include
+#endif // !FLATBUFFERS_HAS_STRING_VIEW
+
+#ifndef FLATBUFFERS_GENERAL_HEAP_ALLOC_OK
+  // Allow heap allocations to be used
+  #define FLATBUFFERS_GENERAL_HEAP_ALLOC_OK 1
+#endif // !FLATBUFFERS_GENERAL_HEAP_ALLOC_OK
+
+#ifndef FLATBUFFERS_HAS_NEW_STRTOD
+  // Modern (C++11) strtod and strtof functions are available for use.
+  // 1) nan/inf strings as argument of strtod;
+  // 2) hex-float  as argument of  strtod/strtof.
+  #if (defined(_MSC_VER) && _MSC_VER >= 1900) || \
+      (defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 409)) || \
+      (defined(__clang__))
+    #define FLATBUFFERS_HAS_NEW_STRTOD 1
+  #endif
+#endif // !FLATBUFFERS_HAS_NEW_STRTOD
+
+#ifndef FLATBUFFERS_LOCALE_INDEPENDENT
+  // Enable locale independent functions {strtof_l, strtod_l,strtoll_l,
+  // strtoull_l}.
+  #if (defined(_MSC_VER) && _MSC_VER >= 1800) || \
+      (defined(__ANDROID_API__) && __ANDROID_API__>= 21) || \
+      (defined(_XOPEN_VERSION) && (_XOPEN_VERSION >= 700)) && \
+        (!defined(__Fuchsia__) && !defined(__ANDROID_API__))
+    #define FLATBUFFERS_LOCALE_INDEPENDENT 1
+  #else
+    #define FLATBUFFERS_LOCALE_INDEPENDENT 0
+  #endif
+#endif  // !FLATBUFFERS_LOCALE_INDEPENDENT
+
+// Suppress Undefined Behavior Sanitizer (recoverable only). Usage:
+// - FLATBUFFERS_SUPPRESS_UBSAN("undefined")
+// - FLATBUFFERS_SUPPRESS_UBSAN("signed-integer-overflow")
+#if defined(__clang__) && (__clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >=7))
+  #define FLATBUFFERS_SUPPRESS_UBSAN(type) __attribute__((no_sanitize(type)))
+#elif defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 409)
+  #define FLATBUFFERS_SUPPRESS_UBSAN(type) __attribute__((no_sanitize_undefined))
+#else
+  #define FLATBUFFERS_SUPPRESS_UBSAN(type)
+#endif
+
+// This is constexpr function used for checking compile-time constants.
+// Avoid `#pragma warning(disable: 4127) // C4127: expression is constant`.
+template<typename T> FLATBUFFERS_CONSTEXPR inline bool IsConstTrue(T t) {
+  return !!t;
+}
+
+// Enable C++ attribute [[]] if std:c++17 or higher.
+#if ((__cplusplus >= 201703L) \
+    || (defined(_MSVC_LANG) &&  (_MSVC_LANG >= 201703L)))
+  // All attributes unknown to an implementation are ignored without causing an error.
+  #define FLATBUFFERS_ATTRIBUTE(attr) attr
+
+  #define FLATBUFFERS_FALLTHROUGH() [[fallthrough]]
+#else
+  #define FLATBUFFERS_ATTRIBUTE(attr)
+
+  #if FLATBUFFERS_CLANG >= 30800
+    #define FLATBUFFERS_FALLTHROUGH() [[clang::fallthrough]]
+  #elif FLATBUFFERS_GCC >= 70300
+    #define FLATBUFFERS_FALLTHROUGH() [[gnu::fallthrough]]
+  #else
+    #define FLATBUFFERS_FALLTHROUGH()
+  #endif
+#endif
+
+/// @endcond
+
+/// @file
+namespace flatbuffers {
+
+/// @cond FLATBUFFERS_INTERNAL
+// Our default offset / size type, 32bit on purpose on 64bit systems.
+// Also, using a consistent offset type maintains compatibility of serialized
+// offset values between 32bit and 64bit systems.
+typedef uint32_t uoffset_t;
+typedef uint64_t uoffset64_t;
+
+// Signed offsets for references that can go in both directions.
+typedef int32_t soffset_t;
+typedef int64_t soffset64_t;
+
+// Offset/index used in v-tables, can be changed to uint8_t in
+// format forks to save a bit of space if desired.
+typedef uint16_t voffset_t;
+
+typedef uintmax_t largest_scalar_t;
+
+// In 32bits, this evaluates to 2GB - 1
+#define FLATBUFFERS_MAX_BUFFER_SIZE std::numeric_limits<::flatbuffers::soffset_t>::max()
+#define FLATBUFFERS_MAX_64_BUFFER_SIZE std::numeric_limits<::flatbuffers::soffset64_t>::max()
+
+// The minimum size buffer that can be a valid flatbuffer.
+// Includes the offset to the root table (uoffset_t), the offset to the vtable
+// of the root table (soffset_t), the size of the vtable (uint16_t), and the
+// size of the referring table (uint16_t).
+#define FLATBUFFERS_MIN_BUFFER_SIZE sizeof(uoffset_t) + sizeof(soffset_t) + \
+   sizeof(uint16_t) + sizeof(uint16_t)
+
+// We support aligning the contents of buffers up to this size.
+#ifndef FLATBUFFERS_MAX_ALIGNMENT
+  #define FLATBUFFERS_MAX_ALIGNMENT 32
+#endif
+
+/// @brief The length of a FlatBuffer file header.
+static const size_t kFileIdentifierLength = 4;
+
+inline bool VerifyAlignmentRequirements(size_t align, size_t min_align = 1) {
+  return (min_align <= align) && (align <= (FLATBUFFERS_MAX_ALIGNMENT)) &&
+         (align & (align - 1)) == 0;  // must be power of 2
+}
+
+#if defined(_MSC_VER)
+  #pragma warning(disable: 4351) // C4351: new behavior: elements of array ... will be default initialized
+  #pragma warning(push)
+  #pragma warning(disable: 4127) // C4127: conditional expression is constant
+#endif
+
+template<typename T> T EndianSwap(T t) {
+  #if defined(_MSC_VER)
+    #define FLATBUFFERS_BYTESWAP16 _byteswap_ushort
+    #define FLATBUFFERS_BYTESWAP32 _byteswap_ulong
+    #define FLATBUFFERS_BYTESWAP64 _byteswap_uint64
+  #elif defined(__ICCARM__)
+    #define FLATBUFFERS_BYTESWAP16 __REV16
+    #define FLATBUFFERS_BYTESWAP32 __REV
+    #define FLATBUFFERS_BYTESWAP64(x) \
+       ((__REV(static_cast<uint32_t>(x >> 32U))) | (static_cast<uint64_t>(__REV(static_cast<uint32_t>(x)))) << 32U)
+  #else
+    #if defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ < 408 && !defined(__clang__)
+      // __builtin_bswap16 was missing prior to GCC 4.8.
+      #define FLATBUFFERS_BYTESWAP16(x) \
+        static_cast<uint16_t>(__builtin_bswap32(static_cast<uint32_t>(x) << 16))
+    #else
+      #define FLATBUFFERS_BYTESWAP16 __builtin_bswap16
+    #endif
+    #define FLATBUFFERS_BYTESWAP32 __builtin_bswap32
+    #define FLATBUFFERS_BYTESWAP64 __builtin_bswap64
+  #endif
+  if (sizeof(T) == 1) {   // Compile-time if-then's.
+    return t;
+  } else if (sizeof(T) == 2) {
+    union { T t; uint16_t i; } u = { t };
+    u.i = FLATBUFFERS_BYTESWAP16(u.i);
+    return u.t;
+  } else if (sizeof(T) == 4) {
+    union { T t; uint32_t i; } u = { t };
+    u.i = FLATBUFFERS_BYTESWAP32(u.i);
+    return u.t;
+  } else if (sizeof(T) == 8) {
+    union { T t; uint64_t i; } u = { t };
+    u.i = FLATBUFFERS_BYTESWAP64(u.i);
+    return u.t;
+  } else {
+    FLATBUFFERS_ASSERT(0);
+    return t;
+  }
+}
+
+#if defined(_MSC_VER)
+  #pragma warning(pop)
+#endif
+
+
+template<typename T> T EndianScalar(T t) {
+  #if FLATBUFFERS_LITTLEENDIAN
+    return t;
+  #else
+    return EndianSwap(t);
+  #endif
+}
+
+template<typename T>
+// UBSAN: C++ aliasing type rules, see std::bit_cast<> for details.
+FLATBUFFERS_SUPPRESS_UBSAN("alignment")
+T ReadScalar(const void *p) {
+  return EndianScalar(*reinterpret_cast<const T *>(p));
+}
+
+// See https://github.com/google/flatbuffers/issues/5950
+
+#if (FLATBUFFERS_GCC >= 100000) && (FLATBUFFERS_GCC < 110000)
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
+
+template<typename T>
+// UBSAN: C++ aliasing type rules, see std::bit_cast<> for details.
+FLATBUFFERS_SUPPRESS_UBSAN("alignment")
+void WriteScalar(void *p, T t) {
+  *reinterpret_cast<T *>(p) = EndianScalar(t);
+}
+
+template<typename T> struct Offset;
+template<typename T> FLATBUFFERS_SUPPRESS_UBSAN("alignment") void WriteScalar(void *p, Offset<T> t) {
+  *reinterpret_cast<uoffset_t *>(p) = EndianScalar(t.o);
+}
+
+#if (FLATBUFFERS_GCC >= 100000) && (FLATBUFFERS_GCC < 110000)
+  #pragma GCC diagnostic pop
+#endif
+
+// Computes how many bytes you'd have to pad to be able to write an
+// "scalar_size" scalar if the buffer had grown to "buf_size" (downwards in
+// memory).
+FLATBUFFERS_SUPPRESS_UBSAN("unsigned-integer-overflow")
+inline size_t PaddingBytes(size_t buf_size, size_t scalar_size) {
+  return ((~buf_size) + 1) & (scalar_size - 1);
+}
+
+// Generic 'operator==' with conditional specialisations.
+// T e - new value of a scalar field.
+// T def - default of scalar (is known at compile-time).
+template<typename T> inline bool IsTheSameAs(T e, T def) { return e == def; }
+
+#if defined(FLATBUFFERS_NAN_DEFAULTS) && \
+    defined(FLATBUFFERS_HAS_NEW_STRTOD) && (FLATBUFFERS_HAS_NEW_STRTOD > 0)
+// Like `operator==(e, def)` with weak NaN if T=(float|double).
+template<typename T> inline bool IsFloatTheSameAs(T e, T def) {
+  return (e == def) || ((def != def) && (e != e));
+}
+template<> inline bool IsTheSameAs<float>(float e, float def) {
+  return IsFloatTheSameAs(e, def);
+}
+template<> inline bool IsTheSameAs<double>(double e, double def) {
+  return IsFloatTheSameAs(e, def);
+}
+#endif
+
+// Check 'v' is out of closed range [low; high].
+// Workaround for GCC warning [-Werror=type-limits]:
+// comparison is always true due to limited range of data type.
+template<typename T>
+inline bool IsOutRange(const T &v, const T &low, const T &high) {
+  return (v < low) || (high < v);
+}
+
+// Check 'v' is in closed range [low; high].
+template<typename T>
+inline bool IsInRange(const T &v, const T &low, const T &high) {
+  return !IsOutRange(v, low, high);
+}
+
+}  // namespace flatbuffers
+#endif  // FLATBUFFERS_BASE_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/buffer.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/buffer.h
new file mode 100644
index 00000000..94d4f790
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/buffer.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_BUFFER_H_
+#define FLATBUFFERS_BUFFER_H_
+
+#include <algorithm>
+
+#include "flatbuffers/base.h"
+
+namespace flatbuffers {
+
+// Wrapper for uoffset_t to allow safe template specialization.
+// Value is allowed to be 0 to indicate a null object (see e.g. AddOffset).
+template<typename T = void> struct Offset {
+  // The type of offset to use.
+  typedef uoffset_t offset_type;
+
+  offset_type o;
+  Offset() : o(0) {}
+  Offset(const offset_type _o) : o(_o) {}
+  Offset<> Union() const { return o; }
+  bool IsNull() const { return !o; }
+};
+
+// Wrapper for uoffset64_t Offsets.
+template<typename T = void> struct Offset64 {
+  // The type of offset to use.
+  typedef uoffset64_t offset_type;
+
+  offset_type o;
+  Offset64() : o(0) {}
+  Offset64(const offset_type offset) : o(offset) {}
+  Offset64<> Union() const { return o; }
+  bool IsNull() const { return !o; }
+};
+
+// Litmus check for ensuring the Offsets are the expected size.
+static_assert(sizeof(Offset<>) == 4, "Offset has wrong size");
+static_assert(sizeof(Offset64<>) == 8, "Offset64 has wrong size");
+
+inline void EndianCheck() {
+  int endiantest = 1;
+  // If this fails, see FLATBUFFERS_LITTLEENDIAN above.
+  FLATBUFFERS_ASSERT(*reinterpret_cast<char *>(&endiantest) ==
+                     FLATBUFFERS_LITTLEENDIAN);
+  (void)endiantest;
+}
+
+template<typename T> FLATBUFFERS_CONSTEXPR size_t AlignOf() {
+  // clang-format off
+  #ifdef _MSC_VER
+    return __alignof(T);
+  #else
+    #ifndef alignof
+      return __alignof__(T);
+    #else
+      return alignof(T);
+    #endif
+  #endif
+  // clang-format on
+}
+
+// Lexicographically compare two strings (possibly containing nulls), and
+// return true if the first is less than the second.
+static inline bool StringLessThan(const char *a_data, uoffset_t a_size,
+                                  const char *b_data, uoffset_t b_size) {
+  const auto cmp = memcmp(a_data, b_data, (std::min)(a_size, b_size));
+  return cmp == 0 ? a_size < b_size : cmp < 0;
+}
+
+// When we read serialized data from memory, in the case of most scalars,
+// we want to just read T, but in the case of Offset, we want to actually
+// perform the indirection and return a pointer.
+// The template specialization below does just that.
+// It is wrapped in a struct since function templates can't overload on the
+// return type like this.
+// The typedef is for the convenience of callers of this function
+// (avoiding the need for a trailing return decltype)
+template<typename T> struct IndirectHelper {
+  typedef T return_type;
+  typedef T mutable_return_type;
+  static const size_t element_stride = sizeof(T);
+
+  static return_type Read(const uint8_t *p, const size_t i) {
+    return EndianScalar((reinterpret_cast<const T *>(p))[i]);
+  }
+  static mutable_return_type Read(uint8_t *p, const size_t i) {
+    return reinterpret_cast<mutable_return_type>(
+        Read(const_cast<const uint8_t *>(p), i));
+  }
+};
+
+// For vector of Offsets.
+template<typename T, template<typename> class OffsetT>
+struct IndirectHelper<OffsetT<T>> {
+  typedef const T *return_type;
+  typedef T *mutable_return_type;
+  typedef typename OffsetT<T>::offset_type offset_type;
+  static const offset_type element_stride = sizeof(offset_type);
+
+  static return_type Read(const uint8_t *const p, const offset_type i) {
+    // Offsets are relative to themselves, so first update the pointer to
+    // point to the offset location.
+    const uint8_t *const offset_location = p + i * element_stride;
+
+    // Then read the scalar value of the offset (which may be 32 or 64-bits) and
+    // then determine the relative location from the offset location.
+    return reinterpret_cast<return_type>(
+        offset_location + ReadScalar<offset_type>(offset_location));
+  }
+  static mutable_return_type Read(uint8_t *const p, const offset_type i) {
+    // Offsets are relative to themselves, so first update the pointer to
+    // point to the offset location.
+    uint8_t *const offset_location = p + i * element_stride;
+
+    // Then read the scalar value of the offset (which may be 32 or 64-bits) and
+    // then determine the relative location from the offset location.
+    return reinterpret_cast<mutable_return_type>(
+        offset_location + ReadScalar<offset_type>(offset_location));
+  }
+};
+
+// For vector of structs.
+template<typename T> struct IndirectHelper<const T *> {
+  typedef const T *return_type;
+  typedef T *mutable_return_type;
+  static const size_t element_stride = sizeof(T);
+
+  static return_type Read(const uint8_t *const p, const size_t i) {
+    // Structs are stored inline, relative to the first struct pointer.
+    return reinterpret_cast<return_type>(p + i * element_stride);
+  }
+  static mutable_return_type Read(uint8_t *const p, const size_t i) {
+    // Structs are stored inline, relative to the first struct pointer.
+    return reinterpret_cast<mutable_return_type>(p + i * element_stride);
+  }
+};
+
+/// @brief Get a pointer to the file_identifier section of the buffer.
+/// @return Returns a const char pointer to the start of the file_identifier
+/// characters in the buffer.  The returned char * has length
+/// 'flatbuffers::FlatBufferBuilder::kFileIdentifierLength'.
+/// This function is UNDEFINED for FlatBuffers whose schema does not include
+/// a file_identifier (likely points at padding or the start of a the root
+/// vtable).
+inline const char *GetBufferIdentifier(const void *buf,
+                                       bool size_prefixed = false) {
+  return reinterpret_cast<const char *>(buf) +
+         ((size_prefixed) ? 2 * sizeof(uoffset_t) : sizeof(uoffset_t));
+}
+
+// Helper to see if the identifier in a buffer has the expected value.
+inline bool BufferHasIdentifier(const void *buf, const char *identifier,
+                                bool size_prefixed = false) {
+  return strncmp(GetBufferIdentifier(buf, size_prefixed), identifier,
+                 flatbuffers::kFileIdentifierLength) == 0;
+}
+
+/// @cond FLATBUFFERS_INTERNAL
+// Helpers to get a typed pointer to the root object contained in the buffer.
+template<typename T> T *GetMutableRoot(void *buf) {
+  if (!buf) return nullptr;
+  EndianCheck();
+  return reinterpret_cast<T *>(
+      reinterpret_cast<uint8_t *>(buf) +
+      EndianScalar(*reinterpret_cast<uoffset_t *>(buf)));
+}
+
+template<typename T, typename SizeT = uoffset_t>
+T *GetMutableSizePrefixedRoot(void *buf) {
+  return GetMutableRoot<T>(reinterpret_cast<uint8_t *>(buf) + sizeof(SizeT));
+}
+
+template<typename T> const T *GetRoot(const void *buf) {
+  return GetMutableRoot<T>(const_cast<void *>(buf));
+}
+
+template<typename T, typename SizeT = uoffset_t>
+const T *GetSizePrefixedRoot(const void *buf) {
+  return GetRoot<T>(reinterpret_cast<const uint8_t *>(buf) + sizeof(SizeT));
+}
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_BUFFER_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/buffer_ref.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/buffer_ref.h
new file mode 100644
index 00000000..f70941fc
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/buffer_ref.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_BUFFER_REF_H_
+#define FLATBUFFERS_BUFFER_REF_H_
+
+#include "flatbuffers/base.h"
+#include "flatbuffers/verifier.h"
+
+namespace flatbuffers {
+
+// Convenient way to bundle a buffer and its length, to pass it around
+// typed by its root.
+// A BufferRef does not own its buffer.
+struct BufferRefBase {};  // for std::is_base_of
+
+template<typename T> struct BufferRef : BufferRefBase {
+  BufferRef() : buf(nullptr), len(0), must_free(false) {}
+  BufferRef(uint8_t *_buf, uoffset_t _len)
+      : buf(_buf), len(_len), must_free(false) {}
+
+  ~BufferRef() {
+    if (must_free) free(buf);
+  }
+
+  const T *GetRoot() const { return flatbuffers::GetRoot<T>(buf); }
+
+  bool Verify() {
+    Verifier verifier(buf, len);
+    return verifier.VerifyBuffer<T>(nullptr);
+  }
+
+  uint8_t *buf;
+  uoffset_t len;
+  bool must_free;
+};
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_BUFFER_REF_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/code_generator.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/code_generator.h
new file mode 100644
index 00000000..2971e556
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/code_generator.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2023 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_CODE_GENERATOR_H_
+#define FLATBUFFERS_CODE_GENERATOR_H_
+
+#include <string>
+
+#include "flatbuffers/idl.h"
+
+namespace flatbuffers {
+
+struct CodeGenOptions {
+  std::string output_path;
+};
+
+// A code generator interface for producing converting flatbuffer schema into
+// code.
+class CodeGenerator {
+ public:
+  virtual ~CodeGenerator() = default;
+
+  enum Status {
+    OK = 0,
+    ERROR = 1,
+    FAILED_VERIFICATION = 2,
+    NOT_IMPLEMENTED = 3
+  };
+
+  std::string status_detail;
+
+  // Generate code from the provided `parser`.
+  //
+  // DEPRECATED: prefer using the other overload of GenerateCode for bfbs.
+  virtual Status GenerateCode(const Parser &parser, const std::string &path,
+                              const std::string &filename) = 0;
+
+  // Generate code from the provided `parser` and place it in the output.
+  virtual Status GenerateCodeString(const Parser &parser,
+                                    const std::string &filename,
+                                    std::string &output) {
+    (void)parser;
+    (void)filename;
+    (void)output;
+    return Status::NOT_IMPLEMENTED;
+  }
+
+  // Generate code from the provided `buffer` of given `length`. The buffer is a
+  // serialized reflection.fbs.
+  virtual Status GenerateCode(const uint8_t *buffer, int64_t length,
+                              const CodeGenOptions &options) = 0;
+
+  virtual Status GenerateMakeRule(const Parser &parser, const std::string &path,
+                                  const std::string &filename,
+                                  std::string &output) = 0;
+
+  virtual Status GenerateGrpcCode(const Parser &parser, const std::string &path,
+                                  const std::string &filename) = 0;
+
+  virtual Status GenerateRootFile(const Parser &parser,
+                                  const std::string &path) = 0;
+
+  virtual bool IsSchemaOnly() const = 0;
+
+  virtual bool SupportsBfbsGeneration() const = 0;
+
+  virtual bool SupportsRootFileGeneration() const = 0;
+
+  virtual IDLOptions::Language Language() const = 0;
+
+  virtual std::string LanguageName() const = 0;
+
+ protected:
+  CodeGenerator() = default;
+
+ private:
+  // Copying is not supported.
+  CodeGenerator(const CodeGenerator &) = delete;
+  CodeGenerator &operator=(const CodeGenerator &) = delete;
+};
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_CODE_GENERATOR_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/code_generators.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/code_generators.h
new file mode 100644
index 00000000..fc030d43
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/code_generators.h
@@ -0,0 +1,238 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_CODE_GENERATORS_H_
+#define FLATBUFFERS_CODE_GENERATORS_H_
+
+#include <map>
+#include <sstream>
+
+#include "flatbuffers/idl.h"
+
+namespace flatbuffers {
+
+// Utility class to assist in generating code through use of text templates.
+//
+// Example code:
+//   CodeWriter code("\t");
+//   code.SetValue("NAME", "Foo");
+//   code += "void {{NAME}}() { printf("%s", "{{NAME}}"); }";
+//   code.SetValue("NAME", "Bar");
+//   code += "void {{NAME}}() { printf("%s", "{{NAME}}"); }";
+//   std::cout << code.ToString() << std::endl;
+//
+// Output:
+//  void Foo() { printf("%s", "Foo"); }
+//  void Bar() { printf("%s", "Bar"); }
+class CodeWriter {
+ public:
+  CodeWriter(std::string pad = std::string())
+      : pad_(pad), cur_ident_lvl_(0), ignore_ident_(false) {}
+
+  // Clears the current "written" code.
+  void Clear() {
+    stream_.str("");
+    stream_.clear();
+  }
+
+  // Associates a key with a value.  All subsequent calls to operator+=, where
+  // the specified key is contained in {{ and }} delimiters will be replaced by
+  // the given value.
+  void SetValue(const std::string &key, const std::string &value) {
+    value_map_[key] = value;
+  }
+
+  std::string GetValue(const std::string &key) const {
+    const auto it = value_map_.find(key);
+    return it == value_map_.end() ? "" : it->second;
+  }
+
+  // Appends the given text to the generated code as well as a newline
+  // character.  Any text within {{ and }} delimiters is replaced by values
+  // previously stored in the CodeWriter by calling SetValue above.  The newline
+  // will be suppressed if the text ends with the \\ character.
+  void operator+=(std::string text);
+
+  // Returns the current contents of the CodeWriter as a std::string.
+  std::string ToString() const { return stream_.str(); }
+
+  // Increase ident level for writing code
+  void IncrementIdentLevel() { cur_ident_lvl_++; }
+  // Decrease ident level for writing code
+  void DecrementIdentLevel() {
+    if (cur_ident_lvl_) cur_ident_lvl_--;
+  }
+
+  void SetPadding(const std::string &padding) { pad_ = padding; }
+
+ private:
+  std::map<std::string, std::string> value_map_;
+  std::stringstream stream_;
+  std::string pad_;
+  int cur_ident_lvl_;
+  bool ignore_ident_;
+
+  // Add ident padding (tab or space) based on ident level
+  void AppendIdent(std::stringstream &stream);
+};
+
+class BaseGenerator {
+ public:
+  virtual bool generate() = 0;
+
+  static std::string NamespaceDir(const Parser &parser, const std::string &path,
+                                  const Namespace &ns,
+                                  const bool dasherize = false);
+
+  std::string GeneratedFileName(const std::string &path,
+                                const std::string &file_name,
+                                const IDLOptions &options) const;
+
+ protected:
+  BaseGenerator(const Parser &parser, const std::string &path,
+                const std::string &file_name, std::string qualifying_start,
+                std::string qualifying_separator, std::string default_extension)
+      : parser_(parser),
+        path_(path),
+        file_name_(file_name),
+        qualifying_start_(qualifying_start),
+        qualifying_separator_(qualifying_separator),
+        default_extension_(default_extension) {}
+  virtual ~BaseGenerator() {}
+
+  // No copy/assign.
+  BaseGenerator &operator=(const BaseGenerator &);
+  BaseGenerator(const BaseGenerator &);
+
+  std::string NamespaceDir(const Namespace &ns,
+                           const bool dasherize = false) const;
+
+  static const char *FlatBuffersGeneratedWarning();
+
+  static std::string FullNamespace(const char *separator, const Namespace &ns);
+
+  static std::string LastNamespacePart(const Namespace &ns);
+
+  // tracks the current namespace for early exit in WrapInNameSpace
+  // c++, java and csharp returns a different namespace from
+  // the following default (no early exit, always fully qualify),
+  // which works for js and php
+  virtual const Namespace *CurrentNameSpace() const { return nullptr; }
+
+  // Ensure that a type is prefixed with its namespace even within
+  // its own namespace to avoid conflict between generated method
+  // names and similarly named classes or structs
+  std::string WrapInNameSpace(const Namespace *ns,
+                              const std::string &name) const;
+
+  std::string WrapInNameSpace(const Definition &def,
+                              const std::string &suffix = "") const;
+
+  std::string GetNameSpace(const Definition &def) const;
+
+  const Parser &parser_;
+  const std::string &path_;
+  const std::string &file_name_;
+  const std::string qualifying_start_;
+  const std::string qualifying_separator_;
+  const std::string default_extension_;
+};
+
+struct CommentConfig {
+  const char *first_line;
+  const char *content_line_prefix;
+  const char *last_line;
+};
+
+extern void GenComment(const std::vector<std::string> &dc,
+                       std::string *code_ptr, const CommentConfig *config,
+                       const char *prefix = "");
+
+class FloatConstantGenerator {
+ public:
+  virtual ~FloatConstantGenerator() {}
+  std::string GenFloatConstant(const FieldDef &field) const;
+
+ private:
+  virtual std::string Value(double v, const std::string &src) const = 0;
+  virtual std::string Inf(double v) const = 0;
+  virtual std::string NaN(double v) const = 0;
+
+  virtual std::string Value(float v, const std::string &src) const = 0;
+  virtual std::string Inf(float v) const = 0;
+  virtual std::string NaN(float v) const = 0;
+
+  template<typename T>
+  std::string GenFloatConstantImpl(const FieldDef &field) const;
+};
+
+class SimpleFloatConstantGenerator : public FloatConstantGenerator {
+ public:
+  SimpleFloatConstantGenerator(const char *nan_number,
+                               const char *pos_inf_number,
+                               const char *neg_inf_number);
+
+ private:
+  std::string Value(double v,
+                    const std::string &src) const FLATBUFFERS_OVERRIDE;
+  std::string Inf(double v) const FLATBUFFERS_OVERRIDE;
+  std::string NaN(double v) const FLATBUFFERS_OVERRIDE;
+
+  std::string Value(float v, const std::string &src) const FLATBUFFERS_OVERRIDE;
+  std::string Inf(float v) const FLATBUFFERS_OVERRIDE;
+  std::string NaN(float v) const FLATBUFFERS_OVERRIDE;
+
+  const std::string nan_number_;
+  const std::string pos_inf_number_;
+  const std::string neg_inf_number_;
+};
+
+// C++, C#, Java like generator.
+class TypedFloatConstantGenerator : public FloatConstantGenerator {
+ public:
+  TypedFloatConstantGenerator(const char *double_prefix,
+                              const char *single_prefix, const char *nan_number,
+                              const char *pos_inf_number,
+                              const char *neg_inf_number = "");
+
+ private:
+  std::string Value(double v,
+                    const std::string &src) const FLATBUFFERS_OVERRIDE;
+  std::string Inf(double v) const FLATBUFFERS_OVERRIDE;
+
+  std::string NaN(double v) const FLATBUFFERS_OVERRIDE;
+
+  std::string Value(float v, const std::string &src) const FLATBUFFERS_OVERRIDE;
+  std::string Inf(float v) const FLATBUFFERS_OVERRIDE;
+  std::string NaN(float v) const FLATBUFFERS_OVERRIDE;
+
+  std::string MakeNaN(const std::string &prefix) const;
+  std::string MakeInf(bool neg, const std::string &prefix) const;
+
+  const std::string double_prefix_;
+  const std::string single_prefix_;
+  const std::string nan_number_;
+  const std::string pos_inf_number_;
+  const std::string neg_inf_number_;
+};
+
+std::string JavaCSharpMakeRule(const bool java, const Parser &parser,
+                               const std::string &path,
+                               const std::string &file_name);
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_CODE_GENERATORS_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/default_allocator.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/default_allocator.h
new file mode 100644
index 00000000..d4724122
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/default_allocator.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_DEFAULT_ALLOCATOR_H_
+#define FLATBUFFERS_DEFAULT_ALLOCATOR_H_
+
+#include "flatbuffers/allocator.h"
+#include "flatbuffers/base.h"
+
+namespace flatbuffers {
+
+// DefaultAllocator uses new/delete to allocate memory regions
+class DefaultAllocator : public Allocator {
+ public:
+  uint8_t *allocate(size_t size) FLATBUFFERS_OVERRIDE {
+    return new uint8_t[size];
+  }
+
+  void deallocate(uint8_t *p, size_t) FLATBUFFERS_OVERRIDE { delete[] p; }
+
+  static void dealloc(void *p, size_t) { delete[] static_cast<uint8_t *>(p); }
+};
+
+// These functions allow for a null allocator to mean use the default allocator,
+// as used by DetachedBuffer and vector_downward below.
+// This is to avoid having a statically or dynamically allocated default
+// allocator, or having to move it between the classes that may own it.
+inline uint8_t *Allocate(Allocator *allocator, size_t size) {
+  return allocator ? allocator->allocate(size)
+                   : DefaultAllocator().allocate(size);
+}
+
+inline void Deallocate(Allocator *allocator, uint8_t *p, size_t size) {
+  if (allocator)
+    allocator->deallocate(p, size);
+  else
+    DefaultAllocator().deallocate(p, size);
+}
+
+inline uint8_t *ReallocateDownward(Allocator *allocator, uint8_t *old_p,
+                                   size_t old_size, size_t new_size,
+                                   size_t in_use_back, size_t in_use_front) {
+  return allocator ? allocator->reallocate_downward(old_p, old_size, new_size,
+                                                    in_use_back, in_use_front)
+                   : DefaultAllocator().reallocate_downward(
+                         old_p, old_size, new_size, in_use_back, in_use_front);
+}
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_DEFAULT_ALLOCATOR_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/detached_buffer.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/detached_buffer.h
new file mode 100644
index 00000000..5e900bae
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/detached_buffer.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_DETACHED_BUFFER_H_
+#define FLATBUFFERS_DETACHED_BUFFER_H_
+
+#include "flatbuffers/allocator.h"
+#include "flatbuffers/base.h"
+#include "flatbuffers/default_allocator.h"
+
+namespace flatbuffers {
+
+// DetachedBuffer is a finished flatbuffer memory region, detached from its
+// builder. The original memory region and allocator are also stored so that
+// the DetachedBuffer can manage the memory lifetime.
+class DetachedBuffer {
+ public:
+  DetachedBuffer()
+      : allocator_(nullptr),
+        own_allocator_(false),
+        buf_(nullptr),
+        reserved_(0),
+        cur_(nullptr),
+        size_(0) {}
+
+  DetachedBuffer(Allocator *allocator, bool own_allocator, uint8_t *buf,
+                 size_t reserved, uint8_t *cur, size_t sz)
+      : allocator_(allocator),
+        own_allocator_(own_allocator),
+        buf_(buf),
+        reserved_(reserved),
+        cur_(cur),
+        size_(sz) {}
+
+  DetachedBuffer(DetachedBuffer &&other) noexcept
+      : allocator_(other.allocator_),
+        own_allocator_(other.own_allocator_),
+        buf_(other.buf_),
+        reserved_(other.reserved_),
+        cur_(other.cur_),
+        size_(other.size_) {
+    other.reset();
+  }
+
+  DetachedBuffer &operator=(DetachedBuffer &&other) noexcept {
+    if (this == &other) return *this;
+
+    destroy();
+
+    allocator_ = other.allocator_;
+    own_allocator_ = other.own_allocator_;
+    buf_ = other.buf_;
+    reserved_ = other.reserved_;
+    cur_ = other.cur_;
+    size_ = other.size_;
+
+    other.reset();
+
+    return *this;
+  }
+
+  ~DetachedBuffer() { destroy(); }
+
+  const uint8_t *data() const { return cur_; }
+
+  uint8_t *data() { return cur_; }
+
+  size_t size() const { return size_; }
+
+  // These may change access mode, leave these at end of public section
+  FLATBUFFERS_DELETE_FUNC(DetachedBuffer(const DetachedBuffer &other));
+  FLATBUFFERS_DELETE_FUNC(
+      DetachedBuffer &operator=(const DetachedBuffer &other));
+
+ protected:
+  Allocator *allocator_;
+  bool own_allocator_;
+  uint8_t *buf_;
+  size_t reserved_;
+  uint8_t *cur_;
+  size_t size_;
+
+  inline void destroy() {
+    if (buf_) Deallocate(allocator_, buf_, reserved_);
+    if (own_allocator_ && allocator_) { delete allocator_; }
+    reset();
+  }
+
+  inline void reset() {
+    allocator_ = nullptr;
+    own_allocator_ = false;
+    buf_ = nullptr;
+    reserved_ = 0;
+    cur_ = nullptr;
+    size_ = 0;
+  }
+};
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_DETACHED_BUFFER_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/file_manager.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/file_manager.h
new file mode 100644
index 00000000..069df5b8
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/file_manager.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2023 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_FILE_MANAGER_H_
+#define FLATBUFFERS_FILE_MANAGER_H_
+
+#include <set>
+#include <string>
+
+#include "flatbuffers/util.h"
+
+namespace flatbuffers {
+
+// A File interface to write data to file by default or
+// save only file names
+class FileManager {
+ public:
+  FileManager() = default;
+  virtual ~FileManager() = default;
+
+  virtual bool SaveFile(const std::string &absolute_file_name,
+                        const std::string &content) = 0;
+
+  virtual bool LoadFile(const std::string &absolute_file_name,
+                        std::string *buf) = 0;
+
+ private:
+  // Copying is not supported.
+  FileManager(const FileManager &) = delete;
+  FileManager &operator=(const FileManager &) = delete;
+};
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_FILE_MANAGER_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flatbuffer_builder.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flatbuffer_builder.h
new file mode 100644
index 00000000..0a38b4ac
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flatbuffer_builder.h
@@ -0,0 +1,1465 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_FLATBUFFER_BUILDER_H_
+#define FLATBUFFERS_FLATBUFFER_BUILDER_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <initializer_list>
+#include <type_traits>
+
+#include "flatbuffers/allocator.h"
+#include "flatbuffers/array.h"
+#include "flatbuffers/base.h"
+#include "flatbuffers/buffer.h"
+#include "flatbuffers/buffer_ref.h"
+#include "flatbuffers/default_allocator.h"
+#include "flatbuffers/detached_buffer.h"
+#include "flatbuffers/stl_emulation.h"
+#include "flatbuffers/string.h"
+#include "flatbuffers/struct.h"
+#include "flatbuffers/table.h"
+#include "flatbuffers/vector.h"
+#include "flatbuffers/vector_downward.h"
+#include "flatbuffers/verifier.h"
+
+namespace flatbuffers {
+
+// Converts a Field ID to a virtual table offset.
+inline voffset_t FieldIndexToOffset(voffset_t field_id) {
+  // Should correspond to what EndTable() below builds up.
+  const voffset_t fixed_fields =
+      2 * sizeof(voffset_t);  // Vtable size and Object Size.
+  return fixed_fields + field_id * sizeof(voffset_t);
+}
+
+template<typename T, typename Alloc = std::allocator<T>>
+const T *data(const std::vector<T, Alloc> &v) {
+  // Eventually the returned pointer gets passed down to memcpy, so
+  // we need it to be non-null to avoid undefined behavior.
+  static uint8_t t;
+  return v.empty() ? reinterpret_cast<const T *>(&t) : &v.front();
+}
+template<typename T, typename Alloc = std::allocator<T>>
+T *data(std::vector<T, Alloc> &v) {
+  // Eventually the returned pointer gets passed down to memcpy, so
+  // we need it to be non-null to avoid undefined behavior.
+  static uint8_t t;
+  return v.empty() ? reinterpret_cast<T *>(&t) : &v.front();
+}
+
+/// @addtogroup flatbuffers_cpp_api
+/// @{
+/// @class FlatBufferBuilder
+/// @brief Helper class to hold data needed in creation of a FlatBuffer.
+/// To serialize data, you typically call one of the `Create*()` functions in
+/// the generated code, which in turn call a sequence of `StartTable`/
+/// `PushElement`/`AddElement`/`EndTable`, or the builtin `CreateString`/
+/// `CreateVector` functions. Do this is depth-first order to build up a tree to
+/// the root. `Finish()` wraps up the buffer ready for transport.
+template<bool Is64Aware = false> class FlatBufferBuilderImpl {
+ public:
+  // This switches the size type of the builder, based on if its 64-bit aware
+  // (uoffset64_t) or not (uoffset_t).
+  typedef
+      typename std::conditional<Is64Aware, uoffset64_t, uoffset_t>::type SizeT;
+
+  /// @brief Default constructor for FlatBufferBuilder.
+  /// @param[in] initial_size The initial size of the buffer, in bytes. Defaults
+  /// to `1024`.
+  /// @param[in] allocator An `Allocator` to use. If null will use
+  /// `DefaultAllocator`.
+  /// @param[in] own_allocator Whether the builder/vector should own the
+  /// allocator. Defaults to / `false`.
+  /// @param[in] buffer_minalign Force the buffer to be aligned to the given
+  /// minimum alignment upon reallocation. Only needed if you intend to store
+  /// types with custom alignment AND you wish to read the buffer in-place
+  /// directly after creation.
+  explicit FlatBufferBuilderImpl(
+      size_t initial_size = 1024, Allocator *allocator = nullptr,
+      bool own_allocator = false,
+      size_t buffer_minalign = AlignOf<largest_scalar_t>())
+      : buf_(initial_size, allocator, own_allocator, buffer_minalign,
+             static_cast<SizeT>(Is64Aware ? FLATBUFFERS_MAX_64_BUFFER_SIZE
+                                          : FLATBUFFERS_MAX_BUFFER_SIZE)),
+        num_field_loc(0),
+        max_voffset_(0),
+        length_of_64_bit_region_(0),
+        nested(false),
+        finished(false),
+        minalign_(1),
+        force_defaults_(false),
+        dedup_vtables_(true),
+        string_pool(nullptr) {
+    EndianCheck();
+  }
+
+  /// @brief Move constructor for FlatBufferBuilder.
+  FlatBufferBuilderImpl(FlatBufferBuilderImpl &&other) noexcept
+      : buf_(1024, nullptr, false, AlignOf<largest_scalar_t>(),
+             static_cast<SizeT>(Is64Aware ? FLATBUFFERS_MAX_64_BUFFER_SIZE
+                                          : FLATBUFFERS_MAX_BUFFER_SIZE)),
+        num_field_loc(0),
+        max_voffset_(0),
+        length_of_64_bit_region_(0),
+        nested(false),
+        finished(false),
+        minalign_(1),
+        force_defaults_(false),
+        dedup_vtables_(true),
+        string_pool(nullptr) {
+    EndianCheck();
+    // Default construct and swap idiom.
+    // Lack of delegating constructors in vs2010 makes it more verbose than
+    // needed.
+    Swap(other);
+  }
+
+  /// @brief Move assignment operator for FlatBufferBuilder.
+  FlatBufferBuilderImpl &operator=(FlatBufferBuilderImpl &&other) noexcept {
+    // Move construct a temporary and swap idiom
+    FlatBufferBuilderImpl temp(std::move(other));
+    Swap(temp);
+    return *this;
+  }
+
+  void Swap(FlatBufferBuilderImpl &other) {
+    using std::swap;
+    buf_.swap(other.buf_);
+    swap(num_field_loc, other.num_field_loc);
+    swap(max_voffset_, other.max_voffset_);
+    swap(length_of_64_bit_region_, other.length_of_64_bit_region_);
+    swap(nested, other.nested);
+    swap(finished, other.finished);
+    swap(minalign_, other.minalign_);
+    swap(force_defaults_, other.force_defaults_);
+    swap(dedup_vtables_, other.dedup_vtables_);
+    swap(string_pool, other.string_pool);
+  }
+
+  ~FlatBufferBuilderImpl() {
+    if (string_pool) delete string_pool;
+  }
+
+  void Reset() {
+    Clear();       // clear builder state
+    buf_.reset();  // deallocate buffer
+  }
+
+  /// @brief Reset all the state in this FlatBufferBuilder so it can be reused
+  /// to construct another buffer.
+  void Clear() {
+    ClearOffsets();
+    buf_.clear();
+    nested = false;
+    finished = false;
+    minalign_ = 1;
+    length_of_64_bit_region_ = 0;
+    if (string_pool) string_pool->clear();
+  }
+
+  /// @brief The current size of the serialized buffer, counting from the end.
+  /// @return Returns an `SizeT` with the current size of the buffer.
+  SizeT GetSize() const { return buf_.size(); }
+
+  /// @brief The current size of the serialized buffer relative to the end of
+  /// the 32-bit region.
+  /// @return Returns an `uoffset_t` with the current size of the buffer.
+  template<bool is_64 = Is64Aware>
+  // Only enable this method for the 64-bit builder, as only that builder is
+  // concerned with the 32/64-bit boundary, and should be the one to bare any
+  // run time costs.
+  typename std::enable_if<is_64, uoffset_t>::type GetSizeRelative32BitRegion()
+      const {
+    //[32-bit region][64-bit region]
+    //         [XXXXXXXXXXXXXXXXXXX] GetSize()
+    //               [YYYYYYYYYYYYY] length_of_64_bit_region_
+    //         [ZZZZ]                return size
+    return static_cast<uoffset_t>(GetSize() - length_of_64_bit_region_);
+  }
+
+  template<bool is_64 = Is64Aware>
+  // Only enable this method for the 32-bit builder.
+  typename std::enable_if<!is_64, uoffset_t>::type GetSizeRelative32BitRegion()
+      const {
+    return static_cast<uoffset_t>(GetSize());
+  }
+
+  /// @brief Get the serialized buffer (after you call `Finish()`).
+  /// @return Returns an `uint8_t` pointer to the FlatBuffer data inside the
+  /// buffer.
+  uint8_t *GetBufferPointer() const {
+    Finished();
+    return buf_.data();
+  }
+
+  /// @brief Get the serialized buffer (after you call `Finish()`) as a span.
+  /// @return Returns a constructed flatbuffers::span that is a view over the
+  /// FlatBuffer data inside the buffer.
+  flatbuffers::span<uint8_t> GetBufferSpan() const {
+    Finished();
+    return flatbuffers::span<uint8_t>(buf_.data(), buf_.size());
+  }
+
+  /// @brief Get a pointer to an unfinished buffer.
+  /// @return Returns a `uint8_t` pointer to the unfinished buffer.
+  uint8_t *GetCurrentBufferPointer() const { return buf_.data(); }
+
+  /// @brief Get the released pointer to the serialized buffer.
+  /// @warning Do NOT attempt to use this FlatBufferBuilder afterwards!
+  /// @return A `FlatBuffer` that owns the buffer and its allocator and
+  /// behaves similar to a `unique_ptr` with a deleter.
+  FLATBUFFERS_ATTRIBUTE([[deprecated("use Release() instead")]])
+  DetachedBuffer ReleaseBufferPointer() {
+    Finished();
+    return buf_.release();
+  }
+
+  /// @brief Get the released DetachedBuffer.
+  /// @return A `DetachedBuffer` that owns the buffer and its allocator.
+  DetachedBuffer Release() {
+    Finished();
+    return buf_.release();
+  }
+
+  /// @brief Get the released pointer to the serialized buffer.
+  /// @param size The size of the memory block containing
+  /// the serialized `FlatBuffer`.
+  /// @param offset The offset from the released pointer where the finished
+  /// `FlatBuffer` starts.
+  /// @return A raw pointer to the start of the memory block containing
+  /// the serialized `FlatBuffer`.
+  /// @remark If the allocator is owned, it gets deleted when the destructor is
+  /// called..
+  uint8_t *ReleaseRaw(size_t &size, size_t &offset) {
+    Finished();
+    return buf_.release_raw(size, offset);
+  }
+
+  /// @brief get the minimum alignment this buffer needs to be accessed
+  /// properly. This is only known once all elements have been written (after
+  /// you call Finish()). You can use this information if you need to embed
+  /// a FlatBuffer in some other buffer, such that you can later read it
+  /// without first having to copy it into its own buffer.
+  size_t GetBufferMinAlignment() const {
+    Finished();
+    return minalign_;
+  }
+
+  /// @cond FLATBUFFERS_INTERNAL
+  void Finished() const {
+    // If you get this assert, you're attempting to get access a buffer
+    // which hasn't been finished yet. Be sure to call
+    // FlatBufferBuilder::Finish with your root table.
+    // If you really need to access an unfinished buffer, call
+    // GetCurrentBufferPointer instead.
+    FLATBUFFERS_ASSERT(finished);
+  }
+  /// @endcond
+
+  /// @brief In order to save space, fields that are set to their default value
+  /// don't get serialized into the buffer.
+  /// @param[in] fd When set to `true`, always serializes default values that
+  /// are set. Optional fields which are not set explicitly, will still not be
+  /// serialized.
+  void ForceDefaults(bool fd) { force_defaults_ = fd; }
+
+  /// @brief By default vtables are deduped in order to save space.
+  /// @param[in] dedup When set to `true`, dedup vtables.
+  void DedupVtables(bool dedup) { dedup_vtables_ = dedup; }
+
+  /// @cond FLATBUFFERS_INTERNAL
+  void Pad(size_t num_bytes) { buf_.fill(num_bytes); }
+
+  void TrackMinAlign(size_t elem_size) {
+    if (elem_size > minalign_) minalign_ = elem_size;
+  }
+
+  void Align(size_t elem_size) {
+    TrackMinAlign(elem_size);
+    buf_.fill(PaddingBytes(buf_.size(), elem_size));
+  }
+
+  void PushFlatBuffer(const uint8_t *bytes, size_t size) {
+    PushBytes(bytes, size);
+    finished = true;
+  }
+
+  void PushBytes(const uint8_t *bytes, size_t size) { buf_.push(bytes, size); }
+
+  void PopBytes(size_t amount) { buf_.pop(amount); }
+
+  template<typename T> void AssertScalarT() {
+    // The code assumes power of 2 sizes and endian-swap-ability.
+    static_assert(flatbuffers::is_scalar<T>::value, "T must be a scalar type");
+  }
+
+  // Write a single aligned scalar to the buffer
+  template<typename T, typename ReturnT = uoffset_t>
+  ReturnT PushElement(T element) {
+    AssertScalarT<T>();
+    Align(sizeof(T));
+    buf_.push_small(EndianScalar(element));
+    return CalculateOffset<ReturnT>();
+  }
+
+  template<typename T, template<typename> class OffsetT = Offset>
+  uoffset_t PushElement(OffsetT<T> off) {
+    // Special case for offsets: see ReferTo below.
+    return PushElement(ReferTo(off.o));
+  }
+
+  // When writing fields, we track where they are, so we can create correct
+  // vtables later.
+  void TrackField(voffset_t field, uoffset_t off) {
+    FieldLoc fl = { off, field };
+    buf_.scratch_push_small(fl);
+    num_field_loc++;
+    if (field > max_voffset_) { max_voffset_ = field; }
+  }
+
+  // Like PushElement, but additionally tracks the field this represents.
+  template<typename T> void AddElement(voffset_t field, T e, T def) {
+    // We don't serialize values equal to the default.
+    if (IsTheSameAs(e, def) && !force_defaults_) return;
+    TrackField(field, PushElement(e));
+  }
+
+  template<typename T> void AddElement(voffset_t field, T e) {
+    TrackField(field, PushElement(e));
+  }
+
+  template<typename T> void AddOffset(voffset_t field, Offset<T> off) {
+    if (off.IsNull()) return;  // Don't store.
+    AddElement(field, ReferTo(off.o), static_cast<uoffset_t>(0));
+  }
+
+  template<typename T> void AddOffset(voffset_t field, Offset64<T> off) {
+    if (off.IsNull()) return;  // Don't store.
+    AddElement(field, ReferTo(off.o), static_cast<uoffset64_t>(0));
+  }
+
+  template<typename T> void AddStruct(voffset_t field, const T *structptr) {
+    if (!structptr) return;  // Default, don't store.
+    Align(AlignOf<T>());
+    buf_.push_small(*structptr);
+    TrackField(field, CalculateOffset<uoffset_t>());
+  }
+
+  void AddStructOffset(voffset_t field, uoffset_t off) {
+    TrackField(field, off);
+  }
+
+  // Offsets initially are relative to the end of the buffer (downwards).
+  // This function converts them to be relative to the current location
+  // in the buffer (when stored here), pointing upwards.
+  uoffset_t ReferTo(uoffset_t off) {
+    // Align to ensure GetSizeRelative32BitRegion() below is correct.
+    Align(sizeof(uoffset_t));
+    // 32-bit offsets are relative to the tail of the 32-bit region of the
+    // buffer. For most cases (without 64-bit entities) this is equivalent to
+    // size of the whole buffer (e.g. GetSize())
+    return ReferTo(off, GetSizeRelative32BitRegion());
+  }
+
+  uoffset64_t ReferTo(uoffset64_t off) {
+    // Align to ensure GetSize() below is correct.
+    Align(sizeof(uoffset64_t));
+    // 64-bit offsets are relative to tail of the whole buffer
+    return ReferTo(off, GetSize());
+  }
+
+  template<typename T, typename T2> T ReferTo(const T off, const T2 size) {
+    FLATBUFFERS_ASSERT(off && off <= size);
+    return size - off + static_cast<T>(sizeof(T));
+  }
+
+  template<typename T> T ReferTo(const T off, const T size) {
+    FLATBUFFERS_ASSERT(off && off <= size);
+    return size - off + static_cast<T>(sizeof(T));
+  }
+
+  void NotNested() {
+    // If you hit this, you're trying to construct a Table/Vector/String
+    // during the construction of its parent table (between the MyTableBuilder
+    // and table.Finish().
+    // Move the creation of these sub-objects to above the MyTableBuilder to
+    // not get this assert.
+    // Ignoring this assert may appear to work in simple cases, but the reason
+    // it is here is that storing objects in-line may cause vtable offsets
+    // to not fit anymore. It also leads to vtable duplication.
+    FLATBUFFERS_ASSERT(!nested);
+    // If you hit this, fields were added outside the scope of a table.
+    FLATBUFFERS_ASSERT(!num_field_loc);
+  }
+
+  // From generated code (or from the parser), we call StartTable/EndTable
+  // with a sequence of AddElement calls in between.
+  uoffset_t StartTable() {
+    NotNested();
+    nested = true;
+    return GetSizeRelative32BitRegion();
+  }
+
+  // This finishes one serialized object by generating the vtable if it's a
+  // table, comparing it against existing vtables, and writing the
+  // resulting vtable offset.
+  uoffset_t EndTable(uoffset_t start) {
+    // If you get this assert, a corresponding StartTable wasn't called.
+    FLATBUFFERS_ASSERT(nested);
+    // Write the vtable offset, which is the start of any Table.
+    // We fill its value later.
+    // This is relative to the end of the 32-bit region.
+    const uoffset_t vtable_offset_loc =
+        static_cast<uoffset_t>(PushElement<soffset_t>(0));
+    // Write a vtable, which consists entirely of voffset_t elements.
+    // It starts with the number of offsets, followed by a type id, followed
+    // by the offsets themselves. In reverse:
+    // Include space for the last offset and ensure empty tables have a
+    // minimum size.
+    max_voffset_ =
+        (std::max)(static_cast<voffset_t>(max_voffset_ + sizeof(voffset_t)),
+                   FieldIndexToOffset(0));
+    buf_.fill_big(max_voffset_);
+    const uoffset_t table_object_size = vtable_offset_loc - start;
+    // Vtable use 16bit offsets.
+    FLATBUFFERS_ASSERT(table_object_size < 0x10000);
+    WriteScalar<voffset_t>(buf_.data() + sizeof(voffset_t),
+                           static_cast<voffset_t>(table_object_size));
+    WriteScalar<voffset_t>(buf_.data(), max_voffset_);
+    // Write the offsets into the table
+    for (auto it = buf_.scratch_end() - num_field_loc * sizeof(FieldLoc);
+         it < buf_.scratch_end(); it += sizeof(FieldLoc)) {
+      auto field_location = reinterpret_cast<FieldLoc *>(it);
+      const voffset_t pos =
+          static_cast<voffset_t>(vtable_offset_loc - field_location->off);
+      // If this asserts, it means you've set a field twice.
+      FLATBUFFERS_ASSERT(
+          !ReadScalar<voffset_t>(buf_.data() + field_location->id));
+      WriteScalar<voffset_t>(buf_.data() + field_location->id, pos);
+    }
+    ClearOffsets();
+    auto vt1 = reinterpret_cast<voffset_t *>(buf_.data());
+    auto vt1_size = ReadScalar<voffset_t>(vt1);
+    auto vt_use = GetSizeRelative32BitRegion();
+    // See if we already have generated a vtable with this exact same
+    // layout before. If so, make it point to the old one, remove this one.
+    if (dedup_vtables_) {
+      for (auto it = buf_.scratch_data(); it < buf_.scratch_end();
+           it += sizeof(uoffset_t)) {
+        auto vt_offset_ptr = reinterpret_cast<uoffset_t *>(it);
+        auto vt2 = reinterpret_cast<voffset_t *>(buf_.data_at(*vt_offset_ptr));
+        auto vt2_size = ReadScalar<voffset_t>(vt2);
+        if (vt1_size != vt2_size || 0 != memcmp(vt2, vt1, vt1_size)) continue;
+        vt_use = *vt_offset_ptr;
+        buf_.pop(GetSizeRelative32BitRegion() - vtable_offset_loc);
+        break;
+      }
+    }
+    // If this is a new vtable, remember it.
+    if (vt_use == GetSizeRelative32BitRegion()) {
+      buf_.scratch_push_small(vt_use);
+    }
+    // Fill the vtable offset we created above.
+    // The offset points from the beginning of the object to where the vtable is
+    // stored.
+    // Offsets default direction is downward in memory for future format
+    // flexibility (storing all vtables at the start of the file).
+    WriteScalar(buf_.data_at(vtable_offset_loc + length_of_64_bit_region_),
+                static_cast<soffset_t>(vt_use) -
+                    static_cast<soffset_t>(vtable_offset_loc));
+    nested = false;
+    return vtable_offset_loc;
+  }
+
+  FLATBUFFERS_ATTRIBUTE([[deprecated("call the version above instead")]])
+  uoffset_t EndTable(uoffset_t start, voffset_t /*numfields*/) {
+    return EndTable(start);
+  }
+
+  // This checks a required field has been set in a given table that has
+  // just been constructed.
+  template<typename T> void Required(Offset<T> table, voffset_t field) {
+    auto table_ptr = reinterpret_cast<const Table *>(buf_.data_at(table.o));
+    bool ok = table_ptr->GetOptionalFieldOffset(field) != 0;
+    // If this fails, the caller will show what field needs to be set.
+    FLATBUFFERS_ASSERT(ok);
+    (void)ok;
+  }
+
+  uoffset_t StartStruct(size_t alignment) {
+    Align(alignment);
+    return GetSizeRelative32BitRegion();
+  }
+
+  uoffset_t EndStruct() { return GetSizeRelative32BitRegion(); }
+
+  void ClearOffsets() {
+    buf_.scratch_pop(num_field_loc * sizeof(FieldLoc));
+    num_field_loc = 0;
+    max_voffset_ = 0;
+  }
+
+  // Aligns such that when "len" bytes are written, an object can be written
+  // after it (forward in the buffer) with "alignment" without padding.
+  void PreAlign(size_t len, size_t alignment) {
+    if (len == 0) return;
+    TrackMinAlign(alignment);
+    buf_.fill(PaddingBytes(GetSize() + len, alignment));
+  }
+
+  // Aligns such than when "len" bytes are written, an object of type `AlignT`
+  // can be written after it (forward in the buffer) without padding.
+  template<typename AlignT> void PreAlign(size_t len) {
+    AssertScalarT<AlignT>();
+    PreAlign(len, AlignOf<AlignT>());
+  }
+  /// @endcond
+
+  /// @brief Store a string in the buffer, which can contain any binary data.
+  /// @param[in] str A const char pointer to the data to be stored as a string.
+  /// @param[in] len The number of bytes that should be stored from `str`.
+  /// @return Returns the offset in the buffer where the string starts.
+  template<template<typename> class OffsetT = Offset>
+  OffsetT<String> CreateString(const char *str, size_t len) {
+    CreateStringImpl(str, len);
+    return OffsetT<String>(
+        CalculateOffset<typename OffsetT<String>::offset_type>());
+  }
+
+  /// @brief Store a string in the buffer, which is null-terminated.
+  /// @param[in] str A const char pointer to a C-string to add to the buffer.
+  /// @return Returns the offset in the buffer where the string starts.
+  template<template<typename> class OffsetT = Offset>
+  OffsetT<String> CreateString(const char *str) {
+    return CreateString<OffsetT>(str, strlen(str));
+  }
+
+  /// @brief Store a string in the buffer, which is null-terminated.
+  /// @param[in] str A char pointer to a C-string to add to the buffer.
+  /// @return Returns the offset in the buffer where the string starts.
+  template<template<typename> class OffsetT = Offset>
+  OffsetT<String> CreateString(char *str) {
+    return CreateString<OffsetT>(str, strlen(str));
+  }
+
+  /// @brief Store a string in the buffer, which can contain any binary data.
+  /// @param[in] str A const reference to a std::string to store in the buffer.
+  /// @return Returns the offset in the buffer where the string starts.
+  template<template<typename> class OffsetT = Offset>
+  OffsetT<String> CreateString(const std::string &str) {
+    return CreateString<OffsetT>(str.c_str(), str.length());
+  }
+
+  // clang-format off
+  #ifdef FLATBUFFERS_HAS_STRING_VIEW
+  /// @brief Store a string in the buffer, which can contain any binary data.
+  /// @param[in] str A const string_view to copy in to the buffer.
+  /// @return Returns the offset in the buffer where the string starts.
+  template<template <typename> class OffsetT = Offset>
+  OffsetT<String>CreateString(flatbuffers::string_view str) {
+    return CreateString<OffsetT>(str.data(), str.size());
+  }
+  #endif // FLATBUFFERS_HAS_STRING_VIEW
+  // clang-format on
+
+  /// @brief Store a string in the buffer, which can contain any binary data.
+  /// @param[in] str A const pointer to a `String` struct to add to the buffer.
+  /// @return Returns the offset in the buffer where the string starts
+  template<template<typename> class OffsetT = Offset>
+  OffsetT<String> CreateString(const String *str) {
+    return str ? CreateString<OffsetT>(str->c_str(), str->size()) : 0;
+  }
+
+  /// @brief Store a string in the buffer, which can contain any binary data.
+  /// @param[in] str A const reference to a std::string like type with support
+  /// of T::c_str() and T::length() to store in the buffer.
+  /// @return Returns the offset in the buffer where the string starts.
+  template<template<typename> class OffsetT = Offset,
+           // No need to explicitly declare the T type, let the compiler deduce
+           // it.
+           int &...ExplicitArgumentBarrier, typename T>
+  OffsetT<String> CreateString(const T &str) {
+    return CreateString<OffsetT>(str.c_str(), str.length());
+  }
+
+  /// @brief Store a string in the buffer, which can contain any binary data.
+  /// If a string with this exact contents has already been serialized before,
+  /// instead simply returns the offset of the existing string. This uses a map
+  /// stored on the heap, but only stores the numerical offsets.
+  /// @param[in] str A const char pointer to the data to be stored as a string.
+  /// @param[in] len The number of bytes that should be stored from `str`.
+  /// @return Returns the offset in the buffer where the string starts.
+  Offset<String> CreateSharedString(const char *str, size_t len) {
+    FLATBUFFERS_ASSERT(FLATBUFFERS_GENERAL_HEAP_ALLOC_OK);
+    if (!string_pool) {
+      string_pool = new StringOffsetMap(StringOffsetCompare(buf_));
+    }
+
+    const size_t size_before_string = buf_.size();
+    // Must first serialize the string, since the set is all offsets into
+    // buffer.
+    const Offset<String> off = CreateString<Offset>(str, len);
+    auto it = string_pool->find(off);
+    // If it exists we reuse existing serialized data!
+    if (it != string_pool->end()) {
+      // We can remove the string we serialized.
+      buf_.pop(buf_.size() - size_before_string);
+      return *it;
+    }
+    // Record this string for future use.
+    string_pool->insert(off);
+    return off;
+  }
+
+#ifdef FLATBUFFERS_HAS_STRING_VIEW
+  /// @brief Store a string in the buffer, which can contain any binary data.
+  /// If a string with this exact contents has already been serialized before,
+  /// instead simply returns the offset of the existing string. This uses a map
+  /// stored on the heap, but only stores the numerical offsets.
+  /// @param[in] str A const std::string_view to store in the buffer.
+  /// @return Returns the offset in the buffer where the string starts
+  Offset<String> CreateSharedString(const flatbuffers::string_view str) {
+    return CreateSharedString(str.data(), str.size());
+  }
+#else
+  /// @brief Store a string in the buffer, which null-terminated.
+  /// If a string with this exact contents has already been serialized before,
+  /// instead simply returns the offset of the existing string. This uses a map
+  /// stored on the heap, but only stores the numerical offsets.
+  /// @param[in] str A const char pointer to a C-string to add to the buffer.
+  /// @return Returns the offset in the buffer where the string starts.
+  Offset<String> CreateSharedString(const char *str) {
+    return CreateSharedString(str, strlen(str));
+  }
+
+  /// @brief Store a string in the buffer, which can contain any binary data.
+  /// If a string with this exact contents has already been serialized before,
+  /// instead simply returns the offset of the existing string. This uses a map
+  /// stored on the heap, but only stores the numerical offsets.
+  /// @param[in] str A const reference to a std::string to store in the buffer.
+  /// @return Returns the offset in the buffer where the string starts.
+  Offset<String> CreateSharedString(const std::string &str) {
+    return CreateSharedString(str.c_str(), str.length());
+  }
+#endif
+
+  /// @brief Store a string in the buffer, which can contain any binary data.
+  /// If a string with this exact contents has already been serialized before,
+  /// instead simply returns the offset of the existing string. This uses a map
+  /// stored on the heap, but only stores the numerical offsets.
+  /// @param[in] str A const pointer to a `String` struct to add to the buffer.
+  /// @return Returns the offset in the buffer where the string starts
+  Offset<String> CreateSharedString(const String *str) {
+    return str ? CreateSharedString(str->c_str(), str->size()) : 0;
+  }
+
+  /// @cond FLATBUFFERS_INTERNAL
+  template<typename LenT = uoffset_t, typename ReturnT = uoffset_t>
+  ReturnT EndVector(size_t len) {
+    FLATBUFFERS_ASSERT(nested);  // Hit if no corresponding StartVector.
+    nested = false;
+    return PushElement<LenT, ReturnT>(static_cast<LenT>(len));
+  }
+
+  template<template<typename> class OffsetT = Offset, typename LenT = uint32_t>
+  void StartVector(size_t len, size_t elemsize, size_t alignment) {
+    NotNested();
+    nested = true;
+    // Align to the Length type of the vector (either 32-bit or 64-bit), so
+    // that the length of the buffer can be added without padding.
+    PreAlign<LenT>(len * elemsize);
+    PreAlign(len * elemsize, alignment);  // Just in case elemsize > uoffset_t.
+  }
+
+  template<typename T, template<typename> class OffsetT = Offset,
+           typename LenT = uint32_t>
+  void StartVector(size_t len) {
+    return StartVector<OffsetT, LenT>(len, sizeof(T), AlignOf<T>());
+  }
+
+  // Call this right before StartVector/CreateVector if you want to force the
+  // alignment to be something different than what the element size would
+  // normally dictate.
+  // This is useful when storing a nested_flatbuffer in a vector of bytes,
+  // or when storing SIMD floats, etc.
+  void ForceVectorAlignment(size_t len, size_t elemsize, size_t alignment) {
+    if (len == 0) return;
+    FLATBUFFERS_ASSERT(VerifyAlignmentRequirements(alignment));
+    PreAlign(len * elemsize, alignment);
+  }
+
+  // Similar to ForceVectorAlignment but for String fields.
+  void ForceStringAlignment(size_t len, size_t alignment) {
+    if (len == 0) return;
+    FLATBUFFERS_ASSERT(VerifyAlignmentRequirements(alignment));
+    PreAlign((len + 1) * sizeof(char), alignment);
+  }
+
+  /// @endcond
+
+  /// @brief Serialize an array into a FlatBuffer `vector`.
+  /// @tparam T The data type of the array elements.
+  /// @tparam OffsetT the type of offset to return
+  /// @tparam VectorT the type of vector to cast to.
+  /// @param[in] v A pointer to the array of type `T` to serialize into the
+  /// buffer as a `vector`.
+  /// @param[in] len The number of elements to serialize.
+  /// @return Returns a typed `TOffset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, template<typename...> class OffsetT = Offset,
+           template<typename...> class VectorT = Vector>
+  OffsetT<VectorT<T>> CreateVector(const T *v, size_t len) {
+    // The type of the length field in the vector.
+    typedef typename VectorT<T>::size_type LenT;
+    typedef typename OffsetT<VectorT<T>>::offset_type offset_type;
+    // If this assert hits, you're specifying a template argument that is
+    // causing the wrong overload to be selected, remove it.
+    AssertScalarT<T>();
+    StartVector<T, OffsetT, LenT>(len);
+    if (len > 0) {
+      // clang-format off
+      #if FLATBUFFERS_LITTLEENDIAN
+        PushBytes(reinterpret_cast<const uint8_t *>(v), len * sizeof(T));
+      #else
+        if (sizeof(T) == 1) {
+          PushBytes(reinterpret_cast<const uint8_t *>(v), len);
+        } else {
+          for (auto i = len; i > 0; ) {
+            PushElement(v[--i]);
+          }
+        }
+      #endif
+      // clang-format on
+    }
+    return OffsetT<VectorT<T>>(EndVector<LenT, offset_type>(len));
+  }
+
+  /// @brief Serialize an array like object into a FlatBuffer `vector`.
+  /// @tparam T The data type of the array elements.
+  /// @tparam C The type of the array.
+  /// @param[in] array A reference to an array like object of type `T` to
+  /// serialize into the buffer as a `vector`.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, class C> Offset<Vector<T>> CreateVector(const C &array) {
+    return CreateVector(array.data(), array.size());
+  }
+
+  /// @brief Serialize an initializer list into a FlatBuffer `vector`.
+  /// @tparam T The data type of the initializer list elements.
+  /// @param[in] v The value of the initializer list.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T>
+  Offset<Vector<T>> CreateVector(std::initializer_list<T> v) {
+    return CreateVector(v.begin(), v.size());
+  }
+
+  template<typename T>
+  Offset<Vector<Offset<T>>> CreateVector(const Offset<T> *v, size_t len) {
+    StartVector<Offset<T>>(len);
+    for (auto i = len; i > 0;) { PushElement(v[--i]); }
+    return Offset<Vector<Offset<T>>>(EndVector(len));
+  }
+
+  /// @brief Serialize a `std::vector` into a FlatBuffer `vector`.
+  /// @tparam T The data type of the `std::vector` elements.
+  /// @param v A const reference to the `std::vector` to serialize into the
+  /// buffer as a `vector`.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, typename Alloc = std::allocator<T>>
+  Offset<Vector<T>> CreateVector(const std::vector<T, Alloc> &v) {
+    return CreateVector(data(v), v.size());
+  }
+
+  template<template<typename...> class VectorT = Vector64,
+           int &...ExplicitArgumentBarrier, typename T>
+  Offset64<VectorT<T>> CreateVector64(const std::vector<T> &v) {
+    return CreateVector<T, Offset64, VectorT>(data(v), v.size());
+  }
+
+  // vector<bool> may be implemented using a bit-set, so we can't access it as
+  // an array. Instead, read elements manually.
+  // Background: https://isocpp.org/blog/2012/11/on-vectorbool
+  Offset<Vector<uint8_t>> CreateVector(const std::vector<bool> &v) {
+    StartVector<uint8_t>(v.size());
+    for (auto i = v.size(); i > 0;) {
+      PushElement(static_cast<uint8_t>(v[--i]));
+    }
+    return Offset<Vector<uint8_t>>(EndVector(v.size()));
+  }
+
+  /// @brief Serialize values returned by a function into a FlatBuffer `vector`.
+  /// This is a convenience function that takes care of iteration for you.
+  /// @tparam T The data type of the `std::vector` elements.
+  /// @param f A function that takes the current iteration 0..vector_size-1 and
+  /// returns any type that you can construct a FlatBuffers vector out of.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T>
+  Offset<Vector<T>> CreateVector(size_t vector_size,
+                                 const std::function<T(size_t i)> &f) {
+    FLATBUFFERS_ASSERT(FLATBUFFERS_GENERAL_HEAP_ALLOC_OK);
+    std::vector<T> elems(vector_size);
+    for (size_t i = 0; i < vector_size; i++) elems[i] = f(i);
+    return CreateVector(elems);
+  }
+
+  /// @brief Serialize values returned by a function into a FlatBuffer `vector`.
+  /// This is a convenience function that takes care of iteration for you. This
+  /// uses a vector stored on the heap to store the intermediate results of the
+  /// iteration.
+  /// @tparam T The data type of the `std::vector` elements.
+  /// @param f A function that takes the current iteration 0..vector_size-1,
+  /// and the state parameter returning any type that you can construct a
+  /// FlatBuffers vector out of.
+  /// @param state State passed to f.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, typename F, typename S>
+  Offset<Vector<T>> CreateVector(size_t vector_size, F f, S *state) {
+    FLATBUFFERS_ASSERT(FLATBUFFERS_GENERAL_HEAP_ALLOC_OK);
+    std::vector<T> elems(vector_size);
+    for (size_t i = 0; i < vector_size; i++) elems[i] = f(i, state);
+    return CreateVector(elems);
+  }
+
+  /// @brief Serialize a `std::vector<StringType>` into a FlatBuffer `vector`.
+  /// whereas StringType is any type that is accepted by the CreateString()
+  /// overloads.
+  /// This is a convenience function for a common case.
+  /// @param v A const reference to the `std::vector` to serialize into the
+  /// buffer as a `vector`.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename StringType = std::string,
+           typename Alloc = std::allocator<StringType>>
+  Offset<Vector<Offset<String>>> CreateVectorOfStrings(
+      const std::vector<StringType, Alloc> &v) {
+    return CreateVectorOfStrings(v.cbegin(), v.cend());
+  }
+
+  /// @brief Serialize a collection of Strings into a FlatBuffer `vector`.
+  /// This is a convenience function for a common case.
+  /// @param begin The beginning iterator of the collection
+  /// @param end The ending iterator of the collection
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<class It>
+  Offset<Vector<Offset<String>>> CreateVectorOfStrings(It begin, It end) {
+    auto size = std::distance(begin, end);
+    auto scratch_buffer_usage = size * sizeof(Offset<String>);
+    // If there is not enough space to store the offsets, there definitely won't
+    // be enough space to store all the strings. So ensuring space for the
+    // scratch region is OK, for if it fails, it would have failed later.
+    buf_.ensure_space(scratch_buffer_usage);
+    for (auto it = begin; it != end; ++it) {
+      buf_.scratch_push_small(CreateString(*it));
+    }
+    StartVector<Offset<String>>(size);
+    for (auto i = 1; i <= size; i++) {
+      // Note we re-evaluate the buf location each iteration to account for any
+      // underlying buffer resizing that may occur.
+      PushElement(*reinterpret_cast<Offset<String> *>(
+          buf_.scratch_end() - i * sizeof(Offset<String>)));
+    }
+    buf_.scratch_pop(scratch_buffer_usage);
+    return Offset<Vector<Offset<String>>>(EndVector(size));
+  }
+
+  /// @brief Serialize an array of structs into a FlatBuffer `vector`.
+  /// @tparam T The data type of the struct array elements.
+  /// @param[in] v A pointer to the array of type `T` to serialize into the
+  /// buffer as a `vector`.
+  /// @param[in] len The number of elements to serialize.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, template<typename...> class OffsetT = Offset,
+           template<typename...> class VectorT = Vector>
+  OffsetT<VectorT<const T *>> CreateVectorOfStructs(const T *v, size_t len) {
+    // The type of the length field in the vector.
+    typedef typename VectorT<T>::size_type LenT;
+    typedef typename OffsetT<VectorT<const T *>>::offset_type offset_type;
+
+    StartVector<OffsetT, LenT>(len * sizeof(T) / AlignOf<T>(), sizeof(T),
+                               AlignOf<T>());
+    if (len > 0) {
+      PushBytes(reinterpret_cast<const uint8_t *>(v), sizeof(T) * len);
+    }
+    return OffsetT<VectorT<const T *>>(EndVector<LenT, offset_type>(len));
+  }
+
+  /// @brief Serialize an array of structs into a FlatBuffer `vector`.
+  /// @tparam T The data type of the struct array elements.
+  /// @param[in] filler A function that takes the current iteration
+  /// 0..vector_size-1 and a pointer to the struct that must be filled.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  /// This is mostly useful when flatbuffers are generated with mutation
+  /// accessors.
+  template<typename T>
+  Offset<Vector<const T *>> CreateVectorOfStructs(
+      size_t vector_size, const std::function<void(size_t i, T *)> &filler) {
+    T *structs = StartVectorOfStructs<T>(vector_size);
+    for (size_t i = 0; i < vector_size; i++) {
+      filler(i, structs);
+      structs++;
+    }
+    return EndVectorOfStructs<T>(vector_size);
+  }
+
+  /// @brief Serialize an array of structs into a FlatBuffer `vector`.
+  /// @tparam T The data type of the struct array elements.
+  /// @param[in] f A function that takes the current iteration 0..vector_size-1,
+  /// a pointer to the struct that must be filled and the state argument.
+  /// @param[in] state Arbitrary state to pass to f.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  /// This is mostly useful when flatbuffers are generated with mutation
+  /// accessors.
+  template<typename T, typename F, typename S>
+  Offset<Vector<const T *>> CreateVectorOfStructs(size_t vector_size, F f,
+                                                  S *state) {
+    T *structs = StartVectorOfStructs<T>(vector_size);
+    for (size_t i = 0; i < vector_size; i++) {
+      f(i, structs, state);
+      structs++;
+    }
+    return EndVectorOfStructs<T>(vector_size);
+  }
+
+  /// @brief Serialize a `std::vector` of structs into a FlatBuffer `vector`.
+  /// @tparam T The data type of the `std::vector` struct elements.
+  /// @param[in] v A const reference to the `std::vector` of structs to
+  /// serialize into the buffer as a `vector`.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, template<typename...> class OffsetT = Offset,
+           template<typename...> class VectorT = Vector,
+           typename Alloc = std::allocator<T>>
+  OffsetT<VectorT<const T *>> CreateVectorOfStructs(
+      const std::vector<T, Alloc> &v) {
+    return CreateVectorOfStructs<T, OffsetT, VectorT>(data(v), v.size());
+  }
+
+  template<template<typename...> class VectorT = Vector64, int &..., typename T>
+  Offset64<VectorT<const T *>> CreateVectorOfStructs64(
+      const std::vector<T> &v) {
+    return CreateVectorOfStructs<T, Offset64, VectorT>(data(v), v.size());
+  }
+
+  /// @brief Serialize an array of native structs into a FlatBuffer `vector`.
+  /// @tparam T The data type of the struct array elements.
+  /// @tparam S The data type of the native struct array elements.
+  /// @param[in] v A pointer to the array of type `S` to serialize into the
+  /// buffer as a `vector`.
+  /// @param[in] len The number of elements to serialize.
+  /// @param[in] pack_func Pointer to a function to convert the native struct
+  /// to the FlatBuffer struct.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, typename S>
+  Offset<Vector<const T *>> CreateVectorOfNativeStructs(
+      const S *v, size_t len, T (*const pack_func)(const S &)) {
+    FLATBUFFERS_ASSERT(pack_func);
+    auto structs = StartVectorOfStructs<T>(len);
+    for (size_t i = 0; i < len; i++) { structs[i] = pack_func(v[i]); }
+    return EndVectorOfStructs<T>(len);
+  }
+
+  /// @brief Serialize an array of native structs into a FlatBuffer `vector`.
+  /// @tparam T The data type of the struct array elements.
+  /// @tparam S The data type of the native struct array elements.
+  /// @param[in] v A pointer to the array of type `S` to serialize into the
+  /// buffer as a `vector`.
+  /// @param[in] len The number of elements to serialize.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, typename S>
+  Offset<Vector<const T *>> CreateVectorOfNativeStructs(const S *v,
+                                                        size_t len) {
+    extern T Pack(const S &);
+    return CreateVectorOfNativeStructs(v, len, Pack);
+  }
+
+  /// @brief Serialize a `std::vector` of native structs into a FlatBuffer
+  /// `vector`.
+  /// @tparam T The data type of the `std::vector` struct elements.
+  /// @tparam S The data type of the `std::vector` native struct elements.
+  /// @param[in] v A const reference to the `std::vector` of structs to
+  /// serialize into the buffer as a `vector`.
+  /// @param[in] pack_func Pointer to a function to convert the native struct
+  /// to the FlatBuffer struct.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, typename S, typename Alloc = std::allocator<T>>
+  Offset<Vector<const T *>> CreateVectorOfNativeStructs(
+      const std::vector<S, Alloc> &v, T (*const pack_func)(const S &)) {
+    return CreateVectorOfNativeStructs<T, S>(data(v), v.size(), pack_func);
+  }
+
+  /// @brief Serialize a `std::vector` of native structs into a FlatBuffer
+  /// `vector`.
+  /// @tparam T The data type of the `std::vector` struct elements.
+  /// @tparam S The data type of the `std::vector` native struct elements.
+  /// @param[in] v A const reference to the `std::vector` of structs to
+  /// serialize into the buffer as a `vector`.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, typename S, typename Alloc = std::allocator<S>>
+  Offset<Vector<const T *>> CreateVectorOfNativeStructs(
+      const std::vector<S, Alloc> &v) {
+    return CreateVectorOfNativeStructs<T, S>(data(v), v.size());
+  }
+
+  /// @cond FLATBUFFERS_INTERNAL
+  template<typename T> struct StructKeyComparator {
+    bool operator()(const T &a, const T &b) const {
+      return a.KeyCompareLessThan(&b);
+    }
+  };
+  /// @endcond
+
+  /// @brief Serialize a `std::vector` of structs into a FlatBuffer `vector`
+  /// in sorted order.
+  /// @tparam T The data type of the `std::vector` struct elements.
+  /// @param[in] v A const reference to the `std::vector` of structs to
+  /// serialize into the buffer as a `vector`.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, typename Alloc = std::allocator<T>>
+  Offset<Vector<const T *>> CreateVectorOfSortedStructs(
+      std::vector<T, Alloc> *v) {
+    return CreateVectorOfSortedStructs(data(*v), v->size());
+  }
+
+  /// @brief Serialize a `std::vector` of native structs into a FlatBuffer
+  /// `vector` in sorted order.
+  /// @tparam T The data type of the `std::vector` struct elements.
+  /// @tparam S The data type of the `std::vector` native struct elements.
+  /// @param[in] v A const reference to the `std::vector` of structs to
+  /// serialize into the buffer as a `vector`.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, typename S, typename Alloc = std::allocator<T>>
+  Offset<Vector<const T *>> CreateVectorOfSortedNativeStructs(
+      std::vector<S, Alloc> *v) {
+    return CreateVectorOfSortedNativeStructs<T, S>(data(*v), v->size());
+  }
+
+  /// @brief Serialize an array of structs into a FlatBuffer `vector` in sorted
+  /// order.
+  /// @tparam T The data type of the struct array elements.
+  /// @param[in] v A pointer to the array of type `T` to serialize into the
+  /// buffer as a `vector`.
+  /// @param[in] len The number of elements to serialize.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T>
+  Offset<Vector<const T *>> CreateVectorOfSortedStructs(T *v, size_t len) {
+    std::stable_sort(v, v + len, StructKeyComparator<T>());
+    return CreateVectorOfStructs(v, len);
+  }
+
+  /// @brief Serialize an array of native structs into a FlatBuffer `vector` in
+  /// sorted order.
+  /// @tparam T The data type of the struct array elements.
+  /// @tparam S The data type of the native struct array elements.
+  /// @param[in] v A pointer to the array of type `S` to serialize into the
+  /// buffer as a `vector`.
+  /// @param[in] len The number of elements to serialize.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, typename S>
+  Offset<Vector<const T *>> CreateVectorOfSortedNativeStructs(S *v,
+                                                              size_t len) {
+    extern T Pack(const S &);
+    auto structs = StartVectorOfStructs<T>(len);
+    for (size_t i = 0; i < len; i++) { structs[i] = Pack(v[i]); }
+    std::stable_sort(structs, structs + len, StructKeyComparator<T>());
+    return EndVectorOfStructs<T>(len);
+  }
+
+  /// @cond FLATBUFFERS_INTERNAL
+  template<typename T> struct TableKeyComparator {
+    explicit TableKeyComparator(vector_downward<SizeT> &buf) : buf_(buf) {}
+    TableKeyComparator(const TableKeyComparator &other) : buf_(other.buf_) {}
+    bool operator()(const Offset<T> &a, const Offset<T> &b) const {
+      auto table_a = reinterpret_cast<T *>(buf_.data_at(a.o));
+      auto table_b = reinterpret_cast<T *>(buf_.data_at(b.o));
+      return table_a->KeyCompareLessThan(table_b);
+    }
+    vector_downward<SizeT> &buf_;
+
+   private:
+    FLATBUFFERS_DELETE_FUNC(
+        TableKeyComparator &operator=(const TableKeyComparator &other));
+  };
+  /// @endcond
+
+  /// @brief Serialize an array of `table` offsets as a `vector` in the buffer
+  /// in sorted order.
+  /// @tparam T The data type that the offset refers to.
+  /// @param[in] v An array of type `Offset<T>` that contains the `table`
+  /// offsets to store in the buffer in sorted order.
+  /// @param[in] len The number of elements to store in the `vector`.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T>
+  Offset<Vector<Offset<T>>> CreateVectorOfSortedTables(Offset<T> *v,
+                                                       size_t len) {
+    std::stable_sort(v, v + len, TableKeyComparator<T>(buf_));
+    return CreateVector(v, len);
+  }
+
+  /// @brief Serialize an array of `table` offsets as a `vector` in the buffer
+  /// in sorted order.
+  /// @tparam T The data type that the offset refers to.
+  /// @param[in] v An array of type `Offset<T>` that contains the `table`
+  /// offsets to store in the buffer in sorted order.
+  /// @return Returns a typed `Offset` into the serialized data indicating
+  /// where the vector is stored.
+  template<typename T, typename Alloc = std::allocator<T>>
+  Offset<Vector<Offset<T>>> CreateVectorOfSortedTables(
+      std::vector<Offset<T>, Alloc> *v) {
+    return CreateVectorOfSortedTables(data(*v), v->size());
+  }
+
+  /// @brief Specialized version of `CreateVector` for non-copying use cases.
+  /// Write the data any time later to the returned buffer pointer `buf`.
+  /// @param[in] len The number of elements to store in the `vector`.
+  /// @param[in] elemsize The size of each element in the `vector`.
+  /// @param[out] buf A pointer to a `uint8_t` pointer that can be
+  /// written to at a later time to serialize the data into a `vector`
+  /// in the buffer.
+  uoffset_t CreateUninitializedVector(size_t len, size_t elemsize,
+                                      size_t alignment, uint8_t **buf) {
+    NotNested();
+    StartVector(len, elemsize, alignment);
+    buf_.make_space(len * elemsize);
+    const uoffset_t vec_start = GetSizeRelative32BitRegion();
+    auto vec_end = EndVector(len);
+    *buf = buf_.data_at(vec_start);
+    return vec_end;
+  }
+
+  FLATBUFFERS_ATTRIBUTE([[deprecated("call the version above instead")]])
+  uoffset_t CreateUninitializedVector(size_t len, size_t elemsize,
+                                      uint8_t **buf) {
+    return CreateUninitializedVector(len, elemsize, elemsize, buf);
+  }
+
+  /// @brief Specialized version of `CreateVector` for non-copying use cases.
+  /// Write the data any time later to the returned buffer pointer `buf`.
+  /// @tparam T The data type of the data that will be stored in the buffer
+  /// as a `vector`.
+  /// @param[in] len The number of elements to store in the `vector`.
+  /// @param[out] buf A pointer to a pointer of type `T` that can be
+  /// written to at a later time to serialize the data into a `vector`
+  /// in the buffer.
+  template<typename T>
+  Offset<Vector<T>> CreateUninitializedVector(size_t len, T **buf) {
+    AssertScalarT<T>();
+    return CreateUninitializedVector(len, sizeof(T), AlignOf<T>(),
+                                     reinterpret_cast<uint8_t **>(buf));
+  }
+
+  template<typename T>
+  Offset<Vector<const T *>> CreateUninitializedVectorOfStructs(size_t len,
+                                                               T **buf) {
+    return CreateUninitializedVector(len, sizeof(T), AlignOf<T>(),
+                                     reinterpret_cast<uint8_t **>(buf));
+  }
+
+  // @brief Create a vector of scalar type T given as input a vector of scalar
+  // type U, useful with e.g. pre "enum class" enums, or any existing scalar
+  // data of the wrong type.
+  template<typename T, typename U>
+  Offset<Vector<T>> CreateVectorScalarCast(const U *v, size_t len) {
+    AssertScalarT<T>();
+    AssertScalarT<U>();
+    StartVector<T>(len);
+    for (auto i = len; i > 0;) { PushElement(static_cast<T>(v[--i])); }
+    return Offset<Vector<T>>(EndVector(len));
+  }
+
+  /// @brief Write a struct by itself, typically to be part of a union.
+  template<typename T> Offset<const T *> CreateStruct(const T &structobj) {
+    NotNested();
+    Align(AlignOf<T>());
+    buf_.push_small(structobj);
+    return Offset<const T *>(
+        CalculateOffset<typename Offset<const T *>::offset_type>());
+  }
+
+  /// @brief Finish serializing a buffer by writing the root offset.
+  /// @param[in] file_identifier If a `file_identifier` is given, the buffer
+  /// will be prefixed with a standard FlatBuffers file header.
+  template<typename T>
+  void Finish(Offset<T> root, const char *file_identifier = nullptr) {
+    Finish(root.o, file_identifier, false);
+  }
+
+  /// @brief Finish a buffer with a 32 bit size field pre-fixed (size of the
+  /// buffer following the size field). These buffers are NOT compatible
+  /// with standard buffers created by Finish, i.e. you can't call GetRoot
+  /// on them, you have to use GetSizePrefixedRoot instead.
+  /// All >32 bit quantities in this buffer will be aligned when the whole
+  /// size pre-fixed buffer is aligned.
+  /// These kinds of buffers are useful for creating a stream of FlatBuffers.
+  template<typename T>
+  void FinishSizePrefixed(Offset<T> root,
+                          const char *file_identifier = nullptr) {
+    Finish(root.o, file_identifier, true);
+  }
+
+  void SwapBufAllocator(FlatBufferBuilderImpl &other) {
+    buf_.swap_allocator(other.buf_);
+  }
+
+  /// @brief The length of a FlatBuffer file header.
+  static const size_t kFileIdentifierLength =
+      ::flatbuffers::kFileIdentifierLength;
+
+ protected:
+  // You shouldn't really be copying instances of this class.
+  FlatBufferBuilderImpl(const FlatBufferBuilderImpl &);
+  FlatBufferBuilderImpl &operator=(const FlatBufferBuilderImpl &);
+
+  void Finish(uoffset_t root, const char *file_identifier, bool size_prefix) {
+    NotNested();
+    buf_.clear_scratch();
+
+    const size_t prefix_size = size_prefix ? sizeof(SizeT) : 0;
+    // Make sure we track the alignment of the size prefix.
+    TrackMinAlign(prefix_size);
+
+    const size_t root_offset_size = sizeof(uoffset_t);
+    const size_t file_id_size = file_identifier ? kFileIdentifierLength : 0;
+
+    // This will cause the whole buffer to be aligned.
+    PreAlign(prefix_size + root_offset_size + file_id_size, minalign_);
+
+    if (file_identifier) {
+      FLATBUFFERS_ASSERT(strlen(file_identifier) == kFileIdentifierLength);
+      PushBytes(reinterpret_cast<const uint8_t *>(file_identifier),
+                kFileIdentifierLength);
+    }
+    PushElement(ReferTo(root));  // Location of root.
+    if (size_prefix) { PushElement(GetSize()); }
+    finished = true;
+  }
+
+  struct FieldLoc {
+    uoffset_t off;
+    voffset_t id;
+  };
+
+  vector_downward<SizeT> buf_;
+
+  // Accumulating offsets of table members while it is being built.
+  // We store these in the scratch pad of buf_, after the vtable offsets.
+  uoffset_t num_field_loc;
+  // Track how much of the vtable is in use, so we can output the most compact
+  // possible vtable.
+  voffset_t max_voffset_;
+
+  // This is the length of the 64-bit region of the buffer. The buffer supports
+  // 64-bit offsets by forcing serialization of those elements in the "tail"
+  // region of the buffer (i.e. "64-bit region"). To properly keep track of
+  // offsets that are referenced from the tail of the buffer to not overflow
+  // their size (e.g. Offset is a uint32_t type), the boundary of the 32-/64-bit
+  // regions must be tracked.
+  //
+  // [    Complete FlatBuffer     ]
+  // [32-bit region][64-bit region]
+  //               ^              ^
+  //               |              Tail of the buffer.
+  //               |
+  //               Tail of the 32-bit region of the buffer.
+  //
+  // This keeps track of the size of the 64-bit region so that the tail of the
+  // 32-bit region can be calculated as `GetSize() - length_of_64_bit_region_`.
+  //
+  // This will remain 0 if no 64-bit offset types are added to the buffer.
+  size_t length_of_64_bit_region_;
+
+  // Ensure objects are not nested.
+  bool nested;
+
+  // Ensure the buffer is finished before it is being accessed.
+  bool finished;
+
+  size_t minalign_;
+
+  bool force_defaults_;  // Serialize values equal to their defaults anyway.
+
+  bool dedup_vtables_;
+
+  struct StringOffsetCompare {
+    explicit StringOffsetCompare(const vector_downward<SizeT> &buf)
+        : buf_(&buf) {}
+    bool operator()(const Offset<String> &a, const Offset<String> &b) const {
+      auto stra = reinterpret_cast<const String *>(buf_->data_at(a.o));
+      auto strb = reinterpret_cast<const String *>(buf_->data_at(b.o));
+      return StringLessThan(stra->data(), stra->size(), strb->data(),
+                            strb->size());
+    }
+    const vector_downward<SizeT> *buf_;
+  };
+
+  // For use with CreateSharedString. Instantiated on first use only.
+  typedef std::set<Offset<String>, StringOffsetCompare> StringOffsetMap;
+  StringOffsetMap *string_pool;
+
+ private:
+  void CanAddOffset64() {
+    // If you hit this assertion, you are attempting to add a 64-bit offset to
+    // a 32-bit only builder. This is because the builder has overloads that
+    // differ only on the offset size returned: e.g.:
+    //
+    //   FlatBufferBuilder builder;
+    //   Offset64<String> string_offset = builder.CreateString<Offset64>();
+    //
+    // Either use a 64-bit aware builder, or don't try to create an Offset64
+    // return type.
+    //
+    // TODO(derekbailey): we can probably do more enable_if to avoid this
+    // looking like its possible to the user.
+    static_assert(Is64Aware, "cannot add 64-bit offset to a 32-bit builder");
+
+    // If you hit this assertion, you are attempting to add an 64-bit offset
+    // item after already serializing a 32-bit item. All 64-bit offsets have to
+    // added to the tail of the buffer before any 32-bit items can be added.
+    // Otherwise some items might not be addressable due to the maximum range of
+    // the 32-bit offset.
+    FLATBUFFERS_ASSERT(GetSize() == length_of_64_bit_region_);
+  }
+
+  /// @brief Store a string in the buffer, which can contain any binary data.
+  /// @param[in] str A const char pointer to the data to be stored as a string.
+  /// @param[in] len The number of bytes that should be stored from `str`.
+  /// @return Returns the offset in the buffer where the string starts.
+  void CreateStringImpl(const char *str, size_t len) {
+    NotNested();
+    PreAlign<uoffset_t>(len + 1);  // Always 0-terminated.
+    buf_.fill(1);
+    PushBytes(reinterpret_cast<const uint8_t *>(str), len);
+    PushElement(static_cast<uoffset_t>(len));
+  }
+
+  // Allocates space for a vector of structures.
+  // Must be completed with EndVectorOfStructs().
+  template<typename T, template<typename> class OffsetT = Offset>
+  T *StartVectorOfStructs(size_t vector_size) {
+    StartVector<OffsetT>(vector_size * sizeof(T) / AlignOf<T>(), sizeof(T),
+                         AlignOf<T>());
+    return reinterpret_cast<T *>(buf_.make_space(vector_size * sizeof(T)));
+  }
+
+  // End the vector of structures in the flatbuffers.
+  // Vector should have previously be started with StartVectorOfStructs().
+  template<typename T, template<typename> class OffsetT = Offset>
+  OffsetT<Vector<const T *>> EndVectorOfStructs(size_t vector_size) {
+    return OffsetT<Vector<const T *>>(
+        EndVector<typename Vector<const T *>::size_type,
+                  typename OffsetT<Vector<const T *>>::offset_type>(
+            vector_size));
+  }
+
+  template<typename T>
+  typename std::enable_if<std::is_same<T, uoffset_t>::value, T>::type
+  CalculateOffset() {
+    // Default to the end of the 32-bit region. This may or may not be the end
+    // of the buffer, depending on if any 64-bit offsets have been added.
+    return GetSizeRelative32BitRegion();
+  }
+
+  // Specializations to handle the 64-bit CalculateOffset, which is relative to
+  // end of the buffer.
+  template<typename T>
+  typename std::enable_if<std::is_same<T, uoffset64_t>::value, T>::type
+  CalculateOffset() {
+    // This should never be compiled in when not using a 64-bit builder.
+    static_assert(Is64Aware, "invalid 64-bit offset in 32-bit builder");
+
+    // Store how big the 64-bit region of the buffer is, so we can determine
+    // where the 32/64 bit boundary is.
+    length_of_64_bit_region_ = GetSize();
+
+    return length_of_64_bit_region_;
+  }
+};
+/// @}
+
+// Hack to `FlatBufferBuilder` mean `FlatBufferBuilder<false>` or
+// `FlatBufferBuilder<>`, where the template < > syntax is required.
+using FlatBufferBuilder = FlatBufferBuilderImpl<false>;
+using FlatBufferBuilder64 = FlatBufferBuilderImpl<true>;
+
+// These are external due to GCC not allowing them in the class.
+// See: https://stackoverflow.com/q/8061456/868247
+template<>
+template<>
+inline Offset64<String> FlatBufferBuilder64::CreateString(const char *str,
+                                                          size_t len) {
+  CanAddOffset64();
+  CreateStringImpl(str, len);
+  return Offset64<String>(
+      CalculateOffset<typename Offset64<String>::offset_type>());
+}
+
+// Used to distinguish from real Offsets.
+template<typename T = void> struct EmptyOffset {};
+
+// TODO(derekbailey): it would be nice to combine these two methods.
+template<>
+template<>
+inline void FlatBufferBuilder64::StartVector<Offset64, uint32_t>(
+    size_t len, size_t elemsize, size_t alignment) {
+  CanAddOffset64();
+  StartVector<EmptyOffset, uint32_t>(len, elemsize, alignment);
+}
+
+template<>
+template<>
+inline void FlatBufferBuilder64::StartVector<Offset64, uint64_t>(
+    size_t len, size_t elemsize, size_t alignment) {
+  CanAddOffset64();
+  StartVector<EmptyOffset, uint64_t>(len, elemsize, alignment);
+}
+
+/// Helpers to get a typed pointer to objects that are currently being built.
+/// @warning Creating new objects will lead to reallocations and invalidates
+/// the pointer!
+template<typename T>
+T *GetMutableTemporaryPointer(FlatBufferBuilder &fbb, Offset<T> offset) {
+  return reinterpret_cast<T *>(fbb.GetCurrentBufferPointer() + fbb.GetSize() -
+                               offset.o);
+}
+
+template<typename T>
+const T *GetTemporaryPointer(FlatBufferBuilder &fbb, Offset<T> offset) {
+  return GetMutableTemporaryPointer<T>(fbb, offset);
+}
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_FLATBUFFER_BUILDER_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flatbuffers.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flatbuffers.h
new file mode 100644
index 00000000..bc828a31
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flatbuffers.h
@@ -0,0 +1,284 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_H_
+#define FLATBUFFERS_H_
+
+#include <algorithm>
+
+// TODO: These includes are for mitigating the pains of users editing their
+// source because they relied on flatbuffers.h to include everything for them.
+#include "flatbuffers/array.h"
+#include "flatbuffers/base.h"
+#include "flatbuffers/buffer.h"
+#include "flatbuffers/buffer_ref.h"
+#include "flatbuffers/detached_buffer.h"
+#include "flatbuffers/flatbuffer_builder.h"
+#include "flatbuffers/stl_emulation.h"
+#include "flatbuffers/string.h"
+#include "flatbuffers/struct.h"
+#include "flatbuffers/table.h"
+#include "flatbuffers/vector.h"
+#include "flatbuffers/vector_downward.h"
+#include "flatbuffers/verifier.h"
+
+namespace flatbuffers {
+
+/// @brief This can compute the start of a FlatBuffer from a root pointer, i.e.
+/// it is the opposite transformation of GetRoot().
+/// This may be useful if you want to pass on a root and have the recipient
+/// delete the buffer afterwards.
+inline const uint8_t *GetBufferStartFromRootPointer(const void *root) {
+  auto table = reinterpret_cast<const Table *>(root);
+  auto vtable = table->GetVTable();
+  // Either the vtable is before the root or after the root.
+  auto start = (std::min)(vtable, reinterpret_cast<const uint8_t *>(root));
+  // Align to at least sizeof(uoffset_t).
+  start = reinterpret_cast<const uint8_t *>(reinterpret_cast<uintptr_t>(start) &
+                                            ~(sizeof(uoffset_t) - 1));
+  // Additionally, there may be a file_identifier in the buffer, and the root
+  // offset. The buffer may have been aligned to any size between
+  // sizeof(uoffset_t) and FLATBUFFERS_MAX_ALIGNMENT (see "force_align").
+  // Sadly, the exact alignment is only known when constructing the buffer,
+  // since it depends on the presence of values with said alignment properties.
+  // So instead, we simply look at the next uoffset_t values (root,
+  // file_identifier, and alignment padding) to see which points to the root.
+  // None of the other values can "impersonate" the root since they will either
+  // be 0 or four ASCII characters.
+  static_assert(flatbuffers::kFileIdentifierLength == sizeof(uoffset_t),
+                "file_identifier is assumed to be the same size as uoffset_t");
+  for (auto possible_roots = FLATBUFFERS_MAX_ALIGNMENT / sizeof(uoffset_t) + 1;
+       possible_roots; possible_roots--) {
+    start -= sizeof(uoffset_t);
+    if (ReadScalar<uoffset_t>(start) + start ==
+        reinterpret_cast<const uint8_t *>(root))
+      return start;
+  }
+  // We didn't find the root, either the "root" passed isn't really a root,
+  // or the buffer is corrupt.
+  // Assert, because calling this function with bad data may cause reads
+  // outside of buffer boundaries.
+  FLATBUFFERS_ASSERT(false);
+  return nullptr;
+}
+
+/// @brief This return the prefixed size of a FlatBuffer.
+template<typename SizeT = uoffset_t>
+inline SizeT GetPrefixedSize(const uint8_t *buf) {
+  return ReadScalar<SizeT>(buf);
+}
+
+// Gets the total length of the buffer given a sized prefixed FlatBuffer.
+//
+// This includes the size of the prefix as well as the buffer:
+//
+//  [size prefix][flatbuffer]
+//  |---------length--------|
+template<typename SizeT = uoffset_t>
+inline SizeT GetSizePrefixedBufferLength(const uint8_t *const buf) {
+  return ReadScalar<SizeT>(buf) + sizeof(SizeT);
+}
+
+// Base class for native objects (FlatBuffer data de-serialized into native
+// C++ data structures).
+// Contains no functionality, purely documentative.
+struct NativeTable {};
+
+/// @brief Function types to be used with resolving hashes into objects and
+/// back again. The resolver gets a pointer to a field inside an object API
+/// object that is of the type specified in the schema using the attribute
+/// `cpp_type` (it is thus important whatever you write to this address
+/// matches that type). The value of this field is initially null, so you
+/// may choose to implement a delayed binding lookup using this function
+/// if you wish. The resolver does the opposite lookup, for when the object
+/// is being serialized again.
+typedef uint64_t hash_value_t;
+typedef std::function<void(void **pointer_adr, hash_value_t hash)>
+    resolver_function_t;
+typedef std::function<hash_value_t(void *pointer)> rehasher_function_t;
+
+// Helper function to test if a field is present, using any of the field
+// enums in the generated code.
+// `table` must be a generated table type. Since this is a template parameter,
+// this is not typechecked to be a subclass of Table, so beware!
+// Note: this function will return false for fields equal to the default
+// value, since they're not stored in the buffer (unless force_defaults was
+// used).
+template<typename T>
+bool IsFieldPresent(const T *table, typename T::FlatBuffersVTableOffset field) {
+  // Cast, since Table is a private baseclass of any table types.
+  return reinterpret_cast<const Table *>(table)->CheckField(
+      static_cast<voffset_t>(field));
+}
+
+// Utility function for reverse lookups on the EnumNames*() functions
+// (in the generated C++ code)
+// names must be NULL terminated.
+inline int LookupEnum(const char **names, const char *name) {
+  for (const char **p = names; *p; p++)
+    if (!strcmp(*p, name)) return static_cast<int>(p - names);
+  return -1;
+}
+
+// These macros allow us to layout a struct with a guarantee that they'll end
+// up looking the same on different compilers and platforms.
+// It does this by disallowing the compiler to do any padding, and then
+// does padding itself by inserting extra padding fields that make every
+// element aligned to its own size.
+// Additionally, it manually sets the alignment of the struct as a whole,
+// which is typically its largest element, or a custom size set in the schema
+// by the force_align attribute.
+// These are used in the generated code only.
+
+// clang-format off
+#if defined(_MSC_VER)
+  #define FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(alignment) \
+    __pragma(pack(1)) \
+    struct __declspec(align(alignment))
+  #define FLATBUFFERS_STRUCT_END(name, size) \
+    __pragma(pack()) \
+    static_assert(sizeof(name) == size, "compiler breaks packing rules")
+#elif defined(__GNUC__) || defined(__clang__) || defined(__ICCARM__)
+  #define FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(alignment) \
+    _Pragma("pack(1)") \
+    struct __attribute__((aligned(alignment)))
+  #define FLATBUFFERS_STRUCT_END(name, size) \
+    _Pragma("pack()") \
+    static_assert(sizeof(name) == size, "compiler breaks packing rules")
+#else
+  #error Unknown compiler, please define structure alignment macros
+#endif
+// clang-format on
+
+// Minimal reflection via code generation.
+// Besides full-fat reflection (see reflection.h) and parsing/printing by
+// loading schemas (see idl.h), we can also have code generation for minimal
+// reflection data which allows pretty-printing and other uses without needing
+// a schema or a parser.
+// Generate code with --reflect-types (types only) or --reflect-names (names
+// also) to enable.
+// See minireflect.h for utilities using this functionality.
+
+// These types are organized slightly differently as the ones in idl.h.
+enum SequenceType { ST_TABLE, ST_STRUCT, ST_UNION, ST_ENUM };
+
+// Scalars have the same order as in idl.h
+// clang-format off
+#define FLATBUFFERS_GEN_ELEMENTARY_TYPES(ET) \
+  ET(ET_UTYPE) \
+  ET(ET_BOOL) \
+  ET(ET_CHAR) \
+  ET(ET_UCHAR) \
+  ET(ET_SHORT) \
+  ET(ET_USHORT) \
+  ET(ET_INT) \
+  ET(ET_UINT) \
+  ET(ET_LONG) \
+  ET(ET_ULONG) \
+  ET(ET_FLOAT) \
+  ET(ET_DOUBLE) \
+  ET(ET_STRING) \
+  ET(ET_SEQUENCE)  // See SequenceType.
+
+enum ElementaryType {
+  #define FLATBUFFERS_ET(E) E,
+    FLATBUFFERS_GEN_ELEMENTARY_TYPES(FLATBUFFERS_ET)
+  #undef FLATBUFFERS_ET
+};
+
+inline const char * const *ElementaryTypeNames() {
+  static const char * const names[] = {
+    #define FLATBUFFERS_ET(E) #E,
+      FLATBUFFERS_GEN_ELEMENTARY_TYPES(FLATBUFFERS_ET)
+    #undef FLATBUFFERS_ET
+  };
+  return names;
+}
+// clang-format on
+
+// Basic type info cost just 16bits per field!
+// We're explicitly defining the signedness since the signedness of integer
+// bitfields is otherwise implementation-defined and causes warnings on older
+// GCC compilers.
+struct TypeCode {
+  // ElementaryType
+  unsigned short base_type : 4;
+  // Either vector (in table) or array (in struct)
+  unsigned short is_repeating : 1;
+  // Index into type_refs below, or -1 for none.
+  signed short sequence_ref : 11;
+};
+
+static_assert(sizeof(TypeCode) == 2, "TypeCode");
+
+struct TypeTable;
+
+// Signature of the static method present in each type.
+typedef const TypeTable *(*TypeFunction)();
+
+struct TypeTable {
+  SequenceType st;
+  size_t num_elems;  // of type_codes, values, names (but not type_refs).
+  const TypeCode *type_codes;     // num_elems count
+  const TypeFunction *type_refs;  // less than num_elems entries (see TypeCode).
+  const int16_t *array_sizes;     // less than num_elems entries (see TypeCode).
+  const int64_t *values;  // Only set for non-consecutive enum/union or structs.
+  const char *const *names;  // Only set if compiled with --reflect-names.
+};
+
+// String which identifies the current version of FlatBuffers.
+inline const char *flatbuffers_version_string() {
+  return "FlatBuffers " FLATBUFFERS_STRING(FLATBUFFERS_VERSION_MAJOR) "."
+      FLATBUFFERS_STRING(FLATBUFFERS_VERSION_MINOR) "."
+      FLATBUFFERS_STRING(FLATBUFFERS_VERSION_REVISION);
+}
+
+// clang-format off
+#define FLATBUFFERS_DEFINE_BITMASK_OPERATORS(E, T)\
+    inline E operator | (E lhs, E rhs){\
+        return E(T(lhs) | T(rhs));\
+    }\
+    inline E operator & (E lhs, E rhs){\
+        return E(T(lhs) & T(rhs));\
+    }\
+    inline E operator ^ (E lhs, E rhs){\
+        return E(T(lhs) ^ T(rhs));\
+    }\
+    inline E operator ~ (E lhs){\
+        return E(~T(lhs));\
+    }\
+    inline E operator |= (E &lhs, E rhs){\
+        lhs = lhs | rhs;\
+        return lhs;\
+    }\
+    inline E operator &= (E &lhs, E rhs){\
+        lhs = lhs & rhs;\
+        return lhs;\
+    }\
+    inline E operator ^= (E &lhs, E rhs){\
+        lhs = lhs ^ rhs;\
+        return lhs;\
+    }\
+    inline bool operator !(E rhs) \
+    {\
+        return !bool(T(rhs)); \
+    }
+/// @endcond
+}  // namespace flatbuffers
+
+// clang-format on
+
+#endif  // FLATBUFFERS_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flatc.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flatc.h
new file mode 100644
index 00000000..e98eb80d
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flatc.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2017 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_FLATC_H_
+#define FLATBUFFERS_FLATC_H_
+
+#include <functional>
+#include <limits>
+#include <list>
+#include <memory>
+#include <string>
+
+#include "flatbuffers/code_generator.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/idl.h"
+#include "flatbuffers/util.h"
+
+namespace flatbuffers {
+
+extern void LogCompilerWarn(const std::string &warn);
+extern void LogCompilerError(const std::string &err);
+
+struct FlatCOptions {
+  IDLOptions opts;
+
+  std::string program_name;
+
+  std::string output_path;
+
+  std::vector<std::string> filenames;
+
+  std::list<std::string> include_directories_storage;
+  std::vector<const char *> include_directories;
+  std::vector<const char *> conform_include_directories;
+  std::vector<bool> generator_enabled;
+  size_t binary_files_from = std::numeric_limits<size_t>::max();
+  std::string conform_to_schema;
+  std::string annotate_schema;
+  bool annotate_include_vector_contents = true;
+  bool any_generator = false;
+  bool print_make_rules = false;
+  bool raw_binary = false;
+  bool schema_binary = false;
+  bool grpc_enabled = false;
+  bool requires_bfbs = false;
+  bool file_names_only = false;
+
+  std::vector<std::shared_ptr<CodeGenerator>> generators;
+};
+
+struct FlatCOption {
+  std::string short_opt;
+  std::string long_opt;
+  std::string parameter;
+  std::string description;
+};
+
+class FlatCompiler {
+ public:
+  typedef void (*WarnFn)(const FlatCompiler *flatc, const std::string &warn,
+                         bool show_exe_name);
+
+  typedef void (*ErrorFn)(const FlatCompiler *flatc, const std::string &err,
+                          bool usage, bool show_exe_name);
+
+  // Parameters required to initialize the FlatCompiler.
+  struct InitParams {
+    InitParams() : warn_fn(nullptr), error_fn(nullptr) {}
+
+    WarnFn warn_fn;
+    ErrorFn error_fn;
+  };
+
+  explicit FlatCompiler(const InitParams &params) : params_(params) {}
+
+  bool RegisterCodeGenerator(const FlatCOption &option,
+                             std::shared_ptr<CodeGenerator> code_generator);
+
+  int Compile(const FlatCOptions &options);
+
+  std::string GetShortUsageString(const std::string &program_name) const;
+  std::string GetUsageString(const std::string &program_name) const;
+
+  // Parse the FlatC options from command line arguments.
+  FlatCOptions ParseFromCommandLineArguments(int argc, const char **argv);
+
+ private:
+  void ParseFile(flatbuffers::Parser &parser, const std::string &filename,
+                 const std::string &contents,
+                 const std::vector<const char *> &include_directories) const;
+
+  void LoadBinarySchema(Parser &parser, const std::string &filename,
+                        const std::string &contents);
+
+  void Warn(const std::string &warn, bool show_exe_name = true) const;
+
+  void Error(const std::string &err, bool usage = true,
+             bool show_exe_name = true) const;
+
+  void AnnotateBinaries(const uint8_t *binary_schema,
+                        uint64_t binary_schema_size,
+                        const FlatCOptions &options);
+
+  void ValidateOptions(const FlatCOptions &options);
+
+  Parser GetConformParser(const FlatCOptions &options);
+
+  std::unique_ptr<Parser> GenerateCode(const FlatCOptions &options,
+                                       Parser &conform_parser);
+
+  std::map<std::string, std::shared_ptr<CodeGenerator>> code_generators_;
+
+  InitParams params_;
+};
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_FLATC_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flex_flat_util.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flex_flat_util.h
new file mode 100644
index 00000000..020957eb
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flex_flat_util.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2022 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_FLEX_FLAT_UTIL_H_
+#define FLATBUFFERS_FLEX_FLAT_UTIL_H_
+
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/flexbuffers.h"
+
+namespace flexbuffers {
+
+// Verifies the `nested` flexbuffer within a flatbuffer vector is valid.
+inline bool VerifyNestedFlexBuffer(
+    const flatbuffers::Vector<uint8_t> *const nested,
+    flatbuffers::Verifier &verifier) {
+  if (!nested) return true;
+  return verifier.Check(flexbuffers::VerifyBuffer(
+      nested->data(), nested->size(), verifier.GetFlexReuseTracker()));
+}
+
+}  // namespace flexbuffers
+
+#endif  // FLATBUFFERS_FLEX_FLAT_UTIL_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flexbuffers.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flexbuffers.h
new file mode 100644
index 00000000..8e8cac14
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flexbuffers.h
@@ -0,0 +1,1887 @@
+/*
+ * Copyright 2017 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_FLEXBUFFERS_H_
+#define FLATBUFFERS_FLEXBUFFERS_H_
+
+#include <algorithm>
+#include <map>
+// Used to select STL variant.
+#include "flatbuffers/base.h"
+// We use the basic binary writing functions from the regular FlatBuffers.
+#include "flatbuffers/util.h"
+
+#ifdef _MSC_VER
+#  include <intrin.h>
+#endif
+
+#if defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable : 4127)  // C4127: conditional expression is constant
+#endif
+
+namespace flexbuffers {
+
+class Reference;
+class Map;
+
+// These are used in the lower 2 bits of a type field to determine the size of
+// the elements (and or size field) of the item pointed to (e.g. vector).
+enum BitWidth {
+  BIT_WIDTH_8 = 0,
+  BIT_WIDTH_16 = 1,
+  BIT_WIDTH_32 = 2,
+  BIT_WIDTH_64 = 3,
+};
+
+// These are used as the upper 6 bits of a type field to indicate the actual
+// type.
+enum Type {
+  FBT_NULL = 0,
+  FBT_INT = 1,
+  FBT_UINT = 2,
+  FBT_FLOAT = 3,
+  // Types above stored inline, types below (except FBT_BOOL) store an offset.
+  FBT_KEY = 4,
+  FBT_STRING = 5,
+  FBT_INDIRECT_INT = 6,
+  FBT_INDIRECT_UINT = 7,
+  FBT_INDIRECT_FLOAT = 8,
+  FBT_MAP = 9,
+  FBT_VECTOR = 10,      // Untyped.
+  FBT_VECTOR_INT = 11,  // Typed any size (stores no type table).
+  FBT_VECTOR_UINT = 12,
+  FBT_VECTOR_FLOAT = 13,
+  FBT_VECTOR_KEY = 14,
+  // DEPRECATED, use FBT_VECTOR or FBT_VECTOR_KEY instead.
+  // Read test.cpp/FlexBuffersDeprecatedTest() for details on why.
+  FBT_VECTOR_STRING_DEPRECATED = 15,
+  FBT_VECTOR_INT2 = 16,  // Typed tuple (no type table, no size field).
+  FBT_VECTOR_UINT2 = 17,
+  FBT_VECTOR_FLOAT2 = 18,
+  FBT_VECTOR_INT3 = 19,  // Typed triple (no type table, no size field).
+  FBT_VECTOR_UINT3 = 20,
+  FBT_VECTOR_FLOAT3 = 21,
+  FBT_VECTOR_INT4 = 22,  // Typed quad (no type table, no size field).
+  FBT_VECTOR_UINT4 = 23,
+  FBT_VECTOR_FLOAT4 = 24,
+  FBT_BLOB = 25,
+  FBT_BOOL = 26,
+  FBT_VECTOR_BOOL =
+      36,  // To Allow the same type of conversion of type to vector type
+
+  FBT_MAX_TYPE = 37
+};
+
+inline bool IsInline(Type t) { return t <= FBT_FLOAT || t == FBT_BOOL; }
+
+inline bool IsTypedVectorElementType(Type t) {
+  return (t >= FBT_INT && t <= FBT_STRING) || t == FBT_BOOL;
+}
+
+inline bool IsTypedVector(Type t) {
+  return (t >= FBT_VECTOR_INT && t <= FBT_VECTOR_STRING_DEPRECATED) ||
+         t == FBT_VECTOR_BOOL;
+}
+
+inline bool IsFixedTypedVector(Type t) {
+  return t >= FBT_VECTOR_INT2 && t <= FBT_VECTOR_FLOAT4;
+}
+
+inline Type ToTypedVector(Type t, size_t fixed_len = 0) {
+  FLATBUFFERS_ASSERT(IsTypedVectorElementType(t));
+  switch (fixed_len) {
+    case 0: return static_cast<Type>(t - FBT_INT + FBT_VECTOR_INT);
+    case 2: return static_cast<Type>(t - FBT_INT + FBT_VECTOR_INT2);
+    case 3: return static_cast<Type>(t - FBT_INT + FBT_VECTOR_INT3);
+    case 4: return static_cast<Type>(t - FBT_INT + FBT_VECTOR_INT4);
+    default: FLATBUFFERS_ASSERT(0); return FBT_NULL;
+  }
+}
+
+inline Type ToTypedVectorElementType(Type t) {
+  FLATBUFFERS_ASSERT(IsTypedVector(t));
+  return static_cast<Type>(t - FBT_VECTOR_INT + FBT_INT);
+}
+
+inline Type ToFixedTypedVectorElementType(Type t, uint8_t *len) {
+  FLATBUFFERS_ASSERT(IsFixedTypedVector(t));
+  auto fixed_type = t - FBT_VECTOR_INT2;
+  *len = static_cast<uint8_t>(fixed_type / 3 +
+                              2);  // 3 types each, starting from length 2.
+  return static_cast<Type>(fixed_type % 3 + FBT_INT);
+}
+
+// TODO: implement proper support for 8/16bit floats, or decide not to
+// support them.
+typedef int16_t half;
+typedef int8_t quarter;
+
+// TODO: can we do this without conditionals using intrinsics or inline asm
+// on some platforms? Given branch prediction the method below should be
+// decently quick, but it is the most frequently executed function.
+// We could do an (unaligned) 64-bit read if we ifdef out the platforms for
+// which that doesn't work (or where we'd read into un-owned memory).
+template<typename R, typename T1, typename T2, typename T4, typename T8>
+R ReadSizedScalar(const uint8_t *data, uint8_t byte_width) {
+  return byte_width < 4
+             ? (byte_width < 2
+                    ? static_cast<R>(flatbuffers::ReadScalar<T1>(data))
+                    : static_cast<R>(flatbuffers::ReadScalar<T2>(data)))
+             : (byte_width < 8
+                    ? static_cast<R>(flatbuffers::ReadScalar<T4>(data))
+                    : static_cast<R>(flatbuffers::ReadScalar<T8>(data)));
+}
+
+inline int64_t ReadInt64(const uint8_t *data, uint8_t byte_width) {
+  return ReadSizedScalar<int64_t, int8_t, int16_t, int32_t, int64_t>(
+      data, byte_width);
+}
+
+inline uint64_t ReadUInt64(const uint8_t *data, uint8_t byte_width) {
+  // This is the "hottest" function (all offset lookups use this), so worth
+  // optimizing if possible.
+  // TODO: GCC apparently replaces memcpy by a rep movsb, but only if count is a
+  // constant, which here it isn't. Test if memcpy is still faster than
+  // the conditionals in ReadSizedScalar. Can also use inline asm.
+
+  // clang-format off
+  #if defined(_MSC_VER) && defined(_M_X64) && !defined(_M_ARM64EC)
+  // This is 64-bit Windows only, __movsb does not work on 32-bit Windows.
+    uint64_t u = 0;
+    __movsb(reinterpret_cast<uint8_t *>(&u),
+            reinterpret_cast<const uint8_t *>(data), byte_width);
+    return flatbuffers::EndianScalar(u);
+  #else
+    return ReadSizedScalar<uint64_t, uint8_t, uint16_t, uint32_t, uint64_t>(
+             data, byte_width);
+  #endif
+  // clang-format on
+}
+
+inline double ReadDouble(const uint8_t *data, uint8_t byte_width) {
+  return ReadSizedScalar<double, quarter, half, float, double>(data,
+                                                               byte_width);
+}
+
+inline const uint8_t *Indirect(const uint8_t *offset, uint8_t byte_width) {
+  return offset - ReadUInt64(offset, byte_width);
+}
+
+template<typename T> const uint8_t *Indirect(const uint8_t *offset) {
+  return offset - flatbuffers::ReadScalar<T>(offset);
+}
+
+inline BitWidth WidthU(uint64_t u) {
+#define FLATBUFFERS_GET_FIELD_BIT_WIDTH(value, width)                   \
+  {                                                                     \
+    if (!((u) & ~((1ULL << (width)) - 1ULL))) return BIT_WIDTH_##width; \
+  }
+  FLATBUFFERS_GET_FIELD_BIT_WIDTH(u, 8);
+  FLATBUFFERS_GET_FIELD_BIT_WIDTH(u, 16);
+  FLATBUFFERS_GET_FIELD_BIT_WIDTH(u, 32);
+#undef FLATBUFFERS_GET_FIELD_BIT_WIDTH
+  return BIT_WIDTH_64;
+}
+
+inline BitWidth WidthI(int64_t i) {
+  auto u = static_cast<uint64_t>(i) << 1;
+  return WidthU(i >= 0 ? u : ~u);
+}
+
+inline BitWidth WidthF(double f) {
+  return static_cast<double>(static_cast<float>(f)) == f ? BIT_WIDTH_32
+                                                         : BIT_WIDTH_64;
+}
+
+// Base class of all types below.
+// Points into the data buffer and allows access to one type.
+class Object {
+ public:
+  Object(const uint8_t *data, uint8_t byte_width)
+      : data_(data), byte_width_(byte_width) {}
+
+ protected:
+  const uint8_t *data_;
+  uint8_t byte_width_;
+};
+
+// Object that has a size, obtained either from size prefix, or elsewhere.
+class Sized : public Object {
+ public:
+  // Size prefix.
+  Sized(const uint8_t *data, uint8_t byte_width)
+      : Object(data, byte_width), size_(read_size()) {}
+  // Manual size.
+  Sized(const uint8_t *data, uint8_t byte_width, size_t sz)
+      : Object(data, byte_width), size_(sz) {}
+  size_t size() const { return size_; }
+  // Access size stored in `byte_width_` bytes before data_ pointer.
+  size_t read_size() const {
+    return static_cast<size_t>(ReadUInt64(data_ - byte_width_, byte_width_));
+  }
+
+ protected:
+  size_t size_;
+};
+
+class String : public Sized {
+ public:
+  // Size prefix.
+  String(const uint8_t *data, uint8_t byte_width) : Sized(data, byte_width) {}
+  // Manual size.
+  String(const uint8_t *data, uint8_t byte_width, size_t sz)
+      : Sized(data, byte_width, sz) {}
+
+  size_t length() const { return size(); }
+  const char *c_str() const { return reinterpret_cast<const char *>(data_); }
+  std::string str() const { return std::string(c_str(), size()); }
+
+  static String EmptyString() {
+    static const char *empty_string = "";
+    return String(reinterpret_cast<const uint8_t *>(empty_string), 1, 0);
+  }
+  bool IsTheEmptyString() const { return data_ == EmptyString().data_; }
+};
+
+class Blob : public Sized {
+ public:
+  Blob(const uint8_t *data_buf, uint8_t byte_width)
+      : Sized(data_buf, byte_width) {}
+
+  static Blob EmptyBlob() {
+    static const uint8_t empty_blob[] = { 0 /*len*/ };
+    return Blob(empty_blob + 1, 1);
+  }
+  bool IsTheEmptyBlob() const { return data_ == EmptyBlob().data_; }
+  const uint8_t *data() const { return data_; }
+};
+
+class Vector : public Sized {
+ public:
+  Vector(const uint8_t *data, uint8_t byte_width) : Sized(data, byte_width) {}
+
+  Reference operator[](size_t i) const;
+
+  static Vector EmptyVector() {
+    static const uint8_t empty_vector[] = { 0 /*len*/ };
+    return Vector(empty_vector + 1, 1);
+  }
+  bool IsTheEmptyVector() const { return data_ == EmptyVector().data_; }
+};
+
+class TypedVector : public Sized {
+ public:
+  TypedVector(const uint8_t *data, uint8_t byte_width, Type element_type)
+      : Sized(data, byte_width), type_(element_type) {}
+
+  Reference operator[](size_t i) const;
+
+  static TypedVector EmptyTypedVector() {
+    static const uint8_t empty_typed_vector[] = { 0 /*len*/ };
+    return TypedVector(empty_typed_vector + 1, 1, FBT_INT);
+  }
+  bool IsTheEmptyVector() const {
+    return data_ == TypedVector::EmptyTypedVector().data_;
+  }
+
+  Type ElementType() { return type_; }
+
+  friend Reference;
+
+ private:
+  Type type_;
+
+  friend Map;
+};
+
+class FixedTypedVector : public Object {
+ public:
+  FixedTypedVector(const uint8_t *data, uint8_t byte_width, Type element_type,
+                   uint8_t len)
+      : Object(data, byte_width), type_(element_type), len_(len) {}
+
+  Reference operator[](size_t i) const;
+
+  static FixedTypedVector EmptyFixedTypedVector() {
+    static const uint8_t fixed_empty_vector[] = { 0 /* unused */ };
+    return FixedTypedVector(fixed_empty_vector, 1, FBT_INT, 0);
+  }
+  bool IsTheEmptyFixedTypedVector() const {
+    return data_ == FixedTypedVector::EmptyFixedTypedVector().data_;
+  }
+
+  Type ElementType() const { return type_; }
+  uint8_t size() const { return len_; }
+
+ private:
+  Type type_;
+  uint8_t len_;
+};
+
+class Map : public Vector {
+ public:
+  Map(const uint8_t *data, uint8_t byte_width) : Vector(data, byte_width) {}
+
+  Reference operator[](const char *key) const;
+  Reference operator[](const std::string &key) const;
+
+  Vector Values() const { return Vector(data_, byte_width_); }
+
+  TypedVector Keys() const {
+    const size_t num_prefixed_fields = 3;
+    auto keys_offset = data_ - byte_width_ * num_prefixed_fields;
+    return TypedVector(Indirect(keys_offset, byte_width_),
+                       static_cast<uint8_t>(
+                           ReadUInt64(keys_offset + byte_width_, byte_width_)),
+                       FBT_KEY);
+  }
+
+  static Map EmptyMap() {
+    static const uint8_t empty_map[] = {
+      0 /*keys_len*/, 0 /*keys_offset*/, 1 /*keys_width*/, 0 /*len*/
+    };
+    return Map(empty_map + 4, 1);
+  }
+
+  bool IsTheEmptyMap() const { return data_ == EmptyMap().data_; }
+};
+
+template<typename T>
+void AppendToString(std::string &s, T &&v, bool keys_quoted) {
+  s += "[ ";
+  for (size_t i = 0; i < v.size(); i++) {
+    if (i) s += ", ";
+    v[i].ToString(true, keys_quoted, s);
+  }
+  s += " ]";
+}
+
+class Reference {
+ public:
+  Reference()
+      : data_(nullptr), parent_width_(0), byte_width_(0), type_(FBT_NULL) {}
+
+  Reference(const uint8_t *data, uint8_t parent_width, uint8_t byte_width,
+            Type type)
+      : data_(data),
+        parent_width_(parent_width),
+        byte_width_(byte_width),
+        type_(type) {}
+
+  Reference(const uint8_t *data, uint8_t parent_width, uint8_t packed_type)
+      : data_(data),
+        parent_width_(parent_width),
+        byte_width_(static_cast<uint8_t>(1 << (packed_type & 3))),
+        type_(static_cast<Type>(packed_type >> 2)) {}
+
+  Type GetType() const { return type_; }
+
+  bool IsNull() const { return type_ == FBT_NULL; }
+  bool IsBool() const { return type_ == FBT_BOOL; }
+  bool IsInt() const { return type_ == FBT_INT || type_ == FBT_INDIRECT_INT; }
+  bool IsUInt() const {
+    return type_ == FBT_UINT || type_ == FBT_INDIRECT_UINT;
+  }
+  bool IsIntOrUint() const { return IsInt() || IsUInt(); }
+  bool IsFloat() const {
+    return type_ == FBT_FLOAT || type_ == FBT_INDIRECT_FLOAT;
+  }
+  bool IsNumeric() const { return IsIntOrUint() || IsFloat(); }
+  bool IsString() const { return type_ == FBT_STRING; }
+  bool IsKey() const { return type_ == FBT_KEY; }
+  bool IsVector() const { return type_ == FBT_VECTOR || type_ == FBT_MAP; }
+  bool IsUntypedVector() const { return type_ == FBT_VECTOR; }
+  bool IsTypedVector() const { return flexbuffers::IsTypedVector(type_); }
+  bool IsFixedTypedVector() const {
+    return flexbuffers::IsFixedTypedVector(type_);
+  }
+  bool IsAnyVector() const {
+    return (IsTypedVector() || IsFixedTypedVector() || IsVector());
+  }
+  bool IsMap() const { return type_ == FBT_MAP; }
+  bool IsBlob() const { return type_ == FBT_BLOB; }
+  bool AsBool() const {
+    return (type_ == FBT_BOOL ? ReadUInt64(data_, parent_width_)
+                              : AsUInt64()) != 0;
+  }
+
+  // Reads any type as a int64_t. Never fails, does most sensible conversion.
+  // Truncates floats, strings are attempted to be parsed for a number,
+  // vectors/maps return their size. Returns 0 if all else fails.
+  int64_t AsInt64() const {
+    if (type_ == FBT_INT) {
+      // A fast path for the common case.
+      return ReadInt64(data_, parent_width_);
+    } else
+      switch (type_) {
+        case FBT_INDIRECT_INT: return ReadInt64(Indirect(), byte_width_);
+        case FBT_UINT: return ReadUInt64(data_, parent_width_);
+        case FBT_INDIRECT_UINT: return ReadUInt64(Indirect(), byte_width_);
+        case FBT_FLOAT:
+          return static_cast<int64_t>(ReadDouble(data_, parent_width_));
+        case FBT_INDIRECT_FLOAT:
+          return static_cast<int64_t>(ReadDouble(Indirect(), byte_width_));
+        case FBT_NULL: return 0;
+        case FBT_STRING: return flatbuffers::StringToInt(AsString().c_str());
+        case FBT_VECTOR: return static_cast<int64_t>(AsVector().size());
+        case FBT_BOOL: return ReadInt64(data_, parent_width_);
+        default:
+          // Convert other things to int.
+          return 0;
+      }
+  }
+
+  // TODO: could specialize these to not use AsInt64() if that saves
+  // extension ops in generated code, and use a faster op than ReadInt64.
+  int32_t AsInt32() const { return static_cast<int32_t>(AsInt64()); }
+  int16_t AsInt16() const { return static_cast<int16_t>(AsInt64()); }
+  int8_t AsInt8() const { return static_cast<int8_t>(AsInt64()); }
+
+  uint64_t AsUInt64() const {
+    if (type_ == FBT_UINT) {
+      // A fast path for the common case.
+      return ReadUInt64(data_, parent_width_);
+    } else
+      switch (type_) {
+        case FBT_INDIRECT_UINT: return ReadUInt64(Indirect(), byte_width_);
+        case FBT_INT: return ReadInt64(data_, parent_width_);
+        case FBT_INDIRECT_INT: return ReadInt64(Indirect(), byte_width_);
+        case FBT_FLOAT:
+          return static_cast<uint64_t>(ReadDouble(data_, parent_width_));
+        case FBT_INDIRECT_FLOAT:
+          return static_cast<uint64_t>(ReadDouble(Indirect(), byte_width_));
+        case FBT_NULL: return 0;
+        case FBT_STRING: return flatbuffers::StringToUInt(AsString().c_str());
+        case FBT_VECTOR: return static_cast<uint64_t>(AsVector().size());
+        case FBT_BOOL: return ReadUInt64(data_, parent_width_);
+        default:
+          // Convert other things to uint.
+          return 0;
+      }
+  }
+
+  uint32_t AsUInt32() const { return static_cast<uint32_t>(AsUInt64()); }
+  uint16_t AsUInt16() const { return static_cast<uint16_t>(AsUInt64()); }
+  uint8_t AsUInt8() const { return static_cast<uint8_t>(AsUInt64()); }
+
+  double AsDouble() const {
+    if (type_ == FBT_FLOAT) {
+      // A fast path for the common case.
+      return ReadDouble(data_, parent_width_);
+    } else
+      switch (type_) {
+        case FBT_INDIRECT_FLOAT: return ReadDouble(Indirect(), byte_width_);
+        case FBT_INT:
+          return static_cast<double>(ReadInt64(data_, parent_width_));
+        case FBT_UINT:
+          return static_cast<double>(ReadUInt64(data_, parent_width_));
+        case FBT_INDIRECT_INT:
+          return static_cast<double>(ReadInt64(Indirect(), byte_width_));
+        case FBT_INDIRECT_UINT:
+          return static_cast<double>(ReadUInt64(Indirect(), byte_width_));
+        case FBT_NULL: return 0.0;
+        case FBT_STRING: {
+          double d;
+          flatbuffers::StringToNumber(AsString().c_str(), &d);
+          return d;
+        }
+        case FBT_VECTOR: return static_cast<double>(AsVector().size());
+        case FBT_BOOL:
+          return static_cast<double>(ReadUInt64(data_, parent_width_));
+        default:
+          // Convert strings and other things to float.
+          return 0;
+      }
+  }
+
+  float AsFloat() const { return static_cast<float>(AsDouble()); }
+
+  const char *AsKey() const {
+    if (type_ == FBT_KEY || type_ == FBT_STRING) {
+      return reinterpret_cast<const char *>(Indirect());
+    } else {
+      return "";
+    }
+  }
+
+  // This function returns the empty string if you try to read something that
+  // is not a string or key.
+  String AsString() const {
+    if (type_ == FBT_STRING) {
+      return String(Indirect(), byte_width_);
+    } else if (type_ == FBT_KEY) {
+      auto key = Indirect();
+      return String(key, byte_width_,
+                    strlen(reinterpret_cast<const char *>(key)));
+    } else {
+      return String::EmptyString();
+    }
+  }
+
+  // Unlike AsString(), this will convert any type to a std::string.
+  std::string ToString() const {
+    std::string s;
+    ToString(false, false, s);
+    return s;
+  }
+
+  // Convert any type to a JSON-like string. strings_quoted determines if
+  // string values at the top level receive "" quotes (inside other values
+  // they always do). keys_quoted determines if keys are quoted, at any level.
+  // TODO(wvo): add further options to have indentation/newlines.
+  void ToString(bool strings_quoted, bool keys_quoted, std::string &s) const {
+    if (type_ == FBT_STRING) {
+      String str(Indirect(), byte_width_);
+      if (strings_quoted) {
+        flatbuffers::EscapeString(str.c_str(), str.length(), &s, true, false);
+      } else {
+        s.append(str.c_str(), str.length());
+      }
+    } else if (IsKey()) {
+      auto str = AsKey();
+      if (keys_quoted) {
+        flatbuffers::EscapeString(str, strlen(str), &s, true, false);
+      } else {
+        s += str;
+      }
+    } else if (IsInt()) {
+      s += flatbuffers::NumToString(AsInt64());
+    } else if (IsUInt()) {
+      s += flatbuffers::NumToString(AsUInt64());
+    } else if (IsFloat()) {
+      s += flatbuffers::NumToString(AsDouble());
+    } else if (IsNull()) {
+      s += "null";
+    } else if (IsBool()) {
+      s += AsBool() ? "true" : "false";
+    } else if (IsMap()) {
+      s += "{ ";
+      auto m = AsMap();
+      auto keys = m.Keys();
+      auto vals = m.Values();
+      for (size_t i = 0; i < keys.size(); i++) {
+        bool kq = keys_quoted;
+        if (!kq) {
+          // FlexBuffers keys may contain arbitrary characters, only allow
+          // unquoted if it looks like an "identifier":
+          const char *p = keys[i].AsKey();
+          if (!flatbuffers::is_alpha(*p) && *p != '_') {
+            kq = true;
+          } else {
+            while (*++p) {
+              if (!flatbuffers::is_alnum(*p) && *p != '_') {
+                kq = true;
+                break;
+              }
+            }
+          }
+        }
+        keys[i].ToString(true, kq, s);
+        s += ": ";
+        vals[i].ToString(true, keys_quoted, s);
+        if (i < keys.size() - 1) s += ", ";
+      }
+      s += " }";
+    } else if (IsVector()) {
+      AppendToString<Vector>(s, AsVector(), keys_quoted);
+    } else if (IsTypedVector()) {
+      AppendToString<TypedVector>(s, AsTypedVector(), keys_quoted);
+    } else if (IsFixedTypedVector()) {
+      AppendToString<FixedTypedVector>(s, AsFixedTypedVector(), keys_quoted);
+    } else if (IsBlob()) {
+      auto blob = AsBlob();
+      flatbuffers::EscapeString(reinterpret_cast<const char *>(blob.data()),
+                                blob.size(), &s, true, false);
+    } else {
+      s += "(?)";
+    }
+  }
+
+  // This function returns the empty blob if you try to read a not-blob.
+  // Strings can be viewed as blobs too.
+  Blob AsBlob() const {
+    if (type_ == FBT_BLOB || type_ == FBT_STRING) {
+      return Blob(Indirect(), byte_width_);
+    } else {
+      return Blob::EmptyBlob();
+    }
+  }
+
+  // This function returns the empty vector if you try to read a not-vector.
+  // Maps can be viewed as vectors too.
+  Vector AsVector() const {
+    if (type_ == FBT_VECTOR || type_ == FBT_MAP) {
+      return Vector(Indirect(), byte_width_);
+    } else {
+      return Vector::EmptyVector();
+    }
+  }
+
+  TypedVector AsTypedVector() const {
+    if (IsTypedVector()) {
+      auto tv =
+          TypedVector(Indirect(), byte_width_, ToTypedVectorElementType(type_));
+      if (tv.type_ == FBT_STRING) {
+        // These can't be accessed as strings, since we don't know the bit-width
+        // of the size field, see the declaration of
+        // FBT_VECTOR_STRING_DEPRECATED above for details.
+        // We change the type here to be keys, which are a subtype of strings,
+        // and will ignore the size field. This will truncate strings with
+        // embedded nulls.
+        tv.type_ = FBT_KEY;
+      }
+      return tv;
+    } else {
+      return TypedVector::EmptyTypedVector();
+    }
+  }
+
+  FixedTypedVector AsFixedTypedVector() const {
+    if (IsFixedTypedVector()) {
+      uint8_t len = 0;
+      auto vtype = ToFixedTypedVectorElementType(type_, &len);
+      return FixedTypedVector(Indirect(), byte_width_, vtype, len);
+    } else {
+      return FixedTypedVector::EmptyFixedTypedVector();
+    }
+  }
+
+  Map AsMap() const {
+    if (type_ == FBT_MAP) {
+      return Map(Indirect(), byte_width_);
+    } else {
+      return Map::EmptyMap();
+    }
+  }
+
+  template<typename T> T As() const;
+
+  // Experimental: Mutation functions.
+  // These allow scalars in an already created buffer to be updated in-place.
+  // Since by default scalars are stored in the smallest possible space,
+  // the new value may not fit, in which case these functions return false.
+  // To avoid this, you can construct the values you intend to mutate using
+  // Builder::ForceMinimumBitWidth.
+  bool MutateInt(int64_t i) {
+    if (type_ == FBT_INT) {
+      return Mutate(data_, i, parent_width_, WidthI(i));
+    } else if (type_ == FBT_INDIRECT_INT) {
+      return Mutate(Indirect(), i, byte_width_, WidthI(i));
+    } else if (type_ == FBT_UINT) {
+      auto u = static_cast<uint64_t>(i);
+      return Mutate(data_, u, parent_width_, WidthU(u));
+    } else if (type_ == FBT_INDIRECT_UINT) {
+      auto u = static_cast<uint64_t>(i);
+      return Mutate(Indirect(), u, byte_width_, WidthU(u));
+    } else {
+      return false;
+    }
+  }
+
+  bool MutateBool(bool b) {
+    return type_ == FBT_BOOL && Mutate(data_, b, parent_width_, BIT_WIDTH_8);
+  }
+
+  bool MutateUInt(uint64_t u) {
+    if (type_ == FBT_UINT) {
+      return Mutate(data_, u, parent_width_, WidthU(u));
+    } else if (type_ == FBT_INDIRECT_UINT) {
+      return Mutate(Indirect(), u, byte_width_, WidthU(u));
+    } else if (type_ == FBT_INT) {
+      auto i = static_cast<int64_t>(u);
+      return Mutate(data_, i, parent_width_, WidthI(i));
+    } else if (type_ == FBT_INDIRECT_INT) {
+      auto i = static_cast<int64_t>(u);
+      return Mutate(Indirect(), i, byte_width_, WidthI(i));
+    } else {
+      return false;
+    }
+  }
+
+  bool MutateFloat(float f) {
+    if (type_ == FBT_FLOAT) {
+      return MutateF(data_, f, parent_width_, BIT_WIDTH_32);
+    } else if (type_ == FBT_INDIRECT_FLOAT) {
+      return MutateF(Indirect(), f, byte_width_, BIT_WIDTH_32);
+    } else {
+      return false;
+    }
+  }
+
+  bool MutateFloat(double d) {
+    if (type_ == FBT_FLOAT) {
+      return MutateF(data_, d, parent_width_, WidthF(d));
+    } else if (type_ == FBT_INDIRECT_FLOAT) {
+      return MutateF(Indirect(), d, byte_width_, WidthF(d));
+    } else {
+      return false;
+    }
+  }
+
+  bool MutateString(const char *str, size_t len) {
+    auto s = AsString();
+    if (s.IsTheEmptyString()) return false;
+    // This is very strict, could allow shorter strings, but that creates
+    // garbage.
+    if (s.length() != len) return false;
+    memcpy(const_cast<char *>(s.c_str()), str, len);
+    return true;
+  }
+  bool MutateString(const char *str) { return MutateString(str, strlen(str)); }
+  bool MutateString(const std::string &str) {
+    return MutateString(str.data(), str.length());
+  }
+
+ private:
+  const uint8_t *Indirect() const {
+    return flexbuffers::Indirect(data_, parent_width_);
+  }
+
+  template<typename T>
+  bool Mutate(const uint8_t *dest, T t, size_t byte_width,
+              BitWidth value_width) {
+    auto fits = static_cast<size_t>(static_cast<size_t>(1U) << value_width) <=
+                byte_width;
+    if (fits) {
+      t = flatbuffers::EndianScalar(t);
+      memcpy(const_cast<uint8_t *>(dest), &t, byte_width);
+    }
+    return fits;
+  }
+
+  template<typename T>
+  bool MutateF(const uint8_t *dest, T t, size_t byte_width,
+               BitWidth value_width) {
+    if (byte_width == sizeof(double))
+      return Mutate(dest, static_cast<double>(t), byte_width, value_width);
+    if (byte_width == sizeof(float))
+      return Mutate(dest, static_cast<float>(t), byte_width, value_width);
+    FLATBUFFERS_ASSERT(false);
+    return false;
+  }
+
+  friend class Verifier;
+
+  const uint8_t *data_;
+  uint8_t parent_width_;
+  uint8_t byte_width_;
+  Type type_;
+};
+
+// Template specialization for As().
+template<> inline bool Reference::As<bool>() const { return AsBool(); }
+
+template<> inline int8_t Reference::As<int8_t>() const { return AsInt8(); }
+template<> inline int16_t Reference::As<int16_t>() const { return AsInt16(); }
+template<> inline int32_t Reference::As<int32_t>() const { return AsInt32(); }
+template<> inline int64_t Reference::As<int64_t>() const { return AsInt64(); }
+
+template<> inline uint8_t Reference::As<uint8_t>() const { return AsUInt8(); }
+template<> inline uint16_t Reference::As<uint16_t>() const {
+  return AsUInt16();
+}
+template<> inline uint32_t Reference::As<uint32_t>() const {
+  return AsUInt32();
+}
+template<> inline uint64_t Reference::As<uint64_t>() const {
+  return AsUInt64();
+}
+
+template<> inline double Reference::As<double>() const { return AsDouble(); }
+template<> inline float Reference::As<float>() const { return AsFloat(); }
+
+template<> inline String Reference::As<String>() const { return AsString(); }
+template<> inline std::string Reference::As<std::string>() const {
+  return AsString().str();
+}
+
+template<> inline Blob Reference::As<Blob>() const { return AsBlob(); }
+template<> inline Vector Reference::As<Vector>() const { return AsVector(); }
+template<> inline TypedVector Reference::As<TypedVector>() const {
+  return AsTypedVector();
+}
+template<> inline FixedTypedVector Reference::As<FixedTypedVector>() const {
+  return AsFixedTypedVector();
+}
+template<> inline Map Reference::As<Map>() const { return AsMap(); }
+
+inline uint8_t PackedType(BitWidth bit_width, Type type) {
+  return static_cast<uint8_t>(bit_width | (type << 2));
+}
+
+inline uint8_t NullPackedType() { return PackedType(BIT_WIDTH_8, FBT_NULL); }
+
+// Vector accessors.
+// Note: if you try to access outside of bounds, you get a Null value back
+// instead. Normally this would be an assert, but since this is "dynamically
+// typed" data, you may not want that (someone sends you a 2d vector and you
+// wanted 3d).
+// The Null converts seamlessly into a default value for any other type.
+// TODO(wvo): Could introduce an #ifdef that makes this into an assert?
+inline Reference Vector::operator[](size_t i) const {
+  auto len = size();
+  if (i >= len) return Reference(nullptr, 1, NullPackedType());
+  auto packed_type = (data_ + len * byte_width_)[i];
+  auto elem = data_ + i * byte_width_;
+  return Reference(elem, byte_width_, packed_type);
+}
+
+inline Reference TypedVector::operator[](size_t i) const {
+  auto len = size();
+  if (i >= len) return Reference(nullptr, 1, NullPackedType());
+  auto elem = data_ + i * byte_width_;
+  return Reference(elem, byte_width_, 1, type_);
+}
+
+inline Reference FixedTypedVector::operator[](size_t i) const {
+  if (i >= len_) return Reference(nullptr, 1, NullPackedType());
+  auto elem = data_ + i * byte_width_;
+  return Reference(elem, byte_width_, 1, type_);
+}
+
+template<typename T> int KeyCompare(const void *key, const void *elem) {
+  auto str_elem = reinterpret_cast<const char *>(
+      Indirect<T>(reinterpret_cast<const uint8_t *>(elem)));
+  auto skey = reinterpret_cast<const char *>(key);
+  return strcmp(skey, str_elem);
+}
+
+inline Reference Map::operator[](const char *key) const {
+  auto keys = Keys();
+  // We can't pass keys.byte_width_ to the comparison function, so we have
+  // to pick the right one ahead of time.
+  int (*comp)(const void *, const void *) = nullptr;
+  switch (keys.byte_width_) {
+    case 1: comp = KeyCompare<uint8_t>; break;
+    case 2: comp = KeyCompare<uint16_t>; break;
+    case 4: comp = KeyCompare<uint32_t>; break;
+    case 8: comp = KeyCompare<uint64_t>; break;
+    default: FLATBUFFERS_ASSERT(false); return Reference();
+  }
+  auto res = std::bsearch(key, keys.data_, keys.size(), keys.byte_width_, comp);
+  if (!res) return Reference(nullptr, 1, NullPackedType());
+  auto i = (reinterpret_cast<uint8_t *>(res) - keys.data_) / keys.byte_width_;
+  return (*static_cast<const Vector *>(this))[i];
+}
+
+inline Reference Map::operator[](const std::string &key) const {
+  return (*this)[key.c_str()];
+}
+
+inline Reference GetRoot(const uint8_t *buffer, size_t size) {
+  // See Finish() below for the serialization counterpart of this.
+  // The root starts at the end of the buffer, so we parse backwards from there.
+  auto end = buffer + size;
+  auto byte_width = *--end;
+  auto packed_type = *--end;
+  end -= byte_width;  // The root data item.
+  return Reference(end, byte_width, packed_type);
+}
+
+inline Reference GetRoot(const std::vector<uint8_t> &buffer) {
+  return GetRoot(buffer.data(), buffer.size());
+}
+
+// Flags that configure how the Builder behaves.
+// The "Share" flags determine if the Builder automatically tries to pool
+// this type. Pooling can reduce the size of serialized data if there are
+// multiple maps of the same kind, at the expense of slightly slower
+// serialization (the cost of lookups) and more memory use (std::set).
+// By default this is on for keys, but off for strings.
+// Turn keys off if you have e.g. only one map.
+// Turn strings on if you expect many non-unique string values.
+// Additionally, sharing key vectors can save space if you have maps with
+// identical field populations.
+enum BuilderFlag {
+  BUILDER_FLAG_NONE = 0,
+  BUILDER_FLAG_SHARE_KEYS = 1,
+  BUILDER_FLAG_SHARE_STRINGS = 2,
+  BUILDER_FLAG_SHARE_KEYS_AND_STRINGS = 3,
+  BUILDER_FLAG_SHARE_KEY_VECTORS = 4,
+  BUILDER_FLAG_SHARE_ALL = 7,
+};
+
+class Builder FLATBUFFERS_FINAL_CLASS {
+ public:
+  Builder(size_t initial_size = 256,
+          BuilderFlag flags = BUILDER_FLAG_SHARE_KEYS)
+      : buf_(initial_size),
+        finished_(false),
+        has_duplicate_keys_(false),
+        flags_(flags),
+        force_min_bit_width_(BIT_WIDTH_8),
+        key_pool(KeyOffsetCompare(buf_)),
+        string_pool(StringOffsetCompare(buf_)) {
+    buf_.clear();
+  }
+
+#ifdef FLATBUFFERS_DEFAULT_DECLARATION
+  Builder(Builder &&) = default;
+  Builder &operator=(Builder &&) = default;
+#endif
+
+  /// @brief Get the serialized buffer (after you call `Finish()`).
+  /// @return Returns a vector owned by this class.
+  const std::vector<uint8_t> &GetBuffer() const {
+    Finished();
+    return buf_;
+  }
+
+  // Size of the buffer. Does not include unfinished values.
+  size_t GetSize() const { return buf_.size(); }
+
+  // Reset all state so we can re-use the buffer.
+  void Clear() {
+    buf_.clear();
+    stack_.clear();
+    finished_ = false;
+    // flags_ remains as-is;
+    force_min_bit_width_ = BIT_WIDTH_8;
+    key_pool.clear();
+    string_pool.clear();
+  }
+
+  // All value constructing functions below have two versions: one that
+  // takes a key (for placement inside a map) and one that doesn't (for inside
+  // vectors and elsewhere).
+
+  void Null() { stack_.push_back(Value()); }
+  void Null(const char *key) {
+    Key(key);
+    Null();
+  }
+
+  void Int(int64_t i) { stack_.push_back(Value(i, FBT_INT, WidthI(i))); }
+  void Int(const char *key, int64_t i) {
+    Key(key);
+    Int(i);
+  }
+
+  void UInt(uint64_t u) { stack_.push_back(Value(u, FBT_UINT, WidthU(u))); }
+  void UInt(const char *key, uint64_t u) {
+    Key(key);
+    UInt(u);
+  }
+
+  void Float(float f) { stack_.push_back(Value(f)); }
+  void Float(const char *key, float f) {
+    Key(key);
+    Float(f);
+  }
+
+  void Double(double f) { stack_.push_back(Value(f)); }
+  void Double(const char *key, double d) {
+    Key(key);
+    Double(d);
+  }
+
+  void Bool(bool b) { stack_.push_back(Value(b)); }
+  void Bool(const char *key, bool b) {
+    Key(key);
+    Bool(b);
+  }
+
+  void IndirectInt(int64_t i) { PushIndirect(i, FBT_INDIRECT_INT, WidthI(i)); }
+  void IndirectInt(const char *key, int64_t i) {
+    Key(key);
+    IndirectInt(i);
+  }
+
+  void IndirectUInt(uint64_t u) {
+    PushIndirect(u, FBT_INDIRECT_UINT, WidthU(u));
+  }
+  void IndirectUInt(const char *key, uint64_t u) {
+    Key(key);
+    IndirectUInt(u);
+  }
+
+  void IndirectFloat(float f) {
+    PushIndirect(f, FBT_INDIRECT_FLOAT, BIT_WIDTH_32);
+  }
+  void IndirectFloat(const char *key, float f) {
+    Key(key);
+    IndirectFloat(f);
+  }
+
+  void IndirectDouble(double f) {
+    PushIndirect(f, FBT_INDIRECT_FLOAT, WidthF(f));
+  }
+  void IndirectDouble(const char *key, double d) {
+    Key(key);
+    IndirectDouble(d);
+  }
+
+  size_t Key(const char *str, size_t len) {
+    auto sloc = buf_.size();
+    WriteBytes(str, len + 1);
+    if (flags_ & BUILDER_FLAG_SHARE_KEYS) {
+      auto it = key_pool.find(sloc);
+      if (it != key_pool.end()) {
+        // Already in the buffer. Remove key we just serialized, and use
+        // existing offset instead.
+        buf_.resize(sloc);
+        sloc = *it;
+      } else {
+        key_pool.insert(sloc);
+      }
+    }
+    stack_.push_back(Value(static_cast<uint64_t>(sloc), FBT_KEY, BIT_WIDTH_8));
+    return sloc;
+  }
+
+  size_t Key(const char *str) { return Key(str, strlen(str)); }
+  size_t Key(const std::string &str) { return Key(str.c_str(), str.size()); }
+
+  size_t String(const char *str, size_t len) {
+    auto reset_to = buf_.size();
+    auto sloc = CreateBlob(str, len, 1, FBT_STRING);
+    if (flags_ & BUILDER_FLAG_SHARE_STRINGS) {
+      StringOffset so(sloc, len);
+      auto it = string_pool.find(so);
+      if (it != string_pool.end()) {
+        // Already in the buffer. Remove string we just serialized, and use
+        // existing offset instead.
+        buf_.resize(reset_to);
+        sloc = it->first;
+        stack_.back().u_ = sloc;
+      } else {
+        string_pool.insert(so);
+      }
+    }
+    return sloc;
+  }
+  size_t String(const char *str) { return String(str, strlen(str)); }
+  size_t String(const std::string &str) {
+    return String(str.c_str(), str.size());
+  }
+  void String(const flexbuffers::String &str) {
+    String(str.c_str(), str.length());
+  }
+
+  void String(const char *key, const char *str) {
+    Key(key);
+    String(str);
+  }
+  void String(const char *key, const std::string &str) {
+    Key(key);
+    String(str);
+  }
+  void String(const char *key, const flexbuffers::String &str) {
+    Key(key);
+    String(str);
+  }
+
+  size_t Blob(const void *data, size_t len) {
+    return CreateBlob(data, len, 0, FBT_BLOB);
+  }
+  size_t Blob(const std::vector<uint8_t> &v) {
+    return CreateBlob(v.data(), v.size(), 0, FBT_BLOB);
+  }
+
+  void Blob(const char *key, const void *data, size_t len) {
+    Key(key);
+    Blob(data, len);
+  }
+  void Blob(const char *key, const std::vector<uint8_t> &v) {
+    Key(key);
+    Blob(v);
+  }
+
+  // TODO(wvo): support all the FlexBuffer types (like flexbuffers::String),
+  // e.g. Vector etc. Also in overloaded versions.
+  // Also some FlatBuffers types?
+
+  size_t StartVector() { return stack_.size(); }
+  size_t StartVector(const char *key) {
+    Key(key);
+    return stack_.size();
+  }
+  size_t StartMap() { return stack_.size(); }
+  size_t StartMap(const char *key) {
+    Key(key);
+    return stack_.size();
+  }
+
+  // TODO(wvo): allow this to specify an alignment greater than the natural
+  // alignment.
+  size_t EndVector(size_t start, bool typed, bool fixed) {
+    auto vec = CreateVector(start, stack_.size() - start, 1, typed, fixed);
+    // Remove temp elements and return vector.
+    stack_.resize(start);
+    stack_.push_back(vec);
+    return static_cast<size_t>(vec.u_);
+  }
+
+  size_t EndMap(size_t start) {
+    // We should have interleaved keys and values on the stack.
+    // Make sure it is an even number:
+    auto len = stack_.size() - start;
+    FLATBUFFERS_ASSERT(!(len & 1));
+    len /= 2;
+    // Make sure keys are all strings:
+    for (auto key = start; key < stack_.size(); key += 2) {
+      FLATBUFFERS_ASSERT(stack_[key].type_ == FBT_KEY);
+    }
+    // Now sort values, so later we can do a binary search lookup.
+    // We want to sort 2 array elements at a time.
+    struct TwoValue {
+      Value key;
+      Value val;
+    };
+    // TODO(wvo): strict aliasing?
+    // TODO(wvo): allow the caller to indicate the data is already sorted
+    // for maximum efficiency? With an assert to check sortedness to make sure
+    // we're not breaking binary search.
+    // Or, we can track if the map is sorted as keys are added which would be
+    // be quite cheap (cheaper than checking it here), so we can skip this
+    // step automatically when appliccable, and encourage people to write in
+    // sorted fashion.
+    // std::sort is typically already a lot faster on sorted data though.
+    auto dict = reinterpret_cast<TwoValue *>(stack_.data() + start);
+    std::sort(
+        dict, dict + len, [&](const TwoValue &a, const TwoValue &b) -> bool {
+          auto as = reinterpret_cast<const char *>(buf_.data() + a.key.u_);
+          auto bs = reinterpret_cast<const char *>(buf_.data() + b.key.u_);
+          auto comp = strcmp(as, bs);
+          // We want to disallow duplicate keys, since this results in a
+          // map where values cannot be found.
+          // But we can't assert here (since we don't want to fail on
+          // random JSON input) or have an error mechanism.
+          // Instead, we set has_duplicate_keys_ in the builder to
+          // signal this.
+          // TODO: Have to check for pointer equality, as some sort
+          // implementation apparently call this function with the same
+          // element?? Why?
+          if (!comp && &a != &b) has_duplicate_keys_ = true;
+          return comp < 0;
+        });
+    // First create a vector out of all keys.
+    // TODO(wvo): if kBuilderFlagShareKeyVectors is true, see if we can share
+    // the first vector.
+    auto keys = CreateVector(start, len, 2, true, false);
+    auto vec = CreateVector(start + 1, len, 2, false, false, &keys);
+    // Remove temp elements and return map.
+    stack_.resize(start);
+    stack_.push_back(vec);
+    return static_cast<size_t>(vec.u_);
+  }
+
+  // Call this after EndMap to see if the map had any duplicate keys.
+  // Any map with such keys won't be able to retrieve all values.
+  bool HasDuplicateKeys() const { return has_duplicate_keys_; }
+
+  template<typename F> size_t Vector(F f) {
+    auto start = StartVector();
+    f();
+    return EndVector(start, false, false);
+  }
+  template<typename F, typename T> size_t Vector(F f, T &state) {
+    auto start = StartVector();
+    f(state);
+    return EndVector(start, false, false);
+  }
+  template<typename F> size_t Vector(const char *key, F f) {
+    auto start = StartVector(key);
+    f();
+    return EndVector(start, false, false);
+  }
+  template<typename F, typename T>
+  size_t Vector(const char *key, F f, T &state) {
+    auto start = StartVector(key);
+    f(state);
+    return EndVector(start, false, false);
+  }
+
+  template<typename T> void Vector(const T *elems, size_t len) {
+    if (flatbuffers::is_scalar<T>::value) {
+      // This path should be a lot quicker and use less space.
+      ScalarVector(elems, len, false);
+    } else {
+      auto start = StartVector();
+      for (size_t i = 0; i < len; i++) Add(elems[i]);
+      EndVector(start, false, false);
+    }
+  }
+  template<typename T>
+  void Vector(const char *key, const T *elems, size_t len) {
+    Key(key);
+    Vector(elems, len);
+  }
+  template<typename T> void Vector(const std::vector<T> &vec) {
+    Vector(vec.data(), vec.size());
+  }
+
+  template<typename F> size_t TypedVector(F f) {
+    auto start = StartVector();
+    f();
+    return EndVector(start, true, false);
+  }
+  template<typename F, typename T> size_t TypedVector(F f, T &state) {
+    auto start = StartVector();
+    f(state);
+    return EndVector(start, true, false);
+  }
+  template<typename F> size_t TypedVector(const char *key, F f) {
+    auto start = StartVector(key);
+    f();
+    return EndVector(start, true, false);
+  }
+  template<typename F, typename T>
+  size_t TypedVector(const char *key, F f, T &state) {
+    auto start = StartVector(key);
+    f(state);
+    return EndVector(start, true, false);
+  }
+
+  template<typename T> size_t FixedTypedVector(const T *elems, size_t len) {
+    // We only support a few fixed vector lengths. Anything bigger use a
+    // regular typed vector.
+    FLATBUFFERS_ASSERT(len >= 2 && len <= 4);
+    // And only scalar values.
+    static_assert(flatbuffers::is_scalar<T>::value, "Unrelated types");
+    return ScalarVector(elems, len, true);
+  }
+
+  template<typename T>
+  size_t FixedTypedVector(const char *key, const T *elems, size_t len) {
+    Key(key);
+    return FixedTypedVector(elems, len);
+  }
+
+  template<typename F> size_t Map(F f) {
+    auto start = StartMap();
+    f();
+    return EndMap(start);
+  }
+  template<typename F, typename T> size_t Map(F f, T &state) {
+    auto start = StartMap();
+    f(state);
+    return EndMap(start);
+  }
+  template<typename F> size_t Map(const char *key, F f) {
+    auto start = StartMap(key);
+    f();
+    return EndMap(start);
+  }
+  template<typename F, typename T> size_t Map(const char *key, F f, T &state) {
+    auto start = StartMap(key);
+    f(state);
+    return EndMap(start);
+  }
+  template<typename T> void Map(const std::map<std::string, T> &map) {
+    auto start = StartMap();
+    for (auto it = map.begin(); it != map.end(); ++it)
+      Add(it->first.c_str(), it->second);
+    EndMap(start);
+  }
+
+  // If you wish to share a value explicitly (a value not shared automatically
+  // through one of the BUILDER_FLAG_SHARE_* flags) you can do so with these
+  // functions. Or if you wish to turn those flags off for performance reasons
+  // and still do some explicit sharing. For example:
+  // builder.IndirectDouble(M_PI);
+  // auto id = builder.LastValue();  // Remember where we stored it.
+  // .. more code goes here ..
+  // builder.ReuseValue(id);  // Refers to same double by offset.
+  // LastValue works regardless of whether the value has a key or not.
+  // Works on any data type.
+  struct Value;
+  Value LastValue() { return stack_.back(); }
+  void ReuseValue(Value v) { stack_.push_back(v); }
+  void ReuseValue(const char *key, Value v) {
+    Key(key);
+    ReuseValue(v);
+  }
+
+  // Overloaded Add that tries to call the correct function above.
+  void Add(int8_t i) { Int(i); }
+  void Add(int16_t i) { Int(i); }
+  void Add(int32_t i) { Int(i); }
+  void Add(int64_t i) { Int(i); }
+  void Add(uint8_t u) { UInt(u); }
+  void Add(uint16_t u) { UInt(u); }
+  void Add(uint32_t u) { UInt(u); }
+  void Add(uint64_t u) { UInt(u); }
+  void Add(float f) { Float(f); }
+  void Add(double d) { Double(d); }
+  void Add(bool b) { Bool(b); }
+  void Add(const char *str) { String(str); }
+  void Add(const std::string &str) { String(str); }
+  void Add(const flexbuffers::String &str) { String(str); }
+
+  template<typename T> void Add(const std::vector<T> &vec) { Vector(vec); }
+
+  template<typename T> void Add(const char *key, const T &t) {
+    Key(key);
+    Add(t);
+  }
+
+  template<typename T> void Add(const std::map<std::string, T> &map) {
+    Map(map);
+  }
+
+  template<typename T> void operator+=(const T &t) { Add(t); }
+
+  // This function is useful in combination with the Mutate* functions above.
+  // It forces elements of vectors and maps to have a minimum size, such that
+  // they can later be updated without failing.
+  // Call with no arguments to reset.
+  void ForceMinimumBitWidth(BitWidth bw = BIT_WIDTH_8) {
+    force_min_bit_width_ = bw;
+  }
+
+  void Finish() {
+    // If you hit this assert, you likely have objects that were never included
+    // in a parent. You need to have exactly one root to finish a buffer.
+    // Check your Start/End calls are matched, and all objects are inside
+    // some other object.
+    FLATBUFFERS_ASSERT(stack_.size() == 1);
+
+    // Write root value.
+    auto byte_width = Align(stack_[0].ElemWidth(buf_.size(), 0));
+    WriteAny(stack_[0], byte_width);
+    // Write root type.
+    Write(stack_[0].StoredPackedType(), 1);
+    // Write root size. Normally determined by parent, but root has no parent :)
+    Write(byte_width, 1);
+
+    finished_ = true;
+  }
+
+ private:
+  void Finished() const {
+    // If you get this assert, you're attempting to get access a buffer
+    // which hasn't been finished yet. Be sure to call
+    // Builder::Finish with your root object.
+    FLATBUFFERS_ASSERT(finished_);
+  }
+
+  // Align to prepare for writing a scalar with a certain size.
+  uint8_t Align(BitWidth alignment) {
+    auto byte_width = 1U << alignment;
+    buf_.insert(buf_.end(), flatbuffers::PaddingBytes(buf_.size(), byte_width),
+                0);
+    return static_cast<uint8_t>(byte_width);
+  }
+
+  void WriteBytes(const void *val, size_t size) {
+    buf_.insert(buf_.end(), reinterpret_cast<const uint8_t *>(val),
+                reinterpret_cast<const uint8_t *>(val) + size);
+  }
+
+  template<typename T> void Write(T val, size_t byte_width) {
+    FLATBUFFERS_ASSERT(sizeof(T) >= byte_width);
+    val = flatbuffers::EndianScalar(val);
+    WriteBytes(&val, byte_width);
+  }
+
+  void WriteDouble(double f, uint8_t byte_width) {
+    switch (byte_width) {
+      case 8: Write(f, byte_width); break;
+      case 4: Write(static_cast<float>(f), byte_width); break;
+      // case 2: Write(static_cast<half>(f), byte_width); break;
+      // case 1: Write(static_cast<quarter>(f), byte_width); break;
+      default: FLATBUFFERS_ASSERT(0);
+    }
+  }
+
+  void WriteOffset(uint64_t o, uint8_t byte_width) {
+    auto reloff = buf_.size() - o;
+    FLATBUFFERS_ASSERT(byte_width == 8 || reloff < 1ULL << (byte_width * 8));
+    Write(reloff, byte_width);
+  }
+
+  template<typename T> void PushIndirect(T val, Type type, BitWidth bit_width) {
+    auto byte_width = Align(bit_width);
+    auto iloc = buf_.size();
+    Write(val, byte_width);
+    stack_.push_back(Value(static_cast<uint64_t>(iloc), type, bit_width));
+  }
+
+  static BitWidth WidthB(size_t byte_width) {
+    switch (byte_width) {
+      case 1: return BIT_WIDTH_8;
+      case 2: return BIT_WIDTH_16;
+      case 4: return BIT_WIDTH_32;
+      case 8: return BIT_WIDTH_64;
+      default: FLATBUFFERS_ASSERT(false); return BIT_WIDTH_64;
+    }
+  }
+
+  template<typename T> static Type GetScalarType() {
+    static_assert(flatbuffers::is_scalar<T>::value, "Unrelated types");
+    return flatbuffers::is_floating_point<T>::value ? FBT_FLOAT
+           : flatbuffers::is_same<T, bool>::value
+               ? FBT_BOOL
+               : (flatbuffers::is_unsigned<T>::value ? FBT_UINT : FBT_INT);
+  }
+
+ public:
+  // This was really intended to be private, except for LastValue/ReuseValue.
+  struct Value {
+    union {
+      int64_t i_;
+      uint64_t u_;
+      double f_;
+    };
+
+    Type type_;
+
+    // For scalars: of itself, for vector: of its elements, for string: length.
+    BitWidth min_bit_width_;
+
+    Value() : i_(0), type_(FBT_NULL), min_bit_width_(BIT_WIDTH_8) {}
+
+    Value(bool b)
+        : u_(static_cast<uint64_t>(b)),
+          type_(FBT_BOOL),
+          min_bit_width_(BIT_WIDTH_8) {}
+
+    Value(int64_t i, Type t, BitWidth bw)
+        : i_(i), type_(t), min_bit_width_(bw) {}
+    Value(uint64_t u, Type t, BitWidth bw)
+        : u_(u), type_(t), min_bit_width_(bw) {}
+
+    Value(float f)
+        : f_(static_cast<double>(f)),
+          type_(FBT_FLOAT),
+          min_bit_width_(BIT_WIDTH_32) {}
+    Value(double f) : f_(f), type_(FBT_FLOAT), min_bit_width_(WidthF(f)) {}
+
+    uint8_t StoredPackedType(BitWidth parent_bit_width_ = BIT_WIDTH_8) const {
+      return PackedType(StoredWidth(parent_bit_width_), type_);
+    }
+
+    BitWidth ElemWidth(size_t buf_size, size_t elem_index) const {
+      if (IsInline(type_)) {
+        return min_bit_width_;
+      } else {
+        // We have an absolute offset, but want to store a relative offset
+        // elem_index elements beyond the current buffer end. Since whether
+        // the relative offset fits in a certain byte_width depends on
+        // the size of the elements before it (and their alignment), we have
+        // to test for each size in turn.
+        for (size_t byte_width = 1;
+             byte_width <= sizeof(flatbuffers::largest_scalar_t);
+             byte_width *= 2) {
+          // Where are we going to write this offset?
+          auto offset_loc = buf_size +
+                            flatbuffers::PaddingBytes(buf_size, byte_width) +
+                            elem_index * byte_width;
+          // Compute relative offset.
+          auto offset = offset_loc - u_;
+          // Does it fit?
+          auto bit_width = WidthU(offset);
+          if (static_cast<size_t>(static_cast<size_t>(1U) << bit_width) ==
+              byte_width)
+            return bit_width;
+        }
+        FLATBUFFERS_ASSERT(false);  // Must match one of the sizes above.
+        return BIT_WIDTH_64;
+      }
+    }
+
+    BitWidth StoredWidth(BitWidth parent_bit_width_ = BIT_WIDTH_8) const {
+      if (IsInline(type_)) {
+        return (std::max)(min_bit_width_, parent_bit_width_);
+      } else {
+        return min_bit_width_;
+      }
+    }
+  };
+
+ private:
+  void WriteAny(const Value &val, uint8_t byte_width) {
+    switch (val.type_) {
+      case FBT_NULL:
+      case FBT_INT: Write(val.i_, byte_width); break;
+      case FBT_BOOL:
+      case FBT_UINT: Write(val.u_, byte_width); break;
+      case FBT_FLOAT: WriteDouble(val.f_, byte_width); break;
+      default: WriteOffset(val.u_, byte_width); break;
+    }
+  }
+
+  size_t CreateBlob(const void *data, size_t len, size_t trailing, Type type) {
+    auto bit_width = WidthU(len);
+    auto byte_width = Align(bit_width);
+    Write<uint64_t>(len, byte_width);
+    auto sloc = buf_.size();
+    WriteBytes(data, len + trailing);
+    stack_.push_back(Value(static_cast<uint64_t>(sloc), type, bit_width));
+    return sloc;
+  }
+
+  template<typename T>
+  size_t ScalarVector(const T *elems, size_t len, bool fixed) {
+    auto vector_type = GetScalarType<T>();
+    auto byte_width = sizeof(T);
+    auto bit_width = WidthB(byte_width);
+    // If you get this assert, you're trying to write a vector with a size
+    // field that is bigger than the scalars you're trying to write (e.g. a
+    // byte vector > 255 elements). For such types, write a "blob" instead.
+    // TODO: instead of asserting, could write vector with larger elements
+    // instead, though that would be wasteful.
+    FLATBUFFERS_ASSERT(WidthU(len) <= bit_width);
+    Align(bit_width);
+    if (!fixed) Write<uint64_t>(len, byte_width);
+    auto vloc = buf_.size();
+    for (size_t i = 0; i < len; i++) Write(elems[i], byte_width);
+    stack_.push_back(Value(static_cast<uint64_t>(vloc),
+                           ToTypedVector(vector_type, fixed ? len : 0),
+                           bit_width));
+    return vloc;
+  }
+
+  Value CreateVector(size_t start, size_t vec_len, size_t step, bool typed,
+                     bool fixed, const Value *keys = nullptr) {
+    FLATBUFFERS_ASSERT(
+        !fixed ||
+        typed);  // typed=false, fixed=true combination is not supported.
+    // Figure out smallest bit width we can store this vector with.
+    auto bit_width = (std::max)(force_min_bit_width_, WidthU(vec_len));
+    auto prefix_elems = 1;
+    if (keys) {
+      // If this vector is part of a map, we will pre-fix an offset to the keys
+      // to this vector.
+      bit_width = (std::max)(bit_width, keys->ElemWidth(buf_.size(), 0));
+      prefix_elems += 2;
+    }
+    Type vector_type = FBT_KEY;
+    // Check bit widths and types for all elements.
+    for (size_t i = start; i < stack_.size(); i += step) {
+      auto elem_width =
+          stack_[i].ElemWidth(buf_.size(), i - start + prefix_elems);
+      bit_width = (std::max)(bit_width, elem_width);
+      if (typed) {
+        if (i == start) {
+          vector_type = stack_[i].type_;
+        } else {
+          // If you get this assert, you are writing a typed vector with
+          // elements that are not all the same type.
+          FLATBUFFERS_ASSERT(vector_type == stack_[i].type_);
+        }
+      }
+    }
+    // If you get this assert, your typed types are not one of:
+    // Int / UInt / Float / Key.
+    FLATBUFFERS_ASSERT(!typed || IsTypedVectorElementType(vector_type));
+    auto byte_width = Align(bit_width);
+    // Write vector. First the keys width/offset if available, and size.
+    if (keys) {
+      WriteOffset(keys->u_, byte_width);
+      Write<uint64_t>(1ULL << keys->min_bit_width_, byte_width);
+    }
+    if (!fixed) Write<uint64_t>(vec_len, byte_width);
+    // Then the actual data.
+    auto vloc = buf_.size();
+    for (size_t i = start; i < stack_.size(); i += step) {
+      WriteAny(stack_[i], byte_width);
+    }
+    // Then the types.
+    if (!typed) {
+      for (size_t i = start; i < stack_.size(); i += step) {
+        buf_.push_back(stack_[i].StoredPackedType(bit_width));
+      }
+    }
+    return Value(static_cast<uint64_t>(vloc),
+                 keys ? FBT_MAP
+                      : (typed ? ToTypedVector(vector_type, fixed ? vec_len : 0)
+                               : FBT_VECTOR),
+                 bit_width);
+  }
+
+  // You shouldn't really be copying instances of this class.
+  Builder(const Builder &);
+  Builder &operator=(const Builder &);
+
+  std::vector<uint8_t> buf_;
+  std::vector<Value> stack_;
+
+  bool finished_;
+  bool has_duplicate_keys_;
+
+  BuilderFlag flags_;
+
+  BitWidth force_min_bit_width_;
+
+  struct KeyOffsetCompare {
+    explicit KeyOffsetCompare(const std::vector<uint8_t> &buf) : buf_(&buf) {}
+    bool operator()(size_t a, size_t b) const {
+      auto stra = reinterpret_cast<const char *>(buf_->data() + a);
+      auto strb = reinterpret_cast<const char *>(buf_->data() + b);
+      return strcmp(stra, strb) < 0;
+    }
+    const std::vector<uint8_t> *buf_;
+  };
+
+  typedef std::pair<size_t, size_t> StringOffset;
+  struct StringOffsetCompare {
+    explicit StringOffsetCompare(const std::vector<uint8_t> &buf)
+        : buf_(&buf) {}
+    bool operator()(const StringOffset &a, const StringOffset &b) const {
+      auto stra = buf_->data() + a.first;
+      auto strb = buf_->data() + b.first;
+      auto cr = memcmp(stra, strb, (std::min)(a.second, b.second) + 1);
+      return cr < 0 || (cr == 0 && a.second < b.second);
+    }
+    const std::vector<uint8_t> *buf_;
+  };
+
+  typedef std::set<size_t, KeyOffsetCompare> KeyOffsetMap;
+  typedef std::set<StringOffset, StringOffsetCompare> StringOffsetMap;
+
+  KeyOffsetMap key_pool;
+  StringOffsetMap string_pool;
+
+  friend class Verifier;
+};
+
+// Helper class to verify the integrity of a FlexBuffer
+class Verifier FLATBUFFERS_FINAL_CLASS {
+ public:
+  Verifier(const uint8_t *buf, size_t buf_len,
+           // Supplying this vector likely results in faster verification
+           // of larger buffers with many shared keys/strings, but
+           // comes at the cost of using additional memory the same size of
+           // the buffer being verified, so it is by default off.
+           std::vector<uint8_t> *reuse_tracker = nullptr,
+           bool _check_alignment = true, size_t max_depth = 64)
+      : buf_(buf),
+        size_(buf_len),
+        depth_(0),
+        max_depth_(max_depth),
+        num_vectors_(0),
+        max_vectors_(buf_len),
+        check_alignment_(_check_alignment),
+        reuse_tracker_(reuse_tracker) {
+    FLATBUFFERS_ASSERT(size_ < FLATBUFFERS_MAX_BUFFER_SIZE);
+    if (reuse_tracker_) {
+      reuse_tracker_->clear();
+      reuse_tracker_->resize(size_, PackedType(BIT_WIDTH_8, FBT_NULL));
+    }
+  }
+
+ private:
+  // Central location where any verification failures register.
+  bool Check(bool ok) const {
+    // clang-format off
+    #ifdef FLATBUFFERS_DEBUG_VERIFICATION_FAILURE
+      FLATBUFFERS_ASSERT(ok);
+    #endif
+    // clang-format on
+    return ok;
+  }
+
+  // Verify any range within the buffer.
+  bool VerifyFrom(size_t elem, size_t elem_len) const {
+    return Check(elem_len < size_ && elem <= size_ - elem_len);
+  }
+  bool VerifyBefore(size_t elem, size_t elem_len) const {
+    return Check(elem_len <= elem);
+  }
+
+  bool VerifyFromPointer(const uint8_t *p, size_t len) {
+    auto o = static_cast<size_t>(p - buf_);
+    return VerifyFrom(o, len);
+  }
+  bool VerifyBeforePointer(const uint8_t *p, size_t len) {
+    auto o = static_cast<size_t>(p - buf_);
+    return VerifyBefore(o, len);
+  }
+
+  bool VerifyByteWidth(size_t width) {
+    return Check(width == 1 || width == 2 || width == 4 || width == 8);
+  }
+
+  bool VerifyType(int type) { return Check(type >= 0 && type < FBT_MAX_TYPE); }
+
+  bool VerifyOffset(uint64_t off, const uint8_t *p) {
+    return Check(off <= static_cast<uint64_t>(size_)) &&
+           off <= static_cast<uint64_t>(p - buf_);
+  }
+
+  bool VerifyAlignment(const uint8_t *p, size_t size) const {
+    auto o = static_cast<size_t>(p - buf_);
+    return Check((o & (size - 1)) == 0 || !check_alignment_);
+  }
+
+// Macro, since we want to escape from parent function & use lazy args.
+#define FLEX_CHECK_VERIFIED(P, PACKED_TYPE)                     \
+  if (reuse_tracker_) {                                         \
+    auto packed_type = PACKED_TYPE;                             \
+    auto existing = (*reuse_tracker_)[P - buf_];                \
+    if (existing == packed_type) return true;                   \
+    /* Fail verification if already set with different type! */ \
+    if (!Check(existing == 0)) return false;                    \
+    (*reuse_tracker_)[P - buf_] = packed_type;                  \
+  }
+
+  bool VerifyVector(Reference r, const uint8_t *p, Type elem_type) {
+    // Any kind of nesting goes thru this function, so guard against that
+    // here, both with simple nesting checks, and the reuse tracker if on.
+    depth_++;
+    num_vectors_++;
+    if (!Check(depth_ <= max_depth_ && num_vectors_ <= max_vectors_))
+      return false;
+    auto size_byte_width = r.byte_width_;
+    if (!VerifyBeforePointer(p, size_byte_width)) return false;
+    FLEX_CHECK_VERIFIED(p - size_byte_width,
+                        PackedType(Builder::WidthB(size_byte_width), r.type_));
+    auto sized = Sized(p, size_byte_width);
+    auto num_elems = sized.size();
+    auto elem_byte_width = r.type_ == FBT_STRING || r.type_ == FBT_BLOB
+                               ? uint8_t(1)
+                               : r.byte_width_;
+    auto max_elems = SIZE_MAX / elem_byte_width;
+    if (!Check(num_elems < max_elems))
+      return false;  // Protect against byte_size overflowing.
+    auto byte_size = num_elems * elem_byte_width;
+    if (!VerifyFromPointer(p, byte_size)) return false;
+    if (elem_type == FBT_NULL) {
+      // Verify type bytes after the vector.
+      if (!VerifyFromPointer(p + byte_size, num_elems)) return false;
+      auto v = Vector(p, size_byte_width);
+      for (size_t i = 0; i < num_elems; i++)
+        if (!VerifyRef(v[i])) return false;
+    } else if (elem_type == FBT_KEY) {
+      auto v = TypedVector(p, elem_byte_width, FBT_KEY);
+      for (size_t i = 0; i < num_elems; i++)
+        if (!VerifyRef(v[i])) return false;
+    } else {
+      FLATBUFFERS_ASSERT(IsInline(elem_type));
+    }
+    depth_--;
+    return true;
+  }
+
+  bool VerifyKeys(const uint8_t *p, uint8_t byte_width) {
+    // The vector part of the map has already been verified.
+    const size_t num_prefixed_fields = 3;
+    if (!VerifyBeforePointer(p, byte_width * num_prefixed_fields)) return false;
+    p -= byte_width * num_prefixed_fields;
+    auto off = ReadUInt64(p, byte_width);
+    if (!VerifyOffset(off, p)) return false;
+    auto key_byte_with =
+        static_cast<uint8_t>(ReadUInt64(p + byte_width, byte_width));
+    if (!VerifyByteWidth(key_byte_with)) return false;
+    return VerifyVector(Reference(p, byte_width, key_byte_with, FBT_VECTOR_KEY),
+                        p - off, FBT_KEY);
+  }
+
+  bool VerifyKey(const uint8_t *p) {
+    FLEX_CHECK_VERIFIED(p, PackedType(BIT_WIDTH_8, FBT_KEY));
+    while (p < buf_ + size_)
+      if (*p++) return true;
+    return false;
+  }
+
+#undef FLEX_CHECK_VERIFIED
+
+  bool VerifyTerminator(const String &s) {
+    return VerifyFromPointer(reinterpret_cast<const uint8_t *>(s.c_str()),
+                             s.size() + 1);
+  }
+
+  bool VerifyRef(Reference r) {
+    // r.parent_width_ and r.data_ already verified.
+    if (!VerifyByteWidth(r.byte_width_) || !VerifyType(r.type_)) {
+      return false;
+    }
+    if (IsInline(r.type_)) {
+      // Inline scalars, don't require further verification.
+      return true;
+    }
+    // All remaining types are an offset.
+    auto off = ReadUInt64(r.data_, r.parent_width_);
+    if (!VerifyOffset(off, r.data_)) return false;
+    auto p = r.Indirect();
+    if (!VerifyAlignment(p, r.byte_width_)) return false;
+    switch (r.type_) {
+      case FBT_INDIRECT_INT:
+      case FBT_INDIRECT_UINT:
+      case FBT_INDIRECT_FLOAT: return VerifyFromPointer(p, r.byte_width_);
+      case FBT_KEY: return VerifyKey(p);
+      case FBT_MAP:
+        return VerifyVector(r, p, FBT_NULL) && VerifyKeys(p, r.byte_width_);
+      case FBT_VECTOR: return VerifyVector(r, p, FBT_NULL);
+      case FBT_VECTOR_INT: return VerifyVector(r, p, FBT_INT);
+      case FBT_VECTOR_BOOL:
+      case FBT_VECTOR_UINT: return VerifyVector(r, p, FBT_UINT);
+      case FBT_VECTOR_FLOAT: return VerifyVector(r, p, FBT_FLOAT);
+      case FBT_VECTOR_KEY: return VerifyVector(r, p, FBT_KEY);
+      case FBT_VECTOR_STRING_DEPRECATED:
+        // Use of FBT_KEY here intentional, see elsewhere.
+        return VerifyVector(r, p, FBT_KEY);
+      case FBT_BLOB: return VerifyVector(r, p, FBT_UINT);
+      case FBT_STRING:
+        return VerifyVector(r, p, FBT_UINT) &&
+               VerifyTerminator(String(p, r.byte_width_));
+      case FBT_VECTOR_INT2:
+      case FBT_VECTOR_UINT2:
+      case FBT_VECTOR_FLOAT2:
+      case FBT_VECTOR_INT3:
+      case FBT_VECTOR_UINT3:
+      case FBT_VECTOR_FLOAT3:
+      case FBT_VECTOR_INT4:
+      case FBT_VECTOR_UINT4:
+      case FBT_VECTOR_FLOAT4: {
+        uint8_t len = 0;
+        auto vtype = ToFixedTypedVectorElementType(r.type_, &len);
+        if (!VerifyType(vtype)) return false;
+        return VerifyFromPointer(p, static_cast<size_t>(r.byte_width_) * len);
+      }
+      default: return false;
+    }
+  }
+
+ public:
+  bool VerifyBuffer() {
+    if (!Check(size_ >= 3)) return false;
+    auto end = buf_ + size_;
+    auto byte_width = *--end;
+    auto packed_type = *--end;
+    return VerifyByteWidth(byte_width) && Check(end - buf_ >= byte_width) &&
+           VerifyRef(Reference(end - byte_width, byte_width, packed_type));
+  }
+
+ private:
+  const uint8_t *buf_;
+  size_t size_;
+  size_t depth_;
+  const size_t max_depth_;
+  size_t num_vectors_;
+  const size_t max_vectors_;
+  bool check_alignment_;
+  std::vector<uint8_t> *reuse_tracker_;
+};
+
+// Utility function that constructs the Verifier for you, see above for
+// parameters.
+inline bool VerifyBuffer(const uint8_t *buf, size_t buf_len,
+                         std::vector<uint8_t> *reuse_tracker = nullptr) {
+  Verifier verifier(buf, buf_len, reuse_tracker);
+  return verifier.VerifyBuffer();
+}
+
+}  // namespace flexbuffers
+
+#if defined(_MSC_VER)
+#  pragma warning(pop)
+#endif
+
+#endif  // FLATBUFFERS_FLEXBUFFERS_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/grpc.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/grpc.h
new file mode 100644
index 00000000..7472c31d
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/grpc.h
@@ -0,0 +1,299 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_GRPC_H_
+#define FLATBUFFERS_GRPC_H_
+
+// Helper functionality to glue FlatBuffers and GRPC.
+
+#include "flatbuffers/flatbuffers.h"
+#include "grpcpp/support/byte_buffer.h"
+#include "grpcpp/support/slice.h"
+
+namespace flatbuffers {
+namespace grpc {
+
+// Message is a typed wrapper around a buffer that manages the underlying
+// `grpc_slice` and also provides flatbuffers-specific helpers such as `Verify`
+// and `GetRoot`. Since it is backed by a `grpc_slice`, the underlying buffer
+// is refcounted and ownership is be managed automatically.
+template<class T> class Message {
+ public:
+  Message() {}
+
+  Message(::grpc::Slice slice) : slice_(slice) {}
+
+  Message &operator=(const Message &other) = delete;
+
+  Message(Message &&other) = default;
+
+  Message(const Message &other) = delete;
+
+  Message &operator=(Message &&other) = default;
+
+  const uint8_t *mutable_data() const { return slice_.begin(); }
+
+  const uint8_t *data() const { return slice_.begin(); }
+
+  size_t size() const { return slice_.size(); }
+
+  bool Verify() const {
+    Verifier verifier(data(), size());
+    return verifier.VerifyBuffer<T>(nullptr);
+  }
+
+  T *GetMutableRoot() { return flatbuffers::GetMutableRoot<T>(mutable_data()); }
+
+  const T *GetRoot() const { return flatbuffers::GetRoot<T>(data()); }
+
+  // This is only intended for serializer use, or if you know what you're doing
+  const ::grpc::Slice &BorrowSlice() const { return slice_; }
+
+ private:
+  ::grpc::Slice slice_;
+};
+
+class MessageBuilder;
+
+// SliceAllocator is a gRPC-specific allocator that uses the `grpc_slice`
+// refcounted slices to manage memory ownership. This makes it easy and
+// efficient to transfer buffers to gRPC.
+class SliceAllocator : public Allocator {
+ public:
+  SliceAllocator() {}
+
+  SliceAllocator(const SliceAllocator &other) = delete;
+  SliceAllocator &operator=(const SliceAllocator &other) = delete;
+
+  SliceAllocator(SliceAllocator &&other) {
+    // default-construct and swap idiom
+    swap(other);
+  }
+
+  SliceAllocator &operator=(SliceAllocator &&other) {
+    // move-construct and swap idiom
+    SliceAllocator temp(std::move(other));
+    swap(temp);
+    return *this;
+  }
+
+  void swap(SliceAllocator &other) {
+    using std::swap;
+    swap(slice_, other.slice_);
+  }
+
+  virtual ~SliceAllocator() {}
+
+  virtual uint8_t *allocate(size_t size) override {
+    FLATBUFFERS_ASSERT(slice_.size() == 0);
+    slice_ = ::grpc::Slice(size);
+    return const_cast<uint8_t *>(slice_.begin());
+  }
+
+  virtual void deallocate(uint8_t *p, size_t size) override {
+    FLATBUFFERS_ASSERT(p == slice_.begin());
+    FLATBUFFERS_ASSERT(size == slice_.size());
+    slice_ = ::grpc::Slice();
+  }
+
+  virtual uint8_t *reallocate_downward(uint8_t *old_p, size_t old_size,
+                                       size_t new_size, size_t in_use_back,
+                                       size_t in_use_front) override {
+    FLATBUFFERS_ASSERT(old_p == slice_.begin());
+    FLATBUFFERS_ASSERT(old_size == slice_.size());
+    FLATBUFFERS_ASSERT(new_size > old_size);
+    ::grpc::Slice old_slice = slice_;
+    ::grpc::Slice new_slice = ::grpc::Slice(new_size);
+    uint8_t *new_p = const_cast<uint8_t *>(new_slice.begin());
+    memcpy_downward(old_p, old_size, new_p, new_size, in_use_back,
+                    in_use_front);
+    slice_ = new_slice;
+    return new_p;
+  }
+
+ private:
+  ::grpc::Slice &get_slice(uint8_t *p, size_t size) {
+    FLATBUFFERS_ASSERT(p == slice_.begin());
+    FLATBUFFERS_ASSERT(size == slice_.size());
+    return slice_;
+  }
+
+  ::grpc::Slice slice_;
+
+  friend class MessageBuilder;
+};
+
+// SliceAllocatorMember is a hack to ensure that the MessageBuilder's
+// slice_allocator_ member is constructed before the FlatBufferBuilder, since
+// the allocator is used in the FlatBufferBuilder ctor.
+namespace detail {
+struct SliceAllocatorMember {
+  SliceAllocator slice_allocator_;
+};
+}  // namespace detail
+
+// MessageBuilder is a gRPC-specific FlatBufferBuilder that uses SliceAllocator
+// to allocate gRPC buffers.
+class MessageBuilder : private detail::SliceAllocatorMember,
+                       public FlatBufferBuilder {
+ public:
+  explicit MessageBuilder(uoffset_t initial_size = 1024)
+      : FlatBufferBuilder(initial_size, &slice_allocator_, false) {}
+
+  MessageBuilder(const MessageBuilder &other) = delete;
+  MessageBuilder &operator=(const MessageBuilder &other) = delete;
+
+  MessageBuilder(MessageBuilder &&other)
+      : FlatBufferBuilder(1024, &slice_allocator_, false) {
+    // Default construct and swap idiom.
+    Swap(other);
+  }
+
+  /// Create a MessageBuilder from a FlatBufferBuilder.
+  explicit MessageBuilder(FlatBufferBuilder &&src,
+                          void (*dealloc)(void *,
+                                          size_t) = &DefaultAllocator::dealloc)
+      : FlatBufferBuilder(1024, &slice_allocator_, false) {
+    src.Swap(*this);
+    src.SwapBufAllocator(*this);
+    if (buf_.capacity()) {
+      uint8_t *buf = buf_.scratch_data();  // pointer to memory
+      size_t capacity = buf_.capacity();   // size of memory
+      slice_allocator_.slice_ = ::grpc::Slice(buf, capacity, dealloc);
+    } else {
+      slice_allocator_.slice_ = ::grpc::Slice();
+    }
+  }
+
+  /// Move-assign a FlatBufferBuilder to a MessageBuilder.
+  /// Only FlatBufferBuilder with default allocator (basically, nullptr) is
+  /// supported.
+  MessageBuilder &operator=(FlatBufferBuilder &&src) {
+    // Move construct a temporary and swap
+    MessageBuilder temp(std::move(src));
+    Swap(temp);
+    return *this;
+  }
+
+  MessageBuilder &operator=(MessageBuilder &&other) {
+    // Move construct a temporary and swap
+    MessageBuilder temp(std::move(other));
+    Swap(temp);
+    return *this;
+  }
+
+  void Swap(MessageBuilder &other) {
+    slice_allocator_.swap(other.slice_allocator_);
+    FlatBufferBuilder::Swap(other);
+    // After swapping the FlatBufferBuilder, we swap back the allocator, which
+    // restores the original allocator back in place. This is necessary because
+    // MessageBuilder's allocator is its own member (SliceAllocatorMember). The
+    // allocator passed to FlatBufferBuilder::vector_downward must point to this
+    // member.
+    buf_.swap_allocator(other.buf_);
+  }
+
+  // Releases the ownership of the buffer pointer.
+  // Returns the size, offset, and the original grpc_slice that
+  // allocated the buffer. Also see grpc_slice_unref().
+  uint8_t *ReleaseRaw(size_t &size, size_t &offset, ::grpc::Slice &slice) {
+    uint8_t *buf = FlatBufferBuilder::ReleaseRaw(size, offset);
+    slice = slice_allocator_.slice_;
+    slice_allocator_.slice_ = ::grpc::Slice();
+    return buf;
+  }
+
+  ~MessageBuilder() {}
+
+  // GetMessage extracts the subslice of the buffer corresponding to the
+  // flatbuffers-encoded region and wraps it in a `Message<T>` to handle buffer
+  // ownership.
+  template<class T> Message<T> GetMessage() {
+    auto buf_data = buf_.scratch_data();  // pointer to memory
+    auto buf_size = buf_.capacity();      // size of memory
+    auto msg_data = buf_.data();          // pointer to msg
+    auto msg_size = buf_.size();          // size of msg
+    // Do some sanity checks on data/size
+    FLATBUFFERS_ASSERT(msg_data);
+    FLATBUFFERS_ASSERT(msg_size);
+    FLATBUFFERS_ASSERT(msg_data >= buf_data);
+    FLATBUFFERS_ASSERT(msg_data + msg_size <= buf_data + buf_size);
+    // Calculate offsets from the buffer start
+    auto begin = msg_data - buf_data;
+    auto end = begin + msg_size;
+    // Get the slice we are working with (no refcount change)
+    ::grpc::Slice slice = slice_allocator_.get_slice(buf_data, buf_size);
+    // Extract a subslice of the existing slice (increment refcount)
+    ::grpc::Slice subslice = slice.sub(begin, end);
+    // Wrap the subslice in a `Message<T>`, but don't increment refcount
+    Message<T> msg(subslice);
+    return msg;
+  }
+
+  template<class T> Message<T> ReleaseMessage() {
+    Message<T> msg = GetMessage<T>();
+    Reset();
+    return msg;
+  }
+
+ private:
+  // SliceAllocator slice_allocator_;  // part of SliceAllocatorMember
+};
+
+}  // namespace grpc
+}  // namespace flatbuffers
+
+namespace grpc {
+
+template<class T> class SerializationTraits<flatbuffers::grpc::Message<T>> {
+ public:
+  static grpc::Status Serialize(const flatbuffers::grpc::Message<T> &msg,
+                                ByteBuffer *buffer, bool *own_buffer) {
+    // Package the single slice into a `ByteBuffer`,
+    // incrementing the refcount in the process.
+    *buffer = ByteBuffer(&msg.BorrowSlice(), 1);
+    *own_buffer = true;
+    return grpc::Status::OK;
+  }
+
+  // Deserialize by pulling the
+  static grpc::Status Deserialize(ByteBuffer *buf,
+                                  flatbuffers::grpc::Message<T> *msg) {
+    Slice slice;
+    if (!buf->TrySingleSlice(&slice).ok()) {
+      if (!buf->DumpToSingleSlice(&slice).ok()) {
+        buf->Clear();
+        return ::grpc::Status(::grpc::StatusCode::INTERNAL, "No payload");
+      }
+    }
+    *msg = flatbuffers::grpc::Message<T>(slice);
+    buf->Clear();
+#if FLATBUFFERS_GRPC_DISABLE_AUTO_VERIFICATION
+    return ::grpc::Status::OK;
+#else
+    if (msg->Verify()) {
+      return ::grpc::Status::OK;
+    } else {
+      return ::grpc::Status(::grpc::StatusCode::INTERNAL,
+                            "Message verification failed");
+    }
+#endif
+  }
+};
+
+}  // namespace grpc
+
+#endif  // FLATBUFFERS_GRPC_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/hash.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/hash.h
new file mode 100644
index 00000000..aebf0713
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/hash.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2015 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_HASH_H_
+#define FLATBUFFERS_HASH_H_
+
+#include <cstdint>
+#include <cstring>
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace flatbuffers {
+
+template<typename T> struct FnvTraits {
+  static const T kFnvPrime;
+  static const T kOffsetBasis;
+};
+
+template<> struct FnvTraits<uint32_t> {
+  static const uint32_t kFnvPrime = 0x01000193;
+  static const uint32_t kOffsetBasis = 0x811C9DC5;
+};
+
+template<> struct FnvTraits<uint64_t> {
+  static const uint64_t kFnvPrime = 0x00000100000001b3ULL;
+  static const uint64_t kOffsetBasis = 0xcbf29ce484222645ULL;
+};
+
+template<typename T> T HashFnv1(const char *input) {
+  T hash = FnvTraits<T>::kOffsetBasis;
+  for (const char *c = input; *c; ++c) {
+    hash *= FnvTraits<T>::kFnvPrime;
+    hash ^= static_cast<unsigned char>(*c);
+  }
+  return hash;
+}
+
+template<typename T> T HashFnv1a(const char *input) {
+  T hash = FnvTraits<T>::kOffsetBasis;
+  for (const char *c = input; *c; ++c) {
+    hash ^= static_cast<unsigned char>(*c);
+    hash *= FnvTraits<T>::kFnvPrime;
+  }
+  return hash;
+}
+
+template<> inline uint16_t HashFnv1<uint16_t>(const char *input) {
+  uint32_t hash = HashFnv1<uint32_t>(input);
+  return (hash >> 16) ^ (hash & 0xffff);
+}
+
+template<> inline uint16_t HashFnv1a<uint16_t>(const char *input) {
+  uint32_t hash = HashFnv1a<uint32_t>(input);
+  return (hash >> 16) ^ (hash & 0xffff);
+}
+
+template<typename T> struct NamedHashFunction {
+  const char *name;
+
+  typedef T (*HashFunction)(const char *);
+  HashFunction function;
+};
+
+const NamedHashFunction<uint16_t> kHashFunctions16[] = {
+  { "fnv1_16", HashFnv1<uint16_t> },
+  { "fnv1a_16", HashFnv1a<uint16_t> },
+};
+
+const NamedHashFunction<uint32_t> kHashFunctions32[] = {
+  { "fnv1_32", HashFnv1<uint32_t> },
+  { "fnv1a_32", HashFnv1a<uint32_t> },
+};
+
+const NamedHashFunction<uint64_t> kHashFunctions64[] = {
+  { "fnv1_64", HashFnv1<uint64_t> },
+  { "fnv1a_64", HashFnv1a<uint64_t> },
+};
+
+inline NamedHashFunction<uint16_t>::HashFunction FindHashFunction16(
+    const char *name) {
+  std::size_t size = sizeof(kHashFunctions16) / sizeof(kHashFunctions16[0]);
+  for (std::size_t i = 0; i < size; ++i) {
+    if (std::strcmp(name, kHashFunctions16[i].name) == 0) {
+      return kHashFunctions16[i].function;
+    }
+  }
+  return nullptr;
+}
+
+inline NamedHashFunction<uint32_t>::HashFunction FindHashFunction32(
+    const char *name) {
+  std::size_t size = sizeof(kHashFunctions32) / sizeof(kHashFunctions32[0]);
+  for (std::size_t i = 0; i < size; ++i) {
+    if (std::strcmp(name, kHashFunctions32[i].name) == 0) {
+      return kHashFunctions32[i].function;
+    }
+  }
+  return nullptr;
+}
+
+inline NamedHashFunction<uint64_t>::HashFunction FindHashFunction64(
+    const char *name) {
+  std::size_t size = sizeof(kHashFunctions64) / sizeof(kHashFunctions64[0]);
+  for (std::size_t i = 0; i < size; ++i) {
+    if (std::strcmp(name, kHashFunctions64[i].name) == 0) {
+      return kHashFunctions64[i].function;
+    }
+  }
+  return nullptr;
+}
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_HASH_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/idl.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/idl.h
new file mode 100644
index 00000000..ad45d311
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/idl.h
@@ -0,0 +1,1254 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_IDL_H_
+#define FLATBUFFERS_IDL_H_
+
+#include <algorithm>
+#include <functional>
+#include <map>
+#include <memory>
+#include <stack>
+
+#include "flatbuffers/base.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/flexbuffers.h"
+#include "flatbuffers/hash.h"
+#include "flatbuffers/reflection.h"
+
+// This file defines the data types representing a parsed IDL (Interface
+// Definition Language) / schema file.
+
+// Limits maximum depth of nested objects.
+// Prevents stack overflow while parse scheme, or json, or flexbuffer.
+#if !defined(FLATBUFFERS_MAX_PARSING_DEPTH)
+#  define FLATBUFFERS_MAX_PARSING_DEPTH 64
+#endif
+
+namespace flatbuffers {
+
+// The order of these matters for Is*() functions below.
+// Additionally, Parser::ParseType assumes bool..string is a contiguous range
+// of type tokens.
+// clang-format off
+#define FLATBUFFERS_GEN_TYPES_SCALAR(TD) \
+  TD(NONE,     "",       uint8_t,  byte,   byte,    byte,   uint8,   u8,   UByte, UInt8, 0) \
+  TD(UTYPE,    "",       uint8_t,  byte,   byte,    byte,   uint8,   u8,   UByte, UInt8, 1) /* begin scalar/int */ \
+  TD(BOOL,     "bool",   uint8_t,  boolean,bool,    bool,   bool,    bool, Boolean, Bool, 2) \
+  TD(CHAR,     "byte",   int8_t,   byte,   int8,    sbyte,  int8,    i8,   Byte, Int8, 3) \
+  TD(UCHAR,    "ubyte",  uint8_t,  byte,   byte,    byte,   uint8,   u8,   UByte, UInt8, 4) \
+  TD(SHORT,    "short",  int16_t,  short,  int16,   short,  int16,   i16,  Short, Int16, 5) \
+  TD(USHORT,   "ushort", uint16_t, short,  uint16,  ushort, uint16,  u16,  UShort, UInt16, 6) \
+  TD(INT,      "int",    int32_t,  int,    int32,   int,    int32,   i32,  Int, Int32, 7) \
+  TD(UINT,     "uint",   uint32_t, int,    uint32,  uint,   uint32,  u32,  UInt, UInt32, 8) \
+  TD(LONG,     "long",   int64_t,  long,   int64,   long,   int64,   i64,  Long, Int64, 9) \
+  TD(ULONG,    "ulong",  uint64_t, long,   uint64,  ulong,  uint64,  u64,  ULong, UInt64, 10) /* end int */ \
+  TD(FLOAT,    "float",  float,    float,  float32, float,  float32, f32,  Float, Float32, 11) /* begin float */ \
+  TD(DOUBLE,   "double", double,   double, float64, double, float64, f64,  Double, Double, 12) /* end float/scalar */
+#define FLATBUFFERS_GEN_TYPES_POINTER(TD) \
+  TD(STRING,   "string", Offset<void>,   int, int, StringOffset, int, unused, Int, Offset<String>, 13) \
+  TD(VECTOR,   "",       Offset<void>,   int, int, VectorOffset, int, unused, Int, Offset<UOffset>, 14) \
+  TD(VECTOR64, "",       Offset64<void>, int, int, VectorOffset, int, unused, Int, Offset<UOffset>, 18) \
+  TD(STRUCT,   "",       Offset<void>,   int, int, int,          int, unused, Int, Offset<UOffset>, 15) \
+  TD(UNION,    "",       Offset<void>,   int, int, int,          int, unused, Int, Offset<UOffset>, 16)
+#define FLATBUFFERS_GEN_TYPE_ARRAY(TD) \
+  TD(ARRAY,    "",       int,            int, int, int,          int, unused, Int, Offset<UOffset>, 17)
+// The fields are:
+// - enum
+// - FlatBuffers schema type.
+// - C++ type.
+// - Java type.
+// - Go type.
+// - C# / .Net type.
+// - Python type.
+// - Kotlin type.
+// - Rust type.
+// - Swift type.
+// - enum value (matches the reflected values)
+
+// using these macros, we can now write code dealing with types just once, e.g.
+
+/*
+switch (type) {
+  #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, JTYPE, GTYPE, NTYPE, PTYPE, \
+                         RTYPE, KTYPE, STYPE, ...) \
+    case BASE_TYPE_ ## ENUM: \
+      // do something specific to CTYPE here
+    FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+  #undef FLATBUFFERS_TD
+}
+*/
+
+// If not all FLATBUFFERS_GEN_() arguments are necessary for implementation
+// of FLATBUFFERS_TD, you can use a variadic macro (with __VA_ARGS__ if needed).
+// In the above example, only CTYPE is used to generate the code, it can be rewritten:
+
+/*
+switch (type) {
+  #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, ...) \
+    case BASE_TYPE_ ## ENUM: \
+      // do something specific to CTYPE here
+    FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+  #undef FLATBUFFERS_TD
+}
+*/
+
+#define FLATBUFFERS_GEN_TYPES(TD) \
+        FLATBUFFERS_GEN_TYPES_SCALAR(TD) \
+        FLATBUFFERS_GEN_TYPES_POINTER(TD) \
+        FLATBUFFERS_GEN_TYPE_ARRAY(TD)
+
+// Create an enum for all the types above.
+#ifdef __GNUC__
+__extension__  // Stop GCC complaining about trailing comma with -Wpendantic.
+#endif
+enum BaseType {
+  #define FLATBUFFERS_TD(ENUM, IDLTYPE, \
+              CTYPE, JTYPE, GTYPE, NTYPE, PTYPE, RTYPE, KTYPE, STYPE, ENUM_VALUE) \
+    BASE_TYPE_ ## ENUM = ENUM_VALUE,
+    FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+  #undef FLATBUFFERS_TD
+};
+
+#define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, ...) \
+  static_assert(sizeof(CTYPE) <= sizeof(largest_scalar_t), \
+                "define largest_scalar_t as " #CTYPE);
+  FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+#undef FLATBUFFERS_TD
+
+inline bool IsScalar (BaseType t) { return t >= BASE_TYPE_UTYPE &&
+                                           t <= BASE_TYPE_DOUBLE; }
+inline bool IsInteger(BaseType t) { return t >= BASE_TYPE_UTYPE &&
+                                           t <= BASE_TYPE_ULONG; }
+inline bool IsFloat  (BaseType t) { return t == BASE_TYPE_FLOAT ||
+                                           t == BASE_TYPE_DOUBLE; }
+inline bool IsLong   (BaseType t) { return t == BASE_TYPE_LONG ||
+                                           t == BASE_TYPE_ULONG; }
+inline bool IsBool   (BaseType t) { return t == BASE_TYPE_BOOL; }
+inline bool IsOneByte(BaseType t) { return t >= BASE_TYPE_UTYPE &&
+                                           t <= BASE_TYPE_UCHAR; }
+inline bool IsVector (BaseType t) { return t == BASE_TYPE_VECTOR ||
+                                           t == BASE_TYPE_VECTOR64; }
+
+inline bool IsUnsigned(BaseType t) {
+  return (t == BASE_TYPE_UTYPE)  || (t == BASE_TYPE_UCHAR) ||
+         (t == BASE_TYPE_USHORT) || (t == BASE_TYPE_UINT)  ||
+         (t == BASE_TYPE_ULONG);
+}
+
+inline size_t SizeOf(const BaseType t) {
+  switch (t) {
+  #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, ...) \
+    case BASE_TYPE_##ENUM: return sizeof(CTYPE);
+      FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+  #undef FLATBUFFERS_TD
+    default: FLATBUFFERS_ASSERT(0);
+  }
+  return 0;
+}
+
+inline const char* TypeName(const BaseType t) {
+  switch (t) {
+  #define FLATBUFFERS_TD(ENUM, IDLTYPE, ...) \
+    case BASE_TYPE_##ENUM: return IDLTYPE;
+      FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+  #undef FLATBUFFERS_TD
+    default: FLATBUFFERS_ASSERT(0);
+  }
+  return nullptr;
+}
+
+inline const char* StringOf(const BaseType t) {
+  switch (t) {
+  #define FLATBUFFERS_TD(ENUM, IDLTYPE, CTYPE, ...) \
+    case BASE_TYPE_##ENUM: return #CTYPE;
+      FLATBUFFERS_GEN_TYPES(FLATBUFFERS_TD)
+  #undef FLATBUFFERS_TD
+    default: FLATBUFFERS_ASSERT(0);
+  }
+  return "";
+}
+
+// clang-format on
+
+struct StructDef;
+struct EnumDef;
+class Parser;
+
+// Represents any type in the IDL, which is a combination of the BaseType
+// and additional information for vectors/structs_.
+struct Type {
+  explicit Type(BaseType _base_type = BASE_TYPE_NONE, StructDef *_sd = nullptr,
+                EnumDef *_ed = nullptr, uint16_t _fixed_length = 0)
+      : base_type(_base_type),
+        element(BASE_TYPE_NONE),
+        struct_def(_sd),
+        enum_def(_ed),
+        fixed_length(_fixed_length) {}
+
+  bool operator==(const Type &o) const {
+    return base_type == o.base_type && element == o.element &&
+           struct_def == o.struct_def && enum_def == o.enum_def;
+  }
+
+  Type VectorType() const {
+    return Type(element, struct_def, enum_def, fixed_length);
+  }
+
+  Offset<reflection::Type> Serialize(FlatBufferBuilder *builder) const;
+
+  bool Deserialize(const Parser &parser, const reflection::Type *type);
+
+  BaseType base_type;
+  BaseType element;       // only set if t == BASE_TYPE_VECTOR or
+                          // BASE_TYPE_VECTOR64
+  StructDef *struct_def;  // only set if t or element == BASE_TYPE_STRUCT
+  EnumDef *enum_def;      // set if t == BASE_TYPE_UNION / BASE_TYPE_UTYPE,
+                          // or for an integral type derived from an enum.
+  uint16_t fixed_length;  // only set if t == BASE_TYPE_ARRAY
+};
+
+// Represents a parsed scalar value, it's type, and field offset.
+struct Value {
+  Value()
+      : constant("0"),
+        offset(static_cast<voffset_t>(~(static_cast<voffset_t>(0U)))) {}
+  Type type;
+  std::string constant;
+  voffset_t offset;
+};
+
+// Helper class that retains the original order of a set of identifiers and
+// also provides quick lookup.
+template<typename T> class SymbolTable {
+ public:
+  ~SymbolTable() {
+    for (auto it = vec.begin(); it != vec.end(); ++it) { delete *it; }
+  }
+
+  bool Add(const std::string &name, T *e) {
+    vec.emplace_back(e);
+    auto it = dict.find(name);
+    if (it != dict.end()) return true;
+    dict[name] = e;
+    return false;
+  }
+
+  void Move(const std::string &oldname, const std::string &newname) {
+    auto it = dict.find(oldname);
+    if (it != dict.end()) {
+      auto obj = it->second;
+      dict.erase(it);
+      dict[newname] = obj;
+    } else {
+      FLATBUFFERS_ASSERT(false);
+    }
+  }
+
+  T *Lookup(const std::string &name) const {
+    auto it = dict.find(name);
+    return it == dict.end() ? nullptr : it->second;
+  }
+
+ public:
+  std::map<std::string, T *> dict;  // quick lookup
+  std::vector<T *> vec;             // Used to iterate in order of insertion
+};
+
+// A name space, as set in the schema.
+struct Namespace {
+  Namespace() : from_table(0) {}
+
+  // Given a (potentially unqualified) name, return the "fully qualified" name
+  // which has a full namespaced descriptor.
+  // With max_components you can request less than the number of components
+  // the current namespace has.
+  std::string GetFullyQualifiedName(const std::string &name,
+                                    size_t max_components = 1000) const;
+
+  std::vector<std::string> components;
+  size_t from_table;  // Part of the namespace corresponds to a message/table.
+};
+
+inline bool operator<(const Namespace &a, const Namespace &b) {
+  size_t min_size = std::min(a.components.size(), b.components.size());
+  for (size_t i = 0; i < min_size; ++i) {
+    if (a.components[i] != b.components[i])
+      return a.components[i] < b.components[i];
+  }
+  return a.components.size() < b.components.size();
+}
+
+// Base class for all definition types (fields, structs_, enums_).
+struct Definition {
+  Definition()
+      : generated(false),
+        defined_namespace(nullptr),
+        serialized_location(0),
+        index(-1),
+        refcount(1),
+        declaration_file(nullptr) {}
+
+  flatbuffers::Offset<
+      flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>>>
+  SerializeAttributes(FlatBufferBuilder *builder, const Parser &parser) const;
+
+  bool DeserializeAttributes(Parser &parser,
+                             const Vector<Offset<reflection::KeyValue>> *attrs);
+
+  std::string name;
+  std::string file;
+  std::vector<std::string> doc_comment;
+  SymbolTable<Value> attributes;
+  bool generated;  // did we already output code for this definition?
+  Namespace *defined_namespace;  // Where it was defined.
+
+  // For use with Serialize()
+  uoffset_t serialized_location;
+  int index;  // Inside the vector it is stored.
+  int refcount;
+  const std::string *declaration_file;
+};
+
+struct FieldDef : public Definition {
+  FieldDef()
+      : deprecated(false),
+        key(false),
+        shared(false),
+        native_inline(false),
+        flexbuffer(false),
+        offset64(false),
+        presence(kDefault),
+        nested_flatbuffer(nullptr),
+        padding(0),
+        sibling_union_field(nullptr) {}
+
+  Offset<reflection::Field> Serialize(FlatBufferBuilder *builder, uint16_t id,
+                                      const Parser &parser) const;
+
+  bool Deserialize(Parser &parser, const reflection::Field *field);
+
+  bool IsScalarOptional() const {
+    return IsScalar(value.type.base_type) && IsOptional();
+  }
+  bool IsOptional() const { return presence == kOptional; }
+  bool IsRequired() const { return presence == kRequired; }
+  bool IsDefault() const { return presence == kDefault; }
+
+  Value value;
+  bool deprecated;  // Field is allowed to be present in old data, but can't be.
+                    // written in new data nor accessed in new code.
+  bool key;         // Field functions as a key for creating sorted vectors.
+  bool shared;  // Field will be using string pooling (i.e. CreateSharedString)
+                // as default serialization behavior if field is a string.
+  bool native_inline;  // Field will be defined inline (instead of as a pointer)
+                       // for native tables if field is a struct.
+  bool flexbuffer;     // This field contains FlexBuffer data.
+  bool offset64;       // If the field uses 64-bit offsets.
+
+  enum Presence {
+    // Field must always be present.
+    kRequired,
+    // Non-presence should be signalled to and controlled by users.
+    kOptional,
+    // Non-presence is hidden from users.
+    // Implementations may omit writing default values.
+    kDefault,
+  };
+  Presence static MakeFieldPresence(bool optional, bool required) {
+    FLATBUFFERS_ASSERT(!(required && optional));
+    // clang-format off
+    return required ? FieldDef::kRequired
+         : optional ? FieldDef::kOptional
+                    : FieldDef::kDefault;
+    // clang-format on
+  }
+  Presence presence;
+
+  StructDef *nested_flatbuffer;  // This field contains nested FlatBuffer data.
+  size_t padding;                // Bytes to always pad after this field.
+
+  // sibling_union_field is always set to nullptr. The only exception is
+  // when FieldDef is a union field or an union type field. Therefore,
+  // sibling_union_field on a union field points to the union type field
+  // and vice-versa.
+  FieldDef *sibling_union_field;
+};
+
+struct StructDef : public Definition {
+  StructDef()
+      : fixed(false),
+        predecl(true),
+        sortbysize(true),
+        has_key(false),
+        minalign(1),
+        bytesize(0) {}
+
+  void PadLastField(size_t min_align) {
+    auto padding = PaddingBytes(bytesize, min_align);
+    bytesize += padding;
+    if (fields.vec.size()) fields.vec.back()->padding = padding;
+  }
+
+  Offset<reflection::Object> Serialize(FlatBufferBuilder *builder,
+                                       const Parser &parser) const;
+
+  bool Deserialize(Parser &parser, const reflection::Object *object);
+
+  SymbolTable<FieldDef> fields;
+
+  bool fixed;       // If it's struct, not a table.
+  bool predecl;     // If it's used before it was defined.
+  bool sortbysize;  // Whether fields come in the declaration or size order.
+  bool has_key;     // It has a key field.
+  size_t minalign;  // What the whole object needs to be aligned to.
+  size_t bytesize;  // Size if fixed.
+
+  flatbuffers::unique_ptr<std::string> original_location;
+  std::vector<voffset_t> reserved_ids;
+};
+
+struct EnumDef;
+struct EnumValBuilder;
+
+struct EnumVal {
+  Offset<reflection::EnumVal> Serialize(FlatBufferBuilder *builder,
+                                        const Parser &parser) const;
+
+  bool Deserialize(Parser &parser, const reflection::EnumVal *val);
+
+  flatbuffers::Offset<
+      flatbuffers::Vector<flatbuffers::Offset<reflection::KeyValue>>>
+  SerializeAttributes(FlatBufferBuilder *builder, const Parser &parser) const;
+
+  bool DeserializeAttributes(Parser &parser,
+                             const Vector<Offset<reflection::KeyValue>> *attrs);
+
+  uint64_t GetAsUInt64() const { return static_cast<uint64_t>(value); }
+  int64_t GetAsInt64() const { return value; }
+  bool IsZero() const { return 0 == value; }
+  bool IsNonZero() const { return !IsZero(); }
+
+  std::string name;
+  std::vector<std::string> doc_comment;
+  Type union_type;
+  SymbolTable<Value> attributes;
+
+ private:
+  friend EnumDef;
+  friend EnumValBuilder;
+  friend bool operator==(const EnumVal &lhs, const EnumVal &rhs);
+
+  EnumVal(const std::string &_name, int64_t _val) : name(_name), value(_val) {}
+  EnumVal() : value(0) {}
+
+  int64_t value;
+};
+
+struct EnumDef : public Definition {
+  EnumDef() : is_union(false), uses_multiple_type_instances(false) {}
+
+  Offset<reflection::Enum> Serialize(FlatBufferBuilder *builder,
+                                     const Parser &parser) const;
+
+  bool Deserialize(Parser &parser, const reflection::Enum *values);
+
+  template<typename T> void ChangeEnumValue(EnumVal *ev, T new_val);
+  void SortByValue();
+  void RemoveDuplicates();
+
+  std::string AllFlags() const;
+  const EnumVal *MinValue() const;
+  const EnumVal *MaxValue() const;
+  // Returns the number of integer steps from v1 to v2.
+  uint64_t Distance(const EnumVal *v1, const EnumVal *v2) const;
+  // Returns the number of integer steps from Min to Max.
+  uint64_t Distance() const { return Distance(MinValue(), MaxValue()); }
+
+  EnumVal *ReverseLookup(int64_t enum_idx,
+                         bool skip_union_default = false) const;
+  EnumVal *FindByValue(const std::string &constant) const;
+
+  std::string ToString(const EnumVal &ev) const {
+    return IsUInt64() ? NumToString(ev.GetAsUInt64())
+                      : NumToString(ev.GetAsInt64());
+  }
+
+  size_t size() const { return vals.vec.size(); }
+
+  const std::vector<EnumVal *> &Vals() const { return vals.vec; }
+
+  const EnumVal *Lookup(const std::string &enum_name) const {
+    return vals.Lookup(enum_name);
+  }
+
+  bool is_union;
+  // Type is a union which uses type aliases where at least one type is
+  // available under two different names.
+  bool uses_multiple_type_instances;
+  Type underlying_type;
+
+ private:
+  bool IsUInt64() const {
+    return (BASE_TYPE_ULONG == underlying_type.base_type);
+  }
+
+  friend EnumValBuilder;
+  SymbolTable<EnumVal> vals;
+};
+
+inline bool IsString(const Type &type) {
+  return type.base_type == BASE_TYPE_STRING;
+}
+
+inline bool IsStruct(const Type &type) {
+  return type.base_type == BASE_TYPE_STRUCT && type.struct_def->fixed;
+}
+
+inline bool IsIncompleteStruct(const Type &type) {
+  return type.base_type == BASE_TYPE_STRUCT && type.struct_def->predecl;
+}
+
+inline bool IsTable(const Type &type) {
+  return type.base_type == BASE_TYPE_STRUCT && !type.struct_def->fixed;
+}
+
+inline bool IsUnion(const Type &type) {
+  return type.enum_def != nullptr && type.enum_def->is_union;
+}
+
+inline bool IsUnionType(const Type &type) {
+  return IsUnion(type) && IsInteger(type.base_type);
+}
+
+inline bool IsVector(const Type &type) { return IsVector(type.base_type); }
+
+inline bool IsVectorOfStruct(const Type &type) {
+  return IsVector(type) && IsStruct(type.VectorType());
+}
+
+inline bool IsVectorOfTable(const Type &type) {
+  return IsVector(type) && IsTable(type.VectorType());
+}
+
+inline bool IsArray(const Type &type) {
+  return type.base_type == BASE_TYPE_ARRAY;
+}
+
+inline bool IsSeries(const Type &type) {
+  return IsVector(type) || IsArray(type);
+}
+
+inline bool IsEnum(const Type &type) {
+  return type.enum_def != nullptr && IsInteger(type.base_type);
+}
+
+inline size_t InlineSize(const Type &type) {
+  return IsStruct(type)
+             ? type.struct_def->bytesize
+             : (IsArray(type)
+                    ? InlineSize(type.VectorType()) * type.fixed_length
+                    : SizeOf(type.base_type));
+}
+
+inline size_t InlineAlignment(const Type &type) {
+  if (IsStruct(type)) {
+    return type.struct_def->minalign;
+  } else if (IsArray(type)) {
+    return IsStruct(type.VectorType()) ? type.struct_def->minalign
+                                       : SizeOf(type.element);
+  } else {
+    return SizeOf(type.base_type);
+  }
+}
+inline bool operator==(const EnumVal &lhs, const EnumVal &rhs) {
+  return lhs.value == rhs.value;
+}
+inline bool operator!=(const EnumVal &lhs, const EnumVal &rhs) {
+  return !(lhs == rhs);
+}
+
+inline bool EqualByName(const Type &a, const Type &b) {
+  return a.base_type == b.base_type && a.element == b.element &&
+         (a.struct_def == b.struct_def ||
+          (a.struct_def != nullptr && b.struct_def != nullptr &&
+           a.struct_def->name == b.struct_def->name)) &&
+         (a.enum_def == b.enum_def ||
+          (a.enum_def != nullptr && b.enum_def != nullptr &&
+           a.enum_def->name == b.enum_def->name));
+}
+
+struct RPCCall : public Definition {
+  Offset<reflection::RPCCall> Serialize(FlatBufferBuilder *builder,
+                                        const Parser &parser) const;
+
+  bool Deserialize(Parser &parser, const reflection::RPCCall *call);
+
+  StructDef *request, *response;
+};
+
+struct ServiceDef : public Definition {
+  Offset<reflection::Service> Serialize(FlatBufferBuilder *builder,
+                                        const Parser &parser) const;
+  bool Deserialize(Parser &parser, const reflection::Service *service);
+
+  SymbolTable<RPCCall> calls;
+};
+
+struct IncludedFile {
+  // The name of the schema file being included, as defined in the .fbs file.
+  // This includes the prefix (e.g., include "foo/bar/baz.fbs" would mean this
+  // value is "foo/bar/baz.fbs").
+  std::string schema_name;
+
+  // The filename of where the included file was found, after searching the
+  // relative paths plus any other paths included with `flatc -I ...`. Note,
+  // while this is sometimes the same as schema_name, it is not always, since it
+  // can be defined relative to where flatc was invoked.
+  std::string filename;
+};
+
+// Since IncludedFile is contained within a std::set, need to provide ordering.
+inline bool operator<(const IncludedFile &a, const IncludedFile &b) {
+  return a.filename < b.filename;
+}
+
+// Container of options that may apply to any of the source/text generators.
+struct IDLOptions {
+  // field case style options for C++
+  enum CaseStyle { CaseStyle_Unchanged = 0, CaseStyle_Upper, CaseStyle_Lower };
+  enum class ProtoIdGapAction { NO_OP, WARNING, ERROR };
+  bool gen_jvmstatic;
+  // Use flexbuffers instead for binary and text generation
+  bool use_flexbuffers;
+  bool strict_json;
+  bool output_default_scalars_in_json;
+  int indent_step;
+  bool cpp_minify_enums;
+  bool output_enum_identifiers;
+  bool prefixed_enums;
+  bool scoped_enums;
+  bool emit_min_max_enum_values;
+  bool swift_implementation_only;
+  bool include_dependence_headers;
+  bool mutable_buffer;
+  bool one_file;
+  bool proto_mode;
+  bool proto_oneof_union;
+  bool generate_all;
+  bool skip_unexpected_fields_in_json;
+  bool generate_name_strings;
+  bool generate_object_based_api;
+  bool gen_compare;
+  std::string cpp_object_api_pointer_type;
+  std::string cpp_object_api_string_type;
+  bool cpp_object_api_string_flexible_constructor;
+  CaseStyle cpp_object_api_field_case_style;
+  bool cpp_direct_copy;
+  bool gen_nullable;
+  std::string java_package_prefix;
+  bool java_checkerframework;
+  bool gen_generated;
+  bool gen_json_coders;
+  std::string object_prefix;
+  std::string object_suffix;
+  bool union_value_namespacing;
+  bool allow_non_utf8;
+  bool natural_utf8;
+  std::string include_prefix;
+  bool keep_prefix;
+  bool binary_schema_comments;
+  bool binary_schema_builtins;
+  bool binary_schema_gen_embed;
+  std::string go_import;
+  std::string go_namespace;
+  std::string go_module_name;
+  bool protobuf_ascii_alike;
+  bool size_prefixed;
+  std::string root_type;
+  bool force_defaults;
+  bool java_primitive_has_method;
+  bool cs_gen_json_serializer;
+  std::vector<std::string> cpp_includes;
+  std::string cpp_std;
+  bool cpp_static_reflection;
+  std::string proto_namespace_suffix;
+  std::string filename_suffix;
+  std::string filename_extension;
+  bool no_warnings;
+  bool warnings_as_errors;
+  std::string project_root;
+  bool cs_global_alias;
+  bool json_nested_flatbuffers;
+  bool json_nested_flexbuffers;
+  bool json_nested_legacy_flatbuffers;
+  bool ts_flat_files;
+  bool ts_entry_points;
+  bool ts_no_import_ext;
+  bool no_leak_private_annotations;
+  bool require_json_eof;
+  bool keep_proto_id;
+  bool python_no_type_prefix_suffix;
+  bool python_typing;
+  ProtoIdGapAction proto_id_gap_action;
+
+  // Possible options for the more general generator below.
+  enum Language {
+    kJava = 1 << 0,
+    kCSharp = 1 << 1,
+    kGo = 1 << 2,
+    kCpp = 1 << 3,
+    kPython = 1 << 5,
+    kPhp = 1 << 6,
+    kJson = 1 << 7,
+    kBinary = 1 << 8,
+    kTs = 1 << 9,
+    kJsonSchema = 1 << 10,
+    kDart = 1 << 11,
+    kLua = 1 << 12,
+    kLobster = 1 << 13,
+    kRust = 1 << 14,
+    kKotlin = 1 << 15,
+    kSwift = 1 << 16,
+    kNim = 1 << 17,
+    kProto = 1 << 18,
+    kMAX
+  };
+
+  enum MiniReflect { kNone, kTypes, kTypesAndNames };
+
+  MiniReflect mini_reflect;
+
+  // If set, require all fields in a table to be explicitly numbered.
+  bool require_explicit_ids;
+
+  // If set, implement serde::Serialize for generated Rust types
+  bool rust_serialize;
+
+  // If set, generate rust types in individual files with a root module file.
+  bool rust_module_root_file;
+
+  // The corresponding language bit will be set if a language is included
+  // for code generation.
+  unsigned long lang_to_generate;
+
+  // If set (default behavior), empty string fields will be set to nullptr to
+  // make the flatbuffer more compact.
+  bool set_empty_strings_to_null;
+
+  // If set (default behavior), empty vector fields will be set to nullptr to
+  // make the flatbuffer more compact.
+  bool set_empty_vectors_to_null;
+
+  IDLOptions()
+      : gen_jvmstatic(false),
+        use_flexbuffers(false),
+        strict_json(false),
+        output_default_scalars_in_json(false),
+        indent_step(2),
+        cpp_minify_enums(false),
+        output_enum_identifiers(true),
+        prefixed_enums(true),
+        scoped_enums(false),
+        emit_min_max_enum_values(true),
+        swift_implementation_only(false),
+        include_dependence_headers(true),
+        mutable_buffer(false),
+        one_file(false),
+        proto_mode(false),
+        proto_oneof_union(false),
+        generate_all(false),
+        skip_unexpected_fields_in_json(false),
+        generate_name_strings(false),
+        generate_object_based_api(false),
+        gen_compare(false),
+        cpp_object_api_pointer_type("std::unique_ptr"),
+        cpp_object_api_string_flexible_constructor(false),
+        cpp_object_api_field_case_style(CaseStyle_Unchanged),
+        cpp_direct_copy(true),
+        gen_nullable(false),
+        java_checkerframework(false),
+        gen_generated(false),
+        gen_json_coders(false),
+        object_suffix("T"),
+        union_value_namespacing(true),
+        allow_non_utf8(false),
+        natural_utf8(false),
+        keep_prefix(false),
+        binary_schema_comments(false),
+        binary_schema_builtins(false),
+        binary_schema_gen_embed(false),
+        protobuf_ascii_alike(false),
+        size_prefixed(false),
+        force_defaults(false),
+        java_primitive_has_method(false),
+        cs_gen_json_serializer(false),
+        cpp_static_reflection(false),
+        filename_suffix("_generated"),
+        filename_extension(),
+        no_warnings(false),
+        warnings_as_errors(false),
+        project_root(""),
+        cs_global_alias(false),
+        json_nested_flatbuffers(true),
+        json_nested_flexbuffers(true),
+        json_nested_legacy_flatbuffers(false),
+        ts_flat_files(false),
+        ts_entry_points(false),
+        ts_no_import_ext(false),
+        no_leak_private_annotations(false),
+        require_json_eof(true),
+        keep_proto_id(false),
+        python_no_type_prefix_suffix(false),
+        python_typing(false),
+        proto_id_gap_action(ProtoIdGapAction::WARNING),
+        mini_reflect(IDLOptions::kNone),
+        require_explicit_ids(false),
+        rust_serialize(false),
+        rust_module_root_file(false),
+        lang_to_generate(0),
+        set_empty_strings_to_null(true),
+        set_empty_vectors_to_null(true) {}
+};
+
+// This encapsulates where the parser is in the current source file.
+struct ParserState {
+  ParserState()
+      : prev_cursor_(nullptr),
+        cursor_(nullptr),
+        line_start_(nullptr),
+        line_(0),
+        token_(-1),
+        attr_is_trivial_ascii_string_(true) {}
+
+ protected:
+  void ResetState(const char *source) {
+    prev_cursor_ = source;
+    cursor_ = source;
+    line_ = 0;
+    MarkNewLine();
+  }
+
+  void MarkNewLine() {
+    line_start_ = cursor_;
+    line_ += 1;
+  }
+
+  int64_t CursorPosition() const {
+    FLATBUFFERS_ASSERT(cursor_ && line_start_ && cursor_ >= line_start_);
+    return static_cast<int64_t>(cursor_ - line_start_);
+  }
+
+  const char *prev_cursor_;
+  const char *cursor_;
+  const char *line_start_;
+  int line_;  // the current line being parsed
+  int token_;
+
+  // Flag: text in attribute_ is true ASCII string without escape
+  // sequences. Only printable ASCII (without [\t\r\n]).
+  // Used for number-in-string (and base64 string in future).
+  bool attr_is_trivial_ascii_string_;
+  std::string attribute_;
+  std::vector<std::string> doc_comment_;
+};
+
+// A way to make error propagation less error prone by requiring values to be
+// checked.
+// Once you create a value of this type you must either:
+// - Call Check() on it.
+// - Copy or assign it to another value.
+// Failure to do so leads to an assert.
+// This guarantees that this as return value cannot be ignored.
+class CheckedError {
+ public:
+  explicit CheckedError(bool error)
+      : is_error_(error), has_been_checked_(false) {}
+
+  CheckedError &operator=(const CheckedError &other) {
+    is_error_ = other.is_error_;
+    has_been_checked_ = false;
+    other.has_been_checked_ = true;
+    return *this;
+  }
+
+  CheckedError(const CheckedError &other) {
+    *this = other;  // Use assignment operator.
+  }
+
+  ~CheckedError() { FLATBUFFERS_ASSERT(has_been_checked_); }
+
+  bool Check() {
+    has_been_checked_ = true;
+    return is_error_;
+  }
+
+ private:
+  bool is_error_;
+  mutable bool has_been_checked_;
+};
+
+// Additionally, in GCC we can get these errors statically, for additional
+// assurance:
+// clang-format off
+#ifdef __GNUC__
+#define FLATBUFFERS_CHECKED_ERROR CheckedError \
+          __attribute__((warn_unused_result))
+#else
+#define FLATBUFFERS_CHECKED_ERROR CheckedError
+#endif
+// clang-format on
+
+class Parser : public ParserState {
+ public:
+  explicit Parser(const IDLOptions &options = IDLOptions())
+      : current_namespace_(nullptr),
+        empty_namespace_(nullptr),
+        flex_builder_(256, flexbuffers::BUILDER_FLAG_SHARE_ALL),
+        root_struct_def_(nullptr),
+        opts(options),
+        uses_flexbuffers_(false),
+        has_warning_(false),
+        advanced_features_(0),
+        source_(nullptr),
+        anonymous_counter_(0),
+        parse_depth_counter_(0) {
+    if (opts.force_defaults) { builder_.ForceDefaults(true); }
+    // Start out with the empty namespace being current.
+    empty_namespace_ = new Namespace();
+    namespaces_.push_back(empty_namespace_);
+    current_namespace_ = empty_namespace_;
+    known_attributes_["deprecated"] = true;
+    known_attributes_["required"] = true;
+    known_attributes_["key"] = true;
+    known_attributes_["shared"] = true;
+    known_attributes_["hash"] = true;
+    known_attributes_["id"] = true;
+    known_attributes_["force_align"] = true;
+    known_attributes_["bit_flags"] = true;
+    known_attributes_["original_order"] = true;
+    known_attributes_["nested_flatbuffer"] = true;
+    known_attributes_["csharp_partial"] = true;
+    known_attributes_["streaming"] = true;
+    known_attributes_["idempotent"] = true;
+    known_attributes_["cpp_type"] = true;
+    known_attributes_["cpp_ptr_type"] = true;
+    known_attributes_["cpp_ptr_type_get"] = true;
+    known_attributes_["cpp_str_type"] = true;
+    known_attributes_["cpp_str_flex_ctor"] = true;
+    known_attributes_["native_inline"] = true;
+    known_attributes_["native_custom_alloc"] = true;
+    known_attributes_["native_type"] = true;
+    known_attributes_["native_type_pack_name"] = true;
+    known_attributes_["native_default"] = true;
+    known_attributes_["flexbuffer"] = true;
+    known_attributes_["private"] = true;
+
+    // An attribute added to a field to indicate that is uses 64-bit addressing.
+    known_attributes_["offset64"] = true;
+
+    // An attribute added to a vector field to indicate that it uses 64-bit
+    // addressing and it has a 64-bit length.
+    known_attributes_["vector64"] = true;
+  }
+
+  // Copying is not allowed
+  Parser(const Parser &) = delete;
+  Parser &operator=(const Parser &) = delete;
+
+  Parser(Parser &&) = default;
+  Parser &operator=(Parser &&) = default;
+
+  ~Parser() {
+    for (auto it = namespaces_.begin(); it != namespaces_.end(); ++it) {
+      delete *it;
+    }
+  }
+
+  // Parse the string containing either schema or JSON data, which will
+  // populate the SymbolTable's or the FlatBufferBuilder above.
+  // include_paths is used to resolve any include statements, and typically
+  // should at least include the project path (where you loaded source_ from).
+  // include_paths must be nullptr terminated if specified.
+  // If include_paths is nullptr, it will attempt to load from the current
+  // directory.
+  // If the source was loaded from a file and isn't an include file,
+  // supply its name in source_filename.
+  // All paths specified in this call must be in posix format, if you accept
+  // paths from user input, please call PosixPath on them first.
+  bool Parse(const char *_source, const char **include_paths = nullptr,
+             const char *source_filename = nullptr);
+
+  bool ParseJson(const char *json, const char *json_filename = nullptr);
+
+  // Returns the number of characters were consumed when parsing a JSON string.
+  std::ptrdiff_t BytesConsumed() const;
+
+  // Set the root type. May override the one set in the schema.
+  bool SetRootType(const char *name);
+
+  // Mark all definitions as already having code generated.
+  void MarkGenerated();
+
+  // Get the files recursively included by the given file. The returned
+  // container will have at least the given file.
+  std::set<std::string> GetIncludedFilesRecursive(
+      const std::string &file_name) const;
+
+  // Fills builder_ with a binary version of the schema parsed.
+  // See reflection/reflection.fbs
+  void Serialize();
+
+  // Deserialize a schema buffer
+  bool Deserialize(const uint8_t *buf, const size_t size);
+
+  // Fills internal structure as if the schema passed had been loaded by parsing
+  // with Parse except that included filenames will not be populated.
+  bool Deserialize(const reflection::Schema *schema);
+
+  Type *DeserializeType(const reflection::Type *type);
+
+  // Checks that the schema represented by this parser is a safe evolution
+  // of the schema provided. Returns non-empty error on any problems.
+  std::string ConformTo(const Parser &base);
+
+  // Similar to Parse(), but now only accepts JSON to be parsed into a
+  // FlexBuffer.
+  bool ParseFlexBuffer(const char *source, const char *source_filename,
+                       flexbuffers::Builder *builder);
+
+  StructDef *LookupStruct(const std::string &id) const;
+  StructDef *LookupStructThruParentNamespaces(const std::string &id) const;
+
+  std::string UnqualifiedName(const std::string &fullQualifiedName);
+
+  FLATBUFFERS_CHECKED_ERROR Error(const std::string &msg);
+
+  // @brief Verify that any of 'opts.lang_to_generate' supports Optional scalars
+  // in a schema.
+  // @param opts Options used to parce a schema and generate code.
+  static bool SupportsOptionalScalars(const flatbuffers::IDLOptions &opts);
+
+  // Get the set of included files that are directly referenced by the file
+  // being parsed. This does not include files that are transitively included by
+  // others includes.
+  std::vector<IncludedFile> GetIncludedFiles() const;
+
+ private:
+  class ParseDepthGuard;
+
+  void Message(const std::string &msg);
+  void Warning(const std::string &msg);
+  FLATBUFFERS_CHECKED_ERROR ParseHexNum(int nibbles, uint64_t *val);
+  FLATBUFFERS_CHECKED_ERROR Next();
+  FLATBUFFERS_CHECKED_ERROR SkipByteOrderMark();
+  bool Is(int t) const;
+  bool IsIdent(const char *id) const;
+  FLATBUFFERS_CHECKED_ERROR Expect(int t);
+  std::string TokenToStringId(int t) const;
+  EnumDef *LookupEnum(const std::string &id);
+  FLATBUFFERS_CHECKED_ERROR ParseNamespacing(std::string *id,
+                                             std::string *last);
+  FLATBUFFERS_CHECKED_ERROR ParseTypeIdent(Type &type);
+  FLATBUFFERS_CHECKED_ERROR ParseType(Type &type);
+  FLATBUFFERS_CHECKED_ERROR AddField(StructDef &struct_def,
+                                     const std::string &name, const Type &type,
+                                     FieldDef **dest);
+  FLATBUFFERS_CHECKED_ERROR ParseField(StructDef &struct_def);
+  FLATBUFFERS_CHECKED_ERROR ParseString(Value &val, bool use_string_pooling);
+  FLATBUFFERS_CHECKED_ERROR ParseComma();
+  FLATBUFFERS_CHECKED_ERROR ParseAnyValue(Value &val, FieldDef *field,
+                                          size_t parent_fieldn,
+                                          const StructDef *parent_struct_def,
+                                          size_t count,
+                                          bool inside_vector = false);
+  template<typename F>
+  FLATBUFFERS_CHECKED_ERROR ParseTableDelimiters(size_t &fieldn,
+                                                 const StructDef *struct_def,
+                                                 F body);
+  FLATBUFFERS_CHECKED_ERROR ParseTable(const StructDef &struct_def,
+                                       std::string *value, uoffset_t *ovalue);
+  void SerializeStruct(const StructDef &struct_def, const Value &val);
+  void SerializeStruct(FlatBufferBuilder &builder, const StructDef &struct_def,
+                       const Value &val);
+  template<typename F>
+  FLATBUFFERS_CHECKED_ERROR ParseVectorDelimiters(size_t &count, F body);
+  FLATBUFFERS_CHECKED_ERROR ParseVector(const Type &type, uoffset_t *ovalue,
+                                        FieldDef *field, size_t fieldn);
+  FLATBUFFERS_CHECKED_ERROR ParseArray(Value &array);
+  FLATBUFFERS_CHECKED_ERROR ParseNestedFlatbuffer(
+      Value &val, FieldDef *field, size_t fieldn,
+      const StructDef *parent_struct_def);
+  FLATBUFFERS_CHECKED_ERROR ParseMetaData(SymbolTable<Value> *attributes);
+  FLATBUFFERS_CHECKED_ERROR TryTypedValue(const std::string *name, int dtoken,
+                                          bool check, Value &e, BaseType req,
+                                          bool *destmatch);
+  FLATBUFFERS_CHECKED_ERROR ParseHash(Value &e, FieldDef *field);
+  FLATBUFFERS_CHECKED_ERROR TokenError();
+  FLATBUFFERS_CHECKED_ERROR ParseSingleValue(const std::string *name, Value &e,
+                                             bool check_now);
+  FLATBUFFERS_CHECKED_ERROR ParseFunction(const std::string *name, Value &e);
+  FLATBUFFERS_CHECKED_ERROR ParseEnumFromString(const Type &type,
+                                                std::string *result);
+  StructDef *LookupCreateStruct(const std::string &name,
+                                bool create_if_new = true,
+                                bool definition = false);
+  FLATBUFFERS_CHECKED_ERROR ParseEnum(bool is_union, EnumDef **dest,
+                                      const char *filename);
+  FLATBUFFERS_CHECKED_ERROR ParseNamespace();
+  FLATBUFFERS_CHECKED_ERROR StartStruct(const std::string &name,
+                                        StructDef **dest);
+  FLATBUFFERS_CHECKED_ERROR StartEnum(const std::string &name, bool is_union,
+                                      EnumDef **dest);
+  FLATBUFFERS_CHECKED_ERROR ParseDecl(const char *filename);
+  FLATBUFFERS_CHECKED_ERROR ParseService(const char *filename);
+  FLATBUFFERS_CHECKED_ERROR ParseProtoFields(StructDef *struct_def,
+                                             bool isextend, bool inside_oneof);
+  FLATBUFFERS_CHECKED_ERROR ParseProtoMapField(StructDef *struct_def);
+  FLATBUFFERS_CHECKED_ERROR ParseProtoOption();
+  FLATBUFFERS_CHECKED_ERROR ParseProtoKey();
+  FLATBUFFERS_CHECKED_ERROR ParseProtoDecl();
+  FLATBUFFERS_CHECKED_ERROR ParseProtoCurliesOrIdent();
+  FLATBUFFERS_CHECKED_ERROR ParseTypeFromProtoType(Type *type);
+  FLATBUFFERS_CHECKED_ERROR SkipAnyJsonValue();
+  FLATBUFFERS_CHECKED_ERROR ParseFlexBufferNumericConstant(
+      flexbuffers::Builder *builder);
+  FLATBUFFERS_CHECKED_ERROR ParseFlexBufferValue(flexbuffers::Builder *builder);
+  FLATBUFFERS_CHECKED_ERROR StartParseFile(const char *source,
+                                           const char *source_filename);
+  FLATBUFFERS_CHECKED_ERROR ParseRoot(const char *_source,
+                                      const char **include_paths,
+                                      const char *source_filename);
+  FLATBUFFERS_CHECKED_ERROR CheckPrivateLeak();
+  FLATBUFFERS_CHECKED_ERROR CheckPrivatelyLeakedFields(
+      const Definition &def, const Definition &value_type);
+  FLATBUFFERS_CHECKED_ERROR DoParse(const char *_source,
+                                    const char **include_paths,
+                                    const char *source_filename,
+                                    const char *include_filename);
+  FLATBUFFERS_CHECKED_ERROR DoParseJson();
+  FLATBUFFERS_CHECKED_ERROR CheckClash(std::vector<FieldDef *> &fields,
+                                       StructDef *struct_def,
+                                       const char *suffix, BaseType baseType);
+  FLATBUFFERS_CHECKED_ERROR ParseAlignAttribute(
+      const std::string &align_constant, size_t min_align, size_t *align);
+
+  bool SupportsAdvancedUnionFeatures() const;
+  bool SupportsAdvancedArrayFeatures() const;
+  bool SupportsOptionalScalars() const;
+  bool SupportsDefaultVectorsAndStrings() const;
+  bool Supports64BitOffsets() const;
+  bool SupportsUnionUnderlyingType() const;
+  Namespace *UniqueNamespace(Namespace *ns);
+
+  FLATBUFFERS_CHECKED_ERROR RecurseError();
+  template<typename F> CheckedError Recurse(F f);
+
+  const std::string &GetPooledString(const std::string &s) const;
+
+ public:
+  SymbolTable<Type> types_;
+  SymbolTable<StructDef> structs_;
+  SymbolTable<EnumDef> enums_;
+  SymbolTable<ServiceDef> services_;
+  std::vector<Namespace *> namespaces_;
+  Namespace *current_namespace_;
+  Namespace *empty_namespace_;
+  std::string error_;  // User readable error_ if Parse() == false
+
+  FlatBufferBuilder builder_;  // any data contained in the file
+  flexbuffers::Builder flex_builder_;
+  flexbuffers::Reference flex_root_;
+  StructDef *root_struct_def_;
+  std::string file_identifier_;
+  std::string file_extension_;
+
+  std::map<uint64_t, std::string> included_files_;
+  std::map<std::string, std::set<IncludedFile>> files_included_per_file_;
+  std::vector<std::string> native_included_files_;
+
+  std::map<std::string, bool> known_attributes_;
+
+  IDLOptions opts;
+  bool uses_flexbuffers_;
+  bool has_warning_;
+
+  uint64_t advanced_features_;
+
+  std::string file_being_parsed_;
+
+ private:
+  const char *source_;
+
+  std::vector<std::pair<Value, FieldDef *>> field_stack_;
+
+  // TODO(cneo): Refactor parser to use string_cache more often to save
+  // on memory usage.
+  mutable std::set<std::string> string_cache_;
+
+  int anonymous_counter_;
+  int parse_depth_counter_;  // stack-overflow guard
+};
+
+// Utility functions for multiple generators:
+
+// Generate text (JSON) from a given FlatBuffer, and a given Parser
+// object that has been populated with the corresponding schema.
+// If ident_step is 0, no indentation will be generated. Additionally,
+// if it is less than 0, no linefeeds will be generated either.
+// See idl_gen_text.cpp.
+// strict_json adds "quotes" around field names if true.
+// These functions return nullptr on success, or an error string,
+// which may happen if the flatbuffer cannot be encoded in JSON (e.g.,
+// it contains non-UTF-8 byte arrays in String values).
+extern const char *GenTextFromTable(const Parser &parser, const void *table,
+                                    const std::string &tablename,
+                                    std::string *text);
+extern const char *GenText(const Parser &parser, const void *flatbuffer,
+                           std::string *text);
+extern const char *GenTextFile(const Parser &parser, const std::string &path,
+                               const std::string &file_name);
+
+// Generate GRPC Cpp interfaces.
+// See idl_gen_grpc.cpp.
+bool GenerateCppGRPC(const Parser &parser, const std::string &path,
+                     const std::string &file_name);
+
+// Generate GRPC Go interfaces.
+// See idl_gen_grpc.cpp.
+bool GenerateGoGRPC(const Parser &parser, const std::string &path,
+                    const std::string &file_name);
+
+// Generate GRPC Java classes.
+// See idl_gen_grpc.cpp
+bool GenerateJavaGRPC(const Parser &parser, const std::string &path,
+                      const std::string &file_name);
+
+// Generate GRPC Python interfaces.
+// See idl_gen_grpc.cpp.
+bool GeneratePythonGRPC(const Parser &parser, const std::string &path,
+                        const std::string &file_name);
+
+// Generate GRPC Swift interfaces.
+// See idl_gen_grpc.cpp.
+extern bool GenerateSwiftGRPC(const Parser &parser, const std::string &path,
+                              const std::string &file_name);
+
+extern bool GenerateTSGRPC(const Parser &parser, const std::string &path,
+                           const std::string &file_name);
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_IDL_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/minireflect.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/minireflect.h
new file mode 100644
index 00000000..1e04bfff
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/minireflect.h
@@ -0,0 +1,420 @@
+/*
+ * Copyright 2017 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_MINIREFLECT_H_
+#define FLATBUFFERS_MINIREFLECT_H_
+
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/util.h"
+
+namespace flatbuffers {
+
+// Utilities that can be used with the "mini reflection" tables present
+// in generated code with --reflect-types (only types) or --reflect-names
+// (also names).
+// This allows basic reflection functionality such as pretty-printing
+// that does not require the use of the schema parser or loading of binary
+// schema files at runtime (reflection.h).
+
+// For any of the functions below that take `const TypeTable *`, you pass
+// `FooTypeTable()` if the type of the root is `Foo`.
+
+// First, a generic iterator that can be used by multiple algorithms.
+
+struct IterationVisitor {
+  // These mark the scope of a table or struct.
+  virtual void StartSequence() {}
+  virtual void EndSequence() {}
+  // Called for each field regardless of whether it is present or not.
+  // If not present, val == nullptr. set_idx is the index of all set fields.
+  virtual void Field(size_t /*field_idx*/, size_t /*set_idx*/,
+                     ElementaryType /*type*/, bool /*is_vector*/,
+                     const TypeTable * /*type_table*/, const char * /*name*/,
+                     const uint8_t * /*val*/) {}
+  // Called for a value that is actually present, after a field, or as part
+  // of a vector.
+  virtual void UType(uint8_t, const char *) {}
+  virtual void Bool(bool) {}
+  virtual void Char(int8_t, const char *) {}
+  virtual void UChar(uint8_t, const char *) {}
+  virtual void Short(int16_t, const char *) {}
+  virtual void UShort(uint16_t, const char *) {}
+  virtual void Int(int32_t, const char *) {}
+  virtual void UInt(uint32_t, const char *) {}
+  virtual void Long(int64_t) {}
+  virtual void ULong(uint64_t) {}
+  virtual void Float(float) {}
+  virtual void Double(double) {}
+  virtual void String(const String *) {}
+  virtual void Unknown(const uint8_t *) {}  // From a future version.
+  // These mark the scope of a vector.
+  virtual void StartVector() {}
+  virtual void EndVector() {}
+  virtual void Element(size_t /*i*/, ElementaryType /*type*/,
+                       const TypeTable * /*type_table*/,
+                       const uint8_t * /*val*/) {}
+  virtual ~IterationVisitor() {}
+};
+
+inline size_t InlineSize(ElementaryType type, const TypeTable *type_table) {
+  switch (type) {
+    case ET_UTYPE:
+    case ET_BOOL:
+    case ET_CHAR:
+    case ET_UCHAR: return 1;
+    case ET_SHORT:
+    case ET_USHORT: return 2;
+    case ET_INT:
+    case ET_UINT:
+    case ET_FLOAT:
+    case ET_STRING: return 4;
+    case ET_LONG:
+    case ET_ULONG:
+    case ET_DOUBLE: return 8;
+    case ET_SEQUENCE:
+      switch (type_table->st) {
+        case ST_TABLE:
+        case ST_UNION: return 4;
+        case ST_STRUCT:
+          return static_cast<size_t>(type_table->values[type_table->num_elems]);
+        default: FLATBUFFERS_ASSERT(false); return 1;
+      }
+    default: FLATBUFFERS_ASSERT(false); return 1;
+  }
+}
+
+inline int64_t LookupEnum(int64_t enum_val, const int64_t *values,
+                          size_t num_values) {
+  if (!values) return enum_val;
+  for (size_t i = 0; i < num_values; i++) {
+    if (enum_val == values[i]) return static_cast<int64_t>(i);
+  }
+  return -1;  // Unknown enum value.
+}
+
+template<typename T> const char *EnumName(T tval, const TypeTable *type_table) {
+  if (!type_table || !type_table->names) return nullptr;
+  auto i = LookupEnum(static_cast<int64_t>(tval), type_table->values,
+                      type_table->num_elems);
+  if (i >= 0 && i < static_cast<int64_t>(type_table->num_elems)) {
+    return type_table->names[i];
+  }
+  return nullptr;
+}
+
+void IterateObject(const uint8_t *obj, const TypeTable *type_table,
+                   IterationVisitor *visitor);
+
+inline void IterateValue(ElementaryType type, const uint8_t *val,
+                         const TypeTable *type_table, const uint8_t *prev_val,
+                         soffset_t vector_index, IterationVisitor *visitor) {
+  switch (type) {
+    case ET_UTYPE: {
+      auto tval = ReadScalar<uint8_t>(val);
+      visitor->UType(tval, EnumName(tval, type_table));
+      break;
+    }
+    case ET_BOOL: {
+      visitor->Bool(ReadScalar<uint8_t>(val) != 0);
+      break;
+    }
+    case ET_CHAR: {
+      auto tval = ReadScalar<int8_t>(val);
+      visitor->Char(tval, EnumName(tval, type_table));
+      break;
+    }
+    case ET_UCHAR: {
+      auto tval = ReadScalar<uint8_t>(val);
+      visitor->UChar(tval, EnumName(tval, type_table));
+      break;
+    }
+    case ET_SHORT: {
+      auto tval = ReadScalar<int16_t>(val);
+      visitor->Short(tval, EnumName(tval, type_table));
+      break;
+    }
+    case ET_USHORT: {
+      auto tval = ReadScalar<uint16_t>(val);
+      visitor->UShort(tval, EnumName(tval, type_table));
+      break;
+    }
+    case ET_INT: {
+      auto tval = ReadScalar<int32_t>(val);
+      visitor->Int(tval, EnumName(tval, type_table));
+      break;
+    }
+    case ET_UINT: {
+      auto tval = ReadScalar<uint32_t>(val);
+      visitor->UInt(tval, EnumName(tval, type_table));
+      break;
+    }
+    case ET_LONG: {
+      visitor->Long(ReadScalar<int64_t>(val));
+      break;
+    }
+    case ET_ULONG: {
+      visitor->ULong(ReadScalar<uint64_t>(val));
+      break;
+    }
+    case ET_FLOAT: {
+      visitor->Float(ReadScalar<float>(val));
+      break;
+    }
+    case ET_DOUBLE: {
+      visitor->Double(ReadScalar<double>(val));
+      break;
+    }
+    case ET_STRING: {
+      val += ReadScalar<uoffset_t>(val);
+      visitor->String(reinterpret_cast<const String *>(val));
+      break;
+    }
+    case ET_SEQUENCE: {
+      switch (type_table->st) {
+        case ST_TABLE:
+          val += ReadScalar<uoffset_t>(val);
+          IterateObject(val, type_table, visitor);
+          break;
+        case ST_STRUCT: IterateObject(val, type_table, visitor); break;
+        case ST_UNION: {
+          val += ReadScalar<uoffset_t>(val);
+          FLATBUFFERS_ASSERT(prev_val);
+          auto union_type = *prev_val;  // Always a uint8_t.
+          if (vector_index >= 0) {
+            auto type_vec = reinterpret_cast<const Vector<uint8_t> *>(prev_val);
+            union_type = type_vec->Get(static_cast<uoffset_t>(vector_index));
+          }
+          auto type_code_idx =
+              LookupEnum(union_type, type_table->values, type_table->num_elems);
+          if (type_code_idx >= 0 &&
+              type_code_idx < static_cast<int32_t>(type_table->num_elems)) {
+            auto type_code = type_table->type_codes[type_code_idx];
+            switch (type_code.base_type) {
+              case ET_SEQUENCE: {
+                auto ref = type_table->type_refs[type_code.sequence_ref]();
+                IterateObject(val, ref, visitor);
+                break;
+              }
+              case ET_STRING:
+                visitor->String(reinterpret_cast<const String *>(val));
+                break;
+              default: visitor->Unknown(val);
+            }
+          } else {
+            visitor->Unknown(val);
+          }
+          break;
+        }
+        case ST_ENUM: FLATBUFFERS_ASSERT(false); break;
+      }
+      break;
+    }
+    default: {
+      visitor->Unknown(val);
+      break;
+    }
+  }
+}
+
+inline void IterateObject(const uint8_t *obj, const TypeTable *type_table,
+                          IterationVisitor *visitor) {
+  visitor->StartSequence();
+  const uint8_t *prev_val = nullptr;
+  size_t set_idx = 0;
+  size_t array_idx = 0;
+  for (size_t i = 0; i < type_table->num_elems; i++) {
+    auto type_code = type_table->type_codes[i];
+    auto type = static_cast<ElementaryType>(type_code.base_type);
+    auto is_repeating = type_code.is_repeating != 0;
+    auto ref_idx = type_code.sequence_ref;
+    const TypeTable *ref = nullptr;
+    if (ref_idx >= 0) { ref = type_table->type_refs[ref_idx](); }
+    auto name = type_table->names ? type_table->names[i] : nullptr;
+    const uint8_t *val = nullptr;
+    if (type_table->st == ST_TABLE) {
+      val = reinterpret_cast<const Table *>(obj)->GetAddressOf(
+          FieldIndexToOffset(static_cast<voffset_t>(i)));
+    } else {
+      val = obj + type_table->values[i];
+    }
+    visitor->Field(i, set_idx, type, is_repeating, ref, name, val);
+    if (val) {
+      set_idx++;
+      if (is_repeating) {
+        auto elem_ptr = val;
+        size_t size = 0;
+        if (type_table->st == ST_TABLE) {
+          // variable length vector
+          val += ReadScalar<uoffset_t>(val);
+          auto vec = reinterpret_cast<const Vector<uint8_t> *>(val);
+          elem_ptr = vec->Data();
+          size = vec->size();
+        } else {
+          // otherwise fixed size array
+          size = type_table->array_sizes[array_idx];
+          ++array_idx;
+        }
+        visitor->StartVector();
+        for (size_t j = 0; j < size; j++) {
+          visitor->Element(j, type, ref, elem_ptr);
+          IterateValue(type, elem_ptr, ref, prev_val, static_cast<soffset_t>(j),
+                       visitor);
+          elem_ptr += InlineSize(type, ref);
+        }
+        visitor->EndVector();
+      } else {
+        IterateValue(type, val, ref, prev_val, -1, visitor);
+      }
+    }
+    prev_val = val;
+  }
+  visitor->EndSequence();
+}
+
+inline void IterateFlatBuffer(const uint8_t *buffer,
+                              const TypeTable *type_table,
+                              IterationVisitor *callback) {
+  IterateObject(GetRoot<uint8_t>(buffer), type_table, callback);
+}
+
+// Outputting a Flatbuffer to a string. Tries to conform as close to JSON /
+// the output generated by idl_gen_text.cpp.
+
+struct ToStringVisitor : public IterationVisitor {
+  std::string s;
+  std::string d;
+  bool q;
+  std::string in;
+  size_t indent_level;
+  bool vector_delimited;
+  ToStringVisitor(std::string delimiter, bool quotes, std::string indent,
+                  bool vdelimited = true)
+      : d(delimiter),
+        q(quotes),
+        in(indent),
+        indent_level(0),
+        vector_delimited(vdelimited) {}
+  ToStringVisitor(std::string delimiter)
+      : d(delimiter),
+        q(false),
+        in(""),
+        indent_level(0),
+        vector_delimited(true) {}
+
+  void append_indent() {
+    for (size_t i = 0; i < indent_level; i++) { s += in; }
+  }
+
+  void StartSequence() {
+    s += "{";
+    s += d;
+    indent_level++;
+  }
+  void EndSequence() {
+    s += d;
+    indent_level--;
+    append_indent();
+    s += "}";
+  }
+  void Field(size_t /*field_idx*/, size_t set_idx, ElementaryType /*type*/,
+             bool /*is_vector*/, const TypeTable * /*type_table*/,
+             const char *name, const uint8_t *val) {
+    if (!val) return;
+    if (set_idx) {
+      s += ",";
+      s += d;
+    }
+    append_indent();
+    if (name) {
+      if (q) s += "\"";
+      s += name;
+      if (q) s += "\"";
+      s += ": ";
+    }
+  }
+  template<typename T> void Named(T x, const char *name) {
+    if (name) {
+      if (q) s += "\"";
+      s += name;
+      if (q) s += "\"";
+    } else {
+      s += NumToString(x);
+    }
+  }
+  void UType(uint8_t x, const char *name) { Named(x, name); }
+  void Bool(bool x) { s += x ? "true" : "false"; }
+  void Char(int8_t x, const char *name) { Named(x, name); }
+  void UChar(uint8_t x, const char *name) { Named(x, name); }
+  void Short(int16_t x, const char *name) { Named(x, name); }
+  void UShort(uint16_t x, const char *name) { Named(x, name); }
+  void Int(int32_t x, const char *name) { Named(x, name); }
+  void UInt(uint32_t x, const char *name) { Named(x, name); }
+  void Long(int64_t x) { s += NumToString(x); }
+  void ULong(uint64_t x) { s += NumToString(x); }
+  void Float(float x) { s += NumToString(x); }
+  void Double(double x) { s += NumToString(x); }
+  void String(const struct String *str) {
+    EscapeString(str->c_str(), str->size(), &s, true, false);
+  }
+  void Unknown(const uint8_t *) { s += "(?)"; }
+  void StartVector() {
+    s += "[";
+    if (vector_delimited) {
+      s += d;
+      indent_level++;
+      append_indent();
+    } else {
+      s += " ";
+    }
+  }
+  void EndVector() {
+    if (vector_delimited) {
+      s += d;
+      indent_level--;
+      append_indent();
+    } else {
+      s += " ";
+    }
+    s += "]";
+  }
+  void Element(size_t i, ElementaryType /*type*/,
+               const TypeTable * /*type_table*/, const uint8_t * /*val*/) {
+    if (i) {
+      s += ",";
+      if (vector_delimited) {
+        s += d;
+        append_indent();
+      } else {
+        s += " ";
+      }
+    }
+  }
+};
+
+inline std::string FlatBufferToString(const uint8_t *buffer,
+                                      const TypeTable *type_table,
+                                      bool multi_line = false,
+                                      bool vector_delimited = true,
+                                      const std::string &indent = "") {
+  ToStringVisitor tostring_visitor(multi_line ? "\n" : " ", false, indent,
+                                   vector_delimited);
+  IterateFlatBuffer(buffer, type_table, &tostring_visitor);
+  return tostring_visitor.s;
+}
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_MINIREFLECT_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/pch/flatc_pch.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/pch/flatc_pch.h
new file mode 100644
index 00000000..77132790
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/pch/flatc_pch.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2017 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_FLATC_PCH_H_
+#define FLATBUFFERS_FLATC_PCH_H_
+
+// stl
+#include <cmath>
+#include <sstream>
+#include <cassert>
+#include <unordered_set>
+#include <unordered_map>
+#include <iostream>
+#include <functional>
+#include <set>
+#include <iterator>
+#include <tuple>
+
+// flatbuffers
+#include "flatbuffers/pch/pch.h"
+#include "flatbuffers/code_generators.h"
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/flexbuffers.h"
+#include "flatbuffers/idl.h"
+
+#endif // FLATBUFFERS_FLATC_PCH_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/pch/pch.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/pch/pch.h
new file mode 100644
index 00000000..804e99ed
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/pch/pch.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2017 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_PCH_H_
+#define FLATBUFFERS_PCH_H_
+
+// stl
+#include <cstdint>
+#include <cstring>
+#include <algorithm>
+#include <list>
+#include <string>
+#include <utility>
+#include <iomanip>
+#include <map>
+#include <memory>
+#include <limits>
+#include <stack>
+#include <vector>
+#include <type_traits>
+
+// flatbuffers
+#include "flatbuffers/util.h"
+
+#endif // FLATBUFFERS_PCH_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/reflection.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/reflection.h
new file mode 100644
index 00000000..faf647c3
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/reflection.h
@@ -0,0 +1,523 @@
+/*
+ * Copyright 2015 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_REFLECTION_H_
+#define FLATBUFFERS_REFLECTION_H_
+
+// This is somewhat of a circular dependency because flatc (and thus this
+// file) is needed to generate this header in the first place.
+// Should normally not be a problem since it can be generated by the
+// previous version of flatc whenever this code needs to change.
+// See scripts/generate_code.py for generation.
+#include "flatbuffers/reflection_generated.h"
+
+// Helper functionality for reflection.
+
+namespace flatbuffers {
+
+// ------------------------- GETTERS -------------------------
+
+inline bool IsScalar(reflection::BaseType t) {
+  return t >= reflection::UType && t <= reflection::Double;
+}
+inline bool IsInteger(reflection::BaseType t) {
+  return t >= reflection::UType && t <= reflection::ULong;
+}
+inline bool IsFloat(reflection::BaseType t) {
+  return t == reflection::Float || t == reflection::Double;
+}
+inline bool IsLong(reflection::BaseType t) {
+  return t == reflection::Long || t == reflection::ULong;
+}
+
+// Size of a basic type, don't use with structs.
+inline size_t GetTypeSize(reflection::BaseType base_type) {
+  // This needs to correspond to the BaseType enum.
+  static size_t sizes[] = {
+    0,  // None
+    1,  // UType
+    1,  // Bool
+    1,  // Byte
+    1,  // UByte
+    2,  // Short
+    2,  // UShort
+    4,  // Int
+    4,  // UInt
+    8,  // Long
+    8,  // ULong
+    4,  // Float
+    8,  // Double
+    4,  // String
+    4,  // Vector
+    4,  // Obj
+    4,  // Union
+    0,  // Array. Only used in structs. 0 was chosen to prevent out-of-bounds
+        // errors.
+    8,  // Vector64
+
+    0  // MaxBaseType. This must be kept the last entry in this array.
+  };
+  static_assert(sizeof(sizes) / sizeof(size_t) == reflection::MaxBaseType + 1,
+                "Size of sizes[] array does not match the count of BaseType "
+                "enum values.");
+  return sizes[base_type];
+}
+
+// Same as above, but now correctly returns the size of a struct if
+// the field (or vector element) is a struct.
+inline size_t GetTypeSizeInline(reflection::BaseType base_type, int type_index,
+                                const reflection::Schema &schema) {
+  if (base_type == reflection::Obj &&
+      schema.objects()->Get(type_index)->is_struct()) {
+    return schema.objects()->Get(type_index)->bytesize();
+  } else {
+    return GetTypeSize(base_type);
+  }
+}
+
+// Get the root, regardless of what type it is.
+inline Table *GetAnyRoot(uint8_t *const flatbuf) {
+  return GetMutableRoot<Table>(flatbuf);
+}
+
+inline const Table *GetAnyRoot(const uint8_t *const flatbuf) {
+  return GetRoot<Table>(flatbuf);
+}
+
+inline Table *GetAnySizePrefixedRoot(uint8_t *const flatbuf) {
+  return GetMutableSizePrefixedRoot<Table>(flatbuf);
+}
+
+inline const Table *GetAnySizePrefixedRoot(const uint8_t *const flatbuf) {
+  return GetSizePrefixedRoot<Table>(flatbuf);
+}
+
+// Get a field's default, if you know it's an integer, and its exact type.
+template<typename T> T GetFieldDefaultI(const reflection::Field &field) {
+  FLATBUFFERS_ASSERT(sizeof(T) == GetTypeSize(field.type()->base_type()));
+  return static_cast<T>(field.default_integer());
+}
+
+// Get a field's default, if you know it's floating point and its exact type.
+template<typename T> T GetFieldDefaultF(const reflection::Field &field) {
+  FLATBUFFERS_ASSERT(sizeof(T) == GetTypeSize(field.type()->base_type()));
+  return static_cast<T>(field.default_real());
+}
+
+// Get a field, if you know it's an integer, and its exact type.
+template<typename T>
+T GetFieldI(const Table &table, const reflection::Field &field) {
+  FLATBUFFERS_ASSERT(sizeof(T) == GetTypeSize(field.type()->base_type()));
+  return table.GetField<T>(field.offset(),
+                           static_cast<T>(field.default_integer()));
+}
+
+// Get a field, if you know it's floating point and its exact type.
+template<typename T>
+T GetFieldF(const Table &table, const reflection::Field &field) {
+  FLATBUFFERS_ASSERT(sizeof(T) == GetTypeSize(field.type()->base_type()));
+  return table.GetField<T>(field.offset(),
+                           static_cast<T>(field.default_real()));
+}
+
+// Get a field, if you know it's a string.
+inline const String *GetFieldS(const Table &table,
+                               const reflection::Field &field) {
+  FLATBUFFERS_ASSERT(field.type()->base_type() == reflection::String);
+  return table.GetPointer<const String *>(field.offset());
+}
+
+// Get a field, if you know it's a vector.
+template<typename T>
+Vector<T> *GetFieldV(const Table &table, const reflection::Field &field) {
+  FLATBUFFERS_ASSERT(field.type()->base_type() == reflection::Vector &&
+                     sizeof(T) == GetTypeSize(field.type()->element()));
+  return table.GetPointer<Vector<T> *>(field.offset());
+}
+
+// Get a field, if you know it's a vector, generically.
+// To actually access elements, use the return value together with
+// field.type()->element() in any of GetAnyVectorElemI below etc.
+inline VectorOfAny *GetFieldAnyV(const Table &table,
+                                 const reflection::Field &field) {
+  return table.GetPointer<VectorOfAny *>(field.offset());
+}
+
+// Get a field, if you know it's a table.
+inline Table *GetFieldT(const Table &table, const reflection::Field &field) {
+  FLATBUFFERS_ASSERT(field.type()->base_type() == reflection::Obj ||
+                     field.type()->base_type() == reflection::Union);
+  return table.GetPointer<Table *>(field.offset());
+}
+
+// Get a field, if you know it's a struct.
+inline const Struct *GetFieldStruct(const Table &table,
+                                    const reflection::Field &field) {
+  // TODO: This does NOT check if the field is a table or struct, but we'd need
+  // access to the schema to check the is_struct flag.
+  FLATBUFFERS_ASSERT(field.type()->base_type() == reflection::Obj);
+  return table.GetStruct<const Struct *>(field.offset());
+}
+
+// Get a structure's field, if you know it's a struct.
+inline const Struct *GetFieldStruct(const Struct &structure,
+                                    const reflection::Field &field) {
+  FLATBUFFERS_ASSERT(field.type()->base_type() == reflection::Obj);
+  return structure.GetStruct<const Struct *>(field.offset());
+}
+
+// Raw helper functions used below: get any value in memory as a 64bit int, a
+// double or a string.
+// All scalars get static_cast to an int64_t, strings use strtoull, every other
+// data type returns 0.
+int64_t GetAnyValueI(reflection::BaseType type, const uint8_t *data);
+// All scalars static cast to double, strings use strtod, every other data
+// type is 0.0.
+double GetAnyValueF(reflection::BaseType type, const uint8_t *data);
+// All scalars converted using stringstream, strings as-is, and all other
+// data types provide some level of debug-pretty-printing.
+std::string GetAnyValueS(reflection::BaseType type, const uint8_t *data,
+                         const reflection::Schema *schema, int type_index);
+
+// Get any table field as a 64bit int, regardless of what type it is.
+inline int64_t GetAnyFieldI(const Table &table,
+                            const reflection::Field &field) {
+  auto field_ptr = table.GetAddressOf(field.offset());
+  return field_ptr ? GetAnyValueI(field.type()->base_type(), field_ptr)
+                   : field.default_integer();
+}
+
+// Get any table field as a double, regardless of what type it is.
+inline double GetAnyFieldF(const Table &table, const reflection::Field &field) {
+  auto field_ptr = table.GetAddressOf(field.offset());
+  return field_ptr ? GetAnyValueF(field.type()->base_type(), field_ptr)
+                   : field.default_real();
+}
+
+// Get any table field as a string, regardless of what type it is.
+// You may pass nullptr for the schema if you don't care to have fields that
+// are of table type pretty-printed.
+inline std::string GetAnyFieldS(const Table &table,
+                                const reflection::Field &field,
+                                const reflection::Schema *schema) {
+  auto field_ptr = table.GetAddressOf(field.offset());
+  return field_ptr ? GetAnyValueS(field.type()->base_type(), field_ptr, schema,
+                                  field.type()->index())
+                   : "";
+}
+
+// Get any struct field as a 64bit int, regardless of what type it is.
+inline int64_t GetAnyFieldI(const Struct &st, const reflection::Field &field) {
+  return GetAnyValueI(field.type()->base_type(),
+                      st.GetAddressOf(field.offset()));
+}
+
+// Get any struct field as a double, regardless of what type it is.
+inline double GetAnyFieldF(const Struct &st, const reflection::Field &field) {
+  return GetAnyValueF(field.type()->base_type(),
+                      st.GetAddressOf(field.offset()));
+}
+
+// Get any struct field as a string, regardless of what type it is.
+inline std::string GetAnyFieldS(const Struct &st,
+                                const reflection::Field &field) {
+  return GetAnyValueS(field.type()->base_type(),
+                      st.GetAddressOf(field.offset()), nullptr, -1);
+}
+
+// Get any vector element as a 64bit int, regardless of what type it is.
+inline int64_t GetAnyVectorElemI(const VectorOfAny *vec,
+                                 reflection::BaseType elem_type, size_t i) {
+  return GetAnyValueI(elem_type, vec->Data() + GetTypeSize(elem_type) * i);
+}
+
+// Get any vector element as a double, regardless of what type it is.
+inline double GetAnyVectorElemF(const VectorOfAny *vec,
+                                reflection::BaseType elem_type, size_t i) {
+  return GetAnyValueF(elem_type, vec->Data() + GetTypeSize(elem_type) * i);
+}
+
+// Get any vector element as a string, regardless of what type it is.
+inline std::string GetAnyVectorElemS(const VectorOfAny *vec,
+                                     reflection::BaseType elem_type, size_t i) {
+  return GetAnyValueS(elem_type, vec->Data() + GetTypeSize(elem_type) * i,
+                      nullptr, -1);
+}
+
+// Get a vector element that's a table/string/vector from a generic vector.
+// Pass Table/String/VectorOfAny as template parameter.
+// Warning: does no typechecking.
+template<typename T>
+T *GetAnyVectorElemPointer(const VectorOfAny *vec, size_t i) {
+  auto elem_ptr = vec->Data() + sizeof(uoffset_t) * i;
+  return reinterpret_cast<T *>(elem_ptr + ReadScalar<uoffset_t>(elem_ptr));
+}
+
+// Get the inline-address of a vector element. Useful for Structs (pass Struct
+// as template arg), or being able to address a range of scalars in-line.
+// Get elem_size from GetTypeSizeInline().
+// Note: little-endian data on all platforms, use EndianScalar() instead of
+// raw pointer access with scalars).
+template<typename T>
+T *GetAnyVectorElemAddressOf(const VectorOfAny *vec, size_t i,
+                             size_t elem_size) {
+  return reinterpret_cast<T *>(vec->Data() + elem_size * i);
+}
+
+// Similarly, for elements of tables.
+template<typename T>
+T *GetAnyFieldAddressOf(const Table &table, const reflection::Field &field) {
+  return reinterpret_cast<T *>(table.GetAddressOf(field.offset()));
+}
+
+// Similarly, for elements of structs.
+template<typename T>
+T *GetAnyFieldAddressOf(const Struct &st, const reflection::Field &field) {
+  return reinterpret_cast<T *>(st.GetAddressOf(field.offset()));
+}
+
+// Loop over all the fields of the provided `object` and call `func` on each one
+// in increasing order by their field->id(). If `reverse` is true, `func` is
+// called in descending order
+void ForAllFields(const reflection::Object *object, bool reverse,
+                  std::function<void(const reflection::Field *)> func);
+
+// ------------------------- SETTERS -------------------------
+
+// Set any scalar field, if you know its exact type.
+template<typename T>
+bool SetField(Table *table, const reflection::Field &field, T val) {
+  reflection::BaseType type = field.type()->base_type();
+  if (!IsScalar(type)) { return false; }
+  FLATBUFFERS_ASSERT(sizeof(T) == GetTypeSize(type));
+  T def;
+  if (IsInteger(type)) {
+    def = GetFieldDefaultI<T>(field);
+  } else {
+    FLATBUFFERS_ASSERT(IsFloat(type));
+    def = GetFieldDefaultF<T>(field);
+  }
+  return table->SetField(field.offset(), val, def);
+}
+
+// Raw helper functions used below: set any value in memory as a 64bit int, a
+// double or a string.
+// These work for all scalar values, but do nothing for other data types.
+// To set a string, see SetString below.
+void SetAnyValueI(reflection::BaseType type, uint8_t *data, int64_t val);
+void SetAnyValueF(reflection::BaseType type, uint8_t *data, double val);
+void SetAnyValueS(reflection::BaseType type, uint8_t *data, const char *val);
+
+// Set any table field as a 64bit int, regardless of type what it is.
+inline bool SetAnyFieldI(Table *table, const reflection::Field &field,
+                         int64_t val) {
+  auto field_ptr = table->GetAddressOf(field.offset());
+  if (!field_ptr) return val == GetFieldDefaultI<int64_t>(field);
+  SetAnyValueI(field.type()->base_type(), field_ptr, val);
+  return true;
+}
+
+// Set any table field as a double, regardless of what type it is.
+inline bool SetAnyFieldF(Table *table, const reflection::Field &field,
+                         double val) {
+  auto field_ptr = table->GetAddressOf(field.offset());
+  if (!field_ptr) return val == GetFieldDefaultF<double>(field);
+  SetAnyValueF(field.type()->base_type(), field_ptr, val);
+  return true;
+}
+
+// Set any table field as a string, regardless of what type it is.
+inline bool SetAnyFieldS(Table *table, const reflection::Field &field,
+                         const char *val) {
+  auto field_ptr = table->GetAddressOf(field.offset());
+  if (!field_ptr) return false;
+  SetAnyValueS(field.type()->base_type(), field_ptr, val);
+  return true;
+}
+
+// Set any struct field as a 64bit int, regardless of type what it is.
+inline void SetAnyFieldI(Struct *st, const reflection::Field &field,
+                         int64_t val) {
+  SetAnyValueI(field.type()->base_type(), st->GetAddressOf(field.offset()),
+               val);
+}
+
+// Set any struct field as a double, regardless of type what it is.
+inline void SetAnyFieldF(Struct *st, const reflection::Field &field,
+                         double val) {
+  SetAnyValueF(field.type()->base_type(), st->GetAddressOf(field.offset()),
+               val);
+}
+
+// Set any struct field as a string, regardless of type what it is.
+inline void SetAnyFieldS(Struct *st, const reflection::Field &field,
+                         const char *val) {
+  SetAnyValueS(field.type()->base_type(), st->GetAddressOf(field.offset()),
+               val);
+}
+
+// Set any vector element as a 64bit int, regardless of type what it is.
+inline void SetAnyVectorElemI(VectorOfAny *vec, reflection::BaseType elem_type,
+                              size_t i, int64_t val) {
+  SetAnyValueI(elem_type, vec->Data() + GetTypeSize(elem_type) * i, val);
+}
+
+// Set any vector element as a double, regardless of type what it is.
+inline void SetAnyVectorElemF(VectorOfAny *vec, reflection::BaseType elem_type,
+                              size_t i, double val) {
+  SetAnyValueF(elem_type, vec->Data() + GetTypeSize(elem_type) * i, val);
+}
+
+// Set any vector element as a string, regardless of type what it is.
+inline void SetAnyVectorElemS(VectorOfAny *vec, reflection::BaseType elem_type,
+                              size_t i, const char *val) {
+  SetAnyValueS(elem_type, vec->Data() + GetTypeSize(elem_type) * i, val);
+}
+
+// ------------------------- RESIZING SETTERS -------------------------
+
+// "smart" pointer for use with resizing vectors: turns a pointer inside
+// a vector into a relative offset, such that it is not affected by resizes.
+template<typename T, typename U> class pointer_inside_vector {
+ public:
+  pointer_inside_vector(T *ptr, std::vector<U> &vec)
+      : offset_(reinterpret_cast<uint8_t *>(ptr) -
+                reinterpret_cast<uint8_t *>(vec.data())),
+        vec_(vec) {}
+
+  T *operator*() const {
+    return reinterpret_cast<T *>(reinterpret_cast<uint8_t *>(vec_.data()) +
+                                 offset_);
+  }
+  T *operator->() const { return operator*(); }
+
+ private:
+  size_t offset_;
+  std::vector<U> &vec_;
+};
+
+// Helper to create the above easily without specifying template args.
+template<typename T, typename U>
+pointer_inside_vector<T, U> piv(T *ptr, std::vector<U> &vec) {
+  return pointer_inside_vector<T, U>(ptr, vec);
+}
+
+inline const char *UnionTypeFieldSuffix() { return "_type"; }
+
+// Helper to figure out the actual table type a union refers to.
+inline const reflection::Object &GetUnionType(
+    const reflection::Schema &schema, const reflection::Object &parent,
+    const reflection::Field &unionfield, const Table &table) {
+  auto enumdef = schema.enums()->Get(unionfield.type()->index());
+  // TODO: this is clumsy and slow, but no other way to find it?
+  auto type_field = parent.fields()->LookupByKey(
+      (unionfield.name()->str() + UnionTypeFieldSuffix()).c_str());
+  FLATBUFFERS_ASSERT(type_field);
+  auto union_type = GetFieldI<uint8_t>(table, *type_field);
+  auto enumval = enumdef->values()->LookupByKey(union_type);
+  return *schema.objects()->Get(enumval->union_type()->index());
+}
+
+// Changes the contents of a string inside a FlatBuffer. FlatBuffer must
+// live inside a std::vector so we can resize the buffer if needed.
+// "str" must live inside "flatbuf" and may be invalidated after this call.
+// If your FlatBuffer's root table is not the schema's root table, you should
+// pass in your root_table type as well.
+void SetString(const reflection::Schema &schema, const std::string &val,
+               const String *str, std::vector<uint8_t> *flatbuf,
+               const reflection::Object *root_table = nullptr);
+
+// Resizes a flatbuffers::Vector inside a FlatBuffer. FlatBuffer must
+// live inside a std::vector so we can resize the buffer if needed.
+// "vec" must live inside "flatbuf" and may be invalidated after this call.
+// If your FlatBuffer's root table is not the schema's root table, you should
+// pass in your root_table type as well.
+uint8_t *ResizeAnyVector(const reflection::Schema &schema, uoffset_t newsize,
+                         const VectorOfAny *vec, uoffset_t num_elems,
+                         uoffset_t elem_size, std::vector<uint8_t> *flatbuf,
+                         const reflection::Object *root_table = nullptr);
+
+template<typename T>
+void ResizeVector(const reflection::Schema &schema, uoffset_t newsize, T val,
+                  const Vector<T> *vec, std::vector<uint8_t> *flatbuf,
+                  const reflection::Object *root_table = nullptr) {
+  auto delta_elem = static_cast<int>(newsize) - static_cast<int>(vec->size());
+  auto newelems = ResizeAnyVector(
+      schema, newsize, reinterpret_cast<const VectorOfAny *>(vec), vec->size(),
+      static_cast<uoffset_t>(sizeof(T)), flatbuf, root_table);
+  // Set new elements to "val".
+  for (int i = 0; i < delta_elem; i++) {
+    auto loc = newelems + i * sizeof(T);
+    auto is_scalar = flatbuffers::is_scalar<T>::value;
+    if (is_scalar) {
+      WriteScalar(loc, val);
+    } else {  // struct
+      *reinterpret_cast<T *>(loc) = val;
+    }
+  }
+}
+
+// Adds any new data (in the form of a new FlatBuffer) to an existing
+// FlatBuffer. This can be used when any of the above methods are not
+// sufficient, in particular for adding new tables and new fields.
+// This is potentially slightly less efficient than a FlatBuffer constructed
+// in one piece, since the new FlatBuffer doesn't share any vtables with the
+// existing one.
+// The return value can now be set using Vector::MutateOffset or SetFieldT
+// below.
+const uint8_t *AddFlatBuffer(std::vector<uint8_t> &flatbuf,
+                             const uint8_t *newbuf, size_t newlen);
+
+inline bool SetFieldT(Table *table, const reflection::Field &field,
+                      const uint8_t *val) {
+  FLATBUFFERS_ASSERT(sizeof(uoffset_t) ==
+                     GetTypeSize(field.type()->base_type()));
+  return table->SetPointer(field.offset(), val);
+}
+
+// ------------------------- COPYING -------------------------
+
+// Generic copying of tables from a FlatBuffer into a FlatBuffer builder.
+// Can be used to do any kind of merging/selecting you may want to do out
+// of existing buffers. Also useful to reconstruct a whole buffer if the
+// above resizing functionality has introduced garbage in a buffer you want
+// to remove.
+// Note: this does not deal with DAGs correctly. If the table passed forms a
+// DAG, the copy will be a tree instead (with duplicates). Strings can be
+// shared however, by passing true for use_string_pooling.
+
+Offset<const Table *> CopyTable(FlatBufferBuilder &fbb,
+                                const reflection::Schema &schema,
+                                const reflection::Object &objectdef,
+                                const Table &table,
+                                bool use_string_pooling = false);
+
+// Verifies the provided flatbuffer using reflection.
+// root should point to the root type for this flatbuffer.
+// buf should point to the start of flatbuffer data.
+// length specifies the size of the flatbuffer data.
+bool Verify(const reflection::Schema &schema, const reflection::Object &root,
+            const uint8_t *buf, size_t length, uoffset_t max_depth = 64,
+            uoffset_t max_tables = 1000000);
+
+bool VerifySizePrefixed(const reflection::Schema &schema,
+                        const reflection::Object &root, const uint8_t *buf,
+                        size_t length, uoffset_t max_depth = 64,
+                        uoffset_t max_tables = 1000000);
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_REFLECTION_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/reflection_generated.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/reflection_generated.h
new file mode 100644
index 00000000..96e9315a
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/reflection_generated.h
@@ -0,0 +1,1487 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_REFLECTION_REFLECTION_H_
+#define FLATBUFFERS_GENERATED_REFLECTION_REFLECTION_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
+              FLATBUFFERS_VERSION_MINOR == 5 &&
+              FLATBUFFERS_VERSION_REVISION == 26,
+             "Non-compatible flatbuffers version included");
+
+namespace reflection {
+
+struct Type;
+struct TypeBuilder;
+
+struct KeyValue;
+struct KeyValueBuilder;
+
+struct EnumVal;
+struct EnumValBuilder;
+
+struct Enum;
+struct EnumBuilder;
+
+struct Field;
+struct FieldBuilder;
+
+struct Object;
+struct ObjectBuilder;
+
+struct RPCCall;
+struct RPCCallBuilder;
+
+struct Service;
+struct ServiceBuilder;
+
+struct SchemaFile;
+struct SchemaFileBuilder;
+
+struct Schema;
+struct SchemaBuilder;
+
+enum BaseType {
+  None = 0,
+  UType = 1,
+  Bool = 2,
+  Byte = 3,
+  UByte = 4,
+  Short = 5,
+  UShort = 6,
+  Int = 7,
+  UInt = 8,
+  Long = 9,
+  ULong = 10,
+  Float = 11,
+  Double = 12,
+  String = 13,
+  Vector = 14,
+  Obj = 15,
+  Union = 16,
+  Array = 17,
+  Vector64 = 18,
+  MaxBaseType = 19
+};
+
+inline const BaseType (&EnumValuesBaseType())[20] {
+  static const BaseType values[] = {
+    None,
+    UType,
+    Bool,
+    Byte,
+    UByte,
+    Short,
+    UShort,
+    Int,
+    UInt,
+    Long,
+    ULong,
+    Float,
+    Double,
+    String,
+    Vector,
+    Obj,
+    Union,
+    Array,
+    Vector64,
+    MaxBaseType
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesBaseType() {
+  static const char * const names[21] = {
+    "None",
+    "UType",
+    "Bool",
+    "Byte",
+    "UByte",
+    "Short",
+    "UShort",
+    "Int",
+    "UInt",
+    "Long",
+    "ULong",
+    "Float",
+    "Double",
+    "String",
+    "Vector",
+    "Obj",
+    "Union",
+    "Array",
+    "Vector64",
+    "MaxBaseType",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameBaseType(BaseType e) {
+  if (::flatbuffers::IsOutRange(e, None, MaxBaseType)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesBaseType()[index];
+}
+
+/// New schema language features that are not supported by old code generators.
+enum AdvancedFeatures {
+  AdvancedArrayFeatures = 1ULL,
+  AdvancedUnionFeatures = 2ULL,
+  OptionalScalars = 4ULL,
+  DefaultVectorsAndStrings = 8ULL
+};
+
+inline const AdvancedFeatures (&EnumValuesAdvancedFeatures())[4] {
+  static const AdvancedFeatures values[] = {
+    AdvancedArrayFeatures,
+    AdvancedUnionFeatures,
+    OptionalScalars,
+    DefaultVectorsAndStrings
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesAdvancedFeatures() {
+  static const char * const names[9] = {
+    "AdvancedArrayFeatures",
+    "AdvancedUnionFeatures",
+    "",
+    "OptionalScalars",
+    "",
+    "",
+    "",
+    "DefaultVectorsAndStrings",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameAdvancedFeatures(AdvancedFeatures e) {
+  if (::flatbuffers::IsOutRange(e, AdvancedArrayFeatures, DefaultVectorsAndStrings)) return "";
+  const size_t index = static_cast<size_t>(e) - static_cast<size_t>(AdvancedArrayFeatures);
+  return EnumNamesAdvancedFeatures()[index];
+}
+
+struct Type FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TypeBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BASE_TYPE = 4,
+    VT_ELEMENT = 6,
+    VT_INDEX = 8,
+    VT_FIXED_LENGTH = 10,
+    VT_BASE_SIZE = 12,
+    VT_ELEMENT_SIZE = 14
+  };
+  reflection::BaseType base_type() const {
+    return static_cast<reflection::BaseType>(GetField<int8_t>(VT_BASE_TYPE, 0));
+  }
+  reflection::BaseType element() const {
+    return static_cast<reflection::BaseType>(GetField<int8_t>(VT_ELEMENT, 0));
+  }
+  int32_t index() const {
+    return GetField<int32_t>(VT_INDEX, -1);
+  }
+  uint16_t fixed_length() const {
+    return GetField<uint16_t>(VT_FIXED_LENGTH, 0);
+  }
+  /// The size (octets) of the `base_type` field.
+  uint32_t base_size() const {
+    return GetField<uint32_t>(VT_BASE_SIZE, 4);
+  }
+  /// The size (octets) of the `element` field, if present.
+  uint32_t element_size() const {
+    return GetField<uint32_t>(VT_ELEMENT_SIZE, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_BASE_TYPE, 1) &&
+           VerifyField<int8_t>(verifier, VT_ELEMENT, 1) &&
+           VerifyField<int32_t>(verifier, VT_INDEX, 4) &&
+           VerifyField<uint16_t>(verifier, VT_FIXED_LENGTH, 2) &&
+           VerifyField<uint32_t>(verifier, VT_BASE_SIZE, 4) &&
+           VerifyField<uint32_t>(verifier, VT_ELEMENT_SIZE, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct TypeBuilder {
+  typedef Type Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_base_type(reflection::BaseType base_type) {
+    fbb_.AddElement<int8_t>(Type::VT_BASE_TYPE, static_cast<int8_t>(base_type), 0);
+  }
+  void add_element(reflection::BaseType element) {
+    fbb_.AddElement<int8_t>(Type::VT_ELEMENT, static_cast<int8_t>(element), 0);
+  }
+  void add_index(int32_t index) {
+    fbb_.AddElement<int32_t>(Type::VT_INDEX, index, -1);
+  }
+  void add_fixed_length(uint16_t fixed_length) {
+    fbb_.AddElement<uint16_t>(Type::VT_FIXED_LENGTH, fixed_length, 0);
+  }
+  void add_base_size(uint32_t base_size) {
+    fbb_.AddElement<uint32_t>(Type::VT_BASE_SIZE, base_size, 4);
+  }
+  void add_element_size(uint32_t element_size) {
+    fbb_.AddElement<uint32_t>(Type::VT_ELEMENT_SIZE, element_size, 0);
+  }
+  explicit TypeBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Type> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Type>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Type> CreateType(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    reflection::BaseType base_type = reflection::None,
+    reflection::BaseType element = reflection::None,
+    int32_t index = -1,
+    uint16_t fixed_length = 0,
+    uint32_t base_size = 4,
+    uint32_t element_size = 0) {
+  TypeBuilder builder_(_fbb);
+  builder_.add_element_size(element_size);
+  builder_.add_base_size(base_size);
+  builder_.add_index(index);
+  builder_.add_fixed_length(fixed_length);
+  builder_.add_element(element);
+  builder_.add_base_type(base_type);
+  return builder_.Finish();
+}
+
+struct KeyValue FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef KeyValueBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_KEY = 4,
+    VT_VALUE = 6
+  };
+  const ::flatbuffers::String *key() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_KEY);
+  }
+  bool KeyCompareLessThan(const KeyValue * const o) const {
+    return *key() < *o->key();
+  }
+  int KeyCompareWithValue(const char *_key) const {
+    return strcmp(key()->c_str(), _key);
+  }
+  const ::flatbuffers::String *value() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_VALUE);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffsetRequired(verifier, VT_KEY) &&
+           verifier.VerifyString(key()) &&
+           VerifyOffset(verifier, VT_VALUE) &&
+           verifier.VerifyString(value()) &&
+           verifier.EndTable();
+  }
+};
+
+struct KeyValueBuilder {
+  typedef KeyValue Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_key(::flatbuffers::Offset<::flatbuffers::String> key) {
+    fbb_.AddOffset(KeyValue::VT_KEY, key);
+  }
+  void add_value(::flatbuffers::Offset<::flatbuffers::String> value) {
+    fbb_.AddOffset(KeyValue::VT_VALUE, value);
+  }
+  explicit KeyValueBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<KeyValue> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<KeyValue>(end);
+    fbb_.Required(o, KeyValue::VT_KEY);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<KeyValue> CreateKeyValue(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> key = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> value = 0) {
+  KeyValueBuilder builder_(_fbb);
+  builder_.add_value(value);
+  builder_.add_key(key);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<KeyValue> CreateKeyValueDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *key = nullptr,
+    const char *value = nullptr) {
+  auto key__ = key ? _fbb.CreateString(key) : 0;
+  auto value__ = value ? _fbb.CreateString(value) : 0;
+  return reflection::CreateKeyValue(
+      _fbb,
+      key__,
+      value__);
+}
+
+struct EnumVal FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef EnumValBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_VALUE = 6,
+    VT_UNION_TYPE = 10,
+    VT_DOCUMENTATION = 12,
+    VT_ATTRIBUTES = 14
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  int64_t value() const {
+    return GetField<int64_t>(VT_VALUE, 0);
+  }
+  bool KeyCompareLessThan(const EnumVal * const o) const {
+    return value() < o->value();
+  }
+  int KeyCompareWithValue(int64_t _value) const {
+    return static_cast<int>(value() > _value) - static_cast<int>(value() < _value);
+  }
+  const reflection::Type *union_type() const {
+    return GetPointer<const reflection::Type *>(VT_UNION_TYPE);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *documentation() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_DOCUMENTATION);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>> *attributes() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>> *>(VT_ATTRIBUTES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffsetRequired(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyField<int64_t>(verifier, VT_VALUE, 8) &&
+           VerifyOffset(verifier, VT_UNION_TYPE) &&
+           verifier.VerifyTable(union_type()) &&
+           VerifyOffset(verifier, VT_DOCUMENTATION) &&
+           verifier.VerifyVector(documentation()) &&
+           verifier.VerifyVectorOfStrings(documentation()) &&
+           VerifyOffset(verifier, VT_ATTRIBUTES) &&
+           verifier.VerifyVector(attributes()) &&
+           verifier.VerifyVectorOfTables(attributes()) &&
+           verifier.EndTable();
+  }
+};
+
+struct EnumValBuilder {
+  typedef EnumVal Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(EnumVal::VT_NAME, name);
+  }
+  void add_value(int64_t value) {
+    fbb_.AddElement<int64_t>(EnumVal::VT_VALUE, value, 0);
+  }
+  void add_union_type(::flatbuffers::Offset<reflection::Type> union_type) {
+    fbb_.AddOffset(EnumVal::VT_UNION_TYPE, union_type);
+  }
+  void add_documentation(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> documentation) {
+    fbb_.AddOffset(EnumVal::VT_DOCUMENTATION, documentation);
+  }
+  void add_attributes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>>> attributes) {
+    fbb_.AddOffset(EnumVal::VT_ATTRIBUTES, attributes);
+  }
+  explicit EnumValBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<EnumVal> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<EnumVal>(end);
+    fbb_.Required(o, EnumVal::VT_NAME);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<EnumVal> CreateEnumVal(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    int64_t value = 0,
+    ::flatbuffers::Offset<reflection::Type> union_type = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> documentation = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>>> attributes = 0) {
+  EnumValBuilder builder_(_fbb);
+  builder_.add_value(value);
+  builder_.add_attributes(attributes);
+  builder_.add_documentation(documentation);
+  builder_.add_union_type(union_type);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<EnumVal> CreateEnumValDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    int64_t value = 0,
+    ::flatbuffers::Offset<reflection::Type> union_type = 0,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *documentation = nullptr,
+    std::vector<::flatbuffers::Offset<reflection::KeyValue>> *attributes = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto documentation__ = documentation ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*documentation) : 0;
+  auto attributes__ = attributes ? _fbb.CreateVectorOfSortedTables<reflection::KeyValue>(attributes) : 0;
+  return reflection::CreateEnumVal(
+      _fbb,
+      name__,
+      value,
+      union_type,
+      documentation__,
+      attributes__);
+}
+
+struct Enum FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef EnumBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_VALUES = 6,
+    VT_IS_UNION = 8,
+    VT_UNDERLYING_TYPE = 10,
+    VT_ATTRIBUTES = 12,
+    VT_DOCUMENTATION = 14,
+    VT_DECLARATION_FILE = 16
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  bool KeyCompareLessThan(const Enum * const o) const {
+    return *name() < *o->name();
+  }
+  int KeyCompareWithValue(const char *_name) const {
+    return strcmp(name()->c_str(), _name);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::EnumVal>> *values() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::EnumVal>> *>(VT_VALUES);
+  }
+  bool is_union() const {
+    return GetField<uint8_t>(VT_IS_UNION, 0) != 0;
+  }
+  const reflection::Type *underlying_type() const {
+    return GetPointer<const reflection::Type *>(VT_UNDERLYING_TYPE);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>> *attributes() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>> *>(VT_ATTRIBUTES);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *documentation() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_DOCUMENTATION);
+  }
+  /// File that this Enum is declared in.
+  const ::flatbuffers::String *declaration_file() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DECLARATION_FILE);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffsetRequired(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffsetRequired(verifier, VT_VALUES) &&
+           verifier.VerifyVector(values()) &&
+           verifier.VerifyVectorOfTables(values()) &&
+           VerifyField<uint8_t>(verifier, VT_IS_UNION, 1) &&
+           VerifyOffsetRequired(verifier, VT_UNDERLYING_TYPE) &&
+           verifier.VerifyTable(underlying_type()) &&
+           VerifyOffset(verifier, VT_ATTRIBUTES) &&
+           verifier.VerifyVector(attributes()) &&
+           verifier.VerifyVectorOfTables(attributes()) &&
+           VerifyOffset(verifier, VT_DOCUMENTATION) &&
+           verifier.VerifyVector(documentation()) &&
+           verifier.VerifyVectorOfStrings(documentation()) &&
+           VerifyOffset(verifier, VT_DECLARATION_FILE) &&
+           verifier.VerifyString(declaration_file()) &&
+           verifier.EndTable();
+  }
+};
+
+struct EnumBuilder {
+  typedef Enum Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(Enum::VT_NAME, name);
+  }
+  void add_values(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::EnumVal>>> values) {
+    fbb_.AddOffset(Enum::VT_VALUES, values);
+  }
+  void add_is_union(bool is_union) {
+    fbb_.AddElement<uint8_t>(Enum::VT_IS_UNION, static_cast<uint8_t>(is_union), 0);
+  }
+  void add_underlying_type(::flatbuffers::Offset<reflection::Type> underlying_type) {
+    fbb_.AddOffset(Enum::VT_UNDERLYING_TYPE, underlying_type);
+  }
+  void add_attributes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>>> attributes) {
+    fbb_.AddOffset(Enum::VT_ATTRIBUTES, attributes);
+  }
+  void add_documentation(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> documentation) {
+    fbb_.AddOffset(Enum::VT_DOCUMENTATION, documentation);
+  }
+  void add_declaration_file(::flatbuffers::Offset<::flatbuffers::String> declaration_file) {
+    fbb_.AddOffset(Enum::VT_DECLARATION_FILE, declaration_file);
+  }
+  explicit EnumBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Enum> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Enum>(end);
+    fbb_.Required(o, Enum::VT_NAME);
+    fbb_.Required(o, Enum::VT_VALUES);
+    fbb_.Required(o, Enum::VT_UNDERLYING_TYPE);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Enum> CreateEnum(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::EnumVal>>> values = 0,
+    bool is_union = false,
+    ::flatbuffers::Offset<reflection::Type> underlying_type = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>>> attributes = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> documentation = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> declaration_file = 0) {
+  EnumBuilder builder_(_fbb);
+  builder_.add_declaration_file(declaration_file);
+  builder_.add_documentation(documentation);
+  builder_.add_attributes(attributes);
+  builder_.add_underlying_type(underlying_type);
+  builder_.add_values(values);
+  builder_.add_name(name);
+  builder_.add_is_union(is_union);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Enum> CreateEnumDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    std::vector<::flatbuffers::Offset<reflection::EnumVal>> *values = nullptr,
+    bool is_union = false,
+    ::flatbuffers::Offset<reflection::Type> underlying_type = 0,
+    std::vector<::flatbuffers::Offset<reflection::KeyValue>> *attributes = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *documentation = nullptr,
+    const char *declaration_file = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto values__ = values ? _fbb.CreateVectorOfSortedTables<reflection::EnumVal>(values) : 0;
+  auto attributes__ = attributes ? _fbb.CreateVectorOfSortedTables<reflection::KeyValue>(attributes) : 0;
+  auto documentation__ = documentation ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*documentation) : 0;
+  auto declaration_file__ = declaration_file ? _fbb.CreateString(declaration_file) : 0;
+  return reflection::CreateEnum(
+      _fbb,
+      name__,
+      values__,
+      is_union,
+      underlying_type,
+      attributes__,
+      documentation__,
+      declaration_file__);
+}
+
+struct Field FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FieldBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_TYPE = 6,
+    VT_ID = 8,
+    VT_OFFSET = 10,
+    VT_DEFAULT_INTEGER = 12,
+    VT_DEFAULT_REAL = 14,
+    VT_DEPRECATED = 16,
+    VT_REQUIRED = 18,
+    VT_KEY = 20,
+    VT_ATTRIBUTES = 22,
+    VT_DOCUMENTATION = 24,
+    VT_OPTIONAL = 26,
+    VT_PADDING = 28,
+    VT_OFFSET64 = 30
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  bool KeyCompareLessThan(const Field * const o) const {
+    return *name() < *o->name();
+  }
+  int KeyCompareWithValue(const char *_name) const {
+    return strcmp(name()->c_str(), _name);
+  }
+  const reflection::Type *type() const {
+    return GetPointer<const reflection::Type *>(VT_TYPE);
+  }
+  uint16_t id() const {
+    return GetField<uint16_t>(VT_ID, 0);
+  }
+  uint16_t offset() const {
+    return GetField<uint16_t>(VT_OFFSET, 0);
+  }
+  int64_t default_integer() const {
+    return GetField<int64_t>(VT_DEFAULT_INTEGER, 0);
+  }
+  double default_real() const {
+    return GetField<double>(VT_DEFAULT_REAL, 0.0);
+  }
+  bool deprecated() const {
+    return GetField<uint8_t>(VT_DEPRECATED, 0) != 0;
+  }
+  bool required() const {
+    return GetField<uint8_t>(VT_REQUIRED, 0) != 0;
+  }
+  bool key() const {
+    return GetField<uint8_t>(VT_KEY, 0) != 0;
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>> *attributes() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>> *>(VT_ATTRIBUTES);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *documentation() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_DOCUMENTATION);
+  }
+  bool optional() const {
+    return GetField<uint8_t>(VT_OPTIONAL, 0) != 0;
+  }
+  /// Number of padding octets to always add after this field. Structs only.
+  uint16_t padding() const {
+    return GetField<uint16_t>(VT_PADDING, 0);
+  }
+  /// If the field uses 64-bit offsets.
+  bool offset64() const {
+    return GetField<uint8_t>(VT_OFFSET64, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffsetRequired(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffsetRequired(verifier, VT_TYPE) &&
+           verifier.VerifyTable(type()) &&
+           VerifyField<uint16_t>(verifier, VT_ID, 2) &&
+           VerifyField<uint16_t>(verifier, VT_OFFSET, 2) &&
+           VerifyField<int64_t>(verifier, VT_DEFAULT_INTEGER, 8) &&
+           VerifyField<double>(verifier, VT_DEFAULT_REAL, 8) &&
+           VerifyField<uint8_t>(verifier, VT_DEPRECATED, 1) &&
+           VerifyField<uint8_t>(verifier, VT_REQUIRED, 1) &&
+           VerifyField<uint8_t>(verifier, VT_KEY, 1) &&
+           VerifyOffset(verifier, VT_ATTRIBUTES) &&
+           verifier.VerifyVector(attributes()) &&
+           verifier.VerifyVectorOfTables(attributes()) &&
+           VerifyOffset(verifier, VT_DOCUMENTATION) &&
+           verifier.VerifyVector(documentation()) &&
+           verifier.VerifyVectorOfStrings(documentation()) &&
+           VerifyField<uint8_t>(verifier, VT_OPTIONAL, 1) &&
+           VerifyField<uint16_t>(verifier, VT_PADDING, 2) &&
+           VerifyField<uint8_t>(verifier, VT_OFFSET64, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct FieldBuilder {
+  typedef Field Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(Field::VT_NAME, name);
+  }
+  void add_type(::flatbuffers::Offset<reflection::Type> type) {
+    fbb_.AddOffset(Field::VT_TYPE, type);
+  }
+  void add_id(uint16_t id) {
+    fbb_.AddElement<uint16_t>(Field::VT_ID, id, 0);
+  }
+  void add_offset(uint16_t offset) {
+    fbb_.AddElement<uint16_t>(Field::VT_OFFSET, offset, 0);
+  }
+  void add_default_integer(int64_t default_integer) {
+    fbb_.AddElement<int64_t>(Field::VT_DEFAULT_INTEGER, default_integer, 0);
+  }
+  void add_default_real(double default_real) {
+    fbb_.AddElement<double>(Field::VT_DEFAULT_REAL, default_real, 0.0);
+  }
+  void add_deprecated(bool deprecated) {
+    fbb_.AddElement<uint8_t>(Field::VT_DEPRECATED, static_cast<uint8_t>(deprecated), 0);
+  }
+  void add_required(bool required) {
+    fbb_.AddElement<uint8_t>(Field::VT_REQUIRED, static_cast<uint8_t>(required), 0);
+  }
+  void add_key(bool key) {
+    fbb_.AddElement<uint8_t>(Field::VT_KEY, static_cast<uint8_t>(key), 0);
+  }
+  void add_attributes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>>> attributes) {
+    fbb_.AddOffset(Field::VT_ATTRIBUTES, attributes);
+  }
+  void add_documentation(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> documentation) {
+    fbb_.AddOffset(Field::VT_DOCUMENTATION, documentation);
+  }
+  void add_optional(bool optional) {
+    fbb_.AddElement<uint8_t>(Field::VT_OPTIONAL, static_cast<uint8_t>(optional), 0);
+  }
+  void add_padding(uint16_t padding) {
+    fbb_.AddElement<uint16_t>(Field::VT_PADDING, padding, 0);
+  }
+  void add_offset64(bool offset64) {
+    fbb_.AddElement<uint8_t>(Field::VT_OFFSET64, static_cast<uint8_t>(offset64), 0);
+  }
+  explicit FieldBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Field> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Field>(end);
+    fbb_.Required(o, Field::VT_NAME);
+    fbb_.Required(o, Field::VT_TYPE);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Field> CreateField(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<reflection::Type> type = 0,
+    uint16_t id = 0,
+    uint16_t offset = 0,
+    int64_t default_integer = 0,
+    double default_real = 0.0,
+    bool deprecated = false,
+    bool required = false,
+    bool key = false,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>>> attributes = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> documentation = 0,
+    bool optional = false,
+    uint16_t padding = 0,
+    bool offset64 = false) {
+  FieldBuilder builder_(_fbb);
+  builder_.add_default_real(default_real);
+  builder_.add_default_integer(default_integer);
+  builder_.add_documentation(documentation);
+  builder_.add_attributes(attributes);
+  builder_.add_type(type);
+  builder_.add_name(name);
+  builder_.add_padding(padding);
+  builder_.add_offset(offset);
+  builder_.add_id(id);
+  builder_.add_offset64(offset64);
+  builder_.add_optional(optional);
+  builder_.add_key(key);
+  builder_.add_required(required);
+  builder_.add_deprecated(deprecated);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Field> CreateFieldDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    ::flatbuffers::Offset<reflection::Type> type = 0,
+    uint16_t id = 0,
+    uint16_t offset = 0,
+    int64_t default_integer = 0,
+    double default_real = 0.0,
+    bool deprecated = false,
+    bool required = false,
+    bool key = false,
+    std::vector<::flatbuffers::Offset<reflection::KeyValue>> *attributes = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *documentation = nullptr,
+    bool optional = false,
+    uint16_t padding = 0,
+    bool offset64 = false) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto attributes__ = attributes ? _fbb.CreateVectorOfSortedTables<reflection::KeyValue>(attributes) : 0;
+  auto documentation__ = documentation ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*documentation) : 0;
+  return reflection::CreateField(
+      _fbb,
+      name__,
+      type,
+      id,
+      offset,
+      default_integer,
+      default_real,
+      deprecated,
+      required,
+      key,
+      attributes__,
+      documentation__,
+      optional,
+      padding,
+      offset64);
+}
+
+struct Object FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ObjectBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_FIELDS = 6,
+    VT_IS_STRUCT = 8,
+    VT_MINALIGN = 10,
+    VT_BYTESIZE = 12,
+    VT_ATTRIBUTES = 14,
+    VT_DOCUMENTATION = 16,
+    VT_DECLARATION_FILE = 18
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  bool KeyCompareLessThan(const Object * const o) const {
+    return *name() < *o->name();
+  }
+  int KeyCompareWithValue(const char *_name) const {
+    return strcmp(name()->c_str(), _name);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::Field>> *fields() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::Field>> *>(VT_FIELDS);
+  }
+  bool is_struct() const {
+    return GetField<uint8_t>(VT_IS_STRUCT, 0) != 0;
+  }
+  int32_t minalign() const {
+    return GetField<int32_t>(VT_MINALIGN, 0);
+  }
+  int32_t bytesize() const {
+    return GetField<int32_t>(VT_BYTESIZE, 0);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>> *attributes() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>> *>(VT_ATTRIBUTES);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *documentation() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_DOCUMENTATION);
+  }
+  /// File that this Object is declared in.
+  const ::flatbuffers::String *declaration_file() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DECLARATION_FILE);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffsetRequired(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffsetRequired(verifier, VT_FIELDS) &&
+           verifier.VerifyVector(fields()) &&
+           verifier.VerifyVectorOfTables(fields()) &&
+           VerifyField<uint8_t>(verifier, VT_IS_STRUCT, 1) &&
+           VerifyField<int32_t>(verifier, VT_MINALIGN, 4) &&
+           VerifyField<int32_t>(verifier, VT_BYTESIZE, 4) &&
+           VerifyOffset(verifier, VT_ATTRIBUTES) &&
+           verifier.VerifyVector(attributes()) &&
+           verifier.VerifyVectorOfTables(attributes()) &&
+           VerifyOffset(verifier, VT_DOCUMENTATION) &&
+           verifier.VerifyVector(documentation()) &&
+           verifier.VerifyVectorOfStrings(documentation()) &&
+           VerifyOffset(verifier, VT_DECLARATION_FILE) &&
+           verifier.VerifyString(declaration_file()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ObjectBuilder {
+  typedef Object Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(Object::VT_NAME, name);
+  }
+  void add_fields(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::Field>>> fields) {
+    fbb_.AddOffset(Object::VT_FIELDS, fields);
+  }
+  void add_is_struct(bool is_struct) {
+    fbb_.AddElement<uint8_t>(Object::VT_IS_STRUCT, static_cast<uint8_t>(is_struct), 0);
+  }
+  void add_minalign(int32_t minalign) {
+    fbb_.AddElement<int32_t>(Object::VT_MINALIGN, minalign, 0);
+  }
+  void add_bytesize(int32_t bytesize) {
+    fbb_.AddElement<int32_t>(Object::VT_BYTESIZE, bytesize, 0);
+  }
+  void add_attributes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>>> attributes) {
+    fbb_.AddOffset(Object::VT_ATTRIBUTES, attributes);
+  }
+  void add_documentation(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> documentation) {
+    fbb_.AddOffset(Object::VT_DOCUMENTATION, documentation);
+  }
+  void add_declaration_file(::flatbuffers::Offset<::flatbuffers::String> declaration_file) {
+    fbb_.AddOffset(Object::VT_DECLARATION_FILE, declaration_file);
+  }
+  explicit ObjectBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Object> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Object>(end);
+    fbb_.Required(o, Object::VT_NAME);
+    fbb_.Required(o, Object::VT_FIELDS);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Object> CreateObject(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::Field>>> fields = 0,
+    bool is_struct = false,
+    int32_t minalign = 0,
+    int32_t bytesize = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>>> attributes = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> documentation = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> declaration_file = 0) {
+  ObjectBuilder builder_(_fbb);
+  builder_.add_declaration_file(declaration_file);
+  builder_.add_documentation(documentation);
+  builder_.add_attributes(attributes);
+  builder_.add_bytesize(bytesize);
+  builder_.add_minalign(minalign);
+  builder_.add_fields(fields);
+  builder_.add_name(name);
+  builder_.add_is_struct(is_struct);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Object> CreateObjectDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    std::vector<::flatbuffers::Offset<reflection::Field>> *fields = nullptr,
+    bool is_struct = false,
+    int32_t minalign = 0,
+    int32_t bytesize = 0,
+    std::vector<::flatbuffers::Offset<reflection::KeyValue>> *attributes = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *documentation = nullptr,
+    const char *declaration_file = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto fields__ = fields ? _fbb.CreateVectorOfSortedTables<reflection::Field>(fields) : 0;
+  auto attributes__ = attributes ? _fbb.CreateVectorOfSortedTables<reflection::KeyValue>(attributes) : 0;
+  auto documentation__ = documentation ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*documentation) : 0;
+  auto declaration_file__ = declaration_file ? _fbb.CreateString(declaration_file) : 0;
+  return reflection::CreateObject(
+      _fbb,
+      name__,
+      fields__,
+      is_struct,
+      minalign,
+      bytesize,
+      attributes__,
+      documentation__,
+      declaration_file__);
+}
+
+struct RPCCall FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef RPCCallBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_REQUEST = 6,
+    VT_RESPONSE = 8,
+    VT_ATTRIBUTES = 10,
+    VT_DOCUMENTATION = 12
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  bool KeyCompareLessThan(const RPCCall * const o) const {
+    return *name() < *o->name();
+  }
+  int KeyCompareWithValue(const char *_name) const {
+    return strcmp(name()->c_str(), _name);
+  }
+  const reflection::Object *request() const {
+    return GetPointer<const reflection::Object *>(VT_REQUEST);
+  }
+  const reflection::Object *response() const {
+    return GetPointer<const reflection::Object *>(VT_RESPONSE);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>> *attributes() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>> *>(VT_ATTRIBUTES);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *documentation() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_DOCUMENTATION);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffsetRequired(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffsetRequired(verifier, VT_REQUEST) &&
+           verifier.VerifyTable(request()) &&
+           VerifyOffsetRequired(verifier, VT_RESPONSE) &&
+           verifier.VerifyTable(response()) &&
+           VerifyOffset(verifier, VT_ATTRIBUTES) &&
+           verifier.VerifyVector(attributes()) &&
+           verifier.VerifyVectorOfTables(attributes()) &&
+           VerifyOffset(verifier, VT_DOCUMENTATION) &&
+           verifier.VerifyVector(documentation()) &&
+           verifier.VerifyVectorOfStrings(documentation()) &&
+           verifier.EndTable();
+  }
+};
+
+struct RPCCallBuilder {
+  typedef RPCCall Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(RPCCall::VT_NAME, name);
+  }
+  void add_request(::flatbuffers::Offset<reflection::Object> request) {
+    fbb_.AddOffset(RPCCall::VT_REQUEST, request);
+  }
+  void add_response(::flatbuffers::Offset<reflection::Object> response) {
+    fbb_.AddOffset(RPCCall::VT_RESPONSE, response);
+  }
+  void add_attributes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>>> attributes) {
+    fbb_.AddOffset(RPCCall::VT_ATTRIBUTES, attributes);
+  }
+  void add_documentation(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> documentation) {
+    fbb_.AddOffset(RPCCall::VT_DOCUMENTATION, documentation);
+  }
+  explicit RPCCallBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<RPCCall> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<RPCCall>(end);
+    fbb_.Required(o, RPCCall::VT_NAME);
+    fbb_.Required(o, RPCCall::VT_REQUEST);
+    fbb_.Required(o, RPCCall::VT_RESPONSE);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<RPCCall> CreateRPCCall(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<reflection::Object> request = 0,
+    ::flatbuffers::Offset<reflection::Object> response = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>>> attributes = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> documentation = 0) {
+  RPCCallBuilder builder_(_fbb);
+  builder_.add_documentation(documentation);
+  builder_.add_attributes(attributes);
+  builder_.add_response(response);
+  builder_.add_request(request);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<RPCCall> CreateRPCCallDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    ::flatbuffers::Offset<reflection::Object> request = 0,
+    ::flatbuffers::Offset<reflection::Object> response = 0,
+    std::vector<::flatbuffers::Offset<reflection::KeyValue>> *attributes = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *documentation = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto attributes__ = attributes ? _fbb.CreateVectorOfSortedTables<reflection::KeyValue>(attributes) : 0;
+  auto documentation__ = documentation ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*documentation) : 0;
+  return reflection::CreateRPCCall(
+      _fbb,
+      name__,
+      request,
+      response,
+      attributes__,
+      documentation__);
+}
+
+struct Service FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ServiceBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_CALLS = 6,
+    VT_ATTRIBUTES = 8,
+    VT_DOCUMENTATION = 10,
+    VT_DECLARATION_FILE = 12
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  bool KeyCompareLessThan(const Service * const o) const {
+    return *name() < *o->name();
+  }
+  int KeyCompareWithValue(const char *_name) const {
+    return strcmp(name()->c_str(), _name);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::RPCCall>> *calls() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::RPCCall>> *>(VT_CALLS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>> *attributes() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>> *>(VT_ATTRIBUTES);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *documentation() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_DOCUMENTATION);
+  }
+  /// File that this Service is declared in.
+  const ::flatbuffers::String *declaration_file() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DECLARATION_FILE);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffsetRequired(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, VT_CALLS) &&
+           verifier.VerifyVector(calls()) &&
+           verifier.VerifyVectorOfTables(calls()) &&
+           VerifyOffset(verifier, VT_ATTRIBUTES) &&
+           verifier.VerifyVector(attributes()) &&
+           verifier.VerifyVectorOfTables(attributes()) &&
+           VerifyOffset(verifier, VT_DOCUMENTATION) &&
+           verifier.VerifyVector(documentation()) &&
+           verifier.VerifyVectorOfStrings(documentation()) &&
+           VerifyOffset(verifier, VT_DECLARATION_FILE) &&
+           verifier.VerifyString(declaration_file()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ServiceBuilder {
+  typedef Service Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(Service::VT_NAME, name);
+  }
+  void add_calls(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::RPCCall>>> calls) {
+    fbb_.AddOffset(Service::VT_CALLS, calls);
+  }
+  void add_attributes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>>> attributes) {
+    fbb_.AddOffset(Service::VT_ATTRIBUTES, attributes);
+  }
+  void add_documentation(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> documentation) {
+    fbb_.AddOffset(Service::VT_DOCUMENTATION, documentation);
+  }
+  void add_declaration_file(::flatbuffers::Offset<::flatbuffers::String> declaration_file) {
+    fbb_.AddOffset(Service::VT_DECLARATION_FILE, declaration_file);
+  }
+  explicit ServiceBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Service> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Service>(end);
+    fbb_.Required(o, Service::VT_NAME);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Service> CreateService(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::RPCCall>>> calls = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::KeyValue>>> attributes = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> documentation = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> declaration_file = 0) {
+  ServiceBuilder builder_(_fbb);
+  builder_.add_declaration_file(declaration_file);
+  builder_.add_documentation(documentation);
+  builder_.add_attributes(attributes);
+  builder_.add_calls(calls);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Service> CreateServiceDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    std::vector<::flatbuffers::Offset<reflection::RPCCall>> *calls = nullptr,
+    std::vector<::flatbuffers::Offset<reflection::KeyValue>> *attributes = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *documentation = nullptr,
+    const char *declaration_file = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto calls__ = calls ? _fbb.CreateVectorOfSortedTables<reflection::RPCCall>(calls) : 0;
+  auto attributes__ = attributes ? _fbb.CreateVectorOfSortedTables<reflection::KeyValue>(attributes) : 0;
+  auto documentation__ = documentation ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*documentation) : 0;
+  auto declaration_file__ = declaration_file ? _fbb.CreateString(declaration_file) : 0;
+  return reflection::CreateService(
+      _fbb,
+      name__,
+      calls__,
+      attributes__,
+      documentation__,
+      declaration_file__);
+}
+
+/// File specific information.
+/// Symbols declared within a file may be recovered by iterating over all
+/// symbols and examining the `declaration_file` field.
+struct SchemaFile FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SchemaFileBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FILENAME = 4,
+    VT_INCLUDED_FILENAMES = 6
+  };
+  /// Filename, relative to project root.
+  const ::flatbuffers::String *filename() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_FILENAME);
+  }
+  bool KeyCompareLessThan(const SchemaFile * const o) const {
+    return *filename() < *o->filename();
+  }
+  int KeyCompareWithValue(const char *_filename) const {
+    return strcmp(filename()->c_str(), _filename);
+  }
+  /// Names of included files, relative to project root.
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *included_filenames() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_INCLUDED_FILENAMES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffsetRequired(verifier, VT_FILENAME) &&
+           verifier.VerifyString(filename()) &&
+           VerifyOffset(verifier, VT_INCLUDED_FILENAMES) &&
+           verifier.VerifyVector(included_filenames()) &&
+           verifier.VerifyVectorOfStrings(included_filenames()) &&
+           verifier.EndTable();
+  }
+};
+
+struct SchemaFileBuilder {
+  typedef SchemaFile Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_filename(::flatbuffers::Offset<::flatbuffers::String> filename) {
+    fbb_.AddOffset(SchemaFile::VT_FILENAME, filename);
+  }
+  void add_included_filenames(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> included_filenames) {
+    fbb_.AddOffset(SchemaFile::VT_INCLUDED_FILENAMES, included_filenames);
+  }
+  explicit SchemaFileBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SchemaFile> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SchemaFile>(end);
+    fbb_.Required(o, SchemaFile::VT_FILENAME);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SchemaFile> CreateSchemaFile(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> filename = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> included_filenames = 0) {
+  SchemaFileBuilder builder_(_fbb);
+  builder_.add_included_filenames(included_filenames);
+  builder_.add_filename(filename);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<SchemaFile> CreateSchemaFileDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *filename = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *included_filenames = nullptr) {
+  auto filename__ = filename ? _fbb.CreateString(filename) : 0;
+  auto included_filenames__ = included_filenames ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*included_filenames) : 0;
+  return reflection::CreateSchemaFile(
+      _fbb,
+      filename__,
+      included_filenames__);
+}
+
+struct Schema FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SchemaBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_OBJECTS = 4,
+    VT_ENUMS = 6,
+    VT_FILE_IDENT = 8,
+    VT_FILE_EXT = 10,
+    VT_ROOT_TABLE = 12,
+    VT_SERVICES = 14,
+    VT_ADVANCED_FEATURES = 16,
+    VT_FBS_FILES = 18
+  };
+  const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::Object>> *objects() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::Object>> *>(VT_OBJECTS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::Enum>> *enums() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::Enum>> *>(VT_ENUMS);
+  }
+  const ::flatbuffers::String *file_ident() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_FILE_IDENT);
+  }
+  const ::flatbuffers::String *file_ext() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_FILE_EXT);
+  }
+  const reflection::Object *root_table() const {
+    return GetPointer<const reflection::Object *>(VT_ROOT_TABLE);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::Service>> *services() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::Service>> *>(VT_SERVICES);
+  }
+  reflection::AdvancedFeatures advanced_features() const {
+    return static_cast<reflection::AdvancedFeatures>(GetField<uint64_t>(VT_ADVANCED_FEATURES, 0));
+  }
+  /// All the files used in this compilation. Files are relative to where
+  /// flatc was invoked.
+  const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::SchemaFile>> *fbs_files() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<reflection::SchemaFile>> *>(VT_FBS_FILES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffsetRequired(verifier, VT_OBJECTS) &&
+           verifier.VerifyVector(objects()) &&
+           verifier.VerifyVectorOfTables(objects()) &&
+           VerifyOffsetRequired(verifier, VT_ENUMS) &&
+           verifier.VerifyVector(enums()) &&
+           verifier.VerifyVectorOfTables(enums()) &&
+           VerifyOffset(verifier, VT_FILE_IDENT) &&
+           verifier.VerifyString(file_ident()) &&
+           VerifyOffset(verifier, VT_FILE_EXT) &&
+           verifier.VerifyString(file_ext()) &&
+           VerifyOffset(verifier, VT_ROOT_TABLE) &&
+           verifier.VerifyTable(root_table()) &&
+           VerifyOffset(verifier, VT_SERVICES) &&
+           verifier.VerifyVector(services()) &&
+           verifier.VerifyVectorOfTables(services()) &&
+           VerifyField<uint64_t>(verifier, VT_ADVANCED_FEATURES, 8) &&
+           VerifyOffset(verifier, VT_FBS_FILES) &&
+           verifier.VerifyVector(fbs_files()) &&
+           verifier.VerifyVectorOfTables(fbs_files()) &&
+           verifier.EndTable();
+  }
+};
+
+struct SchemaBuilder {
+  typedef Schema Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_objects(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::Object>>> objects) {
+    fbb_.AddOffset(Schema::VT_OBJECTS, objects);
+  }
+  void add_enums(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::Enum>>> enums) {
+    fbb_.AddOffset(Schema::VT_ENUMS, enums);
+  }
+  void add_file_ident(::flatbuffers::Offset<::flatbuffers::String> file_ident) {
+    fbb_.AddOffset(Schema::VT_FILE_IDENT, file_ident);
+  }
+  void add_file_ext(::flatbuffers::Offset<::flatbuffers::String> file_ext) {
+    fbb_.AddOffset(Schema::VT_FILE_EXT, file_ext);
+  }
+  void add_root_table(::flatbuffers::Offset<reflection::Object> root_table) {
+    fbb_.AddOffset(Schema::VT_ROOT_TABLE, root_table);
+  }
+  void add_services(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::Service>>> services) {
+    fbb_.AddOffset(Schema::VT_SERVICES, services);
+  }
+  void add_advanced_features(reflection::AdvancedFeatures advanced_features) {
+    fbb_.AddElement<uint64_t>(Schema::VT_ADVANCED_FEATURES, static_cast<uint64_t>(advanced_features), 0);
+  }
+  void add_fbs_files(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::SchemaFile>>> fbs_files) {
+    fbb_.AddOffset(Schema::VT_FBS_FILES, fbs_files);
+  }
+  explicit SchemaBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Schema> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Schema>(end);
+    fbb_.Required(o, Schema::VT_OBJECTS);
+    fbb_.Required(o, Schema::VT_ENUMS);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Schema> CreateSchema(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::Object>>> objects = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::Enum>>> enums = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> file_ident = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> file_ext = 0,
+    ::flatbuffers::Offset<reflection::Object> root_table = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::Service>>> services = 0,
+    reflection::AdvancedFeatures advanced_features = static_cast<reflection::AdvancedFeatures>(0),
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<reflection::SchemaFile>>> fbs_files = 0) {
+  SchemaBuilder builder_(_fbb);
+  builder_.add_advanced_features(advanced_features);
+  builder_.add_fbs_files(fbs_files);
+  builder_.add_services(services);
+  builder_.add_root_table(root_table);
+  builder_.add_file_ext(file_ext);
+  builder_.add_file_ident(file_ident);
+  builder_.add_enums(enums);
+  builder_.add_objects(objects);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Schema> CreateSchemaDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    std::vector<::flatbuffers::Offset<reflection::Object>> *objects = nullptr,
+    std::vector<::flatbuffers::Offset<reflection::Enum>> *enums = nullptr,
+    const char *file_ident = nullptr,
+    const char *file_ext = nullptr,
+    ::flatbuffers::Offset<reflection::Object> root_table = 0,
+    std::vector<::flatbuffers::Offset<reflection::Service>> *services = nullptr,
+    reflection::AdvancedFeatures advanced_features = static_cast<reflection::AdvancedFeatures>(0),
+    std::vector<::flatbuffers::Offset<reflection::SchemaFile>> *fbs_files = nullptr) {
+  auto objects__ = objects ? _fbb.CreateVectorOfSortedTables<reflection::Object>(objects) : 0;
+  auto enums__ = enums ? _fbb.CreateVectorOfSortedTables<reflection::Enum>(enums) : 0;
+  auto file_ident__ = file_ident ? _fbb.CreateString(file_ident) : 0;
+  auto file_ext__ = file_ext ? _fbb.CreateString(file_ext) : 0;
+  auto services__ = services ? _fbb.CreateVectorOfSortedTables<reflection::Service>(services) : 0;
+  auto fbs_files__ = fbs_files ? _fbb.CreateVectorOfSortedTables<reflection::SchemaFile>(fbs_files) : 0;
+  return reflection::CreateSchema(
+      _fbb,
+      objects__,
+      enums__,
+      file_ident__,
+      file_ext__,
+      root_table,
+      services__,
+      advanced_features,
+      fbs_files__);
+}
+
+inline const reflection::Schema *GetSchema(const void *buf) {
+  return ::flatbuffers::GetRoot<reflection::Schema>(buf);
+}
+
+inline const reflection::Schema *GetSizePrefixedSchema(const void *buf) {
+  return ::flatbuffers::GetSizePrefixedRoot<reflection::Schema>(buf);
+}
+
+inline const char *SchemaIdentifier() {
+  return "BFBS";
+}
+
+inline bool SchemaBufferHasIdentifier(const void *buf) {
+  return ::flatbuffers::BufferHasIdentifier(
+      buf, SchemaIdentifier());
+}
+
+inline bool SizePrefixedSchemaBufferHasIdentifier(const void *buf) {
+  return ::flatbuffers::BufferHasIdentifier(
+      buf, SchemaIdentifier(), true);
+}
+
+inline bool VerifySchemaBuffer(
+    ::flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<reflection::Schema>(SchemaIdentifier());
+}
+
+inline bool VerifySizePrefixedSchemaBuffer(
+    ::flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<reflection::Schema>(SchemaIdentifier());
+}
+
+inline const char *SchemaExtension() {
+  return "bfbs";
+}
+
+inline void FinishSchemaBuffer(
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<reflection::Schema> root) {
+  fbb.Finish(root, SchemaIdentifier());
+}
+
+inline void FinishSizePrefixedSchemaBuffer(
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<reflection::Schema> root) {
+  fbb.FinishSizePrefixed(root, SchemaIdentifier());
+}
+
+}  // namespace reflection
+
+#endif  // FLATBUFFERS_GENERATED_REFLECTION_REFLECTION_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/registry.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/registry.h
new file mode 100644
index 00000000..f59aa647
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/registry.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2017 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_REGISTRY_H_
+#define FLATBUFFERS_REGISTRY_H_
+
+#include "flatbuffers/base.h"
+#include "flatbuffers/idl.h"
+
+namespace flatbuffers {
+
+// Convenience class to easily parse or generate text for arbitrary FlatBuffers.
+// Simply pre-populate it with all schema filenames that may be in use, and
+// This class will look them up using the file_identifier declared in the
+// schema.
+class Registry {
+ public:
+  // Call this for all schemas that may be in use. The identifier has
+  // a function in the generated code, e.g. MonsterIdentifier().
+  void Register(const char *file_identifier, const char *schema_path) {
+    Schema schema;
+    schema.path_ = schema_path;
+    schemas_[file_identifier] = schema;
+  }
+
+  // Generate text from an arbitrary FlatBuffer by looking up its
+  // file_identifier in the registry.
+  bool FlatBufferToText(const uint8_t *flatbuf, size_t len, std::string *dest) {
+    // Get the identifier out of the buffer.
+    // If the buffer is truncated, exit.
+    if (len < sizeof(uoffset_t) + kFileIdentifierLength) {
+      lasterror_ = "buffer truncated";
+      return false;
+    }
+    std::string ident(
+        reinterpret_cast<const char *>(flatbuf) + sizeof(uoffset_t),
+        kFileIdentifierLength);
+    // Load and parse the schema.
+    Parser parser;
+    if (!LoadSchema(ident, &parser)) return false;
+    // Now we're ready to generate text.
+    auto err = GenText(parser, flatbuf, dest);
+    if (err) {
+      lasterror_ =
+          "unable to generate text for FlatBuffer binary: " + std::string(err);
+      return false;
+    }
+    return true;
+  }
+
+  // Converts a binary buffer to text using one of the schemas in the registry,
+  // use the file_identifier to indicate which.
+  // If DetachedBuffer::data() is null then parsing failed.
+  DetachedBuffer TextToFlatBuffer(const char *text,
+                                  const char *file_identifier) {
+    // Load and parse the schema.
+    Parser parser;
+    if (!LoadSchema(file_identifier, &parser)) return DetachedBuffer();
+    // Parse the text.
+    if (!parser.Parse(text)) {
+      lasterror_ = parser.error_;
+      return DetachedBuffer();
+    }
+    // We have a valid FlatBuffer. Detach it from the builder and return.
+    return parser.builder_.Release();
+  }
+
+  // Modify any parsing / output options used by the other functions.
+  void SetOptions(const IDLOptions &opts) { opts_ = opts; }
+
+  // If schemas used contain include statements, call this function for every
+  // directory the parser should search them for.
+  void AddIncludeDirectory(const char *path) { include_paths_.push_back(path); }
+
+  // Returns a human readable error if any of the above functions fail.
+  const std::string &GetLastError() { return lasterror_; }
+
+ private:
+  bool LoadSchema(const std::string &ident, Parser *parser) {
+    // Find the schema, if not, exit.
+    auto it = schemas_.find(ident);
+    if (it == schemas_.end()) {
+      // Don't attach the identifier, since it may not be human readable.
+      lasterror_ = "identifier for this buffer not in the registry";
+      return false;
+    }
+    auto &schema = it->second;
+    // Load the schema from disk. If not, exit.
+    std::string schematext;
+    if (!LoadFile(schema.path_.c_str(), false, &schematext)) {
+      lasterror_ = "could not load schema: " + schema.path_;
+      return false;
+    }
+    // Parse schema.
+    parser->opts = opts_;
+    if (!parser->Parse(schematext.c_str(), include_paths_.data(),
+                       schema.path_.c_str())) {
+      lasterror_ = parser->error_;
+      return false;
+    }
+    return true;
+  }
+
+  struct Schema {
+    std::string path_;
+    // TODO(wvo) optionally cache schema file or parsed schema here.
+  };
+
+  std::string lasterror_;
+  IDLOptions opts_;
+  std::vector<const char *> include_paths_;
+  std::map<std::string, Schema> schemas_;
+};
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_REGISTRY_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/stl_emulation.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/stl_emulation.h
new file mode 100644
index 00000000..fd3a8cda
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/stl_emulation.h
@@ -0,0 +1,513 @@
+/*
+ * Copyright 2017 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_STL_EMULATION_H_
+#define FLATBUFFERS_STL_EMULATION_H_
+
+// clang-format off
+#include "flatbuffers/base.h"
+
+#include <string>
+#include <type_traits>
+#include <vector>
+#include <memory>
+#include <limits>
+
+#ifndef FLATBUFFERS_USE_STD_OPTIONAL
+  // Detect C++17 compatible compiler.
+  // __cplusplus >= 201703L - a compiler has support of 'static inline' variables.
+  #if (defined(__cplusplus) && __cplusplus >= 201703L) \
+      || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+    #define FLATBUFFERS_USE_STD_OPTIONAL 1
+  #else
+    #define FLATBUFFERS_USE_STD_OPTIONAL 0
+  #endif // (defined(__cplusplus) && __cplusplus >= 201703L) ...
+#endif // FLATBUFFERS_USE_STD_OPTIONAL
+
+#if FLATBUFFERS_USE_STD_OPTIONAL
+  #include <optional>
+#endif
+
+#ifndef FLATBUFFERS_USE_STD_SPAN
+  // Testing __cpp_lib_span requires including either <version> or <span>,
+  // both of which were added in C++20.
+  // See: https://en.cppreference.com/w/cpp/utility/feature_test
+  #if defined(__cplusplus) && __cplusplus >= 202002L
+    #define FLATBUFFERS_USE_STD_SPAN 1
+  #endif
+#endif // FLATBUFFERS_USE_STD_SPAN
+
+#if defined(FLATBUFFERS_USE_STD_SPAN)
+  #include <array>
+  #include <span>
+#else
+  // Disable non-trivial ctors if FLATBUFFERS_SPAN_MINIMAL defined.
+  #if !defined(FLATBUFFERS_TEMPLATES_ALIASES)
+    #define FLATBUFFERS_SPAN_MINIMAL
+  #else
+    // Enable implicit construction of a span<T,N> from a std::array<T,N>.
+    #include <array>
+  #endif
+#endif // defined(FLATBUFFERS_USE_STD_SPAN)
+
+// This header provides backwards compatibility for older versions of the STL.
+namespace flatbuffers {
+
+#if defined(FLATBUFFERS_TEMPLATES_ALIASES)
+  template <typename T>
+  using numeric_limits = std::numeric_limits<T>;
+#else
+  template <typename T> class numeric_limits :
+    public std::numeric_limits<T> {};
+#endif  // defined(FLATBUFFERS_TEMPLATES_ALIASES)
+
+#if defined(FLATBUFFERS_TEMPLATES_ALIASES)
+  template <typename T> using is_scalar = std::is_scalar<T>;
+  template <typename T, typename U> using is_same = std::is_same<T,U>;
+  template <typename T> using is_floating_point = std::is_floating_point<T>;
+  template <typename T> using is_unsigned = std::is_unsigned<T>;
+  template <typename T> using is_enum = std::is_enum<T>;
+  template <typename T> using make_unsigned = std::make_unsigned<T>;
+  template<bool B, class T, class F>
+  using conditional = std::conditional<B, T, F>;
+  template<class T, T v>
+  using integral_constant = std::integral_constant<T, v>;
+  template <bool B>
+  using bool_constant = integral_constant<bool, B>;
+  using true_type  = std::true_type;
+  using false_type = std::false_type;
+#else
+  // MSVC 2010 doesn't support C++11 aliases.
+  template <typename T> struct is_scalar : public std::is_scalar<T> {};
+  template <typename T, typename U> struct is_same : public std::is_same<T,U> {};
+  template <typename T> struct is_floating_point :
+        public std::is_floating_point<T> {};
+  template <typename T> struct is_unsigned : public std::is_unsigned<T> {};
+  template <typename T> struct is_enum : public std::is_enum<T> {};
+  template <typename T> struct make_unsigned : public std::make_unsigned<T> {};
+  template<bool B, class T, class F>
+  struct conditional : public std::conditional<B, T, F> {};
+  template<class T, T v>
+  struct integral_constant : public std::integral_constant<T, v> {};
+  template <bool B>
+  struct bool_constant : public integral_constant<bool, B> {};
+  typedef bool_constant<true>  true_type;
+  typedef bool_constant<false> false_type;
+#endif  // defined(FLATBUFFERS_TEMPLATES_ALIASES)
+
+#if defined(FLATBUFFERS_TEMPLATES_ALIASES)
+  template <class T> using unique_ptr = std::unique_ptr<T>;
+#else
+  // MSVC 2010 doesn't support C++11 aliases.
+  // We're manually "aliasing" the class here as we want to bring unique_ptr
+  // into the flatbuffers namespace.  We have unique_ptr in the flatbuffers
+  // namespace we have a completely independent implementation (see below)
+  // for C++98 STL implementations.
+  template <class T> class unique_ptr : public std::unique_ptr<T> {
+    public:
+    unique_ptr() {}
+    explicit unique_ptr(T* p) : std::unique_ptr<T>(p) {}
+    unique_ptr(std::unique_ptr<T>&& u) { *this = std::move(u); }
+    unique_ptr(unique_ptr&& u) { *this = std::move(u); }
+    unique_ptr& operator=(std::unique_ptr<T>&& u) {
+      std::unique_ptr<T>::reset(u.release());
+      return *this;
+    }
+    unique_ptr& operator=(unique_ptr&& u) {
+      std::unique_ptr<T>::reset(u.release());
+      return *this;
+    }
+    unique_ptr& operator=(T* p) {
+      return std::unique_ptr<T>::operator=(p);
+    }
+  };
+#endif  // defined(FLATBUFFERS_TEMPLATES_ALIASES)
+
+#if FLATBUFFERS_USE_STD_OPTIONAL
+template<class T>
+using Optional = std::optional<T>;
+using nullopt_t = std::nullopt_t;
+inline constexpr nullopt_t nullopt = std::nullopt;
+
+#else
+// Limited implementation of Optional<T> type for a scalar T.
+// This implementation limited by trivial types compatible with
+// std::is_arithmetic<T> or std::is_enum<T> type traits.
+
+// A tag to indicate an empty flatbuffers::optional<T>.
+struct nullopt_t {
+  explicit FLATBUFFERS_CONSTEXPR_CPP11 nullopt_t(int) {}
+};
+
+#if defined(FLATBUFFERS_CONSTEXPR_DEFINED)
+  namespace internal {
+    template <class> struct nullopt_holder {
+      static constexpr nullopt_t instance_ = nullopt_t(0);
+    };
+    template<class Dummy>
+    constexpr nullopt_t nullopt_holder<Dummy>::instance_;
+  }
+  static constexpr const nullopt_t &nullopt = internal::nullopt_holder<void>::instance_;
+
+#else
+  namespace internal {
+    template <class> struct nullopt_holder {
+      static const nullopt_t instance_;
+    };
+    template<class Dummy>
+    const nullopt_t nullopt_holder<Dummy>::instance_  = nullopt_t(0);
+  }
+  static const nullopt_t &nullopt = internal::nullopt_holder<void>::instance_;
+
+#endif
+
+template<class T>
+class Optional FLATBUFFERS_FINAL_CLASS {
+  // Non-scalar 'T' would extremely complicated Optional<T>.
+  // Use is_scalar<T> checking because flatbuffers flatbuffers::is_arithmetic<T>
+  // isn't implemented.
+  static_assert(flatbuffers::is_scalar<T>::value, "unexpected type T");
+
+ public:
+  ~Optional() {}
+
+  FLATBUFFERS_CONSTEXPR_CPP11 Optional() FLATBUFFERS_NOEXCEPT
+    : value_(), has_value_(false) {}
+
+  FLATBUFFERS_CONSTEXPR_CPP11 Optional(nullopt_t) FLATBUFFERS_NOEXCEPT
+    : value_(), has_value_(false) {}
+
+  FLATBUFFERS_CONSTEXPR_CPP11 Optional(T val) FLATBUFFERS_NOEXCEPT
+    : value_(val), has_value_(true) {}
+
+  FLATBUFFERS_CONSTEXPR_CPP11 Optional(const Optional &other) FLATBUFFERS_NOEXCEPT
+    : value_(other.value_), has_value_(other.has_value_) {}
+
+  FLATBUFFERS_CONSTEXPR_CPP14 Optional &operator=(const Optional &other) FLATBUFFERS_NOEXCEPT {
+    value_ = other.value_;
+    has_value_ = other.has_value_;
+    return *this;
+  }
+
+  FLATBUFFERS_CONSTEXPR_CPP14 Optional &operator=(nullopt_t) FLATBUFFERS_NOEXCEPT {
+    value_ = T();
+    has_value_ = false;
+    return *this;
+  }
+
+  FLATBUFFERS_CONSTEXPR_CPP14 Optional &operator=(T val) FLATBUFFERS_NOEXCEPT {
+    value_ = val;
+    has_value_ = true;
+    return *this;
+  }
+
+  void reset() FLATBUFFERS_NOEXCEPT {
+    *this = nullopt;
+  }
+
+  void swap(Optional &other) FLATBUFFERS_NOEXCEPT {
+    std::swap(value_, other.value_);
+    std::swap(has_value_, other.has_value_);
+  }
+
+  FLATBUFFERS_CONSTEXPR_CPP11 FLATBUFFERS_EXPLICIT_CPP11 operator bool() const FLATBUFFERS_NOEXCEPT {
+    return has_value_;
+  }
+
+  FLATBUFFERS_CONSTEXPR_CPP11 bool has_value() const FLATBUFFERS_NOEXCEPT {
+    return has_value_;
+  }
+
+  FLATBUFFERS_CONSTEXPR_CPP11 const T& operator*() const FLATBUFFERS_NOEXCEPT {
+    return value_;
+  }
+
+  const T& value() const {
+    FLATBUFFERS_ASSERT(has_value());
+    return value_;
+  }
+
+  T value_or(T default_value) const FLATBUFFERS_NOEXCEPT {
+    return has_value() ? value_ : default_value;
+  }
+
+ private:
+  T value_;
+  bool has_value_;
+};
+
+template<class T>
+FLATBUFFERS_CONSTEXPR_CPP11 bool operator==(const Optional<T>& opt, nullopt_t) FLATBUFFERS_NOEXCEPT {
+  return !opt;
+}
+template<class T>
+FLATBUFFERS_CONSTEXPR_CPP11 bool operator==(nullopt_t, const Optional<T>& opt) FLATBUFFERS_NOEXCEPT {
+  return !opt;
+}
+
+template<class T, class U>
+FLATBUFFERS_CONSTEXPR_CPP11 bool operator==(const Optional<T>& lhs, const U& rhs) FLATBUFFERS_NOEXCEPT {
+  return static_cast<bool>(lhs) && (*lhs == rhs);
+}
+
+template<class T, class U>
+FLATBUFFERS_CONSTEXPR_CPP11 bool operator==(const T& lhs, const Optional<U>& rhs) FLATBUFFERS_NOEXCEPT {
+  return static_cast<bool>(rhs) && (lhs == *rhs);
+}
+
+template<class T, class U>
+FLATBUFFERS_CONSTEXPR_CPP11 bool operator==(const Optional<T>& lhs, const Optional<U>& rhs) FLATBUFFERS_NOEXCEPT {
+  return static_cast<bool>(lhs) != static_cast<bool>(rhs)
+              ? false
+              : !static_cast<bool>(lhs) ? false : (*lhs == *rhs);
+}
+#endif // FLATBUFFERS_USE_STD_OPTIONAL
+
+
+// Very limited and naive partial implementation of C++20 std::span<T,Extent>.
+#if defined(FLATBUFFERS_USE_STD_SPAN)
+  inline constexpr std::size_t dynamic_extent = std::dynamic_extent;
+  template<class T, std::size_t Extent = std::dynamic_extent>
+  using span = std::span<T, Extent>;
+
+#else // !defined(FLATBUFFERS_USE_STD_SPAN)
+FLATBUFFERS_CONSTEXPR std::size_t dynamic_extent = static_cast<std::size_t>(-1);
+
+// Exclude this code if MSVC2010 or non-STL Android is active.
+// The non-STL Android doesn't have `std::is_convertible` required for SFINAE.
+#if !defined(FLATBUFFERS_SPAN_MINIMAL)
+namespace internal {
+  // This is SFINAE helper class for checking of a common condition:
+  // > This overload only participates in overload resolution
+  // > Check whether a pointer to an array of From can be converted
+  // > to a pointer to an array of To.
+  // This helper is used for checking of 'From -> const From'.
+  template<class To, std::size_t Extent, class From, std::size_t N>
+  struct is_span_convertible {
+    using type =
+      typename std::conditional<std::is_convertible<From (*)[], To (*)[]>::value
+                                && (Extent == dynamic_extent || N == Extent),
+                                int, void>::type;
+  };
+
+  template<typename T>
+  struct SpanIterator {
+    // TODO: upgrade to std::random_access_iterator_tag.
+    using iterator_category = std::forward_iterator_tag;
+    using difference_type  = std::ptrdiff_t;
+    using value_type = typename std::remove_cv<T>::type;
+    using reference = T&;
+    using pointer   = T*;
+
+    // Convince MSVC compiler that this iterator is trusted (it is verified).
+    #ifdef _MSC_VER
+      using _Unchecked_type = pointer;
+    #endif // _MSC_VER
+
+    SpanIterator(pointer ptr) : ptr_(ptr) {}
+    reference operator*() const { return *ptr_; }
+    pointer operator->() { return ptr_; }
+    SpanIterator& operator++() { ptr_++; return *this; }  
+    SpanIterator  operator++(int) { auto tmp = *this; ++(*this); return tmp; }
+
+    friend bool operator== (const SpanIterator& lhs, const SpanIterator& rhs) { return lhs.ptr_ == rhs.ptr_; }
+    friend bool operator!= (const SpanIterator& lhs, const SpanIterator& rhs) { return lhs.ptr_ != rhs.ptr_; }
+
+   private:
+    pointer ptr_;
+  };
+}  // namespace internal
+#endif  // !defined(FLATBUFFERS_SPAN_MINIMAL)
+
+// T - element type; must be a complete type that is not an abstract
+// class type.
+// Extent - the number of elements in the sequence, or dynamic.
+template<class T, std::size_t Extent = dynamic_extent>
+class span FLATBUFFERS_FINAL_CLASS {
+ public:
+  typedef T element_type;
+  typedef T& reference;
+  typedef const T& const_reference;
+  typedef T* pointer;
+  typedef const T* const_pointer;
+  typedef std::size_t size_type;
+
+  static FLATBUFFERS_CONSTEXPR size_type extent = Extent;
+
+  // Returns the number of elements in the span.
+  FLATBUFFERS_CONSTEXPR_CPP11 size_type size() const FLATBUFFERS_NOEXCEPT {
+    return count_;
+  }
+
+  // Returns the size of the sequence in bytes.
+  FLATBUFFERS_CONSTEXPR_CPP11
+  size_type size_bytes() const FLATBUFFERS_NOEXCEPT {
+    return size() * sizeof(element_type);
+  }
+
+  // Checks if the span is empty.
+  FLATBUFFERS_CONSTEXPR_CPP11 bool empty() const FLATBUFFERS_NOEXCEPT {
+    return size() == 0;
+  }
+
+  // Returns a pointer to the beginning of the sequence.
+  FLATBUFFERS_CONSTEXPR_CPP11 pointer data() const FLATBUFFERS_NOEXCEPT {
+    return data_;
+  }
+
+  #if !defined(FLATBUFFERS_SPAN_MINIMAL)
+    using Iterator = internal::SpanIterator<T>;
+
+    Iterator begin() const { return Iterator(data()); }
+    Iterator end() const   { return Iterator(data() + size()); }
+  #endif
+
+  // Returns a reference to the idx-th element of the sequence.
+  // The behavior is undefined if the idx is greater than or equal to size().
+  FLATBUFFERS_CONSTEXPR_CPP11 reference operator[](size_type idx) const {
+    return data()[idx];
+  }
+
+  FLATBUFFERS_CONSTEXPR_CPP11 span(const span &other) FLATBUFFERS_NOEXCEPT
+      : data_(other.data_), count_(other.count_) {}
+
+  FLATBUFFERS_CONSTEXPR_CPP14 span &operator=(const span &other)
+      FLATBUFFERS_NOEXCEPT {
+    data_ = other.data_;
+    count_ = other.count_;
+  }
+
+  // Limited implementation of
+  // `template <class It> constexpr std::span(It first, size_type count);`.
+  //
+  // Constructs a span that is a view over the range [first, first + count);
+  // the resulting span has: data() == first and size() == count.
+  // The behavior is undefined if [first, first + count) is not a valid range,
+  // or if (extent != flatbuffers::dynamic_extent && count != extent).
+  FLATBUFFERS_CONSTEXPR_CPP11
+  explicit span(pointer first, size_type count) FLATBUFFERS_NOEXCEPT
+    : data_ (Extent == dynamic_extent ? first : (Extent == count ? first : nullptr)),
+      count_(Extent == dynamic_extent ? count : (Extent == count ? Extent : 0)) {
+      // Make span empty if the count argument is incompatible with span<T,N>.
+  }
+
+  // Exclude this code if MSVC2010 is active. The MSVC2010 isn't C++11
+  // compliant, it doesn't support default template arguments for functions.
+  #if defined(FLATBUFFERS_SPAN_MINIMAL)
+  FLATBUFFERS_CONSTEXPR_CPP11 span() FLATBUFFERS_NOEXCEPT : data_(nullptr),
+                                                            count_(0) {
+    static_assert(extent == 0 || extent == dynamic_extent, "invalid span");
+  }
+
+  #else
+  // Constructs an empty span whose data() == nullptr and size() == 0.
+  // This overload only participates in overload resolution if
+  // extent == 0 || extent == flatbuffers::dynamic_extent.
+  // A dummy template argument N is need dependency for SFINAE.
+  template<std::size_t N = 0,
+    typename internal::is_span_convertible<element_type, Extent, element_type, (N - N)>::type = 0>
+  FLATBUFFERS_CONSTEXPR_CPP11 span() FLATBUFFERS_NOEXCEPT : data_(nullptr),
+                                                            count_(0) {
+    static_assert(extent == 0 || extent == dynamic_extent, "invalid span");
+  }
+
+  // Constructs a span that is a view over the array arr; the resulting span
+  // has size() == N and data() == std::data(arr). These overloads only
+  // participate in overload resolution if
+  // extent == std::dynamic_extent || N == extent is true and
+  // std::remove_pointer_t<decltype(std::data(arr))>(*)[]
+  // is convertible to element_type (*)[].
+  template<std::size_t N,
+    typename internal::is_span_convertible<element_type, Extent, element_type, N>::type = 0>
+  FLATBUFFERS_CONSTEXPR_CPP11 span(element_type (&arr)[N]) FLATBUFFERS_NOEXCEPT
+      : data_(arr), count_(N) {}
+
+  template<class U, std::size_t N,
+    typename internal::is_span_convertible<element_type, Extent, U, N>::type = 0>
+  FLATBUFFERS_CONSTEXPR_CPP11 span(std::array<U, N> &arr) FLATBUFFERS_NOEXCEPT
+     : data_(arr.data()), count_(N) {}
+
+  //template<class U, std::size_t N,
+  //  int = 0>
+  //FLATBUFFERS_CONSTEXPR_CPP11 span(std::array<U, N> &arr) FLATBUFFERS_NOEXCEPT
+  //   : data_(arr.data()), count_(N) {}
+
+  template<class U, std::size_t N,
+    typename internal::is_span_convertible<element_type, Extent, U, N>::type = 0>
+  FLATBUFFERS_CONSTEXPR_CPP11 span(const std::array<U, N> &arr) FLATBUFFERS_NOEXCEPT
+    : data_(arr.data()), count_(N) {}
+
+  // Converting constructor from another span s;
+  // the resulting span has size() == s.size() and data() == s.data().
+  // This overload only participates in overload resolution
+  // if extent == std::dynamic_extent || N == extent is true and U (*)[]
+  // is convertible to element_type (*)[].
+  template<class U, std::size_t N,
+    typename internal::is_span_convertible<element_type, Extent, U, N>::type = 0>
+  FLATBUFFERS_CONSTEXPR_CPP11 span(const flatbuffers::span<U, N> &s) FLATBUFFERS_NOEXCEPT
+      : span(s.data(), s.size()) {
+  }
+
+  #endif  // !defined(FLATBUFFERS_SPAN_MINIMAL)
+
+ private:
+  // This is a naive implementation with 'count_' member even if (Extent != dynamic_extent).
+  pointer const data_;
+  size_type count_;
+};
+#endif  // defined(FLATBUFFERS_USE_STD_SPAN)
+
+#if !defined(FLATBUFFERS_SPAN_MINIMAL)
+template<class ElementType, std::size_t Extent>
+FLATBUFFERS_CONSTEXPR_CPP11
+flatbuffers::span<ElementType, Extent> make_span(ElementType(&arr)[Extent]) FLATBUFFERS_NOEXCEPT {
+  return span<ElementType, Extent>(arr);
+}
+
+template<class ElementType, std::size_t Extent>
+FLATBUFFERS_CONSTEXPR_CPP11
+flatbuffers::span<const ElementType, Extent> make_span(const ElementType(&arr)[Extent]) FLATBUFFERS_NOEXCEPT {
+  return span<const ElementType, Extent>(arr);
+}
+
+template<class ElementType, std::size_t Extent>
+FLATBUFFERS_CONSTEXPR_CPP11
+flatbuffers::span<ElementType, Extent> make_span(std::array<ElementType, Extent> &arr) FLATBUFFERS_NOEXCEPT {
+  return span<ElementType, Extent>(arr);
+}
+
+template<class ElementType, std::size_t Extent>
+FLATBUFFERS_CONSTEXPR_CPP11
+flatbuffers::span<const ElementType, Extent> make_span(const std::array<ElementType, Extent> &arr) FLATBUFFERS_NOEXCEPT {
+  return span<const ElementType, Extent>(arr);
+}
+
+template<class ElementType, std::size_t Extent>
+FLATBUFFERS_CONSTEXPR_CPP11
+flatbuffers::span<ElementType, dynamic_extent> make_span(ElementType *first, std::size_t count) FLATBUFFERS_NOEXCEPT {
+  return span<ElementType, dynamic_extent>(first, count);
+}
+
+template<class ElementType, std::size_t Extent>
+FLATBUFFERS_CONSTEXPR_CPP11
+flatbuffers::span<const ElementType, dynamic_extent> make_span(const ElementType *first, std::size_t count) FLATBUFFERS_NOEXCEPT {
+  return span<const ElementType, dynamic_extent>(first, count);
+}
+#endif // !defined(FLATBUFFERS_SPAN_MINIMAL)
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_STL_EMULATION_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/string.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/string.h
new file mode 100644
index 00000000..97e399fd
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/string.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_STRING_H_
+#define FLATBUFFERS_STRING_H_
+
+#include "flatbuffers/base.h"
+#include "flatbuffers/vector.h"
+
+namespace flatbuffers {
+
+struct String : public Vector<char> {
+  const char *c_str() const { return reinterpret_cast<const char *>(Data()); }
+  std::string str() const { return std::string(c_str(), size()); }
+
+  // clang-format off
+  #ifdef FLATBUFFERS_HAS_STRING_VIEW
+  flatbuffers::string_view string_view() const {
+    return flatbuffers::string_view(c_str(), size());
+  }
+  #endif // FLATBUFFERS_HAS_STRING_VIEW
+  // clang-format on
+
+  bool operator<(const String &o) const {
+    return StringLessThan(this->data(), this->size(), o.data(), o.size());
+  }
+};
+
+// Convenience function to get std::string from a String returning an empty
+// string on null pointer.
+static inline std::string GetString(const String *str) {
+  return str ? str->str() : "";
+}
+
+// Convenience function to get char* from a String returning an empty string on
+// null pointer.
+static inline const char *GetCstring(const String *str) {
+  return str ? str->c_str() : "";
+}
+
+#ifdef FLATBUFFERS_HAS_STRING_VIEW
+// Convenience function to get string_view from a String returning an empty
+// string_view on null pointer.
+static inline flatbuffers::string_view GetStringView(const String *str) {
+  return str ? str->string_view() : flatbuffers::string_view();
+}
+#endif  // FLATBUFFERS_HAS_STRING_VIEW
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_STRING_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/struct.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/struct.h
new file mode 100644
index 00000000..abacc8a9
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/struct.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_STRUCT_H_
+#define FLATBUFFERS_STRUCT_H_
+
+#include "flatbuffers/base.h"
+
+namespace flatbuffers {
+
+// "structs" are flat structures that do not have an offset table, thus
+// always have all members present and do not support forwards/backwards
+// compatible extensions.
+
+class Struct FLATBUFFERS_FINAL_CLASS {
+ public:
+  template<typename T> T GetField(uoffset_t o) const {
+    return ReadScalar<T>(&data_[o]);
+  }
+
+  template<typename T> T GetStruct(uoffset_t o) const {
+    return reinterpret_cast<T>(&data_[o]);
+  }
+
+  const uint8_t *GetAddressOf(uoffset_t o) const { return &data_[o]; }
+  uint8_t *GetAddressOf(uoffset_t o) { return &data_[o]; }
+
+ private:
+  // private constructor & copy constructor: you obtain instances of this
+  // class by pointing to existing data only
+  Struct();
+  Struct(const Struct &);
+  Struct &operator=(const Struct &);
+
+  uint8_t data_[1];
+};
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_STRUCT_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/table.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/table.h
new file mode 100644
index 00000000..e92d8ae8
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/table.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_TABLE_H_
+#define FLATBUFFERS_TABLE_H_
+
+#include "flatbuffers/base.h"
+#include "flatbuffers/verifier.h"
+
+namespace flatbuffers {
+
+// "tables" use an offset table (possibly shared) that allows fields to be
+// omitted and added at will, but uses an extra indirection to read.
+class Table {
+ public:
+  const uint8_t *GetVTable() const {
+    return data_ - ReadScalar<soffset_t>(data_);
+  }
+
+  // This gets the field offset for any of the functions below it, or 0
+  // if the field was not present.
+  voffset_t GetOptionalFieldOffset(voffset_t field) const {
+    // The vtable offset is always at the start.
+    auto vtable = GetVTable();
+    // The first element is the size of the vtable (fields + type id + itself).
+    auto vtsize = ReadScalar<voffset_t>(vtable);
+    // If the field we're accessing is outside the vtable, we're reading older
+    // data, so it's the same as if the offset was 0 (not present).
+    return field < vtsize ? ReadScalar<voffset_t>(vtable + field) : 0;
+  }
+
+  template<typename T> T GetField(voffset_t field, T defaultval) const {
+    auto field_offset = GetOptionalFieldOffset(field);
+    return field_offset ? ReadScalar<T>(data_ + field_offset) : defaultval;
+  }
+
+  template<typename P, typename OffsetSize = uoffset_t>
+  P GetPointer(voffset_t field) {
+    auto field_offset = GetOptionalFieldOffset(field);
+    auto p = data_ + field_offset;
+    return field_offset ? reinterpret_cast<P>(p + ReadScalar<OffsetSize>(p))
+                        : nullptr;
+  }
+  template<typename P, typename OffsetSize = uoffset_t>
+  P GetPointer(voffset_t field) const {
+    return const_cast<Table *>(this)->GetPointer<P, OffsetSize>(field);
+  }
+
+  template<typename P> P GetPointer64(voffset_t field) {
+    return GetPointer<P, uoffset64_t>(field);
+  }
+
+  template<typename P> P GetPointer64(voffset_t field) const {
+    return GetPointer<P, uoffset64_t>(field);
+  }
+
+  template<typename P> P GetStruct(voffset_t field) const {
+    auto field_offset = GetOptionalFieldOffset(field);
+    auto p = const_cast<uint8_t *>(data_ + field_offset);
+    return field_offset ? reinterpret_cast<P>(p) : nullptr;
+  }
+
+  template<typename Raw, typename Face>
+  flatbuffers::Optional<Face> GetOptional(voffset_t field) const {
+    auto field_offset = GetOptionalFieldOffset(field);
+    auto p = data_ + field_offset;
+    return field_offset ? Optional<Face>(static_cast<Face>(ReadScalar<Raw>(p)))
+                        : Optional<Face>();
+  }
+
+  template<typename T> bool SetField(voffset_t field, T val, T def) {
+    auto field_offset = GetOptionalFieldOffset(field);
+    if (!field_offset) return IsTheSameAs(val, def);
+    WriteScalar(data_ + field_offset, val);
+    return true;
+  }
+  template<typename T> bool SetField(voffset_t field, T val) {
+    auto field_offset = GetOptionalFieldOffset(field);
+    if (!field_offset) return false;
+    WriteScalar(data_ + field_offset, val);
+    return true;
+  }
+
+  bool SetPointer(voffset_t field, const uint8_t *val) {
+    auto field_offset = GetOptionalFieldOffset(field);
+    if (!field_offset) return false;
+    WriteScalar(data_ + field_offset,
+                static_cast<uoffset_t>(val - (data_ + field_offset)));
+    return true;
+  }
+
+  uint8_t *GetAddressOf(voffset_t field) {
+    auto field_offset = GetOptionalFieldOffset(field);
+    return field_offset ? data_ + field_offset : nullptr;
+  }
+  const uint8_t *GetAddressOf(voffset_t field) const {
+    return const_cast<Table *>(this)->GetAddressOf(field);
+  }
+
+  bool CheckField(voffset_t field) const {
+    return GetOptionalFieldOffset(field) != 0;
+  }
+
+  // Verify the vtable of this table.
+  // Call this once per table, followed by VerifyField once per field.
+  bool VerifyTableStart(Verifier &verifier) const {
+    return verifier.VerifyTableStart(data_);
+  }
+
+  // Verify a particular field.
+  template<typename T>
+  bool VerifyField(const Verifier &verifier, voffset_t field,
+                   size_t align) const {
+    // Calling GetOptionalFieldOffset should be safe now thanks to
+    // VerifyTable().
+    auto field_offset = GetOptionalFieldOffset(field);
+    // Check the actual field.
+    return !field_offset || verifier.VerifyField<T>(data_, field_offset, align);
+  }
+
+  // VerifyField for required fields.
+  template<typename T>
+  bool VerifyFieldRequired(const Verifier &verifier, voffset_t field,
+                           size_t align) const {
+    auto field_offset = GetOptionalFieldOffset(field);
+    return verifier.Check(field_offset != 0) &&
+           verifier.VerifyField<T>(data_, field_offset, align);
+  }
+
+  // Versions for offsets.
+  template<typename OffsetT = uoffset_t>
+  bool VerifyOffset(const Verifier &verifier, voffset_t field) const {
+    auto field_offset = GetOptionalFieldOffset(field);
+    return !field_offset || verifier.VerifyOffset<OffsetT>(data_, field_offset);
+  }
+
+  template<typename OffsetT = uoffset_t>
+  bool VerifyOffsetRequired(const Verifier &verifier, voffset_t field) const {
+    auto field_offset = GetOptionalFieldOffset(field);
+    return verifier.Check(field_offset != 0) &&
+           verifier.VerifyOffset<OffsetT>(data_, field_offset);
+  }
+
+  bool VerifyOffset64(const Verifier &verifier, voffset_t field) const {
+    return VerifyOffset<uoffset64_t>(verifier, field);
+  }
+
+  bool VerifyOffset64Required(const Verifier &verifier, voffset_t field) const {
+    return VerifyOffsetRequired<uoffset64_t>(verifier, field);
+  }
+
+ private:
+  // private constructor & copy constructor: you obtain instances of this
+  // class by pointing to existing data only
+  Table();
+  Table(const Table &other);
+  Table &operator=(const Table &);
+
+  uint8_t data_[1];
+};
+
+// This specialization allows avoiding warnings like:
+// MSVC C4800: type: forcing value to bool 'true' or 'false'.
+template<>
+inline flatbuffers::Optional<bool> Table::GetOptional<uint8_t, bool>(
+    voffset_t field) const {
+  auto field_offset = GetOptionalFieldOffset(field);
+  auto p = data_ + field_offset;
+  return field_offset ? Optional<bool>(ReadScalar<uint8_t>(p) != 0)
+                      : Optional<bool>();
+}
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_TABLE_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/util.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/util.h
new file mode 100644
index 00000000..1ccf3517
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/util.h
@@ -0,0 +1,732 @@
+/*
+ * Copyright 2014 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_UTIL_H_
+#define FLATBUFFERS_UTIL_H_
+
+#include <ctype.h>
+#include <errno.h>
+
+#include "flatbuffers/base.h"
+#include "flatbuffers/stl_emulation.h"
+
+#ifndef FLATBUFFERS_PREFER_PRINTF
+#  include <iomanip>
+#  include <sstream>
+#else  // FLATBUFFERS_PREFER_PRINTF
+#  include <float.h>
+#  include <stdio.h>
+#endif  // FLATBUFFERS_PREFER_PRINTF
+
+#include <cmath>
+#include <limits>
+#include <string>
+
+namespace flatbuffers {
+
+// @locale-independent functions for ASCII characters set.
+
+// Fast checking that character lies in closed range: [a <= x <= b]
+// using one compare (conditional branch) operator.
+inline bool check_ascii_range(char x, char a, char b) {
+  FLATBUFFERS_ASSERT(a <= b);
+  // (Hacker's Delight): `a <= x <= b` <=> `(x-a) <={u} (b-a)`.
+  // The x, a, b will be promoted to int and subtracted without overflow.
+  return static_cast<unsigned int>(x - a) <= static_cast<unsigned int>(b - a);
+}
+
+// Case-insensitive isalpha
+inline bool is_alpha(char c) {
+  // ASCII only: alpha to upper case => reset bit 0x20 (~0x20 = 0xDF).
+  return check_ascii_range(c & 0xDF, 'a' & 0xDF, 'z' & 0xDF);
+}
+
+// Check for uppercase alpha
+inline bool is_alpha_upper(char c) { return check_ascii_range(c, 'A', 'Z'); }
+
+// Check (case-insensitive) that `c` is equal to alpha.
+inline bool is_alpha_char(char c, char alpha) {
+  FLATBUFFERS_ASSERT(is_alpha(alpha));
+  // ASCII only: alpha to upper case => reset bit 0x20 (~0x20 = 0xDF).
+  return ((c & 0xDF) == (alpha & 0xDF));
+}
+
+// https://en.cppreference.com/w/cpp/string/byte/isxdigit
+// isdigit and isxdigit are the only standard narrow character classification
+// functions that are not affected by the currently installed C locale. although
+// some implementations (e.g. Microsoft in 1252 codepage) may classify
+// additional single-byte characters as digits.
+inline bool is_digit(char c) { return check_ascii_range(c, '0', '9'); }
+
+inline bool is_xdigit(char c) {
+  // Replace by look-up table.
+  return is_digit(c) || check_ascii_range(c & 0xDF, 'a' & 0xDF, 'f' & 0xDF);
+}
+
+// Case-insensitive isalnum
+inline bool is_alnum(char c) { return is_alpha(c) || is_digit(c); }
+
+inline char CharToUpper(char c) {
+  return static_cast<char>(::toupper(static_cast<unsigned char>(c)));
+}
+
+inline char CharToLower(char c) {
+  return static_cast<char>(::tolower(static_cast<unsigned char>(c)));
+}
+
+// @end-locale-independent functions for ASCII character set
+
+#ifdef FLATBUFFERS_PREFER_PRINTF
+template<typename T> size_t IntToDigitCount(T t) {
+  size_t digit_count = 0;
+  // Count the sign for negative numbers
+  if (t < 0) digit_count++;
+  // Count a single 0 left of the dot for fractional numbers
+  if (-1 < t && t < 1) digit_count++;
+  // Count digits until fractional part
+  T eps = std::numeric_limits<T>::epsilon();
+  while (t <= (-1 + eps) || (1 - eps) <= t) {
+    t /= 10;
+    digit_count++;
+  }
+  return digit_count;
+}
+
+template<typename T> size_t NumToStringWidth(T t, int precision = 0) {
+  size_t string_width = IntToDigitCount(t);
+  // Count the dot for floating point numbers
+  if (precision) string_width += (precision + 1);
+  return string_width;
+}
+
+template<typename T>
+std::string NumToStringImplWrapper(T t, const char *fmt, int precision = 0) {
+  size_t string_width = NumToStringWidth(t, precision);
+  std::string s(string_width, 0x00);
+  // Allow snprintf to use std::string trailing null to detect buffer overflow
+  snprintf(const_cast<char *>(s.data()), (s.size() + 1), fmt, string_width, t);
+  return s;
+}
+#endif  // FLATBUFFERS_PREFER_PRINTF
+
+// Convert an integer or floating point value to a string.
+// In contrast to std::stringstream, "char" values are
+// converted to a string of digits, and we don't use scientific notation.
+template<typename T> std::string NumToString(T t) {
+  // clang-format off
+
+  #ifndef FLATBUFFERS_PREFER_PRINTF
+    std::stringstream ss;
+    ss << t;
+    return ss.str();
+  #else // FLATBUFFERS_PREFER_PRINTF
+    auto v = static_cast<long long>(t);
+    return NumToStringImplWrapper(v, "%.*lld");
+  #endif // FLATBUFFERS_PREFER_PRINTF
+  // clang-format on
+}
+// Avoid char types used as character data.
+template<> inline std::string NumToString<signed char>(signed char t) {
+  return NumToString(static_cast<int>(t));
+}
+template<> inline std::string NumToString<unsigned char>(unsigned char t) {
+  return NumToString(static_cast<int>(t));
+}
+template<> inline std::string NumToString<char>(char t) {
+  return NumToString(static_cast<int>(t));
+}
+
+// Special versions for floats/doubles.
+template<typename T> std::string FloatToString(T t, int precision) {
+  // clang-format off
+
+  #ifndef FLATBUFFERS_PREFER_PRINTF
+    // to_string() prints different numbers of digits for floats depending on
+    // platform and isn't available on Android, so we use stringstream
+    std::stringstream ss;
+    // Use std::fixed to suppress scientific notation.
+    ss << std::fixed;
+    // Default precision is 6, we want that to be higher for doubles.
+    ss << std::setprecision(precision);
+    ss << t;
+    auto s = ss.str();
+  #else // FLATBUFFERS_PREFER_PRINTF
+    auto v = static_cast<double>(t);
+    auto s = NumToStringImplWrapper(v, "%0.*f", precision);
+  #endif // FLATBUFFERS_PREFER_PRINTF
+  // clang-format on
+  // Sadly, std::fixed turns "1" into "1.00000", so here we undo that.
+  auto p = s.find_last_not_of('0');
+  if (p != std::string::npos) {
+    // Strip trailing zeroes. If it is a whole number, keep one zero.
+    s.resize(p + (s[p] == '.' ? 2 : 1));
+  }
+  return s;
+}
+
+template<> inline std::string NumToString<double>(double t) {
+  return FloatToString(t, 12);
+}
+template<> inline std::string NumToString<float>(float t) {
+  return FloatToString(t, 6);
+}
+
+// Convert an integer value to a hexadecimal string.
+// The returned string length is always xdigits long, prefixed by 0 digits.
+// For example, IntToStringHex(0x23, 8) returns the string "00000023".
+inline std::string IntToStringHex(int i, int xdigits) {
+  FLATBUFFERS_ASSERT(i >= 0);
+  // clang-format off
+
+  #ifndef FLATBUFFERS_PREFER_PRINTF
+    std::stringstream ss;
+    ss << std::setw(xdigits) << std::setfill('0') << std::hex << std::uppercase
+       << i;
+    return ss.str();
+  #else // FLATBUFFERS_PREFER_PRINTF
+    return NumToStringImplWrapper(i, "%.*X", xdigits);
+  #endif // FLATBUFFERS_PREFER_PRINTF
+  // clang-format on
+}
+
+// clang-format off
+// Use locale independent functions {strtod_l, strtof_l, strtoll_l, strtoull_l}.
+#if defined(FLATBUFFERS_LOCALE_INDEPENDENT) && (FLATBUFFERS_LOCALE_INDEPENDENT > 0)
+  class ClassicLocale {
+    #ifdef _MSC_VER
+      typedef _locale_t locale_type;
+    #else
+      typedef locale_t locale_type;  // POSIX.1-2008 locale_t type
+    #endif
+    ClassicLocale();
+    ~ClassicLocale();
+    locale_type locale_;
+    static ClassicLocale instance_;
+  public:
+    static locale_type Get() { return instance_.locale_; }
+  };
+
+  #ifdef _MSC_VER
+    #define __strtoull_impl(s, pe, b) _strtoui64_l(s, pe, b, ClassicLocale::Get())
+    #define __strtoll_impl(s, pe, b) _strtoi64_l(s, pe, b, ClassicLocale::Get())
+    #define __strtod_impl(s, pe) _strtod_l(s, pe, ClassicLocale::Get())
+    #define __strtof_impl(s, pe) _strtof_l(s, pe, ClassicLocale::Get())
+  #else
+    #define __strtoull_impl(s, pe, b) strtoull_l(s, pe, b, ClassicLocale::Get())
+    #define __strtoll_impl(s, pe, b) strtoll_l(s, pe, b, ClassicLocale::Get())
+    #define __strtod_impl(s, pe) strtod_l(s, pe, ClassicLocale::Get())
+    #define __strtof_impl(s, pe) strtof_l(s, pe, ClassicLocale::Get())
+  #endif
+#else
+  #define __strtod_impl(s, pe) strtod(s, pe)
+  #define __strtof_impl(s, pe) static_cast<float>(strtod(s, pe))
+  #ifdef _MSC_VER
+    #define __strtoull_impl(s, pe, b) _strtoui64(s, pe, b)
+    #define __strtoll_impl(s, pe, b) _strtoi64(s, pe, b)
+  #else
+    #define __strtoull_impl(s, pe, b) strtoull(s, pe, b)
+    #define __strtoll_impl(s, pe, b) strtoll(s, pe, b)
+  #endif
+#endif
+
+inline void strtoval_impl(int64_t *val, const char *str, char **endptr,
+                                 int base) {
+    *val = __strtoll_impl(str, endptr, base);
+}
+
+inline void strtoval_impl(uint64_t *val, const char *str, char **endptr,
+                                 int base) {
+  *val = __strtoull_impl(str, endptr, base);
+}
+
+inline void strtoval_impl(double *val, const char *str, char **endptr) {
+  *val = __strtod_impl(str, endptr);
+}
+
+// UBSAN: double to float is safe if numeric_limits<float>::is_iec559 is true.
+FLATBUFFERS_SUPPRESS_UBSAN("float-cast-overflow")
+inline void strtoval_impl(float *val, const char *str, char **endptr) {
+  *val = __strtof_impl(str, endptr);
+}
+#undef __strtoull_impl
+#undef __strtoll_impl
+#undef __strtod_impl
+#undef __strtof_impl
+// clang-format on
+
+// Adaptor for strtoull()/strtoll().
+// Flatbuffers accepts numbers with any count of leading zeros (-009 is -9),
+// while strtoll with base=0 interprets first leading zero as octal prefix.
+// In future, it is possible to add prefixed 0b0101.
+// 1) Checks errno code for overflow condition (out of range).
+// 2) If base <= 0, function try to detect base of number by prefix.
+//
+// Return value (like strtoull and strtoll, but reject partial result):
+// - If successful, an integer value corresponding to the str is returned.
+// - If full string conversion can't be performed, 0 is returned.
+// - If the converted value falls out of range of corresponding return type, a
+// range error occurs. In this case value MAX(T)/MIN(T) is returned.
+template<typename T>
+inline bool StringToIntegerImpl(T *val, const char *const str,
+                                const int base = 0,
+                                const bool check_errno = true) {
+  // T is int64_t or uint64_T
+  FLATBUFFERS_ASSERT(str);
+  if (base <= 0) {
+    auto s = str;
+    while (*s && !is_digit(*s)) s++;
+    if (s[0] == '0' && is_alpha_char(s[1], 'X'))
+      return StringToIntegerImpl(val, str, 16, check_errno);
+    // if a prefix not match, try base=10
+    return StringToIntegerImpl(val, str, 10, check_errno);
+  } else {
+    if (check_errno) errno = 0;  // clear thread-local errno
+    auto endptr = str;
+    strtoval_impl(val, str, const_cast<char **>(&endptr), base);
+    if ((*endptr != '\0') || (endptr == str)) {
+      *val = 0;      // erase partial result
+      return false;  // invalid string
+    }
+    // errno is out-of-range, return MAX/MIN
+    if (check_errno && errno) return false;
+    return true;
+  }
+}
+
+template<typename T>
+inline bool StringToFloatImpl(T *val, const char *const str) {
+  // Type T must be either float or double.
+  FLATBUFFERS_ASSERT(str && val);
+  auto end = str;
+  strtoval_impl(val, str, const_cast<char **>(&end));
+  auto done = (end != str) && (*end == '\0');
+  if (!done) *val = 0;  // erase partial result
+  if (done && std::isnan(*val)) { *val = std::numeric_limits<T>::quiet_NaN(); }
+  return done;
+}
+
+// Convert a string to an instance of T.
+// Return value (matched with StringToInteger64Impl and strtod):
+// - If successful, a numeric value corresponding to the str is returned.
+// - If full string conversion can't be performed, 0 is returned.
+// - If the converted value falls out of range of corresponding return type, a
+// range error occurs. In this case value MAX(T)/MIN(T) is returned.
+template<typename T> inline bool StringToNumber(const char *s, T *val) {
+  // Assert on `unsigned long` and `signed long` on LP64.
+  // If it is necessary, it could be solved with flatbuffers::enable_if<B,T>.
+  static_assert(sizeof(T) < sizeof(int64_t), "unexpected type T");
+  FLATBUFFERS_ASSERT(s && val);
+  int64_t i64;
+  // The errno check isn't needed, will return MAX/MIN on overflow.
+  if (StringToIntegerImpl(&i64, s, 0, false)) {
+    const int64_t max = (flatbuffers::numeric_limits<T>::max)();
+    const int64_t min = flatbuffers::numeric_limits<T>::lowest();
+    if (i64 > max) {
+      *val = static_cast<T>(max);
+      return false;
+    }
+    if (i64 < min) {
+      // For unsigned types return max to distinguish from
+      // "no conversion can be performed" when 0 is returned.
+      *val = static_cast<T>(flatbuffers::is_unsigned<T>::value ? max : min);
+      return false;
+    }
+    *val = static_cast<T>(i64);
+    return true;
+  }
+  *val = 0;
+  return false;
+}
+
+template<> inline bool StringToNumber<int64_t>(const char *str, int64_t *val) {
+  return StringToIntegerImpl(val, str);
+}
+
+template<>
+inline bool StringToNumber<uint64_t>(const char *str, uint64_t *val) {
+  if (!StringToIntegerImpl(val, str)) return false;
+  // The strtoull accepts negative numbers:
+  // If the minus sign was part of the input sequence, the numeric value
+  // calculated from the sequence of digits is negated as if by unary minus
+  // in the result type, which applies unsigned integer wraparound rules.
+  // Fix this behaviour (except -0).
+  if (*val) {
+    auto s = str;
+    while (*s && !is_digit(*s)) s++;
+    s = (s > str) ? (s - 1) : s;  // step back to one symbol
+    if (*s == '-') {
+      // For unsigned types return the max to distinguish from
+      // "no conversion can be performed".
+      *val = (flatbuffers::numeric_limits<uint64_t>::max)();
+      return false;
+    }
+  }
+  return true;
+}
+
+template<> inline bool StringToNumber(const char *s, float *val) {
+  return StringToFloatImpl(val, s);
+}
+
+template<> inline bool StringToNumber(const char *s, double *val) {
+  return StringToFloatImpl(val, s);
+}
+
+inline int64_t StringToInt(const char *s, int base = 10) {
+  int64_t val;
+  return StringToIntegerImpl(&val, s, base) ? val : 0;
+}
+
+inline uint64_t StringToUInt(const char *s, int base = 10) {
+  uint64_t val;
+  return StringToIntegerImpl(&val, s, base) ? val : 0;
+}
+
+inline bool StringIsFlatbufferNan(const std::string &s) {
+  return s == "nan" || s == "+nan" || s == "-nan";
+}
+
+inline bool StringIsFlatbufferPositiveInfinity(const std::string &s) {
+  return s == "inf" || s == "+inf" || s == "infinity" || s == "+infinity";
+}
+
+inline bool StringIsFlatbufferNegativeInfinity(const std::string &s) {
+  return s == "-inf" || s == "-infinity";
+}
+
+typedef bool (*LoadFileFunction)(const char *filename, bool binary,
+                                 std::string *dest);
+typedef bool (*FileExistsFunction)(const char *filename);
+
+LoadFileFunction SetLoadFileFunction(LoadFileFunction load_file_function);
+
+FileExistsFunction SetFileExistsFunction(
+    FileExistsFunction file_exists_function);
+
+// Check if file "name" exists.
+bool FileExists(const char *name);
+
+// Check if "name" exists and it is also a directory.
+bool DirExists(const char *name);
+
+// Load file "name" into "buf" returning true if successful
+// false otherwise.  If "binary" is false data is read
+// using ifstream's text mode, otherwise data is read with
+// no transcoding.
+bool LoadFile(const char *name, bool binary, std::string *buf);
+
+// Save data "buf" of length "len" bytes into a file
+// "name" returning true if successful, false otherwise.
+// If "binary" is false data is written using ifstream's
+// text mode, otherwise data is written with no
+// transcoding.
+bool SaveFile(const char *name, const char *buf, size_t len, bool binary);
+
+// Save data "buf" into file "name" returning true if
+// successful, false otherwise.  If "binary" is false
+// data is written using ifstream's text mode, otherwise
+// data is written with no transcoding.
+inline bool SaveFile(const char *name, const std::string &buf, bool binary) {
+  return SaveFile(name, buf.c_str(), buf.size(), binary);
+}
+
+// Functionality for minimalistic portable path handling.
+
+// The functions below behave correctly regardless of whether posix ('/') or
+// Windows ('/' or '\\') separators are used.
+
+// Any new separators inserted are always posix.
+FLATBUFFERS_CONSTEXPR char kPathSeparator = '/';
+
+// Returns the path with the extension, if any, removed.
+std::string StripExtension(const std::string &filepath);
+
+// Returns the extension, if any.
+std::string GetExtension(const std::string &filepath);
+
+// Return the last component of the path, after the last separator.
+std::string StripPath(const std::string &filepath);
+
+// Strip the last component of the path + separator.
+std::string StripFileName(const std::string &filepath);
+
+std::string StripPrefix(const std::string &filepath,
+                        const std::string &prefix_to_remove);
+
+// Concatenates a path with a filename, regardless of whether the path
+// ends in a separator or not.
+std::string ConCatPathFileName(const std::string &path,
+                               const std::string &filename);
+
+// Replaces any '\\' separators with '/'
+std::string PosixPath(const char *path);
+std::string PosixPath(const std::string &path);
+
+// This function ensure a directory exists, by recursively
+// creating dirs for any parts of the path that don't exist yet.
+void EnsureDirExists(const std::string &filepath);
+
+// Obtains the absolute path from any other path.
+// Returns the input path if the absolute path couldn't be resolved.
+std::string AbsolutePath(const std::string &filepath);
+
+// Returns files relative to the --project_root path, prefixed with `//`.
+std::string RelativeToRootPath(const std::string &project,
+                               const std::string &filepath);
+
+// To and from UTF-8 unicode conversion functions
+
+// Convert a unicode code point into a UTF-8 representation by appending it
+// to a string. Returns the number of bytes generated.
+inline int ToUTF8(uint32_t ucc, std::string *out) {
+  FLATBUFFERS_ASSERT(!(ucc & 0x80000000));  // Top bit can't be set.
+  // 6 possible encodings: http://en.wikipedia.org/wiki/UTF-8
+  for (int i = 0; i < 6; i++) {
+    // Max bits this encoding can represent.
+    uint32_t max_bits = 6 + i * 5 + static_cast<int>(!i);
+    if (ucc < (1u << max_bits)) {  // does it fit?
+      // Remaining bits not encoded in the first byte, store 6 bits each
+      uint32_t remain_bits = i * 6;
+      // Store first byte:
+      (*out) += static_cast<char>((0xFE << (max_bits - remain_bits)) |
+                                  (ucc >> remain_bits));
+      // Store remaining bytes:
+      for (int j = i - 1; j >= 0; j--) {
+        (*out) += static_cast<char>(((ucc >> (j * 6)) & 0x3F) | 0x80);
+      }
+      return i + 1;  // Return the number of bytes added.
+    }
+  }
+  FLATBUFFERS_ASSERT(0);  // Impossible to arrive here.
+  return -1;
+}
+
+// Converts whatever prefix of the incoming string corresponds to a valid
+// UTF-8 sequence into a unicode code. The incoming pointer will have been
+// advanced past all bytes parsed.
+// returns -1 upon corrupt UTF-8 encoding (ignore the incoming pointer in
+// this case).
+inline int FromUTF8(const char **in) {
+  int len = 0;
+  // Count leading 1 bits.
+  for (int mask = 0x80; mask >= 0x04; mask >>= 1) {
+    if (**in & mask) {
+      len++;
+    } else {
+      break;
+    }
+  }
+  if ((static_cast<unsigned char>(**in) << len) & 0x80)
+    return -1;  // Bit after leading 1's must be 0.
+  if (!len) return *(*in)++;
+  // UTF-8 encoded values with a length are between 2 and 4 bytes.
+  if (len < 2 || len > 4) { return -1; }
+  // Grab initial bits of the code.
+  int ucc = *(*in)++ & ((1 << (7 - len)) - 1);
+  for (int i = 0; i < len - 1; i++) {
+    if ((**in & 0xC0) != 0x80) return -1;  // Upper bits must 1 0.
+    ucc <<= 6;
+    ucc |= *(*in)++ & 0x3F;  // Grab 6 more bits of the code.
+  }
+  // UTF-8 cannot encode values between 0xD800 and 0xDFFF (reserved for
+  // UTF-16 surrogate pairs).
+  if (ucc >= 0xD800 && ucc <= 0xDFFF) { return -1; }
+  // UTF-8 must represent code points in their shortest possible encoding.
+  switch (len) {
+    case 2:
+      // Two bytes of UTF-8 can represent code points from U+0080 to U+07FF.
+      if (ucc < 0x0080 || ucc > 0x07FF) { return -1; }
+      break;
+    case 3:
+      // Three bytes of UTF-8 can represent code points from U+0800 to U+FFFF.
+      if (ucc < 0x0800 || ucc > 0xFFFF) { return -1; }
+      break;
+    case 4:
+      // Four bytes of UTF-8 can represent code points from U+10000 to U+10FFFF.
+      if (ucc < 0x10000 || ucc > 0x10FFFF) { return -1; }
+      break;
+  }
+  return ucc;
+}
+
+#ifndef FLATBUFFERS_PREFER_PRINTF
+// Wraps a string to a maximum length, inserting new lines where necessary. Any
+// existing whitespace will be collapsed down to a single space. A prefix or
+// suffix can be provided, which will be inserted before or after a wrapped
+// line, respectively.
+inline std::string WordWrap(const std::string in, size_t max_length,
+                            const std::string wrapped_line_prefix,
+                            const std::string wrapped_line_suffix) {
+  std::istringstream in_stream(in);
+  std::string wrapped, line, word;
+
+  in_stream >> word;
+  line = word;
+
+  while (in_stream >> word) {
+    if ((line.length() + 1 + word.length() + wrapped_line_suffix.length()) <
+        max_length) {
+      line += " " + word;
+    } else {
+      wrapped += line + wrapped_line_suffix + "\n";
+      line = wrapped_line_prefix + word;
+    }
+  }
+  wrapped += line;
+
+  return wrapped;
+}
+#endif  // !FLATBUFFERS_PREFER_PRINTF
+
+inline bool EscapeString(const char *s, size_t length, std::string *_text,
+                         bool allow_non_utf8, bool natural_utf8) {
+  std::string &text = *_text;
+  text += "\"";
+  for (uoffset_t i = 0; i < length; i++) {
+    char c = s[i];
+    switch (c) {
+      case '\n': text += "\\n"; break;
+      case '\t': text += "\\t"; break;
+      case '\r': text += "\\r"; break;
+      case '\b': text += "\\b"; break;
+      case '\f': text += "\\f"; break;
+      case '\"': text += "\\\""; break;
+      case '\\': text += "\\\\"; break;
+      default:
+        if (c >= ' ' && c <= '~') {
+          text += c;
+        } else {
+          // Not printable ASCII data. Let's see if it's valid UTF-8 first:
+          const char *utf8 = s + i;
+          int ucc = FromUTF8(&utf8);
+          if (ucc < 0) {
+            if (allow_non_utf8) {
+              text += "\\x";
+              text += IntToStringHex(static_cast<uint8_t>(c), 2);
+            } else {
+              // There are two cases here:
+              //
+              // 1) We reached here by parsing an IDL file. In that case,
+              // we previously checked for non-UTF-8, so we shouldn't reach
+              // here.
+              //
+              // 2) We reached here by someone calling GenText()
+              // on a previously-serialized flatbuffer. The data might have
+              // non-UTF-8 Strings, or might be corrupt.
+              //
+              // In both cases, we have to give up and inform the caller
+              // they have no JSON.
+              return false;
+            }
+          } else {
+            if (natural_utf8) {
+              // utf8 points to past all utf-8 bytes parsed
+              text.append(s + i, static_cast<size_t>(utf8 - s - i));
+            } else if (ucc <= 0xFFFF) {
+              // Parses as Unicode within JSON's \uXXXX range, so use that.
+              text += "\\u";
+              text += IntToStringHex(ucc, 4);
+            } else if (ucc <= 0x10FFFF) {
+              // Encode Unicode SMP values to a surrogate pair using two \u
+              // escapes.
+              uint32_t base = ucc - 0x10000;
+              auto high_surrogate = (base >> 10) + 0xD800;
+              auto low_surrogate = (base & 0x03FF) + 0xDC00;
+              text += "\\u";
+              text += IntToStringHex(high_surrogate, 4);
+              text += "\\u";
+              text += IntToStringHex(low_surrogate, 4);
+            }
+            // Skip past characters recognized.
+            i = static_cast<uoffset_t>(utf8 - s - 1);
+          }
+        }
+        break;
+    }
+  }
+  text += "\"";
+  return true;
+}
+
+inline std::string BufferToHexText(const void *buffer, size_t buffer_size,
+                                   size_t max_length,
+                                   const std::string &wrapped_line_prefix,
+                                   const std::string &wrapped_line_suffix) {
+  std::string text = wrapped_line_prefix;
+  size_t start_offset = 0;
+  const char *s = reinterpret_cast<const char *>(buffer);
+  for (size_t i = 0; s && i < buffer_size; i++) {
+    // Last iteration or do we have more?
+    bool have_more = i + 1 < buffer_size;
+    text += "0x";
+    text += IntToStringHex(static_cast<uint8_t>(s[i]), 2);
+    if (have_more) { text += ','; }
+    // If we have more to process and we reached max_length
+    if (have_more &&
+        text.size() + wrapped_line_suffix.size() >= start_offset + max_length) {
+      text += wrapped_line_suffix;
+      text += '\n';
+      start_offset = text.size();
+      text += wrapped_line_prefix;
+    }
+  }
+  text += wrapped_line_suffix;
+  return text;
+}
+
+// Remove paired quotes in a string: "text"|'text' -> text.
+std::string RemoveStringQuotes(const std::string &s);
+
+// Change th global C-locale to locale with name <locale_name>.
+// Returns an actual locale name in <_value>, useful if locale_name is "" or
+// null.
+bool SetGlobalTestLocale(const char *locale_name,
+                         std::string *_value = nullptr);
+
+// Read (or test) a value of environment variable.
+bool ReadEnvironmentVariable(const char *var_name,
+                             std::string *_value = nullptr);
+
+enum class Case {
+  kUnknown = 0,
+  // TheQuickBrownFox
+  kUpperCamel = 1,
+  // theQuickBrownFox
+  kLowerCamel = 2,
+  // the_quick_brown_fox
+  kSnake = 3,
+  // THE_QUICK_BROWN_FOX
+  kScreamingSnake = 4,
+  // THEQUICKBROWNFOX
+  kAllUpper = 5,
+  // thequickbrownfox
+  kAllLower = 6,
+  // the-quick-brown-fox
+  kDasher = 7,
+  // THEQuiCKBr_ownFox (or whatever you want, we won't change it)
+  kKeep = 8,
+  // the_quick_brown_fox123 (as opposed to the_quick_brown_fox_123)
+  kSnake2 = 9,
+};
+
+// Convert the `input` string of case `input_case` to the specified
+// `output_case`.
+std::string ConvertCase(const std::string &input, Case output_case,
+                        Case input_case = Case::kSnake);
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_UTIL_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/vector.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/vector.h
new file mode 100644
index 00000000..ae52b938
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/vector.h
@@ -0,0 +1,397 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_VECTOR_H_
+#define FLATBUFFERS_VECTOR_H_
+
+#include "flatbuffers/base.h"
+#include "flatbuffers/buffer.h"
+#include "flatbuffers/stl_emulation.h"
+
+namespace flatbuffers {
+
+struct String;
+
+// An STL compatible iterator implementation for Vector below, effectively
+// calling Get() for every element.
+template<typename T, typename IT, typename Data = uint8_t *,
+         typename SizeT = uoffset_t>
+struct VectorIterator {
+  typedef std::random_access_iterator_tag iterator_category;
+  typedef IT value_type;
+  typedef ptrdiff_t difference_type;
+  typedef IT *pointer;
+  typedef IT &reference;
+
+  static const SizeT element_stride = IndirectHelper<T>::element_stride;
+
+  VectorIterator(Data data, SizeT i) : data_(data + element_stride * i) {}
+  VectorIterator(const VectorIterator &other) : data_(other.data_) {}
+  VectorIterator() : data_(nullptr) {}
+
+  VectorIterator &operator=(const VectorIterator &other) {
+    data_ = other.data_;
+    return *this;
+  }
+
+  VectorIterator &operator=(VectorIterator &&other) {
+    data_ = other.data_;
+    return *this;
+  }
+
+  bool operator==(const VectorIterator &other) const {
+    return data_ == other.data_;
+  }
+
+  bool operator<(const VectorIterator &other) const {
+    return data_ < other.data_;
+  }
+
+  bool operator!=(const VectorIterator &other) const {
+    return data_ != other.data_;
+  }
+
+  difference_type operator-(const VectorIterator &other) const {
+    return (data_ - other.data_) / element_stride;
+  }
+
+  // Note: return type is incompatible with the standard
+  // `reference operator*()`.
+  IT operator*() const { return IndirectHelper<T>::Read(data_, 0); }
+
+  // Note: return type is incompatible with the standard
+  // `pointer operator->()`.
+  IT operator->() const { return IndirectHelper<T>::Read(data_, 0); }
+
+  VectorIterator &operator++() {
+    data_ += element_stride;
+    return *this;
+  }
+
+  VectorIterator operator++(int) {
+    VectorIterator temp(data_, 0);
+    data_ += element_stride;
+    return temp;
+  }
+
+  VectorIterator operator+(const SizeT &offset) const {
+    return VectorIterator(data_ + offset * element_stride, 0);
+  }
+
+  VectorIterator &operator+=(const SizeT &offset) {
+    data_ += offset * element_stride;
+    return *this;
+  }
+
+  VectorIterator &operator--() {
+    data_ -= element_stride;
+    return *this;
+  }
+
+  VectorIterator operator--(int) {
+    VectorIterator temp(data_, 0);
+    data_ -= element_stride;
+    return temp;
+  }
+
+  VectorIterator operator-(const SizeT &offset) const {
+    return VectorIterator(data_ - offset * element_stride, 0);
+  }
+
+  VectorIterator &operator-=(const SizeT &offset) {
+    data_ -= offset * element_stride;
+    return *this;
+  }
+
+ private:
+  Data data_;
+};
+
+template<typename T, typename IT, typename SizeT = uoffset_t>
+using VectorConstIterator = VectorIterator<T, IT, const uint8_t *, SizeT>;
+
+template<typename Iterator>
+struct VectorReverseIterator : public std::reverse_iterator<Iterator> {
+  explicit VectorReverseIterator(Iterator iter)
+      : std::reverse_iterator<Iterator>(iter) {}
+
+  // Note: return type is incompatible with the standard
+  // `reference operator*()`.
+  typename Iterator::value_type operator*() const {
+    auto tmp = std::reverse_iterator<Iterator>::current;
+    return *--tmp;
+  }
+
+  // Note: return type is incompatible with the standard
+  // `pointer operator->()`.
+  typename Iterator::value_type operator->() const {
+    auto tmp = std::reverse_iterator<Iterator>::current;
+    return *--tmp;
+  }
+};
+
+// This is used as a helper type for accessing vectors.
+// Vector::data() assumes the vector elements start after the length field.
+template<typename T, typename SizeT = uoffset_t> class Vector {
+ public:
+  typedef VectorIterator<T, typename IndirectHelper<T>::mutable_return_type,
+                         uint8_t *, SizeT>
+      iterator;
+  typedef VectorConstIterator<T, typename IndirectHelper<T>::return_type, SizeT>
+      const_iterator;
+  typedef VectorReverseIterator<iterator> reverse_iterator;
+  typedef VectorReverseIterator<const_iterator> const_reverse_iterator;
+
+  typedef typename flatbuffers::bool_constant<flatbuffers::is_scalar<T>::value>
+      scalar_tag;
+
+  static FLATBUFFERS_CONSTEXPR bool is_span_observable =
+      scalar_tag::value && (FLATBUFFERS_LITTLEENDIAN || sizeof(T) == 1);
+
+  SizeT size() const { return EndianScalar(length_); }
+
+  // Deprecated: use size(). Here for backwards compatibility.
+  FLATBUFFERS_ATTRIBUTE([[deprecated("use size() instead")]])
+  SizeT Length() const { return size(); }
+
+  typedef SizeT size_type;
+  typedef typename IndirectHelper<T>::return_type return_type;
+  typedef typename IndirectHelper<T>::mutable_return_type mutable_return_type;
+  typedef return_type value_type;
+
+  return_type Get(SizeT i) const {
+    FLATBUFFERS_ASSERT(i < size());
+    return IndirectHelper<T>::Read(Data(), i);
+  }
+
+  return_type operator[](SizeT i) const { return Get(i); }
+
+  // If this is a Vector of enums, T will be its storage type, not the enum
+  // type. This function makes it convenient to retrieve value with enum
+  // type E.
+  template<typename E> E GetEnum(SizeT i) const {
+    return static_cast<E>(Get(i));
+  }
+
+  // If this a vector of unions, this does the cast for you. There's no check
+  // to make sure this is the right type!
+  template<typename U> const U *GetAs(SizeT i) const {
+    return reinterpret_cast<const U *>(Get(i));
+  }
+
+  // If this a vector of unions, this does the cast for you. There's no check
+  // to make sure this is actually a string!
+  const String *GetAsString(SizeT i) const {
+    return reinterpret_cast<const String *>(Get(i));
+  }
+
+  const void *GetStructFromOffset(size_t o) const {
+    return reinterpret_cast<const void *>(Data() + o);
+  }
+
+  iterator begin() { return iterator(Data(), 0); }
+  const_iterator begin() const { return const_iterator(Data(), 0); }
+
+  iterator end() { return iterator(Data(), size()); }
+  const_iterator end() const { return const_iterator(Data(), size()); }
+
+  reverse_iterator rbegin() { return reverse_iterator(end()); }
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+
+  reverse_iterator rend() { return reverse_iterator(begin()); }
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+  const_iterator cbegin() const { return begin(); }
+
+  const_iterator cend() const { return end(); }
+
+  const_reverse_iterator crbegin() const { return rbegin(); }
+
+  const_reverse_iterator crend() const { return rend(); }
+
+  // Change elements if you have a non-const pointer to this object.
+  // Scalars only. See reflection.h, and the documentation.
+  void Mutate(SizeT i, const T &val) {
+    FLATBUFFERS_ASSERT(i < size());
+    WriteScalar(data() + i, val);
+  }
+
+  // Change an element of a vector of tables (or strings).
+  // "val" points to the new table/string, as you can obtain from
+  // e.g. reflection::AddFlatBuffer().
+  void MutateOffset(SizeT i, const uint8_t *val) {
+    FLATBUFFERS_ASSERT(i < size());
+    static_assert(sizeof(T) == sizeof(SizeT), "Unrelated types");
+    WriteScalar(data() + i,
+                static_cast<SizeT>(val - (Data() + i * sizeof(SizeT))));
+  }
+
+  // Get a mutable pointer to tables/strings inside this vector.
+  mutable_return_type GetMutableObject(SizeT i) const {
+    FLATBUFFERS_ASSERT(i < size());
+    return const_cast<mutable_return_type>(IndirectHelper<T>::Read(Data(), i));
+  }
+
+  // The raw data in little endian format. Use with care.
+  const uint8_t *Data() const {
+    return reinterpret_cast<const uint8_t *>(&length_ + 1);
+  }
+
+  uint8_t *Data() { return reinterpret_cast<uint8_t *>(&length_ + 1); }
+
+  // Similarly, but typed, much like std::vector::data
+  const T *data() const { return reinterpret_cast<const T *>(Data()); }
+  T *data() { return reinterpret_cast<T *>(Data()); }
+
+  template<typename K> return_type LookupByKey(K key) const {
+    void *search_result = std::bsearch(
+        &key, Data(), size(), IndirectHelper<T>::element_stride, KeyCompare<K>);
+
+    if (!search_result) {
+      return nullptr;  // Key not found.
+    }
+
+    const uint8_t *element = reinterpret_cast<const uint8_t *>(search_result);
+
+    return IndirectHelper<T>::Read(element, 0);
+  }
+
+  template<typename K> mutable_return_type MutableLookupByKey(K key) {
+    return const_cast<mutable_return_type>(LookupByKey(key));
+  }
+
+ protected:
+  // This class is only used to access pre-existing data. Don't ever
+  // try to construct these manually.
+  Vector();
+
+  SizeT length_;
+
+ private:
+  // This class is a pointer. Copying will therefore create an invalid object.
+  // Private and unimplemented copy constructor.
+  Vector(const Vector &);
+  Vector &operator=(const Vector &);
+
+  template<typename K> static int KeyCompare(const void *ap, const void *bp) {
+    const K *key = reinterpret_cast<const K *>(ap);
+    const uint8_t *data = reinterpret_cast<const uint8_t *>(bp);
+    auto table = IndirectHelper<T>::Read(data, 0);
+
+    // std::bsearch compares with the operands transposed, so we negate the
+    // result here.
+    return -table->KeyCompareWithValue(*key);
+  }
+};
+
+template<typename T> using Vector64 = Vector<T, uoffset64_t>;
+
+template<class U>
+FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<U> make_span(Vector<U> &vec)
+    FLATBUFFERS_NOEXCEPT {
+  static_assert(Vector<U>::is_span_observable,
+                "wrong type U, only LE-scalar, or byte types are allowed");
+  return span<U>(vec.data(), vec.size());
+}
+
+template<class U>
+FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<const U> make_span(
+    const Vector<U> &vec) FLATBUFFERS_NOEXCEPT {
+  static_assert(Vector<U>::is_span_observable,
+                "wrong type U, only LE-scalar, or byte types are allowed");
+  return span<const U>(vec.data(), vec.size());
+}
+
+template<class U>
+FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<uint8_t> make_bytes_span(
+    Vector<U> &vec) FLATBUFFERS_NOEXCEPT {
+  static_assert(Vector<U>::scalar_tag::value,
+                "wrong type U, only LE-scalar, or byte types are allowed");
+  return span<uint8_t>(vec.Data(), vec.size() * sizeof(U));
+}
+
+template<class U>
+FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<const uint8_t> make_bytes_span(
+    const Vector<U> &vec) FLATBUFFERS_NOEXCEPT {
+  static_assert(Vector<U>::scalar_tag::value,
+                "wrong type U, only LE-scalar, or byte types are allowed");
+  return span<const uint8_t>(vec.Data(), vec.size() * sizeof(U));
+}
+
+// Convenient helper functions to get a span of any vector, regardless
+// of whether it is null or not (the field is not set).
+template<class U>
+FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<U> make_span(Vector<U> *ptr)
+    FLATBUFFERS_NOEXCEPT {
+  static_assert(Vector<U>::is_span_observable,
+                "wrong type U, only LE-scalar, or byte types are allowed");
+  return ptr ? make_span(*ptr) : span<U>();
+}
+
+template<class U>
+FLATBUFFERS_CONSTEXPR_CPP11 flatbuffers::span<const U> make_span(
+    const Vector<U> *ptr) FLATBUFFERS_NOEXCEPT {
+  static_assert(Vector<U>::is_span_observable,
+                "wrong type U, only LE-scalar, or byte types are allowed");
+  return ptr ? make_span(*ptr) : span<const U>();
+}
+
+// Represent a vector much like the template above, but in this case we
+// don't know what the element types are (used with reflection.h).
+class VectorOfAny {
+ public:
+  uoffset_t size() const { return EndianScalar(length_); }
+
+  const uint8_t *Data() const {
+    return reinterpret_cast<const uint8_t *>(&length_ + 1);
+  }
+  uint8_t *Data() { return reinterpret_cast<uint8_t *>(&length_ + 1); }
+
+ protected:
+  VectorOfAny();
+
+  uoffset_t length_;
+
+ private:
+  VectorOfAny(const VectorOfAny &);
+  VectorOfAny &operator=(const VectorOfAny &);
+};
+
+template<typename T, typename U>
+Vector<Offset<T>> *VectorCast(Vector<Offset<U>> *ptr) {
+  static_assert(std::is_base_of<T, U>::value, "Unrelated types");
+  return reinterpret_cast<Vector<Offset<T>> *>(ptr);
+}
+
+template<typename T, typename U>
+const Vector<Offset<T>> *VectorCast(const Vector<Offset<U>> *ptr) {
+  static_assert(std::is_base_of<T, U>::value, "Unrelated types");
+  return reinterpret_cast<const Vector<Offset<T>> *>(ptr);
+}
+
+// Convenient helper function to get the length of any vector, regardless
+// of whether it is null or not (the field is not set).
+template<typename T> static inline size_t VectorLength(const Vector<T> *v) {
+  return v ? v->size() : 0;
+}
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_VERIFIER_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/vector_downward.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/vector_downward.h
new file mode 100644
index 00000000..2b5a92cf
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/vector_downward.h
@@ -0,0 +1,289 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_VECTOR_DOWNWARD_H_
+#define FLATBUFFERS_VECTOR_DOWNWARD_H_
+
+#include <algorithm>
+#include <cstdint>
+
+#include "flatbuffers/base.h"
+#include "flatbuffers/default_allocator.h"
+#include "flatbuffers/detached_buffer.h"
+
+namespace flatbuffers {
+
+// This is a minimal replication of std::vector<uint8_t> functionality,
+// except growing from higher to lower addresses. i.e. push_back() inserts data
+// in the lowest address in the vector.
+// Since this vector leaves the lower part unused, we support a "scratch-pad"
+// that can be stored there for temporary data, to share the allocated space.
+// Essentially, this supports 2 std::vectors in a single buffer.
+template<typename SizeT = uoffset_t> class vector_downward {
+ public:
+  explicit vector_downward(size_t initial_size, Allocator *allocator,
+                           bool own_allocator, size_t buffer_minalign,
+                           const SizeT max_size = FLATBUFFERS_MAX_BUFFER_SIZE)
+      : allocator_(allocator),
+        own_allocator_(own_allocator),
+        initial_size_(initial_size),
+        max_size_(max_size),
+        buffer_minalign_(buffer_minalign),
+        reserved_(0),
+        size_(0),
+        buf_(nullptr),
+        cur_(nullptr),
+        scratch_(nullptr) {}
+
+  vector_downward(vector_downward &&other) noexcept
+      // clang-format on
+      : allocator_(other.allocator_),
+        own_allocator_(other.own_allocator_),
+        initial_size_(other.initial_size_),
+        max_size_(other.max_size_),
+        buffer_minalign_(other.buffer_minalign_),
+        reserved_(other.reserved_),
+        size_(other.size_),
+        buf_(other.buf_),
+        cur_(other.cur_),
+        scratch_(other.scratch_) {
+    // No change in other.allocator_
+    // No change in other.initial_size_
+    // No change in other.buffer_minalign_
+    other.own_allocator_ = false;
+    other.reserved_ = 0;
+    other.buf_ = nullptr;
+    other.cur_ = nullptr;
+    other.scratch_ = nullptr;
+  }
+
+  vector_downward &operator=(vector_downward &&other) noexcept {
+    // Move construct a temporary and swap idiom
+    vector_downward temp(std::move(other));
+    swap(temp);
+    return *this;
+  }
+
+  ~vector_downward() {
+    clear_buffer();
+    clear_allocator();
+  }
+
+  void reset() {
+    clear_buffer();
+    clear();
+  }
+
+  void clear() {
+    if (buf_) {
+      cur_ = buf_ + reserved_;
+    } else {
+      reserved_ = 0;
+      cur_ = nullptr;
+    }
+    size_ = 0;
+    clear_scratch();
+  }
+
+  void clear_scratch() { scratch_ = buf_; }
+
+  void clear_allocator() {
+    if (own_allocator_ && allocator_) { delete allocator_; }
+    allocator_ = nullptr;
+    own_allocator_ = false;
+  }
+
+  void clear_buffer() {
+    if (buf_) Deallocate(allocator_, buf_, reserved_);
+    buf_ = nullptr;
+  }
+
+  // Relinquish the pointer to the caller.
+  uint8_t *release_raw(size_t &allocated_bytes, size_t &offset) {
+    auto *buf = buf_;
+    allocated_bytes = reserved_;
+    offset = vector_downward::offset();
+
+    // release_raw only relinquishes the buffer ownership.
+    // Does not deallocate or reset the allocator. Destructor will do that.
+    buf_ = nullptr;
+    clear();
+    return buf;
+  }
+
+  // Relinquish the pointer to the caller.
+  DetachedBuffer release() {
+    // allocator ownership (if any) is transferred to DetachedBuffer.
+    DetachedBuffer fb(allocator_, own_allocator_, buf_, reserved_, cur_,
+                      size());
+    if (own_allocator_) {
+      allocator_ = nullptr;
+      own_allocator_ = false;
+    }
+    buf_ = nullptr;
+    clear();
+    return fb;
+  }
+
+  size_t ensure_space(size_t len) {
+    FLATBUFFERS_ASSERT(cur_ >= scratch_ && scratch_ >= buf_);
+    // If the length is larger than the unused part of the buffer, we need to
+    // grow.
+    if (len > unused_buffer_size()) { reallocate(len); }
+    FLATBUFFERS_ASSERT(size() < max_size_);
+    return len;
+  }
+
+  inline uint8_t *make_space(size_t len) {
+    if (len) {
+      ensure_space(len);
+      cur_ -= len;
+      size_ += static_cast<SizeT>(len);
+    }
+    return cur_;
+  }
+
+  // Returns nullptr if using the DefaultAllocator.
+  Allocator *get_custom_allocator() { return allocator_; }
+
+  // The current offset into the buffer.
+  size_t offset() const { return cur_ - buf_; }
+
+  // The total size of the vector (both the buffer and scratch parts).
+  inline SizeT size() const { return size_; }
+
+  // The size of the buffer part of the vector that is currently unused.
+  SizeT unused_buffer_size() const {
+    return static_cast<SizeT>(cur_ - scratch_);
+  }
+
+  // The size of the scratch part of the vector.
+  SizeT scratch_size() const { return static_cast<SizeT>(scratch_ - buf_); }
+
+  size_t capacity() const { return reserved_; }
+
+  uint8_t *data() const {
+    FLATBUFFERS_ASSERT(cur_);
+    return cur_;
+  }
+
+  uint8_t *scratch_data() const {
+    FLATBUFFERS_ASSERT(buf_);
+    return buf_;
+  }
+
+  uint8_t *scratch_end() const {
+    FLATBUFFERS_ASSERT(scratch_);
+    return scratch_;
+  }
+
+  uint8_t *data_at(size_t offset) const { return buf_ + reserved_ - offset; }
+
+  void push(const uint8_t *bytes, size_t num) {
+    if (num > 0) { memcpy(make_space(num), bytes, num); }
+  }
+
+  // Specialized version of push() that avoids memcpy call for small data.
+  template<typename T> void push_small(const T &little_endian_t) {
+    make_space(sizeof(T));
+    *reinterpret_cast<T *>(cur_) = little_endian_t;
+  }
+
+  template<typename T> void scratch_push_small(const T &t) {
+    ensure_space(sizeof(T));
+    *reinterpret_cast<T *>(scratch_) = t;
+    scratch_ += sizeof(T);
+  }
+
+  // fill() is most frequently called with small byte counts (<= 4),
+  // which is why we're using loops rather than calling memset.
+  void fill(size_t zero_pad_bytes) {
+    make_space(zero_pad_bytes);
+    for (size_t i = 0; i < zero_pad_bytes; i++) cur_[i] = 0;
+  }
+
+  // Version for when we know the size is larger.
+  // Precondition: zero_pad_bytes > 0
+  void fill_big(size_t zero_pad_bytes) {
+    memset(make_space(zero_pad_bytes), 0, zero_pad_bytes);
+  }
+
+  void pop(size_t bytes_to_remove) {
+    cur_ += bytes_to_remove;
+    size_ -= static_cast<SizeT>(bytes_to_remove);
+  }
+
+  void scratch_pop(size_t bytes_to_remove) { scratch_ -= bytes_to_remove; }
+
+  void swap(vector_downward &other) {
+    using std::swap;
+    swap(allocator_, other.allocator_);
+    swap(own_allocator_, other.own_allocator_);
+    swap(initial_size_, other.initial_size_);
+    swap(buffer_minalign_, other.buffer_minalign_);
+    swap(reserved_, other.reserved_);
+    swap(size_, other.size_);
+    swap(max_size_, other.max_size_);
+    swap(buf_, other.buf_);
+    swap(cur_, other.cur_);
+    swap(scratch_, other.scratch_);
+  }
+
+  void swap_allocator(vector_downward &other) {
+    using std::swap;
+    swap(allocator_, other.allocator_);
+    swap(own_allocator_, other.own_allocator_);
+  }
+
+ private:
+  // You shouldn't really be copying instances of this class.
+  FLATBUFFERS_DELETE_FUNC(vector_downward(const vector_downward &));
+  FLATBUFFERS_DELETE_FUNC(vector_downward &operator=(const vector_downward &));
+
+  Allocator *allocator_;
+  bool own_allocator_;
+  size_t initial_size_;
+
+  // The maximum size the vector can be.
+  SizeT max_size_;
+  size_t buffer_minalign_;
+  size_t reserved_;
+  SizeT size_;
+  uint8_t *buf_;
+  uint8_t *cur_;  // Points at location between empty (below) and used (above).
+  uint8_t *scratch_;  // Points to the end of the scratchpad in use.
+
+  void reallocate(size_t len) {
+    auto old_reserved = reserved_;
+    auto old_size = size();
+    auto old_scratch_size = scratch_size();
+    reserved_ +=
+        (std::max)(len, old_reserved ? old_reserved / 2 : initial_size_);
+    reserved_ = (reserved_ + buffer_minalign_ - 1) & ~(buffer_minalign_ - 1);
+    if (buf_) {
+      buf_ = ReallocateDownward(allocator_, buf_, old_reserved, reserved_,
+                                old_size, old_scratch_size);
+    } else {
+      buf_ = Allocate(allocator_, reserved_);
+    }
+    cur_ = buf_ + reserved_ - old_size;
+    scratch_ = buf_ + old_scratch_size;
+  }
+};
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_VECTOR_DOWNWARD_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/verifier.h b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/verifier.h
new file mode 100644
index 00000000..de1146be
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/verifier.h
@@ -0,0 +1,332 @@
+/*
+ * Copyright 2021 Google Inc. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FLATBUFFERS_VERIFIER_H_
+#define FLATBUFFERS_VERIFIER_H_
+
+#include "flatbuffers/base.h"
+#include "flatbuffers/vector.h"
+
+namespace flatbuffers {
+
+// Helper class to verify the integrity of a FlatBuffer
+class Verifier FLATBUFFERS_FINAL_CLASS {
+ public:
+  struct Options {
+    // The maximum nesting of tables and vectors before we call it invalid.
+    uoffset_t max_depth = 64;
+    // The maximum number of tables we will verify before we call it invalid.
+    uoffset_t max_tables = 1000000;
+    // If true, verify all data is aligned.
+    bool check_alignment = true;
+    // If true, run verifier on nested flatbuffers
+    bool check_nested_flatbuffers = true;
+    // The maximum size of a buffer.
+    size_t max_size = FLATBUFFERS_MAX_BUFFER_SIZE;
+    // Use assertions to check for errors.
+    bool assert = false;
+  };
+
+  explicit Verifier(const uint8_t *const buf, const size_t buf_len,
+                    const Options &opts)
+      : buf_(buf), size_(buf_len), opts_(opts) {
+    FLATBUFFERS_ASSERT(size_ < opts.max_size);
+  }
+
+  // Deprecated API, please construct with Verifier::Options.
+  Verifier(const uint8_t *const buf, const size_t buf_len,
+           const uoffset_t max_depth = 64, const uoffset_t max_tables = 1000000,
+           const bool check_alignment = true)
+      : Verifier(buf, buf_len, [&] {
+          Options opts;
+          opts.max_depth = max_depth;
+          opts.max_tables = max_tables;
+          opts.check_alignment = check_alignment;
+          return opts;
+        }()) {}
+
+  // Central location where any verification failures register.
+  bool Check(const bool ok) const {
+    // clang-format off
+    #ifdef FLATBUFFERS_DEBUG_VERIFICATION_FAILURE
+       if (opts_.assert) { FLATBUFFERS_ASSERT(ok); }
+    #endif
+    #ifdef FLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE
+      if (!ok)
+        upper_bound_ = 0;
+    #endif
+    // clang-format on
+    return ok;
+  }
+
+  // Verify any range within the buffer.
+  bool Verify(const size_t elem, const size_t elem_len) const {
+    // clang-format off
+    #ifdef FLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE
+      auto upper_bound = elem + elem_len;
+      if (upper_bound_ < upper_bound)
+        upper_bound_ =  upper_bound;
+    #endif
+    // clang-format on
+    return Check(elem_len < size_ && elem <= size_ - elem_len);
+  }
+
+  bool VerifyAlignment(const size_t elem, const size_t align) const {
+    return Check((elem & (align - 1)) == 0 || !opts_.check_alignment);
+  }
+
+  // Verify a range indicated by sizeof(T).
+  template<typename T> bool Verify(const size_t elem) const {
+    return VerifyAlignment(elem, sizeof(T)) && Verify(elem, sizeof(T));
+  }
+
+  bool VerifyFromPointer(const uint8_t *const p, const size_t len) {
+    return Verify(static_cast<size_t>(p - buf_), len);
+  }
+
+  // Verify relative to a known-good base pointer.
+  bool VerifyFieldStruct(const uint8_t *const base, const voffset_t elem_off,
+                         const size_t elem_len, const size_t align) const {
+    const auto f = static_cast<size_t>(base - buf_) + elem_off;
+    return VerifyAlignment(f, align) && Verify(f, elem_len);
+  }
+
+  template<typename T>
+  bool VerifyField(const uint8_t *const base, const voffset_t elem_off,
+                   const size_t align) const {
+    const auto f = static_cast<size_t>(base - buf_) + elem_off;
+    return VerifyAlignment(f, align) && Verify(f, sizeof(T));
+  }
+
+  // Verify a pointer (may be NULL) of a table type.
+  template<typename T> bool VerifyTable(const T *const table) {
+    return !table || table->Verify(*this);
+  }
+
+  // Verify a pointer (may be NULL) of any vector type.
+  template<int &..., typename T, typename LenT>
+  bool VerifyVector(const Vector<T, LenT> *const vec) const {
+    return !vec || VerifyVectorOrString<LenT>(
+                       reinterpret_cast<const uint8_t *>(vec), sizeof(T));
+  }
+
+  // Verify a pointer (may be NULL) of a vector to struct.
+  template<int &..., typename T, typename LenT>
+  bool VerifyVector(const Vector<const T *, LenT> *const vec) const {
+    return VerifyVector(reinterpret_cast<const Vector<T, LenT> *>(vec));
+  }
+
+  // Verify a pointer (may be NULL) to string.
+  bool VerifyString(const String *const str) const {
+    size_t end;
+    return !str || (VerifyVectorOrString<uoffset_t>(
+                        reinterpret_cast<const uint8_t *>(str), 1, &end) &&
+                    Verify(end, 1) &&           // Must have terminator
+                    Check(buf_[end] == '\0'));  // Terminating byte must be 0.
+  }
+
+  // Common code between vectors and strings.
+  template<typename LenT = uoffset_t>
+  bool VerifyVectorOrString(const uint8_t *const vec, const size_t elem_size,
+                            size_t *const end = nullptr) const {
+    const auto vec_offset = static_cast<size_t>(vec - buf_);
+    // Check we can read the size field.
+    if (!Verify<LenT>(vec_offset)) return false;
+    // Check the whole array. If this is a string, the byte past the array must
+    // be 0.
+    const LenT size = ReadScalar<LenT>(vec);
+    const auto max_elems = opts_.max_size / elem_size;
+    if (!Check(size < max_elems))
+      return false;  // Protect against byte_size overflowing.
+    const auto byte_size = sizeof(LenT) + elem_size * size;
+    if (end) *end = vec_offset + byte_size;
+    return Verify(vec_offset, byte_size);
+  }
+
+  // Special case for string contents, after the above has been called.
+  bool VerifyVectorOfStrings(const Vector<Offset<String>> *const vec) const {
+    if (vec) {
+      for (uoffset_t i = 0; i < vec->size(); i++) {
+        if (!VerifyString(vec->Get(i))) return false;
+      }
+    }
+    return true;
+  }
+
+  // Special case for table contents, after the above has been called.
+  template<typename T>
+  bool VerifyVectorOfTables(const Vector<Offset<T>> *const vec) {
+    if (vec) {
+      for (uoffset_t i = 0; i < vec->size(); i++) {
+        if (!vec->Get(i)->Verify(*this)) return false;
+      }
+    }
+    return true;
+  }
+
+  FLATBUFFERS_SUPPRESS_UBSAN("unsigned-integer-overflow")
+  bool VerifyTableStart(const uint8_t *const table) {
+    // Check the vtable offset.
+    const auto tableo = static_cast<size_t>(table - buf_);
+    if (!Verify<soffset_t>(tableo)) return false;
+    // This offset may be signed, but doing the subtraction unsigned always
+    // gives the result we want.
+    const auto vtableo =
+        tableo - static_cast<size_t>(ReadScalar<soffset_t>(table));
+    // Check the vtable size field, then check vtable fits in its entirety.
+    if (!(VerifyComplexity() && Verify<voffset_t>(vtableo) &&
+          VerifyAlignment(ReadScalar<voffset_t>(buf_ + vtableo),
+                          sizeof(voffset_t))))
+      return false;
+    const auto vsize = ReadScalar<voffset_t>(buf_ + vtableo);
+    return Check((vsize & 1) == 0) && Verify(vtableo, vsize);
+  }
+
+  template<typename T>
+  bool VerifyBufferFromStart(const char *const identifier, const size_t start) {
+    // Buffers have to be of some size to be valid. The reason it is a runtime
+    // check instead of static_assert, is that nested flatbuffers go through
+    // this call and their size is determined at runtime.
+    if (!Check(size_ >= FLATBUFFERS_MIN_BUFFER_SIZE)) return false;
+
+    // If an identifier is provided, check that we have a buffer
+    if (identifier && !Check((size_ >= 2 * sizeof(flatbuffers::uoffset_t) &&
+                              BufferHasIdentifier(buf_ + start, identifier)))) {
+      return false;
+    }
+
+    // Call T::Verify, which must be in the generated code for this type.
+    const auto o = VerifyOffset<uoffset_t>(start);
+    return Check(o != 0) &&
+           reinterpret_cast<const T *>(buf_ + start + o)->Verify(*this)
+    // clang-format off
+    #ifdef FLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE
+           && GetComputedSize()
+    #endif
+        ;
+    // clang-format on
+  }
+
+  template<typename T, int &..., typename SizeT>
+  bool VerifyNestedFlatBuffer(const Vector<uint8_t, SizeT> *const buf,
+                              const char *const identifier) {
+    // Caller opted out of this.
+    if (!opts_.check_nested_flatbuffers) return true;
+
+    // An empty buffer is OK as it indicates not present.
+    if (!buf) return true;
+
+    // If there is a nested buffer, it must be greater than the min size.
+    if (!Check(buf->size() >= FLATBUFFERS_MIN_BUFFER_SIZE)) return false;
+
+    Verifier nested_verifier(buf->data(), buf->size(), opts_);
+    return nested_verifier.VerifyBuffer<T>(identifier);
+  }
+
+  // Verify this whole buffer, starting with root type T.
+  template<typename T> bool VerifyBuffer() { return VerifyBuffer<T>(nullptr); }
+
+  template<typename T> bool VerifyBuffer(const char *const identifier) {
+    return VerifyBufferFromStart<T>(identifier, 0);
+  }
+
+  template<typename T, typename SizeT = uoffset_t>
+  bool VerifySizePrefixedBuffer(const char *const identifier) {
+    return Verify<SizeT>(0U) &&
+           // Ensure the prefixed size is within the bounds of the provided
+           // length.
+           Check(ReadScalar<SizeT>(buf_) + sizeof(SizeT) <= size_) &&
+           VerifyBufferFromStart<T>(identifier, sizeof(SizeT));
+  }
+
+  template<typename OffsetT = uoffset_t, typename SOffsetT = soffset_t>
+  size_t VerifyOffset(const size_t start) const {
+    if (!Verify<OffsetT>(start)) return 0;
+    const auto o = ReadScalar<OffsetT>(buf_ + start);
+    // May not point to itself.
+    if (!Check(o != 0)) return 0;
+    // Can't wrap around larger than the max size.
+    if (!Check(static_cast<SOffsetT>(o) >= 0)) return 0;
+    // Must be inside the buffer to create a pointer from it (pointer outside
+    // buffer is UB).
+    if (!Verify(start + o, 1)) return 0;
+    return o;
+  }
+
+  template<typename OffsetT = uoffset_t>
+  size_t VerifyOffset(const uint8_t *const base, const voffset_t start) const {
+    return VerifyOffset<OffsetT>(static_cast<size_t>(base - buf_) + start);
+  }
+
+  // Called at the start of a table to increase counters measuring data
+  // structure depth and amount, and possibly bails out with false if limits set
+  // by the constructor have been hit. Needs to be balanced with EndTable().
+  bool VerifyComplexity() {
+    depth_++;
+    num_tables_++;
+    return Check(depth_ <= opts_.max_depth && num_tables_ <= opts_.max_tables);
+  }
+
+  // Called at the end of a table to pop the depth count.
+  bool EndTable() {
+    depth_--;
+    return true;
+  }
+
+  // Returns the message size in bytes
+  size_t GetComputedSize() const {
+    // clang-format off
+    #ifdef FLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE
+      uintptr_t size = upper_bound_;
+      // Align the size to uoffset_t
+      size = (size - 1 + sizeof(uoffset_t)) & ~(sizeof(uoffset_t) - 1);
+      return (size > size_) ?  0 : size;
+    #else
+      // Must turn on FLATBUFFERS_TRACK_VERIFIER_BUFFER_SIZE for this to work.
+      (void)upper_bound_;
+      FLATBUFFERS_ASSERT(false);
+      return 0;
+    #endif
+    // clang-format on
+  }
+
+  std::vector<uint8_t> *GetFlexReuseTracker() { return flex_reuse_tracker_; }
+
+  void SetFlexReuseTracker(std::vector<uint8_t> *const rt) {
+    flex_reuse_tracker_ = rt;
+  }
+
+ private:
+  const uint8_t *buf_;
+  const size_t size_;
+  const Options opts_;
+
+  mutable size_t upper_bound_ = 0;
+
+  uoffset_t depth_ = 0;
+  uoffset_t num_tables_ = 0;
+  std::vector<uint8_t> *flex_reuse_tracker_ = nullptr;
+};
+
+// Specialization for 64-bit offsets.
+template<>
+inline size_t Verifier::VerifyOffset<uoffset64_t>(const size_t start) const {
+  return VerifyOffset<uoffset64_t, soffset64_t>(start);
+}
+
+}  // namespace flatbuffers
+
+#endif  // FLATBUFFERS_VERIFIER_H_
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/package.json b/ethosu/regor/dependencies/thirdparty/flatbuffers/package.json
new file mode 100644
index 00000000..c5e304eb
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/package.json
@@ -0,0 +1,46 @@
+{
+  "name": "flatbuffers",
+  "version": "23.5.26",
+  "description": "Memory Efficient Serialization Library",
+  "files": [
+    "js/**/*.js",
+    "js/**/*.d.ts",
+    "mjs/**/*.js",
+    "mjs/**/*.d.ts",
+    "ts/**/*.ts"
+  ],
+  "main": "js/flatbuffers.js",
+  "module": "mjs/flatbuffers.js",
+  "directories": {
+    "doc": "docs",
+    "test": "tests"
+  },
+  "scripts": {
+    "test": "npm run compile && cd tests/ts && python3 ./TypeScriptTest.py",
+    "lint": "eslint ts",
+    "compile": "tsc && tsc -p tsconfig.mjs.json && esbuild js/flatbuffers.js --minify --global-name=flatbuffers --bundle --outfile=js/flatbuffers.min.js",
+    "prepublishOnly": "npm install --only=dev && npm run compile"
+  },
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/google/flatbuffers.git"
+  },
+  "keywords": [
+    "flatbuffers"
+  ],
+  "author": "The FlatBuffers project",
+  "license": "SEE LICENSE IN LICENSE",
+  "bugs": {
+    "url": "https://github.com/google/flatbuffers/issues"
+  },
+  "homepage": "https://google.github.io/flatbuffers/",
+  "dependencies": {},
+  "devDependencies": {
+    "@types/node": "18.16.3",
+    "@typescript-eslint/eslint-plugin": "^5.59.2",
+    "@typescript-eslint/parser": "^5.59.2",
+    "esbuild": "^0.17.18",
+    "eslint": "^8.39.0",
+    "typescript": "5.0.4"
+  }
+}
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/pnpm-lock.yaml b/ethosu/regor/dependencies/thirdparty/flatbuffers/pnpm-lock.yaml
new file mode 100644
index 00000000..8dfe73dc
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/pnpm-lock.yaml
@@ -0,0 +1,1184 @@
+lockfileVersion: '6.0'
+
+devDependencies:
+  '@types/node':
+    specifier: 18.16.3
+    version: 18.16.3
+  '@typescript-eslint/eslint-plugin':
+    specifier: ^5.59.2
+    version: 5.59.2(@typescript-eslint/parser@5.59.2)(eslint@8.39.0)(typescript@5.0.4)
+  '@typescript-eslint/parser':
+    specifier: ^5.59.2
+    version: 5.59.2(eslint@8.39.0)(typescript@5.0.4)
+  esbuild:
+    specifier: ^0.17.18
+    version: 0.17.18
+  eslint:
+    specifier: ^8.39.0
+    version: 8.39.0
+  typescript:
+    specifier: 5.0.4
+    version: 5.0.4
+
+packages:
+
+  /@esbuild/android-arm64@0.17.18:
+    resolution: {integrity: sha512-/iq0aK0eeHgSC3z55ucMAHO05OIqmQehiGay8eP5l/5l+iEr4EIbh4/MI8xD9qRFjqzgkc0JkX0LculNC9mXBw==}
+    engines: {node: '>=12'}
+    cpu: [arm64]
+    os: [android]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@esbuild/android-arm@0.17.18:
+    resolution: {integrity: sha512-EmwL+vUBZJ7mhFCs5lA4ZimpUH3WMAoqvOIYhVQwdIgSpHC8ImHdsRyhHAVxpDYUSm0lWvd63z0XH1IlImS2Qw==}
+    engines: {node: '>=12'}
+    cpu: [arm]
+    os: [android]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@esbuild/android-x64@0.17.18:
+    resolution: {integrity: sha512-x+0efYNBF3NPW2Xc5bFOSFW7tTXdAcpfEg2nXmxegm4mJuVeS+i109m/7HMiOQ6M12aVGGFlqJX3RhNdYM2lWg==}
+    engines: {node: '>=12'}
+    cpu: [x64]
+    os: [android]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@esbuild/darwin-arm64@0.17.18:
+    resolution: {integrity: sha512-6tY+djEAdF48M1ONWnQb1C+6LiXrKjmqjzPNPWXhu/GzOHTHX2nh8Mo2ZAmBFg0kIodHhciEgUBtcYCAIjGbjQ==}
+    engines: {node: '>=12'}
+    cpu: [arm64]
+    os: [darwin]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@esbuild/darwin-x64@0.17.18:
+    resolution: {integrity: sha512-Qq84ykvLvya3dO49wVC9FFCNUfSrQJLbxhoQk/TE1r6MjHo3sFF2tlJCwMjhkBVq3/ahUisj7+EpRSz0/+8+9A==}
+    engines: {node: '>=12'}
+    cpu: [x64]
+    os: [darwin]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@esbuild/freebsd-arm64@0.17.18:
+    resolution: {integrity: sha512-fw/ZfxfAzuHfaQeMDhbzxp9mc+mHn1Y94VDHFHjGvt2Uxl10mT4CDavHm+/L9KG441t1QdABqkVYwakMUeyLRA==}
+    engines: {node: '>=12'}
+    cpu: [arm64]
+    os: [freebsd]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@esbuild/freebsd-x64@0.17.18:
+    resolution: {integrity: sha512-FQFbRtTaEi8ZBi/A6kxOC0V0E9B/97vPdYjY9NdawyLd4Qk5VD5g2pbWN2VR1c0xhzcJm74HWpObPszWC+qTew==}
+    engines: {node: '>=12'}
+    cpu: [x64]
+    os: [freebsd]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@esbuild/linux-arm64@0.17.18:
+    resolution: {integrity: sha512-R7pZvQZFOY2sxUG8P6A21eq6q+eBv7JPQYIybHVf1XkQYC+lT7nDBdC7wWKTrbvMXKRaGudp/dzZCwL/863mZQ==}
+    engines: {node: '>=12'}
+    cpu: [arm64]
+    os: [linux]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@esbuild/linux-arm@0.17.18:
+    resolution: {integrity: sha512-jW+UCM40LzHcouIaqv3e/oRs0JM76JfhHjCavPxMUti7VAPh8CaGSlS7cmyrdpzSk7A+8f0hiedHqr/LMnfijg==}
+    engines: {node: '>=12'}
+    cpu: [arm]
+    os: [linux]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@esbuild/linux-ia32@0.17.18:
+    resolution: {integrity: sha512-ygIMc3I7wxgXIxk6j3V00VlABIjq260i967Cp9BNAk5pOOpIXmd1RFQJQX9Io7KRsthDrQYrtcx7QCof4o3ZoQ==}
+    engines: {node: '>=12'}
+    cpu: [ia32]
+    os: [linux]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@esbuild/linux-loong64@0.17.18:
+    resolution: {integrity: sha512-bvPG+MyFs5ZlwYclCG1D744oHk1Pv7j8psF5TfYx7otCVmcJsEXgFEhQkbhNW8otDHL1a2KDINW20cfCgnzgMQ==}
+    engines: {node: '>=12'}
+    cpu: [loong64]
+    os: [linux]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@esbuild/linux-mips64el@0.17.18:
+    resolution: {integrity: sha512-oVqckATOAGuiUOa6wr8TXaVPSa+6IwVJrGidmNZS1cZVx0HqkTMkqFGD2HIx9H1RvOwFeWYdaYbdY6B89KUMxA==}
+    engines: {node: '>=12'}
+    cpu: [mips64el]
+    os: [linux]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@esbuild/linux-ppc64@0.17.18:
+    resolution: {integrity: sha512-3dLlQO+b/LnQNxgH4l9rqa2/IwRJVN9u/bK63FhOPB4xqiRqlQAU0qDU3JJuf0BmaH0yytTBdoSBHrb2jqc5qQ==}
+    engines: {node: '>=12'}
+    cpu: [ppc64]
+    os: [linux]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@esbuild/linux-riscv64@0.17.18:
+    resolution: {integrity: sha512-/x7leOyDPjZV3TcsdfrSI107zItVnsX1q2nho7hbbQoKnmoeUWjs+08rKKt4AUXju7+3aRZSsKrJtaRmsdL1xA==}
+    engines: {node: '>=12'}
+    cpu: [riscv64]
+    os: [linux]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@esbuild/linux-s390x@0.17.18:
+    resolution: {integrity: sha512-cX0I8Q9xQkL/6F5zWdYmVf5JSQt+ZfZD2bJudZrWD+4mnUvoZ3TDDXtDX2mUaq6upMFv9FlfIh4Gfun0tbGzuw==}
+    engines: {node: '>=12'}
+    cpu: [s390x]
+    os: [linux]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@esbuild/linux-x64@0.17.18:
+    resolution: {integrity: sha512-66RmRsPlYy4jFl0vG80GcNRdirx4nVWAzJmXkevgphP1qf4dsLQCpSKGM3DUQCojwU1hnepI63gNZdrr02wHUA==}
+    engines: {node: '>=12'}
+    cpu: [x64]
+    os: [linux]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@esbuild/netbsd-x64@0.17.18:
+    resolution: {integrity: sha512-95IRY7mI2yrkLlTLb1gpDxdC5WLC5mZDi+kA9dmM5XAGxCME0F8i4bYH4jZreaJ6lIZ0B8hTrweqG1fUyW7jbg==}
+    engines: {node: '>=12'}
+    cpu: [x64]
+    os: [netbsd]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@esbuild/openbsd-x64@0.17.18:
+    resolution: {integrity: sha512-WevVOgcng+8hSZ4Q3BKL3n1xTv5H6Nb53cBrtzzEjDbbnOmucEVcZeGCsCOi9bAOcDYEeBZbD2SJNBxlfP3qiA==}
+    engines: {node: '>=12'}
+    cpu: [x64]
+    os: [openbsd]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@esbuild/sunos-x64@0.17.18:
+    resolution: {integrity: sha512-Rzf4QfQagnwhQXVBS3BYUlxmEbcV7MY+BH5vfDZekU5eYpcffHSyjU8T0xucKVuOcdCsMo+Ur5wmgQJH2GfNrg==}
+    engines: {node: '>=12'}
+    cpu: [x64]
+    os: [sunos]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@esbuild/win32-arm64@0.17.18:
+    resolution: {integrity: sha512-Kb3Ko/KKaWhjeAm2YoT/cNZaHaD1Yk/pa3FTsmqo9uFh1D1Rfco7BBLIPdDOozrObj2sahslFuAQGvWbgWldAg==}
+    engines: {node: '>=12'}
+    cpu: [arm64]
+    os: [win32]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@esbuild/win32-ia32@0.17.18:
+    resolution: {integrity: sha512-0/xUMIdkVHwkvxfbd5+lfG7mHOf2FRrxNbPiKWg9C4fFrB8H0guClmaM3BFiRUYrznVoyxTIyC/Ou2B7QQSwmw==}
+    engines: {node: '>=12'}
+    cpu: [ia32]
+    os: [win32]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@esbuild/win32-x64@0.17.18:
+    resolution: {integrity: sha512-qU25Ma1I3NqTSHJUOKi9sAH1/Mzuvlke0ioMJRthLXKm7JiSKVwFghlGbDLOO2sARECGhja4xYfRAZNPAkooYg==}
+    engines: {node: '>=12'}
+    cpu: [x64]
+    os: [win32]
+    requiresBuild: true
+    dev: true
+    optional: true
+
+  /@eslint-community/eslint-utils@4.4.0(eslint@8.39.0):
+    resolution: {integrity: sha512-1/sA4dwrzBAyeUoQ6oxahHKmrZvsnLCg4RfxW3ZFGGmQkSNQPFNLV9CUEFQP1x9EYXHTo5p6xdhZM1Ne9p/AfA==}
+    engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
+    peerDependencies:
+      eslint: ^6.0.0 || ^7.0.0 || >=8.0.0
+    dependencies:
+      eslint: 8.39.0
+      eslint-visitor-keys: 3.4.0
+    dev: true
+
+  /@eslint-community/regexpp@4.5.1:
+    resolution: {integrity: sha512-Z5ba73P98O1KUYCCJTUeVpja9RcGoMdncZ6T49FCUl2lN38JtCJ+3WgIDBv0AuY4WChU5PmtJmOCTlN6FZTFKQ==}
+    engines: {node: ^12.0.0 || ^14.0.0 || >=16.0.0}
+    dev: true
+
+  /@eslint/eslintrc@2.0.2:
+    resolution: {integrity: sha512-3W4f5tDUra+pA+FzgugqL2pRimUTDJWKr7BINqOpkZrC0uYI0NIc0/JFgBROCU07HR6GieA5m3/rsPIhDmCXTQ==}
+    engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
+    dependencies:
+      ajv: 6.12.6
+      debug: 4.3.4
+      espree: 9.5.1
+      globals: 13.20.0
+      ignore: 5.2.4
+      import-fresh: 3.3.0
+      js-yaml: 4.1.0
+      minimatch: 3.1.2
+      strip-json-comments: 3.1.1
+    transitivePeerDependencies:
+      - supports-color
+    dev: true
+
+  /@eslint/js@8.39.0:
+    resolution: {integrity: sha512-kf9RB0Fg7NZfap83B3QOqOGg9QmD9yBudqQXzzOtn3i4y7ZUXe5ONeW34Gwi+TxhH4mvj72R1Zc300KUMa9Bng==}
+    engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
+    dev: true
+
+  /@humanwhocodes/config-array@0.11.8:
+    resolution: {integrity: sha512-UybHIJzJnR5Qc/MsD9Kr+RpO2h+/P1GhOwdiLPXK5TWk5sgTdu88bTD9UP+CKbPPh5Rni1u0GjAdYQLemG8g+g==}
+    engines: {node: '>=10.10.0'}
+    dependencies:
+      '@humanwhocodes/object-schema': 1.2.1
+      debug: 4.3.4
+      minimatch: 3.1.2
+    transitivePeerDependencies:
+      - supports-color
+    dev: true
+
+  /@humanwhocodes/module-importer@1.0.1:
+    resolution: {integrity: sha512-bxveV4V8v5Yb4ncFTT3rPSgZBOpCkjfK0y4oVVVJwIuDVBRMDXrPyXRL988i5ap9m9bnyEEjWfm5WkBmtffLfA==}
+    engines: {node: '>=12.22'}
+    dev: true
+
+  /@humanwhocodes/object-schema@1.2.1:
+    resolution: {integrity: sha512-ZnQMnLV4e7hDlUvw8H+U8ASL02SS2Gn6+9Ac3wGGLIe7+je2AeAOxPY+izIPJDfFDb7eDjev0Us8MO1iFRN8hA==}
+    dev: true
+
+  /@nodelib/fs.scandir@2.1.5:
+    resolution: {integrity: sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==}
+    engines: {node: '>= 8'}
+    dependencies:
+      '@nodelib/fs.stat': 2.0.5
+      run-parallel: 1.2.0
+    dev: true
+
+  /@nodelib/fs.stat@2.0.5:
+    resolution: {integrity: sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==}
+    engines: {node: '>= 8'}
+    dev: true
+
+  /@nodelib/fs.walk@1.2.8:
+    resolution: {integrity: sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==}
+    engines: {node: '>= 8'}
+    dependencies:
+      '@nodelib/fs.scandir': 2.1.5
+      fastq: 1.15.0
+    dev: true
+
+  /@types/json-schema@7.0.11:
+    resolution: {integrity: sha512-wOuvG1SN4Us4rez+tylwwwCV1psiNVOkJeM3AUWUNWg/jDQY2+HE/444y5gc+jBmRqASOm2Oeh5c1axHobwRKQ==}
+    dev: true
+
+  /@types/node@18.16.3:
+    resolution: {integrity: sha512-OPs5WnnT1xkCBiuQrZA4+YAV4HEJejmHneyraIaxsbev5yCEr6KMwINNFP9wQeFIw8FWcoTqF3vQsa5CDaI+8Q==}
+    dev: true
+
+  /@types/semver@7.3.13:
+    resolution: {integrity: sha512-21cFJr9z3g5dW8B0CVI9g2O9beqaThGQ6ZFBqHfwhzLDKUxaqTIy3vnfah/UPkfOiF2pLq+tGz+W8RyCskuslw==}
+    dev: true
+
+  /@typescript-eslint/eslint-plugin@5.59.2(@typescript-eslint/parser@5.59.2)(eslint@8.39.0)(typescript@5.0.4):
+    resolution: {integrity: sha512-yVrXupeHjRxLDcPKL10sGQ/QlVrA8J5IYOEWVqk0lJaSZP7X5DfnP7Ns3cc74/blmbipQ1htFNVGsHX6wsYm0A==}
+    engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
+    peerDependencies:
+      '@typescript-eslint/parser': ^5.0.0
+      eslint: ^6.0.0 || ^7.0.0 || ^8.0.0
+      typescript: '*'
+    peerDependenciesMeta:
+      typescript:
+        optional: true
+    dependencies:
+      '@eslint-community/regexpp': 4.5.1
+      '@typescript-eslint/parser': 5.59.2(eslint@8.39.0)(typescript@5.0.4)
+      '@typescript-eslint/scope-manager': 5.59.2
+      '@typescript-eslint/type-utils': 5.59.2(eslint@8.39.0)(typescript@5.0.4)
+      '@typescript-eslint/utils': 5.59.2(eslint@8.39.0)(typescript@5.0.4)
+      debug: 4.3.4
+      eslint: 8.39.0
+      grapheme-splitter: 1.0.4
+      ignore: 5.2.4
+      natural-compare-lite: 1.4.0
+      semver: 7.5.0
+      tsutils: 3.21.0(typescript@5.0.4)
+      typescript: 5.0.4
+    transitivePeerDependencies:
+      - supports-color
+    dev: true
+
+  /@typescript-eslint/parser@5.59.2(eslint@8.39.0)(typescript@5.0.4):
+    resolution: {integrity: sha512-uq0sKyw6ao1iFOZZGk9F8Nro/8+gfB5ezl1cA06SrqbgJAt0SRoFhb9pXaHvkrxUpZaoLxt8KlovHNk8Gp6/HQ==}
+    engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
+    peerDependencies:
+      eslint: ^6.0.0 || ^7.0.0 || ^8.0.0
+      typescript: '*'
+    peerDependenciesMeta:
+      typescript:
+        optional: true
+    dependencies:
+      '@typescript-eslint/scope-manager': 5.59.2
+      '@typescript-eslint/types': 5.59.2
+      '@typescript-eslint/typescript-estree': 5.59.2(typescript@5.0.4)
+      debug: 4.3.4
+      eslint: 8.39.0
+      typescript: 5.0.4
+    transitivePeerDependencies:
+      - supports-color
+    dev: true
+
+  /@typescript-eslint/scope-manager@5.59.2:
+    resolution: {integrity: sha512-dB1v7ROySwQWKqQ8rEWcdbTsFjh2G0vn8KUyvTXdPoyzSL6lLGkiXEV5CvpJsEe9xIdKV+8Zqb7wif2issoOFA==}
+    engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
+    dependencies:
+      '@typescript-eslint/types': 5.59.2
+      '@typescript-eslint/visitor-keys': 5.59.2
+    dev: true
+
+  /@typescript-eslint/type-utils@5.59.2(eslint@8.39.0)(typescript@5.0.4):
+    resolution: {integrity: sha512-b1LS2phBOsEy/T381bxkkywfQXkV1dWda/z0PhnIy3bC5+rQWQDS7fk9CSpcXBccPY27Z6vBEuaPBCKCgYezyQ==}
+    engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
+    peerDependencies:
+      eslint: '*'
+      typescript: '*'
+    peerDependenciesMeta:
+      typescript:
+        optional: true
+    dependencies:
+      '@typescript-eslint/typescript-estree': 5.59.2(typescript@5.0.4)
+      '@typescript-eslint/utils': 5.59.2(eslint@8.39.0)(typescript@5.0.4)
+      debug: 4.3.4
+      eslint: 8.39.0
+      tsutils: 3.21.0(typescript@5.0.4)
+      typescript: 5.0.4
+    transitivePeerDependencies:
+      - supports-color
+    dev: true
+
+  /@typescript-eslint/types@5.59.2:
+    resolution: {integrity: sha512-LbJ/HqoVs2XTGq5shkiKaNTuVv5tTejdHgfdjqRUGdYhjW1crm/M7og2jhVskMt8/4wS3T1+PfFvL1K3wqYj4w==}
+    engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
+    dev: true
+
+  /@typescript-eslint/typescript-estree@5.59.2(typescript@5.0.4):
+    resolution: {integrity: sha512-+j4SmbwVmZsQ9jEyBMgpuBD0rKwi9RxRpjX71Brr73RsYnEr3Lt5QZ624Bxphp8HUkSKfqGnPJp1kA5nl0Sh7Q==}
+    engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
+    peerDependencies:
+      typescript: '*'
+    peerDependenciesMeta:
+      typescript:
+        optional: true
+    dependencies:
+      '@typescript-eslint/types': 5.59.2
+      '@typescript-eslint/visitor-keys': 5.59.2
+      debug: 4.3.4
+      globby: 11.1.0
+      is-glob: 4.0.3
+      semver: 7.5.0
+      tsutils: 3.21.0(typescript@5.0.4)
+      typescript: 5.0.4
+    transitivePeerDependencies:
+      - supports-color
+    dev: true
+
+  /@typescript-eslint/utils@5.59.2(eslint@8.39.0)(typescript@5.0.4):
+    resolution: {integrity: sha512-kSuF6/77TZzyGPhGO4uVp+f0SBoYxCDf+lW3GKhtKru/L8k/Hd7NFQxyWUeY7Z/KGB2C6Fe3yf2vVi4V9TsCSQ==}
+    engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
+    peerDependencies:
+      eslint: ^6.0.0 || ^7.0.0 || ^8.0.0
+    dependencies:
+      '@eslint-community/eslint-utils': 4.4.0(eslint@8.39.0)
+      '@types/json-schema': 7.0.11
+      '@types/semver': 7.3.13
+      '@typescript-eslint/scope-manager': 5.59.2
+      '@typescript-eslint/types': 5.59.2
+      '@typescript-eslint/typescript-estree': 5.59.2(typescript@5.0.4)
+      eslint: 8.39.0
+      eslint-scope: 5.1.1
+      semver: 7.5.0
+    transitivePeerDependencies:
+      - supports-color
+      - typescript
+    dev: true
+
+  /@typescript-eslint/visitor-keys@5.59.2:
+    resolution: {integrity: sha512-EEpsO8m3RASrKAHI9jpavNv9NlEUebV4qmF1OWxSTtKSFBpC1NCmWazDQHFivRf0O1DV11BA645yrLEVQ0/Lig==}
+    engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
+    dependencies:
+      '@typescript-eslint/types': 5.59.2
+      eslint-visitor-keys: 3.4.0
+    dev: true
+
+  /acorn-jsx@5.3.2(acorn@8.8.2):
+    resolution: {integrity: sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ==}
+    peerDependencies:
+      acorn: ^6.0.0 || ^7.0.0 || ^8.0.0
+    dependencies:
+      acorn: 8.8.2
+    dev: true
+
+  /acorn@8.8.2:
+    resolution: {integrity: sha512-xjIYgE8HBrkpd/sJqOGNspf8uHG+NOHGOw6a/Urj8taM2EXfdNAH2oFcPeIFfsv3+kz/mJrS5VuMqbNLjCa2vw==}
+    engines: {node: '>=0.4.0'}
+    hasBin: true
+    dev: true
+
+  /ajv@6.12.6:
+    resolution: {integrity: sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==}
+    dependencies:
+      fast-deep-equal: 3.1.3
+      fast-json-stable-stringify: 2.1.0
+      json-schema-traverse: 0.4.1
+      uri-js: 4.4.1
+    dev: true
+
+  /ansi-regex@5.0.1:
+    resolution: {integrity: sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==}
+    engines: {node: '>=8'}
+    dev: true
+
+  /ansi-styles@4.3.0:
+    resolution: {integrity: sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==}
+    engines: {node: '>=8'}
+    dependencies:
+      color-convert: 2.0.1
+    dev: true
+
+  /argparse@2.0.1:
+    resolution: {integrity: sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==}
+    dev: true
+
+  /array-union@2.1.0:
+    resolution: {integrity: sha512-HGyxoOTYUyCM6stUe6EJgnd4EoewAI7zMdfqO+kGjnlZmBDz/cR5pf8r/cR4Wq60sL/p0IkcjUEEPwS3GFrIyw==}
+    engines: {node: '>=8'}
+    dev: true
+
+  /balanced-match@1.0.2:
+    resolution: {integrity: sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==}
+    dev: true
+
+  /brace-expansion@1.1.11:
+    resolution: {integrity: sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==}
+    dependencies:
+      balanced-match: 1.0.2
+      concat-map: 0.0.1
+    dev: true
+
+  /braces@3.0.2:
+    resolution: {integrity: sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==}
+    engines: {node: '>=8'}
+    dependencies:
+      fill-range: 7.0.1
+    dev: true
+
+  /callsites@3.1.0:
+    resolution: {integrity: sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==}
+    engines: {node: '>=6'}
+    dev: true
+
+  /chalk@4.1.2:
+    resolution: {integrity: sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==}
+    engines: {node: '>=10'}
+    dependencies:
+      ansi-styles: 4.3.0
+      supports-color: 7.2.0
+    dev: true
+
+  /color-convert@2.0.1:
+    resolution: {integrity: sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==}
+    engines: {node: '>=7.0.0'}
+    dependencies:
+      color-name: 1.1.4
+    dev: true
+
+  /color-name@1.1.4:
+    resolution: {integrity: sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==}
+    dev: true
+
+  /concat-map@0.0.1:
+    resolution: {integrity: sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==}
+    dev: true
+
+  /cross-spawn@7.0.3:
+    resolution: {integrity: sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==}
+    engines: {node: '>= 8'}
+    dependencies:
+      path-key: 3.1.1
+      shebang-command: 2.0.0
+      which: 2.0.2
+    dev: true
+
+  /debug@4.3.4:
+    resolution: {integrity: sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==}
+    engines: {node: '>=6.0'}
+    peerDependencies:
+      supports-color: '*'
+    peerDependenciesMeta:
+      supports-color:
+        optional: true
+    dependencies:
+      ms: 2.1.2
+    dev: true
+
+  /deep-is@0.1.4:
+    resolution: {integrity: sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==}
+    dev: true
+
+  /dir-glob@3.0.1:
+    resolution: {integrity: sha512-WkrWp9GR4KXfKGYzOLmTuGVi1UWFfws377n9cc55/tb6DuqyF6pcQ5AbiHEshaDpY9v6oaSr2XCDidGmMwdzIA==}
+    engines: {node: '>=8'}
+    dependencies:
+      path-type: 4.0.0
+    dev: true
+
+  /doctrine@3.0.0:
+    resolution: {integrity: sha512-yS+Q5i3hBf7GBkd4KG8a7eBNNWNGLTaEwwYWUijIYM7zrlYDM0BFXHjjPWlWZ1Rg7UaddZeIDmi9jF3HmqiQ2w==}
+    engines: {node: '>=6.0.0'}
+    dependencies:
+      esutils: 2.0.3
+    dev: true
+
+  /esbuild@0.17.18:
+    resolution: {integrity: sha512-z1lix43jBs6UKjcZVKOw2xx69ffE2aG0PygLL5qJ9OS/gy0Ewd1gW/PUQIOIQGXBHWNywSc0floSKoMFF8aK2w==}
+    engines: {node: '>=12'}
+    hasBin: true
+    requiresBuild: true
+    optionalDependencies:
+      '@esbuild/android-arm': 0.17.18
+      '@esbuild/android-arm64': 0.17.18
+      '@esbuild/android-x64': 0.17.18
+      '@esbuild/darwin-arm64': 0.17.18
+      '@esbuild/darwin-x64': 0.17.18
+      '@esbuild/freebsd-arm64': 0.17.18
+      '@esbuild/freebsd-x64': 0.17.18
+      '@esbuild/linux-arm': 0.17.18
+      '@esbuild/linux-arm64': 0.17.18
+      '@esbuild/linux-ia32': 0.17.18
+      '@esbuild/linux-loong64': 0.17.18
+      '@esbuild/linux-mips64el': 0.17.18
+      '@esbuild/linux-ppc64': 0.17.18
+      '@esbuild/linux-riscv64': 0.17.18
+      '@esbuild/linux-s390x': 0.17.18
+      '@esbuild/linux-x64': 0.17.18
+      '@esbuild/netbsd-x64': 0.17.18
+      '@esbuild/openbsd-x64': 0.17.18
+      '@esbuild/sunos-x64': 0.17.18
+      '@esbuild/win32-arm64': 0.17.18
+      '@esbuild/win32-ia32': 0.17.18
+      '@esbuild/win32-x64': 0.17.18
+    dev: true
+
+  /escape-string-regexp@4.0.0:
+    resolution: {integrity: sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==}
+    engines: {node: '>=10'}
+    dev: true
+
+  /eslint-scope@5.1.1:
+    resolution: {integrity: sha512-2NxwbF/hZ0KpepYN0cNbo+FN6XoK7GaHlQhgx/hIZl6Va0bF45RQOOwhLIy8lQDbuCiadSLCBnH2CFYquit5bw==}
+    engines: {node: '>=8.0.0'}
+    dependencies:
+      esrecurse: 4.3.0
+      estraverse: 4.3.0
+    dev: true
+
+  /eslint-scope@7.2.0:
+    resolution: {integrity: sha512-DYj5deGlHBfMt15J7rdtyKNq/Nqlv5KfU4iodrQ019XESsRnwXH9KAE0y3cwtUHDo2ob7CypAnCqefh6vioWRw==}
+    engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
+    dependencies:
+      esrecurse: 4.3.0
+      estraverse: 5.3.0
+    dev: true
+
+  /eslint-visitor-keys@3.4.0:
+    resolution: {integrity: sha512-HPpKPUBQcAsZOsHAFwTtIKcYlCje62XB7SEAcxjtmW6TD1WVpkS6i6/hOVtTZIl4zGj/mBqpFVGvaDneik+VoQ==}
+    engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
+    dev: true
+
+  /eslint@8.39.0:
+    resolution: {integrity: sha512-mwiok6cy7KTW7rBpo05k6+p4YVZByLNjAZ/ACB9DRCu4YDRwjXI01tWHp6KAUWelsBetTxKK/2sHB0vdS8Z2Og==}
+    engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
+    hasBin: true
+    dependencies:
+      '@eslint-community/eslint-utils': 4.4.0(eslint@8.39.0)
+      '@eslint-community/regexpp': 4.5.1
+      '@eslint/eslintrc': 2.0.2
+      '@eslint/js': 8.39.0
+      '@humanwhocodes/config-array': 0.11.8
+      '@humanwhocodes/module-importer': 1.0.1
+      '@nodelib/fs.walk': 1.2.8
+      ajv: 6.12.6
+      chalk: 4.1.2
+      cross-spawn: 7.0.3
+      debug: 4.3.4
+      doctrine: 3.0.0
+      escape-string-regexp: 4.0.0
+      eslint-scope: 7.2.0
+      eslint-visitor-keys: 3.4.0
+      espree: 9.5.1
+      esquery: 1.5.0
+      esutils: 2.0.3
+      fast-deep-equal: 3.1.3
+      file-entry-cache: 6.0.1
+      find-up: 5.0.0
+      glob-parent: 6.0.2
+      globals: 13.20.0
+      grapheme-splitter: 1.0.4
+      ignore: 5.2.4
+      import-fresh: 3.3.0
+      imurmurhash: 0.1.4
+      is-glob: 4.0.3
+      is-path-inside: 3.0.3
+      js-sdsl: 4.4.0
+      js-yaml: 4.1.0
+      json-stable-stringify-without-jsonify: 1.0.1
+      levn: 0.4.1
+      lodash.merge: 4.6.2
+      minimatch: 3.1.2
+      natural-compare: 1.4.0
+      optionator: 0.9.1
+      strip-ansi: 6.0.1
+      strip-json-comments: 3.1.1
+      text-table: 0.2.0
+    transitivePeerDependencies:
+      - supports-color
+    dev: true
+
+  /espree@9.5.1:
+    resolution: {integrity: sha512-5yxtHSZXRSW5pvv3hAlXM5+/Oswi1AUFqBmbibKb5s6bp3rGIDkyXU6xCoyuuLhijr4SFwPrXRoZjz0AZDN9tg==}
+    engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0}
+    dependencies:
+      acorn: 8.8.2
+      acorn-jsx: 5.3.2(acorn@8.8.2)
+      eslint-visitor-keys: 3.4.0
+    dev: true
+
+  /esquery@1.5.0:
+    resolution: {integrity: sha512-YQLXUplAwJgCydQ78IMJywZCceoqk1oH01OERdSAJc/7U2AylwjhSCLDEtqwg811idIS/9fIU5GjG73IgjKMVg==}
+    engines: {node: '>=0.10'}
+    dependencies:
+      estraverse: 5.3.0
+    dev: true
+
+  /esrecurse@4.3.0:
+    resolution: {integrity: sha512-KmfKL3b6G+RXvP8N1vr3Tq1kL/oCFgn2NYXEtqP8/L3pKapUA4G8cFVaoF3SU323CD4XypR/ffioHmkti6/Tag==}
+    engines: {node: '>=4.0'}
+    dependencies:
+      estraverse: 5.3.0
+    dev: true
+
+  /estraverse@4.3.0:
+    resolution: {integrity: sha512-39nnKffWz8xN1BU/2c79n9nB9HDzo0niYUqx6xyqUnyoAnQyyWpOTdZEeiCch8BBu515t4wp9ZmgVfVhn9EBpw==}
+    engines: {node: '>=4.0'}
+    dev: true
+
+  /estraverse@5.3.0:
+    resolution: {integrity: sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==}
+    engines: {node: '>=4.0'}
+    dev: true
+
+  /esutils@2.0.3:
+    resolution: {integrity: sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==}
+    engines: {node: '>=0.10.0'}
+    dev: true
+
+  /fast-deep-equal@3.1.3:
+    resolution: {integrity: sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==}
+    dev: true
+
+  /fast-glob@3.2.12:
+    resolution: {integrity: sha512-DVj4CQIYYow0BlaelwK1pHl5n5cRSJfM60UA0zK891sVInoPri2Ekj7+e1CT3/3qxXenpI+nBBmQAcJPJgaj4w==}
+    engines: {node: '>=8.6.0'}
+    dependencies:
+      '@nodelib/fs.stat': 2.0.5
+      '@nodelib/fs.walk': 1.2.8
+      glob-parent: 5.1.2
+      merge2: 1.4.1
+      micromatch: 4.0.5
+    dev: true
+
+  /fast-json-stable-stringify@2.1.0:
+    resolution: {integrity: sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==}
+    dev: true
+
+  /fast-levenshtein@2.0.6:
+    resolution: {integrity: sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==}
+    dev: true
+
+  /fastq@1.15.0:
+    resolution: {integrity: sha512-wBrocU2LCXXa+lWBt8RoIRD89Fi8OdABODa/kEnyeyjS5aZO5/GNvI5sEINADqP/h8M29UHTHUb53sUu5Ihqdw==}
+    dependencies:
+      reusify: 1.0.4
+    dev: true
+
+  /file-entry-cache@6.0.1:
+    resolution: {integrity: sha512-7Gps/XWymbLk2QLYK4NzpMOrYjMhdIxXuIvy2QBsLE6ljuodKvdkWs/cpyJJ3CVIVpH0Oi1Hvg1ovbMzLdFBBg==}
+    engines: {node: ^10.12.0 || >=12.0.0}
+    dependencies:
+      flat-cache: 3.0.4
+    dev: true
+
+  /fill-range@7.0.1:
+    resolution: {integrity: sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==}
+    engines: {node: '>=8'}
+    dependencies:
+      to-regex-range: 5.0.1
+    dev: true
+
+  /find-up@5.0.0:
+    resolution: {integrity: sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng==}
+    engines: {node: '>=10'}
+    dependencies:
+      locate-path: 6.0.0
+      path-exists: 4.0.0
+    dev: true
+
+  /flat-cache@3.0.4:
+    resolution: {integrity: sha512-dm9s5Pw7Jc0GvMYbshN6zchCA9RgQlzzEZX3vylR9IqFfS8XciblUXOKfW6SiuJ0e13eDYZoZV5wdrev7P3Nwg==}
+    engines: {node: ^10.12.0 || >=12.0.0}
+    dependencies:
+      flatted: 3.2.7
+      rimraf: 3.0.2
+    dev: true
+
+  /flatted@3.2.7:
+    resolution: {integrity: sha512-5nqDSxl8nn5BSNxyR3n4I6eDmbolI6WT+QqR547RwxQapgjQBmtktdP+HTBb/a/zLsbzERTONyUB5pefh5TtjQ==}
+    dev: true
+
+  /fs.realpath@1.0.0:
+    resolution: {integrity: sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==}
+    dev: true
+
+  /glob-parent@5.1.2:
+    resolution: {integrity: sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==}
+    engines: {node: '>= 6'}
+    dependencies:
+      is-glob: 4.0.3
+    dev: true
+
+  /glob-parent@6.0.2:
+    resolution: {integrity: sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A==}
+    engines: {node: '>=10.13.0'}
+    dependencies:
+      is-glob: 4.0.3
+    dev: true
+
+  /glob@7.2.3:
+    resolution: {integrity: sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==}
+    dependencies:
+      fs.realpath: 1.0.0
+      inflight: 1.0.6
+      inherits: 2.0.4
+      minimatch: 3.1.2
+      once: 1.4.0
+      path-is-absolute: 1.0.1
+    dev: true
+
+  /globals@13.20.0:
+    resolution: {integrity: sha512-Qg5QtVkCy/kv3FUSlu4ukeZDVf9ee0iXLAUYX13gbR17bnejFTzr4iS9bY7kwCf1NztRNm1t91fjOiyx4CSwPQ==}
+    engines: {node: '>=8'}
+    dependencies:
+      type-fest: 0.20.2
+    dev: true
+
+  /globby@11.1.0:
+    resolution: {integrity: sha512-jhIXaOzy1sb8IyocaruWSn1TjmnBVs8Ayhcy83rmxNJ8q2uWKCAj3CnJY+KpGSXCueAPc0i05kVvVKtP1t9S3g==}
+    engines: {node: '>=10'}
+    dependencies:
+      array-union: 2.1.0
+      dir-glob: 3.0.1
+      fast-glob: 3.2.12
+      ignore: 5.2.4
+      merge2: 1.4.1
+      slash: 3.0.0
+    dev: true
+
+  /grapheme-splitter@1.0.4:
+    resolution: {integrity: sha512-bzh50DW9kTPM00T8y4o8vQg89Di9oLJVLW/KaOGIXJWP/iqCN6WKYkbNOF04vFLJhwcpYUh9ydh/+5vpOqV4YQ==}
+    dev: true
+
+  /has-flag@4.0.0:
+    resolution: {integrity: sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==}
+    engines: {node: '>=8'}
+    dev: true
+
+  /ignore@5.2.4:
+    resolution: {integrity: sha512-MAb38BcSbH0eHNBxn7ql2NH/kX33OkB3lZ1BNdh7ENeRChHTYsTvWrMubiIAMNS2llXEEgZ1MUOBtXChP3kaFQ==}
+    engines: {node: '>= 4'}
+    dev: true
+
+  /import-fresh@3.3.0:
+    resolution: {integrity: sha512-veYYhQa+D1QBKznvhUHxb8faxlrwUnxseDAbAp457E0wLNio2bOSKnjYDhMj+YiAq61xrMGhQk9iXVk5FzgQMw==}
+    engines: {node: '>=6'}
+    dependencies:
+      parent-module: 1.0.1
+      resolve-from: 4.0.0
+    dev: true
+
+  /imurmurhash@0.1.4:
+    resolution: {integrity: sha512-JmXMZ6wuvDmLiHEml9ykzqO6lwFbof0GG4IkcGaENdCRDDmMVnny7s5HsIgHCbaq0w2MyPhDqkhTUgS2LU2PHA==}
+    engines: {node: '>=0.8.19'}
+    dev: true
+
+  /inflight@1.0.6:
+    resolution: {integrity: sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==}
+    dependencies:
+      once: 1.4.0
+      wrappy: 1.0.2
+    dev: true
+
+  /inherits@2.0.4:
+    resolution: {integrity: sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==}
+    dev: true
+
+  /is-extglob@2.1.1:
+    resolution: {integrity: sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==}
+    engines: {node: '>=0.10.0'}
+    dev: true
+
+  /is-glob@4.0.3:
+    resolution: {integrity: sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==}
+    engines: {node: '>=0.10.0'}
+    dependencies:
+      is-extglob: 2.1.1
+    dev: true
+
+  /is-number@7.0.0:
+    resolution: {integrity: sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==}
+    engines: {node: '>=0.12.0'}
+    dev: true
+
+  /is-path-inside@3.0.3:
+    resolution: {integrity: sha512-Fd4gABb+ycGAmKou8eMftCupSir5lRxqf4aD/vd0cD2qc4HL07OjCeuHMr8Ro4CoMaeCKDB0/ECBOVWjTwUvPQ==}
+    engines: {node: '>=8'}
+    dev: true
+
+  /isexe@2.0.0:
+    resolution: {integrity: sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==}
+    dev: true
+
+  /js-sdsl@4.4.0:
+    resolution: {integrity: sha512-FfVSdx6pJ41Oa+CF7RDaFmTnCaFhua+SNYQX74riGOpl96x+2jQCqEfQ2bnXu/5DPCqlRuiqyvTJM0Qjz26IVg==}
+    dev: true
+
+  /js-yaml@4.1.0:
+    resolution: {integrity: sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==}
+    hasBin: true
+    dependencies:
+      argparse: 2.0.1
+    dev: true
+
+  /json-schema-traverse@0.4.1:
+    resolution: {integrity: sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==}
+    dev: true
+
+  /json-stable-stringify-without-jsonify@1.0.1:
+    resolution: {integrity: sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==}
+    dev: true
+
+  /levn@0.4.1:
+    resolution: {integrity: sha512-+bT2uH4E5LGE7h/n3evcS/sQlJXCpIp6ym8OWJ5eV6+67Dsql/LaaT7qJBAt2rzfoa/5QBGBhxDix1dMt2kQKQ==}
+    engines: {node: '>= 0.8.0'}
+    dependencies:
+      prelude-ls: 1.2.1
+      type-check: 0.4.0
+    dev: true
+
+  /locate-path@6.0.0:
+    resolution: {integrity: sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw==}
+    engines: {node: '>=10'}
+    dependencies:
+      p-locate: 5.0.0
+    dev: true
+
+  /lodash.merge@4.6.2:
+    resolution: {integrity: sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==}
+    dev: true
+
+  /lru-cache@6.0.0:
+    resolution: {integrity: sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==}
+    engines: {node: '>=10'}
+    dependencies:
+      yallist: 4.0.0
+    dev: true
+
+  /merge2@1.4.1:
+    resolution: {integrity: sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==}
+    engines: {node: '>= 8'}
+    dev: true
+
+  /micromatch@4.0.5:
+    resolution: {integrity: sha512-DMy+ERcEW2q8Z2Po+WNXuw3c5YaUSFjAO5GsJqfEl7UjvtIuFKO6ZrKvcItdy98dwFI2N1tg3zNIdKaQT+aNdA==}
+    engines: {node: '>=8.6'}
+    dependencies:
+      braces: 3.0.2
+      picomatch: 2.3.1
+    dev: true
+
+  /minimatch@3.1.2:
+    resolution: {integrity: sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==}
+    dependencies:
+      brace-expansion: 1.1.11
+    dev: true
+
+  /ms@2.1.2:
+    resolution: {integrity: sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==}
+    dev: true
+
+  /natural-compare-lite@1.4.0:
+    resolution: {integrity: sha512-Tj+HTDSJJKaZnfiuw+iaF9skdPpTo2GtEly5JHnWV/hfv2Qj/9RKsGISQtLh2ox3l5EAGw487hnBee0sIJ6v2g==}
+    dev: true
+
+  /natural-compare@1.4.0:
+    resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==}
+    dev: true
+
+  /once@1.4.0:
+    resolution: {integrity: sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==}
+    dependencies:
+      wrappy: 1.0.2
+    dev: true
+
+  /optionator@0.9.1:
+    resolution: {integrity: sha512-74RlY5FCnhq4jRxVUPKDaRwrVNXMqsGsiW6AJw4XK8hmtm10wC0ypZBLw5IIp85NZMr91+qd1RvvENwg7jjRFw==}
+    engines: {node: '>= 0.8.0'}
+    dependencies:
+      deep-is: 0.1.4
+      fast-levenshtein: 2.0.6
+      levn: 0.4.1
+      prelude-ls: 1.2.1
+      type-check: 0.4.0
+      word-wrap: 1.2.3
+    dev: true
+
+  /p-limit@3.1.0:
+    resolution: {integrity: sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==}
+    engines: {node: '>=10'}
+    dependencies:
+      yocto-queue: 0.1.0
+    dev: true
+
+  /p-locate@5.0.0:
+    resolution: {integrity: sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw==}
+    engines: {node: '>=10'}
+    dependencies:
+      p-limit: 3.1.0
+    dev: true
+
+  /parent-module@1.0.1:
+    resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==}
+    engines: {node: '>=6'}
+    dependencies:
+      callsites: 3.1.0
+    dev: true
+
+  /path-exists@4.0.0:
+    resolution: {integrity: sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==}
+    engines: {node: '>=8'}
+    dev: true
+
+  /path-is-absolute@1.0.1:
+    resolution: {integrity: sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==}
+    engines: {node: '>=0.10.0'}
+    dev: true
+
+  /path-key@3.1.1:
+    resolution: {integrity: sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==}
+    engines: {node: '>=8'}
+    dev: true
+
+  /path-type@4.0.0:
+    resolution: {integrity: sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==}
+    engines: {node: '>=8'}
+    dev: true
+
+  /picomatch@2.3.1:
+    resolution: {integrity: sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==}
+    engines: {node: '>=8.6'}
+    dev: true
+
+  /prelude-ls@1.2.1:
+    resolution: {integrity: sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==}
+    engines: {node: '>= 0.8.0'}
+    dev: true
+
+  /punycode@2.3.0:
+    resolution: {integrity: sha512-rRV+zQD8tVFys26lAGR9WUuS4iUAngJScM+ZRSKtvl5tKeZ2t5bvdNFdNHBW9FWR4guGHlgmsZ1G7BSm2wTbuA==}
+    engines: {node: '>=6'}
+    dev: true
+
+  /queue-microtask@1.2.3:
+    resolution: {integrity: sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==}
+    dev: true
+
+  /resolve-from@4.0.0:
+    resolution: {integrity: sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==}
+    engines: {node: '>=4'}
+    dev: true
+
+  /reusify@1.0.4:
+    resolution: {integrity: sha512-U9nH88a3fc/ekCF1l0/UP1IosiuIjyTh7hBvXVMHYgVcfGvt897Xguj2UOLDeI5BG2m7/uwyaLVT6fbtCwTyzw==}
+    engines: {iojs: '>=1.0.0', node: '>=0.10.0'}
+    dev: true
+
+  /rimraf@3.0.2:
+    resolution: {integrity: sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==}
+    hasBin: true
+    dependencies:
+      glob: 7.2.3
+    dev: true
+
+  /run-parallel@1.2.0:
+    resolution: {integrity: sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==}
+    dependencies:
+      queue-microtask: 1.2.3
+    dev: true
+
+  /semver@7.5.0:
+    resolution: {integrity: sha512-+XC0AD/R7Q2mPSRuy2Id0+CGTZ98+8f+KvwirxOKIEyid+XSx6HbC63p+O4IndTHuX5Z+JxQ0TghCkO5Cg/2HA==}
+    engines: {node: '>=10'}
+    hasBin: true
+    dependencies:
+      lru-cache: 6.0.0
+    dev: true
+
+  /shebang-command@2.0.0:
+    resolution: {integrity: sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==}
+    engines: {node: '>=8'}
+    dependencies:
+      shebang-regex: 3.0.0
+    dev: true
+
+  /shebang-regex@3.0.0:
+    resolution: {integrity: sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==}
+    engines: {node: '>=8'}
+    dev: true
+
+  /slash@3.0.0:
+    resolution: {integrity: sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==}
+    engines: {node: '>=8'}
+    dev: true
+
+  /strip-ansi@6.0.1:
+    resolution: {integrity: sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==}
+    engines: {node: '>=8'}
+    dependencies:
+      ansi-regex: 5.0.1
+    dev: true
+
+  /strip-json-comments@3.1.1:
+    resolution: {integrity: sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==}
+    engines: {node: '>=8'}
+    dev: true
+
+  /supports-color@7.2.0:
+    resolution: {integrity: sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==}
+    engines: {node: '>=8'}
+    dependencies:
+      has-flag: 4.0.0
+    dev: true
+
+  /text-table@0.2.0:
+    resolution: {integrity: sha512-N+8UisAXDGk8PFXP4HAzVR9nbfmVJ3zYLAWiTIoqC5v5isinhr+r5uaO8+7r3BMfuNIufIsA7RdpVgacC2cSpw==}
+    dev: true
+
+  /to-regex-range@5.0.1:
+    resolution: {integrity: sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==}
+    engines: {node: '>=8.0'}
+    dependencies:
+      is-number: 7.0.0
+    dev: true
+
+  /tslib@1.14.1:
+    resolution: {integrity: sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg==}
+    dev: true
+
+  /tsutils@3.21.0(typescript@5.0.4):
+    resolution: {integrity: sha512-mHKK3iUXL+3UF6xL5k0PEhKRUBKPBCv/+RkEOpjRWxxx27KKRBmmA60A9pgOUvMi8GKhRMPEmjBRPzs2W7O1OA==}
+    engines: {node: '>= 6'}
+    peerDependencies:
+      typescript: '>=2.8.0 || >= 3.2.0-dev || >= 3.3.0-dev || >= 3.4.0-dev || >= 3.5.0-dev || >= 3.6.0-dev || >= 3.6.0-beta || >= 3.7.0-dev || >= 3.7.0-beta'
+    dependencies:
+      tslib: 1.14.1
+      typescript: 5.0.4
+    dev: true
+
+  /type-check@0.4.0:
+    resolution: {integrity: sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==}
+    engines: {node: '>= 0.8.0'}
+    dependencies:
+      prelude-ls: 1.2.1
+    dev: true
+
+  /type-fest@0.20.2:
+    resolution: {integrity: sha512-Ne+eE4r0/iWnpAxD852z3A+N0Bt5RN//NjJwRd2VFHEmrywxf5vsZlh4R6lixl6B+wz/8d+maTSAkN1FIkI3LQ==}
+    engines: {node: '>=10'}
+    dev: true
+
+  /typescript@5.0.4:
+    resolution: {integrity: sha512-cW9T5W9xY37cc+jfEnaUvX91foxtHkza3Nw3wkoF4sSlKn0MONdkdEndig/qPBWXNkmplh3NzayQzCiHM4/hqw==}
+    engines: {node: '>=12.20'}
+    hasBin: true
+    dev: true
+
+  /uri-js@4.4.1:
+    resolution: {integrity: sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==}
+    dependencies:
+      punycode: 2.3.0
+    dev: true
+
+  /which@2.0.2:
+    resolution: {integrity: sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==}
+    engines: {node: '>= 8'}
+    hasBin: true
+    dependencies:
+      isexe: 2.0.0
+    dev: true
+
+  /word-wrap@1.2.3:
+    resolution: {integrity: sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ==}
+    engines: {node: '>=0.10.0'}
+    dev: true
+
+  /wrappy@1.0.2:
+    resolution: {integrity: sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==}
+    dev: true
+
+  /yallist@4.0.0:
+    resolution: {integrity: sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==}
+    dev: true
+
+  /yocto-queue@0.1.0:
+    resolution: {integrity: sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==}
+    engines: {node: '>=10'}
+    dev: true
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/swift.swiftformat b/ethosu/regor/dependencies/thirdparty/flatbuffers/swift.swiftformat
new file mode 100644
index 00000000..80e475a5
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/swift.swiftformat
@@ -0,0 +1,27 @@
+--swiftversion 5.7
+
+# format
+--indent 2
+--maxwidth 80
+
+# options
+--self remove # redundantSelf
+--importgrouping testable-bottom # sortedImports
+--trimwhitespace always
+--indentcase false
+--ifdef no-indent #indent
+--wraparguments before-first # wrapArguments
+--wrapparameters before-first # wrapArguments
+--closingparen same-line # wrapArguments
+--funcattributes prev-line # wrapAttributes
+--typeattributes prev-line # wrapAttributes
+
+# rules
+--rules wrap,todos,anyObjectProtocol,redundantParens,redundantReturn,redundantSelf,sortedImports,strongifiedSelf,trailingCommas,trailingSpace,wrapArguments,wrapMultilineStatementBraces,indent,wrapAttributes,void,fileHeader
+--disable trailingclosures
+
+--exclude **/*_generated.swift
+--exclude **/swift_code_*.swift
+--exclude **/*.grpc.swift
+
+--header "/*\n * Copyright {year} Google Inc. All rights reserved.\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */"
\ No newline at end of file
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/tsconfig.json b/ethosu/regor/dependencies/thirdparty/flatbuffers/tsconfig.json
new file mode 100644
index 00000000..a8c89f5a
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/tsconfig.json
@@ -0,0 +1,15 @@
+{
+  "compilerOptions": {
+    "target": "es2020",
+    "module": "commonjs",
+    "lib": ["ES2020", "DOM"],
+    "declaration": true,
+    "outDir": "./js",
+    "strict": true,
+    "esModuleInterop": true,
+    "forceConsistentCasingInFileNames": true
+  },
+  "include": [
+    "ts/**/*.ts"
+  ]
+}
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/tsconfig.mjs.json b/ethosu/regor/dependencies/thirdparty/flatbuffers/tsconfig.mjs.json
new file mode 100644
index 00000000..20adaa38
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/tsconfig.mjs.json
@@ -0,0 +1,15 @@
+{
+  "compilerOptions": {
+    "target": "es2020",
+    "module": "es2020",
+    "lib": ["ES2020", "DOM"],
+    "declaration": true,
+    "outDir": "./mjs",
+    "strict": true,
+    "esModuleInterop": true,
+    "forceConsistentCasingInFileNames": true
+  },
+  "include": [
+    "ts/**/*.ts"
+  ]
+}
\ No newline at end of file
diff --git a/ethosu/regor/dependencies/thirdparty/flatbuffers/typescript.bzl b/ethosu/regor/dependencies/thirdparty/flatbuffers/typescript.bzl
new file mode 100644
index 00000000..63c1218c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/flatbuffers/typescript.bzl
@@ -0,0 +1,90 @@
+"""
+Rules for building typescript flatbuffers with Bazel.
+"""
+
+load("@aspect_rules_js//js:defs.bzl", "js_library")
+load(":build_defs.bzl", "flatbuffer_library_public")
+
+DEFAULT_FLATC_TS_ARGS = [
+    "--gen-object-api",
+    "--gen-mutable",
+    "--reflect-names",
+    "--gen-name-strings",
+    "--ts-flat-files",
+    "--keep-prefix",
+]
+
+def flatbuffer_ts_library(
+        name,
+        srcs,
+        compatible_with = None,
+        target_compatible_with = None,
+        deps = [],
+        include_paths = None,
+        flatc_args = DEFAULT_FLATC_TS_ARGS,
+        visibility = None,
+        restricted_to = None,
+        gen_reflections = False):
+    """Generates a ts_library rule for a given flatbuffer definition.
+
+    Args:
+      name: Name of the generated ts_library rule.
+      srcs: Source .fbs file(s).
+      deps: Other flatbuffer_ts_library's to depend on. Note that currently
+            you must specify all your transitive dependencies manually.
+      include_paths: Optional, list of paths the includes files can be found in.
+      flatc_args: Optional list of additional arguments to pass to flatc
+          (e.g. --gen-mutable).
+      visibility: The visibility of the generated cc_library. By default, use the
+          default visibility of the project.
+      compatible_with: Optional, The list of environments this rule can be built
+        for, in addition to default-supported environments.
+      restricted_to: Optional, The list of environments this rule can be built
+        for, instead of default-supported environments.
+      target_compatible_with: Optional, The list of target platform constraints
+        to use.
+      gen_reflections: Optional, if true this will generate the flatbuffer
+        reflection binaries for the schemas.
+    """
+    srcs_lib = "%s_srcs" % (name)
+    out_base = [s.replace(".fbs", "").split("/")[-1].split(":")[-1] for s in srcs]
+
+    if len(srcs) != 1:
+        fail("flatbuffer_ts_library only supports one .fbs file per target currently.")
+
+    outs = ["%s_generated.cjs" % s for s in out_base]
+    includes = [d + "_includes" for d in deps]
+    reflection_name = "%s_reflection" % name if gen_reflections else ""
+    flatbuffer_library_public(
+        name = srcs_lib,
+        srcs = srcs,
+        outs = outs,
+        language_flag = "--ts",
+        includes = includes,
+        include_paths = include_paths,
+        extra_env = "ESBUILD_BIN=$(ESBUILD_BIN)",
+        flatc_args = flatc_args + ["--filename-suffix _generated"],
+        compatible_with = compatible_with,
+        restricted_to = restricted_to,
+        reflection_name = reflection_name,
+        reflection_visibility = visibility,
+        target_compatible_with = target_compatible_with,
+        flatc_path = "@com_github_google_flatbuffers//ts:compile_flat_file",
+        toolchains = ["@aspect_rules_esbuild//esbuild:resolved_toolchain"],
+        tools = ["@aspect_rules_esbuild//esbuild:resolved_toolchain"],
+    )
+    js_library(
+        name = name,
+        visibility = visibility,
+        compatible_with = compatible_with,
+        restricted_to = restricted_to,
+        target_compatible_with = target_compatible_with,
+        srcs = outs,
+    )
+    native.filegroup(
+        name = "%s_includes" % (name),
+        srcs = srcs + includes,
+        compatible_with = compatible_with,
+        restricted_to = restricted_to,
+        visibility = visibility,
+    )
diff --git a/ethosu/regor/dependencies/thirdparty/fmt/CMakeLists.txt b/ethosu/regor/dependencies/thirdparty/fmt/CMakeLists.txt
new file mode 100644
index 00000000..4b928ae3
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/fmt/CMakeLists.txt
@@ -0,0 +1,453 @@
+cmake_minimum_required(VERSION 3.8...3.26)
+
+# Fallback for using newer policies on CMake <3.12.
+if (${CMAKE_VERSION} VERSION_LESS 3.12)
+  cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION})
+endif ()
+
+# Determine if fmt is built as a subproject (using add_subdirectory)
+# or if it is the master project.
+if (NOT DEFINED FMT_MASTER_PROJECT)
+  set(FMT_MASTER_PROJECT OFF)
+  if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
+    set(FMT_MASTER_PROJECT ON)
+    message(STATUS "CMake version: ${CMAKE_VERSION}")
+  endif ()
+endif ()
+
+# Joins arguments and places the results in ${result_var}.
+function(join result_var)
+  set(result "")
+  foreach (arg ${ARGN})
+    set(result "${result}${arg}")
+  endforeach ()
+  set(${result_var} "${result}" PARENT_SCOPE)
+endfunction()
+
+# DEPRECATED! Should be merged into add_module_library.
+function(enable_module target)
+  if (MSVC)
+    set(BMI ${CMAKE_CURRENT_BINARY_DIR}/${target}.ifc)
+    target_compile_options(${target}
+      PRIVATE /interface /ifcOutput ${BMI}
+      INTERFACE /reference fmt=${BMI})
+    set_target_properties(${target} PROPERTIES ADDITIONAL_CLEAN_FILES ${BMI})
+    set_source_files_properties(${BMI} PROPERTIES GENERATED ON)
+  endif ()
+endfunction()
+
+# Adds a library compiled with C++20 module support.
+# `enabled` is a CMake variables that specifies if modules are enabled.
+# If modules are disabled `add_module_library` falls back to creating a
+# non-modular library.
+#
+# Usage:
+#   add_module_library(<name> [sources...] FALLBACK [sources...] [IF enabled])
+function(add_module_library name)
+  cmake_parse_arguments(AML "" "IF" "FALLBACK" ${ARGN})
+  set(sources ${AML_UNPARSED_ARGUMENTS})
+
+  add_library(${name})
+  set_target_properties(${name} PROPERTIES LINKER_LANGUAGE CXX)
+
+  if (NOT ${${AML_IF}})
+    # Create a non-modular library.
+    target_sources(${name} PRIVATE ${AML_FALLBACK})
+    return()
+  endif ()
+
+  # Modules require C++20.
+  target_compile_features(${name} PUBLIC cxx_std_20)
+  if (CMAKE_COMPILER_IS_GNUCXX)
+    target_compile_options(${name} PUBLIC -fmodules-ts)
+  endif ()
+
+  # `std` is affected by CMake options and may be higher than C++20.
+  get_target_property(std ${name} CXX_STANDARD)
+
+  if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    set(pcms)
+    foreach (src ${sources})
+      get_filename_component(pcm ${src} NAME_WE)
+      set(pcm ${pcm}.pcm)
+
+      # Propagate -fmodule-file=*.pcm to targets that link with this library.
+      target_compile_options(
+        ${name} PUBLIC -fmodule-file=${CMAKE_CURRENT_BINARY_DIR}/${pcm})
+
+      # Use an absolute path to prevent target_link_libraries prepending -l
+      # to it.
+      set(pcms ${pcms} ${CMAKE_CURRENT_BINARY_DIR}/${pcm})
+      add_custom_command(
+        OUTPUT ${pcm}
+        COMMAND ${CMAKE_CXX_COMPILER}
+                -std=c++${std} -x c++-module --precompile -c
+                -o ${pcm} ${CMAKE_CURRENT_SOURCE_DIR}/${src}
+                "-I$<JOIN:$<TARGET_PROPERTY:${name},INCLUDE_DIRECTORIES>,;-I>"
+        # Required by the -I generator expression above.
+        COMMAND_EXPAND_LISTS
+        DEPENDS ${src})
+    endforeach ()
+
+    # Add .pcm files as sources to make sure they are built before the library.
+    set(sources)
+    foreach (pcm ${pcms})
+      get_filename_component(pcm_we ${pcm} NAME_WE)
+      set(obj ${pcm_we}.o)
+      # Use an absolute path to prevent target_link_libraries prepending -l.
+      set(sources ${sources} ${pcm} ${CMAKE_CURRENT_BINARY_DIR}/${obj})
+      add_custom_command(
+        OUTPUT ${obj}
+        COMMAND ${CMAKE_CXX_COMPILER} $<TARGET_PROPERTY:${name},COMPILE_OPTIONS>
+                -c -o ${obj} ${pcm}
+        DEPENDS ${pcm})
+    endforeach ()
+  endif ()
+  target_sources(${name} PRIVATE ${sources})
+endfunction()
+
+include(CMakeParseArguments)
+
+# Sets a cache variable with a docstring joined from multiple arguments:
+#   set(<variable> <value>... CACHE <type> <docstring>...)
+# This allows splitting a long docstring for readability.
+function(set_verbose)
+  # cmake_parse_arguments is broken in CMake 3.4 (cannot parse CACHE) so use
+  # list instead.
+  list(GET ARGN 0 var)
+  list(REMOVE_AT ARGN 0)
+  list(GET ARGN 0 val)
+  list(REMOVE_AT ARGN 0)
+  list(REMOVE_AT ARGN 0)
+  list(GET ARGN 0 type)
+  list(REMOVE_AT ARGN 0)
+  join(doc ${ARGN})
+  set(${var} ${val} CACHE ${type} ${doc})
+endfunction()
+
+# Set the default CMAKE_BUILD_TYPE to Release.
+# This should be done before the project command since the latter can set
+# CMAKE_BUILD_TYPE itself (it does so for nmake).
+if (FMT_MASTER_PROJECT AND NOT CMAKE_BUILD_TYPE)
+  set_verbose(CMAKE_BUILD_TYPE Release CACHE STRING
+              "Choose the type of build, options are: None(CMAKE_CXX_FLAGS or "
+              "CMAKE_C_FLAGS used) Debug Release RelWithDebInfo MinSizeRel.")
+endif ()
+
+project(FMT CXX)
+include(GNUInstallDirs)
+set_verbose(FMT_INC_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE STRING
+            "Installation directory for include files, a relative path that "
+            "will be joined with ${CMAKE_INSTALL_PREFIX} or an absolute path.")
+
+option(FMT_PEDANTIC "Enable extra warnings and expensive tests." OFF)
+option(FMT_WERROR "Halt the compilation with an error on compiler warnings."
+       OFF)
+
+# Options that control generation of various targets.
+option(FMT_DOC "Generate the doc target." ${FMT_MASTER_PROJECT})
+option(FMT_INSTALL "Generate the install target." ON)
+option(FMT_TEST "Generate the test target." ${FMT_MASTER_PROJECT})
+option(FMT_FUZZ "Generate the fuzz target." OFF)
+option(FMT_CUDA_TEST "Generate the cuda-test target." OFF)
+option(FMT_OS "Include core requiring OS (Windows/Posix) " ON)
+option(FMT_MODULE "Build a module instead of a traditional library." OFF)
+option(FMT_SYSTEM_HEADERS "Expose headers with marking them as system." OFF)
+
+if (FMT_TEST AND FMT_MODULE)
+  # The tests require {fmt} to be compiled as traditional library
+  message(STATUS "Testing is incompatible with build mode 'module'.")
+endif ()
+set(FMT_SYSTEM_HEADERS_ATTRIBUTE "")
+if (FMT_SYSTEM_HEADERS)
+  set(FMT_SYSTEM_HEADERS_ATTRIBUTE SYSTEM)
+endif ()
+if (CMAKE_SYSTEM_NAME STREQUAL "MSDOS")
+  set(FMT_TEST OFF)
+  message(STATUS "MSDOS is incompatible with gtest")
+endif ()
+
+# Get version from core.h
+file(READ include/fmt/core.h core_h)
+if (NOT core_h MATCHES "FMT_VERSION ([0-9]+)([0-9][0-9])([0-9][0-9])")
+  message(FATAL_ERROR "Cannot get FMT_VERSION from core.h.")
+endif ()
+# Use math to skip leading zeros if any.
+math(EXPR CPACK_PACKAGE_VERSION_MAJOR ${CMAKE_MATCH_1})
+math(EXPR CPACK_PACKAGE_VERSION_MINOR ${CMAKE_MATCH_2})
+math(EXPR CPACK_PACKAGE_VERSION_PATCH ${CMAKE_MATCH_3})
+join(FMT_VERSION ${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.
+                 ${CPACK_PACKAGE_VERSION_PATCH})
+message(STATUS "Version: ${FMT_VERSION}")
+
+message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
+
+if (NOT CMAKE_RUNTIME_OUTPUT_DIRECTORY)
+  set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
+endif ()
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}
+  "${CMAKE_CURRENT_SOURCE_DIR}/support/cmake")
+
+include(CheckCXXCompilerFlag)
+include(JoinPaths)
+
+if (FMT_MASTER_PROJECT AND NOT DEFINED CMAKE_CXX_VISIBILITY_PRESET)
+  set_verbose(CMAKE_CXX_VISIBILITY_PRESET hidden CACHE STRING
+              "Preset for the export of private symbols")
+  set_property(CACHE CMAKE_CXX_VISIBILITY_PRESET PROPERTY STRINGS
+               hidden default)
+endif ()
+
+if (FMT_MASTER_PROJECT AND NOT DEFINED CMAKE_VISIBILITY_INLINES_HIDDEN)
+  set_verbose(CMAKE_VISIBILITY_INLINES_HIDDEN ON CACHE BOOL
+              "Whether to add a compile flag to hide symbols of inline functions")
+endif ()
+
+if (CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+  set(PEDANTIC_COMPILE_FLAGS -pedantic-errors -Wall -Wextra -pedantic
+      -Wold-style-cast -Wundef
+      -Wredundant-decls -Wwrite-strings -Wpointer-arith
+      -Wcast-qual -Wformat=2 -Wmissing-include-dirs
+      -Wcast-align
+      -Wctor-dtor-privacy -Wdisabled-optimization
+      -Winvalid-pch -Woverloaded-virtual
+      -Wconversion -Wundef
+      -Wno-ctor-dtor-privacy -Wno-format-nonliteral)
+  if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.6)
+      set(PEDANTIC_COMPILE_FLAGS ${PEDANTIC_COMPILE_FLAGS}
+         -Wno-dangling-else -Wno-unused-local-typedefs)
+  endif ()
+  if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
+      set(PEDANTIC_COMPILE_FLAGS ${PEDANTIC_COMPILE_FLAGS} -Wdouble-promotion
+          -Wtrampolines -Wzero-as-null-pointer-constant -Wuseless-cast
+          -Wvector-operation-performance -Wsized-deallocation -Wshadow)
+  endif ()
+  if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.0)
+      set(PEDANTIC_COMPILE_FLAGS ${PEDANTIC_COMPILE_FLAGS} -Wshift-overflow=2
+          -Wnull-dereference -Wduplicated-cond)
+  endif ()
+  set(WERROR_FLAG -Werror)
+endif ()
+
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  set(PEDANTIC_COMPILE_FLAGS -Wall -Wextra -pedantic -Wconversion -Wundef
+      -Wdeprecated -Wweak-vtables -Wshadow
+      -Wno-gnu-zero-variadic-macro-arguments)
+  check_cxx_compiler_flag(-Wzero-as-null-pointer-constant HAS_NULLPTR_WARNING)
+  if (HAS_NULLPTR_WARNING)
+    set(PEDANTIC_COMPILE_FLAGS ${PEDANTIC_COMPILE_FLAGS}
+        -Wzero-as-null-pointer-constant)
+  endif ()
+  set(WERROR_FLAG -Werror)
+endif ()
+
+if (MSVC)
+  set(PEDANTIC_COMPILE_FLAGS /W3)
+  set(WERROR_FLAG /WX)
+endif ()
+
+if (FMT_MASTER_PROJECT AND CMAKE_GENERATOR MATCHES "Visual Studio")
+  # If Microsoft SDK is installed create script run-msbuild.bat that
+  # calls SetEnv.cmd to set up build environment and runs msbuild.
+  # It is useful when building Visual Studio projects with the SDK
+  # toolchain rather than Visual Studio.
+  include(FindSetEnv)
+  if (WINSDK_SETENV)
+    set(MSBUILD_SETUP "call \"${WINSDK_SETENV}\"")
+  endif ()
+  # Set FrameworkPathOverride to get rid of MSB3644 warnings.
+  join(netfxpath
+       "C:\\Program Files\\Reference Assemblies\\Microsoft\\Framework\\"
+       ".NETFramework\\v4.0")
+  file(WRITE run-msbuild.bat "
+    ${MSBUILD_SETUP}
+    ${CMAKE_MAKE_PROGRAM} -p:FrameworkPathOverride=\"${netfxpath}\" %*")
+endif ()
+
+function(add_headers VAR)
+  set(headers ${${VAR}})
+  foreach (header ${ARGN})
+    set(headers ${headers} include/fmt/${header})
+  endforeach()
+  set(${VAR} ${headers} PARENT_SCOPE)
+endfunction()
+
+# Define the fmt library, its includes and the needed defines.
+add_headers(FMT_HEADERS args.h chrono.h color.h compile.h core.h format.h
+                        format-inl.h os.h ostream.h printf.h ranges.h std.h
+                        xchar.h)
+set(FMT_SOURCES src/format.cc)
+if (FMT_OS)
+  set(FMT_SOURCES ${FMT_SOURCES} src/os.cc)
+endif ()
+
+add_module_library(fmt src/fmt.cc FALLBACK
+                   ${FMT_SOURCES} ${FMT_HEADERS} README.md ChangeLog.md
+                   IF FMT_MODULE)
+add_library(fmt::fmt ALIAS fmt)
+if (FMT_MODULE)
+  enable_module(fmt)
+endif ()
+
+if (FMT_WERROR)
+  target_compile_options(fmt PRIVATE ${WERROR_FLAG})
+endif ()
+if (FMT_PEDANTIC)
+  target_compile_options(fmt PRIVATE ${PEDANTIC_COMPILE_FLAGS})
+endif ()
+
+if (cxx_std_11 IN_LIST CMAKE_CXX_COMPILE_FEATURES)
+  target_compile_features(fmt PUBLIC cxx_std_11)
+else ()
+  message(WARNING "Feature cxx_std_11 is unknown for the CXX compiler")
+endif ()
+
+target_include_directories(fmt ${FMT_SYSTEM_HEADERS_ATTRIBUTE} PUBLIC
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
+  $<INSTALL_INTERFACE:${FMT_INC_DIR}>)
+
+set(FMT_DEBUG_POSTFIX d CACHE STRING "Debug library postfix.")
+
+set_target_properties(fmt PROPERTIES
+  VERSION ${FMT_VERSION} SOVERSION ${CPACK_PACKAGE_VERSION_MAJOR}
+  PUBLIC_HEADER "${FMT_HEADERS}"
+  DEBUG_POSTFIX "${FMT_DEBUG_POSTFIX}"
+
+  # Workaround for Visual Studio 2017:
+  # Ensure the .pdb is created with the same name and in the same directory
+  # as the .lib. Newer VS versions already do this by default, but there is no
+  # harm in setting it for those too. Ignored by other generators.
+  COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+  COMPILE_PDB_NAME "fmt"
+  COMPILE_PDB_NAME_DEBUG "fmt${FMT_DEBUG_POSTFIX}")
+
+# Set FMT_LIB_NAME for pkg-config fmt.pc. We cannot use the OUTPUT_NAME target
+# property because it's not set by default.
+set(FMT_LIB_NAME fmt)
+if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+  set(FMT_LIB_NAME ${FMT_LIB_NAME}${FMT_DEBUG_POSTFIX})
+endif ()
+
+if (BUILD_SHARED_LIBS)
+  target_compile_definitions(fmt PRIVATE FMT_LIB_EXPORT INTERFACE FMT_SHARED)
+endif ()
+if (FMT_SAFE_DURATION_CAST)
+  target_compile_definitions(fmt PUBLIC FMT_SAFE_DURATION_CAST)
+endif ()
+
+add_library(fmt-header-only INTERFACE)
+add_library(fmt::fmt-header-only ALIAS fmt-header-only)
+
+target_compile_definitions(fmt-header-only INTERFACE FMT_HEADER_ONLY=1)
+target_compile_features(fmt-header-only INTERFACE cxx_std_11)
+
+target_include_directories(fmt-header-only
+  ${FMT_SYSTEM_HEADERS_ATTRIBUTE} INTERFACE
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
+  $<INSTALL_INTERFACE:${FMT_INC_DIR}>)
+
+# Install targets.
+if (FMT_INSTALL)
+  include(CMakePackageConfigHelpers)
+  set_verbose(FMT_CMAKE_DIR ${CMAKE_INSTALL_LIBDIR}/cmake/fmt CACHE STRING
+              "Installation directory for cmake files, a relative path that "
+              "will be joined with ${CMAKE_INSTALL_PREFIX} or an absolute "
+              "path.")
+  set(version_config ${PROJECT_BINARY_DIR}/fmt-config-version.cmake)
+  set(project_config ${PROJECT_BINARY_DIR}/fmt-config.cmake)
+  set(pkgconfig ${PROJECT_BINARY_DIR}/fmt.pc)
+  set(targets_export_name fmt-targets)
+
+  set_verbose(FMT_LIB_DIR ${CMAKE_INSTALL_LIBDIR} CACHE STRING
+              "Installation directory for libraries, a relative path that "
+              "will be joined to ${CMAKE_INSTALL_PREFIX} or an absolute path.")
+
+  set_verbose(FMT_PKGCONFIG_DIR ${CMAKE_INSTALL_LIBDIR}/pkgconfig CACHE STRING
+              "Installation directory for pkgconfig (.pc) files, a relative "
+              "path that will be joined with ${CMAKE_INSTALL_PREFIX} or an "
+              "absolute path.")
+
+  # Generate the version, config and target files into the build directory.
+  write_basic_package_version_file(
+    ${version_config}
+    VERSION ${FMT_VERSION}
+    COMPATIBILITY AnyNewerVersion)
+
+  join_paths(libdir_for_pc_file "\${exec_prefix}" "${FMT_LIB_DIR}")
+  join_paths(includedir_for_pc_file "\${prefix}" "${FMT_INC_DIR}")
+
+  configure_file(
+    "${PROJECT_SOURCE_DIR}/support/cmake/fmt.pc.in"
+    "${pkgconfig}"
+    @ONLY)
+  configure_package_config_file(
+    ${PROJECT_SOURCE_DIR}/support/cmake/fmt-config.cmake.in
+    ${project_config}
+    INSTALL_DESTINATION ${FMT_CMAKE_DIR})
+
+  set(INSTALL_TARGETS fmt fmt-header-only)
+
+  # Install the library and headers.
+  install(TARGETS ${INSTALL_TARGETS} EXPORT ${targets_export_name}
+          LIBRARY DESTINATION ${FMT_LIB_DIR}
+          ARCHIVE DESTINATION ${FMT_LIB_DIR}
+          PUBLIC_HEADER DESTINATION "${FMT_INC_DIR}/fmt"
+          RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+  # Use a namespace because CMake provides better diagnostics for namespaced
+  # imported targets.
+  export(TARGETS ${INSTALL_TARGETS} NAMESPACE fmt::
+         FILE ${PROJECT_BINARY_DIR}/${targets_export_name}.cmake)
+
+  # Install version, config and target files.
+  install(
+    FILES ${project_config} ${version_config}
+    DESTINATION ${FMT_CMAKE_DIR})
+  install(EXPORT ${targets_export_name} DESTINATION ${FMT_CMAKE_DIR}
+          NAMESPACE fmt::)
+
+  install(FILES "${pkgconfig}" DESTINATION "${FMT_PKGCONFIG_DIR}")
+endif ()
+
+if (FMT_DOC)
+  add_subdirectory(doc)
+endif ()
+
+if (FMT_TEST)
+  enable_testing()
+  add_subdirectory(test)
+endif ()
+
+# Control fuzzing independent of the unit tests.
+if (FMT_FUZZ)
+  add_subdirectory(test/fuzzing)
+
+  # The FMT_FUZZ macro is used to prevent resource exhaustion in fuzzing
+  # mode and make fuzzing practically possible. It is similar to
+  # FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION but uses a different name to
+  # avoid interfering with fuzzing of projects that use {fmt}.
+  # See also https://llvm.org/docs/LibFuzzer.html#fuzzer-friendly-build-mode.
+  target_compile_definitions(fmt PUBLIC FMT_FUZZ)
+endif ()
+
+set(gitignore ${PROJECT_SOURCE_DIR}/.gitignore)
+if (FMT_MASTER_PROJECT AND EXISTS ${gitignore})
+  # Get the list of ignored files from .gitignore.
+  file (STRINGS ${gitignore} lines)
+  list(REMOVE_ITEM lines /doc/html)
+  foreach (line ${lines})
+    string(REPLACE "." "[.]" line "${line}")
+    string(REPLACE "*" ".*" line "${line}")
+    set(ignored_files ${ignored_files} "${line}$" "${line}/")
+  endforeach ()
+  set(ignored_files ${ignored_files}
+    /.git /breathe /format-benchmark sphinx/ .buildinfo .doctrees)
+
+  set(CPACK_SOURCE_GENERATOR ZIP)
+  set(CPACK_SOURCE_IGNORE_FILES ${ignored_files})
+  set(CPACK_SOURCE_PACKAGE_FILE_NAME fmt-${FMT_VERSION})
+  set(CPACK_PACKAGE_NAME fmt)
+  set(CPACK_RESOURCE_FILE_README ${PROJECT_SOURCE_DIR}/README.md)
+  include(CPack)
+endif ()
diff --git a/ethosu/regor/dependencies/thirdparty/fmt/CONTRIBUTING.md b/ethosu/regor/dependencies/thirdparty/fmt/CONTRIBUTING.md
new file mode 100644
index 00000000..b82f1450
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/fmt/CONTRIBUTING.md
@@ -0,0 +1,20 @@
+Contributing to {fmt}
+=====================
+
+By submitting a pull request or a patch, you represent that you have the right
+to license your contribution to the {fmt} project owners and the community,
+agree that your contributions are licensed under the {fmt} license, and agree
+to future changes to the licensing.
+
+All C++ code must adhere to [Google C++ Style Guide](
+https://google.github.io/styleguide/cppguide.html) with the following
+exceptions:
+
+* Exceptions are permitted
+* snake_case should be used instead of UpperCamelCase for function and type
+  names
+
+All documentation must adhere to the [Google Developer Documentation Style
+Guide](https://developers.google.com/style).
+
+Thanks for contributing!
diff --git a/ethosu/regor/dependencies/thirdparty/fmt/ChangeLog.md b/ethosu/regor/dependencies/thirdparty/fmt/ChangeLog.md
new file mode 100644
index 00000000..515f9fdc
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/fmt/ChangeLog.md
@@ -0,0 +1,5533 @@
+# 10.2.1 - 2024-01-03
+
+-   Fixed ABI compatibility with earlier 10.x versions
+    (https://github.com/fmtlib/fmt/pull/3786). Thanks @saraedum.
+
+# 10.2.0 - 2024-01-01
+
+-   Added support for the `%j` specifier (the number of days) for
+    `std::chrono::duration` (https://github.com/fmtlib/fmt/issues/3643,
+    https://github.com/fmtlib/fmt/pull/3732). Thanks @intelfx.
+
+-   Added support for the chrono suffix for days and changed
+    the suffix for minutes from "m" to the correct "min"
+    (https://github.com/fmtlib/fmt/issues/3662,
+    https://github.com/fmtlib/fmt/pull/3664).
+    For example ([godbolt](https://godbolt.org/z/9KhMnq9ba)):
+
+    ```c++
+    #include <fmt/chrono.h>
+
+    int main() {
+      fmt::print("{}\n", std::chrono::days(42)); // prints "42d"
+    }
+    ```
+
+    Thanks @Richardk2n.
+
+-   Fixed an overflow in `std::chrono::time_point` formatting with large dates
+    (https://github.com/fmtlib/fmt/issues/3725,
+    https://github.com/fmtlib/fmt/pull/3727). Thanks @cschreib.
+
+-   Added a formatter for `std::source_location`
+    (https://github.com/fmtlib/fmt/pull/3730).
+    For example ([godbolt](https://godbolt.org/z/YajfKjhhr)):
+
+    ```c++
+    #include <source_location>
+    #include <fmt/std.h>
+
+    int main() {
+      fmt::print("{}\n", std::source_location::current());
+    }
+    ```
+
+    prints
+
+    ```
+    /app/example.cpp:5:51: int main()
+    ```
+
+    Thanks @felix642.
+
+-   Added a formatter for `std::bitset`
+    (https://github.com/fmtlib/fmt/pull/3660).
+    For example ([godbolt](https://godbolt.org/z/bdEaGeYxe)):
+
+    ```c++
+    #include <bitset>
+    #include <fmt/std.h>
+
+    int main() {
+      fmt::print("{}\n", std::bitset<6>(42)); // prints "101010"
+    }
+    ```
+
+    Thanks @muggenhor.
+
+-   Added an experimental `nested_formatter` that provides an easy way of
+    applying a formatter to one or more subobjects while automatically handling
+    width, fill and alignment. For example:
+
+    ```c++
+    #include <fmt/format.h>
+
+    struct point {
+      double x, y;
+    };
+
+    template <>
+    struct fmt::formatter<point> : nested_formatter<double> {
+      auto format(point p, format_context& ctx) const {
+        return write_padded(ctx, [=](auto out) {
+          return format_to(out, "({}, {})", nested(p.x), nested(p.y));
+        });
+      }
+    };
+
+    int main() {
+      fmt::print("[{:>20.2f}]", point{1, 2});
+    }
+    ```
+
+    prints
+
+    ```
+    [          (1.00, 2.00)]
+    ```
+
+-   Added the generic representation (`g`) to `std::filesystem::path`
+    (https://github.com/fmtlib/fmt/issues/3715,
+    https://github.com/fmtlib/fmt/pull/3729). For example:
+
+    ```c++
+    #include <filesystem>
+    #include <fmt/std.h>
+
+    int main() {
+      fmt::print("{:g}\n", std::filesystem::path("C:\\foo"));
+    }
+    ```
+
+    prints `"C:/foo"` on Windows.
+
+    Thanks @js324.
+
+-   Made `format_as` work with references
+    (https://github.com/fmtlib/fmt/pull/3739). Thanks @tchaikov.
+
+-   Fixed formatting of invalid UTF-8 with precision
+    (https://github.com/fmtlib/fmt/issues/3284).
+
+-   Fixed an inconsistency between `fmt::to_string` and `fmt::format`
+    (https://github.com/fmtlib/fmt/issues/3684).
+
+-   Disallowed unsafe uses of `fmt::styled`
+    (https://github.com/fmtlib/fmt/issues/3625):
+
+    ```c++
+    auto s = fmt::styled(std::string("dangle"), fmt::emphasis::bold);
+    fmt::print("{}\n", s); // compile error
+    ```
+
+    Pass `fmt::styled(...)` as a parameter instead.
+
+-   Added a null check when formatting a C string with the `s` specifier
+    (https://github.com/fmtlib/fmt/issues/3706).
+
+-   Disallowed the `c` specifier for `bool`
+    (https://github.com/fmtlib/fmt/issues/3726,
+    https://github.com/fmtlib/fmt/pull/3734). Thanks @js324.
+
+-   Made the default formatting unlocalized in `fmt::ostream_formatter` for
+    consistency with the rest of the library
+    (https://github.com/fmtlib/fmt/issues/3460).
+
+-   Fixed localized formatting in bases other than decimal
+    (https://github.com/fmtlib/fmt/issues/3693,
+    https://github.com/fmtlib/fmt/pull/3750). Thanks @js324.
+
+-   Fixed a performance regression in experimental `fmt::ostream::print`
+    (https://github.com/fmtlib/fmt/issues/3674).
+
+-   Added synchronization with the underlying output stream when writing to
+    the Windows console
+    (https://github.com/fmtlib/fmt/pull/3668,
+    https://github.com/fmtlib/fmt/issues/3688,
+    https://github.com/fmtlib/fmt/pull/3689).
+    Thanks @Roman-Koshelev and @dimztimz.
+
+-   Changed to only export `format_error` when {fmt} is built as a shared
+    library (https://github.com/fmtlib/fmt/issues/3626,
+    https://github.com/fmtlib/fmt/pull/3627). Thanks @phprus.
+
+-   Made `fmt::streamed` `constexpr`.
+    (https://github.com/fmtlib/fmt/pull/3650). Thanks @muggenhor.
+
+-   Enabled `consteval` on older versions of MSVC
+    (https://github.com/fmtlib/fmt/pull/3757). Thanks @phprus.
+
+-   Added an option to build without `wchar_t` support on Windows
+    (https://github.com/fmtlib/fmt/issues/3631,
+    https://github.com/fmtlib/fmt/pull/3636). Thanks @glebm.
+
+-   Improved build and CI configuration
+    (https://github.com/fmtlib/fmt/pull/3679,
+    https://github.com/fmtlib/fmt/issues/3701,
+    https://github.com/fmtlib/fmt/pull/3702,
+    https://github.com/fmtlib/fmt/pull/3749).
+    Thanks @jcar87, @pklima and @tchaikov.
+
+-   Fixed various warnings, compilation and test issues
+    (https://github.com/fmtlib/fmt/issues/3607,
+    https://github.com/fmtlib/fmt/pull/3610,
+    https://github.com/fmtlib/fmt/pull/3624,
+    https://github.com/fmtlib/fmt/pull/3630,
+    https://github.com/fmtlib/fmt/pull/3634,
+    https://github.com/fmtlib/fmt/pull/3638,
+    https://github.com/fmtlib/fmt/issues/3645,
+    https://github.com/fmtlib/fmt/issues/3646,
+    https://github.com/fmtlib/fmt/pull/3647,
+    https://github.com/fmtlib/fmt/pull/3652,
+    https://github.com/fmtlib/fmt/issues/3654,
+    https://github.com/fmtlib/fmt/pull/3663,
+    https://github.com/fmtlib/fmt/issues/3670,
+    https://github.com/fmtlib/fmt/pull/3680,
+    https://github.com/fmtlib/fmt/issues/3694,
+    https://github.com/fmtlib/fmt/pull/3695,
+    https://github.com/fmtlib/fmt/pull/3699,
+    https://github.com/fmtlib/fmt/issues/3705,
+    https://github.com/fmtlib/fmt/issues/3710,
+    https://github.com/fmtlib/fmt/issues/3712,
+    https://github.com/fmtlib/fmt/pull/3713,
+    https://github.com/fmtlib/fmt/issues/3714,
+    https://github.com/fmtlib/fmt/pull/3716,
+    https://github.com/fmtlib/fmt/pull/3723,
+    https://github.com/fmtlib/fmt/issues/3738,
+    https://github.com/fmtlib/fmt/issues/3740,
+    https://github.com/fmtlib/fmt/pull/3741,
+    https://github.com/fmtlib/fmt/pull/3743,
+    https://github.com/fmtlib/fmt/issues/3745,
+    https://github.com/fmtlib/fmt/pull/3747,
+    https://github.com/fmtlib/fmt/pull/3748,
+    https://github.com/fmtlib/fmt/pull/3751,
+    https://github.com/fmtlib/fmt/pull/3754,
+    https://github.com/fmtlib/fmt/pull/3755,
+    https://github.com/fmtlib/fmt/issues/3760,
+    https://github.com/fmtlib/fmt/pull/3762,
+    https://github.com/fmtlib/fmt/issues/3763,
+    https://github.com/fmtlib/fmt/pull/3764,
+    https://github.com/fmtlib/fmt/issues/3774,
+    https://github.com/fmtlib/fmt/pull/3779).
+    Thanks @danakj, @vinayyadav3016, @cyyever, @phprus, @qimiko, @saschasc,
+    @gsjaardema, @lazka, @Zhaojun-Liu, @carlsmedstad, @hotwatermorning,
+    @cptFracassa, @kuguma, @PeterJohnson, @H1X4Dev, @asantoni, @eltociear,
+    @msimberg, @tchaikov, @waywardmonkeys.
+
+-   Improved documentation and README
+    (https://github.com/fmtlib/fmt/issues/2086,
+    https://github.com/fmtlib/fmt/issues/3637,
+    https://github.com/fmtlib/fmt/pull/3642,
+    https://github.com/fmtlib/fmt/pull/3653,
+    https://github.com/fmtlib/fmt/pull/3655,
+    https://github.com/fmtlib/fmt/pull/3661,
+    https://github.com/fmtlib/fmt/issues/3673,
+    https://github.com/fmtlib/fmt/pull/3677,
+    https://github.com/fmtlib/fmt/pull/3737,
+    https://github.com/fmtlib/fmt/issues/3742,
+    https://github.com/fmtlib/fmt/pull/3744).
+    Thanks @idzm, @perlun, @joycebrum, @fennewald, @reinhardt1053, @GeorgeLS.
+
+-   Updated CI dependencies
+    (https://github.com/fmtlib/fmt/pull/3615,
+    https://github.com/fmtlib/fmt/pull/3622,
+    https://github.com/fmtlib/fmt/pull/3623,
+    https://github.com/fmtlib/fmt/pull/3666,
+    https://github.com/fmtlib/fmt/pull/3696,
+    https://github.com/fmtlib/fmt/pull/3697,
+    https://github.com/fmtlib/fmt/pull/3759,
+    https://github.com/fmtlib/fmt/pull/3782).
+
+# 10.1.1 - 2023-08-28
+
+-   Added formatters for `std::atomic` and `atomic_flag`
+    (https://github.com/fmtlib/fmt/pull/3574,
+    https://github.com/fmtlib/fmt/pull/3594).
+    Thanks @wangzw and @AlexGuteniev.
+-   Fixed an error about partial specialization of `formatter<string>`
+    after instantiation when compiled with gcc and C++20
+    (https://github.com/fmtlib/fmt/issues/3584).
+-   Fixed compilation as a C++20 module with gcc and clang
+    (https://github.com/fmtlib/fmt/issues/3587,
+    https://github.com/fmtlib/fmt/pull/3597,
+    https://github.com/fmtlib/fmt/pull/3605).
+    Thanks @MathewBensonCode.
+-   Made `fmt::to_string` work with types that have `format_as`
+    overloads (https://github.com/fmtlib/fmt/pull/3575). Thanks @phprus.
+-   Made `formatted_size` work with integral format specifiers at
+    compile time (https://github.com/fmtlib/fmt/pull/3591).
+    Thanks @elbeno.
+-   Fixed a warning about the `no_unique_address` attribute on clang-cl
+    (https://github.com/fmtlib/fmt/pull/3599). Thanks @lukester1975.
+-   Improved compatibility with the legacy GBK encoding
+    (https://github.com/fmtlib/fmt/issues/3598,
+    https://github.com/fmtlib/fmt/pull/3599). Thanks @YuHuanTin.
+-   Added OpenSSF Scorecard analysis
+    (https://github.com/fmtlib/fmt/issues/3530,
+    https://github.com/fmtlib/fmt/pull/3571). Thanks @joycebrum.
+-   Updated CI dependencies
+    (https://github.com/fmtlib/fmt/pull/3591,
+    https://github.com/fmtlib/fmt/pull/3592,
+    https://github.com/fmtlib/fmt/pull/3593,
+    https://github.com/fmtlib/fmt/pull/3602).
+
+# 10.1.0 - 2023-08-12
+
+-   Optimized format string compilation resulting in up to 40% speed up
+    in compiled `format_to` and \~4x speed up in compiled `format_to_n`
+    on a concatenation benchmark
+    (https://github.com/fmtlib/fmt/issues/3133,
+    https://github.com/fmtlib/fmt/issues/3484).
+
+    {fmt} 10.0:
+
+        ---------------------------------------------------------
+        Benchmark               Time             CPU   Iterations
+        ---------------------------------------------------------
+        BM_format_to         78.9 ns         78.9 ns      8881746
+        BM_format_to_n        568 ns          568 ns      1232089
+
+    {fmt} 10.1:
+
+        ---------------------------------------------------------
+        Benchmark               Time             CPU   Iterations
+        ---------------------------------------------------------
+        BM_format_to         54.9 ns         54.9 ns     12727944
+        BM_format_to_n        133 ns          133 ns      5257795
+
+-   Optimized storage of an empty allocator in `basic_memory_buffer`
+    (https://github.com/fmtlib/fmt/pull/3485). Thanks @Minty-Meeo.
+
+-   Added formatters for proxy references to elements of
+    `std::vector<bool>` and `std::bitset<N>`
+    (https://github.com/fmtlib/fmt/issues/3567,
+    https://github.com/fmtlib/fmt/pull/3570). For example
+    ([godbolt](https://godbolt.org/z/zYb79Pvn8)):
+
+    ```c++
+    #include <vector>
+    #include <fmt/std.h>
+
+    int main() {
+      auto v = std::vector<bool>{true};
+      fmt::print("{}", v[0]);
+    }
+    ```
+
+    Thanks @phprus and @felix642.
+
+-   Fixed an ambiguous formatter specialization for containers that look
+    like container adaptors such as `boost::flat_set`
+    (https://github.com/fmtlib/fmt/issues/3556,
+    https://github.com/fmtlib/fmt/pull/3561). Thanks @5chmidti.
+
+-   Fixed compilation when formatting durations not convertible from
+    `std::chrono::seconds`
+    (https://github.com/fmtlib/fmt/pull/3430). Thanks @patlkli.
+
+-   Made the `formatter` specialization for `char*` const-correct
+    (https://github.com/fmtlib/fmt/pull/3432). Thanks @timsong-cpp.
+
+-   Made `{}` and `{:}` handled consistently during compile-time checks
+    (https://github.com/fmtlib/fmt/issues/3526).
+
+-   Disallowed passing temporaries to `make_format_args` to improve API
+    safety by preventing dangling references.
+
+-   Improved the compile-time error for unformattable types
+    (https://github.com/fmtlib/fmt/pull/3478). Thanks @BRevzin.
+
+-   Improved the floating-point formatter
+    (https://github.com/fmtlib/fmt/pull/3448,
+    https://github.com/fmtlib/fmt/pull/3450).
+    Thanks @florimond-collette.
+
+-   Fixed handling of precision for `long double` larger than 64 bits.
+    (https://github.com/fmtlib/fmt/issues/3539,
+    https://github.com/fmtlib/fmt/issues/3564).
+
+-   Made floating-point and chrono tests less platform-dependent
+    (https://github.com/fmtlib/fmt/issues/3337,
+    https://github.com/fmtlib/fmt/issues/3433,
+    https://github.com/fmtlib/fmt/pull/3434). Thanks @phprus.
+
+-   Removed the remnants of the Grisu floating-point formatter that has
+    been replaced by Dragonbox in earlier versions.
+
+-   Added `throw_format_error` to the public API
+    (https://github.com/fmtlib/fmt/pull/3551). Thanks @mjerabek.
+
+-   Made `FMT_THROW` assert even if assertions are disabled when
+    compiling with exceptions disabled
+    (https://github.com/fmtlib/fmt/issues/3418,
+    https://github.com/fmtlib/fmt/pull/3439). Thanks @BRevzin.
+
+-   Made `format_as` and `std::filesystem::path` formatter work with
+    exotic code unit types.
+    (https://github.com/fmtlib/fmt/pull/3457,
+    https://github.com/fmtlib/fmt/pull/3476). Thanks @gix and @hmbj.
+
+-   Added support for the `?` format specifier to
+    `std::filesystem::path` and made the default unescaped for
+    consistency with strings.
+
+-   Deprecated the wide stream overload of `printf`.
+
+-   Removed unused `basic_printf_parse_context`.
+
+-   Improved RTTI detection used when formatting exceptions
+    (https://github.com/fmtlib/fmt/pull/3468). Thanks @danakj.
+
+-   Improved compatibility with VxWorks7
+    (https://github.com/fmtlib/fmt/pull/3467). Thanks @wenshan1.
+
+-   Improved documentation
+    (https://github.com/fmtlib/fmt/issues/3174,
+    https://github.com/fmtlib/fmt/issues/3423,
+    https://github.com/fmtlib/fmt/pull/3454,
+    https://github.com/fmtlib/fmt/issues/3458,
+    https://github.com/fmtlib/fmt/pull/3461,
+    https://github.com/fmtlib/fmt/issues/3487,
+    https://github.com/fmtlib/fmt/pull/3515).
+    Thanks @zencatalyst, @rlalik and @mikecrowe.
+
+-   Improved build and CI configurations
+    (https://github.com/fmtlib/fmt/issues/3449,
+    https://github.com/fmtlib/fmt/pull/3451,
+    https://github.com/fmtlib/fmt/pull/3452,
+    https://github.com/fmtlib/fmt/pull/3453,
+    https://github.com/fmtlib/fmt/pull/3459,
+    https://github.com/fmtlib/fmt/issues/3481,
+    https://github.com/fmtlib/fmt/pull/3486,
+    https://github.com/fmtlib/fmt/issues/3489,
+    https://github.com/fmtlib/fmt/pull/3496,
+    https://github.com/fmtlib/fmt/issues/3517,
+    https://github.com/fmtlib/fmt/pull/3523,
+    https://github.com/fmtlib/fmt/pull/3563).
+    Thanks @joycebrum, @glebm, @phprus, @petrmanek, @setoye and @abouvier.
+
+-   Fixed various warnings and compilation issues
+    (https://github.com/fmtlib/fmt/issues/3408,
+    https://github.com/fmtlib/fmt/issues/3424,
+    https://github.com/fmtlib/fmt/issues/3444,
+    https://github.com/fmtlib/fmt/pull/3446,
+    https://github.com/fmtlib/fmt/pull/3475,
+    https://github.com/fmtlib/fmt/pull/3482,
+    https://github.com/fmtlib/fmt/issues/3492,
+    https://github.com/fmtlib/fmt/pull/3493,
+    https://github.com/fmtlib/fmt/pull/3508,
+    https://github.com/fmtlib/fmt/issues/3509,
+    https://github.com/fmtlib/fmt/issues/3533,
+    https://github.com/fmtlib/fmt/pull/3542,
+    https://github.com/fmtlib/fmt/issues/3543,
+    https://github.com/fmtlib/fmt/issues/3540,
+    https://github.com/fmtlib/fmt/pull/3544,
+    https://github.com/fmtlib/fmt/issues/3548,
+    https://github.com/fmtlib/fmt/pull/3549,
+    https://github.com/fmtlib/fmt/pull/3550,
+    https://github.com/fmtlib/fmt/pull/3552).
+    Thanks @adesitter, @hmbj, @Minty-Meeo, @phprus, @TobiSchluter,
+    @kieranclancy, @alexeedm, @jurihock, @Ozomahtli and @razaqq.
+
+# 10.0.0 - 2023-05-09
+
+-   Replaced Grisu with a new floating-point formatting algorithm for
+    given precision (https://github.com/fmtlib/fmt/issues/3262,
+    https://github.com/fmtlib/fmt/issues/2750,
+    https://github.com/fmtlib/fmt/pull/3269,
+    https://github.com/fmtlib/fmt/pull/3276). The new algorithm
+    is based on Dragonbox already used for the shortest representation
+    and gives substantial performance improvement:
+
+    ![](https://user-images.githubusercontent.com/33922675/211956670-84891a09-6867-47d9-82fc-3230da7abe0f.png)
+
+    -   Red: new algorithm
+    -   Green: new algorithm with `FMT_USE_FULL_CACHE_DRAGONBOX` defined
+        to 1
+    -   Blue: old algorithm
+
+    Thanks @jk-jeon.
+
+-   Replaced `snprintf`-based hex float formatter with an internal
+    implementation (https://github.com/fmtlib/fmt/pull/3179,
+    https://github.com/fmtlib/fmt/pull/3203). This removes the
+    last usage of `s(n)printf` in {fmt}. Thanks @phprus.
+
+-   Fixed alignment of floating-point numbers with localization
+    (https://github.com/fmtlib/fmt/issues/3263,
+    https://github.com/fmtlib/fmt/pull/3272). Thanks @ShawnZhong.
+
+-   Made handling of `#` consistent with `std::format`.
+
+-   Improved C++20 module support
+    (https://github.com/fmtlib/fmt/pull/3134,
+    https://github.com/fmtlib/fmt/pull/3254,
+    https://github.com/fmtlib/fmt/pull/3386,
+    https://github.com/fmtlib/fmt/pull/3387,
+    https://github.com/fmtlib/fmt/pull/3388,
+    https://github.com/fmtlib/fmt/pull/3392,
+    https://github.com/fmtlib/fmt/pull/3397,
+    https://github.com/fmtlib/fmt/pull/3399,
+    https://github.com/fmtlib/fmt/pull/3400).
+    Thanks @laitingsheng, @Orvid and @DanielaE.
+    
+-   Switched to the [modules CMake library](https://github.com/vitaut/modules)
+    which allows building {fmt} as a C++20 module with clang:
+
+        CXX=clang++ cmake -DFMT_MODULE=ON .
+        make
+
+-   Made `format_as` work with any user-defined type and not just enums.
+    For example ([godbolt](https://godbolt.org/z/b7rqhq5Kh)):
+
+    ```c++
+    #include <fmt/format.h>
+
+    struct floaty_mc_floatface {
+      double value;
+    };
+
+    auto format_as(floaty_mc_floatface f) { return f.value; }
+
+    int main() {
+      fmt::print("{:8}\n", floaty_mc_floatface{0.42}); // prints "    0.42"
+    }
+    ```
+
+-   Removed deprecated implicit conversions for enums and conversions to
+    primitive types for compatibility with `std::format` and to prevent
+    potential ODR violations. Use `format_as` instead.
+
+-   Added support for fill, align and width to the time point formatter
+    (https://github.com/fmtlib/fmt/issues/3237,
+    https://github.com/fmtlib/fmt/pull/3260,
+    https://github.com/fmtlib/fmt/pull/3275). For example
+    ([godbolt](https://godbolt.org/z/rKP6MGz6c)):
+
+    ```c++
+    #include <fmt/chrono.h>
+
+    int main() {
+      // prints "    2023"
+      fmt::print("{:>8%Y}\n", std::chrono::system_clock::now());
+    }
+    ```
+
+    Thanks @ShawnZhong.
+
+-   Implemented formatting of subseconds
+    (https://github.com/fmtlib/fmt/issues/2207,
+    https://github.com/fmtlib/fmt/issues/3117,
+    https://github.com/fmtlib/fmt/pull/3115,
+    https://github.com/fmtlib/fmt/pull/3143,
+    https://github.com/fmtlib/fmt/pull/3144,
+    https://github.com/fmtlib/fmt/pull/3349). For example
+    ([godbolt](https://godbolt.org/z/45738oGEo)):
+
+    ```c++
+    #include <fmt/chrono.h>
+
+    int main() {
+      // prints 01.234567
+      fmt::print("{:%S}\n", std::chrono::microseconds(1234567));
+    }
+    ```
+
+    Thanks @patrickroocks @phprus and @BRevzin.
+
+-   Added precision support to `%S`
+    (https://github.com/fmtlib/fmt/pull/3148). Thanks @SappyJoy
+
+-   Added support for `std::utc_time`
+    (https://github.com/fmtlib/fmt/issues/3098,
+    https://github.com/fmtlib/fmt/pull/3110). Thanks @patrickroocks.
+
+-   Switched formatting of `std::chrono::system_clock` from local time
+    to UTC for compatibility with the standard
+    (https://github.com/fmtlib/fmt/issues/3199,
+    https://github.com/fmtlib/fmt/pull/3230). Thanks @ned14.
+
+-   Added support for `%Ez` and `%Oz` to chrono formatters.
+    (https://github.com/fmtlib/fmt/issues/3220,
+    https://github.com/fmtlib/fmt/pull/3222). Thanks @phprus.
+
+-   Improved validation of format specifiers for `std::chrono::duration`
+    (https://github.com/fmtlib/fmt/issues/3219,
+    https://github.com/fmtlib/fmt/pull/3232). Thanks @ShawnZhong.
+
+-   Fixed formatting of time points before the epoch
+    (https://github.com/fmtlib/fmt/issues/3117,
+    https://github.com/fmtlib/fmt/pull/3261). For example
+    ([godbolt](https://godbolt.org/z/f7bcznb3W)):
+
+    ```c++
+    #include <fmt/chrono.h>
+
+    int main() {
+      auto t = std::chrono::system_clock::from_time_t(0) -
+               std::chrono::milliseconds(250);
+      fmt::print("{:%S}\n", t); // prints 59.750000000
+    }
+    ```
+
+    Thanks @ShawnZhong.
+
+-   Experimental: implemented glibc extension for padding seconds,
+    minutes and hours
+    (https://github.com/fmtlib/fmt/issues/2959,
+    https://github.com/fmtlib/fmt/pull/3271). Thanks @ShawnZhong.
+
+-   Added a formatter for `std::exception`
+    (https://github.com/fmtlib/fmt/issues/2977,
+    https://github.com/fmtlib/fmt/issues/3012,
+    https://github.com/fmtlib/fmt/pull/3062,
+    https://github.com/fmtlib/fmt/pull/3076,
+    https://github.com/fmtlib/fmt/pull/3119). For example
+    ([godbolt](https://godbolt.org/z/8xoWGs9e4)):
+
+    ```c++
+    #include <fmt/std.h>
+    #include <vector>
+
+    int main() {
+      try {
+        std::vector<bool>().at(0);
+      } catch(const std::exception& e) {
+        fmt::print("{}", e);
+      }
+    }
+    ```
+
+    prints:
+
+        vector<bool>::_M_range_check: __n (which is 0) >= this->size() (which is 0)
+
+    on libstdc++. Thanks @zach2good and @phprus.
+
+-   Moved `std::error_code` formatter from `fmt/os.h` to `fmt/std.h`.
+    (https://github.com/fmtlib/fmt/pull/3125). Thanks @phprus.
+
+-   Added formatters for standard container adapters:
+    `std::priority_queue`, `std::queue` and `std::stack`
+    (https://github.com/fmtlib/fmt/issues/3215,
+    https://github.com/fmtlib/fmt/pull/3279). For example
+    ([godbolt](https://godbolt.org/z/74h1xY9qK)):
+
+    ```c++
+    #include <fmt/ranges.h>
+    #include <stack>
+    #include <vector>
+
+    int main() {
+      auto s = std::stack<bool, std::vector<bool>>();
+      for (auto b: {true, false, true}) s.push(b);
+      fmt::print("{}\n", s); // prints [true, false, true]
+    }
+    ```
+
+    Thanks @ShawnZhong.
+
+-   Added a formatter for `std::optional` to `fmt/std.h`
+    (https://github.com/fmtlib/fmt/issues/1367,
+    https://github.com/fmtlib/fmt/pull/3303).
+    Thanks @tom-huntington.
+
+-   Fixed formatting of valueless by exception variants
+    (https://github.com/fmtlib/fmt/pull/3347). Thanks @TheOmegaCarrot.
+
+-   Made `fmt::ptr` accept `unique_ptr` with a custom deleter
+    (https://github.com/fmtlib/fmt/pull/3177). Thanks @hmbj.
+
+-   Fixed formatting of noncopyable ranges and nested ranges of chars
+    (https://github.com/fmtlib/fmt/pull/3158
+    https://github.com/fmtlib/fmt/issues/3286,
+    https://github.com/fmtlib/fmt/pull/3290). Thanks @BRevzin.
+
+-   Fixed issues with formatting of paths and ranges of paths
+    (https://github.com/fmtlib/fmt/issues/3319,
+    https://github.com/fmtlib/fmt/pull/3321
+    https://github.com/fmtlib/fmt/issues/3322). Thanks @phprus.
+
+-   Improved handling of invalid Unicode in paths.
+
+-   Enabled compile-time checks on Apple clang 14 and later
+    (https://github.com/fmtlib/fmt/pull/3331). Thanks @cloyce.
+
+-   Improved compile-time checks of named arguments
+    (https://github.com/fmtlib/fmt/issues/3105,
+    https://github.com/fmtlib/fmt/pull/3214). Thanks @rbrich.
+
+-   Fixed formatting when both alignment and `0` are given
+    (https://github.com/fmtlib/fmt/issues/3236,
+    https://github.com/fmtlib/fmt/pull/3248). Thanks @ShawnZhong.
+
+-   Improved Unicode support in the experimental file API on Windows
+    (https://github.com/fmtlib/fmt/issues/3234,
+    https://github.com/fmtlib/fmt/pull/3293). Thanks @Fros1er.
+
+-   Unified UTF transcoding
+    (https://github.com/fmtlib/fmt/pull/3416). Thanks @phprus.
+
+-   Added support for UTF-8 digit separators via an experimental locale
+    facet (https://github.com/fmtlib/fmt/issues/1861). For
+    example ([godbolt](https://godbolt.org/z/f7bcznb3W)):
+
+    ```c++
+    auto loc = std::locale(
+      std::locale(), new fmt::format_facet<std::locale>("’"));
+    auto s = fmt::format(loc, "{:L}", 1000);
+    ```
+
+    where `’` is U+2019 used as a digit separator in the de_CH locale.
+
+-   Added an overload of `formatted_size` that takes a locale
+    (https://github.com/fmtlib/fmt/issues/3084,
+    https://github.com/fmtlib/fmt/pull/3087). Thanks @gerboengels.
+
+-   Removed the deprecated `FMT_DEPRECATED_OSTREAM`.
+
+-   Fixed a UB when using a null `std::string_view` with
+    `fmt::to_string` or format string compilation
+    (https://github.com/fmtlib/fmt/issues/3241,
+    https://github.com/fmtlib/fmt/pull/3244). Thanks @phprus.
+
+-   Added `starts_with` to the fallback `string_view` implementation
+    (https://github.com/fmtlib/fmt/pull/3080). Thanks @phprus.
+
+-   Added `fmt::basic_format_string::get()` for compatibility with
+    `basic_format_string`
+    (https://github.com/fmtlib/fmt/pull/3111). Thanks @huangqinjin.
+
+-   Added `println` for compatibility with C++23
+    (https://github.com/fmtlib/fmt/pull/3267). Thanks @ShawnZhong.
+
+-   Renamed the `FMT_EXPORT` macro for shared library usage to
+    `FMT_LIB_EXPORT`.
+
+-   Improved documentation
+    (https://github.com/fmtlib/fmt/issues/3108,
+    https://github.com/fmtlib/fmt/issues/3169,
+    https://github.com/fmtlib/fmt/pull/3243).
+    https://github.com/fmtlib/fmt/pull/3404).
+    Thanks @Cleroth and @Vertexwahn.
+
+-   Improved build configuration and tests
+    (https://github.com/fmtlib/fmt/pull/3118,
+    https://github.com/fmtlib/fmt/pull/3120,
+    https://github.com/fmtlib/fmt/pull/3188,
+    https://github.com/fmtlib/fmt/issues/3189,
+    https://github.com/fmtlib/fmt/pull/3198,
+    https://github.com/fmtlib/fmt/pull/3205,
+    https://github.com/fmtlib/fmt/pull/3207,
+    https://github.com/fmtlib/fmt/pull/3210,
+    https://github.com/fmtlib/fmt/pull/3240,
+    https://github.com/fmtlib/fmt/pull/3256,
+    https://github.com/fmtlib/fmt/pull/3264,
+    https://github.com/fmtlib/fmt/issues/3299,
+    https://github.com/fmtlib/fmt/pull/3302,
+    https://github.com/fmtlib/fmt/pull/3312,
+    https://github.com/fmtlib/fmt/issues/3317,
+    https://github.com/fmtlib/fmt/pull/3328,
+    https://github.com/fmtlib/fmt/pull/3333,
+    https://github.com/fmtlib/fmt/pull/3369,
+    https://github.com/fmtlib/fmt/issues/3373,
+    https://github.com/fmtlib/fmt/pull/3395,
+    https://github.com/fmtlib/fmt/pull/3406,
+    https://github.com/fmtlib/fmt/pull/3411).
+    Thanks @dimztimz, @phprus, @DavidKorczynski, @ChrisThrasher,
+    @FrancoisCarouge, @kennyweiss, @luzpaz, @codeinred, @Mixaill, @joycebrum,
+    @kevinhwang and @Vertexwahn.
+
+-   Fixed a regression in handling empty format specifiers after a colon
+    (`{:}`) (https://github.com/fmtlib/fmt/pull/3086). Thanks @oxidase.
+
+-   Worked around a broken implementation of
+    `std::is_constant_evaluated` in some versions of libstdc++ on clang
+    (https://github.com/fmtlib/fmt/issues/3247,
+    https://github.com/fmtlib/fmt/pull/3281). Thanks @phprus.
+
+-   Fixed formatting of volatile variables
+    (https://github.com/fmtlib/fmt/pull/3068).
+
+-   Fixed various warnings and compilation issues
+    (https://github.com/fmtlib/fmt/pull/3057,
+    https://github.com/fmtlib/fmt/pull/3066,
+    https://github.com/fmtlib/fmt/pull/3072,
+    https://github.com/fmtlib/fmt/pull/3082,
+    https://github.com/fmtlib/fmt/pull/3091,
+    https://github.com/fmtlib/fmt/issues/3092,
+    https://github.com/fmtlib/fmt/pull/3093,
+    https://github.com/fmtlib/fmt/pull/3095,
+    https://github.com/fmtlib/fmt/issues/3096,
+    https://github.com/fmtlib/fmt/pull/3097,
+    https://github.com/fmtlib/fmt/issues/3128,
+    https://github.com/fmtlib/fmt/pull/3129,
+    https://github.com/fmtlib/fmt/pull/3137,
+    https://github.com/fmtlib/fmt/pull/3139,
+    https://github.com/fmtlib/fmt/issues/3140,
+    https://github.com/fmtlib/fmt/pull/3142,
+    https://github.com/fmtlib/fmt/issues/3149,
+    https://github.com/fmtlib/fmt/pull/3150,
+    https://github.com/fmtlib/fmt/issues/3154,
+    https://github.com/fmtlib/fmt/issues/3163,
+    https://github.com/fmtlib/fmt/issues/3178,
+    https://github.com/fmtlib/fmt/pull/3184,
+    https://github.com/fmtlib/fmt/pull/3196,
+    https://github.com/fmtlib/fmt/issues/3204,
+    https://github.com/fmtlib/fmt/pull/3206,
+    https://github.com/fmtlib/fmt/pull/3208,
+    https://github.com/fmtlib/fmt/issues/3213,
+    https://github.com/fmtlib/fmt/pull/3216,
+    https://github.com/fmtlib/fmt/issues/3224,
+    https://github.com/fmtlib/fmt/issues/3226,
+    https://github.com/fmtlib/fmt/issues/3228,
+    https://github.com/fmtlib/fmt/pull/3229,
+    https://github.com/fmtlib/fmt/pull/3259,
+    https://github.com/fmtlib/fmt/issues/3274,
+    https://github.com/fmtlib/fmt/issues/3287,
+    https://github.com/fmtlib/fmt/pull/3288,
+    https://github.com/fmtlib/fmt/issues/3292,
+    https://github.com/fmtlib/fmt/pull/3295,
+    https://github.com/fmtlib/fmt/pull/3296,
+    https://github.com/fmtlib/fmt/issues/3298,
+    https://github.com/fmtlib/fmt/issues/3325,
+    https://github.com/fmtlib/fmt/pull/3326,
+    https://github.com/fmtlib/fmt/issues/3334,
+    https://github.com/fmtlib/fmt/issues/3342,
+    https://github.com/fmtlib/fmt/pull/3343,
+    https://github.com/fmtlib/fmt/issues/3351,
+    https://github.com/fmtlib/fmt/pull/3352,
+    https://github.com/fmtlib/fmt/pull/3362,
+    https://github.com/fmtlib/fmt/issues/3365,
+    https://github.com/fmtlib/fmt/pull/3366,
+    https://github.com/fmtlib/fmt/pull/3374,
+    https://github.com/fmtlib/fmt/issues/3377,
+    https://github.com/fmtlib/fmt/pull/3378,
+    https://github.com/fmtlib/fmt/issues/3381,
+    https://github.com/fmtlib/fmt/pull/3398,
+    https://github.com/fmtlib/fmt/pull/3413,
+    https://github.com/fmtlib/fmt/issues/3415).
+    Thanks @phprus, @gsjaardema, @NewbieOrange, @EngineLessCC, @asmaloney,
+    @HazardyKnusperkeks, @sergiud, @Youw, @thesmurph, @czudziakm,
+    @Roman-Koshelev, @chronoxor, @ShawnZhong, @russelltg, @glebm, @tmartin-gh,
+    @Zhaojun-Liu, @louiswins and @mogemimi.
+
+# 9.1.0 - 2022-08-27
+
+-   `fmt::formatted_size` now works at compile time
+    (https://github.com/fmtlib/fmt/pull/3026). For example
+    ([godbolt](https://godbolt.org/z/1MW5rMdf8)):
+
+    ```c++
+    #include <fmt/compile.h>
+
+    int main() {
+      using namespace fmt::literals;
+      constexpr size_t n = fmt::formatted_size("{}"_cf, 42);
+      fmt::print("{}\n", n); // prints 2
+    }
+    ```
+
+    Thanks @marksantaniello.
+
+-   Fixed handling of invalid UTF-8
+    (https://github.com/fmtlib/fmt/pull/3038,
+    https://github.com/fmtlib/fmt/pull/3044,
+    https://github.com/fmtlib/fmt/pull/3056).
+    Thanks @phprus and @skeeto.
+
+-   Improved Unicode support in `ostream` overloads of `print`
+    (https://github.com/fmtlib/fmt/pull/2994,
+    https://github.com/fmtlib/fmt/pull/3001,
+    https://github.com/fmtlib/fmt/pull/3025). Thanks @dimztimz.
+
+-   Fixed handling of the sign specifier in localized formatting on
+    systems with 32-bit `wchar_t`
+    (https://github.com/fmtlib/fmt/issues/3041).
+
+-   Added support for wide streams to `fmt::streamed`
+    (https://github.com/fmtlib/fmt/pull/2994). Thanks @phprus.
+
+-   Added the `n` specifier that disables the output of delimiters when
+    formatting ranges (https://github.com/fmtlib/fmt/pull/2981,
+    https://github.com/fmtlib/fmt/pull/2983). For example
+    ([godbolt](https://godbolt.org/z/roKqGdj8c)):
+
+    ```c++
+    #include <fmt/ranges.h>
+    #include <vector>
+
+    int main() {
+      auto v = std::vector{1, 2, 3};
+      fmt::print("{:n}\n", v); // prints 1, 2, 3
+    }
+    ```
+
+    Thanks @BRevzin.
+
+-   Worked around problematic `std::string_view` constructors introduced
+    in C++23 (https://github.com/fmtlib/fmt/issues/3030,
+    https://github.com/fmtlib/fmt/issues/3050). Thanks @strega-nil-ms.
+
+-   Improve handling (exclusion) of recursive ranges
+    (https://github.com/fmtlib/fmt/issues/2968,
+    https://github.com/fmtlib/fmt/pull/2974). Thanks @Dani-Hub.
+
+-   Improved error reporting in format string compilation
+    (https://github.com/fmtlib/fmt/issues/3055).
+
+-   Improved the implementation of
+    [Dragonbox](https://github.com/jk-jeon/dragonbox), the algorithm
+    used for the default floating-point formatting
+    (https://github.com/fmtlib/fmt/pull/2984). Thanks @jk-jeon.
+
+-   Fixed issues with floating-point formatting on exotic platforms.
+
+-   Improved the implementation of chrono formatting
+    (https://github.com/fmtlib/fmt/pull/3010). Thanks @phprus.
+
+-   Improved documentation
+    (https://github.com/fmtlib/fmt/pull/2966,
+    https://github.com/fmtlib/fmt/pull/3009,
+    https://github.com/fmtlib/fmt/issues/3020,
+    https://github.com/fmtlib/fmt/pull/3037).
+    Thanks @mwinterb, @jcelerier and @remiburtin.
+
+-   Improved build configuration
+    (https://github.com/fmtlib/fmt/pull/2991,
+    https://github.com/fmtlib/fmt/pull/2995,
+    https://github.com/fmtlib/fmt/issues/3004,
+    https://github.com/fmtlib/fmt/pull/3007,
+    https://github.com/fmtlib/fmt/pull/3040).
+    Thanks @dimztimz and @hwhsu1231.
+
+-   Fixed various warnings and compilation issues
+    (https://github.com/fmtlib/fmt/issues/2969,
+    https://github.com/fmtlib/fmt/pull/2971,
+    https://github.com/fmtlib/fmt/issues/2975,
+    https://github.com/fmtlib/fmt/pull/2982,
+    https://github.com/fmtlib/fmt/pull/2985,
+    https://github.com/fmtlib/fmt/issues/2988,
+    https://github.com/fmtlib/fmt/issues/2989,
+    https://github.com/fmtlib/fmt/issues/3000,
+    https://github.com/fmtlib/fmt/issues/3006,
+    https://github.com/fmtlib/fmt/issues/3014,
+    https://github.com/fmtlib/fmt/issues/3015,
+    https://github.com/fmtlib/fmt/pull/3021,
+    https://github.com/fmtlib/fmt/issues/3023,
+    https://github.com/fmtlib/fmt/pull/3024,
+    https://github.com/fmtlib/fmt/pull/3029,
+    https://github.com/fmtlib/fmt/pull/3043,
+    https://github.com/fmtlib/fmt/issues/3052,
+    https://github.com/fmtlib/fmt/pull/3053,
+    https://github.com/fmtlib/fmt/pull/3054).
+    Thanks @h-friederich, @dimztimz, @olupton, @bernhardmgruber and @phprus.
+
+# 9.0.0 - 2022-07-04
+
+-   Switched to the internal floating point formatter for all decimal
+    presentation formats. In particular this results in consistent
+    rounding on all platforms and removing the `s[n]printf` fallback for
+    decimal FP formatting.
+
+-   Compile-time floating point formatting no longer requires the
+    header-only mode. For example
+    ([godbolt](https://godbolt.org/z/G37PTeG3b)):
+
+    ```c++
+    #include <array>
+    #include <fmt/compile.h>
+
+    consteval auto compile_time_dtoa(double value) -> std::array<char, 10> {
+      auto result = std::array<char, 10>();
+      fmt::format_to(result.data(), FMT_COMPILE("{}"), value);
+      return result;
+    }
+
+    constexpr auto answer = compile_time_dtoa(0.42);
+    ```
+
+    works with the default settings.
+
+-   Improved the implementation of
+    [Dragonbox](https://github.com/jk-jeon/dragonbox), the algorithm
+    used for the default floating-point formatting
+    (https://github.com/fmtlib/fmt/pull/2713,
+    https://github.com/fmtlib/fmt/pull/2750). Thanks @jk-jeon.
+
+-   Made `fmt::to_string` work with `__float128`. This uses the internal
+    FP formatter and works even on system without `__float128` support
+    in `[s]printf`.
+
+-   Disabled automatic `std::ostream` insertion operator (`operator<<`)
+    discovery when `fmt/ostream.h` is included to prevent ODR
+    violations. You can get the old behavior by defining
+    `FMT_DEPRECATED_OSTREAM` but this will be removed in the next major
+    release. Use `fmt::streamed` or `fmt::ostream_formatter` to enable
+    formatting via `std::ostream` instead.
+
+-   Added `fmt::ostream_formatter` that can be used to write `formatter`
+    specializations that perform formatting via `std::ostream`. For
+    example ([godbolt](https://godbolt.org/z/5sEc5qMsf)):
+
+    ```c++
+    #include <fmt/ostream.h>
+
+    struct date {
+      int year, month, day;
+
+      friend std::ostream& operator<<(std::ostream& os, const date& d) {
+        return os << d.year << '-' << d.month << '-' << d.day;
+      }
+    };
+
+    template <> struct fmt::formatter<date> : ostream_formatter {};
+
+    std::string s = fmt::format("The date is {}", date{2012, 12, 9});
+    // s == "The date is 2012-12-9"
+    ```
+
+-   Added the `fmt::streamed` function that takes an object and formats
+    it via `std::ostream`. For example
+    ([godbolt](https://godbolt.org/z/5G3346G1f)):
+
+    ```c++
+    #include <thread>
+    #include <fmt/ostream.h>
+
+    int main() {
+      fmt::print("Current thread id: {}\n",
+                 fmt::streamed(std::this_thread::get_id()));
+    }
+    ```
+
+    Note that `fmt/std.h` provides a `formatter` specialization for
+    `std::thread::id` so you don\'t need to format it via
+    `std::ostream`.
+
+-   Deprecated implicit conversions of unscoped enums to integers for
+    consistency with scoped enums.
+
+-   Added an argument-dependent lookup based `format_as` extension API
+    to simplify formatting of enums.
+
+-   Added experimental `std::variant` formatting support
+    (https://github.com/fmtlib/fmt/pull/2941). For example
+    ([godbolt](https://godbolt.org/z/KG9z6cq68)):
+
+    ```c++
+    #include <variant>
+    #include <fmt/std.h>
+
+    int main() {
+      auto v = std::variant<int, std::string>(42);
+      fmt::print("{}\n", v);
+    }
+    ```
+
+    prints:
+
+        variant(42)
+
+    Thanks @jehelset.
+
+-   Added experimental `std::filesystem::path` formatting support
+    (https://github.com/fmtlib/fmt/issues/2865,
+    https://github.com/fmtlib/fmt/pull/2902,
+    https://github.com/fmtlib/fmt/issues/2917,
+    https://github.com/fmtlib/fmt/pull/2918). For example
+    ([godbolt](https://godbolt.org/z/o44dMexEb)):
+
+    ```c++
+    #include <filesystem>
+    #include <fmt/std.h>
+
+    int main() {
+      fmt::print("There is no place like {}.", std::filesystem::path("/home"));
+    }
+    ```
+
+    prints:
+
+        There is no place like "/home".
+
+    Thanks @phprus.
+
+-   Added a `std::thread::id` formatter to `fmt/std.h`. For example
+    ([godbolt](https://godbolt.org/z/j1azbYf3E)):
+
+    ```c++
+    #include <thread>
+    #include <fmt/std.h>
+
+    int main() {
+      fmt::print("Current thread id: {}\n", std::this_thread::get_id());
+    }
+    ```
+
+-   Added `fmt::styled` that applies a text style to an individual
+    argument (https://github.com/fmtlib/fmt/pull/2793). For
+    example ([godbolt](https://godbolt.org/z/vWGW7v5M6)):
+
+    ```c++
+    #include <fmt/chrono.h>
+    #include <fmt/color.h>
+
+    int main() {
+      auto now = std::chrono::system_clock::now();
+      fmt::print(
+        "[{}] {}: {}\n",
+        fmt::styled(now, fmt::emphasis::bold),
+        fmt::styled("error", fg(fmt::color::red)),
+        "something went wrong");
+    }
+    ```
+
+    prints
+
+    ![](https://user-images.githubusercontent.com/576385/175071215-12809244-dab0-4005-96d8-7cd911c964d5.png)
+
+    Thanks @rbrugo.
+
+-   Made `fmt::print` overload for text styles correctly handle UTF-8
+    (https://github.com/fmtlib/fmt/issues/2681,
+    https://github.com/fmtlib/fmt/pull/2701). Thanks @AlexGuteniev.
+
+-   Fixed Unicode handling when writing to an ostream.
+
+-   Added support for nested specifiers to range formatting
+    (https://github.com/fmtlib/fmt/pull/2673). For example
+    ([godbolt](https://godbolt.org/z/xd3Gj38cf)):
+
+    ```c++
+    #include <vector>
+    #include <fmt/ranges.h>
+
+    int main() {
+      fmt::print("{::#x}\n", std::vector{10, 20, 30});
+    }
+    ```
+
+    prints `[0xa, 0x14, 0x1e]`.
+
+    Thanks @BRevzin.
+
+-   Implemented escaping of wide strings in ranges
+    (https://github.com/fmtlib/fmt/pull/2904). Thanks @phprus.
+
+-   Added support for ranges with `begin` / `end` found via the
+    argument-dependent lookup
+    (https://github.com/fmtlib/fmt/pull/2807). Thanks @rbrugo.
+
+-   Fixed formatting of certain kinds of ranges of ranges
+    (https://github.com/fmtlib/fmt/pull/2787). Thanks @BRevzin.
+
+-   Fixed handling of maps with element types other than `std::pair`
+    (https://github.com/fmtlib/fmt/pull/2944). Thanks @BrukerJWD.
+
+-   Made tuple formatter enabled only if elements are formattable
+    (https://github.com/fmtlib/fmt/issues/2939,
+    https://github.com/fmtlib/fmt/pull/2940). Thanks @jehelset.
+
+-   Made `fmt::join` compatible with format string compilation
+    (https://github.com/fmtlib/fmt/issues/2719,
+    https://github.com/fmtlib/fmt/pull/2720). Thanks @phprus.
+
+-   Made compile-time checks work with named arguments of custom types
+    and `std::ostream` `print` overloads
+    (https://github.com/fmtlib/fmt/issues/2816,
+    https://github.com/fmtlib/fmt/issues/2817,
+    https://github.com/fmtlib/fmt/pull/2819). Thanks @timsong-cpp.
+
+-   Removed `make_args_checked` because it is no longer needed for
+    compile-time checks
+    (https://github.com/fmtlib/fmt/pull/2760). Thanks @phprus.
+
+-   Removed the following deprecated APIs: `_format`, `arg_join`, the
+    `format_to` overload that takes a memory buffer, `[v]fprintf` that
+    takes an `ostream`.
+
+-   Removed the deprecated implicit conversion of `[const] signed char*`
+    and `[const] unsigned char*` to C strings.
+
+-   Removed the deprecated `fmt/locale.h`.
+
+-   Replaced the deprecated `fileno()` with `descriptor()` in
+    `buffered_file`.
+
+-   Moved `to_string_view` to the `detail` namespace since it\'s an
+    implementation detail.
+
+-   Made access mode of a created file consistent with `fopen` by
+    setting `S_IWGRP` and `S_IWOTH`
+    (https://github.com/fmtlib/fmt/pull/2733). Thanks @arogge.
+
+-   Removed a redundant buffer resize when formatting to `std::ostream`
+    (https://github.com/fmtlib/fmt/issues/2842,
+    https://github.com/fmtlib/fmt/pull/2843). Thanks @jcelerier.
+
+-   Made precision computation for strings consistent with width
+    (https://github.com/fmtlib/fmt/issues/2888).
+
+-   Fixed handling of locale separators in floating point formatting
+    (https://github.com/fmtlib/fmt/issues/2830).
+
+-   Made sign specifiers work with `__int128_t`
+    (https://github.com/fmtlib/fmt/issues/2773).
+
+-   Improved support for systems such as CHERI with extra data stored in
+    pointers (https://github.com/fmtlib/fmt/pull/2932).
+    Thanks @davidchisnall.
+
+-   Improved documentation
+    (https://github.com/fmtlib/fmt/pull/2706,
+    https://github.com/fmtlib/fmt/pull/2712,
+    https://github.com/fmtlib/fmt/pull/2789,
+    https://github.com/fmtlib/fmt/pull/2803,
+    https://github.com/fmtlib/fmt/pull/2805,
+    https://github.com/fmtlib/fmt/pull/2815,
+    https://github.com/fmtlib/fmt/pull/2924).
+    Thanks @BRevzin, @Pokechu22, @setoye, @rtobar, @rbrugo, @anoonD and
+    @leha-bot.
+
+-   Improved build configuration
+    (https://github.com/fmtlib/fmt/pull/2766,
+    https://github.com/fmtlib/fmt/pull/2772,
+    https://github.com/fmtlib/fmt/pull/2836,
+    https://github.com/fmtlib/fmt/pull/2852,
+    https://github.com/fmtlib/fmt/pull/2907,
+    https://github.com/fmtlib/fmt/pull/2913,
+    https://github.com/fmtlib/fmt/pull/2914).
+    Thanks @kambala-decapitator, @mattiasljungstrom, @kieselnb, @nathannaveen
+    and @Vertexwahn.
+
+-   Fixed various warnings and compilation issues
+    (https://github.com/fmtlib/fmt/issues/2408,
+    https://github.com/fmtlib/fmt/issues/2507,
+    https://github.com/fmtlib/fmt/issues/2697,
+    https://github.com/fmtlib/fmt/issues/2715,
+    https://github.com/fmtlib/fmt/issues/2717,
+    https://github.com/fmtlib/fmt/pull/2722,
+    https://github.com/fmtlib/fmt/pull/2724,
+    https://github.com/fmtlib/fmt/pull/2725,
+    https://github.com/fmtlib/fmt/issues/2726,
+    https://github.com/fmtlib/fmt/pull/2728,
+    https://github.com/fmtlib/fmt/pull/2732,
+    https://github.com/fmtlib/fmt/issues/2738,
+    https://github.com/fmtlib/fmt/pull/2742,
+    https://github.com/fmtlib/fmt/issues/2744,
+    https://github.com/fmtlib/fmt/issues/2745,
+    https://github.com/fmtlib/fmt/issues/2746,
+    https://github.com/fmtlib/fmt/issues/2754,
+    https://github.com/fmtlib/fmt/pull/2755,
+    https://github.com/fmtlib/fmt/issues/2757,
+    https://github.com/fmtlib/fmt/pull/2758,
+    https://github.com/fmtlib/fmt/issues/2761,
+    https://github.com/fmtlib/fmt/pull/2762,
+    https://github.com/fmtlib/fmt/issues/2763,
+    https://github.com/fmtlib/fmt/pull/2765,
+    https://github.com/fmtlib/fmt/issues/2769,
+    https://github.com/fmtlib/fmt/pull/2770,
+    https://github.com/fmtlib/fmt/issues/2771,
+    https://github.com/fmtlib/fmt/issues/2777,
+    https://github.com/fmtlib/fmt/pull/2779,
+    https://github.com/fmtlib/fmt/pull/2782,
+    https://github.com/fmtlib/fmt/pull/2783,
+    https://github.com/fmtlib/fmt/issues/2794,
+    https://github.com/fmtlib/fmt/issues/2796,
+    https://github.com/fmtlib/fmt/pull/2797,
+    https://github.com/fmtlib/fmt/pull/2801,
+    https://github.com/fmtlib/fmt/pull/2802,
+    https://github.com/fmtlib/fmt/issues/2808,
+    https://github.com/fmtlib/fmt/issues/2818,
+    https://github.com/fmtlib/fmt/pull/2819,
+    https://github.com/fmtlib/fmt/issues/2829,
+    https://github.com/fmtlib/fmt/issues/2835,
+    https://github.com/fmtlib/fmt/issues/2848,
+    https://github.com/fmtlib/fmt/issues/2860,
+    https://github.com/fmtlib/fmt/pull/2861,
+    https://github.com/fmtlib/fmt/pull/2882,
+    https://github.com/fmtlib/fmt/issues/2886,
+    https://github.com/fmtlib/fmt/issues/2891,
+    https://github.com/fmtlib/fmt/pull/2892,
+    https://github.com/fmtlib/fmt/issues/2895,
+    https://github.com/fmtlib/fmt/issues/2896,
+    https://github.com/fmtlib/fmt/pull/2903,
+    https://github.com/fmtlib/fmt/issues/2906,
+    https://github.com/fmtlib/fmt/issues/2908,
+    https://github.com/fmtlib/fmt/pull/2909,
+    https://github.com/fmtlib/fmt/issues/2920,
+    https://github.com/fmtlib/fmt/pull/2922,
+    https://github.com/fmtlib/fmt/pull/2927,
+    https://github.com/fmtlib/fmt/pull/2929,
+    https://github.com/fmtlib/fmt/issues/2936,
+    https://github.com/fmtlib/fmt/pull/2937,
+    https://github.com/fmtlib/fmt/pull/2938,
+    https://github.com/fmtlib/fmt/pull/2951,
+    https://github.com/fmtlib/fmt/issues/2954,
+    https://github.com/fmtlib/fmt/pull/2957,
+    https://github.com/fmtlib/fmt/issues/2958,
+    https://github.com/fmtlib/fmt/pull/2960).
+    Thanks @matrackif @Tobi823, @ivan-volnov, @VasiliPupkin256,
+    @federico-busato, @barcharcraz, @jk-jeon, @HazardyKnusperkeks, @dalboris,
+    @seanm, @gsjaardema, @timsong-cpp, @seanm, @frithrah, @chronoxor, @Agga,
+    @madmaxoft, @JurajX, @phprus and @Dani-Hub.
+
+# 8.1.1 - 2022-01-06
+
+-   Restored ABI compatibility with version 8.0.x
+    (https://github.com/fmtlib/fmt/issues/2695,
+    https://github.com/fmtlib/fmt/pull/2696). Thanks @saraedum.
+-   Fixed chrono formatting on big endian systems
+    (https://github.com/fmtlib/fmt/issues/2698,
+    https://github.com/fmtlib/fmt/pull/2699).
+    Thanks @phprus and @xvitaly.
+-   Fixed a linkage error with mingw
+    (https://github.com/fmtlib/fmt/issues/2691,
+    https://github.com/fmtlib/fmt/pull/2692). Thanks @rbberger.
+
+# 8.1.0 - 2022-01-02
+
+-   Optimized chrono formatting
+    (https://github.com/fmtlib/fmt/pull/2500,
+    https://github.com/fmtlib/fmt/pull/2537,
+    https://github.com/fmtlib/fmt/issues/2541,
+    https://github.com/fmtlib/fmt/pull/2544,
+    https://github.com/fmtlib/fmt/pull/2550,
+    https://github.com/fmtlib/fmt/pull/2551,
+    https://github.com/fmtlib/fmt/pull/2576,
+    https://github.com/fmtlib/fmt/issues/2577,
+    https://github.com/fmtlib/fmt/pull/2586,
+    https://github.com/fmtlib/fmt/pull/2591,
+    https://github.com/fmtlib/fmt/pull/2594,
+    https://github.com/fmtlib/fmt/pull/2602,
+    https://github.com/fmtlib/fmt/pull/2617,
+    https://github.com/fmtlib/fmt/issues/2628,
+    https://github.com/fmtlib/fmt/pull/2633,
+    https://github.com/fmtlib/fmt/issues/2670,
+    https://github.com/fmtlib/fmt/pull/2671).
+
+    Processing of some specifiers such as `%z` and `%Y` is now up to
+    10-20 times faster, for example on GCC 11 with libstdc++:
+
+        ----------------------------------------------------------------------------
+        Benchmark                                  Before             After
+        ----------------------------------------------------------------------------
+        FMTFormatter_z                             261 ns             26.3 ns
+        FMTFormatterCompile_z                      246 ns             11.6 ns
+        FMTFormatter_Y                             263 ns             26.1 ns
+        FMTFormatterCompile_Y                      244 ns             10.5 ns
+        ----------------------------------------------------------------------------
+
+    Thanks @phprus and @toughengineer.
+
+-   Implemented subsecond formatting for chrono durations
+    (https://github.com/fmtlib/fmt/pull/2623). For example
+    ([godbolt](https://godbolt.org/z/es7vWTETe)):
+
+    ```c++
+    #include <fmt/chrono.h>
+
+    int main() {
+      fmt::print("{:%S}", std::chrono::milliseconds(1234));
+    }
+    ```
+
+    prints \"01.234\".
+
+    Thanks @matrackif.
+
+-   Fixed handling of precision 0 when formatting chrono durations
+    (https://github.com/fmtlib/fmt/issues/2587,
+    https://github.com/fmtlib/fmt/pull/2588). Thanks @lukester1975.
+
+-   Fixed an overflow on invalid inputs in the `tm` formatter
+    (https://github.com/fmtlib/fmt/pull/2564). Thanks @phprus.
+
+-   Added `fmt::group_digits` that formats integers with a non-localized
+    digit separator (comma) for groups of three digits. For example
+    ([godbolt](https://godbolt.org/z/TxGxG9Poq)):
+
+    ```c++
+    #include <fmt/format.h>
+
+    int main() {
+      fmt::print("{} dollars", fmt::group_digits(1000000));
+    }
+    ```
+
+    prints \"1,000,000 dollars\".
+
+-   Added support for faint, conceal, reverse and blink text styles
+    (https://github.com/fmtlib/fmt/pull/2394):
+
+    <https://user-images.githubusercontent.com/576385/147710227-c68f5317-f8fa-42c3-9123-7c4ba3c398cb.mp4>
+
+    Thanks @benit8 and @data-man.
+
+-   Added experimental support for compile-time floating point
+    formatting (https://github.com/fmtlib/fmt/pull/2426,
+    https://github.com/fmtlib/fmt/pull/2470). It is currently
+    limited to the header-only mode. Thanks @alexezeder.
+
+-   Added UDL-based named argument support to compile-time format string
+    checks (https://github.com/fmtlib/fmt/issues/2640,
+    https://github.com/fmtlib/fmt/pull/2649). For example
+    ([godbolt](https://godbolt.org/z/ohGbbvonv)):
+
+    ```c++
+    #include <fmt/format.h>
+
+    int main() {
+      using namespace fmt::literals;
+      fmt::print("{answer:s}", "answer"_a=42);
+    }
+    ```
+
+    gives a compile-time error on compilers with C++20 `consteval` and
+    non-type template parameter support (gcc 10+) because `s` is not a
+    valid format specifier for an integer.
+
+    Thanks @alexezeder.
+
+-   Implemented escaping of string range elements. For example
+    ([godbolt](https://godbolt.org/z/rKvM1vKf3)):
+
+    ```c++
+    #include <fmt/ranges.h>
+    #include <vector>
+
+    int main() {
+      fmt::print("{}", std::vector<std::string>{"\naan"});
+    }
+    ```
+
+    is now printed as:
+
+        ["\naan"]
+
+    instead of:
+
+        ["
+        aan"]
+
+-   Added an experimental `?` specifier for escaping strings.
+    (https://github.com/fmtlib/fmt/pull/2674). Thanks @BRevzin.
+
+-   Switched to JSON-like representation of maps and sets for
+    consistency with Python\'s `str.format`. For example
+    ([godbolt](https://godbolt.org/z/seKjoY9W5)):
+
+    ```c++
+    #include <fmt/ranges.h>
+    #include <map>
+
+    int main() {
+      fmt::print("{}", std::map<std::string, int>{{"answer", 42}});
+    }
+    ```
+
+    is now printed as:
+
+        {"answer": 42}
+
+-   Extended `fmt::join` to support C++20-only ranges
+    (https://github.com/fmtlib/fmt/pull/2549). Thanks @BRevzin.
+
+-   Optimized handling of non-const-iterable ranges and implemented
+    initial support for non-const-formattable types.
+
+-   Disabled implicit conversions of scoped enums to integers that was
+    accidentally introduced in earlier versions
+    (https://github.com/fmtlib/fmt/pull/1841).
+
+-   Deprecated implicit conversion of `[const] signed char*` and
+    `[const] unsigned char*` to C strings.
+
+-   Deprecated `_format`, a legacy UDL-based format API
+    (https://github.com/fmtlib/fmt/pull/2646). Thanks @alexezeder.
+
+-   Marked `format`, `formatted_size` and `to_string` as `[[nodiscard]]`
+    (https://github.com/fmtlib/fmt/pull/2612). @0x8000-0000.
+
+-   Added missing diagnostic when trying to format function and member
+    pointers as well as objects convertible to pointers which is
+    explicitly disallowed
+    (https://github.com/fmtlib/fmt/issues/2598,
+    https://github.com/fmtlib/fmt/pull/2609,
+    https://github.com/fmtlib/fmt/pull/2610). Thanks @AlexGuteniev.
+
+-   Optimized writing to a contiguous buffer with `format_to_n`
+    (https://github.com/fmtlib/fmt/pull/2489). Thanks @Roman-Koshelev.
+
+-   Optimized writing to non-`char` buffers
+    (https://github.com/fmtlib/fmt/pull/2477). Thanks @Roman-Koshelev.
+
+-   Decimal point is now localized when using the `L` specifier.
+
+-   Improved floating point formatter implementation
+    (https://github.com/fmtlib/fmt/pull/2498,
+    https://github.com/fmtlib/fmt/pull/2499). Thanks @Roman-Koshelev.
+
+-   Fixed handling of very large precision in fixed format
+    (https://github.com/fmtlib/fmt/pull/2616).
+
+-   Made a table of cached powers used in FP formatting static
+    (https://github.com/fmtlib/fmt/pull/2509). Thanks @jk-jeon.
+
+-   Resolved a lookup ambiguity with C++20 format-related functions due
+    to ADL (https://github.com/fmtlib/fmt/issues/2639,
+    https://github.com/fmtlib/fmt/pull/2641). Thanks @mkurdej.
+
+-   Removed unnecessary inline namespace qualification
+    (https://github.com/fmtlib/fmt/issues/2642,
+    https://github.com/fmtlib/fmt/pull/2643). Thanks @mkurdej.
+
+-   Implemented argument forwarding in `format_to_n`
+    (https://github.com/fmtlib/fmt/issues/2462,
+    https://github.com/fmtlib/fmt/pull/2463). Thanks @owent.
+
+-   Fixed handling of implicit conversions in `fmt::to_string` and
+    format string compilation
+    (https://github.com/fmtlib/fmt/issues/2565).
+
+-   Changed the default access mode of files created by
+    `fmt::output_file` to `-rw-r--r--` for consistency with `fopen`
+    (https://github.com/fmtlib/fmt/issues/2530).
+
+-   Make `fmt::ostream::flush` public
+    (https://github.com/fmtlib/fmt/issues/2435).
+
+-   Improved C++14/17 attribute detection
+    (https://github.com/fmtlib/fmt/pull/2615). Thanks @AlexGuteniev.
+
+-   Improved `consteval` detection for MSVC
+    (https://github.com/fmtlib/fmt/pull/2559). Thanks @DanielaE.
+
+-   Improved documentation
+    (https://github.com/fmtlib/fmt/issues/2406,
+    https://github.com/fmtlib/fmt/pull/2446,
+    https://github.com/fmtlib/fmt/issues/2493,
+    https://github.com/fmtlib/fmt/issues/2513,
+    https://github.com/fmtlib/fmt/pull/2515,
+    https://github.com/fmtlib/fmt/issues/2522,
+    https://github.com/fmtlib/fmt/pull/2562,
+    https://github.com/fmtlib/fmt/pull/2575,
+    https://github.com/fmtlib/fmt/pull/2606,
+    https://github.com/fmtlib/fmt/pull/2620,
+    https://github.com/fmtlib/fmt/issues/2676).
+    Thanks @sobolevn, @UnePierre, @zhsj, @phprus, @ericcurtin and @Lounarok.
+
+-   Improved fuzzers and added a fuzzer for chrono timepoint formatting
+    (https://github.com/fmtlib/fmt/pull/2461,
+    https://github.com/fmtlib/fmt/pull/2469). @pauldreik,
+
+-   Added the `FMT_SYSTEM_HEADERS` CMake option setting which marks
+    {fmt}\'s headers as system. It can be used to suppress warnings
+    (https://github.com/fmtlib/fmt/issues/2644,
+    https://github.com/fmtlib/fmt/pull/2651). Thanks @alexezeder.
+
+-   Added the Bazel build system support
+    (https://github.com/fmtlib/fmt/pull/2505,
+    https://github.com/fmtlib/fmt/pull/2516). Thanks @Vertexwahn.
+
+-   Improved build configuration and tests
+    (https://github.com/fmtlib/fmt/issues/2437,
+    https://github.com/fmtlib/fmt/pull/2558,
+    https://github.com/fmtlib/fmt/pull/2648,
+    https://github.com/fmtlib/fmt/pull/2650,
+    https://github.com/fmtlib/fmt/pull/2663,
+    https://github.com/fmtlib/fmt/pull/2677).
+    Thanks @DanielaE, @alexezeder and @phprus.
+
+-   Fixed various warnings and compilation issues
+    (https://github.com/fmtlib/fmt/pull/2353,
+    https://github.com/fmtlib/fmt/pull/2356,
+    https://github.com/fmtlib/fmt/pull/2399,
+    https://github.com/fmtlib/fmt/issues/2408,
+    https://github.com/fmtlib/fmt/pull/2414,
+    https://github.com/fmtlib/fmt/pull/2427,
+    https://github.com/fmtlib/fmt/pull/2432,
+    https://github.com/fmtlib/fmt/pull/2442,
+    https://github.com/fmtlib/fmt/pull/2434,
+    https://github.com/fmtlib/fmt/issues/2439,
+    https://github.com/fmtlib/fmt/pull/2447,
+    https://github.com/fmtlib/fmt/pull/2450,
+    https://github.com/fmtlib/fmt/issues/2455,
+    https://github.com/fmtlib/fmt/issues/2465,
+    https://github.com/fmtlib/fmt/issues/2472,
+    https://github.com/fmtlib/fmt/issues/2474,
+    https://github.com/fmtlib/fmt/pull/2476,
+    https://github.com/fmtlib/fmt/issues/2478,
+    https://github.com/fmtlib/fmt/issues/2479,
+    https://github.com/fmtlib/fmt/issues/2481,
+    https://github.com/fmtlib/fmt/pull/2482,
+    https://github.com/fmtlib/fmt/pull/2483,
+    https://github.com/fmtlib/fmt/issues/2490,
+    https://github.com/fmtlib/fmt/pull/2491,
+    https://github.com/fmtlib/fmt/pull/2510,
+    https://github.com/fmtlib/fmt/pull/2518,
+    https://github.com/fmtlib/fmt/issues/2528,
+    https://github.com/fmtlib/fmt/pull/2529,
+    https://github.com/fmtlib/fmt/pull/2539,
+    https://github.com/fmtlib/fmt/issues/2540,
+    https://github.com/fmtlib/fmt/pull/2545,
+    https://github.com/fmtlib/fmt/pull/2555,
+    https://github.com/fmtlib/fmt/issues/2557,
+    https://github.com/fmtlib/fmt/issues/2570,
+    https://github.com/fmtlib/fmt/pull/2573,
+    https://github.com/fmtlib/fmt/pull/2582,
+    https://github.com/fmtlib/fmt/issues/2605,
+    https://github.com/fmtlib/fmt/pull/2611,
+    https://github.com/fmtlib/fmt/pull/2647,
+    https://github.com/fmtlib/fmt/issues/2627,
+    https://github.com/fmtlib/fmt/pull/2630,
+    https://github.com/fmtlib/fmt/issues/2635,
+    https://github.com/fmtlib/fmt/issues/2638,
+    https://github.com/fmtlib/fmt/issues/2653,
+    https://github.com/fmtlib/fmt/issues/2654,
+    https://github.com/fmtlib/fmt/issues/2661,
+    https://github.com/fmtlib/fmt/pull/2664,
+    https://github.com/fmtlib/fmt/pull/2684).
+    Thanks @DanielaE, @mwinterb, @cdacamar, @TrebledJ, @bodomartin, @cquammen,
+    @white238, @mmarkeloff, @palacaze, @jcelerier, @mborn-adi, @BrukerJWD,
+    @spyridon97, @phprus, @oliverlee, @joshessman-llnl, @akohlmey, @timkalu,
+    @olupton, @Acretock, @alexezeder, @andrewcorrigan, @lucpelletier and
+    @HazardyKnusperkeks.
+
+# 8.0.1 - 2021-07-02
+
+-   Fixed the version number in the inline namespace
+    (https://github.com/fmtlib/fmt/issues/2374).
+-   Added a missing presentation type check for `std::string`
+    (https://github.com/fmtlib/fmt/issues/2402).
+-   Fixed a linkage error when mixing code built with clang and gcc
+    (https://github.com/fmtlib/fmt/issues/2377).
+-   Fixed documentation issues
+    (https://github.com/fmtlib/fmt/pull/2396,
+    https://github.com/fmtlib/fmt/issues/2403,
+    https://github.com/fmtlib/fmt/issues/2406). Thanks @mkurdej.
+-   Removed dead code in FP formatter (
+    https://github.com/fmtlib/fmt/pull/2398). Thanks @javierhonduco.
+-   Fixed various warnings and compilation issues
+    (https://github.com/fmtlib/fmt/issues/2351,
+    https://github.com/fmtlib/fmt/issues/2359,
+    https://github.com/fmtlib/fmt/pull/2365,
+    https://github.com/fmtlib/fmt/issues/2368,
+    https://github.com/fmtlib/fmt/pull/2370,
+    https://github.com/fmtlib/fmt/pull/2376,
+    https://github.com/fmtlib/fmt/pull/2381,
+    https://github.com/fmtlib/fmt/pull/2382,
+    https://github.com/fmtlib/fmt/issues/2386,
+    https://github.com/fmtlib/fmt/pull/2389,
+    https://github.com/fmtlib/fmt/pull/2395,
+    https://github.com/fmtlib/fmt/pull/2397,
+    https://github.com/fmtlib/fmt/issues/2400,
+    https://github.com/fmtlib/fmt/issues/2401,
+    https://github.com/fmtlib/fmt/pull/2407).
+    Thanks @zx2c4, @AidanSun05, @mattiasljungstrom, @joemmett, @erengy,
+    @patlkli, @gsjaardema and @phprus.
+
+# 8.0.0 - 2021-06-21
+
+-   Enabled compile-time format string checks by default. For example
+    ([godbolt](https://godbolt.org/z/sMxcohGjz)):
+
+    ```c++
+    #include <fmt/core.h>
+
+    int main() {
+      fmt::print("{:d}", "I am not a number");
+    }
+    ```
+
+    gives a compile-time error on compilers with C++20 `consteval`
+    support (gcc 10+, clang 11+) because `d` is not a valid format
+    specifier for a string.
+
+    To pass a runtime string wrap it in `fmt::runtime`:
+
+    ```c++
+    fmt::print(fmt::runtime("{:d}"), "I am not a number");
+    ```
+
+-   Added compile-time formatting
+    (https://github.com/fmtlib/fmt/pull/2019,
+    https://github.com/fmtlib/fmt/pull/2044,
+    https://github.com/fmtlib/fmt/pull/2056,
+    https://github.com/fmtlib/fmt/pull/2072,
+    https://github.com/fmtlib/fmt/pull/2075,
+    https://github.com/fmtlib/fmt/issues/2078,
+    https://github.com/fmtlib/fmt/pull/2129,
+    https://github.com/fmtlib/fmt/pull/2326). For example
+    ([godbolt](https://godbolt.org/z/Mxx9d89jM)):
+
+    ```c++
+    #include <fmt/compile.h>
+
+    consteval auto compile_time_itoa(int value) -> std::array<char, 10> {
+      auto result = std::array<char, 10>();
+      fmt::format_to(result.data(), FMT_COMPILE("{}"), value);
+      return result;
+    }
+
+    constexpr auto answer = compile_time_itoa(42);
+    ```
+
+    Most of the formatting functionality is available at compile time
+    with a notable exception of floating-point numbers and pointers.
+    Thanks @alexezeder.
+
+-   Optimized handling of format specifiers during format string
+    compilation. For example, hexadecimal formatting (`"{:x}"`) is now
+    3-7x faster than before when using `format_to` with format string
+    compilation and a stack-allocated buffer
+    (https://github.com/fmtlib/fmt/issues/1944).
+
+    Before (7.1.3):
+
+        ----------------------------------------------------------------------------
+        Benchmark                                  Time             CPU   Iterations
+        ----------------------------------------------------------------------------
+        FMTCompileOld/0                         15.5 ns         15.5 ns     43302898
+        FMTCompileOld/42                        16.6 ns         16.6 ns     43278267
+        FMTCompileOld/273123                    18.7 ns         18.6 ns     37035861
+        FMTCompileOld/9223372036854775807       19.4 ns         19.4 ns     35243000
+        ----------------------------------------------------------------------------
+
+    After (8.x):
+
+        ----------------------------------------------------------------------------
+        Benchmark                                  Time             CPU   Iterations
+        ----------------------------------------------------------------------------
+        FMTCompileNew/0                         1.99 ns         1.99 ns    360523686
+        FMTCompileNew/42                        2.33 ns         2.33 ns    279865664
+        FMTCompileNew/273123                    3.72 ns         3.71 ns    190230315
+        FMTCompileNew/9223372036854775807       5.28 ns         5.26 ns    130711631
+        ----------------------------------------------------------------------------
+
+    It is even faster than `std::to_chars` from libc++ compiled with
+    clang on macOS:
+
+        ----------------------------------------------------------------------------
+        Benchmark                                  Time             CPU   Iterations
+        ----------------------------------------------------------------------------
+        ToChars/0                               4.42 ns         4.41 ns    160196630
+        ToChars/42                              5.00 ns         4.98 ns    140735201
+        ToChars/273123                          7.26 ns         7.24 ns     95784130
+        ToChars/9223372036854775807             8.77 ns         8.75 ns     75872534
+        ----------------------------------------------------------------------------
+
+    In other cases, especially involving `std::string` construction, the
+    speed up is usually lower because handling format specifiers takes a
+    smaller fraction of the total time.
+
+-   Added the `_cf` user-defined literal to represent a compiled format
+    string. It can be used instead of the `FMT_COMPILE` macro
+    (https://github.com/fmtlib/fmt/pull/2043,
+    https://github.com/fmtlib/fmt/pull/2242):
+
+    ```c++
+    #include <fmt/compile.h>
+
+    using namespace fmt::literals;
+    auto s = fmt::format(FMT_COMPILE("{}"), 42); // 🙁 not modern
+    auto s = fmt::format("{}"_cf, 42);           // 🙂 modern as hell
+    ```
+
+    It requires compiler support for class types in non-type template
+    parameters (a C++20 feature) which is available in GCC 9.3+.
+    Thanks @alexezeder.
+
+-   Format string compilation now requires `format` functions of
+    `formatter` specializations for user-defined types to be `const`:
+
+    ```c++
+    template <> struct fmt::formatter<my_type>: formatter<string_view> {
+      template <typename FormatContext>
+      auto format(my_type obj, FormatContext& ctx) const {  // Note const here.
+        // ...
+      }
+    };
+    ```
+
+-   Added UDL-based named argument support to format string compilation
+    (https://github.com/fmtlib/fmt/pull/2243,
+    https://github.com/fmtlib/fmt/pull/2281). For example:
+
+    ```c++
+    #include <fmt/compile.h>
+
+    using namespace fmt::literals;
+    auto s = fmt::format(FMT_COMPILE("{answer}"), "answer"_a = 42);
+    ```
+
+    Here the argument named \"answer\" is resolved at compile time with
+    no runtime overhead. Thanks @alexezeder.
+
+-   Added format string compilation support to `fmt::print`
+    (https://github.com/fmtlib/fmt/issues/2280,
+    https://github.com/fmtlib/fmt/pull/2304). Thanks @alexezeder.
+
+-   Added initial support for compiling {fmt} as a C++20 module
+    (https://github.com/fmtlib/fmt/pull/2235,
+    https://github.com/fmtlib/fmt/pull/2240,
+    https://github.com/fmtlib/fmt/pull/2260,
+    https://github.com/fmtlib/fmt/pull/2282,
+    https://github.com/fmtlib/fmt/pull/2283,
+    https://github.com/fmtlib/fmt/pull/2288,
+    https://github.com/fmtlib/fmt/pull/2298,
+    https://github.com/fmtlib/fmt/pull/2306,
+    https://github.com/fmtlib/fmt/pull/2307,
+    https://github.com/fmtlib/fmt/pull/2309,
+    https://github.com/fmtlib/fmt/pull/2318,
+    https://github.com/fmtlib/fmt/pull/2324,
+    https://github.com/fmtlib/fmt/pull/2332,
+    https://github.com/fmtlib/fmt/pull/2340). Thanks @DanielaE.
+
+-   Made symbols private by default reducing shared library size
+    (https://github.com/fmtlib/fmt/pull/2301). For example
+    there was a \~15% reported reduction on one platform. Thanks @sergiud.
+
+-   Optimized includes making the result of preprocessing `fmt/format.h`
+    \~20% smaller with libstdc++/C++20 and slightly improving build
+    times (https://github.com/fmtlib/fmt/issues/1998).
+
+-   Added support of ranges with non-const `begin` / `end`
+    (https://github.com/fmtlib/fmt/pull/1953). Thanks @kitegi.
+
+-   Added support of `std::byte` and other formattable types to
+    `fmt::join` (https://github.com/fmtlib/fmt/issues/1981,
+    https://github.com/fmtlib/fmt/issues/2040,
+    https://github.com/fmtlib/fmt/pull/2050,
+    https://github.com/fmtlib/fmt/issues/2262). For example:
+
+    ```c++
+    #include <fmt/format.h>
+    #include <cstddef>
+    #include <vector>
+
+    int main() {
+      auto bytes = std::vector{std::byte(4), std::byte(2)};
+      fmt::print("{}", fmt::join(bytes, ""));
+    }
+    ```
+
+    prints \"42\".
+
+    Thanks @kamibo.
+
+-   Implemented the default format for `std::chrono::system_clock`
+    (https://github.com/fmtlib/fmt/issues/2319,
+    https://github.com/fmtlib/fmt/pull/2345). For example:
+
+    ```c++
+    #include <fmt/chrono.h>
+
+    int main() {
+      fmt::print("{}", std::chrono::system_clock::now());
+    }
+    ```
+
+    prints \"2021-06-18 15:22:00\" (the output depends on the current
+    date and time). Thanks @sunmy2019.
+
+-   Made more chrono specifiers locale independent by default. Use the
+    `'L'` specifier to get localized formatting. For example:
+
+    ```c++
+    #include <fmt/chrono.h>
+
+    int main() {
+      std::locale::global(std::locale("ru_RU.UTF-8"));
+      auto monday = std::chrono::weekday(1);
+      fmt::print("{}\n", monday);   // prints "Mon"
+      fmt::print("{:L}\n", monday); // prints "пн"
+    }
+    ```
+
+-   Improved locale handling in chrono formatting
+    (https://github.com/fmtlib/fmt/issues/2337,
+    https://github.com/fmtlib/fmt/pull/2349,
+    https://github.com/fmtlib/fmt/pull/2350). Thanks @phprus.
+
+-   Deprecated `fmt/locale.h` moving the formatting functions that take
+    a locale to `fmt/format.h` (`char`) and `fmt/xchar` (other
+    overloads). This doesn\'t introduce a dependency on `<locale>` so
+    there is virtually no compile time effect.
+
+-   Deprecated an undocumented `format_to` overload that takes
+    `basic_memory_buffer`.
+
+-   Made parameter order in `vformat_to` consistent with `format_to`
+    (https://github.com/fmtlib/fmt/issues/2327).
+
+-   Added support for time points with arbitrary durations
+    (https://github.com/fmtlib/fmt/issues/2208). For example:
+
+    ```c++
+    #include <fmt/chrono.h>
+
+    int main() {
+      using tp = std::chrono::time_point<
+        std::chrono::system_clock, std::chrono::seconds>;
+      fmt::print("{:%S}", tp(std::chrono::seconds(42)));
+    }
+    ```
+
+    prints \"42\".
+
+-   Formatting floating-point numbers no longer produces trailing zeros
+    by default for consistency with `std::format`. For example:
+
+    ```c++
+    #include <fmt/core.h>
+
+    int main() {
+      fmt::print("{0:.3}", 1.1);
+    }
+    ```
+
+    prints \"1.1\". Use the `'#'` specifier to keep trailing zeros.
+
+-   Dropped a limit on the number of elements in a range and replaced
+    `{}` with `[]` as range delimiters for consistency with Python\'s
+    `str.format`.
+
+-   The `'L'` specifier for locale-specific numeric formatting can now
+    be combined with presentation specifiers as in `std::format`. For
+    example:
+
+    ```c++
+    #include <fmt/core.h>
+    #include <locale>
+
+    int main() {
+      std::locale::global(std::locale("fr_FR.UTF-8"));
+      fmt::print("{0:.2Lf}", 0.42);
+    }
+    ```
+
+    prints \"0,42\". The deprecated `'n'` specifier has been removed.
+
+-   Made the `0` specifier ignored for infinity and NaN
+    (https://github.com/fmtlib/fmt/issues/2305,
+    https://github.com/fmtlib/fmt/pull/2310). Thanks @Liedtke.
+
+-   Made the hexfloat formatting use the right alignment by default
+    (https://github.com/fmtlib/fmt/issues/2308,
+    https://github.com/fmtlib/fmt/pull/2317). Thanks @Liedtke.
+
+-   Removed the deprecated numeric alignment (`'='`). Use the `'0'`
+    specifier instead.
+
+-   Removed the deprecated `fmt/posix.h` header that has been replaced
+    with `fmt/os.h`.
+
+-   Removed the deprecated `format_to_n_context`, `format_to_n_args` and
+    `make_format_to_n_args`. They have been replaced with
+    `format_context`, `` format_args` and ``make_format_args\`\`
+    respectively.
+
+-   Moved `wchar_t`-specific functions and types to `fmt/xchar.h`. You
+    can define `FMT_DEPRECATED_INCLUDE_XCHAR` to automatically include
+    `fmt/xchar.h` from `fmt/format.h` but this will be disabled in the
+    next major release.
+
+-   Fixed handling of the `'+'` specifier in localized formatting
+    (https://github.com/fmtlib/fmt/issues/2133).
+
+-   Added support for the `'s'` format specifier that gives textual
+    representation of `bool`
+    (https://github.com/fmtlib/fmt/issues/2094,
+    https://github.com/fmtlib/fmt/pull/2109). For example:
+
+    ```c++
+    #include <fmt/core.h>
+
+    int main() {
+      fmt::print("{:s}", true);
+    }
+    ```
+
+    prints \"true\". Thanks @powercoderlol.
+
+-   Made `fmt::ptr` work with function pointers
+    (https://github.com/fmtlib/fmt/pull/2131). For example:
+
+    ```c++
+    #include <fmt/format.h>
+
+    int main() {
+      fmt::print("My main: {}\n", fmt::ptr(main));
+    }
+    ```
+
+    Thanks @mikecrowe.
+
+-   The undocumented support for specializing `formatter` for pointer
+    types has been removed.
+
+-   Fixed `fmt::formatted_size` with format string compilation
+    (https://github.com/fmtlib/fmt/pull/2141,
+    https://github.com/fmtlib/fmt/pull/2161). Thanks @alexezeder.
+
+-   Fixed handling of empty format strings during format string
+    compilation (https://github.com/fmtlib/fmt/issues/2042):
+
+    ```c++
+    auto s = fmt::format(FMT_COMPILE(""));
+    ```
+
+    Thanks @alexezeder.
+
+-   Fixed handling of enums in `fmt::to_string`
+    (https://github.com/fmtlib/fmt/issues/2036).
+
+-   Improved width computation
+    (https://github.com/fmtlib/fmt/issues/2033,
+    https://github.com/fmtlib/fmt/issues/2091). For example:
+
+    ```c++
+    #include <fmt/core.h>
+
+    int main() {
+      fmt::print("{:-<10}{}\n", "你好", "世界");
+      fmt::print("{:-<10}{}\n", "hello", "world");
+    }
+    ```
+
+    prints
+
+    ![](https://user-images.githubusercontent.com/576385/119840373-cea3ca80-beb9-11eb-91e0-54266c48e181.png)
+
+    on a modern terminal.
+
+-   The experimental fast output stream (`fmt::ostream`) is now
+    truncated by default for consistency with `fopen`
+    (https://github.com/fmtlib/fmt/issues/2018). For example:
+
+    ```c++
+    #include <fmt/os.h>
+
+    int main() {
+      fmt::ostream out1 = fmt::output_file("guide");
+      out1.print("Zaphod");
+      out1.close();
+      fmt::ostream out2 = fmt::output_file("guide");
+      out2.print("Ford");
+    }
+    ```
+
+    writes \"Ford\" to the file \"guide\". To preserve the old file
+    content if any pass `fmt::file::WRONLY | fmt::file::CREATE` flags to
+    `fmt::output_file`.
+
+-   Fixed moving of `fmt::ostream` that holds buffered data
+    (https://github.com/fmtlib/fmt/issues/2197,
+    https://github.com/fmtlib/fmt/pull/2198). Thanks @vtta.
+
+-   Replaced the `fmt::system_error` exception with a function of the
+    same name that constructs `std::system_error`
+    (https://github.com/fmtlib/fmt/issues/2266).
+
+-   Replaced the `fmt::windows_error` exception with a function of the
+    same name that constructs `std::system_error` with the category
+    returned by `fmt::system_category()`
+    (https://github.com/fmtlib/fmt/issues/2274,
+    https://github.com/fmtlib/fmt/pull/2275). The latter is
+    similar to `std::sytem_category` but correctly handles UTF-8.
+    Thanks @phprus.
+
+-   Replaced `fmt::error_code` with `std::error_code` and made it
+    formattable (https://github.com/fmtlib/fmt/issues/2269,
+    https://github.com/fmtlib/fmt/pull/2270,
+    https://github.com/fmtlib/fmt/pull/2273). Thanks @phprus.
+
+-   Added speech synthesis support
+    (https://github.com/fmtlib/fmt/pull/2206).
+
+-   Made `format_to` work with a memory buffer that has a custom
+    allocator (https://github.com/fmtlib/fmt/pull/2300).
+    Thanks @voxmea.
+
+-   Added `Allocator::max_size` support to `basic_memory_buffer`.
+    (https://github.com/fmtlib/fmt/pull/1960). Thanks @phprus.
+
+-   Added wide string support to `fmt::join`
+    (https://github.com/fmtlib/fmt/pull/2236). Thanks @crbrz.
+
+-   Made iterators passed to `formatter` specializations via a format
+    context satisfy C++20 `std::output_iterator` requirements
+    (https://github.com/fmtlib/fmt/issues/2156,
+    https://github.com/fmtlib/fmt/pull/2158,
+    https://github.com/fmtlib/fmt/issues/2195,
+    https://github.com/fmtlib/fmt/pull/2204). Thanks @randomnetcat.
+
+-   Optimized the `printf` implementation
+    (https://github.com/fmtlib/fmt/pull/1982,
+    https://github.com/fmtlib/fmt/pull/1984,
+    https://github.com/fmtlib/fmt/pull/2016,
+    https://github.com/fmtlib/fmt/pull/2164).
+    Thanks @rimathia and @moiwi.
+
+-   Improved detection of `constexpr` `char_traits`
+    (https://github.com/fmtlib/fmt/pull/2246,
+    https://github.com/fmtlib/fmt/pull/2257). Thanks @phprus.
+
+-   Fixed writing to `stdout` when it is redirected to `NUL` on Windows
+    (https://github.com/fmtlib/fmt/issues/2080).
+
+-   Fixed exception propagation from iterators
+    (https://github.com/fmtlib/fmt/issues/2097).
+
+-   Improved `strftime` error handling
+    (https://github.com/fmtlib/fmt/issues/2238,
+    https://github.com/fmtlib/fmt/pull/2244). Thanks @yumeyao.
+
+-   Stopped using deprecated GCC UDL template extension.
+
+-   Added `fmt/args.h` to the install target
+    (https://github.com/fmtlib/fmt/issues/2096).
+
+-   Error messages are now passed to assert when exceptions are disabled
+    (https://github.com/fmtlib/fmt/pull/2145). Thanks @NobodyXu.
+
+-   Added the `FMT_MASTER_PROJECT` CMake option to control build and
+    install targets when {fmt} is included via `add_subdirectory`
+    (https://github.com/fmtlib/fmt/issues/2098,
+    https://github.com/fmtlib/fmt/pull/2100).
+    Thanks @randomizedthinking.
+
+-   Improved build configuration
+    (https://github.com/fmtlib/fmt/pull/2026,
+    https://github.com/fmtlib/fmt/pull/2122).
+    Thanks @luncliff and @ibaned.
+
+-   Fixed various warnings and compilation issues
+    (https://github.com/fmtlib/fmt/issues/1947,
+    https://github.com/fmtlib/fmt/pull/1959,
+    https://github.com/fmtlib/fmt/pull/1963,
+    https://github.com/fmtlib/fmt/pull/1965,
+    https://github.com/fmtlib/fmt/issues/1966,
+    https://github.com/fmtlib/fmt/pull/1974,
+    https://github.com/fmtlib/fmt/pull/1975,
+    https://github.com/fmtlib/fmt/pull/1990,
+    https://github.com/fmtlib/fmt/issues/2000,
+    https://github.com/fmtlib/fmt/pull/2001,
+    https://github.com/fmtlib/fmt/issues/2002,
+    https://github.com/fmtlib/fmt/issues/2004,
+    https://github.com/fmtlib/fmt/pull/2006,
+    https://github.com/fmtlib/fmt/pull/2009,
+    https://github.com/fmtlib/fmt/pull/2010,
+    https://github.com/fmtlib/fmt/issues/2038,
+    https://github.com/fmtlib/fmt/issues/2039,
+    https://github.com/fmtlib/fmt/issues/2047,
+    https://github.com/fmtlib/fmt/pull/2053,
+    https://github.com/fmtlib/fmt/issues/2059,
+    https://github.com/fmtlib/fmt/pull/2065,
+    https://github.com/fmtlib/fmt/pull/2067,
+    https://github.com/fmtlib/fmt/pull/2068,
+    https://github.com/fmtlib/fmt/pull/2073,
+    https://github.com/fmtlib/fmt/issues/2103,
+    https://github.com/fmtlib/fmt/issues/2105,
+    https://github.com/fmtlib/fmt/pull/2106,
+    https://github.com/fmtlib/fmt/pull/2107,
+    https://github.com/fmtlib/fmt/issues/2116,
+    https://github.com/fmtlib/fmt/pull/2117,
+    https://github.com/fmtlib/fmt/issues/2118,
+    https://github.com/fmtlib/fmt/pull/2119,
+    https://github.com/fmtlib/fmt/issues/2127,
+    https://github.com/fmtlib/fmt/pull/2128,
+    https://github.com/fmtlib/fmt/issues/2140,
+    https://github.com/fmtlib/fmt/issues/2142,
+    https://github.com/fmtlib/fmt/pull/2143,
+    https://github.com/fmtlib/fmt/pull/2144,
+    https://github.com/fmtlib/fmt/issues/2147,
+    https://github.com/fmtlib/fmt/issues/2148,
+    https://github.com/fmtlib/fmt/issues/2149,
+    https://github.com/fmtlib/fmt/pull/2152,
+    https://github.com/fmtlib/fmt/pull/2160,
+    https://github.com/fmtlib/fmt/issues/2170,
+    https://github.com/fmtlib/fmt/issues/2175,
+    https://github.com/fmtlib/fmt/issues/2176,
+    https://github.com/fmtlib/fmt/pull/2177,
+    https://github.com/fmtlib/fmt/issues/2178,
+    https://github.com/fmtlib/fmt/pull/2179,
+    https://github.com/fmtlib/fmt/issues/2180,
+    https://github.com/fmtlib/fmt/issues/2181,
+    https://github.com/fmtlib/fmt/pull/2183,
+    https://github.com/fmtlib/fmt/issues/2184,
+    https://github.com/fmtlib/fmt/issues/2185,
+    https://github.com/fmtlib/fmt/pull/2186,
+    https://github.com/fmtlib/fmt/pull/2187,
+    https://github.com/fmtlib/fmt/pull/2190,
+    https://github.com/fmtlib/fmt/pull/2192,
+    https://github.com/fmtlib/fmt/pull/2194,
+    https://github.com/fmtlib/fmt/pull/2205,
+    https://github.com/fmtlib/fmt/issues/2210,
+    https://github.com/fmtlib/fmt/pull/2211,
+    https://github.com/fmtlib/fmt/pull/2215,
+    https://github.com/fmtlib/fmt/pull/2216,
+    https://github.com/fmtlib/fmt/pull/2218,
+    https://github.com/fmtlib/fmt/pull/2220,
+    https://github.com/fmtlib/fmt/issues/2228,
+    https://github.com/fmtlib/fmt/pull/2229,
+    https://github.com/fmtlib/fmt/pull/2230,
+    https://github.com/fmtlib/fmt/issues/2233,
+    https://github.com/fmtlib/fmt/pull/2239,
+    https://github.com/fmtlib/fmt/issues/2248,
+    https://github.com/fmtlib/fmt/issues/2252,
+    https://github.com/fmtlib/fmt/pull/2253,
+    https://github.com/fmtlib/fmt/pull/2255,
+    https://github.com/fmtlib/fmt/issues/2261,
+    https://github.com/fmtlib/fmt/issues/2278,
+    https://github.com/fmtlib/fmt/issues/2284,
+    https://github.com/fmtlib/fmt/pull/2287,
+    https://github.com/fmtlib/fmt/pull/2289,
+    https://github.com/fmtlib/fmt/pull/2290,
+    https://github.com/fmtlib/fmt/pull/2293,
+    https://github.com/fmtlib/fmt/issues/2295,
+    https://github.com/fmtlib/fmt/pull/2296,
+    https://github.com/fmtlib/fmt/pull/2297,
+    https://github.com/fmtlib/fmt/issues/2311,
+    https://github.com/fmtlib/fmt/pull/2313,
+    https://github.com/fmtlib/fmt/pull/2315,
+    https://github.com/fmtlib/fmt/issues/2320,
+    https://github.com/fmtlib/fmt/pull/2321,
+    https://github.com/fmtlib/fmt/pull/2323,
+    https://github.com/fmtlib/fmt/issues/2328,
+    https://github.com/fmtlib/fmt/pull/2329,
+    https://github.com/fmtlib/fmt/pull/2333,
+    https://github.com/fmtlib/fmt/pull/2338,
+    https://github.com/fmtlib/fmt/pull/2341).
+    Thanks @darklukee, @fagg, @killerbot242, @jgopel, @yeswalrus, @Finkman,
+    @HazardyKnusperkeks, @dkavolis, @concatime, @chronoxor, @summivox, @yNeo,
+    @Apache-HB, @alexezeder, @toojays, @Brainy0207, @vadz, @imsherlock, @phprus,
+    @white238, @yafshar, @BillyDonahue, @jstaahl, @denchat, @DanielaE,
+    @ilyakurdyukov, @ilmai, @JessyDL, @sergiud, @mwinterb, @sven-herrmann,
+    @jmelas, @twoixter, @crbrz and @upsj.
+
+-   Improved documentation
+    (https://github.com/fmtlib/fmt/issues/1986,
+    https://github.com/fmtlib/fmt/pull/2051,
+    https://github.com/fmtlib/fmt/issues/2057,
+    https://github.com/fmtlib/fmt/pull/2081,
+    https://github.com/fmtlib/fmt/issues/2084,
+    https://github.com/fmtlib/fmt/pull/2312).
+    Thanks @imba-tjd, @0x416c69 and @mordante.
+
+-   Continuous integration and test improvements
+    (https://github.com/fmtlib/fmt/issues/1969,
+    https://github.com/fmtlib/fmt/pull/1991,
+    https://github.com/fmtlib/fmt/pull/2020,
+    https://github.com/fmtlib/fmt/pull/2110,
+    https://github.com/fmtlib/fmt/pull/2114,
+    https://github.com/fmtlib/fmt/issues/2196,
+    https://github.com/fmtlib/fmt/pull/2217,
+    https://github.com/fmtlib/fmt/pull/2247,
+    https://github.com/fmtlib/fmt/pull/2256,
+    https://github.com/fmtlib/fmt/pull/2336,
+    https://github.com/fmtlib/fmt/pull/2346).
+    Thanks @jgopel, @alexezeder and @DanielaE.
+
+# 7.1.3 - 2020-11-24
+
+-   Fixed handling of buffer boundaries in `format_to_n`
+    (https://github.com/fmtlib/fmt/issues/1996,
+    https://github.com/fmtlib/fmt/issues/2029).
+-   Fixed linkage errors when linking with a shared library
+    (https://github.com/fmtlib/fmt/issues/2011).
+-   Reintroduced ostream support to range formatters
+    (https://github.com/fmtlib/fmt/issues/2014).
+-   Worked around an issue with mixing std versions in gcc
+    (https://github.com/fmtlib/fmt/issues/2017).
+
+# 7.1.2 - 2020-11-04
+
+-   Fixed floating point formatting with large precision
+    (https://github.com/fmtlib/fmt/issues/1976).
+
+# 7.1.1 - 2020-11-01
+
+-   Fixed ABI compatibility with 7.0.x
+    (https://github.com/fmtlib/fmt/issues/1961).
+-   Added the `FMT_ARM_ABI_COMPATIBILITY` macro to work around ABI
+    incompatibility between GCC and Clang on ARM
+    (https://github.com/fmtlib/fmt/issues/1919).
+-   Worked around a SFINAE bug in GCC 8
+    (https://github.com/fmtlib/fmt/issues/1957).
+-   Fixed linkage errors when building with GCC\'s LTO
+    (https://github.com/fmtlib/fmt/issues/1955).
+-   Fixed a compilation error when building without `__builtin_clz` or
+    equivalent (https://github.com/fmtlib/fmt/pull/1968).
+    Thanks @tohammer.
+-   Fixed a sign conversion warning
+    (https://github.com/fmtlib/fmt/pull/1964). Thanks @OptoCloud.
+
+# 7.1.0 - 2020-10-25
+
+-   Switched from
+    [Grisu3](https://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf)
+    to [Dragonbox](https://github.com/jk-jeon/dragonbox) for the default
+    floating-point formatting which gives the shortest decimal
+    representation with round-trip guarantee and correct rounding
+    (https://github.com/fmtlib/fmt/pull/1882,
+    https://github.com/fmtlib/fmt/pull/1887,
+    https://github.com/fmtlib/fmt/pull/1894). This makes {fmt}
+    up to 20-30x faster than common implementations of
+    `std::ostringstream` and `sprintf` on
+    [dtoa-benchmark](https://github.com/fmtlib/dtoa-benchmark) and
+    faster than double-conversion and Ryū:
+
+    ![](https://user-images.githubusercontent.com/576385/95684665-11719600-0ba8-11eb-8e5b-972ff4e49428.png)
+
+    It is possible to get even better performance at the cost of larger
+    binary size by compiling with the `FMT_USE_FULL_CACHE_DRAGONBOX`
+    macro set to 1.
+
+    Thanks @jk-jeon.
+
+-   Added an experimental unsynchronized file output API which, together
+    with [format string
+    compilation](https://fmt.dev/latest/api.html#compile-api), can give
+    [5-9 times speed up compared to
+    fprintf](https://www.zverovich.net/2020/08/04/optimal-file-buffer-size.html)
+    on common platforms ([godbolt](https://godbolt.org/z/nsTcG8)):
+
+    ```c++
+    #include <fmt/os.h>
+
+    int main() {
+      auto f = fmt::output_file("guide");
+      f.print("The answer is {}.", 42);
+    }
+    ```
+
+-   Added a formatter for `std::chrono::time_point<system_clock>`
+    (https://github.com/fmtlib/fmt/issues/1819,
+    https://github.com/fmtlib/fmt/pull/1837). For example
+    ([godbolt](https://godbolt.org/z/c4M6fh)):
+
+    ```c++
+    #include <fmt/chrono.h>
+
+    int main() {
+      auto now = std::chrono::system_clock::now();
+      fmt::print("The time is {:%H:%M:%S}.\n", now);
+    }
+    ```
+
+    Thanks @adamburgess.
+
+-   Added support for ranges with non-const `begin`/`end` to `fmt::join`
+    (https://github.com/fmtlib/fmt/issues/1784,
+    https://github.com/fmtlib/fmt/pull/1786). For example
+    ([godbolt](https://godbolt.org/z/jP63Tv)):
+
+    ```c++
+    #include <fmt/ranges.h>
+    #include <range/v3/view/filter.hpp>
+
+    int main() {
+      using std::literals::string_literals::operator""s;
+      auto strs = std::array{"a"s, "bb"s, "ccc"s};
+      auto range = strs | ranges::views::filter(
+        [] (const std::string &x) { return x.size() != 2; }
+      );
+      fmt::print("{}\n", fmt::join(range, ""));
+    }
+    ```
+
+    prints \"accc\".
+
+    Thanks @tonyelewis.
+
+-   Added a `memory_buffer::append` overload that takes a range
+    (https://github.com/fmtlib/fmt/pull/1806). Thanks @BRevzin.
+
+-   Improved handling of single code units in `FMT_COMPILE`. For
+    example:
+
+    ```c++
+    #include <fmt/compile.h>
+
+    char* f(char* buf) {
+      return fmt::format_to(buf, FMT_COMPILE("x{}"), 42);
+    }
+    ```
+
+    compiles to just ([godbolt](https://godbolt.org/z/5vncz3)):
+
+    ```asm
+    _Z1fPc:
+      movb $120, (%rdi)
+      xorl %edx, %edx
+      cmpl $42, _ZN3fmt2v76detail10basic_dataIvE23zero_or_powers_of_10_32E+8(%rip)
+      movl $3, %eax
+      seta %dl
+      subl %edx, %eax
+      movzwl _ZN3fmt2v76detail10basic_dataIvE6digitsE+84(%rip), %edx
+      cltq
+      addq %rdi, %rax
+      movw %dx, -2(%rax)
+      ret
+    ```
+
+    Here a single `mov` instruction writes `'x'` (`$120`) to the output
+    buffer.
+
+-   Added dynamic width support to format string compilation
+    (https://github.com/fmtlib/fmt/issues/1809).
+
+-   Improved error reporting for unformattable types: now you\'ll get
+    the type name directly in the error message instead of the note:
+
+    ```c++
+    #include <fmt/core.h>
+
+    struct how_about_no {};
+
+    int main() {
+      fmt::print("{}", how_about_no());
+    }
+    ```
+
+    Error ([godbolt](https://godbolt.org/z/GoxM4e)):
+
+    `fmt/core.h:1438:3: error: static_assert failed due to requirement 'fmt::v7::formattable<how_about_no>()' "Cannot format an argument. To make type T formattable provide a formatter<T> specialization: https://fmt.dev/latest/api.html#udt" ...`
+
+-   Added the
+    [make_args_checked](https://fmt.dev/7.1.0/api.html#argument-lists)
+    function template that allows you to write formatting functions with
+    compile-time format string checks and avoid binary code bloat
+    ([godbolt](https://godbolt.org/z/PEf9qr)):
+
+    ```c++
+    void vlog(const char* file, int line, fmt::string_view format,
+              fmt::format_args args) {
+      fmt::print("{}: {}: ", file, line);
+      fmt::vprint(format, args);
+    }
+
+    template <typename S, typename... Args>
+    void log(const char* file, int line, const S& format, Args&&... args) {
+      vlog(file, line, format,
+          fmt::make_args_checked<Args...>(format, args...));
+    }
+
+    #define MY_LOG(format, ...) \
+      log(__FILE__, __LINE__, FMT_STRING(format), __VA_ARGS__)
+
+    MY_LOG("invalid squishiness: {}", 42);
+    ```
+
+-   Replaced `snprintf` fallback with a faster internal IEEE 754 `float`
+    and `double` formatter for arbitrary precision. For example
+    ([godbolt](https://godbolt.org/z/dPhWvj)):
+
+    ```c++
+    #include <fmt/core.h>
+
+    int main() {
+      fmt::print("{:.500}\n", 4.9406564584124654E-324);
+    }
+    ```
+
+    prints
+
+    `4.9406564584124654417656879286822137236505980261432476442558568250067550727020875186529983636163599237979656469544571773092665671035593979639877479601078187812630071319031140452784581716784898210368871863605699873072305000638740915356498438731247339727316961514003171538539807412623856559117102665855668676818703956031062493194527159149245532930545654440112748012970999954193198940908041656332452475714786901472678015935523861155013480352649347201937902681071074917033322268447533357208324319360923829e-324`.
+
+-   Made `format_to_n` and `formatted_size` part of the [core
+    API](https://fmt.dev/latest/api.html#core-api)
+    ([godbolt](https://godbolt.org/z/sPjY1K)):
+
+    ```c++
+    #include <fmt/core.h>
+
+    int main() {
+      char buffer[10];
+      auto result = fmt::format_to_n(buffer, sizeof(buffer), "{}", 42);
+    }
+    ```
+
+-   Added `fmt::format_to_n` overload with format string compilation
+    (https://github.com/fmtlib/fmt/issues/1764,
+    https://github.com/fmtlib/fmt/pull/1767,
+    https://github.com/fmtlib/fmt/pull/1869). For example
+    ([godbolt](https://godbolt.org/z/93h86q)):
+
+    ```c++
+    #include <fmt/compile.h>
+
+    int main() {
+      char buffer[8];
+      fmt::format_to_n(buffer, sizeof(buffer), FMT_COMPILE("{}"), 42);
+    }
+    ```
+
+    Thanks @Kurkin and @alexezeder.
+
+-   Added `fmt::format_to` overload that take `text_style`
+    (https://github.com/fmtlib/fmt/issues/1593,
+    https://github.com/fmtlib/fmt/issues/1842,
+    https://github.com/fmtlib/fmt/pull/1843). For example
+    ([godbolt](https://godbolt.org/z/91153r)):
+
+    ```c++
+    #include <fmt/color.h>
+
+    int main() {
+      std::string out;
+      fmt::format_to(std::back_inserter(out),
+                     fmt::emphasis::bold | fg(fmt::color::red),
+                     "The answer is {}.", 42);
+    }
+    ```
+
+    Thanks @Naios.
+
+-   Made the `'#'` specifier emit trailing zeros in addition to the
+    decimal point (https://github.com/fmtlib/fmt/issues/1797).
+    For example ([godbolt](https://godbolt.org/z/bhdcW9)):
+
+    ```c++
+    #include <fmt/core.h>
+
+    int main() {
+      fmt::print("{:#.2g}", 0.5);
+    }
+    ```
+
+    prints `0.50`.
+
+-   Changed the default floating point format to not include `.0` for
+    consistency with `std::format` and `std::to_chars`
+    (https://github.com/fmtlib/fmt/issues/1893,
+    https://github.com/fmtlib/fmt/issues/1943). It is possible
+    to get the decimal point and trailing zero with the `#` specifier.
+
+-   Fixed an issue with floating-point formatting that could result in
+    addition of a non-significant trailing zero in rare cases e.g.
+    `1.00e-34` instead of `1.0e-34`
+    (https://github.com/fmtlib/fmt/issues/1873,
+    https://github.com/fmtlib/fmt/issues/1917).
+
+-   Made `fmt::to_string` fallback on `ostream` insertion operator if
+    the `formatter` specialization is not provided
+    (https://github.com/fmtlib/fmt/issues/1815,
+    https://github.com/fmtlib/fmt/pull/1829). Thanks @alexezeder.
+
+-   Added support for the append mode to the experimental file API and
+    improved `fcntl.h` detection.
+    (https://github.com/fmtlib/fmt/pull/1847,
+    https://github.com/fmtlib/fmt/pull/1848). Thanks @t-wiser.
+
+-   Fixed handling of types that have both an implicit conversion
+    operator and an overloaded `ostream` insertion operator
+    (https://github.com/fmtlib/fmt/issues/1766).
+
+-   Fixed a slicing issue in an internal iterator type
+    (https://github.com/fmtlib/fmt/pull/1822). Thanks @BRevzin.
+
+-   Fixed an issue in locale-specific integer formatting
+    (https://github.com/fmtlib/fmt/issues/1927).
+
+-   Fixed handling of exotic code unit types
+    (https://github.com/fmtlib/fmt/issues/1870,
+    https://github.com/fmtlib/fmt/issues/1932).
+
+-   Improved `FMT_ALWAYS_INLINE`
+    (https://github.com/fmtlib/fmt/pull/1878). Thanks @jk-jeon.
+
+-   Removed dependency on `windows.h`
+    (https://github.com/fmtlib/fmt/pull/1900). Thanks @bernd5.
+
+-   Optimized counting of decimal digits on MSVC
+    (https://github.com/fmtlib/fmt/pull/1890). Thanks @mwinterb.
+
+-   Improved documentation
+    (https://github.com/fmtlib/fmt/issues/1772,
+    https://github.com/fmtlib/fmt/pull/1775,
+    https://github.com/fmtlib/fmt/pull/1792,
+    https://github.com/fmtlib/fmt/pull/1838,
+    https://github.com/fmtlib/fmt/pull/1888,
+    https://github.com/fmtlib/fmt/pull/1918,
+    https://github.com/fmtlib/fmt/pull/1939).
+    Thanks @leolchat, @pepsiman, @Klaim, @ravijanjam, @francesco-st and @udnaan.
+
+-   Added the `FMT_REDUCE_INT_INSTANTIATIONS` CMake option that reduces
+    the binary code size at the cost of some integer formatting
+    performance. This can be useful for extremely memory-constrained
+    embedded systems
+    (https://github.com/fmtlib/fmt/issues/1778,
+    https://github.com/fmtlib/fmt/pull/1781). Thanks @kammce.
+
+-   Added the `FMT_USE_INLINE_NAMESPACES` macro to control usage of
+    inline namespaces
+    (https://github.com/fmtlib/fmt/pull/1945). Thanks @darklukee.
+
+-   Improved build configuration
+    (https://github.com/fmtlib/fmt/pull/1760,
+    https://github.com/fmtlib/fmt/pull/1770,
+    https://github.com/fmtlib/fmt/issues/1779,
+    https://github.com/fmtlib/fmt/pull/1783,
+    https://github.com/fmtlib/fmt/pull/1823).
+    Thanks @dvetutnev, @xvitaly, @tambry, @medithe and @martinwuehrer.
+
+-   Fixed various warnings and compilation issues
+    (https://github.com/fmtlib/fmt/pull/1790,
+    https://github.com/fmtlib/fmt/pull/1802,
+    https://github.com/fmtlib/fmt/pull/1808,
+    https://github.com/fmtlib/fmt/issues/1810,
+    https://github.com/fmtlib/fmt/issues/1811,
+    https://github.com/fmtlib/fmt/pull/1812,
+    https://github.com/fmtlib/fmt/pull/1814,
+    https://github.com/fmtlib/fmt/pull/1816,
+    https://github.com/fmtlib/fmt/pull/1817,
+    https://github.com/fmtlib/fmt/pull/1818,
+    https://github.com/fmtlib/fmt/issues/1825,
+    https://github.com/fmtlib/fmt/pull/1836,
+    https://github.com/fmtlib/fmt/pull/1855,
+    https://github.com/fmtlib/fmt/pull/1856,
+    https://github.com/fmtlib/fmt/pull/1860,
+    https://github.com/fmtlib/fmt/pull/1877,
+    https://github.com/fmtlib/fmt/pull/1879,
+    https://github.com/fmtlib/fmt/pull/1880,
+    https://github.com/fmtlib/fmt/issues/1896,
+    https://github.com/fmtlib/fmt/pull/1897,
+    https://github.com/fmtlib/fmt/pull/1898,
+    https://github.com/fmtlib/fmt/issues/1904,
+    https://github.com/fmtlib/fmt/pull/1908,
+    https://github.com/fmtlib/fmt/issues/1911,
+    https://github.com/fmtlib/fmt/issues/1912,
+    https://github.com/fmtlib/fmt/issues/1928,
+    https://github.com/fmtlib/fmt/pull/1929,
+    https://github.com/fmtlib/fmt/issues/1935,
+    https://github.com/fmtlib/fmt/pull/1937,
+    https://github.com/fmtlib/fmt/pull/1942,
+    https://github.com/fmtlib/fmt/issues/1949).
+    Thanks @TheQwertiest, @medithe, @martinwuehrer, @n16h7hunt3r, @Othereum,
+    @gsjaardema, @AlexanderLanin, @gcerretani, @chronoxor, @noizefloor,
+    @akohlmey, @jk-jeon, @rimathia, @rglarix, @moiwi, @heckad, @MarcDirven.
+    @BartSiwek and @darklukee.
+
+# 7.0.3 - 2020-08-06
+
+-   Worked around broken `numeric_limits` for 128-bit integers
+    (https://github.com/fmtlib/fmt/issues/1787).
+-   Added error reporting on missing named arguments
+    (https://github.com/fmtlib/fmt/issues/1796).
+-   Stopped using 128-bit integers with clang-cl
+    (https://github.com/fmtlib/fmt/pull/1800). Thanks @Kingcom.
+-   Fixed issues in locale-specific integer formatting
+    (https://github.com/fmtlib/fmt/issues/1782,
+    https://github.com/fmtlib/fmt/issues/1801).
+
+# 7.0.2 - 2020-07-29
+
+-   Worked around broken `numeric_limits` for 128-bit integers
+    (https://github.com/fmtlib/fmt/issues/1725).
+-   Fixed compatibility with CMake 3.4
+    (https://github.com/fmtlib/fmt/issues/1779).
+-   Fixed handling of digit separators in locale-specific formatting
+    (https://github.com/fmtlib/fmt/issues/1782).
+
+# 7.0.1 - 2020-07-07
+
+-   Updated the inline version namespace name.
+-   Worked around a gcc bug in mangling of alias templates
+    (https://github.com/fmtlib/fmt/issues/1753).
+-   Fixed a linkage error on Windows
+    (https://github.com/fmtlib/fmt/issues/1757). Thanks @Kurkin.
+-   Fixed minor issues with the documentation.
+
+# 7.0.0 - 2020-07-05
+
+-   Reduced the library size. For example, on macOS a stripped test
+    binary statically linked with {fmt} [shrank from \~368k to less than
+    100k](http://www.zverovich.net/2020/05/21/reducing-library-size.html).
+
+-   Added a simpler and more efficient [format string compilation
+    API](https://fmt.dev/7.0.0/api.html#compile-api):
+
+    ```c++
+    #include <fmt/compile.h>
+
+    // Converts 42 into std::string using the most efficient method and no
+    // runtime format string processing.
+    std::string s = fmt::format(FMT_COMPILE("{}"), 42);
+    ```
+
+    The old `fmt::compile` API is now deprecated.
+
+-   Optimized integer formatting: `format_to` with format string
+    compilation and a stack-allocated buffer is now [faster than
+    to_chars on both libc++ and
+    libstdc++](http://www.zverovich.net/2020/06/13/fast-int-to-string-revisited.html).
+
+-   Optimized handling of small format strings. For example,
+
+    ```c++
+    fmt::format("Result: {}: ({},{},{},{})", str1, str2, str3, str4, str5)
+    ```
+
+    is now \~40% faster
+    (https://github.com/fmtlib/fmt/issues/1685).
+
+-   Applied extern templates to improve compile times when using the
+    core API and `fmt/format.h`
+    (https://github.com/fmtlib/fmt/issues/1452). For example,
+    on macOS with clang the compile time of a test translation unit
+    dropped from 2.3s to 0.3s with `-O2` and from 0.6s to 0.3s with the
+    default settings (`-O0`).
+
+    Before (`-O2`):
+
+        % time c++ -c test.cc -I include -std=c++17 -O2
+        c++ -c test.cc -I include -std=c++17 -O2  2.22s user 0.08s system 99% cpu 2.311 total
+
+    After (`-O2`):
+
+        % time c++ -c test.cc -I include -std=c++17 -O2
+        c++ -c test.cc -I include -std=c++17 -O2  0.26s user 0.04s system 98% cpu 0.303 total
+
+    Before (default):
+
+        % time c++ -c test.cc -I include -std=c++17
+        c++ -c test.cc -I include -std=c++17  0.53s user 0.06s system 98% cpu 0.601 total
+
+    After (default):
+
+        % time c++ -c test.cc -I include -std=c++17
+        c++ -c test.cc -I include -std=c++17  0.24s user 0.06s system 98% cpu 0.301 total
+
+    It is still recommended to use `fmt/core.h` instead of
+    `fmt/format.h` but the compile time difference is now smaller.
+    Thanks @alex3d for the suggestion.
+
+-   Named arguments are now stored on stack (no dynamic memory
+    allocations) and the compiled code is more compact and efficient.
+    For example
+
+    ```c++
+    #include <fmt/core.h>
+
+    int main() {
+      fmt::print("The answer is {answer}\n", fmt::arg("answer", 42));
+    }
+    ```
+
+    compiles to just ([godbolt](https://godbolt.org/z/NcfEp_))
+
+    ```asm
+    .LC0:
+            .string "answer"
+    .LC1:
+            .string "The answer is {answer}\n"
+    main:
+            sub     rsp, 56
+            mov     edi, OFFSET FLAT:.LC1
+            mov     esi, 23
+            movabs  rdx, 4611686018427387905
+            lea     rax, [rsp+32]
+            lea     rcx, [rsp+16]
+            mov     QWORD PTR [rsp+8], 1
+            mov     QWORD PTR [rsp], rax
+            mov     DWORD PTR [rsp+16], 42
+            mov     QWORD PTR [rsp+32], OFFSET FLAT:.LC0
+            mov     DWORD PTR [rsp+40], 0
+            call    fmt::v6::vprint(fmt::v6::basic_string_view<char>,
+                                    fmt::v6::format_args)
+            xor     eax, eax
+            add     rsp, 56
+            ret
+
+        .L.str.1:
+                .asciz  "answer"
+    ```
+
+-   Implemented compile-time checks for dynamic width and precision
+    (https://github.com/fmtlib/fmt/issues/1614):
+
+    ```c++
+    #include <fmt/format.h>
+
+    int main() {
+      fmt::print(FMT_STRING("{0:{1}}"), 42);
+    }
+    ```
+
+    now gives a compilation error because argument 1 doesn\'t exist:
+
+        In file included from test.cc:1:
+        include/fmt/format.h:2726:27: error: constexpr variable 'invalid_format' must be
+        initialized by a constant expression
+          FMT_CONSTEXPR_DECL bool invalid_format =
+                                  ^
+        ...
+        include/fmt/core.h:569:26: note: in call to
+        '&checker(s, {}).context_->on_error(&"argument not found"[0])'
+            if (id >= num_args_) on_error("argument not found");
+                                ^
+
+-   Added sentinel support to `fmt::join`
+    (https://github.com/fmtlib/fmt/pull/1689)
+
+    ```c++
+    struct zstring_sentinel {};
+    bool operator==(const char* p, zstring_sentinel) { return *p == '\0'; }
+    bool operator!=(const char* p, zstring_sentinel) { return *p != '\0'; }
+
+    struct zstring {
+      const char* p;
+      const char* begin() const { return p; }
+      zstring_sentinel end() const { return {}; }
+    };
+
+    auto s = fmt::format("{}", fmt::join(zstring{"hello"}, "_"));
+    // s == "h_e_l_l_o"
+    ```
+
+    Thanks @BRevzin.
+
+-   Added support for named arguments, `clear` and `reserve` to
+    `dynamic_format_arg_store`
+    (https://github.com/fmtlib/fmt/issues/1655,
+    https://github.com/fmtlib/fmt/pull/1663,
+    https://github.com/fmtlib/fmt/pull/1674,
+    https://github.com/fmtlib/fmt/pull/1677). Thanks @vsolontsov-ll.
+
+-   Added support for the `'c'` format specifier to integral types for
+    compatibility with `std::format`
+    (https://github.com/fmtlib/fmt/issues/1652).
+
+-   Replaced the `'n'` format specifier with `'L'` for compatibility
+    with `std::format`
+    (https://github.com/fmtlib/fmt/issues/1624). The `'n'`
+    specifier can be enabled via the `FMT_DEPRECATED_N_SPECIFIER` macro.
+
+-   The `'='` format specifier is now disabled by default for
+    compatibility with `std::format`. It can be enabled via the
+    `FMT_DEPRECATED_NUMERIC_ALIGN` macro.
+
+-   Removed the following deprecated APIs:
+
+    -   `FMT_STRING_ALIAS` and `fmt` macros - replaced by `FMT_STRING`
+    -   `fmt::basic_string_view::char_type` - replaced by
+        `fmt::basic_string_view::value_type`
+    -   `convert_to_int`
+    -   `format_arg_store::types`
+    -   `*parse_context` - replaced by `*format_parse_context`
+    -   `FMT_DEPRECATED_INCLUDE_OS`
+    -   `FMT_DEPRECATED_PERCENT` - incompatible with `std::format`
+    -   `*writer` - replaced by compiled format API
+
+-   Renamed the `internal` namespace to `detail`
+    (https://github.com/fmtlib/fmt/issues/1538). The former is
+    still provided as an alias if the `FMT_USE_INTERNAL` macro is
+    defined.
+
+-   Improved compatibility between `fmt::printf` with the standard specs
+    (https://github.com/fmtlib/fmt/issues/1595,
+    https://github.com/fmtlib/fmt/pull/1682,
+    https://github.com/fmtlib/fmt/pull/1683,
+    https://github.com/fmtlib/fmt/pull/1687,
+    https://github.com/fmtlib/fmt/pull/1699). Thanks @rimathia.
+
+-   Fixed handling of `operator<<` overloads that use `copyfmt`
+    (https://github.com/fmtlib/fmt/issues/1666).
+
+-   Added the `FMT_OS` CMake option to control inclusion of OS-specific
+    APIs in the fmt target. This can be useful for embedded platforms
+    (https://github.com/fmtlib/fmt/issues/1654,
+    https://github.com/fmtlib/fmt/pull/1656). Thanks @kwesolowski.
+
+-   Replaced `FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION` with the
+    `FMT_FUZZ` macro to prevent interfering with fuzzing of projects
+    using {fmt} (https://github.com/fmtlib/fmt/pull/1650).
+    Thanks @asraa.
+
+-   Fixed compatibility with emscripten
+    (https://github.com/fmtlib/fmt/issues/1636,
+    https://github.com/fmtlib/fmt/pull/1637). Thanks @ArthurSonzogni.
+
+-   Improved documentation
+    (https://github.com/fmtlib/fmt/issues/704,
+    https://github.com/fmtlib/fmt/pull/1643,
+    https://github.com/fmtlib/fmt/pull/1660,
+    https://github.com/fmtlib/fmt/pull/1681,
+    https://github.com/fmtlib/fmt/pull/1691,
+    https://github.com/fmtlib/fmt/pull/1706,
+    https://github.com/fmtlib/fmt/pull/1714,
+    https://github.com/fmtlib/fmt/pull/1721,
+    https://github.com/fmtlib/fmt/pull/1739,
+    https://github.com/fmtlib/fmt/pull/1740,
+    https://github.com/fmtlib/fmt/pull/1741,
+    https://github.com/fmtlib/fmt/pull/1751).
+    Thanks @senior7515, @lsr0, @puetzk, @fpelliccioni, Alexey Kuzmenko, @jelly,
+    @claremacrae, @jiapengwen, @gsjaardema and @alexey-milovidov.
+
+-   Implemented various build configuration fixes and improvements
+    (https://github.com/fmtlib/fmt/pull/1603,
+    https://github.com/fmtlib/fmt/pull/1657,
+    https://github.com/fmtlib/fmt/pull/1702,
+    https://github.com/fmtlib/fmt/pull/1728).
+    Thanks @scramsby, @jtojnar, @orivej and @flagarde.
+
+-   Fixed various warnings and compilation issues
+    (https://github.com/fmtlib/fmt/pull/1616,
+    https://github.com/fmtlib/fmt/issues/1620,
+    https://github.com/fmtlib/fmt/issues/1622,
+    https://github.com/fmtlib/fmt/issues/1625,
+    https://github.com/fmtlib/fmt/pull/1627,
+    https://github.com/fmtlib/fmt/issues/1628,
+    https://github.com/fmtlib/fmt/pull/1629,
+    https://github.com/fmtlib/fmt/issues/1631,
+    https://github.com/fmtlib/fmt/pull/1633,
+    https://github.com/fmtlib/fmt/pull/1649,
+    https://github.com/fmtlib/fmt/issues/1658,
+    https://github.com/fmtlib/fmt/pull/1661,
+    https://github.com/fmtlib/fmt/pull/1667,
+    https://github.com/fmtlib/fmt/issues/1668,
+    https://github.com/fmtlib/fmt/pull/1669,
+    https://github.com/fmtlib/fmt/issues/1692,
+    https://github.com/fmtlib/fmt/pull/1696,
+    https://github.com/fmtlib/fmt/pull/1697,
+    https://github.com/fmtlib/fmt/issues/1707,
+    https://github.com/fmtlib/fmt/pull/1712,
+    https://github.com/fmtlib/fmt/pull/1716,
+    https://github.com/fmtlib/fmt/pull/1722,
+    https://github.com/fmtlib/fmt/issues/1724,
+    https://github.com/fmtlib/fmt/pull/1729,
+    https://github.com/fmtlib/fmt/pull/1738,
+    https://github.com/fmtlib/fmt/issues/1742,
+    https://github.com/fmtlib/fmt/issues/1743,
+    https://github.com/fmtlib/fmt/pull/1744,
+    https://github.com/fmtlib/fmt/issues/1747,
+    https://github.com/fmtlib/fmt/pull/1750).
+    Thanks @gsjaardema, @gabime, @johnor, @Kurkin, @invexed, @peterbell10,
+    @daixtrose, @petrutlucian94, @Neargye, @ambitslix, @gabime, @erthink,
+    @tohammer and @0x8000-0000.
+
+# 6.2.1 - 2020-05-09
+
+-   Fixed ostream support in `sprintf`
+    (https://github.com/fmtlib/fmt/issues/1631).
+-   Fixed type detection when using implicit conversion to `string_view`
+    and ostream `operator<<` inconsistently
+    (https://github.com/fmtlib/fmt/issues/1662).
+
+# 6.2.0 - 2020-04-05
+
+-   Improved error reporting when trying to format an object of a
+    non-formattable type:
+
+    ```c++
+    fmt::format("{}", S());
+    ```
+
+    now gives:
+
+        include/fmt/core.h:1015:5: error: static_assert failed due to requirement
+        'formattable' "Cannot format argument. To make type T formattable provide a
+        formatter<T> specialization:
+        https://fmt.dev/latest/api.html#formatting-user-defined-types"
+            static_assert(
+            ^
+        ...
+        note: in instantiation of function template specialization
+        'fmt::v6::format<char [3], S, char>' requested here
+          fmt::format("{}", S());
+               ^
+
+    if `S` is not formattable.
+
+-   Reduced the library size by \~10%.
+
+-   Always print decimal point if `#` is specified
+    (https://github.com/fmtlib/fmt/issues/1476,
+    https://github.com/fmtlib/fmt/issues/1498):
+
+    ```c++
+    fmt::print("{:#.0f}", 42.0);
+    ```
+
+    now prints `42.`
+
+-   Implemented the `'L'` specifier for locale-specific numeric
+    formatting to improve compatibility with `std::format`. The `'n'`
+    specifier is now deprecated and will be removed in the next major
+    release.
+
+-   Moved OS-specific APIs such as `windows_error` from `fmt/format.h`
+    to `fmt/os.h`. You can define `FMT_DEPRECATED_INCLUDE_OS` to
+    automatically include `fmt/os.h` from `fmt/format.h` for
+    compatibility but this will be disabled in the next major release.
+
+-   Added precision overflow detection in floating-point formatting.
+
+-   Implemented detection of invalid use of `fmt::arg`.
+
+-   Used `type_identity` to block unnecessary template argument
+    deduction. Thanks Tim Song.
+
+-   Improved UTF-8 handling
+    (https://github.com/fmtlib/fmt/issues/1109):
+
+    ```c++
+    fmt::print("┌{0:─^{2}}┐\n"
+               "│{1: ^{2}}│\n"
+               "└{0:─^{2}}┘\n", "", "Привет, мир!", 20);
+    ```
+
+    now prints:
+
+        ┌────────────────────┐
+        │    Привет, мир!    │
+        └────────────────────┘
+
+    on systems that support Unicode.
+
+-   Added experimental dynamic argument storage
+    (https://github.com/fmtlib/fmt/issues/1170,
+    https://github.com/fmtlib/fmt/pull/1584):
+
+    ```c++
+    fmt::dynamic_format_arg_store<fmt::format_context> store;
+    store.push_back("answer");
+    store.push_back(42);
+    fmt::vprint("The {} is {}.\n", store);
+    ```
+
+    prints:
+
+        The answer is 42.
+
+    Thanks @vsolontsov-ll.
+
+-   Made `fmt::join` accept `initializer_list`
+    (https://github.com/fmtlib/fmt/pull/1591). Thanks @Rapotkinnik.
+
+-   Fixed handling of empty tuples
+    (https://github.com/fmtlib/fmt/issues/1588).
+
+-   Fixed handling of output iterators in `format_to_n`
+    (https://github.com/fmtlib/fmt/issues/1506).
+
+-   Fixed formatting of `std::chrono::duration` types to wide output
+    (https://github.com/fmtlib/fmt/pull/1533). Thanks @zeffy.
+
+-   Added const `begin` and `end` overload to buffers
+    (https://github.com/fmtlib/fmt/pull/1553). Thanks @dominicpoeschko.
+
+-   Added the ability to disable floating-point formatting via
+    `FMT_USE_FLOAT`, `FMT_USE_DOUBLE` and `FMT_USE_LONG_DOUBLE` macros
+    for extremely memory-constrained embedded system
+    (https://github.com/fmtlib/fmt/pull/1590). Thanks @albaguirre.
+
+-   Made `FMT_STRING` work with `constexpr` `string_view`
+    (https://github.com/fmtlib/fmt/pull/1589). Thanks @scramsby.
+
+-   Implemented a minor optimization in the format string parser
+    (https://github.com/fmtlib/fmt/pull/1560). Thanks @IkarusDeveloper.
+
+-   Improved attribute detection
+    (https://github.com/fmtlib/fmt/pull/1469,
+    https://github.com/fmtlib/fmt/pull/1475,
+    https://github.com/fmtlib/fmt/pull/1576).
+    Thanks @federico-busato, @chronoxor and @refnum.
+
+-   Improved documentation
+    (https://github.com/fmtlib/fmt/pull/1481,
+    https://github.com/fmtlib/fmt/pull/1523).
+    Thanks @JackBoosY and @imba-tjd.
+
+-   Fixed symbol visibility on Linux when compiling with
+    `-fvisibility=hidden`
+    (https://github.com/fmtlib/fmt/pull/1535). Thanks @milianw.
+
+-   Implemented various build configuration fixes and improvements
+    (https://github.com/fmtlib/fmt/issues/1264,
+    https://github.com/fmtlib/fmt/issues/1460,
+    https://github.com/fmtlib/fmt/pull/1534,
+    https://github.com/fmtlib/fmt/issues/1536,
+    https://github.com/fmtlib/fmt/issues/1545,
+    https://github.com/fmtlib/fmt/pull/1546,
+    https://github.com/fmtlib/fmt/issues/1566,
+    https://github.com/fmtlib/fmt/pull/1582,
+    https://github.com/fmtlib/fmt/issues/1597,
+    https://github.com/fmtlib/fmt/pull/1598).
+    Thanks @ambitslix, @jwillikers and @stac47.
+
+-   Fixed various warnings and compilation issues
+    (https://github.com/fmtlib/fmt/pull/1433,
+    https://github.com/fmtlib/fmt/issues/1461,
+    https://github.com/fmtlib/fmt/pull/1470,
+    https://github.com/fmtlib/fmt/pull/1480,
+    https://github.com/fmtlib/fmt/pull/1485,
+    https://github.com/fmtlib/fmt/pull/1492,
+    https://github.com/fmtlib/fmt/issues/1493,
+    https://github.com/fmtlib/fmt/issues/1504,
+    https://github.com/fmtlib/fmt/pull/1505,
+    https://github.com/fmtlib/fmt/pull/1512,
+    https://github.com/fmtlib/fmt/issues/1515,
+    https://github.com/fmtlib/fmt/pull/1516,
+    https://github.com/fmtlib/fmt/pull/1518,
+    https://github.com/fmtlib/fmt/pull/1519,
+    https://github.com/fmtlib/fmt/pull/1520,
+    https://github.com/fmtlib/fmt/pull/1521,
+    https://github.com/fmtlib/fmt/pull/1522,
+    https://github.com/fmtlib/fmt/issues/1524,
+    https://github.com/fmtlib/fmt/pull/1530,
+    https://github.com/fmtlib/fmt/issues/1531,
+    https://github.com/fmtlib/fmt/pull/1532,
+    https://github.com/fmtlib/fmt/issues/1539,
+    https://github.com/fmtlib/fmt/issues/1547,
+    https://github.com/fmtlib/fmt/issues/1548,
+    https://github.com/fmtlib/fmt/pull/1554,
+    https://github.com/fmtlib/fmt/issues/1567,
+    https://github.com/fmtlib/fmt/pull/1568,
+    https://github.com/fmtlib/fmt/pull/1569,
+    https://github.com/fmtlib/fmt/pull/1571,
+    https://github.com/fmtlib/fmt/pull/1573,
+    https://github.com/fmtlib/fmt/pull/1575,
+    https://github.com/fmtlib/fmt/pull/1581,
+    https://github.com/fmtlib/fmt/issues/1583,
+    https://github.com/fmtlib/fmt/issues/1586,
+    https://github.com/fmtlib/fmt/issues/1587,
+    https://github.com/fmtlib/fmt/issues/1594,
+    https://github.com/fmtlib/fmt/pull/1596,
+    https://github.com/fmtlib/fmt/issues/1604,
+    https://github.com/fmtlib/fmt/pull/1606,
+    https://github.com/fmtlib/fmt/issues/1607,
+    https://github.com/fmtlib/fmt/issues/1609).
+    Thanks @marti4d, @iPherian, @parkertomatoes, @gsjaardema, @chronoxor,
+    @DanielaE, @torsten48, @tohammer, @lefticus, @ryusakki, @adnsv, @fghzxm,
+    @refnum, @pramodk, @Spirrwell and @scramsby.
+
+# 6.1.2 - 2019-12-11
+
+-   Fixed ABI compatibility with `libfmt.so.6.0.0`
+    (https://github.com/fmtlib/fmt/issues/1471).
+-   Fixed handling types convertible to `std::string_view`
+    (https://github.com/fmtlib/fmt/pull/1451). Thanks @denizevrenci.
+-   Made CUDA test an opt-in enabled via the `FMT_CUDA_TEST` CMake
+    option.
+-   Fixed sign conversion warnings
+    (https://github.com/fmtlib/fmt/pull/1440). Thanks @0x8000-0000.
+
+# 6.1.1 - 2019-12-04
+
+-   Fixed shared library build on Windows
+    (https://github.com/fmtlib/fmt/pull/1443,
+    https://github.com/fmtlib/fmt/issues/1445,
+    https://github.com/fmtlib/fmt/pull/1446,
+    https://github.com/fmtlib/fmt/issues/1450).
+    Thanks @egorpugin and @bbolli.
+-   Added a missing decimal point in exponent notation with trailing
+    zeros.
+-   Removed deprecated `format_arg_store::TYPES`.
+
+# 6.1.0 - 2019-12-01
+
+-   {fmt} now formats IEEE 754 `float` and `double` using the shortest
+    decimal representation with correct rounding by default:
+
+    ```c++
+    #include <cmath>
+    #include <fmt/core.h>
+
+    int main() {
+      fmt::print("{}", M_PI);
+    }
+    ```
+
+    prints `3.141592653589793`.
+
+-   Made the fast binary to decimal floating-point formatter the
+    default, simplified it and improved performance. {fmt} is now 15
+    times faster than libc++\'s `std::ostringstream`, 11 times faster
+    than `printf` and 10% faster than double-conversion on
+    [dtoa-benchmark](https://github.com/fmtlib/dtoa-benchmark):
+
+    | Function      | Time (ns) | Speedup |
+    | ------------- | --------: | ------: |
+    | ostringstream | 1,346.30  | 1.00x   |
+    | ostrstream    | 1,195.74  | 1.13x   |
+    | sprintf       | 995.08    | 1.35x   |
+    | doubleconv    | 99.10     | 13.59x  |
+    | fmt           | 88.34     | 15.24x  |
+
+    ![](https://user-images.githubusercontent.com/576385/69767160-cdaca400-112f-11ea-9fc5-347c9f83caad.png)
+
+-   {fmt} no longer converts `float` arguments to `double`. In
+    particular this improves the default (shortest) representation of
+    floats and makes `fmt::format` consistent with `std::format` specs
+    (https://github.com/fmtlib/fmt/issues/1336,
+    https://github.com/fmtlib/fmt/issues/1353,
+    https://github.com/fmtlib/fmt/pull/1360,
+    https://github.com/fmtlib/fmt/pull/1361):
+
+    ```c++
+    fmt::print("{}", 0.1f);
+    ```
+
+    prints `0.1` instead of `0.10000000149011612`.
+
+    Thanks @orivej.
+
+-   Made floating-point formatting output consistent with
+    `printf`/iostreams
+    (https://github.com/fmtlib/fmt/issues/1376,
+    https://github.com/fmtlib/fmt/issues/1417).
+
+-   Added support for 128-bit integers
+    (https://github.com/fmtlib/fmt/pull/1287):
+
+    ```c++
+    fmt::print("{}", std::numeric_limits<__int128_t>::max());
+    ```
+
+    prints `170141183460469231731687303715884105727`.
+
+    Thanks @denizevrenci.
+
+-   The overload of `print` that takes `text_style` is now atomic, i.e.
+    the output from different threads doesn\'t interleave
+    (https://github.com/fmtlib/fmt/pull/1351). Thanks @tankiJong.
+
+-   Made compile time in the header-only mode \~20% faster by reducing
+    the number of template instantiations. `wchar_t` overload of
+    `vprint` was moved from `fmt/core.h` to `fmt/format.h`.
+
+-   Added an overload of `fmt::join` that works with tuples
+    (https://github.com/fmtlib/fmt/issues/1322,
+    https://github.com/fmtlib/fmt/pull/1330):
+
+    ```c++
+    #include <tuple>
+    #include <fmt/ranges.h>
+
+    int main() {
+      std::tuple<char, int, float> t{'a', 1, 2.0f};
+      fmt::print("{}", t);
+    }
+    ```
+
+    prints `('a', 1, 2.0)`.
+
+    Thanks @jeremyong.
+
+-   Changed formatting of octal zero with prefix from \"00\" to \"0\":
+
+    ```c++
+    fmt::print("{:#o}", 0);
+    ```
+
+    prints `0`.
+
+-   The locale is now passed to ostream insertion (`<<`) operators
+    (https://github.com/fmtlib/fmt/pull/1406):
+
+    ```c++
+    #include <fmt/locale.h>
+    #include <fmt/ostream.h>
+
+    struct S {
+      double value;
+    };
+
+    std::ostream& operator<<(std::ostream& os, S s) {
+      return os << s.value;
+    }
+
+    int main() {
+      auto s = fmt::format(std::locale("fr_FR.UTF-8"), "{}", S{0.42});
+      // s == "0,42"
+    }
+    ```
+
+    Thanks @dlaugt.
+
+-   Locale-specific number formatting now uses grouping
+    (https://github.com/fmtlib/fmt/issues/1393,
+    https://github.com/fmtlib/fmt/pull/1394). Thanks @skrdaniel.
+
+-   Fixed handling of types with deleted implicit rvalue conversion to
+    `const char**` (https://github.com/fmtlib/fmt/issues/1421):
+
+    ```c++
+    struct mystring {
+      operator const char*() const&;
+      operator const char*() &;
+      operator const char*() const&& = delete;
+      operator const char*() && = delete;
+    };
+    mystring str;
+    fmt::print("{}", str); // now compiles
+    ```
+
+-   Enums are now mapped to correct underlying types instead of `int`
+    (https://github.com/fmtlib/fmt/pull/1286). Thanks @agmt.
+
+-   Enum classes are no longer implicitly converted to `int`
+    (https://github.com/fmtlib/fmt/issues/1424).
+
+-   Added `basic_format_parse_context` for consistency with C++20
+    `std::format` and deprecated `basic_parse_context`.
+
+-   Fixed handling of UTF-8 in precision
+    (https://github.com/fmtlib/fmt/issues/1389,
+    https://github.com/fmtlib/fmt/pull/1390). Thanks @tajtiattila.
+
+-   {fmt} can now be installed on Linux, macOS and Windows with
+    [Conda](https://docs.conda.io/en/latest/) using its
+    [conda-forge](https://conda-forge.org)
+    [package](https://github.com/conda-forge/fmt-feedstock)
+    (https://github.com/fmtlib/fmt/pull/1410):
+
+        conda install -c conda-forge fmt
+
+    Thanks @tdegeus.
+
+-   Added a CUDA test (https://github.com/fmtlib/fmt/pull/1285,
+    https://github.com/fmtlib/fmt/pull/1317).
+    Thanks @luncliff and @risa2000.
+
+-   Improved documentation
+    (https://github.com/fmtlib/fmt/pull/1276,
+    https://github.com/fmtlib/fmt/issues/1291,
+    https://github.com/fmtlib/fmt/issues/1296,
+    https://github.com/fmtlib/fmt/pull/1315,
+    https://github.com/fmtlib/fmt/pull/1332,
+    https://github.com/fmtlib/fmt/pull/1337,
+    https://github.com/fmtlib/fmt/issues/1395
+    https://github.com/fmtlib/fmt/pull/1418).
+    Thanks @waywardmonkeys, @pauldreik and @jackoalan.
+
+-   Various code improvements
+    (https://github.com/fmtlib/fmt/pull/1358,
+    https://github.com/fmtlib/fmt/pull/1407).
+    Thanks @orivej and @dpacbach.
+
+-   Fixed compile-time format string checks for user-defined types
+    (https://github.com/fmtlib/fmt/issues/1292).
+
+-   Worked around a false positive in `unsigned-integer-overflow` sanitizer
+    (https://github.com/fmtlib/fmt/issues/1377).
+
+-   Fixed various warnings and compilation issues
+    (https://github.com/fmtlib/fmt/issues/1273,
+    https://github.com/fmtlib/fmt/pull/1278,
+    https://github.com/fmtlib/fmt/pull/1280,
+    https://github.com/fmtlib/fmt/issues/1281,
+    https://github.com/fmtlib/fmt/issues/1288,
+    https://github.com/fmtlib/fmt/pull/1290,
+    https://github.com/fmtlib/fmt/pull/1301,
+    https://github.com/fmtlib/fmt/issues/1305,
+    https://github.com/fmtlib/fmt/issues/1306,
+    https://github.com/fmtlib/fmt/issues/1309,
+    https://github.com/fmtlib/fmt/pull/1312,
+    https://github.com/fmtlib/fmt/issues/1313,
+    https://github.com/fmtlib/fmt/issues/1316,
+    https://github.com/fmtlib/fmt/issues/1319,
+    https://github.com/fmtlib/fmt/pull/1320,
+    https://github.com/fmtlib/fmt/pull/1326,
+    https://github.com/fmtlib/fmt/pull/1328,
+    https://github.com/fmtlib/fmt/issues/1344,
+    https://github.com/fmtlib/fmt/pull/1345,
+    https://github.com/fmtlib/fmt/pull/1347,
+    https://github.com/fmtlib/fmt/pull/1349,
+    https://github.com/fmtlib/fmt/issues/1354,
+    https://github.com/fmtlib/fmt/issues/1362,
+    https://github.com/fmtlib/fmt/issues/1366,
+    https://github.com/fmtlib/fmt/pull/1364,
+    https://github.com/fmtlib/fmt/pull/1370,
+    https://github.com/fmtlib/fmt/pull/1371,
+    https://github.com/fmtlib/fmt/issues/1385,
+    https://github.com/fmtlib/fmt/issues/1388,
+    https://github.com/fmtlib/fmt/pull/1397,
+    https://github.com/fmtlib/fmt/pull/1414,
+    https://github.com/fmtlib/fmt/pull/1416,
+    https://github.com/fmtlib/fmt/issues/1422
+    https://github.com/fmtlib/fmt/pull/1427,
+    https://github.com/fmtlib/fmt/issues/1431,
+    https://github.com/fmtlib/fmt/pull/1433).
+    Thanks @hhb, @gsjaardema, @gabime, @neheb, @vedranmiletic, @dkavolis,
+    @mwinterb, @orivej, @denizevrenci, @leonklingele, @chronoxor, @kent-tri,
+    @0x8000-0000 and @marti4d.
+
+# 6.0.0 - 2019-08-26
+
+-   Switched to the [MIT license](
+    https://github.com/fmtlib/fmt/blob/5a4b24613ba16cc689977c3b5bd8274a3ba1dd1f/LICENSE.rst)
+    with an optional exception that allows distributing binary code
+    without attribution.
+
+-   Floating-point formatting is now locale-independent by default:
+
+    ```c++
+    #include <locale>
+    #include <fmt/core.h>
+
+    int main() {
+      std::locale::global(std::locale("ru_RU.UTF-8"));
+      fmt::print("value = {}", 4.2);
+    }
+    ```
+
+    prints \"value = 4.2\" regardless of the locale.
+
+    For locale-specific formatting use the `n` specifier:
+
+    ```c++
+    std::locale::global(std::locale("ru_RU.UTF-8"));
+    fmt::print("value = {:n}", 4.2);
+    ```
+
+    prints \"value = 4,2\".
+
+-   Added an experimental Grisu floating-point formatting algorithm
+    implementation (disabled by default). To enable it compile with the
+    `FMT_USE_GRISU` macro defined to 1:
+
+    ```c++
+    #define FMT_USE_GRISU 1
+    #include <fmt/format.h>
+
+    auto s = fmt::format("{}", 4.2); // formats 4.2 using Grisu
+    ```
+
+    With Grisu enabled, {fmt} is 13x faster than `std::ostringstream`
+    (libc++) and 10x faster than `sprintf` on
+    [dtoa-benchmark](https://github.com/fmtlib/dtoa-benchmark) ([full
+    results](https://fmt.dev/unknown_mac64_clang10.0.html)):
+
+    ![](https://user-images.githubusercontent.com/576385/54883977-9fe8c000-4e28-11e9-8bde-272d122e7c52.jpg)
+
+-   Separated formatting and parsing contexts for consistency with
+    [C++20 std::format](http://eel.is/c++draft/format), removing the
+    undocumented `basic_format_context::parse_context()` function.
+
+-   Added [oss-fuzz](https://github.com/google/oss-fuzz) support
+    (https://github.com/fmtlib/fmt/pull/1199). Thanks @pauldreik.
+
+-   `formatter` specializations now always take precedence over
+    `operator<<` (https://github.com/fmtlib/fmt/issues/952):
+
+    ```c++
+    #include <iostream>
+    #include <fmt/ostream.h>
+
+    struct S {};
+
+    std::ostream& operator<<(std::ostream& os, S) {
+      return os << 1;
+    }
+
+    template <>
+    struct fmt::formatter<S> : fmt::formatter<int> {
+      auto format(S, format_context& ctx) {
+        return formatter<int>::format(2, ctx);
+      }
+    };
+
+    int main() {
+      std::cout << S() << "\n"; // prints 1 using operator<<
+      fmt::print("{}\n", S());  // prints 2 using formatter
+    }
+    ```
+
+-   Introduced the experimental `fmt::compile` function that does format
+    string compilation
+    (https://github.com/fmtlib/fmt/issues/618,
+    https://github.com/fmtlib/fmt/issues/1169,
+    https://github.com/fmtlib/fmt/pull/1171):
+
+    ```c++
+    #include <fmt/compile.h>
+
+    auto f = fmt::compile<int>("{}");
+    std::string s = fmt::format(f, 42); // can be called multiple times to
+                                        // format different values
+    // s == "42"
+    ```
+
+    It moves the cost of parsing a format string outside of the format
+    function which can be beneficial when identically formatting many
+    objects of the same types. Thanks @stryku.
+
+-   Added experimental `%` format specifier that formats floating-point
+    values as percentages
+    (https://github.com/fmtlib/fmt/pull/1060,
+    https://github.com/fmtlib/fmt/pull/1069,
+    https://github.com/fmtlib/fmt/pull/1071):
+
+    ```c++
+    auto s = fmt::format("{:.1%}", 0.42); // s == "42.0%"
+    ```
+
+    Thanks @gawain-bolton.
+
+-   Implemented precision for floating-point durations
+    (https://github.com/fmtlib/fmt/issues/1004,
+    https://github.com/fmtlib/fmt/pull/1012):
+
+    ```c++
+    auto s = fmt::format("{:.1}", std::chrono::duration<double>(1.234));
+    // s == 1.2s
+    ```
+
+    Thanks @DanielaE.
+
+-   Implemented `chrono` format specifiers `%Q` and `%q` that give the
+    value and the unit respectively
+    (https://github.com/fmtlib/fmt/pull/1019):
+
+    ```c++
+    auto value = fmt::format("{:%Q}", 42s); // value == "42"
+    auto unit  = fmt::format("{:%q}", 42s); // unit == "s"
+    ```
+
+    Thanks @DanielaE.
+
+-   Fixed handling of dynamic width in chrono formatter:
+
+    ```c++
+    auto s = fmt::format("{0:{1}%H:%M:%S}", std::chrono::seconds(12345), 12);
+    //                        ^ width argument index                     ^ width
+    // s == "03:25:45    "
+    ```
+
+    Thanks Howard Hinnant.
+
+-   Removed deprecated `fmt/time.h`. Use `fmt/chrono.h` instead.
+
+-   Added `fmt::format` and `fmt::vformat` overloads that take
+    `text_style` (https://github.com/fmtlib/fmt/issues/993,
+    https://github.com/fmtlib/fmt/pull/994):
+
+    ```c++
+    #include <fmt/color.h>
+
+    std::string message = fmt::format(fmt::emphasis::bold | fg(fmt::color::red),
+                                      "The answer is {}.", 42);
+    ```
+
+    Thanks @Naios.
+
+-   Removed the deprecated color API (`print_colored`). Use the new API,
+    namely `print` overloads that take `text_style` instead.
+
+-   Made `std::unique_ptr` and `std::shared_ptr` formattable as pointers
+    via `fmt::ptr` (https://github.com/fmtlib/fmt/pull/1121):
+
+    ```c++
+    std::unique_ptr<int> p = ...;
+    fmt::print("{}", fmt::ptr(p)); // prints p as a pointer
+    ```
+
+    Thanks @sighingnow.
+
+-   Made `print` and `vprint` report I/O errors
+    (https://github.com/fmtlib/fmt/issues/1098,
+    https://github.com/fmtlib/fmt/pull/1099). Thanks @BillyDonahue.
+
+-   Marked deprecated APIs with the `[[deprecated]]` attribute and
+    removed internal uses of deprecated APIs
+    (https://github.com/fmtlib/fmt/pull/1022). Thanks @eliaskosunen.
+
+-   Modernized the codebase using more C++11 features and removing
+    workarounds. Most importantly, `buffer_context` is now an alias
+    template, so use `buffer_context<T>` instead of
+    `buffer_context<T>::type`. These features require GCC 4.8 or later.
+
+-   `formatter` specializations now always take precedence over implicit
+    conversions to `int` and the undocumented `convert_to_int` trait is
+    now deprecated.
+
+-   Moved the undocumented `basic_writer`, `writer`, and `wwriter` types
+    to the `internal` namespace.
+
+-   Removed deprecated `basic_format_context::begin()`. Use `out()`
+    instead.
+
+-   Disallowed passing the result of `join` as an lvalue to prevent
+    misuse.
+
+-   Refactored the undocumented structs that represent parsed format
+    specifiers to simplify the API and allow multibyte fill.
+
+-   Moved SFINAE to template parameters to reduce symbol sizes.
+
+-   Switched to `fputws` for writing wide strings so that it\'s no
+    longer required to call `_setmode` on Windows
+    (https://github.com/fmtlib/fmt/issues/1229,
+    https://github.com/fmtlib/fmt/pull/1243). Thanks @jackoalan.
+
+-   Improved literal-based API
+    (https://github.com/fmtlib/fmt/pull/1254). Thanks @sylveon.
+
+-   Added support for exotic platforms without `uintptr_t` such as IBM i
+    (AS/400) which has 128-bit pointers and only 64-bit integers
+    (https://github.com/fmtlib/fmt/issues/1059).
+
+-   Added [Sublime Text syntax highlighting config](
+    https://github.com/fmtlib/fmt/blob/master/support/C%2B%2B.sublime-syntax)
+    (https://github.com/fmtlib/fmt/issues/1037). Thanks @Kronuz.
+
+-   Added the `FMT_ENFORCE_COMPILE_STRING` macro to enforce the use of
+    compile-time format strings
+    (https://github.com/fmtlib/fmt/pull/1231). Thanks @jackoalan.
+
+-   Stopped setting `CMAKE_BUILD_TYPE` if {fmt} is a subproject
+    (https://github.com/fmtlib/fmt/issues/1081).
+
+-   Various build improvements
+    (https://github.com/fmtlib/fmt/pull/1039,
+    https://github.com/fmtlib/fmt/pull/1078,
+    https://github.com/fmtlib/fmt/pull/1091,
+    https://github.com/fmtlib/fmt/pull/1103,
+    https://github.com/fmtlib/fmt/pull/1177).
+    Thanks @luncliff, @jasonszang, @olafhering, @Lecetem and @pauldreik.
+
+-   Improved documentation
+    (https://github.com/fmtlib/fmt/issues/1049,
+    https://github.com/fmtlib/fmt/pull/1051,
+    https://github.com/fmtlib/fmt/pull/1083,
+    https://github.com/fmtlib/fmt/pull/1113,
+    https://github.com/fmtlib/fmt/pull/1114,
+    https://github.com/fmtlib/fmt/issues/1146,
+    https://github.com/fmtlib/fmt/issues/1180,
+    https://github.com/fmtlib/fmt/pull/1250,
+    https://github.com/fmtlib/fmt/pull/1252,
+    https://github.com/fmtlib/fmt/pull/1265).
+    Thanks @mikelui, @foonathan, @BillyDonahue, @jwakely, @kaisbe and
+    @sdebionne.
+
+-   Fixed ambiguous formatter specialization in `fmt/ranges.h`
+    (https://github.com/fmtlib/fmt/issues/1123).
+
+-   Fixed formatting of a non-empty `std::filesystem::path` which is an
+    infinitely deep range of its components
+    (https://github.com/fmtlib/fmt/issues/1268).
+
+-   Fixed handling of general output iterators when formatting
+    characters (https://github.com/fmtlib/fmt/issues/1056,
+    https://github.com/fmtlib/fmt/pull/1058). Thanks @abolz.
+
+-   Fixed handling of output iterators in `formatter` specialization for
+    ranges (https://github.com/fmtlib/fmt/issues/1064).
+
+-   Fixed handling of exotic character types
+    (https://github.com/fmtlib/fmt/issues/1188).
+
+-   Made chrono formatting work with exceptions disabled
+    (https://github.com/fmtlib/fmt/issues/1062).
+
+-   Fixed DLL visibility issues
+    (https://github.com/fmtlib/fmt/pull/1134,
+    https://github.com/fmtlib/fmt/pull/1147). Thanks @denchat.
+
+-   Disabled the use of UDL template extension on GCC 9
+    (https://github.com/fmtlib/fmt/issues/1148).
+
+-   Removed misplaced `format` compile-time checks from `printf`
+    (https://github.com/fmtlib/fmt/issues/1173).
+
+-   Fixed issues in the experimental floating-point formatter
+    (https://github.com/fmtlib/fmt/issues/1072,
+    https://github.com/fmtlib/fmt/issues/1129,
+    https://github.com/fmtlib/fmt/issues/1153,
+    https://github.com/fmtlib/fmt/pull/1155,
+    https://github.com/fmtlib/fmt/issues/1210,
+    https://github.com/fmtlib/fmt/issues/1222). Thanks @alabuzhev.
+
+-   Fixed bugs discovered by fuzzing or during fuzzing integration
+    (https://github.com/fmtlib/fmt/issues/1124,
+    https://github.com/fmtlib/fmt/issues/1127,
+    https://github.com/fmtlib/fmt/issues/1132,
+    https://github.com/fmtlib/fmt/pull/1135,
+    https://github.com/fmtlib/fmt/issues/1136,
+    https://github.com/fmtlib/fmt/issues/1141,
+    https://github.com/fmtlib/fmt/issues/1142,
+    https://github.com/fmtlib/fmt/issues/1178,
+    https://github.com/fmtlib/fmt/issues/1179,
+    https://github.com/fmtlib/fmt/issues/1194). Thanks @pauldreik.
+
+-   Fixed building tests on FreeBSD and Hurd
+    (https://github.com/fmtlib/fmt/issues/1043). Thanks @jackyf.
+
+-   Fixed various warnings and compilation issues
+    (https://github.com/fmtlib/fmt/pull/998,
+    https://github.com/fmtlib/fmt/pull/1006,
+    https://github.com/fmtlib/fmt/issues/1008,
+    https://github.com/fmtlib/fmt/issues/1011,
+    https://github.com/fmtlib/fmt/issues/1025,
+    https://github.com/fmtlib/fmt/pull/1027,
+    https://github.com/fmtlib/fmt/pull/1028,
+    https://github.com/fmtlib/fmt/pull/1029,
+    https://github.com/fmtlib/fmt/pull/1030,
+    https://github.com/fmtlib/fmt/pull/1031,
+    https://github.com/fmtlib/fmt/pull/1054,
+    https://github.com/fmtlib/fmt/issues/1063,
+    https://github.com/fmtlib/fmt/pull/1068,
+    https://github.com/fmtlib/fmt/pull/1074,
+    https://github.com/fmtlib/fmt/pull/1075,
+    https://github.com/fmtlib/fmt/pull/1079,
+    https://github.com/fmtlib/fmt/pull/1086,
+    https://github.com/fmtlib/fmt/issues/1088,
+    https://github.com/fmtlib/fmt/pull/1089,
+    https://github.com/fmtlib/fmt/pull/1094,
+    https://github.com/fmtlib/fmt/issues/1101,
+    https://github.com/fmtlib/fmt/pull/1102,
+    https://github.com/fmtlib/fmt/issues/1105,
+    https://github.com/fmtlib/fmt/pull/1107,
+    https://github.com/fmtlib/fmt/issues/1115,
+    https://github.com/fmtlib/fmt/issues/1117,
+    https://github.com/fmtlib/fmt/issues/1118,
+    https://github.com/fmtlib/fmt/issues/1120,
+    https://github.com/fmtlib/fmt/issues/1123,
+    https://github.com/fmtlib/fmt/pull/1139,
+    https://github.com/fmtlib/fmt/issues/1140,
+    https://github.com/fmtlib/fmt/issues/1143,
+    https://github.com/fmtlib/fmt/pull/1144,
+    https://github.com/fmtlib/fmt/pull/1150,
+    https://github.com/fmtlib/fmt/pull/1151,
+    https://github.com/fmtlib/fmt/issues/1152,
+    https://github.com/fmtlib/fmt/issues/1154,
+    https://github.com/fmtlib/fmt/issues/1156,
+    https://github.com/fmtlib/fmt/pull/1159,
+    https://github.com/fmtlib/fmt/issues/1175,
+    https://github.com/fmtlib/fmt/issues/1181,
+    https://github.com/fmtlib/fmt/issues/1186,
+    https://github.com/fmtlib/fmt/pull/1187,
+    https://github.com/fmtlib/fmt/pull/1191,
+    https://github.com/fmtlib/fmt/issues/1197,
+    https://github.com/fmtlib/fmt/issues/1200,
+    https://github.com/fmtlib/fmt/issues/1203,
+    https://github.com/fmtlib/fmt/issues/1205,
+    https://github.com/fmtlib/fmt/pull/1206,
+    https://github.com/fmtlib/fmt/issues/1213,
+    https://github.com/fmtlib/fmt/issues/1214,
+    https://github.com/fmtlib/fmt/pull/1217,
+    https://github.com/fmtlib/fmt/issues/1228,
+    https://github.com/fmtlib/fmt/pull/1230,
+    https://github.com/fmtlib/fmt/issues/1232,
+    https://github.com/fmtlib/fmt/pull/1235,
+    https://github.com/fmtlib/fmt/pull/1236,
+    https://github.com/fmtlib/fmt/issues/1240).
+    Thanks @DanielaE, @mwinterb, @eliaskosunen, @morinmorin, @ricco19,
+    @waywardmonkeys, @chronoxor, @remyabel, @pauldreik, @gsjaardema, @rcane,
+    @mocabe, @denchat, @cjdb, @HazardyKnusperkeks, @vedranmiletic, @jackoalan,
+    @DaanDeMeyer and @starkmapper.
+
+# 5.3.0 - 2018-12-28
+
+-   Introduced experimental chrono formatting support:
+
+    ```c++
+    #include <fmt/chrono.h>
+
+    int main() {
+      using namespace std::literals::chrono_literals;
+      fmt::print("Default format: {} {}\n", 42s, 100ms);
+      fmt::print("strftime-like format: {:%H:%M:%S}\n", 3h + 15min + 30s);
+    }
+    ```
+
+    prints:
+
+        Default format: 42s 100ms
+        strftime-like format: 03:15:30
+
+-   Added experimental support for emphasis (bold, italic, underline,
+    strikethrough), colored output to a file stream, and improved
+    colored formatting API
+    (https://github.com/fmtlib/fmt/pull/961,
+    https://github.com/fmtlib/fmt/pull/967,
+    https://github.com/fmtlib/fmt/pull/973):
+
+    ```c++
+    #include <fmt/color.h>
+
+    int main() {
+      print(fg(fmt::color::crimson) | fmt::emphasis::bold,
+            "Hello, {}!\n", "world");
+      print(fg(fmt::color::floral_white) | bg(fmt::color::slate_gray) |
+            fmt::emphasis::underline, "Hello, {}!\n", "мир");
+      print(fg(fmt::color::steel_blue) | fmt::emphasis::italic,
+            "Hello, {}!\n", "世界");
+    }
+    ```
+
+    prints the following on modern terminals with RGB color support:
+
+    ![](https://user-images.githubusercontent.com/576385/50405788-b66e7500-076e-11e9-9592-7324d1f951d8.png)
+
+    Thanks @Rakete1111.
+
+-   Added support for 4-bit terminal colors
+    (https://github.com/fmtlib/fmt/issues/968,
+    https://github.com/fmtlib/fmt/pull/974)
+
+    ```c++
+    #include <fmt/color.h>
+
+    int main() {
+      print(fg(fmt::terminal_color::red), "stop\n");
+    }
+    ```
+
+    Note that these colors vary by terminal:
+
+    ![](https://user-images.githubusercontent.com/576385/50405925-dbfc7e00-0770-11e9-9b85-333fab0af9ac.png)
+
+    Thanks @Rakete1111.
+
+-   Parameterized formatting functions on the type of the format string
+    (https://github.com/fmtlib/fmt/issues/880,
+    https://github.com/fmtlib/fmt/pull/881,
+    https://github.com/fmtlib/fmt/pull/883,
+    https://github.com/fmtlib/fmt/pull/885,
+    https://github.com/fmtlib/fmt/pull/897,
+    https://github.com/fmtlib/fmt/issues/920). Any object of
+    type `S` that has an overloaded `to_string_view(const S&)` returning
+    `fmt::string_view` can be used as a format string:
+
+    ```c++
+    namespace my_ns {
+    inline string_view to_string_view(const my_string& s) {
+      return {s.data(), s.length()};
+    }
+    }
+
+    std::string message = fmt::format(my_string("The answer is {}."), 42);
+    ```
+
+    Thanks @DanielaE.
+
+-   Made `std::string_view` work as a format string
+    (https://github.com/fmtlib/fmt/pull/898):
+
+    ```c++
+    auto message = fmt::format(std::string_view("The answer is {}."), 42);
+    ```
+
+    Thanks @DanielaE.
+
+-   Added wide string support to compile-time format string checks
+    (https://github.com/fmtlib/fmt/pull/924):
+
+    ```c++
+    print(fmt(L"{:f}"), 42); // compile-time error: invalid type specifier
+    ```
+
+    Thanks @XZiar.
+
+-   Made colored print functions work with wide strings
+    (https://github.com/fmtlib/fmt/pull/867):
+
+    ```c++
+    #include <fmt/color.h>
+
+    int main() {
+      print(fg(fmt::color::red), L"{}\n", 42);
+    }
+    ```
+
+    Thanks @DanielaE.
+
+-   Introduced experimental Unicode support
+    (https://github.com/fmtlib/fmt/issues/628,
+    https://github.com/fmtlib/fmt/pull/891):
+
+    ```c++
+    using namespace fmt::literals;
+    auto s = fmt::format("{:*^5}"_u, "🤡"_u); // s == "**🤡**"_u
+    ```
+
+-   Improved locale support:
+
+    ```c++
+    #include <fmt/locale.h>
+
+    struct numpunct : std::numpunct<char> {
+     protected:
+      char do_thousands_sep() const override { return '~'; }
+    };
+
+    std::locale loc;
+    auto s = fmt::format(std::locale(loc, new numpunct()), "{:n}", 1234567);
+    // s == "1~234~567"
+    ```
+
+-   Constrained formatting functions on proper iterator types
+    (https://github.com/fmtlib/fmt/pull/921). Thanks @DanielaE.
+
+-   Added `make_printf_args` and `make_wprintf_args` functions
+    (https://github.com/fmtlib/fmt/pull/934). Thanks @tnovotny.
+
+-   Deprecated `fmt::visit`, `parse_context`, and `wparse_context`. Use
+    `fmt::visit_format_arg`, `format_parse_context`, and
+    `wformat_parse_context` instead.
+
+-   Removed undocumented `basic_fixed_buffer` which has been superseded
+    by the iterator-based API
+    (https://github.com/fmtlib/fmt/issues/873,
+    https://github.com/fmtlib/fmt/pull/902). Thanks @superfunc.
+
+-   Disallowed repeated leading zeros in an argument ID:
+
+    ```c++
+    fmt::print("{000}", 42); // error
+    ```
+
+-   Reintroduced support for gcc 4.4.
+
+-   Fixed compilation on platforms with exotic `double`
+    (https://github.com/fmtlib/fmt/issues/878).
+
+-   Improved documentation
+    (https://github.com/fmtlib/fmt/issues/164,
+    https://github.com/fmtlib/fmt/issues/877,
+    https://github.com/fmtlib/fmt/pull/901,
+    https://github.com/fmtlib/fmt/pull/906,
+    https://github.com/fmtlib/fmt/pull/979).
+    Thanks @kookjr, @DarkDimius and @HecticSerenity.
+
+-   Added pkgconfig support which makes it easier to consume the library
+    from meson and other build systems
+    (https://github.com/fmtlib/fmt/pull/916). Thanks @colemickens.
+
+-   Various build improvements
+    (https://github.com/fmtlib/fmt/pull/909,
+    https://github.com/fmtlib/fmt/pull/926,
+    https://github.com/fmtlib/fmt/pull/937,
+    https://github.com/fmtlib/fmt/pull/953,
+    https://github.com/fmtlib/fmt/pull/959).
+    Thanks @tchaikov, @luncliff, @AndreasSchoenle, @hotwatermorning and @Zefz.
+
+-   Improved `string_view` construction performance
+    (https://github.com/fmtlib/fmt/pull/914). Thanks @gabime.
+
+-   Fixed non-matching char types
+    (https://github.com/fmtlib/fmt/pull/895). Thanks @DanielaE.
+
+-   Fixed `format_to_n` with `std::back_insert_iterator`
+    (https://github.com/fmtlib/fmt/pull/913). Thanks @DanielaE.
+
+-   Fixed locale-dependent formatting
+    (https://github.com/fmtlib/fmt/issues/905).
+
+-   Fixed various compiler warnings and errors
+    (https://github.com/fmtlib/fmt/pull/882,
+    https://github.com/fmtlib/fmt/pull/886,
+    https://github.com/fmtlib/fmt/pull/933,
+    https://github.com/fmtlib/fmt/pull/941,
+    https://github.com/fmtlib/fmt/issues/931,
+    https://github.com/fmtlib/fmt/pull/943,
+    https://github.com/fmtlib/fmt/pull/954,
+    https://github.com/fmtlib/fmt/pull/956,
+    https://github.com/fmtlib/fmt/pull/962,
+    https://github.com/fmtlib/fmt/issues/965,
+    https://github.com/fmtlib/fmt/issues/977,
+    https://github.com/fmtlib/fmt/pull/983,
+    https://github.com/fmtlib/fmt/pull/989).
+    Thanks @Luthaf, @stevenhoving, @christinaa, @lgritz, @DanielaE,
+    @0x8000-0000 and @liuping1997.
+
+# 5.2.1 - 2018-09-21
+
+-   Fixed `visit` lookup issues on gcc 7 & 8
+    (https://github.com/fmtlib/fmt/pull/870). Thanks @medithe.
+-   Fixed linkage errors on older gcc.
+-   Prevented `fmt/range.h` from specializing `fmt::basic_string_view`
+    (https://github.com/fmtlib/fmt/issues/865,
+    https://github.com/fmtlib/fmt/pull/868). Thanks @hhggit.
+-   Improved error message when formatting unknown types
+    (https://github.com/fmtlib/fmt/pull/872). Thanks @foonathan.
+-   Disabled templated user-defined literals when compiled under nvcc
+    (https://github.com/fmtlib/fmt/pull/875). Thanks @CandyGumdrop.
+-   Fixed `format_to` formatting to `wmemory_buffer`
+    (https://github.com/fmtlib/fmt/issues/874).
+
+# 5.2.0 - 2018-09-13
+
+-   Optimized format string parsing and argument processing which
+    resulted in up to 5x speed up on long format strings and significant
+    performance boost on various benchmarks. For example, version 5.2 is
+    2.22x faster than 5.1 on decimal integer formatting with `format_to`
+    (macOS, clang-902.0.39.2):
+
+    | Method                     | Time, s         | Speedup |
+    | -------------------------- | --------------: | ------: |
+    | fmt::format 5.1            | 0.58            |         |
+    | fmt::format 5.2            | 0.35            |   1.66x |
+    | fmt::format_to 5.1         | 0.51            |         |
+    | fmt::format_to 5.2         | 0.23            |   2.22x |
+    | sprintf                    | 0.71            |         |
+    | std::to_string             | 1.01            |         |
+    | std::stringstream          | 1.73            |         |
+
+-   Changed the `fmt` macro from opt-out to opt-in to prevent name
+    collisions. To enable it define the `FMT_STRING_ALIAS` macro to 1
+    before including `fmt/format.h`:
+
+    ```c++
+    #define FMT_STRING_ALIAS 1
+    #include <fmt/format.h>
+    std::string answer = format(fmt("{}"), 42);
+    ```
+
+-   Added compile-time format string checks to `format_to` overload that
+    takes `fmt::memory_buffer`
+    (https://github.com/fmtlib/fmt/issues/783):
+
+    ```c++
+    fmt::memory_buffer buf;
+    // Compile-time error: invalid type specifier.
+    fmt::format_to(buf, fmt("{:d}"), "foo");
+    ```
+
+-   Moved experimental color support to `fmt/color.h` and enabled the
+    new API by default. The old API can be enabled by defining the
+    `FMT_DEPRECATED_COLORS` macro.
+
+-   Added formatting support for types explicitly convertible to
+    `fmt::string_view`:
+
+    ```c++
+    struct foo {
+      explicit operator fmt::string_view() const { return "foo"; }
+    };
+    auto s = format("{}", foo());
+    ```
+
+    In particular, this makes formatting function work with
+    `folly::StringPiece`.
+
+-   Implemented preliminary support for `char*_t` by replacing the
+    `format` function overloads with a single function template
+    parameterized on the string type.
+
+-   Added support for dynamic argument lists
+    (https://github.com/fmtlib/fmt/issues/814,
+    https://github.com/fmtlib/fmt/pull/819). Thanks @MikePopoloski.
+
+-   Reduced executable size overhead for embedded targets using newlib
+    nano by making locale dependency optional
+    (https://github.com/fmtlib/fmt/pull/839). Thanks @teajay-fr.
+
+-   Keep `noexcept` specifier when exceptions are disabled
+    (https://github.com/fmtlib/fmt/issues/801,
+    https://github.com/fmtlib/fmt/pull/810). Thanks @qis.
+
+-   Fixed formatting of user-defined types providing `operator<<` with
+    `format_to_n` (https://github.com/fmtlib/fmt/pull/806).
+    Thanks @mkurdej.
+
+-   Fixed dynamic linkage of new symbols
+    (https://github.com/fmtlib/fmt/issues/808).
+
+-   Fixed global initialization issue
+    (https://github.com/fmtlib/fmt/issues/807):
+
+    ```c++
+    // This works on compilers with constexpr support.
+    static const std::string answer = fmt::format("{}", 42);
+    ```
+
+-   Fixed various compiler warnings and errors
+    (https://github.com/fmtlib/fmt/pull/804,
+    https://github.com/fmtlib/fmt/issues/809,
+    https://github.com/fmtlib/fmt/pull/811,
+    https://github.com/fmtlib/fmt/issues/822,
+    https://github.com/fmtlib/fmt/pull/827,
+    https://github.com/fmtlib/fmt/issues/830,
+    https://github.com/fmtlib/fmt/pull/838,
+    https://github.com/fmtlib/fmt/issues/843,
+    https://github.com/fmtlib/fmt/pull/844,
+    https://github.com/fmtlib/fmt/issues/851,
+    https://github.com/fmtlib/fmt/pull/852,
+    https://github.com/fmtlib/fmt/pull/854).
+    Thanks @henryiii, @medithe, and @eliasdaler.
+
+# 5.1.0 - 2018-07-05
+
+-   Added experimental support for RGB color output enabled with the
+    `FMT_EXTENDED_COLORS` macro:
+
+    ```c++
+    #define FMT_EXTENDED_COLORS
+    #define FMT_HEADER_ONLY // or compile fmt with FMT_EXTENDED_COLORS defined
+    #include <fmt/format.h>
+
+    fmt::print(fmt::color::steel_blue, "Some beautiful text");
+    ```
+
+    The old API (the `print_colored` and `vprint_colored` functions and
+    the `color` enum) is now deprecated.
+    (https://github.com/fmtlib/fmt/issues/762
+    https://github.com/fmtlib/fmt/pull/767). thanks @Remotion.
+
+-   Added quotes to strings in ranges and tuples
+    (https://github.com/fmtlib/fmt/pull/766). Thanks @Remotion.
+
+-   Made `format_to` work with `basic_memory_buffer`
+    (https://github.com/fmtlib/fmt/issues/776).
+
+-   Added `vformat_to_n` and `wchar_t` overload of `format_to_n`
+    (https://github.com/fmtlib/fmt/issues/764,
+    https://github.com/fmtlib/fmt/issues/769).
+
+-   Made `is_range` and `is_tuple_like` part of public (experimental)
+    API to allow specialization for user-defined types
+    (https://github.com/fmtlib/fmt/issues/751,
+    https://github.com/fmtlib/fmt/pull/759). Thanks @drrlvn.
+
+-   Added more compilers to continuous integration and increased
+    `FMT_PEDANTIC` warning levels
+    (https://github.com/fmtlib/fmt/pull/736). Thanks @eliaskosunen.
+
+-   Fixed compilation with MSVC 2013.
+
+-   Fixed handling of user-defined types in `format_to`
+    (https://github.com/fmtlib/fmt/issues/793).
+
+-   Forced linking of inline `vformat` functions into the library
+    (https://github.com/fmtlib/fmt/issues/795).
+
+-   Fixed incorrect call to on_align in `'{:}='`
+    (https://github.com/fmtlib/fmt/issues/750).
+
+-   Fixed floating-point formatting to a non-back_insert_iterator with
+    sign & numeric alignment specified
+    (https://github.com/fmtlib/fmt/issues/756).
+
+-   Fixed formatting to an array with `format_to_n`
+    (https://github.com/fmtlib/fmt/issues/778).
+
+-   Fixed formatting of more than 15 named arguments
+    (https://github.com/fmtlib/fmt/issues/754).
+
+-   Fixed handling of compile-time strings when including
+    `fmt/ostream.h`. (https://github.com/fmtlib/fmt/issues/768).
+
+-   Fixed various compiler warnings and errors
+    (https://github.com/fmtlib/fmt/issues/742,
+    https://github.com/fmtlib/fmt/issues/748,
+    https://github.com/fmtlib/fmt/issues/752,
+    https://github.com/fmtlib/fmt/issues/770,
+    https://github.com/fmtlib/fmt/pull/775,
+    https://github.com/fmtlib/fmt/issues/779,
+    https://github.com/fmtlib/fmt/pull/780,
+    https://github.com/fmtlib/fmt/pull/790,
+    https://github.com/fmtlib/fmt/pull/792,
+    https://github.com/fmtlib/fmt/pull/800).
+    Thanks @Remotion, @gabime, @foonathan, @Dark-Passenger and @0x8000-0000.
+
+# 5.0.0 - 2018-05-21
+
+-   Added a requirement for partial C++11 support, most importantly
+    variadic templates and type traits, and dropped `FMT_VARIADIC_*`
+    emulation macros. Variadic templates are available since GCC 4.4,
+    Clang 2.9 and MSVC 18.0 (2013). For older compilers use {fmt}
+    [version 4.x](https://github.com/fmtlib/fmt/releases/tag/4.1.0)
+    which continues to be maintained and works with C++98 compilers.
+
+-   Renamed symbols to follow standard C++ naming conventions and
+    proposed a subset of the library for standardization in [P0645R2
+    Text Formatting](https://wg21.link/P0645).
+
+-   Implemented `constexpr` parsing of format strings and [compile-time
+    format string
+    checks](https://fmt.dev/latest/api.html#compile-time-format-string-checks).
+    For example
+
+    ```c++
+    #include <fmt/format.h>
+
+    std::string s = format(fmt("{:d}"), "foo");
+    ```
+
+    gives a compile-time error because `d` is an invalid specifier for
+    strings ([godbolt](https://godbolt.org/g/rnCy9Q)):
+
+        ...
+        <source>:4:19: note: in instantiation of function template specialization 'fmt::v5::format<S, char [4]>' requested here
+          std::string s = format(fmt("{:d}"), "foo");
+                          ^
+        format.h:1337:13: note: non-constexpr function 'on_error' cannot be used in a constant expression
+            handler.on_error("invalid type specifier");
+
+    Compile-time checks require relaxed `constexpr` (C++14 feature)
+    support. If the latter is not available, checks will be performed at
+    runtime.
+
+-   Separated format string parsing and formatting in the extension API
+    to enable compile-time format string processing. For example
+
+    ```c++
+    struct Answer {};
+
+    namespace fmt {
+    template <>
+    struct formatter<Answer> {
+      constexpr auto parse(parse_context& ctx) {
+        auto it = ctx.begin();
+        spec = *it;
+        if (spec != 'd' && spec != 's')
+          throw format_error("invalid specifier");
+        return ++it;
+      }
+
+      template <typename FormatContext>
+      auto format(Answer, FormatContext& ctx) {
+        return spec == 's' ?
+          format_to(ctx.begin(), "{}", "fourty-two") :
+          format_to(ctx.begin(), "{}", 42);
+      }
+
+      char spec = 0;
+    };
+    }
+
+    std::string s = format(fmt("{:x}"), Answer());
+    ```
+
+    gives a compile-time error due to invalid format specifier
+    ([godbolt](https://godbolt.org/g/2jQ1Dv)):
+
+        ...
+        <source>:12:45: error: expression '<throw-expression>' is not a constant expression
+               throw format_error("invalid specifier");
+
+-   Added [iterator
+    support](https://fmt.dev/latest/api.html#output-iterator-support):
+
+    ```c++
+    #include <vector>
+    #include <fmt/format.h>
+
+    std::vector<char> out;
+    fmt::format_to(std::back_inserter(out), "{}", 42);
+    ```
+
+-   Added the
+    [format_to_n](https://fmt.dev/latest/api.html#_CPPv2N3fmt11format_to_nE8OutputItNSt6size_tE11string_viewDpRK4Args)
+    function that restricts the output to the specified number of
+    characters (https://github.com/fmtlib/fmt/issues/298):
+
+    ```c++
+    char out[4];
+    fmt::format_to_n(out, sizeof(out), "{}", 12345);
+    // out == "1234" (without terminating '\0')
+    ```
+
+-   Added the [formatted_size](
+    https://fmt.dev/latest/api.html#_CPPv2N3fmt14formatted_sizeE11string_viewDpRK4Args)
+    function for computing the output size:
+
+    ```c++
+    #include <fmt/format.h>
+
+    auto size = fmt::formatted_size("{}", 12345); // size == 5
+    ```
+
+-   Improved compile times by reducing dependencies on standard headers
+    and providing a lightweight [core
+    API](https://fmt.dev/latest/api.html#core-api):
+
+    ```c++
+    #include <fmt/core.h>
+
+    fmt::print("The answer is {}.", 42);
+    ```
+
+    See [Compile time and code
+    bloat](https://github.com/fmtlib/fmt#compile-time-and-code-bloat).
+
+-   Added the [make_format_args](
+    https://fmt.dev/latest/api.html#_CPPv2N3fmt16make_format_argsEDpRK4Args)
+    function for capturing formatting arguments:
+
+    ```c++
+    // Prints formatted error message.
+    void vreport_error(const char *format, fmt::format_args args) {
+      fmt::print("Error: ");
+      fmt::vprint(format, args);
+    }
+    template <typename... Args>
+    void report_error(const char *format, const Args & ... args) {
+      vreport_error(format, fmt::make_format_args(args...));
+    }
+    ```
+
+-   Added the `make_printf_args` function for capturing `printf`
+    arguments (https://github.com/fmtlib/fmt/issues/687,
+    https://github.com/fmtlib/fmt/pull/694). Thanks @Kronuz.
+
+-   Added prefix `v` to non-variadic functions taking `format_args` to
+    distinguish them from variadic ones:
+
+    ```c++
+    std::string vformat(string_view format_str, format_args args);
+
+    template <typename... Args>
+    std::string format(string_view format_str, const Args & ... args);
+    ```
+
+-   Added experimental support for formatting ranges, containers and
+    tuple-like types in `fmt/ranges.h`
+    (https://github.com/fmtlib/fmt/pull/735):
+
+    ```c++
+    #include <fmt/ranges.h>
+
+    std::vector<int> v = {1, 2, 3};
+    fmt::print("{}", v); // prints {1, 2, 3}
+    ```
+
+    Thanks @Remotion.
+
+-   Implemented `wchar_t` date and time formatting
+    (https://github.com/fmtlib/fmt/pull/712):
+
+    ```c++
+    #include <fmt/time.h>
+
+    std::time_t t = std::time(nullptr);
+    auto s = fmt::format(L"The date is {:%Y-%m-%d}.", *std::localtime(&t));
+    ```
+
+    Thanks @DanielaE.
+
+-   Provided more wide string overloads
+    (https://github.com/fmtlib/fmt/pull/724). Thanks @DanielaE.
+
+-   Switched from a custom null-terminated string view class to
+    `string_view` in the format API and provided `fmt::string_view`
+    which implements a subset of `std::string_view` API for pre-C++17
+    systems.
+
+-   Added support for `std::experimental::string_view`
+    (https://github.com/fmtlib/fmt/pull/607):
+
+    ```c++
+    #include <fmt/core.h>
+    #include <experimental/string_view>
+
+    fmt::print("{}", std::experimental::string_view("foo"));
+    ```
+
+    Thanks @virgiliofornazin.
+
+-   Allowed mixing named and automatic arguments:
+
+    ```c++
+    fmt::format("{} {two}", 1, fmt::arg("two", 2));
+    ```
+
+-   Removed the write API in favor of the [format
+    API](https://fmt.dev/latest/api.html#format-api) with compile-time
+    handling of format strings.
+
+-   Disallowed formatting of multibyte strings into a wide character
+    target (https://github.com/fmtlib/fmt/pull/606).
+
+-   Improved documentation
+    (https://github.com/fmtlib/fmt/pull/515,
+    https://github.com/fmtlib/fmt/issues/614,
+    https://github.com/fmtlib/fmt/pull/617,
+    https://github.com/fmtlib/fmt/pull/661,
+    https://github.com/fmtlib/fmt/pull/680).
+    Thanks @ibell, @mihaitodor and @johnthagen.
+
+-   Implemented more efficient handling of large number of format
+    arguments.
+
+-   Introduced an inline namespace for symbol versioning.
+
+-   Added debug postfix `d` to the `fmt` library name
+    (https://github.com/fmtlib/fmt/issues/636).
+
+-   Removed unnecessary `fmt/` prefix in includes
+    (https://github.com/fmtlib/fmt/pull/397). Thanks @chronoxor.
+
+-   Moved `fmt/*.h` to `include/fmt/*.h` to prevent irrelevant files and
+    directories appearing on the include search paths when fmt is used
+    as a subproject and moved source files to the `src` directory.
+
+-   Added qmake project file `support/fmt.pro`
+    (https://github.com/fmtlib/fmt/pull/641). Thanks @cowo78.
+
+-   Added Gradle build file `support/build.gradle`
+    (https://github.com/fmtlib/fmt/pull/649). Thanks @luncliff.
+
+-   Removed `FMT_CPPFORMAT` CMake option.
+
+-   Fixed a name conflict with the macro `CHAR_WIDTH` in glibc
+    (https://github.com/fmtlib/fmt/pull/616). Thanks @aroig.
+
+-   Fixed handling of nested braces in `fmt::join`
+    (https://github.com/fmtlib/fmt/issues/638).
+
+-   Added `SOURCELINK_SUFFIX` for compatibility with Sphinx 1.5
+    (https://github.com/fmtlib/fmt/pull/497). Thanks @ginggs.
+
+-   Added a missing `inline` in the header-only mode
+    (https://github.com/fmtlib/fmt/pull/626). Thanks @aroig.
+
+-   Fixed various compiler warnings
+    (https://github.com/fmtlib/fmt/pull/640,
+    https://github.com/fmtlib/fmt/pull/656,
+    https://github.com/fmtlib/fmt/pull/679,
+    https://github.com/fmtlib/fmt/pull/681,
+    https://github.com/fmtlib/fmt/pull/705,
+    https://github.com/fmtlib/fmt/issues/715,
+    https://github.com/fmtlib/fmt/pull/717,
+    https://github.com/fmtlib/fmt/pull/720,
+    https://github.com/fmtlib/fmt/pull/723,
+    https://github.com/fmtlib/fmt/pull/726,
+    https://github.com/fmtlib/fmt/pull/730,
+    https://github.com/fmtlib/fmt/pull/739).
+    Thanks @peterbell10, @LarsGullik, @foonathan, @eliaskosunen,
+    @christianparpart, @DanielaE and @mwinterb.
+
+-   Worked around an MSVC bug and fixed several warnings
+    (https://github.com/fmtlib/fmt/pull/653). Thanks @alabuzhev.
+
+-   Worked around GCC bug 67371
+    (https://github.com/fmtlib/fmt/issues/682).
+
+-   Fixed compilation with `-fno-exceptions`
+    (https://github.com/fmtlib/fmt/pull/655). Thanks @chenxiaolong.
+
+-   Made `constexpr remove_prefix` gcc version check tighter
+    (https://github.com/fmtlib/fmt/issues/648).
+
+-   Renamed internal type enum constants to prevent collision with
+    poorly written C libraries
+    (https://github.com/fmtlib/fmt/issues/644).
+
+-   Added detection of `wostream operator<<`
+    (https://github.com/fmtlib/fmt/issues/650).
+
+-   Fixed compilation on OpenBSD
+    (https://github.com/fmtlib/fmt/pull/660). Thanks @hubslave.
+
+-   Fixed compilation on FreeBSD 12
+    (https://github.com/fmtlib/fmt/pull/732). Thanks @dankm.
+
+-   Fixed compilation when there is a mismatch between `-std` options
+    between the library and user code
+    (https://github.com/fmtlib/fmt/issues/664).
+
+-   Fixed compilation with GCC 7 and `-std=c++11`
+    (https://github.com/fmtlib/fmt/issues/734).
+
+-   Improved generated binary code on GCC 7 and older
+    (https://github.com/fmtlib/fmt/issues/668).
+
+-   Fixed handling of numeric alignment with no width
+    (https://github.com/fmtlib/fmt/issues/675).
+
+-   Fixed handling of empty strings in UTF8/16 converters
+    (https://github.com/fmtlib/fmt/pull/676). Thanks @vgalka-sl.
+
+-   Fixed formatting of an empty `string_view`
+    (https://github.com/fmtlib/fmt/issues/689).
+
+-   Fixed detection of `string_view` on libc++
+    (https://github.com/fmtlib/fmt/issues/686).
+
+-   Fixed DLL issues (https://github.com/fmtlib/fmt/pull/696).
+    Thanks @sebkoenig.
+
+-   Fixed compile checks for mixing narrow and wide strings
+    (https://github.com/fmtlib/fmt/issues/690).
+
+-   Disabled unsafe implicit conversion to `std::string`
+    (https://github.com/fmtlib/fmt/issues/729).
+
+-   Fixed handling of reused format specs (as in `fmt::join`) for
+    pointers (https://github.com/fmtlib/fmt/pull/725). Thanks @mwinterb.
+
+-   Fixed installation of `fmt/ranges.h`
+    (https://github.com/fmtlib/fmt/pull/738). Thanks @sv1990.
+
+# 4.1.0 - 2017-12-20
+
+-   Added `fmt::to_wstring()` in addition to `fmt::to_string()`
+    (https://github.com/fmtlib/fmt/pull/559). Thanks @alabuzhev.
+-   Added support for C++17 `std::string_view`
+    (https://github.com/fmtlib/fmt/pull/571 and
+    https://github.com/fmtlib/fmt/pull/578).
+    Thanks @thelostt and @mwinterb.
+-   Enabled stream exceptions to catch errors
+    (https://github.com/fmtlib/fmt/issues/581). Thanks @crusader-mike.
+-   Allowed formatting of class hierarchies with `fmt::format_arg()`
+    (https://github.com/fmtlib/fmt/pull/547). Thanks @rollbear.
+-   Removed limitations on character types
+    (https://github.com/fmtlib/fmt/pull/563). Thanks @Yelnats321.
+-   Conditionally enabled use of `std::allocator_traits`
+    (https://github.com/fmtlib/fmt/pull/583). Thanks @mwinterb.
+-   Added support for `const` variadic member function emulation with
+    `FMT_VARIADIC_CONST`
+    (https://github.com/fmtlib/fmt/pull/591). Thanks @ludekvodicka.
+-   Various bugfixes: bad overflow check, unsupported implicit type
+    conversion when determining formatting function, test segfaults
+    (https://github.com/fmtlib/fmt/issues/551), ill-formed
+    macros (https://github.com/fmtlib/fmt/pull/542) and
+    ambiguous overloads
+    (https://github.com/fmtlib/fmt/issues/580). Thanks @xylosper.
+-   Prevented warnings on MSVC
+    (https://github.com/fmtlib/fmt/pull/605,
+    https://github.com/fmtlib/fmt/pull/602, and
+    https://github.com/fmtlib/fmt/pull/545), clang
+    (https://github.com/fmtlib/fmt/pull/582), GCC
+    (https://github.com/fmtlib/fmt/issues/573), various
+    conversion warnings (https://github.com/fmtlib/fmt/pull/609,
+    https://github.com/fmtlib/fmt/pull/567,
+    https://github.com/fmtlib/fmt/pull/553 and
+    https://github.com/fmtlib/fmt/pull/553), and added
+    `override` and `[[noreturn]]`
+    (https://github.com/fmtlib/fmt/pull/549 and
+    https://github.com/fmtlib/fmt/issues/555).
+    Thanks @alabuzhev, @virgiliofornazin, @alexanderbock, @yumetodo, @VaderY,
+    @jpcima, @thelostt and @Manu343726.
+-   Improved CMake: Used `GNUInstallDirs` to set installation location
+    (https://github.com/fmtlib/fmt/pull/610) and fixed warnings
+    (https://github.com/fmtlib/fmt/pull/536 and
+    https://github.com/fmtlib/fmt/pull/556).
+    Thanks @mikecrowe, @evgen231 and @henryiii.
+
+# 4.0.0 - 2017-06-27
+
+-   Removed old compatibility headers `cppformat/*.h` and CMake options
+    (https://github.com/fmtlib/fmt/pull/527). Thanks @maddinat0r.
+
+-   Added `string.h` containing `fmt::to_string()` as alternative to
+    `std::to_string()` as well as other string writer functionality
+    (https://github.com/fmtlib/fmt/issues/326 and
+    https://github.com/fmtlib/fmt/pull/441):
+
+    ```c++
+    #include "fmt/string.h"
+
+    std::string answer = fmt::to_string(42);
+    ```
+
+    Thanks @glebov-andrey.
+
+-   Moved `fmt::printf()` to new `printf.h` header and allowed `%s` as
+    generic specifier (https://github.com/fmtlib/fmt/pull/453),
+    made `%.f` more conformant to regular `printf()`
+    (https://github.com/fmtlib/fmt/pull/490), added custom
+    writer support (https://github.com/fmtlib/fmt/issues/476)
+    and implemented missing custom argument formatting
+    (https://github.com/fmtlib/fmt/pull/339 and
+    https://github.com/fmtlib/fmt/pull/340):
+
+    ```c++
+    #include "fmt/printf.h"
+
+    // %s format specifier can be used with any argument type.
+    fmt::printf("%s", 42);
+    ```
+
+    Thanks @mojoBrendan, @manylegged and @spacemoose.
+    See also https://github.com/fmtlib/fmt/issues/360,
+    https://github.com/fmtlib/fmt/issues/335 and
+    https://github.com/fmtlib/fmt/issues/331.
+
+-   Added `container.h` containing a `BasicContainerWriter` to write to
+    containers like `std::vector`
+    (https://github.com/fmtlib/fmt/pull/450). Thanks @polyvertex.
+
+-   Added `fmt::join()` function that takes a range and formats its
+    elements separated by a given string
+    (https://github.com/fmtlib/fmt/pull/466):
+
+    ```c++
+    #include "fmt/format.h"
+
+    std::vector<double> v = {1.2, 3.4, 5.6};
+    // Prints "(+01.20, +03.40, +05.60)".
+    fmt::print("({:+06.2f})", fmt::join(v.begin(), v.end(), ", "));
+    ```
+
+    Thanks @olivier80.
+
+-   Added support for custom formatting specifications to simplify
+    customization of built-in formatting
+    (https://github.com/fmtlib/fmt/pull/444). Thanks @polyvertex.
+    See also https://github.com/fmtlib/fmt/issues/439.
+
+-   Added `fmt::format_system_error()` for error code formatting
+    (https://github.com/fmtlib/fmt/issues/323 and
+    https://github.com/fmtlib/fmt/pull/526). Thanks @maddinat0r.
+
+-   Added thread-safe `fmt::localtime()` and `fmt::gmtime()` as
+    replacement for the standard version to `time.h`
+    (https://github.com/fmtlib/fmt/pull/396). Thanks @codicodi.
+
+-   Internal improvements to `NamedArg` and `ArgLists`
+    (https://github.com/fmtlib/fmt/pull/389 and
+    https://github.com/fmtlib/fmt/pull/390). Thanks @chronoxor.
+
+-   Fixed crash due to bug in `FormatBuf`
+    (https://github.com/fmtlib/fmt/pull/493). Thanks @effzeh. See also
+    https://github.com/fmtlib/fmt/issues/480 and
+    https://github.com/fmtlib/fmt/issues/491.
+
+-   Fixed handling of wide strings in `fmt::StringWriter`.
+
+-   Improved compiler error messages
+    (https://github.com/fmtlib/fmt/issues/357).
+
+-   Fixed various warnings and issues with various compilers
+    (https://github.com/fmtlib/fmt/pull/494,
+    https://github.com/fmtlib/fmt/pull/499,
+    https://github.com/fmtlib/fmt/pull/483,
+    https://github.com/fmtlib/fmt/pull/485,
+    https://github.com/fmtlib/fmt/pull/482,
+    https://github.com/fmtlib/fmt/pull/475,
+    https://github.com/fmtlib/fmt/pull/473 and
+    https://github.com/fmtlib/fmt/pull/414).
+    Thanks @chronoxor, @zhaohuaxishi, @pkestene, @dschmidt and @0x414c.
+
+-   Improved CMake: targets are now namespaced
+    (https://github.com/fmtlib/fmt/pull/511 and
+    https://github.com/fmtlib/fmt/pull/513), supported
+    header-only `printf.h`
+    (https://github.com/fmtlib/fmt/pull/354), fixed issue with
+    minimal supported library subset
+    (https://github.com/fmtlib/fmt/issues/418,
+    https://github.com/fmtlib/fmt/pull/419 and
+    https://github.com/fmtlib/fmt/pull/420).
+    Thanks @bjoernthiel, @niosHD, @LogicalKnight and @alabuzhev.
+
+-   Improved documentation (https://github.com/fmtlib/fmt/pull/393).
+    Thanks @pwm1234.
+
+# 3.0.2 - 2017-06-14
+
+-   Added `FMT_VERSION` macro
+    (https://github.com/fmtlib/fmt/issues/411).
+-   Used `FMT_NULL` instead of literal `0`
+    (https://github.com/fmtlib/fmt/pull/409). Thanks @alabuzhev.
+-   Added extern templates for `format_float`
+    (https://github.com/fmtlib/fmt/issues/413).
+-   Fixed implicit conversion issue
+    (https://github.com/fmtlib/fmt/issues/507).
+-   Fixed signbit detection
+    (https://github.com/fmtlib/fmt/issues/423).
+-   Fixed naming collision
+    (https://github.com/fmtlib/fmt/issues/425).
+-   Fixed missing intrinsic for C++/CLI
+    (https://github.com/fmtlib/fmt/pull/457). Thanks @calumr.
+-   Fixed Android detection
+    (https://github.com/fmtlib/fmt/pull/458). Thanks @Gachapen.
+-   Use lean `windows.h` if not in header-only mode
+    (https://github.com/fmtlib/fmt/pull/503). Thanks @Quentin01.
+-   Fixed issue with CMake exporting C++11 flag
+    (https://github.com/fmtlib/fmt/pull/455). Thanks @EricWF.
+-   Fixed issue with nvcc and MSVC compiler bug and MinGW
+    (https://github.com/fmtlib/fmt/issues/505).
+-   Fixed DLL issues (https://github.com/fmtlib/fmt/pull/469 and
+    https://github.com/fmtlib/fmt/pull/502).
+    Thanks @richardeakin and @AndreasSchoenle.
+-   Fixed test compilation under FreeBSD
+    (https://github.com/fmtlib/fmt/issues/433).
+-   Fixed various warnings
+    (https://github.com/fmtlib/fmt/pull/403,
+    https://github.com/fmtlib/fmt/pull/410 and
+    https://github.com/fmtlib/fmt/pull/510).
+    Thanks @Lecetem, @chenhayat and @trozen.
+-   Worked around a broken `__builtin_clz` in clang with MS codegen
+    (https://github.com/fmtlib/fmt/issues/519).
+-   Removed redundant include
+    (https://github.com/fmtlib/fmt/issues/479).
+-   Fixed documentation issues.
+
+# 3.0.1 - 2016-11-01
+
+-   Fixed handling of thousands separator
+    (https://github.com/fmtlib/fmt/issues/353).
+-   Fixed handling of `unsigned char` strings
+    (https://github.com/fmtlib/fmt/issues/373).
+-   Corrected buffer growth when formatting time
+    (https://github.com/fmtlib/fmt/issues/367).
+-   Removed warnings under MSVC and clang
+    (https://github.com/fmtlib/fmt/issues/318,
+    https://github.com/fmtlib/fmt/issues/250, also merged
+    https://github.com/fmtlib/fmt/pull/385 and
+    https://github.com/fmtlib/fmt/pull/361).
+    Thanks @jcelerier and @nmoehrle.
+-   Fixed compilation issues under Android
+    (https://github.com/fmtlib/fmt/pull/327,
+    https://github.com/fmtlib/fmt/issues/345 and
+    https://github.com/fmtlib/fmt/pull/381), FreeBSD
+    (https://github.com/fmtlib/fmt/pull/358), Cygwin
+    (https://github.com/fmtlib/fmt/issues/388), MinGW
+    (https://github.com/fmtlib/fmt/issues/355) as well as other
+    issues (https://github.com/fmtlib/fmt/issues/350,
+    https://github.com/fmtlib/fmt/issues/355,
+    https://github.com/fmtlib/fmt/pull/348,
+    https://github.com/fmtlib/fmt/pull/402,
+    https://github.com/fmtlib/fmt/pull/405).
+    Thanks @dpantele, @hghwng, @arvedarved, @LogicalKnight and @JanHellwig.
+-   Fixed some documentation issues and extended specification
+    (https://github.com/fmtlib/fmt/issues/320,
+    https://github.com/fmtlib/fmt/pull/333,
+    https://github.com/fmtlib/fmt/issues/347,
+    https://github.com/fmtlib/fmt/pull/362). Thanks @smellman.
+
+# 3.0.0 - 2016-05-07
+
+-   The project has been renamed from C++ Format (cppformat) to fmt for
+    consistency with the used namespace and macro prefix
+    (https://github.com/fmtlib/fmt/issues/307). Library headers
+    are now located in the `fmt` directory:
+
+    ```c++
+    #include "fmt/format.h"
+    ```
+
+    Including `format.h` from the `cppformat` directory is deprecated
+    but works via a proxy header which will be removed in the next major
+    version.
+
+    The documentation is now available at <https://fmt.dev>.
+
+-   Added support for
+    [strftime](http://en.cppreference.com/w/cpp/chrono/c/strftime)-like
+    [date and time
+    formatting](https://fmt.dev/3.0.0/api.html#date-and-time-formatting)
+    (https://github.com/fmtlib/fmt/issues/283):
+
+    ```c++
+    #include "fmt/time.h"
+
+    std::time_t t = std::time(nullptr);
+    // Prints "The date is 2016-04-29." (with the current date)
+    fmt::print("The date is {:%Y-%m-%d}.", *std::localtime(&t));
+    ```
+
+-   `std::ostream` support including formatting of user-defined types
+    that provide overloaded `operator<<` has been moved to
+    `fmt/ostream.h`:
+
+    ```c++
+    #include "fmt/ostream.h"
+
+    class Date {
+      int year_, month_, day_;
+    public:
+      Date(int year, int month, int day) : year_(year), month_(month), day_(day) {}
+
+      friend std::ostream &operator<<(std::ostream &os, const Date &d) {
+        return os << d.year_ << '-' << d.month_ << '-' << d.day_;
+      }
+    };
+
+    std::string s = fmt::format("The date is {}", Date(2012, 12, 9));
+    // s == "The date is 2012-12-9"
+    ```
+
+-   Added support for [custom argument
+    formatters](https://fmt.dev/3.0.0/api.html#argument-formatters)
+    (https://github.com/fmtlib/fmt/issues/235).
+
+-   Added support for locale-specific integer formatting with the `n`
+    specifier (https://github.com/fmtlib/fmt/issues/305):
+
+    ```c++
+    std::setlocale(LC_ALL, "en_US.utf8");
+    fmt::print("cppformat: {:n}\n", 1234567); // prints 1,234,567
+    ```
+
+-   Sign is now preserved when formatting an integer with an incorrect
+    `printf` format specifier
+    (https://github.com/fmtlib/fmt/issues/265):
+
+    ```c++
+    fmt::printf("%lld", -42); // prints -42
+    ```
+
+    Note that it would be an undefined behavior in `std::printf`.
+
+-   Length modifiers such as `ll` are now optional in printf formatting
+    functions and the correct type is determined automatically
+    (https://github.com/fmtlib/fmt/issues/255):
+
+    ```c++
+    fmt::printf("%d", std::numeric_limits<long long>::max());
+    ```
+
+    Note that it would be an undefined behavior in `std::printf`.
+
+-   Added initial support for custom formatters
+    (https://github.com/fmtlib/fmt/issues/231).
+
+-   Fixed detection of user-defined literal support on Intel C++
+    compiler (https://github.com/fmtlib/fmt/issues/311,
+    https://github.com/fmtlib/fmt/pull/312).
+    Thanks @dean0x7d and @speth.
+
+-   Reduced compile time
+    (https://github.com/fmtlib/fmt/pull/243,
+    https://github.com/fmtlib/fmt/pull/249,
+    https://github.com/fmtlib/fmt/issues/317):
+
+    ![](https://cloud.githubusercontent.com/assets/4831417/11614060/b9e826d2-9c36-11e5-8666-d4131bf503ef.png)
+
+    ![](https://cloud.githubusercontent.com/assets/4831417/11614080/6ac903cc-9c37-11e5-8165-26df6efae364.png)
+
+    Thanks @dean0x7d.
+
+-   Compile test fixes (https://github.com/fmtlib/fmt/pull/313).
+    Thanks @dean0x7d.
+
+-   Documentation fixes (https://github.com/fmtlib/fmt/pull/239,
+    https://github.com/fmtlib/fmt/issues/248,
+    https://github.com/fmtlib/fmt/issues/252,
+    https://github.com/fmtlib/fmt/pull/258,
+    https://github.com/fmtlib/fmt/issues/260,
+    https://github.com/fmtlib/fmt/issues/301,
+    https://github.com/fmtlib/fmt/pull/309).
+    Thanks @ReadmeCritic @Gachapen and @jwilk.
+
+-   Fixed compiler and sanitizer warnings
+    (https://github.com/fmtlib/fmt/issues/244,
+    https://github.com/fmtlib/fmt/pull/256,
+    https://github.com/fmtlib/fmt/pull/259,
+    https://github.com/fmtlib/fmt/issues/263,
+    https://github.com/fmtlib/fmt/issues/274,
+    https://github.com/fmtlib/fmt/pull/277,
+    https://github.com/fmtlib/fmt/pull/286,
+    https://github.com/fmtlib/fmt/issues/291,
+    https://github.com/fmtlib/fmt/issues/296,
+    https://github.com/fmtlib/fmt/issues/308).
+    Thanks @mwinterb, @pweiskircher and @Naios.
+
+-   Improved compatibility with Windows Store apps
+    (https://github.com/fmtlib/fmt/issues/280,
+    https://github.com/fmtlib/fmt/pull/285) Thanks @mwinterb.
+
+-   Added tests of compatibility with older C++ standards
+    (https://github.com/fmtlib/fmt/pull/273). Thanks @niosHD.
+
+-   Fixed Android build
+    (https://github.com/fmtlib/fmt/pull/271). Thanks @newnon.
+
+-   Changed `ArgMap` to be backed by a vector instead of a map.
+    (https://github.com/fmtlib/fmt/issues/261,
+    https://github.com/fmtlib/fmt/pull/262). Thanks @mwinterb.
+
+-   Added `fprintf` overload that writes to a `std::ostream`
+    (https://github.com/fmtlib/fmt/pull/251).
+    Thanks @nickhutchinson.
+
+-   Export symbols when building a Windows DLL
+    (https://github.com/fmtlib/fmt/pull/245).
+    Thanks @macdems.
+
+-   Fixed compilation on Cygwin
+    (https://github.com/fmtlib/fmt/issues/304).
+
+-   Implemented a workaround for a bug in Apple LLVM version 4.2 of
+    clang (https://github.com/fmtlib/fmt/issues/276).
+
+-   Implemented a workaround for Google Test bug
+    https://github.com/google/googletest/issues/705 on gcc 6
+    (https://github.com/fmtlib/fmt/issues/268). Thanks @octoploid.
+
+-   Removed Biicode support because the latter has been discontinued.
+
+# 2.1.1 - 2016-04-11
+
+-   The install location for generated CMake files is now configurable
+    via the `FMT_CMAKE_DIR` CMake variable
+    (https://github.com/fmtlib/fmt/pull/299). Thanks @niosHD.
+-   Documentation fixes
+    (https://github.com/fmtlib/fmt/issues/252).
+
+# 2.1.0 - 2016-03-21
+
+-   Project layout and build system improvements
+    (https://github.com/fmtlib/fmt/pull/267):
+
+    -   The code have been moved to the `cppformat` directory. Including
+        `format.h` from the top-level directory is deprecated but works
+        via a proxy header which will be removed in the next major
+        version.
+    -   C++ Format CMake targets now have proper interface definitions.
+    -   Installed version of the library now supports the header-only
+        configuration.
+    -   Targets `doc`, `install`, and `test` are now disabled if C++
+        Format is included as a CMake subproject. They can be enabled by
+        setting `FMT_DOC`, `FMT_INSTALL`, and `FMT_TEST` in the parent
+        project.
+
+    Thanks @niosHD.
+
+# 2.0.1 - 2016-03-13
+
+-   Improved CMake find and package support
+    (https://github.com/fmtlib/fmt/issues/264). Thanks @niosHD.
+-   Fix compile error with Android NDK and mingw32
+    (https://github.com/fmtlib/fmt/issues/241). Thanks @Gachapen.
+-   Documentation fixes
+    (https://github.com/fmtlib/fmt/issues/248,
+    https://github.com/fmtlib/fmt/issues/260).
+
+# 2.0.0 - 2015-12-01
+
+## General
+
+-   \[Breaking\] Named arguments
+    (https://github.com/fmtlib/fmt/pull/169,
+    https://github.com/fmtlib/fmt/pull/173,
+    https://github.com/fmtlib/fmt/pull/174):
+
+    ```c++
+    fmt::print("The answer is {answer}.", fmt::arg("answer", 42));
+    ```
+
+    Thanks @jamboree.
+
+-   \[Experimental\] User-defined literals for format and named
+    arguments (https://github.com/fmtlib/fmt/pull/204,
+    https://github.com/fmtlib/fmt/pull/206,
+    https://github.com/fmtlib/fmt/pull/207):
+
+    ```c++
+    using namespace fmt::literals;
+    fmt::print("The answer is {answer}.", "answer"_a=42);
+    ```
+
+    Thanks @dean0x7d.
+
+-   \[Breaking\] Formatting of more than 16 arguments is now supported
+    when using variadic templates
+    (https://github.com/fmtlib/fmt/issues/141). Thanks @Shauren.
+
+-   Runtime width specification
+    (https://github.com/fmtlib/fmt/pull/168):
+
+    ```c++
+    fmt::format("{0:{1}}", 42, 5); // gives "   42"
+    ```
+
+    Thanks @jamboree.
+
+-   \[Breaking\] Enums are now formatted with an overloaded
+    `std::ostream` insertion operator (`operator<<`) if available
+    (https://github.com/fmtlib/fmt/issues/232).
+
+-   \[Breaking\] Changed default `bool` format to textual, \"true\" or
+    \"false\" (https://github.com/fmtlib/fmt/issues/170):
+
+    ```c++
+    fmt::print("{}", true); // prints "true"
+    ```
+
+    To print `bool` as a number use numeric format specifier such as
+    `d`:
+
+    ```c++
+    fmt::print("{:d}", true); // prints "1"
+    ```
+
+-   `fmt::printf` and `fmt::sprintf` now support formatting of `bool`
+    with the `%s` specifier giving textual output, \"true\" or \"false\"
+    (https://github.com/fmtlib/fmt/pull/223):
+
+    ```c++
+    fmt::printf("%s", true); // prints "true"
+    ```
+
+    Thanks @LarsGullik.
+
+-   \[Breaking\] `signed char` and `unsigned char` are now formatted as
+    integers by default
+    (https://github.com/fmtlib/fmt/pull/217).
+
+-   \[Breaking\] Pointers to C strings can now be formatted with the `p`
+    specifier (https://github.com/fmtlib/fmt/pull/223):
+
+    ```c++
+    fmt::print("{:p}", "test"); // prints pointer value
+    ```
+
+    Thanks @LarsGullik.
+
+-   \[Breaking\] `fmt::printf` and `fmt::sprintf` now print null
+    pointers as `(nil)` and null strings as `(null)` for consistency
+    with glibc (https://github.com/fmtlib/fmt/pull/226).
+    Thanks @LarsGullik.
+
+-   \[Breaking\] `fmt::(s)printf` now supports formatting of objects of
+    user-defined types that provide an overloaded `std::ostream`
+    insertion operator (`operator<<`)
+    (https://github.com/fmtlib/fmt/issues/201):
+
+    ```c++
+    fmt::printf("The date is %s", Date(2012, 12, 9));
+    ```
+
+-   \[Breaking\] The `Buffer` template is now part of the public API and
+    can be used to implement custom memory buffers
+    (https://github.com/fmtlib/fmt/issues/140). Thanks @polyvertex.
+
+-   \[Breaking\] Improved compatibility between `BasicStringRef` and
+    [std::experimental::basic_string_view](
+    http://en.cppreference.com/w/cpp/experimental/basic_string_view)
+    (https://github.com/fmtlib/fmt/issues/100,
+    https://github.com/fmtlib/fmt/issues/159,
+    https://github.com/fmtlib/fmt/issues/183):
+
+    -   Comparison operators now compare string content, not pointers
+    -   `BasicStringRef::c_str` replaced by `BasicStringRef::data`
+    -   `BasicStringRef` is no longer assumed to be null-terminated
+
+    References to null-terminated strings are now represented by a new
+    class, `BasicCStringRef`.
+
+-   Dependency on pthreads introduced by Google Test is now optional
+    (https://github.com/fmtlib/fmt/issues/185).
+
+-   New CMake options `FMT_DOC`, `FMT_INSTALL` and `FMT_TEST` to control
+    generation of `doc`, `install` and `test` targets respectively, on
+    by default (https://github.com/fmtlib/fmt/issues/197,
+    https://github.com/fmtlib/fmt/issues/198,
+    https://github.com/fmtlib/fmt/issues/200). Thanks @maddinat0r.
+
+-   `noexcept` is now used when compiling with MSVC2015
+    (https://github.com/fmtlib/fmt/pull/215). Thanks @dmkrepo.
+
+-   Added an option to disable use of `windows.h` when
+    `FMT_USE_WINDOWS_H` is defined as 0 before including `format.h`
+    (https://github.com/fmtlib/fmt/issues/171). Thanks @alfps.
+
+-   \[Breaking\] `windows.h` is now included with `NOMINMAX` unless
+    `FMT_WIN_MINMAX` is defined. This is done to prevent breaking code
+    using `std::min` and `std::max` and only affects the header-only
+    configuration (https://github.com/fmtlib/fmt/issues/152,
+    https://github.com/fmtlib/fmt/pull/153,
+    https://github.com/fmtlib/fmt/pull/154). Thanks @DevO2012.
+
+-   Improved support for custom character types
+    (https://github.com/fmtlib/fmt/issues/171). Thanks @alfps.
+
+-   Added an option to disable use of IOStreams when `FMT_USE_IOSTREAMS`
+    is defined as 0 before including `format.h`
+    (https://github.com/fmtlib/fmt/issues/205,
+    https://github.com/fmtlib/fmt/pull/208). Thanks @JodiTheTigger.
+
+-   Improved detection of `isnan`, `isinf` and `signbit`.
+
+## Optimization
+
+-   Made formatting of user-defined types more efficient with a custom
+    stream buffer (https://github.com/fmtlib/fmt/issues/92,
+    https://github.com/fmtlib/fmt/pull/230). Thanks @NotImplemented.
+-   Further improved performance of `fmt::Writer` on integer formatting
+    and fixed a minor regression. Now it is \~7% faster than
+    `karma::generate` on Karma\'s benchmark
+    (https://github.com/fmtlib/fmt/issues/186).
+-   \[Breaking\] Reduced [compiled code
+    size](https://github.com/fmtlib/fmt#compile-time-and-code-bloat)
+    (https://github.com/fmtlib/fmt/issues/143,
+    https://github.com/fmtlib/fmt/pull/149).
+
+## Distribution
+
+-   \[Breaking\] Headers are now installed in
+    `${CMAKE_INSTALL_PREFIX}/include/cppformat`
+    (https://github.com/fmtlib/fmt/issues/178). Thanks @jackyf.
+
+-   \[Breaking\] Changed the library name from `format` to `cppformat`
+    for consistency with the project name and to avoid potential
+    conflicts (https://github.com/fmtlib/fmt/issues/178).
+    Thanks @jackyf.
+
+-   C++ Format is now available in [Debian](https://www.debian.org/)
+    GNU/Linux
+    ([stretch](https://packages.debian.org/source/stretch/cppformat),
+    [sid](https://packages.debian.org/source/sid/cppformat)) and derived
+    distributions such as
+    [Ubuntu](https://launchpad.net/ubuntu/+source/cppformat) 15.10 and
+    later (https://github.com/fmtlib/fmt/issues/155):
+
+        $ sudo apt-get install libcppformat1-dev
+
+    Thanks @jackyf.
+
+-   [Packages for Fedora and
+    RHEL](https://admin.fedoraproject.org/pkgdb/package/cppformat/) are
+    now available. Thanks Dave Johansen.
+
+-   C++ Format can now be installed via [Homebrew](http://brew.sh/) on
+    OS X (https://github.com/fmtlib/fmt/issues/157):
+
+        $ brew install cppformat
+
+    Thanks @ortho and Anatoliy Bulukin.
+
+## Documentation
+
+-   Migrated from ReadTheDocs to GitHub Pages for better responsiveness
+    and reliability (https://github.com/fmtlib/fmt/issues/128).
+    New documentation address is <http://cppformat.github.io/>.
+-   Added [Building thedocumentation](
+    https://fmt.dev/2.0.0/usage.html#building-the-documentation)
+    section to the documentation.
+-   Documentation build script is now compatible with Python 3 and newer
+    pip versions. (https://github.com/fmtlib/fmt/pull/189,
+    https://github.com/fmtlib/fmt/issues/209).
+    Thanks @JodiTheTigger and @xentec.
+-   Documentation fixes and improvements
+    (https://github.com/fmtlib/fmt/issues/36,
+    https://github.com/fmtlib/fmt/issues/75,
+    https://github.com/fmtlib/fmt/issues/125,
+    https://github.com/fmtlib/fmt/pull/160,
+    https://github.com/fmtlib/fmt/pull/161,
+    https://github.com/fmtlib/fmt/issues/162,
+    https://github.com/fmtlib/fmt/issues/165,
+    https://github.com/fmtlib/fmt/issues/210). 
+    Thanks @syohex.
+-   Fixed out-of-tree documentation build
+    (https://github.com/fmtlib/fmt/issues/177). Thanks @jackyf.
+
+## Fixes
+
+-   Fixed `initializer_list` detection
+    (https://github.com/fmtlib/fmt/issues/136). Thanks @Gachapen.
+
+-   \[Breaking\] Fixed formatting of enums with numeric format
+    specifiers in `fmt::(s)printf`
+    (https://github.com/fmtlib/fmt/issues/131,
+    https://github.com/fmtlib/fmt/issues/139):
+
+    ```c++
+    enum { ANSWER = 42 };
+    fmt::printf("%d", ANSWER);
+    ```
+
+    Thanks @Naios.
+
+-   Improved compatibility with old versions of MinGW
+    (https://github.com/fmtlib/fmt/issues/129,
+    https://github.com/fmtlib/fmt/pull/130,
+    https://github.com/fmtlib/fmt/issues/132). Thanks @cstamford.
+
+-   Fixed a compile error on MSVC with disabled exceptions
+    (https://github.com/fmtlib/fmt/issues/144).
+
+-   Added a workaround for broken implementation of variadic templates
+    in MSVC2012 (https://github.com/fmtlib/fmt/issues/148).
+
+-   Placed the anonymous namespace within `fmt` namespace for the
+    header-only configuration (https://github.com/fmtlib/fmt/issues/171).
+    Thanks @alfps.
+
+-   Fixed issues reported by Coverity Scan
+    (https://github.com/fmtlib/fmt/issues/187,
+    https://github.com/fmtlib/fmt/issues/192).
+
+-   Implemented a workaround for a name lookup bug in MSVC2010
+    (https://github.com/fmtlib/fmt/issues/188).
+
+-   Fixed compiler warnings
+    (https://github.com/fmtlib/fmt/issues/95,
+    https://github.com/fmtlib/fmt/issues/96,
+    https://github.com/fmtlib/fmt/pull/114,
+    https://github.com/fmtlib/fmt/issues/135,
+    https://github.com/fmtlib/fmt/issues/142,
+    https://github.com/fmtlib/fmt/issues/145,
+    https://github.com/fmtlib/fmt/issues/146,
+    https://github.com/fmtlib/fmt/issues/158,
+    https://github.com/fmtlib/fmt/issues/163,
+    https://github.com/fmtlib/fmt/issues/175,
+    https://github.com/fmtlib/fmt/issues/190,
+    https://github.com/fmtlib/fmt/pull/191,
+    https://github.com/fmtlib/fmt/issues/194,
+    https://github.com/fmtlib/fmt/pull/196,
+    https://github.com/fmtlib/fmt/issues/216,
+    https://github.com/fmtlib/fmt/pull/218,
+    https://github.com/fmtlib/fmt/pull/220,
+    https://github.com/fmtlib/fmt/pull/229,
+    https://github.com/fmtlib/fmt/issues/233,
+    https://github.com/fmtlib/fmt/issues/234,
+    https://github.com/fmtlib/fmt/pull/236,
+    https://github.com/fmtlib/fmt/issues/281,
+    https://github.com/fmtlib/fmt/issues/289).
+    Thanks @seanmiddleditch, @dixlorenz, @CarterLi, @Naios, @fmatthew5876,
+    @LevskiWeng, @rpopescu, @gabime, @cubicool, @jkflying, @LogicalKnight,
+    @inguin and @Jopie64.
+
+-   Fixed portability issues (mostly causing test failures) on ARM,
+    ppc64, ppc64le, s390x and SunOS 5.11 i386
+    (https://github.com/fmtlib/fmt/issues/138,
+    https://github.com/fmtlib/fmt/issues/179,
+    https://github.com/fmtlib/fmt/issues/180,
+    https://github.com/fmtlib/fmt/issues/202,
+    https://github.com/fmtlib/fmt/issues/225, [Red Hat Bugzilla
+    Bug 1260297](https://bugzilla.redhat.com/show_bug.cgi?id=1260297)).
+    Thanks @Naios, @jackyf and Dave Johansen.
+
+-   Fixed a name conflict with macro `free` defined in `crtdbg.h` when
+    `_CRTDBG_MAP_ALLOC` is set (https://github.com/fmtlib/fmt/issues/211).
+
+-   Fixed shared library build on OS X
+    (https://github.com/fmtlib/fmt/pull/212). Thanks @dean0x7d.
+
+-   Fixed an overload conflict on MSVC when `/Zc:wchar_t-` option is
+    specified (https://github.com/fmtlib/fmt/pull/214).
+    Thanks @slavanap.
+
+-   Improved compatibility with MSVC 2008
+    (https://github.com/fmtlib/fmt/pull/236). Thanks @Jopie64.
+
+-   Improved compatibility with bcc32
+    (https://github.com/fmtlib/fmt/issues/227).
+
+-   Fixed `static_assert` detection on Clang
+    (https://github.com/fmtlib/fmt/pull/228). Thanks @dean0x7d.
+
+# 1.1.0 - 2015-03-06
+
+-   Added `BasicArrayWriter`, a class template that provides operations
+    for formatting and writing data into a fixed-size array
+    (https://github.com/fmtlib/fmt/issues/105 and
+    https://github.com/fmtlib/fmt/issues/122):
+
+    ```c++
+    char buffer[100];
+    fmt::ArrayWriter w(buffer);
+    w.write("The answer is {}", 42);
+    ```
+
+-   Added [0 A.D.](http://play0ad.com/) and [PenUltima Online
+    (POL)](http://www.polserver.com/) to the list of notable projects
+    using C++ Format.
+
+-   C++ Format now uses MSVC intrinsics for better formatting performance
+    (https://github.com/fmtlib/fmt/pull/115,
+    https://github.com/fmtlib/fmt/pull/116,
+    https://github.com/fmtlib/fmt/pull/118 and
+    https://github.com/fmtlib/fmt/pull/121). Previously these
+    optimizations where only used on GCC and Clang.
+    Thanks @CarterLi and @objectx.
+
+-   CMake install target
+    (https://github.com/fmtlib/fmt/pull/119). Thanks @TrentHouliston.
+
+    You can now install C++ Format with `make install` command.
+
+-   Improved [Biicode](http://www.biicode.com/) support
+    (https://github.com/fmtlib/fmt/pull/98 and
+    https://github.com/fmtlib/fmt/pull/104).
+    Thanks @MariadeAnton and @franramirez688.
+
+-   Improved support for building with [Android NDK](
+    https://developer.android.com/tools/sdk/ndk/index.html)
+    (https://github.com/fmtlib/fmt/pull/107). Thanks @newnon.
+
+    The [android-ndk-example](https://github.com/fmtlib/android-ndk-example)
+    repository provides and example of using C++ Format with Android NDK:
+
+    ![](https://raw.githubusercontent.com/fmtlib/android-ndk-example/master/screenshot.png)
+
+-   Improved documentation of `SystemError` and `WindowsError`
+    (https://github.com/fmtlib/fmt/issues/54).
+
+-   Various code improvements
+    (https://github.com/fmtlib/fmt/pull/110,
+    https://github.com/fmtlib/fmt/pull/111
+    https://github.com/fmtlib/fmt/pull/112). Thanks @CarterLi.
+
+-   Improved compile-time errors when formatting wide into narrow
+    strings (https://github.com/fmtlib/fmt/issues/117).
+
+-   Fixed `BasicWriter::write` without formatting arguments when C++11
+    support is disabled
+    (https://github.com/fmtlib/fmt/issues/109).
+
+-   Fixed header-only build on OS X with GCC 4.9
+    (https://github.com/fmtlib/fmt/issues/124).
+
+-   Fixed packaging issues (https://github.com/fmtlib/fmt/issues/94).
+
+-   Added [changelog](https://github.com/fmtlib/fmt/blob/master/ChangeLog.md)
+    (https://github.com/fmtlib/fmt/issues/103).
+
+# 1.0.0 - 2015-02-05
+
+-   Add support for a header-only configuration when `FMT_HEADER_ONLY`
+    is defined before including `format.h`:
+
+    ```c++
+    #define FMT_HEADER_ONLY
+    #include "format.h"
+    ```
+
+-   Compute string length in the constructor of `BasicStringRef` instead
+    of the `size` method
+    (https://github.com/fmtlib/fmt/issues/79). This eliminates
+    size computation for string literals on reasonable optimizing
+    compilers.
+
+-   Fix formatting of types with overloaded `operator <<` for
+    `std::wostream` (https://github.com/fmtlib/fmt/issues/86):
+
+    ```c++
+    fmt::format(L"The date is {0}", Date(2012, 12, 9));
+    ```
+
+-   Fix linkage of tests on Arch Linux
+    (https://github.com/fmtlib/fmt/issues/89).
+
+-   Allow precision specifier for non-float arguments
+    (https://github.com/fmtlib/fmt/issues/90):
+
+    ```c++
+    fmt::print("{:.3}\n", "Carpet"); // prints "Car"
+    ```
+
+-   Fix build on Android NDK (https://github.com/fmtlib/fmt/issues/93).
+
+-   Improvements to documentation build procedure.
+
+-   Remove `FMT_SHARED` CMake variable in favor of standard [BUILD_SHARED_LIBS](
+    http://www.cmake.org/cmake/help/v3.0/variable/BUILD_SHARED_LIBS.html).
+
+-   Fix error handling in `fmt::fprintf`.
+
+-   Fix a number of warnings.
+
+# 0.12.0 - 2014-10-25
+
+-   \[Breaking\] Improved separation between formatting and buffer
+    management. `Writer` is now a base class that cannot be instantiated
+    directly. The new `MemoryWriter` class implements the default buffer
+    management with small allocations done on stack. So `fmt::Writer`
+    should be replaced with `fmt::MemoryWriter` in variable
+    declarations.
+
+    Old code:
+
+    ```c++
+    fmt::Writer w;
+    ```
+
+    New code:
+
+    ```c++
+    fmt::MemoryWriter w;
+    ```
+
+    If you pass `fmt::Writer` by reference, you can continue to do so:
+
+    ```c++
+    void f(fmt::Writer &w);
+    ```
+
+    This doesn\'t affect the formatting API.
+
+-   Support for custom memory allocators
+    (https://github.com/fmtlib/fmt/issues/69)
+
+-   Formatting functions now accept [signed char]{.title-ref} and
+    [unsigned char]{.title-ref} strings as arguments
+    (https://github.com/fmtlib/fmt/issues/73):
+
+    ```c++
+    auto s = format("GLSL version: {}", glGetString(GL_VERSION));
+    ```
+
+-   Reduced code bloat. According to the new [benchmark
+    results](https://github.com/fmtlib/fmt#compile-time-and-code-bloat),
+    cppformat is close to `printf` and by the order of magnitude better
+    than Boost Format in terms of compiled code size.
+
+-   Improved appearance of the documentation on mobile by using the
+    [Sphinx Bootstrap
+    theme](http://ryan-roemer.github.io/sphinx-bootstrap-theme/):
+
+    | Old | New |
+    | --- | --- |
+    | ![](https://cloud.githubusercontent.com/assets/576385/4792130/cd256436-5de3-11e4-9a62-c077d0c2b003.png) | ![](https://cloud.githubusercontent.com/assets/576385/4792131/cd29896c-5de3-11e4-8f59-cac952942bf0.png) |
+
+# 0.11.0 - 2014-08-21
+
+-   Safe printf implementation with a POSIX extension for positional
+    arguments:
+
+    ```c++
+    fmt::printf("Elapsed time: %.2f seconds", 1.23);
+    fmt::printf("%1$s, %3$d %2$s", weekday, month, day);
+    ```
+
+-   Arguments of `char` type can now be formatted as integers (Issue
+    https://github.com/fmtlib/fmt/issues/55):
+
+    ```c++
+    fmt::format("0x{0:02X}", 'a');
+    ```
+
+-   Deprecated parts of the API removed.
+
+-   The library is now built and tested on MinGW with Appveyor in
+    addition to existing test platforms Linux/GCC, OS X/Clang,
+    Windows/MSVC.
+
+# 0.10.0 - 2014-07-01
+
+**Improved API**
+
+-   All formatting methods are now implemented as variadic functions
+    instead of using `operator<<` for feeding arbitrary arguments into a
+    temporary formatter object. This works both with C++11 where
+    variadic templates are used and with older standards where variadic
+    functions are emulated by providing lightweight wrapper functions
+    defined with the `FMT_VARIADIC` macro. You can use this macro for
+    defining your own portable variadic functions:
+
+    ```c++
+    void report_error(const char *format, const fmt::ArgList &args) {
+      fmt::print("Error: {}");
+      fmt::print(format, args);
+    }
+    FMT_VARIADIC(void, report_error, const char *)
+
+    report_error("file not found: {}", path);
+    ```
+
+    Apart from a more natural syntax, this also improves performance as
+    there is no need to construct temporary formatter objects and
+    control arguments\' lifetimes. Because the wrapper functions are
+    very lightweight, this doesn\'t cause code bloat even in pre-C++11
+    mode.
+
+-   Simplified common case of formatting an `std::string`. Now it
+    requires a single function call:
+
+    ```c++
+    std::string s = format("The answer is {}.", 42);
+    ```
+
+    Previously it required 2 function calls:
+
+    ```c++
+    std::string s = str(Format("The answer is {}.") << 42);
+    ```
+
+    Instead of unsafe `c_str` function, `fmt::Writer` should be used
+    directly to bypass creation of `std::string`:
+
+    ```c++
+    fmt::Writer w;
+    w.write("The answer is {}.", 42);
+    w.c_str();  // returns a C string
+    ```
+
+    This doesn\'t do dynamic memory allocation for small strings and is
+    less error prone as the lifetime of the string is the same as for
+    `std::string::c_str` which is well understood (hopefully).
+
+-   Improved consistency in naming functions that are a part of the
+    public API. Now all public functions are lowercase following the
+    standard library conventions. Previously it was a combination of
+    lowercase and CapitalizedWords. Issue
+    https://github.com/fmtlib/fmt/issues/50.
+
+-   Old functions are marked as deprecated and will be removed in the
+    next release.
+
+**Other Changes**
+
+-   Experimental support for printf format specifications (work in
+    progress):
+
+    ```c++
+    fmt::printf("The answer is %d.", 42);
+    std::string s = fmt::sprintf("Look, a %s!", "string");
+    ```
+
+-   Support for hexadecimal floating point format specifiers `a` and
+    `A`:
+
+    ```c++
+    print("{:a}", -42.0); // Prints -0x1.5p+5
+    print("{:A}", -42.0); // Prints -0X1.5P+5
+    ```
+
+-   CMake option `FMT_SHARED` that specifies whether to build format as
+    a shared library (off by default).
+
+# 0.9.0 - 2014-05-13
+
+-   More efficient implementation of variadic formatting functions.
+
+-   `Writer::Format` now has a variadic overload:
+
+    ```c++
+    Writer out;
+    out.Format("Look, I'm {}!", "variadic");
+    ```
+
+-   For efficiency and consistency with other overloads, variadic
+    overload of the `Format` function now returns `Writer` instead of
+    `std::string`. Use the `str` function to convert it to
+    `std::string`:
+
+    ```c++
+    std::string s = str(Format("Look, I'm {}!", "variadic"));
+    ```
+
+-   Replaced formatter actions with output sinks: `NoAction` -\>
+    `NullSink`, `Write` -\> `FileSink`, `ColorWriter` -\>
+    `ANSITerminalSink`. This improves naming consistency and shouldn\'t
+    affect client code unless these classes are used directly which
+    should be rarely needed.
+
+-   Added `ThrowSystemError` function that formats a message and throws
+    `SystemError` containing the formatted message and system-specific
+    error description. For example, the following code
+
+    ```c++
+    FILE *f = fopen(filename, "r");
+    if (!f)
+      ThrowSystemError(errno, "Failed to open file '{}'") << filename;
+    ```
+
+    will throw `SystemError` exception with description \"Failed to open
+    file \'\<filename\>\': No such file or directory\" if file doesn\'t
+    exist.
+
+-   Support for AppVeyor continuous integration platform.
+
+-   `Format` now throws `SystemError` in case of I/O errors.
+
+-   Improve test infrastructure. Print functions are now tested by
+    redirecting the output to a pipe.
+
+# 0.8.0 - 2014-04-14
+
+-   Initial release
diff --git a/ethosu/regor/dependencies/thirdparty/fmt/LICENSE b/ethosu/regor/dependencies/thirdparty/fmt/LICENSE
new file mode 100644
index 00000000..1cd1ef92
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/fmt/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2012 - present, Victor Zverovich and {fmt} contributors
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+--- Optional exception to the license ---
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into a machine-executable object form of such
+source code, you may redistribute such embedded portions in such object form
+without including the above copyright and permission notices.
diff --git a/ethosu/regor/dependencies/thirdparty/fmt/README.md b/ethosu/regor/dependencies/thirdparty/fmt/README.md
new file mode 100644
index 00000000..dcfb16ec
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/fmt/README.md
@@ -0,0 +1,490 @@
+<img src="https://user-images.githubusercontent.com/576385/156254208-f5b743a9-88cf-439d-b0c0-923d53e8d551.png" alt="{fmt}" width="25%"/>
+
+[![image](https://github.com/fmtlib/fmt/workflows/linux/badge.svg)](https://github.com/fmtlib/fmt/actions?query=workflow%3Alinux)
+[![image](https://github.com/fmtlib/fmt/workflows/macos/badge.svg)](https://github.com/fmtlib/fmt/actions?query=workflow%3Amacos)
+[![image](https://github.com/fmtlib/fmt/workflows/windows/badge.svg)](https://github.com/fmtlib/fmt/actions?query=workflow%3Awindows)
+[![fmt is continuously fuzzed at oss-fuzz](https://oss-fuzz-build-logs.storage.googleapis.com/badges/fmt.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?\%0Acolspec=ID%20Type%20Component%20Status%20Proj%20Reported%20Owner%20\%0ASummary&q=proj%3Dfmt&can=1)
+[![Ask questions at StackOverflow with the tag fmt](https://img.shields.io/badge/stackoverflow-fmt-blue.svg)](https://stackoverflow.com/questions/tagged/fmt)
+[![image](https://api.securityscorecards.dev/projects/github.com/fmtlib/fmt/badge)](https://securityscorecards.dev/viewer/?uri=github.com/fmtlib/fmt)
+
+**{fmt}** is an open-source formatting library providing a fast and safe
+alternative to C stdio and C++ iostreams.
+
+If you like this project, please consider donating to one of the funds
+that help victims of the war in Ukraine: <https://www.stopputin.net/>.
+
+[Documentation](https://fmt.dev)
+
+[Cheat Sheets](https://hackingcpp.com/cpp/libs/fmt.html)
+
+Q&A: ask questions on [StackOverflow with the tag
+fmt](https://stackoverflow.com/questions/tagged/fmt).
+
+Try {fmt} in [Compiler Explorer](https://godbolt.org/z/Eq5763).
+
+# Features
+
+- Simple [format API](https://fmt.dev/latest/api.html) with positional
+  arguments for localization
+- Implementation of [C++20
+  std::format](https://en.cppreference.com/w/cpp/utility/format) and
+  [C++23 std::print](https://en.cppreference.com/w/cpp/io/print)
+- [Format string syntax](https://fmt.dev/latest/syntax.html) similar
+  to Python\'s
+  [format](https://docs.python.org/3/library/stdtypes.html#str.format)
+- Fast IEEE 754 floating-point formatter with correct rounding,
+  shortness and round-trip guarantees using the
+  [Dragonbox](https://github.com/jk-jeon/dragonbox) algorithm
+- Portable Unicode support
+- Safe [printf
+  implementation](https://fmt.dev/latest/api.html#printf-formatting)
+  including the POSIX extension for positional arguments
+- Extensibility: [support for user-defined
+  types](https://fmt.dev/latest/api.html#formatting-user-defined-types)
+- High performance: faster than common standard library
+  implementations of `(s)printf`, iostreams, `to_string` and
+  `to_chars`, see [Speed tests](#speed-tests) and [Converting a
+  hundred million integers to strings per
+  second](http://www.zverovich.net/2020/06/13/fast-int-to-string-revisited.html)
+- Small code size both in terms of source code with the minimum
+  configuration consisting of just three files, `core.h`, `format.h`
+  and `format-inl.h`, and compiled code; see [Compile time and code
+  bloat](#compile-time-and-code-bloat)
+- Reliability: the library has an extensive set of
+  [tests](https://github.com/fmtlib/fmt/tree/master/test) and is
+  [continuously fuzzed](https://bugs.chromium.org/p/oss-fuzz/issues/list?colspec=ID%20Type%20Component%20Status%20Proj%20Reported%20Owner%20Summary&q=proj%3Dfmt&can=1)
+- Safety: the library is fully type-safe, errors in format strings can
+  be reported at compile time, automatic memory management prevents
+  buffer overflow errors
+- Ease of use: small self-contained code base, no external
+  dependencies, permissive MIT
+  [license](https://github.com/fmtlib/fmt/blob/master/LICENSE.rst)
+- [Portability](https://fmt.dev/latest/index.html#portability) with
+  consistent output across platforms and support for older compilers
+- Clean warning-free codebase even on high warning levels such as
+  `-Wall -Wextra -pedantic`
+- Locale independence by default
+- Optional header-only configuration enabled with the
+  `FMT_HEADER_ONLY` macro
+
+See the [documentation](https://fmt.dev) for more details.
+
+# Examples
+
+**Print to stdout** ([run](https://godbolt.org/z/Tevcjh))
+
+``` c++
+#include <fmt/core.h>
+
+int main() {
+  fmt::print("Hello, world!\n");
+}
+```
+
+**Format a string** ([run](https://godbolt.org/z/oK8h33))
+
+``` c++
+std::string s = fmt::format("The answer is {}.", 42);
+// s == "The answer is 42."
+```
+
+**Format a string using positional arguments**
+([run](https://godbolt.org/z/Yn7Txe))
+
+``` c++
+std::string s = fmt::format("I'd rather be {1} than {0}.", "right", "happy");
+// s == "I'd rather be happy than right."
+```
+
+**Print dates and times** ([run](https://godbolt.org/z/c31ExdY3W))
+
+``` c++
+#include <fmt/chrono.h>
+
+int main() {
+  auto now = std::chrono::system_clock::now();
+  fmt::print("Date and time: {}\n", now);
+  fmt::print("Time: {:%H:%M}\n", now);
+}
+```
+
+Output:
+
+    Date and time: 2023-12-26 19:10:31.557195597
+    Time: 19:10
+
+**Print a container** ([run](https://godbolt.org/z/MxM1YqjE7))
+
+``` c++
+#include <vector>
+#include <fmt/ranges.h>
+
+int main() {
+  std::vector<int> v = {1, 2, 3};
+  fmt::print("{}\n", v);
+}
+```
+
+Output:
+
+    [1, 2, 3]
+
+**Check a format string at compile time**
+
+``` c++
+std::string s = fmt::format("{:d}", "I am not a number");
+```
+
+This gives a compile-time error in C++20 because `d` is an invalid
+format specifier for a string.
+
+**Write a file from a single thread**
+
+``` c++
+#include <fmt/os.h>
+
+int main() {
+  auto out = fmt::output_file("guide.txt");
+  out.print("Don't {}", "Panic");
+}
+```
+
+This can be [5 to 9 times faster than
+fprintf](http://www.zverovich.net/2020/08/04/optimal-file-buffer-size.html).
+
+**Print with colors and text styles**
+
+``` c++
+#include <fmt/color.h>
+
+int main() {
+  fmt::print(fg(fmt::color::crimson) | fmt::emphasis::bold,
+             "Hello, {}!\n", "world");
+  fmt::print(fg(fmt::color::floral_white) | bg(fmt::color::slate_gray) |
+             fmt::emphasis::underline, "Olá, {}!\n", "Mundo");
+  fmt::print(fg(fmt::color::steel_blue) | fmt::emphasis::italic,
+             "你好{}！\n", "世界");
+}
+```
+
+Output on a modern terminal with Unicode support:
+
+![image](https://github.com/fmtlib/fmt/assets/%0A576385/2a93c904-d6fa-4aa6-b453-2618e1c327d7)
+
+# Benchmarks
+
+## Speed tests
+
+| Library           | Method        | Run Time, s |
+|-------------------|---------------|-------------|
+| libc              | printf        |   0.91      |
+| libc++            | std::ostream  |   2.49      |
+| {fmt} 9.1         | fmt::print    |   0.74      |
+| Boost Format 1.80 | boost::format |   6.26      |
+| Folly Format      | folly::format |   1.87      |
+
+{fmt} is the fastest of the benchmarked methods, \~20% faster than
+`printf`.
+
+The above results were generated by building `tinyformat_test.cpp` on
+macOS 12.6.1 with `clang++ -O3 -DNDEBUG -DSPEED_TEST -DHAVE_FORMAT`, and
+taking the best of three runs. In the test, the format string
+`"%0.10f:%04d:%+g:%s:%p:%c:%%\n"` or equivalent is filled 2,000,000
+times with output sent to `/dev/null`; for further details refer to the
+[source](https://github.com/fmtlib/format-benchmark/blob/master/src/tinyformat-test.cc).
+
+{fmt} is up to 20-30x faster than `std::ostringstream` and `sprintf` on
+IEEE754 `float` and `double` formatting
+([dtoa-benchmark](https://github.com/fmtlib/dtoa-benchmark)) and faster
+than [double-conversion](https://github.com/google/double-conversion)
+and [ryu](https://github.com/ulfjack/ryu):
+
+[![image](https://user-images.githubusercontent.com/576385/95684665-11719600-0ba8-11eb-8e5b-972ff4e49428.png)](https://fmt.dev/unknown_mac64_clang12.0.html)
+
+## Compile time and code bloat
+
+The script
+[bloat-test.py](https://github.com/fmtlib/format-benchmark/blob/master/bloat-test.py)
+from [format-benchmark](https://github.com/fmtlib/format-benchmark)
+tests compile time and code bloat for nontrivial projects. It generates
+100 translation units and uses `printf()` or its alternative five times
+in each to simulate a medium-sized project. The resulting executable
+size and compile time (Apple LLVM version 8.1.0 (clang-802.0.42), macOS
+Sierra, best of three) is shown in the following tables.
+
+**Optimized build (-O3)**
+
+| Method        | Compile Time, s | Executable size, KiB | Stripped size, KiB |
+|---------------|-----------------|----------------------|--------------------|
+| printf        |   2.6           |   29                 |   26               |
+| printf+string |   16.4          |   29                 |   26               |
+| iostreams     |   31.1          |   59                 |   55               |
+| {fmt}         |   19.0          |   37                 |   34               |
+| Boost Format  |   91.9          |   226                |   203              |
+| Folly Format  |   115.7         |   101                |   88               |
+
+As you can see, {fmt} has 60% less overhead in terms of resulting binary
+code size compared to iostreams and comes pretty close to `printf`.
+Boost Format and Folly Format have the largest overheads.
+
+`printf+string` is the same as `printf` but with an extra `<string>`
+include to measure the overhead of the latter.
+
+**Non-optimized build**
+
+| Method        | Compile Time, s | Executable size, KiB | Stripped size, KiB |
+|---------------|-----------------|----------------------|--------------------|
+| printf        |   2.2           |   33                 |   30               |
+| printf+string |   16.0          |   33                 |   30               |
+| iostreams     |   28.3          |   56                 |   52               |
+| {fmt}         |   18.2          |   59                 |   50               |
+| Boost Format  |   54.1          |   365                |   303              |
+| Folly Format  |   79.9          |   445                |   430              |
+
+`libc`, `lib(std)c++`, and `libfmt` are all linked as shared libraries
+to compare formatting function overhead only. Boost Format is a
+header-only library so it doesn\'t provide any linkage options.
+
+## Running the tests
+
+Please refer to [Building the
+library](https://fmt.dev/latest/usage.html#building-the-library) for
+instructions on how to build the library and run the unit tests.
+
+Benchmarks reside in a separate repository,
+[format-benchmarks](https://github.com/fmtlib/format-benchmark), so to
+run the benchmarks you first need to clone this repository and generate
+Makefiles with CMake:
+
+    $ git clone --recursive https://github.com/fmtlib/format-benchmark.git
+    $ cd format-benchmark
+    $ cmake .
+
+Then you can run the speed test:
+
+    $ make speed-test
+
+or the bloat test:
+
+    $ make bloat-test
+
+# Migrating code
+
+[clang-tidy](https://clang.llvm.org/extra/clang-tidy/) v17 (not yet
+released) provides the
+[modernize-use-std-print](https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-std-print.html)
+check that is capable of converting occurrences of `printf` and
+`fprintf` to `fmt::print` if configured to do so. (By default it
+converts to `std::print`.)
+
+# Notable projects using this library
+
+- [0 A.D.](https://play0ad.com/): a free, open-source, cross-platform
+  real-time strategy game
+- [AMPL/MP](https://github.com/ampl/mp): an open-source library for
+  mathematical programming
+- [Apple's FoundationDB](https://github.com/apple/foundationdb): an open-source,
+  distributed, transactional key-value store
+- [Aseprite](https://github.com/aseprite/aseprite): animated sprite
+  editor & pixel art tool
+- [AvioBook](https://www.aviobook.aero/en): a comprehensive aircraft
+  operations suite
+- [Blizzard Battle.net](https://battle.net/): an online gaming
+  platform
+- [Celestia](https://celestia.space/): real-time 3D visualization of
+  space
+- [Ceph](https://ceph.com/): a scalable distributed storage system
+- [ccache](https://ccache.dev/): a compiler cache
+- [ClickHouse](https://github.com/ClickHouse/ClickHouse): an
+  analytical database management system
+- [Contour](https://github.com/contour-terminal/contour/): a modern
+  terminal emulator
+- [CUAUV](https://cuauv.org/): Cornell University\'s autonomous
+  underwater vehicle
+- [Drake](https://drake.mit.edu/): a planning, control, and analysis
+  toolbox for nonlinear dynamical systems (MIT)
+- [Envoy](https://lyft.github.io/envoy/): C++ L7 proxy and
+  communication bus (Lyft)
+- [FiveM](https://fivem.net/): a modification framework for GTA V
+- [fmtlog](https://github.com/MengRao/fmtlog): a performant
+  fmtlib-style logging library with latency in nanoseconds
+- [Folly](https://github.com/facebook/folly): Facebook open-source
+  library
+- [GemRB](https://gemrb.org/): a portable open-source implementation
+  of Bioware's Infinity Engine
+- [Grand Mountain
+  Adventure](https://store.steampowered.com/app/1247360/Grand_Mountain_Adventure/):
+  a beautiful open-world ski & snowboarding game
+- [HarpyWar/pvpgn](https://github.com/pvpgn/pvpgn-server): Player vs
+  Player Gaming Network with tweaks
+- [KBEngine](https://github.com/kbengine/kbengine): an open-source
+  MMOG server engine
+- [Keypirinha](https://keypirinha.com/): a semantic launcher for
+  Windows
+- [Kodi](https://kodi.tv/) (formerly xbmc): home theater software
+- [Knuth](https://kth.cash/): high-performance Bitcoin full-node
+- [libunicode](https://github.com/contour-terminal/libunicode/): a
+  modern C++17 Unicode library
+- [MariaDB](https://mariadb.org/): relational database management
+  system
+- [Microsoft Verona](https://github.com/microsoft/verona): research
+  programming language for concurrent ownership
+- [MongoDB](https://mongodb.com/): distributed document database
+- [MongoDB Smasher](https://github.com/duckie/mongo_smasher): a small
+  tool to generate randomized datasets
+- [OpenSpace](https://openspaceproject.com/): an open-source
+  astrovisualization framework
+- [PenUltima Online (POL)](https://www.polserver.com/): an MMO server,
+  compatible with most Ultima Online clients
+- [PyTorch](https://github.com/pytorch/pytorch): an open-source
+  machine learning library
+- [quasardb](https://www.quasardb.net/): a distributed,
+  high-performance, associative database
+- [Quill](https://github.com/odygrd/quill): asynchronous low-latency
+  logging library
+- [QKW](https://github.com/ravijanjam/qkw): generalizing aliasing to
+  simplify navigation, and executing complex multi-line terminal
+  command sequences
+- [redis-cerberus](https://github.com/HunanTV/redis-cerberus): a Redis
+  cluster proxy
+- [redpanda](https://vectorized.io/redpanda): a 10x faster Kafka®
+  replacement for mission-critical systems written in C++
+- [rpclib](http://rpclib.net/): a modern C++ msgpack-RPC server and
+  client library
+- [Salesforce Analytics
+  Cloud](https://www.salesforce.com/analytics-cloud/overview/):
+  business intelligence software
+- [Scylla](https://www.scylladb.com/): a Cassandra-compatible NoSQL
+  data store that can handle 1 million transactions per second on a
+  single server
+- [Seastar](http://www.seastar-project.org/): an advanced, open-source
+  C++ framework for high-performance server applications on modern
+  hardware
+- [spdlog](https://github.com/gabime/spdlog): super fast C++ logging
+  library
+- [Stellar](https://www.stellar.org/): financial platform
+- [Touch Surgery](https://www.touchsurgery.com/): surgery simulator
+- [TrinityCore](https://github.com/TrinityCore/TrinityCore):
+  open-source MMORPG framework
+- [🐙 userver framework](https://userver.tech/): open-source
+  asynchronous framework with a rich set of abstractions and database
+  drivers
+- [Windows Terminal](https://github.com/microsoft/terminal): the new
+  Windows terminal
+
+[More\...](https://github.com/search?q=fmtlib&type=Code)
+
+If you are aware of other projects using this library, please let me
+know by [email](mailto:victor.zverovich@gmail.com) or by submitting an
+[issue](https://github.com/fmtlib/fmt/issues).
+
+# Motivation
+
+So why yet another formatting library?
+
+There are plenty of methods for doing this task, from standard ones like
+the printf family of function and iostreams to Boost Format and
+FastFormat libraries. The reason for creating a new library is that
+every existing solution that I found either had serious issues or
+didn\'t provide all the features I needed.
+
+## printf
+
+The good thing about `printf` is that it is pretty fast and readily
+available being a part of the C standard library. The main drawback is
+that it doesn\'t support user-defined types. `printf` also has safety
+issues although they are somewhat mitigated with [\_\_attribute\_\_
+((format (printf,
+\...))](https://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html) in
+GCC. There is a POSIX extension that adds positional arguments required
+for
+[i18n](https://en.wikipedia.org/wiki/Internationalization_and_localization)
+to `printf` but it is not a part of C99 and may not be available on some
+platforms.
+
+## iostreams
+
+The main issue with iostreams is best illustrated with an example:
+
+``` c++
+std::cout << std::setprecision(2) << std::fixed << 1.23456 << "\n";
+```
+
+which is a lot of typing compared to printf:
+
+``` c++
+printf("%.2f\n", 1.23456);
+```
+
+Matthew Wilson, the author of FastFormat, called this \"chevron hell\".
+iostreams don\'t support positional arguments by design.
+
+The good part is that iostreams support user-defined types and are safe
+although error handling is awkward.
+
+## Boost Format
+
+This is a very powerful library that supports both `printf`-like format
+strings and positional arguments. Its main drawback is performance.
+According to various benchmarks, it is much slower than other methods
+considered here. Boost Format also has excessive build times and severe
+code bloat issues (see [Benchmarks](#benchmarks)).
+
+## FastFormat
+
+This is an interesting library that is fast, safe, and has positional
+arguments. However, it has significant limitations, citing its author:
+
+> Three features that have no hope of being accommodated within the
+> current design are:
+>
+> - Leading zeros (or any other non-space padding)
+> - Octal/hexadecimal encoding
+> - Runtime width/alignment specification
+
+It is also quite big and has a heavy dependency, STLSoft, which might be
+too restrictive for using it in some projects.
+
+## Boost Spirit.Karma
+
+This is not a formatting library but I decided to include it here for
+completeness. As iostreams, it suffers from the problem of mixing
+verbatim text with arguments. The library is pretty fast, but slower on
+integer formatting than `fmt::format_to` with format string compilation
+on Karma\'s own benchmark, see [Converting a hundred million integers to
+strings per
+second](http://www.zverovich.net/2020/06/13/fast-int-to-string-revisited.html).
+
+# License
+
+{fmt} is distributed under the MIT
+[license](https://github.com/fmtlib/fmt/blob/master/LICENSE).
+
+# Documentation License
+
+The [Format String Syntax](https://fmt.dev/latest/syntax.html) section
+in the documentation is based on the one from Python [string module
+documentation](https://docs.python.org/3/library/string.html#module-string).
+For this reason, the documentation is distributed under the Python
+Software Foundation license available in
+[doc/python-license.txt](https://raw.github.com/fmtlib/fmt/master/doc/python-license.txt).
+It only applies if you distribute the documentation of {fmt}.
+
+# Maintainers
+
+The {fmt} library is maintained by Victor Zverovich
+([vitaut](https://github.com/vitaut)) with contributions from many other
+people. See
+[Contributors](https://github.com/fmtlib/fmt/graphs/contributors) and
+[Releases](https://github.com/fmtlib/fmt/releases) for some of the
+names. Let us know if your contribution is not listed or mentioned
+incorrectly and we\'ll make it right.
+
+# Security Policy
+
+To report a security issue, please disclose it at [security
+advisory](https://github.com/fmtlib/fmt/security/advisories/new).
+
+This project is maintained by a team of volunteers on a
+reasonable-effort basis. As such, please give us at least 90 days to
+work on a fix before public exposure.
diff --git a/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/args.h b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/args.h
new file mode 100644
index 00000000..ad1654bb
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/args.h
@@ -0,0 +1,235 @@
+// Formatting library for C++ - dynamic argument lists
+//
+// Copyright (c) 2012 - present, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_ARGS_H_
+#define FMT_ARGS_H_
+
+#include <functional>  // std::reference_wrapper
+#include <memory>      // std::unique_ptr
+#include <vector>
+
+#include "core.h"
+
+FMT_BEGIN_NAMESPACE
+
+namespace detail {
+
+template <typename T> struct is_reference_wrapper : std::false_type {};
+template <typename T>
+struct is_reference_wrapper<std::reference_wrapper<T>> : std::true_type {};
+
+template <typename T> auto unwrap(const T& v) -> const T& { return v; }
+template <typename T>
+auto unwrap(const std::reference_wrapper<T>& v) -> const T& {
+  return static_cast<const T&>(v);
+}
+
+class dynamic_arg_list {
+  // Workaround for clang's -Wweak-vtables. Unlike for regular classes, for
+  // templates it doesn't complain about inability to deduce single translation
+  // unit for placing vtable. So storage_node_base is made a fake template.
+  template <typename = void> struct node {
+    virtual ~node() = default;
+    std::unique_ptr<node<>> next;
+  };
+
+  template <typename T> struct typed_node : node<> {
+    T value;
+
+    template <typename Arg>
+    FMT_CONSTEXPR typed_node(const Arg& arg) : value(arg) {}
+
+    template <typename Char>
+    FMT_CONSTEXPR typed_node(const basic_string_view<Char>& arg)
+        : value(arg.data(), arg.size()) {}
+  };
+
+  std::unique_ptr<node<>> head_;
+
+ public:
+  template <typename T, typename Arg> auto push(const Arg& arg) -> const T& {
+    auto new_node = std::unique_ptr<typed_node<T>>(new typed_node<T>(arg));
+    auto& value = new_node->value;
+    new_node->next = std::move(head_);
+    head_ = std::move(new_node);
+    return value;
+  }
+};
+}  // namespace detail
+
+/**
+  \rst
+  A dynamic version of `fmt::format_arg_store`.
+  It's equipped with a storage to potentially temporary objects which lifetimes
+  could be shorter than the format arguments object.
+
+  It can be implicitly converted into `~fmt::basic_format_args` for passing
+  into type-erased formatting functions such as `~fmt::vformat`.
+  \endrst
+ */
+template <typename Context>
+class dynamic_format_arg_store
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+    // Workaround a GCC template argument substitution bug.
+    : public basic_format_args<Context>
+#endif
+{
+ private:
+  using char_type = typename Context::char_type;
+
+  template <typename T> struct need_copy {
+    static constexpr detail::type mapped_type =
+        detail::mapped_type_constant<T, Context>::value;
+
+    enum {
+      value = !(detail::is_reference_wrapper<T>::value ||
+                std::is_same<T, basic_string_view<char_type>>::value ||
+                std::is_same<T, detail::std_string_view<char_type>>::value ||
+                (mapped_type != detail::type::cstring_type &&
+                 mapped_type != detail::type::string_type &&
+                 mapped_type != detail::type::custom_type))
+    };
+  };
+
+  template <typename T>
+  using stored_type = conditional_t<
+      std::is_convertible<T, std::basic_string<char_type>>::value &&
+          !detail::is_reference_wrapper<T>::value,
+      std::basic_string<char_type>, T>;
+
+  // Storage of basic_format_arg must be contiguous.
+  std::vector<basic_format_arg<Context>> data_;
+  std::vector<detail::named_arg_info<char_type>> named_info_;
+
+  // Storage of arguments not fitting into basic_format_arg must grow
+  // without relocation because items in data_ refer to it.
+  detail::dynamic_arg_list dynamic_args_;
+
+  friend class basic_format_args<Context>;
+
+  auto get_types() const -> unsigned long long {
+    return detail::is_unpacked_bit | data_.size() |
+           (named_info_.empty()
+                ? 0ULL
+                : static_cast<unsigned long long>(detail::has_named_args_bit));
+  }
+
+  auto data() const -> const basic_format_arg<Context>* {
+    return named_info_.empty() ? data_.data() : data_.data() + 1;
+  }
+
+  template <typename T> void emplace_arg(const T& arg) {
+    data_.emplace_back(detail::make_arg<Context>(arg));
+  }
+
+  template <typename T>
+  void emplace_arg(const detail::named_arg<char_type, T>& arg) {
+    if (named_info_.empty()) {
+      constexpr const detail::named_arg_info<char_type>* zero_ptr{nullptr};
+      data_.insert(data_.begin(), {zero_ptr, 0});
+    }
+    data_.emplace_back(detail::make_arg<Context>(detail::unwrap(arg.value)));
+    auto pop_one = [](std::vector<basic_format_arg<Context>>* data) {
+      data->pop_back();
+    };
+    std::unique_ptr<std::vector<basic_format_arg<Context>>, decltype(pop_one)>
+        guard{&data_, pop_one};
+    named_info_.push_back({arg.name, static_cast<int>(data_.size() - 2u)});
+    data_[0].value_.named_args = {named_info_.data(), named_info_.size()};
+    guard.release();
+  }
+
+ public:
+  constexpr dynamic_format_arg_store() = default;
+
+  /**
+    \rst
+    Adds an argument into the dynamic store for later passing to a formatting
+    function.
+
+    Note that custom types and string types (but not string views) are copied
+    into the store dynamically allocating memory if necessary.
+
+    **Example**::
+
+      fmt::dynamic_format_arg_store<fmt::format_context> store;
+      store.push_back(42);
+      store.push_back("abc");
+      store.push_back(1.5f);
+      std::string result = fmt::vformat("{} and {} and {}", store);
+    \endrst
+  */
+  template <typename T> void push_back(const T& arg) {
+    if (detail::const_check(need_copy<T>::value))
+      emplace_arg(dynamic_args_.push<stored_type<T>>(arg));
+    else
+      emplace_arg(detail::unwrap(arg));
+  }
+
+  /**
+    \rst
+    Adds a reference to the argument into the dynamic store for later passing to
+    a formatting function.
+
+    **Example**::
+
+      fmt::dynamic_format_arg_store<fmt::format_context> store;
+      char band[] = "Rolling Stones";
+      store.push_back(std::cref(band));
+      band[9] = 'c'; // Changing str affects the output.
+      std::string result = fmt::vformat("{}", store);
+      // result == "Rolling Scones"
+    \endrst
+  */
+  template <typename T> void push_back(std::reference_wrapper<T> arg) {
+    static_assert(
+        need_copy<T>::value,
+        "objects of built-in types and string views are always copied");
+    emplace_arg(arg.get());
+  }
+
+  /**
+    Adds named argument into the dynamic store for later passing to a formatting
+    function. ``std::reference_wrapper`` is supported to avoid copying of the
+    argument. The name is always copied into the store.
+  */
+  template <typename T>
+  void push_back(const detail::named_arg<char_type, T>& arg) {
+    const char_type* arg_name =
+        dynamic_args_.push<std::basic_string<char_type>>(arg.name).c_str();
+    if (detail::const_check(need_copy<T>::value)) {
+      emplace_arg(
+          fmt::arg(arg_name, dynamic_args_.push<stored_type<T>>(arg.value)));
+    } else {
+      emplace_arg(fmt::arg(arg_name, arg.value));
+    }
+  }
+
+  /** Erase all elements from the store */
+  void clear() {
+    data_.clear();
+    named_info_.clear();
+    dynamic_args_ = detail::dynamic_arg_list();
+  }
+
+  /**
+    \rst
+    Reserves space to store at least *new_cap* arguments including
+    *new_cap_named* named arguments.
+    \endrst
+  */
+  void reserve(size_t new_cap, size_t new_cap_named) {
+    FMT_ASSERT(new_cap >= new_cap_named,
+               "Set of arguments includes set of named arguments");
+    data_.reserve(new_cap);
+    named_info_.reserve(new_cap_named);
+  }
+};
+
+FMT_END_NAMESPACE
+
+#endif  // FMT_ARGS_H_
diff --git a/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/chrono.h b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/chrono.h
new file mode 100644
index 00000000..9d54574e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/chrono.h
@@ -0,0 +1,2240 @@
+// Formatting library for C++ - chrono support
+//
+// Copyright (c) 2012 - present, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_CHRONO_H_
+#define FMT_CHRONO_H_
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>    // std::isfinite
+#include <cstring>  // std::memcpy
+#include <ctime>
+#include <iterator>
+#include <locale>
+#include <ostream>
+#include <type_traits>
+
+#include "ostream.h"  // formatbuf
+
+FMT_BEGIN_NAMESPACE
+
+// Check if std::chrono::local_t is available.
+#ifndef FMT_USE_LOCAL_TIME
+#  ifdef __cpp_lib_chrono
+#    define FMT_USE_LOCAL_TIME (__cpp_lib_chrono >= 201907L)
+#  else
+#    define FMT_USE_LOCAL_TIME 0
+#  endif
+#endif
+
+// Check if std::chrono::utc_timestamp is available.
+#ifndef FMT_USE_UTC_TIME
+#  ifdef __cpp_lib_chrono
+#    define FMT_USE_UTC_TIME (__cpp_lib_chrono >= 201907L)
+#  else
+#    define FMT_USE_UTC_TIME 0
+#  endif
+#endif
+
+// Enable tzset.
+#ifndef FMT_USE_TZSET
+// UWP doesn't provide _tzset.
+#  if FMT_HAS_INCLUDE("winapifamily.h")
+#    include <winapifamily.h>
+#  endif
+#  if defined(_WIN32) && (!defined(WINAPI_FAMILY) || \
+                          (WINAPI_FAMILY == WINAPI_FAMILY_DESKTOP_APP))
+#    define FMT_USE_TZSET 1
+#  else
+#    define FMT_USE_TZSET 0
+#  endif
+#endif
+
+// Enable safe chrono durations, unless explicitly disabled.
+#ifndef FMT_SAFE_DURATION_CAST
+#  define FMT_SAFE_DURATION_CAST 1
+#endif
+#if FMT_SAFE_DURATION_CAST
+
+// For conversion between std::chrono::durations without undefined
+// behaviour or erroneous results.
+// This is a stripped down version of duration_cast, for inclusion in fmt.
+// See https://github.com/pauldreik/safe_duration_cast
+//
+// Copyright Paul Dreik 2019
+namespace safe_duration_cast {
+
+template <typename To, typename From,
+          FMT_ENABLE_IF(!std::is_same<From, To>::value &&
+                        std::numeric_limits<From>::is_signed ==
+                            std::numeric_limits<To>::is_signed)>
+FMT_CONSTEXPR auto lossless_integral_conversion(const From from, int& ec)
+    -> To {
+  ec = 0;
+  using F = std::numeric_limits<From>;
+  using T = std::numeric_limits<To>;
+  static_assert(F::is_integer, "From must be integral");
+  static_assert(T::is_integer, "To must be integral");
+
+  // A and B are both signed, or both unsigned.
+  if (detail::const_check(F::digits <= T::digits)) {
+    // From fits in To without any problem.
+  } else {
+    // From does not always fit in To, resort to a dynamic check.
+    if (from < (T::min)() || from > (T::max)()) {
+      // outside range.
+      ec = 1;
+      return {};
+    }
+  }
+  return static_cast<To>(from);
+}
+
+/**
+ * converts From to To, without loss. If the dynamic value of from
+ * can't be converted to To without loss, ec is set.
+ */
+template <typename To, typename From,
+          FMT_ENABLE_IF(!std::is_same<From, To>::value &&
+                        std::numeric_limits<From>::is_signed !=
+                            std::numeric_limits<To>::is_signed)>
+FMT_CONSTEXPR auto lossless_integral_conversion(const From from, int& ec)
+    -> To {
+  ec = 0;
+  using F = std::numeric_limits<From>;
+  using T = std::numeric_limits<To>;
+  static_assert(F::is_integer, "From must be integral");
+  static_assert(T::is_integer, "To must be integral");
+
+  if (detail::const_check(F::is_signed && !T::is_signed)) {
+    // From may be negative, not allowed!
+    if (fmt::detail::is_negative(from)) {
+      ec = 1;
+      return {};
+    }
+    // From is positive. Can it always fit in To?
+    if (detail::const_check(F::digits > T::digits) &&
+        from > static_cast<From>(detail::max_value<To>())) {
+      ec = 1;
+      return {};
+    }
+  }
+
+  if (detail::const_check(!F::is_signed && T::is_signed &&
+                          F::digits >= T::digits) &&
+      from > static_cast<From>(detail::max_value<To>())) {
+    ec = 1;
+    return {};
+  }
+  return static_cast<To>(from);  // Lossless conversion.
+}
+
+template <typename To, typename From,
+          FMT_ENABLE_IF(std::is_same<From, To>::value)>
+FMT_CONSTEXPR auto lossless_integral_conversion(const From from, int& ec)
+    -> To {
+  ec = 0;
+  return from;
+}  // function
+
+// clang-format off
+/**
+ * converts From to To if possible, otherwise ec is set.
+ *
+ * input                            |    output
+ * ---------------------------------|---------------
+ * NaN                              | NaN
+ * Inf                              | Inf
+ * normal, fits in output           | converted (possibly lossy)
+ * normal, does not fit in output   | ec is set
+ * subnormal                        | best effort
+ * -Inf                             | -Inf
+ */
+// clang-format on
+template <typename To, typename From,
+          FMT_ENABLE_IF(!std::is_same<From, To>::value)>
+FMT_CONSTEXPR auto safe_float_conversion(const From from, int& ec) -> To {
+  ec = 0;
+  using T = std::numeric_limits<To>;
+  static_assert(std::is_floating_point<From>::value, "From must be floating");
+  static_assert(std::is_floating_point<To>::value, "To must be floating");
+
+  // catch the only happy case
+  if (std::isfinite(from)) {
+    if (from >= T::lowest() && from <= (T::max)()) {
+      return static_cast<To>(from);
+    }
+    // not within range.
+    ec = 1;
+    return {};
+  }
+
+  // nan and inf will be preserved
+  return static_cast<To>(from);
+}  // function
+
+template <typename To, typename From,
+          FMT_ENABLE_IF(std::is_same<From, To>::value)>
+FMT_CONSTEXPR auto safe_float_conversion(const From from, int& ec) -> To {
+  ec = 0;
+  static_assert(std::is_floating_point<From>::value, "From must be floating");
+  return from;
+}
+
+/**
+ * safe duration cast between integral durations
+ */
+template <typename To, typename FromRep, typename FromPeriod,
+          FMT_ENABLE_IF(std::is_integral<FromRep>::value),
+          FMT_ENABLE_IF(std::is_integral<typename To::rep>::value)>
+auto safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
+                        int& ec) -> To {
+  using From = std::chrono::duration<FromRep, FromPeriod>;
+  ec = 0;
+  // the basic idea is that we need to convert from count() in the from type
+  // to count() in the To type, by multiplying it with this:
+  struct Factor
+      : std::ratio_divide<typename From::period, typename To::period> {};
+
+  static_assert(Factor::num > 0, "num must be positive");
+  static_assert(Factor::den > 0, "den must be positive");
+
+  // the conversion is like this: multiply from.count() with Factor::num
+  // /Factor::den and convert it to To::rep, all this without
+  // overflow/underflow. let's start by finding a suitable type that can hold
+  // both To, From and Factor::num
+  using IntermediateRep =
+      typename std::common_type<typename From::rep, typename To::rep,
+                                decltype(Factor::num)>::type;
+
+  // safe conversion to IntermediateRep
+  IntermediateRep count =
+      lossless_integral_conversion<IntermediateRep>(from.count(), ec);
+  if (ec) return {};
+  // multiply with Factor::num without overflow or underflow
+  if (detail::const_check(Factor::num != 1)) {
+    const auto max1 = detail::max_value<IntermediateRep>() / Factor::num;
+    if (count > max1) {
+      ec = 1;
+      return {};
+    }
+    const auto min1 =
+        (std::numeric_limits<IntermediateRep>::min)() / Factor::num;
+    if (detail::const_check(!std::is_unsigned<IntermediateRep>::value) &&
+        count < min1) {
+      ec = 1;
+      return {};
+    }
+    count *= Factor::num;
+  }
+
+  if (detail::const_check(Factor::den != 1)) count /= Factor::den;
+  auto tocount = lossless_integral_conversion<typename To::rep>(count, ec);
+  return ec ? To() : To(tocount);
+}
+
+/**
+ * safe duration_cast between floating point durations
+ */
+template <typename To, typename FromRep, typename FromPeriod,
+          FMT_ENABLE_IF(std::is_floating_point<FromRep>::value),
+          FMT_ENABLE_IF(std::is_floating_point<typename To::rep>::value)>
+auto safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
+                        int& ec) -> To {
+  using From = std::chrono::duration<FromRep, FromPeriod>;
+  ec = 0;
+  if (std::isnan(from.count())) {
+    // nan in, gives nan out. easy.
+    return To{std::numeric_limits<typename To::rep>::quiet_NaN()};
+  }
+  // maybe we should also check if from is denormal, and decide what to do about
+  // it.
+
+  // +-inf should be preserved.
+  if (std::isinf(from.count())) {
+    return To{from.count()};
+  }
+
+  // the basic idea is that we need to convert from count() in the from type
+  // to count() in the To type, by multiplying it with this:
+  struct Factor
+      : std::ratio_divide<typename From::period, typename To::period> {};
+
+  static_assert(Factor::num > 0, "num must be positive");
+  static_assert(Factor::den > 0, "den must be positive");
+
+  // the conversion is like this: multiply from.count() with Factor::num
+  // /Factor::den and convert it to To::rep, all this without
+  // overflow/underflow. let's start by finding a suitable type that can hold
+  // both To, From and Factor::num
+  using IntermediateRep =
+      typename std::common_type<typename From::rep, typename To::rep,
+                                decltype(Factor::num)>::type;
+
+  // force conversion of From::rep -> IntermediateRep to be safe,
+  // even if it will never happen be narrowing in this context.
+  IntermediateRep count =
+      safe_float_conversion<IntermediateRep>(from.count(), ec);
+  if (ec) {
+    return {};
+  }
+
+  // multiply with Factor::num without overflow or underflow
+  if (detail::const_check(Factor::num != 1)) {
+    constexpr auto max1 = detail::max_value<IntermediateRep>() /
+                          static_cast<IntermediateRep>(Factor::num);
+    if (count > max1) {
+      ec = 1;
+      return {};
+    }
+    constexpr auto min1 = std::numeric_limits<IntermediateRep>::lowest() /
+                          static_cast<IntermediateRep>(Factor::num);
+    if (count < min1) {
+      ec = 1;
+      return {};
+    }
+    count *= static_cast<IntermediateRep>(Factor::num);
+  }
+
+  // this can't go wrong, right? den>0 is checked earlier.
+  if (detail::const_check(Factor::den != 1)) {
+    using common_t = typename std::common_type<IntermediateRep, intmax_t>::type;
+    count /= static_cast<common_t>(Factor::den);
+  }
+
+  // convert to the to type, safely
+  using ToRep = typename To::rep;
+
+  const ToRep tocount = safe_float_conversion<ToRep>(count, ec);
+  if (ec) {
+    return {};
+  }
+  return To{tocount};
+}
+}  // namespace safe_duration_cast
+#endif
+
+// Prevents expansion of a preceding token as a function-style macro.
+// Usage: f FMT_NOMACRO()
+#define FMT_NOMACRO
+
+namespace detail {
+template <typename T = void> struct null {};
+inline auto localtime_r FMT_NOMACRO(...) -> null<> { return null<>(); }
+inline auto localtime_s(...) -> null<> { return null<>(); }
+inline auto gmtime_r(...) -> null<> { return null<>(); }
+inline auto gmtime_s(...) -> null<> { return null<>(); }
+
+inline auto get_classic_locale() -> const std::locale& {
+  static const auto& locale = std::locale::classic();
+  return locale;
+}
+
+template <typename CodeUnit> struct codecvt_result {
+  static constexpr const size_t max_size = 32;
+  CodeUnit buf[max_size];
+  CodeUnit* end;
+};
+
+template <typename CodeUnit>
+void write_codecvt(codecvt_result<CodeUnit>& out, string_view in_buf,
+                   const std::locale& loc) {
+#if FMT_CLANG_VERSION
+#  pragma clang diagnostic push
+#  pragma clang diagnostic ignored "-Wdeprecated"
+  auto& f = std::use_facet<std::codecvt<CodeUnit, char, std::mbstate_t>>(loc);
+#  pragma clang diagnostic pop
+#else
+  auto& f = std::use_facet<std::codecvt<CodeUnit, char, std::mbstate_t>>(loc);
+#endif
+  auto mb = std::mbstate_t();
+  const char* from_next = nullptr;
+  auto result = f.in(mb, in_buf.begin(), in_buf.end(), from_next,
+                     std::begin(out.buf), std::end(out.buf), out.end);
+  if (result != std::codecvt_base::ok)
+    FMT_THROW(format_error("failed to format time"));
+}
+
+template <typename OutputIt>
+auto write_encoded_tm_str(OutputIt out, string_view in, const std::locale& loc)
+    -> OutputIt {
+  if (detail::is_utf8() && loc != get_classic_locale()) {
+    // char16_t and char32_t codecvts are broken in MSVC (linkage errors) and
+    // gcc-4.
+#if FMT_MSC_VERSION != 0 || \
+    (defined(__GLIBCXX__) && !defined(_GLIBCXX_USE_DUAL_ABI))
+    // The _GLIBCXX_USE_DUAL_ABI macro is always defined in libstdc++ from gcc-5
+    // and newer.
+    using code_unit = wchar_t;
+#else
+    using code_unit = char32_t;
+#endif
+
+    using unit_t = codecvt_result<code_unit>;
+    unit_t unit;
+    write_codecvt(unit, in, loc);
+    // In UTF-8 is used one to four one-byte code units.
+    auto u =
+        to_utf8<code_unit, basic_memory_buffer<char, unit_t::max_size * 4>>();
+    if (!u.convert({unit.buf, to_unsigned(unit.end - unit.buf)}))
+      FMT_THROW(format_error("failed to format time"));
+    return copy_str<char>(u.c_str(), u.c_str() + u.size(), out);
+  }
+  return copy_str<char>(in.data(), in.data() + in.size(), out);
+}
+
+template <typename Char, typename OutputIt,
+          FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
+auto write_tm_str(OutputIt out, string_view sv, const std::locale& loc)
+    -> OutputIt {
+  codecvt_result<Char> unit;
+  write_codecvt(unit, sv, loc);
+  return copy_str<Char>(unit.buf, unit.end, out);
+}
+
+template <typename Char, typename OutputIt,
+          FMT_ENABLE_IF(std::is_same<Char, char>::value)>
+auto write_tm_str(OutputIt out, string_view sv, const std::locale& loc)
+    -> OutputIt {
+  return write_encoded_tm_str(out, sv, loc);
+}
+
+template <typename Char>
+inline void do_write(buffer<Char>& buf, const std::tm& time,
+                     const std::locale& loc, char format, char modifier) {
+  auto&& format_buf = formatbuf<std::basic_streambuf<Char>>(buf);
+  auto&& os = std::basic_ostream<Char>(&format_buf);
+  os.imbue(loc);
+  const auto& facet = std::use_facet<std::time_put<Char>>(loc);
+  auto end = facet.put(os, os, Char(' '), &time, format, modifier);
+  if (end.failed()) FMT_THROW(format_error("failed to format time"));
+}
+
+template <typename Char, typename OutputIt,
+          FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
+auto write(OutputIt out, const std::tm& time, const std::locale& loc,
+           char format, char modifier = 0) -> OutputIt {
+  auto&& buf = get_buffer<Char>(out);
+  do_write<Char>(buf, time, loc, format, modifier);
+  return get_iterator(buf, out);
+}
+
+template <typename Char, typename OutputIt,
+          FMT_ENABLE_IF(std::is_same<Char, char>::value)>
+auto write(OutputIt out, const std::tm& time, const std::locale& loc,
+           char format, char modifier = 0) -> OutputIt {
+  auto&& buf = basic_memory_buffer<Char>();
+  do_write<char>(buf, time, loc, format, modifier);
+  return write_encoded_tm_str(out, string_view(buf.data(), buf.size()), loc);
+}
+
+template <typename Rep1, typename Rep2>
+struct is_same_arithmetic_type
+    : public std::integral_constant<bool,
+                                    (std::is_integral<Rep1>::value &&
+                                     std::is_integral<Rep2>::value) ||
+                                        (std::is_floating_point<Rep1>::value &&
+                                         std::is_floating_point<Rep2>::value)> {
+};
+
+template <
+    typename To, typename FromRep, typename FromPeriod,
+    FMT_ENABLE_IF(is_same_arithmetic_type<FromRep, typename To::rep>::value)>
+auto fmt_duration_cast(std::chrono::duration<FromRep, FromPeriod> from) -> To {
+#if FMT_SAFE_DURATION_CAST
+  // Throwing version of safe_duration_cast is only available for
+  // integer to integer or float to float casts.
+  int ec;
+  To to = safe_duration_cast::safe_duration_cast<To>(from, ec);
+  if (ec) FMT_THROW(format_error("cannot format duration"));
+  return to;
+#else
+  // Standard duration cast, may overflow.
+  return std::chrono::duration_cast<To>(from);
+#endif
+}
+
+template <
+    typename To, typename FromRep, typename FromPeriod,
+    FMT_ENABLE_IF(!is_same_arithmetic_type<FromRep, typename To::rep>::value)>
+auto fmt_duration_cast(std::chrono::duration<FromRep, FromPeriod> from) -> To {
+  // Mixed integer <-> float cast is not supported by safe_duration_cast.
+  return std::chrono::duration_cast<To>(from);
+}
+
+template <typename Duration>
+auto to_time_t(
+    std::chrono::time_point<std::chrono::system_clock, Duration> time_point)
+    -> std::time_t {
+  // Cannot use std::chrono::system_clock::to_time_t since this would first
+  // require a cast to std::chrono::system_clock::time_point, which could
+  // overflow.
+  return fmt_duration_cast<std::chrono::duration<std::time_t>>(
+             time_point.time_since_epoch())
+      .count();
+}
+}  // namespace detail
+
+FMT_BEGIN_EXPORT
+
+/**
+  Converts given time since epoch as ``std::time_t`` value into calendar time,
+  expressed in local time. Unlike ``std::localtime``, this function is
+  thread-safe on most platforms.
+ */
+inline auto localtime(std::time_t time) -> std::tm {
+  struct dispatcher {
+    std::time_t time_;
+    std::tm tm_;
+
+    dispatcher(std::time_t t) : time_(t) {}
+
+    auto run() -> bool {
+      using namespace fmt::detail;
+      return handle(localtime_r(&time_, &tm_));
+    }
+
+    auto handle(std::tm* tm) -> bool { return tm != nullptr; }
+
+    auto handle(detail::null<>) -> bool {
+      using namespace fmt::detail;
+      return fallback(localtime_s(&tm_, &time_));
+    }
+
+    auto fallback(int res) -> bool { return res == 0; }
+
+#if !FMT_MSC_VERSION
+    auto fallback(detail::null<>) -> bool {
+      using namespace fmt::detail;
+      std::tm* tm = std::localtime(&time_);
+      if (tm) tm_ = *tm;
+      return tm != nullptr;
+    }
+#endif
+  };
+  dispatcher lt(time);
+  // Too big time values may be unsupported.
+  if (!lt.run()) FMT_THROW(format_error("time_t value out of range"));
+  return lt.tm_;
+}
+
+#if FMT_USE_LOCAL_TIME
+template <typename Duration>
+inline auto localtime(std::chrono::local_time<Duration> time) -> std::tm {
+  return localtime(
+      detail::to_time_t(std::chrono::current_zone()->to_sys(time)));
+}
+#endif
+
+/**
+  Converts given time since epoch as ``std::time_t`` value into calendar time,
+  expressed in Coordinated Universal Time (UTC). Unlike ``std::gmtime``, this
+  function is thread-safe on most platforms.
+ */
+inline auto gmtime(std::time_t time) -> std::tm {
+  struct dispatcher {
+    std::time_t time_;
+    std::tm tm_;
+
+    dispatcher(std::time_t t) : time_(t) {}
+
+    auto run() -> bool {
+      using namespace fmt::detail;
+      return handle(gmtime_r(&time_, &tm_));
+    }
+
+    auto handle(std::tm* tm) -> bool { return tm != nullptr; }
+
+    auto handle(detail::null<>) -> bool {
+      using namespace fmt::detail;
+      return fallback(gmtime_s(&tm_, &time_));
+    }
+
+    auto fallback(int res) -> bool { return res == 0; }
+
+#if !FMT_MSC_VERSION
+    auto fallback(detail::null<>) -> bool {
+      std::tm* tm = std::gmtime(&time_);
+      if (tm) tm_ = *tm;
+      return tm != nullptr;
+    }
+#endif
+  };
+  auto gt = dispatcher(time);
+  // Too big time values may be unsupported.
+  if (!gt.run()) FMT_THROW(format_error("time_t value out of range"));
+  return gt.tm_;
+}
+
+template <typename Duration>
+inline auto gmtime(
+    std::chrono::time_point<std::chrono::system_clock, Duration> time_point)
+    -> std::tm {
+  return gmtime(detail::to_time_t(time_point));
+}
+
+namespace detail {
+
+// Writes two-digit numbers a, b and c separated by sep to buf.
+// The method by Pavel Novikov based on
+// https://johnnylee-sde.github.io/Fast-unsigned-integer-to-time-string/.
+inline void write_digit2_separated(char* buf, unsigned a, unsigned b,
+                                   unsigned c, char sep) {
+  unsigned long long digits =
+      a | (b << 24) | (static_cast<unsigned long long>(c) << 48);
+  // Convert each value to BCD.
+  // We have x = a * 10 + b and we want to convert it to BCD y = a * 16 + b.
+  // The difference is
+  //   y - x = a * 6
+  // a can be found from x:
+  //   a = floor(x / 10)
+  // then
+  //   y = x + a * 6 = x + floor(x / 10) * 6
+  // floor(x / 10) is (x * 205) >> 11 (needs 16 bits).
+  digits += (((digits * 205) >> 11) & 0x000f00000f00000f) * 6;
+  // Put low nibbles to high bytes and high nibbles to low bytes.
+  digits = ((digits & 0x00f00000f00000f0) >> 4) |
+           ((digits & 0x000f00000f00000f) << 8);
+  auto usep = static_cast<unsigned long long>(sep);
+  // Add ASCII '0' to each digit byte and insert separators.
+  digits |= 0x3030003030003030 | (usep << 16) | (usep << 40);
+
+  constexpr const size_t len = 8;
+  if (const_check(is_big_endian())) {
+    char tmp[len];
+    std::memcpy(tmp, &digits, len);
+    std::reverse_copy(tmp, tmp + len, buf);
+  } else {
+    std::memcpy(buf, &digits, len);
+  }
+}
+
+template <typename Period>
+FMT_CONSTEXPR inline auto get_units() -> const char* {
+  if (std::is_same<Period, std::atto>::value) return "as";
+  if (std::is_same<Period, std::femto>::value) return "fs";
+  if (std::is_same<Period, std::pico>::value) return "ps";
+  if (std::is_same<Period, std::nano>::value) return "ns";
+  if (std::is_same<Period, std::micro>::value) return "µs";
+  if (std::is_same<Period, std::milli>::value) return "ms";
+  if (std::is_same<Period, std::centi>::value) return "cs";
+  if (std::is_same<Period, std::deci>::value) return "ds";
+  if (std::is_same<Period, std::ratio<1>>::value) return "s";
+  if (std::is_same<Period, std::deca>::value) return "das";
+  if (std::is_same<Period, std::hecto>::value) return "hs";
+  if (std::is_same<Period, std::kilo>::value) return "ks";
+  if (std::is_same<Period, std::mega>::value) return "Ms";
+  if (std::is_same<Period, std::giga>::value) return "Gs";
+  if (std::is_same<Period, std::tera>::value) return "Ts";
+  if (std::is_same<Period, std::peta>::value) return "Ps";
+  if (std::is_same<Period, std::exa>::value) return "Es";
+  if (std::is_same<Period, std::ratio<60>>::value) return "min";
+  if (std::is_same<Period, std::ratio<3600>>::value) return "h";
+  if (std::is_same<Period, std::ratio<86400>>::value) return "d";
+  return nullptr;
+}
+
+enum class numeric_system {
+  standard,
+  // Alternative numeric system, e.g. 十二 instead of 12 in ja_JP locale.
+  alternative
+};
+
+// Glibc extensions for formatting numeric values.
+enum class pad_type {
+  unspecified,
+  // Do not pad a numeric result string.
+  none,
+  // Pad a numeric result string with zeros even if the conversion specifier
+  // character uses space-padding by default.
+  zero,
+  // Pad a numeric result string with spaces.
+  space,
+};
+
+template <typename OutputIt>
+auto write_padding(OutputIt out, pad_type pad, int width) -> OutputIt {
+  if (pad == pad_type::none) return out;
+  return std::fill_n(out, width, pad == pad_type::space ? ' ' : '0');
+}
+
+template <typename OutputIt>
+auto write_padding(OutputIt out, pad_type pad) -> OutputIt {
+  if (pad != pad_type::none) *out++ = pad == pad_type::space ? ' ' : '0';
+  return out;
+}
+
+// Parses a put_time-like format string and invokes handler actions.
+template <typename Char, typename Handler>
+FMT_CONSTEXPR auto parse_chrono_format(const Char* begin, const Char* end,
+                                       Handler&& handler) -> const Char* {
+  if (begin == end || *begin == '}') return begin;
+  if (*begin != '%') FMT_THROW(format_error("invalid format"));
+  auto ptr = begin;
+  pad_type pad = pad_type::unspecified;
+  while (ptr != end) {
+    auto c = *ptr;
+    if (c == '}') break;
+    if (c != '%') {
+      ++ptr;
+      continue;
+    }
+    if (begin != ptr) handler.on_text(begin, ptr);
+    ++ptr;  // consume '%'
+    if (ptr == end) FMT_THROW(format_error("invalid format"));
+    c = *ptr;
+    switch (c) {
+    case '_':
+      pad = pad_type::space;
+      ++ptr;
+      break;
+    case '-':
+      pad = pad_type::none;
+      ++ptr;
+      break;
+    case '0':
+      pad = pad_type::zero;
+      ++ptr;
+      break;
+    }
+    if (ptr == end) FMT_THROW(format_error("invalid format"));
+    c = *ptr++;
+    switch (c) {
+    case '%':
+      handler.on_text(ptr - 1, ptr);
+      break;
+    case 'n': {
+      const Char newline[] = {'\n'};
+      handler.on_text(newline, newline + 1);
+      break;
+    }
+    case 't': {
+      const Char tab[] = {'\t'};
+      handler.on_text(tab, tab + 1);
+      break;
+    }
+    // Year:
+    case 'Y':
+      handler.on_year(numeric_system::standard);
+      break;
+    case 'y':
+      handler.on_short_year(numeric_system::standard);
+      break;
+    case 'C':
+      handler.on_century(numeric_system::standard);
+      break;
+    case 'G':
+      handler.on_iso_week_based_year();
+      break;
+    case 'g':
+      handler.on_iso_week_based_short_year();
+      break;
+    // Day of the week:
+    case 'a':
+      handler.on_abbr_weekday();
+      break;
+    case 'A':
+      handler.on_full_weekday();
+      break;
+    case 'w':
+      handler.on_dec0_weekday(numeric_system::standard);
+      break;
+    case 'u':
+      handler.on_dec1_weekday(numeric_system::standard);
+      break;
+    // Month:
+    case 'b':
+    case 'h':
+      handler.on_abbr_month();
+      break;
+    case 'B':
+      handler.on_full_month();
+      break;
+    case 'm':
+      handler.on_dec_month(numeric_system::standard);
+      break;
+    // Day of the year/month:
+    case 'U':
+      handler.on_dec0_week_of_year(numeric_system::standard);
+      break;
+    case 'W':
+      handler.on_dec1_week_of_year(numeric_system::standard);
+      break;
+    case 'V':
+      handler.on_iso_week_of_year(numeric_system::standard);
+      break;
+    case 'j':
+      handler.on_day_of_year();
+      break;
+    case 'd':
+      handler.on_day_of_month(numeric_system::standard);
+      break;
+    case 'e':
+      handler.on_day_of_month_space(numeric_system::standard);
+      break;
+    // Hour, minute, second:
+    case 'H':
+      handler.on_24_hour(numeric_system::standard, pad);
+      break;
+    case 'I':
+      handler.on_12_hour(numeric_system::standard, pad);
+      break;
+    case 'M':
+      handler.on_minute(numeric_system::standard, pad);
+      break;
+    case 'S':
+      handler.on_second(numeric_system::standard, pad);
+      break;
+    // Other:
+    case 'c':
+      handler.on_datetime(numeric_system::standard);
+      break;
+    case 'x':
+      handler.on_loc_date(numeric_system::standard);
+      break;
+    case 'X':
+      handler.on_loc_time(numeric_system::standard);
+      break;
+    case 'D':
+      handler.on_us_date();
+      break;
+    case 'F':
+      handler.on_iso_date();
+      break;
+    case 'r':
+      handler.on_12_hour_time();
+      break;
+    case 'R':
+      handler.on_24_hour_time();
+      break;
+    case 'T':
+      handler.on_iso_time();
+      break;
+    case 'p':
+      handler.on_am_pm();
+      break;
+    case 'Q':
+      handler.on_duration_value();
+      break;
+    case 'q':
+      handler.on_duration_unit();
+      break;
+    case 'z':
+      handler.on_utc_offset(numeric_system::standard);
+      break;
+    case 'Z':
+      handler.on_tz_name();
+      break;
+    // Alternative representation:
+    case 'E': {
+      if (ptr == end) FMT_THROW(format_error("invalid format"));
+      c = *ptr++;
+      switch (c) {
+      case 'Y':
+        handler.on_year(numeric_system::alternative);
+        break;
+      case 'y':
+        handler.on_offset_year();
+        break;
+      case 'C':
+        handler.on_century(numeric_system::alternative);
+        break;
+      case 'c':
+        handler.on_datetime(numeric_system::alternative);
+        break;
+      case 'x':
+        handler.on_loc_date(numeric_system::alternative);
+        break;
+      case 'X':
+        handler.on_loc_time(numeric_system::alternative);
+        break;
+      case 'z':
+        handler.on_utc_offset(numeric_system::alternative);
+        break;
+      default:
+        FMT_THROW(format_error("invalid format"));
+      }
+      break;
+    }
+    case 'O':
+      if (ptr == end) FMT_THROW(format_error("invalid format"));
+      c = *ptr++;
+      switch (c) {
+      case 'y':
+        handler.on_short_year(numeric_system::alternative);
+        break;
+      case 'm':
+        handler.on_dec_month(numeric_system::alternative);
+        break;
+      case 'U':
+        handler.on_dec0_week_of_year(numeric_system::alternative);
+        break;
+      case 'W':
+        handler.on_dec1_week_of_year(numeric_system::alternative);
+        break;
+      case 'V':
+        handler.on_iso_week_of_year(numeric_system::alternative);
+        break;
+      case 'd':
+        handler.on_day_of_month(numeric_system::alternative);
+        break;
+      case 'e':
+        handler.on_day_of_month_space(numeric_system::alternative);
+        break;
+      case 'w':
+        handler.on_dec0_weekday(numeric_system::alternative);
+        break;
+      case 'u':
+        handler.on_dec1_weekday(numeric_system::alternative);
+        break;
+      case 'H':
+        handler.on_24_hour(numeric_system::alternative, pad);
+        break;
+      case 'I':
+        handler.on_12_hour(numeric_system::alternative, pad);
+        break;
+      case 'M':
+        handler.on_minute(numeric_system::alternative, pad);
+        break;
+      case 'S':
+        handler.on_second(numeric_system::alternative, pad);
+        break;
+      case 'z':
+        handler.on_utc_offset(numeric_system::alternative);
+        break;
+      default:
+        FMT_THROW(format_error("invalid format"));
+      }
+      break;
+    default:
+      FMT_THROW(format_error("invalid format"));
+    }
+    begin = ptr;
+  }
+  if (begin != ptr) handler.on_text(begin, ptr);
+  return ptr;
+}
+
+template <typename Derived> struct null_chrono_spec_handler {
+  FMT_CONSTEXPR void unsupported() {
+    static_cast<Derived*>(this)->unsupported();
+  }
+  FMT_CONSTEXPR void on_year(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_short_year(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_offset_year() { unsupported(); }
+  FMT_CONSTEXPR void on_century(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_iso_week_based_year() { unsupported(); }
+  FMT_CONSTEXPR void on_iso_week_based_short_year() { unsupported(); }
+  FMT_CONSTEXPR void on_abbr_weekday() { unsupported(); }
+  FMT_CONSTEXPR void on_full_weekday() { unsupported(); }
+  FMT_CONSTEXPR void on_dec0_weekday(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_dec1_weekday(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_abbr_month() { unsupported(); }
+  FMT_CONSTEXPR void on_full_month() { unsupported(); }
+  FMT_CONSTEXPR void on_dec_month(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_dec0_week_of_year(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_dec1_week_of_year(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_iso_week_of_year(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_day_of_year() { unsupported(); }
+  FMT_CONSTEXPR void on_day_of_month(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_day_of_month_space(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_24_hour(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_12_hour(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_minute(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_second(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_datetime(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_loc_date(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_loc_time(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_us_date() { unsupported(); }
+  FMT_CONSTEXPR void on_iso_date() { unsupported(); }
+  FMT_CONSTEXPR void on_12_hour_time() { unsupported(); }
+  FMT_CONSTEXPR void on_24_hour_time() { unsupported(); }
+  FMT_CONSTEXPR void on_iso_time() { unsupported(); }
+  FMT_CONSTEXPR void on_am_pm() { unsupported(); }
+  FMT_CONSTEXPR void on_duration_value() { unsupported(); }
+  FMT_CONSTEXPR void on_duration_unit() { unsupported(); }
+  FMT_CONSTEXPR void on_utc_offset(numeric_system) { unsupported(); }
+  FMT_CONSTEXPR void on_tz_name() { unsupported(); }
+};
+
+struct tm_format_checker : null_chrono_spec_handler<tm_format_checker> {
+  FMT_NORETURN void unsupported() { FMT_THROW(format_error("no format")); }
+
+  template <typename Char>
+  FMT_CONSTEXPR void on_text(const Char*, const Char*) {}
+  FMT_CONSTEXPR void on_year(numeric_system) {}
+  FMT_CONSTEXPR void on_short_year(numeric_system) {}
+  FMT_CONSTEXPR void on_offset_year() {}
+  FMT_CONSTEXPR void on_century(numeric_system) {}
+  FMT_CONSTEXPR void on_iso_week_based_year() {}
+  FMT_CONSTEXPR void on_iso_week_based_short_year() {}
+  FMT_CONSTEXPR void on_abbr_weekday() {}
+  FMT_CONSTEXPR void on_full_weekday() {}
+  FMT_CONSTEXPR void on_dec0_weekday(numeric_system) {}
+  FMT_CONSTEXPR void on_dec1_weekday(numeric_system) {}
+  FMT_CONSTEXPR void on_abbr_month() {}
+  FMT_CONSTEXPR void on_full_month() {}
+  FMT_CONSTEXPR void on_dec_month(numeric_system) {}
+  FMT_CONSTEXPR void on_dec0_week_of_year(numeric_system) {}
+  FMT_CONSTEXPR void on_dec1_week_of_year(numeric_system) {}
+  FMT_CONSTEXPR void on_iso_week_of_year(numeric_system) {}
+  FMT_CONSTEXPR void on_day_of_year() {}
+  FMT_CONSTEXPR void on_day_of_month(numeric_system) {}
+  FMT_CONSTEXPR void on_day_of_month_space(numeric_system) {}
+  FMT_CONSTEXPR void on_24_hour(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_12_hour(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_minute(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_second(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_datetime(numeric_system) {}
+  FMT_CONSTEXPR void on_loc_date(numeric_system) {}
+  FMT_CONSTEXPR void on_loc_time(numeric_system) {}
+  FMT_CONSTEXPR void on_us_date() {}
+  FMT_CONSTEXPR void on_iso_date() {}
+  FMT_CONSTEXPR void on_12_hour_time() {}
+  FMT_CONSTEXPR void on_24_hour_time() {}
+  FMT_CONSTEXPR void on_iso_time() {}
+  FMT_CONSTEXPR void on_am_pm() {}
+  FMT_CONSTEXPR void on_utc_offset(numeric_system) {}
+  FMT_CONSTEXPR void on_tz_name() {}
+};
+
+inline auto tm_wday_full_name(int wday) -> const char* {
+  static constexpr const char* full_name_list[] = {
+      "Sunday",   "Monday", "Tuesday", "Wednesday",
+      "Thursday", "Friday", "Saturday"};
+  return wday >= 0 && wday <= 6 ? full_name_list[wday] : "?";
+}
+inline auto tm_wday_short_name(int wday) -> const char* {
+  static constexpr const char* short_name_list[] = {"Sun", "Mon", "Tue", "Wed",
+                                                    "Thu", "Fri", "Sat"};
+  return wday >= 0 && wday <= 6 ? short_name_list[wday] : "???";
+}
+
+inline auto tm_mon_full_name(int mon) -> const char* {
+  static constexpr const char* full_name_list[] = {
+      "January", "February", "March",     "April",   "May",      "June",
+      "July",    "August",   "September", "October", "November", "December"};
+  return mon >= 0 && mon <= 11 ? full_name_list[mon] : "?";
+}
+inline auto tm_mon_short_name(int mon) -> const char* {
+  static constexpr const char* short_name_list[] = {
+      "Jan", "Feb", "Mar", "Apr", "May", "Jun",
+      "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
+  };
+  return mon >= 0 && mon <= 11 ? short_name_list[mon] : "???";
+}
+
+template <typename T, typename = void>
+struct has_member_data_tm_gmtoff : std::false_type {};
+template <typename T>
+struct has_member_data_tm_gmtoff<T, void_t<decltype(T::tm_gmtoff)>>
+    : std::true_type {};
+
+template <typename T, typename = void>
+struct has_member_data_tm_zone : std::false_type {};
+template <typename T>
+struct has_member_data_tm_zone<T, void_t<decltype(T::tm_zone)>>
+    : std::true_type {};
+
+#if FMT_USE_TZSET
+inline void tzset_once() {
+  static bool init = []() -> bool {
+    _tzset();
+    return true;
+  }();
+  ignore_unused(init);
+}
+#endif
+
+// Converts value to Int and checks that it's in the range [0, upper).
+template <typename T, typename Int, FMT_ENABLE_IF(std::is_integral<T>::value)>
+inline auto to_nonnegative_int(T value, Int upper) -> Int {
+  if (!std::is_unsigned<Int>::value &&
+      (value < 0 || to_unsigned(value) > to_unsigned(upper))) {
+    FMT_THROW(fmt::format_error("chrono value is out of range"));
+  }
+  return static_cast<Int>(value);
+}
+template <typename T, typename Int, FMT_ENABLE_IF(!std::is_integral<T>::value)>
+inline auto to_nonnegative_int(T value, Int upper) -> Int {
+  if (value < 0 || value > static_cast<T>(upper))
+    FMT_THROW(format_error("invalid value"));
+  return static_cast<Int>(value);
+}
+
+constexpr auto pow10(std::uint32_t n) -> long long {
+  return n == 0 ? 1 : 10 * pow10(n - 1);
+}
+
+// Counts the number of fractional digits in the range [0, 18] according to the
+// C++20 spec. If more than 18 fractional digits are required then returns 6 for
+// microseconds precision.
+template <long long Num, long long Den, int N = 0,
+          bool Enabled = (N < 19) && (Num <= max_value<long long>() / 10)>
+struct count_fractional_digits {
+  static constexpr int value =
+      Num % Den == 0 ? N : count_fractional_digits<Num * 10, Den, N + 1>::value;
+};
+
+// Base case that doesn't instantiate any more templates
+// in order to avoid overflow.
+template <long long Num, long long Den, int N>
+struct count_fractional_digits<Num, Den, N, false> {
+  static constexpr int value = (Num % Den == 0) ? N : 6;
+};
+
+// Format subseconds which are given as an integer type with an appropriate
+// number of digits.
+template <typename Char, typename OutputIt, typename Duration>
+void write_fractional_seconds(OutputIt& out, Duration d, int precision = -1) {
+  constexpr auto num_fractional_digits =
+      count_fractional_digits<Duration::period::num,
+                              Duration::period::den>::value;
+
+  using subsecond_precision = std::chrono::duration<
+      typename std::common_type<typename Duration::rep,
+                                std::chrono::seconds::rep>::type,
+      std::ratio<1, detail::pow10(num_fractional_digits)>>;
+
+  const auto fractional = d - fmt_duration_cast<std::chrono::seconds>(d);
+  const auto subseconds =
+      std::chrono::treat_as_floating_point<
+          typename subsecond_precision::rep>::value
+          ? fractional.count()
+          : fmt_duration_cast<subsecond_precision>(fractional).count();
+  auto n = static_cast<uint32_or_64_or_128_t<long long>>(subseconds);
+  const int num_digits = detail::count_digits(n);
+
+  int leading_zeroes = (std::max)(0, num_fractional_digits - num_digits);
+  if (precision < 0) {
+    FMT_ASSERT(!std::is_floating_point<typename Duration::rep>::value, "");
+    if (std::ratio_less<typename subsecond_precision::period,
+                        std::chrono::seconds::period>::value) {
+      *out++ = '.';
+      out = std::fill_n(out, leading_zeroes, '0');
+      out = format_decimal<Char>(out, n, num_digits).end;
+    }
+  } else {
+    *out++ = '.';
+    leading_zeroes = (std::min)(leading_zeroes, precision);
+    out = std::fill_n(out, leading_zeroes, '0');
+    int remaining = precision - leading_zeroes;
+    if (remaining != 0 && remaining < num_digits) {
+      n /= to_unsigned(detail::pow10(to_unsigned(num_digits - remaining)));
+      out = format_decimal<Char>(out, n, remaining).end;
+      return;
+    }
+    out = format_decimal<Char>(out, n, num_digits).end;
+    remaining -= num_digits;
+    out = std::fill_n(out, remaining, '0');
+  }
+}
+
+// Format subseconds which are given as a floating point type with an
+// appropriate number of digits. We cannot pass the Duration here, as we
+// explicitly need to pass the Rep value in the chrono_formatter.
+template <typename Duration>
+void write_floating_seconds(memory_buffer& buf, Duration duration,
+                            int num_fractional_digits = -1) {
+  using rep = typename Duration::rep;
+  FMT_ASSERT(std::is_floating_point<rep>::value, "");
+
+  auto val = duration.count();
+
+  if (num_fractional_digits < 0) {
+    // For `std::round` with fallback to `round`:
+    // On some toolchains `std::round` is not available (e.g. GCC 6).
+    using namespace std;
+    num_fractional_digits =
+        count_fractional_digits<Duration::period::num,
+                                Duration::period::den>::value;
+    if (num_fractional_digits < 6 && static_cast<rep>(round(val)) != val)
+      num_fractional_digits = 6;
+  }
+
+  fmt::format_to(std::back_inserter(buf), FMT_STRING("{:.{}f}"),
+                 std::fmod(val * static_cast<rep>(Duration::period::num) /
+                               static_cast<rep>(Duration::period::den),
+                           static_cast<rep>(60)),
+                 num_fractional_digits);
+}
+
+template <typename OutputIt, typename Char,
+          typename Duration = std::chrono::seconds>
+class tm_writer {
+ private:
+  static constexpr int days_per_week = 7;
+
+  const std::locale& loc_;
+  const bool is_classic_;
+  OutputIt out_;
+  const Duration* subsecs_;
+  const std::tm& tm_;
+
+  auto tm_sec() const noexcept -> int {
+    FMT_ASSERT(tm_.tm_sec >= 0 && tm_.tm_sec <= 61, "");
+    return tm_.tm_sec;
+  }
+  auto tm_min() const noexcept -> int {
+    FMT_ASSERT(tm_.tm_min >= 0 && tm_.tm_min <= 59, "");
+    return tm_.tm_min;
+  }
+  auto tm_hour() const noexcept -> int {
+    FMT_ASSERT(tm_.tm_hour >= 0 && tm_.tm_hour <= 23, "");
+    return tm_.tm_hour;
+  }
+  auto tm_mday() const noexcept -> int {
+    FMT_ASSERT(tm_.tm_mday >= 1 && tm_.tm_mday <= 31, "");
+    return tm_.tm_mday;
+  }
+  auto tm_mon() const noexcept -> int {
+    FMT_ASSERT(tm_.tm_mon >= 0 && tm_.tm_mon <= 11, "");
+    return tm_.tm_mon;
+  }
+  auto tm_year() const noexcept -> long long { return 1900ll + tm_.tm_year; }
+  auto tm_wday() const noexcept -> int {
+    FMT_ASSERT(tm_.tm_wday >= 0 && tm_.tm_wday <= 6, "");
+    return tm_.tm_wday;
+  }
+  auto tm_yday() const noexcept -> int {
+    FMT_ASSERT(tm_.tm_yday >= 0 && tm_.tm_yday <= 365, "");
+    return tm_.tm_yday;
+  }
+
+  auto tm_hour12() const noexcept -> int {
+    const auto h = tm_hour();
+    const auto z = h < 12 ? h : h - 12;
+    return z == 0 ? 12 : z;
+  }
+
+  // POSIX and the C Standard are unclear or inconsistent about what %C and %y
+  // do if the year is negative or exceeds 9999. Use the convention that %C
+  // concatenated with %y yields the same output as %Y, and that %Y contains at
+  // least 4 characters, with more only if necessary.
+  auto split_year_lower(long long year) const noexcept -> int {
+    auto l = year % 100;
+    if (l < 0) l = -l;  // l in [0, 99]
+    return static_cast<int>(l);
+  }
+
+  // Algorithm: https://en.wikipedia.org/wiki/ISO_week_date.
+  auto iso_year_weeks(long long curr_year) const noexcept -> int {
+    const auto prev_year = curr_year - 1;
+    const auto curr_p =
+        (curr_year + curr_year / 4 - curr_year / 100 + curr_year / 400) %
+        days_per_week;
+    const auto prev_p =
+        (prev_year + prev_year / 4 - prev_year / 100 + prev_year / 400) %
+        days_per_week;
+    return 52 + ((curr_p == 4 || prev_p == 3) ? 1 : 0);
+  }
+  auto iso_week_num(int tm_yday, int tm_wday) const noexcept -> int {
+    return (tm_yday + 11 - (tm_wday == 0 ? days_per_week : tm_wday)) /
+           days_per_week;
+  }
+  auto tm_iso_week_year() const noexcept -> long long {
+    const auto year = tm_year();
+    const auto w = iso_week_num(tm_yday(), tm_wday());
+    if (w < 1) return year - 1;
+    if (w > iso_year_weeks(year)) return year + 1;
+    return year;
+  }
+  auto tm_iso_week_of_year() const noexcept -> int {
+    const auto year = tm_year();
+    const auto w = iso_week_num(tm_yday(), tm_wday());
+    if (w < 1) return iso_year_weeks(year - 1);
+    if (w > iso_year_weeks(year)) return 1;
+    return w;
+  }
+
+  void write1(int value) {
+    *out_++ = static_cast<char>('0' + to_unsigned(value) % 10);
+  }
+  void write2(int value) {
+    const char* d = digits2(to_unsigned(value) % 100);
+    *out_++ = *d++;
+    *out_++ = *d;
+  }
+  void write2(int value, pad_type pad) {
+    unsigned int v = to_unsigned(value) % 100;
+    if (v >= 10) {
+      const char* d = digits2(v);
+      *out_++ = *d++;
+      *out_++ = *d;
+    } else {
+      out_ = detail::write_padding(out_, pad);
+      *out_++ = static_cast<char>('0' + v);
+    }
+  }
+
+  void write_year_extended(long long year) {
+    // At least 4 characters.
+    int width = 4;
+    if (year < 0) {
+      *out_++ = '-';
+      year = 0 - year;
+      --width;
+    }
+    uint32_or_64_or_128_t<long long> n = to_unsigned(year);
+    const int num_digits = count_digits(n);
+    if (width > num_digits) out_ = std::fill_n(out_, width - num_digits, '0');
+    out_ = format_decimal<Char>(out_, n, num_digits).end;
+  }
+  void write_year(long long year) {
+    if (year >= 0 && year < 10000) {
+      write2(static_cast<int>(year / 100));
+      write2(static_cast<int>(year % 100));
+    } else {
+      write_year_extended(year);
+    }
+  }
+
+  void write_utc_offset(long offset, numeric_system ns) {
+    if (offset < 0) {
+      *out_++ = '-';
+      offset = -offset;
+    } else {
+      *out_++ = '+';
+    }
+    offset /= 60;
+    write2(static_cast<int>(offset / 60));
+    if (ns != numeric_system::standard) *out_++ = ':';
+    write2(static_cast<int>(offset % 60));
+  }
+  template <typename T, FMT_ENABLE_IF(has_member_data_tm_gmtoff<T>::value)>
+  void format_utc_offset_impl(const T& tm, numeric_system ns) {
+    write_utc_offset(tm.tm_gmtoff, ns);
+  }
+  template <typename T, FMT_ENABLE_IF(!has_member_data_tm_gmtoff<T>::value)>
+  void format_utc_offset_impl(const T& tm, numeric_system ns) {
+#if defined(_WIN32) && defined(_UCRT)
+#  if FMT_USE_TZSET
+    tzset_once();
+#  endif
+    long offset = 0;
+    _get_timezone(&offset);
+    if (tm.tm_isdst) {
+      long dstbias = 0;
+      _get_dstbias(&dstbias);
+      offset += dstbias;
+    }
+    write_utc_offset(-offset, ns);
+#else
+    if (ns == numeric_system::standard) return format_localized('z');
+
+    // Extract timezone offset from timezone conversion functions.
+    std::tm gtm = tm;
+    std::time_t gt = std::mktime(&gtm);
+    std::tm ltm = gmtime(gt);
+    std::time_t lt = std::mktime(&ltm);
+    long offset = gt - lt;
+    write_utc_offset(offset, ns);
+#endif
+  }
+
+  template <typename T, FMT_ENABLE_IF(has_member_data_tm_zone<T>::value)>
+  void format_tz_name_impl(const T& tm) {
+    if (is_classic_)
+      out_ = write_tm_str<Char>(out_, tm.tm_zone, loc_);
+    else
+      format_localized('Z');
+  }
+  template <typename T, FMT_ENABLE_IF(!has_member_data_tm_zone<T>::value)>
+  void format_tz_name_impl(const T&) {
+    format_localized('Z');
+  }
+
+  void format_localized(char format, char modifier = 0) {
+    out_ = write<Char>(out_, tm_, loc_, format, modifier);
+  }
+
+ public:
+  tm_writer(const std::locale& loc, OutputIt out, const std::tm& tm,
+            const Duration* subsecs = nullptr)
+      : loc_(loc),
+        is_classic_(loc_ == get_classic_locale()),
+        out_(out),
+        subsecs_(subsecs),
+        tm_(tm) {}
+
+  auto out() const -> OutputIt { return out_; }
+
+  FMT_CONSTEXPR void on_text(const Char* begin, const Char* end) {
+    out_ = copy_str<Char>(begin, end, out_);
+  }
+
+  void on_abbr_weekday() {
+    if (is_classic_)
+      out_ = write(out_, tm_wday_short_name(tm_wday()));
+    else
+      format_localized('a');
+  }
+  void on_full_weekday() {
+    if (is_classic_)
+      out_ = write(out_, tm_wday_full_name(tm_wday()));
+    else
+      format_localized('A');
+  }
+  void on_dec0_weekday(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard) return write1(tm_wday());
+    format_localized('w', 'O');
+  }
+  void on_dec1_weekday(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard) {
+      auto wday = tm_wday();
+      write1(wday == 0 ? days_per_week : wday);
+    } else {
+      format_localized('u', 'O');
+    }
+  }
+
+  void on_abbr_month() {
+    if (is_classic_)
+      out_ = write(out_, tm_mon_short_name(tm_mon()));
+    else
+      format_localized('b');
+  }
+  void on_full_month() {
+    if (is_classic_)
+      out_ = write(out_, tm_mon_full_name(tm_mon()));
+    else
+      format_localized('B');
+  }
+
+  void on_datetime(numeric_system ns) {
+    if (is_classic_) {
+      on_abbr_weekday();
+      *out_++ = ' ';
+      on_abbr_month();
+      *out_++ = ' ';
+      on_day_of_month_space(numeric_system::standard);
+      *out_++ = ' ';
+      on_iso_time();
+      *out_++ = ' ';
+      on_year(numeric_system::standard);
+    } else {
+      format_localized('c', ns == numeric_system::standard ? '\0' : 'E');
+    }
+  }
+  void on_loc_date(numeric_system ns) {
+    if (is_classic_)
+      on_us_date();
+    else
+      format_localized('x', ns == numeric_system::standard ? '\0' : 'E');
+  }
+  void on_loc_time(numeric_system ns) {
+    if (is_classic_)
+      on_iso_time();
+    else
+      format_localized('X', ns == numeric_system::standard ? '\0' : 'E');
+  }
+  void on_us_date() {
+    char buf[8];
+    write_digit2_separated(buf, to_unsigned(tm_mon() + 1),
+                           to_unsigned(tm_mday()),
+                           to_unsigned(split_year_lower(tm_year())), '/');
+    out_ = copy_str<Char>(std::begin(buf), std::end(buf), out_);
+  }
+  void on_iso_date() {
+    auto year = tm_year();
+    char buf[10];
+    size_t offset = 0;
+    if (year >= 0 && year < 10000) {
+      copy2(buf, digits2(static_cast<size_t>(year / 100)));
+    } else {
+      offset = 4;
+      write_year_extended(year);
+      year = 0;
+    }
+    write_digit2_separated(buf + 2, static_cast<unsigned>(year % 100),
+                           to_unsigned(tm_mon() + 1), to_unsigned(tm_mday()),
+                           '-');
+    out_ = copy_str<Char>(std::begin(buf) + offset, std::end(buf), out_);
+  }
+
+  void on_utc_offset(numeric_system ns) { format_utc_offset_impl(tm_, ns); }
+  void on_tz_name() { format_tz_name_impl(tm_); }
+
+  void on_year(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write_year(tm_year());
+    format_localized('Y', 'E');
+  }
+  void on_short_year(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2(split_year_lower(tm_year()));
+    format_localized('y', 'O');
+  }
+  void on_offset_year() {
+    if (is_classic_) return write2(split_year_lower(tm_year()));
+    format_localized('y', 'E');
+  }
+
+  void on_century(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard) {
+      auto year = tm_year();
+      auto upper = year / 100;
+      if (year >= -99 && year < 0) {
+        // Zero upper on negative year.
+        *out_++ = '-';
+        *out_++ = '0';
+      } else if (upper >= 0 && upper < 100) {
+        write2(static_cast<int>(upper));
+      } else {
+        out_ = write<Char>(out_, upper);
+      }
+    } else {
+      format_localized('C', 'E');
+    }
+  }
+
+  void on_dec_month(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2(tm_mon() + 1);
+    format_localized('m', 'O');
+  }
+
+  void on_dec0_week_of_year(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2((tm_yday() + days_per_week - tm_wday()) / days_per_week);
+    format_localized('U', 'O');
+  }
+  void on_dec1_week_of_year(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard) {
+      auto wday = tm_wday();
+      write2((tm_yday() + days_per_week -
+              (wday == 0 ? (days_per_week - 1) : (wday - 1))) /
+             days_per_week);
+    } else {
+      format_localized('W', 'O');
+    }
+  }
+  void on_iso_week_of_year(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2(tm_iso_week_of_year());
+    format_localized('V', 'O');
+  }
+
+  void on_iso_week_based_year() { write_year(tm_iso_week_year()); }
+  void on_iso_week_based_short_year() {
+    write2(split_year_lower(tm_iso_week_year()));
+  }
+
+  void on_day_of_year() {
+    auto yday = tm_yday() + 1;
+    write1(yday / 100);
+    write2(yday % 100);
+  }
+  void on_day_of_month(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard) return write2(tm_mday());
+    format_localized('d', 'O');
+  }
+  void on_day_of_month_space(numeric_system ns) {
+    if (is_classic_ || ns == numeric_system::standard) {
+      auto mday = to_unsigned(tm_mday()) % 100;
+      const char* d2 = digits2(mday);
+      *out_++ = mday < 10 ? ' ' : d2[0];
+      *out_++ = d2[1];
+    } else {
+      format_localized('e', 'O');
+    }
+  }
+
+  void on_24_hour(numeric_system ns, pad_type pad) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2(tm_hour(), pad);
+    format_localized('H', 'O');
+  }
+  void on_12_hour(numeric_system ns, pad_type pad) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2(tm_hour12(), pad);
+    format_localized('I', 'O');
+  }
+  void on_minute(numeric_system ns, pad_type pad) {
+    if (is_classic_ || ns == numeric_system::standard)
+      return write2(tm_min(), pad);
+    format_localized('M', 'O');
+  }
+
+  void on_second(numeric_system ns, pad_type pad) {
+    if (is_classic_ || ns == numeric_system::standard) {
+      write2(tm_sec(), pad);
+      if (subsecs_) {
+        if (std::is_floating_point<typename Duration::rep>::value) {
+          auto buf = memory_buffer();
+          write_floating_seconds(buf, *subsecs_);
+          if (buf.size() > 1) {
+            // Remove the leading "0", write something like ".123".
+            out_ = std::copy(buf.begin() + 1, buf.end(), out_);
+          }
+        } else {
+          write_fractional_seconds<Char>(out_, *subsecs_);
+        }
+      }
+    } else {
+      // Currently no formatting of subseconds when a locale is set.
+      format_localized('S', 'O');
+    }
+  }
+
+  void on_12_hour_time() {
+    if (is_classic_) {
+      char buf[8];
+      write_digit2_separated(buf, to_unsigned(tm_hour12()),
+                             to_unsigned(tm_min()), to_unsigned(tm_sec()), ':');
+      out_ = copy_str<Char>(std::begin(buf), std::end(buf), out_);
+      *out_++ = ' ';
+      on_am_pm();
+    } else {
+      format_localized('r');
+    }
+  }
+  void on_24_hour_time() {
+    write2(tm_hour());
+    *out_++ = ':';
+    write2(tm_min());
+  }
+  void on_iso_time() {
+    on_24_hour_time();
+    *out_++ = ':';
+    on_second(numeric_system::standard, pad_type::unspecified);
+  }
+
+  void on_am_pm() {
+    if (is_classic_) {
+      *out_++ = tm_hour() < 12 ? 'A' : 'P';
+      *out_++ = 'M';
+    } else {
+      format_localized('p');
+    }
+  }
+
+  // These apply to chrono durations but not tm.
+  void on_duration_value() {}
+  void on_duration_unit() {}
+};
+
+struct chrono_format_checker : null_chrono_spec_handler<chrono_format_checker> {
+  bool has_precision_integral = false;
+
+  FMT_NORETURN void unsupported() { FMT_THROW(format_error("no date")); }
+
+  template <typename Char>
+  FMT_CONSTEXPR void on_text(const Char*, const Char*) {}
+  FMT_CONSTEXPR void on_day_of_year() {}
+  FMT_CONSTEXPR void on_24_hour(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_12_hour(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_minute(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_second(numeric_system, pad_type) {}
+  FMT_CONSTEXPR void on_12_hour_time() {}
+  FMT_CONSTEXPR void on_24_hour_time() {}
+  FMT_CONSTEXPR void on_iso_time() {}
+  FMT_CONSTEXPR void on_am_pm() {}
+  FMT_CONSTEXPR void on_duration_value() const {
+    if (has_precision_integral) {
+      FMT_THROW(format_error("precision not allowed for this argument type"));
+    }
+  }
+  FMT_CONSTEXPR void on_duration_unit() {}
+};
+
+template <typename T,
+          FMT_ENABLE_IF(std::is_integral<T>::value&& has_isfinite<T>::value)>
+inline auto isfinite(T) -> bool {
+  return true;
+}
+
+template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
+inline auto mod(T x, int y) -> T {
+  return x % static_cast<T>(y);
+}
+template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value)>
+inline auto mod(T x, int y) -> T {
+  return std::fmod(x, static_cast<T>(y));
+}
+
+// If T is an integral type, maps T to its unsigned counterpart, otherwise
+// leaves it unchanged (unlike std::make_unsigned).
+template <typename T, bool INTEGRAL = std::is_integral<T>::value>
+struct make_unsigned_or_unchanged {
+  using type = T;
+};
+
+template <typename T> struct make_unsigned_or_unchanged<T, true> {
+  using type = typename std::make_unsigned<T>::type;
+};
+
+template <typename Rep, typename Period,
+          FMT_ENABLE_IF(std::is_integral<Rep>::value)>
+inline auto get_milliseconds(std::chrono::duration<Rep, Period> d)
+    -> std::chrono::duration<Rep, std::milli> {
+  // this may overflow and/or the result may not fit in the
+  // target type.
+#if FMT_SAFE_DURATION_CAST
+  using CommonSecondsType =
+      typename std::common_type<decltype(d), std::chrono::seconds>::type;
+  const auto d_as_common = fmt_duration_cast<CommonSecondsType>(d);
+  const auto d_as_whole_seconds =
+      fmt_duration_cast<std::chrono::seconds>(d_as_common);
+  // this conversion should be nonproblematic
+  const auto diff = d_as_common - d_as_whole_seconds;
+  const auto ms =
+      fmt_duration_cast<std::chrono::duration<Rep, std::milli>>(diff);
+  return ms;
+#else
+  auto s = fmt_duration_cast<std::chrono::seconds>(d);
+  return fmt_duration_cast<std::chrono::milliseconds>(d - s);
+#endif
+}
+
+template <typename Char, typename Rep, typename OutputIt,
+          FMT_ENABLE_IF(std::is_integral<Rep>::value)>
+auto format_duration_value(OutputIt out, Rep val, int) -> OutputIt {
+  return write<Char>(out, val);
+}
+
+template <typename Char, typename Rep, typename OutputIt,
+          FMT_ENABLE_IF(std::is_floating_point<Rep>::value)>
+auto format_duration_value(OutputIt out, Rep val, int precision) -> OutputIt {
+  auto specs = format_specs<Char>();
+  specs.precision = precision;
+  specs.type = precision >= 0 ? presentation_type::fixed_lower
+                              : presentation_type::general_lower;
+  return write<Char>(out, val, specs);
+}
+
+template <typename Char, typename OutputIt>
+auto copy_unit(string_view unit, OutputIt out, Char) -> OutputIt {
+  return std::copy(unit.begin(), unit.end(), out);
+}
+
+template <typename OutputIt>
+auto copy_unit(string_view unit, OutputIt out, wchar_t) -> OutputIt {
+  // This works when wchar_t is UTF-32 because units only contain characters
+  // that have the same representation in UTF-16 and UTF-32.
+  utf8_to_utf16 u(unit);
+  return std::copy(u.c_str(), u.c_str() + u.size(), out);
+}
+
+template <typename Char, typename Period, typename OutputIt>
+auto format_duration_unit(OutputIt out) -> OutputIt {
+  if (const char* unit = get_units<Period>())
+    return copy_unit(string_view(unit), out, Char());
+  *out++ = '[';
+  out = write<Char>(out, Period::num);
+  if (const_check(Period::den != 1)) {
+    *out++ = '/';
+    out = write<Char>(out, Period::den);
+  }
+  *out++ = ']';
+  *out++ = 's';
+  return out;
+}
+
+class get_locale {
+ private:
+  union {
+    std::locale locale_;
+  };
+  bool has_locale_ = false;
+
+ public:
+  get_locale(bool localized, locale_ref loc) : has_locale_(localized) {
+    if (localized)
+      ::new (&locale_) std::locale(loc.template get<std::locale>());
+  }
+  ~get_locale() {
+    if (has_locale_) locale_.~locale();
+  }
+  operator const std::locale&() const {
+    return has_locale_ ? locale_ : get_classic_locale();
+  }
+};
+
+template <typename FormatContext, typename OutputIt, typename Rep,
+          typename Period>
+struct chrono_formatter {
+  FormatContext& context;
+  OutputIt out;
+  int precision;
+  bool localized = false;
+  // rep is unsigned to avoid overflow.
+  using rep =
+      conditional_t<std::is_integral<Rep>::value && sizeof(Rep) < sizeof(int),
+                    unsigned, typename make_unsigned_or_unchanged<Rep>::type>;
+  rep val;
+  using seconds = std::chrono::duration<rep>;
+  seconds s;
+  using milliseconds = std::chrono::duration<rep, std::milli>;
+  bool negative;
+
+  using char_type = typename FormatContext::char_type;
+  using tm_writer_type = tm_writer<OutputIt, char_type>;
+
+  chrono_formatter(FormatContext& ctx, OutputIt o,
+                   std::chrono::duration<Rep, Period> d)
+      : context(ctx),
+        out(o),
+        val(static_cast<rep>(d.count())),
+        negative(false) {
+    if (d.count() < 0) {
+      val = 0 - val;
+      negative = true;
+    }
+
+    // this may overflow and/or the result may not fit in the
+    // target type.
+    // might need checked conversion (rep!=Rep)
+    s = fmt_duration_cast<seconds>(std::chrono::duration<rep, Period>(val));
+  }
+
+  // returns true if nan or inf, writes to out.
+  auto handle_nan_inf() -> bool {
+    if (isfinite(val)) {
+      return false;
+    }
+    if (isnan(val)) {
+      write_nan();
+      return true;
+    }
+    // must be +-inf
+    if (val > 0) {
+      write_pinf();
+    } else {
+      write_ninf();
+    }
+    return true;
+  }
+
+  auto days() const -> Rep { return static_cast<Rep>(s.count() / 86400); }
+  auto hour() const -> Rep {
+    return static_cast<Rep>(mod((s.count() / 3600), 24));
+  }
+
+  auto hour12() const -> Rep {
+    Rep hour = static_cast<Rep>(mod((s.count() / 3600), 12));
+    return hour <= 0 ? 12 : hour;
+  }
+
+  auto minute() const -> Rep {
+    return static_cast<Rep>(mod((s.count() / 60), 60));
+  }
+  auto second() const -> Rep { return static_cast<Rep>(mod(s.count(), 60)); }
+
+  auto time() const -> std::tm {
+    auto time = std::tm();
+    time.tm_hour = to_nonnegative_int(hour(), 24);
+    time.tm_min = to_nonnegative_int(minute(), 60);
+    time.tm_sec = to_nonnegative_int(second(), 60);
+    return time;
+  }
+
+  void write_sign() {
+    if (negative) {
+      *out++ = '-';
+      negative = false;
+    }
+  }
+
+  void write(Rep value, int width, pad_type pad = pad_type::unspecified) {
+    write_sign();
+    if (isnan(value)) return write_nan();
+    uint32_or_64_or_128_t<int> n =
+        to_unsigned(to_nonnegative_int(value, max_value<int>()));
+    int num_digits = detail::count_digits(n);
+    if (width > num_digits) {
+      out = detail::write_padding(out, pad, width - num_digits);
+    }
+    out = format_decimal<char_type>(out, n, num_digits).end;
+  }
+
+  void write_nan() { std::copy_n("nan", 3, out); }
+  void write_pinf() { std::copy_n("inf", 3, out); }
+  void write_ninf() { std::copy_n("-inf", 4, out); }
+
+  template <typename Callback, typename... Args>
+  void format_tm(const tm& time, Callback cb, Args... args) {
+    if (isnan(val)) return write_nan();
+    get_locale loc(localized, context.locale());
+    auto w = tm_writer_type(loc, out, time);
+    (w.*cb)(args...);
+    out = w.out();
+  }
+
+  void on_text(const char_type* begin, const char_type* end) {
+    std::copy(begin, end, out);
+  }
+
+  // These are not implemented because durations don't have date information.
+  void on_abbr_weekday() {}
+  void on_full_weekday() {}
+  void on_dec0_weekday(numeric_system) {}
+  void on_dec1_weekday(numeric_system) {}
+  void on_abbr_month() {}
+  void on_full_month() {}
+  void on_datetime(numeric_system) {}
+  void on_loc_date(numeric_system) {}
+  void on_loc_time(numeric_system) {}
+  void on_us_date() {}
+  void on_iso_date() {}
+  void on_utc_offset(numeric_system) {}
+  void on_tz_name() {}
+  void on_year(numeric_system) {}
+  void on_short_year(numeric_system) {}
+  void on_offset_year() {}
+  void on_century(numeric_system) {}
+  void on_iso_week_based_year() {}
+  void on_iso_week_based_short_year() {}
+  void on_dec_month(numeric_system) {}
+  void on_dec0_week_of_year(numeric_system) {}
+  void on_dec1_week_of_year(numeric_system) {}
+  void on_iso_week_of_year(numeric_system) {}
+  void on_day_of_month(numeric_system) {}
+  void on_day_of_month_space(numeric_system) {}
+
+  void on_day_of_year() {
+    if (handle_nan_inf()) return;
+    write(days(), 0);
+  }
+
+  void on_24_hour(numeric_system ns, pad_type pad) {
+    if (handle_nan_inf()) return;
+
+    if (ns == numeric_system::standard) return write(hour(), 2, pad);
+    auto time = tm();
+    time.tm_hour = to_nonnegative_int(hour(), 24);
+    format_tm(time, &tm_writer_type::on_24_hour, ns, pad);
+  }
+
+  void on_12_hour(numeric_system ns, pad_type pad) {
+    if (handle_nan_inf()) return;
+
+    if (ns == numeric_system::standard) return write(hour12(), 2, pad);
+    auto time = tm();
+    time.tm_hour = to_nonnegative_int(hour12(), 12);
+    format_tm(time, &tm_writer_type::on_12_hour, ns, pad);
+  }
+
+  void on_minute(numeric_system ns, pad_type pad) {
+    if (handle_nan_inf()) return;
+
+    if (ns == numeric_system::standard) return write(minute(), 2, pad);
+    auto time = tm();
+    time.tm_min = to_nonnegative_int(minute(), 60);
+    format_tm(time, &tm_writer_type::on_minute, ns, pad);
+  }
+
+  void on_second(numeric_system ns, pad_type pad) {
+    if (handle_nan_inf()) return;
+
+    if (ns == numeric_system::standard) {
+      if (std::is_floating_point<rep>::value) {
+        auto buf = memory_buffer();
+        write_floating_seconds(buf, std::chrono::duration<rep, Period>(val),
+                               precision);
+        if (negative) *out++ = '-';
+        if (buf.size() < 2 || buf[1] == '.') {
+          out = detail::write_padding(out, pad);
+        }
+        out = std::copy(buf.begin(), buf.end(), out);
+      } else {
+        write(second(), 2, pad);
+        write_fractional_seconds<char_type>(
+            out, std::chrono::duration<rep, Period>(val), precision);
+      }
+      return;
+    }
+    auto time = tm();
+    time.tm_sec = to_nonnegative_int(second(), 60);
+    format_tm(time, &tm_writer_type::on_second, ns, pad);
+  }
+
+  void on_12_hour_time() {
+    if (handle_nan_inf()) return;
+    format_tm(time(), &tm_writer_type::on_12_hour_time);
+  }
+
+  void on_24_hour_time() {
+    if (handle_nan_inf()) {
+      *out++ = ':';
+      handle_nan_inf();
+      return;
+    }
+
+    write(hour(), 2);
+    *out++ = ':';
+    write(minute(), 2);
+  }
+
+  void on_iso_time() {
+    on_24_hour_time();
+    *out++ = ':';
+    if (handle_nan_inf()) return;
+    on_second(numeric_system::standard, pad_type::unspecified);
+  }
+
+  void on_am_pm() {
+    if (handle_nan_inf()) return;
+    format_tm(time(), &tm_writer_type::on_am_pm);
+  }
+
+  void on_duration_value() {
+    if (handle_nan_inf()) return;
+    write_sign();
+    out = format_duration_value<char_type>(out, val, precision);
+  }
+
+  void on_duration_unit() {
+    out = format_duration_unit<char_type, Period>(out);
+  }
+};
+
+}  // namespace detail
+
+#if defined(__cpp_lib_chrono) && __cpp_lib_chrono >= 201907
+using weekday = std::chrono::weekday;
+#else
+// A fallback version of weekday.
+class weekday {
+ private:
+  unsigned char value;
+
+ public:
+  weekday() = default;
+  explicit constexpr weekday(unsigned wd) noexcept
+      : value(static_cast<unsigned char>(wd != 7 ? wd : 0)) {}
+  constexpr auto c_encoding() const noexcept -> unsigned { return value; }
+};
+
+class year_month_day {};
+#endif
+
+// A rudimentary weekday formatter.
+template <typename Char> struct formatter<weekday, Char> {
+ private:
+  bool localized = false;
+
+ public:
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    auto begin = ctx.begin(), end = ctx.end();
+    if (begin != end && *begin == 'L') {
+      ++begin;
+      localized = true;
+    }
+    return begin;
+  }
+
+  template <typename FormatContext>
+  auto format(weekday wd, FormatContext& ctx) const -> decltype(ctx.out()) {
+    auto time = std::tm();
+    time.tm_wday = static_cast<int>(wd.c_encoding());
+    detail::get_locale loc(localized, ctx.locale());
+    auto w = detail::tm_writer<decltype(ctx.out()), Char>(loc, ctx.out(), time);
+    w.on_abbr_weekday();
+    return w.out();
+  }
+};
+
+template <typename Rep, typename Period, typename Char>
+struct formatter<std::chrono::duration<Rep, Period>, Char> {
+ private:
+  format_specs<Char> specs_;
+  detail::arg_ref<Char> width_ref_;
+  detail::arg_ref<Char> precision_ref_;
+  bool localized_ = false;
+  basic_string_view<Char> format_str_;
+
+ public:
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    auto it = ctx.begin(), end = ctx.end();
+    if (it == end || *it == '}') return it;
+
+    it = detail::parse_align(it, end, specs_);
+    if (it == end) return it;
+
+    it = detail::parse_dynamic_spec(it, end, specs_.width, width_ref_, ctx);
+    if (it == end) return it;
+
+    auto checker = detail::chrono_format_checker();
+    if (*it == '.') {
+      checker.has_precision_integral = !std::is_floating_point<Rep>::value;
+      it = detail::parse_precision(it, end, specs_.precision, precision_ref_,
+                                   ctx);
+    }
+    if (it != end && *it == 'L') {
+      localized_ = true;
+      ++it;
+    }
+    end = detail::parse_chrono_format(it, end, checker);
+    format_str_ = {it, detail::to_unsigned(end - it)};
+    return end;
+  }
+
+  template <typename FormatContext>
+  auto format(std::chrono::duration<Rep, Period> d, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto specs = specs_;
+    auto precision = specs.precision;
+    specs.precision = -1;
+    auto begin = format_str_.begin(), end = format_str_.end();
+    // As a possible future optimization, we could avoid extra copying if width
+    // is not specified.
+    auto buf = basic_memory_buffer<Char>();
+    auto out = std::back_inserter(buf);
+    detail::handle_dynamic_spec<detail::width_checker>(specs.width, width_ref_,
+                                                       ctx);
+    detail::handle_dynamic_spec<detail::precision_checker>(precision,
+                                                           precision_ref_, ctx);
+    if (begin == end || *begin == '}') {
+      out = detail::format_duration_value<Char>(out, d.count(), precision);
+      detail::format_duration_unit<Char, Period>(out);
+    } else {
+      using chrono_formatter =
+          detail::chrono_formatter<FormatContext, decltype(out), Rep, Period>;
+      auto f = chrono_formatter(ctx, out, d);
+      f.precision = precision;
+      f.localized = localized_;
+      detail::parse_chrono_format(begin, end, f);
+    }
+    return detail::write(
+        ctx.out(), basic_string_view<Char>(buf.data(), buf.size()), specs);
+  }
+};
+
+template <typename Char, typename Duration>
+struct formatter<std::chrono::time_point<std::chrono::system_clock, Duration>,
+                 Char> : formatter<std::tm, Char> {
+  FMT_CONSTEXPR formatter() {
+    this->format_str_ = detail::string_literal<Char, '%', 'F', ' ', '%', 'T'>{};
+  }
+
+  template <typename FormatContext>
+  auto format(std::chrono::time_point<std::chrono::system_clock, Duration> val,
+              FormatContext& ctx) const -> decltype(ctx.out()) {
+    using period = typename Duration::period;
+    if (detail::const_check(
+            period::num != 1 || period::den != 1 ||
+            std::is_floating_point<typename Duration::rep>::value)) {
+      const auto epoch = val.time_since_epoch();
+      auto subsecs = detail::fmt_duration_cast<Duration>(
+          epoch - detail::fmt_duration_cast<std::chrono::seconds>(epoch));
+
+      if (subsecs.count() < 0) {
+        auto second =
+            detail::fmt_duration_cast<Duration>(std::chrono::seconds(1));
+        if (epoch.count() < ((Duration::min)() + second).count())
+          FMT_THROW(format_error("duration is too small"));
+        subsecs += second;
+        val -= second;
+      }
+
+      return formatter<std::tm, Char>::do_format(gmtime(val), ctx, &subsecs);
+    }
+
+    return formatter<std::tm, Char>::format(gmtime(val), ctx);
+  }
+};
+
+#if FMT_USE_LOCAL_TIME
+template <typename Char, typename Duration>
+struct formatter<std::chrono::local_time<Duration>, Char>
+    : formatter<std::tm, Char> {
+  FMT_CONSTEXPR formatter() {
+    this->format_str_ = detail::string_literal<Char, '%', 'F', ' ', '%', 'T'>{};
+  }
+
+  template <typename FormatContext>
+  auto format(std::chrono::local_time<Duration> val, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    using period = typename Duration::period;
+    if (period::num != 1 || period::den != 1 ||
+        std::is_floating_point<typename Duration::rep>::value) {
+      const auto epoch = val.time_since_epoch();
+      const auto subsecs = detail::fmt_duration_cast<Duration>(
+          epoch - detail::fmt_duration_cast<std::chrono::seconds>(epoch));
+
+      return formatter<std::tm, Char>::do_format(localtime(val), ctx, &subsecs);
+    }
+
+    return formatter<std::tm, Char>::format(localtime(val), ctx);
+  }
+};
+#endif
+
+#if FMT_USE_UTC_TIME
+template <typename Char, typename Duration>
+struct formatter<std::chrono::time_point<std::chrono::utc_clock, Duration>,
+                 Char>
+    : formatter<std::chrono::time_point<std::chrono::system_clock, Duration>,
+                Char> {
+  template <typename FormatContext>
+  auto format(std::chrono::time_point<std::chrono::utc_clock, Duration> val,
+              FormatContext& ctx) const -> decltype(ctx.out()) {
+    return formatter<
+        std::chrono::time_point<std::chrono::system_clock, Duration>,
+        Char>::format(std::chrono::utc_clock::to_sys(val), ctx);
+  }
+};
+#endif
+
+template <typename Char> struct formatter<std::tm, Char> {
+ private:
+  format_specs<Char> specs_;
+  detail::arg_ref<Char> width_ref_;
+
+ protected:
+  basic_string_view<Char> format_str_;
+
+  template <typename FormatContext, typename Duration>
+  auto do_format(const std::tm& tm, FormatContext& ctx,
+                 const Duration* subsecs) const -> decltype(ctx.out()) {
+    auto specs = specs_;
+    auto buf = basic_memory_buffer<Char>();
+    auto out = std::back_inserter(buf);
+    detail::handle_dynamic_spec<detail::width_checker>(specs.width, width_ref_,
+                                                       ctx);
+
+    auto loc_ref = ctx.locale();
+    detail::get_locale loc(static_cast<bool>(loc_ref), loc_ref);
+    auto w =
+        detail::tm_writer<decltype(out), Char, Duration>(loc, out, tm, subsecs);
+    detail::parse_chrono_format(format_str_.begin(), format_str_.end(), w);
+    return detail::write(
+        ctx.out(), basic_string_view<Char>(buf.data(), buf.size()), specs);
+  }
+
+ public:
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    auto it = ctx.begin(), end = ctx.end();
+    if (it == end || *it == '}') return it;
+
+    it = detail::parse_align(it, end, specs_);
+    if (it == end) return it;
+
+    it = detail::parse_dynamic_spec(it, end, specs_.width, width_ref_, ctx);
+    if (it == end) return it;
+
+    end = detail::parse_chrono_format(it, end, detail::tm_format_checker());
+    // Replace the default format_str only if the new spec is not empty.
+    if (end != it) format_str_ = {it, detail::to_unsigned(end - it)};
+    return end;
+  }
+
+  template <typename FormatContext>
+  auto format(const std::tm& tm, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return do_format<FormatContext, std::chrono::seconds>(tm, ctx, nullptr);
+  }
+};
+
+FMT_END_EXPORT
+FMT_END_NAMESPACE
+
+#endif  // FMT_CHRONO_H_
diff --git a/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/color.h b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/color.h
new file mode 100644
index 00000000..367849a8
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/color.h
@@ -0,0 +1,643 @@
+// Formatting library for C++ - color support
+//
+// Copyright (c) 2018 - present, Victor Zverovich and fmt contributors
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_COLOR_H_
+#define FMT_COLOR_H_
+
+#include "format.h"
+
+FMT_BEGIN_NAMESPACE
+FMT_BEGIN_EXPORT
+
+enum class color : uint32_t {
+  alice_blue = 0xF0F8FF,               // rgb(240,248,255)
+  antique_white = 0xFAEBD7,            // rgb(250,235,215)
+  aqua = 0x00FFFF,                     // rgb(0,255,255)
+  aquamarine = 0x7FFFD4,               // rgb(127,255,212)
+  azure = 0xF0FFFF,                    // rgb(240,255,255)
+  beige = 0xF5F5DC,                    // rgb(245,245,220)
+  bisque = 0xFFE4C4,                   // rgb(255,228,196)
+  black = 0x000000,                    // rgb(0,0,0)
+  blanched_almond = 0xFFEBCD,          // rgb(255,235,205)
+  blue = 0x0000FF,                     // rgb(0,0,255)
+  blue_violet = 0x8A2BE2,              // rgb(138,43,226)
+  brown = 0xA52A2A,                    // rgb(165,42,42)
+  burly_wood = 0xDEB887,               // rgb(222,184,135)
+  cadet_blue = 0x5F9EA0,               // rgb(95,158,160)
+  chartreuse = 0x7FFF00,               // rgb(127,255,0)
+  chocolate = 0xD2691E,                // rgb(210,105,30)
+  coral = 0xFF7F50,                    // rgb(255,127,80)
+  cornflower_blue = 0x6495ED,          // rgb(100,149,237)
+  cornsilk = 0xFFF8DC,                 // rgb(255,248,220)
+  crimson = 0xDC143C,                  // rgb(220,20,60)
+  cyan = 0x00FFFF,                     // rgb(0,255,255)
+  dark_blue = 0x00008B,                // rgb(0,0,139)
+  dark_cyan = 0x008B8B,                // rgb(0,139,139)
+  dark_golden_rod = 0xB8860B,          // rgb(184,134,11)
+  dark_gray = 0xA9A9A9,                // rgb(169,169,169)
+  dark_green = 0x006400,               // rgb(0,100,0)
+  dark_khaki = 0xBDB76B,               // rgb(189,183,107)
+  dark_magenta = 0x8B008B,             // rgb(139,0,139)
+  dark_olive_green = 0x556B2F,         // rgb(85,107,47)
+  dark_orange = 0xFF8C00,              // rgb(255,140,0)
+  dark_orchid = 0x9932CC,              // rgb(153,50,204)
+  dark_red = 0x8B0000,                 // rgb(139,0,0)
+  dark_salmon = 0xE9967A,              // rgb(233,150,122)
+  dark_sea_green = 0x8FBC8F,           // rgb(143,188,143)
+  dark_slate_blue = 0x483D8B,          // rgb(72,61,139)
+  dark_slate_gray = 0x2F4F4F,          // rgb(47,79,79)
+  dark_turquoise = 0x00CED1,           // rgb(0,206,209)
+  dark_violet = 0x9400D3,              // rgb(148,0,211)
+  deep_pink = 0xFF1493,                // rgb(255,20,147)
+  deep_sky_blue = 0x00BFFF,            // rgb(0,191,255)
+  dim_gray = 0x696969,                 // rgb(105,105,105)
+  dodger_blue = 0x1E90FF,              // rgb(30,144,255)
+  fire_brick = 0xB22222,               // rgb(178,34,34)
+  floral_white = 0xFFFAF0,             // rgb(255,250,240)
+  forest_green = 0x228B22,             // rgb(34,139,34)
+  fuchsia = 0xFF00FF,                  // rgb(255,0,255)
+  gainsboro = 0xDCDCDC,                // rgb(220,220,220)
+  ghost_white = 0xF8F8FF,              // rgb(248,248,255)
+  gold = 0xFFD700,                     // rgb(255,215,0)
+  golden_rod = 0xDAA520,               // rgb(218,165,32)
+  gray = 0x808080,                     // rgb(128,128,128)
+  green = 0x008000,                    // rgb(0,128,0)
+  green_yellow = 0xADFF2F,             // rgb(173,255,47)
+  honey_dew = 0xF0FFF0,                // rgb(240,255,240)
+  hot_pink = 0xFF69B4,                 // rgb(255,105,180)
+  indian_red = 0xCD5C5C,               // rgb(205,92,92)
+  indigo = 0x4B0082,                   // rgb(75,0,130)
+  ivory = 0xFFFFF0,                    // rgb(255,255,240)
+  khaki = 0xF0E68C,                    // rgb(240,230,140)
+  lavender = 0xE6E6FA,                 // rgb(230,230,250)
+  lavender_blush = 0xFFF0F5,           // rgb(255,240,245)
+  lawn_green = 0x7CFC00,               // rgb(124,252,0)
+  lemon_chiffon = 0xFFFACD,            // rgb(255,250,205)
+  light_blue = 0xADD8E6,               // rgb(173,216,230)
+  light_coral = 0xF08080,              // rgb(240,128,128)
+  light_cyan = 0xE0FFFF,               // rgb(224,255,255)
+  light_golden_rod_yellow = 0xFAFAD2,  // rgb(250,250,210)
+  light_gray = 0xD3D3D3,               // rgb(211,211,211)
+  light_green = 0x90EE90,              // rgb(144,238,144)
+  light_pink = 0xFFB6C1,               // rgb(255,182,193)
+  light_salmon = 0xFFA07A,             // rgb(255,160,122)
+  light_sea_green = 0x20B2AA,          // rgb(32,178,170)
+  light_sky_blue = 0x87CEFA,           // rgb(135,206,250)
+  light_slate_gray = 0x778899,         // rgb(119,136,153)
+  light_steel_blue = 0xB0C4DE,         // rgb(176,196,222)
+  light_yellow = 0xFFFFE0,             // rgb(255,255,224)
+  lime = 0x00FF00,                     // rgb(0,255,0)
+  lime_green = 0x32CD32,               // rgb(50,205,50)
+  linen = 0xFAF0E6,                    // rgb(250,240,230)
+  magenta = 0xFF00FF,                  // rgb(255,0,255)
+  maroon = 0x800000,                   // rgb(128,0,0)
+  medium_aquamarine = 0x66CDAA,        // rgb(102,205,170)
+  medium_blue = 0x0000CD,              // rgb(0,0,205)
+  medium_orchid = 0xBA55D3,            // rgb(186,85,211)
+  medium_purple = 0x9370DB,            // rgb(147,112,219)
+  medium_sea_green = 0x3CB371,         // rgb(60,179,113)
+  medium_slate_blue = 0x7B68EE,        // rgb(123,104,238)
+  medium_spring_green = 0x00FA9A,      // rgb(0,250,154)
+  medium_turquoise = 0x48D1CC,         // rgb(72,209,204)
+  medium_violet_red = 0xC71585,        // rgb(199,21,133)
+  midnight_blue = 0x191970,            // rgb(25,25,112)
+  mint_cream = 0xF5FFFA,               // rgb(245,255,250)
+  misty_rose = 0xFFE4E1,               // rgb(255,228,225)
+  moccasin = 0xFFE4B5,                 // rgb(255,228,181)
+  navajo_white = 0xFFDEAD,             // rgb(255,222,173)
+  navy = 0x000080,                     // rgb(0,0,128)
+  old_lace = 0xFDF5E6,                 // rgb(253,245,230)
+  olive = 0x808000,                    // rgb(128,128,0)
+  olive_drab = 0x6B8E23,               // rgb(107,142,35)
+  orange = 0xFFA500,                   // rgb(255,165,0)
+  orange_red = 0xFF4500,               // rgb(255,69,0)
+  orchid = 0xDA70D6,                   // rgb(218,112,214)
+  pale_golden_rod = 0xEEE8AA,          // rgb(238,232,170)
+  pale_green = 0x98FB98,               // rgb(152,251,152)
+  pale_turquoise = 0xAFEEEE,           // rgb(175,238,238)
+  pale_violet_red = 0xDB7093,          // rgb(219,112,147)
+  papaya_whip = 0xFFEFD5,              // rgb(255,239,213)
+  peach_puff = 0xFFDAB9,               // rgb(255,218,185)
+  peru = 0xCD853F,                     // rgb(205,133,63)
+  pink = 0xFFC0CB,                     // rgb(255,192,203)
+  plum = 0xDDA0DD,                     // rgb(221,160,221)
+  powder_blue = 0xB0E0E6,              // rgb(176,224,230)
+  purple = 0x800080,                   // rgb(128,0,128)
+  rebecca_purple = 0x663399,           // rgb(102,51,153)
+  red = 0xFF0000,                      // rgb(255,0,0)
+  rosy_brown = 0xBC8F8F,               // rgb(188,143,143)
+  royal_blue = 0x4169E1,               // rgb(65,105,225)
+  saddle_brown = 0x8B4513,             // rgb(139,69,19)
+  salmon = 0xFA8072,                   // rgb(250,128,114)
+  sandy_brown = 0xF4A460,              // rgb(244,164,96)
+  sea_green = 0x2E8B57,                // rgb(46,139,87)
+  sea_shell = 0xFFF5EE,                // rgb(255,245,238)
+  sienna = 0xA0522D,                   // rgb(160,82,45)
+  silver = 0xC0C0C0,                   // rgb(192,192,192)
+  sky_blue = 0x87CEEB,                 // rgb(135,206,235)
+  slate_blue = 0x6A5ACD,               // rgb(106,90,205)
+  slate_gray = 0x708090,               // rgb(112,128,144)
+  snow = 0xFFFAFA,                     // rgb(255,250,250)
+  spring_green = 0x00FF7F,             // rgb(0,255,127)
+  steel_blue = 0x4682B4,               // rgb(70,130,180)
+  tan = 0xD2B48C,                      // rgb(210,180,140)
+  teal = 0x008080,                     // rgb(0,128,128)
+  thistle = 0xD8BFD8,                  // rgb(216,191,216)
+  tomato = 0xFF6347,                   // rgb(255,99,71)
+  turquoise = 0x40E0D0,                // rgb(64,224,208)
+  violet = 0xEE82EE,                   // rgb(238,130,238)
+  wheat = 0xF5DEB3,                    // rgb(245,222,179)
+  white = 0xFFFFFF,                    // rgb(255,255,255)
+  white_smoke = 0xF5F5F5,              // rgb(245,245,245)
+  yellow = 0xFFFF00,                   // rgb(255,255,0)
+  yellow_green = 0x9ACD32              // rgb(154,205,50)
+};                                     // enum class color
+
+enum class terminal_color : uint8_t {
+  black = 30,
+  red,
+  green,
+  yellow,
+  blue,
+  magenta,
+  cyan,
+  white,
+  bright_black = 90,
+  bright_red,
+  bright_green,
+  bright_yellow,
+  bright_blue,
+  bright_magenta,
+  bright_cyan,
+  bright_white
+};
+
+enum class emphasis : uint8_t {
+  bold = 1,
+  faint = 1 << 1,
+  italic = 1 << 2,
+  underline = 1 << 3,
+  blink = 1 << 4,
+  reverse = 1 << 5,
+  conceal = 1 << 6,
+  strikethrough = 1 << 7,
+};
+
+// rgb is a struct for red, green and blue colors.
+// Using the name "rgb" makes some editors show the color in a tooltip.
+struct rgb {
+  FMT_CONSTEXPR rgb() : r(0), g(0), b(0) {}
+  FMT_CONSTEXPR rgb(uint8_t r_, uint8_t g_, uint8_t b_) : r(r_), g(g_), b(b_) {}
+  FMT_CONSTEXPR rgb(uint32_t hex)
+      : r((hex >> 16) & 0xFF), g((hex >> 8) & 0xFF), b(hex & 0xFF) {}
+  FMT_CONSTEXPR rgb(color hex)
+      : r((uint32_t(hex) >> 16) & 0xFF),
+        g((uint32_t(hex) >> 8) & 0xFF),
+        b(uint32_t(hex) & 0xFF) {}
+  uint8_t r;
+  uint8_t g;
+  uint8_t b;
+};
+
+namespace detail {
+
+// color is a struct of either a rgb color or a terminal color.
+struct color_type {
+  FMT_CONSTEXPR color_type() noexcept : is_rgb(), value{} {}
+  FMT_CONSTEXPR color_type(color rgb_color) noexcept : is_rgb(true), value{} {
+    value.rgb_color = static_cast<uint32_t>(rgb_color);
+  }
+  FMT_CONSTEXPR color_type(rgb rgb_color) noexcept : is_rgb(true), value{} {
+    value.rgb_color = (static_cast<uint32_t>(rgb_color.r) << 16) |
+                      (static_cast<uint32_t>(rgb_color.g) << 8) | rgb_color.b;
+  }
+  FMT_CONSTEXPR color_type(terminal_color term_color) noexcept
+      : is_rgb(), value{} {
+    value.term_color = static_cast<uint8_t>(term_color);
+  }
+  bool is_rgb;
+  union color_union {
+    uint8_t term_color;
+    uint32_t rgb_color;
+  } value;
+};
+}  // namespace detail
+
+/** A text style consisting of foreground and background colors and emphasis. */
+class text_style {
+ public:
+  FMT_CONSTEXPR text_style(emphasis em = emphasis()) noexcept
+      : set_foreground_color(), set_background_color(), ems(em) {}
+
+  FMT_CONSTEXPR auto operator|=(const text_style& rhs) -> text_style& {
+    if (!set_foreground_color) {
+      set_foreground_color = rhs.set_foreground_color;
+      foreground_color = rhs.foreground_color;
+    } else if (rhs.set_foreground_color) {
+      if (!foreground_color.is_rgb || !rhs.foreground_color.is_rgb)
+        FMT_THROW(format_error("can't OR a terminal color"));
+      foreground_color.value.rgb_color |= rhs.foreground_color.value.rgb_color;
+    }
+
+    if (!set_background_color) {
+      set_background_color = rhs.set_background_color;
+      background_color = rhs.background_color;
+    } else if (rhs.set_background_color) {
+      if (!background_color.is_rgb || !rhs.background_color.is_rgb)
+        FMT_THROW(format_error("can't OR a terminal color"));
+      background_color.value.rgb_color |= rhs.background_color.value.rgb_color;
+    }
+
+    ems = static_cast<emphasis>(static_cast<uint8_t>(ems) |
+                                static_cast<uint8_t>(rhs.ems));
+    return *this;
+  }
+
+  friend FMT_CONSTEXPR auto operator|(text_style lhs, const text_style& rhs)
+      -> text_style {
+    return lhs |= rhs;
+  }
+
+  FMT_CONSTEXPR auto has_foreground() const noexcept -> bool {
+    return set_foreground_color;
+  }
+  FMT_CONSTEXPR auto has_background() const noexcept -> bool {
+    return set_background_color;
+  }
+  FMT_CONSTEXPR auto has_emphasis() const noexcept -> bool {
+    return static_cast<uint8_t>(ems) != 0;
+  }
+  FMT_CONSTEXPR auto get_foreground() const noexcept -> detail::color_type {
+    FMT_ASSERT(has_foreground(), "no foreground specified for this style");
+    return foreground_color;
+  }
+  FMT_CONSTEXPR auto get_background() const noexcept -> detail::color_type {
+    FMT_ASSERT(has_background(), "no background specified for this style");
+    return background_color;
+  }
+  FMT_CONSTEXPR auto get_emphasis() const noexcept -> emphasis {
+    FMT_ASSERT(has_emphasis(), "no emphasis specified for this style");
+    return ems;
+  }
+
+ private:
+  FMT_CONSTEXPR text_style(bool is_foreground,
+                           detail::color_type text_color) noexcept
+      : set_foreground_color(), set_background_color(), ems() {
+    if (is_foreground) {
+      foreground_color = text_color;
+      set_foreground_color = true;
+    } else {
+      background_color = text_color;
+      set_background_color = true;
+    }
+  }
+
+  friend FMT_CONSTEXPR auto fg(detail::color_type foreground) noexcept
+      -> text_style;
+
+  friend FMT_CONSTEXPR auto bg(detail::color_type background) noexcept
+      -> text_style;
+
+  detail::color_type foreground_color;
+  detail::color_type background_color;
+  bool set_foreground_color;
+  bool set_background_color;
+  emphasis ems;
+};
+
+/** Creates a text style from the foreground (text) color. */
+FMT_CONSTEXPR inline auto fg(detail::color_type foreground) noexcept
+    -> text_style {
+  return text_style(true, foreground);
+}
+
+/** Creates a text style from the background color. */
+FMT_CONSTEXPR inline auto bg(detail::color_type background) noexcept
+    -> text_style {
+  return text_style(false, background);
+}
+
+FMT_CONSTEXPR inline auto operator|(emphasis lhs, emphasis rhs) noexcept
+    -> text_style {
+  return text_style(lhs) | rhs;
+}
+
+namespace detail {
+
+template <typename Char> struct ansi_color_escape {
+  FMT_CONSTEXPR ansi_color_escape(detail::color_type text_color,
+                                  const char* esc) noexcept {
+    // If we have a terminal color, we need to output another escape code
+    // sequence.
+    if (!text_color.is_rgb) {
+      bool is_background = esc == string_view("\x1b[48;2;");
+      uint32_t value = text_color.value.term_color;
+      // Background ASCII codes are the same as the foreground ones but with
+      // 10 more.
+      if (is_background) value += 10u;
+
+      size_t index = 0;
+      buffer[index++] = static_cast<Char>('\x1b');
+      buffer[index++] = static_cast<Char>('[');
+
+      if (value >= 100u) {
+        buffer[index++] = static_cast<Char>('1');
+        value %= 100u;
+      }
+      buffer[index++] = static_cast<Char>('0' + value / 10u);
+      buffer[index++] = static_cast<Char>('0' + value % 10u);
+
+      buffer[index++] = static_cast<Char>('m');
+      buffer[index++] = static_cast<Char>('\0');
+      return;
+    }
+
+    for (int i = 0; i < 7; i++) {
+      buffer[i] = static_cast<Char>(esc[i]);
+    }
+    rgb color(text_color.value.rgb_color);
+    to_esc(color.r, buffer + 7, ';');
+    to_esc(color.g, buffer + 11, ';');
+    to_esc(color.b, buffer + 15, 'm');
+    buffer[19] = static_cast<Char>(0);
+  }
+  FMT_CONSTEXPR ansi_color_escape(emphasis em) noexcept {
+    uint8_t em_codes[num_emphases] = {};
+    if (has_emphasis(em, emphasis::bold)) em_codes[0] = 1;
+    if (has_emphasis(em, emphasis::faint)) em_codes[1] = 2;
+    if (has_emphasis(em, emphasis::italic)) em_codes[2] = 3;
+    if (has_emphasis(em, emphasis::underline)) em_codes[3] = 4;
+    if (has_emphasis(em, emphasis::blink)) em_codes[4] = 5;
+    if (has_emphasis(em, emphasis::reverse)) em_codes[5] = 7;
+    if (has_emphasis(em, emphasis::conceal)) em_codes[6] = 8;
+    if (has_emphasis(em, emphasis::strikethrough)) em_codes[7] = 9;
+
+    size_t index = 0;
+    for (size_t i = 0; i < num_emphases; ++i) {
+      if (!em_codes[i]) continue;
+      buffer[index++] = static_cast<Char>('\x1b');
+      buffer[index++] = static_cast<Char>('[');
+      buffer[index++] = static_cast<Char>('0' + em_codes[i]);
+      buffer[index++] = static_cast<Char>('m');
+    }
+    buffer[index++] = static_cast<Char>(0);
+  }
+  FMT_CONSTEXPR operator const Char*() const noexcept { return buffer; }
+
+  FMT_CONSTEXPR auto begin() const noexcept -> const Char* { return buffer; }
+  FMT_CONSTEXPR_CHAR_TRAITS auto end() const noexcept -> const Char* {
+    return buffer + std::char_traits<Char>::length(buffer);
+  }
+
+ private:
+  static constexpr size_t num_emphases = 8;
+  Char buffer[7u + 3u * num_emphases + 1u];
+
+  static FMT_CONSTEXPR void to_esc(uint8_t c, Char* out,
+                                   char delimiter) noexcept {
+    out[0] = static_cast<Char>('0' + c / 100);
+    out[1] = static_cast<Char>('0' + c / 10 % 10);
+    out[2] = static_cast<Char>('0' + c % 10);
+    out[3] = static_cast<Char>(delimiter);
+  }
+  static FMT_CONSTEXPR auto has_emphasis(emphasis em, emphasis mask) noexcept
+      -> bool {
+    return static_cast<uint8_t>(em) & static_cast<uint8_t>(mask);
+  }
+};
+
+template <typename Char>
+FMT_CONSTEXPR auto make_foreground_color(detail::color_type foreground) noexcept
+    -> ansi_color_escape<Char> {
+  return ansi_color_escape<Char>(foreground, "\x1b[38;2;");
+}
+
+template <typename Char>
+FMT_CONSTEXPR auto make_background_color(detail::color_type background) noexcept
+    -> ansi_color_escape<Char> {
+  return ansi_color_escape<Char>(background, "\x1b[48;2;");
+}
+
+template <typename Char>
+FMT_CONSTEXPR auto make_emphasis(emphasis em) noexcept
+    -> ansi_color_escape<Char> {
+  return ansi_color_escape<Char>(em);
+}
+
+template <typename Char> inline void reset_color(buffer<Char>& buffer) {
+  auto reset_color = string_view("\x1b[0m");
+  buffer.append(reset_color.begin(), reset_color.end());
+}
+
+template <typename T> struct styled_arg : detail::view {
+  const T& value;
+  text_style style;
+  styled_arg(const T& v, text_style s) : value(v), style(s) {}
+};
+
+template <typename Char>
+void vformat_to(buffer<Char>& buf, const text_style& ts,
+                basic_string_view<Char> format_str,
+                basic_format_args<buffer_context<type_identity_t<Char>>> args) {
+  bool has_style = false;
+  if (ts.has_emphasis()) {
+    has_style = true;
+    auto emphasis = detail::make_emphasis<Char>(ts.get_emphasis());
+    buf.append(emphasis.begin(), emphasis.end());
+  }
+  if (ts.has_foreground()) {
+    has_style = true;
+    auto foreground = detail::make_foreground_color<Char>(ts.get_foreground());
+    buf.append(foreground.begin(), foreground.end());
+  }
+  if (ts.has_background()) {
+    has_style = true;
+    auto background = detail::make_background_color<Char>(ts.get_background());
+    buf.append(background.begin(), background.end());
+  }
+  detail::vformat_to(buf, format_str, args, {});
+  if (has_style) detail::reset_color<Char>(buf);
+}
+
+}  // namespace detail
+
+inline void vprint(std::FILE* f, const text_style& ts, string_view fmt,
+                   format_args args) {
+  // Legacy wide streams are not supported.
+  auto buf = memory_buffer();
+  detail::vformat_to(buf, ts, fmt, args);
+  if (detail::is_utf8()) {
+    detail::print(f, string_view(buf.begin(), buf.size()));
+    return;
+  }
+  buf.push_back('\0');
+  int result = std::fputs(buf.data(), f);
+  if (result < 0)
+    FMT_THROW(system_error(errno, FMT_STRING("cannot write to file")));
+}
+
+/**
+  \rst
+  Formats a string and prints it to the specified file stream using ANSI
+  escape sequences to specify text formatting.
+
+  **Example**::
+
+    fmt::print(fmt::emphasis::bold | fg(fmt::color::red),
+               "Elapsed time: {0:.2f} seconds", 1.23);
+  \endrst
+ */
+template <typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_string<S>::value)>
+void print(std::FILE* f, const text_style& ts, const S& format_str,
+           const Args&... args) {
+  vprint(f, ts, format_str,
+         fmt::make_format_args<buffer_context<char_t<S>>>(args...));
+}
+
+/**
+  \rst
+  Formats a string and prints it to stdout using ANSI escape sequences to
+  specify text formatting.
+
+  **Example**::
+
+    fmt::print(fmt::emphasis::bold | fg(fmt::color::red),
+               "Elapsed time: {0:.2f} seconds", 1.23);
+  \endrst
+ */
+template <typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_string<S>::value)>
+void print(const text_style& ts, const S& format_str, const Args&... args) {
+  return print(stdout, ts, format_str, args...);
+}
+
+template <typename S, typename Char = char_t<S>>
+inline auto vformat(
+    const text_style& ts, const S& format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args)
+    -> std::basic_string<Char> {
+  basic_memory_buffer<Char> buf;
+  detail::vformat_to(buf, ts, detail::to_string_view(format_str), args);
+  return fmt::to_string(buf);
+}
+
+/**
+  \rst
+  Formats arguments and returns the result as a string using ANSI
+  escape sequences to specify text formatting.
+
+  **Example**::
+
+    #include <fmt/color.h>
+    std::string message = fmt::format(fmt::emphasis::bold | fg(fmt::color::red),
+                                      "The answer is {}", 42);
+  \endrst
+*/
+template <typename S, typename... Args, typename Char = char_t<S>>
+inline auto format(const text_style& ts, const S& format_str,
+                   const Args&... args) -> std::basic_string<Char> {
+  return fmt::vformat(ts, detail::to_string_view(format_str),
+                      fmt::make_format_args<buffer_context<Char>>(args...));
+}
+
+/**
+  Formats a string with the given text_style and writes the output to ``out``.
+ */
+template <typename OutputIt, typename Char,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value)>
+auto vformat_to(OutputIt out, const text_style& ts,
+                basic_string_view<Char> format_str,
+                basic_format_args<buffer_context<type_identity_t<Char>>> args)
+    -> OutputIt {
+  auto&& buf = detail::get_buffer<Char>(out);
+  detail::vformat_to(buf, ts, format_str, args);
+  return detail::get_iterator(buf, out);
+}
+
+/**
+  \rst
+  Formats arguments with the given text_style, writes the result to the output
+  iterator ``out`` and returns the iterator past the end of the output range.
+
+  **Example**::
+
+    std::vector<char> out;
+    fmt::format_to(std::back_inserter(out),
+                   fmt::emphasis::bold | fg(fmt::color::red), "{}", 42);
+  \endrst
+*/
+template <
+    typename OutputIt, typename S, typename... Args,
+    bool enable = detail::is_output_iterator<OutputIt, char_t<S>>::value &&
+                  detail::is_string<S>::value>
+inline auto format_to(OutputIt out, const text_style& ts, const S& format_str,
+                      Args&&... args) ->
+    typename std::enable_if<enable, OutputIt>::type {
+  return vformat_to(out, ts, detail::to_string_view(format_str),
+                    fmt::make_format_args<buffer_context<char_t<S>>>(args...));
+}
+
+template <typename T, typename Char>
+struct formatter<detail::styled_arg<T>, Char> : formatter<T, Char> {
+  template <typename FormatContext>
+  auto format(const detail::styled_arg<T>& arg, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    const auto& ts = arg.style;
+    const auto& value = arg.value;
+    auto out = ctx.out();
+
+    bool has_style = false;
+    if (ts.has_emphasis()) {
+      has_style = true;
+      auto emphasis = detail::make_emphasis<Char>(ts.get_emphasis());
+      out = std::copy(emphasis.begin(), emphasis.end(), out);
+    }
+    if (ts.has_foreground()) {
+      has_style = true;
+      auto foreground =
+          detail::make_foreground_color<Char>(ts.get_foreground());
+      out = std::copy(foreground.begin(), foreground.end(), out);
+    }
+    if (ts.has_background()) {
+      has_style = true;
+      auto background =
+          detail::make_background_color<Char>(ts.get_background());
+      out = std::copy(background.begin(), background.end(), out);
+    }
+    out = formatter<T, Char>::format(value, ctx);
+    if (has_style) {
+      auto reset_color = string_view("\x1b[0m");
+      out = std::copy(reset_color.begin(), reset_color.end(), out);
+    }
+    return out;
+  }
+};
+
+/**
+  \rst
+  Returns an argument that will be formatted using ANSI escape sequences,
+  to be used in a formatting function.
+
+  **Example**::
+
+    fmt::print("Elapsed time: {0:.2f} seconds",
+               fmt::styled(1.23, fmt::fg(fmt::color::green) |
+                                 fmt::bg(fmt::color::blue)));
+  \endrst
+ */
+template <typename T>
+FMT_CONSTEXPR auto styled(const T& value, text_style ts)
+    -> detail::styled_arg<remove_cvref_t<T>> {
+  return detail::styled_arg<remove_cvref_t<T>>{value, ts};
+}
+
+FMT_END_EXPORT
+FMT_END_NAMESPACE
+
+#endif  // FMT_COLOR_H_
diff --git a/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/compile.h b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/compile.h
new file mode 100644
index 00000000..3b3f166e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/compile.h
@@ -0,0 +1,535 @@
+// Formatting library for C++ - experimental format string compilation
+//
+// Copyright (c) 2012 - present, Victor Zverovich and fmt contributors
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_COMPILE_H_
+#define FMT_COMPILE_H_
+
+#include "format.h"
+
+FMT_BEGIN_NAMESPACE
+namespace detail {
+
+template <typename Char, typename InputIt>
+FMT_CONSTEXPR inline auto copy_str(InputIt begin, InputIt end,
+                                   counting_iterator it) -> counting_iterator {
+  return it + (end - begin);
+}
+
+// A compile-time string which is compiled into fast formatting code.
+class compiled_string {};
+
+template <typename S>
+struct is_compiled_string : std::is_base_of<compiled_string, S> {};
+
+/**
+  \rst
+  Converts a string literal *s* into a format string that will be parsed at
+  compile time and converted into efficient formatting code. Requires C++17
+  ``constexpr if`` compiler support.
+
+  **Example**::
+
+    // Converts 42 into std::string using the most efficient method and no
+    // runtime format string processing.
+    std::string s = fmt::format(FMT_COMPILE("{}"), 42);
+  \endrst
+ */
+#if defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction)
+#  define FMT_COMPILE(s) \
+    FMT_STRING_IMPL(s, fmt::detail::compiled_string, explicit)
+#else
+#  define FMT_COMPILE(s) FMT_STRING(s)
+#endif
+
+#if FMT_USE_NONTYPE_TEMPLATE_ARGS
+template <typename Char, size_t N,
+          fmt::detail_exported::fixed_string<Char, N> Str>
+struct udl_compiled_string : compiled_string {
+  using char_type = Char;
+  explicit constexpr operator basic_string_view<char_type>() const {
+    return {Str.data, N - 1};
+  }
+};
+#endif
+
+template <typename T, typename... Tail>
+auto first(const T& value, const Tail&...) -> const T& {
+  return value;
+}
+
+#if defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction)
+template <typename... Args> struct type_list {};
+
+// Returns a reference to the argument at index N from [first, rest...].
+template <int N, typename T, typename... Args>
+constexpr const auto& get([[maybe_unused]] const T& first,
+                          [[maybe_unused]] const Args&... rest) {
+  static_assert(N < 1 + sizeof...(Args), "index is out of bounds");
+  if constexpr (N == 0)
+    return first;
+  else
+    return detail::get<N - 1>(rest...);
+}
+
+template <typename Char, typename... Args>
+constexpr int get_arg_index_by_name(basic_string_view<Char> name,
+                                    type_list<Args...>) {
+  return get_arg_index_by_name<Args...>(name);
+}
+
+template <int N, typename> struct get_type_impl;
+
+template <int N, typename... Args> struct get_type_impl<N, type_list<Args...>> {
+  using type =
+      remove_cvref_t<decltype(detail::get<N>(std::declval<Args>()...))>;
+};
+
+template <int N, typename T>
+using get_type = typename get_type_impl<N, T>::type;
+
+template <typename T> struct is_compiled_format : std::false_type {};
+
+template <typename Char> struct text {
+  basic_string_view<Char> data;
+  using char_type = Char;
+
+  template <typename OutputIt, typename... Args>
+  constexpr OutputIt format(OutputIt out, const Args&...) const {
+    return write<Char>(out, data);
+  }
+};
+
+template <typename Char>
+struct is_compiled_format<text<Char>> : std::true_type {};
+
+template <typename Char>
+constexpr text<Char> make_text(basic_string_view<Char> s, size_t pos,
+                               size_t size) {
+  return {{&s[pos], size}};
+}
+
+template <typename Char> struct code_unit {
+  Char value;
+  using char_type = Char;
+
+  template <typename OutputIt, typename... Args>
+  constexpr OutputIt format(OutputIt out, const Args&...) const {
+    *out++ = value;
+    return out;
+  }
+};
+
+// This ensures that the argument type is convertible to `const T&`.
+template <typename T, int N, typename... Args>
+constexpr const T& get_arg_checked(const Args&... args) {
+  const auto& arg = detail::get<N>(args...);
+  if constexpr (detail::is_named_arg<remove_cvref_t<decltype(arg)>>()) {
+    return arg.value;
+  } else {
+    return arg;
+  }
+}
+
+template <typename Char>
+struct is_compiled_format<code_unit<Char>> : std::true_type {};
+
+// A replacement field that refers to argument N.
+template <typename Char, typename T, int N> struct field {
+  using char_type = Char;
+
+  template <typename OutputIt, typename... Args>
+  constexpr OutputIt format(OutputIt out, const Args&... args) const {
+    const T& arg = get_arg_checked<T, N>(args...);
+    if constexpr (std::is_convertible_v<T, basic_string_view<Char>>) {
+      auto s = basic_string_view<Char>(arg);
+      return copy_str<Char>(s.begin(), s.end(), out);
+    }
+    return write<Char>(out, arg);
+  }
+};
+
+template <typename Char, typename T, int N>
+struct is_compiled_format<field<Char, T, N>> : std::true_type {};
+
+// A replacement field that refers to argument with name.
+template <typename Char> struct runtime_named_field {
+  using char_type = Char;
+  basic_string_view<Char> name;
+
+  template <typename OutputIt, typename T>
+  constexpr static bool try_format_argument(
+      OutputIt& out,
+      // [[maybe_unused]] due to unused-but-set-parameter warning in GCC 7,8,9
+      [[maybe_unused]] basic_string_view<Char> arg_name, const T& arg) {
+    if constexpr (is_named_arg<typename std::remove_cv<T>::type>::value) {
+      if (arg_name == arg.name) {
+        out = write<Char>(out, arg.value);
+        return true;
+      }
+    }
+    return false;
+  }
+
+  template <typename OutputIt, typename... Args>
+  constexpr OutputIt format(OutputIt out, const Args&... args) const {
+    bool found = (try_format_argument(out, name, args) || ...);
+    if (!found) {
+      FMT_THROW(format_error("argument with specified name is not found"));
+    }
+    return out;
+  }
+};
+
+template <typename Char>
+struct is_compiled_format<runtime_named_field<Char>> : std::true_type {};
+
+// A replacement field that refers to argument N and has format specifiers.
+template <typename Char, typename T, int N> struct spec_field {
+  using char_type = Char;
+  formatter<T, Char> fmt;
+
+  template <typename OutputIt, typename... Args>
+  constexpr FMT_INLINE OutputIt format(OutputIt out,
+                                       const Args&... args) const {
+    const auto& vargs =
+        fmt::make_format_args<basic_format_context<OutputIt, Char>>(args...);
+    basic_format_context<OutputIt, Char> ctx(out, vargs);
+    return fmt.format(get_arg_checked<T, N>(args...), ctx);
+  }
+};
+
+template <typename Char, typename T, int N>
+struct is_compiled_format<spec_field<Char, T, N>> : std::true_type {};
+
+template <typename L, typename R> struct concat {
+  L lhs;
+  R rhs;
+  using char_type = typename L::char_type;
+
+  template <typename OutputIt, typename... Args>
+  constexpr OutputIt format(OutputIt out, const Args&... args) const {
+    out = lhs.format(out, args...);
+    return rhs.format(out, args...);
+  }
+};
+
+template <typename L, typename R>
+struct is_compiled_format<concat<L, R>> : std::true_type {};
+
+template <typename L, typename R>
+constexpr concat<L, R> make_concat(L lhs, R rhs) {
+  return {lhs, rhs};
+}
+
+struct unknown_format {};
+
+template <typename Char>
+constexpr size_t parse_text(basic_string_view<Char> str, size_t pos) {
+  for (size_t size = str.size(); pos != size; ++pos) {
+    if (str[pos] == '{' || str[pos] == '}') break;
+  }
+  return pos;
+}
+
+template <typename Args, size_t POS, int ID, typename S>
+constexpr auto compile_format_string(S format_str);
+
+template <typename Args, size_t POS, int ID, typename T, typename S>
+constexpr auto parse_tail(T head, S format_str) {
+  if constexpr (POS !=
+                basic_string_view<typename S::char_type>(format_str).size()) {
+    constexpr auto tail = compile_format_string<Args, POS, ID>(format_str);
+    if constexpr (std::is_same<remove_cvref_t<decltype(tail)>,
+                               unknown_format>())
+      return tail;
+    else
+      return make_concat(head, tail);
+  } else {
+    return head;
+  }
+}
+
+template <typename T, typename Char> struct parse_specs_result {
+  formatter<T, Char> fmt;
+  size_t end;
+  int next_arg_id;
+};
+
+enum { manual_indexing_id = -1 };
+
+template <typename T, typename Char>
+constexpr parse_specs_result<T, Char> parse_specs(basic_string_view<Char> str,
+                                                  size_t pos, int next_arg_id) {
+  str.remove_prefix(pos);
+  auto ctx =
+      compile_parse_context<Char>(str, max_value<int>(), nullptr, next_arg_id);
+  auto f = formatter<T, Char>();
+  auto end = f.parse(ctx);
+  return {f, pos + fmt::detail::to_unsigned(end - str.data()),
+          next_arg_id == 0 ? manual_indexing_id : ctx.next_arg_id()};
+}
+
+template <typename Char> struct arg_id_handler {
+  arg_ref<Char> arg_id;
+
+  constexpr int on_auto() {
+    FMT_ASSERT(false, "handler cannot be used with automatic indexing");
+    return 0;
+  }
+  constexpr int on_index(int id) {
+    arg_id = arg_ref<Char>(id);
+    return 0;
+  }
+  constexpr int on_name(basic_string_view<Char> id) {
+    arg_id = arg_ref<Char>(id);
+    return 0;
+  }
+};
+
+template <typename Char> struct parse_arg_id_result {
+  arg_ref<Char> arg_id;
+  const Char* arg_id_end;
+};
+
+template <int ID, typename Char>
+constexpr auto parse_arg_id(const Char* begin, const Char* end) {
+  auto handler = arg_id_handler<Char>{arg_ref<Char>{}};
+  auto arg_id_end = parse_arg_id(begin, end, handler);
+  return parse_arg_id_result<Char>{handler.arg_id, arg_id_end};
+}
+
+template <typename T, typename Enable = void> struct field_type {
+  using type = remove_cvref_t<T>;
+};
+
+template <typename T>
+struct field_type<T, enable_if_t<detail::is_named_arg<T>::value>> {
+  using type = remove_cvref_t<decltype(T::value)>;
+};
+
+template <typename T, typename Args, size_t END_POS, int ARG_INDEX, int NEXT_ID,
+          typename S>
+constexpr auto parse_replacement_field_then_tail(S format_str) {
+  using char_type = typename S::char_type;
+  constexpr auto str = basic_string_view<char_type>(format_str);
+  constexpr char_type c = END_POS != str.size() ? str[END_POS] : char_type();
+  if constexpr (c == '}') {
+    return parse_tail<Args, END_POS + 1, NEXT_ID>(
+        field<char_type, typename field_type<T>::type, ARG_INDEX>(),
+        format_str);
+  } else if constexpr (c != ':') {
+    FMT_THROW(format_error("expected ':'"));
+  } else {
+    constexpr auto result = parse_specs<typename field_type<T>::type>(
+        str, END_POS + 1, NEXT_ID == manual_indexing_id ? 0 : NEXT_ID);
+    if constexpr (result.end >= str.size() || str[result.end] != '}') {
+      FMT_THROW(format_error("expected '}'"));
+      return 0;
+    } else {
+      return parse_tail<Args, result.end + 1, result.next_arg_id>(
+          spec_field<char_type, typename field_type<T>::type, ARG_INDEX>{
+              result.fmt},
+          format_str);
+    }
+  }
+}
+
+// Compiles a non-empty format string and returns the compiled representation
+// or unknown_format() on unrecognized input.
+template <typename Args, size_t POS, int ID, typename S>
+constexpr auto compile_format_string(S format_str) {
+  using char_type = typename S::char_type;
+  constexpr auto str = basic_string_view<char_type>(format_str);
+  if constexpr (str[POS] == '{') {
+    if constexpr (POS + 1 == str.size())
+      FMT_THROW(format_error("unmatched '{' in format string"));
+    if constexpr (str[POS + 1] == '{') {
+      return parse_tail<Args, POS + 2, ID>(make_text(str, POS, 1), format_str);
+    } else if constexpr (str[POS + 1] == '}' || str[POS + 1] == ':') {
+      static_assert(ID != manual_indexing_id,
+                    "cannot switch from manual to automatic argument indexing");
+      constexpr auto next_id =
+          ID != manual_indexing_id ? ID + 1 : manual_indexing_id;
+      return parse_replacement_field_then_tail<get_type<ID, Args>, Args,
+                                               POS + 1, ID, next_id>(
+          format_str);
+    } else {
+      constexpr auto arg_id_result =
+          parse_arg_id<ID>(str.data() + POS + 1, str.data() + str.size());
+      constexpr auto arg_id_end_pos = arg_id_result.arg_id_end - str.data();
+      constexpr char_type c =
+          arg_id_end_pos != str.size() ? str[arg_id_end_pos] : char_type();
+      static_assert(c == '}' || c == ':', "missing '}' in format string");
+      if constexpr (arg_id_result.arg_id.kind == arg_id_kind::index) {
+        static_assert(
+            ID == manual_indexing_id || ID == 0,
+            "cannot switch from automatic to manual argument indexing");
+        constexpr auto arg_index = arg_id_result.arg_id.val.index;
+        return parse_replacement_field_then_tail<get_type<arg_index, Args>,
+                                                 Args, arg_id_end_pos,
+                                                 arg_index, manual_indexing_id>(
+            format_str);
+      } else if constexpr (arg_id_result.arg_id.kind == arg_id_kind::name) {
+        constexpr auto arg_index =
+            get_arg_index_by_name(arg_id_result.arg_id.val.name, Args{});
+        if constexpr (arg_index >= 0) {
+          constexpr auto next_id =
+              ID != manual_indexing_id ? ID + 1 : manual_indexing_id;
+          return parse_replacement_field_then_tail<
+              decltype(get_type<arg_index, Args>::value), Args, arg_id_end_pos,
+              arg_index, next_id>(format_str);
+        } else if constexpr (c == '}') {
+          return parse_tail<Args, arg_id_end_pos + 1, ID>(
+              runtime_named_field<char_type>{arg_id_result.arg_id.val.name},
+              format_str);
+        } else if constexpr (c == ':') {
+          return unknown_format();  // no type info for specs parsing
+        }
+      }
+    }
+  } else if constexpr (str[POS] == '}') {
+    if constexpr (POS + 1 == str.size())
+      FMT_THROW(format_error("unmatched '}' in format string"));
+    return parse_tail<Args, POS + 2, ID>(make_text(str, POS, 1), format_str);
+  } else {
+    constexpr auto end = parse_text(str, POS + 1);
+    if constexpr (end - POS > 1) {
+      return parse_tail<Args, end, ID>(make_text(str, POS, end - POS),
+                                       format_str);
+    } else {
+      return parse_tail<Args, end, ID>(code_unit<char_type>{str[POS]},
+                                       format_str);
+    }
+  }
+}
+
+template <typename... Args, typename S,
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
+constexpr auto compile(S format_str) {
+  constexpr auto str = basic_string_view<typename S::char_type>(format_str);
+  if constexpr (str.size() == 0) {
+    return detail::make_text(str, 0, 0);
+  } else {
+    constexpr auto result =
+        detail::compile_format_string<detail::type_list<Args...>, 0, 0>(
+            format_str);
+    return result;
+  }
+}
+#endif  // defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction)
+}  // namespace detail
+
+FMT_BEGIN_EXPORT
+
+#if defined(__cpp_if_constexpr) && defined(__cpp_return_type_deduction)
+
+template <typename CompiledFormat, typename... Args,
+          typename Char = typename CompiledFormat::char_type,
+          FMT_ENABLE_IF(detail::is_compiled_format<CompiledFormat>::value)>
+FMT_INLINE std::basic_string<Char> format(const CompiledFormat& cf,
+                                          const Args&... args) {
+  auto s = std::basic_string<Char>();
+  cf.format(std::back_inserter(s), args...);
+  return s;
+}
+
+template <typename OutputIt, typename CompiledFormat, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_format<CompiledFormat>::value)>
+constexpr FMT_INLINE OutputIt format_to(OutputIt out, const CompiledFormat& cf,
+                                        const Args&... args) {
+  return cf.format(out, args...);
+}
+
+template <typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
+FMT_INLINE std::basic_string<typename S::char_type> format(const S&,
+                                                           Args&&... args) {
+  if constexpr (std::is_same<typename S::char_type, char>::value) {
+    constexpr auto str = basic_string_view<typename S::char_type>(S());
+    if constexpr (str.size() == 2 && str[0] == '{' && str[1] == '}') {
+      const auto& first = detail::first(args...);
+      if constexpr (detail::is_named_arg<
+                        remove_cvref_t<decltype(first)>>::value) {
+        return fmt::to_string(first.value);
+      } else {
+        return fmt::to_string(first);
+      }
+    }
+  }
+  constexpr auto compiled = detail::compile<Args...>(S());
+  if constexpr (std::is_same<remove_cvref_t<decltype(compiled)>,
+                             detail::unknown_format>()) {
+    return fmt::format(
+        static_cast<basic_string_view<typename S::char_type>>(S()),
+        std::forward<Args>(args)...);
+  } else {
+    return fmt::format(compiled, std::forward<Args>(args)...);
+  }
+}
+
+template <typename OutputIt, typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
+FMT_CONSTEXPR OutputIt format_to(OutputIt out, const S&, Args&&... args) {
+  constexpr auto compiled = detail::compile<Args...>(S());
+  if constexpr (std::is_same<remove_cvref_t<decltype(compiled)>,
+                             detail::unknown_format>()) {
+    return fmt::format_to(
+        out, static_cast<basic_string_view<typename S::char_type>>(S()),
+        std::forward<Args>(args)...);
+  } else {
+    return fmt::format_to(out, compiled, std::forward<Args>(args)...);
+  }
+}
+#endif
+
+template <typename OutputIt, typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
+auto format_to_n(OutputIt out, size_t n, const S& format_str, Args&&... args)
+    -> format_to_n_result<OutputIt> {
+  using traits = detail::fixed_buffer_traits;
+  auto buf = detail::iterator_buffer<OutputIt, char, traits>(out, n);
+  fmt::format_to(std::back_inserter(buf), format_str,
+                 std::forward<Args>(args)...);
+  return {buf.out(), buf.count()};
+}
+
+template <typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
+FMT_CONSTEXPR20 auto formatted_size(const S& format_str, const Args&... args)
+    -> size_t {
+  return fmt::format_to(detail::counting_iterator(), format_str, args...)
+      .count();
+}
+
+template <typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
+void print(std::FILE* f, const S& format_str, const Args&... args) {
+  memory_buffer buffer;
+  fmt::format_to(std::back_inserter(buffer), format_str, args...);
+  detail::print(f, {buffer.data(), buffer.size()});
+}
+
+template <typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
+void print(const S& format_str, const Args&... args) {
+  print(stdout, format_str, args...);
+}
+
+#if FMT_USE_NONTYPE_TEMPLATE_ARGS
+inline namespace literals {
+template <detail_exported::fixed_string Str> constexpr auto operator""_cf() {
+  using char_t = remove_cvref_t<decltype(Str.data[0])>;
+  return detail::udl_compiled_string<char_t, sizeof(Str.data) / sizeof(char_t),
+                                     Str>();
+}
+}  // namespace literals
+#endif
+
+FMT_END_EXPORT
+FMT_END_NAMESPACE
+
+#endif  // FMT_COMPILE_H_
diff --git a/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/core.h b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/core.h
new file mode 100644
index 00000000..b51c1406
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/core.h
@@ -0,0 +1,2969 @@
+// Formatting library for C++ - the core API for char/UTF-8
+//
+// Copyright (c) 2012 - present, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_CORE_H_
+#define FMT_CORE_H_
+
+#include <cstddef>  // std::byte
+#include <cstdio>   // std::FILE
+#include <cstring>  // std::strlen
+#include <iterator>
+#include <limits>
+#include <memory>  // std::addressof
+#include <string>
+#include <type_traits>
+
+// The fmt library version in the form major * 10000 + minor * 100 + patch.
+#define FMT_VERSION 100201
+
+#if defined(__clang__) && !defined(__ibmxl__)
+#  define FMT_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__)
+#else
+#  define FMT_CLANG_VERSION 0
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && \
+    !defined(__NVCOMPILER)
+#  define FMT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#else
+#  define FMT_GCC_VERSION 0
+#endif
+
+#ifndef FMT_GCC_PRAGMA
+// Workaround _Pragma bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59884.
+#  if FMT_GCC_VERSION >= 504
+#    define FMT_GCC_PRAGMA(arg) _Pragma(arg)
+#  else
+#    define FMT_GCC_PRAGMA(arg)
+#  endif
+#endif
+
+#ifdef __ICL
+#  define FMT_ICC_VERSION __ICL
+#elif defined(__INTEL_COMPILER)
+#  define FMT_ICC_VERSION __INTEL_COMPILER
+#else
+#  define FMT_ICC_VERSION 0
+#endif
+
+#ifdef _MSC_VER
+#  define FMT_MSC_VERSION _MSC_VER
+#  define FMT_MSC_WARNING(...) __pragma(warning(__VA_ARGS__))
+#else
+#  define FMT_MSC_VERSION 0
+#  define FMT_MSC_WARNING(...)
+#endif
+
+#ifdef _MSVC_LANG
+#  define FMT_CPLUSPLUS _MSVC_LANG
+#else
+#  define FMT_CPLUSPLUS __cplusplus
+#endif
+
+#ifdef __has_feature
+#  define FMT_HAS_FEATURE(x) __has_feature(x)
+#else
+#  define FMT_HAS_FEATURE(x) 0
+#endif
+
+#if defined(__has_include) || FMT_ICC_VERSION >= 1600 || FMT_MSC_VERSION > 1900
+#  define FMT_HAS_INCLUDE(x) __has_include(x)
+#else
+#  define FMT_HAS_INCLUDE(x) 0
+#endif
+
+#ifdef __has_cpp_attribute
+#  define FMT_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+#  define FMT_HAS_CPP_ATTRIBUTE(x) 0
+#endif
+
+#define FMT_HAS_CPP14_ATTRIBUTE(attribute) \
+  (FMT_CPLUSPLUS >= 201402L && FMT_HAS_CPP_ATTRIBUTE(attribute))
+
+#define FMT_HAS_CPP17_ATTRIBUTE(attribute) \
+  (FMT_CPLUSPLUS >= 201703L && FMT_HAS_CPP_ATTRIBUTE(attribute))
+
+// Check if relaxed C++14 constexpr is supported.
+// GCC doesn't allow throw in constexpr until version 6 (bug 67371).
+#ifndef FMT_USE_CONSTEXPR
+#  if (FMT_HAS_FEATURE(cxx_relaxed_constexpr) || FMT_MSC_VERSION >= 1912 || \
+       (FMT_GCC_VERSION >= 600 && FMT_CPLUSPLUS >= 201402L)) &&             \
+      !FMT_ICC_VERSION && (!defined(__NVCC__) || FMT_CPLUSPLUS >= 202002L)
+#    define FMT_USE_CONSTEXPR 1
+#  else
+#    define FMT_USE_CONSTEXPR 0
+#  endif
+#endif
+#if FMT_USE_CONSTEXPR
+#  define FMT_CONSTEXPR constexpr
+#else
+#  define FMT_CONSTEXPR
+#endif
+
+#if (FMT_CPLUSPLUS >= 202002L ||                                \
+     (FMT_CPLUSPLUS >= 201709L && FMT_GCC_VERSION >= 1002)) &&  \
+    ((!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10) &&  \
+     (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION >= 10000) && \
+     (!FMT_MSC_VERSION || FMT_MSC_VERSION >= 1928)) &&          \
+    defined(__cpp_lib_is_constant_evaluated)
+#  define FMT_CONSTEXPR20 constexpr
+#else
+#  define FMT_CONSTEXPR20
+#endif
+
+// Check if constexpr std::char_traits<>::{compare,length} are supported.
+#if defined(__GLIBCXX__)
+#  if FMT_CPLUSPLUS >= 201703L && defined(_GLIBCXX_RELEASE) && \
+      _GLIBCXX_RELEASE >= 7  // GCC 7+ libstdc++ has _GLIBCXX_RELEASE.
+#    define FMT_CONSTEXPR_CHAR_TRAITS constexpr
+#  endif
+#elif defined(_LIBCPP_VERSION) && FMT_CPLUSPLUS >= 201703L && \
+    _LIBCPP_VERSION >= 4000
+#  define FMT_CONSTEXPR_CHAR_TRAITS constexpr
+#elif FMT_MSC_VERSION >= 1914 && FMT_CPLUSPLUS >= 201703L
+#  define FMT_CONSTEXPR_CHAR_TRAITS constexpr
+#endif
+#ifndef FMT_CONSTEXPR_CHAR_TRAITS
+#  define FMT_CONSTEXPR_CHAR_TRAITS
+#endif
+
+// Check if exceptions are disabled.
+#ifndef FMT_EXCEPTIONS
+#  if (defined(__GNUC__) && !defined(__EXCEPTIONS)) || \
+      (FMT_MSC_VERSION && !_HAS_EXCEPTIONS)
+#    define FMT_EXCEPTIONS 0
+#  else
+#    define FMT_EXCEPTIONS 1
+#  endif
+#endif
+
+// Disable [[noreturn]] on MSVC/NVCC because of bogus unreachable code warnings.
+#if FMT_EXCEPTIONS && FMT_HAS_CPP_ATTRIBUTE(noreturn) && !FMT_MSC_VERSION && \
+    !defined(__NVCC__)
+#  define FMT_NORETURN [[noreturn]]
+#else
+#  define FMT_NORETURN
+#endif
+
+#ifndef FMT_NODISCARD
+#  if FMT_HAS_CPP17_ATTRIBUTE(nodiscard)
+#    define FMT_NODISCARD [[nodiscard]]
+#  else
+#    define FMT_NODISCARD
+#  endif
+#endif
+
+#ifndef FMT_INLINE
+#  if FMT_GCC_VERSION || FMT_CLANG_VERSION
+#    define FMT_INLINE inline __attribute__((always_inline))
+#  else
+#    define FMT_INLINE inline
+#  endif
+#endif
+
+#ifdef _MSC_VER
+#  define FMT_UNCHECKED_ITERATOR(It) \
+    using _Unchecked_type = It  // Mark iterator as checked.
+#else
+#  define FMT_UNCHECKED_ITERATOR(It) using unchecked_type = It
+#endif
+
+#ifndef FMT_BEGIN_NAMESPACE
+#  define FMT_BEGIN_NAMESPACE \
+    namespace fmt {           \
+    inline namespace v10 {
+#  define FMT_END_NAMESPACE \
+    }                       \
+    }
+#endif
+
+#ifndef FMT_EXPORT
+#  define FMT_EXPORT
+#  define FMT_BEGIN_EXPORT
+#  define FMT_END_EXPORT
+#endif
+
+#if FMT_GCC_VERSION || FMT_CLANG_VERSION
+#  define FMT_VISIBILITY(value) __attribute__((visibility(value)))
+#else
+#  define FMT_VISIBILITY(value)
+#endif
+
+#if !defined(FMT_HEADER_ONLY) && defined(_WIN32)
+#  if defined(FMT_LIB_EXPORT)
+#    define FMT_API __declspec(dllexport)
+#  elif defined(FMT_SHARED)
+#    define FMT_API __declspec(dllimport)
+#  endif
+#elif defined(FMT_LIB_EXPORT) || defined(FMT_SHARED)
+#  define FMT_API FMT_VISIBILITY("default")
+#endif
+#ifndef FMT_API
+#  define FMT_API
+#endif
+
+// libc++ supports string_view in pre-c++17.
+#if FMT_HAS_INCLUDE(<string_view>) && \
+    (FMT_CPLUSPLUS >= 201703L || defined(_LIBCPP_VERSION))
+#  include <string_view>
+#  define FMT_USE_STRING_VIEW
+#elif FMT_HAS_INCLUDE("experimental/string_view") && FMT_CPLUSPLUS >= 201402L
+#  include <experimental/string_view>
+#  define FMT_USE_EXPERIMENTAL_STRING_VIEW
+#endif
+
+#ifndef FMT_UNICODE
+#  define FMT_UNICODE !FMT_MSC_VERSION
+#endif
+
+#ifndef FMT_CONSTEVAL
+#  if ((FMT_GCC_VERSION >= 1000 || FMT_CLANG_VERSION >= 1101) && \
+       (!defined(__apple_build_version__) ||                     \
+        __apple_build_version__ >= 14000029L) &&                 \
+       FMT_CPLUSPLUS >= 202002L) ||                              \
+      (defined(__cpp_consteval) &&                               \
+       (!FMT_MSC_VERSION || FMT_MSC_VERSION >= 1929))
+// consteval is broken in MSVC before VS2019 version 16.10 and Apple clang
+// before 14.
+#    define FMT_CONSTEVAL consteval
+#    define FMT_HAS_CONSTEVAL
+#  else
+#    define FMT_CONSTEVAL
+#  endif
+#endif
+
+#ifndef FMT_USE_NONTYPE_TEMPLATE_ARGS
+#  if defined(__cpp_nontype_template_args) &&                  \
+      ((FMT_GCC_VERSION >= 903 && FMT_CPLUSPLUS >= 201709L) || \
+       __cpp_nontype_template_args >= 201911L) &&              \
+      !defined(__NVCOMPILER) && !defined(__LCC__)
+#    define FMT_USE_NONTYPE_TEMPLATE_ARGS 1
+#  else
+#    define FMT_USE_NONTYPE_TEMPLATE_ARGS 0
+#  endif
+#endif
+
+// GCC < 5 requires this-> in decltype
+#ifndef FMT_DECLTYPE_THIS
+#  if FMT_GCC_VERSION && FMT_GCC_VERSION < 500
+#    define FMT_DECLTYPE_THIS this->
+#  else
+#    define FMT_DECLTYPE_THIS
+#  endif
+#endif
+
+// Enable minimal optimizations for more compact code in debug mode.
+FMT_GCC_PRAGMA("GCC push_options")
+#if !defined(__OPTIMIZE__) && !defined(__NVCOMPILER) && !defined(__LCC__) && \
+    !defined(__CUDACC__)
+FMT_GCC_PRAGMA("GCC optimize(\"Og\")")
+#endif
+
+FMT_BEGIN_NAMESPACE
+
+// Implementations of enable_if_t and other metafunctions for older systems.
+template <bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+template <bool B, typename T, typename F>
+using conditional_t = typename std::conditional<B, T, F>::type;
+template <bool B> using bool_constant = std::integral_constant<bool, B>;
+template <typename T>
+using remove_reference_t = typename std::remove_reference<T>::type;
+template <typename T>
+using remove_const_t = typename std::remove_const<T>::type;
+template <typename T>
+using remove_cvref_t = typename std::remove_cv<remove_reference_t<T>>::type;
+template <typename T> struct type_identity {
+  using type = T;
+};
+template <typename T> using type_identity_t = typename type_identity<T>::type;
+template <typename T>
+using underlying_t = typename std::underlying_type<T>::type;
+
+// Checks whether T is a container with contiguous storage.
+template <typename T> struct is_contiguous : std::false_type {};
+template <typename Char>
+struct is_contiguous<std::basic_string<Char>> : std::true_type {};
+
+struct monostate {
+  constexpr monostate() {}
+};
+
+// An enable_if helper to be used in template parameters which results in much
+// shorter symbols: https://godbolt.org/z/sWw4vP. Extra parentheses are needed
+// to workaround a bug in MSVC 2019 (see #1140 and #1186).
+#ifdef FMT_DOC
+#  define FMT_ENABLE_IF(...)
+#else
+#  define FMT_ENABLE_IF(...) fmt::enable_if_t<(__VA_ARGS__), int> = 0
+#endif
+
+// This is defined in core.h instead of format.h to avoid injecting in std.
+// It is a template to avoid undesirable implicit conversions to std::byte.
+#ifdef __cpp_lib_byte
+template <typename T, FMT_ENABLE_IF(std::is_same<T, std::byte>::value)>
+inline auto format_as(T b) -> unsigned char {
+  return static_cast<unsigned char>(b);
+}
+#endif
+
+namespace detail {
+// Suppresses "unused variable" warnings with the method described in
+// https://herbsutter.com/2009/10/18/mailbag-shutting-up-compiler-warnings/.
+// (void)var does not work on many Intel compilers.
+template <typename... T> FMT_CONSTEXPR void ignore_unused(const T&...) {}
+
+constexpr FMT_INLINE auto is_constant_evaluated(
+    bool default_value = false) noexcept -> bool {
+// Workaround for incompatibility between libstdc++ consteval-based
+// std::is_constant_evaluated() implementation and clang-14.
+// https://github.com/fmtlib/fmt/issues/3247
+#if FMT_CPLUSPLUS >= 202002L && defined(_GLIBCXX_RELEASE) && \
+    _GLIBCXX_RELEASE >= 12 &&                                \
+    (FMT_CLANG_VERSION >= 1400 && FMT_CLANG_VERSION < 1500)
+  ignore_unused(default_value);
+  return __builtin_is_constant_evaluated();
+#elif defined(__cpp_lib_is_constant_evaluated)
+  ignore_unused(default_value);
+  return std::is_constant_evaluated();
+#else
+  return default_value;
+#endif
+}
+
+// Suppresses "conditional expression is constant" warnings.
+template <typename T> constexpr FMT_INLINE auto const_check(T value) -> T {
+  return value;
+}
+
+FMT_NORETURN FMT_API void assert_fail(const char* file, int line,
+                                      const char* message);
+
+#ifndef FMT_ASSERT
+#  ifdef NDEBUG
+// FMT_ASSERT is not empty to avoid -Wempty-body.
+#    define FMT_ASSERT(condition, message) \
+      fmt::detail::ignore_unused((condition), (message))
+#  else
+#    define FMT_ASSERT(condition, message)                                    \
+      ((condition) /* void() fails with -Winvalid-constexpr on clang 4.0.1 */ \
+           ? (void)0                                                          \
+           : fmt::detail::assert_fail(__FILE__, __LINE__, (message)))
+#  endif
+#endif
+
+#if defined(FMT_USE_STRING_VIEW)
+template <typename Char> using std_string_view = std::basic_string_view<Char>;
+#elif defined(FMT_USE_EXPERIMENTAL_STRING_VIEW)
+template <typename Char>
+using std_string_view = std::experimental::basic_string_view<Char>;
+#else
+template <typename T> struct std_string_view {};
+#endif
+
+#ifdef FMT_USE_INT128
+// Do nothing.
+#elif defined(__SIZEOF_INT128__) && !defined(__NVCC__) && \
+    !(FMT_CLANG_VERSION && FMT_MSC_VERSION)
+#  define FMT_USE_INT128 1
+using int128_opt = __int128_t;  // An optional native 128-bit integer.
+using uint128_opt = __uint128_t;
+template <typename T> inline auto convert_for_visit(T value) -> T {
+  return value;
+}
+#else
+#  define FMT_USE_INT128 0
+#endif
+#if !FMT_USE_INT128
+enum class int128_opt {};
+enum class uint128_opt {};
+// Reduce template instantiations.
+template <typename T> auto convert_for_visit(T) -> monostate { return {}; }
+#endif
+
+// Casts a nonnegative integer to unsigned.
+template <typename Int>
+FMT_CONSTEXPR auto to_unsigned(Int value) ->
+    typename std::make_unsigned<Int>::type {
+  FMT_ASSERT(std::is_unsigned<Int>::value || value >= 0, "negative value");
+  return static_cast<typename std::make_unsigned<Int>::type>(value);
+}
+
+FMT_CONSTEXPR inline auto is_utf8() -> bool {
+  FMT_MSC_WARNING(suppress : 4566) constexpr unsigned char section[] = "\u00A7";
+
+  // Avoid buggy sign extensions in MSVC's constant evaluation mode (#2297).
+  using uchar = unsigned char;
+  return FMT_UNICODE || (sizeof(section) == 3 && uchar(section[0]) == 0xC2 &&
+                         uchar(section[1]) == 0xA7);
+}
+}  // namespace detail
+
+/**
+  An implementation of ``std::basic_string_view`` for pre-C++17. It provides a
+  subset of the API. ``fmt::basic_string_view`` is used for format strings even
+  if ``std::string_view`` is available to prevent issues when a library is
+  compiled with a different ``-std`` option than the client code (which is not
+  recommended).
+ */
+FMT_EXPORT
+template <typename Char> class basic_string_view {
+ private:
+  const Char* data_;
+  size_t size_;
+
+ public:
+  using value_type = Char;
+  using iterator = const Char*;
+
+  constexpr basic_string_view() noexcept : data_(nullptr), size_(0) {}
+
+  /** Constructs a string reference object from a C string and a size. */
+  constexpr basic_string_view(const Char* s, size_t count) noexcept
+      : data_(s), size_(count) {}
+
+  /**
+    \rst
+    Constructs a string reference object from a C string computing
+    the size with ``std::char_traits<Char>::length``.
+    \endrst
+   */
+  FMT_CONSTEXPR_CHAR_TRAITS
+  FMT_INLINE
+  basic_string_view(const Char* s)
+      : data_(s),
+        size_(detail::const_check(std::is_same<Char, char>::value &&
+                                  !detail::is_constant_evaluated(true))
+                  ? std::strlen(reinterpret_cast<const char*>(s))
+                  : std::char_traits<Char>::length(s)) {}
+
+  /** Constructs a string reference from a ``std::basic_string`` object. */
+  template <typename Traits, typename Alloc>
+  FMT_CONSTEXPR basic_string_view(
+      const std::basic_string<Char, Traits, Alloc>& s) noexcept
+      : data_(s.data()), size_(s.size()) {}
+
+  template <typename S, FMT_ENABLE_IF(std::is_same<
+                                      S, detail::std_string_view<Char>>::value)>
+  FMT_CONSTEXPR basic_string_view(S s) noexcept
+      : data_(s.data()), size_(s.size()) {}
+
+  /** Returns a pointer to the string data. */
+  constexpr auto data() const noexcept -> const Char* { return data_; }
+
+  /** Returns the string size. */
+  constexpr auto size() const noexcept -> size_t { return size_; }
+
+  constexpr auto begin() const noexcept -> iterator { return data_; }
+  constexpr auto end() const noexcept -> iterator { return data_ + size_; }
+
+  constexpr auto operator[](size_t pos) const noexcept -> const Char& {
+    return data_[pos];
+  }
+
+  FMT_CONSTEXPR void remove_prefix(size_t n) noexcept {
+    data_ += n;
+    size_ -= n;
+  }
+
+  FMT_CONSTEXPR_CHAR_TRAITS auto starts_with(
+      basic_string_view<Char> sv) const noexcept -> bool {
+    return size_ >= sv.size_ &&
+           std::char_traits<Char>::compare(data_, sv.data_, sv.size_) == 0;
+  }
+  FMT_CONSTEXPR_CHAR_TRAITS auto starts_with(Char c) const noexcept -> bool {
+    return size_ >= 1 && std::char_traits<Char>::eq(*data_, c);
+  }
+  FMT_CONSTEXPR_CHAR_TRAITS auto starts_with(const Char* s) const -> bool {
+    return starts_with(basic_string_view<Char>(s));
+  }
+
+  // Lexicographically compare this string reference to other.
+  FMT_CONSTEXPR_CHAR_TRAITS auto compare(basic_string_view other) const -> int {
+    size_t str_size = size_ < other.size_ ? size_ : other.size_;
+    int result = std::char_traits<Char>::compare(data_, other.data_, str_size);
+    if (result == 0)
+      result = size_ == other.size_ ? 0 : (size_ < other.size_ ? -1 : 1);
+    return result;
+  }
+
+  FMT_CONSTEXPR_CHAR_TRAITS friend auto operator==(basic_string_view lhs,
+                                                   basic_string_view rhs)
+      -> bool {
+    return lhs.compare(rhs) == 0;
+  }
+  friend auto operator!=(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) != 0;
+  }
+  friend auto operator<(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) < 0;
+  }
+  friend auto operator<=(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) <= 0;
+  }
+  friend auto operator>(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) > 0;
+  }
+  friend auto operator>=(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) >= 0;
+  }
+};
+
+FMT_EXPORT
+using string_view = basic_string_view<char>;
+
+/** Specifies if ``T`` is a character type. Can be specialized by users. */
+FMT_EXPORT
+template <typename T> struct is_char : std::false_type {};
+template <> struct is_char<char> : std::true_type {};
+
+namespace detail {
+
+// A base class for compile-time strings.
+struct compile_string {};
+
+template <typename S>
+struct is_compile_string : std::is_base_of<compile_string, S> {};
+
+template <typename Char, FMT_ENABLE_IF(is_char<Char>::value)>
+FMT_INLINE auto to_string_view(const Char* s) -> basic_string_view<Char> {
+  return s;
+}
+template <typename Char, typename Traits, typename Alloc>
+inline auto to_string_view(const std::basic_string<Char, Traits, Alloc>& s)
+    -> basic_string_view<Char> {
+  return s;
+}
+template <typename Char>
+constexpr auto to_string_view(basic_string_view<Char> s)
+    -> basic_string_view<Char> {
+  return s;
+}
+template <typename Char,
+          FMT_ENABLE_IF(!std::is_empty<std_string_view<Char>>::value)>
+inline auto to_string_view(std_string_view<Char> s) -> basic_string_view<Char> {
+  return s;
+}
+template <typename S, FMT_ENABLE_IF(is_compile_string<S>::value)>
+constexpr auto to_string_view(const S& s)
+    -> basic_string_view<typename S::char_type> {
+  return basic_string_view<typename S::char_type>(s);
+}
+void to_string_view(...);
+
+// Specifies whether S is a string type convertible to fmt::basic_string_view.
+// It should be a constexpr function but MSVC 2017 fails to compile it in
+// enable_if and MSVC 2015 fails to compile it as an alias template.
+// ADL is intentionally disabled as to_string_view is not an extension point.
+template <typename S>
+struct is_string
+    : std::is_class<decltype(detail::to_string_view(std::declval<S>()))> {};
+
+template <typename S, typename = void> struct char_t_impl {};
+template <typename S> struct char_t_impl<S, enable_if_t<is_string<S>::value>> {
+  using result = decltype(to_string_view(std::declval<S>()));
+  using type = typename result::value_type;
+};
+
+enum class type {
+  none_type,
+  // Integer types should go first,
+  int_type,
+  uint_type,
+  long_long_type,
+  ulong_long_type,
+  int128_type,
+  uint128_type,
+  bool_type,
+  char_type,
+  last_integer_type = char_type,
+  // followed by floating-point types.
+  float_type,
+  double_type,
+  long_double_type,
+  last_numeric_type = long_double_type,
+  cstring_type,
+  string_type,
+  pointer_type,
+  custom_type
+};
+
+// Maps core type T to the corresponding type enum constant.
+template <typename T, typename Char>
+struct type_constant : std::integral_constant<type, type::custom_type> {};
+
+#define FMT_TYPE_CONSTANT(Type, constant) \
+  template <typename Char>                \
+  struct type_constant<Type, Char>        \
+      : std::integral_constant<type, type::constant> {}
+
+FMT_TYPE_CONSTANT(int, int_type);
+FMT_TYPE_CONSTANT(unsigned, uint_type);
+FMT_TYPE_CONSTANT(long long, long_long_type);
+FMT_TYPE_CONSTANT(unsigned long long, ulong_long_type);
+FMT_TYPE_CONSTANT(int128_opt, int128_type);
+FMT_TYPE_CONSTANT(uint128_opt, uint128_type);
+FMT_TYPE_CONSTANT(bool, bool_type);
+FMT_TYPE_CONSTANT(Char, char_type);
+FMT_TYPE_CONSTANT(float, float_type);
+FMT_TYPE_CONSTANT(double, double_type);
+FMT_TYPE_CONSTANT(long double, long_double_type);
+FMT_TYPE_CONSTANT(const Char*, cstring_type);
+FMT_TYPE_CONSTANT(basic_string_view<Char>, string_type);
+FMT_TYPE_CONSTANT(const void*, pointer_type);
+
+constexpr auto is_integral_type(type t) -> bool {
+  return t > type::none_type && t <= type::last_integer_type;
+}
+constexpr auto is_arithmetic_type(type t) -> bool {
+  return t > type::none_type && t <= type::last_numeric_type;
+}
+
+constexpr auto set(type rhs) -> int { return 1 << static_cast<int>(rhs); }
+constexpr auto in(type t, int set) -> bool {
+  return ((set >> static_cast<int>(t)) & 1) != 0;
+}
+
+// Bitsets of types.
+enum {
+  sint_set =
+      set(type::int_type) | set(type::long_long_type) | set(type::int128_type),
+  uint_set = set(type::uint_type) | set(type::ulong_long_type) |
+             set(type::uint128_type),
+  bool_set = set(type::bool_type),
+  char_set = set(type::char_type),
+  float_set = set(type::float_type) | set(type::double_type) |
+              set(type::long_double_type),
+  string_set = set(type::string_type),
+  cstring_set = set(type::cstring_type),
+  pointer_set = set(type::pointer_type)
+};
+
+// DEPRECATED!
+FMT_NORETURN FMT_API void throw_format_error(const char* message);
+
+struct error_handler {
+  constexpr error_handler() = default;
+
+  // This function is intentionally not constexpr to give a compile-time error.
+  FMT_NORETURN void on_error(const char* message) {
+    throw_format_error(message);
+  }
+};
+}  // namespace detail
+
+/** Throws ``format_error`` with a given message. */
+using detail::throw_format_error;
+
+/** String's character type. */
+template <typename S> using char_t = typename detail::char_t_impl<S>::type;
+
+/**
+  \rst
+  Parsing context consisting of a format string range being parsed and an
+  argument counter for automatic indexing.
+  You can use the ``format_parse_context`` type alias for ``char`` instead.
+  \endrst
+ */
+FMT_EXPORT
+template <typename Char> class basic_format_parse_context {
+ private:
+  basic_string_view<Char> format_str_;
+  int next_arg_id_;
+
+  FMT_CONSTEXPR void do_check_arg_id(int id);
+
+ public:
+  using char_type = Char;
+  using iterator = const Char*;
+
+  explicit constexpr basic_format_parse_context(
+      basic_string_view<Char> format_str, int next_arg_id = 0)
+      : format_str_(format_str), next_arg_id_(next_arg_id) {}
+
+  /**
+    Returns an iterator to the beginning of the format string range being
+    parsed.
+   */
+  constexpr auto begin() const noexcept -> iterator {
+    return format_str_.begin();
+  }
+
+  /**
+    Returns an iterator past the end of the format string range being parsed.
+   */
+  constexpr auto end() const noexcept -> iterator { return format_str_.end(); }
+
+  /** Advances the begin iterator to ``it``. */
+  FMT_CONSTEXPR void advance_to(iterator it) {
+    format_str_.remove_prefix(detail::to_unsigned(it - begin()));
+  }
+
+  /**
+    Reports an error if using the manual argument indexing; otherwise returns
+    the next argument index and switches to the automatic indexing.
+   */
+  FMT_CONSTEXPR auto next_arg_id() -> int {
+    if (next_arg_id_ < 0) {
+      detail::throw_format_error(
+          "cannot switch from manual to automatic argument indexing");
+      return 0;
+    }
+    int id = next_arg_id_++;
+    do_check_arg_id(id);
+    return id;
+  }
+
+  /**
+    Reports an error if using the automatic argument indexing; otherwise
+    switches to the manual indexing.
+   */
+  FMT_CONSTEXPR void check_arg_id(int id) {
+    if (next_arg_id_ > 0) {
+      detail::throw_format_error(
+          "cannot switch from automatic to manual argument indexing");
+      return;
+    }
+    next_arg_id_ = -1;
+    do_check_arg_id(id);
+  }
+  FMT_CONSTEXPR void check_arg_id(basic_string_view<Char>) {}
+  FMT_CONSTEXPR void check_dynamic_spec(int arg_id);
+};
+
+FMT_EXPORT
+using format_parse_context = basic_format_parse_context<char>;
+
+namespace detail {
+// A parse context with extra data used only in compile-time checks.
+template <typename Char>
+class compile_parse_context : public basic_format_parse_context<Char> {
+ private:
+  int num_args_;
+  const type* types_;
+  using base = basic_format_parse_context<Char>;
+
+ public:
+  explicit FMT_CONSTEXPR compile_parse_context(
+      basic_string_view<Char> format_str, int num_args, const type* types,
+      int next_arg_id = 0)
+      : base(format_str, next_arg_id), num_args_(num_args), types_(types) {}
+
+  constexpr auto num_args() const -> int { return num_args_; }
+  constexpr auto arg_type(int id) const -> type { return types_[id]; }
+
+  FMT_CONSTEXPR auto next_arg_id() -> int {
+    int id = base::next_arg_id();
+    if (id >= num_args_) throw_format_error("argument not found");
+    return id;
+  }
+
+  FMT_CONSTEXPR void check_arg_id(int id) {
+    base::check_arg_id(id);
+    if (id >= num_args_) throw_format_error("argument not found");
+  }
+  using base::check_arg_id;
+
+  FMT_CONSTEXPR void check_dynamic_spec(int arg_id) {
+    detail::ignore_unused(arg_id);
+#if !defined(__LCC__)
+    if (arg_id < num_args_ && types_ && !is_integral_type(types_[arg_id]))
+      throw_format_error("width/precision is not integer");
+#endif
+  }
+};
+
+// Extracts a reference to the container from back_insert_iterator.
+template <typename Container>
+inline auto get_container(std::back_insert_iterator<Container> it)
+    -> Container& {
+  using base = std::back_insert_iterator<Container>;
+  struct accessor : base {
+    accessor(base b) : base(b) {}
+    using base::container;
+  };
+  return *accessor(it).container;
+}
+
+template <typename Char, typename InputIt, typename OutputIt>
+FMT_CONSTEXPR auto copy_str(InputIt begin, InputIt end, OutputIt out)
+    -> OutputIt {
+  while (begin != end) *out++ = static_cast<Char>(*begin++);
+  return out;
+}
+
+template <typename Char, typename T, typename U,
+          FMT_ENABLE_IF(
+              std::is_same<remove_const_t<T>, U>::value&& is_char<U>::value)>
+FMT_CONSTEXPR auto copy_str(T* begin, T* end, U* out) -> U* {
+  if (is_constant_evaluated()) return copy_str<Char, T*, U*>(begin, end, out);
+  auto size = to_unsigned(end - begin);
+  if (size > 0) memcpy(out, begin, size * sizeof(U));
+  return out + size;
+}
+
+/**
+  \rst
+  A contiguous memory buffer with an optional growing ability. It is an internal
+  class and shouldn't be used directly, only via `~fmt::basic_memory_buffer`.
+  \endrst
+ */
+template <typename T> class buffer {
+ private:
+  T* ptr_;
+  size_t size_;
+  size_t capacity_;
+
+ protected:
+  // Don't initialize ptr_ since it is not accessed to save a few cycles.
+  FMT_MSC_WARNING(suppress : 26495)
+  FMT_CONSTEXPR buffer(size_t sz) noexcept : size_(sz), capacity_(sz) {}
+
+  FMT_CONSTEXPR20 buffer(T* p = nullptr, size_t sz = 0, size_t cap = 0) noexcept
+      : ptr_(p), size_(sz), capacity_(cap) {}
+
+  FMT_CONSTEXPR20 ~buffer() = default;
+  buffer(buffer&&) = default;
+
+  /** Sets the buffer data and capacity. */
+  FMT_CONSTEXPR void set(T* buf_data, size_t buf_capacity) noexcept {
+    ptr_ = buf_data;
+    capacity_ = buf_capacity;
+  }
+
+  /** Increases the buffer capacity to hold at least *capacity* elements. */
+  // DEPRECATED!
+  virtual FMT_CONSTEXPR20 void grow(size_t capacity) = 0;
+
+ public:
+  using value_type = T;
+  using const_reference = const T&;
+
+  buffer(const buffer&) = delete;
+  void operator=(const buffer&) = delete;
+
+  FMT_INLINE auto begin() noexcept -> T* { return ptr_; }
+  FMT_INLINE auto end() noexcept -> T* { return ptr_ + size_; }
+
+  FMT_INLINE auto begin() const noexcept -> const T* { return ptr_; }
+  FMT_INLINE auto end() const noexcept -> const T* { return ptr_ + size_; }
+
+  /** Returns the size of this buffer. */
+  constexpr auto size() const noexcept -> size_t { return size_; }
+
+  /** Returns the capacity of this buffer. */
+  constexpr auto capacity() const noexcept -> size_t { return capacity_; }
+
+  /** Returns a pointer to the buffer data (not null-terminated). */
+  FMT_CONSTEXPR auto data() noexcept -> T* { return ptr_; }
+  FMT_CONSTEXPR auto data() const noexcept -> const T* { return ptr_; }
+
+  /** Clears this buffer. */
+  void clear() { size_ = 0; }
+
+  // Tries resizing the buffer to contain *count* elements. If T is a POD type
+  // the new elements may not be initialized.
+  FMT_CONSTEXPR20 void try_resize(size_t count) {
+    try_reserve(count);
+    size_ = count <= capacity_ ? count : capacity_;
+  }
+
+  // Tries increasing the buffer capacity to *new_capacity*. It can increase the
+  // capacity by a smaller amount than requested but guarantees there is space
+  // for at least one additional element either by increasing the capacity or by
+  // flushing the buffer if it is full.
+  FMT_CONSTEXPR20 void try_reserve(size_t new_capacity) {
+    if (new_capacity > capacity_) grow(new_capacity);
+  }
+
+  FMT_CONSTEXPR20 void push_back(const T& value) {
+    try_reserve(size_ + 1);
+    ptr_[size_++] = value;
+  }
+
+  /** Appends data to the end of the buffer. */
+  template <typename U> void append(const U* begin, const U* end);
+
+  template <typename Idx> FMT_CONSTEXPR auto operator[](Idx index) -> T& {
+    return ptr_[index];
+  }
+  template <typename Idx>
+  FMT_CONSTEXPR auto operator[](Idx index) const -> const T& {
+    return ptr_[index];
+  }
+};
+
+struct buffer_traits {
+  explicit buffer_traits(size_t) {}
+  auto count() const -> size_t { return 0; }
+  auto limit(size_t size) -> size_t { return size; }
+};
+
+class fixed_buffer_traits {
+ private:
+  size_t count_ = 0;
+  size_t limit_;
+
+ public:
+  explicit fixed_buffer_traits(size_t limit) : limit_(limit) {}
+  auto count() const -> size_t { return count_; }
+  auto limit(size_t size) -> size_t {
+    size_t n = limit_ > count_ ? limit_ - count_ : 0;
+    count_ += size;
+    return size < n ? size : n;
+  }
+};
+
+// A buffer that writes to an output iterator when flushed.
+template <typename OutputIt, typename T, typename Traits = buffer_traits>
+class iterator_buffer final : public Traits, public buffer<T> {
+ private:
+  OutputIt out_;
+  enum { buffer_size = 256 };
+  T data_[buffer_size];
+
+ protected:
+  FMT_CONSTEXPR20 void grow(size_t) override {
+    if (this->size() == buffer_size) flush();
+  }
+
+  void flush() {
+    auto size = this->size();
+    this->clear();
+    out_ = copy_str<T>(data_, data_ + this->limit(size), out_);
+  }
+
+ public:
+  explicit iterator_buffer(OutputIt out, size_t n = buffer_size)
+      : Traits(n), buffer<T>(data_, 0, buffer_size), out_(out) {}
+  iterator_buffer(iterator_buffer&& other)
+      : Traits(other), buffer<T>(data_, 0, buffer_size), out_(other.out_) {}
+  ~iterator_buffer() { flush(); }
+
+  auto out() -> OutputIt {
+    flush();
+    return out_;
+  }
+  auto count() const -> size_t { return Traits::count() + this->size(); }
+};
+
+template <typename T>
+class iterator_buffer<T*, T, fixed_buffer_traits> final
+    : public fixed_buffer_traits,
+      public buffer<T> {
+ private:
+  T* out_;
+  enum { buffer_size = 256 };
+  T data_[buffer_size];
+
+ protected:
+  FMT_CONSTEXPR20 void grow(size_t) override {
+    if (this->size() == this->capacity()) flush();
+  }
+
+  void flush() {
+    size_t n = this->limit(this->size());
+    if (this->data() == out_) {
+      out_ += n;
+      this->set(data_, buffer_size);
+    }
+    this->clear();
+  }
+
+ public:
+  explicit iterator_buffer(T* out, size_t n = buffer_size)
+      : fixed_buffer_traits(n), buffer<T>(out, 0, n), out_(out) {}
+  iterator_buffer(iterator_buffer&& other)
+      : fixed_buffer_traits(other),
+        buffer<T>(std::move(other)),
+        out_(other.out_) {
+    if (this->data() != out_) {
+      this->set(data_, buffer_size);
+      this->clear();
+    }
+  }
+  ~iterator_buffer() { flush(); }
+
+  auto out() -> T* {
+    flush();
+    return out_;
+  }
+  auto count() const -> size_t {
+    return fixed_buffer_traits::count() + this->size();
+  }
+};
+
+template <typename T> class iterator_buffer<T*, T> final : public buffer<T> {
+ protected:
+  FMT_CONSTEXPR20 void grow(size_t) override {}
+
+ public:
+  explicit iterator_buffer(T* out, size_t = 0) : buffer<T>(out, 0, ~size_t()) {}
+
+  auto out() -> T* { return &*this->end(); }
+};
+
+// A buffer that writes to a container with the contiguous storage.
+template <typename Container>
+class iterator_buffer<std::back_insert_iterator<Container>,
+                      enable_if_t<is_contiguous<Container>::value,
+                                  typename Container::value_type>>
+    final : public buffer<typename Container::value_type> {
+ private:
+  Container& container_;
+
+ protected:
+  FMT_CONSTEXPR20 void grow(size_t capacity) override {
+    container_.resize(capacity);
+    this->set(&container_[0], capacity);
+  }
+
+ public:
+  explicit iterator_buffer(Container& c)
+      : buffer<typename Container::value_type>(c.size()), container_(c) {}
+  explicit iterator_buffer(std::back_insert_iterator<Container> out, size_t = 0)
+      : iterator_buffer(get_container(out)) {}
+
+  auto out() -> std::back_insert_iterator<Container> {
+    return std::back_inserter(container_);
+  }
+};
+
+// A buffer that counts the number of code units written discarding the output.
+template <typename T = char> class counting_buffer final : public buffer<T> {
+ private:
+  enum { buffer_size = 256 };
+  T data_[buffer_size];
+  size_t count_ = 0;
+
+ protected:
+  FMT_CONSTEXPR20 void grow(size_t) override {
+    if (this->size() != buffer_size) return;
+    count_ += this->size();
+    this->clear();
+  }
+
+ public:
+  counting_buffer() : buffer<T>(data_, 0, buffer_size) {}
+
+  auto count() -> size_t { return count_ + this->size(); }
+};
+}  // namespace detail
+
+template <typename Char>
+FMT_CONSTEXPR void basic_format_parse_context<Char>::do_check_arg_id(int id) {
+  // Argument id is only checked at compile-time during parsing because
+  // formatting has its own validation.
+  if (detail::is_constant_evaluated() &&
+      (!FMT_GCC_VERSION || FMT_GCC_VERSION >= 1200)) {
+    using context = detail::compile_parse_context<Char>;
+    if (id >= static_cast<context*>(this)->num_args())
+      detail::throw_format_error("argument not found");
+  }
+}
+
+template <typename Char>
+FMT_CONSTEXPR void basic_format_parse_context<Char>::check_dynamic_spec(
+    int arg_id) {
+  if (detail::is_constant_evaluated() &&
+      (!FMT_GCC_VERSION || FMT_GCC_VERSION >= 1200)) {
+    using context = detail::compile_parse_context<Char>;
+    static_cast<context*>(this)->check_dynamic_spec(arg_id);
+  }
+}
+
+FMT_EXPORT template <typename Context> class basic_format_arg;
+FMT_EXPORT template <typename Context> class basic_format_args;
+FMT_EXPORT template <typename Context> class dynamic_format_arg_store;
+
+// A formatter for objects of type T.
+FMT_EXPORT
+template <typename T, typename Char = char, typename Enable = void>
+struct formatter {
+  // A deleted default constructor indicates a disabled formatter.
+  formatter() = delete;
+};
+
+// Specifies if T has an enabled formatter specialization. A type can be
+// formattable even if it doesn't have a formatter e.g. via a conversion.
+template <typename T, typename Context>
+using has_formatter =
+    std::is_constructible<typename Context::template formatter_type<T>>;
+
+// An output iterator that appends to a buffer.
+// It is used to reduce symbol sizes for the common case.
+class appender : public std::back_insert_iterator<detail::buffer<char>> {
+  using base = std::back_insert_iterator<detail::buffer<char>>;
+
+ public:
+  using std::back_insert_iterator<detail::buffer<char>>::back_insert_iterator;
+  appender(base it) noexcept : base(it) {}
+  FMT_UNCHECKED_ITERATOR(appender);
+
+  auto operator++() noexcept -> appender& { return *this; }
+  auto operator++(int) noexcept -> appender { return *this; }
+};
+
+namespace detail {
+
+template <typename Context, typename T>
+constexpr auto has_const_formatter_impl(T*)
+    -> decltype(typename Context::template formatter_type<T>().format(
+                    std::declval<const T&>(), std::declval<Context&>()),
+                true) {
+  return true;
+}
+template <typename Context>
+constexpr auto has_const_formatter_impl(...) -> bool {
+  return false;
+}
+template <typename T, typename Context>
+constexpr auto has_const_formatter() -> bool {
+  return has_const_formatter_impl<Context>(static_cast<T*>(nullptr));
+}
+
+template <typename T>
+using buffer_appender = conditional_t<std::is_same<T, char>::value, appender,
+                                      std::back_insert_iterator<buffer<T>>>;
+
+// Maps an output iterator to a buffer.
+template <typename T, typename OutputIt>
+auto get_buffer(OutputIt out) -> iterator_buffer<OutputIt, T> {
+  return iterator_buffer<OutputIt, T>(out);
+}
+template <typename T, typename Buf,
+          FMT_ENABLE_IF(std::is_base_of<buffer<char>, Buf>::value)>
+auto get_buffer(std::back_insert_iterator<Buf> out) -> buffer<char>& {
+  return get_container(out);
+}
+
+template <typename Buf, typename OutputIt>
+FMT_INLINE auto get_iterator(Buf& buf, OutputIt) -> decltype(buf.out()) {
+  return buf.out();
+}
+template <typename T, typename OutputIt>
+auto get_iterator(buffer<T>&, OutputIt out) -> OutputIt {
+  return out;
+}
+
+struct view {};
+
+template <typename Char, typename T> struct named_arg : view {
+  const Char* name;
+  const T& value;
+  named_arg(const Char* n, const T& v) : name(n), value(v) {}
+};
+
+template <typename Char> struct named_arg_info {
+  const Char* name;
+  int id;
+};
+
+template <typename T, typename Char, size_t NUM_ARGS, size_t NUM_NAMED_ARGS>
+struct arg_data {
+  // args_[0].named_args points to named_args_ to avoid bloating format_args.
+  // +1 to workaround a bug in gcc 7.5 that causes duplicated-branches warning.
+  T args_[1 + (NUM_ARGS != 0 ? NUM_ARGS : +1)];
+  named_arg_info<Char> named_args_[NUM_NAMED_ARGS];
+
+  template <typename... U>
+  arg_data(const U&... init) : args_{T(named_args_, NUM_NAMED_ARGS), init...} {}
+  arg_data(const arg_data& other) = delete;
+  auto args() const -> const T* { return args_ + 1; }
+  auto named_args() -> named_arg_info<Char>* { return named_args_; }
+};
+
+template <typename T, typename Char, size_t NUM_ARGS>
+struct arg_data<T, Char, NUM_ARGS, 0> {
+  // +1 to workaround a bug in gcc 7.5 that causes duplicated-branches warning.
+  T args_[NUM_ARGS != 0 ? NUM_ARGS : +1];
+
+  template <typename... U>
+  FMT_CONSTEXPR FMT_INLINE arg_data(const U&... init) : args_{init...} {}
+  FMT_CONSTEXPR FMT_INLINE auto args() const -> const T* { return args_; }
+  FMT_CONSTEXPR FMT_INLINE auto named_args() -> std::nullptr_t {
+    return nullptr;
+  }
+};
+
+template <typename Char>
+inline void init_named_args(named_arg_info<Char>*, int, int) {}
+
+template <typename T> struct is_named_arg : std::false_type {};
+template <typename T> struct is_statically_named_arg : std::false_type {};
+
+template <typename T, typename Char>
+struct is_named_arg<named_arg<Char, T>> : std::true_type {};
+
+template <typename Char, typename T, typename... Tail,
+          FMT_ENABLE_IF(!is_named_arg<T>::value)>
+void init_named_args(named_arg_info<Char>* named_args, int arg_count,
+                     int named_arg_count, const T&, const Tail&... args) {
+  init_named_args(named_args, arg_count + 1, named_arg_count, args...);
+}
+
+template <typename Char, typename T, typename... Tail,
+          FMT_ENABLE_IF(is_named_arg<T>::value)>
+void init_named_args(named_arg_info<Char>* named_args, int arg_count,
+                     int named_arg_count, const T& arg, const Tail&... args) {
+  named_args[named_arg_count++] = {arg.name, arg_count};
+  init_named_args(named_args, arg_count + 1, named_arg_count, args...);
+}
+
+template <typename... Args>
+FMT_CONSTEXPR FMT_INLINE void init_named_args(std::nullptr_t, int, int,
+                                              const Args&...) {}
+
+template <bool B = false> constexpr auto count() -> size_t { return B ? 1 : 0; }
+template <bool B1, bool B2, bool... Tail> constexpr auto count() -> size_t {
+  return (B1 ? 1 : 0) + count<B2, Tail...>();
+}
+
+template <typename... Args> constexpr auto count_named_args() -> size_t {
+  return count<is_named_arg<Args>::value...>();
+}
+
+template <typename... Args>
+constexpr auto count_statically_named_args() -> size_t {
+  return count<is_statically_named_arg<Args>::value...>();
+}
+
+struct unformattable {};
+struct unformattable_char : unformattable {};
+struct unformattable_pointer : unformattable {};
+
+template <typename Char> struct string_value {
+  const Char* data;
+  size_t size;
+};
+
+template <typename Char> struct named_arg_value {
+  const named_arg_info<Char>* data;
+  size_t size;
+};
+
+template <typename Context> struct custom_value {
+  using parse_context = typename Context::parse_context_type;
+  void* value;
+  void (*format)(void* arg, parse_context& parse_ctx, Context& ctx);
+};
+
+// A formatting argument value.
+template <typename Context> class value {
+ public:
+  using char_type = typename Context::char_type;
+
+  union {
+    monostate no_value;
+    int int_value;
+    unsigned uint_value;
+    long long long_long_value;
+    unsigned long long ulong_long_value;
+    int128_opt int128_value;
+    uint128_opt uint128_value;
+    bool bool_value;
+    char_type char_value;
+    float float_value;
+    double double_value;
+    long double long_double_value;
+    const void* pointer;
+    string_value<char_type> string;
+    custom_value<Context> custom;
+    named_arg_value<char_type> named_args;
+  };
+
+  constexpr FMT_INLINE value() : no_value() {}
+  constexpr FMT_INLINE value(int val) : int_value(val) {}
+  constexpr FMT_INLINE value(unsigned val) : uint_value(val) {}
+  constexpr FMT_INLINE value(long long val) : long_long_value(val) {}
+  constexpr FMT_INLINE value(unsigned long long val) : ulong_long_value(val) {}
+  FMT_INLINE value(int128_opt val) : int128_value(val) {}
+  FMT_INLINE value(uint128_opt val) : uint128_value(val) {}
+  constexpr FMT_INLINE value(float val) : float_value(val) {}
+  constexpr FMT_INLINE value(double val) : double_value(val) {}
+  FMT_INLINE value(long double val) : long_double_value(val) {}
+  constexpr FMT_INLINE value(bool val) : bool_value(val) {}
+  constexpr FMT_INLINE value(char_type val) : char_value(val) {}
+  FMT_CONSTEXPR FMT_INLINE value(const char_type* val) {
+    string.data = val;
+    if (is_constant_evaluated()) string.size = {};
+  }
+  FMT_CONSTEXPR FMT_INLINE value(basic_string_view<char_type> val) {
+    string.data = val.data();
+    string.size = val.size();
+  }
+  FMT_INLINE value(const void* val) : pointer(val) {}
+  FMT_INLINE value(const named_arg_info<char_type>* args, size_t size)
+      : named_args{args, size} {}
+
+  template <typename T> FMT_CONSTEXPR20 FMT_INLINE value(T& val) {
+    using value_type = remove_const_t<T>;
+    custom.value = const_cast<value_type*>(std::addressof(val));
+    // Get the formatter type through the context to allow different contexts
+    // have different extension points, e.g. `formatter<T>` for `format` and
+    // `printf_formatter<T>` for `printf`.
+    custom.format = format_custom_arg<
+        value_type, typename Context::template formatter_type<value_type>>;
+  }
+  value(unformattable);
+  value(unformattable_char);
+  value(unformattable_pointer);
+
+ private:
+  // Formats an argument of a custom type, such as a user-defined class.
+  template <typename T, typename Formatter>
+  static void format_custom_arg(void* arg,
+                                typename Context::parse_context_type& parse_ctx,
+                                Context& ctx) {
+    auto f = Formatter();
+    parse_ctx.advance_to(f.parse(parse_ctx));
+    using qualified_type =
+        conditional_t<has_const_formatter<T, Context>(), const T, T>;
+    // Calling format through a mutable reference is deprecated.
+    ctx.advance_to(f.format(*static_cast<qualified_type*>(arg), ctx));
+  }
+};
+
+// To minimize the number of types we need to deal with, long is translated
+// either to int or to long long depending on its size.
+enum { long_short = sizeof(long) == sizeof(int) };
+using long_type = conditional_t<long_short, int, long long>;
+using ulong_type = conditional_t<long_short, unsigned, unsigned long long>;
+
+template <typename T> struct format_as_result {
+  template <typename U,
+            FMT_ENABLE_IF(std::is_enum<U>::value || std::is_class<U>::value)>
+  static auto map(U*) -> remove_cvref_t<decltype(format_as(std::declval<U>()))>;
+  static auto map(...) -> void;
+
+  using type = decltype(map(static_cast<T*>(nullptr)));
+};
+template <typename T> using format_as_t = typename format_as_result<T>::type;
+
+template <typename T>
+struct has_format_as
+    : bool_constant<!std::is_same<format_as_t<T>, void>::value> {};
+
+// Maps formatting arguments to core types.
+// arg_mapper reports errors by returning unformattable instead of using
+// static_assert because it's used in the is_formattable trait.
+template <typename Context> struct arg_mapper {
+  using char_type = typename Context::char_type;
+
+  FMT_CONSTEXPR FMT_INLINE auto map(signed char val) -> int { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(unsigned char val) -> unsigned {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(short val) -> int { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(unsigned short val) -> unsigned {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(int val) -> int { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(unsigned val) -> unsigned { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(long val) -> long_type { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(unsigned long val) -> ulong_type {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(long long val) -> long long { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(unsigned long long val)
+      -> unsigned long long {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(int128_opt val) -> int128_opt {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(uint128_opt val) -> uint128_opt {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(bool val) -> bool { return val; }
+
+  template <typename T, FMT_ENABLE_IF(std::is_same<T, char>::value ||
+                                      std::is_same<T, char_type>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(T val) -> char_type {
+    return val;
+  }
+  template <typename T, enable_if_t<(std::is_same<T, wchar_t>::value ||
+#ifdef __cpp_char8_t
+                                     std::is_same<T, char8_t>::value ||
+#endif
+                                     std::is_same<T, char16_t>::value ||
+                                     std::is_same<T, char32_t>::value) &&
+                                        !std::is_same<T, char_type>::value,
+                                    int> = 0>
+  FMT_CONSTEXPR FMT_INLINE auto map(T) -> unformattable_char {
+    return {};
+  }
+
+  FMT_CONSTEXPR FMT_INLINE auto map(float val) -> float { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(double val) -> double { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(long double val) -> long double {
+    return val;
+  }
+
+  FMT_CONSTEXPR FMT_INLINE auto map(char_type* val) -> const char_type* {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(const char_type* val) -> const char_type* {
+    return val;
+  }
+  template <typename T,
+            FMT_ENABLE_IF(is_string<T>::value && !std::is_pointer<T>::value &&
+                          std::is_same<char_type, char_t<T>>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(const T& val)
+      -> basic_string_view<char_type> {
+    return to_string_view(val);
+  }
+  template <typename T,
+            FMT_ENABLE_IF(is_string<T>::value && !std::is_pointer<T>::value &&
+                          !std::is_same<char_type, char_t<T>>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(const T&) -> unformattable_char {
+    return {};
+  }
+
+  FMT_CONSTEXPR FMT_INLINE auto map(void* val) -> const void* { return val; }
+  FMT_CONSTEXPR FMT_INLINE auto map(const void* val) -> const void* {
+    return val;
+  }
+  FMT_CONSTEXPR FMT_INLINE auto map(std::nullptr_t val) -> const void* {
+    return val;
+  }
+
+  // Use SFINAE instead of a const T* parameter to avoid a conflict with the
+  // array overload.
+  template <
+      typename T,
+      FMT_ENABLE_IF(
+          std::is_pointer<T>::value || std::is_member_pointer<T>::value ||
+          std::is_function<typename std::remove_pointer<T>::type>::value ||
+          (std::is_array<T>::value &&
+           !std::is_convertible<T, const char_type*>::value))>
+  FMT_CONSTEXPR auto map(const T&) -> unformattable_pointer {
+    return {};
+  }
+
+  template <typename T, std::size_t N,
+            FMT_ENABLE_IF(!std::is_same<T, wchar_t>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(const T (&values)[N]) -> const T (&)[N] {
+    return values;
+  }
+
+  // Only map owning types because mapping views can be unsafe.
+  template <typename T, typename U = format_as_t<T>,
+            FMT_ENABLE_IF(std::is_arithmetic<U>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(const T& val)
+      -> decltype(FMT_DECLTYPE_THIS map(U())) {
+    return map(format_as(val));
+  }
+
+  template <typename T, typename U = remove_const_t<T>>
+  struct formattable : bool_constant<has_const_formatter<U, Context>() ||
+                                     (has_formatter<U, Context>::value &&
+                                      !std::is_const<T>::value)> {};
+
+  template <typename T, FMT_ENABLE_IF(formattable<T>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto do_map(T& val) -> T& {
+    return val;
+  }
+  template <typename T, FMT_ENABLE_IF(!formattable<T>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto do_map(T&) -> unformattable {
+    return {};
+  }
+
+  template <typename T, typename U = remove_const_t<T>,
+            FMT_ENABLE_IF((std::is_class<U>::value || std::is_enum<U>::value ||
+                           std::is_union<U>::value) &&
+                          !is_string<U>::value && !is_char<U>::value &&
+                          !is_named_arg<U>::value &&
+                          !std::is_arithmetic<format_as_t<U>>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(T& val)
+      -> decltype(FMT_DECLTYPE_THIS do_map(val)) {
+    return do_map(val);
+  }
+
+  template <typename T, FMT_ENABLE_IF(is_named_arg<T>::value)>
+  FMT_CONSTEXPR FMT_INLINE auto map(const T& named_arg)
+      -> decltype(FMT_DECLTYPE_THIS map(named_arg.value)) {
+    return map(named_arg.value);
+  }
+
+  auto map(...) -> unformattable { return {}; }
+};
+
+// A type constant after applying arg_mapper<Context>.
+template <typename T, typename Context>
+using mapped_type_constant =
+    type_constant<decltype(arg_mapper<Context>().map(std::declval<const T&>())),
+                  typename Context::char_type>;
+
+enum { packed_arg_bits = 4 };
+// Maximum number of arguments with packed types.
+enum { max_packed_args = 62 / packed_arg_bits };
+enum : unsigned long long { is_unpacked_bit = 1ULL << 63 };
+enum : unsigned long long { has_named_args_bit = 1ULL << 62 };
+
+template <typename Char, typename InputIt>
+auto copy_str(InputIt begin, InputIt end, appender out) -> appender {
+  get_container(out).append(begin, end);
+  return out;
+}
+template <typename Char, typename InputIt>
+auto copy_str(InputIt begin, InputIt end,
+              std::back_insert_iterator<std::string> out)
+    -> std::back_insert_iterator<std::string> {
+  get_container(out).append(begin, end);
+  return out;
+}
+
+template <typename Char, typename R, typename OutputIt>
+FMT_CONSTEXPR auto copy_str(R&& rng, OutputIt out) -> OutputIt {
+  return detail::copy_str<Char>(rng.begin(), rng.end(), out);
+}
+
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 500
+// A workaround for gcc 4.8 to make void_t work in a SFINAE context.
+template <typename...> struct void_t_impl {
+  using type = void;
+};
+template <typename... T> using void_t = typename void_t_impl<T...>::type;
+#else
+template <typename...> using void_t = void;
+#endif
+
+template <typename It, typename T, typename Enable = void>
+struct is_output_iterator : std::false_type {};
+
+template <typename It, typename T>
+struct is_output_iterator<
+    It, T,
+    void_t<typename std::iterator_traits<It>::iterator_category,
+           decltype(*std::declval<It>() = std::declval<T>())>>
+    : std::true_type {};
+
+template <typename It> struct is_back_insert_iterator : std::false_type {};
+template <typename Container>
+struct is_back_insert_iterator<std::back_insert_iterator<Container>>
+    : std::true_type {};
+
+// A type-erased reference to an std::locale to avoid a heavy <locale> include.
+class locale_ref {
+ private:
+  const void* locale_;  // A type-erased pointer to std::locale.
+
+ public:
+  constexpr FMT_INLINE locale_ref() : locale_(nullptr) {}
+  template <typename Locale> explicit locale_ref(const Locale& loc);
+
+  explicit operator bool() const noexcept { return locale_ != nullptr; }
+
+  template <typename Locale> auto get() const -> Locale;
+};
+
+template <typename> constexpr auto encode_types() -> unsigned long long {
+  return 0;
+}
+
+template <typename Context, typename Arg, typename... Args>
+constexpr auto encode_types() -> unsigned long long {
+  return static_cast<unsigned>(mapped_type_constant<Arg, Context>::value) |
+         (encode_types<Context, Args...>() << packed_arg_bits);
+}
+
+#if defined(__cpp_if_constexpr)
+// This type is intentionally undefined, only used for errors
+template <typename T, typename Char> struct type_is_unformattable_for;
+#endif
+
+template <bool PACKED, typename Context, typename T, FMT_ENABLE_IF(PACKED)>
+FMT_CONSTEXPR FMT_INLINE auto make_arg(T& val) -> value<Context> {
+  using arg_type = remove_cvref_t<decltype(arg_mapper<Context>().map(val))>;
+
+  constexpr bool formattable_char =
+      !std::is_same<arg_type, unformattable_char>::value;
+  static_assert(formattable_char, "Mixing character types is disallowed.");
+
+  // Formatting of arbitrary pointers is disallowed. If you want to format a
+  // pointer cast it to `void*` or `const void*`. In particular, this forbids
+  // formatting of `[const] volatile char*` printed as bool by iostreams.
+  constexpr bool formattable_pointer =
+      !std::is_same<arg_type, unformattable_pointer>::value;
+  static_assert(formattable_pointer,
+                "Formatting of non-void pointers is disallowed.");
+
+  constexpr bool formattable = !std::is_same<arg_type, unformattable>::value;
+#if defined(__cpp_if_constexpr)
+  if constexpr (!formattable) {
+    type_is_unformattable_for<T, typename Context::char_type> _;
+  }
+#endif
+  static_assert(
+      formattable,
+      "Cannot format an argument. To make type T formattable provide a "
+      "formatter<T> specialization: https://fmt.dev/latest/api.html#udt");
+  return {arg_mapper<Context>().map(val)};
+}
+
+template <typename Context, typename T>
+FMT_CONSTEXPR auto make_arg(T& val) -> basic_format_arg<Context> {
+  auto arg = basic_format_arg<Context>();
+  arg.type_ = mapped_type_constant<T, Context>::value;
+  arg.value_ = make_arg<true, Context>(val);
+  return arg;
+}
+
+template <bool PACKED, typename Context, typename T, FMT_ENABLE_IF(!PACKED)>
+FMT_CONSTEXPR inline auto make_arg(T& val) -> basic_format_arg<Context> {
+  return make_arg<Context>(val);
+}
+}  // namespace detail
+FMT_BEGIN_EXPORT
+
+// A formatting argument. Context is a template parameter for the compiled API
+// where output can be unbuffered.
+template <typename Context> class basic_format_arg {
+ private:
+  detail::value<Context> value_;
+  detail::type type_;
+
+  template <typename ContextType, typename T>
+  friend FMT_CONSTEXPR auto detail::make_arg(T& value)
+      -> basic_format_arg<ContextType>;
+
+  template <typename Visitor, typename Ctx>
+  friend FMT_CONSTEXPR auto visit_format_arg(Visitor&& vis,
+                                             const basic_format_arg<Ctx>& arg)
+      -> decltype(vis(0));
+
+  friend class basic_format_args<Context>;
+  friend class dynamic_format_arg_store<Context>;
+
+  using char_type = typename Context::char_type;
+
+  template <typename T, typename Char, size_t NUM_ARGS, size_t NUM_NAMED_ARGS>
+  friend struct detail::arg_data;
+
+  basic_format_arg(const detail::named_arg_info<char_type>* args, size_t size)
+      : value_(args, size) {}
+
+ public:
+  class handle {
+   public:
+    explicit handle(detail::custom_value<Context> custom) : custom_(custom) {}
+
+    void format(typename Context::parse_context_type& parse_ctx,
+                Context& ctx) const {
+      custom_.format(custom_.value, parse_ctx, ctx);
+    }
+
+   private:
+    detail::custom_value<Context> custom_;
+  };
+
+  constexpr basic_format_arg() : type_(detail::type::none_type) {}
+
+  constexpr explicit operator bool() const noexcept {
+    return type_ != detail::type::none_type;
+  }
+
+  auto type() const -> detail::type { return type_; }
+
+  auto is_integral() const -> bool { return detail::is_integral_type(type_); }
+  auto is_arithmetic() const -> bool {
+    return detail::is_arithmetic_type(type_);
+  }
+
+  FMT_INLINE auto format_custom(const char_type* parse_begin,
+                                typename Context::parse_context_type& parse_ctx,
+                                Context& ctx) -> bool {
+    if (type_ != detail::type::custom_type) return false;
+    parse_ctx.advance_to(parse_begin);
+    value_.custom.format(value_.custom.value, parse_ctx, ctx);
+    return true;
+  }
+};
+
+/**
+  \rst
+  Visits an argument dispatching to the appropriate visit method based on
+  the argument type. For example, if the argument type is ``double`` then
+  ``vis(value)`` will be called with the value of type ``double``.
+  \endrst
+ */
+// DEPRECATED!
+template <typename Visitor, typename Context>
+FMT_CONSTEXPR FMT_INLINE auto visit_format_arg(
+    Visitor&& vis, const basic_format_arg<Context>& arg) -> decltype(vis(0)) {
+  switch (arg.type_) {
+  case detail::type::none_type:
+    break;
+  case detail::type::int_type:
+    return vis(arg.value_.int_value);
+  case detail::type::uint_type:
+    return vis(arg.value_.uint_value);
+  case detail::type::long_long_type:
+    return vis(arg.value_.long_long_value);
+  case detail::type::ulong_long_type:
+    return vis(arg.value_.ulong_long_value);
+  case detail::type::int128_type:
+    return vis(detail::convert_for_visit(arg.value_.int128_value));
+  case detail::type::uint128_type:
+    return vis(detail::convert_for_visit(arg.value_.uint128_value));
+  case detail::type::bool_type:
+    return vis(arg.value_.bool_value);
+  case detail::type::char_type:
+    return vis(arg.value_.char_value);
+  case detail::type::float_type:
+    return vis(arg.value_.float_value);
+  case detail::type::double_type:
+    return vis(arg.value_.double_value);
+  case detail::type::long_double_type:
+    return vis(arg.value_.long_double_value);
+  case detail::type::cstring_type:
+    return vis(arg.value_.string.data);
+  case detail::type::string_type:
+    using sv = basic_string_view<typename Context::char_type>;
+    return vis(sv(arg.value_.string.data, arg.value_.string.size));
+  case detail::type::pointer_type:
+    return vis(arg.value_.pointer);
+  case detail::type::custom_type:
+    return vis(typename basic_format_arg<Context>::handle(arg.value_.custom));
+  }
+  return vis(monostate());
+}
+
+// Formatting context.
+template <typename OutputIt, typename Char> class basic_format_context {
+ private:
+  OutputIt out_;
+  basic_format_args<basic_format_context> args_;
+  detail::locale_ref loc_;
+
+ public:
+  using iterator = OutputIt;
+  using format_arg = basic_format_arg<basic_format_context>;
+  using format_args = basic_format_args<basic_format_context>;
+  using parse_context_type = basic_format_parse_context<Char>;
+  template <typename T> using formatter_type = formatter<T, Char>;
+
+  /** The character type for the output. */
+  using char_type = Char;
+
+  basic_format_context(basic_format_context&&) = default;
+  basic_format_context(const basic_format_context&) = delete;
+  void operator=(const basic_format_context&) = delete;
+  /**
+    Constructs a ``basic_format_context`` object. References to the arguments
+    are stored in the object so make sure they have appropriate lifetimes.
+   */
+  constexpr basic_format_context(OutputIt out, format_args ctx_args,
+                                 detail::locale_ref loc = {})
+      : out_(out), args_(ctx_args), loc_(loc) {}
+
+  constexpr auto arg(int id) const -> format_arg { return args_.get(id); }
+  FMT_CONSTEXPR auto arg(basic_string_view<Char> name) -> format_arg {
+    return args_.get(name);
+  }
+  FMT_CONSTEXPR auto arg_id(basic_string_view<Char> name) -> int {
+    return args_.get_id(name);
+  }
+  auto args() const -> const format_args& { return args_; }
+
+  // DEPRECATED!
+  FMT_CONSTEXPR auto error_handler() -> detail::error_handler { return {}; }
+  void on_error(const char* message) { error_handler().on_error(message); }
+
+  // Returns an iterator to the beginning of the output range.
+  FMT_CONSTEXPR auto out() -> iterator { return out_; }
+
+  // Advances the begin iterator to ``it``.
+  void advance_to(iterator it) {
+    if (!detail::is_back_insert_iterator<iterator>()) out_ = it;
+  }
+
+  FMT_CONSTEXPR auto locale() -> detail::locale_ref { return loc_; }
+};
+
+template <typename Char>
+using buffer_context =
+    basic_format_context<detail::buffer_appender<Char>, Char>;
+using format_context = buffer_context<char>;
+
+template <typename T, typename Char = char>
+using is_formattable = bool_constant<!std::is_base_of<
+    detail::unformattable, decltype(detail::arg_mapper<buffer_context<Char>>()
+                                        .map(std::declval<T&>()))>::value>;
+
+/**
+  \rst
+  An array of references to arguments. It can be implicitly converted into
+  `~fmt::basic_format_args` for passing into type-erased formatting functions
+  such as `~fmt::vformat`.
+  \endrst
+ */
+template <typename Context, typename... Args>
+class format_arg_store
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+    // Workaround a GCC template argument substitution bug.
+    : public basic_format_args<Context>
+#endif
+{
+ private:
+  static const size_t num_args = sizeof...(Args);
+  static constexpr size_t num_named_args = detail::count_named_args<Args...>();
+  static const bool is_packed = num_args <= detail::max_packed_args;
+
+  using value_type = conditional_t<is_packed, detail::value<Context>,
+                                   basic_format_arg<Context>>;
+
+  detail::arg_data<value_type, typename Context::char_type, num_args,
+                   num_named_args>
+      data_;
+
+  friend class basic_format_args<Context>;
+
+  static constexpr unsigned long long desc =
+      (is_packed ? detail::encode_types<Context, Args...>()
+                 : detail::is_unpacked_bit | num_args) |
+      (num_named_args != 0
+           ? static_cast<unsigned long long>(detail::has_named_args_bit)
+           : 0);
+
+ public:
+  template <typename... T>
+  FMT_CONSTEXPR FMT_INLINE format_arg_store(T&... args)
+      :
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+        basic_format_args<Context>(*this),
+#endif
+        data_{detail::make_arg<is_packed, Context>(args)...} {
+    if (detail::const_check(num_named_args != 0))
+      detail::init_named_args(data_.named_args(), 0, 0, args...);
+  }
+};
+
+/**
+  \rst
+  Constructs a `~fmt::format_arg_store` object that contains references to
+  arguments and can be implicitly converted to `~fmt::format_args`. `Context`
+  can be omitted in which case it defaults to `~fmt::format_context`.
+  See `~fmt::arg` for lifetime considerations.
+  \endrst
+ */
+// Arguments are taken by lvalue references to avoid some lifetime issues.
+template <typename Context = format_context, typename... T>
+constexpr auto make_format_args(T&... args)
+    -> format_arg_store<Context, remove_cvref_t<T>...> {
+  return {args...};
+}
+
+/**
+  \rst
+  Returns a named argument to be used in a formatting function.
+  It should only be used in a call to a formatting function or
+  `dynamic_format_arg_store::push_back`.
+
+  **Example**::
+
+    fmt::print("Elapsed time: {s:.2f} seconds", fmt::arg("s", 1.23));
+  \endrst
+ */
+template <typename Char, typename T>
+inline auto arg(const Char* name, const T& arg) -> detail::named_arg<Char, T> {
+  static_assert(!detail::is_named_arg<T>(), "nested named arguments");
+  return {name, arg};
+}
+FMT_END_EXPORT
+
+/**
+  \rst
+  A view of a collection of formatting arguments. To avoid lifetime issues it
+  should only be used as a parameter type in type-erased functions such as
+  ``vformat``::
+
+    void vlog(string_view format_str, format_args args);  // OK
+    format_args args = make_format_args();  // Error: dangling reference
+  \endrst
+ */
+template <typename Context> class basic_format_args {
+ public:
+  using size_type = int;
+  using format_arg = basic_format_arg<Context>;
+
+ private:
+  // A descriptor that contains information about formatting arguments.
+  // If the number of arguments is less or equal to max_packed_args then
+  // argument types are passed in the descriptor. This reduces binary code size
+  // per formatting function call.
+  unsigned long long desc_;
+  union {
+    // If is_packed() returns true then argument values are stored in values_;
+    // otherwise they are stored in args_. This is done to improve cache
+    // locality and reduce compiled code size since storing larger objects
+    // may require more code (at least on x86-64) even if the same amount of
+    // data is actually copied to stack. It saves ~10% on the bloat test.
+    const detail::value<Context>* values_;
+    const format_arg* args_;
+  };
+
+  constexpr auto is_packed() const -> bool {
+    return (desc_ & detail::is_unpacked_bit) == 0;
+  }
+  auto has_named_args() const -> bool {
+    return (desc_ & detail::has_named_args_bit) != 0;
+  }
+
+  FMT_CONSTEXPR auto type(int index) const -> detail::type {
+    int shift = index * detail::packed_arg_bits;
+    unsigned int mask = (1 << detail::packed_arg_bits) - 1;
+    return static_cast<detail::type>((desc_ >> shift) & mask);
+  }
+
+  constexpr FMT_INLINE basic_format_args(unsigned long long desc,
+                                         const detail::value<Context>* values)
+      : desc_(desc), values_(values) {}
+  constexpr basic_format_args(unsigned long long desc, const format_arg* args)
+      : desc_(desc), args_(args) {}
+
+ public:
+  constexpr basic_format_args() : desc_(0), args_(nullptr) {}
+
+  /**
+   \rst
+   Constructs a `basic_format_args` object from `~fmt::format_arg_store`.
+   \endrst
+   */
+  template <typename... Args>
+  constexpr FMT_INLINE basic_format_args(
+      const format_arg_store<Context, Args...>& store)
+      : basic_format_args(format_arg_store<Context, Args...>::desc,
+                          store.data_.args()) {}
+
+  /**
+   \rst
+   Constructs a `basic_format_args` object from
+   `~fmt::dynamic_format_arg_store`.
+   \endrst
+   */
+  constexpr FMT_INLINE basic_format_args(
+      const dynamic_format_arg_store<Context>& store)
+      : basic_format_args(store.get_types(), store.data()) {}
+
+  /**
+   \rst
+   Constructs a `basic_format_args` object from a dynamic set of arguments.
+   \endrst
+   */
+  constexpr basic_format_args(const format_arg* args, int count)
+      : basic_format_args(detail::is_unpacked_bit | detail::to_unsigned(count),
+                          args) {}
+
+  /** Returns the argument with the specified id. */
+  FMT_CONSTEXPR auto get(int id) const -> format_arg {
+    format_arg arg;
+    if (!is_packed()) {
+      if (id < max_size()) arg = args_[id];
+      return arg;
+    }
+    if (id >= detail::max_packed_args) return arg;
+    arg.type_ = type(id);
+    if (arg.type_ == detail::type::none_type) return arg;
+    arg.value_ = values_[id];
+    return arg;
+  }
+
+  template <typename Char>
+  auto get(basic_string_view<Char> name) const -> format_arg {
+    int id = get_id(name);
+    return id >= 0 ? get(id) : format_arg();
+  }
+
+  template <typename Char>
+  auto get_id(basic_string_view<Char> name) const -> int {
+    if (!has_named_args()) return -1;
+    const auto& named_args =
+        (is_packed() ? values_[-1] : args_[-1].value_).named_args;
+    for (size_t i = 0; i < named_args.size; ++i) {
+      if (named_args.data[i].name == name) return named_args.data[i].id;
+    }
+    return -1;
+  }
+
+  auto max_size() const -> int {
+    unsigned long long max_packed = detail::max_packed_args;
+    return static_cast<int>(is_packed() ? max_packed
+                                        : desc_ & ~detail::is_unpacked_bit);
+  }
+};
+
+/** An alias to ``basic_format_args<format_context>``. */
+// A separate type would result in shorter symbols but break ABI compatibility
+// between clang and gcc on ARM (#1919).
+FMT_EXPORT using format_args = basic_format_args<format_context>;
+
+// We cannot use enum classes as bit fields because of a gcc bug, so we put them
+// in namespaces instead (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61414).
+// Additionally, if an underlying type is specified, older gcc incorrectly warns
+// that the type is too small. Both bugs are fixed in gcc 9.3.
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 903
+#  define FMT_ENUM_UNDERLYING_TYPE(type)
+#else
+#  define FMT_ENUM_UNDERLYING_TYPE(type) : type
+#endif
+namespace align {
+enum type FMT_ENUM_UNDERLYING_TYPE(unsigned char){none, left, right, center,
+                                                  numeric};
+}
+using align_t = align::type;
+namespace sign {
+enum type FMT_ENUM_UNDERLYING_TYPE(unsigned char){none, minus, plus, space};
+}
+using sign_t = sign::type;
+
+namespace detail {
+
+// Workaround an array initialization issue in gcc 4.8.
+template <typename Char> struct fill_t {
+ private:
+  enum { max_size = 4 };
+  Char data_[max_size] = {Char(' '), Char(0), Char(0), Char(0)};
+  unsigned char size_ = 1;
+
+ public:
+  FMT_CONSTEXPR void operator=(basic_string_view<Char> s) {
+    auto size = s.size();
+    FMT_ASSERT(size <= max_size, "invalid fill");
+    for (size_t i = 0; i < size; ++i) data_[i] = s[i];
+    size_ = static_cast<unsigned char>(size);
+  }
+
+  constexpr auto size() const -> size_t { return size_; }
+  constexpr auto data() const -> const Char* { return data_; }
+
+  FMT_CONSTEXPR auto operator[](size_t index) -> Char& { return data_[index]; }
+  FMT_CONSTEXPR auto operator[](size_t index) const -> const Char& {
+    return data_[index];
+  }
+};
+}  // namespace detail
+
+enum class presentation_type : unsigned char {
+  none,
+  dec,             // 'd'
+  oct,             // 'o'
+  hex_lower,       // 'x'
+  hex_upper,       // 'X'
+  bin_lower,       // 'b'
+  bin_upper,       // 'B'
+  hexfloat_lower,  // 'a'
+  hexfloat_upper,  // 'A'
+  exp_lower,       // 'e'
+  exp_upper,       // 'E'
+  fixed_lower,     // 'f'
+  fixed_upper,     // 'F'
+  general_lower,   // 'g'
+  general_upper,   // 'G'
+  chr,             // 'c'
+  string,          // 's'
+  pointer,         // 'p'
+  debug            // '?'
+};
+
+// Format specifiers for built-in and string types.
+template <typename Char = char> struct format_specs {
+  int width;
+  int precision;
+  presentation_type type;
+  align_t align : 4;
+  sign_t sign : 3;
+  bool alt : 1;  // Alternate form ('#').
+  bool localized : 1;
+  detail::fill_t<Char> fill;
+
+  constexpr format_specs()
+      : width(0),
+        precision(-1),
+        type(presentation_type::none),
+        align(align::none),
+        sign(sign::none),
+        alt(false),
+        localized(false) {}
+};
+
+namespace detail {
+
+enum class arg_id_kind { none, index, name };
+
+// An argument reference.
+template <typename Char> struct arg_ref {
+  FMT_CONSTEXPR arg_ref() : kind(arg_id_kind::none), val() {}
+
+  FMT_CONSTEXPR explicit arg_ref(int index)
+      : kind(arg_id_kind::index), val(index) {}
+  FMT_CONSTEXPR explicit arg_ref(basic_string_view<Char> name)
+      : kind(arg_id_kind::name), val(name) {}
+
+  FMT_CONSTEXPR auto operator=(int idx) -> arg_ref& {
+    kind = arg_id_kind::index;
+    val.index = idx;
+    return *this;
+  }
+
+  arg_id_kind kind;
+  union value {
+    FMT_CONSTEXPR value(int idx = 0) : index(idx) {}
+    FMT_CONSTEXPR value(basic_string_view<Char> n) : name(n) {}
+
+    int index;
+    basic_string_view<Char> name;
+  } val;
+};
+
+// Format specifiers with width and precision resolved at formatting rather
+// than parsing time to allow reusing the same parsed specifiers with
+// different sets of arguments (precompilation of format strings).
+template <typename Char = char>
+struct dynamic_format_specs : format_specs<Char> {
+  arg_ref<Char> width_ref;
+  arg_ref<Char> precision_ref;
+};
+
+// Converts a character to ASCII. Returns '\0' on conversion failure.
+template <typename Char, FMT_ENABLE_IF(std::is_integral<Char>::value)>
+constexpr auto to_ascii(Char c) -> char {
+  return c <= 0xff ? static_cast<char>(c) : '\0';
+}
+template <typename Char, FMT_ENABLE_IF(std::is_enum<Char>::value)>
+constexpr auto to_ascii(Char c) -> char {
+  return c <= 0xff ? static_cast<char>(c) : '\0';
+}
+
+// Returns the number of code units in a code point or 1 on error.
+template <typename Char>
+FMT_CONSTEXPR auto code_point_length(const Char* begin) -> int {
+  if (const_check(sizeof(Char) != 1)) return 1;
+  auto c = static_cast<unsigned char>(*begin);
+  return static_cast<int>((0x3a55000000000000ull >> (2 * (c >> 3))) & 0x3) + 1;
+}
+
+// Return the result via the out param to workaround gcc bug 77539.
+template <bool IS_CONSTEXPR, typename T, typename Ptr = const T*>
+FMT_CONSTEXPR auto find(Ptr first, Ptr last, T value, Ptr& out) -> bool {
+  for (out = first; out != last; ++out) {
+    if (*out == value) return true;
+  }
+  return false;
+}
+
+template <>
+inline auto find<false, char>(const char* first, const char* last, char value,
+                              const char*& out) -> bool {
+  out = static_cast<const char*>(
+      std::memchr(first, value, to_unsigned(last - first)));
+  return out != nullptr;
+}
+
+// Parses the range [begin, end) as an unsigned integer. This function assumes
+// that the range is non-empty and the first character is a digit.
+template <typename Char>
+FMT_CONSTEXPR auto parse_nonnegative_int(const Char*& begin, const Char* end,
+                                         int error_value) noexcept -> int {
+  FMT_ASSERT(begin != end && '0' <= *begin && *begin <= '9', "");
+  unsigned value = 0, prev = 0;
+  auto p = begin;
+  do {
+    prev = value;
+    value = value * 10 + unsigned(*p - '0');
+    ++p;
+  } while (p != end && '0' <= *p && *p <= '9');
+  auto num_digits = p - begin;
+  begin = p;
+  if (num_digits <= std::numeric_limits<int>::digits10)
+    return static_cast<int>(value);
+  // Check for overflow.
+  const unsigned max = to_unsigned((std::numeric_limits<int>::max)());
+  return num_digits == std::numeric_limits<int>::digits10 + 1 &&
+                 prev * 10ull + unsigned(p[-1] - '0') <= max
+             ? static_cast<int>(value)
+             : error_value;
+}
+
+FMT_CONSTEXPR inline auto parse_align(char c) -> align_t {
+  switch (c) {
+  case '<':
+    return align::left;
+  case '>':
+    return align::right;
+  case '^':
+    return align::center;
+  }
+  return align::none;
+}
+
+template <typename Char> constexpr auto is_name_start(Char c) -> bool {
+  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_';
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR auto do_parse_arg_id(const Char* begin, const Char* end,
+                                   Handler&& handler) -> const Char* {
+  Char c = *begin;
+  if (c >= '0' && c <= '9') {
+    int index = 0;
+    constexpr int max = (std::numeric_limits<int>::max)();
+    if (c != '0')
+      index = parse_nonnegative_int(begin, end, max);
+    else
+      ++begin;
+    if (begin == end || (*begin != '}' && *begin != ':'))
+      throw_format_error("invalid format string");
+    else
+      handler.on_index(index);
+    return begin;
+  }
+  if (!is_name_start(c)) {
+    throw_format_error("invalid format string");
+    return begin;
+  }
+  auto it = begin;
+  do {
+    ++it;
+  } while (it != end && (is_name_start(*it) || ('0' <= *it && *it <= '9')));
+  handler.on_name({begin, to_unsigned(it - begin)});
+  return it;
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR FMT_INLINE auto parse_arg_id(const Char* begin, const Char* end,
+                                           Handler&& handler) -> const Char* {
+  FMT_ASSERT(begin != end, "");
+  Char c = *begin;
+  if (c != '}' && c != ':') return do_parse_arg_id(begin, end, handler);
+  handler.on_auto();
+  return begin;
+}
+
+template <typename Char> struct dynamic_spec_id_handler {
+  basic_format_parse_context<Char>& ctx;
+  arg_ref<Char>& ref;
+
+  FMT_CONSTEXPR void on_auto() {
+    int id = ctx.next_arg_id();
+    ref = arg_ref<Char>(id);
+    ctx.check_dynamic_spec(id);
+  }
+  FMT_CONSTEXPR void on_index(int id) {
+    ref = arg_ref<Char>(id);
+    ctx.check_arg_id(id);
+    ctx.check_dynamic_spec(id);
+  }
+  FMT_CONSTEXPR void on_name(basic_string_view<Char> id) {
+    ref = arg_ref<Char>(id);
+    ctx.check_arg_id(id);
+  }
+};
+
+// Parses [integer | "{" [arg_id] "}"].
+template <typename Char>
+FMT_CONSTEXPR auto parse_dynamic_spec(const Char* begin, const Char* end,
+                                      int& value, arg_ref<Char>& ref,
+                                      basic_format_parse_context<Char>& ctx)
+    -> const Char* {
+  FMT_ASSERT(begin != end, "");
+  if ('0' <= *begin && *begin <= '9') {
+    int val = parse_nonnegative_int(begin, end, -1);
+    if (val != -1)
+      value = val;
+    else
+      throw_format_error("number is too big");
+  } else if (*begin == '{') {
+    ++begin;
+    auto handler = dynamic_spec_id_handler<Char>{ctx, ref};
+    if (begin != end) begin = parse_arg_id(begin, end, handler);
+    if (begin != end && *begin == '}') return ++begin;
+    throw_format_error("invalid format string");
+  }
+  return begin;
+}
+
+template <typename Char>
+FMT_CONSTEXPR auto parse_precision(const Char* begin, const Char* end,
+                                   int& value, arg_ref<Char>& ref,
+                                   basic_format_parse_context<Char>& ctx)
+    -> const Char* {
+  ++begin;
+  if (begin == end || *begin == '}') {
+    throw_format_error("invalid precision");
+    return begin;
+  }
+  return parse_dynamic_spec(begin, end, value, ref, ctx);
+}
+
+enum class state { start, align, sign, hash, zero, width, precision, locale };
+
+// Parses standard format specifiers.
+template <typename Char>
+FMT_CONSTEXPR FMT_INLINE auto parse_format_specs(
+    const Char* begin, const Char* end, dynamic_format_specs<Char>& specs,
+    basic_format_parse_context<Char>& ctx, type arg_type) -> const Char* {
+  auto c = '\0';
+  if (end - begin > 1) {
+    auto next = to_ascii(begin[1]);
+    c = parse_align(next) == align::none ? to_ascii(*begin) : '\0';
+  } else {
+    if (begin == end) return begin;
+    c = to_ascii(*begin);
+  }
+
+  struct {
+    state current_state = state::start;
+    FMT_CONSTEXPR void operator()(state s, bool valid = true) {
+      if (current_state >= s || !valid)
+        throw_format_error("invalid format specifier");
+      current_state = s;
+    }
+  } enter_state;
+
+  using pres = presentation_type;
+  constexpr auto integral_set = sint_set | uint_set | bool_set | char_set;
+  struct {
+    const Char*& begin;
+    dynamic_format_specs<Char>& specs;
+    type arg_type;
+
+    FMT_CONSTEXPR auto operator()(pres pres_type, int set) -> const Char* {
+      if (!in(arg_type, set)) {
+        if (arg_type == type::none_type) return begin;
+        throw_format_error("invalid format specifier");
+      }
+      specs.type = pres_type;
+      return begin + 1;
+    }
+  } parse_presentation_type{begin, specs, arg_type};
+
+  for (;;) {
+    switch (c) {
+    case '<':
+    case '>':
+    case '^':
+      enter_state(state::align);
+      specs.align = parse_align(c);
+      ++begin;
+      break;
+    case '+':
+    case '-':
+    case ' ':
+      if (arg_type == type::none_type) return begin;
+      enter_state(state::sign, in(arg_type, sint_set | float_set));
+      switch (c) {
+      case '+':
+        specs.sign = sign::plus;
+        break;
+      case '-':
+        specs.sign = sign::minus;
+        break;
+      case ' ':
+        specs.sign = sign::space;
+        break;
+      }
+      ++begin;
+      break;
+    case '#':
+      if (arg_type == type::none_type) return begin;
+      enter_state(state::hash, is_arithmetic_type(arg_type));
+      specs.alt = true;
+      ++begin;
+      break;
+    case '0':
+      enter_state(state::zero);
+      if (!is_arithmetic_type(arg_type)) {
+        if (arg_type == type::none_type) return begin;
+        throw_format_error("format specifier requires numeric argument");
+      }
+      if (specs.align == align::none) {
+        // Ignore 0 if align is specified for compatibility with std::format.
+        specs.align = align::numeric;
+        specs.fill[0] = Char('0');
+      }
+      ++begin;
+      break;
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+    case '8':
+    case '9':
+    case '{':
+      enter_state(state::width);
+      begin = parse_dynamic_spec(begin, end, specs.width, specs.width_ref, ctx);
+      break;
+    case '.':
+      if (arg_type == type::none_type) return begin;
+      enter_state(state::precision,
+                  in(arg_type, float_set | string_set | cstring_set));
+      begin = parse_precision(begin, end, specs.precision, specs.precision_ref,
+                              ctx);
+      break;
+    case 'L':
+      if (arg_type == type::none_type) return begin;
+      enter_state(state::locale, is_arithmetic_type(arg_type));
+      specs.localized = true;
+      ++begin;
+      break;
+    case 'd':
+      return parse_presentation_type(pres::dec, integral_set);
+    case 'o':
+      return parse_presentation_type(pres::oct, integral_set);
+    case 'x':
+      return parse_presentation_type(pres::hex_lower, integral_set);
+    case 'X':
+      return parse_presentation_type(pres::hex_upper, integral_set);
+    case 'b':
+      return parse_presentation_type(pres::bin_lower, integral_set);
+    case 'B':
+      return parse_presentation_type(pres::bin_upper, integral_set);
+    case 'a':
+      return parse_presentation_type(pres::hexfloat_lower, float_set);
+    case 'A':
+      return parse_presentation_type(pres::hexfloat_upper, float_set);
+    case 'e':
+      return parse_presentation_type(pres::exp_lower, float_set);
+    case 'E':
+      return parse_presentation_type(pres::exp_upper, float_set);
+    case 'f':
+      return parse_presentation_type(pres::fixed_lower, float_set);
+    case 'F':
+      return parse_presentation_type(pres::fixed_upper, float_set);
+    case 'g':
+      return parse_presentation_type(pres::general_lower, float_set);
+    case 'G':
+      return parse_presentation_type(pres::general_upper, float_set);
+    case 'c':
+      if (arg_type == type::bool_type)
+        throw_format_error("invalid format specifier");
+      return parse_presentation_type(pres::chr, integral_set);
+    case 's':
+      return parse_presentation_type(pres::string,
+                                     bool_set | string_set | cstring_set);
+    case 'p':
+      return parse_presentation_type(pres::pointer, pointer_set | cstring_set);
+    case '?':
+      return parse_presentation_type(pres::debug,
+                                     char_set | string_set | cstring_set);
+    case '}':
+      return begin;
+    default: {
+      if (*begin == '}') return begin;
+      // Parse fill and alignment.
+      auto fill_end = begin + code_point_length(begin);
+      if (end - fill_end <= 0) {
+        throw_format_error("invalid format specifier");
+        return begin;
+      }
+      if (*begin == '{') {
+        throw_format_error("invalid fill character '{'");
+        return begin;
+      }
+      auto align = parse_align(to_ascii(*fill_end));
+      enter_state(state::align, align != align::none);
+      specs.fill = {begin, to_unsigned(fill_end - begin)};
+      specs.align = align;
+      begin = fill_end + 1;
+    }
+    }
+    if (begin == end) return begin;
+    c = to_ascii(*begin);
+  }
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR auto parse_replacement_field(const Char* begin, const Char* end,
+                                           Handler&& handler) -> const Char* {
+  struct id_adapter {
+    Handler& handler;
+    int arg_id;
+
+    FMT_CONSTEXPR void on_auto() { arg_id = handler.on_arg_id(); }
+    FMT_CONSTEXPR void on_index(int id) { arg_id = handler.on_arg_id(id); }
+    FMT_CONSTEXPR void on_name(basic_string_view<Char> id) {
+      arg_id = handler.on_arg_id(id);
+    }
+  };
+
+  ++begin;
+  if (begin == end) return handler.on_error("invalid format string"), end;
+  if (*begin == '}') {
+    handler.on_replacement_field(handler.on_arg_id(), begin);
+  } else if (*begin == '{') {
+    handler.on_text(begin, begin + 1);
+  } else {
+    auto adapter = id_adapter{handler, 0};
+    begin = parse_arg_id(begin, end, adapter);
+    Char c = begin != end ? *begin : Char();
+    if (c == '}') {
+      handler.on_replacement_field(adapter.arg_id, begin);
+    } else if (c == ':') {
+      begin = handler.on_format_specs(adapter.arg_id, begin + 1, end);
+      if (begin == end || *begin != '}')
+        return handler.on_error("unknown format specifier"), end;
+    } else {
+      return handler.on_error("missing '}' in format string"), end;
+    }
+  }
+  return begin + 1;
+}
+
+template <bool IS_CONSTEXPR, typename Char, typename Handler>
+FMT_CONSTEXPR FMT_INLINE void parse_format_string(
+    basic_string_view<Char> format_str, Handler&& handler) {
+  auto begin = format_str.data();
+  auto end = begin + format_str.size();
+  if (end - begin < 32) {
+    // Use a simple loop instead of memchr for small strings.
+    const Char* p = begin;
+    while (p != end) {
+      auto c = *p++;
+      if (c == '{') {
+        handler.on_text(begin, p - 1);
+        begin = p = parse_replacement_field(p - 1, end, handler);
+      } else if (c == '}') {
+        if (p == end || *p != '}')
+          return handler.on_error("unmatched '}' in format string");
+        handler.on_text(begin, p);
+        begin = ++p;
+      }
+    }
+    handler.on_text(begin, end);
+    return;
+  }
+  struct writer {
+    FMT_CONSTEXPR void operator()(const Char* from, const Char* to) {
+      if (from == to) return;
+      for (;;) {
+        const Char* p = nullptr;
+        if (!find<IS_CONSTEXPR>(from, to, Char('}'), p))
+          return handler_.on_text(from, to);
+        ++p;
+        if (p == to || *p != '}')
+          return handler_.on_error("unmatched '}' in format string");
+        handler_.on_text(from, p);
+        from = p + 1;
+      }
+    }
+    Handler& handler_;
+  } write = {handler};
+  while (begin != end) {
+    // Doing two passes with memchr (one for '{' and another for '}') is up to
+    // 2.5x faster than the naive one-pass implementation on big format strings.
+    const Char* p = begin;
+    if (*begin != '{' && !find<IS_CONSTEXPR>(begin + 1, end, Char('{'), p))
+      return write(begin, end);
+    write(begin, p);
+    begin = parse_replacement_field(p, end, handler);
+  }
+}
+
+template <typename T, bool = is_named_arg<T>::value> struct strip_named_arg {
+  using type = T;
+};
+template <typename T> struct strip_named_arg<T, true> {
+  using type = remove_cvref_t<decltype(T::value)>;
+};
+
+template <typename T, typename ParseContext>
+FMT_CONSTEXPR auto parse_format_specs(ParseContext& ctx)
+    -> decltype(ctx.begin()) {
+  using char_type = typename ParseContext::char_type;
+  using context = buffer_context<char_type>;
+  using mapped_type = conditional_t<
+      mapped_type_constant<T, context>::value != type::custom_type,
+      decltype(arg_mapper<context>().map(std::declval<const T&>())),
+      typename strip_named_arg<T>::type>;
+#if defined(__cpp_if_constexpr)
+  if constexpr (std::is_default_constructible<
+                    formatter<mapped_type, char_type>>::value) {
+    return formatter<mapped_type, char_type>().parse(ctx);
+  } else {
+    type_is_unformattable_for<T, char_type> _;
+    return ctx.begin();
+  }
+#else
+  return formatter<mapped_type, char_type>().parse(ctx);
+#endif
+}
+
+// Checks char specs and returns true iff the presentation type is char-like.
+template <typename Char>
+FMT_CONSTEXPR auto check_char_specs(const format_specs<Char>& specs) -> bool {
+  if (specs.type != presentation_type::none &&
+      specs.type != presentation_type::chr &&
+      specs.type != presentation_type::debug) {
+    return false;
+  }
+  if (specs.align == align::numeric || specs.sign != sign::none || specs.alt)
+    throw_format_error("invalid format specifier for char");
+  return true;
+}
+
+#if FMT_USE_NONTYPE_TEMPLATE_ARGS
+template <int N, typename T, typename... Args, typename Char>
+constexpr auto get_arg_index_by_name(basic_string_view<Char> name) -> int {
+  if constexpr (is_statically_named_arg<T>()) {
+    if (name == T::name) return N;
+  }
+  if constexpr (sizeof...(Args) > 0)
+    return get_arg_index_by_name<N + 1, Args...>(name);
+  (void)name;  // Workaround an MSVC bug about "unused" parameter.
+  return -1;
+}
+#endif
+
+template <typename... Args, typename Char>
+FMT_CONSTEXPR auto get_arg_index_by_name(basic_string_view<Char> name) -> int {
+#if FMT_USE_NONTYPE_TEMPLATE_ARGS
+  if constexpr (sizeof...(Args) > 0)
+    return get_arg_index_by_name<0, Args...>(name);
+#endif
+  (void)name;
+  return -1;
+}
+
+template <typename Char, typename... Args> class format_string_checker {
+ private:
+  using parse_context_type = compile_parse_context<Char>;
+  static constexpr int num_args = sizeof...(Args);
+
+  // Format specifier parsing function.
+  // In the future basic_format_parse_context will replace compile_parse_context
+  // here and will use is_constant_evaluated and downcasting to access the data
+  // needed for compile-time checks: https://godbolt.org/z/GvWzcTjh1.
+  using parse_func = const Char* (*)(parse_context_type&);
+
+  type types_[num_args > 0 ? static_cast<size_t>(num_args) : 1];
+  parse_context_type context_;
+  parse_func parse_funcs_[num_args > 0 ? static_cast<size_t>(num_args) : 1];
+
+ public:
+  explicit FMT_CONSTEXPR format_string_checker(basic_string_view<Char> fmt)
+      : types_{mapped_type_constant<Args, buffer_context<Char>>::value...},
+        context_(fmt, num_args, types_),
+        parse_funcs_{&parse_format_specs<Args, parse_context_type>...} {}
+
+  FMT_CONSTEXPR void on_text(const Char*, const Char*) {}
+
+  FMT_CONSTEXPR auto on_arg_id() -> int { return context_.next_arg_id(); }
+  FMT_CONSTEXPR auto on_arg_id(int id) -> int {
+    return context_.check_arg_id(id), id;
+  }
+  FMT_CONSTEXPR auto on_arg_id(basic_string_view<Char> id) -> int {
+#if FMT_USE_NONTYPE_TEMPLATE_ARGS
+    auto index = get_arg_index_by_name<Args...>(id);
+    if (index < 0) on_error("named argument is not found");
+    return index;
+#else
+    (void)id;
+    on_error("compile-time checks for named arguments require C++20 support");
+    return 0;
+#endif
+  }
+
+  FMT_CONSTEXPR void on_replacement_field(int id, const Char* begin) {
+    on_format_specs(id, begin, begin);  // Call parse() on empty specs.
+  }
+
+  FMT_CONSTEXPR auto on_format_specs(int id, const Char* begin, const Char*)
+      -> const Char* {
+    context_.advance_to(begin);
+    // id >= 0 check is a workaround for gcc 10 bug (#2065).
+    return id >= 0 && id < num_args ? parse_funcs_[id](context_) : begin;
+  }
+
+  FMT_CONSTEXPR void on_error(const char* message) {
+    throw_format_error(message);
+  }
+};
+
+// Reports a compile-time error if S is not a valid format string.
+template <typename..., typename S, FMT_ENABLE_IF(!is_compile_string<S>::value)>
+FMT_INLINE void check_format_string(const S&) {
+#ifdef FMT_ENFORCE_COMPILE_STRING
+  static_assert(is_compile_string<S>::value,
+                "FMT_ENFORCE_COMPILE_STRING requires all format strings to use "
+                "FMT_STRING.");
+#endif
+}
+template <typename... Args, typename S,
+          FMT_ENABLE_IF(is_compile_string<S>::value)>
+void check_format_string(S format_str) {
+  using char_t = typename S::char_type;
+  FMT_CONSTEXPR auto s = basic_string_view<char_t>(format_str);
+  using checker = format_string_checker<char_t, remove_cvref_t<Args>...>;
+  FMT_CONSTEXPR bool error = (parse_format_string<true>(s, checker(s)), true);
+  ignore_unused(error);
+}
+
+template <typename Char = char> struct vformat_args {
+  using type = basic_format_args<
+      basic_format_context<std::back_insert_iterator<buffer<Char>>, Char>>;
+};
+template <> struct vformat_args<char> {
+  using type = format_args;
+};
+
+// Use vformat_args and avoid type_identity to keep symbols short.
+template <typename Char>
+void vformat_to(buffer<Char>& buf, basic_string_view<Char> fmt,
+                typename vformat_args<Char>::type args, locale_ref loc = {});
+
+FMT_API void vprint_mojibake(std::FILE*, string_view, format_args);
+#ifndef _WIN32
+inline void vprint_mojibake(std::FILE*, string_view, format_args) {}
+#endif
+}  // namespace detail
+
+FMT_BEGIN_EXPORT
+
+// A formatter specialization for natively supported types.
+template <typename T, typename Char>
+struct formatter<T, Char,
+                 enable_if_t<detail::type_constant<T, Char>::value !=
+                             detail::type::custom_type>> {
+ private:
+  detail::dynamic_format_specs<Char> specs_;
+
+ public:
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> const Char* {
+    auto type = detail::type_constant<T, Char>::value;
+    auto end =
+        detail::parse_format_specs(ctx.begin(), ctx.end(), specs_, ctx, type);
+    if (type == detail::type::char_type) detail::check_char_specs(specs_);
+    return end;
+  }
+
+  template <detail::type U = detail::type_constant<T, Char>::value,
+            FMT_ENABLE_IF(U == detail::type::string_type ||
+                          U == detail::type::cstring_type ||
+                          U == detail::type::char_type)>
+  FMT_CONSTEXPR void set_debug_format(bool set = true) {
+    specs_.type = set ? presentation_type::debug : presentation_type::none;
+  }
+
+  template <typename FormatContext>
+  FMT_CONSTEXPR auto format(const T& val, FormatContext& ctx) const
+      -> decltype(ctx.out());
+};
+
+template <typename Char = char> struct runtime_format_string {
+  basic_string_view<Char> str;
+};
+
+/** A compile-time format string. */
+template <typename Char, typename... Args> class basic_format_string {
+ private:
+  basic_string_view<Char> str_;
+
+ public:
+  template <typename S,
+            FMT_ENABLE_IF(
+                std::is_convertible<const S&, basic_string_view<Char>>::value)>
+  FMT_CONSTEVAL FMT_INLINE basic_format_string(const S& s) : str_(s) {
+    static_assert(
+        detail::count<
+            (std::is_base_of<detail::view, remove_reference_t<Args>>::value &&
+             std::is_reference<Args>::value)...>() == 0,
+        "passing views as lvalues is disallowed");
+#ifdef FMT_HAS_CONSTEVAL
+    if constexpr (detail::count_named_args<Args...>() ==
+                  detail::count_statically_named_args<Args...>()) {
+      using checker =
+          detail::format_string_checker<Char, remove_cvref_t<Args>...>;
+      detail::parse_format_string<true>(str_, checker(s));
+    }
+#else
+    detail::check_format_string<Args...>(s);
+#endif
+  }
+  basic_format_string(runtime_format_string<Char> fmt) : str_(fmt.str) {}
+
+  FMT_INLINE operator basic_string_view<Char>() const { return str_; }
+  FMT_INLINE auto get() const -> basic_string_view<Char> { return str_; }
+};
+
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+// Workaround broken conversion on older gcc.
+template <typename...> using format_string = string_view;
+inline auto runtime(string_view s) -> string_view { return s; }
+#else
+template <typename... Args>
+using format_string = basic_format_string<char, type_identity_t<Args>...>;
+/**
+  \rst
+  Creates a runtime format string.
+
+  **Example**::
+
+    // Check format string at runtime instead of compile-time.
+    fmt::print(fmt::runtime("{:d}"), "I am not a number");
+  \endrst
+ */
+inline auto runtime(string_view s) -> runtime_format_string<> { return {{s}}; }
+#endif
+
+FMT_API auto vformat(string_view fmt, format_args args) -> std::string;
+
+/**
+  \rst
+  Formats ``args`` according to specifications in ``fmt`` and returns the result
+  as a string.
+
+  **Example**::
+
+    #include <fmt/core.h>
+    std::string message = fmt::format("The answer is {}.", 42);
+  \endrst
+*/
+template <typename... T>
+FMT_NODISCARD FMT_INLINE auto format(format_string<T...> fmt, T&&... args)
+    -> std::string {
+  return vformat(fmt, fmt::make_format_args(args...));
+}
+
+/** Formats a string and writes the output to ``out``. */
+template <typename OutputIt,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+auto vformat_to(OutputIt out, string_view fmt, format_args args) -> OutputIt {
+  auto&& buf = detail::get_buffer<char>(out);
+  detail::vformat_to(buf, fmt, args, {});
+  return detail::get_iterator(buf, out);
+}
+
+/**
+ \rst
+ Formats ``args`` according to specifications in ``fmt``, writes the result to
+ the output iterator ``out`` and returns the iterator past the end of the output
+ range. `format_to` does not append a terminating null character.
+
+ **Example**::
+
+   auto out = std::vector<char>();
+   fmt::format_to(std::back_inserter(out), "{}", 42);
+ \endrst
+ */
+template <typename OutputIt, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+FMT_INLINE auto format_to(OutputIt out, format_string<T...> fmt, T&&... args)
+    -> OutputIt {
+  return vformat_to(out, fmt, fmt::make_format_args(args...));
+}
+
+template <typename OutputIt> struct format_to_n_result {
+  /** Iterator past the end of the output range. */
+  OutputIt out;
+  /** Total (not truncated) output size. */
+  size_t size;
+};
+
+template <typename OutputIt, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+auto vformat_to_n(OutputIt out, size_t n, string_view fmt, format_args args)
+    -> format_to_n_result<OutputIt> {
+  using traits = detail::fixed_buffer_traits;
+  auto buf = detail::iterator_buffer<OutputIt, char, traits>(out, n);
+  detail::vformat_to(buf, fmt, args, {});
+  return {buf.out(), buf.count()};
+}
+
+/**
+  \rst
+  Formats ``args`` according to specifications in ``fmt``, writes up to ``n``
+  characters of the result to the output iterator ``out`` and returns the total
+  (not truncated) output size and the iterator past the end of the output range.
+  `format_to_n` does not append a terminating null character.
+  \endrst
+ */
+template <typename OutputIt, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+FMT_INLINE auto format_to_n(OutputIt out, size_t n, format_string<T...> fmt,
+                            T&&... args) -> format_to_n_result<OutputIt> {
+  return vformat_to_n(out, n, fmt, fmt::make_format_args(args...));
+}
+
+/** Returns the number of chars in the output of ``format(fmt, args...)``. */
+template <typename... T>
+FMT_NODISCARD FMT_INLINE auto formatted_size(format_string<T...> fmt,
+                                             T&&... args) -> size_t {
+  auto buf = detail::counting_buffer<>();
+  detail::vformat_to<char>(buf, fmt, fmt::make_format_args(args...), {});
+  return buf.count();
+}
+
+FMT_API void vprint(string_view fmt, format_args args);
+FMT_API void vprint(std::FILE* f, string_view fmt, format_args args);
+
+/**
+  \rst
+  Formats ``args`` according to specifications in ``fmt`` and writes the output
+  to ``stdout``.
+
+  **Example**::
+
+    fmt::print("Elapsed time: {0:.2f} seconds", 1.23);
+  \endrst
+ */
+template <typename... T>
+FMT_INLINE void print(format_string<T...> fmt, T&&... args) {
+  const auto& vargs = fmt::make_format_args(args...);
+  return detail::is_utf8() ? vprint(fmt, vargs)
+                           : detail::vprint_mojibake(stdout, fmt, vargs);
+}
+
+/**
+  \rst
+  Formats ``args`` according to specifications in ``fmt`` and writes the
+  output to the file ``f``.
+
+  **Example**::
+
+    fmt::print(stderr, "Don't {}!", "panic");
+  \endrst
+ */
+template <typename... T>
+FMT_INLINE void print(std::FILE* f, format_string<T...> fmt, T&&... args) {
+  const auto& vargs = fmt::make_format_args(args...);
+  return detail::is_utf8() ? vprint(f, fmt, vargs)
+                           : detail::vprint_mojibake(f, fmt, vargs);
+}
+
+/**
+  Formats ``args`` according to specifications in ``fmt`` and writes the
+  output to the file ``f`` followed by a newline.
+ */
+template <typename... T>
+FMT_INLINE void println(std::FILE* f, format_string<T...> fmt, T&&... args) {
+  return fmt::print(f, "{}\n", fmt::format(fmt, std::forward<T>(args)...));
+}
+
+/**
+  Formats ``args`` according to specifications in ``fmt`` and writes the output
+  to ``stdout`` followed by a newline.
+ */
+template <typename... T>
+FMT_INLINE void println(format_string<T...> fmt, T&&... args) {
+  return fmt::println(stdout, fmt, std::forward<T>(args)...);
+}
+
+FMT_END_EXPORT
+FMT_GCC_PRAGMA("GCC pop_options")
+FMT_END_NAMESPACE
+
+#ifdef FMT_HEADER_ONLY
+#  include "format.h"
+#endif
+#endif  // FMT_CORE_H_
diff --git a/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/format-inl.h b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/format-inl.h
new file mode 100644
index 00000000..efac5d1f
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/format-inl.h
@@ -0,0 +1,1678 @@
+// Formatting library for C++ - implementation
+//
+// Copyright (c) 2012 - 2016, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_FORMAT_INL_H_
+#define FMT_FORMAT_INL_H_
+
+#include <algorithm>
+#include <cerrno>  // errno
+#include <climits>
+#include <cmath>
+#include <exception>
+
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
+#  include <locale>
+#endif
+
+#if defined(_WIN32) && !defined(FMT_WINDOWS_NO_WCHAR)
+#  include <io.h>  // _isatty
+#endif
+
+#include "format.h"
+
+FMT_BEGIN_NAMESPACE
+namespace detail {
+
+FMT_FUNC void assert_fail(const char* file, int line, const char* message) {
+  // Use unchecked std::fprintf to avoid triggering another assertion when
+  // writing to stderr fails
+  std::fprintf(stderr, "%s:%d: assertion failed: %s", file, line, message);
+  // Chosen instead of std::abort to satisfy Clang in CUDA mode during device
+  // code pass.
+  std::terminate();
+}
+
+FMT_FUNC void throw_format_error(const char* message) {
+  FMT_THROW(format_error(message));
+}
+
+FMT_FUNC void format_error_code(detail::buffer<char>& out, int error_code,
+                                string_view message) noexcept {
+  // Report error code making sure that the output fits into
+  // inline_buffer_size to avoid dynamic memory allocation and potential
+  // bad_alloc.
+  out.try_resize(0);
+  static const char SEP[] = ": ";
+  static const char ERROR_STR[] = "error ";
+  // Subtract 2 to account for terminating null characters in SEP and ERROR_STR.
+  size_t error_code_size = sizeof(SEP) + sizeof(ERROR_STR) - 2;
+  auto abs_value = static_cast<uint32_or_64_or_128_t<int>>(error_code);
+  if (detail::is_negative(error_code)) {
+    abs_value = 0 - abs_value;
+    ++error_code_size;
+  }
+  error_code_size += detail::to_unsigned(detail::count_digits(abs_value));
+  auto it = buffer_appender<char>(out);
+  if (message.size() <= inline_buffer_size - error_code_size)
+    fmt::format_to(it, FMT_STRING("{}{}"), message, SEP);
+  fmt::format_to(it, FMT_STRING("{}{}"), ERROR_STR, error_code);
+  FMT_ASSERT(out.size() <= inline_buffer_size, "");
+}
+
+FMT_FUNC void report_error(format_func func, int error_code,
+                           const char* message) noexcept {
+  memory_buffer full_message;
+  func(full_message, error_code, message);
+  // Don't use fwrite_fully because the latter may throw.
+  if (std::fwrite(full_message.data(), full_message.size(), 1, stderr) > 0)
+    std::fputc('\n', stderr);
+}
+
+// A wrapper around fwrite that throws on error.
+inline void fwrite_fully(const void* ptr, size_t count, FILE* stream) {
+  size_t written = std::fwrite(ptr, 1, count, stream);
+  if (written < count)
+    FMT_THROW(system_error(errno, FMT_STRING("cannot write to file")));
+}
+
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
+template <typename Locale>
+locale_ref::locale_ref(const Locale& loc) : locale_(&loc) {
+  static_assert(std::is_same<Locale, std::locale>::value, "");
+}
+
+template <typename Locale> auto locale_ref::get() const -> Locale {
+  static_assert(std::is_same<Locale, std::locale>::value, "");
+  return locale_ ? *static_cast<const std::locale*>(locale_) : std::locale();
+}
+
+template <typename Char>
+FMT_FUNC auto thousands_sep_impl(locale_ref loc) -> thousands_sep_result<Char> {
+  auto& facet = std::use_facet<std::numpunct<Char>>(loc.get<std::locale>());
+  auto grouping = facet.grouping();
+  auto thousands_sep = grouping.empty() ? Char() : facet.thousands_sep();
+  return {std::move(grouping), thousands_sep};
+}
+template <typename Char>
+FMT_FUNC auto decimal_point_impl(locale_ref loc) -> Char {
+  return std::use_facet<std::numpunct<Char>>(loc.get<std::locale>())
+      .decimal_point();
+}
+#else
+template <typename Char>
+FMT_FUNC auto thousands_sep_impl(locale_ref) -> thousands_sep_result<Char> {
+  return {"\03", FMT_STATIC_THOUSANDS_SEPARATOR};
+}
+template <typename Char> FMT_FUNC Char decimal_point_impl(locale_ref) {
+  return '.';
+}
+#endif
+
+FMT_FUNC auto write_loc(appender out, loc_value value,
+                        const format_specs<>& specs, locale_ref loc) -> bool {
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
+  auto locale = loc.get<std::locale>();
+  // We cannot use the num_put<char> facet because it may produce output in
+  // a wrong encoding.
+  using facet = format_facet<std::locale>;
+  if (std::has_facet<facet>(locale))
+    return std::use_facet<facet>(locale).put(out, value, specs);
+  return facet(locale).put(out, value, specs);
+#endif
+  return false;
+}
+}  // namespace detail
+
+template <typename Locale> typename Locale::id format_facet<Locale>::id;
+
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
+template <typename Locale> format_facet<Locale>::format_facet(Locale& loc) {
+  auto& numpunct = std::use_facet<std::numpunct<char>>(loc);
+  grouping_ = numpunct.grouping();
+  if (!grouping_.empty()) separator_ = std::string(1, numpunct.thousands_sep());
+}
+
+template <>
+FMT_API FMT_FUNC auto format_facet<std::locale>::do_put(
+    appender out, loc_value val, const format_specs<>& specs) const -> bool {
+  return val.visit(
+      detail::loc_writer<>{out, specs, separator_, grouping_, decimal_point_});
+}
+#endif
+
+FMT_FUNC auto vsystem_error(int error_code, string_view fmt, format_args args)
+    -> std::system_error {
+  auto ec = std::error_code(error_code, std::generic_category());
+  return std::system_error(ec, vformat(fmt, args));
+}
+
+namespace detail {
+
+template <typename F>
+inline auto operator==(basic_fp<F> x, basic_fp<F> y) -> bool {
+  return x.f == y.f && x.e == y.e;
+}
+
+// Compilers should be able to optimize this into the ror instruction.
+FMT_CONSTEXPR inline auto rotr(uint32_t n, uint32_t r) noexcept -> uint32_t {
+  r &= 31;
+  return (n >> r) | (n << (32 - r));
+}
+FMT_CONSTEXPR inline auto rotr(uint64_t n, uint32_t r) noexcept -> uint64_t {
+  r &= 63;
+  return (n >> r) | (n << (64 - r));
+}
+
+// Implementation of Dragonbox algorithm: https://github.com/jk-jeon/dragonbox.
+namespace dragonbox {
+// Computes upper 64 bits of multiplication of a 32-bit unsigned integer and a
+// 64-bit unsigned integer.
+inline auto umul96_upper64(uint32_t x, uint64_t y) noexcept -> uint64_t {
+  return umul128_upper64(static_cast<uint64_t>(x) << 32, y);
+}
+
+// Computes lower 128 bits of multiplication of a 64-bit unsigned integer and a
+// 128-bit unsigned integer.
+inline auto umul192_lower128(uint64_t x, uint128_fallback y) noexcept
+    -> uint128_fallback {
+  uint64_t high = x * y.high();
+  uint128_fallback high_low = umul128(x, y.low());
+  return {high + high_low.high(), high_low.low()};
+}
+
+// Computes lower 64 bits of multiplication of a 32-bit unsigned integer and a
+// 64-bit unsigned integer.
+inline auto umul96_lower64(uint32_t x, uint64_t y) noexcept -> uint64_t {
+  return x * y;
+}
+
+// Various fast log computations.
+inline auto floor_log10_pow2_minus_log10_4_over_3(int e) noexcept -> int {
+  FMT_ASSERT(e <= 2936 && e >= -2985, "too large exponent");
+  return (e * 631305 - 261663) >> 21;
+}
+
+FMT_INLINE_VARIABLE constexpr struct {
+  uint32_t divisor;
+  int shift_amount;
+} div_small_pow10_infos[] = {{10, 16}, {100, 16}};
+
+// Replaces n by floor(n / pow(10, N)) returning true if and only if n is
+// divisible by pow(10, N).
+// Precondition: n <= pow(10, N + 1).
+template <int N>
+auto check_divisibility_and_divide_by_pow10(uint32_t& n) noexcept -> bool {
+  // The numbers below are chosen such that:
+  //   1. floor(n/d) = floor(nm / 2^k) where d=10 or d=100,
+  //   2. nm mod 2^k < m if and only if n is divisible by d,
+  // where m is magic_number, k is shift_amount
+  // and d is divisor.
+  //
+  // Item 1 is a common technique of replacing division by a constant with
+  // multiplication, see e.g. "Division by Invariant Integers Using
+  // Multiplication" by Granlund and Montgomery (1994). magic_number (m) is set
+  // to ceil(2^k/d) for large enough k.
+  // The idea for item 2 originates from Schubfach.
+  constexpr auto info = div_small_pow10_infos[N - 1];
+  FMT_ASSERT(n <= info.divisor * 10, "n is too large");
+  constexpr uint32_t magic_number =
+      (1u << info.shift_amount) / info.divisor + 1;
+  n *= magic_number;
+  const uint32_t comparison_mask = (1u << info.shift_amount) - 1;
+  bool result = (n & comparison_mask) < magic_number;
+  n >>= info.shift_amount;
+  return result;
+}
+
+// Computes floor(n / pow(10, N)) for small n and N.
+// Precondition: n <= pow(10, N + 1).
+template <int N> auto small_division_by_pow10(uint32_t n) noexcept -> uint32_t {
+  constexpr auto info = div_small_pow10_infos[N - 1];
+  FMT_ASSERT(n <= info.divisor * 10, "n is too large");
+  constexpr uint32_t magic_number =
+      (1u << info.shift_amount) / info.divisor + 1;
+  return (n * magic_number) >> info.shift_amount;
+}
+
+// Computes floor(n / 10^(kappa + 1)) (float)
+inline auto divide_by_10_to_kappa_plus_1(uint32_t n) noexcept -> uint32_t {
+  // 1374389535 = ceil(2^37/100)
+  return static_cast<uint32_t>((static_cast<uint64_t>(n) * 1374389535) >> 37);
+}
+// Computes floor(n / 10^(kappa + 1)) (double)
+inline auto divide_by_10_to_kappa_plus_1(uint64_t n) noexcept -> uint64_t {
+  // 2361183241434822607 = ceil(2^(64+7)/1000)
+  return umul128_upper64(n, 2361183241434822607ull) >> 7;
+}
+
+// Various subroutines using pow10 cache
+template <typename T> struct cache_accessor;
+
+template <> struct cache_accessor<float> {
+  using carrier_uint = float_info<float>::carrier_uint;
+  using cache_entry_type = uint64_t;
+
+  static auto get_cached_power(int k) noexcept -> uint64_t {
+    FMT_ASSERT(k >= float_info<float>::min_k && k <= float_info<float>::max_k,
+               "k is out of range");
+    static constexpr const uint64_t pow10_significands[] = {
+        0x81ceb32c4b43fcf5, 0xa2425ff75e14fc32, 0xcad2f7f5359a3b3f,
+        0xfd87b5f28300ca0e, 0x9e74d1b791e07e49, 0xc612062576589ddb,
+        0xf79687aed3eec552, 0x9abe14cd44753b53, 0xc16d9a0095928a28,
+        0xf1c90080baf72cb2, 0x971da05074da7bef, 0xbce5086492111aeb,
+        0xec1e4a7db69561a6, 0x9392ee8e921d5d08, 0xb877aa3236a4b44a,
+        0xe69594bec44de15c, 0x901d7cf73ab0acda, 0xb424dc35095cd810,
+        0xe12e13424bb40e14, 0x8cbccc096f5088cc, 0xafebff0bcb24aaff,
+        0xdbe6fecebdedd5bf, 0x89705f4136b4a598, 0xabcc77118461cefd,
+        0xd6bf94d5e57a42bd, 0x8637bd05af6c69b6, 0xa7c5ac471b478424,
+        0xd1b71758e219652c, 0x83126e978d4fdf3c, 0xa3d70a3d70a3d70b,
+        0xcccccccccccccccd, 0x8000000000000000, 0xa000000000000000,
+        0xc800000000000000, 0xfa00000000000000, 0x9c40000000000000,
+        0xc350000000000000, 0xf424000000000000, 0x9896800000000000,
+        0xbebc200000000000, 0xee6b280000000000, 0x9502f90000000000,
+        0xba43b74000000000, 0xe8d4a51000000000, 0x9184e72a00000000,
+        0xb5e620f480000000, 0xe35fa931a0000000, 0x8e1bc9bf04000000,
+        0xb1a2bc2ec5000000, 0xde0b6b3a76400000, 0x8ac7230489e80000,
+        0xad78ebc5ac620000, 0xd8d726b7177a8000, 0x878678326eac9000,
+        0xa968163f0a57b400, 0xd3c21bcecceda100, 0x84595161401484a0,
+        0xa56fa5b99019a5c8, 0xcecb8f27f4200f3a, 0x813f3978f8940985,
+        0xa18f07d736b90be6, 0xc9f2c9cd04674edf, 0xfc6f7c4045812297,
+        0x9dc5ada82b70b59e, 0xc5371912364ce306, 0xf684df56c3e01bc7,
+        0x9a130b963a6c115d, 0xc097ce7bc90715b4, 0xf0bdc21abb48db21,
+        0x96769950b50d88f5, 0xbc143fa4e250eb32, 0xeb194f8e1ae525fe,
+        0x92efd1b8d0cf37bf, 0xb7abc627050305ae, 0xe596b7b0c643c71a,
+        0x8f7e32ce7bea5c70, 0xb35dbf821ae4f38c, 0xe0352f62a19e306f};
+    return pow10_significands[k - float_info<float>::min_k];
+  }
+
+  struct compute_mul_result {
+    carrier_uint result;
+    bool is_integer;
+  };
+  struct compute_mul_parity_result {
+    bool parity;
+    bool is_integer;
+  };
+
+  static auto compute_mul(carrier_uint u,
+                          const cache_entry_type& cache) noexcept
+      -> compute_mul_result {
+    auto r = umul96_upper64(u, cache);
+    return {static_cast<carrier_uint>(r >> 32),
+            static_cast<carrier_uint>(r) == 0};
+  }
+
+  static auto compute_delta(const cache_entry_type& cache, int beta) noexcept
+      -> uint32_t {
+    return static_cast<uint32_t>(cache >> (64 - 1 - beta));
+  }
+
+  static auto compute_mul_parity(carrier_uint two_f,
+                                 const cache_entry_type& cache,
+                                 int beta) noexcept
+      -> compute_mul_parity_result {
+    FMT_ASSERT(beta >= 1, "");
+    FMT_ASSERT(beta < 64, "");
+
+    auto r = umul96_lower64(two_f, cache);
+    return {((r >> (64 - beta)) & 1) != 0,
+            static_cast<uint32_t>(r >> (32 - beta)) == 0};
+  }
+
+  static auto compute_left_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
+    return static_cast<carrier_uint>(
+        (cache - (cache >> (num_significand_bits<float>() + 2))) >>
+        (64 - num_significand_bits<float>() - 1 - beta));
+  }
+
+  static auto compute_right_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
+    return static_cast<carrier_uint>(
+        (cache + (cache >> (num_significand_bits<float>() + 1))) >>
+        (64 - num_significand_bits<float>() - 1 - beta));
+  }
+
+  static auto compute_round_up_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
+    return (static_cast<carrier_uint>(
+                cache >> (64 - num_significand_bits<float>() - 2 - beta)) +
+            1) /
+           2;
+  }
+};
+
+template <> struct cache_accessor<double> {
+  using carrier_uint = float_info<double>::carrier_uint;
+  using cache_entry_type = uint128_fallback;
+
+  static auto get_cached_power(int k) noexcept -> uint128_fallback {
+    FMT_ASSERT(k >= float_info<double>::min_k && k <= float_info<double>::max_k,
+               "k is out of range");
+
+    static constexpr const uint128_fallback pow10_significands[] = {
+#if FMT_USE_FULL_CACHE_DRAGONBOX
+      {0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7b},
+      {0x9faacf3df73609b1, 0x77b191618c54e9ad},
+      {0xc795830d75038c1d, 0xd59df5b9ef6a2418},
+      {0xf97ae3d0d2446f25, 0x4b0573286b44ad1e},
+      {0x9becce62836ac577, 0x4ee367f9430aec33},
+      {0xc2e801fb244576d5, 0x229c41f793cda740},
+      {0xf3a20279ed56d48a, 0x6b43527578c11110},
+      {0x9845418c345644d6, 0x830a13896b78aaaa},
+      {0xbe5691ef416bd60c, 0x23cc986bc656d554},
+      {0xedec366b11c6cb8f, 0x2cbfbe86b7ec8aa9},
+      {0x94b3a202eb1c3f39, 0x7bf7d71432f3d6aa},
+      {0xb9e08a83a5e34f07, 0xdaf5ccd93fb0cc54},
+      {0xe858ad248f5c22c9, 0xd1b3400f8f9cff69},
+      {0x91376c36d99995be, 0x23100809b9c21fa2},
+      {0xb58547448ffffb2d, 0xabd40a0c2832a78b},
+      {0xe2e69915b3fff9f9, 0x16c90c8f323f516d},
+      {0x8dd01fad907ffc3b, 0xae3da7d97f6792e4},
+      {0xb1442798f49ffb4a, 0x99cd11cfdf41779d},
+      {0xdd95317f31c7fa1d, 0x40405643d711d584},
+      {0x8a7d3eef7f1cfc52, 0x482835ea666b2573},
+      {0xad1c8eab5ee43b66, 0xda3243650005eed0},
+      {0xd863b256369d4a40, 0x90bed43e40076a83},
+      {0x873e4f75e2224e68, 0x5a7744a6e804a292},
+      {0xa90de3535aaae202, 0x711515d0a205cb37},
+      {0xd3515c2831559a83, 0x0d5a5b44ca873e04},
+      {0x8412d9991ed58091, 0xe858790afe9486c3},
+      {0xa5178fff668ae0b6, 0x626e974dbe39a873},
+      {0xce5d73ff402d98e3, 0xfb0a3d212dc81290},
+      {0x80fa687f881c7f8e, 0x7ce66634bc9d0b9a},
+      {0xa139029f6a239f72, 0x1c1fffc1ebc44e81},
+      {0xc987434744ac874e, 0xa327ffb266b56221},
+      {0xfbe9141915d7a922, 0x4bf1ff9f0062baa9},
+      {0x9d71ac8fada6c9b5, 0x6f773fc3603db4aa},
+      {0xc4ce17b399107c22, 0xcb550fb4384d21d4},
+      {0xf6019da07f549b2b, 0x7e2a53a146606a49},
+      {0x99c102844f94e0fb, 0x2eda7444cbfc426e},
+      {0xc0314325637a1939, 0xfa911155fefb5309},
+      {0xf03d93eebc589f88, 0x793555ab7eba27cb},
+      {0x96267c7535b763b5, 0x4bc1558b2f3458df},
+      {0xbbb01b9283253ca2, 0x9eb1aaedfb016f17},
+      {0xea9c227723ee8bcb, 0x465e15a979c1cadd},
+      {0x92a1958a7675175f, 0x0bfacd89ec191eca},
+      {0xb749faed14125d36, 0xcef980ec671f667c},
+      {0xe51c79a85916f484, 0x82b7e12780e7401b},
+      {0x8f31cc0937ae58d2, 0xd1b2ecb8b0908811},
+      {0xb2fe3f0b8599ef07, 0x861fa7e6dcb4aa16},
+      {0xdfbdcece67006ac9, 0x67a791e093e1d49b},
+      {0x8bd6a141006042bd, 0xe0c8bb2c5c6d24e1},
+      {0xaecc49914078536d, 0x58fae9f773886e19},
+      {0xda7f5bf590966848, 0xaf39a475506a899f},
+      {0x888f99797a5e012d, 0x6d8406c952429604},
+      {0xaab37fd7d8f58178, 0xc8e5087ba6d33b84},
+      {0xd5605fcdcf32e1d6, 0xfb1e4a9a90880a65},
+      {0x855c3be0a17fcd26, 0x5cf2eea09a550680},
+      {0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481f},
+      {0xd0601d8efc57b08b, 0xf13b94daf124da27},
+      {0x823c12795db6ce57, 0x76c53d08d6b70859},
+      {0xa2cb1717b52481ed, 0x54768c4b0c64ca6f},
+      {0xcb7ddcdda26da268, 0xa9942f5dcf7dfd0a},
+      {0xfe5d54150b090b02, 0xd3f93b35435d7c4d},
+      {0x9efa548d26e5a6e1, 0xc47bc5014a1a6db0},
+      {0xc6b8e9b0709f109a, 0x359ab6419ca1091c},
+      {0xf867241c8cc6d4c0, 0xc30163d203c94b63},
+      {0x9b407691d7fc44f8, 0x79e0de63425dcf1e},
+      {0xc21094364dfb5636, 0x985915fc12f542e5},
+      {0xf294b943e17a2bc4, 0x3e6f5b7b17b2939e},
+      {0x979cf3ca6cec5b5a, 0xa705992ceecf9c43},
+      {0xbd8430bd08277231, 0x50c6ff782a838354},
+      {0xece53cec4a314ebd, 0xa4f8bf5635246429},
+      {0x940f4613ae5ed136, 0x871b7795e136be9a},
+      {0xb913179899f68584, 0x28e2557b59846e40},
+      {0xe757dd7ec07426e5, 0x331aeada2fe589d0},
+      {0x9096ea6f3848984f, 0x3ff0d2c85def7622},
+      {0xb4bca50b065abe63, 0x0fed077a756b53aa},
+      {0xe1ebce4dc7f16dfb, 0xd3e8495912c62895},
+      {0x8d3360f09cf6e4bd, 0x64712dd7abbbd95d},
+      {0xb080392cc4349dec, 0xbd8d794d96aacfb4},
+      {0xdca04777f541c567, 0xecf0d7a0fc5583a1},
+      {0x89e42caaf9491b60, 0xf41686c49db57245},
+      {0xac5d37d5b79b6239, 0x311c2875c522ced6},
+      {0xd77485cb25823ac7, 0x7d633293366b828c},
+      {0x86a8d39ef77164bc, 0xae5dff9c02033198},
+      {0xa8530886b54dbdeb, 0xd9f57f830283fdfd},
+      {0xd267caa862a12d66, 0xd072df63c324fd7c},
+      {0x8380dea93da4bc60, 0x4247cb9e59f71e6e},
+      {0xa46116538d0deb78, 0x52d9be85f074e609},
+      {0xcd795be870516656, 0x67902e276c921f8c},
+      {0x806bd9714632dff6, 0x00ba1cd8a3db53b7},
+      {0xa086cfcd97bf97f3, 0x80e8a40eccd228a5},
+      {0xc8a883c0fdaf7df0, 0x6122cd128006b2ce},
+      {0xfad2a4b13d1b5d6c, 0x796b805720085f82},
+      {0x9cc3a6eec6311a63, 0xcbe3303674053bb1},
+      {0xc3f490aa77bd60fc, 0xbedbfc4411068a9d},
+      {0xf4f1b4d515acb93b, 0xee92fb5515482d45},
+      {0x991711052d8bf3c5, 0x751bdd152d4d1c4b},
+      {0xbf5cd54678eef0b6, 0xd262d45a78a0635e},
+      {0xef340a98172aace4, 0x86fb897116c87c35},
+      {0x9580869f0e7aac0e, 0xd45d35e6ae3d4da1},
+      {0xbae0a846d2195712, 0x8974836059cca10a},
+      {0xe998d258869facd7, 0x2bd1a438703fc94c},
+      {0x91ff83775423cc06, 0x7b6306a34627ddd0},
+      {0xb67f6455292cbf08, 0x1a3bc84c17b1d543},
+      {0xe41f3d6a7377eeca, 0x20caba5f1d9e4a94},
+      {0x8e938662882af53e, 0x547eb47b7282ee9d},
+      {0xb23867fb2a35b28d, 0xe99e619a4f23aa44},
+      {0xdec681f9f4c31f31, 0x6405fa00e2ec94d5},
+      {0x8b3c113c38f9f37e, 0xde83bc408dd3dd05},
+      {0xae0b158b4738705e, 0x9624ab50b148d446},
+      {0xd98ddaee19068c76, 0x3badd624dd9b0958},
+      {0x87f8a8d4cfa417c9, 0xe54ca5d70a80e5d7},
+      {0xa9f6d30a038d1dbc, 0x5e9fcf4ccd211f4d},
+      {0xd47487cc8470652b, 0x7647c32000696720},
+      {0x84c8d4dfd2c63f3b, 0x29ecd9f40041e074},
+      {0xa5fb0a17c777cf09, 0xf468107100525891},
+      {0xcf79cc9db955c2cc, 0x7182148d4066eeb5},
+      {0x81ac1fe293d599bf, 0xc6f14cd848405531},
+      {0xa21727db38cb002f, 0xb8ada00e5a506a7d},
+      {0xca9cf1d206fdc03b, 0xa6d90811f0e4851d},
+      {0xfd442e4688bd304a, 0x908f4a166d1da664},
+      {0x9e4a9cec15763e2e, 0x9a598e4e043287ff},
+      {0xc5dd44271ad3cdba, 0x40eff1e1853f29fe},
+      {0xf7549530e188c128, 0xd12bee59e68ef47d},
+      {0x9a94dd3e8cf578b9, 0x82bb74f8301958cf},
+      {0xc13a148e3032d6e7, 0xe36a52363c1faf02},
+      {0xf18899b1bc3f8ca1, 0xdc44e6c3cb279ac2},
+      {0x96f5600f15a7b7e5, 0x29ab103a5ef8c0ba},
+      {0xbcb2b812db11a5de, 0x7415d448f6b6f0e8},
+      {0xebdf661791d60f56, 0x111b495b3464ad22},
+      {0x936b9fcebb25c995, 0xcab10dd900beec35},
+      {0xb84687c269ef3bfb, 0x3d5d514f40eea743},
+      {0xe65829b3046b0afa, 0x0cb4a5a3112a5113},
+      {0x8ff71a0fe2c2e6dc, 0x47f0e785eaba72ac},
+      {0xb3f4e093db73a093, 0x59ed216765690f57},
+      {0xe0f218b8d25088b8, 0x306869c13ec3532d},
+      {0x8c974f7383725573, 0x1e414218c73a13fc},
+      {0xafbd2350644eeacf, 0xe5d1929ef90898fb},
+      {0xdbac6c247d62a583, 0xdf45f746b74abf3a},
+      {0x894bc396ce5da772, 0x6b8bba8c328eb784},
+      {0xab9eb47c81f5114f, 0x066ea92f3f326565},
+      {0xd686619ba27255a2, 0xc80a537b0efefebe},
+      {0x8613fd0145877585, 0xbd06742ce95f5f37},
+      {0xa798fc4196e952e7, 0x2c48113823b73705},
+      {0xd17f3b51fca3a7a0, 0xf75a15862ca504c6},
+      {0x82ef85133de648c4, 0x9a984d73dbe722fc},
+      {0xa3ab66580d5fdaf5, 0xc13e60d0d2e0ebbb},
+      {0xcc963fee10b7d1b3, 0x318df905079926a9},
+      {0xffbbcfe994e5c61f, 0xfdf17746497f7053},
+      {0x9fd561f1fd0f9bd3, 0xfeb6ea8bedefa634},
+      {0xc7caba6e7c5382c8, 0xfe64a52ee96b8fc1},
+      {0xf9bd690a1b68637b, 0x3dfdce7aa3c673b1},
+      {0x9c1661a651213e2d, 0x06bea10ca65c084f},
+      {0xc31bfa0fe5698db8, 0x486e494fcff30a63},
+      {0xf3e2f893dec3f126, 0x5a89dba3c3efccfb},
+      {0x986ddb5c6b3a76b7, 0xf89629465a75e01d},
+      {0xbe89523386091465, 0xf6bbb397f1135824},
+      {0xee2ba6c0678b597f, 0x746aa07ded582e2d},
+      {0x94db483840b717ef, 0xa8c2a44eb4571cdd},
+      {0xba121a4650e4ddeb, 0x92f34d62616ce414},
+      {0xe896a0d7e51e1566, 0x77b020baf9c81d18},
+      {0x915e2486ef32cd60, 0x0ace1474dc1d122f},
+      {0xb5b5ada8aaff80b8, 0x0d819992132456bb},
+      {0xe3231912d5bf60e6, 0x10e1fff697ed6c6a},
+      {0x8df5efabc5979c8f, 0xca8d3ffa1ef463c2},
+      {0xb1736b96b6fd83b3, 0xbd308ff8a6b17cb3},
+      {0xddd0467c64bce4a0, 0xac7cb3f6d05ddbdf},
+      {0x8aa22c0dbef60ee4, 0x6bcdf07a423aa96c},
+      {0xad4ab7112eb3929d, 0x86c16c98d2c953c7},
+      {0xd89d64d57a607744, 0xe871c7bf077ba8b8},
+      {0x87625f056c7c4a8b, 0x11471cd764ad4973},
+      {0xa93af6c6c79b5d2d, 0xd598e40d3dd89bd0},
+      {0xd389b47879823479, 0x4aff1d108d4ec2c4},
+      {0x843610cb4bf160cb, 0xcedf722a585139bb},
+      {0xa54394fe1eedb8fe, 0xc2974eb4ee658829},
+      {0xce947a3da6a9273e, 0x733d226229feea33},
+      {0x811ccc668829b887, 0x0806357d5a3f5260},
+      {0xa163ff802a3426a8, 0xca07c2dcb0cf26f8},
+      {0xc9bcff6034c13052, 0xfc89b393dd02f0b6},
+      {0xfc2c3f3841f17c67, 0xbbac2078d443ace3},
+      {0x9d9ba7832936edc0, 0xd54b944b84aa4c0e},
+      {0xc5029163f384a931, 0x0a9e795e65d4df12},
+      {0xf64335bcf065d37d, 0x4d4617b5ff4a16d6},
+      {0x99ea0196163fa42e, 0x504bced1bf8e4e46},
+      {0xc06481fb9bcf8d39, 0xe45ec2862f71e1d7},
+      {0xf07da27a82c37088, 0x5d767327bb4e5a4d},
+      {0x964e858c91ba2655, 0x3a6a07f8d510f870},
+      {0xbbe226efb628afea, 0x890489f70a55368c},
+      {0xeadab0aba3b2dbe5, 0x2b45ac74ccea842f},
+      {0x92c8ae6b464fc96f, 0x3b0b8bc90012929e},
+      {0xb77ada0617e3bbcb, 0x09ce6ebb40173745},
+      {0xe55990879ddcaabd, 0xcc420a6a101d0516},
+      {0x8f57fa54c2a9eab6, 0x9fa946824a12232e},
+      {0xb32df8e9f3546564, 0x47939822dc96abfa},
+      {0xdff9772470297ebd, 0x59787e2b93bc56f8},
+      {0x8bfbea76c619ef36, 0x57eb4edb3c55b65b},
+      {0xaefae51477a06b03, 0xede622920b6b23f2},
+      {0xdab99e59958885c4, 0xe95fab368e45ecee},
+      {0x88b402f7fd75539b, 0x11dbcb0218ebb415},
+      {0xaae103b5fcd2a881, 0xd652bdc29f26a11a},
+      {0xd59944a37c0752a2, 0x4be76d3346f04960},
+      {0x857fcae62d8493a5, 0x6f70a4400c562ddc},
+      {0xa6dfbd9fb8e5b88e, 0xcb4ccd500f6bb953},
+      {0xd097ad07a71f26b2, 0x7e2000a41346a7a8},
+      {0x825ecc24c873782f, 0x8ed400668c0c28c9},
+      {0xa2f67f2dfa90563b, 0x728900802f0f32fb},
+      {0xcbb41ef979346bca, 0x4f2b40a03ad2ffba},
+      {0xfea126b7d78186bc, 0xe2f610c84987bfa9},
+      {0x9f24b832e6b0f436, 0x0dd9ca7d2df4d7ca},
+      {0xc6ede63fa05d3143, 0x91503d1c79720dbc},
+      {0xf8a95fcf88747d94, 0x75a44c6397ce912b},
+      {0x9b69dbe1b548ce7c, 0xc986afbe3ee11abb},
+      {0xc24452da229b021b, 0xfbe85badce996169},
+      {0xf2d56790ab41c2a2, 0xfae27299423fb9c4},
+      {0x97c560ba6b0919a5, 0xdccd879fc967d41b},
+      {0xbdb6b8e905cb600f, 0x5400e987bbc1c921},
+      {0xed246723473e3813, 0x290123e9aab23b69},
+      {0x9436c0760c86e30b, 0xf9a0b6720aaf6522},
+      {0xb94470938fa89bce, 0xf808e40e8d5b3e6a},
+      {0xe7958cb87392c2c2, 0xb60b1d1230b20e05},
+      {0x90bd77f3483bb9b9, 0xb1c6f22b5e6f48c3},
+      {0xb4ecd5f01a4aa828, 0x1e38aeb6360b1af4},
+      {0xe2280b6c20dd5232, 0x25c6da63c38de1b1},
+      {0x8d590723948a535f, 0x579c487e5a38ad0f},
+      {0xb0af48ec79ace837, 0x2d835a9df0c6d852},
+      {0xdcdb1b2798182244, 0xf8e431456cf88e66},
+      {0x8a08f0f8bf0f156b, 0x1b8e9ecb641b5900},
+      {0xac8b2d36eed2dac5, 0xe272467e3d222f40},
+      {0xd7adf884aa879177, 0x5b0ed81dcc6abb10},
+      {0x86ccbb52ea94baea, 0x98e947129fc2b4ea},
+      {0xa87fea27a539e9a5, 0x3f2398d747b36225},
+      {0xd29fe4b18e88640e, 0x8eec7f0d19a03aae},
+      {0x83a3eeeef9153e89, 0x1953cf68300424ad},
+      {0xa48ceaaab75a8e2b, 0x5fa8c3423c052dd8},
+      {0xcdb02555653131b6, 0x3792f412cb06794e},
+      {0x808e17555f3ebf11, 0xe2bbd88bbee40bd1},
+      {0xa0b19d2ab70e6ed6, 0x5b6aceaeae9d0ec5},
+      {0xc8de047564d20a8b, 0xf245825a5a445276},
+      {0xfb158592be068d2e, 0xeed6e2f0f0d56713},
+      {0x9ced737bb6c4183d, 0x55464dd69685606c},
+      {0xc428d05aa4751e4c, 0xaa97e14c3c26b887},
+      {0xf53304714d9265df, 0xd53dd99f4b3066a9},
+      {0x993fe2c6d07b7fab, 0xe546a8038efe402a},
+      {0xbf8fdb78849a5f96, 0xde98520472bdd034},
+      {0xef73d256a5c0f77c, 0x963e66858f6d4441},
+      {0x95a8637627989aad, 0xdde7001379a44aa9},
+      {0xbb127c53b17ec159, 0x5560c018580d5d53},
+      {0xe9d71b689dde71af, 0xaab8f01e6e10b4a7},
+      {0x9226712162ab070d, 0xcab3961304ca70e9},
+      {0xb6b00d69bb55c8d1, 0x3d607b97c5fd0d23},
+      {0xe45c10c42a2b3b05, 0x8cb89a7db77c506b},
+      {0x8eb98a7a9a5b04e3, 0x77f3608e92adb243},
+      {0xb267ed1940f1c61c, 0x55f038b237591ed4},
+      {0xdf01e85f912e37a3, 0x6b6c46dec52f6689},
+      {0x8b61313bbabce2c6, 0x2323ac4b3b3da016},
+      {0xae397d8aa96c1b77, 0xabec975e0a0d081b},
+      {0xd9c7dced53c72255, 0x96e7bd358c904a22},
+      {0x881cea14545c7575, 0x7e50d64177da2e55},
+      {0xaa242499697392d2, 0xdde50bd1d5d0b9ea},
+      {0xd4ad2dbfc3d07787, 0x955e4ec64b44e865},
+      {0x84ec3c97da624ab4, 0xbd5af13bef0b113f},
+      {0xa6274bbdd0fadd61, 0xecb1ad8aeacdd58f},
+      {0xcfb11ead453994ba, 0x67de18eda5814af3},
+      {0x81ceb32c4b43fcf4, 0x80eacf948770ced8},
+      {0xa2425ff75e14fc31, 0xa1258379a94d028e},
+      {0xcad2f7f5359a3b3e, 0x096ee45813a04331},
+      {0xfd87b5f28300ca0d, 0x8bca9d6e188853fd},
+      {0x9e74d1b791e07e48, 0x775ea264cf55347e},
+      {0xc612062576589dda, 0x95364afe032a819e},
+      {0xf79687aed3eec551, 0x3a83ddbd83f52205},
+      {0x9abe14cd44753b52, 0xc4926a9672793543},
+      {0xc16d9a0095928a27, 0x75b7053c0f178294},
+      {0xf1c90080baf72cb1, 0x5324c68b12dd6339},
+      {0x971da05074da7bee, 0xd3f6fc16ebca5e04},
+      {0xbce5086492111aea, 0x88f4bb1ca6bcf585},
+      {0xec1e4a7db69561a5, 0x2b31e9e3d06c32e6},
+      {0x9392ee8e921d5d07, 0x3aff322e62439fd0},
+      {0xb877aa3236a4b449, 0x09befeb9fad487c3},
+      {0xe69594bec44de15b, 0x4c2ebe687989a9b4},
+      {0x901d7cf73ab0acd9, 0x0f9d37014bf60a11},
+      {0xb424dc35095cd80f, 0x538484c19ef38c95},
+      {0xe12e13424bb40e13, 0x2865a5f206b06fba},
+      {0x8cbccc096f5088cb, 0xf93f87b7442e45d4},
+      {0xafebff0bcb24aafe, 0xf78f69a51539d749},
+      {0xdbe6fecebdedd5be, 0xb573440e5a884d1c},
+      {0x89705f4136b4a597, 0x31680a88f8953031},
+      {0xabcc77118461cefc, 0xfdc20d2b36ba7c3e},
+      {0xd6bf94d5e57a42bc, 0x3d32907604691b4d},
+      {0x8637bd05af6c69b5, 0xa63f9a49c2c1b110},
+      {0xa7c5ac471b478423, 0x0fcf80dc33721d54},
+      {0xd1b71758e219652b, 0xd3c36113404ea4a9},
+      {0x83126e978d4fdf3b, 0x645a1cac083126ea},
+      {0xa3d70a3d70a3d70a, 0x3d70a3d70a3d70a4},
+      {0xcccccccccccccccc, 0xcccccccccccccccd},
+      {0x8000000000000000, 0x0000000000000000},
+      {0xa000000000000000, 0x0000000000000000},
+      {0xc800000000000000, 0x0000000000000000},
+      {0xfa00000000000000, 0x0000000000000000},
+      {0x9c40000000000000, 0x0000000000000000},
+      {0xc350000000000000, 0x0000000000000000},
+      {0xf424000000000000, 0x0000000000000000},
+      {0x9896800000000000, 0x0000000000000000},
+      {0xbebc200000000000, 0x0000000000000000},
+      {0xee6b280000000000, 0x0000000000000000},
+      {0x9502f90000000000, 0x0000000000000000},
+      {0xba43b74000000000, 0x0000000000000000},
+      {0xe8d4a51000000000, 0x0000000000000000},
+      {0x9184e72a00000000, 0x0000000000000000},
+      {0xb5e620f480000000, 0x0000000000000000},
+      {0xe35fa931a0000000, 0x0000000000000000},
+      {0x8e1bc9bf04000000, 0x0000000000000000},
+      {0xb1a2bc2ec5000000, 0x0000000000000000},
+      {0xde0b6b3a76400000, 0x0000000000000000},
+      {0x8ac7230489e80000, 0x0000000000000000},
+      {0xad78ebc5ac620000, 0x0000000000000000},
+      {0xd8d726b7177a8000, 0x0000000000000000},
+      {0x878678326eac9000, 0x0000000000000000},
+      {0xa968163f0a57b400, 0x0000000000000000},
+      {0xd3c21bcecceda100, 0x0000000000000000},
+      {0x84595161401484a0, 0x0000000000000000},
+      {0xa56fa5b99019a5c8, 0x0000000000000000},
+      {0xcecb8f27f4200f3a, 0x0000000000000000},
+      {0x813f3978f8940984, 0x4000000000000000},
+      {0xa18f07d736b90be5, 0x5000000000000000},
+      {0xc9f2c9cd04674ede, 0xa400000000000000},
+      {0xfc6f7c4045812296, 0x4d00000000000000},
+      {0x9dc5ada82b70b59d, 0xf020000000000000},
+      {0xc5371912364ce305, 0x6c28000000000000},
+      {0xf684df56c3e01bc6, 0xc732000000000000},
+      {0x9a130b963a6c115c, 0x3c7f400000000000},
+      {0xc097ce7bc90715b3, 0x4b9f100000000000},
+      {0xf0bdc21abb48db20, 0x1e86d40000000000},
+      {0x96769950b50d88f4, 0x1314448000000000},
+      {0xbc143fa4e250eb31, 0x17d955a000000000},
+      {0xeb194f8e1ae525fd, 0x5dcfab0800000000},
+      {0x92efd1b8d0cf37be, 0x5aa1cae500000000},
+      {0xb7abc627050305ad, 0xf14a3d9e40000000},
+      {0xe596b7b0c643c719, 0x6d9ccd05d0000000},
+      {0x8f7e32ce7bea5c6f, 0xe4820023a2000000},
+      {0xb35dbf821ae4f38b, 0xdda2802c8a800000},
+      {0xe0352f62a19e306e, 0xd50b2037ad200000},
+      {0x8c213d9da502de45, 0x4526f422cc340000},
+      {0xaf298d050e4395d6, 0x9670b12b7f410000},
+      {0xdaf3f04651d47b4c, 0x3c0cdd765f114000},
+      {0x88d8762bf324cd0f, 0xa5880a69fb6ac800},
+      {0xab0e93b6efee0053, 0x8eea0d047a457a00},
+      {0xd5d238a4abe98068, 0x72a4904598d6d880},
+      {0x85a36366eb71f041, 0x47a6da2b7f864750},
+      {0xa70c3c40a64e6c51, 0x999090b65f67d924},
+      {0xd0cf4b50cfe20765, 0xfff4b4e3f741cf6d},
+      {0x82818f1281ed449f, 0xbff8f10e7a8921a5},
+      {0xa321f2d7226895c7, 0xaff72d52192b6a0e},
+      {0xcbea6f8ceb02bb39, 0x9bf4f8a69f764491},
+      {0xfee50b7025c36a08, 0x02f236d04753d5b5},
+      {0x9f4f2726179a2245, 0x01d762422c946591},
+      {0xc722f0ef9d80aad6, 0x424d3ad2b7b97ef6},
+      {0xf8ebad2b84e0d58b, 0xd2e0898765a7deb3},
+      {0x9b934c3b330c8577, 0x63cc55f49f88eb30},
+      {0xc2781f49ffcfa6d5, 0x3cbf6b71c76b25fc},
+      {0xf316271c7fc3908a, 0x8bef464e3945ef7b},
+      {0x97edd871cfda3a56, 0x97758bf0e3cbb5ad},
+      {0xbde94e8e43d0c8ec, 0x3d52eeed1cbea318},
+      {0xed63a231d4c4fb27, 0x4ca7aaa863ee4bde},
+      {0x945e455f24fb1cf8, 0x8fe8caa93e74ef6b},
+      {0xb975d6b6ee39e436, 0xb3e2fd538e122b45},
+      {0xe7d34c64a9c85d44, 0x60dbbca87196b617},
+      {0x90e40fbeea1d3a4a, 0xbc8955e946fe31ce},
+      {0xb51d13aea4a488dd, 0x6babab6398bdbe42},
+      {0xe264589a4dcdab14, 0xc696963c7eed2dd2},
+      {0x8d7eb76070a08aec, 0xfc1e1de5cf543ca3},
+      {0xb0de65388cc8ada8, 0x3b25a55f43294bcc},
+      {0xdd15fe86affad912, 0x49ef0eb713f39ebf},
+      {0x8a2dbf142dfcc7ab, 0x6e3569326c784338},
+      {0xacb92ed9397bf996, 0x49c2c37f07965405},
+      {0xd7e77a8f87daf7fb, 0xdc33745ec97be907},
+      {0x86f0ac99b4e8dafd, 0x69a028bb3ded71a4},
+      {0xa8acd7c0222311bc, 0xc40832ea0d68ce0d},
+      {0xd2d80db02aabd62b, 0xf50a3fa490c30191},
+      {0x83c7088e1aab65db, 0x792667c6da79e0fb},
+      {0xa4b8cab1a1563f52, 0x577001b891185939},
+      {0xcde6fd5e09abcf26, 0xed4c0226b55e6f87},
+      {0x80b05e5ac60b6178, 0x544f8158315b05b5},
+      {0xa0dc75f1778e39d6, 0x696361ae3db1c722},
+      {0xc913936dd571c84c, 0x03bc3a19cd1e38ea},
+      {0xfb5878494ace3a5f, 0x04ab48a04065c724},
+      {0x9d174b2dcec0e47b, 0x62eb0d64283f9c77},
+      {0xc45d1df942711d9a, 0x3ba5d0bd324f8395},
+      {0xf5746577930d6500, 0xca8f44ec7ee3647a},
+      {0x9968bf6abbe85f20, 0x7e998b13cf4e1ecc},
+      {0xbfc2ef456ae276e8, 0x9e3fedd8c321a67f},
+      {0xefb3ab16c59b14a2, 0xc5cfe94ef3ea101f},
+      {0x95d04aee3b80ece5, 0xbba1f1d158724a13},
+      {0xbb445da9ca61281f, 0x2a8a6e45ae8edc98},
+      {0xea1575143cf97226, 0xf52d09d71a3293be},
+      {0x924d692ca61be758, 0x593c2626705f9c57},
+      {0xb6e0c377cfa2e12e, 0x6f8b2fb00c77836d},
+      {0xe498f455c38b997a, 0x0b6dfb9c0f956448},
+      {0x8edf98b59a373fec, 0x4724bd4189bd5ead},
+      {0xb2977ee300c50fe7, 0x58edec91ec2cb658},
+      {0xdf3d5e9bc0f653e1, 0x2f2967b66737e3ee},
+      {0x8b865b215899f46c, 0xbd79e0d20082ee75},
+      {0xae67f1e9aec07187, 0xecd8590680a3aa12},
+      {0xda01ee641a708de9, 0xe80e6f4820cc9496},
+      {0x884134fe908658b2, 0x3109058d147fdcde},
+      {0xaa51823e34a7eede, 0xbd4b46f0599fd416},
+      {0xd4e5e2cdc1d1ea96, 0x6c9e18ac7007c91b},
+      {0x850fadc09923329e, 0x03e2cf6bc604ddb1},
+      {0xa6539930bf6bff45, 0x84db8346b786151d},
+      {0xcfe87f7cef46ff16, 0xe612641865679a64},
+      {0x81f14fae158c5f6e, 0x4fcb7e8f3f60c07f},
+      {0xa26da3999aef7749, 0xe3be5e330f38f09e},
+      {0xcb090c8001ab551c, 0x5cadf5bfd3072cc6},
+      {0xfdcb4fa002162a63, 0x73d9732fc7c8f7f7},
+      {0x9e9f11c4014dda7e, 0x2867e7fddcdd9afb},
+      {0xc646d63501a1511d, 0xb281e1fd541501b9},
+      {0xf7d88bc24209a565, 0x1f225a7ca91a4227},
+      {0x9ae757596946075f, 0x3375788de9b06959},
+      {0xc1a12d2fc3978937, 0x0052d6b1641c83af},
+      {0xf209787bb47d6b84, 0xc0678c5dbd23a49b},
+      {0x9745eb4d50ce6332, 0xf840b7ba963646e1},
+      {0xbd176620a501fbff, 0xb650e5a93bc3d899},
+      {0xec5d3fa8ce427aff, 0xa3e51f138ab4cebf},
+      {0x93ba47c980e98cdf, 0xc66f336c36b10138},
+      {0xb8a8d9bbe123f017, 0xb80b0047445d4185},
+      {0xe6d3102ad96cec1d, 0xa60dc059157491e6},
+      {0x9043ea1ac7e41392, 0x87c89837ad68db30},
+      {0xb454e4a179dd1877, 0x29babe4598c311fc},
+      {0xe16a1dc9d8545e94, 0xf4296dd6fef3d67b},
+      {0x8ce2529e2734bb1d, 0x1899e4a65f58660d},
+      {0xb01ae745b101e9e4, 0x5ec05dcff72e7f90},
+      {0xdc21a1171d42645d, 0x76707543f4fa1f74},
+      {0x899504ae72497eba, 0x6a06494a791c53a9},
+      {0xabfa45da0edbde69, 0x0487db9d17636893},
+      {0xd6f8d7509292d603, 0x45a9d2845d3c42b7},
+      {0x865b86925b9bc5c2, 0x0b8a2392ba45a9b3},
+      {0xa7f26836f282b732, 0x8e6cac7768d7141f},
+      {0xd1ef0244af2364ff, 0x3207d795430cd927},
+      {0x8335616aed761f1f, 0x7f44e6bd49e807b9},
+      {0xa402b9c5a8d3a6e7, 0x5f16206c9c6209a7},
+      {0xcd036837130890a1, 0x36dba887c37a8c10},
+      {0x802221226be55a64, 0xc2494954da2c978a},
+      {0xa02aa96b06deb0fd, 0xf2db9baa10b7bd6d},
+      {0xc83553c5c8965d3d, 0x6f92829494e5acc8},
+      {0xfa42a8b73abbf48c, 0xcb772339ba1f17fa},
+      {0x9c69a97284b578d7, 0xff2a760414536efc},
+      {0xc38413cf25e2d70d, 0xfef5138519684abb},
+      {0xf46518c2ef5b8cd1, 0x7eb258665fc25d6a},
+      {0x98bf2f79d5993802, 0xef2f773ffbd97a62},
+      {0xbeeefb584aff8603, 0xaafb550ffacfd8fb},
+      {0xeeaaba2e5dbf6784, 0x95ba2a53f983cf39},
+      {0x952ab45cfa97a0b2, 0xdd945a747bf26184},
+      {0xba756174393d88df, 0x94f971119aeef9e5},
+      {0xe912b9d1478ceb17, 0x7a37cd5601aab85e},
+      {0x91abb422ccb812ee, 0xac62e055c10ab33b},
+      {0xb616a12b7fe617aa, 0x577b986b314d600a},
+      {0xe39c49765fdf9d94, 0xed5a7e85fda0b80c},
+      {0x8e41ade9fbebc27d, 0x14588f13be847308},
+      {0xb1d219647ae6b31c, 0x596eb2d8ae258fc9},
+      {0xde469fbd99a05fe3, 0x6fca5f8ed9aef3bc},
+      {0x8aec23d680043bee, 0x25de7bb9480d5855},
+      {0xada72ccc20054ae9, 0xaf561aa79a10ae6b},
+      {0xd910f7ff28069da4, 0x1b2ba1518094da05},
+      {0x87aa9aff79042286, 0x90fb44d2f05d0843},
+      {0xa99541bf57452b28, 0x353a1607ac744a54},
+      {0xd3fa922f2d1675f2, 0x42889b8997915ce9},
+      {0x847c9b5d7c2e09b7, 0x69956135febada12},
+      {0xa59bc234db398c25, 0x43fab9837e699096},
+      {0xcf02b2c21207ef2e, 0x94f967e45e03f4bc},
+      {0x8161afb94b44f57d, 0x1d1be0eebac278f6},
+      {0xa1ba1ba79e1632dc, 0x6462d92a69731733},
+      {0xca28a291859bbf93, 0x7d7b8f7503cfdcff},
+      {0xfcb2cb35e702af78, 0x5cda735244c3d43f},
+      {0x9defbf01b061adab, 0x3a0888136afa64a8},
+      {0xc56baec21c7a1916, 0x088aaa1845b8fdd1},
+      {0xf6c69a72a3989f5b, 0x8aad549e57273d46},
+      {0x9a3c2087a63f6399, 0x36ac54e2f678864c},
+      {0xc0cb28a98fcf3c7f, 0x84576a1bb416a7de},
+      {0xf0fdf2d3f3c30b9f, 0x656d44a2a11c51d6},
+      {0x969eb7c47859e743, 0x9f644ae5a4b1b326},
+      {0xbc4665b596706114, 0x873d5d9f0dde1fef},
+      {0xeb57ff22fc0c7959, 0xa90cb506d155a7eb},
+      {0x9316ff75dd87cbd8, 0x09a7f12442d588f3},
+      {0xb7dcbf5354e9bece, 0x0c11ed6d538aeb30},
+      {0xe5d3ef282a242e81, 0x8f1668c8a86da5fb},
+      {0x8fa475791a569d10, 0xf96e017d694487bd},
+      {0xb38d92d760ec4455, 0x37c981dcc395a9ad},
+      {0xe070f78d3927556a, 0x85bbe253f47b1418},
+      {0x8c469ab843b89562, 0x93956d7478ccec8f},
+      {0xaf58416654a6babb, 0x387ac8d1970027b3},
+      {0xdb2e51bfe9d0696a, 0x06997b05fcc0319f},
+      {0x88fcf317f22241e2, 0x441fece3bdf81f04},
+      {0xab3c2fddeeaad25a, 0xd527e81cad7626c4},
+      {0xd60b3bd56a5586f1, 0x8a71e223d8d3b075},
+      {0x85c7056562757456, 0xf6872d5667844e4a},
+      {0xa738c6bebb12d16c, 0xb428f8ac016561dc},
+      {0xd106f86e69d785c7, 0xe13336d701beba53},
+      {0x82a45b450226b39c, 0xecc0024661173474},
+      {0xa34d721642b06084, 0x27f002d7f95d0191},
+      {0xcc20ce9bd35c78a5, 0x31ec038df7b441f5},
+      {0xff290242c83396ce, 0x7e67047175a15272},
+      {0x9f79a169bd203e41, 0x0f0062c6e984d387},
+      {0xc75809c42c684dd1, 0x52c07b78a3e60869},
+      {0xf92e0c3537826145, 0xa7709a56ccdf8a83},
+      {0x9bbcc7a142b17ccb, 0x88a66076400bb692},
+      {0xc2abf989935ddbfe, 0x6acff893d00ea436},
+      {0xf356f7ebf83552fe, 0x0583f6b8c4124d44},
+      {0x98165af37b2153de, 0xc3727a337a8b704b},
+      {0xbe1bf1b059e9a8d6, 0x744f18c0592e4c5d},
+      {0xeda2ee1c7064130c, 0x1162def06f79df74},
+      {0x9485d4d1c63e8be7, 0x8addcb5645ac2ba9},
+      {0xb9a74a0637ce2ee1, 0x6d953e2bd7173693},
+      {0xe8111c87c5c1ba99, 0xc8fa8db6ccdd0438},
+      {0x910ab1d4db9914a0, 0x1d9c9892400a22a3},
+      {0xb54d5e4a127f59c8, 0x2503beb6d00cab4c},
+      {0xe2a0b5dc971f303a, 0x2e44ae64840fd61e},
+      {0x8da471a9de737e24, 0x5ceaecfed289e5d3},
+      {0xb10d8e1456105dad, 0x7425a83e872c5f48},
+      {0xdd50f1996b947518, 0xd12f124e28f7771a},
+      {0x8a5296ffe33cc92f, 0x82bd6b70d99aaa70},
+      {0xace73cbfdc0bfb7b, 0x636cc64d1001550c},
+      {0xd8210befd30efa5a, 0x3c47f7e05401aa4f},
+      {0x8714a775e3e95c78, 0x65acfaec34810a72},
+      {0xa8d9d1535ce3b396, 0x7f1839a741a14d0e},
+      {0xd31045a8341ca07c, 0x1ede48111209a051},
+      {0x83ea2b892091e44d, 0x934aed0aab460433},
+      {0xa4e4b66b68b65d60, 0xf81da84d56178540},
+      {0xce1de40642e3f4b9, 0x36251260ab9d668f},
+      {0x80d2ae83e9ce78f3, 0xc1d72b7c6b42601a},
+      {0xa1075a24e4421730, 0xb24cf65b8612f820},
+      {0xc94930ae1d529cfc, 0xdee033f26797b628},
+      {0xfb9b7cd9a4a7443c, 0x169840ef017da3b2},
+      {0x9d412e0806e88aa5, 0x8e1f289560ee864f},
+      {0xc491798a08a2ad4e, 0xf1a6f2bab92a27e3},
+      {0xf5b5d7ec8acb58a2, 0xae10af696774b1dc},
+      {0x9991a6f3d6bf1765, 0xacca6da1e0a8ef2a},
+      {0xbff610b0cc6edd3f, 0x17fd090a58d32af4},
+      {0xeff394dcff8a948e, 0xddfc4b4cef07f5b1},
+      {0x95f83d0a1fb69cd9, 0x4abdaf101564f98f},
+      {0xbb764c4ca7a4440f, 0x9d6d1ad41abe37f2},
+      {0xea53df5fd18d5513, 0x84c86189216dc5ee},
+      {0x92746b9be2f8552c, 0x32fd3cf5b4e49bb5},
+      {0xb7118682dbb66a77, 0x3fbc8c33221dc2a2},
+      {0xe4d5e82392a40515, 0x0fabaf3feaa5334b},
+      {0x8f05b1163ba6832d, 0x29cb4d87f2a7400f},
+      {0xb2c71d5bca9023f8, 0x743e20e9ef511013},
+      {0xdf78e4b2bd342cf6, 0x914da9246b255417},
+      {0x8bab8eefb6409c1a, 0x1ad089b6c2f7548f},
+      {0xae9672aba3d0c320, 0xa184ac2473b529b2},
+      {0xda3c0f568cc4f3e8, 0xc9e5d72d90a2741f},
+      {0x8865899617fb1871, 0x7e2fa67c7a658893},
+      {0xaa7eebfb9df9de8d, 0xddbb901b98feeab8},
+      {0xd51ea6fa85785631, 0x552a74227f3ea566},
+      {0x8533285c936b35de, 0xd53a88958f872760},
+      {0xa67ff273b8460356, 0x8a892abaf368f138},
+      {0xd01fef10a657842c, 0x2d2b7569b0432d86},
+      {0x8213f56a67f6b29b, 0x9c3b29620e29fc74},
+      {0xa298f2c501f45f42, 0x8349f3ba91b47b90},
+      {0xcb3f2f7642717713, 0x241c70a936219a74},
+      {0xfe0efb53d30dd4d7, 0xed238cd383aa0111},
+      {0x9ec95d1463e8a506, 0xf4363804324a40ab},
+      {0xc67bb4597ce2ce48, 0xb143c6053edcd0d6},
+      {0xf81aa16fdc1b81da, 0xdd94b7868e94050b},
+      {0x9b10a4e5e9913128, 0xca7cf2b4191c8327},
+      {0xc1d4ce1f63f57d72, 0xfd1c2f611f63a3f1},
+      {0xf24a01a73cf2dccf, 0xbc633b39673c8ced},
+      {0x976e41088617ca01, 0xd5be0503e085d814},
+      {0xbd49d14aa79dbc82, 0x4b2d8644d8a74e19},
+      {0xec9c459d51852ba2, 0xddf8e7d60ed1219f},
+      {0x93e1ab8252f33b45, 0xcabb90e5c942b504},
+      {0xb8da1662e7b00a17, 0x3d6a751f3b936244},
+      {0xe7109bfba19c0c9d, 0x0cc512670a783ad5},
+      {0x906a617d450187e2, 0x27fb2b80668b24c6},
+      {0xb484f9dc9641e9da, 0xb1f9f660802dedf7},
+      {0xe1a63853bbd26451, 0x5e7873f8a0396974},
+      {0x8d07e33455637eb2, 0xdb0b487b6423e1e9},
+      {0xb049dc016abc5e5f, 0x91ce1a9a3d2cda63},
+      {0xdc5c5301c56b75f7, 0x7641a140cc7810fc},
+      {0x89b9b3e11b6329ba, 0xa9e904c87fcb0a9e},
+      {0xac2820d9623bf429, 0x546345fa9fbdcd45},
+      {0xd732290fbacaf133, 0xa97c177947ad4096},
+      {0x867f59a9d4bed6c0, 0x49ed8eabcccc485e},
+      {0xa81f301449ee8c70, 0x5c68f256bfff5a75},
+      {0xd226fc195c6a2f8c, 0x73832eec6fff3112},
+      {0x83585d8fd9c25db7, 0xc831fd53c5ff7eac},
+      {0xa42e74f3d032f525, 0xba3e7ca8b77f5e56},
+      {0xcd3a1230c43fb26f, 0x28ce1bd2e55f35ec},
+      {0x80444b5e7aa7cf85, 0x7980d163cf5b81b4},
+      {0xa0555e361951c366, 0xd7e105bcc3326220},
+      {0xc86ab5c39fa63440, 0x8dd9472bf3fefaa8},
+      {0xfa856334878fc150, 0xb14f98f6f0feb952},
+      {0x9c935e00d4b9d8d2, 0x6ed1bf9a569f33d4},
+      {0xc3b8358109e84f07, 0x0a862f80ec4700c9},
+      {0xf4a642e14c6262c8, 0xcd27bb612758c0fb},
+      {0x98e7e9cccfbd7dbd, 0x8038d51cb897789d},
+      {0xbf21e44003acdd2c, 0xe0470a63e6bd56c4},
+      {0xeeea5d5004981478, 0x1858ccfce06cac75},
+      {0x95527a5202df0ccb, 0x0f37801e0c43ebc9},
+      {0xbaa718e68396cffd, 0xd30560258f54e6bb},
+      {0xe950df20247c83fd, 0x47c6b82ef32a206a},
+      {0x91d28b7416cdd27e, 0x4cdc331d57fa5442},
+      {0xb6472e511c81471d, 0xe0133fe4adf8e953},
+      {0xe3d8f9e563a198e5, 0x58180fddd97723a7},
+      {0x8e679c2f5e44ff8f, 0x570f09eaa7ea7649},
+      {0xb201833b35d63f73, 0x2cd2cc6551e513db},
+      {0xde81e40a034bcf4f, 0xf8077f7ea65e58d2},
+      {0x8b112e86420f6191, 0xfb04afaf27faf783},
+      {0xadd57a27d29339f6, 0x79c5db9af1f9b564},
+      {0xd94ad8b1c7380874, 0x18375281ae7822bd},
+      {0x87cec76f1c830548, 0x8f2293910d0b15b6},
+      {0xa9c2794ae3a3c69a, 0xb2eb3875504ddb23},
+      {0xd433179d9c8cb841, 0x5fa60692a46151ec},
+      {0x849feec281d7f328, 0xdbc7c41ba6bcd334},
+      {0xa5c7ea73224deff3, 0x12b9b522906c0801},
+      {0xcf39e50feae16bef, 0xd768226b34870a01},
+      {0x81842f29f2cce375, 0xe6a1158300d46641},
+      {0xa1e53af46f801c53, 0x60495ae3c1097fd1},
+      {0xca5e89b18b602368, 0x385bb19cb14bdfc5},
+      {0xfcf62c1dee382c42, 0x46729e03dd9ed7b6},
+      {0x9e19db92b4e31ba9, 0x6c07a2c26a8346d2},
+      {0xc5a05277621be293, 0xc7098b7305241886},
+      {0xf70867153aa2db38, 0xb8cbee4fc66d1ea8},
+      {0x9a65406d44a5c903, 0x737f74f1dc043329},
+      {0xc0fe908895cf3b44, 0x505f522e53053ff3},
+      {0xf13e34aabb430a15, 0x647726b9e7c68ff0},
+      {0x96c6e0eab509e64d, 0x5eca783430dc19f6},
+      {0xbc789925624c5fe0, 0xb67d16413d132073},
+      {0xeb96bf6ebadf77d8, 0xe41c5bd18c57e890},
+      {0x933e37a534cbaae7, 0x8e91b962f7b6f15a},
+      {0xb80dc58e81fe95a1, 0x723627bbb5a4adb1},
+      {0xe61136f2227e3b09, 0xcec3b1aaa30dd91d},
+      {0x8fcac257558ee4e6, 0x213a4f0aa5e8a7b2},
+      {0xb3bd72ed2af29e1f, 0xa988e2cd4f62d19e},
+      {0xe0accfa875af45a7, 0x93eb1b80a33b8606},
+      {0x8c6c01c9498d8b88, 0xbc72f130660533c4},
+      {0xaf87023b9bf0ee6a, 0xeb8fad7c7f8680b5},
+      {0xdb68c2ca82ed2a05, 0xa67398db9f6820e2},
+#else
+      {0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7b},
+      {0xce5d73ff402d98e3, 0xfb0a3d212dc81290},
+      {0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481f},
+      {0x86a8d39ef77164bc, 0xae5dff9c02033198},
+      {0xd98ddaee19068c76, 0x3badd624dd9b0958},
+      {0xafbd2350644eeacf, 0xe5d1929ef90898fb},
+      {0x8df5efabc5979c8f, 0xca8d3ffa1ef463c2},
+      {0xe55990879ddcaabd, 0xcc420a6a101d0516},
+      {0xb94470938fa89bce, 0xf808e40e8d5b3e6a},
+      {0x95a8637627989aad, 0xdde7001379a44aa9},
+      {0xf1c90080baf72cb1, 0x5324c68b12dd6339},
+      {0xc350000000000000, 0x0000000000000000},
+      {0x9dc5ada82b70b59d, 0xf020000000000000},
+      {0xfee50b7025c36a08, 0x02f236d04753d5b5},
+      {0xcde6fd5e09abcf26, 0xed4c0226b55e6f87},
+      {0xa6539930bf6bff45, 0x84db8346b786151d},
+      {0x865b86925b9bc5c2, 0x0b8a2392ba45a9b3},
+      {0xd910f7ff28069da4, 0x1b2ba1518094da05},
+      {0xaf58416654a6babb, 0x387ac8d1970027b3},
+      {0x8da471a9de737e24, 0x5ceaecfed289e5d3},
+      {0xe4d5e82392a40515, 0x0fabaf3feaa5334b},
+      {0xb8da1662e7b00a17, 0x3d6a751f3b936244},
+      {0x95527a5202df0ccb, 0x0f37801e0c43ebc9},
+      {0xf13e34aabb430a15, 0x647726b9e7c68ff0}
+#endif
+    };
+
+#if FMT_USE_FULL_CACHE_DRAGONBOX
+    return pow10_significands[k - float_info<double>::min_k];
+#else
+    static constexpr const uint64_t powers_of_5_64[] = {
+        0x0000000000000001, 0x0000000000000005, 0x0000000000000019,
+        0x000000000000007d, 0x0000000000000271, 0x0000000000000c35,
+        0x0000000000003d09, 0x000000000001312d, 0x000000000005f5e1,
+        0x00000000001dcd65, 0x00000000009502f9, 0x0000000002e90edd,
+        0x000000000e8d4a51, 0x0000000048c27395, 0x000000016bcc41e9,
+        0x000000071afd498d, 0x0000002386f26fc1, 0x000000b1a2bc2ec5,
+        0x000003782dace9d9, 0x00001158e460913d, 0x000056bc75e2d631,
+        0x0001b1ae4d6e2ef5, 0x000878678326eac9, 0x002a5a058fc295ed,
+        0x00d3c21bcecceda1, 0x0422ca8b0a00a425, 0x14adf4b7320334b9};
+
+    static const int compression_ratio = 27;
+
+    // Compute base index.
+    int cache_index = (k - float_info<double>::min_k) / compression_ratio;
+    int kb = cache_index * compression_ratio + float_info<double>::min_k;
+    int offset = k - kb;
+
+    // Get base cache.
+    uint128_fallback base_cache = pow10_significands[cache_index];
+    if (offset == 0) return base_cache;
+
+    // Compute the required amount of bit-shift.
+    int alpha = floor_log2_pow10(kb + offset) - floor_log2_pow10(kb) - offset;
+    FMT_ASSERT(alpha > 0 && alpha < 64, "shifting error detected");
+
+    // Try to recover the real cache.
+    uint64_t pow5 = powers_of_5_64[offset];
+    uint128_fallback recovered_cache = umul128(base_cache.high(), pow5);
+    uint128_fallback middle_low = umul128(base_cache.low(), pow5);
+
+    recovered_cache += middle_low.high();
+
+    uint64_t high_to_middle = recovered_cache.high() << (64 - alpha);
+    uint64_t middle_to_low = recovered_cache.low() << (64 - alpha);
+
+    recovered_cache =
+        uint128_fallback{(recovered_cache.low() >> alpha) | high_to_middle,
+                         ((middle_low.low() >> alpha) | middle_to_low)};
+    FMT_ASSERT(recovered_cache.low() + 1 != 0, "");
+    return {recovered_cache.high(), recovered_cache.low() + 1};
+#endif
+  }
+
+  struct compute_mul_result {
+    carrier_uint result;
+    bool is_integer;
+  };
+  struct compute_mul_parity_result {
+    bool parity;
+    bool is_integer;
+  };
+
+  static auto compute_mul(carrier_uint u,
+                          const cache_entry_type& cache) noexcept
+      -> compute_mul_result {
+    auto r = umul192_upper128(u, cache);
+    return {r.high(), r.low() == 0};
+  }
+
+  static auto compute_delta(cache_entry_type const& cache, int beta) noexcept
+      -> uint32_t {
+    return static_cast<uint32_t>(cache.high() >> (64 - 1 - beta));
+  }
+
+  static auto compute_mul_parity(carrier_uint two_f,
+                                 const cache_entry_type& cache,
+                                 int beta) noexcept
+      -> compute_mul_parity_result {
+    FMT_ASSERT(beta >= 1, "");
+    FMT_ASSERT(beta < 64, "");
+
+    auto r = umul192_lower128(two_f, cache);
+    return {((r.high() >> (64 - beta)) & 1) != 0,
+            ((r.high() << beta) | (r.low() >> (64 - beta))) == 0};
+  }
+
+  static auto compute_left_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
+    return (cache.high() -
+            (cache.high() >> (num_significand_bits<double>() + 2))) >>
+           (64 - num_significand_bits<double>() - 1 - beta);
+  }
+
+  static auto compute_right_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
+    return (cache.high() +
+            (cache.high() >> (num_significand_bits<double>() + 1))) >>
+           (64 - num_significand_bits<double>() - 1 - beta);
+  }
+
+  static auto compute_round_up_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
+    return ((cache.high() >> (64 - num_significand_bits<double>() - 2 - beta)) +
+            1) /
+           2;
+  }
+};
+
+FMT_FUNC auto get_cached_power(int k) noexcept -> uint128_fallback {
+  return cache_accessor<double>::get_cached_power(k);
+}
+
+// Various integer checks
+template <typename T>
+auto is_left_endpoint_integer_shorter_interval(int exponent) noexcept -> bool {
+  const int case_shorter_interval_left_endpoint_lower_threshold = 2;
+  const int case_shorter_interval_left_endpoint_upper_threshold = 3;
+  return exponent >= case_shorter_interval_left_endpoint_lower_threshold &&
+         exponent <= case_shorter_interval_left_endpoint_upper_threshold;
+}
+
+// Remove trailing zeros from n and return the number of zeros removed (float)
+FMT_INLINE int remove_trailing_zeros(uint32_t& n, int s = 0) noexcept {
+  FMT_ASSERT(n != 0, "");
+  // Modular inverse of 5 (mod 2^32): (mod_inv_5 * 5) mod 2^32 = 1.
+  constexpr uint32_t mod_inv_5 = 0xcccccccd;
+  constexpr uint32_t mod_inv_25 = 0xc28f5c29;  // = mod_inv_5 * mod_inv_5
+
+  while (true) {
+    auto q = rotr(n * mod_inv_25, 2);
+    if (q > max_value<uint32_t>() / 100) break;
+    n = q;
+    s += 2;
+  }
+  auto q = rotr(n * mod_inv_5, 1);
+  if (q <= max_value<uint32_t>() / 10) {
+    n = q;
+    s |= 1;
+  }
+  return s;
+}
+
+// Removes trailing zeros and returns the number of zeros removed (double)
+FMT_INLINE int remove_trailing_zeros(uint64_t& n) noexcept {
+  FMT_ASSERT(n != 0, "");
+
+  // This magic number is ceil(2^90 / 10^8).
+  constexpr uint64_t magic_number = 12379400392853802749ull;
+  auto nm = umul128(n, magic_number);
+
+  // Is n is divisible by 10^8?
+  if ((nm.high() & ((1ull << (90 - 64)) - 1)) == 0 && nm.low() < magic_number) {
+    // If yes, work with the quotient...
+    auto n32 = static_cast<uint32_t>(nm.high() >> (90 - 64));
+    // ... and use the 32 bit variant of the function
+    int s = remove_trailing_zeros(n32, 8);
+    n = n32;
+    return s;
+  }
+
+  // If n is not divisible by 10^8, work with n itself.
+  constexpr uint64_t mod_inv_5 = 0xcccccccccccccccd;
+  constexpr uint64_t mod_inv_25 = 0x8f5c28f5c28f5c29;  // mod_inv_5 * mod_inv_5
+
+  int s = 0;
+  while (true) {
+    auto q = rotr(n * mod_inv_25, 2);
+    if (q > max_value<uint64_t>() / 100) break;
+    n = q;
+    s += 2;
+  }
+  auto q = rotr(n * mod_inv_5, 1);
+  if (q <= max_value<uint64_t>() / 10) {
+    n = q;
+    s |= 1;
+  }
+
+  return s;
+}
+
+// The main algorithm for shorter interval case
+template <typename T>
+FMT_INLINE decimal_fp<T> shorter_interval_case(int exponent) noexcept {
+  decimal_fp<T> ret_value;
+  // Compute k and beta
+  const int minus_k = floor_log10_pow2_minus_log10_4_over_3(exponent);
+  const int beta = exponent + floor_log2_pow10(-minus_k);
+
+  // Compute xi and zi
+  using cache_entry_type = typename cache_accessor<T>::cache_entry_type;
+  const cache_entry_type cache = cache_accessor<T>::get_cached_power(-minus_k);
+
+  auto xi = cache_accessor<T>::compute_left_endpoint_for_shorter_interval_case(
+      cache, beta);
+  auto zi = cache_accessor<T>::compute_right_endpoint_for_shorter_interval_case(
+      cache, beta);
+
+  // If the left endpoint is not an integer, increase it
+  if (!is_left_endpoint_integer_shorter_interval<T>(exponent)) ++xi;
+
+  // Try bigger divisor
+  ret_value.significand = zi / 10;
+
+  // If succeed, remove trailing zeros if necessary and return
+  if (ret_value.significand * 10 >= xi) {
+    ret_value.exponent = minus_k + 1;
+    ret_value.exponent += remove_trailing_zeros(ret_value.significand);
+    return ret_value;
+  }
+
+  // Otherwise, compute the round-up of y
+  ret_value.significand =
+      cache_accessor<T>::compute_round_up_for_shorter_interval_case(cache,
+                                                                    beta);
+  ret_value.exponent = minus_k;
+
+  // When tie occurs, choose one of them according to the rule
+  if (exponent >= float_info<T>::shorter_interval_tie_lower_threshold &&
+      exponent <= float_info<T>::shorter_interval_tie_upper_threshold) {
+    ret_value.significand = ret_value.significand % 2 == 0
+                                ? ret_value.significand
+                                : ret_value.significand - 1;
+  } else if (ret_value.significand < xi) {
+    ++ret_value.significand;
+  }
+  return ret_value;
+}
+
+template <typename T> auto to_decimal(T x) noexcept -> decimal_fp<T> {
+  // Step 1: integer promotion & Schubfach multiplier calculation.
+
+  using carrier_uint = typename float_info<T>::carrier_uint;
+  using cache_entry_type = typename cache_accessor<T>::cache_entry_type;
+  auto br = bit_cast<carrier_uint>(x);
+
+  // Extract significand bits and exponent bits.
+  const carrier_uint significand_mask =
+      (static_cast<carrier_uint>(1) << num_significand_bits<T>()) - 1;
+  carrier_uint significand = (br & significand_mask);
+  int exponent =
+      static_cast<int>((br & exponent_mask<T>()) >> num_significand_bits<T>());
+
+  if (exponent != 0) {  // Check if normal.
+    exponent -= exponent_bias<T>() + num_significand_bits<T>();
+
+    // Shorter interval case; proceed like Schubfach.
+    // In fact, when exponent == 1 and significand == 0, the interval is
+    // regular. However, it can be shown that the end-results are anyway same.
+    if (significand == 0) return shorter_interval_case<T>(exponent);
+
+    significand |= (static_cast<carrier_uint>(1) << num_significand_bits<T>());
+  } else {
+    // Subnormal case; the interval is always regular.
+    if (significand == 0) return {0, 0};
+    exponent =
+        std::numeric_limits<T>::min_exponent - num_significand_bits<T>() - 1;
+  }
+
+  const bool include_left_endpoint = (significand % 2 == 0);
+  const bool include_right_endpoint = include_left_endpoint;
+
+  // Compute k and beta.
+  const int minus_k = floor_log10_pow2(exponent) - float_info<T>::kappa;
+  const cache_entry_type cache = cache_accessor<T>::get_cached_power(-minus_k);
+  const int beta = exponent + floor_log2_pow10(-minus_k);
+
+  // Compute zi and deltai.
+  // 10^kappa <= deltai < 10^(kappa + 1)
+  const uint32_t deltai = cache_accessor<T>::compute_delta(cache, beta);
+  const carrier_uint two_fc = significand << 1;
+
+  // For the case of binary32, the result of integer check is not correct for
+  // 29711844 * 2^-82
+  // = 6.1442653300000000008655037797566933477355632930994033813476... * 10^-18
+  // and 29711844 * 2^-81
+  // = 1.2288530660000000001731007559513386695471126586198806762695... * 10^-17,
+  // and they are the unique counterexamples. However, since 29711844 is even,
+  // this does not cause any problem for the endpoints calculations; it can only
+  // cause a problem when we need to perform integer check for the center.
+  // Fortunately, with these inputs, that branch is never executed, so we are
+  // fine.
+  const typename cache_accessor<T>::compute_mul_result z_mul =
+      cache_accessor<T>::compute_mul((two_fc | 1) << beta, cache);
+
+  // Step 2: Try larger divisor; remove trailing zeros if necessary.
+
+  // Using an upper bound on zi, we might be able to optimize the division
+  // better than the compiler; we are computing zi / big_divisor here.
+  decimal_fp<T> ret_value;
+  ret_value.significand = divide_by_10_to_kappa_plus_1(z_mul.result);
+  uint32_t r = static_cast<uint32_t>(z_mul.result - float_info<T>::big_divisor *
+                                                        ret_value.significand);
+
+  if (r < deltai) {
+    // Exclude the right endpoint if necessary.
+    if (r == 0 && (z_mul.is_integer & !include_right_endpoint)) {
+      --ret_value.significand;
+      r = float_info<T>::big_divisor;
+      goto small_divisor_case_label;
+    }
+  } else if (r > deltai) {
+    goto small_divisor_case_label;
+  } else {
+    // r == deltai; compare fractional parts.
+    const typename cache_accessor<T>::compute_mul_parity_result x_mul =
+        cache_accessor<T>::compute_mul_parity(two_fc - 1, cache, beta);
+
+    if (!(x_mul.parity | (x_mul.is_integer & include_left_endpoint)))
+      goto small_divisor_case_label;
+  }
+  ret_value.exponent = minus_k + float_info<T>::kappa + 1;
+
+  // We may need to remove trailing zeros.
+  ret_value.exponent += remove_trailing_zeros(ret_value.significand);
+  return ret_value;
+
+  // Step 3: Find the significand with the smaller divisor.
+
+small_divisor_case_label:
+  ret_value.significand *= 10;
+  ret_value.exponent = minus_k + float_info<T>::kappa;
+
+  uint32_t dist = r - (deltai / 2) + (float_info<T>::small_divisor / 2);
+  const bool approx_y_parity =
+      ((dist ^ (float_info<T>::small_divisor / 2)) & 1) != 0;
+
+  // Is dist divisible by 10^kappa?
+  const bool divisible_by_small_divisor =
+      check_divisibility_and_divide_by_pow10<float_info<T>::kappa>(dist);
+
+  // Add dist / 10^kappa to the significand.
+  ret_value.significand += dist;
+
+  if (!divisible_by_small_divisor) return ret_value;
+
+  // Check z^(f) >= epsilon^(f).
+  // We have either yi == zi - epsiloni or yi == (zi - epsiloni) - 1,
+  // where yi == zi - epsiloni if and only if z^(f) >= epsilon^(f).
+  // Since there are only 2 possibilities, we only need to care about the
+  // parity. Also, zi and r should have the same parity since the divisor
+  // is an even number.
+  const auto y_mul = cache_accessor<T>::compute_mul_parity(two_fc, cache, beta);
+
+  // If z^(f) >= epsilon^(f), we might have a tie when z^(f) == epsilon^(f),
+  // or equivalently, when y is an integer.
+  if (y_mul.parity != approx_y_parity)
+    --ret_value.significand;
+  else if (y_mul.is_integer & (ret_value.significand % 2 != 0))
+    --ret_value.significand;
+  return ret_value;
+}
+}  // namespace dragonbox
+}  // namespace detail
+
+template <> struct formatter<detail::bigint> {
+  FMT_CONSTEXPR auto parse(format_parse_context& ctx)
+      -> format_parse_context::iterator {
+    return ctx.begin();
+  }
+
+  auto format(const detail::bigint& n, format_context& ctx) const
+      -> format_context::iterator {
+    auto out = ctx.out();
+    bool first = true;
+    for (auto i = n.bigits_.size(); i > 0; --i) {
+      auto value = n.bigits_[i - 1u];
+      if (first) {
+        out = fmt::format_to(out, FMT_STRING("{:x}"), value);
+        first = false;
+        continue;
+      }
+      out = fmt::format_to(out, FMT_STRING("{:08x}"), value);
+    }
+    if (n.exp_ > 0)
+      out = fmt::format_to(out, FMT_STRING("p{}"),
+                           n.exp_ * detail::bigint::bigit_bits);
+    return out;
+  }
+};
+
+FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) {
+  for_each_codepoint(s, [this](uint32_t cp, string_view) {
+    if (cp == invalid_code_point) FMT_THROW(std::runtime_error("invalid utf8"));
+    if (cp <= 0xFFFF) {
+      buffer_.push_back(static_cast<wchar_t>(cp));
+    } else {
+      cp -= 0x10000;
+      buffer_.push_back(static_cast<wchar_t>(0xD800 + (cp >> 10)));
+      buffer_.push_back(static_cast<wchar_t>(0xDC00 + (cp & 0x3FF)));
+    }
+    return true;
+  });
+  buffer_.push_back(0);
+}
+
+FMT_FUNC void format_system_error(detail::buffer<char>& out, int error_code,
+                                  const char* message) noexcept {
+  FMT_TRY {
+    auto ec = std::error_code(error_code, std::generic_category());
+    write(std::back_inserter(out), std::system_error(ec, message).what());
+    return;
+  }
+  FMT_CATCH(...) {}
+  format_error_code(out, error_code, message);
+}
+
+FMT_FUNC void report_system_error(int error_code,
+                                  const char* message) noexcept {
+  report_error(format_system_error, error_code, message);
+}
+
+FMT_FUNC auto vformat(string_view fmt, format_args args) -> std::string {
+  // Don't optimize the "{}" case to keep the binary size small and because it
+  // can be better optimized in fmt::format anyway.
+  auto buffer = memory_buffer();
+  detail::vformat_to(buffer, fmt, args);
+  return to_string(buffer);
+}
+
+namespace detail {
+#if !defined(_WIN32) || defined(FMT_WINDOWS_NO_WCHAR)
+FMT_FUNC auto write_console(int, string_view) -> bool { return false; }
+FMT_FUNC auto write_console(std::FILE*, string_view) -> bool { return false; }
+#else
+using dword = conditional_t<sizeof(long) == 4, unsigned long, unsigned>;
+extern "C" __declspec(dllimport) int __stdcall WriteConsoleW(  //
+    void*, const void*, dword, dword*, void*);
+
+FMT_FUNC bool write_console(int fd, string_view text) {
+  auto u16 = utf8_to_utf16(text);
+  return WriteConsoleW(reinterpret_cast<void*>(_get_osfhandle(fd)), u16.c_str(),
+                       static_cast<dword>(u16.size()), nullptr, nullptr) != 0;
+}
+
+FMT_FUNC auto write_console(std::FILE* f, string_view text) -> bool {
+  return write_console(_fileno(f), text);
+}
+#endif
+
+#ifdef _WIN32
+// Print assuming legacy (non-Unicode) encoding.
+FMT_FUNC void vprint_mojibake(std::FILE* f, string_view fmt, format_args args) {
+  auto buffer = memory_buffer();
+  detail::vformat_to(buffer, fmt, args);
+  fwrite_fully(buffer.data(), buffer.size(), f);
+}
+#endif
+
+FMT_FUNC void print(std::FILE* f, string_view text) {
+#ifdef _WIN32
+  int fd = _fileno(f);
+  if (_isatty(fd)) {
+    std::fflush(f);
+    if (write_console(fd, text)) return;
+  }
+#endif
+  fwrite_fully(text.data(), text.size(), f);
+}
+}  // namespace detail
+
+FMT_FUNC void vprint(std::FILE* f, string_view fmt, format_args args) {
+  auto buffer = memory_buffer();
+  detail::vformat_to(buffer, fmt, args);
+  detail::print(f, {buffer.data(), buffer.size()});
+}
+
+FMT_FUNC void vprint(string_view fmt, format_args args) {
+  vprint(stdout, fmt, args);
+}
+
+namespace detail {
+
+struct singleton {
+  unsigned char upper;
+  unsigned char lower_count;
+};
+
+inline auto is_printable(uint16_t x, const singleton* singletons,
+                         size_t singletons_size,
+                         const unsigned char* singleton_lowers,
+                         const unsigned char* normal, size_t normal_size)
+    -> bool {
+  auto upper = x >> 8;
+  auto lower_start = 0;
+  for (size_t i = 0; i < singletons_size; ++i) {
+    auto s = singletons[i];
+    auto lower_end = lower_start + s.lower_count;
+    if (upper < s.upper) break;
+    if (upper == s.upper) {
+      for (auto j = lower_start; j < lower_end; ++j) {
+        if (singleton_lowers[j] == (x & 0xff)) return false;
+      }
+    }
+    lower_start = lower_end;
+  }
+
+  auto xsigned = static_cast<int>(x);
+  auto current = true;
+  for (size_t i = 0; i < normal_size; ++i) {
+    auto v = static_cast<int>(normal[i]);
+    auto len = (v & 0x80) != 0 ? (v & 0x7f) << 8 | normal[++i] : v;
+    xsigned -= len;
+    if (xsigned < 0) break;
+    current = !current;
+  }
+  return current;
+}
+
+// This code is generated by support/printable.py.
+FMT_FUNC auto is_printable(uint32_t cp) -> bool {
+  static constexpr singleton singletons0[] = {
+      {0x00, 1},  {0x03, 5},  {0x05, 6},  {0x06, 3},  {0x07, 6},  {0x08, 8},
+      {0x09, 17}, {0x0a, 28}, {0x0b, 25}, {0x0c, 20}, {0x0d, 16}, {0x0e, 13},
+      {0x0f, 4},  {0x10, 3},  {0x12, 18}, {0x13, 9},  {0x16, 1},  {0x17, 5},
+      {0x18, 2},  {0x19, 3},  {0x1a, 7},  {0x1c, 2},  {0x1d, 1},  {0x1f, 22},
+      {0x20, 3},  {0x2b, 3},  {0x2c, 2},  {0x2d, 11}, {0x2e, 1},  {0x30, 3},
+      {0x31, 2},  {0x32, 1},  {0xa7, 2},  {0xa9, 2},  {0xaa, 4},  {0xab, 8},
+      {0xfa, 2},  {0xfb, 5},  {0xfd, 4},  {0xfe, 3},  {0xff, 9},
+  };
+  static constexpr unsigned char singletons0_lower[] = {
+      0xad, 0x78, 0x79, 0x8b, 0x8d, 0xa2, 0x30, 0x57, 0x58, 0x8b, 0x8c, 0x90,
+      0x1c, 0x1d, 0xdd, 0x0e, 0x0f, 0x4b, 0x4c, 0xfb, 0xfc, 0x2e, 0x2f, 0x3f,
+      0x5c, 0x5d, 0x5f, 0xb5, 0xe2, 0x84, 0x8d, 0x8e, 0x91, 0x92, 0xa9, 0xb1,
+      0xba, 0xbb, 0xc5, 0xc6, 0xc9, 0xca, 0xde, 0xe4, 0xe5, 0xff, 0x00, 0x04,
+      0x11, 0x12, 0x29, 0x31, 0x34, 0x37, 0x3a, 0x3b, 0x3d, 0x49, 0x4a, 0x5d,
+      0x84, 0x8e, 0x92, 0xa9, 0xb1, 0xb4, 0xba, 0xbb, 0xc6, 0xca, 0xce, 0xcf,
+      0xe4, 0xe5, 0x00, 0x04, 0x0d, 0x0e, 0x11, 0x12, 0x29, 0x31, 0x34, 0x3a,
+      0x3b, 0x45, 0x46, 0x49, 0x4a, 0x5e, 0x64, 0x65, 0x84, 0x91, 0x9b, 0x9d,
+      0xc9, 0xce, 0xcf, 0x0d, 0x11, 0x29, 0x45, 0x49, 0x57, 0x64, 0x65, 0x8d,
+      0x91, 0xa9, 0xb4, 0xba, 0xbb, 0xc5, 0xc9, 0xdf, 0xe4, 0xe5, 0xf0, 0x0d,
+      0x11, 0x45, 0x49, 0x64, 0x65, 0x80, 0x84, 0xb2, 0xbc, 0xbe, 0xbf, 0xd5,
+      0xd7, 0xf0, 0xf1, 0x83, 0x85, 0x8b, 0xa4, 0xa6, 0xbe, 0xbf, 0xc5, 0xc7,
+      0xce, 0xcf, 0xda, 0xdb, 0x48, 0x98, 0xbd, 0xcd, 0xc6, 0xce, 0xcf, 0x49,
+      0x4e, 0x4f, 0x57, 0x59, 0x5e, 0x5f, 0x89, 0x8e, 0x8f, 0xb1, 0xb6, 0xb7,
+      0xbf, 0xc1, 0xc6, 0xc7, 0xd7, 0x11, 0x16, 0x17, 0x5b, 0x5c, 0xf6, 0xf7,
+      0xfe, 0xff, 0x80, 0x0d, 0x6d, 0x71, 0xde, 0xdf, 0x0e, 0x0f, 0x1f, 0x6e,
+      0x6f, 0x1c, 0x1d, 0x5f, 0x7d, 0x7e, 0xae, 0xaf, 0xbb, 0xbc, 0xfa, 0x16,
+      0x17, 0x1e, 0x1f, 0x46, 0x47, 0x4e, 0x4f, 0x58, 0x5a, 0x5c, 0x5e, 0x7e,
+      0x7f, 0xb5, 0xc5, 0xd4, 0xd5, 0xdc, 0xf0, 0xf1, 0xf5, 0x72, 0x73, 0x8f,
+      0x74, 0x75, 0x96, 0x2f, 0x5f, 0x26, 0x2e, 0x2f, 0xa7, 0xaf, 0xb7, 0xbf,
+      0xc7, 0xcf, 0xd7, 0xdf, 0x9a, 0x40, 0x97, 0x98, 0x30, 0x8f, 0x1f, 0xc0,
+      0xc1, 0xce, 0xff, 0x4e, 0x4f, 0x5a, 0x5b, 0x07, 0x08, 0x0f, 0x10, 0x27,
+      0x2f, 0xee, 0xef, 0x6e, 0x6f, 0x37, 0x3d, 0x3f, 0x42, 0x45, 0x90, 0x91,
+      0xfe, 0xff, 0x53, 0x67, 0x75, 0xc8, 0xc9, 0xd0, 0xd1, 0xd8, 0xd9, 0xe7,
+      0xfe, 0xff,
+  };
+  static constexpr singleton singletons1[] = {
+      {0x00, 6},  {0x01, 1}, {0x03, 1},  {0x04, 2}, {0x08, 8},  {0x09, 2},
+      {0x0a, 5},  {0x0b, 2}, {0x0e, 4},  {0x10, 1}, {0x11, 2},  {0x12, 5},
+      {0x13, 17}, {0x14, 1}, {0x15, 2},  {0x17, 2}, {0x19, 13}, {0x1c, 5},
+      {0x1d, 8},  {0x24, 1}, {0x6a, 3},  {0x6b, 2}, {0xbc, 2},  {0xd1, 2},
+      {0xd4, 12}, {0xd5, 9}, {0xd6, 2},  {0xd7, 2}, {0xda, 1},  {0xe0, 5},
+      {0xe1, 2},  {0xe8, 2}, {0xee, 32}, {0xf0, 4}, {0xf8, 2},  {0xf9, 2},
+      {0xfa, 2},  {0xfb, 1},
+  };
+  static constexpr unsigned char singletons1_lower[] = {
+      0x0c, 0x27, 0x3b, 0x3e, 0x4e, 0x4f, 0x8f, 0x9e, 0x9e, 0x9f, 0x06, 0x07,
+      0x09, 0x36, 0x3d, 0x3e, 0x56, 0xf3, 0xd0, 0xd1, 0x04, 0x14, 0x18, 0x36,
+      0x37, 0x56, 0x57, 0x7f, 0xaa, 0xae, 0xaf, 0xbd, 0x35, 0xe0, 0x12, 0x87,
+      0x89, 0x8e, 0x9e, 0x04, 0x0d, 0x0e, 0x11, 0x12, 0x29, 0x31, 0x34, 0x3a,
+      0x45, 0x46, 0x49, 0x4a, 0x4e, 0x4f, 0x64, 0x65, 0x5c, 0xb6, 0xb7, 0x1b,
+      0x1c, 0x07, 0x08, 0x0a, 0x0b, 0x14, 0x17, 0x36, 0x39, 0x3a, 0xa8, 0xa9,
+      0xd8, 0xd9, 0x09, 0x37, 0x90, 0x91, 0xa8, 0x07, 0x0a, 0x3b, 0x3e, 0x66,
+      0x69, 0x8f, 0x92, 0x6f, 0x5f, 0xee, 0xef, 0x5a, 0x62, 0x9a, 0x9b, 0x27,
+      0x28, 0x55, 0x9d, 0xa0, 0xa1, 0xa3, 0xa4, 0xa7, 0xa8, 0xad, 0xba, 0xbc,
+      0xc4, 0x06, 0x0b, 0x0c, 0x15, 0x1d, 0x3a, 0x3f, 0x45, 0x51, 0xa6, 0xa7,
+      0xcc, 0xcd, 0xa0, 0x07, 0x19, 0x1a, 0x22, 0x25, 0x3e, 0x3f, 0xc5, 0xc6,
+      0x04, 0x20, 0x23, 0x25, 0x26, 0x28, 0x33, 0x38, 0x3a, 0x48, 0x4a, 0x4c,
+      0x50, 0x53, 0x55, 0x56, 0x58, 0x5a, 0x5c, 0x5e, 0x60, 0x63, 0x65, 0x66,
+      0x6b, 0x73, 0x78, 0x7d, 0x7f, 0x8a, 0xa4, 0xaa, 0xaf, 0xb0, 0xc0, 0xd0,
+      0xae, 0xaf, 0x79, 0xcc, 0x6e, 0x6f, 0x93,
+  };
+  static constexpr unsigned char normal0[] = {
+      0x00, 0x20, 0x5f, 0x22, 0x82, 0xdf, 0x04, 0x82, 0x44, 0x08, 0x1b, 0x04,
+      0x06, 0x11, 0x81, 0xac, 0x0e, 0x80, 0xab, 0x35, 0x28, 0x0b, 0x80, 0xe0,
+      0x03, 0x19, 0x08, 0x01, 0x04, 0x2f, 0x04, 0x34, 0x04, 0x07, 0x03, 0x01,
+      0x07, 0x06, 0x07, 0x11, 0x0a, 0x50, 0x0f, 0x12, 0x07, 0x55, 0x07, 0x03,
+      0x04, 0x1c, 0x0a, 0x09, 0x03, 0x08, 0x03, 0x07, 0x03, 0x02, 0x03, 0x03,
+      0x03, 0x0c, 0x04, 0x05, 0x03, 0x0b, 0x06, 0x01, 0x0e, 0x15, 0x05, 0x3a,
+      0x03, 0x11, 0x07, 0x06, 0x05, 0x10, 0x07, 0x57, 0x07, 0x02, 0x07, 0x15,
+      0x0d, 0x50, 0x04, 0x43, 0x03, 0x2d, 0x03, 0x01, 0x04, 0x11, 0x06, 0x0f,
+      0x0c, 0x3a, 0x04, 0x1d, 0x25, 0x5f, 0x20, 0x6d, 0x04, 0x6a, 0x25, 0x80,
+      0xc8, 0x05, 0x82, 0xb0, 0x03, 0x1a, 0x06, 0x82, 0xfd, 0x03, 0x59, 0x07,
+      0x15, 0x0b, 0x17, 0x09, 0x14, 0x0c, 0x14, 0x0c, 0x6a, 0x06, 0x0a, 0x06,
+      0x1a, 0x06, 0x59, 0x07, 0x2b, 0x05, 0x46, 0x0a, 0x2c, 0x04, 0x0c, 0x04,
+      0x01, 0x03, 0x31, 0x0b, 0x2c, 0x04, 0x1a, 0x06, 0x0b, 0x03, 0x80, 0xac,
+      0x06, 0x0a, 0x06, 0x21, 0x3f, 0x4c, 0x04, 0x2d, 0x03, 0x74, 0x08, 0x3c,
+      0x03, 0x0f, 0x03, 0x3c, 0x07, 0x38, 0x08, 0x2b, 0x05, 0x82, 0xff, 0x11,
+      0x18, 0x08, 0x2f, 0x11, 0x2d, 0x03, 0x20, 0x10, 0x21, 0x0f, 0x80, 0x8c,
+      0x04, 0x82, 0x97, 0x19, 0x0b, 0x15, 0x88, 0x94, 0x05, 0x2f, 0x05, 0x3b,
+      0x07, 0x02, 0x0e, 0x18, 0x09, 0x80, 0xb3, 0x2d, 0x74, 0x0c, 0x80, 0xd6,
+      0x1a, 0x0c, 0x05, 0x80, 0xff, 0x05, 0x80, 0xdf, 0x0c, 0xee, 0x0d, 0x03,
+      0x84, 0x8d, 0x03, 0x37, 0x09, 0x81, 0x5c, 0x14, 0x80, 0xb8, 0x08, 0x80,
+      0xcb, 0x2a, 0x38, 0x03, 0x0a, 0x06, 0x38, 0x08, 0x46, 0x08, 0x0c, 0x06,
+      0x74, 0x0b, 0x1e, 0x03, 0x5a, 0x04, 0x59, 0x09, 0x80, 0x83, 0x18, 0x1c,
+      0x0a, 0x16, 0x09, 0x4c, 0x04, 0x80, 0x8a, 0x06, 0xab, 0xa4, 0x0c, 0x17,
+      0x04, 0x31, 0xa1, 0x04, 0x81, 0xda, 0x26, 0x07, 0x0c, 0x05, 0x05, 0x80,
+      0xa5, 0x11, 0x81, 0x6d, 0x10, 0x78, 0x28, 0x2a, 0x06, 0x4c, 0x04, 0x80,
+      0x8d, 0x04, 0x80, 0xbe, 0x03, 0x1b, 0x03, 0x0f, 0x0d,
+  };
+  static constexpr unsigned char normal1[] = {
+      0x5e, 0x22, 0x7b, 0x05, 0x03, 0x04, 0x2d, 0x03, 0x66, 0x03, 0x01, 0x2f,
+      0x2e, 0x80, 0x82, 0x1d, 0x03, 0x31, 0x0f, 0x1c, 0x04, 0x24, 0x09, 0x1e,
+      0x05, 0x2b, 0x05, 0x44, 0x04, 0x0e, 0x2a, 0x80, 0xaa, 0x06, 0x24, 0x04,
+      0x24, 0x04, 0x28, 0x08, 0x34, 0x0b, 0x01, 0x80, 0x90, 0x81, 0x37, 0x09,
+      0x16, 0x0a, 0x08, 0x80, 0x98, 0x39, 0x03, 0x63, 0x08, 0x09, 0x30, 0x16,
+      0x05, 0x21, 0x03, 0x1b, 0x05, 0x01, 0x40, 0x38, 0x04, 0x4b, 0x05, 0x2f,
+      0x04, 0x0a, 0x07, 0x09, 0x07, 0x40, 0x20, 0x27, 0x04, 0x0c, 0x09, 0x36,
+      0x03, 0x3a, 0x05, 0x1a, 0x07, 0x04, 0x0c, 0x07, 0x50, 0x49, 0x37, 0x33,
+      0x0d, 0x33, 0x07, 0x2e, 0x08, 0x0a, 0x81, 0x26, 0x52, 0x4e, 0x28, 0x08,
+      0x2a, 0x56, 0x1c, 0x14, 0x17, 0x09, 0x4e, 0x04, 0x1e, 0x0f, 0x43, 0x0e,
+      0x19, 0x07, 0x0a, 0x06, 0x48, 0x08, 0x27, 0x09, 0x75, 0x0b, 0x3f, 0x41,
+      0x2a, 0x06, 0x3b, 0x05, 0x0a, 0x06, 0x51, 0x06, 0x01, 0x05, 0x10, 0x03,
+      0x05, 0x80, 0x8b, 0x62, 0x1e, 0x48, 0x08, 0x0a, 0x80, 0xa6, 0x5e, 0x22,
+      0x45, 0x0b, 0x0a, 0x06, 0x0d, 0x13, 0x39, 0x07, 0x0a, 0x36, 0x2c, 0x04,
+      0x10, 0x80, 0xc0, 0x3c, 0x64, 0x53, 0x0c, 0x48, 0x09, 0x0a, 0x46, 0x45,
+      0x1b, 0x48, 0x08, 0x53, 0x1d, 0x39, 0x81, 0x07, 0x46, 0x0a, 0x1d, 0x03,
+      0x47, 0x49, 0x37, 0x03, 0x0e, 0x08, 0x0a, 0x06, 0x39, 0x07, 0x0a, 0x81,
+      0x36, 0x19, 0x80, 0xb7, 0x01, 0x0f, 0x32, 0x0d, 0x83, 0x9b, 0x66, 0x75,
+      0x0b, 0x80, 0xc4, 0x8a, 0xbc, 0x84, 0x2f, 0x8f, 0xd1, 0x82, 0x47, 0xa1,
+      0xb9, 0x82, 0x39, 0x07, 0x2a, 0x04, 0x02, 0x60, 0x26, 0x0a, 0x46, 0x0a,
+      0x28, 0x05, 0x13, 0x82, 0xb0, 0x5b, 0x65, 0x4b, 0x04, 0x39, 0x07, 0x11,
+      0x40, 0x05, 0x0b, 0x02, 0x0e, 0x97, 0xf8, 0x08, 0x84, 0xd6, 0x2a, 0x09,
+      0xa2, 0xf7, 0x81, 0x1f, 0x31, 0x03, 0x11, 0x04, 0x08, 0x81, 0x8c, 0x89,
+      0x04, 0x6b, 0x05, 0x0d, 0x03, 0x09, 0x07, 0x10, 0x93, 0x60, 0x80, 0xf6,
+      0x0a, 0x73, 0x08, 0x6e, 0x17, 0x46, 0x80, 0x9a, 0x14, 0x0c, 0x57, 0x09,
+      0x19, 0x80, 0x87, 0x81, 0x47, 0x03, 0x85, 0x42, 0x0f, 0x15, 0x85, 0x50,
+      0x2b, 0x80, 0xd5, 0x2d, 0x03, 0x1a, 0x04, 0x02, 0x81, 0x70, 0x3a, 0x05,
+      0x01, 0x85, 0x00, 0x80, 0xd7, 0x29, 0x4c, 0x04, 0x0a, 0x04, 0x02, 0x83,
+      0x11, 0x44, 0x4c, 0x3d, 0x80, 0xc2, 0x3c, 0x06, 0x01, 0x04, 0x55, 0x05,
+      0x1b, 0x34, 0x02, 0x81, 0x0e, 0x2c, 0x04, 0x64, 0x0c, 0x56, 0x0a, 0x80,
+      0xae, 0x38, 0x1d, 0x0d, 0x2c, 0x04, 0x09, 0x07, 0x02, 0x0e, 0x06, 0x80,
+      0x9a, 0x83, 0xd8, 0x08, 0x0d, 0x03, 0x0d, 0x03, 0x74, 0x0c, 0x59, 0x07,
+      0x0c, 0x14, 0x0c, 0x04, 0x38, 0x08, 0x0a, 0x06, 0x28, 0x08, 0x22, 0x4e,
+      0x81, 0x54, 0x0c, 0x15, 0x03, 0x03, 0x05, 0x07, 0x09, 0x19, 0x07, 0x07,
+      0x09, 0x03, 0x0d, 0x07, 0x29, 0x80, 0xcb, 0x25, 0x0a, 0x84, 0x06,
+  };
+  auto lower = static_cast<uint16_t>(cp);
+  if (cp < 0x10000) {
+    return is_printable(lower, singletons0,
+                        sizeof(singletons0) / sizeof(*singletons0),
+                        singletons0_lower, normal0, sizeof(normal0));
+  }
+  if (cp < 0x20000) {
+    return is_printable(lower, singletons1,
+                        sizeof(singletons1) / sizeof(*singletons1),
+                        singletons1_lower, normal1, sizeof(normal1));
+  }
+  if (0x2a6de <= cp && cp < 0x2a700) return false;
+  if (0x2b735 <= cp && cp < 0x2b740) return false;
+  if (0x2b81e <= cp && cp < 0x2b820) return false;
+  if (0x2cea2 <= cp && cp < 0x2ceb0) return false;
+  if (0x2ebe1 <= cp && cp < 0x2f800) return false;
+  if (0x2fa1e <= cp && cp < 0x30000) return false;
+  if (0x3134b <= cp && cp < 0xe0100) return false;
+  if (0xe01f0 <= cp && cp < 0x110000) return false;
+  return cp < 0x110000;
+}
+
+}  // namespace detail
+
+FMT_END_NAMESPACE
+
+#endif  // FMT_FORMAT_INL_H_
diff --git a/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/format.h b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/format.h
new file mode 100644
index 00000000..7637c8a0
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/format.h
@@ -0,0 +1,4535 @@
+/*
+  Formatting library for C++
+
+  Copyright (c) 2012 - present, Victor Zverovich
+
+  Permission is hereby granted, free of charge, to any person obtaining
+  a copy of this software and associated documentation files (the
+  "Software"), to deal in the Software without restriction, including
+  without limitation the rights to use, copy, modify, merge, publish,
+  distribute, sublicense, and/or sell copies of the Software, and to
+  permit persons to whom the Software is furnished to do so, subject to
+  the following conditions:
+
+  The above copyright notice and this permission notice shall be
+  included in all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+  --- Optional exception to the license ---
+
+  As an exception, if, as a result of your compiling your source code, portions
+  of this Software are embedded into a machine-executable object form of such
+  source code, you may redistribute such embedded portions in such object form
+  without including the above copyright and permission notices.
+ */
+
+#ifndef FMT_FORMAT_H_
+#define FMT_FORMAT_H_
+
+#include <cmath>             // std::signbit
+#include <cstdint>           // uint32_t
+#include <cstring>           // std::memcpy
+#include <initializer_list>  // std::initializer_list
+#include <limits>            // std::numeric_limits
+#include <memory>            // std::uninitialized_copy
+#include <stdexcept>         // std::runtime_error
+#include <system_error>      // std::system_error
+
+#ifdef __cpp_lib_bit_cast
+#  include <bit>  // std::bit_cast
+#endif
+
+#include "core.h"
+
+#if defined __cpp_inline_variables && __cpp_inline_variables >= 201606L
+#  define FMT_INLINE_VARIABLE inline
+#else
+#  define FMT_INLINE_VARIABLE
+#endif
+
+#if FMT_HAS_CPP17_ATTRIBUTE(fallthrough)
+#  define FMT_FALLTHROUGH [[fallthrough]]
+#elif defined(__clang__)
+#  define FMT_FALLTHROUGH [[clang::fallthrough]]
+#elif FMT_GCC_VERSION >= 700 && \
+    (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= 520)
+#  define FMT_FALLTHROUGH [[gnu::fallthrough]]
+#else
+#  define FMT_FALLTHROUGH
+#endif
+
+#ifndef FMT_DEPRECATED
+#  if FMT_HAS_CPP14_ATTRIBUTE(deprecated) || FMT_MSC_VERSION >= 1900
+#    define FMT_DEPRECATED [[deprecated]]
+#  else
+#    if (defined(__GNUC__) && !defined(__LCC__)) || defined(__clang__)
+#      define FMT_DEPRECATED __attribute__((deprecated))
+#    elif FMT_MSC_VERSION
+#      define FMT_DEPRECATED __declspec(deprecated)
+#    else
+#      define FMT_DEPRECATED /* deprecated */
+#    endif
+#  endif
+#endif
+
+#ifndef FMT_NO_UNIQUE_ADDRESS
+#  if FMT_CPLUSPLUS >= 202002L
+#    if FMT_HAS_CPP_ATTRIBUTE(no_unique_address)
+#      define FMT_NO_UNIQUE_ADDRESS [[no_unique_address]]
+// VS2019 v16.10 and later except clang-cl (https://reviews.llvm.org/D110485)
+#    elif (FMT_MSC_VERSION >= 1929) && !FMT_CLANG_VERSION
+#      define FMT_NO_UNIQUE_ADDRESS [[msvc::no_unique_address]]
+#    endif
+#  endif
+#endif
+#ifndef FMT_NO_UNIQUE_ADDRESS
+#  define FMT_NO_UNIQUE_ADDRESS
+#endif
+
+// Visibility when compiled as a shared library/object.
+#if defined(FMT_LIB_EXPORT) || defined(FMT_SHARED)
+#  define FMT_SO_VISIBILITY(value) FMT_VISIBILITY(value)
+#else
+#  define FMT_SO_VISIBILITY(value)
+#endif
+
+#ifdef __has_builtin
+#  define FMT_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#  define FMT_HAS_BUILTIN(x) 0
+#endif
+
+#if FMT_GCC_VERSION || FMT_CLANG_VERSION
+#  define FMT_NOINLINE __attribute__((noinline))
+#else
+#  define FMT_NOINLINE
+#endif
+
+#ifndef FMT_THROW
+#  if FMT_EXCEPTIONS
+#    if FMT_MSC_VERSION || defined(__NVCC__)
+FMT_BEGIN_NAMESPACE
+namespace detail {
+template <typename Exception> inline void do_throw(const Exception& x) {
+  // Silence unreachable code warnings in MSVC and NVCC because these
+  // are nearly impossible to fix in a generic code.
+  volatile bool b = true;
+  if (b) throw x;
+}
+}  // namespace detail
+FMT_END_NAMESPACE
+#      define FMT_THROW(x) detail::do_throw(x)
+#    else
+#      define FMT_THROW(x) throw x
+#    endif
+#  else
+#    define FMT_THROW(x) \
+      ::fmt::detail::assert_fail(__FILE__, __LINE__, (x).what())
+#  endif
+#endif
+
+#if FMT_EXCEPTIONS
+#  define FMT_TRY try
+#  define FMT_CATCH(x) catch (x)
+#else
+#  define FMT_TRY if (true)
+#  define FMT_CATCH(x) if (false)
+#endif
+
+#ifndef FMT_MAYBE_UNUSED
+#  if FMT_HAS_CPP17_ATTRIBUTE(maybe_unused)
+#    define FMT_MAYBE_UNUSED [[maybe_unused]]
+#  else
+#    define FMT_MAYBE_UNUSED
+#  endif
+#endif
+
+#ifndef FMT_USE_USER_DEFINED_LITERALS
+// EDG based compilers (Intel, NVIDIA, Elbrus, etc), GCC and MSVC support UDLs.
+//
+// GCC before 4.9 requires a space in `operator"" _a` which is invalid in later
+// compiler versions.
+#  if (FMT_HAS_FEATURE(cxx_user_literals) || FMT_GCC_VERSION >= 409 || \
+       FMT_MSC_VERSION >= 1900) &&                                     \
+      (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= /* UDL feature */ 480)
+#    define FMT_USE_USER_DEFINED_LITERALS 1
+#  else
+#    define FMT_USE_USER_DEFINED_LITERALS 0
+#  endif
+#endif
+
+// Defining FMT_REDUCE_INT_INSTANTIATIONS to 1, will reduce the number of
+// integer formatter template instantiations to just one by only using the
+// largest integer type. This results in a reduction in binary size but will
+// cause a decrease in integer formatting performance.
+#if !defined(FMT_REDUCE_INT_INSTANTIATIONS)
+#  define FMT_REDUCE_INT_INSTANTIATIONS 0
+#endif
+
+// __builtin_clz is broken in clang with Microsoft CodeGen:
+// https://github.com/fmtlib/fmt/issues/519.
+#if !FMT_MSC_VERSION
+#  if FMT_HAS_BUILTIN(__builtin_clz) || FMT_GCC_VERSION || FMT_ICC_VERSION
+#    define FMT_BUILTIN_CLZ(n) __builtin_clz(n)
+#  endif
+#  if FMT_HAS_BUILTIN(__builtin_clzll) || FMT_GCC_VERSION || FMT_ICC_VERSION
+#    define FMT_BUILTIN_CLZLL(n) __builtin_clzll(n)
+#  endif
+#endif
+
+// __builtin_ctz is broken in Intel Compiler Classic on Windows:
+// https://github.com/fmtlib/fmt/issues/2510.
+#ifndef __ICL
+#  if FMT_HAS_BUILTIN(__builtin_ctz) || FMT_GCC_VERSION || FMT_ICC_VERSION || \
+      defined(__NVCOMPILER)
+#    define FMT_BUILTIN_CTZ(n) __builtin_ctz(n)
+#  endif
+#  if FMT_HAS_BUILTIN(__builtin_ctzll) || FMT_GCC_VERSION || \
+      FMT_ICC_VERSION || defined(__NVCOMPILER)
+#    define FMT_BUILTIN_CTZLL(n) __builtin_ctzll(n)
+#  endif
+#endif
+
+#if FMT_MSC_VERSION
+#  include <intrin.h>  // _BitScanReverse[64], _BitScanForward[64], _umul128
+#endif
+
+// Some compilers masquerade as both MSVC and GCC-likes or otherwise support
+// __builtin_clz and __builtin_clzll, so only define FMT_BUILTIN_CLZ using the
+// MSVC intrinsics if the clz and clzll builtins are not available.
+#if FMT_MSC_VERSION && !defined(FMT_BUILTIN_CLZLL) && \
+    !defined(FMT_BUILTIN_CTZLL)
+FMT_BEGIN_NAMESPACE
+namespace detail {
+// Avoid Clang with Microsoft CodeGen's -Wunknown-pragmas warning.
+#  if !defined(__clang__)
+#    pragma intrinsic(_BitScanForward)
+#    pragma intrinsic(_BitScanReverse)
+#    if defined(_WIN64)
+#      pragma intrinsic(_BitScanForward64)
+#      pragma intrinsic(_BitScanReverse64)
+#    endif
+#  endif
+
+inline auto clz(uint32_t x) -> int {
+  unsigned long r = 0;
+  _BitScanReverse(&r, x);
+  FMT_ASSERT(x != 0, "");
+  // Static analysis complains about using uninitialized data
+  // "r", but the only way that can happen is if "x" is 0,
+  // which the callers guarantee to not happen.
+  FMT_MSC_WARNING(suppress : 6102)
+  return 31 ^ static_cast<int>(r);
+}
+#  define FMT_BUILTIN_CLZ(n) detail::clz(n)
+
+inline auto clzll(uint64_t x) -> int {
+  unsigned long r = 0;
+#  ifdef _WIN64
+  _BitScanReverse64(&r, x);
+#  else
+  // Scan the high 32 bits.
+  if (_BitScanReverse(&r, static_cast<uint32_t>(x >> 32)))
+    return 63 ^ static_cast<int>(r + 32);
+  // Scan the low 32 bits.
+  _BitScanReverse(&r, static_cast<uint32_t>(x));
+#  endif
+  FMT_ASSERT(x != 0, "");
+  FMT_MSC_WARNING(suppress : 6102)  // Suppress a bogus static analysis warning.
+  return 63 ^ static_cast<int>(r);
+}
+#  define FMT_BUILTIN_CLZLL(n) detail::clzll(n)
+
+inline auto ctz(uint32_t x) -> int {
+  unsigned long r = 0;
+  _BitScanForward(&r, x);
+  FMT_ASSERT(x != 0, "");
+  FMT_MSC_WARNING(suppress : 6102)  // Suppress a bogus static analysis warning.
+  return static_cast<int>(r);
+}
+#  define FMT_BUILTIN_CTZ(n) detail::ctz(n)
+
+inline auto ctzll(uint64_t x) -> int {
+  unsigned long r = 0;
+  FMT_ASSERT(x != 0, "");
+  FMT_MSC_WARNING(suppress : 6102)  // Suppress a bogus static analysis warning.
+#  ifdef _WIN64
+  _BitScanForward64(&r, x);
+#  else
+  // Scan the low 32 bits.
+  if (_BitScanForward(&r, static_cast<uint32_t>(x))) return static_cast<int>(r);
+  // Scan the high 32 bits.
+  _BitScanForward(&r, static_cast<uint32_t>(x >> 32));
+  r += 32;
+#  endif
+  return static_cast<int>(r);
+}
+#  define FMT_BUILTIN_CTZLL(n) detail::ctzll(n)
+}  // namespace detail
+FMT_END_NAMESPACE
+#endif
+
+FMT_BEGIN_NAMESPACE
+namespace detail {
+
+FMT_CONSTEXPR inline void abort_fuzzing_if(bool condition) {
+  ignore_unused(condition);
+#ifdef FMT_FUZZ
+  if (condition) throw std::runtime_error("fuzzing limit reached");
+#endif
+}
+
+template <typename CharT, CharT... C> struct string_literal {
+  static constexpr CharT value[sizeof...(C)] = {C...};
+  constexpr operator basic_string_view<CharT>() const {
+    return {value, sizeof...(C)};
+  }
+};
+
+#if FMT_CPLUSPLUS < 201703L
+template <typename CharT, CharT... C>
+constexpr CharT string_literal<CharT, C...>::value[sizeof...(C)];
+#endif
+
+// Implementation of std::bit_cast for pre-C++20.
+template <typename To, typename From, FMT_ENABLE_IF(sizeof(To) == sizeof(From))>
+FMT_CONSTEXPR20 auto bit_cast(const From& from) -> To {
+#ifdef __cpp_lib_bit_cast
+  if (is_constant_evaluated()) return std::bit_cast<To>(from);
+#endif
+  auto to = To();
+  // The cast suppresses a bogus -Wclass-memaccess on GCC.
+  std::memcpy(static_cast<void*>(&to), &from, sizeof(to));
+  return to;
+}
+
+inline auto is_big_endian() -> bool {
+#ifdef _WIN32
+  return false;
+#elif defined(__BIG_ENDIAN__)
+  return true;
+#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__)
+  return __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__;
+#else
+  struct bytes {
+    char data[sizeof(int)];
+  };
+  return bit_cast<bytes>(1).data[0] == 0;
+#endif
+}
+
+class uint128_fallback {
+ private:
+  uint64_t lo_, hi_;
+
+ public:
+  constexpr uint128_fallback(uint64_t hi, uint64_t lo) : lo_(lo), hi_(hi) {}
+  constexpr uint128_fallback(uint64_t value = 0) : lo_(value), hi_(0) {}
+
+  constexpr auto high() const noexcept -> uint64_t { return hi_; }
+  constexpr auto low() const noexcept -> uint64_t { return lo_; }
+
+  template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
+  constexpr explicit operator T() const {
+    return static_cast<T>(lo_);
+  }
+
+  friend constexpr auto operator==(const uint128_fallback& lhs,
+                                   const uint128_fallback& rhs) -> bool {
+    return lhs.hi_ == rhs.hi_ && lhs.lo_ == rhs.lo_;
+  }
+  friend constexpr auto operator!=(const uint128_fallback& lhs,
+                                   const uint128_fallback& rhs) -> bool {
+    return !(lhs == rhs);
+  }
+  friend constexpr auto operator>(const uint128_fallback& lhs,
+                                  const uint128_fallback& rhs) -> bool {
+    return lhs.hi_ != rhs.hi_ ? lhs.hi_ > rhs.hi_ : lhs.lo_ > rhs.lo_;
+  }
+  friend constexpr auto operator|(const uint128_fallback& lhs,
+                                  const uint128_fallback& rhs)
+      -> uint128_fallback {
+    return {lhs.hi_ | rhs.hi_, lhs.lo_ | rhs.lo_};
+  }
+  friend constexpr auto operator&(const uint128_fallback& lhs,
+                                  const uint128_fallback& rhs)
+      -> uint128_fallback {
+    return {lhs.hi_ & rhs.hi_, lhs.lo_ & rhs.lo_};
+  }
+  friend constexpr auto operator~(const uint128_fallback& n)
+      -> uint128_fallback {
+    return {~n.hi_, ~n.lo_};
+  }
+  friend auto operator+(const uint128_fallback& lhs,
+                        const uint128_fallback& rhs) -> uint128_fallback {
+    auto result = uint128_fallback(lhs);
+    result += rhs;
+    return result;
+  }
+  friend auto operator*(const uint128_fallback& lhs, uint32_t rhs)
+      -> uint128_fallback {
+    FMT_ASSERT(lhs.hi_ == 0, "");
+    uint64_t hi = (lhs.lo_ >> 32) * rhs;
+    uint64_t lo = (lhs.lo_ & ~uint32_t()) * rhs;
+    uint64_t new_lo = (hi << 32) + lo;
+    return {(hi >> 32) + (new_lo < lo ? 1 : 0), new_lo};
+  }
+  friend auto operator-(const uint128_fallback& lhs, uint64_t rhs)
+      -> uint128_fallback {
+    return {lhs.hi_ - (lhs.lo_ < rhs ? 1 : 0), lhs.lo_ - rhs};
+  }
+  FMT_CONSTEXPR auto operator>>(int shift) const -> uint128_fallback {
+    if (shift == 64) return {0, hi_};
+    if (shift > 64) return uint128_fallback(0, hi_) >> (shift - 64);
+    return {hi_ >> shift, (hi_ << (64 - shift)) | (lo_ >> shift)};
+  }
+  FMT_CONSTEXPR auto operator<<(int shift) const -> uint128_fallback {
+    if (shift == 64) return {lo_, 0};
+    if (shift > 64) return uint128_fallback(lo_, 0) << (shift - 64);
+    return {hi_ << shift | (lo_ >> (64 - shift)), (lo_ << shift)};
+  }
+  FMT_CONSTEXPR auto operator>>=(int shift) -> uint128_fallback& {
+    return *this = *this >> shift;
+  }
+  FMT_CONSTEXPR void operator+=(uint128_fallback n) {
+    uint64_t new_lo = lo_ + n.lo_;
+    uint64_t new_hi = hi_ + n.hi_ + (new_lo < lo_ ? 1 : 0);
+    FMT_ASSERT(new_hi >= hi_, "");
+    lo_ = new_lo;
+    hi_ = new_hi;
+  }
+  FMT_CONSTEXPR void operator&=(uint128_fallback n) {
+    lo_ &= n.lo_;
+    hi_ &= n.hi_;
+  }
+
+  FMT_CONSTEXPR20 auto operator+=(uint64_t n) noexcept -> uint128_fallback& {
+    if (is_constant_evaluated()) {
+      lo_ += n;
+      hi_ += (lo_ < n ? 1 : 0);
+      return *this;
+    }
+#if FMT_HAS_BUILTIN(__builtin_addcll) && !defined(__ibmxl__)
+    unsigned long long carry;
+    lo_ = __builtin_addcll(lo_, n, 0, &carry);
+    hi_ += carry;
+#elif FMT_HAS_BUILTIN(__builtin_ia32_addcarryx_u64) && !defined(__ibmxl__)
+    unsigned long long result;
+    auto carry = __builtin_ia32_addcarryx_u64(0, lo_, n, &result);
+    lo_ = result;
+    hi_ += carry;
+#elif defined(_MSC_VER) && defined(_M_X64)
+    auto carry = _addcarry_u64(0, lo_, n, &lo_);
+    _addcarry_u64(carry, hi_, 0, &hi_);
+#else
+    lo_ += n;
+    hi_ += (lo_ < n ? 1 : 0);
+#endif
+    return *this;
+  }
+};
+
+using uint128_t = conditional_t<FMT_USE_INT128, uint128_opt, uint128_fallback>;
+
+#ifdef UINTPTR_MAX
+using uintptr_t = ::uintptr_t;
+#else
+using uintptr_t = uint128_t;
+#endif
+
+// Returns the largest possible value for type T. Same as
+// std::numeric_limits<T>::max() but shorter and not affected by the max macro.
+template <typename T> constexpr auto max_value() -> T {
+  return (std::numeric_limits<T>::max)();
+}
+template <typename T> constexpr auto num_bits() -> int {
+  return std::numeric_limits<T>::digits;
+}
+// std::numeric_limits<T>::digits may return 0 for 128-bit ints.
+template <> constexpr auto num_bits<int128_opt>() -> int { return 128; }
+template <> constexpr auto num_bits<uint128_t>() -> int { return 128; }
+
+// A heterogeneous bit_cast used for converting 96-bit long double to uint128_t
+// and 128-bit pointers to uint128_fallback.
+template <typename To, typename From, FMT_ENABLE_IF(sizeof(To) > sizeof(From))>
+inline auto bit_cast(const From& from) -> To {
+  constexpr auto size = static_cast<int>(sizeof(From) / sizeof(unsigned));
+  struct data_t {
+    unsigned value[static_cast<unsigned>(size)];
+  } data = bit_cast<data_t>(from);
+  auto result = To();
+  if (const_check(is_big_endian())) {
+    for (int i = 0; i < size; ++i)
+      result = (result << num_bits<unsigned>()) | data.value[i];
+  } else {
+    for (int i = size - 1; i >= 0; --i)
+      result = (result << num_bits<unsigned>()) | data.value[i];
+  }
+  return result;
+}
+
+template <typename UInt>
+FMT_CONSTEXPR20 inline auto countl_zero_fallback(UInt n) -> int {
+  int lz = 0;
+  constexpr UInt msb_mask = static_cast<UInt>(1) << (num_bits<UInt>() - 1);
+  for (; (n & msb_mask) == 0; n <<= 1) lz++;
+  return lz;
+}
+
+FMT_CONSTEXPR20 inline auto countl_zero(uint32_t n) -> int {
+#ifdef FMT_BUILTIN_CLZ
+  if (!is_constant_evaluated()) return FMT_BUILTIN_CLZ(n);
+#endif
+  return countl_zero_fallback(n);
+}
+
+FMT_CONSTEXPR20 inline auto countl_zero(uint64_t n) -> int {
+#ifdef FMT_BUILTIN_CLZLL
+  if (!is_constant_evaluated()) return FMT_BUILTIN_CLZLL(n);
+#endif
+  return countl_zero_fallback(n);
+}
+
+FMT_INLINE void assume(bool condition) {
+  (void)condition;
+#if FMT_HAS_BUILTIN(__builtin_assume) && !FMT_ICC_VERSION
+  __builtin_assume(condition);
+#elif FMT_GCC_VERSION
+  if (!condition) __builtin_unreachable();
+#endif
+}
+
+// An approximation of iterator_t for pre-C++20 systems.
+template <typename T>
+using iterator_t = decltype(std::begin(std::declval<T&>()));
+template <typename T> using sentinel_t = decltype(std::end(std::declval<T&>()));
+
+// A workaround for std::string not having mutable data() until C++17.
+template <typename Char>
+inline auto get_data(std::basic_string<Char>& s) -> Char* {
+  return &s[0];
+}
+template <typename Container>
+inline auto get_data(Container& c) -> typename Container::value_type* {
+  return c.data();
+}
+
+// Attempts to reserve space for n extra characters in the output range.
+// Returns a pointer to the reserved range or a reference to it.
+template <typename Container, FMT_ENABLE_IF(is_contiguous<Container>::value)>
+#if FMT_CLANG_VERSION >= 307 && !FMT_ICC_VERSION
+__attribute__((no_sanitize("undefined")))
+#endif
+inline auto
+reserve(std::back_insert_iterator<Container> it, size_t n) ->
+    typename Container::value_type* {
+  Container& c = get_container(it);
+  size_t size = c.size();
+  c.resize(size + n);
+  return get_data(c) + size;
+}
+
+template <typename T>
+inline auto reserve(buffer_appender<T> it, size_t n) -> buffer_appender<T> {
+  buffer<T>& buf = get_container(it);
+  buf.try_reserve(buf.size() + n);
+  return it;
+}
+
+template <typename Iterator>
+constexpr auto reserve(Iterator& it, size_t) -> Iterator& {
+  return it;
+}
+
+template <typename OutputIt>
+using reserve_iterator =
+    remove_reference_t<decltype(reserve(std::declval<OutputIt&>(), 0))>;
+
+template <typename T, typename OutputIt>
+constexpr auto to_pointer(OutputIt, size_t) -> T* {
+  return nullptr;
+}
+template <typename T> auto to_pointer(buffer_appender<T> it, size_t n) -> T* {
+  buffer<T>& buf = get_container(it);
+  auto size = buf.size();
+  if (buf.capacity() < size + n) return nullptr;
+  buf.try_resize(size + n);
+  return buf.data() + size;
+}
+
+template <typename Container, FMT_ENABLE_IF(is_contiguous<Container>::value)>
+inline auto base_iterator(std::back_insert_iterator<Container> it,
+                          typename Container::value_type*)
+    -> std::back_insert_iterator<Container> {
+  return it;
+}
+
+template <typename Iterator>
+constexpr auto base_iterator(Iterator, Iterator it) -> Iterator {
+  return it;
+}
+
+// <algorithm> is spectacularly slow to compile in C++20 so use a simple fill_n
+// instead (#1998).
+template <typename OutputIt, typename Size, typename T>
+FMT_CONSTEXPR auto fill_n(OutputIt out, Size count, const T& value)
+    -> OutputIt {
+  for (Size i = 0; i < count; ++i) *out++ = value;
+  return out;
+}
+template <typename T, typename Size>
+FMT_CONSTEXPR20 auto fill_n(T* out, Size count, char value) -> T* {
+  if (is_constant_evaluated()) {
+    return fill_n<T*, Size, T>(out, count, value);
+  }
+  std::memset(out, value, to_unsigned(count));
+  return out + count;
+}
+
+#ifdef __cpp_char8_t
+using char8_type = char8_t;
+#else
+enum char8_type : unsigned char {};
+#endif
+
+template <typename OutChar, typename InputIt, typename OutputIt>
+FMT_CONSTEXPR FMT_NOINLINE auto copy_str_noinline(InputIt begin, InputIt end,
+                                                  OutputIt out) -> OutputIt {
+  return copy_str<OutChar>(begin, end, out);
+}
+
+// A public domain branchless UTF-8 decoder by Christopher Wellons:
+// https://github.com/skeeto/branchless-utf8
+/* Decode the next character, c, from s, reporting errors in e.
+ *
+ * Since this is a branchless decoder, four bytes will be read from the
+ * buffer regardless of the actual length of the next character. This
+ * means the buffer _must_ have at least three bytes of zero padding
+ * following the end of the data stream.
+ *
+ * Errors are reported in e, which will be non-zero if the parsed
+ * character was somehow invalid: invalid byte sequence, non-canonical
+ * encoding, or a surrogate half.
+ *
+ * The function returns a pointer to the next character. When an error
+ * occurs, this pointer will be a guess that depends on the particular
+ * error, but it will always advance at least one byte.
+ */
+FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e)
+    -> const char* {
+  constexpr const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
+  constexpr const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
+  constexpr const int shiftc[] = {0, 18, 12, 6, 0};
+  constexpr const int shifte[] = {0, 6, 4, 2, 0};
+
+  int len = "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0\0\0\2\2\2\2\3\3\4"
+      [static_cast<unsigned char>(*s) >> 3];
+  // Compute the pointer to the next character early so that the next
+  // iteration can start working on the next character. Neither Clang
+  // nor GCC figure out this reordering on their own.
+  const char* next = s + len + !len;
+
+  using uchar = unsigned char;
+
+  // Assume a four-byte character and load four bytes. Unused bits are
+  // shifted out.
+  *c = uint32_t(uchar(s[0]) & masks[len]) << 18;
+  *c |= uint32_t(uchar(s[1]) & 0x3f) << 12;
+  *c |= uint32_t(uchar(s[2]) & 0x3f) << 6;
+  *c |= uint32_t(uchar(s[3]) & 0x3f) << 0;
+  *c >>= shiftc[len];
+
+  // Accumulate the various error conditions.
+  *e = (*c < mins[len]) << 6;       // non-canonical encoding
+  *e |= ((*c >> 11) == 0x1b) << 7;  // surrogate half?
+  *e |= (*c > 0x10FFFF) << 8;       // out of range?
+  *e |= (uchar(s[1]) & 0xc0) >> 2;
+  *e |= (uchar(s[2]) & 0xc0) >> 4;
+  *e |= uchar(s[3]) >> 6;
+  *e ^= 0x2a;  // top two bits of each tail byte correct?
+  *e >>= shifte[len];
+
+  return next;
+}
+
+constexpr FMT_INLINE_VARIABLE uint32_t invalid_code_point = ~uint32_t();
+
+// Invokes f(cp, sv) for every code point cp in s with sv being the string view
+// corresponding to the code point. cp is invalid_code_point on error.
+template <typename F>
+FMT_CONSTEXPR void for_each_codepoint(string_view s, F f) {
+  auto decode = [f](const char* buf_ptr, const char* ptr) {
+    auto cp = uint32_t();
+    auto error = 0;
+    auto end = utf8_decode(buf_ptr, &cp, &error);
+    bool result = f(error ? invalid_code_point : cp,
+                    string_view(ptr, error ? 1 : to_unsigned(end - buf_ptr)));
+    return result ? (error ? buf_ptr + 1 : end) : nullptr;
+  };
+  auto p = s.data();
+  const size_t block_size = 4;  // utf8_decode always reads blocks of 4 chars.
+  if (s.size() >= block_size) {
+    for (auto end = p + s.size() - block_size + 1; p < end;) {
+      p = decode(p, p);
+      if (!p) return;
+    }
+  }
+  if (auto num_chars_left = s.data() + s.size() - p) {
+    char buf[2 * block_size - 1] = {};
+    copy_str<char>(p, p + num_chars_left, buf);
+    const char* buf_ptr = buf;
+    do {
+      auto end = decode(buf_ptr, p);
+      if (!end) return;
+      p += end - buf_ptr;
+      buf_ptr = end;
+    } while (buf_ptr - buf < num_chars_left);
+  }
+}
+
+template <typename Char>
+inline auto compute_width(basic_string_view<Char> s) -> size_t {
+  return s.size();
+}
+
+// Computes approximate display width of a UTF-8 string.
+FMT_CONSTEXPR inline auto compute_width(string_view s) -> size_t {
+  size_t num_code_points = 0;
+  // It is not a lambda for compatibility with C++14.
+  struct count_code_points {
+    size_t* count;
+    FMT_CONSTEXPR auto operator()(uint32_t cp, string_view) const -> bool {
+      *count += detail::to_unsigned(
+          1 +
+          (cp >= 0x1100 &&
+           (cp <= 0x115f ||  // Hangul Jamo init. consonants
+            cp == 0x2329 ||  // LEFT-POINTING ANGLE BRACKET
+            cp == 0x232a ||  // RIGHT-POINTING ANGLE BRACKET
+            // CJK ... Yi except IDEOGRAPHIC HALF FILL SPACE:
+            (cp >= 0x2e80 && cp <= 0xa4cf && cp != 0x303f) ||
+            (cp >= 0xac00 && cp <= 0xd7a3) ||    // Hangul Syllables
+            (cp >= 0xf900 && cp <= 0xfaff) ||    // CJK Compatibility Ideographs
+            (cp >= 0xfe10 && cp <= 0xfe19) ||    // Vertical Forms
+            (cp >= 0xfe30 && cp <= 0xfe6f) ||    // CJK Compatibility Forms
+            (cp >= 0xff00 && cp <= 0xff60) ||    // Fullwidth Forms
+            (cp >= 0xffe0 && cp <= 0xffe6) ||    // Fullwidth Forms
+            (cp >= 0x20000 && cp <= 0x2fffd) ||  // CJK
+            (cp >= 0x30000 && cp <= 0x3fffd) ||
+            // Miscellaneous Symbols and Pictographs + Emoticons:
+            (cp >= 0x1f300 && cp <= 0x1f64f) ||
+            // Supplemental Symbols and Pictographs:
+            (cp >= 0x1f900 && cp <= 0x1f9ff))));
+      return true;
+    }
+  };
+  // We could avoid branches by using utf8_decode directly.
+  for_each_codepoint(s, count_code_points{&num_code_points});
+  return num_code_points;
+}
+
+inline auto compute_width(basic_string_view<char8_type> s) -> size_t {
+  return compute_width(
+      string_view(reinterpret_cast<const char*>(s.data()), s.size()));
+}
+
+template <typename Char>
+inline auto code_point_index(basic_string_view<Char> s, size_t n) -> size_t {
+  size_t size = s.size();
+  return n < size ? n : size;
+}
+
+// Calculates the index of the nth code point in a UTF-8 string.
+inline auto code_point_index(string_view s, size_t n) -> size_t {
+  size_t result = s.size();
+  const char* begin = s.begin();
+  for_each_codepoint(s, [begin, &n, &result](uint32_t, string_view sv) {
+    if (n != 0) {
+      --n;
+      return true;
+    }
+    result = to_unsigned(sv.begin() - begin);
+    return false;
+  });
+  return result;
+}
+
+inline auto code_point_index(basic_string_view<char8_type> s, size_t n)
+    -> size_t {
+  return code_point_index(
+      string_view(reinterpret_cast<const char*>(s.data()), s.size()), n);
+}
+
+template <typename T> struct is_integral : std::is_integral<T> {};
+template <> struct is_integral<int128_opt> : std::true_type {};
+template <> struct is_integral<uint128_t> : std::true_type {};
+
+template <typename T>
+using is_signed =
+    std::integral_constant<bool, std::numeric_limits<T>::is_signed ||
+                                     std::is_same<T, int128_opt>::value>;
+
+template <typename T>
+using is_integer =
+    bool_constant<is_integral<T>::value && !std::is_same<T, bool>::value &&
+                  !std::is_same<T, char>::value &&
+                  !std::is_same<T, wchar_t>::value>;
+
+#ifndef FMT_USE_FLOAT
+#  define FMT_USE_FLOAT 1
+#endif
+#ifndef FMT_USE_DOUBLE
+#  define FMT_USE_DOUBLE 1
+#endif
+#ifndef FMT_USE_LONG_DOUBLE
+#  define FMT_USE_LONG_DOUBLE 1
+#endif
+
+#ifndef FMT_USE_FLOAT128
+#  ifdef __clang__
+// Clang emulates GCC, so it has to appear early.
+#    if FMT_HAS_INCLUDE(<quadmath.h>)
+#      define FMT_USE_FLOAT128 1
+#    endif
+#  elif defined(__GNUC__)
+// GNU C++:
+#    if defined(_GLIBCXX_USE_FLOAT128) && !defined(__STRICT_ANSI__)
+#      define FMT_USE_FLOAT128 1
+#    endif
+#  endif
+#  ifndef FMT_USE_FLOAT128
+#    define FMT_USE_FLOAT128 0
+#  endif
+#endif
+
+#if FMT_USE_FLOAT128
+using float128 = __float128;
+#else
+using float128 = void;
+#endif
+template <typename T> using is_float128 = std::is_same<T, float128>;
+
+template <typename T>
+using is_floating_point =
+    bool_constant<std::is_floating_point<T>::value || is_float128<T>::value>;
+
+template <typename T, bool = std::is_floating_point<T>::value>
+struct is_fast_float : bool_constant<std::numeric_limits<T>::is_iec559 &&
+                                     sizeof(T) <= sizeof(double)> {};
+template <typename T> struct is_fast_float<T, false> : std::false_type {};
+
+template <typename T>
+using is_double_double = bool_constant<std::numeric_limits<T>::digits == 106>;
+
+#ifndef FMT_USE_FULL_CACHE_DRAGONBOX
+#  define FMT_USE_FULL_CACHE_DRAGONBOX 0
+#endif
+
+template <typename T>
+template <typename U>
+void buffer<T>::append(const U* begin, const U* end) {
+  while (begin != end) {
+    auto count = to_unsigned(end - begin);
+    try_reserve(size_ + count);
+    auto free_cap = capacity_ - size_;
+    if (free_cap < count) count = free_cap;
+    std::uninitialized_copy_n(begin, count, ptr_ + size_);
+    size_ += count;
+    begin += count;
+  }
+}
+
+template <typename T, typename Enable = void>
+struct is_locale : std::false_type {};
+template <typename T>
+struct is_locale<T, void_t<decltype(T::classic())>> : std::true_type {};
+}  // namespace detail
+
+FMT_BEGIN_EXPORT
+
+// The number of characters to store in the basic_memory_buffer object itself
+// to avoid dynamic memory allocation.
+enum { inline_buffer_size = 500 };
+
+/**
+  \rst
+  A dynamically growing memory buffer for trivially copyable/constructible types
+  with the first ``SIZE`` elements stored in the object itself.
+
+  You can use the ``memory_buffer`` type alias for ``char`` instead.
+
+  **Example**::
+
+     auto out = fmt::memory_buffer();
+     fmt::format_to(std::back_inserter(out), "The answer is {}.", 42);
+
+  This will append the following output to the ``out`` object:
+
+  .. code-block:: none
+
+     The answer is 42.
+
+  The output can be converted to an ``std::string`` with ``to_string(out)``.
+  \endrst
+ */
+template <typename T, size_t SIZE = inline_buffer_size,
+          typename Allocator = std::allocator<T>>
+class basic_memory_buffer final : public detail::buffer<T> {
+ private:
+  T store_[SIZE];
+
+  // Don't inherit from Allocator to avoid generating type_info for it.
+  FMT_NO_UNIQUE_ADDRESS Allocator alloc_;
+
+  // Deallocate memory allocated by the buffer.
+  FMT_CONSTEXPR20 void deallocate() {
+    T* data = this->data();
+    if (data != store_) alloc_.deallocate(data, this->capacity());
+  }
+
+ protected:
+  FMT_CONSTEXPR20 void grow(size_t size) override {
+    detail::abort_fuzzing_if(size > 5000);
+    const size_t max_size = std::allocator_traits<Allocator>::max_size(alloc_);
+    size_t old_capacity = this->capacity();
+    size_t new_capacity = old_capacity + old_capacity / 2;
+    if (size > new_capacity)
+      new_capacity = size;
+    else if (new_capacity > max_size)
+      new_capacity = size > max_size ? size : max_size;
+    T* old_data = this->data();
+    T* new_data =
+        std::allocator_traits<Allocator>::allocate(alloc_, new_capacity);
+    // Suppress a bogus -Wstringop-overflow in gcc 13.1 (#3481).
+    detail::assume(this->size() <= new_capacity);
+    // The following code doesn't throw, so the raw pointer above doesn't leak.
+    std::uninitialized_copy_n(old_data, this->size(), new_data);
+    this->set(new_data, new_capacity);
+    // deallocate must not throw according to the standard, but even if it does,
+    // the buffer already uses the new storage and will deallocate it in
+    // destructor.
+    if (old_data != store_) alloc_.deallocate(old_data, old_capacity);
+  }
+
+ public:
+  using value_type = T;
+  using const_reference = const T&;
+
+  FMT_CONSTEXPR20 explicit basic_memory_buffer(
+      const Allocator& alloc = Allocator())
+      : alloc_(alloc) {
+    this->set(store_, SIZE);
+    if (detail::is_constant_evaluated()) detail::fill_n(store_, SIZE, T());
+  }
+  FMT_CONSTEXPR20 ~basic_memory_buffer() { deallocate(); }
+
+ private:
+  // Move data from other to this buffer.
+  FMT_CONSTEXPR20 void move(basic_memory_buffer& other) {
+    alloc_ = std::move(other.alloc_);
+    T* data = other.data();
+    size_t size = other.size(), capacity = other.capacity();
+    if (data == other.store_) {
+      this->set(store_, capacity);
+      detail::copy_str<T>(other.store_, other.store_ + size, store_);
+    } else {
+      this->set(data, capacity);
+      // Set pointer to the inline array so that delete is not called
+      // when deallocating.
+      other.set(other.store_, 0);
+      other.clear();
+    }
+    this->resize(size);
+  }
+
+ public:
+  /**
+    \rst
+    Constructs a :class:`fmt::basic_memory_buffer` object moving the content
+    of the other object to it.
+    \endrst
+   */
+  FMT_CONSTEXPR20 basic_memory_buffer(basic_memory_buffer&& other) noexcept {
+    move(other);
+  }
+
+  /**
+    \rst
+    Moves the content of the other ``basic_memory_buffer`` object to this one.
+    \endrst
+   */
+  auto operator=(basic_memory_buffer&& other) noexcept -> basic_memory_buffer& {
+    FMT_ASSERT(this != &other, "");
+    deallocate();
+    move(other);
+    return *this;
+  }
+
+  // Returns a copy of the allocator associated with this buffer.
+  auto get_allocator() const -> Allocator { return alloc_; }
+
+  /**
+    Resizes the buffer to contain *count* elements. If T is a POD type new
+    elements may not be initialized.
+   */
+  FMT_CONSTEXPR20 void resize(size_t count) { this->try_resize(count); }
+
+  /** Increases the buffer capacity to *new_capacity*. */
+  void reserve(size_t new_capacity) { this->try_reserve(new_capacity); }
+
+  using detail::buffer<T>::append;
+  template <typename ContiguousRange>
+  void append(const ContiguousRange& range) {
+    append(range.data(), range.data() + range.size());
+  }
+};
+
+using memory_buffer = basic_memory_buffer<char>;
+
+template <typename T, size_t SIZE, typename Allocator>
+struct is_contiguous<basic_memory_buffer<T, SIZE, Allocator>> : std::true_type {
+};
+
+FMT_END_EXPORT
+namespace detail {
+FMT_API auto write_console(int fd, string_view text) -> bool;
+FMT_API auto write_console(std::FILE* f, string_view text) -> bool;
+FMT_API void print(std::FILE*, string_view);
+}  // namespace detail
+
+FMT_BEGIN_EXPORT
+
+// Suppress a misleading warning in older versions of clang.
+#if FMT_CLANG_VERSION
+#  pragma clang diagnostic ignored "-Wweak-vtables"
+#endif
+
+/** An error reported from a formatting function. */
+class FMT_SO_VISIBILITY("default") format_error : public std::runtime_error {
+ public:
+  using std::runtime_error::runtime_error;
+};
+
+namespace detail_exported {
+#if FMT_USE_NONTYPE_TEMPLATE_ARGS
+template <typename Char, size_t N> struct fixed_string {
+  constexpr fixed_string(const Char (&str)[N]) {
+    detail::copy_str<Char, const Char*, Char*>(static_cast<const Char*>(str),
+                                               str + N, data);
+  }
+  Char data[N] = {};
+};
+#endif
+
+// Converts a compile-time string to basic_string_view.
+template <typename Char, size_t N>
+constexpr auto compile_string_to_view(const Char (&s)[N])
+    -> basic_string_view<Char> {
+  // Remove trailing NUL character if needed. Won't be present if this is used
+  // with a raw character array (i.e. not defined as a string).
+  return {s, N - (std::char_traits<Char>::to_int_type(s[N - 1]) == 0 ? 1 : 0)};
+}
+template <typename Char>
+constexpr auto compile_string_to_view(detail::std_string_view<Char> s)
+    -> basic_string_view<Char> {
+  return {s.data(), s.size()};
+}
+}  // namespace detail_exported
+
+class loc_value {
+ private:
+  basic_format_arg<format_context> value_;
+
+ public:
+  template <typename T, FMT_ENABLE_IF(!detail::is_float128<T>::value)>
+  loc_value(T value) : value_(detail::make_arg<format_context>(value)) {}
+
+  template <typename T, FMT_ENABLE_IF(detail::is_float128<T>::value)>
+  loc_value(T) {}
+
+  template <typename Visitor> auto visit(Visitor&& vis) -> decltype(vis(0)) {
+    return visit_format_arg(vis, value_);
+  }
+};
+
+// A locale facet that formats values in UTF-8.
+// It is parameterized on the locale to avoid the heavy <locale> include.
+template <typename Locale> class format_facet : public Locale::facet {
+ private:
+  std::string separator_;
+  std::string grouping_;
+  std::string decimal_point_;
+
+ protected:
+  virtual auto do_put(appender out, loc_value val,
+                      const format_specs<>& specs) const -> bool;
+
+ public:
+  static FMT_API typename Locale::id id;
+
+  explicit format_facet(Locale& loc);
+  explicit format_facet(string_view sep = "",
+                        std::initializer_list<unsigned char> g = {3},
+                        std::string decimal_point = ".")
+      : separator_(sep.data(), sep.size()),
+        grouping_(g.begin(), g.end()),
+        decimal_point_(decimal_point) {}
+
+  auto put(appender out, loc_value val, const format_specs<>& specs) const
+      -> bool {
+    return do_put(out, val, specs);
+  }
+};
+
+namespace detail {
+
+// Returns true if value is negative, false otherwise.
+// Same as `value < 0` but doesn't produce warnings if T is an unsigned type.
+template <typename T, FMT_ENABLE_IF(is_signed<T>::value)>
+constexpr auto is_negative(T value) -> bool {
+  return value < 0;
+}
+template <typename T, FMT_ENABLE_IF(!is_signed<T>::value)>
+constexpr auto is_negative(T) -> bool {
+  return false;
+}
+
+template <typename T>
+FMT_CONSTEXPR auto is_supported_floating_point(T) -> bool {
+  if (std::is_same<T, float>()) return FMT_USE_FLOAT;
+  if (std::is_same<T, double>()) return FMT_USE_DOUBLE;
+  if (std::is_same<T, long double>()) return FMT_USE_LONG_DOUBLE;
+  return true;
+}
+
+// Smallest of uint32_t, uint64_t, uint128_t that is large enough to
+// represent all values of an integral type T.
+template <typename T>
+using uint32_or_64_or_128_t =
+    conditional_t<num_bits<T>() <= 32 && !FMT_REDUCE_INT_INSTANTIATIONS,
+                  uint32_t,
+                  conditional_t<num_bits<T>() <= 64, uint64_t, uint128_t>>;
+template <typename T>
+using uint64_or_128_t = conditional_t<num_bits<T>() <= 64, uint64_t, uint128_t>;
+
+#define FMT_POWERS_OF_10(factor)                                  \
+  factor * 10, (factor) * 100, (factor) * 1000, (factor) * 10000, \
+      (factor) * 100000, (factor) * 1000000, (factor) * 10000000, \
+      (factor) * 100000000, (factor) * 1000000000
+
+// Converts value in the range [0, 100) to a string.
+constexpr auto digits2(size_t value) -> const char* {
+  // GCC generates slightly better code when value is pointer-size.
+  return &"0001020304050607080910111213141516171819"
+         "2021222324252627282930313233343536373839"
+         "4041424344454647484950515253545556575859"
+         "6061626364656667686970717273747576777879"
+         "8081828384858687888990919293949596979899"[value * 2];
+}
+
+// Sign is a template parameter to workaround a bug in gcc 4.8.
+template <typename Char, typename Sign> constexpr auto sign(Sign s) -> Char {
+#if !FMT_GCC_VERSION || FMT_GCC_VERSION >= 604
+  static_assert(std::is_same<Sign, sign_t>::value, "");
+#endif
+  return static_cast<Char>("\0-+ "[s]);
+}
+
+template <typename T> FMT_CONSTEXPR auto count_digits_fallback(T n) -> int {
+  int count = 1;
+  for (;;) {
+    // Integer division is slow so do it for a group of four digits instead
+    // of for every digit. The idea comes from the talk by Alexandrescu
+    // "Three Optimization Tips for C++". See speed-test for a comparison.
+    if (n < 10) return count;
+    if (n < 100) return count + 1;
+    if (n < 1000) return count + 2;
+    if (n < 10000) return count + 3;
+    n /= 10000u;
+    count += 4;
+  }
+}
+#if FMT_USE_INT128
+FMT_CONSTEXPR inline auto count_digits(uint128_opt n) -> int {
+  return count_digits_fallback(n);
+}
+#endif
+
+#ifdef FMT_BUILTIN_CLZLL
+// It is a separate function rather than a part of count_digits to workaround
+// the lack of static constexpr in constexpr functions.
+inline auto do_count_digits(uint64_t n) -> int {
+  // This has comparable performance to the version by Kendall Willets
+  // (https://github.com/fmtlib/format-benchmark/blob/master/digits10)
+  // but uses smaller tables.
+  // Maps bsr(n) to ceil(log10(pow(2, bsr(n) + 1) - 1)).
+  static constexpr uint8_t bsr2log10[] = {
+      1,  1,  1,  2,  2,  2,  3,  3,  3,  4,  4,  4,  4,  5,  5,  5,
+      6,  6,  6,  7,  7,  7,  7,  8,  8,  8,  9,  9,  9,  10, 10, 10,
+      10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 15, 15,
+      15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 19, 20};
+  auto t = bsr2log10[FMT_BUILTIN_CLZLL(n | 1) ^ 63];
+  static constexpr const uint64_t zero_or_powers_of_10[] = {
+      0, 0, FMT_POWERS_OF_10(1U), FMT_POWERS_OF_10(1000000000ULL),
+      10000000000000000000ULL};
+  return t - (n < zero_or_powers_of_10[t]);
+}
+#endif
+
+// Returns the number of decimal digits in n. Leading zeros are not counted
+// except for n == 0 in which case count_digits returns 1.
+FMT_CONSTEXPR20 inline auto count_digits(uint64_t n) -> int {
+#ifdef FMT_BUILTIN_CLZLL
+  if (!is_constant_evaluated()) {
+    return do_count_digits(n);
+  }
+#endif
+  return count_digits_fallback(n);
+}
+
+// Counts the number of digits in n. BITS = log2(radix).
+template <int BITS, typename UInt>
+FMT_CONSTEXPR auto count_digits(UInt n) -> int {
+#ifdef FMT_BUILTIN_CLZ
+  if (!is_constant_evaluated() && num_bits<UInt>() == 32)
+    return (FMT_BUILTIN_CLZ(static_cast<uint32_t>(n) | 1) ^ 31) / BITS + 1;
+#endif
+  // Lambda avoids unreachable code warnings from NVHPC.
+  return [](UInt m) {
+    int num_digits = 0;
+    do {
+      ++num_digits;
+    } while ((m >>= BITS) != 0);
+    return num_digits;
+  }(n);
+}
+
+#ifdef FMT_BUILTIN_CLZ
+// It is a separate function rather than a part of count_digits to workaround
+// the lack of static constexpr in constexpr functions.
+FMT_INLINE auto do_count_digits(uint32_t n) -> int {
+// An optimization by Kendall Willets from https://bit.ly/3uOIQrB.
+// This increments the upper 32 bits (log10(T) - 1) when >= T is added.
+#  define FMT_INC(T) (((sizeof(#T) - 1ull) << 32) - T)
+  static constexpr uint64_t table[] = {
+      FMT_INC(0),          FMT_INC(0),          FMT_INC(0),           // 8
+      FMT_INC(10),         FMT_INC(10),         FMT_INC(10),          // 64
+      FMT_INC(100),        FMT_INC(100),        FMT_INC(100),         // 512
+      FMT_INC(1000),       FMT_INC(1000),       FMT_INC(1000),        // 4096
+      FMT_INC(10000),      FMT_INC(10000),      FMT_INC(10000),       // 32k
+      FMT_INC(100000),     FMT_INC(100000),     FMT_INC(100000),      // 256k
+      FMT_INC(1000000),    FMT_INC(1000000),    FMT_INC(1000000),     // 2048k
+      FMT_INC(10000000),   FMT_INC(10000000),   FMT_INC(10000000),    // 16M
+      FMT_INC(100000000),  FMT_INC(100000000),  FMT_INC(100000000),   // 128M
+      FMT_INC(1000000000), FMT_INC(1000000000), FMT_INC(1000000000),  // 1024M
+      FMT_INC(1000000000), FMT_INC(1000000000)                        // 4B
+  };
+  auto inc = table[FMT_BUILTIN_CLZ(n | 1) ^ 31];
+  return static_cast<int>((n + inc) >> 32);
+}
+#endif
+
+// Optional version of count_digits for better performance on 32-bit platforms.
+FMT_CONSTEXPR20 inline auto count_digits(uint32_t n) -> int {
+#ifdef FMT_BUILTIN_CLZ
+  if (!is_constant_evaluated()) {
+    return do_count_digits(n);
+  }
+#endif
+  return count_digits_fallback(n);
+}
+
+template <typename Int> constexpr auto digits10() noexcept -> int {
+  return std::numeric_limits<Int>::digits10;
+}
+template <> constexpr auto digits10<int128_opt>() noexcept -> int { return 38; }
+template <> constexpr auto digits10<uint128_t>() noexcept -> int { return 38; }
+
+template <typename Char> struct thousands_sep_result {
+  std::string grouping;
+  Char thousands_sep;
+};
+
+template <typename Char>
+FMT_API auto thousands_sep_impl(locale_ref loc) -> thousands_sep_result<Char>;
+template <typename Char>
+inline auto thousands_sep(locale_ref loc) -> thousands_sep_result<Char> {
+  auto result = thousands_sep_impl<char>(loc);
+  return {result.grouping, Char(result.thousands_sep)};
+}
+template <>
+inline auto thousands_sep(locale_ref loc) -> thousands_sep_result<wchar_t> {
+  return thousands_sep_impl<wchar_t>(loc);
+}
+
+template <typename Char>
+FMT_API auto decimal_point_impl(locale_ref loc) -> Char;
+template <typename Char> inline auto decimal_point(locale_ref loc) -> Char {
+  return Char(decimal_point_impl<char>(loc));
+}
+template <> inline auto decimal_point(locale_ref loc) -> wchar_t {
+  return decimal_point_impl<wchar_t>(loc);
+}
+
+// Compares two characters for equality.
+template <typename Char> auto equal2(const Char* lhs, const char* rhs) -> bool {
+  return lhs[0] == Char(rhs[0]) && lhs[1] == Char(rhs[1]);
+}
+inline auto equal2(const char* lhs, const char* rhs) -> bool {
+  return memcmp(lhs, rhs, 2) == 0;
+}
+
+// Copies two characters from src to dst.
+template <typename Char>
+FMT_CONSTEXPR20 FMT_INLINE void copy2(Char* dst, const char* src) {
+  if (!is_constant_evaluated() && sizeof(Char) == sizeof(char)) {
+    memcpy(dst, src, 2);
+    return;
+  }
+  *dst++ = static_cast<Char>(*src++);
+  *dst = static_cast<Char>(*src);
+}
+
+template <typename Iterator> struct format_decimal_result {
+  Iterator begin;
+  Iterator end;
+};
+
+// Formats a decimal unsigned integer value writing into out pointing to a
+// buffer of specified size. The caller must ensure that the buffer is large
+// enough.
+template <typename Char, typename UInt>
+FMT_CONSTEXPR20 auto format_decimal(Char* out, UInt value, int size)
+    -> format_decimal_result<Char*> {
+  FMT_ASSERT(size >= count_digits(value), "invalid digit count");
+  out += size;
+  Char* end = out;
+  while (value >= 100) {
+    // Integer division is slow so do it for a group of two digits instead
+    // of for every digit. The idea comes from the talk by Alexandrescu
+    // "Three Optimization Tips for C++". See speed-test for a comparison.
+    out -= 2;
+    copy2(out, digits2(static_cast<size_t>(value % 100)));
+    value /= 100;
+  }
+  if (value < 10) {
+    *--out = static_cast<Char>('0' + value);
+    return {out, end};
+  }
+  out -= 2;
+  copy2(out, digits2(static_cast<size_t>(value)));
+  return {out, end};
+}
+
+template <typename Char, typename UInt, typename Iterator,
+          FMT_ENABLE_IF(!std::is_pointer<remove_cvref_t<Iterator>>::value)>
+FMT_CONSTEXPR inline auto format_decimal(Iterator out, UInt value, int size)
+    -> format_decimal_result<Iterator> {
+  // Buffer is large enough to hold all digits (digits10 + 1).
+  Char buffer[digits10<UInt>() + 1] = {};
+  auto end = format_decimal(buffer, value, size).end;
+  return {out, detail::copy_str_noinline<Char>(buffer, end, out)};
+}
+
+template <unsigned BASE_BITS, typename Char, typename UInt>
+FMT_CONSTEXPR auto format_uint(Char* buffer, UInt value, int num_digits,
+                               bool upper = false) -> Char* {
+  buffer += num_digits;
+  Char* end = buffer;
+  do {
+    const char* digits = upper ? "0123456789ABCDEF" : "0123456789abcdef";
+    unsigned digit = static_cast<unsigned>(value & ((1 << BASE_BITS) - 1));
+    *--buffer = static_cast<Char>(BASE_BITS < 4 ? static_cast<char>('0' + digit)
+                                                : digits[digit]);
+  } while ((value >>= BASE_BITS) != 0);
+  return end;
+}
+
+template <unsigned BASE_BITS, typename Char, typename It, typename UInt>
+FMT_CONSTEXPR inline auto format_uint(It out, UInt value, int num_digits,
+                                      bool upper = false) -> It {
+  if (auto ptr = to_pointer<Char>(out, to_unsigned(num_digits))) {
+    format_uint<BASE_BITS>(ptr, value, num_digits, upper);
+    return out;
+  }
+  // Buffer should be large enough to hold all digits (digits / BASE_BITS + 1).
+  char buffer[num_bits<UInt>() / BASE_BITS + 1] = {};
+  format_uint<BASE_BITS>(buffer, value, num_digits, upper);
+  return detail::copy_str_noinline<Char>(buffer, buffer + num_digits, out);
+}
+
+// A converter from UTF-8 to UTF-16.
+class utf8_to_utf16 {
+ private:
+  basic_memory_buffer<wchar_t> buffer_;
+
+ public:
+  FMT_API explicit utf8_to_utf16(string_view s);
+  operator basic_string_view<wchar_t>() const { return {&buffer_[0], size()}; }
+  auto size() const -> size_t { return buffer_.size() - 1; }
+  auto c_str() const -> const wchar_t* { return &buffer_[0]; }
+  auto str() const -> std::wstring { return {&buffer_[0], size()}; }
+};
+
+enum class to_utf8_error_policy { abort, replace };
+
+// A converter from UTF-16/UTF-32 (host endian) to UTF-8.
+template <typename WChar, typename Buffer = memory_buffer> class to_utf8 {
+ private:
+  Buffer buffer_;
+
+ public:
+  to_utf8() {}
+  explicit to_utf8(basic_string_view<WChar> s,
+                   to_utf8_error_policy policy = to_utf8_error_policy::abort) {
+    static_assert(sizeof(WChar) == 2 || sizeof(WChar) == 4,
+                  "Expect utf16 or utf32");
+    if (!convert(s, policy))
+      FMT_THROW(std::runtime_error(sizeof(WChar) == 2 ? "invalid utf16"
+                                                      : "invalid utf32"));
+  }
+  operator string_view() const { return string_view(&buffer_[0], size()); }
+  auto size() const -> size_t { return buffer_.size() - 1; }
+  auto c_str() const -> const char* { return &buffer_[0]; }
+  auto str() const -> std::string { return std::string(&buffer_[0], size()); }
+
+  // Performs conversion returning a bool instead of throwing exception on
+  // conversion error. This method may still throw in case of memory allocation
+  // error.
+  auto convert(basic_string_view<WChar> s,
+               to_utf8_error_policy policy = to_utf8_error_policy::abort)
+      -> bool {
+    if (!convert(buffer_, s, policy)) return false;
+    buffer_.push_back(0);
+    return true;
+  }
+  static auto convert(Buffer& buf, basic_string_view<WChar> s,
+                      to_utf8_error_policy policy = to_utf8_error_policy::abort)
+      -> bool {
+    for (auto p = s.begin(); p != s.end(); ++p) {
+      uint32_t c = static_cast<uint32_t>(*p);
+      if (sizeof(WChar) == 2 && c >= 0xd800 && c <= 0xdfff) {
+        // Handle a surrogate pair.
+        ++p;
+        if (p == s.end() || (c & 0xfc00) != 0xd800 || (*p & 0xfc00) != 0xdc00) {
+          if (policy == to_utf8_error_policy::abort) return false;
+          buf.append(string_view("\xEF\xBF\xBD"));
+          --p;
+        } else {
+          c = (c << 10) + static_cast<uint32_t>(*p) - 0x35fdc00;
+        }
+      } else if (c < 0x80) {
+        buf.push_back(static_cast<char>(c));
+      } else if (c < 0x800) {
+        buf.push_back(static_cast<char>(0xc0 | (c >> 6)));
+        buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
+      } else if ((c >= 0x800 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xffff)) {
+        buf.push_back(static_cast<char>(0xe0 | (c >> 12)));
+        buf.push_back(static_cast<char>(0x80 | ((c & 0xfff) >> 6)));
+        buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
+      } else if (c >= 0x10000 && c <= 0x10ffff) {
+        buf.push_back(static_cast<char>(0xf0 | (c >> 18)));
+        buf.push_back(static_cast<char>(0x80 | ((c & 0x3ffff) >> 12)));
+        buf.push_back(static_cast<char>(0x80 | ((c & 0xfff) >> 6)));
+        buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
+      } else {
+        return false;
+      }
+    }
+    return true;
+  }
+};
+
+// Computes 128-bit result of multiplication of two 64-bit unsigned integers.
+inline auto umul128(uint64_t x, uint64_t y) noexcept -> uint128_fallback {
+#if FMT_USE_INT128
+  auto p = static_cast<uint128_opt>(x) * static_cast<uint128_opt>(y);
+  return {static_cast<uint64_t>(p >> 64), static_cast<uint64_t>(p)};
+#elif defined(_MSC_VER) && defined(_M_X64)
+  auto hi = uint64_t();
+  auto lo = _umul128(x, y, &hi);
+  return {hi, lo};
+#else
+  const uint64_t mask = static_cast<uint64_t>(max_value<uint32_t>());
+
+  uint64_t a = x >> 32;
+  uint64_t b = x & mask;
+  uint64_t c = y >> 32;
+  uint64_t d = y & mask;
+
+  uint64_t ac = a * c;
+  uint64_t bc = b * c;
+  uint64_t ad = a * d;
+  uint64_t bd = b * d;
+
+  uint64_t intermediate = (bd >> 32) + (ad & mask) + (bc & mask);
+
+  return {ac + (intermediate >> 32) + (ad >> 32) + (bc >> 32),
+          (intermediate << 32) + (bd & mask)};
+#endif
+}
+
+namespace dragonbox {
+// Computes floor(log10(pow(2, e))) for e in [-2620, 2620] using the method from
+// https://fmt.dev/papers/Dragonbox.pdf#page=28, section 6.1.
+inline auto floor_log10_pow2(int e) noexcept -> int {
+  FMT_ASSERT(e <= 2620 && e >= -2620, "too large exponent");
+  static_assert((-1 >> 1) == -1, "right shift is not arithmetic");
+  return (e * 315653) >> 20;
+}
+
+inline auto floor_log2_pow10(int e) noexcept -> int {
+  FMT_ASSERT(e <= 1233 && e >= -1233, "too large exponent");
+  return (e * 1741647) >> 19;
+}
+
+// Computes upper 64 bits of multiplication of two 64-bit unsigned integers.
+inline auto umul128_upper64(uint64_t x, uint64_t y) noexcept -> uint64_t {
+#if FMT_USE_INT128
+  auto p = static_cast<uint128_opt>(x) * static_cast<uint128_opt>(y);
+  return static_cast<uint64_t>(p >> 64);
+#elif defined(_MSC_VER) && defined(_M_X64)
+  return __umulh(x, y);
+#else
+  return umul128(x, y).high();
+#endif
+}
+
+// Computes upper 128 bits of multiplication of a 64-bit unsigned integer and a
+// 128-bit unsigned integer.
+inline auto umul192_upper128(uint64_t x, uint128_fallback y) noexcept
+    -> uint128_fallback {
+  uint128_fallback r = umul128(x, y.high());
+  r += umul128_upper64(x, y.low());
+  return r;
+}
+
+FMT_API auto get_cached_power(int k) noexcept -> uint128_fallback;
+
+// Type-specific information that Dragonbox uses.
+template <typename T, typename Enable = void> struct float_info;
+
+template <> struct float_info<float> {
+  using carrier_uint = uint32_t;
+  static const int exponent_bits = 8;
+  static const int kappa = 1;
+  static const int big_divisor = 100;
+  static const int small_divisor = 10;
+  static const int min_k = -31;
+  static const int max_k = 46;
+  static const int shorter_interval_tie_lower_threshold = -35;
+  static const int shorter_interval_tie_upper_threshold = -35;
+};
+
+template <> struct float_info<double> {
+  using carrier_uint = uint64_t;
+  static const int exponent_bits = 11;
+  static const int kappa = 2;
+  static const int big_divisor = 1000;
+  static const int small_divisor = 100;
+  static const int min_k = -292;
+  static const int max_k = 341;
+  static const int shorter_interval_tie_lower_threshold = -77;
+  static const int shorter_interval_tie_upper_threshold = -77;
+};
+
+// An 80- or 128-bit floating point number.
+template <typename T>
+struct float_info<T, enable_if_t<std::numeric_limits<T>::digits == 64 ||
+                                 std::numeric_limits<T>::digits == 113 ||
+                                 is_float128<T>::value>> {
+  using carrier_uint = detail::uint128_t;
+  static const int exponent_bits = 15;
+};
+
+// A double-double floating point number.
+template <typename T>
+struct float_info<T, enable_if_t<is_double_double<T>::value>> {
+  using carrier_uint = detail::uint128_t;
+};
+
+template <typename T> struct decimal_fp {
+  using significand_type = typename float_info<T>::carrier_uint;
+  significand_type significand;
+  int exponent;
+};
+
+template <typename T> FMT_API auto to_decimal(T x) noexcept -> decimal_fp<T>;
+}  // namespace dragonbox
+
+// Returns true iff Float has the implicit bit which is not stored.
+template <typename Float> constexpr auto has_implicit_bit() -> bool {
+  // An 80-bit FP number has a 64-bit significand an no implicit bit.
+  return std::numeric_limits<Float>::digits != 64;
+}
+
+// Returns the number of significand bits stored in Float. The implicit bit is
+// not counted since it is not stored.
+template <typename Float> constexpr auto num_significand_bits() -> int {
+  // std::numeric_limits may not support __float128.
+  return is_float128<Float>() ? 112
+                              : (std::numeric_limits<Float>::digits -
+                                 (has_implicit_bit<Float>() ? 1 : 0));
+}
+
+template <typename Float>
+constexpr auto exponent_mask() ->
+    typename dragonbox::float_info<Float>::carrier_uint {
+  using float_uint = typename dragonbox::float_info<Float>::carrier_uint;
+  return ((float_uint(1) << dragonbox::float_info<Float>::exponent_bits) - 1)
+         << num_significand_bits<Float>();
+}
+template <typename Float> constexpr auto exponent_bias() -> int {
+  // std::numeric_limits may not support __float128.
+  return is_float128<Float>() ? 16383
+                              : std::numeric_limits<Float>::max_exponent - 1;
+}
+
+// Writes the exponent exp in the form "[+-]d{2,3}" to buffer.
+template <typename Char, typename It>
+FMT_CONSTEXPR auto write_exponent(int exp, It it) -> It {
+  FMT_ASSERT(-10000 < exp && exp < 10000, "exponent out of range");
+  if (exp < 0) {
+    *it++ = static_cast<Char>('-');
+    exp = -exp;
+  } else {
+    *it++ = static_cast<Char>('+');
+  }
+  if (exp >= 100) {
+    const char* top = digits2(to_unsigned(exp / 100));
+    if (exp >= 1000) *it++ = static_cast<Char>(top[0]);
+    *it++ = static_cast<Char>(top[1]);
+    exp %= 100;
+  }
+  const char* d = digits2(to_unsigned(exp));
+  *it++ = static_cast<Char>(d[0]);
+  *it++ = static_cast<Char>(d[1]);
+  return it;
+}
+
+// A floating-point number f * pow(2, e) where F is an unsigned type.
+template <typename F> struct basic_fp {
+  F f;
+  int e;
+
+  static constexpr const int num_significand_bits =
+      static_cast<int>(sizeof(F) * num_bits<unsigned char>());
+
+  constexpr basic_fp() : f(0), e(0) {}
+  constexpr basic_fp(uint64_t f_val, int e_val) : f(f_val), e(e_val) {}
+
+  // Constructs fp from an IEEE754 floating-point number.
+  template <typename Float> FMT_CONSTEXPR basic_fp(Float n) { assign(n); }
+
+  // Assigns n to this and return true iff predecessor is closer than successor.
+  template <typename Float, FMT_ENABLE_IF(!is_double_double<Float>::value)>
+  FMT_CONSTEXPR auto assign(Float n) -> bool {
+    static_assert(std::numeric_limits<Float>::digits <= 113, "unsupported FP");
+    // Assume Float is in the format [sign][exponent][significand].
+    using carrier_uint = typename dragonbox::float_info<Float>::carrier_uint;
+    const auto num_float_significand_bits =
+        detail::num_significand_bits<Float>();
+    const auto implicit_bit = carrier_uint(1) << num_float_significand_bits;
+    const auto significand_mask = implicit_bit - 1;
+    auto u = bit_cast<carrier_uint>(n);
+    f = static_cast<F>(u & significand_mask);
+    auto biased_e = static_cast<int>((u & exponent_mask<Float>()) >>
+                                     num_float_significand_bits);
+    // The predecessor is closer if n is a normalized power of 2 (f == 0)
+    // other than the smallest normalized number (biased_e > 1).
+    auto is_predecessor_closer = f == 0 && biased_e > 1;
+    if (biased_e == 0)
+      biased_e = 1;  // Subnormals use biased exponent 1 (min exponent).
+    else if (has_implicit_bit<Float>())
+      f += static_cast<F>(implicit_bit);
+    e = biased_e - exponent_bias<Float>() - num_float_significand_bits;
+    if (!has_implicit_bit<Float>()) ++e;
+    return is_predecessor_closer;
+  }
+
+  template <typename Float, FMT_ENABLE_IF(is_double_double<Float>::value)>
+  FMT_CONSTEXPR auto assign(Float n) -> bool {
+    static_assert(std::numeric_limits<double>::is_iec559, "unsupported FP");
+    return assign(static_cast<double>(n));
+  }
+};
+
+using fp = basic_fp<unsigned long long>;
+
+// Normalizes the value converted from double and multiplied by (1 << SHIFT).
+template <int SHIFT = 0, typename F>
+FMT_CONSTEXPR auto normalize(basic_fp<F> value) -> basic_fp<F> {
+  // Handle subnormals.
+  const auto implicit_bit = F(1) << num_significand_bits<double>();
+  const auto shifted_implicit_bit = implicit_bit << SHIFT;
+  while ((value.f & shifted_implicit_bit) == 0) {
+    value.f <<= 1;
+    --value.e;
+  }
+  // Subtract 1 to account for hidden bit.
+  const auto offset = basic_fp<F>::num_significand_bits -
+                      num_significand_bits<double>() - SHIFT - 1;
+  value.f <<= offset;
+  value.e -= offset;
+  return value;
+}
+
+// Computes lhs * rhs / pow(2, 64) rounded to nearest with half-up tie breaking.
+FMT_CONSTEXPR inline auto multiply(uint64_t lhs, uint64_t rhs) -> uint64_t {
+#if FMT_USE_INT128
+  auto product = static_cast<__uint128_t>(lhs) * rhs;
+  auto f = static_cast<uint64_t>(product >> 64);
+  return (static_cast<uint64_t>(product) & (1ULL << 63)) != 0 ? f + 1 : f;
+#else
+  // Multiply 32-bit parts of significands.
+  uint64_t mask = (1ULL << 32) - 1;
+  uint64_t a = lhs >> 32, b = lhs & mask;
+  uint64_t c = rhs >> 32, d = rhs & mask;
+  uint64_t ac = a * c, bc = b * c, ad = a * d, bd = b * d;
+  // Compute mid 64-bit of result and round.
+  uint64_t mid = (bd >> 32) + (ad & mask) + (bc & mask) + (1U << 31);
+  return ac + (ad >> 32) + (bc >> 32) + (mid >> 32);
+#endif
+}
+
+FMT_CONSTEXPR inline auto operator*(fp x, fp y) -> fp {
+  return {multiply(x.f, y.f), x.e + y.e + 64};
+}
+
+template <typename T, bool doublish = num_bits<T>() == num_bits<double>()>
+using convert_float_result =
+    conditional_t<std::is_same<T, float>::value || doublish, double, T>;
+
+template <typename T>
+constexpr auto convert_float(T value) -> convert_float_result<T> {
+  return static_cast<convert_float_result<T>>(value);
+}
+
+template <typename OutputIt, typename Char>
+FMT_NOINLINE FMT_CONSTEXPR auto fill(OutputIt it, size_t n,
+                                     const fill_t<Char>& fill) -> OutputIt {
+  auto fill_size = fill.size();
+  if (fill_size == 1) return detail::fill_n(it, n, fill[0]);
+  auto data = fill.data();
+  for (size_t i = 0; i < n; ++i)
+    it = copy_str<Char>(data, data + fill_size, it);
+  return it;
+}
+
+// Writes the output of f, padded according to format specifications in specs.
+// size: output size in code units.
+// width: output display width in (terminal) column positions.
+template <align::type align = align::left, typename OutputIt, typename Char,
+          typename F>
+FMT_CONSTEXPR auto write_padded(OutputIt out, const format_specs<Char>& specs,
+                                size_t size, size_t width, F&& f) -> OutputIt {
+  static_assert(align == align::left || align == align::right, "");
+  unsigned spec_width = to_unsigned(specs.width);
+  size_t padding = spec_width > width ? spec_width - width : 0;
+  // Shifts are encoded as string literals because static constexpr is not
+  // supported in constexpr functions.
+  auto* shifts = align == align::left ? "\x1f\x1f\x00\x01" : "\x00\x1f\x00\x01";
+  size_t left_padding = padding >> shifts[specs.align];
+  size_t right_padding = padding - left_padding;
+  auto it = reserve(out, size + padding * specs.fill.size());
+  if (left_padding != 0) it = fill(it, left_padding, specs.fill);
+  it = f(it);
+  if (right_padding != 0) it = fill(it, right_padding, specs.fill);
+  return base_iterator(out, it);
+}
+
+template <align::type align = align::left, typename OutputIt, typename Char,
+          typename F>
+constexpr auto write_padded(OutputIt out, const format_specs<Char>& specs,
+                            size_t size, F&& f) -> OutputIt {
+  return write_padded<align>(out, specs, size, size, f);
+}
+
+template <align::type align = align::left, typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write_bytes(OutputIt out, string_view bytes,
+                               const format_specs<Char>& specs) -> OutputIt {
+  return write_padded<align>(
+      out, specs, bytes.size(), [bytes](reserve_iterator<OutputIt> it) {
+        const char* data = bytes.data();
+        return copy_str<Char>(data, data + bytes.size(), it);
+      });
+}
+
+template <typename Char, typename OutputIt, typename UIntPtr>
+auto write_ptr(OutputIt out, UIntPtr value, const format_specs<Char>* specs)
+    -> OutputIt {
+  int num_digits = count_digits<4>(value);
+  auto size = to_unsigned(num_digits) + size_t(2);
+  auto write = [=](reserve_iterator<OutputIt> it) {
+    *it++ = static_cast<Char>('0');
+    *it++ = static_cast<Char>('x');
+    return format_uint<4, Char>(it, value, num_digits);
+  };
+  return specs ? write_padded<align::right>(out, *specs, size, write)
+               : base_iterator(out, write(reserve(out, size)));
+}
+
+// Returns true iff the code point cp is printable.
+FMT_API auto is_printable(uint32_t cp) -> bool;
+
+inline auto needs_escape(uint32_t cp) -> bool {
+  return cp < 0x20 || cp == 0x7f || cp == '"' || cp == '\\' ||
+         !is_printable(cp);
+}
+
+template <typename Char> struct find_escape_result {
+  const Char* begin;
+  const Char* end;
+  uint32_t cp;
+};
+
+template <typename Char>
+using make_unsigned_char =
+    typename conditional_t<std::is_integral<Char>::value,
+                           std::make_unsigned<Char>,
+                           type_identity<uint32_t>>::type;
+
+template <typename Char>
+auto find_escape(const Char* begin, const Char* end)
+    -> find_escape_result<Char> {
+  for (; begin != end; ++begin) {
+    uint32_t cp = static_cast<make_unsigned_char<Char>>(*begin);
+    if (const_check(sizeof(Char) == 1) && cp >= 0x80) continue;
+    if (needs_escape(cp)) return {begin, begin + 1, cp};
+  }
+  return {begin, nullptr, 0};
+}
+
+inline auto find_escape(const char* begin, const char* end)
+    -> find_escape_result<char> {
+  if (!is_utf8()) return find_escape<char>(begin, end);
+  auto result = find_escape_result<char>{end, nullptr, 0};
+  for_each_codepoint(string_view(begin, to_unsigned(end - begin)),
+                     [&](uint32_t cp, string_view sv) {
+                       if (needs_escape(cp)) {
+                         result = {sv.begin(), sv.end(), cp};
+                         return false;
+                       }
+                       return true;
+                     });
+  return result;
+}
+
+#define FMT_STRING_IMPL(s, base, explicit)                                    \
+  [] {                                                                        \
+    /* Use the hidden visibility as a workaround for a GCC bug (#1973). */    \
+    /* Use a macro-like name to avoid shadowing warnings. */                  \
+    struct FMT_VISIBILITY("hidden") FMT_COMPILE_STRING : base {               \
+      using char_type FMT_MAYBE_UNUSED = fmt::remove_cvref_t<decltype(s[0])>; \
+      FMT_MAYBE_UNUSED FMT_CONSTEXPR explicit                                 \
+      operator fmt::basic_string_view<char_type>() const {                    \
+        return fmt::detail_exported::compile_string_to_view<char_type>(s);    \
+      }                                                                       \
+    };                                                                        \
+    return FMT_COMPILE_STRING();                                              \
+  }()
+
+/**
+  \rst
+  Constructs a compile-time format string from a string literal *s*.
+
+  **Example**::
+
+    // A compile-time error because 'd' is an invalid specifier for strings.
+    std::string s = fmt::format(FMT_STRING("{:d}"), "foo");
+  \endrst
+ */
+#define FMT_STRING(s) FMT_STRING_IMPL(s, fmt::detail::compile_string, )
+
+template <size_t width, typename Char, typename OutputIt>
+auto write_codepoint(OutputIt out, char prefix, uint32_t cp) -> OutputIt {
+  *out++ = static_cast<Char>('\\');
+  *out++ = static_cast<Char>(prefix);
+  Char buf[width];
+  fill_n(buf, width, static_cast<Char>('0'));
+  format_uint<4>(buf, cp, width);
+  return copy_str<Char>(buf, buf + width, out);
+}
+
+template <typename OutputIt, typename Char>
+auto write_escaped_cp(OutputIt out, const find_escape_result<Char>& escape)
+    -> OutputIt {
+  auto c = static_cast<Char>(escape.cp);
+  switch (escape.cp) {
+  case '\n':
+    *out++ = static_cast<Char>('\\');
+    c = static_cast<Char>('n');
+    break;
+  case '\r':
+    *out++ = static_cast<Char>('\\');
+    c = static_cast<Char>('r');
+    break;
+  case '\t':
+    *out++ = static_cast<Char>('\\');
+    c = static_cast<Char>('t');
+    break;
+  case '"':
+    FMT_FALLTHROUGH;
+  case '\'':
+    FMT_FALLTHROUGH;
+  case '\\':
+    *out++ = static_cast<Char>('\\');
+    break;
+  default:
+    if (escape.cp < 0x100) {
+      return write_codepoint<2, Char>(out, 'x', escape.cp);
+    }
+    if (escape.cp < 0x10000) {
+      return write_codepoint<4, Char>(out, 'u', escape.cp);
+    }
+    if (escape.cp < 0x110000) {
+      return write_codepoint<8, Char>(out, 'U', escape.cp);
+    }
+    for (Char escape_char : basic_string_view<Char>(
+             escape.begin, to_unsigned(escape.end - escape.begin))) {
+      out = write_codepoint<2, Char>(out, 'x',
+                                     static_cast<uint32_t>(escape_char) & 0xFF);
+    }
+    return out;
+  }
+  *out++ = c;
+  return out;
+}
+
+template <typename Char, typename OutputIt>
+auto write_escaped_string(OutputIt out, basic_string_view<Char> str)
+    -> OutputIt {
+  *out++ = static_cast<Char>('"');
+  auto begin = str.begin(), end = str.end();
+  do {
+    auto escape = find_escape(begin, end);
+    out = copy_str<Char>(begin, escape.begin, out);
+    begin = escape.end;
+    if (!begin) break;
+    out = write_escaped_cp<OutputIt, Char>(out, escape);
+  } while (begin != end);
+  *out++ = static_cast<Char>('"');
+  return out;
+}
+
+template <typename Char, typename OutputIt>
+auto write_escaped_char(OutputIt out, Char v) -> OutputIt {
+  Char v_array[1] = {v};
+  *out++ = static_cast<Char>('\'');
+  if ((needs_escape(static_cast<uint32_t>(v)) && v != static_cast<Char>('"')) ||
+      v == static_cast<Char>('\'')) {
+    out = write_escaped_cp(out,
+                           find_escape_result<Char>{v_array, v_array + 1,
+                                                    static_cast<uint32_t>(v)});
+  } else {
+    *out++ = v;
+  }
+  *out++ = static_cast<Char>('\'');
+  return out;
+}
+
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write_char(OutputIt out, Char value,
+                              const format_specs<Char>& specs) -> OutputIt {
+  bool is_debug = specs.type == presentation_type::debug;
+  return write_padded(out, specs, 1, [=](reserve_iterator<OutputIt> it) {
+    if (is_debug) return write_escaped_char(it, value);
+    *it++ = value;
+    return it;
+  });
+}
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out, Char value,
+                         const format_specs<Char>& specs, locale_ref loc = {})
+    -> OutputIt {
+  // char is formatted as unsigned char for consistency across platforms.
+  using unsigned_type =
+      conditional_t<std::is_same<Char, char>::value, unsigned char, unsigned>;
+  return check_char_specs(specs)
+             ? write_char(out, value, specs)
+             : write(out, static_cast<unsigned_type>(value), specs, loc);
+}
+
+// Data for write_int that doesn't depend on output iterator type. It is used to
+// avoid template code bloat.
+template <typename Char> struct write_int_data {
+  size_t size;
+  size_t padding;
+
+  FMT_CONSTEXPR write_int_data(int num_digits, unsigned prefix,
+                               const format_specs<Char>& specs)
+      : size((prefix >> 24) + to_unsigned(num_digits)), padding(0) {
+    if (specs.align == align::numeric) {
+      auto width = to_unsigned(specs.width);
+      if (width > size) {
+        padding = width - size;
+        size = width;
+      }
+    } else if (specs.precision > num_digits) {
+      size = (prefix >> 24) + to_unsigned(specs.precision);
+      padding = to_unsigned(specs.precision - num_digits);
+    }
+  }
+};
+
+// Writes an integer in the format
+//   <left-padding><prefix><numeric-padding><digits><right-padding>
+// where <digits> are written by write_digits(it).
+// prefix contains chars in three lower bytes and the size in the fourth byte.
+template <typename OutputIt, typename Char, typename W>
+FMT_CONSTEXPR FMT_INLINE auto write_int(OutputIt out, int num_digits,
+                                        unsigned prefix,
+                                        const format_specs<Char>& specs,
+                                        W write_digits) -> OutputIt {
+  // Slightly faster check for specs.width == 0 && specs.precision == -1.
+  if ((specs.width | (specs.precision + 1)) == 0) {
+    auto it = reserve(out, to_unsigned(num_digits) + (prefix >> 24));
+    if (prefix != 0) {
+      for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
+        *it++ = static_cast<Char>(p & 0xff);
+    }
+    return base_iterator(out, write_digits(it));
+  }
+  auto data = write_int_data<Char>(num_digits, prefix, specs);
+  return write_padded<align::right>(
+      out, specs, data.size, [=](reserve_iterator<OutputIt> it) {
+        for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
+          *it++ = static_cast<Char>(p & 0xff);
+        it = detail::fill_n(it, data.padding, static_cast<Char>('0'));
+        return write_digits(it);
+      });
+}
+
+template <typename Char> class digit_grouping {
+ private:
+  std::string grouping_;
+  std::basic_string<Char> thousands_sep_;
+
+  struct next_state {
+    std::string::const_iterator group;
+    int pos;
+  };
+  auto initial_state() const -> next_state { return {grouping_.begin(), 0}; }
+
+  // Returns the next digit group separator position.
+  auto next(next_state& state) const -> int {
+    if (thousands_sep_.empty()) return max_value<int>();
+    if (state.group == grouping_.end()) return state.pos += grouping_.back();
+    if (*state.group <= 0 || *state.group == max_value<char>())
+      return max_value<int>();
+    state.pos += *state.group++;
+    return state.pos;
+  }
+
+ public:
+  explicit digit_grouping(locale_ref loc, bool localized = true) {
+    if (!localized) return;
+    auto sep = thousands_sep<Char>(loc);
+    grouping_ = sep.grouping;
+    if (sep.thousands_sep) thousands_sep_.assign(1, sep.thousands_sep);
+  }
+  digit_grouping(std::string grouping, std::basic_string<Char> sep)
+      : grouping_(std::move(grouping)), thousands_sep_(std::move(sep)) {}
+
+  auto has_separator() const -> bool { return !thousands_sep_.empty(); }
+
+  auto count_separators(int num_digits) const -> int {
+    int count = 0;
+    auto state = initial_state();
+    while (num_digits > next(state)) ++count;
+    return count;
+  }
+
+  // Applies grouping to digits and write the output to out.
+  template <typename Out, typename C>
+  auto apply(Out out, basic_string_view<C> digits) const -> Out {
+    auto num_digits = static_cast<int>(digits.size());
+    auto separators = basic_memory_buffer<int>();
+    separators.push_back(0);
+    auto state = initial_state();
+    while (int i = next(state)) {
+      if (i >= num_digits) break;
+      separators.push_back(i);
+    }
+    for (int i = 0, sep_index = static_cast<int>(separators.size() - 1);
+         i < num_digits; ++i) {
+      if (num_digits - i == separators[sep_index]) {
+        out =
+            copy_str<Char>(thousands_sep_.data(),
+                           thousands_sep_.data() + thousands_sep_.size(), out);
+        --sep_index;
+      }
+      *out++ = static_cast<Char>(digits[to_unsigned(i)]);
+    }
+    return out;
+  }
+};
+
+FMT_CONSTEXPR inline void prefix_append(unsigned& prefix, unsigned value) {
+  prefix |= prefix != 0 ? value << 8 : value;
+  prefix += (1u + (value > 0xff ? 1 : 0)) << 24;
+}
+
+// Writes a decimal integer with digit grouping.
+template <typename OutputIt, typename UInt, typename Char>
+auto write_int(OutputIt out, UInt value, unsigned prefix,
+               const format_specs<Char>& specs,
+               const digit_grouping<Char>& grouping) -> OutputIt {
+  static_assert(std::is_same<uint64_or_128_t<UInt>, UInt>::value, "");
+  int num_digits = 0;
+  auto buffer = memory_buffer();
+  switch (specs.type) {
+  case presentation_type::none:
+  case presentation_type::dec: {
+    num_digits = count_digits(value);
+    format_decimal<char>(appender(buffer), value, num_digits);
+    break;
+  }
+  case presentation_type::hex_lower:
+  case presentation_type::hex_upper: {
+    bool upper = specs.type == presentation_type::hex_upper;
+    if (specs.alt)
+      prefix_append(prefix, unsigned(upper ? 'X' : 'x') << 8 | '0');
+    num_digits = count_digits<4>(value);
+    format_uint<4, char>(appender(buffer), value, num_digits, upper);
+    break;
+  }
+  case presentation_type::bin_lower:
+  case presentation_type::bin_upper: {
+    bool upper = specs.type == presentation_type::bin_upper;
+    if (specs.alt)
+      prefix_append(prefix, unsigned(upper ? 'B' : 'b') << 8 | '0');
+    num_digits = count_digits<1>(value);
+    format_uint<1, char>(appender(buffer), value, num_digits);
+    break;
+  }
+  case presentation_type::oct: {
+    num_digits = count_digits<3>(value);
+    // Octal prefix '0' is counted as a digit, so only add it if precision
+    // is not greater than the number of digits.
+    if (specs.alt && specs.precision <= num_digits && value != 0)
+      prefix_append(prefix, '0');
+    format_uint<3, char>(appender(buffer), value, num_digits);
+    break;
+  }
+  case presentation_type::chr:
+    return write_char(out, static_cast<Char>(value), specs);
+  default:
+    throw_format_error("invalid format specifier");
+  }
+
+  unsigned size = (prefix != 0 ? prefix >> 24 : 0) + to_unsigned(num_digits) +
+                  to_unsigned(grouping.count_separators(num_digits));
+  return write_padded<align::right>(
+      out, specs, size, size, [&](reserve_iterator<OutputIt> it) {
+        for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
+          *it++ = static_cast<Char>(p & 0xff);
+        return grouping.apply(it, string_view(buffer.data(), buffer.size()));
+      });
+}
+
+// Writes a localized value.
+FMT_API auto write_loc(appender out, loc_value value,
+                       const format_specs<>& specs, locale_ref loc) -> bool;
+template <typename OutputIt, typename Char>
+inline auto write_loc(OutputIt, loc_value, const format_specs<Char>&,
+                      locale_ref) -> bool {
+  return false;
+}
+
+template <typename UInt> struct write_int_arg {
+  UInt abs_value;
+  unsigned prefix;
+};
+
+template <typename T>
+FMT_CONSTEXPR auto make_write_int_arg(T value, sign_t sign)
+    -> write_int_arg<uint32_or_64_or_128_t<T>> {
+  auto prefix = 0u;
+  auto abs_value = static_cast<uint32_or_64_or_128_t<T>>(value);
+  if (is_negative(value)) {
+    prefix = 0x01000000 | '-';
+    abs_value = 0 - abs_value;
+  } else {
+    constexpr const unsigned prefixes[4] = {0, 0, 0x1000000u | '+',
+                                            0x1000000u | ' '};
+    prefix = prefixes[sign];
+  }
+  return {abs_value, prefix};
+}
+
+template <typename Char = char> struct loc_writer {
+  buffer_appender<Char> out;
+  const format_specs<Char>& specs;
+  std::basic_string<Char> sep;
+  std::string grouping;
+  std::basic_string<Char> decimal_point;
+
+  template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
+  auto operator()(T value) -> bool {
+    auto arg = make_write_int_arg(value, specs.sign);
+    write_int(out, static_cast<uint64_or_128_t<T>>(arg.abs_value), arg.prefix,
+              specs, digit_grouping<Char>(grouping, sep));
+    return true;
+  }
+
+  template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
+  auto operator()(T) -> bool {
+    return false;
+  }
+};
+
+template <typename Char, typename OutputIt, typename T>
+FMT_CONSTEXPR FMT_INLINE auto write_int(OutputIt out, write_int_arg<T> arg,
+                                        const format_specs<Char>& specs,
+                                        locale_ref) -> OutputIt {
+  static_assert(std::is_same<T, uint32_or_64_or_128_t<T>>::value, "");
+  auto abs_value = arg.abs_value;
+  auto prefix = arg.prefix;
+  switch (specs.type) {
+  case presentation_type::none:
+  case presentation_type::dec: {
+    auto num_digits = count_digits(abs_value);
+    return write_int(
+        out, num_digits, prefix, specs, [=](reserve_iterator<OutputIt> it) {
+          return format_decimal<Char>(it, abs_value, num_digits).end;
+        });
+  }
+  case presentation_type::hex_lower:
+  case presentation_type::hex_upper: {
+    bool upper = specs.type == presentation_type::hex_upper;
+    if (specs.alt)
+      prefix_append(prefix, unsigned(upper ? 'X' : 'x') << 8 | '0');
+    int num_digits = count_digits<4>(abs_value);
+    return write_int(
+        out, num_digits, prefix, specs, [=](reserve_iterator<OutputIt> it) {
+          return format_uint<4, Char>(it, abs_value, num_digits, upper);
+        });
+  }
+  case presentation_type::bin_lower:
+  case presentation_type::bin_upper: {
+    bool upper = specs.type == presentation_type::bin_upper;
+    if (specs.alt)
+      prefix_append(prefix, unsigned(upper ? 'B' : 'b') << 8 | '0');
+    int num_digits = count_digits<1>(abs_value);
+    return write_int(out, num_digits, prefix, specs,
+                     [=](reserve_iterator<OutputIt> it) {
+                       return format_uint<1, Char>(it, abs_value, num_digits);
+                     });
+  }
+  case presentation_type::oct: {
+    int num_digits = count_digits<3>(abs_value);
+    // Octal prefix '0' is counted as a digit, so only add it if precision
+    // is not greater than the number of digits.
+    if (specs.alt && specs.precision <= num_digits && abs_value != 0)
+      prefix_append(prefix, '0');
+    return write_int(out, num_digits, prefix, specs,
+                     [=](reserve_iterator<OutputIt> it) {
+                       return format_uint<3, Char>(it, abs_value, num_digits);
+                     });
+  }
+  case presentation_type::chr:
+    return write_char(out, static_cast<Char>(abs_value), specs);
+  default:
+    throw_format_error("invalid format specifier");
+  }
+  return out;
+}
+template <typename Char, typename OutputIt, typename T>
+FMT_CONSTEXPR FMT_NOINLINE auto write_int_noinline(
+    OutputIt out, write_int_arg<T> arg, const format_specs<Char>& specs,
+    locale_ref loc) -> OutputIt {
+  return write_int(out, arg, specs, loc);
+}
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_integral<T>::value &&
+                        !std::is_same<T, bool>::value &&
+                        std::is_same<OutputIt, buffer_appender<Char>>::value)>
+FMT_CONSTEXPR FMT_INLINE auto write(OutputIt out, T value,
+                                    const format_specs<Char>& specs,
+                                    locale_ref loc) -> OutputIt {
+  if (specs.localized && write_loc(out, value, specs, loc)) return out;
+  return write_int_noinline(out, make_write_int_arg(value, specs.sign), specs,
+                            loc);
+}
+// An inlined version of write used in format string compilation.
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_integral<T>::value &&
+                        !std::is_same<T, bool>::value &&
+                        !std::is_same<OutputIt, buffer_appender<Char>>::value)>
+FMT_CONSTEXPR FMT_INLINE auto write(OutputIt out, T value,
+                                    const format_specs<Char>& specs,
+                                    locale_ref loc) -> OutputIt {
+  if (specs.localized && write_loc(out, value, specs, loc)) return out;
+  return write_int(out, make_write_int_arg(value, specs.sign), specs, loc);
+}
+
+// An output iterator that counts the number of objects written to it and
+// discards them.
+class counting_iterator {
+ private:
+  size_t count_;
+
+ public:
+  using iterator_category = std::output_iterator_tag;
+  using difference_type = std::ptrdiff_t;
+  using pointer = void;
+  using reference = void;
+  FMT_UNCHECKED_ITERATOR(counting_iterator);
+
+  struct value_type {
+    template <typename T> FMT_CONSTEXPR void operator=(const T&) {}
+  };
+
+  FMT_CONSTEXPR counting_iterator() : count_(0) {}
+
+  FMT_CONSTEXPR auto count() const -> size_t { return count_; }
+
+  FMT_CONSTEXPR auto operator++() -> counting_iterator& {
+    ++count_;
+    return *this;
+  }
+  FMT_CONSTEXPR auto operator++(int) -> counting_iterator {
+    auto it = *this;
+    ++*this;
+    return it;
+  }
+
+  FMT_CONSTEXPR friend auto operator+(counting_iterator it, difference_type n)
+      -> counting_iterator {
+    it.count_ += static_cast<size_t>(n);
+    return it;
+  }
+
+  FMT_CONSTEXPR auto operator*() const -> value_type { return {}; }
+};
+
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out, basic_string_view<Char> s,
+                         const format_specs<Char>& specs) -> OutputIt {
+  auto data = s.data();
+  auto size = s.size();
+  if (specs.precision >= 0 && to_unsigned(specs.precision) < size)
+    size = code_point_index(s, to_unsigned(specs.precision));
+  bool is_debug = specs.type == presentation_type::debug;
+  size_t width = 0;
+  if (specs.width != 0) {
+    if (is_debug)
+      width = write_escaped_string(counting_iterator{}, s).count();
+    else
+      width = compute_width(basic_string_view<Char>(data, size));
+  }
+  return write_padded(out, specs, size, width,
+                      [=](reserve_iterator<OutputIt> it) {
+                        if (is_debug) return write_escaped_string(it, s);
+                        return copy_str<Char>(data, data + size, it);
+                      });
+}
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out,
+                         basic_string_view<type_identity_t<Char>> s,
+                         const format_specs<Char>& specs, locale_ref)
+    -> OutputIt {
+  return write(out, s, specs);
+}
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out, const Char* s,
+                         const format_specs<Char>& specs, locale_ref)
+    -> OutputIt {
+  if (specs.type == presentation_type::pointer)
+    return write_ptr<Char>(out, bit_cast<uintptr_t>(s), &specs);
+  if (!s) throw_format_error("string pointer is null");
+  return write(out, basic_string_view<Char>(s), specs, {});
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_integral<T>::value &&
+                        !std::is_same<T, bool>::value &&
+                        !std::is_same<T, Char>::value)>
+FMT_CONSTEXPR auto write(OutputIt out, T value) -> OutputIt {
+  auto abs_value = static_cast<uint32_or_64_or_128_t<T>>(value);
+  bool negative = is_negative(value);
+  // Don't do -abs_value since it trips unsigned-integer-overflow sanitizer.
+  if (negative) abs_value = ~abs_value + 1;
+  int num_digits = count_digits(abs_value);
+  auto size = (negative ? 1 : 0) + static_cast<size_t>(num_digits);
+  auto it = reserve(out, size);
+  if (auto ptr = to_pointer<Char>(it, size)) {
+    if (negative) *ptr++ = static_cast<Char>('-');
+    format_decimal<Char>(ptr, abs_value, num_digits);
+    return out;
+  }
+  if (negative) *it++ = static_cast<Char>('-');
+  it = format_decimal<Char>(it, abs_value, num_digits).end;
+  return base_iterator(out, it);
+}
+
+// DEPRECATED!
+template <typename Char>
+FMT_CONSTEXPR auto parse_align(const Char* begin, const Char* end,
+                               format_specs<Char>& specs) -> const Char* {
+  FMT_ASSERT(begin != end, "");
+  auto align = align::none;
+  auto p = begin + code_point_length(begin);
+  if (end - p <= 0) p = begin;
+  for (;;) {
+    switch (to_ascii(*p)) {
+    case '<':
+      align = align::left;
+      break;
+    case '>':
+      align = align::right;
+      break;
+    case '^':
+      align = align::center;
+      break;
+    }
+    if (align != align::none) {
+      if (p != begin) {
+        auto c = *begin;
+        if (c == '}') return begin;
+        if (c == '{') {
+          throw_format_error("invalid fill character '{'");
+          return begin;
+        }
+        specs.fill = {begin, to_unsigned(p - begin)};
+        begin = p + 1;
+      } else {
+        ++begin;
+      }
+      break;
+    } else if (p == begin) {
+      break;
+    }
+    p = begin;
+  }
+  specs.align = align;
+  return begin;
+}
+
+// A floating-point presentation format.
+enum class float_format : unsigned char {
+  general,  // General: exponent notation or fixed point based on magnitude.
+  exp,      // Exponent notation with the default precision of 6, e.g. 1.2e-3.
+  fixed,    // Fixed point with the default precision of 6, e.g. 0.0012.
+  hex
+};
+
+struct float_specs {
+  int precision;
+  float_format format : 8;
+  sign_t sign : 8;
+  bool upper : 1;
+  bool locale : 1;
+  bool binary32 : 1;
+  bool showpoint : 1;
+};
+
+template <typename Char>
+FMT_CONSTEXPR auto parse_float_type_spec(const format_specs<Char>& specs)
+    -> float_specs {
+  auto result = float_specs();
+  result.showpoint = specs.alt;
+  result.locale = specs.localized;
+  switch (specs.type) {
+  case presentation_type::none:
+    result.format = float_format::general;
+    break;
+  case presentation_type::general_upper:
+    result.upper = true;
+    FMT_FALLTHROUGH;
+  case presentation_type::general_lower:
+    result.format = float_format::general;
+    break;
+  case presentation_type::exp_upper:
+    result.upper = true;
+    FMT_FALLTHROUGH;
+  case presentation_type::exp_lower:
+    result.format = float_format::exp;
+    result.showpoint |= specs.precision != 0;
+    break;
+  case presentation_type::fixed_upper:
+    result.upper = true;
+    FMT_FALLTHROUGH;
+  case presentation_type::fixed_lower:
+    result.format = float_format::fixed;
+    result.showpoint |= specs.precision != 0;
+    break;
+  case presentation_type::hexfloat_upper:
+    result.upper = true;
+    FMT_FALLTHROUGH;
+  case presentation_type::hexfloat_lower:
+    result.format = float_format::hex;
+    break;
+  default:
+    throw_format_error("invalid format specifier");
+    break;
+  }
+  return result;
+}
+
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR20 auto write_nonfinite(OutputIt out, bool isnan,
+                                     format_specs<Char> specs,
+                                     const float_specs& fspecs) -> OutputIt {
+  auto str =
+      isnan ? (fspecs.upper ? "NAN" : "nan") : (fspecs.upper ? "INF" : "inf");
+  constexpr size_t str_size = 3;
+  auto sign = fspecs.sign;
+  auto size = str_size + (sign ? 1 : 0);
+  // Replace '0'-padding with space for non-finite values.
+  const bool is_zero_fill =
+      specs.fill.size() == 1 && *specs.fill.data() == static_cast<Char>('0');
+  if (is_zero_fill) specs.fill[0] = static_cast<Char>(' ');
+  return write_padded(out, specs, size, [=](reserve_iterator<OutputIt> it) {
+    if (sign) *it++ = detail::sign<Char>(sign);
+    return copy_str<Char>(str, str + str_size, it);
+  });
+}
+
+// A decimal floating-point number significand * pow(10, exp).
+struct big_decimal_fp {
+  const char* significand;
+  int significand_size;
+  int exponent;
+};
+
+constexpr auto get_significand_size(const big_decimal_fp& f) -> int {
+  return f.significand_size;
+}
+template <typename T>
+inline auto get_significand_size(const dragonbox::decimal_fp<T>& f) -> int {
+  return count_digits(f.significand);
+}
+
+template <typename Char, typename OutputIt>
+constexpr auto write_significand(OutputIt out, const char* significand,
+                                 int significand_size) -> OutputIt {
+  return copy_str<Char>(significand, significand + significand_size, out);
+}
+template <typename Char, typename OutputIt, typename UInt>
+inline auto write_significand(OutputIt out, UInt significand,
+                              int significand_size) -> OutputIt {
+  return format_decimal<Char>(out, significand, significand_size).end;
+}
+template <typename Char, typename OutputIt, typename T, typename Grouping>
+FMT_CONSTEXPR20 auto write_significand(OutputIt out, T significand,
+                                       int significand_size, int exponent,
+                                       const Grouping& grouping) -> OutputIt {
+  if (!grouping.has_separator()) {
+    out = write_significand<Char>(out, significand, significand_size);
+    return detail::fill_n(out, exponent, static_cast<Char>('0'));
+  }
+  auto buffer = memory_buffer();
+  write_significand<char>(appender(buffer), significand, significand_size);
+  detail::fill_n(appender(buffer), exponent, '0');
+  return grouping.apply(out, string_view(buffer.data(), buffer.size()));
+}
+
+template <typename Char, typename UInt,
+          FMT_ENABLE_IF(std::is_integral<UInt>::value)>
+inline auto write_significand(Char* out, UInt significand, int significand_size,
+                              int integral_size, Char decimal_point) -> Char* {
+  if (!decimal_point)
+    return format_decimal(out, significand, significand_size).end;
+  out += significand_size + 1;
+  Char* end = out;
+  int floating_size = significand_size - integral_size;
+  for (int i = floating_size / 2; i > 0; --i) {
+    out -= 2;
+    copy2(out, digits2(static_cast<std::size_t>(significand % 100)));
+    significand /= 100;
+  }
+  if (floating_size % 2 != 0) {
+    *--out = static_cast<Char>('0' + significand % 10);
+    significand /= 10;
+  }
+  *--out = decimal_point;
+  format_decimal(out - integral_size, significand, integral_size);
+  return end;
+}
+
+template <typename OutputIt, typename UInt, typename Char,
+          FMT_ENABLE_IF(!std::is_pointer<remove_cvref_t<OutputIt>>::value)>
+inline auto write_significand(OutputIt out, UInt significand,
+                              int significand_size, int integral_size,
+                              Char decimal_point) -> OutputIt {
+  // Buffer is large enough to hold digits (digits10 + 1) and a decimal point.
+  Char buffer[digits10<UInt>() + 2];
+  auto end = write_significand(buffer, significand, significand_size,
+                               integral_size, decimal_point);
+  return detail::copy_str_noinline<Char>(buffer, end, out);
+}
+
+template <typename OutputIt, typename Char>
+FMT_CONSTEXPR auto write_significand(OutputIt out, const char* significand,
+                                     int significand_size, int integral_size,
+                                     Char decimal_point) -> OutputIt {
+  out = detail::copy_str_noinline<Char>(significand,
+                                        significand + integral_size, out);
+  if (!decimal_point) return out;
+  *out++ = decimal_point;
+  return detail::copy_str_noinline<Char>(significand + integral_size,
+                                         significand + significand_size, out);
+}
+
+template <typename OutputIt, typename Char, typename T, typename Grouping>
+FMT_CONSTEXPR20 auto write_significand(OutputIt out, T significand,
+                                       int significand_size, int integral_size,
+                                       Char decimal_point,
+                                       const Grouping& grouping) -> OutputIt {
+  if (!grouping.has_separator()) {
+    return write_significand(out, significand, significand_size, integral_size,
+                             decimal_point);
+  }
+  auto buffer = basic_memory_buffer<Char>();
+  write_significand(buffer_appender<Char>(buffer), significand,
+                    significand_size, integral_size, decimal_point);
+  grouping.apply(
+      out, basic_string_view<Char>(buffer.data(), to_unsigned(integral_size)));
+  return detail::copy_str_noinline<Char>(buffer.data() + integral_size,
+                                         buffer.end(), out);
+}
+
+template <typename OutputIt, typename DecimalFP, typename Char,
+          typename Grouping = digit_grouping<Char>>
+FMT_CONSTEXPR20 auto do_write_float(OutputIt out, const DecimalFP& f,
+                                    const format_specs<Char>& specs,
+                                    float_specs fspecs, locale_ref loc)
+    -> OutputIt {
+  auto significand = f.significand;
+  int significand_size = get_significand_size(f);
+  const Char zero = static_cast<Char>('0');
+  auto sign = fspecs.sign;
+  size_t size = to_unsigned(significand_size) + (sign ? 1 : 0);
+  using iterator = reserve_iterator<OutputIt>;
+
+  Char decimal_point =
+      fspecs.locale ? detail::decimal_point<Char>(loc) : static_cast<Char>('.');
+
+  int output_exp = f.exponent + significand_size - 1;
+  auto use_exp_format = [=]() {
+    if (fspecs.format == float_format::exp) return true;
+    if (fspecs.format != float_format::general) return false;
+    // Use the fixed notation if the exponent is in [exp_lower, exp_upper),
+    // e.g. 0.0001 instead of 1e-04. Otherwise use the exponent notation.
+    const int exp_lower = -4, exp_upper = 16;
+    return output_exp < exp_lower ||
+           output_exp >= (fspecs.precision > 0 ? fspecs.precision : exp_upper);
+  };
+  if (use_exp_format()) {
+    int num_zeros = 0;
+    if (fspecs.showpoint) {
+      num_zeros = fspecs.precision - significand_size;
+      if (num_zeros < 0) num_zeros = 0;
+      size += to_unsigned(num_zeros);
+    } else if (significand_size == 1) {
+      decimal_point = Char();
+    }
+    auto abs_output_exp = output_exp >= 0 ? output_exp : -output_exp;
+    int exp_digits = 2;
+    if (abs_output_exp >= 100) exp_digits = abs_output_exp >= 1000 ? 4 : 3;
+
+    size += to_unsigned((decimal_point ? 1 : 0) + 2 + exp_digits);
+    char exp_char = fspecs.upper ? 'E' : 'e';
+    auto write = [=](iterator it) {
+      if (sign) *it++ = detail::sign<Char>(sign);
+      // Insert a decimal point after the first digit and add an exponent.
+      it = write_significand(it, significand, significand_size, 1,
+                             decimal_point);
+      if (num_zeros > 0) it = detail::fill_n(it, num_zeros, zero);
+      *it++ = static_cast<Char>(exp_char);
+      return write_exponent<Char>(output_exp, it);
+    };
+    return specs.width > 0 ? write_padded<align::right>(out, specs, size, write)
+                           : base_iterator(out, write(reserve(out, size)));
+  }
+
+  int exp = f.exponent + significand_size;
+  if (f.exponent >= 0) {
+    // 1234e5 -> 123400000[.0+]
+    size += to_unsigned(f.exponent);
+    int num_zeros = fspecs.precision - exp;
+    abort_fuzzing_if(num_zeros > 5000);
+    if (fspecs.showpoint) {
+      ++size;
+      if (num_zeros <= 0 && fspecs.format != float_format::fixed) num_zeros = 0;
+      if (num_zeros > 0) size += to_unsigned(num_zeros);
+    }
+    auto grouping = Grouping(loc, fspecs.locale);
+    size += to_unsigned(grouping.count_separators(exp));
+    return write_padded<align::right>(out, specs, size, [&](iterator it) {
+      if (sign) *it++ = detail::sign<Char>(sign);
+      it = write_significand<Char>(it, significand, significand_size,
+                                   f.exponent, grouping);
+      if (!fspecs.showpoint) return it;
+      *it++ = decimal_point;
+      return num_zeros > 0 ? detail::fill_n(it, num_zeros, zero) : it;
+    });
+  } else if (exp > 0) {
+    // 1234e-2 -> 12.34[0+]
+    int num_zeros = fspecs.showpoint ? fspecs.precision - significand_size : 0;
+    size += 1 + to_unsigned(num_zeros > 0 ? num_zeros : 0);
+    auto grouping = Grouping(loc, fspecs.locale);
+    size += to_unsigned(grouping.count_separators(exp));
+    return write_padded<align::right>(out, specs, size, [&](iterator it) {
+      if (sign) *it++ = detail::sign<Char>(sign);
+      it = write_significand(it, significand, significand_size, exp,
+                             decimal_point, grouping);
+      return num_zeros > 0 ? detail::fill_n(it, num_zeros, zero) : it;
+    });
+  }
+  // 1234e-6 -> 0.001234
+  int num_zeros = -exp;
+  if (significand_size == 0 && fspecs.precision >= 0 &&
+      fspecs.precision < num_zeros) {
+    num_zeros = fspecs.precision;
+  }
+  bool pointy = num_zeros != 0 || significand_size != 0 || fspecs.showpoint;
+  size += 1 + (pointy ? 1 : 0) + to_unsigned(num_zeros);
+  return write_padded<align::right>(out, specs, size, [&](iterator it) {
+    if (sign) *it++ = detail::sign<Char>(sign);
+    *it++ = zero;
+    if (!pointy) return it;
+    *it++ = decimal_point;
+    it = detail::fill_n(it, num_zeros, zero);
+    return write_significand<Char>(it, significand, significand_size);
+  });
+}
+
+template <typename Char> class fallback_digit_grouping {
+ public:
+  constexpr fallback_digit_grouping(locale_ref, bool) {}
+
+  constexpr auto has_separator() const -> bool { return false; }
+
+  constexpr auto count_separators(int) const -> int { return 0; }
+
+  template <typename Out, typename C>
+  constexpr auto apply(Out out, basic_string_view<C>) const -> Out {
+    return out;
+  }
+};
+
+template <typename OutputIt, typename DecimalFP, typename Char>
+FMT_CONSTEXPR20 auto write_float(OutputIt out, const DecimalFP& f,
+                                 const format_specs<Char>& specs,
+                                 float_specs fspecs, locale_ref loc)
+    -> OutputIt {
+  if (is_constant_evaluated()) {
+    return do_write_float<OutputIt, DecimalFP, Char,
+                          fallback_digit_grouping<Char>>(out, f, specs, fspecs,
+                                                         loc);
+  } else {
+    return do_write_float(out, f, specs, fspecs, loc);
+  }
+}
+
+template <typename T> constexpr auto isnan(T value) -> bool {
+  return !(value >= value);  // std::isnan doesn't support __float128.
+}
+
+template <typename T, typename Enable = void>
+struct has_isfinite : std::false_type {};
+
+template <typename T>
+struct has_isfinite<T, enable_if_t<sizeof(std::isfinite(T())) != 0>>
+    : std::true_type {};
+
+template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value&&
+                                        has_isfinite<T>::value)>
+FMT_CONSTEXPR20 auto isfinite(T value) -> bool {
+  constexpr T inf = T(std::numeric_limits<double>::infinity());
+  if (is_constant_evaluated())
+    return !detail::isnan(value) && value < inf && value > -inf;
+  return std::isfinite(value);
+}
+template <typename T, FMT_ENABLE_IF(!has_isfinite<T>::value)>
+FMT_CONSTEXPR auto isfinite(T value) -> bool {
+  T inf = T(std::numeric_limits<double>::infinity());
+  // std::isfinite doesn't support __float128.
+  return !detail::isnan(value) && value < inf && value > -inf;
+}
+
+template <typename T, FMT_ENABLE_IF(is_floating_point<T>::value)>
+FMT_INLINE FMT_CONSTEXPR bool signbit(T value) {
+  if (is_constant_evaluated()) {
+#ifdef __cpp_if_constexpr
+    if constexpr (std::numeric_limits<double>::is_iec559) {
+      auto bits = detail::bit_cast<uint64_t>(static_cast<double>(value));
+      return (bits >> (num_bits<uint64_t>() - 1)) != 0;
+    }
+#endif
+  }
+  return std::signbit(static_cast<double>(value));
+}
+
+inline FMT_CONSTEXPR20 void adjust_precision(int& precision, int exp10) {
+  // Adjust fixed precision by exponent because it is relative to decimal
+  // point.
+  if (exp10 > 0 && precision > max_value<int>() - exp10)
+    FMT_THROW(format_error("number is too big"));
+  precision += exp10;
+}
+
+class bigint {
+ private:
+  // A bigint is stored as an array of bigits (big digits), with bigit at index
+  // 0 being the least significant one.
+  using bigit = uint32_t;
+  using double_bigit = uint64_t;
+  enum { bigits_capacity = 32 };
+  basic_memory_buffer<bigit, bigits_capacity> bigits_;
+  int exp_;
+
+  FMT_CONSTEXPR20 auto operator[](int index) const -> bigit {
+    return bigits_[to_unsigned(index)];
+  }
+  FMT_CONSTEXPR20 auto operator[](int index) -> bigit& {
+    return bigits_[to_unsigned(index)];
+  }
+
+  static constexpr const int bigit_bits = num_bits<bigit>();
+
+  friend struct formatter<bigint>;
+
+  FMT_CONSTEXPR20 void subtract_bigits(int index, bigit other, bigit& borrow) {
+    auto result = static_cast<double_bigit>((*this)[index]) - other - borrow;
+    (*this)[index] = static_cast<bigit>(result);
+    borrow = static_cast<bigit>(result >> (bigit_bits * 2 - 1));
+  }
+
+  FMT_CONSTEXPR20 void remove_leading_zeros() {
+    int num_bigits = static_cast<int>(bigits_.size()) - 1;
+    while (num_bigits > 0 && (*this)[num_bigits] == 0) --num_bigits;
+    bigits_.resize(to_unsigned(num_bigits + 1));
+  }
+
+  // Computes *this -= other assuming aligned bigints and *this >= other.
+  FMT_CONSTEXPR20 void subtract_aligned(const bigint& other) {
+    FMT_ASSERT(other.exp_ >= exp_, "unaligned bigints");
+    FMT_ASSERT(compare(*this, other) >= 0, "");
+    bigit borrow = 0;
+    int i = other.exp_ - exp_;
+    for (size_t j = 0, n = other.bigits_.size(); j != n; ++i, ++j)
+      subtract_bigits(i, other.bigits_[j], borrow);
+    while (borrow > 0) subtract_bigits(i, 0, borrow);
+    remove_leading_zeros();
+  }
+
+  FMT_CONSTEXPR20 void multiply(uint32_t value) {
+    const double_bigit wide_value = value;
+    bigit carry = 0;
+    for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
+      double_bigit result = bigits_[i] * wide_value + carry;
+      bigits_[i] = static_cast<bigit>(result);
+      carry = static_cast<bigit>(result >> bigit_bits);
+    }
+    if (carry != 0) bigits_.push_back(carry);
+  }
+
+  template <typename UInt, FMT_ENABLE_IF(std::is_same<UInt, uint64_t>::value ||
+                                         std::is_same<UInt, uint128_t>::value)>
+  FMT_CONSTEXPR20 void multiply(UInt value) {
+    using half_uint =
+        conditional_t<std::is_same<UInt, uint128_t>::value, uint64_t, uint32_t>;
+    const int shift = num_bits<half_uint>() - bigit_bits;
+    const UInt lower = static_cast<half_uint>(value);
+    const UInt upper = value >> num_bits<half_uint>();
+    UInt carry = 0;
+    for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
+      UInt result = lower * bigits_[i] + static_cast<bigit>(carry);
+      carry = (upper * bigits_[i] << shift) + (result >> bigit_bits) +
+              (carry >> bigit_bits);
+      bigits_[i] = static_cast<bigit>(result);
+    }
+    while (carry != 0) {
+      bigits_.push_back(static_cast<bigit>(carry));
+      carry >>= bigit_bits;
+    }
+  }
+
+  template <typename UInt, FMT_ENABLE_IF(std::is_same<UInt, uint64_t>::value ||
+                                         std::is_same<UInt, uint128_t>::value)>
+  FMT_CONSTEXPR20 void assign(UInt n) {
+    size_t num_bigits = 0;
+    do {
+      bigits_[num_bigits++] = static_cast<bigit>(n);
+      n >>= bigit_bits;
+    } while (n != 0);
+    bigits_.resize(num_bigits);
+    exp_ = 0;
+  }
+
+ public:
+  FMT_CONSTEXPR20 bigint() : exp_(0) {}
+  explicit bigint(uint64_t n) { assign(n); }
+
+  bigint(const bigint&) = delete;
+  void operator=(const bigint&) = delete;
+
+  FMT_CONSTEXPR20 void assign(const bigint& other) {
+    auto size = other.bigits_.size();
+    bigits_.resize(size);
+    auto data = other.bigits_.data();
+    copy_str<bigit>(data, data + size, bigits_.data());
+    exp_ = other.exp_;
+  }
+
+  template <typename Int> FMT_CONSTEXPR20 void operator=(Int n) {
+    FMT_ASSERT(n > 0, "");
+    assign(uint64_or_128_t<Int>(n));
+  }
+
+  FMT_CONSTEXPR20 auto num_bigits() const -> int {
+    return static_cast<int>(bigits_.size()) + exp_;
+  }
+
+  FMT_NOINLINE FMT_CONSTEXPR20 auto operator<<=(int shift) -> bigint& {
+    FMT_ASSERT(shift >= 0, "");
+    exp_ += shift / bigit_bits;
+    shift %= bigit_bits;
+    if (shift == 0) return *this;
+    bigit carry = 0;
+    for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
+      bigit c = bigits_[i] >> (bigit_bits - shift);
+      bigits_[i] = (bigits_[i] << shift) + carry;
+      carry = c;
+    }
+    if (carry != 0) bigits_.push_back(carry);
+    return *this;
+  }
+
+  template <typename Int>
+  FMT_CONSTEXPR20 auto operator*=(Int value) -> bigint& {
+    FMT_ASSERT(value > 0, "");
+    multiply(uint32_or_64_or_128_t<Int>(value));
+    return *this;
+  }
+
+  friend FMT_CONSTEXPR20 auto compare(const bigint& lhs, const bigint& rhs)
+      -> int {
+    int num_lhs_bigits = lhs.num_bigits(), num_rhs_bigits = rhs.num_bigits();
+    if (num_lhs_bigits != num_rhs_bigits)
+      return num_lhs_bigits > num_rhs_bigits ? 1 : -1;
+    int i = static_cast<int>(lhs.bigits_.size()) - 1;
+    int j = static_cast<int>(rhs.bigits_.size()) - 1;
+    int end = i - j;
+    if (end < 0) end = 0;
+    for (; i >= end; --i, --j) {
+      bigit lhs_bigit = lhs[i], rhs_bigit = rhs[j];
+      if (lhs_bigit != rhs_bigit) return lhs_bigit > rhs_bigit ? 1 : -1;
+    }
+    if (i != j) return i > j ? 1 : -1;
+    return 0;
+  }
+
+  // Returns compare(lhs1 + lhs2, rhs).
+  friend FMT_CONSTEXPR20 auto add_compare(const bigint& lhs1,
+                                          const bigint& lhs2, const bigint& rhs)
+      -> int {
+    auto minimum = [](int a, int b) { return a < b ? a : b; };
+    auto maximum = [](int a, int b) { return a > b ? a : b; };
+    int max_lhs_bigits = maximum(lhs1.num_bigits(), lhs2.num_bigits());
+    int num_rhs_bigits = rhs.num_bigits();
+    if (max_lhs_bigits + 1 < num_rhs_bigits) return -1;
+    if (max_lhs_bigits > num_rhs_bigits) return 1;
+    auto get_bigit = [](const bigint& n, int i) -> bigit {
+      return i >= n.exp_ && i < n.num_bigits() ? n[i - n.exp_] : 0;
+    };
+    double_bigit borrow = 0;
+    int min_exp = minimum(minimum(lhs1.exp_, lhs2.exp_), rhs.exp_);
+    for (int i = num_rhs_bigits - 1; i >= min_exp; --i) {
+      double_bigit sum =
+          static_cast<double_bigit>(get_bigit(lhs1, i)) + get_bigit(lhs2, i);
+      bigit rhs_bigit = get_bigit(rhs, i);
+      if (sum > rhs_bigit + borrow) return 1;
+      borrow = rhs_bigit + borrow - sum;
+      if (borrow > 1) return -1;
+      borrow <<= bigit_bits;
+    }
+    return borrow != 0 ? -1 : 0;
+  }
+
+  // Assigns pow(10, exp) to this bigint.
+  FMT_CONSTEXPR20 void assign_pow10(int exp) {
+    FMT_ASSERT(exp >= 0, "");
+    if (exp == 0) return *this = 1;
+    // Find the top bit.
+    int bitmask = 1;
+    while (exp >= bitmask) bitmask <<= 1;
+    bitmask >>= 1;
+    // pow(10, exp) = pow(5, exp) * pow(2, exp). First compute pow(5, exp) by
+    // repeated squaring and multiplication.
+    *this = 5;
+    bitmask >>= 1;
+    while (bitmask != 0) {
+      square();
+      if ((exp & bitmask) != 0) *this *= 5;
+      bitmask >>= 1;
+    }
+    *this <<= exp;  // Multiply by pow(2, exp) by shifting.
+  }
+
+  FMT_CONSTEXPR20 void square() {
+    int num_bigits = static_cast<int>(bigits_.size());
+    int num_result_bigits = 2 * num_bigits;
+    basic_memory_buffer<bigit, bigits_capacity> n(std::move(bigits_));
+    bigits_.resize(to_unsigned(num_result_bigits));
+    auto sum = uint128_t();
+    for (int bigit_index = 0; bigit_index < num_bigits; ++bigit_index) {
+      // Compute bigit at position bigit_index of the result by adding
+      // cross-product terms n[i] * n[j] such that i + j == bigit_index.
+      for (int i = 0, j = bigit_index; j >= 0; ++i, --j) {
+        // Most terms are multiplied twice which can be optimized in the future.
+        sum += static_cast<double_bigit>(n[i]) * n[j];
+      }
+      (*this)[bigit_index] = static_cast<bigit>(sum);
+      sum >>= num_bits<bigit>();  // Compute the carry.
+    }
+    // Do the same for the top half.
+    for (int bigit_index = num_bigits; bigit_index < num_result_bigits;
+         ++bigit_index) {
+      for (int j = num_bigits - 1, i = bigit_index - j; i < num_bigits;)
+        sum += static_cast<double_bigit>(n[i++]) * n[j--];
+      (*this)[bigit_index] = static_cast<bigit>(sum);
+      sum >>= num_bits<bigit>();
+    }
+    remove_leading_zeros();
+    exp_ *= 2;
+  }
+
+  // If this bigint has a bigger exponent than other, adds trailing zero to make
+  // exponents equal. This simplifies some operations such as subtraction.
+  FMT_CONSTEXPR20 void align(const bigint& other) {
+    int exp_difference = exp_ - other.exp_;
+    if (exp_difference <= 0) return;
+    int num_bigits = static_cast<int>(bigits_.size());
+    bigits_.resize(to_unsigned(num_bigits + exp_difference));
+    for (int i = num_bigits - 1, j = i + exp_difference; i >= 0; --i, --j)
+      bigits_[j] = bigits_[i];
+    std::uninitialized_fill_n(bigits_.data(), exp_difference, 0u);
+    exp_ -= exp_difference;
+  }
+
+  // Divides this bignum by divisor, assigning the remainder to this and
+  // returning the quotient.
+  FMT_CONSTEXPR20 auto divmod_assign(const bigint& divisor) -> int {
+    FMT_ASSERT(this != &divisor, "");
+    if (compare(*this, divisor) < 0) return 0;
+    FMT_ASSERT(divisor.bigits_[divisor.bigits_.size() - 1u] != 0, "");
+    align(divisor);
+    int quotient = 0;
+    do {
+      subtract_aligned(divisor);
+      ++quotient;
+    } while (compare(*this, divisor) >= 0);
+    return quotient;
+  }
+};
+
+// format_dragon flags.
+enum dragon {
+  predecessor_closer = 1,
+  fixup = 2,  // Run fixup to correct exp10 which can be off by one.
+  fixed = 4,
+};
+
+// Formats a floating-point number using a variation of the Fixed-Precision
+// Positive Floating-Point Printout ((FPP)^2) algorithm by Steele & White:
+// https://fmt.dev/papers/p372-steele.pdf.
+FMT_CONSTEXPR20 inline void format_dragon(basic_fp<uint128_t> value,
+                                          unsigned flags, int num_digits,
+                                          buffer<char>& buf, int& exp10) {
+  bigint numerator;    // 2 * R in (FPP)^2.
+  bigint denominator;  // 2 * S in (FPP)^2.
+  // lower and upper are differences between value and corresponding boundaries.
+  bigint lower;             // (M^- in (FPP)^2).
+  bigint upper_store;       // upper's value if different from lower.
+  bigint* upper = nullptr;  // (M^+ in (FPP)^2).
+  // Shift numerator and denominator by an extra bit or two (if lower boundary
+  // is closer) to make lower and upper integers. This eliminates multiplication
+  // by 2 during later computations.
+  bool is_predecessor_closer = (flags & dragon::predecessor_closer) != 0;
+  int shift = is_predecessor_closer ? 2 : 1;
+  if (value.e >= 0) {
+    numerator = value.f;
+    numerator <<= value.e + shift;
+    lower = 1;
+    lower <<= value.e;
+    if (is_predecessor_closer) {
+      upper_store = 1;
+      upper_store <<= value.e + 1;
+      upper = &upper_store;
+    }
+    denominator.assign_pow10(exp10);
+    denominator <<= shift;
+  } else if (exp10 < 0) {
+    numerator.assign_pow10(-exp10);
+    lower.assign(numerator);
+    if (is_predecessor_closer) {
+      upper_store.assign(numerator);
+      upper_store <<= 1;
+      upper = &upper_store;
+    }
+    numerator *= value.f;
+    numerator <<= shift;
+    denominator = 1;
+    denominator <<= shift - value.e;
+  } else {
+    numerator = value.f;
+    numerator <<= shift;
+    denominator.assign_pow10(exp10);
+    denominator <<= shift - value.e;
+    lower = 1;
+    if (is_predecessor_closer) {
+      upper_store = 1ULL << 1;
+      upper = &upper_store;
+    }
+  }
+  int even = static_cast<int>((value.f & 1) == 0);
+  if (!upper) upper = &lower;
+  bool shortest = num_digits < 0;
+  if ((flags & dragon::fixup) != 0) {
+    if (add_compare(numerator, *upper, denominator) + even <= 0) {
+      --exp10;
+      numerator *= 10;
+      if (num_digits < 0) {
+        lower *= 10;
+        if (upper != &lower) *upper *= 10;
+      }
+    }
+    if ((flags & dragon::fixed) != 0) adjust_precision(num_digits, exp10 + 1);
+  }
+  // Invariant: value == (numerator / denominator) * pow(10, exp10).
+  if (shortest) {
+    // Generate the shortest representation.
+    num_digits = 0;
+    char* data = buf.data();
+    for (;;) {
+      int digit = numerator.divmod_assign(denominator);
+      bool low = compare(numerator, lower) - even < 0;  // numerator <[=] lower.
+      // numerator + upper >[=] pow10:
+      bool high = add_compare(numerator, *upper, denominator) + even > 0;
+      data[num_digits++] = static_cast<char>('0' + digit);
+      if (low || high) {
+        if (!low) {
+          ++data[num_digits - 1];
+        } else if (high) {
+          int result = add_compare(numerator, numerator, denominator);
+          // Round half to even.
+          if (result > 0 || (result == 0 && (digit % 2) != 0))
+            ++data[num_digits - 1];
+        }
+        buf.try_resize(to_unsigned(num_digits));
+        exp10 -= num_digits - 1;
+        return;
+      }
+      numerator *= 10;
+      lower *= 10;
+      if (upper != &lower) *upper *= 10;
+    }
+  }
+  // Generate the given number of digits.
+  exp10 -= num_digits - 1;
+  if (num_digits <= 0) {
+    denominator *= 10;
+    auto digit = add_compare(numerator, numerator, denominator) > 0 ? '1' : '0';
+    buf.push_back(digit);
+    return;
+  }
+  buf.try_resize(to_unsigned(num_digits));
+  for (int i = 0; i < num_digits - 1; ++i) {
+    int digit = numerator.divmod_assign(denominator);
+    buf[i] = static_cast<char>('0' + digit);
+    numerator *= 10;
+  }
+  int digit = numerator.divmod_assign(denominator);
+  auto result = add_compare(numerator, numerator, denominator);
+  if (result > 0 || (result == 0 && (digit % 2) != 0)) {
+    if (digit == 9) {
+      const auto overflow = '0' + 10;
+      buf[num_digits - 1] = overflow;
+      // Propagate the carry.
+      for (int i = num_digits - 1; i > 0 && buf[i] == overflow; --i) {
+        buf[i] = '0';
+        ++buf[i - 1];
+      }
+      if (buf[0] == overflow) {
+        buf[0] = '1';
+        if ((flags & dragon::fixed) != 0)
+          buf.push_back('0');
+        else
+          ++exp10;
+      }
+      return;
+    }
+    ++digit;
+  }
+  buf[num_digits - 1] = static_cast<char>('0' + digit);
+}
+
+// Formats a floating-point number using the hexfloat format.
+template <typename Float, FMT_ENABLE_IF(!is_double_double<Float>::value)>
+FMT_CONSTEXPR20 void format_hexfloat(Float value, int precision,
+                                     float_specs specs, buffer<char>& buf) {
+  // float is passed as double to reduce the number of instantiations and to
+  // simplify implementation.
+  static_assert(!std::is_same<Float, float>::value, "");
+
+  using info = dragonbox::float_info<Float>;
+
+  // Assume Float is in the format [sign][exponent][significand].
+  using carrier_uint = typename info::carrier_uint;
+
+  constexpr auto num_float_significand_bits =
+      detail::num_significand_bits<Float>();
+
+  basic_fp<carrier_uint> f(value);
+  f.e += num_float_significand_bits;
+  if (!has_implicit_bit<Float>()) --f.e;
+
+  constexpr auto num_fraction_bits =
+      num_float_significand_bits + (has_implicit_bit<Float>() ? 1 : 0);
+  constexpr auto num_xdigits = (num_fraction_bits + 3) / 4;
+
+  constexpr auto leading_shift = ((num_xdigits - 1) * 4);
+  const auto leading_mask = carrier_uint(0xF) << leading_shift;
+  const auto leading_xdigit =
+      static_cast<uint32_t>((f.f & leading_mask) >> leading_shift);
+  if (leading_xdigit > 1) f.e -= (32 - countl_zero(leading_xdigit) - 1);
+
+  int print_xdigits = num_xdigits - 1;
+  if (precision >= 0 && print_xdigits > precision) {
+    const int shift = ((print_xdigits - precision - 1) * 4);
+    const auto mask = carrier_uint(0xF) << shift;
+    const auto v = static_cast<uint32_t>((f.f & mask) >> shift);
+
+    if (v >= 8) {
+      const auto inc = carrier_uint(1) << (shift + 4);
+      f.f += inc;
+      f.f &= ~(inc - 1);
+    }
+
+    // Check long double overflow
+    if (!has_implicit_bit<Float>()) {
+      const auto implicit_bit = carrier_uint(1) << num_float_significand_bits;
+      if ((f.f & implicit_bit) == implicit_bit) {
+        f.f >>= 4;
+        f.e += 4;
+      }
+    }
+
+    print_xdigits = precision;
+  }
+
+  char xdigits[num_bits<carrier_uint>() / 4];
+  detail::fill_n(xdigits, sizeof(xdigits), '0');
+  format_uint<4>(xdigits, f.f, num_xdigits, specs.upper);
+
+  // Remove zero tail
+  while (print_xdigits > 0 && xdigits[print_xdigits] == '0') --print_xdigits;
+
+  buf.push_back('0');
+  buf.push_back(specs.upper ? 'X' : 'x');
+  buf.push_back(xdigits[0]);
+  if (specs.showpoint || print_xdigits > 0 || print_xdigits < precision)
+    buf.push_back('.');
+  buf.append(xdigits + 1, xdigits + 1 + print_xdigits);
+  for (; print_xdigits < precision; ++print_xdigits) buf.push_back('0');
+
+  buf.push_back(specs.upper ? 'P' : 'p');
+
+  uint32_t abs_e;
+  if (f.e < 0) {
+    buf.push_back('-');
+    abs_e = static_cast<uint32_t>(-f.e);
+  } else {
+    buf.push_back('+');
+    abs_e = static_cast<uint32_t>(f.e);
+  }
+  format_decimal<char>(appender(buf), abs_e, detail::count_digits(abs_e));
+}
+
+template <typename Float, FMT_ENABLE_IF(is_double_double<Float>::value)>
+FMT_CONSTEXPR20 void format_hexfloat(Float value, int precision,
+                                     float_specs specs, buffer<char>& buf) {
+  format_hexfloat(static_cast<double>(value), precision, specs, buf);
+}
+
+constexpr auto fractional_part_rounding_thresholds(int index) -> uint32_t {
+  // For checking rounding thresholds.
+  // The kth entry is chosen to be the smallest integer such that the
+  // upper 32-bits of 10^(k+1) times it is strictly bigger than 5 * 10^k.
+  // It is equal to ceil(2^31 + 2^32/10^(k + 1)).
+  // These are stored in a string literal because we cannot have static arrays
+  // in constexpr functions and non-static ones are poorly optimized.
+  return U"\x9999999a\x828f5c29\x80418938\x80068db9\x8000a7c6\x800010c7"
+         U"\x800001ae\x8000002b"[index];
+}
+
+template <typename Float>
+FMT_CONSTEXPR20 auto format_float(Float value, int precision, float_specs specs,
+                                  buffer<char>& buf) -> int {
+  // float is passed as double to reduce the number of instantiations.
+  static_assert(!std::is_same<Float, float>::value, "");
+  FMT_ASSERT(value >= 0, "value is negative");
+  auto converted_value = convert_float(value);
+
+  const bool fixed = specs.format == float_format::fixed;
+  if (value <= 0) {  // <= instead of == to silence a warning.
+    if (precision <= 0 || !fixed) {
+      buf.push_back('0');
+      return 0;
+    }
+    buf.try_resize(to_unsigned(precision));
+    fill_n(buf.data(), precision, '0');
+    return -precision;
+  }
+
+  int exp = 0;
+  bool use_dragon = true;
+  unsigned dragon_flags = 0;
+  if (!is_fast_float<Float>() || is_constant_evaluated()) {
+    const auto inv_log2_10 = 0.3010299956639812;  // 1 / log2(10)
+    using info = dragonbox::float_info<decltype(converted_value)>;
+    const auto f = basic_fp<typename info::carrier_uint>(converted_value);
+    // Compute exp, an approximate power of 10, such that
+    //   10^(exp - 1) <= value < 10^exp or 10^exp <= value < 10^(exp + 1).
+    // This is based on log10(value) == log2(value) / log2(10) and approximation
+    // of log2(value) by e + num_fraction_bits idea from double-conversion.
+    auto e = (f.e + count_digits<1>(f.f) - 1) * inv_log2_10 - 1e-10;
+    exp = static_cast<int>(e);
+    if (e > exp) ++exp;  // Compute ceil.
+    dragon_flags = dragon::fixup;
+  } else if (precision < 0) {
+    // Use Dragonbox for the shortest format.
+    if (specs.binary32) {
+      auto dec = dragonbox::to_decimal(static_cast<float>(value));
+      write<char>(buffer_appender<char>(buf), dec.significand);
+      return dec.exponent;
+    }
+    auto dec = dragonbox::to_decimal(static_cast<double>(value));
+    write<char>(buffer_appender<char>(buf), dec.significand);
+    return dec.exponent;
+  } else {
+    // Extract significand bits and exponent bits.
+    using info = dragonbox::float_info<double>;
+    auto br = bit_cast<uint64_t>(static_cast<double>(value));
+
+    const uint64_t significand_mask =
+        (static_cast<uint64_t>(1) << num_significand_bits<double>()) - 1;
+    uint64_t significand = (br & significand_mask);
+    int exponent = static_cast<int>((br & exponent_mask<double>()) >>
+                                    num_significand_bits<double>());
+
+    if (exponent != 0) {  // Check if normal.
+      exponent -= exponent_bias<double>() + num_significand_bits<double>();
+      significand |=
+          (static_cast<uint64_t>(1) << num_significand_bits<double>());
+      significand <<= 1;
+    } else {
+      // Normalize subnormal inputs.
+      FMT_ASSERT(significand != 0, "zeros should not appear here");
+      int shift = countl_zero(significand);
+      FMT_ASSERT(shift >= num_bits<uint64_t>() - num_significand_bits<double>(),
+                 "");
+      shift -= (num_bits<uint64_t>() - num_significand_bits<double>() - 2);
+      exponent = (std::numeric_limits<double>::min_exponent -
+                  num_significand_bits<double>()) -
+                 shift;
+      significand <<= shift;
+    }
+
+    // Compute the first several nonzero decimal significand digits.
+    // We call the number we get the first segment.
+    const int k = info::kappa - dragonbox::floor_log10_pow2(exponent);
+    exp = -k;
+    const int beta = exponent + dragonbox::floor_log2_pow10(k);
+    uint64_t first_segment;
+    bool has_more_segments;
+    int digits_in_the_first_segment;
+    {
+      const auto r = dragonbox::umul192_upper128(
+          significand << beta, dragonbox::get_cached_power(k));
+      first_segment = r.high();
+      has_more_segments = r.low() != 0;
+
+      // The first segment can have 18 ~ 19 digits.
+      if (first_segment >= 1000000000000000000ULL) {
+        digits_in_the_first_segment = 19;
+      } else {
+        // When it is of 18-digits, we align it to 19-digits by adding a bogus
+        // zero at the end.
+        digits_in_the_first_segment = 18;
+        first_segment *= 10;
+      }
+    }
+
+    // Compute the actual number of decimal digits to print.
+    if (fixed) adjust_precision(precision, exp + digits_in_the_first_segment);
+
+    // Use Dragon4 only when there might be not enough digits in the first
+    // segment.
+    if (digits_in_the_first_segment > precision) {
+      use_dragon = false;
+
+      if (precision <= 0) {
+        exp += digits_in_the_first_segment;
+
+        if (precision < 0) {
+          // Nothing to do, since all we have are just leading zeros.
+          buf.try_resize(0);
+        } else {
+          // We may need to round-up.
+          buf.try_resize(1);
+          if ((first_segment | static_cast<uint64_t>(has_more_segments)) >
+              5000000000000000000ULL) {
+            buf[0] = '1';
+          } else {
+            buf[0] = '0';
+          }
+        }
+      }  // precision <= 0
+      else {
+        exp += digits_in_the_first_segment - precision;
+
+        // When precision > 0, we divide the first segment into three
+        // subsegments, each with 9, 9, and 0 ~ 1 digits so that each fits
+        // in 32-bits which usually allows faster calculation than in
+        // 64-bits. Since some compiler (e.g. MSVC) doesn't know how to optimize
+        // division-by-constant for large 64-bit divisors, we do it here
+        // manually. The magic number 7922816251426433760 below is equal to
+        // ceil(2^(64+32) / 10^10).
+        const uint32_t first_subsegment = static_cast<uint32_t>(
+            dragonbox::umul128_upper64(first_segment, 7922816251426433760ULL) >>
+            32);
+        const uint64_t second_third_subsegments =
+            first_segment - first_subsegment * 10000000000ULL;
+
+        uint64_t prod;
+        uint32_t digits;
+        bool should_round_up;
+        int number_of_digits_to_print = precision > 9 ? 9 : precision;
+
+        // Print a 9-digits subsegment, either the first or the second.
+        auto print_subsegment = [&](uint32_t subsegment, char* buffer) {
+          int number_of_digits_printed = 0;
+
+          // If we want to print an odd number of digits from the subsegment,
+          if ((number_of_digits_to_print & 1) != 0) {
+            // Convert to 64-bit fixed-point fractional form with 1-digit
+            // integer part. The magic number 720575941 is a good enough
+            // approximation of 2^(32 + 24) / 10^8; see
+            // https://jk-jeon.github.io/posts/2022/12/fixed-precision-formatting/#fixed-length-case
+            // for details.
+            prod = ((subsegment * static_cast<uint64_t>(720575941)) >> 24) + 1;
+            digits = static_cast<uint32_t>(prod >> 32);
+            *buffer = static_cast<char>('0' + digits);
+            number_of_digits_printed++;
+          }
+          // If we want to print an even number of digits from the
+          // first_subsegment,
+          else {
+            // Convert to 64-bit fixed-point fractional form with 2-digits
+            // integer part. The magic number 450359963 is a good enough
+            // approximation of 2^(32 + 20) / 10^7; see
+            // https://jk-jeon.github.io/posts/2022/12/fixed-precision-formatting/#fixed-length-case
+            // for details.
+            prod = ((subsegment * static_cast<uint64_t>(450359963)) >> 20) + 1;
+            digits = static_cast<uint32_t>(prod >> 32);
+            copy2(buffer, digits2(digits));
+            number_of_digits_printed += 2;
+          }
+
+          // Print all digit pairs.
+          while (number_of_digits_printed < number_of_digits_to_print) {
+            prod = static_cast<uint32_t>(prod) * static_cast<uint64_t>(100);
+            digits = static_cast<uint32_t>(prod >> 32);
+            copy2(buffer + number_of_digits_printed, digits2(digits));
+            number_of_digits_printed += 2;
+          }
+        };
+
+        // Print first subsegment.
+        print_subsegment(first_subsegment, buf.data());
+
+        // Perform rounding if the first subsegment is the last subsegment to
+        // print.
+        if (precision <= 9) {
+          // Rounding inside the subsegment.
+          // We round-up if:
+          //  - either the fractional part is strictly larger than 1/2, or
+          //  - the fractional part is exactly 1/2 and the last digit is odd.
+          // We rely on the following observations:
+          //  - If fractional_part >= threshold, then the fractional part is
+          //    strictly larger than 1/2.
+          //  - If the MSB of fractional_part is set, then the fractional part
+          //    must be at least 1/2.
+          //  - When the MSB of fractional_part is set, either
+          //    second_third_subsegments being nonzero or has_more_segments
+          //    being true means there are further digits not printed, so the
+          //    fractional part is strictly larger than 1/2.
+          if (precision < 9) {
+            uint32_t fractional_part = static_cast<uint32_t>(prod);
+            should_round_up =
+                fractional_part >= fractional_part_rounding_thresholds(
+                                       8 - number_of_digits_to_print) ||
+                ((fractional_part >> 31) &
+                 ((digits & 1) | (second_third_subsegments != 0) |
+                  has_more_segments)) != 0;
+          }
+          // Rounding at the subsegment boundary.
+          // In this case, the fractional part is at least 1/2 if and only if
+          // second_third_subsegments >= 5000000000ULL, and is strictly larger
+          // than 1/2 if we further have either second_third_subsegments >
+          // 5000000000ULL or has_more_segments == true.
+          else {
+            should_round_up = second_third_subsegments > 5000000000ULL ||
+                              (second_third_subsegments == 5000000000ULL &&
+                               ((digits & 1) != 0 || has_more_segments));
+          }
+        }
+        // Otherwise, print the second subsegment.
+        else {
+          // Compilers are not aware of how to leverage the maximum value of
+          // second_third_subsegments to find out a better magic number which
+          // allows us to eliminate an additional shift. 1844674407370955162 =
+          // ceil(2^64/10) < ceil(2^64*(10^9/(10^10 - 1))).
+          const uint32_t second_subsegment =
+              static_cast<uint32_t>(dragonbox::umul128_upper64(
+                  second_third_subsegments, 1844674407370955162ULL));
+          const uint32_t third_subsegment =
+              static_cast<uint32_t>(second_third_subsegments) -
+              second_subsegment * 10;
+
+          number_of_digits_to_print = precision - 9;
+          print_subsegment(second_subsegment, buf.data() + 9);
+
+          // Rounding inside the subsegment.
+          if (precision < 18) {
+            // The condition third_subsegment != 0 implies that the segment was
+            // of 19 digits, so in this case the third segment should be
+            // consisting of a genuine digit from the input.
+            uint32_t fractional_part = static_cast<uint32_t>(prod);
+            should_round_up =
+                fractional_part >= fractional_part_rounding_thresholds(
+                                       8 - number_of_digits_to_print) ||
+                ((fractional_part >> 31) &
+                 ((digits & 1) | (third_subsegment != 0) |
+                  has_more_segments)) != 0;
+          }
+          // Rounding at the subsegment boundary.
+          else {
+            // In this case, the segment must be of 19 digits, thus
+            // the third subsegment should be consisting of a genuine digit from
+            // the input.
+            should_round_up = third_subsegment > 5 ||
+                              (third_subsegment == 5 &&
+                               ((digits & 1) != 0 || has_more_segments));
+          }
+        }
+
+        // Round-up if necessary.
+        if (should_round_up) {
+          ++buf[precision - 1];
+          for (int i = precision - 1; i > 0 && buf[i] > '9'; --i) {
+            buf[i] = '0';
+            ++buf[i - 1];
+          }
+          if (buf[0] > '9') {
+            buf[0] = '1';
+            if (fixed)
+              buf[precision++] = '0';
+            else
+              ++exp;
+          }
+        }
+        buf.try_resize(to_unsigned(precision));
+      }
+    }  // if (digits_in_the_first_segment > precision)
+    else {
+      // Adjust the exponent for its use in Dragon4.
+      exp += digits_in_the_first_segment - 1;
+    }
+  }
+  if (use_dragon) {
+    auto f = basic_fp<uint128_t>();
+    bool is_predecessor_closer = specs.binary32
+                                     ? f.assign(static_cast<float>(value))
+                                     : f.assign(converted_value);
+    if (is_predecessor_closer) dragon_flags |= dragon::predecessor_closer;
+    if (fixed) dragon_flags |= dragon::fixed;
+    // Limit precision to the maximum possible number of significant digits in
+    // an IEEE754 double because we don't need to generate zeros.
+    const int max_double_digits = 767;
+    if (precision > max_double_digits) precision = max_double_digits;
+    format_dragon(f, dragon_flags, precision, buf, exp);
+  }
+  if (!fixed && !specs.showpoint) {
+    // Remove trailing zeros.
+    auto num_digits = buf.size();
+    while (num_digits > 0 && buf[num_digits - 1] == '0') {
+      --num_digits;
+      ++exp;
+    }
+    buf.try_resize(num_digits);
+  }
+  return exp;
+}
+template <typename Char, typename OutputIt, typename T>
+FMT_CONSTEXPR20 auto write_float(OutputIt out, T value,
+                                 format_specs<Char> specs, locale_ref loc)
+    -> OutputIt {
+  float_specs fspecs = parse_float_type_spec(specs);
+  fspecs.sign = specs.sign;
+  if (detail::signbit(value)) {  // value < 0 is false for NaN so use signbit.
+    fspecs.sign = sign::minus;
+    value = -value;
+  } else if (fspecs.sign == sign::minus) {
+    fspecs.sign = sign::none;
+  }
+
+  if (!detail::isfinite(value))
+    return write_nonfinite(out, detail::isnan(value), specs, fspecs);
+
+  if (specs.align == align::numeric && fspecs.sign) {
+    auto it = reserve(out, 1);
+    *it++ = detail::sign<Char>(fspecs.sign);
+    out = base_iterator(out, it);
+    fspecs.sign = sign::none;
+    if (specs.width != 0) --specs.width;
+  }
+
+  memory_buffer buffer;
+  if (fspecs.format == float_format::hex) {
+    if (fspecs.sign) buffer.push_back(detail::sign<char>(fspecs.sign));
+    format_hexfloat(convert_float(value), specs.precision, fspecs, buffer);
+    return write_bytes<align::right>(out, {buffer.data(), buffer.size()},
+                                     specs);
+  }
+  int precision = specs.precision >= 0 || specs.type == presentation_type::none
+                      ? specs.precision
+                      : 6;
+  if (fspecs.format == float_format::exp) {
+    if (precision == max_value<int>())
+      throw_format_error("number is too big");
+    else
+      ++precision;
+  } else if (fspecs.format != float_format::fixed && precision == 0) {
+    precision = 1;
+  }
+  if (const_check(std::is_same<T, float>())) fspecs.binary32 = true;
+  int exp = format_float(convert_float(value), precision, fspecs, buffer);
+  fspecs.precision = precision;
+  auto f = big_decimal_fp{buffer.data(), static_cast<int>(buffer.size()), exp};
+  return write_float(out, f, specs, fspecs, loc);
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_floating_point<T>::value)>
+FMT_CONSTEXPR20 auto write(OutputIt out, T value, format_specs<Char> specs,
+                           locale_ref loc = {}) -> OutputIt {
+  if (const_check(!is_supported_floating_point(value))) return out;
+  return specs.localized && write_loc(out, value, specs, loc)
+             ? out
+             : write_float(out, value, specs, loc);
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_fast_float<T>::value)>
+FMT_CONSTEXPR20 auto write(OutputIt out, T value) -> OutputIt {
+  if (is_constant_evaluated()) return write(out, value, format_specs<Char>());
+  if (const_check(!is_supported_floating_point(value))) return out;
+
+  auto fspecs = float_specs();
+  if (detail::signbit(value)) {
+    fspecs.sign = sign::minus;
+    value = -value;
+  }
+
+  constexpr auto specs = format_specs<Char>();
+  using floaty = conditional_t<std::is_same<T, long double>::value, double, T>;
+  using floaty_uint = typename dragonbox::float_info<floaty>::carrier_uint;
+  floaty_uint mask = exponent_mask<floaty>();
+  if ((bit_cast<floaty_uint>(value) & mask) == mask)
+    return write_nonfinite(out, std::isnan(value), specs, fspecs);
+
+  auto dec = dragonbox::to_decimal(static_cast<floaty>(value));
+  return write_float(out, dec, specs, fspecs, {});
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_floating_point<T>::value &&
+                        !is_fast_float<T>::value)>
+inline auto write(OutputIt out, T value) -> OutputIt {
+  return write(out, value, format_specs<Char>());
+}
+
+template <typename Char, typename OutputIt>
+auto write(OutputIt out, monostate, format_specs<Char> = {}, locale_ref = {})
+    -> OutputIt {
+  FMT_ASSERT(false, "");
+  return out;
+}
+
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out, basic_string_view<Char> value)
+    -> OutputIt {
+  auto it = reserve(out, value.size());
+  it = copy_str_noinline<Char>(value.begin(), value.end(), it);
+  return base_iterator(out, it);
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_string<T>::value)>
+constexpr auto write(OutputIt out, const T& value) -> OutputIt {
+  return write<Char>(out, to_string_view(value));
+}
+
+// FMT_ENABLE_IF() condition separated to workaround an MSVC bug.
+template <
+    typename Char, typename OutputIt, typename T,
+    bool check =
+        std::is_enum<T>::value && !std::is_same<T, Char>::value &&
+        mapped_type_constant<T, basic_format_context<OutputIt, Char>>::value !=
+            type::custom_type,
+    FMT_ENABLE_IF(check)>
+FMT_CONSTEXPR auto write(OutputIt out, T value) -> OutputIt {
+  return write<Char>(out, static_cast<underlying_t<T>>(value));
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(std::is_same<T, bool>::value)>
+FMT_CONSTEXPR auto write(OutputIt out, T value,
+                         const format_specs<Char>& specs = {}, locale_ref = {})
+    -> OutputIt {
+  return specs.type != presentation_type::none &&
+                 specs.type != presentation_type::string
+             ? write(out, value ? 1 : 0, specs, {})
+             : write_bytes(out, value ? "true" : "false", specs);
+}
+
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out, Char value) -> OutputIt {
+  auto it = reserve(out, 1);
+  *it++ = value;
+  return base_iterator(out, it);
+}
+
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR_CHAR_TRAITS auto write(OutputIt out, const Char* value)
+    -> OutputIt {
+  if (value) return write(out, basic_string_view<Char>(value));
+  throw_format_error("string pointer is null");
+  return out;
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(std::is_same<T, void>::value)>
+auto write(OutputIt out, const T* value, const format_specs<Char>& specs = {},
+           locale_ref = {}) -> OutputIt {
+  return write_ptr<Char>(out, bit_cast<uintptr_t>(value), &specs);
+}
+
+// A write overload that handles implicit conversions.
+template <typename Char, typename OutputIt, typename T,
+          typename Context = basic_format_context<OutputIt, Char>>
+FMT_CONSTEXPR auto write(OutputIt out, const T& value) -> enable_if_t<
+    std::is_class<T>::value && !is_string<T>::value &&
+        !is_floating_point<T>::value && !std::is_same<T, Char>::value &&
+        !std::is_same<T, remove_cvref_t<decltype(arg_mapper<Context>().map(
+                             value))>>::value,
+    OutputIt> {
+  return write<Char>(out, arg_mapper<Context>().map(value));
+}
+
+template <typename Char, typename OutputIt, typename T,
+          typename Context = basic_format_context<OutputIt, Char>>
+FMT_CONSTEXPR auto write(OutputIt out, const T& value)
+    -> enable_if_t<mapped_type_constant<T, Context>::value == type::custom_type,
+                   OutputIt> {
+  auto formatter = typename Context::template formatter_type<T>();
+  auto parse_ctx = typename Context::parse_context_type({});
+  formatter.parse(parse_ctx);
+  auto ctx = Context(out, {}, {});
+  return formatter.format(value, ctx);
+}
+
+// An argument visitor that formats the argument and writes it via the output
+// iterator. It's a class and not a generic lambda for compatibility with C++11.
+template <typename Char> struct default_arg_formatter {
+  using iterator = buffer_appender<Char>;
+  using context = buffer_context<Char>;
+
+  iterator out;
+  basic_format_args<context> args;
+  locale_ref loc;
+
+  template <typename T> auto operator()(T value) -> iterator {
+    return write<Char>(out, value);
+  }
+  auto operator()(typename basic_format_arg<context>::handle h) -> iterator {
+    basic_format_parse_context<Char> parse_ctx({});
+    context format_ctx(out, args, loc);
+    h.format(parse_ctx, format_ctx);
+    return format_ctx.out();
+  }
+};
+
+template <typename Char> struct arg_formatter {
+  using iterator = buffer_appender<Char>;
+  using context = buffer_context<Char>;
+
+  iterator out;
+  const format_specs<Char>& specs;
+  locale_ref locale;
+
+  template <typename T>
+  FMT_CONSTEXPR FMT_INLINE auto operator()(T value) -> iterator {
+    return detail::write(out, value, specs, locale);
+  }
+  auto operator()(typename basic_format_arg<context>::handle) -> iterator {
+    // User-defined types are handled separately because they require access
+    // to the parse context.
+    return out;
+  }
+};
+
+struct width_checker {
+  template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
+  FMT_CONSTEXPR auto operator()(T value) -> unsigned long long {
+    if (is_negative(value)) throw_format_error("negative width");
+    return static_cast<unsigned long long>(value);
+  }
+
+  template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
+  FMT_CONSTEXPR auto operator()(T) -> unsigned long long {
+    throw_format_error("width is not integer");
+    return 0;
+  }
+};
+
+struct precision_checker {
+  template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
+  FMT_CONSTEXPR auto operator()(T value) -> unsigned long long {
+    if (is_negative(value)) throw_format_error("negative precision");
+    return static_cast<unsigned long long>(value);
+  }
+
+  template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
+  FMT_CONSTEXPR auto operator()(T) -> unsigned long long {
+    throw_format_error("precision is not integer");
+    return 0;
+  }
+};
+
+template <typename Handler, typename FormatArg>
+FMT_CONSTEXPR auto get_dynamic_spec(FormatArg arg) -> int {
+  unsigned long long value = visit_format_arg(Handler(), arg);
+  if (value > to_unsigned(max_value<int>()))
+    throw_format_error("number is too big");
+  return static_cast<int>(value);
+}
+
+template <typename Context, typename ID>
+FMT_CONSTEXPR auto get_arg(Context& ctx, ID id) -> decltype(ctx.arg(id)) {
+  auto arg = ctx.arg(id);
+  if (!arg) ctx.on_error("argument not found");
+  return arg;
+}
+
+template <typename Handler, typename Context>
+FMT_CONSTEXPR void handle_dynamic_spec(int& value,
+                                       arg_ref<typename Context::char_type> ref,
+                                       Context& ctx) {
+  switch (ref.kind) {
+  case arg_id_kind::none:
+    break;
+  case arg_id_kind::index:
+    value = detail::get_dynamic_spec<Handler>(get_arg(ctx, ref.val.index));
+    break;
+  case arg_id_kind::name:
+    value = detail::get_dynamic_spec<Handler>(get_arg(ctx, ref.val.name));
+    break;
+  }
+}
+
+#if FMT_USE_USER_DEFINED_LITERALS
+#  if FMT_USE_NONTYPE_TEMPLATE_ARGS
+template <typename T, typename Char, size_t N,
+          fmt::detail_exported::fixed_string<Char, N> Str>
+struct statically_named_arg : view {
+  static constexpr auto name = Str.data;
+
+  const T& value;
+  statically_named_arg(const T& v) : value(v) {}
+};
+
+template <typename T, typename Char, size_t N,
+          fmt::detail_exported::fixed_string<Char, N> Str>
+struct is_named_arg<statically_named_arg<T, Char, N, Str>> : std::true_type {};
+
+template <typename T, typename Char, size_t N,
+          fmt::detail_exported::fixed_string<Char, N> Str>
+struct is_statically_named_arg<statically_named_arg<T, Char, N, Str>>
+    : std::true_type {};
+
+template <typename Char, size_t N,
+          fmt::detail_exported::fixed_string<Char, N> Str>
+struct udl_arg {
+  template <typename T> auto operator=(T&& value) const {
+    return statically_named_arg<T, Char, N, Str>(std::forward<T>(value));
+  }
+};
+#  else
+template <typename Char> struct udl_arg {
+  const Char* str;
+
+  template <typename T> auto operator=(T&& value) const -> named_arg<Char, T> {
+    return {str, std::forward<T>(value)};
+  }
+};
+#  endif
+#endif  // FMT_USE_USER_DEFINED_LITERALS
+
+template <typename Locale, typename Char>
+auto vformat(const Locale& loc, basic_string_view<Char> fmt,
+             basic_format_args<buffer_context<type_identity_t<Char>>> args)
+    -> std::basic_string<Char> {
+  auto buf = basic_memory_buffer<Char>();
+  detail::vformat_to(buf, fmt, args, detail::locale_ref(loc));
+  return {buf.data(), buf.size()};
+}
+
+using format_func = void (*)(detail::buffer<char>&, int, const char*);
+
+FMT_API void format_error_code(buffer<char>& out, int error_code,
+                               string_view message) noexcept;
+
+FMT_API void report_error(format_func func, int error_code,
+                          const char* message) noexcept;
+}  // namespace detail
+
+FMT_API auto vsystem_error(int error_code, string_view format_str,
+                           format_args args) -> std::system_error;
+
+/**
+  \rst
+  Constructs :class:`std::system_error` with a message formatted with
+  ``fmt::format(fmt, args...)``.
+  *error_code* is a system error code as given by ``errno``.
+
+  **Example**::
+
+    // This throws std::system_error with the description
+    //   cannot open file 'madeup': No such file or directory
+    // or similar (system message may vary).
+    const char* filename = "madeup";
+    std::FILE* file = std::fopen(filename, "r");
+    if (!file)
+      throw fmt::system_error(errno, "cannot open file '{}'", filename);
+  \endrst
+ */
+template <typename... T>
+auto system_error(int error_code, format_string<T...> fmt, T&&... args)
+    -> std::system_error {
+  return vsystem_error(error_code, fmt, fmt::make_format_args(args...));
+}
+
+/**
+  \rst
+  Formats an error message for an error returned by an operating system or a
+  language runtime, for example a file opening error, and writes it to *out*.
+  The format is the same as the one used by ``std::system_error(ec, message)``
+  where ``ec`` is ``std::error_code(error_code, std::generic_category()})``.
+  It is implementation-defined but normally looks like:
+
+  .. parsed-literal::
+     *<message>*: *<system-message>*
+
+  where *<message>* is the passed message and *<system-message>* is the system
+  message corresponding to the error code.
+  *error_code* is a system error code as given by ``errno``.
+  \endrst
+ */
+FMT_API void format_system_error(detail::buffer<char>& out, int error_code,
+                                 const char* message) noexcept;
+
+// Reports a system error without throwing an exception.
+// Can be used to report errors from destructors.
+FMT_API void report_system_error(int error_code, const char* message) noexcept;
+
+/** Fast integer formatter. */
+class format_int {
+ private:
+  // Buffer should be large enough to hold all digits (digits10 + 1),
+  // a sign and a null character.
+  enum { buffer_size = std::numeric_limits<unsigned long long>::digits10 + 3 };
+  mutable char buffer_[buffer_size];
+  char* str_;
+
+  template <typename UInt> auto format_unsigned(UInt value) -> char* {
+    auto n = static_cast<detail::uint32_or_64_or_128_t<UInt>>(value);
+    return detail::format_decimal(buffer_, n, buffer_size - 1).begin;
+  }
+
+  template <typename Int> auto format_signed(Int value) -> char* {
+    auto abs_value = static_cast<detail::uint32_or_64_or_128_t<Int>>(value);
+    bool negative = value < 0;
+    if (negative) abs_value = 0 - abs_value;
+    auto begin = format_unsigned(abs_value);
+    if (negative) *--begin = '-';
+    return begin;
+  }
+
+ public:
+  explicit format_int(int value) : str_(format_signed(value)) {}
+  explicit format_int(long value) : str_(format_signed(value)) {}
+  explicit format_int(long long value) : str_(format_signed(value)) {}
+  explicit format_int(unsigned value) : str_(format_unsigned(value)) {}
+  explicit format_int(unsigned long value) : str_(format_unsigned(value)) {}
+  explicit format_int(unsigned long long value)
+      : str_(format_unsigned(value)) {}
+
+  /** Returns the number of characters written to the output buffer. */
+  auto size() const -> size_t {
+    return detail::to_unsigned(buffer_ - str_ + buffer_size - 1);
+  }
+
+  /**
+    Returns a pointer to the output buffer content. No terminating null
+    character is appended.
+   */
+  auto data() const -> const char* { return str_; }
+
+  /**
+    Returns a pointer to the output buffer content with terminating null
+    character appended.
+   */
+  auto c_str() const -> const char* {
+    buffer_[buffer_size - 1] = '\0';
+    return str_;
+  }
+
+  /**
+    \rst
+    Returns the content of the output buffer as an ``std::string``.
+    \endrst
+   */
+  auto str() const -> std::string { return std::string(str_, size()); }
+};
+
+template <typename T, typename Char>
+struct formatter<T, Char, enable_if_t<detail::has_format_as<T>::value>>
+    : formatter<detail::format_as_t<T>, Char> {
+  template <typename FormatContext>
+  auto format(const T& value, FormatContext& ctx) const -> decltype(ctx.out()) {
+    using base = formatter<detail::format_as_t<T>, Char>;
+    return base::format(format_as(value), ctx);
+  }
+};
+
+#define FMT_FORMAT_AS(Type, Base) \
+  template <typename Char>        \
+  struct formatter<Type, Char> : formatter<Base, Char> {}
+
+FMT_FORMAT_AS(signed char, int);
+FMT_FORMAT_AS(unsigned char, unsigned);
+FMT_FORMAT_AS(short, int);
+FMT_FORMAT_AS(unsigned short, unsigned);
+FMT_FORMAT_AS(long, detail::long_type);
+FMT_FORMAT_AS(unsigned long, detail::ulong_type);
+FMT_FORMAT_AS(Char*, const Char*);
+FMT_FORMAT_AS(std::basic_string<Char>, basic_string_view<Char>);
+FMT_FORMAT_AS(std::nullptr_t, const void*);
+FMT_FORMAT_AS(detail::std_string_view<Char>, basic_string_view<Char>);
+FMT_FORMAT_AS(void*, const void*);
+
+template <typename Char, size_t N>
+struct formatter<Char[N], Char> : formatter<basic_string_view<Char>, Char> {};
+
+/**
+  \rst
+  Converts ``p`` to ``const void*`` for pointer formatting.
+
+  **Example**::
+
+    auto s = fmt::format("{}", fmt::ptr(p));
+  \endrst
+ */
+template <typename T> auto ptr(T p) -> const void* {
+  static_assert(std::is_pointer<T>::value, "");
+  return detail::bit_cast<const void*>(p);
+}
+template <typename T, typename Deleter>
+auto ptr(const std::unique_ptr<T, Deleter>& p) -> const void* {
+  return p.get();
+}
+template <typename T> auto ptr(const std::shared_ptr<T>& p) -> const void* {
+  return p.get();
+}
+
+/**
+  \rst
+  Converts ``e`` to the underlying type.
+
+  **Example**::
+
+    enum class color { red, green, blue };
+    auto s = fmt::format("{}", fmt::underlying(color::red));
+  \endrst
+ */
+template <typename Enum>
+constexpr auto underlying(Enum e) noexcept -> underlying_t<Enum> {
+  return static_cast<underlying_t<Enum>>(e);
+}
+
+namespace enums {
+template <typename Enum, FMT_ENABLE_IF(std::is_enum<Enum>::value)>
+constexpr auto format_as(Enum e) noexcept -> underlying_t<Enum> {
+  return static_cast<underlying_t<Enum>>(e);
+}
+}  // namespace enums
+
+class bytes {
+ private:
+  string_view data_;
+  friend struct formatter<bytes>;
+
+ public:
+  explicit bytes(string_view data) : data_(data) {}
+};
+
+template <> struct formatter<bytes> {
+ private:
+  detail::dynamic_format_specs<> specs_;
+
+ public:
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> const char* {
+    return parse_format_specs(ctx.begin(), ctx.end(), specs_, ctx,
+                              detail::type::string_type);
+  }
+
+  template <typename FormatContext>
+  auto format(bytes b, FormatContext& ctx) -> decltype(ctx.out()) {
+    detail::handle_dynamic_spec<detail::width_checker>(specs_.width,
+                                                       specs_.width_ref, ctx);
+    detail::handle_dynamic_spec<detail::precision_checker>(
+        specs_.precision, specs_.precision_ref, ctx);
+    return detail::write_bytes(ctx.out(), b.data_, specs_);
+  }
+};
+
+// group_digits_view is not derived from view because it copies the argument.
+template <typename T> struct group_digits_view {
+  T value;
+};
+
+/**
+  \rst
+  Returns a view that formats an integer value using ',' as a locale-independent
+  thousands separator.
+
+  **Example**::
+
+    fmt::print("{}", fmt::group_digits(12345));
+    // Output: "12,345"
+  \endrst
+ */
+template <typename T> auto group_digits(T value) -> group_digits_view<T> {
+  return {value};
+}
+
+template <typename T> struct formatter<group_digits_view<T>> : formatter<T> {
+ private:
+  detail::dynamic_format_specs<> specs_;
+
+ public:
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> const char* {
+    return parse_format_specs(ctx.begin(), ctx.end(), specs_, ctx,
+                              detail::type::int_type);
+  }
+
+  template <typename FormatContext>
+  auto format(group_digits_view<T> t, FormatContext& ctx)
+      -> decltype(ctx.out()) {
+    detail::handle_dynamic_spec<detail::width_checker>(specs_.width,
+                                                       specs_.width_ref, ctx);
+    detail::handle_dynamic_spec<detail::precision_checker>(
+        specs_.precision, specs_.precision_ref, ctx);
+    return detail::write_int(
+        ctx.out(), static_cast<detail::uint64_or_128_t<T>>(t.value), 0, specs_,
+        detail::digit_grouping<char>("\3", ","));
+  }
+};
+
+template <typename T> struct nested_view {
+  const formatter<T>* fmt;
+  const T* value;
+};
+
+template <typename T> struct formatter<nested_view<T>> {
+  FMT_CONSTEXPR auto parse(format_parse_context& ctx) -> const char* {
+    return ctx.begin();
+  }
+  auto format(nested_view<T> view, format_context& ctx) const
+      -> decltype(ctx.out()) {
+    return view.fmt->format(*view.value, ctx);
+  }
+};
+
+template <typename T> struct nested_formatter {
+ private:
+  int width_;
+  detail::fill_t<char> fill_;
+  align_t align_ : 4;
+  formatter<T> formatter_;
+
+ public:
+  constexpr nested_formatter() : width_(0), align_(align_t::none) {}
+
+  FMT_CONSTEXPR auto parse(format_parse_context& ctx) -> const char* {
+    auto specs = detail::dynamic_format_specs<char>();
+    auto it = parse_format_specs(ctx.begin(), ctx.end(), specs, ctx,
+                                 detail::type::none_type);
+    width_ = specs.width;
+    fill_ = specs.fill;
+    align_ = specs.align;
+    ctx.advance_to(it);
+    return formatter_.parse(ctx);
+  }
+
+  template <typename F>
+  auto write_padded(format_context& ctx, F write) const -> decltype(ctx.out()) {
+    if (width_ == 0) return write(ctx.out());
+    auto buf = memory_buffer();
+    write(std::back_inserter(buf));
+    auto specs = format_specs<>();
+    specs.width = width_;
+    specs.fill = fill_;
+    specs.align = align_;
+    return detail::write(ctx.out(), string_view(buf.data(), buf.size()), specs);
+  }
+
+  auto nested(const T& value) const -> nested_view<T> {
+    return nested_view<T>{&formatter_, &value};
+  }
+};
+
+// DEPRECATED! join_view will be moved to ranges.h.
+template <typename It, typename Sentinel, typename Char = char>
+struct join_view : detail::view {
+  It begin;
+  Sentinel end;
+  basic_string_view<Char> sep;
+
+  join_view(It b, Sentinel e, basic_string_view<Char> s)
+      : begin(b), end(e), sep(s) {}
+};
+
+template <typename It, typename Sentinel, typename Char>
+struct formatter<join_view<It, Sentinel, Char>, Char> {
+ private:
+  using value_type =
+#ifdef __cpp_lib_ranges
+      std::iter_value_t<It>;
+#else
+      typename std::iterator_traits<It>::value_type;
+#endif
+  formatter<remove_cvref_t<value_type>, Char> value_formatter_;
+
+ public:
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> const Char* {
+    return value_formatter_.parse(ctx);
+  }
+
+  template <typename FormatContext>
+  auto format(const join_view<It, Sentinel, Char>& value,
+              FormatContext& ctx) const -> decltype(ctx.out()) {
+    auto it = value.begin;
+    auto out = ctx.out();
+    if (it != value.end) {
+      out = value_formatter_.format(*it, ctx);
+      ++it;
+      while (it != value.end) {
+        out = detail::copy_str<Char>(value.sep.begin(), value.sep.end(), out);
+        ctx.advance_to(out);
+        out = value_formatter_.format(*it, ctx);
+        ++it;
+      }
+    }
+    return out;
+  }
+};
+
+/**
+  Returns a view that formats the iterator range `[begin, end)` with elements
+  separated by `sep`.
+ */
+template <typename It, typename Sentinel>
+auto join(It begin, Sentinel end, string_view sep) -> join_view<It, Sentinel> {
+  return {begin, end, sep};
+}
+
+/**
+  \rst
+  Returns a view that formats `range` with elements separated by `sep`.
+
+  **Example**::
+
+    std::vector<int> v = {1, 2, 3};
+    fmt::print("{}", fmt::join(v, ", "));
+    // Output: "1, 2, 3"
+
+  ``fmt::join`` applies passed format specifiers to the range elements::
+
+    fmt::print("{:02}", fmt::join(v, ", "));
+    // Output: "01, 02, 03"
+  \endrst
+ */
+template <typename Range>
+auto join(Range&& range, string_view sep)
+    -> join_view<detail::iterator_t<Range>, detail::sentinel_t<Range>> {
+  return join(std::begin(range), std::end(range), sep);
+}
+
+/**
+  \rst
+  Converts *value* to ``std::string`` using the default format for type *T*.
+
+  **Example**::
+
+    #include <fmt/format.h>
+
+    std::string answer = fmt::to_string(42);
+  \endrst
+ */
+template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value &&
+                                    !detail::has_format_as<T>::value)>
+inline auto to_string(const T& value) -> std::string {
+  auto buffer = memory_buffer();
+  detail::write<char>(appender(buffer), value);
+  return {buffer.data(), buffer.size()};
+}
+
+template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
+FMT_NODISCARD inline auto to_string(T value) -> std::string {
+  // The buffer should be large enough to store the number including the sign
+  // or "false" for bool.
+  constexpr int max_size = detail::digits10<T>() + 2;
+  char buffer[max_size > 5 ? static_cast<unsigned>(max_size) : 5];
+  char* begin = buffer;
+  return std::string(begin, detail::write<char>(begin, value));
+}
+
+template <typename Char, size_t SIZE>
+FMT_NODISCARD auto to_string(const basic_memory_buffer<Char, SIZE>& buf)
+    -> std::basic_string<Char> {
+  auto size = buf.size();
+  detail::assume(size < std::basic_string<Char>().max_size());
+  return std::basic_string<Char>(buf.data(), size);
+}
+
+template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value &&
+                                    detail::has_format_as<T>::value)>
+inline auto to_string(const T& value) -> std::string {
+  return to_string(format_as(value));
+}
+
+FMT_END_EXPORT
+
+namespace detail {
+
+template <typename Char>
+void vformat_to(buffer<Char>& buf, basic_string_view<Char> fmt,
+                typename vformat_args<Char>::type args, locale_ref loc) {
+  auto out = buffer_appender<Char>(buf);
+  if (fmt.size() == 2 && equal2(fmt.data(), "{}")) {
+    auto arg = args.get(0);
+    if (!arg) throw_format_error("argument not found");
+    visit_format_arg(default_arg_formatter<Char>{out, args, loc}, arg);
+    return;
+  }
+
+  struct format_handler : error_handler {
+    basic_format_parse_context<Char> parse_context;
+    buffer_context<Char> context;
+
+    format_handler(buffer_appender<Char> p_out, basic_string_view<Char> str,
+                   basic_format_args<buffer_context<Char>> p_args,
+                   locale_ref p_loc)
+        : parse_context(str), context(p_out, p_args, p_loc) {}
+
+    void on_text(const Char* begin, const Char* end) {
+      auto text = basic_string_view<Char>(begin, to_unsigned(end - begin));
+      context.advance_to(write<Char>(context.out(), text));
+    }
+
+    FMT_CONSTEXPR auto on_arg_id() -> int {
+      return parse_context.next_arg_id();
+    }
+    FMT_CONSTEXPR auto on_arg_id(int id) -> int {
+      return parse_context.check_arg_id(id), id;
+    }
+    FMT_CONSTEXPR auto on_arg_id(basic_string_view<Char> id) -> int {
+      int arg_id = context.arg_id(id);
+      if (arg_id < 0) throw_format_error("argument not found");
+      return arg_id;
+    }
+
+    FMT_INLINE void on_replacement_field(int id, const Char*) {
+      auto arg = get_arg(context, id);
+      context.advance_to(visit_format_arg(
+          default_arg_formatter<Char>{context.out(), context.args(),
+                                      context.locale()},
+          arg));
+    }
+
+    auto on_format_specs(int id, const Char* begin, const Char* end)
+        -> const Char* {
+      auto arg = get_arg(context, id);
+      // Not using a visitor for custom types gives better codegen.
+      if (arg.format_custom(begin, parse_context, context))
+        return parse_context.begin();
+      auto specs = detail::dynamic_format_specs<Char>();
+      begin = parse_format_specs(begin, end, specs, parse_context, arg.type());
+      detail::handle_dynamic_spec<detail::width_checker>(
+          specs.width, specs.width_ref, context);
+      detail::handle_dynamic_spec<detail::precision_checker>(
+          specs.precision, specs.precision_ref, context);
+      if (begin == end || *begin != '}')
+        throw_format_error("missing '}' in format string");
+      auto f = arg_formatter<Char>{context.out(), specs, context.locale()};
+      context.advance_to(visit_format_arg(f, arg));
+      return begin;
+    }
+  };
+  detail::parse_format_string<false>(fmt, format_handler(out, fmt, args, loc));
+}
+
+FMT_BEGIN_EXPORT
+
+#ifndef FMT_HEADER_ONLY
+extern template FMT_API void vformat_to(buffer<char>&, string_view,
+                                        typename vformat_args<>::type,
+                                        locale_ref);
+extern template FMT_API auto thousands_sep_impl<char>(locale_ref)
+    -> thousands_sep_result<char>;
+extern template FMT_API auto thousands_sep_impl<wchar_t>(locale_ref)
+    -> thousands_sep_result<wchar_t>;
+extern template FMT_API auto decimal_point_impl(locale_ref) -> char;
+extern template FMT_API auto decimal_point_impl(locale_ref) -> wchar_t;
+#endif  // FMT_HEADER_ONLY
+
+}  // namespace detail
+
+#if FMT_USE_USER_DEFINED_LITERALS
+inline namespace literals {
+/**
+  \rst
+  User-defined literal equivalent of :func:`fmt::arg`.
+
+  **Example**::
+
+    using namespace fmt::literals;
+    fmt::print("Elapsed time: {s:.2f} seconds", "s"_a=1.23);
+  \endrst
+ */
+#  if FMT_USE_NONTYPE_TEMPLATE_ARGS
+template <detail_exported::fixed_string Str> constexpr auto operator""_a() {
+  using char_t = remove_cvref_t<decltype(Str.data[0])>;
+  return detail::udl_arg<char_t, sizeof(Str.data) / sizeof(char_t), Str>();
+}
+#  else
+constexpr auto operator""_a(const char* s, size_t) -> detail::udl_arg<char> {
+  return {s};
+}
+#  endif
+}  // namespace literals
+#endif  // FMT_USE_USER_DEFINED_LITERALS
+
+template <typename Locale, FMT_ENABLE_IF(detail::is_locale<Locale>::value)>
+inline auto vformat(const Locale& loc, string_view fmt, format_args args)
+    -> std::string {
+  return detail::vformat(loc, fmt, args);
+}
+
+template <typename Locale, typename... T,
+          FMT_ENABLE_IF(detail::is_locale<Locale>::value)>
+inline auto format(const Locale& loc, format_string<T...> fmt, T&&... args)
+    -> std::string {
+  return fmt::vformat(loc, string_view(fmt), fmt::make_format_args(args...));
+}
+
+template <typename OutputIt, typename Locale,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value&&
+                            detail::is_locale<Locale>::value)>
+auto vformat_to(OutputIt out, const Locale& loc, string_view fmt,
+                format_args args) -> OutputIt {
+  using detail::get_buffer;
+  auto&& buf = get_buffer<char>(out);
+  detail::vformat_to(buf, fmt, args, detail::locale_ref(loc));
+  return detail::get_iterator(buf, out);
+}
+
+template <typename OutputIt, typename Locale, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value&&
+                            detail::is_locale<Locale>::value)>
+FMT_INLINE auto format_to(OutputIt out, const Locale& loc,
+                          format_string<T...> fmt, T&&... args) -> OutputIt {
+  return vformat_to(out, loc, fmt, fmt::make_format_args(args...));
+}
+
+template <typename Locale, typename... T,
+          FMT_ENABLE_IF(detail::is_locale<Locale>::value)>
+FMT_NODISCARD FMT_INLINE auto formatted_size(const Locale& loc,
+                                             format_string<T...> fmt,
+                                             T&&... args) -> size_t {
+  auto buf = detail::counting_buffer<>();
+  detail::vformat_to<char>(buf, fmt, fmt::make_format_args(args...),
+                           detail::locale_ref(loc));
+  return buf.count();
+}
+
+FMT_END_EXPORT
+
+template <typename T, typename Char>
+template <typename FormatContext>
+FMT_CONSTEXPR FMT_INLINE auto
+formatter<T, Char,
+          enable_if_t<detail::type_constant<T, Char>::value !=
+                      detail::type::custom_type>>::format(const T& val,
+                                                          FormatContext& ctx)
+    const -> decltype(ctx.out()) {
+  if (specs_.width_ref.kind == detail::arg_id_kind::none &&
+      specs_.precision_ref.kind == detail::arg_id_kind::none) {
+    return detail::write<Char>(ctx.out(), val, specs_, ctx.locale());
+  }
+  auto specs = specs_;
+  detail::handle_dynamic_spec<detail::width_checker>(specs.width,
+                                                     specs.width_ref, ctx);
+  detail::handle_dynamic_spec<detail::precision_checker>(
+      specs.precision, specs.precision_ref, ctx);
+  return detail::write<Char>(ctx.out(), val, specs, ctx.locale());
+}
+
+FMT_END_NAMESPACE
+
+#ifdef FMT_HEADER_ONLY
+#  define FMT_FUNC inline
+#  include "format-inl.h"
+#else
+#  define FMT_FUNC
+#endif
+
+#endif  // FMT_FORMAT_H_
diff --git a/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/os.h b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/os.h
new file mode 100644
index 00000000..3c7b3ccb
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/os.h
@@ -0,0 +1,455 @@
+// Formatting library for C++ - optional OS-specific functionality
+//
+// Copyright (c) 2012 - present, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_OS_H_
+#define FMT_OS_H_
+
+#include <cerrno>
+#include <cstddef>
+#include <cstdio>
+#include <system_error>  // std::system_error
+
+#include "format.h"
+
+#if defined __APPLE__ || defined(__FreeBSD__)
+#  if FMT_HAS_INCLUDE(<xlocale.h>)
+#    include <xlocale.h>  // for LC_NUMERIC_MASK on OS X
+#  endif
+#endif
+
+#ifndef FMT_USE_FCNTL
+// UWP doesn't provide _pipe.
+#  if FMT_HAS_INCLUDE("winapifamily.h")
+#    include <winapifamily.h>
+#  endif
+#  if (FMT_HAS_INCLUDE(<fcntl.h>) || defined(__APPLE__) || \
+       defined(__linux__)) &&                              \
+      (!defined(WINAPI_FAMILY) ||                          \
+       (WINAPI_FAMILY == WINAPI_FAMILY_DESKTOP_APP))
+#    include <fcntl.h>  // for O_RDONLY
+#    define FMT_USE_FCNTL 1
+#  else
+#    define FMT_USE_FCNTL 0
+#  endif
+#endif
+
+#ifndef FMT_POSIX
+#  if defined(_WIN32) && !defined(__MINGW32__)
+// Fix warnings about deprecated symbols.
+#    define FMT_POSIX(call) _##call
+#  else
+#    define FMT_POSIX(call) call
+#  endif
+#endif
+
+// Calls to system functions are wrapped in FMT_SYSTEM for testability.
+#ifdef FMT_SYSTEM
+#  define FMT_HAS_SYSTEM
+#  define FMT_POSIX_CALL(call) FMT_SYSTEM(call)
+#else
+#  define FMT_SYSTEM(call) ::call
+#  ifdef _WIN32
+// Fix warnings about deprecated symbols.
+#    define FMT_POSIX_CALL(call) ::_##call
+#  else
+#    define FMT_POSIX_CALL(call) ::call
+#  endif
+#endif
+
+// Retries the expression while it evaluates to error_result and errno
+// equals to EINTR.
+#ifndef _WIN32
+#  define FMT_RETRY_VAL(result, expression, error_result) \
+    do {                                                  \
+      (result) = (expression);                            \
+    } while ((result) == (error_result) && errno == EINTR)
+#else
+#  define FMT_RETRY_VAL(result, expression, error_result) result = (expression)
+#endif
+
+#define FMT_RETRY(result, expression) FMT_RETRY_VAL(result, expression, -1)
+
+FMT_BEGIN_NAMESPACE
+FMT_BEGIN_EXPORT
+
+/**
+  \rst
+  A reference to a null-terminated string. It can be constructed from a C
+  string or ``std::string``.
+
+  You can use one of the following type aliases for common character types:
+
+  +---------------+-----------------------------+
+  | Type          | Definition                  |
+  +===============+=============================+
+  | cstring_view  | basic_cstring_view<char>    |
+  +---------------+-----------------------------+
+  | wcstring_view | basic_cstring_view<wchar_t> |
+  +---------------+-----------------------------+
+
+  This class is most useful as a parameter type to allow passing
+  different types of strings to a function, for example::
+
+    template <typename... Args>
+    std::string format(cstring_view format_str, const Args & ... args);
+
+    format("{}", 42);
+    format(std::string("{}"), 42);
+  \endrst
+ */
+template <typename Char> class basic_cstring_view {
+ private:
+  const Char* data_;
+
+ public:
+  /** Constructs a string reference object from a C string. */
+  basic_cstring_view(const Char* s) : data_(s) {}
+
+  /**
+    \rst
+    Constructs a string reference from an ``std::string`` object.
+    \endrst
+   */
+  basic_cstring_view(const std::basic_string<Char>& s) : data_(s.c_str()) {}
+
+  /** Returns the pointer to a C string. */
+  auto c_str() const -> const Char* { return data_; }
+};
+
+using cstring_view = basic_cstring_view<char>;
+using wcstring_view = basic_cstring_view<wchar_t>;
+
+#ifdef _WIN32
+FMT_API const std::error_category& system_category() noexcept;
+
+namespace detail {
+FMT_API void format_windows_error(buffer<char>& out, int error_code,
+                                  const char* message) noexcept;
+}
+
+FMT_API std::system_error vwindows_error(int error_code, string_view format_str,
+                                         format_args args);
+
+/**
+ \rst
+ Constructs a :class:`std::system_error` object with the description
+ of the form
+
+ .. parsed-literal::
+   *<message>*: *<system-message>*
+
+ where *<message>* is the formatted message and *<system-message>* is the
+ system message corresponding to the error code.
+ *error_code* is a Windows error code as given by ``GetLastError``.
+ If *error_code* is not a valid error code such as -1, the system message
+ will look like "error -1".
+
+ **Example**::
+
+   // This throws a system_error with the description
+   //   cannot open file 'madeup': The system cannot find the file specified.
+   // or similar (system message may vary).
+   const char *filename = "madeup";
+   LPOFSTRUCT of = LPOFSTRUCT();
+   HFILE file = OpenFile(filename, &of, OF_READ);
+   if (file == HFILE_ERROR) {
+     throw fmt::windows_error(GetLastError(),
+                              "cannot open file '{}'", filename);
+   }
+ \endrst
+*/
+template <typename... Args>
+std::system_error windows_error(int error_code, string_view message,
+                                const Args&... args) {
+  return vwindows_error(error_code, message, fmt::make_format_args(args...));
+}
+
+// Reports a Windows error without throwing an exception.
+// Can be used to report errors from destructors.
+FMT_API void report_windows_error(int error_code, const char* message) noexcept;
+#else
+inline auto system_category() noexcept -> const std::error_category& {
+  return std::system_category();
+}
+#endif  // _WIN32
+
+// std::system is not available on some platforms such as iOS (#2248).
+#ifdef __OSX__
+template <typename S, typename... Args, typename Char = char_t<S>>
+void say(const S& format_str, Args&&... args) {
+  std::system(format("say \"{}\"", format(format_str, args...)).c_str());
+}
+#endif
+
+// A buffered file.
+class buffered_file {
+ private:
+  FILE* file_;
+
+  friend class file;
+
+  explicit buffered_file(FILE* f) : file_(f) {}
+
+ public:
+  buffered_file(const buffered_file&) = delete;
+  void operator=(const buffered_file&) = delete;
+
+  // Constructs a buffered_file object which doesn't represent any file.
+  buffered_file() noexcept : file_(nullptr) {}
+
+  // Destroys the object closing the file it represents if any.
+  FMT_API ~buffered_file() noexcept;
+
+ public:
+  buffered_file(buffered_file&& other) noexcept : file_(other.file_) {
+    other.file_ = nullptr;
+  }
+
+  auto operator=(buffered_file&& other) -> buffered_file& {
+    close();
+    file_ = other.file_;
+    other.file_ = nullptr;
+    return *this;
+  }
+
+  // Opens a file.
+  FMT_API buffered_file(cstring_view filename, cstring_view mode);
+
+  // Closes the file.
+  FMT_API void close();
+
+  // Returns the pointer to a FILE object representing this file.
+  auto get() const noexcept -> FILE* { return file_; }
+
+  FMT_API auto descriptor() const -> int;
+
+  void vprint(string_view format_str, format_args args) {
+    fmt::vprint(file_, format_str, args);
+  }
+
+  template <typename... Args>
+  inline void print(string_view format_str, const Args&... args) {
+    vprint(format_str, fmt::make_format_args(args...));
+  }
+};
+
+#if FMT_USE_FCNTL
+// A file. Closed file is represented by a file object with descriptor -1.
+// Methods that are not declared with noexcept may throw
+// fmt::system_error in case of failure. Note that some errors such as
+// closing the file multiple times will cause a crash on Windows rather
+// than an exception. You can get standard behavior by overriding the
+// invalid parameter handler with _set_invalid_parameter_handler.
+class FMT_API file {
+ private:
+  int fd_;  // File descriptor.
+
+  // Constructs a file object with a given descriptor.
+  explicit file(int fd) : fd_(fd) {}
+
+ public:
+  // Possible values for the oflag argument to the constructor.
+  enum {
+    RDONLY = FMT_POSIX(O_RDONLY),  // Open for reading only.
+    WRONLY = FMT_POSIX(O_WRONLY),  // Open for writing only.
+    RDWR = FMT_POSIX(O_RDWR),      // Open for reading and writing.
+    CREATE = FMT_POSIX(O_CREAT),   // Create if the file doesn't exist.
+    APPEND = FMT_POSIX(O_APPEND),  // Open in append mode.
+    TRUNC = FMT_POSIX(O_TRUNC)     // Truncate the content of the file.
+  };
+
+  // Constructs a file object which doesn't represent any file.
+  file() noexcept : fd_(-1) {}
+
+  // Opens a file and constructs a file object representing this file.
+  file(cstring_view path, int oflag);
+
+ public:
+  file(const file&) = delete;
+  void operator=(const file&) = delete;
+
+  file(file&& other) noexcept : fd_(other.fd_) { other.fd_ = -1; }
+
+  // Move assignment is not noexcept because close may throw.
+  auto operator=(file&& other) -> file& {
+    close();
+    fd_ = other.fd_;
+    other.fd_ = -1;
+    return *this;
+  }
+
+  // Destroys the object closing the file it represents if any.
+  ~file() noexcept;
+
+  // Returns the file descriptor.
+  auto descriptor() const noexcept -> int { return fd_; }
+
+  // Closes the file.
+  void close();
+
+  // Returns the file size. The size has signed type for consistency with
+  // stat::st_size.
+  auto size() const -> long long;
+
+  // Attempts to read count bytes from the file into the specified buffer.
+  auto read(void* buffer, size_t count) -> size_t;
+
+  // Attempts to write count bytes from the specified buffer to the file.
+  auto write(const void* buffer, size_t count) -> size_t;
+
+  // Duplicates a file descriptor with the dup function and returns
+  // the duplicate as a file object.
+  static auto dup(int fd) -> file;
+
+  // Makes fd be the copy of this file descriptor, closing fd first if
+  // necessary.
+  void dup2(int fd);
+
+  // Makes fd be the copy of this file descriptor, closing fd first if
+  // necessary.
+  void dup2(int fd, std::error_code& ec) noexcept;
+
+  // Creates a pipe setting up read_end and write_end file objects for reading
+  // and writing respectively.
+  // DEPRECATED! Taking files as out parameters is deprecated.
+  static void pipe(file& read_end, file& write_end);
+
+  // Creates a buffered_file object associated with this file and detaches
+  // this file object from the file.
+  auto fdopen(const char* mode) -> buffered_file;
+
+#  if defined(_WIN32) && !defined(__MINGW32__)
+  // Opens a file and constructs a file object representing this file by
+  // wcstring_view filename. Windows only.
+  static file open_windows_file(wcstring_view path, int oflag);
+#  endif
+};
+
+// Returns the memory page size.
+auto getpagesize() -> long;
+
+namespace detail {
+
+struct buffer_size {
+  buffer_size() = default;
+  size_t value = 0;
+  auto operator=(size_t val) const -> buffer_size {
+    auto bs = buffer_size();
+    bs.value = val;
+    return bs;
+  }
+};
+
+struct ostream_params {
+  int oflag = file::WRONLY | file::CREATE | file::TRUNC;
+  size_t buffer_size = BUFSIZ > 32768 ? BUFSIZ : 32768;
+
+  ostream_params() {}
+
+  template <typename... T>
+  ostream_params(T... params, int new_oflag) : ostream_params(params...) {
+    oflag = new_oflag;
+  }
+
+  template <typename... T>
+  ostream_params(T... params, detail::buffer_size bs)
+      : ostream_params(params...) {
+    this->buffer_size = bs.value;
+  }
+
+// Intel has a bug that results in failure to deduce a constructor
+// for empty parameter packs.
+#  if defined(__INTEL_COMPILER) && __INTEL_COMPILER < 2000
+  ostream_params(int new_oflag) : oflag(new_oflag) {}
+  ostream_params(detail::buffer_size bs) : buffer_size(bs.value) {}
+#  endif
+};
+
+class file_buffer final : public buffer<char> {
+  file file_;
+
+  FMT_API void grow(size_t) override;
+
+ public:
+  FMT_API file_buffer(cstring_view path, const ostream_params& params);
+  FMT_API file_buffer(file_buffer&& other);
+  FMT_API ~file_buffer();
+
+  void flush() {
+    if (size() == 0) return;
+    file_.write(data(), size() * sizeof(data()[0]));
+    clear();
+  }
+
+  void close() {
+    flush();
+    file_.close();
+  }
+};
+
+}  // namespace detail
+
+// Added {} below to work around default constructor error known to
+// occur in Xcode versions 7.2.1 and 8.2.1.
+constexpr detail::buffer_size buffer_size{};
+
+/** A fast output stream which is not thread-safe. */
+class FMT_API ostream {
+ private:
+  FMT_MSC_WARNING(suppress : 4251)
+  detail::file_buffer buffer_;
+
+  ostream(cstring_view path, const detail::ostream_params& params)
+      : buffer_(path, params) {}
+
+ public:
+  ostream(ostream&& other) : buffer_(std::move(other.buffer_)) {}
+
+  ~ostream();
+
+  void flush() { buffer_.flush(); }
+
+  template <typename... T>
+  friend auto output_file(cstring_view path, T... params) -> ostream;
+
+  void close() { buffer_.close(); }
+
+  /**
+    Formats ``args`` according to specifications in ``fmt`` and writes the
+    output to the file.
+   */
+  template <typename... T> void print(format_string<T...> fmt, T&&... args) {
+    vformat_to(std::back_inserter(buffer_), fmt,
+               fmt::make_format_args(args...));
+  }
+};
+
+/**
+  \rst
+  Opens a file for writing. Supported parameters passed in *params*:
+
+  * ``<integer>``: Flags passed to `open
+    <https://pubs.opengroup.org/onlinepubs/007904875/functions/open.html>`_
+    (``file::WRONLY | file::CREATE | file::TRUNC`` by default)
+  * ``buffer_size=<integer>``: Output buffer size
+
+  **Example**::
+
+    auto out = fmt::output_file("guide.txt");
+    out.print("Don't {}", "Panic");
+  \endrst
+ */
+template <typename... T>
+inline auto output_file(cstring_view path, T... params) -> ostream {
+  return {path, detail::ostream_params(params...)};
+}
+#endif  // FMT_USE_FCNTL
+
+FMT_END_EXPORT
+FMT_END_NAMESPACE
+
+#endif  // FMT_OS_H_
diff --git a/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/ostream.h b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/ostream.h
new file mode 100644
index 00000000..26fb3b5a
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/ostream.h
@@ -0,0 +1,245 @@
+// Formatting library for C++ - std::ostream support
+//
+// Copyright (c) 2012 - present, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_OSTREAM_H_
+#define FMT_OSTREAM_H_
+
+#include <fstream>  // std::filebuf
+
+#ifdef _WIN32
+#  ifdef __GLIBCXX__
+#    include <ext/stdio_filebuf.h>
+#    include <ext/stdio_sync_filebuf.h>
+#  endif
+#  include <io.h>
+#endif
+
+#include "format.h"
+
+FMT_BEGIN_NAMESPACE
+namespace detail {
+
+template <typename Streambuf> class formatbuf : public Streambuf {
+ private:
+  using char_type = typename Streambuf::char_type;
+  using streamsize = decltype(std::declval<Streambuf>().sputn(nullptr, 0));
+  using int_type = typename Streambuf::int_type;
+  using traits_type = typename Streambuf::traits_type;
+
+  buffer<char_type>& buffer_;
+
+ public:
+  explicit formatbuf(buffer<char_type>& buf) : buffer_(buf) {}
+
+ protected:
+  // The put area is always empty. This makes the implementation simpler and has
+  // the advantage that the streambuf and the buffer are always in sync and
+  // sputc never writes into uninitialized memory. A disadvantage is that each
+  // call to sputc always results in a (virtual) call to overflow. There is no
+  // disadvantage here for sputn since this always results in a call to xsputn.
+
+  auto overflow(int_type ch) -> int_type override {
+    if (!traits_type::eq_int_type(ch, traits_type::eof()))
+      buffer_.push_back(static_cast<char_type>(ch));
+    return ch;
+  }
+
+  auto xsputn(const char_type* s, streamsize count) -> streamsize override {
+    buffer_.append(s, s + count);
+    return count;
+  }
+};
+
+// Generate a unique explicit instantion in every translation unit using a tag
+// type in an anonymous namespace.
+namespace {
+struct file_access_tag {};
+}  // namespace
+template <typename Tag, typename BufType, FILE* BufType::*FileMemberPtr>
+class file_access {
+  friend auto get_file(BufType& obj) -> FILE* { return obj.*FileMemberPtr; }
+};
+
+#if FMT_MSC_VERSION
+template class file_access<file_access_tag, std::filebuf,
+                           &std::filebuf::_Myfile>;
+auto get_file(std::filebuf&) -> FILE*;
+#endif
+
+inline auto write_ostream_unicode(std::ostream& os, fmt::string_view data)
+    -> bool {
+  FILE* f = nullptr;
+#if FMT_MSC_VERSION
+  if (auto* buf = dynamic_cast<std::filebuf*>(os.rdbuf()))
+    f = get_file(*buf);
+  else
+    return false;
+#elif defined(_WIN32) && defined(__GLIBCXX__)
+  auto* rdbuf = os.rdbuf();
+  if (auto* sfbuf = dynamic_cast<__gnu_cxx::stdio_sync_filebuf<char>*>(rdbuf))
+    f = sfbuf->file();
+  else if (auto* fbuf = dynamic_cast<__gnu_cxx::stdio_filebuf<char>*>(rdbuf))
+    f = fbuf->file();
+  else
+    return false;
+#else
+  ignore_unused(os, data, f);
+#endif
+#ifdef _WIN32
+  if (f) {
+    int fd = _fileno(f);
+    if (_isatty(fd)) {
+      os.flush();
+      return write_console(fd, data);
+    }
+  }
+#endif
+  return false;
+}
+inline auto write_ostream_unicode(std::wostream&,
+                                  fmt::basic_string_view<wchar_t>) -> bool {
+  return false;
+}
+
+// Write the content of buf to os.
+// It is a separate function rather than a part of vprint to simplify testing.
+template <typename Char>
+void write_buffer(std::basic_ostream<Char>& os, buffer<Char>& buf) {
+  const Char* buf_data = buf.data();
+  using unsigned_streamsize = std::make_unsigned<std::streamsize>::type;
+  unsigned_streamsize size = buf.size();
+  unsigned_streamsize max_size = to_unsigned(max_value<std::streamsize>());
+  do {
+    unsigned_streamsize n = size <= max_size ? size : max_size;
+    os.write(buf_data, static_cast<std::streamsize>(n));
+    buf_data += n;
+    size -= n;
+  } while (size != 0);
+}
+
+template <typename Char, typename T>
+void format_value(buffer<Char>& buf, const T& value) {
+  auto&& format_buf = formatbuf<std::basic_streambuf<Char>>(buf);
+  auto&& output = std::basic_ostream<Char>(&format_buf);
+#if !defined(FMT_STATIC_THOUSANDS_SEPARATOR)
+  output.imbue(std::locale::classic());  // The default is always unlocalized.
+#endif
+  output << value;
+  output.exceptions(std::ios_base::failbit | std::ios_base::badbit);
+}
+
+template <typename T> struct streamed_view {
+  const T& value;
+};
+
+}  // namespace detail
+
+// Formats an object of type T that has an overloaded ostream operator<<.
+template <typename Char>
+struct basic_ostream_formatter : formatter<basic_string_view<Char>, Char> {
+  void set_debug_format() = delete;
+
+  template <typename T, typename OutputIt>
+  auto format(const T& value, basic_format_context<OutputIt, Char>& ctx) const
+      -> OutputIt {
+    auto buffer = basic_memory_buffer<Char>();
+    detail::format_value(buffer, value);
+    return formatter<basic_string_view<Char>, Char>::format(
+        {buffer.data(), buffer.size()}, ctx);
+  }
+};
+
+using ostream_formatter = basic_ostream_formatter<char>;
+
+template <typename T, typename Char>
+struct formatter<detail::streamed_view<T>, Char>
+    : basic_ostream_formatter<Char> {
+  template <typename OutputIt>
+  auto format(detail::streamed_view<T> view,
+              basic_format_context<OutputIt, Char>& ctx) const -> OutputIt {
+    return basic_ostream_formatter<Char>::format(view.value, ctx);
+  }
+};
+
+/**
+  \rst
+  Returns a view that formats `value` via an ostream ``operator<<``.
+
+  **Example**::
+
+    fmt::print("Current thread id: {}\n",
+               fmt::streamed(std::this_thread::get_id()));
+  \endrst
+ */
+template <typename T>
+constexpr auto streamed(const T& value) -> detail::streamed_view<T> {
+  return {value};
+}
+
+namespace detail {
+
+inline void vprint_directly(std::ostream& os, string_view format_str,
+                            format_args args) {
+  auto buffer = memory_buffer();
+  detail::vformat_to(buffer, format_str, args);
+  detail::write_buffer(os, buffer);
+}
+
+}  // namespace detail
+
+FMT_EXPORT template <typename Char>
+void vprint(std::basic_ostream<Char>& os,
+            basic_string_view<type_identity_t<Char>> format_str,
+            basic_format_args<buffer_context<type_identity_t<Char>>> args) {
+  auto buffer = basic_memory_buffer<Char>();
+  detail::vformat_to(buffer, format_str, args);
+  if (detail::write_ostream_unicode(os, {buffer.data(), buffer.size()})) return;
+  detail::write_buffer(os, buffer);
+}
+
+/**
+  \rst
+  Prints formatted data to the stream *os*.
+
+  **Example**::
+
+    fmt::print(cerr, "Don't {}!", "panic");
+  \endrst
+ */
+FMT_EXPORT template <typename... T>
+void print(std::ostream& os, format_string<T...> fmt, T&&... args) {
+  const auto& vargs = fmt::make_format_args(args...);
+  if (detail::is_utf8())
+    vprint(os, fmt, vargs);
+  else
+    detail::vprint_directly(os, fmt, vargs);
+}
+
+FMT_EXPORT
+template <typename... Args>
+void print(std::wostream& os,
+           basic_format_string<wchar_t, type_identity_t<Args>...> fmt,
+           Args&&... args) {
+  vprint(os, fmt, fmt::make_format_args<buffer_context<wchar_t>>(args...));
+}
+
+FMT_EXPORT template <typename... T>
+void println(std::ostream& os, format_string<T...> fmt, T&&... args) {
+  fmt::print(os, "{}\n", fmt::format(fmt, std::forward<T>(args)...));
+}
+
+FMT_EXPORT
+template <typename... Args>
+void println(std::wostream& os,
+             basic_format_string<wchar_t, type_identity_t<Args>...> fmt,
+             Args&&... args) {
+  print(os, L"{}\n", fmt::format(fmt, std::forward<Args>(args)...));
+}
+
+FMT_END_NAMESPACE
+
+#endif  // FMT_OSTREAM_H_
diff --git a/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/printf.h b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/printf.h
new file mode 100644
index 00000000..07e81577
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/printf.h
@@ -0,0 +1,675 @@
+// Formatting library for C++ - legacy printf implementation
+//
+// Copyright (c) 2012 - 2016, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_PRINTF_H_
+#define FMT_PRINTF_H_
+
+#include <algorithm>  // std::max
+#include <limits>     // std::numeric_limits
+
+#include "format.h"
+
+FMT_BEGIN_NAMESPACE
+FMT_BEGIN_EXPORT
+
+template <typename T> struct printf_formatter {
+  printf_formatter() = delete;
+};
+
+template <typename Char> class basic_printf_context {
+ private:
+  detail::buffer_appender<Char> out_;
+  basic_format_args<basic_printf_context> args_;
+
+  static_assert(std::is_same<Char, char>::value ||
+                    std::is_same<Char, wchar_t>::value,
+                "Unsupported code unit type.");
+
+ public:
+  using char_type = Char;
+  using parse_context_type = basic_format_parse_context<Char>;
+  template <typename T> using formatter_type = printf_formatter<T>;
+
+  /**
+    \rst
+    Constructs a ``printf_context`` object. References to the arguments are
+    stored in the context object so make sure they have appropriate lifetimes.
+    \endrst
+   */
+  basic_printf_context(detail::buffer_appender<Char> out,
+                       basic_format_args<basic_printf_context> args)
+      : out_(out), args_(args) {}
+
+  auto out() -> detail::buffer_appender<Char> { return out_; }
+  void advance_to(detail::buffer_appender<Char>) {}
+
+  auto locale() -> detail::locale_ref { return {}; }
+
+  auto arg(int id) const -> basic_format_arg<basic_printf_context> {
+    return args_.get(id);
+  }
+
+  FMT_CONSTEXPR void on_error(const char* message) {
+    detail::error_handler().on_error(message);
+  }
+};
+
+namespace detail {
+
+// Checks if a value fits in int - used to avoid warnings about comparing
+// signed and unsigned integers.
+template <bool IsSigned> struct int_checker {
+  template <typename T> static auto fits_in_int(T value) -> bool {
+    unsigned max = max_value<int>();
+    return value <= max;
+  }
+  static auto fits_in_int(bool) -> bool { return true; }
+};
+
+template <> struct int_checker<true> {
+  template <typename T> static auto fits_in_int(T value) -> bool {
+    return value >= (std::numeric_limits<int>::min)() &&
+           value <= max_value<int>();
+  }
+  static auto fits_in_int(int) -> bool { return true; }
+};
+
+struct printf_precision_handler {
+  template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
+  auto operator()(T value) -> int {
+    if (!int_checker<std::numeric_limits<T>::is_signed>::fits_in_int(value))
+      throw_format_error("number is too big");
+    return (std::max)(static_cast<int>(value), 0);
+  }
+
+  template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value)>
+  auto operator()(T) -> int {
+    throw_format_error("precision is not integer");
+    return 0;
+  }
+};
+
+// An argument visitor that returns true iff arg is a zero integer.
+struct is_zero_int {
+  template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
+  auto operator()(T value) -> bool {
+    return value == 0;
+  }
+
+  template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value)>
+  auto operator()(T) -> bool {
+    return false;
+  }
+};
+
+template <typename T> struct make_unsigned_or_bool : std::make_unsigned<T> {};
+
+template <> struct make_unsigned_or_bool<bool> {
+  using type = bool;
+};
+
+template <typename T, typename Context> class arg_converter {
+ private:
+  using char_type = typename Context::char_type;
+
+  basic_format_arg<Context>& arg_;
+  char_type type_;
+
+ public:
+  arg_converter(basic_format_arg<Context>& arg, char_type type)
+      : arg_(arg), type_(type) {}
+
+  void operator()(bool value) {
+    if (type_ != 's') operator()<bool>(value);
+  }
+
+  template <typename U, FMT_ENABLE_IF(std::is_integral<U>::value)>
+  void operator()(U value) {
+    bool is_signed = type_ == 'd' || type_ == 'i';
+    using target_type = conditional_t<std::is_same<T, void>::value, U, T>;
+    if (const_check(sizeof(target_type) <= sizeof(int))) {
+      // Extra casts are used to silence warnings.
+      if (is_signed) {
+        auto n = static_cast<int>(static_cast<target_type>(value));
+        arg_ = detail::make_arg<Context>(n);
+      } else {
+        using unsigned_type = typename make_unsigned_or_bool<target_type>::type;
+        auto n = static_cast<unsigned>(static_cast<unsigned_type>(value));
+        arg_ = detail::make_arg<Context>(n);
+      }
+    } else {
+      if (is_signed) {
+        // glibc's printf doesn't sign extend arguments of smaller types:
+        //   std::printf("%lld", -42);  // prints "4294967254"
+        // but we don't have to do the same because it's a UB.
+        auto n = static_cast<long long>(value);
+        arg_ = detail::make_arg<Context>(n);
+      } else {
+        auto n = static_cast<typename make_unsigned_or_bool<U>::type>(value);
+        arg_ = detail::make_arg<Context>(n);
+      }
+    }
+  }
+
+  template <typename U, FMT_ENABLE_IF(!std::is_integral<U>::value)>
+  void operator()(U) {}  // No conversion needed for non-integral types.
+};
+
+// Converts an integer argument to T for printf, if T is an integral type.
+// If T is void, the argument is converted to corresponding signed or unsigned
+// type depending on the type specifier: 'd' and 'i' - signed, other -
+// unsigned).
+template <typename T, typename Context, typename Char>
+void convert_arg(basic_format_arg<Context>& arg, Char type) {
+  visit_format_arg(arg_converter<T, Context>(arg, type), arg);
+}
+
+// Converts an integer argument to char for printf.
+template <typename Context> class char_converter {
+ private:
+  basic_format_arg<Context>& arg_;
+
+ public:
+  explicit char_converter(basic_format_arg<Context>& arg) : arg_(arg) {}
+
+  template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
+  void operator()(T value) {
+    auto c = static_cast<typename Context::char_type>(value);
+    arg_ = detail::make_arg<Context>(c);
+  }
+
+  template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value)>
+  void operator()(T) {}  // No conversion needed for non-integral types.
+};
+
+// An argument visitor that return a pointer to a C string if argument is a
+// string or null otherwise.
+template <typename Char> struct get_cstring {
+  template <typename T> auto operator()(T) -> const Char* { return nullptr; }
+  auto operator()(const Char* s) -> const Char* { return s; }
+};
+
+// Checks if an argument is a valid printf width specifier and sets
+// left alignment if it is negative.
+template <typename Char> class printf_width_handler {
+ private:
+  format_specs<Char>& specs_;
+
+ public:
+  explicit printf_width_handler(format_specs<Char>& specs) : specs_(specs) {}
+
+  template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
+  auto operator()(T value) -> unsigned {
+    auto width = static_cast<uint32_or_64_or_128_t<T>>(value);
+    if (detail::is_negative(value)) {
+      specs_.align = align::left;
+      width = 0 - width;
+    }
+    unsigned int_max = max_value<int>();
+    if (width > int_max) throw_format_error("number is too big");
+    return static_cast<unsigned>(width);
+  }
+
+  template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value)>
+  auto operator()(T) -> unsigned {
+    throw_format_error("width is not integer");
+    return 0;
+  }
+};
+
+// Workaround for a bug with the XL compiler when initializing
+// printf_arg_formatter's base class.
+template <typename Char>
+auto make_arg_formatter(buffer_appender<Char> iter, format_specs<Char>& s)
+    -> arg_formatter<Char> {
+  return {iter, s, locale_ref()};
+}
+
+// The ``printf`` argument formatter.
+template <typename Char>
+class printf_arg_formatter : public arg_formatter<Char> {
+ private:
+  using base = arg_formatter<Char>;
+  using context_type = basic_printf_context<Char>;
+
+  context_type& context_;
+
+  void write_null_pointer(bool is_string = false) {
+    auto s = this->specs;
+    s.type = presentation_type::none;
+    write_bytes(this->out, is_string ? "(null)" : "(nil)", s);
+  }
+
+ public:
+  printf_arg_formatter(buffer_appender<Char> iter, format_specs<Char>& s,
+                       context_type& ctx)
+      : base(make_arg_formatter(iter, s)), context_(ctx) {}
+
+  void operator()(monostate value) { base::operator()(value); }
+
+  template <typename T, FMT_ENABLE_IF(detail::is_integral<T>::value)>
+  void operator()(T value) {
+    // MSVC2013 fails to compile separate overloads for bool and Char so use
+    // std::is_same instead.
+    if (!std::is_same<T, Char>::value) {
+      base::operator()(value);
+      return;
+    }
+    format_specs<Char> fmt_specs = this->specs;
+    if (fmt_specs.type != presentation_type::none &&
+        fmt_specs.type != presentation_type::chr) {
+      return (*this)(static_cast<int>(value));
+    }
+    fmt_specs.sign = sign::none;
+    fmt_specs.alt = false;
+    fmt_specs.fill[0] = ' ';  // Ignore '0' flag for char types.
+    // align::numeric needs to be overwritten here since the '0' flag is
+    // ignored for non-numeric types
+    if (fmt_specs.align == align::none || fmt_specs.align == align::numeric)
+      fmt_specs.align = align::right;
+    write<Char>(this->out, static_cast<Char>(value), fmt_specs);
+  }
+
+  template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value)>
+  void operator()(T value) {
+    base::operator()(value);
+  }
+
+  /** Formats a null-terminated C string. */
+  void operator()(const char* value) {
+    if (value)
+      base::operator()(value);
+    else
+      write_null_pointer(this->specs.type != presentation_type::pointer);
+  }
+
+  /** Formats a null-terminated wide C string. */
+  void operator()(const wchar_t* value) {
+    if (value)
+      base::operator()(value);
+    else
+      write_null_pointer(this->specs.type != presentation_type::pointer);
+  }
+
+  void operator()(basic_string_view<Char> value) { base::operator()(value); }
+
+  /** Formats a pointer. */
+  void operator()(const void* value) {
+    if (value)
+      base::operator()(value);
+    else
+      write_null_pointer();
+  }
+
+  /** Formats an argument of a custom (user-defined) type. */
+  void operator()(typename basic_format_arg<context_type>::handle handle) {
+    auto parse_ctx = basic_format_parse_context<Char>({});
+    handle.format(parse_ctx, context_);
+  }
+};
+
+template <typename Char>
+void parse_flags(format_specs<Char>& specs, const Char*& it, const Char* end) {
+  for (; it != end; ++it) {
+    switch (*it) {
+    case '-':
+      specs.align = align::left;
+      break;
+    case '+':
+      specs.sign = sign::plus;
+      break;
+    case '0':
+      specs.fill[0] = '0';
+      break;
+    case ' ':
+      if (specs.sign != sign::plus) specs.sign = sign::space;
+      break;
+    case '#':
+      specs.alt = true;
+      break;
+    default:
+      return;
+    }
+  }
+}
+
+template <typename Char, typename GetArg>
+auto parse_header(const Char*& it, const Char* end, format_specs<Char>& specs,
+                  GetArg get_arg) -> int {
+  int arg_index = -1;
+  Char c = *it;
+  if (c >= '0' && c <= '9') {
+    // Parse an argument index (if followed by '$') or a width possibly
+    // preceded with '0' flag(s).
+    int value = parse_nonnegative_int(it, end, -1);
+    if (it != end && *it == '$') {  // value is an argument index
+      ++it;
+      arg_index = value != -1 ? value : max_value<int>();
+    } else {
+      if (c == '0') specs.fill[0] = '0';
+      if (value != 0) {
+        // Nonzero value means that we parsed width and don't need to
+        // parse it or flags again, so return now.
+        if (value == -1) throw_format_error("number is too big");
+        specs.width = value;
+        return arg_index;
+      }
+    }
+  }
+  parse_flags(specs, it, end);
+  // Parse width.
+  if (it != end) {
+    if (*it >= '0' && *it <= '9') {
+      specs.width = parse_nonnegative_int(it, end, -1);
+      if (specs.width == -1) throw_format_error("number is too big");
+    } else if (*it == '*') {
+      ++it;
+      specs.width = static_cast<int>(visit_format_arg(
+          detail::printf_width_handler<Char>(specs), get_arg(-1)));
+    }
+  }
+  return arg_index;
+}
+
+inline auto parse_printf_presentation_type(char c, type t)
+    -> presentation_type {
+  using pt = presentation_type;
+  constexpr auto integral_set = sint_set | uint_set | bool_set | char_set;
+  switch (c) {
+  case 'd':
+    return in(t, integral_set) ? pt::dec : pt::none;
+  case 'o':
+    return in(t, integral_set) ? pt::oct : pt::none;
+  case 'x':
+    return in(t, integral_set) ? pt::hex_lower : pt::none;
+  case 'X':
+    return in(t, integral_set) ? pt::hex_upper : pt::none;
+  case 'a':
+    return in(t, float_set) ? pt::hexfloat_lower : pt::none;
+  case 'A':
+    return in(t, float_set) ? pt::hexfloat_upper : pt::none;
+  case 'e':
+    return in(t, float_set) ? pt::exp_lower : pt::none;
+  case 'E':
+    return in(t, float_set) ? pt::exp_upper : pt::none;
+  case 'f':
+    return in(t, float_set) ? pt::fixed_lower : pt::none;
+  case 'F':
+    return in(t, float_set) ? pt::fixed_upper : pt::none;
+  case 'g':
+    return in(t, float_set) ? pt::general_lower : pt::none;
+  case 'G':
+    return in(t, float_set) ? pt::general_upper : pt::none;
+  case 'c':
+    return in(t, integral_set) ? pt::chr : pt::none;
+  case 's':
+    return in(t, string_set | cstring_set) ? pt::string : pt::none;
+  case 'p':
+    return in(t, pointer_set | cstring_set) ? pt::pointer : pt::none;
+  default:
+    return pt::none;
+  }
+}
+
+template <typename Char, typename Context>
+void vprintf(buffer<Char>& buf, basic_string_view<Char> format,
+             basic_format_args<Context> args) {
+  using iterator = buffer_appender<Char>;
+  auto out = iterator(buf);
+  auto context = basic_printf_context<Char>(out, args);
+  auto parse_ctx = basic_format_parse_context<Char>(format);
+
+  // Returns the argument with specified index or, if arg_index is -1, the next
+  // argument.
+  auto get_arg = [&](int arg_index) {
+    if (arg_index < 0)
+      arg_index = parse_ctx.next_arg_id();
+    else
+      parse_ctx.check_arg_id(--arg_index);
+    return detail::get_arg(context, arg_index);
+  };
+
+  const Char* start = parse_ctx.begin();
+  const Char* end = parse_ctx.end();
+  auto it = start;
+  while (it != end) {
+    if (!find<false, Char>(it, end, '%', it)) {
+      it = end;  // find leaves it == nullptr if it doesn't find '%'.
+      break;
+    }
+    Char c = *it++;
+    if (it != end && *it == c) {
+      write(out, basic_string_view<Char>(start, to_unsigned(it - start)));
+      start = ++it;
+      continue;
+    }
+    write(out, basic_string_view<Char>(start, to_unsigned(it - 1 - start)));
+
+    auto specs = format_specs<Char>();
+    specs.align = align::right;
+
+    // Parse argument index, flags and width.
+    int arg_index = parse_header(it, end, specs, get_arg);
+    if (arg_index == 0) throw_format_error("argument not found");
+
+    // Parse precision.
+    if (it != end && *it == '.') {
+      ++it;
+      c = it != end ? *it : 0;
+      if ('0' <= c && c <= '9') {
+        specs.precision = parse_nonnegative_int(it, end, 0);
+      } else if (c == '*') {
+        ++it;
+        specs.precision = static_cast<int>(
+            visit_format_arg(printf_precision_handler(), get_arg(-1)));
+      } else {
+        specs.precision = 0;
+      }
+    }
+
+    auto arg = get_arg(arg_index);
+    // For d, i, o, u, x, and X conversion specifiers, if a precision is
+    // specified, the '0' flag is ignored
+    if (specs.precision >= 0 && arg.is_integral()) {
+      // Ignore '0' for non-numeric types or if '-' present.
+      specs.fill[0] = ' ';
+    }
+    if (specs.precision >= 0 && arg.type() == type::cstring_type) {
+      auto str = visit_format_arg(get_cstring<Char>(), arg);
+      auto str_end = str + specs.precision;
+      auto nul = std::find(str, str_end, Char());
+      auto sv = basic_string_view<Char>(
+          str, to_unsigned(nul != str_end ? nul - str : specs.precision));
+      arg = make_arg<basic_printf_context<Char>>(sv);
+    }
+    if (specs.alt && visit_format_arg(is_zero_int(), arg)) specs.alt = false;
+    if (specs.fill[0] == '0') {
+      if (arg.is_arithmetic() && specs.align != align::left)
+        specs.align = align::numeric;
+      else
+        specs.fill[0] = ' ';  // Ignore '0' flag for non-numeric types or if '-'
+                              // flag is also present.
+    }
+
+    // Parse length and convert the argument to the required type.
+    c = it != end ? *it++ : 0;
+    Char t = it != end ? *it : 0;
+    switch (c) {
+    case 'h':
+      if (t == 'h') {
+        ++it;
+        t = it != end ? *it : 0;
+        convert_arg<signed char>(arg, t);
+      } else {
+        convert_arg<short>(arg, t);
+      }
+      break;
+    case 'l':
+      if (t == 'l') {
+        ++it;
+        t = it != end ? *it : 0;
+        convert_arg<long long>(arg, t);
+      } else {
+        convert_arg<long>(arg, t);
+      }
+      break;
+    case 'j':
+      convert_arg<intmax_t>(arg, t);
+      break;
+    case 'z':
+      convert_arg<size_t>(arg, t);
+      break;
+    case 't':
+      convert_arg<std::ptrdiff_t>(arg, t);
+      break;
+    case 'L':
+      // printf produces garbage when 'L' is omitted for long double, no
+      // need to do the same.
+      break;
+    default:
+      --it;
+      convert_arg<void>(arg, c);
+    }
+
+    // Parse type.
+    if (it == end) throw_format_error("invalid format string");
+    char type = static_cast<char>(*it++);
+    if (arg.is_integral()) {
+      // Normalize type.
+      switch (type) {
+      case 'i':
+      case 'u':
+        type = 'd';
+        break;
+      case 'c':
+        visit_format_arg(char_converter<basic_printf_context<Char>>(arg), arg);
+        break;
+      }
+    }
+    specs.type = parse_printf_presentation_type(type, arg.type());
+    if (specs.type == presentation_type::none)
+      throw_format_error("invalid format specifier");
+
+    start = it;
+
+    // Format argument.
+    visit_format_arg(printf_arg_formatter<Char>(out, specs, context), arg);
+  }
+  write(out, basic_string_view<Char>(start, to_unsigned(it - start)));
+}
+}  // namespace detail
+
+using printf_context = basic_printf_context<char>;
+using wprintf_context = basic_printf_context<wchar_t>;
+
+using printf_args = basic_format_args<printf_context>;
+using wprintf_args = basic_format_args<wprintf_context>;
+
+/**
+  \rst
+  Constructs an `~fmt::format_arg_store` object that contains references to
+  arguments and can be implicitly converted to `~fmt::printf_args`.
+  \endrst
+ */
+template <typename... T>
+inline auto make_printf_args(const T&... args)
+    -> format_arg_store<printf_context, T...> {
+  return {args...};
+}
+
+// DEPRECATED!
+template <typename... T>
+inline auto make_wprintf_args(const T&... args)
+    -> format_arg_store<wprintf_context, T...> {
+  return {args...};
+}
+
+template <typename Char>
+inline auto vsprintf(
+    basic_string_view<Char> fmt,
+    basic_format_args<basic_printf_context<type_identity_t<Char>>> args)
+    -> std::basic_string<Char> {
+  auto buf = basic_memory_buffer<Char>();
+  detail::vprintf(buf, fmt, args);
+  return to_string(buf);
+}
+
+/**
+  \rst
+  Formats arguments and returns the result as a string.
+
+  **Example**::
+
+    std::string message = fmt::sprintf("The answer is %d", 42);
+  \endrst
+*/
+template <typename S, typename... T,
+          typename Char = enable_if_t<detail::is_string<S>::value, char_t<S>>>
+inline auto sprintf(const S& fmt, const T&... args) -> std::basic_string<Char> {
+  return vsprintf(detail::to_string_view(fmt),
+                  fmt::make_format_args<basic_printf_context<Char>>(args...));
+}
+
+template <typename Char>
+inline auto vfprintf(
+    std::FILE* f, basic_string_view<Char> fmt,
+    basic_format_args<basic_printf_context<type_identity_t<Char>>> args)
+    -> int {
+  auto buf = basic_memory_buffer<Char>();
+  detail::vprintf(buf, fmt, args);
+  size_t size = buf.size();
+  return std::fwrite(buf.data(), sizeof(Char), size, f) < size
+             ? -1
+             : static_cast<int>(size);
+}
+
+/**
+  \rst
+  Prints formatted data to the file *f*.
+
+  **Example**::
+
+    fmt::fprintf(stderr, "Don't %s!", "panic");
+  \endrst
+ */
+template <typename S, typename... T, typename Char = char_t<S>>
+inline auto fprintf(std::FILE* f, const S& fmt, const T&... args) -> int {
+  return vfprintf(f, detail::to_string_view(fmt),
+                  fmt::make_format_args<basic_printf_context<Char>>(args...));
+}
+
+template <typename Char>
+FMT_DEPRECATED inline auto vprintf(
+    basic_string_view<Char> fmt,
+    basic_format_args<basic_printf_context<type_identity_t<Char>>> args)
+    -> int {
+  return vfprintf(stdout, fmt, args);
+}
+
+/**
+  \rst
+  Prints formatted data to ``stdout``.
+
+  **Example**::
+
+    fmt::printf("Elapsed time: %.2f seconds", 1.23);
+  \endrst
+ */
+template <typename... T>
+inline auto printf(string_view fmt, const T&... args) -> int {
+  return vfprintf(stdout, fmt, make_printf_args(args...));
+}
+template <typename... T>
+FMT_DEPRECATED inline auto printf(basic_string_view<wchar_t> fmt,
+                                  const T&... args) -> int {
+  return vfprintf(stdout, fmt, make_wprintf_args(args...));
+}
+
+FMT_END_EXPORT
+FMT_END_NAMESPACE
+
+#endif  // FMT_PRINTF_H_
diff --git a/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/ranges.h b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/ranges.h
new file mode 100644
index 00000000..3638fffb
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/ranges.h
@@ -0,0 +1,738 @@
+// Formatting library for C++ - range and tuple support
+//
+// Copyright (c) 2012 - present, Victor Zverovich and {fmt} contributors
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_RANGES_H_
+#define FMT_RANGES_H_
+
+#include <initializer_list>
+#include <tuple>
+#include <type_traits>
+
+#include "format.h"
+
+FMT_BEGIN_NAMESPACE
+
+namespace detail {
+
+template <typename Range, typename OutputIt>
+auto copy(const Range& range, OutputIt out) -> OutputIt {
+  for (auto it = range.begin(), end = range.end(); it != end; ++it)
+    *out++ = *it;
+  return out;
+}
+
+template <typename OutputIt>
+auto copy(const char* str, OutputIt out) -> OutputIt {
+  while (*str) *out++ = *str++;
+  return out;
+}
+
+template <typename OutputIt> auto copy(char ch, OutputIt out) -> OutputIt {
+  *out++ = ch;
+  return out;
+}
+
+template <typename OutputIt> auto copy(wchar_t ch, OutputIt out) -> OutputIt {
+  *out++ = ch;
+  return out;
+}
+
+// Returns true if T has a std::string-like interface, like std::string_view.
+template <typename T> class is_std_string_like {
+  template <typename U>
+  static auto check(U* p)
+      -> decltype((void)p->find('a'), p->length(), (void)p->data(), int());
+  template <typename> static void check(...);
+
+ public:
+  static constexpr const bool value =
+      is_string<T>::value ||
+      std::is_convertible<T, std_string_view<char>>::value ||
+      !std::is_void<decltype(check<T>(nullptr))>::value;
+};
+
+template <typename Char>
+struct is_std_string_like<fmt::basic_string_view<Char>> : std::true_type {};
+
+template <typename T> class is_map {
+  template <typename U> static auto check(U*) -> typename U::mapped_type;
+  template <typename> static void check(...);
+
+ public:
+#ifdef FMT_FORMAT_MAP_AS_LIST  // DEPRECATED!
+  static constexpr const bool value = false;
+#else
+  static constexpr const bool value =
+      !std::is_void<decltype(check<T>(nullptr))>::value;
+#endif
+};
+
+template <typename T> class is_set {
+  template <typename U> static auto check(U*) -> typename U::key_type;
+  template <typename> static void check(...);
+
+ public:
+#ifdef FMT_FORMAT_SET_AS_LIST  // DEPRECATED!
+  static constexpr const bool value = false;
+#else
+  static constexpr const bool value =
+      !std::is_void<decltype(check<T>(nullptr))>::value && !is_map<T>::value;
+#endif
+};
+
+template <typename... Ts> struct conditional_helper {};
+
+template <typename T, typename _ = void> struct is_range_ : std::false_type {};
+
+#if !FMT_MSC_VERSION || FMT_MSC_VERSION > 1800
+
+#  define FMT_DECLTYPE_RETURN(val)  \
+    ->decltype(val) { return val; } \
+    static_assert(                  \
+        true, "")  // This makes it so that a semicolon is required after the
+                   // macro, which helps clang-format handle the formatting.
+
+// C array overload
+template <typename T, std::size_t N>
+auto range_begin(const T (&arr)[N]) -> const T* {
+  return arr;
+}
+template <typename T, std::size_t N>
+auto range_end(const T (&arr)[N]) -> const T* {
+  return arr + N;
+}
+
+template <typename T, typename Enable = void>
+struct has_member_fn_begin_end_t : std::false_type {};
+
+template <typename T>
+struct has_member_fn_begin_end_t<T, void_t<decltype(std::declval<T>().begin()),
+                                           decltype(std::declval<T>().end())>>
+    : std::true_type {};
+
+// Member function overload
+template <typename T>
+auto range_begin(T&& rng) FMT_DECLTYPE_RETURN(static_cast<T&&>(rng).begin());
+template <typename T>
+auto range_end(T&& rng) FMT_DECLTYPE_RETURN(static_cast<T&&>(rng).end());
+
+// ADL overload. Only participates in overload resolution if member functions
+// are not found.
+template <typename T>
+auto range_begin(T&& rng)
+    -> enable_if_t<!has_member_fn_begin_end_t<T&&>::value,
+                   decltype(begin(static_cast<T&&>(rng)))> {
+  return begin(static_cast<T&&>(rng));
+}
+template <typename T>
+auto range_end(T&& rng) -> enable_if_t<!has_member_fn_begin_end_t<T&&>::value,
+                                       decltype(end(static_cast<T&&>(rng)))> {
+  return end(static_cast<T&&>(rng));
+}
+
+template <typename T, typename Enable = void>
+struct has_const_begin_end : std::false_type {};
+template <typename T, typename Enable = void>
+struct has_mutable_begin_end : std::false_type {};
+
+template <typename T>
+struct has_const_begin_end<
+    T,
+    void_t<
+        decltype(detail::range_begin(std::declval<const remove_cvref_t<T>&>())),
+        decltype(detail::range_end(std::declval<const remove_cvref_t<T>&>()))>>
+    : std::true_type {};
+
+template <typename T>
+struct has_mutable_begin_end<
+    T, void_t<decltype(detail::range_begin(std::declval<T>())),
+              decltype(detail::range_end(std::declval<T>())),
+              // the extra int here is because older versions of MSVC don't
+              // SFINAE properly unless there are distinct types
+              int>> : std::true_type {};
+
+template <typename T>
+struct is_range_<T, void>
+    : std::integral_constant<bool, (has_const_begin_end<T>::value ||
+                                    has_mutable_begin_end<T>::value)> {};
+#  undef FMT_DECLTYPE_RETURN
+#endif
+
+// tuple_size and tuple_element check.
+template <typename T> class is_tuple_like_ {
+  template <typename U>
+  static auto check(U* p) -> decltype(std::tuple_size<U>::value, int());
+  template <typename> static void check(...);
+
+ public:
+  static constexpr const bool value =
+      !std::is_void<decltype(check<T>(nullptr))>::value;
+};
+
+// Check for integer_sequence
+#if defined(__cpp_lib_integer_sequence) || FMT_MSC_VERSION >= 1900
+template <typename T, T... N>
+using integer_sequence = std::integer_sequence<T, N...>;
+template <size_t... N> using index_sequence = std::index_sequence<N...>;
+template <size_t N> using make_index_sequence = std::make_index_sequence<N>;
+#else
+template <typename T, T... N> struct integer_sequence {
+  using value_type = T;
+
+  static FMT_CONSTEXPR auto size() -> size_t { return sizeof...(N); }
+};
+
+template <size_t... N> using index_sequence = integer_sequence<size_t, N...>;
+
+template <typename T, size_t N, T... Ns>
+struct make_integer_sequence : make_integer_sequence<T, N - 1, N - 1, Ns...> {};
+template <typename T, T... Ns>
+struct make_integer_sequence<T, 0, Ns...> : integer_sequence<T, Ns...> {};
+
+template <size_t N>
+using make_index_sequence = make_integer_sequence<size_t, N>;
+#endif
+
+template <typename T>
+using tuple_index_sequence = make_index_sequence<std::tuple_size<T>::value>;
+
+template <typename T, typename C, bool = is_tuple_like_<T>::value>
+class is_tuple_formattable_ {
+ public:
+  static constexpr const bool value = false;
+};
+template <typename T, typename C> class is_tuple_formattable_<T, C, true> {
+  template <std::size_t... Is>
+  static auto check2(index_sequence<Is...>,
+                     integer_sequence<bool, (Is == Is)...>) -> std::true_type;
+  static auto check2(...) -> std::false_type;
+  template <std::size_t... Is>
+  static auto check(index_sequence<Is...>) -> decltype(check2(
+      index_sequence<Is...>{},
+      integer_sequence<bool,
+                       (is_formattable<typename std::tuple_element<Is, T>::type,
+                                       C>::value)...>{}));
+
+ public:
+  static constexpr const bool value =
+      decltype(check(tuple_index_sequence<T>{}))::value;
+};
+
+template <typename Tuple, typename F, size_t... Is>
+FMT_CONSTEXPR void for_each(index_sequence<Is...>, Tuple&& t, F&& f) {
+  using std::get;
+  // Using a free function get<Is>(Tuple) now.
+  const int unused[] = {0, ((void)f(get<Is>(t)), 0)...};
+  ignore_unused(unused);
+}
+
+template <typename Tuple, typename F>
+FMT_CONSTEXPR void for_each(Tuple&& t, F&& f) {
+  for_each(tuple_index_sequence<remove_cvref_t<Tuple>>(),
+           std::forward<Tuple>(t), std::forward<F>(f));
+}
+
+template <typename Tuple1, typename Tuple2, typename F, size_t... Is>
+void for_each2(index_sequence<Is...>, Tuple1&& t1, Tuple2&& t2, F&& f) {
+  using std::get;
+  const int unused[] = {0, ((void)f(get<Is>(t1), get<Is>(t2)), 0)...};
+  ignore_unused(unused);
+}
+
+template <typename Tuple1, typename Tuple2, typename F>
+void for_each2(Tuple1&& t1, Tuple2&& t2, F&& f) {
+  for_each2(tuple_index_sequence<remove_cvref_t<Tuple1>>(),
+            std::forward<Tuple1>(t1), std::forward<Tuple2>(t2),
+            std::forward<F>(f));
+}
+
+namespace tuple {
+// Workaround a bug in MSVC 2019 (v140).
+template <typename Char, typename... T>
+using result_t = std::tuple<formatter<remove_cvref_t<T>, Char>...>;
+
+using std::get;
+template <typename Tuple, typename Char, std::size_t... Is>
+auto get_formatters(index_sequence<Is...>)
+    -> result_t<Char, decltype(get<Is>(std::declval<Tuple>()))...>;
+}  // namespace tuple
+
+#if FMT_MSC_VERSION && FMT_MSC_VERSION < 1920
+// Older MSVC doesn't get the reference type correctly for arrays.
+template <typename R> struct range_reference_type_impl {
+  using type = decltype(*detail::range_begin(std::declval<R&>()));
+};
+
+template <typename T, std::size_t N> struct range_reference_type_impl<T[N]> {
+  using type = T&;
+};
+
+template <typename T>
+using range_reference_type = typename range_reference_type_impl<T>::type;
+#else
+template <typename Range>
+using range_reference_type =
+    decltype(*detail::range_begin(std::declval<Range&>()));
+#endif
+
+// We don't use the Range's value_type for anything, but we do need the Range's
+// reference type, with cv-ref stripped.
+template <typename Range>
+using uncvref_type = remove_cvref_t<range_reference_type<Range>>;
+
+template <typename Formatter>
+FMT_CONSTEXPR auto maybe_set_debug_format(Formatter& f, bool set)
+    -> decltype(f.set_debug_format(set)) {
+  f.set_debug_format(set);
+}
+template <typename Formatter>
+FMT_CONSTEXPR void maybe_set_debug_format(Formatter&, ...) {}
+
+// These are not generic lambdas for compatibility with C++11.
+template <typename ParseContext> struct parse_empty_specs {
+  template <typename Formatter> FMT_CONSTEXPR void operator()(Formatter& f) {
+    f.parse(ctx);
+    detail::maybe_set_debug_format(f, true);
+  }
+  ParseContext& ctx;
+};
+template <typename FormatContext> struct format_tuple_element {
+  using char_type = typename FormatContext::char_type;
+
+  template <typename T>
+  void operator()(const formatter<T, char_type>& f, const T& v) {
+    if (i > 0)
+      ctx.advance_to(detail::copy_str<char_type>(separator, ctx.out()));
+    ctx.advance_to(f.format(v, ctx));
+    ++i;
+  }
+
+  int i;
+  FormatContext& ctx;
+  basic_string_view<char_type> separator;
+};
+
+}  // namespace detail
+
+template <typename T> struct is_tuple_like {
+  static constexpr const bool value =
+      detail::is_tuple_like_<T>::value && !detail::is_range_<T>::value;
+};
+
+template <typename T, typename C> struct is_tuple_formattable {
+  static constexpr const bool value =
+      detail::is_tuple_formattable_<T, C>::value;
+};
+
+template <typename Tuple, typename Char>
+struct formatter<Tuple, Char,
+                 enable_if_t<fmt::is_tuple_like<Tuple>::value &&
+                             fmt::is_tuple_formattable<Tuple, Char>::value>> {
+ private:
+  decltype(detail::tuple::get_formatters<Tuple, Char>(
+      detail::tuple_index_sequence<Tuple>())) formatters_;
+
+  basic_string_view<Char> separator_ = detail::string_literal<Char, ',', ' '>{};
+  basic_string_view<Char> opening_bracket_ =
+      detail::string_literal<Char, '('>{};
+  basic_string_view<Char> closing_bracket_ =
+      detail::string_literal<Char, ')'>{};
+
+ public:
+  FMT_CONSTEXPR formatter() {}
+
+  FMT_CONSTEXPR void set_separator(basic_string_view<Char> sep) {
+    separator_ = sep;
+  }
+
+  FMT_CONSTEXPR void set_brackets(basic_string_view<Char> open,
+                                  basic_string_view<Char> close) {
+    opening_bracket_ = open;
+    closing_bracket_ = close;
+  }
+
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    auto it = ctx.begin();
+    if (it != ctx.end() && *it != '}')
+      FMT_THROW(format_error("invalid format specifier"));
+    detail::for_each(formatters_, detail::parse_empty_specs<ParseContext>{ctx});
+    return it;
+  }
+
+  template <typename FormatContext>
+  auto format(const Tuple& value, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    ctx.advance_to(detail::copy_str<Char>(opening_bracket_, ctx.out()));
+    detail::for_each2(
+        formatters_, value,
+        detail::format_tuple_element<FormatContext>{0, ctx, separator_});
+    return detail::copy_str<Char>(closing_bracket_, ctx.out());
+  }
+};
+
+template <typename T, typename Char> struct is_range {
+  static constexpr const bool value =
+      detail::is_range_<T>::value && !detail::is_std_string_like<T>::value &&
+      !std::is_convertible<T, std::basic_string<Char>>::value &&
+      !std::is_convertible<T, detail::std_string_view<Char>>::value;
+};
+
+namespace detail {
+template <typename Context> struct range_mapper {
+  using mapper = arg_mapper<Context>;
+
+  template <typename T,
+            FMT_ENABLE_IF(has_formatter<remove_cvref_t<T>, Context>::value)>
+  static auto map(T&& value) -> T&& {
+    return static_cast<T&&>(value);
+  }
+  template <typename T,
+            FMT_ENABLE_IF(!has_formatter<remove_cvref_t<T>, Context>::value)>
+  static auto map(T&& value)
+      -> decltype(mapper().map(static_cast<T&&>(value))) {
+    return mapper().map(static_cast<T&&>(value));
+  }
+};
+
+template <typename Char, typename Element>
+using range_formatter_type =
+    formatter<remove_cvref_t<decltype(range_mapper<buffer_context<Char>>{}.map(
+                  std::declval<Element>()))>,
+              Char>;
+
+template <typename R>
+using maybe_const_range =
+    conditional_t<has_const_begin_end<R>::value, const R, R>;
+
+// Workaround a bug in MSVC 2015 and earlier.
+#if !FMT_MSC_VERSION || FMT_MSC_VERSION >= 1910
+template <typename R, typename Char>
+struct is_formattable_delayed
+    : is_formattable<uncvref_type<maybe_const_range<R>>, Char> {};
+#endif
+}  // namespace detail
+
+template <typename...> struct conjunction : std::true_type {};
+template <typename P> struct conjunction<P> : P {};
+template <typename P1, typename... Pn>
+struct conjunction<P1, Pn...>
+    : conditional_t<bool(P1::value), conjunction<Pn...>, P1> {};
+
+template <typename T, typename Char, typename Enable = void>
+struct range_formatter;
+
+template <typename T, typename Char>
+struct range_formatter<
+    T, Char,
+    enable_if_t<conjunction<std::is_same<T, remove_cvref_t<T>>,
+                            is_formattable<T, Char>>::value>> {
+ private:
+  detail::range_formatter_type<Char, T> underlying_;
+  basic_string_view<Char> separator_ = detail::string_literal<Char, ',', ' '>{};
+  basic_string_view<Char> opening_bracket_ =
+      detail::string_literal<Char, '['>{};
+  basic_string_view<Char> closing_bracket_ =
+      detail::string_literal<Char, ']'>{};
+
+ public:
+  FMT_CONSTEXPR range_formatter() {}
+
+  FMT_CONSTEXPR auto underlying() -> detail::range_formatter_type<Char, T>& {
+    return underlying_;
+  }
+
+  FMT_CONSTEXPR void set_separator(basic_string_view<Char> sep) {
+    separator_ = sep;
+  }
+
+  FMT_CONSTEXPR void set_brackets(basic_string_view<Char> open,
+                                  basic_string_view<Char> close) {
+    opening_bracket_ = open;
+    closing_bracket_ = close;
+  }
+
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    auto it = ctx.begin();
+    auto end = ctx.end();
+
+    if (it != end && *it == 'n') {
+      set_brackets({}, {});
+      ++it;
+    }
+
+    if (it != end && *it != '}') {
+      if (*it != ':') FMT_THROW(format_error("invalid format specifier"));
+      ++it;
+    } else {
+      detail::maybe_set_debug_format(underlying_, true);
+    }
+
+    ctx.advance_to(it);
+    return underlying_.parse(ctx);
+  }
+
+  template <typename R, typename FormatContext>
+  auto format(R&& range, FormatContext& ctx) const -> decltype(ctx.out()) {
+    detail::range_mapper<buffer_context<Char>> mapper;
+    auto out = ctx.out();
+    out = detail::copy_str<Char>(opening_bracket_, out);
+    int i = 0;
+    auto it = detail::range_begin(range);
+    auto end = detail::range_end(range);
+    for (; it != end; ++it) {
+      if (i > 0) out = detail::copy_str<Char>(separator_, out);
+      ctx.advance_to(out);
+      auto&& item = *it;
+      out = underlying_.format(mapper.map(item), ctx);
+      ++i;
+    }
+    out = detail::copy_str<Char>(closing_bracket_, out);
+    return out;
+  }
+};
+
+enum class range_format { disabled, map, set, sequence, string, debug_string };
+
+namespace detail {
+template <typename T>
+struct range_format_kind_
+    : std::integral_constant<range_format,
+                             std::is_same<uncvref_type<T>, T>::value
+                                 ? range_format::disabled
+                             : is_map<T>::value ? range_format::map
+                             : is_set<T>::value ? range_format::set
+                                                : range_format::sequence> {};
+
+template <range_format K, typename R, typename Char, typename Enable = void>
+struct range_default_formatter;
+
+template <range_format K>
+using range_format_constant = std::integral_constant<range_format, K>;
+
+template <range_format K, typename R, typename Char>
+struct range_default_formatter<
+    K, R, Char,
+    enable_if_t<(K == range_format::sequence || K == range_format::map ||
+                 K == range_format::set)>> {
+  using range_type = detail::maybe_const_range<R>;
+  range_formatter<detail::uncvref_type<range_type>, Char> underlying_;
+
+  FMT_CONSTEXPR range_default_formatter() { init(range_format_constant<K>()); }
+
+  FMT_CONSTEXPR void init(range_format_constant<range_format::set>) {
+    underlying_.set_brackets(detail::string_literal<Char, '{'>{},
+                             detail::string_literal<Char, '}'>{});
+  }
+
+  FMT_CONSTEXPR void init(range_format_constant<range_format::map>) {
+    underlying_.set_brackets(detail::string_literal<Char, '{'>{},
+                             detail::string_literal<Char, '}'>{});
+    underlying_.underlying().set_brackets({}, {});
+    underlying_.underlying().set_separator(
+        detail::string_literal<Char, ':', ' '>{});
+  }
+
+  FMT_CONSTEXPR void init(range_format_constant<range_format::sequence>) {}
+
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return underlying_.parse(ctx);
+  }
+
+  template <typename FormatContext>
+  auto format(range_type& range, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return underlying_.format(range, ctx);
+  }
+};
+}  // namespace detail
+
+template <typename T, typename Char, typename Enable = void>
+struct range_format_kind
+    : conditional_t<
+          is_range<T, Char>::value, detail::range_format_kind_<T>,
+          std::integral_constant<range_format, range_format::disabled>> {};
+
+template <typename R, typename Char>
+struct formatter<
+    R, Char,
+    enable_if_t<conjunction<bool_constant<range_format_kind<R, Char>::value !=
+                                          range_format::disabled>
+// Workaround a bug in MSVC 2015 and earlier.
+#if !FMT_MSC_VERSION || FMT_MSC_VERSION >= 1910
+                            ,
+                            detail::is_formattable_delayed<R, Char>
+#endif
+                            >::value>>
+    : detail::range_default_formatter<range_format_kind<R, Char>::value, R,
+                                      Char> {
+};
+
+template <typename Char, typename... T> struct tuple_join_view : detail::view {
+  const std::tuple<T...>& tuple;
+  basic_string_view<Char> sep;
+
+  tuple_join_view(const std::tuple<T...>& t, basic_string_view<Char> s)
+      : tuple(t), sep{s} {}
+};
+
+// Define FMT_TUPLE_JOIN_SPECIFIERS to enable experimental format specifiers
+// support in tuple_join. It is disabled by default because of issues with
+// the dynamic width and precision.
+#ifndef FMT_TUPLE_JOIN_SPECIFIERS
+#  define FMT_TUPLE_JOIN_SPECIFIERS 0
+#endif
+
+template <typename Char, typename... T>
+struct formatter<tuple_join_view<Char, T...>, Char> {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return do_parse(ctx, std::integral_constant<size_t, sizeof...(T)>());
+  }
+
+  template <typename FormatContext>
+  auto format(const tuple_join_view<Char, T...>& value,
+              FormatContext& ctx) const -> typename FormatContext::iterator {
+    return do_format(value, ctx,
+                     std::integral_constant<size_t, sizeof...(T)>());
+  }
+
+ private:
+  std::tuple<formatter<typename std::decay<T>::type, Char>...> formatters_;
+
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto do_parse(ParseContext& ctx,
+                              std::integral_constant<size_t, 0>)
+      -> decltype(ctx.begin()) {
+    return ctx.begin();
+  }
+
+  template <typename ParseContext, size_t N>
+  FMT_CONSTEXPR auto do_parse(ParseContext& ctx,
+                              std::integral_constant<size_t, N>)
+      -> decltype(ctx.begin()) {
+    auto end = ctx.begin();
+#if FMT_TUPLE_JOIN_SPECIFIERS
+    end = std::get<sizeof...(T) - N>(formatters_).parse(ctx);
+    if (N > 1) {
+      auto end1 = do_parse(ctx, std::integral_constant<size_t, N - 1>());
+      if (end != end1)
+        FMT_THROW(format_error("incompatible format specs for tuple elements"));
+    }
+#endif
+    return end;
+  }
+
+  template <typename FormatContext>
+  auto do_format(const tuple_join_view<Char, T...>&, FormatContext& ctx,
+                 std::integral_constant<size_t, 0>) const ->
+      typename FormatContext::iterator {
+    return ctx.out();
+  }
+
+  template <typename FormatContext, size_t N>
+  auto do_format(const tuple_join_view<Char, T...>& value, FormatContext& ctx,
+                 std::integral_constant<size_t, N>) const ->
+      typename FormatContext::iterator {
+    auto out = std::get<sizeof...(T) - N>(formatters_)
+                   .format(std::get<sizeof...(T) - N>(value.tuple), ctx);
+    if (N > 1) {
+      out = std::copy(value.sep.begin(), value.sep.end(), out);
+      ctx.advance_to(out);
+      return do_format(value, ctx, std::integral_constant<size_t, N - 1>());
+    }
+    return out;
+  }
+};
+
+namespace detail {
+// Check if T has an interface like a container adaptor (e.g. std::stack,
+// std::queue, std::priority_queue).
+template <typename T> class is_container_adaptor_like {
+  template <typename U> static auto check(U* p) -> typename U::container_type;
+  template <typename> static void check(...);
+
+ public:
+  static constexpr const bool value =
+      !std::is_void<decltype(check<T>(nullptr))>::value;
+};
+
+template <typename Container> struct all {
+  const Container& c;
+  auto begin() const -> typename Container::const_iterator { return c.begin(); }
+  auto end() const -> typename Container::const_iterator { return c.end(); }
+};
+}  // namespace detail
+
+template <typename T, typename Char>
+struct formatter<
+    T, Char,
+    enable_if_t<conjunction<detail::is_container_adaptor_like<T>,
+                            bool_constant<range_format_kind<T, Char>::value ==
+                                          range_format::disabled>>::value>>
+    : formatter<detail::all<typename T::container_type>, Char> {
+  using all = detail::all<typename T::container_type>;
+  template <typename FormatContext>
+  auto format(const T& t, FormatContext& ctx) const -> decltype(ctx.out()) {
+    struct getter : T {
+      static auto get(const T& t) -> all {
+        return {t.*(&getter::c)};  // Access c through the derived class.
+      }
+    };
+    return formatter<all>::format(getter::get(t), ctx);
+  }
+};
+
+FMT_BEGIN_EXPORT
+
+/**
+  \rst
+  Returns an object that formats `tuple` with elements separated by `sep`.
+
+  **Example**::
+
+    std::tuple<int, char> t = {1, 'a'};
+    fmt::print("{}", fmt::join(t, ", "));
+    // Output: "1, a"
+  \endrst
+ */
+template <typename... T>
+FMT_CONSTEXPR auto join(const std::tuple<T...>& tuple, string_view sep)
+    -> tuple_join_view<char, T...> {
+  return {tuple, sep};
+}
+
+template <typename... T>
+FMT_CONSTEXPR auto join(const std::tuple<T...>& tuple,
+                        basic_string_view<wchar_t> sep)
+    -> tuple_join_view<wchar_t, T...> {
+  return {tuple, sep};
+}
+
+/**
+  \rst
+  Returns an object that formats `initializer_list` with elements separated by
+  `sep`.
+
+  **Example**::
+
+    fmt::print("{}", fmt::join({1, 2, 3}, ", "));
+    // Output: "1, 2, 3"
+  \endrst
+ */
+template <typename T>
+auto join(std::initializer_list<T> list, string_view sep)
+    -> join_view<const T*, const T*> {
+  return join(std::begin(list), std::end(list), sep);
+}
+
+FMT_END_EXPORT
+FMT_END_NAMESPACE
+
+#endif  // FMT_RANGES_H_
diff --git a/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/std.h b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/std.h
new file mode 100644
index 00000000..7cff1159
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/std.h
@@ -0,0 +1,537 @@
+// Formatting library for C++ - formatters for standard library types
+//
+// Copyright (c) 2012 - present, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_STD_H_
+#define FMT_STD_H_
+
+#include <atomic>
+#include <bitset>
+#include <cstdlib>
+#include <exception>
+#include <memory>
+#include <thread>
+#include <type_traits>
+#include <typeinfo>
+#include <utility>
+#include <vector>
+
+#include "format.h"
+#include "ostream.h"
+
+#if FMT_HAS_INCLUDE(<version>)
+#  include <version>
+#endif
+// Checking FMT_CPLUSPLUS for warning suppression in MSVC.
+#if FMT_CPLUSPLUS >= 201703L
+#  if FMT_HAS_INCLUDE(<filesystem>)
+#    include <filesystem>
+#  endif
+#  if FMT_HAS_INCLUDE(<variant>)
+#    include <variant>
+#  endif
+#  if FMT_HAS_INCLUDE(<optional>)
+#    include <optional>
+#  endif
+#endif
+
+#if FMT_CPLUSPLUS > 201703L && FMT_HAS_INCLUDE(<source_location>)
+#  include <source_location>
+#endif
+
+// GCC 4 does not support FMT_HAS_INCLUDE.
+#if FMT_HAS_INCLUDE(<cxxabi.h>) || defined(__GLIBCXX__)
+#  include <cxxabi.h>
+// Android NDK with gabi++ library on some architectures does not implement
+// abi::__cxa_demangle().
+#  ifndef __GABIXX_CXXABI_H__
+#    define FMT_HAS_ABI_CXA_DEMANGLE
+#  endif
+#endif
+
+// Check if typeid is available.
+#ifndef FMT_USE_TYPEID
+// __RTTI is for EDG compilers. In MSVC typeid is available without RTTI.
+#  if defined(__GXX_RTTI) || FMT_HAS_FEATURE(cxx_rtti) || FMT_MSC_VERSION || \
+      defined(__INTEL_RTTI__) || defined(__RTTI)
+#    define FMT_USE_TYPEID 1
+#  else
+#    define FMT_USE_TYPEID 0
+#  endif
+#endif
+
+// For older Xcode versions, __cpp_lib_xxx flags are inaccurately defined.
+#ifndef FMT_CPP_LIB_FILESYSTEM
+#  ifdef __cpp_lib_filesystem
+#    define FMT_CPP_LIB_FILESYSTEM __cpp_lib_filesystem
+#  else
+#    define FMT_CPP_LIB_FILESYSTEM 0
+#  endif
+#endif
+
+#ifndef FMT_CPP_LIB_VARIANT
+#  ifdef __cpp_lib_variant
+#    define FMT_CPP_LIB_VARIANT __cpp_lib_variant
+#  else
+#    define FMT_CPP_LIB_VARIANT 0
+#  endif
+#endif
+
+#if FMT_CPP_LIB_FILESYSTEM
+FMT_BEGIN_NAMESPACE
+
+namespace detail {
+
+template <typename Char, typename PathChar>
+auto get_path_string(const std::filesystem::path& p,
+                     const std::basic_string<PathChar>& native) {
+  if constexpr (std::is_same_v<Char, char> && std::is_same_v<PathChar, wchar_t>)
+    return to_utf8<wchar_t>(native, to_utf8_error_policy::replace);
+  else
+    return p.string<Char>();
+}
+
+template <typename Char, typename PathChar>
+void write_escaped_path(basic_memory_buffer<Char>& quoted,
+                        const std::filesystem::path& p,
+                        const std::basic_string<PathChar>& native) {
+  if constexpr (std::is_same_v<Char, char> &&
+                std::is_same_v<PathChar, wchar_t>) {
+    auto buf = basic_memory_buffer<wchar_t>();
+    write_escaped_string<wchar_t>(std::back_inserter(buf), native);
+    bool valid = to_utf8<wchar_t>::convert(quoted, {buf.data(), buf.size()});
+    FMT_ASSERT(valid, "invalid utf16");
+  } else if constexpr (std::is_same_v<Char, PathChar>) {
+    write_escaped_string<std::filesystem::path::value_type>(
+        std::back_inserter(quoted), native);
+  } else {
+    write_escaped_string<Char>(std::back_inserter(quoted), p.string<Char>());
+  }
+}
+
+}  // namespace detail
+
+FMT_EXPORT
+template <typename Char> struct formatter<std::filesystem::path, Char> {
+ private:
+  format_specs<Char> specs_;
+  detail::arg_ref<Char> width_ref_;
+  bool debug_ = false;
+  char path_type_ = 0;
+
+ public:
+  FMT_CONSTEXPR void set_debug_format(bool set = true) { debug_ = set; }
+
+  template <typename ParseContext> FMT_CONSTEXPR auto parse(ParseContext& ctx) {
+    auto it = ctx.begin(), end = ctx.end();
+    if (it == end) return it;
+
+    it = detail::parse_align(it, end, specs_);
+    if (it == end) return it;
+
+    it = detail::parse_dynamic_spec(it, end, specs_.width, width_ref_, ctx);
+    if (it != end && *it == '?') {
+      debug_ = true;
+      ++it;
+    }
+    if (it != end && (*it == 'g')) path_type_ = *it++;
+    return it;
+  }
+
+  template <typename FormatContext>
+  auto format(const std::filesystem::path& p, FormatContext& ctx) const {
+    auto specs = specs_;
+#  ifdef _WIN32
+    auto path_string = !path_type_ ? p.native() : p.generic_wstring();
+#  else
+    auto path_string = !path_type_ ? p.native() : p.generic_string();
+#  endif
+
+    detail::handle_dynamic_spec<detail::width_checker>(specs.width, width_ref_,
+                                                       ctx);
+    if (!debug_) {
+      auto s = detail::get_path_string<Char>(p, path_string);
+      return detail::write(ctx.out(), basic_string_view<Char>(s), specs);
+    }
+    auto quoted = basic_memory_buffer<Char>();
+    detail::write_escaped_path(quoted, p, path_string);
+    return detail::write(ctx.out(),
+                         basic_string_view<Char>(quoted.data(), quoted.size()),
+                         specs);
+  }
+};
+FMT_END_NAMESPACE
+#endif  // FMT_CPP_LIB_FILESYSTEM
+
+FMT_BEGIN_NAMESPACE
+FMT_EXPORT
+template <std::size_t N, typename Char>
+struct formatter<std::bitset<N>, Char> : nested_formatter<string_view> {
+ private:
+  // Functor because C++11 doesn't support generic lambdas.
+  struct writer {
+    const std::bitset<N>& bs;
+
+    template <typename OutputIt>
+    FMT_CONSTEXPR auto operator()(OutputIt out) -> OutputIt {
+      for (auto pos = N; pos > 0; --pos) {
+        out = detail::write<Char>(out, bs[pos - 1] ? Char('1') : Char('0'));
+      }
+
+      return out;
+    }
+  };
+
+ public:
+  template <typename FormatContext>
+  auto format(const std::bitset<N>& bs, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return write_padded(ctx, writer{bs});
+  }
+};
+
+FMT_EXPORT
+template <typename Char>
+struct formatter<std::thread::id, Char> : basic_ostream_formatter<Char> {};
+FMT_END_NAMESPACE
+
+#ifdef __cpp_lib_optional
+FMT_BEGIN_NAMESPACE
+FMT_EXPORT
+template <typename T, typename Char>
+struct formatter<std::optional<T>, Char,
+                 std::enable_if_t<is_formattable<T, Char>::value>> {
+ private:
+  formatter<T, Char> underlying_;
+  static constexpr basic_string_view<Char> optional =
+      detail::string_literal<Char, 'o', 'p', 't', 'i', 'o', 'n', 'a', 'l',
+                             '('>{};
+  static constexpr basic_string_view<Char> none =
+      detail::string_literal<Char, 'n', 'o', 'n', 'e'>{};
+
+  template <class U>
+  FMT_CONSTEXPR static auto maybe_set_debug_format(U& u, bool set)
+      -> decltype(u.set_debug_format(set)) {
+    u.set_debug_format(set);
+  }
+
+  template <class U>
+  FMT_CONSTEXPR static void maybe_set_debug_format(U&, ...) {}
+
+ public:
+  template <typename ParseContext> FMT_CONSTEXPR auto parse(ParseContext& ctx) {
+    maybe_set_debug_format(underlying_, true);
+    return underlying_.parse(ctx);
+  }
+
+  template <typename FormatContext>
+  auto format(const std::optional<T>& opt, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    if (!opt) return detail::write<Char>(ctx.out(), none);
+
+    auto out = ctx.out();
+    out = detail::write<Char>(out, optional);
+    ctx.advance_to(out);
+    out = underlying_.format(*opt, ctx);
+    return detail::write(out, ')');
+  }
+};
+FMT_END_NAMESPACE
+#endif  // __cpp_lib_optional
+
+#ifdef __cpp_lib_source_location
+FMT_BEGIN_NAMESPACE
+FMT_EXPORT
+template <> struct formatter<std::source_location> {
+  template <typename ParseContext> FMT_CONSTEXPR auto parse(ParseContext& ctx) {
+    return ctx.begin();
+  }
+
+  template <typename FormatContext>
+  auto format(const std::source_location& loc, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto out = ctx.out();
+    out = detail::write(out, loc.file_name());
+    out = detail::write(out, ':');
+    out = detail::write<char>(out, loc.line());
+    out = detail::write(out, ':');
+    out = detail::write<char>(out, loc.column());
+    out = detail::write(out, ": ");
+    out = detail::write(out, loc.function_name());
+    return out;
+  }
+};
+FMT_END_NAMESPACE
+#endif
+
+#if FMT_CPP_LIB_VARIANT
+FMT_BEGIN_NAMESPACE
+namespace detail {
+
+template <typename T>
+using variant_index_sequence =
+    std::make_index_sequence<std::variant_size<T>::value>;
+
+template <typename> struct is_variant_like_ : std::false_type {};
+template <typename... Types>
+struct is_variant_like_<std::variant<Types...>> : std::true_type {};
+
+// formattable element check.
+template <typename T, typename C> class is_variant_formattable_ {
+  template <std::size_t... Is>
+  static std::conjunction<
+      is_formattable<std::variant_alternative_t<Is, T>, C>...>
+      check(std::index_sequence<Is...>);
+
+ public:
+  static constexpr const bool value =
+      decltype(check(variant_index_sequence<T>{}))::value;
+};
+
+template <typename Char, typename OutputIt, typename T>
+auto write_variant_alternative(OutputIt out, const T& v) -> OutputIt {
+  if constexpr (is_string<T>::value)
+    return write_escaped_string<Char>(out, detail::to_string_view(v));
+  else if constexpr (std::is_same_v<T, Char>)
+    return write_escaped_char(out, v);
+  else
+    return write<Char>(out, v);
+}
+
+}  // namespace detail
+
+template <typename T> struct is_variant_like {
+  static constexpr const bool value = detail::is_variant_like_<T>::value;
+};
+
+template <typename T, typename C> struct is_variant_formattable {
+  static constexpr const bool value =
+      detail::is_variant_formattable_<T, C>::value;
+};
+
+FMT_EXPORT
+template <typename Char> struct formatter<std::monostate, Char> {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return ctx.begin();
+  }
+
+  template <typename FormatContext>
+  auto format(const std::monostate&, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return detail::write<Char>(ctx.out(), "monostate");
+  }
+};
+
+FMT_EXPORT
+template <typename Variant, typename Char>
+struct formatter<
+    Variant, Char,
+    std::enable_if_t<std::conjunction_v<
+        is_variant_like<Variant>, is_variant_formattable<Variant, Char>>>> {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return ctx.begin();
+  }
+
+  template <typename FormatContext>
+  auto format(const Variant& value, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto out = ctx.out();
+
+    out = detail::write<Char>(out, "variant(");
+    FMT_TRY {
+      std::visit(
+          [&](const auto& v) {
+            out = detail::write_variant_alternative<Char>(out, v);
+          },
+          value);
+    }
+    FMT_CATCH(const std::bad_variant_access&) {
+      detail::write<Char>(out, "valueless by exception");
+    }
+    *out++ = ')';
+    return out;
+  }
+};
+FMT_END_NAMESPACE
+#endif  // FMT_CPP_LIB_VARIANT
+
+FMT_BEGIN_NAMESPACE
+FMT_EXPORT
+template <typename Char> struct formatter<std::error_code, Char> {
+  template <typename ParseContext>
+  FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
+    return ctx.begin();
+  }
+
+  template <typename FormatContext>
+  FMT_CONSTEXPR auto format(const std::error_code& ec, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto out = ctx.out();
+    out = detail::write_bytes(out, ec.category().name(), format_specs<Char>());
+    out = detail::write<Char>(out, Char(':'));
+    out = detail::write<Char>(out, ec.value());
+    return out;
+  }
+};
+
+FMT_EXPORT
+template <typename T, typename Char>
+struct formatter<
+    T, Char,  // DEPRECATED! Mixing code unit types.
+    typename std::enable_if<std::is_base_of<std::exception, T>::value>::type> {
+ private:
+  bool with_typename_ = false;
+
+ public:
+  FMT_CONSTEXPR auto parse(basic_format_parse_context<Char>& ctx)
+      -> decltype(ctx.begin()) {
+    auto it = ctx.begin();
+    auto end = ctx.end();
+    if (it == end || *it == '}') return it;
+    if (*it == 't') {
+      ++it;
+      with_typename_ = FMT_USE_TYPEID != 0;
+    }
+    return it;
+  }
+
+  template <typename OutputIt>
+  auto format(const std::exception& ex,
+              basic_format_context<OutputIt, Char>& ctx) const -> OutputIt {
+    format_specs<Char> spec;
+    auto out = ctx.out();
+    if (!with_typename_)
+      return detail::write_bytes(out, string_view(ex.what()), spec);
+
+#if FMT_USE_TYPEID
+    const std::type_info& ti = typeid(ex);
+#  ifdef FMT_HAS_ABI_CXA_DEMANGLE
+    int status = 0;
+    std::size_t size = 0;
+    std::unique_ptr<char, void (*)(void*)> demangled_name_ptr(
+        abi::__cxa_demangle(ti.name(), nullptr, &size, &status), &std::free);
+
+    string_view demangled_name_view;
+    if (demangled_name_ptr) {
+      demangled_name_view = demangled_name_ptr.get();
+
+      // Normalization of stdlib inline namespace names.
+      // libc++ inline namespaces.
+      //  std::__1::*       -> std::*
+      //  std::__1::__fs::* -> std::*
+      // libstdc++ inline namespaces.
+      //  std::__cxx11::*             -> std::*
+      //  std::filesystem::__cxx11::* -> std::filesystem::*
+      if (demangled_name_view.starts_with("std::")) {
+        char* begin = demangled_name_ptr.get();
+        char* to = begin + 5;  // std::
+        for (char *from = to, *end = begin + demangled_name_view.size();
+             from < end;) {
+          // This is safe, because demangled_name is NUL-terminated.
+          if (from[0] == '_' && from[1] == '_') {
+            char* next = from + 1;
+            while (next < end && *next != ':') next++;
+            if (next[0] == ':' && next[1] == ':') {
+              from = next + 2;
+              continue;
+            }
+          }
+          *to++ = *from++;
+        }
+        demangled_name_view = {begin, detail::to_unsigned(to - begin)};
+      }
+    } else {
+      demangled_name_view = string_view(ti.name());
+    }
+    out = detail::write_bytes(out, demangled_name_view, spec);
+#  elif FMT_MSC_VERSION
+    string_view demangled_name_view(ti.name());
+    if (demangled_name_view.starts_with("class "))
+      demangled_name_view.remove_prefix(6);
+    else if (demangled_name_view.starts_with("struct "))
+      demangled_name_view.remove_prefix(7);
+    out = detail::write_bytes(out, demangled_name_view, spec);
+#  else
+    out = detail::write_bytes(out, string_view(ti.name()), spec);
+#  endif
+    *out++ = ':';
+    *out++ = ' ';
+    return detail::write_bytes(out, string_view(ex.what()), spec);
+#endif
+  }
+};
+
+namespace detail {
+
+template <typename T, typename Enable = void>
+struct has_flip : std::false_type {};
+
+template <typename T>
+struct has_flip<T, void_t<decltype(std::declval<T>().flip())>>
+    : std::true_type {};
+
+template <typename T> struct is_bit_reference_like {
+  static constexpr const bool value =
+      std::is_convertible<T, bool>::value &&
+      std::is_nothrow_assignable<T, bool>::value && has_flip<T>::value;
+};
+
+#ifdef _LIBCPP_VERSION
+
+// Workaround for libc++ incompatibility with C++ standard.
+// According to the Standard, `bitset::operator[] const` returns bool.
+template <typename C>
+struct is_bit_reference_like<std::__bit_const_reference<C>> {
+  static constexpr const bool value = true;
+};
+
+#endif
+
+}  // namespace detail
+
+// We can't use std::vector<bool, Allocator>::reference and
+// std::bitset<N>::reference because the compiler can't deduce Allocator and N
+// in partial specialization.
+FMT_EXPORT
+template <typename BitRef, typename Char>
+struct formatter<BitRef, Char,
+                 enable_if_t<detail::is_bit_reference_like<BitRef>::value>>
+    : formatter<bool, Char> {
+  template <typename FormatContext>
+  FMT_CONSTEXPR auto format(const BitRef& v, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return formatter<bool, Char>::format(v, ctx);
+  }
+};
+
+FMT_EXPORT
+template <typename T, typename Char>
+struct formatter<std::atomic<T>, Char,
+                 enable_if_t<is_formattable<T, Char>::value>>
+    : formatter<T, Char> {
+  template <typename FormatContext>
+  auto format(const std::atomic<T>& v, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return formatter<T, Char>::format(v.load(), ctx);
+  }
+};
+
+#ifdef __cpp_lib_atomic_flag_test
+FMT_EXPORT
+template <typename Char>
+struct formatter<std::atomic_flag, Char> : formatter<bool, Char> {
+  template <typename FormatContext>
+  auto format(const std::atomic_flag& v, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return formatter<bool, Char>::format(v.test(), ctx);
+  }
+};
+#endif  // __cpp_lib_atomic_flag_test
+
+FMT_END_NAMESPACE
+#endif  // FMT_STD_H_
diff --git a/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/xchar.h b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/xchar.h
new file mode 100644
index 00000000..f609c5c4
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/fmt/include/fmt/xchar.h
@@ -0,0 +1,259 @@
+// Formatting library for C++ - optional wchar_t and exotic character support
+//
+// Copyright (c) 2012 - present, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_XCHAR_H_
+#define FMT_XCHAR_H_
+
+#include <cwchar>
+
+#include "format.h"
+
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
+#  include <locale>
+#endif
+
+FMT_BEGIN_NAMESPACE
+namespace detail {
+
+template <typename T>
+using is_exotic_char = bool_constant<!std::is_same<T, char>::value>;
+
+inline auto write_loc(std::back_insert_iterator<detail::buffer<wchar_t>> out,
+                      loc_value value, const format_specs<wchar_t>& specs,
+                      locale_ref loc) -> bool {
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
+  auto& numpunct =
+      std::use_facet<std::numpunct<wchar_t>>(loc.get<std::locale>());
+  auto separator = std::wstring();
+  auto grouping = numpunct.grouping();
+  if (!grouping.empty()) separator = std::wstring(1, numpunct.thousands_sep());
+  return value.visit(loc_writer<wchar_t>{out, specs, separator, grouping, {}});
+#endif
+  return false;
+}
+}  // namespace detail
+
+FMT_BEGIN_EXPORT
+
+using wstring_view = basic_string_view<wchar_t>;
+using wformat_parse_context = basic_format_parse_context<wchar_t>;
+using wformat_context = buffer_context<wchar_t>;
+using wformat_args = basic_format_args<wformat_context>;
+using wmemory_buffer = basic_memory_buffer<wchar_t>;
+
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
+// Workaround broken conversion on older gcc.
+template <typename... Args> using wformat_string = wstring_view;
+inline auto runtime(wstring_view s) -> wstring_view { return s; }
+#else
+template <typename... Args>
+using wformat_string = basic_format_string<wchar_t, type_identity_t<Args>...>;
+inline auto runtime(wstring_view s) -> runtime_format_string<wchar_t> {
+  return {{s}};
+}
+#endif
+
+template <> struct is_char<wchar_t> : std::true_type {};
+template <> struct is_char<detail::char8_type> : std::true_type {};
+template <> struct is_char<char16_t> : std::true_type {};
+template <> struct is_char<char32_t> : std::true_type {};
+
+template <typename... T>
+constexpr auto make_wformat_args(const T&... args)
+    -> format_arg_store<wformat_context, T...> {
+  return {args...};
+}
+
+inline namespace literals {
+#if FMT_USE_USER_DEFINED_LITERALS && !FMT_USE_NONTYPE_TEMPLATE_ARGS
+constexpr auto operator""_a(const wchar_t* s, size_t)
+    -> detail::udl_arg<wchar_t> {
+  return {s};
+}
+#endif
+}  // namespace literals
+
+template <typename It, typename Sentinel>
+auto join(It begin, Sentinel end, wstring_view sep)
+    -> join_view<It, Sentinel, wchar_t> {
+  return {begin, end, sep};
+}
+
+template <typename Range>
+auto join(Range&& range, wstring_view sep)
+    -> join_view<detail::iterator_t<Range>, detail::sentinel_t<Range>,
+                 wchar_t> {
+  return join(std::begin(range), std::end(range), sep);
+}
+
+template <typename T>
+auto join(std::initializer_list<T> list, wstring_view sep)
+    -> join_view<const T*, const T*, wchar_t> {
+  return join(std::begin(list), std::end(list), sep);
+}
+
+template <typename Char, FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
+auto vformat(basic_string_view<Char> format_str,
+             basic_format_args<buffer_context<type_identity_t<Char>>> args)
+    -> std::basic_string<Char> {
+  auto buf = basic_memory_buffer<Char>();
+  detail::vformat_to(buf, format_str, args);
+  return to_string(buf);
+}
+
+template <typename... T>
+auto format(wformat_string<T...> fmt, T&&... args) -> std::wstring {
+  return vformat(fmt::wstring_view(fmt), fmt::make_wformat_args(args...));
+}
+
+// Pass char_t as a default template parameter instead of using
+// std::basic_string<char_t<S>> to reduce the symbol size.
+template <typename S, typename... T, typename Char = char_t<S>,
+          FMT_ENABLE_IF(!std::is_same<Char, char>::value &&
+                        !std::is_same<Char, wchar_t>::value)>
+auto format(const S& format_str, T&&... args) -> std::basic_string<Char> {
+  return vformat(detail::to_string_view(format_str),
+                 fmt::make_format_args<buffer_context<Char>>(args...));
+}
+
+template <typename Locale, typename S, typename Char = char_t<S>,
+          FMT_ENABLE_IF(detail::is_locale<Locale>::value&&
+                            detail::is_exotic_char<Char>::value)>
+inline auto vformat(
+    const Locale& loc, const S& format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args)
+    -> std::basic_string<Char> {
+  return detail::vformat(loc, detail::to_string_view(format_str), args);
+}
+
+template <typename Locale, typename S, typename... T, typename Char = char_t<S>,
+          FMT_ENABLE_IF(detail::is_locale<Locale>::value&&
+                            detail::is_exotic_char<Char>::value)>
+inline auto format(const Locale& loc, const S& format_str, T&&... args)
+    -> std::basic_string<Char> {
+  return detail::vformat(loc, detail::to_string_view(format_str),
+                         fmt::make_format_args<buffer_context<Char>>(args...));
+}
+
+template <typename OutputIt, typename S, typename Char = char_t<S>,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
+                            detail::is_exotic_char<Char>::value)>
+auto vformat_to(OutputIt out, const S& format_str,
+                basic_format_args<buffer_context<type_identity_t<Char>>> args)
+    -> OutputIt {
+  auto&& buf = detail::get_buffer<Char>(out);
+  detail::vformat_to(buf, detail::to_string_view(format_str), args);
+  return detail::get_iterator(buf, out);
+}
+
+template <typename OutputIt, typename S, typename... T,
+          typename Char = char_t<S>,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
+                            detail::is_exotic_char<Char>::value)>
+inline auto format_to(OutputIt out, const S& fmt, T&&... args) -> OutputIt {
+  return vformat_to(out, detail::to_string_view(fmt),
+                    fmt::make_format_args<buffer_context<Char>>(args...));
+}
+
+template <typename Locale, typename S, typename OutputIt, typename... Args,
+          typename Char = char_t<S>,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
+                            detail::is_locale<Locale>::value&&
+                                detail::is_exotic_char<Char>::value)>
+inline auto vformat_to(
+    OutputIt out, const Locale& loc, const S& format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args) -> OutputIt {
+  auto&& buf = detail::get_buffer<Char>(out);
+  vformat_to(buf, detail::to_string_view(format_str), args,
+             detail::locale_ref(loc));
+  return detail::get_iterator(buf, out);
+}
+
+template <typename OutputIt, typename Locale, typename S, typename... T,
+          typename Char = char_t<S>,
+          bool enable = detail::is_output_iterator<OutputIt, Char>::value &&
+                        detail::is_locale<Locale>::value &&
+                        detail::is_exotic_char<Char>::value>
+inline auto format_to(OutputIt out, const Locale& loc, const S& format_str,
+                      T&&... args) ->
+    typename std::enable_if<enable, OutputIt>::type {
+  return vformat_to(out, loc, detail::to_string_view(format_str),
+                    fmt::make_format_args<buffer_context<Char>>(args...));
+}
+
+template <typename OutputIt, typename Char, typename... Args,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
+                            detail::is_exotic_char<Char>::value)>
+inline auto vformat_to_n(
+    OutputIt out, size_t n, basic_string_view<Char> format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args)
+    -> format_to_n_result<OutputIt> {
+  using traits = detail::fixed_buffer_traits;
+  auto buf = detail::iterator_buffer<OutputIt, Char, traits>(out, n);
+  detail::vformat_to(buf, format_str, args);
+  return {buf.out(), buf.count()};
+}
+
+template <typename OutputIt, typename S, typename... T,
+          typename Char = char_t<S>,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value&&
+                            detail::is_exotic_char<Char>::value)>
+inline auto format_to_n(OutputIt out, size_t n, const S& fmt, T&&... args)
+    -> format_to_n_result<OutputIt> {
+  return vformat_to_n(out, n, detail::to_string_view(fmt),
+                      fmt::make_format_args<buffer_context<Char>>(args...));
+}
+
+template <typename S, typename... T, typename Char = char_t<S>,
+          FMT_ENABLE_IF(detail::is_exotic_char<Char>::value)>
+inline auto formatted_size(const S& fmt, T&&... args) -> size_t {
+  auto buf = detail::counting_buffer<Char>();
+  detail::vformat_to(buf, detail::to_string_view(fmt),
+                     fmt::make_format_args<buffer_context<Char>>(args...));
+  return buf.count();
+}
+
+inline void vprint(std::FILE* f, wstring_view fmt, wformat_args args) {
+  auto buf = wmemory_buffer();
+  detail::vformat_to(buf, fmt, args);
+  buf.push_back(L'\0');
+  if (std::fputws(buf.data(), f) == -1)
+    FMT_THROW(system_error(errno, FMT_STRING("cannot write to file")));
+}
+
+inline void vprint(wstring_view fmt, wformat_args args) {
+  vprint(stdout, fmt, args);
+}
+
+template <typename... T>
+void print(std::FILE* f, wformat_string<T...> fmt, T&&... args) {
+  return vprint(f, wstring_view(fmt), fmt::make_wformat_args(args...));
+}
+
+template <typename... T> void print(wformat_string<T...> fmt, T&&... args) {
+  return vprint(wstring_view(fmt), fmt::make_wformat_args(args...));
+}
+
+template <typename... T>
+void println(std::FILE* f, wformat_string<T...> fmt, T&&... args) {
+  return print(f, L"{}\n", fmt::format(fmt, std::forward<T>(args)...));
+}
+
+template <typename... T> void println(wformat_string<T...> fmt, T&&... args) {
+  return print(L"{}\n", fmt::format(fmt, std::forward<T>(args)...));
+}
+
+/**
+  Converts *value* to ``std::wstring`` using the default format for type *T*.
+ */
+template <typename T> inline auto to_wstring(const T& value) -> std::wstring {
+  return format(FMT_STRING(L"{}"), value);
+}
+FMT_END_EXPORT
+FMT_END_NAMESPACE
+
+#endif  // FMT_XCHAR_H_
diff --git a/ethosu/regor/dependencies/thirdparty/fmt/src/fmt.cc b/ethosu/regor/dependencies/thirdparty/fmt/src/fmt.cc
new file mode 100644
index 00000000..5330463a
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/fmt/src/fmt.cc
@@ -0,0 +1,108 @@
+module;
+
+// Put all implementation-provided headers into the global module fragment
+// to prevent attachment to this module.
+#include <algorithm>
+#include <cerrno>
+#include <chrono>
+#include <climits>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <exception>
+#include <filesystem>
+#include <fstream>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <locale>
+#include <memory>
+#include <optional>
+#include <ostream>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <system_error>
+#include <thread>
+#include <type_traits>
+#include <typeinfo>
+#include <utility>
+#include <variant>
+#include <vector>
+#include <version>
+
+#if __has_include(<cxxabi.h>)
+#  include <cxxabi.h>
+#endif
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#  include <intrin.h>
+#endif
+#if defined __APPLE__ || defined(__FreeBSD__)
+#  include <xlocale.h>
+#endif
+#if __has_include(<winapifamily.h>)
+#  include <winapifamily.h>
+#endif
+#if (__has_include(<fcntl.h>) || defined(__APPLE__) || \
+     defined(__linux__)) &&                            \
+    (!defined(WINAPI_FAMILY) || (WINAPI_FAMILY == WINAPI_FAMILY_DESKTOP_APP))
+#  include <fcntl.h>
+#  include <sys/stat.h>
+#  include <sys/types.h>
+#  ifndef _WIN32
+#    include <unistd.h>
+#  else
+#    include <io.h>
+#  endif
+#endif
+#ifdef _WIN32
+#  if defined(__GLIBCXX__)
+#    include <ext/stdio_filebuf.h>
+#    include <ext/stdio_sync_filebuf.h>
+#  endif
+#  define WIN32_LEAN_AND_MEAN
+#  include <windows.h>
+#endif
+
+export module fmt;
+
+#define FMT_EXPORT export
+#define FMT_BEGIN_EXPORT export {
+#define FMT_END_EXPORT }
+
+// If you define FMT_ATTACH_TO_GLOBAL_MODULE
+//  - all declarations are detached from module 'fmt'
+//  - the module behaves like a traditional static library, too
+//  - all library symbols are mangled traditionally
+//  - you can mix TUs with either importing or #including the {fmt} API
+#ifdef FMT_ATTACH_TO_GLOBAL_MODULE
+extern "C++" {
+#endif
+
+// All library-provided declarations and definitions must be in the module
+// purview to be exported.
+#include "fmt/args.h"
+#include "fmt/chrono.h"
+#include "fmt/color.h"
+#include "fmt/compile.h"
+#include "fmt/format.h"
+#include "fmt/os.h"
+#include "fmt/printf.h"
+#include "fmt/std.h"
+#include "fmt/xchar.h"
+
+#ifdef FMT_ATTACH_TO_GLOBAL_MODULE
+}
+#endif
+
+// gcc doesn't yet implement private module fragments
+#if !FMT_GCC_VERSION
+module :private;
+#endif
+
+#include "format.cc"
+#include "os.cc"
diff --git a/ethosu/regor/dependencies/thirdparty/fmt/src/format.cc b/ethosu/regor/dependencies/thirdparty/fmt/src/format.cc
new file mode 100644
index 00000000..391d3a24
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/fmt/src/format.cc
@@ -0,0 +1,43 @@
+// Formatting library for C++
+//
+// Copyright (c) 2012 - 2016, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#include "fmt/format-inl.h"
+
+FMT_BEGIN_NAMESPACE
+namespace detail {
+
+template FMT_API auto dragonbox::to_decimal(float x) noexcept
+    -> dragonbox::decimal_fp<float>;
+template FMT_API auto dragonbox::to_decimal(double x) noexcept
+    -> dragonbox::decimal_fp<double>;
+
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
+template FMT_API locale_ref::locale_ref(const std::locale& loc);
+template FMT_API auto locale_ref::get<std::locale>() const -> std::locale;
+#endif
+
+// Explicit instantiations for char.
+
+template FMT_API auto thousands_sep_impl(locale_ref)
+    -> thousands_sep_result<char>;
+template FMT_API auto decimal_point_impl(locale_ref) -> char;
+
+template FMT_API void buffer<char>::append(const char*, const char*);
+
+template FMT_API void vformat_to(buffer<char>&, string_view,
+                                 typename vformat_args<>::type, locale_ref);
+
+// Explicit instantiations for wchar_t.
+
+template FMT_API auto thousands_sep_impl(locale_ref)
+    -> thousands_sep_result<wchar_t>;
+template FMT_API auto decimal_point_impl(locale_ref) -> wchar_t;
+
+template FMT_API void buffer<wchar_t>::append(const wchar_t*, const wchar_t*);
+
+}  // namespace detail
+FMT_END_NAMESPACE
diff --git a/ethosu/regor/dependencies/thirdparty/fmt/src/os.cc b/ethosu/regor/dependencies/thirdparty/fmt/src/os.cc
new file mode 100644
index 00000000..a639e78c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/fmt/src/os.cc
@@ -0,0 +1,402 @@
+// Formatting library for C++ - optional OS-specific functionality
+//
+// Copyright (c) 2012 - 2016, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+// Disable bogus MSVC warnings.
+#if !defined(_CRT_SECURE_NO_WARNINGS) && defined(_MSC_VER)
+#  define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include "fmt/os.h"
+
+#include <climits>
+
+#if FMT_USE_FCNTL
+#  include <sys/stat.h>
+#  include <sys/types.h>
+
+#  ifdef _WRS_KERNEL    // VxWorks7 kernel
+#    include <ioLib.h>  // getpagesize
+#  endif
+
+#  ifndef _WIN32
+#    include <unistd.h>
+#  else
+#    ifndef WIN32_LEAN_AND_MEAN
+#      define WIN32_LEAN_AND_MEAN
+#    endif
+#    include <io.h>
+
+#    ifndef S_IRUSR
+#      define S_IRUSR _S_IREAD
+#    endif
+#    ifndef S_IWUSR
+#      define S_IWUSR _S_IWRITE
+#    endif
+#    ifndef S_IRGRP
+#      define S_IRGRP 0
+#    endif
+#    ifndef S_IWGRP
+#      define S_IWGRP 0
+#    endif
+#    ifndef S_IROTH
+#      define S_IROTH 0
+#    endif
+#    ifndef S_IWOTH
+#      define S_IWOTH 0
+#    endif
+#  endif  // _WIN32
+#endif    // FMT_USE_FCNTL
+
+#ifdef _WIN32
+#  include <windows.h>
+#endif
+
+namespace {
+#ifdef _WIN32
+// Return type of read and write functions.
+using rwresult = int;
+
+// On Windows the count argument to read and write is unsigned, so convert
+// it from size_t preventing integer overflow.
+inline unsigned convert_rwcount(std::size_t count) {
+  return count <= UINT_MAX ? static_cast<unsigned>(count) : UINT_MAX;
+}
+#elif FMT_USE_FCNTL
+// Return type of read and write functions.
+using rwresult = ssize_t;
+
+inline std::size_t convert_rwcount(std::size_t count) { return count; }
+#endif
+}  // namespace
+
+FMT_BEGIN_NAMESPACE
+
+#ifdef _WIN32
+namespace detail {
+
+class system_message {
+  system_message(const system_message&) = delete;
+  void operator=(const system_message&) = delete;
+
+  unsigned long result_;
+  wchar_t* message_;
+
+  static bool is_whitespace(wchar_t c) noexcept {
+    return c == L' ' || c == L'\n' || c == L'\r' || c == L'\t' || c == L'\0';
+  }
+
+ public:
+  explicit system_message(unsigned long error_code)
+      : result_(0), message_(nullptr) {
+    result_ = FormatMessageW(
+        FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
+            FORMAT_MESSAGE_IGNORE_INSERTS,
+        nullptr, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+        reinterpret_cast<wchar_t*>(&message_), 0, nullptr);
+    if (result_ != 0) {
+      while (result_ != 0 && is_whitespace(message_[result_ - 1])) {
+        --result_;
+      }
+    }
+  }
+  ~system_message() { LocalFree(message_); }
+  explicit operator bool() const noexcept { return result_ != 0; }
+  operator basic_string_view<wchar_t>() const noexcept {
+    return basic_string_view<wchar_t>(message_, result_);
+  }
+};
+
+class utf8_system_category final : public std::error_category {
+ public:
+  const char* name() const noexcept override { return "system"; }
+  std::string message(int error_code) const override {
+    auto&& msg = system_message(error_code);
+    if (msg) {
+      auto utf8_message = to_utf8<wchar_t>();
+      if (utf8_message.convert(msg)) {
+        return utf8_message.str();
+      }
+    }
+    return "unknown error";
+  }
+};
+
+}  // namespace detail
+
+FMT_API const std::error_category& system_category() noexcept {
+  static const detail::utf8_system_category category;
+  return category;
+}
+
+std::system_error vwindows_error(int err_code, string_view format_str,
+                                 format_args args) {
+  auto ec = std::error_code(err_code, system_category());
+  return std::system_error(ec, vformat(format_str, args));
+}
+
+void detail::format_windows_error(detail::buffer<char>& out, int error_code,
+                                  const char* message) noexcept {
+  FMT_TRY {
+    auto&& msg = system_message(error_code);
+    if (msg) {
+      auto utf8_message = to_utf8<wchar_t>();
+      if (utf8_message.convert(msg)) {
+        fmt::format_to(appender(out), FMT_STRING("{}: {}"), message,
+                       string_view(utf8_message));
+        return;
+      }
+    }
+  }
+  FMT_CATCH(...) {}
+  format_error_code(out, error_code, message);
+}
+
+void report_windows_error(int error_code, const char* message) noexcept {
+  report_error(detail::format_windows_error, error_code, message);
+}
+#endif  // _WIN32
+
+buffered_file::~buffered_file() noexcept {
+  if (file_ && FMT_SYSTEM(fclose(file_)) != 0)
+    report_system_error(errno, "cannot close file");
+}
+
+buffered_file::buffered_file(cstring_view filename, cstring_view mode) {
+  FMT_RETRY_VAL(file_, FMT_SYSTEM(fopen(filename.c_str(), mode.c_str())),
+                nullptr);
+  if (!file_)
+    FMT_THROW(system_error(errno, FMT_STRING("cannot open file {}"),
+                           filename.c_str()));
+}
+
+void buffered_file::close() {
+  if (!file_) return;
+  int result = FMT_SYSTEM(fclose(file_));
+  file_ = nullptr;
+  if (result != 0)
+    FMT_THROW(system_error(errno, FMT_STRING("cannot close file")));
+}
+
+int buffered_file::descriptor() const {
+#if !defined(fileno)
+  int fd = FMT_POSIX_CALL(fileno(file_));
+#elif defined(FMT_HAS_SYSTEM)
+  // fileno is a macro on OpenBSD so we cannot use FMT_POSIX_CALL.
+#  define FMT_DISABLE_MACRO
+  int fd = FMT_SYSTEM(fileno FMT_DISABLE_MACRO(file_));
+#else
+  int fd = fileno(file_);
+#endif
+  if (fd == -1)
+    FMT_THROW(system_error(errno, FMT_STRING("cannot get file descriptor")));
+  return fd;
+}
+
+#if FMT_USE_FCNTL
+#  ifdef _WIN32
+using mode_t = int;
+#  endif
+constexpr mode_t default_open_mode =
+    S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
+
+file::file(cstring_view path, int oflag) {
+#  if defined(_WIN32) && !defined(__MINGW32__)
+  fd_ = -1;
+  auto converted = detail::utf8_to_utf16(string_view(path.c_str()));
+  *this = file::open_windows_file(converted.c_str(), oflag);
+#  else
+  FMT_RETRY(fd_, FMT_POSIX_CALL(open(path.c_str(), oflag, default_open_mode)));
+  if (fd_ == -1)
+    FMT_THROW(
+        system_error(errno, FMT_STRING("cannot open file {}"), path.c_str()));
+#  endif
+}
+
+file::~file() noexcept {
+  // Don't retry close in case of EINTR!
+  // See http://linux.derkeiler.com/Mailing-Lists/Kernel/2005-09/3000.html
+  if (fd_ != -1 && FMT_POSIX_CALL(close(fd_)) != 0)
+    report_system_error(errno, "cannot close file");
+}
+
+void file::close() {
+  if (fd_ == -1) return;
+  // Don't retry close in case of EINTR!
+  // See http://linux.derkeiler.com/Mailing-Lists/Kernel/2005-09/3000.html
+  int result = FMT_POSIX_CALL(close(fd_));
+  fd_ = -1;
+  if (result != 0)
+    FMT_THROW(system_error(errno, FMT_STRING("cannot close file")));
+}
+
+long long file::size() const {
+#  ifdef _WIN32
+  // Use GetFileSize instead of GetFileSizeEx for the case when _WIN32_WINNT
+  // is less than 0x0500 as is the case with some default MinGW builds.
+  // Both functions support large file sizes.
+  DWORD size_upper = 0;
+  HANDLE handle = reinterpret_cast<HANDLE>(_get_osfhandle(fd_));
+  DWORD size_lower = FMT_SYSTEM(GetFileSize(handle, &size_upper));
+  if (size_lower == INVALID_FILE_SIZE) {
+    DWORD error = GetLastError();
+    if (error != NO_ERROR)
+      FMT_THROW(windows_error(GetLastError(), "cannot get file size"));
+  }
+  unsigned long long long_size = size_upper;
+  return (long_size << sizeof(DWORD) * CHAR_BIT) | size_lower;
+#  else
+  using Stat = struct stat;
+  Stat file_stat = Stat();
+  if (FMT_POSIX_CALL(fstat(fd_, &file_stat)) == -1)
+    FMT_THROW(system_error(errno, FMT_STRING("cannot get file attributes")));
+  static_assert(sizeof(long long) >= sizeof(file_stat.st_size),
+                "return type of file::size is not large enough");
+  return file_stat.st_size;
+#  endif
+}
+
+std::size_t file::read(void* buffer, std::size_t count) {
+  rwresult result = 0;
+  FMT_RETRY(result, FMT_POSIX_CALL(read(fd_, buffer, convert_rwcount(count))));
+  if (result < 0)
+    FMT_THROW(system_error(errno, FMT_STRING("cannot read from file")));
+  return detail::to_unsigned(result);
+}
+
+std::size_t file::write(const void* buffer, std::size_t count) {
+  rwresult result = 0;
+  FMT_RETRY(result, FMT_POSIX_CALL(write(fd_, buffer, convert_rwcount(count))));
+  if (result < 0)
+    FMT_THROW(system_error(errno, FMT_STRING("cannot write to file")));
+  return detail::to_unsigned(result);
+}
+
+file file::dup(int fd) {
+  // Don't retry as dup doesn't return EINTR.
+  // http://pubs.opengroup.org/onlinepubs/009695399/functions/dup.html
+  int new_fd = FMT_POSIX_CALL(dup(fd));
+  if (new_fd == -1)
+    FMT_THROW(system_error(
+        errno, FMT_STRING("cannot duplicate file descriptor {}"), fd));
+  return file(new_fd);
+}
+
+void file::dup2(int fd) {
+  int result = 0;
+  FMT_RETRY(result, FMT_POSIX_CALL(dup2(fd_, fd)));
+  if (result == -1) {
+    FMT_THROW(system_error(
+        errno, FMT_STRING("cannot duplicate file descriptor {} to {}"), fd_,
+        fd));
+  }
+}
+
+void file::dup2(int fd, std::error_code& ec) noexcept {
+  int result = 0;
+  FMT_RETRY(result, FMT_POSIX_CALL(dup2(fd_, fd)));
+  if (result == -1) ec = std::error_code(errno, std::generic_category());
+}
+
+void file::pipe(file& read_end, file& write_end) {
+  // Close the descriptors first to make sure that assignments don't throw
+  // and there are no leaks.
+  read_end.close();
+  write_end.close();
+  int fds[2] = {};
+#  ifdef _WIN32
+  // Make the default pipe capacity same as on Linux 2.6.11+.
+  enum { DEFAULT_CAPACITY = 65536 };
+  int result = FMT_POSIX_CALL(pipe(fds, DEFAULT_CAPACITY, _O_BINARY));
+#  else
+  // Don't retry as the pipe function doesn't return EINTR.
+  // http://pubs.opengroup.org/onlinepubs/009696799/functions/pipe.html
+  int result = FMT_POSIX_CALL(pipe(fds));
+#  endif
+  if (result != 0)
+    FMT_THROW(system_error(errno, FMT_STRING("cannot create pipe")));
+  // The following assignments don't throw because read_fd and write_fd
+  // are closed.
+  read_end = file(fds[0]);
+  write_end = file(fds[1]);
+}
+
+buffered_file file::fdopen(const char* mode) {
+// Don't retry as fdopen doesn't return EINTR.
+#  if defined(__MINGW32__) && defined(_POSIX_)
+  FILE* f = ::fdopen(fd_, mode);
+#  else
+  FILE* f = FMT_POSIX_CALL(fdopen(fd_, mode));
+#  endif
+  if (!f) {
+    FMT_THROW(system_error(
+        errno, FMT_STRING("cannot associate stream with file descriptor")));
+  }
+  buffered_file bf(f);
+  fd_ = -1;
+  return bf;
+}
+
+#  if defined(_WIN32) && !defined(__MINGW32__)
+file file::open_windows_file(wcstring_view path, int oflag) {
+  int fd = -1;
+  auto err = _wsopen_s(&fd, path.c_str(), oflag, _SH_DENYNO, default_open_mode);
+  if (fd == -1) {
+    FMT_THROW(system_error(err, FMT_STRING("cannot open file {}"),
+                           detail::to_utf8<wchar_t>(path.c_str()).c_str()));
+  }
+  return file(fd);
+}
+#  endif
+
+#  if !defined(__MSDOS__)
+long getpagesize() {
+#    ifdef _WIN32
+  SYSTEM_INFO si;
+  GetSystemInfo(&si);
+  return si.dwPageSize;
+#    else
+#      ifdef _WRS_KERNEL
+  long size = FMT_POSIX_CALL(getpagesize());
+#      else
+  long size = FMT_POSIX_CALL(sysconf(_SC_PAGESIZE));
+#      endif
+
+  if (size < 0)
+    FMT_THROW(system_error(errno, FMT_STRING("cannot get memory page size")));
+  return size;
+#    endif
+}
+#  endif
+
+namespace detail {
+
+void file_buffer::grow(size_t) {
+  if (this->size() == this->capacity()) flush();
+}
+
+file_buffer::file_buffer(cstring_view path,
+                         const detail::ostream_params& params)
+    : file_(path, params.oflag) {
+  set(new char[params.buffer_size], params.buffer_size);
+}
+
+file_buffer::file_buffer(file_buffer&& other)
+    : detail::buffer<char>(other.data(), other.size(), other.capacity()),
+      file_(std::move(other.file_)) {
+  other.clear();
+  other.set(nullptr, 0);
+}
+
+file_buffer::~file_buffer() {
+  flush();
+  delete[] data();
+}
+}  // namespace detail
+
+ostream::~ostream() = default;
+#endif  // FMT_USE_FCNTL
+FMT_END_NAMESPACE
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/AUTHORS b/ethosu/regor/dependencies/thirdparty/gemmlowp/AUTHORS
new file mode 100644
index 00000000..996e1044
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/AUTHORS
@@ -0,0 +1,14 @@
+# This is the official list of gemmlowp authors for copyright purposes.
+# This file is distinct from the CONTRIBUTORS.txt file.
+# See the latter for an explanation.
+
+# Names should be added to this file as:
+# Name or Organization <email address>
+# The email address is not required for organizations.
+
+Google Inc.
+Intel Corporation
+ARM Ltd.
+Silk Labs Inc.
+MIPS Tech LLC
+Wave Computing Inc.
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/BUILD b/ethosu/regor/dependencies/thirdparty/gemmlowp/BUILD
new file mode 100644
index 00000000..5e6bb28c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/BUILD
@@ -0,0 +1,232 @@
+#
+# Description:
+#   gemmlowp is a small self-contained low-precision GEMM library.
+#   https://github.com/google/gemmlowp
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+config_setting(
+    name = "windows",
+    values = {
+        "cpu": "x64_windows",
+    },
+)
+
+config_setting(
+    name = "android",
+    values = {
+        "crosstool_top": "//external:android/crosstool",
+    },
+)
+
+load(":flags.bzl", "LIB_COPTS", "LIB_LINKOPTS", "BIN_LINKOPTS")
+
+filegroup(
+    name = "gemmlowp_private_headers",
+    srcs = glob([
+        "fixedpoint/*.h",
+        "internal/*.h",
+    ]),
+    visibility = ["//visibility:private"],
+)
+
+filegroup(
+    name = "gemmlowp_public_headers",
+    srcs = glob([
+        "meta/*.h",
+        "public/*.h",
+        "profiling/*.h",
+    ]),
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "gemmlowp_headers",
+    srcs = [
+        ":gemmlowp_private_headers",
+        ":gemmlowp_public_headers",
+    ],
+    visibility = ["//visibility:private"],
+)
+
+filegroup(
+    name = "eight_bit_int_gemm_headers",
+    srcs = glob(["eight_bit_int_gemm/*.h"]),
+    visibility = ["//visibility:private"],
+)
+
+filegroup(
+    name = "eight_bit_int_gemm_public_headers",
+    srcs = [
+        ":eight_bit_int_gemm_headers",
+        ":gemmlowp_public_headers",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "eight_bit_int_gemm_sources_with_no_headers",
+    srcs = glob(["eight_bit_int_gemm/*.cc"]),
+    visibility = ["//visibility:private"],
+)
+
+filegroup(
+    name = "eight_bit_int_gemm_sources",
+    srcs = [
+        ":eight_bit_int_gemm_headers",
+        ":eight_bit_int_gemm_sources_with_no_headers",
+        ":gemmlowp_headers",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
+    name = "gemmlowp_test_headers",
+    srcs = [":gemmlowp_headers"] + glob(["test/*.h"]),
+    visibility = ["//visibility:private"],
+)
+
+filegroup(
+    name = "fixedpoint_private_headers",
+    srcs = glob([
+        "fixedpoint/*.h",
+    ]) + [
+        "internal/common.h",
+        "internal/detect_platform.h",
+    ],
+    visibility = ["//visibility:private"],
+)
+
+cc_library(
+    name = "fixedpoint",
+    srcs = [
+        ":fixedpoint_private_headers",
+    ],
+    hdrs = [
+        "fixedpoint/fixedpoint.h",
+    ],
+    # Blaze warning:
+    # "setting 'linkstatic=1' is recommended if there are no object files."
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "gemmlowp",
+    hdrs = [":gemmlowp_headers"],
+    linkopts = LIB_LINKOPTS,
+    # Blaze warning:
+    # "setting 'linkstatic=1' is recommended if there are no object files."
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+    deps = [":fixedpoint"],
+)
+
+cc_library(
+    name = "eight_bit_int_gemm",
+    srcs = [":eight_bit_int_gemm_sources_with_no_headers"],
+    hdrs = [
+        ":eight_bit_int_gemm_headers",
+        ":gemmlowp_private_headers",
+        ":gemmlowp_public_headers",
+    ],
+    copts = LIB_COPTS,
+    linkopts = LIB_LINKOPTS,
+    visibility = ["//visibility:public"],
+    deps = [":gemmlowp"],
+)
+
+cc_library(
+    name = "profiler",
+    hdrs = [
+        "profiling/instrumentation.h",
+        "profiling/profiler.h",
+        "profiling/pthread_everywhere.h",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+# The main gemmlowp unit test
+cc_test(
+    name = "test",
+    size = "medium",
+    srcs = [
+        "test/test.cc",
+        "test/test_data.cc",
+        ":gemmlowp_test_headers",
+    ],
+    copts = ["-O3"],
+    deps = [":eight_bit_int_gemm"],
+)
+
+# Math helpers test
+cc_test(
+    name = "test_math_helpers",
+    size = "small",
+    srcs = [
+        "test/test_math_helpers.cc",
+        ":gemmlowp_test_headers",
+    ],
+)
+
+# BlockingCounter test
+cc_test(
+    name = "test_blocking_counter",
+    size = "medium",
+    srcs = [
+        "test/test_blocking_counter.cc",
+        ":gemmlowp_test_headers",
+    ],
+    linkopts = BIN_LINKOPTS,
+)
+
+# Allocator test
+cc_test(
+    name = "test_allocator",
+    size = "small",
+    srcs = [
+        "test/test_allocator.cc",
+        ":gemmlowp_test_headers",
+    ],
+)
+
+# FixedPoint test
+cc_test(
+    name = "test_fixedpoint",
+    size = "small",
+    srcs = [
+        "test/test_fixedpoint.cc",
+        ":gemmlowp_test_headers",
+    ],
+)
+
+# Benchmark
+cc_binary(
+    name = "benchmark",
+    srcs = [
+        "test/benchmark.cc",
+        ":gemmlowp_test_headers",
+    ],
+    copts = [
+        "-O3",
+        "-DNDEBUG",
+    ],
+    linkopts = BIN_LINKOPTS,
+)
+
+# Benchmark
+cc_binary(
+    name = "benchmark_profile",
+    srcs = [
+        "test/benchmark.cc",
+        ":gemmlowp_test_headers",
+    ],
+    copts = [
+        "-O3",
+        "-DNDEBUG",
+        "-DGEMMLOWP_TEST_PROFILE",
+    ],
+    linkopts = BIN_LINKOPTS,
+)
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/CONTRIBUTING b/ethosu/regor/dependencies/thirdparty/gemmlowp/CONTRIBUTING
new file mode 100644
index 00000000..d6d63bc7
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/CONTRIBUTING
@@ -0,0 +1,53 @@
+Want to contribute? Great! First, read this page (including the small print at the end).
+
+
+Before you contribute
+=====================
+
+Before we can use your code, you must sign the Google Individual Contributor
+License Agreement (CLA),
+
+  https://developers.google.com/open-source/cla/individual?csw=1
+
+which you can do online. The CLA is necessary mainly because you own the
+copyright to your changes, even after your contribution becomes part of our
+codebase, so we need your permission to use and distribute your code. We also
+need to be sure of various other things—for instance that you'll tell us if you
+know that your code infringes on other people's patents. You don't have to sign
+the CLA until after you've submitted your code for review and a member has
+approved it, but you must do it before we can put your code into our codebase.
+Before you start working on a larger contribution, you should get in touch with
+us first through the issue tracker with your idea so that we can help out and
+possibly guide you. Coordinating up front makes it much easier to avoid
+frustration later on.
+
+
+Getting in touch with the gemmlowp community
+============================================
+
+The central point of communication around gemmlowp is the mailing list,
+  https://groups.google.com/forum/#!forum/gemmlowp
+
+
+TODO items and projects
+=======================
+
+We try to keep a current list of TODO items in the todo/ directory.
+Please feel free to pick one to work on, and to ask current maintainers for
+guidance. The gemmlowp mailing list is a good place for that.
+
+
+Code reviews
+============
+
+All submissions, including submissions by project members, require review.
+For this purpose, we use Github pull requests against this repository:
+
+  https://github.com/google/gemmlowp
+
+
+The small print
+===============
+
+Contributions made by corporations are covered by a different agreement than
+the one above, the Software Grant and Corporate Contributor License Agreement.
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/CONTRIBUTORS b/ethosu/regor/dependencies/thirdparty/gemmlowp/CONTRIBUTORS
new file mode 100644
index 00000000..3740e0e8
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/CONTRIBUTORS
@@ -0,0 +1,40 @@
+# People who have agreed to one of the CLAs and can contribute patches.
+# The AUTHORS.txt file lists the copyright holders; this file
+# lists people.  For example, Google employees are listed here
+# but not in AUTHORS.txt, because Google holds the copyright.
+#
+# https://developers.google.com/open-source/cla/individual
+# https://developers.google.com/open-source/cla/corporate
+#
+# Names should be added to this file as:
+#     Name <email address>
+
+Google:
+Benoit Jacob <benoitjacob@google.com>
+Pete Warden <petewarden@google.com>
+Miao Wang <miaowang@google.com>
+David Andersen <dga@google.com>
+Maciek Chociej <maciekc@google.com>
+Justine Tunney <jart@google.com>
+Mark J. Matthews <mjmatthews@google.com>
+Marie White <mariewhite@google.com>
+Suharsh Sivakumar <suharshs@google.com>
+
+Intel:
+Sagi Marcovich <sagi.marcovich@intel.com>
+Murat Efe Guney <murat.e.guney@intel.com>
+Sarah Knepper <sarah.knepper@intel.com>
+Mourad Gouicem <mourad.gouicem@intel.com>
+Richard Winterton <richard.winterton@intel.com>
+
+ARM:
+David Mansell <David.Mansell@arm.com>
+
+Silk Labs:
+Andreas Gal <andreas@silklabs.com>
+
+MIPS Tech LLC:
+Alexey Frunze <Alexey.Frunze@mips.com>
+
+Wave Computing Inc.:
+Alexey Frunze <afrunze@wavecomp.com>
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/LICENSE b/ethosu/regor/dependencies/thirdparty/gemmlowp/LICENSE
new file mode 100644
index 00000000..d6456956
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/Makefile.travis b/ethosu/regor/dependencies/thirdparty/gemmlowp/Makefile.travis
new file mode 100644
index 00000000..6b63a8cb
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/Makefile.travis
@@ -0,0 +1,27 @@
+UNITTESTS_COMMON=test.cc test_allocator.cc test_blocking_counter.cc test_fixedpoint.cc test_math_helpers.cc
+UNITTESTS_X86=$(UNITTESTS_COMMON)
+
+UNITTESTS_X86_BIN=$(addprefix ./test/, $(addsuffix .x86, $(basename $(UNITTESTS_X86))))
+UNITTESTS_BIN=$(UNITTESTS_X86_BIN)
+
+VPATH=./test ./public
+
+space :=
+space +=
+join-with = $(subst $(space),$1,$(strip $2))
+
+.PHONY: compile clean unittest
+
+CC_X86=clang++
+CFLAGS_X86=-march=native -O3 -lpthread
+
+compile: $(UNITTESTS_BIN)
+
+clean:
+	rm -f $(UNITTESTS_BIN)
+
+unittest: $(UNITTESTS_BIN)
+	$(call join-with, && ,$(addprefix ./, $^))
+
+%.x86: %.cc ./eight_bit_int_gemm/eight_bit_int_gemm.cc ./test/test_data.cc
+	$(CC_X86) $(CFLAGS_X86) -std=c++11 -g -O3 -o $@ $^
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/README.md b/ethosu/regor/dependencies/thirdparty/gemmlowp/README.md
new file mode 100644
index 00000000..22fabac5
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/README.md
@@ -0,0 +1,276 @@
+# gemmlowp: a small self-contained low-precision GEMM library
+
+[![Build Status](https://secure.travis-ci.org/google/gemmlowp.png)](http://travis-ci.org/google/gemmlowp)
+
+This is not a full linear algebra library, only a GEMM library: it only does
+general matrix multiplication ("GEMM").
+
+The meaning of "low precision" is detailed in this document:
+[doc/low-precision.md](doc/low-precision.md)
+
+Some of the general design is explained in [doc/design.md](doc/design.md).
+
+**Warning:** This library goes very slow if compiled incorrectly; see below.
+
+## Disclaimer
+
+This is not an official Google product (experimental or otherwise), it is just
+code that happens to be owned by Google.
+
+## Mailing list
+
+gemmlowp-related discussion, about either development or usage, is welcome on
+this Google Group (mailing list / forum):
+
+https://groups.google.com/forum/#!forum/gemmlowp
+
+## Portability, target platforms/architectures
+
+Should be portable to any platform with some C++11 and POSIX support, while we
+have optional optimized code paths for specific architectures.
+
+Required:
+
+*   C++11 (a small conservative subset of it)
+
+Required for some features:
+
+*   Some POSIX interfaces:
+    *   pthreads (for multi-threaded operation and for profiling).
+    *   sysconf (for multi-threaded operation to detect number of cores; may be
+        bypassed).
+
+Optional:
+
+*   Architecture-specific code paths use intrinsics or inline assembly. See
+    "Architecture-specific optimized code paths" below.
+
+## Architecture-specific optimized code paths
+
+We have some optimized code paths for specific instruction sets. Some are
+written in inline assembly, some are written in C++ using intrinsics. Both GCC
+and Clang are supported.
+
+Current optimized code paths:
+
+*   ARM with NEON (both 32bit and 64bit).
+*   Intel x86 with SSE 4.1 (both 32bit and 64bit).
+
+When building for x86, it's very important to pass `-msse4.1` to the compiler,
+otherwise gemmlowp will use slow reference code. Bazel users can compile by
+running `bazel build --copt=-msse4.1 //gemmlowp:all`. The compiled binary should
+work on all Intel CPUs since 2008 (including low power microarchitectures) as
+well as AMD CPUs since 2011.
+
+Please note when compiling binaries that don't need to be distributed, it's
+generally a better idea to pass `-march=native` to the compiler. That flag
+implies `-msse4.1` flag, along with others that might be helpful. This of course
+assumes the host machine supports those instructions. Bazel users should prefer
+to run `bazel build --config=opt //gemmlowp:all` instead.
+
+Details of what it takes to make an efficient port of gemmlowp, namely writing a
+suitable GEMM kernel and accompanying packing code, are explained in this file:
+[doc/kernel.md](doc/kernel.md).
+
+## Public interfaces
+
+### The gemmlowp public interface
+
+gemmlowp's main public interface is in the `public/` subdirectory.
+
+This is a headers-only library, so there is nothing to link to.
+
+Usage documentation, and comments on the deprecation status of each public entry
+point, may be found in [doc/public.md](doc/public.md) .
+
+A full, self-contained usage example, showing how to quantize float matrices and
+perform a quantized matrix multiplication approximating a float matrix
+multiplication, is given in
+[doc/quantization_example.cc](doc/quantization_example.cc).
+
+### Old EightBitIntGemm legacy deprecated interface
+
+The `eight_bit_int_gemm/` subdirectory contains an alternate interface that
+should be considered purely legacy, deprecated, and going to be removed at some
+point in the future.
+
+## Building
+
+### Building by manually invoking your compiler
+
+Because gemmlowp is so simple, working with it involves only single-command-line
+compiler invocations. Therefore we expect that most people working with gemmlowp
+will either manually invoke their compiler, or write their own rules for their
+own preferred build system.
+
+Keep in mind (previous section) that gemmlowp itself is a pure-headers-only
+library so there is nothing to build.
+
+For a Android gemmlowp development workflow, the `scripts/` directory contains a
+script to build and run a program on an Android device:
+
+```
+scripts/test-android.sh
+```
+
+### Building using Bazel
+
+That being said, we also maintain a Bazel BUILD system as part of gemmlowp. Its
+usage is not mandatory at all and is only one possible way that gemmlowp
+libraries and tests may be built. If you are interested, Bazel's home page is
+http://bazel.build/ And you can get started with using Bazel to build gemmlowp
+targets by first creating an empty WORKSPACE file in a parent directory, for
+instance:
+
+```
+$ cd gemmlowp/..  # change to parent directory containing gemmlowp/
+$ touch WORKSPACE # declare that to be our workspace root
+$ bazel build gemmlowp:all
+```
+
+## Testing
+
+### Testing by manually building and running tests
+
+The test/ directory contains unit tests. The primary unit test is
+
+```
+test/test.cc
+```
+
+Since it covers also the EightBitIntGemm interface, it needs to be linked
+against
+
+```
+eight_bit_int_gemm/eight_bit_int_gemm.cc
+```
+
+It also uses realistic data captured from a neural network run in
+
+```
+test/test_data.cc
+```
+
+Thus you'll want to pass the following list of source files to your
+compiler/linker:
+
+```
+test/test.cc
+eight_bit_int_gemm/eight_bit_int_gemm.cc
+test/test_data.cc
+```
+
+The `scripts/` directory contains a script to build and run a program on an
+Android device:
+
+```
+scripts/test-android.sh
+```
+
+It expects the `CXX` environment variable to point to an Android toolchain's C++
+compiler, and expects source files (and optionally, cflags) as command-line
+parameters. To build and run the above-mentioned main unit test, first set `CXX`
+e.g.:
+
+```
+$ export CXX=/some/toolchains/arm-linux-androideabi-4.8/bin/arm-linux-androideabi-g++
+```
+
+Then run:
+
+```
+$ ./scripts/test-android.sh \
+test/test.cc \
+eight_bit_int_gemm/eight_bit_int_gemm.cc \
+test/test_data.cc
+```
+
+### Testing using Bazel
+
+Alternatively, you can use Bazel to build and run tests. See the Bazel
+instruction in the above section on building. Once your Bazel workspace is set
+up, you can for instance do:
+
+```
+$ bazel test gemmlowp:all
+```
+
+## Troubleshooting Compilation
+
+If you're having trouble finding the compiler, follow these instructions to
+build a standalone toolchain:
+https://developer.android.com/ndk/guides/standalone_toolchain.html
+
+Here's an example of setting up Clang 3.5:
+
+```
+$ export INSTALL_DIR=~/toolchains/clang-21-stl-gnu
+$ $NDK/build/tools/make-standalone-toolchain.sh \
+--toolchain=arm-linux-androideabi-clang3.5 --platform=android-21 \
+--install-dir=$INSTALL_DIR
+$ export CXX="$INSTALL_DIR/bin/arm-linux-androideabi-g++ \
+--sysroot=$INSTALL_DIR/sysroot"
+```
+
+Some compilers (e.g. the default clang++ in the same bin directory) don't
+support NEON assembly. The benchmark build process will issue a warning if
+support isn't detected, and you should make sure you're using a compiler like
+arm-linux-androideabi-g++ that does include NEON.
+
+## Benchmarking
+
+The main benchmark is
+
+```
+test/benchmark.cc
+```
+
+It doesn't need to be linked to any other source file. We recommend building
+with assertions disabled (`-DNDEBUG`).
+
+For example, the benchmark can be built and run on an Android device by doing:
+
+```
+$ ./scripts/test-android.sh test/benchmark.cc -DNDEBUG
+```
+
+If `GEMMLOWP_TEST_PROFILE` is defined then the benchmark will be built with
+profiling instrumentation (which makes it slower) and will dump profiles. See
+next section on profiling.
+
+## Profiling
+
+The `profiling/` subdirectory offers a very simple, naive, inaccurate,
+non-interrupting sampling profiler that only requires pthreads (no signals).
+
+It relies on source code being instrumented with pseudo-stack labels. See
+`profiling/instrumentation.h`. A full example of using this profiler is given in
+the top comment of `profiling/profiler.h`.
+
+## Contributing
+
+Contribution-related discussion is always welcome on the gemmlowp mailing list
+(see above).
+
+We try to keep a current list of TODO items in the `todo/` directory.
+Prospective contributors are welcome to pick one to work on, and communicate
+about it on the gemmlowp mailing list.
+
+Details of the contributing process, including legalese, are in CONTRIBUTING.
+
+## Performance goals
+
+Our performance goals differ from typical GEMM performance goals in the
+following ways:
+
+1.  We care not only about speed, but also about minimizing power usage. We
+    specifically care about charge usage in mobile/embedded devices. This
+    implies that we care doubly about minimizing memory bandwidth usage: we care
+    about it, like any GEMM, because of the impact on speed, and we also care
+    about it because it is a key factor of power usage.
+
+2.  Most GEMMs are optimized primarily for large dense matrix sizes (>= 1000).
+    We do care about large sizes, but we also care specifically about the
+    typically smaller matrix sizes encountered in various mobile applications.
+    This means that we have to optimize for all sizes, not just for large enough
+    sizes.
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/WORKSPACE b/ethosu/regor/dependencies/thirdparty/gemmlowp/WORKSPACE
new file mode 100644
index 00000000..e69de29b
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint.h
new file mode 100644
index 00000000..56e95c00
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint.h
@@ -0,0 +1,914 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// fixedpoint.h: fixed-point arithmetic, with basic operations and
+// a few math functions such as tanh.
+
+#ifndef GEMMLOWP_INTERNAL_FIXEDPOINT_H_
+#define GEMMLOWP_INTERNAL_FIXEDPOINT_H_
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+
+#include "../internal/detect_platform.h"
+
+namespace gemmlowp {
+
+// Part 1: Low-level integer-arithmetic primitives.
+// The implementations here are generic implementations valid for
+// scalar types (e.g. std::int32_t). Architecture-specific SIMD types
+// (e.g. NEON int32x4_t) may be supported by providing
+// specializations for them in separate files.
+//
+// The purpose of these primitives is two-fold:
+//  - They will be used to implement higher-level fixed-point
+//    abstractions, namely the FixedPoint class and its arithmetic
+//    operators.
+//  - They will be directly used to implement some more involved
+//    fixed-point computations, e.g. the fixed-point implementation
+//    of math functions such as tanh.
+
+// Some compile-time traits around raw types to handle SIMD aspects:
+// number of lanes, underlying scalar type.
+template <typename tIntegerType>
+struct FixedPointRawTypeTraits {};
+
+template <>
+struct FixedPointRawTypeTraits<std::int32_t> {
+  typedef std::int32_t ScalarRawType;
+  static constexpr int kLanes = 1;
+};
+
+template <>
+struct FixedPointRawTypeTraits<std::int16_t> {
+  typedef std::int16_t ScalarRawType;
+  static constexpr int kLanes = 1;
+};
+
+// Returns a SIMD value duplicating a scalar value across all lanes.
+template <typename tRawType>
+tRawType Dup(typename FixedPointRawTypeTraits<tRawType>::ScalarRawType x) {
+  return x;
+}
+
+// Plain bit-wise AND
+template <typename tIntegerType>
+tIntegerType BitAnd(tIntegerType a, tIntegerType b) {
+  return a & b;
+}
+
+// Plain bit-wise OR
+template <typename tIntegerType>
+tIntegerType BitOr(tIntegerType a, tIntegerType b) {
+  return a | b;
+}
+
+// Plain bit-wise XOR
+template <typename tIntegerType>
+tIntegerType BitXor(tIntegerType a, tIntegerType b) {
+  return a ^ b;
+}
+
+// Plain bit-wise NOT
+template <typename tIntegerType>
+tIntegerType BitNot(tIntegerType a) {
+  return ~a;
+}
+
+// Integer addition. Not saturating. Overflow is undefined behavior.
+template <typename tIntegerType>
+tIntegerType Add(tIntegerType a, tIntegerType b) {
+  return a + b;
+}
+
+// Integer multiplication. Not saturating. Overflow is undefined behavior.
+template <typename tIntegerType>
+tIntegerType Mul(tIntegerType a, tIntegerType b) {
+  return a * b;
+}
+
+// Integer subtraction. Not saturating. Overflow is undefined behavior.
+template <typename tIntegerType>
+tIntegerType Sub(tIntegerType a, tIntegerType b) {
+  return a - b;
+}
+
+// Integer unary negative. Not saturating. Overflow is undefined behavior.
+template <typename tIntegerType>
+tIntegerType Neg(tIntegerType a) {
+  return -a;
+}
+
+// Integer arithmetic left-shift, equivalent to multiplying with a power of two.
+// Negative values are OK. In case of overflow, no Undefined
+// Behavior, but the results are implementation-defined (in practice,
+// they currently are saturated, but we make no commitment to that). The idea
+// is that the caller will want to implement the overflowing cases with
+// saturation with compare-and-mask, so we don't care about the results
+// in the overflow case, we just want to avoid undefined behavior.
+//
+// tIntegerType may be int32 or any narrower signed type.
+template <typename tIntegerType, typename OffsetType>
+tIntegerType ShiftLeft(tIntegerType a, OffsetType offset) {
+  const std::int64_t wide_a = static_cast<std::int64_t>(a);
+  const std::int64_t wide_shifted = wide_a * (1 << offset);
+  const auto min = std::numeric_limits<tIntegerType>::min();
+  const auto max = std::numeric_limits<tIntegerType>::max();
+  return wide_shifted < min
+             ? min
+             : wide_shifted > max ? max
+                                  : static_cast<tIntegerType>(wide_shifted);
+}
+
+// Integer arithmetic right-shift. Not rounding.
+// Relying on implementation-defined, but in-practice-consistent,
+// C++ compiler behavior.
+template <typename tIntegerType>
+tIntegerType ShiftRight(tIntegerType a, int offset) {
+  return a >> offset;
+}
+
+// Each bit of the result is set to the corresponding bit of either then_val or
+// else_val depending on whether the corresponding bit of if_mask is set.
+// Equivalent to the VBSL instruction in ARM NEON.
+template <typename tIntegerType>
+tIntegerType SelectUsingMask(tIntegerType if_mask, tIntegerType then_val,
+                             tIntegerType else_val) {
+  return BitXor(BitAnd(if_mask, then_val), BitAnd(BitNot(if_mask), else_val));
+}
+
+// For each input scalar, the corresponding bits of the result are set if the
+// input scalar is non-zero.
+template <typename tIntegerType>
+tIntegerType MaskIfNonZero(tIntegerType a) {
+  static constexpr tIntegerType zero = 0;
+  return a ? BitNot(zero) : zero;
+}
+
+// For each input scalar, the corresponding bits of the result are set if the
+// input scalar is zero.
+template <typename tIntegerType>
+tIntegerType MaskIfZero(tIntegerType a) {
+  return MaskIfNonZero<tIntegerType>(!a);
+}
+
+// For each pair of input scalars, the corresponding bits of the result are
+// set if the input scalars are equal.
+template <typename tIntegerType>
+tIntegerType MaskIfEqual(tIntegerType a, tIntegerType b) {
+  return MaskIfNonZero<tIntegerType>(a == b);
+}
+
+// For each pair of input scalars, the corresponding bits of the result are
+// set if the input scalars are not equal.
+template <typename tIntegerType>
+tIntegerType MaskIfNotEqual(tIntegerType a, tIntegerType b) {
+  return MaskIfNonZero<tIntegerType>(a != b);
+}
+
+// For each pair of input scalars, the corresponding bits of the result are
+// set if the input scalars a, b satisfy a > b.
+template <typename tIntegerType>
+tIntegerType MaskIfGreaterThan(tIntegerType a, tIntegerType b) {
+  return MaskIfNonZero<tIntegerType>(a > b);
+}
+
+// For each pair of input scalars, the corresponding bits of the result are
+// set if the input scalars a, b satisfy a >= b.
+template <typename tIntegerType>
+tIntegerType MaskIfGreaterThanOrEqual(tIntegerType a, tIntegerType b) {
+  return MaskIfNonZero<tIntegerType>(a >= b);
+}
+
+// For each pair of input scalars, the corresponding bits of the result are
+// set if the input scalars a, b satisfy a < b.
+template <typename tIntegerType>
+tIntegerType MaskIfLessThan(tIntegerType a, tIntegerType b) {
+  return MaskIfNonZero<tIntegerType>(a < b);
+}
+
+// For each pair of input scalars, the corresponding bits of the result are
+// set if the input scalars a, b satisfy a <= b.
+template <typename tIntegerType>
+tIntegerType MaskIfLessThanOrEqual(tIntegerType a, tIntegerType b) {
+  return MaskIfNonZero<tIntegerType>(a <= b);
+}
+
+// Returns true if all of the input scalars are nonzero.
+// This function may currently assume that each of the input scalars has either
+// all or none of its bits set. Otherwise, its behavior is currently undefined.
+template <typename tIntegerType>
+bool All(tIntegerType a) {
+  return a;
+}
+
+// Returns true if any of the input scalars are nonzero.
+// This function may currently assume that each of the input scalars has either
+// all or none of its bits set. Otherwise, its behavior is currently undefined.
+template <typename tIntegerType>
+bool Any(tIntegerType a) {
+  return a;
+}
+
+// Returns (a+b)/2, rounded to the nearest integer.
+// Equivalent to VRHADD in the ARM NEON instruction set.
+template <typename IntegerType>
+IntegerType RoundingHalfSum(IntegerType a, IntegerType b) {
+  static_assert(std::is_same<IntegerType, void>::value, "unimplemented");
+  (void)b;
+  return a;
+}
+
+template <>
+inline std::int32_t RoundingHalfSum(std::int32_t a, std::int32_t b) {
+  std::int64_t a64 = a;
+  std::int64_t b64 = b;
+  std::int64_t sum = a64 + b64;
+  std::int64_t sign = sum >= 0 ? 1 : -1;
+  return static_cast<std::int32_t>((sum + sign) / 2);
+}
+
+template <>
+inline std::int16_t RoundingHalfSum(std::int16_t a, std::int16_t b) {
+  std::int32_t a32 = a;
+  std::int32_t b32 = b;
+  std::int32_t sum = a32 + b32;
+  std::int32_t sign = sum >= 0 ? 1 : -1;
+  return static_cast<std::int16_t>((sum + sign) / 2);
+}
+
+template <typename IntegerType>
+IntegerType SaturatingAdd(IntegerType a, IntegerType b) {
+  static_assert(std::is_same<IntegerType, void>::value, "unimplemented");
+  (void)b;
+  return a;
+}
+
+// So far this is only needed for int16.
+template <>
+inline std::int16_t SaturatingAdd(std::int16_t a, std::int16_t b) {
+  std::int32_t a32 = a;
+  std::int32_t b32 = b;
+  std::int32_t sum = a32 + b32;
+  return static_cast<std::int16_t>(
+      std::min(static_cast<std::int32_t>(32767),
+               std::max(static_cast<std::int32_t>(-32768), sum)));
+}
+
+template <>
+inline std::int8_t SaturatingAdd(std::int8_t a, std::int8_t b) {
+  std::int16_t a16 = a;
+  std::int16_t b16 = b;
+  std::int16_t sum = a16 + b16;
+  return static_cast<std::int8_t>(std::min(
+      static_cast<int16_t>(std::numeric_limits<int8_t>::max()),
+      std::max(static_cast<int16_t>(std::numeric_limits<int8_t>::min()), sum)));
+}
+
+// Returns a+b, saturating if the integers are 16bit or narrower,
+// otherwise just a plain addition.
+template <typename IntegerType, bool Is16Bit>
+struct AddSaturatingIf16BitImpl {
+  static IntegerType Run(IntegerType a, IntegerType b) { return Add(a, b); }
+};
+template <typename IntegerType>
+struct AddSaturatingIf16BitImpl<IntegerType, true> {
+  static IntegerType Run(IntegerType a, IntegerType b) {
+    return SaturatingAdd(a, b);
+  }
+};
+template <typename IntegerType>
+IntegerType AddSaturatingIf16Bit(IntegerType a, IntegerType b) {
+  using ScalarType =
+      typename FixedPointRawTypeTraits<IntegerType>::ScalarRawType;
+  return AddSaturatingIf16BitImpl<IntegerType, sizeof(ScalarType) == 2>::Run(a,
+                                                                             b);
+}
+
+// Returns the integer that represents the product of two fixed-point
+// numbers, interpreting all integers as fixed-point values in the
+// interval [-1, 1), rounding to the nearest value, and saturating
+// -1 * -1 to the maximum value (since 1 is not in the half-open
+// interval [-1, 1)).
+//
+// [The explanation below specializes to std::int32_t for example purpose.]
+//
+// The mapping between IntegerType and the interval [-1, 1) is unique and
+// implied by IntegerType, which is assumed to be signed. For example,
+// for IntegerType==std::int32_t, the mapping is
+//   real_value = integer_value / 2^31.
+// So in this case, and leaving aside rounding and saturating, this
+// function computes ((a / 2^31) * (b / 2^31)) * 2^31, which simplifies to
+//   (a * b) / 2^31.
+//
+// The 'doubling' part in the name of this function comes from the fact that
+// this operation is very close to a "multiply-high" operation, keeping only
+// the top half bits, except that that would be effectively computing
+//   (a * b) / 2^32,
+// so here we are computing 2x that, since
+//   1/2^31 = 2 * 1/2^32.
+// The idea is to use all of the available 32 bits in the destination int32
+// value.
+//
+// [End of the explanation specializing to int32.]
+//
+// This is equivalent to the VQRDMULH instruction in ARM NEON.
+template <typename IntegerType>
+IntegerType SaturatingRoundingDoublingHighMul(IntegerType a, IntegerType b) {
+  static_assert(std::is_same<IntegerType, void>::value, "unimplemented");
+  (void)b;
+  return a;
+}
+
+// This function implements the same computation as the ARMv7 NEON VQRDMULH
+// instruction.
+template <>
+inline std::int32_t SaturatingRoundingDoublingHighMul(std::int32_t a,
+                                                      std::int32_t b) {
+  bool overflow = a == b && a == std::numeric_limits<std::int32_t>::min();
+  std::int64_t a_64(a);
+  std::int64_t b_64(b);
+  std::int64_t ab_64 = a_64 * b_64;
+  std::int32_t nudge = ab_64 >= 0 ? (1 << 30) : (1 - (1 << 30));
+  std::int32_t ab_x2_high32 =
+      static_cast<std::int32_t>((ab_64 + nudge) / (1ll << 31));
+  return overflow ? std::numeric_limits<std::int32_t>::max() : ab_x2_high32;
+}
+
+template <>
+inline std::int16_t SaturatingRoundingDoublingHighMul(std::int16_t a,
+                                                      std::int16_t b) {
+  bool overflow = a == b && a == std::numeric_limits<std::int16_t>::min();
+  std::int32_t a_32(a);
+  std::int32_t b_32(b);
+  std::int32_t ab_32 = a_32 * b_32;
+  std::int16_t nudge = ab_32 >= 0 ? (1 << 14) : (1 - (1 << 14));
+  std::int16_t ab_x2_high16 =
+      static_cast<std::int16_t>((ab_32 + nudge) / (1 << 15));
+  return overflow ? std::numeric_limits<std::int16_t>::max() : ab_x2_high16;
+}
+
+// Correctly-rounded-to-nearest division by a power-of-two.
+// Also known as a rounding arithmetic right shift.
+template <typename IntegerType, typename ExponentType>
+inline IntegerType RoundingDivideByPOT(IntegerType x, ExponentType exponent) {
+  assert(exponent >= 0);
+  assert(exponent <= 31);
+  const IntegerType mask = Dup<IntegerType>((1ll << exponent) - 1);
+  const IntegerType zero = Dup<IntegerType>(0);
+  const IntegerType one = Dup<IntegerType>(1);
+  const IntegerType remainder = BitAnd(x, mask);
+  const IntegerType threshold =
+      Add(ShiftRight(mask, 1), BitAnd(MaskIfLessThan(x, zero), one));
+  return Add(ShiftRight(x, exponent),
+             BitAnd(MaskIfGreaterThan(remainder, threshold), one));
+}
+
+// Returns the product of a run-time integer value by a compile-time power
+// of two, with either a positive exponent (equivalent to an arithmetic
+// left shift, saturating) or a negative exponent (equivalent to an arithmetic
+// right shift, rounding to nearest).
+template <int Exponent, typename IntegerType,
+          int ExponentSign = (Exponent > 0 ? 1 : Exponent < 0 ? -1 : 0)>
+struct ImplSaturatingRoundingMultiplyByPOT {};
+
+template <int Exponent, typename IntegerType>
+struct ImplSaturatingRoundingMultiplyByPOT<Exponent, IntegerType, 0> {
+  static IntegerType eval(IntegerType x) { return x; }
+};
+
+template <int Exponent, typename IntegerType>
+struct ImplSaturatingRoundingMultiplyByPOT<Exponent, IntegerType, 1> {
+  static IntegerType eval(IntegerType x) {
+    using ScalarIntegerType =
+        typename FixedPointRawTypeTraits<IntegerType>::ScalarRawType;
+    const IntegerType min =
+        Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::min());
+    const IntegerType max =
+        Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::max());
+    const int ScalarIntegerTypeBits = 8 * sizeof(ScalarIntegerType);
+
+    const std::int32_t threshold =
+        ((1 << (ScalarIntegerTypeBits - 1 - Exponent)) - 1);
+    const IntegerType positive_mask =
+        MaskIfGreaterThan(x, Dup<IntegerType>(threshold));
+    const IntegerType negative_mask =
+        MaskIfLessThan(x, Dup<IntegerType>(-threshold));
+
+    IntegerType result = ShiftLeft(x, Exponent);
+    result = SelectUsingMask(positive_mask, max, result);
+    result = SelectUsingMask(negative_mask, min, result);
+    return result;
+  }
+};
+
+template <int Exponent, typename IntegerType>
+struct ImplSaturatingRoundingMultiplyByPOT<Exponent, IntegerType, -1> {
+  static IntegerType eval(IntegerType x) {
+    return RoundingDivideByPOT<IntegerType>(x, -Exponent);
+  }
+};
+
+template <int Exponent, typename IntegerType>
+IntegerType SaturatingRoundingMultiplyByPOT(IntegerType x) {
+  return ImplSaturatingRoundingMultiplyByPOT<Exponent, IntegerType>::eval(x);
+}
+
+// Part 2: the FixedPoint class.
+
+// A FixedPoint object represents a fixed-point value stored in the underlying
+// integer type tRawType, if tRawType is a plain scalar integer type.
+// Alternatively, tRawType may be a SIMD type (e.g. NEON int32x4_t) in which
+// case a FixedPoint object represents a corresponding SIMD vector of fixed
+// point values.
+//
+// tIntegerBits describes the range of the fixed-point format: if
+// tIntegerBits == m then the range of representable values is the half-open
+// interval [-2^m; 2^m) where the open boundary on the right side means that
+// 2^m is not representable (how close the maximum representable value is to
+// it, depends on bit-depth of tRawType).
+//
+// In "Q format notation",
+//   https://en.wikipedia.org/wiki/Q_(number_format)
+// we are describing the format
+//   Qm.n
+// where
+//   m = tIntegerBits
+// and
+//   n = NumberOfBits(tRawType) - (m + 1)
+// Note that the (m + 1) in the above line is because we adopt the convention
+// that we count the integer bits exclusively of the sign bit; so (m + 1) is
+// the total number of integer bits inclusive of the sign bit.
+//
+// Accordingly, the number of integral representable values in our range
+//   [-2^m ; 2^m)
+// is equal to 2^(m+1).
+template <typename tRawType, int tIntegerBits>
+class FixedPoint {
+ public:
+  typedef tRawType RawType;
+
+  typedef FixedPointRawTypeTraits<RawType> RawTypeTraits;
+  typedef typename RawTypeTraits::ScalarRawType ScalarRawType;
+
+  static constexpr int kTotalBits = 8 * sizeof(ScalarRawType);
+  static constexpr int kIntegerBits = tIntegerBits;
+  static constexpr int kFractionalBits = kTotalBits - 1 - kIntegerBits;
+  static_assert(kIntegerBits >= 0 && kIntegerBits < kTotalBits,
+                "bad IntegerBits");
+
+  typedef FixedPoint<ScalarRawType, kIntegerBits> ScalarFixedPointType;
+
+  static const ScalarRawType ScalarRawMin() {
+    return std::numeric_limits<ScalarRawType>::min();
+  }
+
+  static const ScalarRawType ScalarRawMax() {
+    return std::numeric_limits<ScalarRawType>::max();
+  }
+
+  static const ScalarRawType RawMin() {
+    return VectorFromScalar(ScalarRawMin());
+  }
+
+  static const ScalarRawType RawMax() {
+    return VectorFromScalar(ScalarRawMax());
+  }
+
+  static FixedPoint FromRaw(RawType x) {
+    FixedPoint retval;
+    retval.raw() = x;
+    return retval;
+  }
+
+  static FixedPoint FromScalarRaw(ScalarRawType x) {
+    FixedPoint retval;
+    retval.raw() = Dup<RawType>(x);
+    return retval;
+  }
+
+  static FixedPoint FromScalarFixedPoint(ScalarFixedPointType x) {
+    return FromScalarRaw(x.raw());
+  }
+
+  template <int Exponent>
+  static FixedPoint ConstantPOT() {
+    static constexpr int kOffset = kFractionalBits + Exponent;
+    static_assert(
+        kOffset < 31,
+        "Constant not exactly representable in this fixed-point format");
+    return FromScalarRaw(ScalarRawType(1) << kOffset);
+  }
+
+  static FixedPoint Zero() { return FromScalarRaw(0); }
+
+  static FixedPoint One() {
+    return FromScalarRaw(
+        kIntegerBits == 0
+            ? ScalarRawMax()
+            : (ScalarRawType(1) << (kIntegerBits == 0 ? 0 : kFractionalBits)));
+  }
+
+  static FixedPoint FromDouble(double x) {
+    const double min_bound = static_cast<double>(ScalarRawMin());
+    const double max_bound = static_cast<double>(ScalarRawMax());
+    return FromScalarRaw(static_cast<ScalarRawType>(std::min(
+        std::max(round(x * static_cast<double>(1ll << kFractionalBits)),
+                 min_bound),
+        max_bound)));
+  }
+
+  RawType raw() const { return i_; }
+  RawType& raw() { return i_; }
+
+ private:
+  RawType i_;
+};
+
+// Part 3: implementation of arithmetic operators for the
+// FixedPoint class, and a few related functions.
+
+// A FixedPoint multiplication is just a
+// SaturatingRoundingDoublingHighMul operation on the underlying
+// raw integer values. The IntegerBits simply add up, as is obvious
+// from the fact that the range is [-2^IntegerBits, 2^IntegerBits).
+template <typename tRawType, int tIntegerBits_a, int tIntegerBits_b>
+FixedPoint<tRawType, tIntegerBits_a + tIntegerBits_b> operator*(
+    FixedPoint<tRawType, tIntegerBits_a> a,
+    FixedPoint<tRawType, tIntegerBits_b> b) {
+  FixedPoint<tRawType, tIntegerBits_a + tIntegerBits_b> c;
+  c.raw() = SaturatingRoundingDoublingHighMul(a.raw(), b.raw());
+  return c;
+}
+
+// Tweaking IntegerBits gives exact multiplication by a power of two.
+template <int tExponent, typename tRawType, int tIntegerBits>
+FixedPoint<tRawType, tExponent + tIntegerBits> ExactMulByPot(
+    FixedPoint<tRawType, tIntegerBits> a) {
+  FixedPoint<tRawType, tExponent + tIntegerBits> c;
+  c.raw() = a.raw();
+  return c;
+}
+
+// If we want to leave IntegerBits fixed, then multiplication
+// by a power of two has to be saturating/rounding, not exact anymore.
+template <int tExponent, typename tRawType, int tIntegerBits>
+FixedPoint<tRawType, tIntegerBits> SaturatingRoundingMultiplyByPOT(
+    FixedPoint<tRawType, tIntegerBits> a) {
+  return FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SaturatingRoundingMultiplyByPOT<tExponent>(a.raw()));
+}
+
+// Generic arithmetic operators.
+
+#define MAKE_FIXEDPOINT_UNARY_FUNC(FuncName, ImplFuncName)                     \
+  template <typename tRawType, int tIntegerBits>                               \
+  FixedPoint<tRawType, tIntegerBits> FuncName(                                 \
+      FixedPoint<tRawType, tIntegerBits> a) {                                  \
+    return FixedPoint<tRawType, tIntegerBits>::FromRaw(ImplFuncName(a.raw())); \
+  }
+
+#define MAKE_FIXEDPOINT_BINARY_FUNC(FuncName, ImplFuncName) \
+  template <typename tRawType, int tIntegerBits>            \
+  FixedPoint<tRawType, tIntegerBits> FuncName(              \
+      FixedPoint<tRawType, tIntegerBits> a,                 \
+      FixedPoint<tRawType, tIntegerBits> b) {               \
+    return FixedPoint<tRawType, tIntegerBits>::FromRaw(     \
+        ImplFuncName(a.raw(), b.raw()));                    \
+  }
+
+MAKE_FIXEDPOINT_UNARY_FUNC(operator-, Neg)
+MAKE_FIXEDPOINT_UNARY_FUNC(operator~, BitNot)
+MAKE_FIXEDPOINT_BINARY_FUNC(operator+, Add)
+MAKE_FIXEDPOINT_BINARY_FUNC(operator-, Sub)
+MAKE_FIXEDPOINT_BINARY_FUNC(operator&, BitAnd)
+MAKE_FIXEDPOINT_BINARY_FUNC(operator^, BitXor)
+MAKE_FIXEDPOINT_BINARY_FUNC(operator|, BitOr)
+MAKE_FIXEDPOINT_BINARY_FUNC(RoundingHalfSum, RoundingHalfSum)
+
+#undef MAKE_FIXEDPOINT_UNARY_FUNC
+#undef MAKE_FIXEDPOINT_BINARY_FUNC
+
+#define MAKE_FIXEDPOINT_UNARY_FUNC_RETURNING_RAW(FuncName)  \
+  template <typename tRawType, int tIntegerBits>            \
+  tRawType FuncName(FixedPoint<tRawType, tIntegerBits> a) { \
+    return FuncName(a.raw());                               \
+  }
+
+#define MAKE_FIXEDPOINT_BINARY_FUNC_RETURNING_RAW(FuncName) \
+  template <typename tRawType, int tIntegerBits>            \
+  tRawType FuncName(FixedPoint<tRawType, tIntegerBits> a,   \
+                    FixedPoint<tRawType, tIntegerBits> b) { \
+    return FuncName(a.raw(), b.raw());                      \
+  }
+
+MAKE_FIXEDPOINT_UNARY_FUNC_RETURNING_RAW(MaskIfZero)
+MAKE_FIXEDPOINT_UNARY_FUNC_RETURNING_RAW(MaskIfNonZero)
+MAKE_FIXEDPOINT_BINARY_FUNC_RETURNING_RAW(MaskIfEqual)
+MAKE_FIXEDPOINT_BINARY_FUNC_RETURNING_RAW(MaskIfNotEqual)
+MAKE_FIXEDPOINT_BINARY_FUNC_RETURNING_RAW(MaskIfGreaterThan)
+MAKE_FIXEDPOINT_BINARY_FUNC_RETURNING_RAW(MaskIfGreaterThanOrEqual)
+MAKE_FIXEDPOINT_BINARY_FUNC_RETURNING_RAW(MaskIfLessThan)
+MAKE_FIXEDPOINT_BINARY_FUNC_RETURNING_RAW(MaskIfLessThanOrEqual)
+
+#undef MAKE_FIXEDPOINT_UNARY_FUNC_RETURNING_RAW
+#undef MAKE_FIXEDPOINT_BINARY_FUNC_RETURNING_RAW
+
+template <typename tRawType, int tIntegerBits>
+FixedPoint<tRawType, tIntegerBits> SelectUsingMask(
+    tRawType if_mask, FixedPoint<tRawType, tIntegerBits> then_val,
+    FixedPoint<tRawType, tIntegerBits> else_val) {
+  return FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SelectUsingMask(if_mask, then_val.raw(), else_val.raw()));
+}
+
+template <typename tRawType, int tIntegerBits>
+bool operator==(FixedPoint<tRawType, tIntegerBits> a,
+                FixedPoint<tRawType, tIntegerBits> b) {
+  return All(MaskIfEqual(a.raw(), b.raw()));
+}
+
+template <typename tRawType, int tIntegerBits>
+bool operator!=(FixedPoint<tRawType, tIntegerBits> a,
+                FixedPoint<tRawType, tIntegerBits> b) {
+  return !(a == b);
+}
+
+template <typename tRawType, int tIntegerBits>
+FixedPoint<tRawType, tIntegerBits> SaturatingAdd(
+    FixedPoint<tRawType, tIntegerBits> a,
+    FixedPoint<tRawType, tIntegerBits> b) {
+  return FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SaturatingAdd(a.raw(), b.raw()));
+}
+
+template <typename tRawType, int tIntegerBits>
+FixedPoint<tRawType, tIntegerBits> AddSaturatingIf16Bit(
+    FixedPoint<tRawType, tIntegerBits> a,
+    FixedPoint<tRawType, tIntegerBits> b) {
+  return FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      AddSaturatingIf16Bit(a.raw(), b.raw()));
+}
+
+// Conversion to floating-point.
+template <typename tRawType, int tIntegerBits>
+double ToDouble(FixedPoint<tRawType, tIntegerBits> x) {
+  static_assert(FixedPointRawTypeTraits<tRawType>::kLanes == 1,
+                "not applicable to SIMD types");
+  typedef FixedPoint<tRawType, tIntegerBits> F;
+  return x.raw() / static_cast<double>(1ll << F::kFractionalBits);
+}
+
+// Rescale changes the number of IntegerBits and updates the underlying
+// raw integer value accordingly.
+template <int tIntegerBitsDst, typename tRawType, int tIntegerBitsSrc>
+FixedPoint<tRawType, tIntegerBitsDst> Rescale(
+    FixedPoint<tRawType, tIntegerBitsSrc> x) {
+  static constexpr int kExponent = tIntegerBitsSrc - tIntegerBitsDst;
+  FixedPoint<tRawType, tIntegerBitsDst> result;
+  result.raw() = SaturatingRoundingMultiplyByPOT<kExponent>(x.raw());
+  return result;
+}
+
+// CheckedFixedPointConstant allows to specify fixed-point constants
+// initialized as real numbers, in a way that does not compile floating-point
+// arithmetic in production code, yet still checks agreement with the
+// floating-point expressions when asserts are enabled.
+//
+// The raw integer value provided is always a int32, encoding a 32-bit
+// fixed-point value, regardless of the actual Scalar type. This allows
+// writing generic code that applies just as well to the 32-bit and 16-bit
+// cases. In the 16-bit case, the raw integer value is internally
+// rounding-shifted by 16 bits to the right.
+template <typename FixedPointType>
+inline typename FixedPointType::ScalarRawType RescaleConstantInitializer(
+    std::int32_t int32_value) {
+  typedef typename FixedPointType::ScalarRawType ScalarRawType;
+  static constexpr int ScalarTypeBits = 8 * sizeof(ScalarRawType);
+  return static_cast<ScalarRawType>(
+      RoundingDivideByPOT<std::int32_t>(int32_value, 32 - ScalarTypeBits));
+}
+#ifdef GEMMLOWP_ENABLE_FIXEDPOINT_CONSTANTS_CHECKS
+template <typename FixedPointType>
+FixedPointType CheckedFixedPointConstant(std::int32_t raw_value,
+                                         double double_value) {
+  const FixedPointType result = FixedPointType::FromScalarRaw(raw_value);
+  assert(result == FixedPointType::FromDouble(double_value));
+  return result;
+}
+#define GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(FixedPointType,                   \
+                                             ScalarRawInt32Value, DoubleValue) \
+  (gemmlowp::CheckedFixedPointConstant<FixedPointType>(                        \
+      gemmlowp::RescaleConstantInitializer<FixedPointType>(                    \
+          ScalarRawInt32Value),                                                \
+      DoubleValue))
+
+#else
+#define GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(FixedPointType,                   \
+                                             ScalarRawInt32Value, DoubleValue) \
+  (FixedPointType::FromScalarRaw(                                              \
+      gemmlowp::RescaleConstantInitializer<FixedPointType>(                    \
+          ScalarRawInt32Value)))
+#endif
+
+// Implementation of exponential function.
+
+// Returns exp(x) for x in [-1/4, 0).
+template <typename tRawType>
+FixedPoint<tRawType, 0> exp_on_interval_between_negative_one_quarter_and_0_excl(
+    FixedPoint<tRawType, 0> a) {
+  typedef FixedPoint<tRawType, 0> F;
+  const F constant_term =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F, 1895147668, std::exp(-1.0 / 8.0));
+  const F constant_1_over_3 =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F, 715827883, 1.0 / 3.0);
+  // We're evaluating a Taylor expansion around -1/8, so we do the change of
+  // variable: x = a + 1/8.
+  // In fixed-point with 0 integer bits, 1/8 is represented by 1 << 28.
+  F x = a + F::template ConstantPOT<-3>();
+  F x2 = x * x;
+  F x3 = x2 * x;
+  F x4 = x2 * x2;
+  F x4_over_4 = SaturatingRoundingMultiplyByPOT<-2>(x4);
+  F x4_over_24_plus_x3_over_6_plus_x2_over_2 =
+      SaturatingRoundingMultiplyByPOT<-1>(
+          ((x4_over_4 + x3) * constant_1_over_3) + x2);
+  return AddSaturatingIf16Bit(
+      constant_term,
+      constant_term * (x + x4_over_24_plus_x3_over_6_plus_x2_over_2));
+}
+
+// Returns exp(x) for x < 0.
+template <typename tRawType, int tIntegerBits>
+FixedPoint<tRawType, 0> exp_on_negative_values(
+    FixedPoint<tRawType, tIntegerBits> a) {
+  typedef FixedPoint<tRawType, tIntegerBits> InputF;
+  typedef FixedPoint<tRawType, 0> ResultF;
+  static constexpr int kFractionalBits = InputF::kFractionalBits;
+  static constexpr int kIntegerBits = InputF::kIntegerBits;
+  const InputF kOneQuarter = InputF::template ConstantPOT<-2>();
+  InputF mask = kOneQuarter - InputF::FromScalarRaw(1);
+  InputF a_mod_quarter_minus_one_quarter = (a & mask) - kOneQuarter;
+  ResultF result = exp_on_interval_between_negative_one_quarter_and_0_excl(
+      Rescale<0>(a_mod_quarter_minus_one_quarter));
+  tRawType remainder = (a_mod_quarter_minus_one_quarter - a).raw();
+
+#define GEMMLOWP_EXP_BARREL_SHIFTER(Exponent, FixedPointMultiplier)         \
+  if (kIntegerBits > Exponent) {                                            \
+    const ResultF kMultiplier = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(       \
+        ResultF, FixedPointMultiplier, std::exp(-std::pow(2.0, Exponent))); \
+    static constexpr int kShiftAmount =                                     \
+        kIntegerBits > Exponent ? kFractionalBits + Exponent : 0;           \
+    result = SelectUsingMask(                                               \
+        MaskIfNonZero(BitAnd(remainder, Dup<tRawType>(1 << kShiftAmount))), \
+        result * kMultiplier, result);                                      \
+  }
+
+  // Constants below are Q0 representations of negative exp fractionals:
+  GEMMLOWP_EXP_BARREL_SHIFTER(-2, 1672461947);  // exp(-1/4)
+  GEMMLOWP_EXP_BARREL_SHIFTER(-1, 1302514674);  // exp(-1/2)
+  GEMMLOWP_EXP_BARREL_SHIFTER(+0, 790015084);   // exp(-1)
+  GEMMLOWP_EXP_BARREL_SHIFTER(+1, 290630308);   // exp(-2)
+  GEMMLOWP_EXP_BARREL_SHIFTER(+2, 39332535);    // exp(-4)
+  GEMMLOWP_EXP_BARREL_SHIFTER(+3, 720401);      // exp(-8)
+  GEMMLOWP_EXP_BARREL_SHIFTER(+4, 242);         // exp(-16)
+
+#undef GEMMLOWP_EXP_BARREL_SHIFTER
+
+  static constexpr int clampB = kIntegerBits > 5 ? 36 - kIntegerBits : 0;
+  if (kIntegerBits > 5) {
+    const InputF clamp =
+        GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(InputF, -(1 << clampB), -32.0);
+    result = SelectUsingMask(MaskIfLessThan(a, clamp), ResultF::Zero(), result);
+  }
+
+  result = SelectUsingMask(MaskIfZero(a), ResultF::One(), result);
+  return result;
+}
+
+// Implementation of tanh: (1 - exp(-2x)) / (1 + exp(-2x)).
+
+// Returns (1 - x) / (1 + x) for x in (0, 1).
+template <typename tRawType>
+FixedPoint<tRawType, 0> one_minus_x_over_one_plus_x_for_x_in_0_1(
+    FixedPoint<tRawType, 0> a) {
+  typedef FixedPoint<tRawType, 0> F0;
+  typedef FixedPoint<tRawType, 2> F2;
+  F0 half_denominator = RoundingHalfSum(a, F0::One());
+  // Newton-Raphson division
+  // https://en.wikipedia.org/wiki/Division_algorithm#Newton.E2.80.93Raphson_division
+  // Refer to that page for the logic behind the 48/17 and 32/17 constants.
+  const F2 constant_48_over_17 =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F2, 1515870810, 48.0 / 17.0);
+  const F2 constant_neg_32_over_17 =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F2, -1010580540, -32.0 / 17.0);
+  F2 x = constant_48_over_17 + half_denominator * constant_neg_32_over_17;
+  for (int i = 0; i < 3; i++) {
+    F2 half_denominator_times_x = half_denominator * x;
+    F2 one_minus_half_denominator_times_x =
+        F2::One() - half_denominator_times_x;
+    x = x + Rescale<2>(x * one_minus_half_denominator_times_x);
+  }
+  return Rescale<0>(x - F2::One());
+}
+
+// Returns -tanh(x) for x < 0.
+template <typename tRawType, int tIntegerBits>
+FixedPoint<tRawType, 0> neg_tanh_on_negative_values(
+    FixedPoint<tRawType, tIntegerBits> a) {
+  return one_minus_x_over_one_plus_x_for_x_in_0_1(
+      exp_on_negative_values(ExactMulByPot<1>(a)));
+}
+
+// Returns tanh(x) for any x.
+template <typename tRawType, int tIntegerBits>
+FixedPoint<tRawType, 0> tanh(FixedPoint<tRawType, tIntegerBits> a) {
+  typedef FixedPoint<tRawType, tIntegerBits> InputF;
+  typedef FixedPoint<tRawType, 0> ResultF;
+  tRawType mask_if_negative = MaskIfLessThan(a, InputF::Zero());
+  tRawType mask_if_zero = MaskIfZero(a);
+  InputF n = SelectUsingMask(mask_if_negative, a, -a);
+  ResultF t = neg_tanh_on_negative_values(n);
+  return SelectUsingMask(mask_if_zero, ResultF::Zero(),
+                         SelectUsingMask(mask_if_negative, -t, t));
+}
+
+// Implementation of logistic function.
+
+// Returns 1 / (1 + x) for x in (0, 1).
+template <typename tRawType>
+FixedPoint<tRawType, 0> one_over_one_plus_x_for_x_in_0_1(
+    FixedPoint<tRawType, 0> a) {
+  typedef FixedPoint<tRawType, 0> F0;
+  typedef FixedPoint<tRawType, 2> F2;
+  F0 half_denominator = RoundingHalfSum(a, F0::One());
+  // Newton-Raphson division
+  // https://en.wikipedia.org/wiki/Division_algorithm#Newton.E2.80.93Raphson_division
+  // Refer to that page for the logic behind the 48/17 and 32/17 constants.
+  const F2 constant_48_over_17 =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F2, 1515870810, 48.0 / 17.0);
+  const F2 constant_neg_32_over_17 =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F2, -1010580540, -32.0 / 17.0);
+  F2 x = constant_48_over_17 + half_denominator * constant_neg_32_over_17;
+  for (int i = 0; i < 3; i++) {
+    F2 half_denominator_times_x = half_denominator * x;
+    F2 one_minus_half_denominator_times_x =
+        F2::One() - half_denominator_times_x;
+    x = x + Rescale<2>(x * one_minus_half_denominator_times_x);
+  }
+  return Rescale<0>(ExactMulByPot<-1>(x));
+}
+
+// Returns logistic(x) = 1 / (1 + exp(-x)) for x > 0.
+template <typename tRawType, int tIntegerBits>
+FixedPoint<tRawType, 0> logistic_on_positive_values(
+    FixedPoint<tRawType, tIntegerBits> a) {
+  return one_over_one_plus_x_for_x_in_0_1(exp_on_negative_values(-a));
+}
+
+// Returns logistic(x) = 1 / (1 + exp(-x)) for any x.
+template <typename tRawType, int tIntegerBits>
+FixedPoint<tRawType, 0> logistic(FixedPoint<tRawType, tIntegerBits> a) {
+  typedef FixedPoint<tRawType, tIntegerBits> InputF;
+  typedef FixedPoint<tRawType, 0> ResultF;
+  tRawType mask_if_positive = MaskIfGreaterThan(a, InputF::Zero());
+  tRawType mask_if_zero = MaskIfZero(a);
+  InputF abs_input = SelectUsingMask(mask_if_positive, a, -a);
+  ResultF result_if_positive = logistic_on_positive_values(abs_input);
+  ResultF result_if_negative = ResultF::One() - result_if_positive;
+  const ResultF one_half =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(ResultF, 1 << 30, 0.5);
+  return SelectUsingMask(mask_if_zero, one_half,
+                         SelectUsingMask(mask_if_positive, result_if_positive,
+                                         result_if_negative));
+}
+
+}  // end namespace gemmlowp
+
+#ifdef GEMMLOWP_NEON
+#include "./fixedpoint_neon.h"
+#elif defined(GEMMLOWP_AVX2)
+#include "./fixedpoint_avx.h"
+#elif defined(GEMMLOWP_SSE4)
+#include "./fixedpoint_sse.h"
+#elif defined(GEMMLOWP_MSA)
+#include "./fixedpoint_msa.h"
+#elif defined(GEMMLOWP_WASMSIMD)
+#include "./fixedpoint_wasmsimd.h"
+#endif
+
+#endif  // GEMMLOWP_INTERNAL_FIXEDPOINT_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_avx.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_avx.h
new file mode 100644
index 00000000..f3fe7321
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_avx.h
@@ -0,0 +1,384 @@
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// fixedpoint_avx.h: optimized avx specializations of the templates
+// in fixedpoint.h.
+
+#ifndef GEMMLOWP_INTERNAL_FIXEDPOINT_AVX_H_
+#define GEMMLOWP_INTERNAL_FIXEDPOINT_AVX_H_
+
+#include <immintrin.h>
+#include "fixedpoint.h"
+#include "fixedpoint_sse.h"
+
+namespace gemmlowp {
+
+struct int16x16_m256i {
+  __m256i v;
+};
+
+// Keep int16x16_m256i trivially constructible/destructible and provide
+// easily optimized helper function.
+inline int16x16_m256i to_int16x16_m256i(__m256i w) {
+  int16x16_m256i r;
+  r.v = w;
+  return r;
+}
+
+template <>
+struct FixedPointRawTypeTraits<__m256i> {
+  typedef std::int32_t ScalarRawType;
+  // TODO: This can actually support up to 8 lanes, so we should either
+  // change to 8 or create int32x8_m256i struct to handle that case.
+  static const int kLanes = 4;
+};
+
+template <>
+struct FixedPointRawTypeTraits<int16x16_m256i> {
+  typedef std::int16_t ScalarRawType;
+  static const int kLanes = 16;
+};
+
+template <>
+inline __m256i BitAnd(__m256i a, __m256i b) {
+  return _mm256_and_si256(a, b);
+}
+
+template <>
+inline int16x16_m256i BitAnd(int16x16_m256i a, int16x16_m256i b) {
+  return to_int16x16_m256i(_mm256_and_si256(a.v, b.v));
+}
+
+template <>
+inline __m256i BitOr(__m256i a, __m256i b) {
+  return _mm256_or_si256(a, b);
+}
+
+template <>
+inline int16x16_m256i BitOr(int16x16_m256i a, int16x16_m256i b) {
+  return to_int16x16_m256i(_mm256_or_si256(a.v, b.v));
+}
+
+template <>
+inline __m256i BitXor(__m256i a, __m256i b) {
+  return _mm256_xor_si256(a, b);
+}
+
+template <>
+inline int16x16_m256i BitXor(int16x16_m256i a, int16x16_m256i b) {
+  return to_int16x16_m256i(_mm256_xor_si256(a.v, b.v));
+}
+
+template <>
+inline __m256i BitNot(__m256i a) {
+  return _mm256_andnot_si256(a, _mm256_set1_epi32(-1));
+}
+
+template <>
+inline int16x16_m256i BitNot(int16x16_m256i a) {
+  return to_int16x16_m256i(_mm256_andnot_si256(a.v, _mm256_set1_epi16(-1)));
+}
+
+template <>
+inline __m256i Add(__m256i a, __m256i b) {
+  return _mm256_add_epi32(a, b);
+}
+
+template <>
+inline int16x16_m256i Add(int16x16_m256i a, int16x16_m256i b) {
+  return to_int16x16_m256i(_mm256_add_epi16(a.v, b.v));
+}
+
+template <>
+inline __m256i Mul(__m256i a, __m256i b) {
+  return _mm256_mullo_epi32(a, b);
+}
+
+template <>
+inline int16x16_m256i Mul(int16x16_m256i a, int16x16_m256i b) {
+  return to_int16x16_m256i(_mm256_mullo_epi16(a.v, b.v));
+}
+
+template <>
+inline __m256i Sub(__m256i a, __m256i b) {
+  return _mm256_sub_epi32(a, b);
+}
+
+template <>
+inline int16x16_m256i Sub(int16x16_m256i a, int16x16_m256i b) {
+  return to_int16x16_m256i(_mm256_sub_epi16(a.v, b.v));
+}
+
+template <>
+inline __m256i Neg(__m256i a) {
+  return _mm256_sign_epi32(a, _mm256_set1_epi32(-1));
+}
+
+template <>
+inline int16x16_m256i Neg(int16x16_m256i a) {
+  return to_int16x16_m256i(_mm256_sign_epi16(a.v, _mm256_set1_epi16(-1)));
+}
+
+template <>
+inline __m256i ShiftLeft(__m256i a, int offset) {
+  return _mm256_slli_epi32(a, offset);
+}
+
+template <>
+inline int16x16_m256i ShiftLeft(int16x16_m256i a, int offset) {
+  return to_int16x16_m256i(_mm256_slli_epi16(a.v, offset));
+}
+
+template <>
+inline __m256i ShiftRight(__m256i a, int offset) {
+  return _mm256_srai_epi32(a, offset);
+}
+
+template <>
+inline int16x16_m256i ShiftRight(int16x16_m256i a, int offset) {
+  return to_int16x16_m256i(_mm256_srai_epi16(a.v, offset));
+}
+
+template <>
+inline __m256i SelectUsingMask(__m256i if_mask, __m256i then_val,
+                               __m256i else_val) {
+  return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(else_val),
+                                              _mm256_castsi256_ps(then_val),
+                                              _mm256_castsi256_ps(if_mask)));
+}
+
+template <>
+inline int16x16_m256i SelectUsingMask(int16x16_m256i if_mask,
+                                      int16x16_m256i then_val,
+                                      int16x16_m256i else_val) {
+  // Borrowed from Intel's arm_neon_sse.h header.
+  return to_int16x16_m256i(
+      _mm256_or_si256(_mm256_and_si256(if_mask.v, then_val.v),
+                      _mm256_andnot_si256(if_mask.v, else_val.v)));
+}
+
+template <>
+inline __m256i MaskIfEqual(__m256i a, __m256i b) {
+  return _mm256_cmpeq_epi32(a, b);
+}
+
+template <>
+inline int16x16_m256i MaskIfEqual(int16x16_m256i a, int16x16_m256i b) {
+  return to_int16x16_m256i(_mm256_cmpeq_epi16(a.v, b.v));
+}
+
+template <>
+inline __m256i MaskIfNotEqual(__m256i a, __m256i b) {
+  return BitNot(MaskIfEqual(a, b));
+}
+
+template <>
+inline int16x16_m256i MaskIfNotEqual(int16x16_m256i a, int16x16_m256i b) {
+  return BitNot(MaskIfEqual(a, b));
+}
+
+template <>
+inline __m256i MaskIfZero(__m256i a) {
+  return MaskIfEqual(a, _mm256_set1_epi32(0));
+}
+
+template <>
+inline int16x16_m256i MaskIfZero(int16x16_m256i a) {
+  return MaskIfEqual(a, to_int16x16_m256i(_mm256_set1_epi16(0)));
+}
+
+template <>
+inline __m256i MaskIfNonZero(__m256i a) {
+  return MaskIfNotEqual(a, _mm256_set1_epi32(0));
+}
+
+template <>
+inline int16x16_m256i MaskIfNonZero(int16x16_m256i a) {
+  return MaskIfNotEqual(a, to_int16x16_m256i(_mm256_set1_epi16(0)));
+}
+
+template <>
+inline __m256i MaskIfGreaterThan(__m256i a, __m256i b) {
+  return _mm256_cmpgt_epi32(a, b);
+}
+
+template <>
+inline int16x16_m256i MaskIfGreaterThan(int16x16_m256i a, int16x16_m256i b) {
+  return to_int16x16_m256i(_mm256_cmpgt_epi16(a.v, b.v));
+}
+
+template <>
+inline __m256i MaskIfLessThan(__m256i a, __m256i b) {
+  return _mm256_cmpgt_epi32(b, a);
+}
+
+template <>
+inline int16x16_m256i MaskIfLessThan(int16x16_m256i a, int16x16_m256i b) {
+  return to_int16x16_m256i(_mm256_cmpgt_epi16(b.v, a.v));
+}
+
+template <>
+inline __m256i MaskIfGreaterThanOrEqual(__m256i a, __m256i b) {
+  return BitNot(MaskIfLessThan(a, b));
+}
+
+template <>
+inline int16x16_m256i MaskIfGreaterThanOrEqual(int16x16_m256i a,
+                                               int16x16_m256i b) {
+  return BitNot(MaskIfLessThan(a, b));
+}
+
+template <>
+inline __m256i MaskIfLessThanOrEqual(__m256i a, __m256i b) {
+  return BitNot(MaskIfGreaterThan(a, b));
+}
+
+template <>
+inline int16x16_m256i MaskIfLessThanOrEqual(int16x16_m256i a,
+                                            int16x16_m256i b) {
+  return BitNot(MaskIfGreaterThan(a, b));
+}
+
+/* Assumptions:
+   - All and Any are used on masks.
+   - masks are all_ones for true lanes, all_zeroes otherwise.
+Hence, All means all 128bits set, and Any means any bit set.
+*/
+
+template <>
+inline bool All(__m256i a) {
+  return _mm256_testc_si256(a, a);
+}
+
+template <>
+inline bool All(int16x16_m256i a) {
+  return _mm256_testc_si256(a.v, a.v);
+}
+
+template <>
+inline bool Any(__m256i a) {
+  return BitNot(_mm256_testz_si256(a, a));
+}
+
+template <>
+inline bool Any(int16x16_m256i a) {
+  return BitNot(_mm256_testz_si256(a.v, a.v));
+}
+
+template <>
+inline __m256i RoundingHalfSum(__m256i a, __m256i b) {
+  /* __m256i round_bit_mask, a_over_2, b_over_2, round_bit, sum; */
+  /* We divide the inputs before the add to avoid the overflow and costly test
+   */
+  /* of checking if an overflow occured on signed add */
+  /* round_bit_mask = _mm_set1_epi32(1); */
+  /* a_over_2 = _mm_srai_epi32(a, 1); */
+  /* b_over_2 = _mm_srai_epi32(b, 1); */
+  /* sum = Add(a_over_2, b_over_2); */
+  /* round_bit = _mm_sign_epi32(BitAnd(BitOr(a,b), round_bit_mask), sum); */
+  /* return Add(sum, round_bit); */
+
+  /* Other possibility detecting overflow and xor the sign if an overflow
+   * happened*/
+  __m256i one, sign_bit_mask, sum, rounded_half_sum, overflow, result;
+  one = _mm256_set1_epi32(1);
+  sign_bit_mask = _mm256_set1_epi32(0x80000000);
+  sum = Add(a, b);
+  rounded_half_sum = _mm256_srai_epi32(Add(sum, one), 1);
+  overflow =
+      BitAnd(BitAnd(BitXor(a, rounded_half_sum), BitXor(b, rounded_half_sum)),
+             sign_bit_mask);
+  result = BitXor(rounded_half_sum, overflow);
+  return result;
+}
+
+template <>
+inline int16x16_m256i RoundingHalfSum(int16x16_m256i a, int16x16_m256i b) {
+  // Borrowed from Intel's arm_neon_sse.h header.
+  __m256i constant_neg_32768 = _mm256_set1_epi16(-32768);
+  __m256i a_unsigned = _mm256_sub_epi16(a.v, constant_neg_32768);
+  __m256i b_unsigned = _mm256_sub_epi16(b.v, constant_neg_32768);
+  __m256i avg_unsigned = _mm256_avg_epu16(a_unsigned, b_unsigned);
+  __m256i avg = _mm256_add_epi16(avg_unsigned, constant_neg_32768);
+  return to_int16x16_m256i(avg);
+}
+
+template <>
+inline __m256i SaturatingRoundingDoublingHighMul(__m256i a, __m256i b) {
+  __m256i min, saturation_mask, a0_a2, a1_a3, b0_b2, b1_b3;
+  __m256i a0b0_a2b2, a1b1_a3b3, a0b0_a2b2_rounded, a1b1_a3b3_rounded;
+  __m256i a0b0_a2b2_rounded_2x, a1b1_a3b3_rounded_2x, result;
+  __m256i nudge;
+
+  // saturation only happen if a == b == INT_MIN
+  min = _mm256_set1_epi32(std::numeric_limits<std::int32_t>::min());
+  saturation_mask = BitAnd(MaskIfEqual(a, b), MaskIfEqual(a, min));
+
+  // a = a0 | a1 | a2 | a3
+  // b = b0 | b1 | b2 | b3
+  a0_a2 = a;
+  a1_a3 = _mm256_srli_si256(a, 4);
+  b0_b2 = b;
+  b1_b3 = _mm256_srli_si256(b, 4);
+
+  a0b0_a2b2 = _mm256_mul_epi32(a0_a2, b0_b2);
+  a1b1_a3b3 = _mm256_mul_epi32(a1_a3, b1_b3);
+
+  // do the rounding and take into account that it will be doubled
+  nudge = _mm256_set1_epi64x(1 << 30);
+  a0b0_a2b2_rounded = _mm256_add_epi64(a0b0_a2b2, nudge);
+  a1b1_a3b3_rounded = _mm256_add_epi64(a1b1_a3b3, nudge);
+
+  // do the doubling
+  a0b0_a2b2_rounded_2x = _mm256_slli_epi64(a0b0_a2b2_rounded, 1);
+  a1b1_a3b3_rounded_2x = _mm256_slli_epi64(a1b1_a3b3_rounded, 1);
+
+  // get the high part of the products
+  result = _mm256_blend_epi16(_mm256_srli_si256(a0b0_a2b2_rounded_2x, 4),
+                              a1b1_a3b3_rounded_2x, 0xcc);
+
+  // saturate those which overflowed
+  return SelectUsingMask(saturation_mask, min, result);
+}
+
+template <>
+inline int16x16_m256i SaturatingRoundingDoublingHighMul(int16x16_m256i a,
+                                                        int16x16_m256i b) {
+  // Use _mm256_mulhrs_epi16 then saturate with a bit-operation,
+  // borrowed from Intel's arm_neon_sse.h header.
+  __m256i result_unsaturated = _mm256_mulhrs_epi16(a.v, b.v);
+  __m256i saturation_mask =
+      _mm256_cmpeq_epi16(result_unsaturated, _mm256_set1_epi16(0x8000));
+  __m256i result = _mm256_xor_si256(result_unsaturated, saturation_mask);
+  return to_int16x16_m256i(result);
+}
+
+template <>
+inline __m256i Dup<__m256i>(std::int32_t x) {
+  return _mm256_set1_epi32(x);
+}
+
+template <>
+inline int16x16_m256i Dup<int16x16_m256i>(std::int16_t x) {
+  return to_int16x16_m256i(_mm256_set1_epi16(x));
+}
+
+// So far this is only needed for int16.
+template <>
+inline int16x16_m256i SaturatingAdd(int16x16_m256i a, int16x16_m256i b) {
+  return to_int16x16_m256i(_mm256_adds_epi16(a.v, b.v));
+}
+
+}  // end namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_FIXEDPOINT_AVX_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_msa.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_msa.h
new file mode 100644
index 00000000..b17f32aa
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_msa.h
@@ -0,0 +1,413 @@
+// Copyright 2018 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// fixedpoint_msa.h: optimized MSA specializations of the templates
+// in fixedpoint.h.
+
+#ifndef GEMMLOWP_INTERNAL_FIXEDPOINT_MSA_H_
+#define GEMMLOWP_INTERNAL_FIXEDPOINT_MSA_H_
+
+#include <msa.h>
+
+namespace gemmlowp {
+
+template <>
+struct FixedPointRawTypeTraits<v4i32> {
+  typedef std::int32_t ScalarRawType;
+  static constexpr int kLanes = 4;
+};
+
+template <>
+struct FixedPointRawTypeTraits<v8i16> {
+  typedef std::int16_t ScalarRawType;
+  static constexpr int kLanes = 8;
+};
+
+template <>
+inline v4i32 BitAnd(v4i32 a, v4i32 b) {
+  return reinterpret_cast<v4i32>(__builtin_msa_and_v(reinterpret_cast<v16u8>(a),
+                                                     reinterpret_cast<v16u8>(b)));
+}
+
+template <>
+inline v8i16 BitAnd(v8i16 a, v8i16 b) {
+  return reinterpret_cast<v8i16>(__builtin_msa_and_v(reinterpret_cast<v16u8>(a),
+                                                     reinterpret_cast<v16u8>(b)));
+}
+
+template <>
+inline v4i32 BitOr(v4i32 a, v4i32 b) {
+  return reinterpret_cast<v4i32>(__builtin_msa_or_v(reinterpret_cast<v16u8>(a),
+                                                    reinterpret_cast<v16u8>(b)));
+}
+
+template <>
+inline v8i16 BitOr(v8i16 a, v8i16 b) {
+  return reinterpret_cast<v8i16>(__builtin_msa_or_v(reinterpret_cast<v16u8>(a),
+                                                    reinterpret_cast<v16u8>(b)));
+}
+
+template <>
+inline v4i32 BitXor(v4i32 a, v4i32 b) {
+  return reinterpret_cast<v4i32>(__builtin_msa_xor_v(reinterpret_cast<v16u8>(a),
+                                                     reinterpret_cast<v16u8>(b)));
+}
+
+template <>
+inline v8i16 BitXor(v8i16 a, v8i16 b) {
+  return reinterpret_cast<v8i16>(__builtin_msa_xor_v(reinterpret_cast<v16u8>(a),
+                                                     reinterpret_cast<v16u8>(b)));
+}
+
+template <>
+inline v4i32 BitNot(v4i32 a) {
+  return reinterpret_cast<v4i32>(__builtin_msa_nor_v(reinterpret_cast<v16u8>(a),
+                                                     reinterpret_cast<v16u8>(a)));
+}
+
+template <>
+inline v8i16 BitNot(v8i16 a) {
+  return reinterpret_cast<v8i16>(__builtin_msa_nor_v(reinterpret_cast<v16u8>(a),
+                                                     reinterpret_cast<v16u8>(a)));
+}
+
+template <>
+inline v4i32 Add(v4i32 a, v4i32 b) {
+  return __builtin_msa_addv_w(a, b);
+}
+
+template <>
+inline v8i16 Add(v8i16 a, v8i16 b) {
+  return __builtin_msa_addv_h(a, b);
+}
+
+template <>
+inline v4i32 Sub(v4i32 a, v4i32 b) {
+  return __builtin_msa_subv_w(a, b);
+}
+
+template <>
+inline v8i16 Sub(v8i16 a, v8i16 b) {
+  return __builtin_msa_subv_h(a, b);
+}
+
+template <>
+inline v4i32 Neg(v4i32 a) {
+  v4i32 zeroes = __builtin_msa_ldi_w(0);
+  return __builtin_msa_subv_w(zeroes, a);
+}
+
+template <>
+inline v8i16 Neg(v8i16 a) {
+  v8i16 zeroes = __builtin_msa_ldi_h(0);
+  return __builtin_msa_subv_h(zeroes, a);
+}
+
+template <>
+inline v4i32 ShiftLeft(v4i32 a, int offset) {
+  return __builtin_msa_sll_w(a, __builtin_msa_fill_w(offset));
+}
+
+template <>
+inline v8i16 ShiftLeft(v8i16 a, int offset) {
+  return __builtin_msa_sll_h(a, __builtin_msa_fill_h(offset));
+}
+
+template <>
+inline v4i32 ShiftRight(v4i32 a, int offset) {
+  return __builtin_msa_sra_w(a, __builtin_msa_fill_w(offset));
+}
+
+template <>
+inline v8i16 ShiftRight(v8i16 a, int offset) {
+  return __builtin_msa_sra_h(a, __builtin_msa_fill_h(offset));
+}
+
+template <>
+inline v4i32 SelectUsingMask(v4i32 if_mask, v4i32 then_val, v4i32 else_val) {
+  if_mask = reinterpret_cast<v4i32>(__builtin_msa_bsel_v(reinterpret_cast<v16u8>(if_mask),
+                                                         reinterpret_cast<v16u8>(else_val),
+                                                         reinterpret_cast<v16u8>(then_val)));
+  return if_mask;
+}
+
+template <>
+inline v8i16 SelectUsingMask(v8i16 if_mask, v8i16 then_val, v8i16 else_val) {
+  if_mask = reinterpret_cast<v8i16>(__builtin_msa_bsel_v(reinterpret_cast<v16u8>(if_mask),
+                                                         reinterpret_cast<v16u8>(else_val),
+                                                         reinterpret_cast<v16u8>(then_val)));
+  return if_mask;
+}
+
+template <>
+inline v4i32 MaskIfEqual(v4i32 a, v4i32 b) {
+  return __builtin_msa_ceq_w(a, b);
+}
+
+template <>
+inline v8i16 MaskIfEqual(v8i16 a, v8i16 b) {
+  return __builtin_msa_ceq_h(a, b);
+}
+
+template <>
+inline v4i32 MaskIfNotEqual(v4i32 a, v4i32 b) {
+  return BitNot(MaskIfEqual(a, b));
+}
+
+template <>
+inline v8i16 MaskIfNotEqual(v8i16 a, v8i16 b) {
+  return BitNot(MaskIfEqual(a, b));
+}
+
+template <>
+inline v4i32 MaskIfZero(v4i32 a) {
+  return __builtin_msa_ceqi_w(a, 0);
+}
+
+template <>
+inline v8i16 MaskIfZero(v8i16 a) {
+  return __builtin_msa_ceqi_h(a, 0);
+}
+
+template <>
+inline v4i32 MaskIfNonZero(v4i32 a) {
+  return BitNot(MaskIfZero(a));
+}
+
+template <>
+inline v8i16 MaskIfNonZero(v8i16 a) {
+  return BitNot(MaskIfZero(a));
+}
+
+template <>
+inline v4i32 MaskIfGreaterThan(v4i32 a, v4i32 b) {
+  return __builtin_msa_clt_s_w(b, a);
+}
+
+template <>
+inline v8i16 MaskIfGreaterThan(v8i16 a, v8i16 b) {
+  return __builtin_msa_clt_s_h(b, a);
+}
+
+template <>
+inline v4i32 MaskIfGreaterThanOrEqual(v4i32 a, v4i32 b) {
+  return __builtin_msa_cle_s_w(b, a);
+}
+
+template <>
+inline v8i16 MaskIfGreaterThanOrEqual(v8i16 a, v8i16 b) {
+  return __builtin_msa_cle_s_h(b, a);
+}
+
+template <>
+inline v4i32 MaskIfLessThan(v4i32 a, v4i32 b) {
+  return __builtin_msa_clt_s_w(a, b);
+}
+
+template <>
+inline v8i16 MaskIfLessThan(v8i16 a, v8i16 b) {
+  return __builtin_msa_clt_s_h(a, b);
+}
+
+template <>
+inline v4i32 MaskIfLessThanOrEqual(v4i32 a, v4i32 b) {
+  return __builtin_msa_cle_s_w(a, b);
+}
+
+template <>
+inline v8i16 MaskIfLessThanOrEqual(v8i16 a, v8i16 b) {
+  return __builtin_msa_cle_s_h(a, b);
+}
+
+template <>
+inline bool All(v4i32 a) {
+  return __builtin_msa_bz_v(reinterpret_cast<v16u8>(BitNot(a)));
+}
+
+template <>
+inline bool All(v8i16 a) {
+  return __builtin_msa_bz_v(reinterpret_cast<v16u8>(BitNot(a)));
+}
+
+template <>
+inline bool Any(v4i32 a) {
+  return __builtin_msa_bnz_v(reinterpret_cast<v16u8>(a));
+}
+
+template <>
+inline bool Any(v8i16 a) {
+  return __builtin_msa_bnz_v(reinterpret_cast<v16u8>(a));
+}
+
+template <>
+inline v4i32 RoundingHalfSum(v4i32 a, v4i32 b) {
+  return __builtin_msa_aver_s_w(a, b);
+}
+
+template <>
+inline v8i16 RoundingHalfSum(v8i16 a, v8i16 b) {
+  return __builtin_msa_aver_s_h(a, b);
+}
+
+template <>
+inline v4i32 SaturatingRoundingDoublingHighMul(v4i32 a, v4i32 b) {
+  return __builtin_msa_mulr_q_w(a, b);
+}
+
+template <>
+inline v8i16 SaturatingRoundingDoublingHighMul(v8i16 a, v8i16 b) {
+  return __builtin_msa_mulr_q_h(a, b);
+}
+
+template <int Exponent>
+struct ImplSaturatingRoundingMultiplyByPOT<Exponent, v4i32, 1> {
+  static v4i32 eval(v4i32 x) {
+    static_assert(Exponent >= 0 && Exponent < 32, "");
+    if (Exponent < 5) {
+      for (int i = 0; i < Exponent; i++) {
+        x = __builtin_msa_adds_s_w(x, x);
+      }
+      return x;
+    } else {
+      // Saturate each signed 32-bit element to (32 - Exponent)
+      // bits (this takes full care of negative elements).
+      v4i32 res = __builtin_msa_sat_s_w(x, 31 - Exponent);
+      // Set tmp to 0x7FFFFFFF for those elements which staturated
+      // to smaller (positive) values and 0 for all others.
+      v4i32 tmp = __builtin_msa_srli_w(__builtin_msa_clt_s_w(res, x), 1);
+      // Shift the saturated elements. The positive saturated elements
+      // will have Exponent trailing zero bits after the shift. Those
+      // need to be ones, not zeroes.
+      res = __builtin_msa_slli_w(res, Exponent);
+      // Finally, set those trailing zero bits to ones.
+      res = reinterpret_cast<v4i32>(__builtin_msa_or_v(reinterpret_cast<v16u8>(res),
+                                                       reinterpret_cast<v16u8>(tmp)));
+      return res;
+    }
+  }
+};
+
+template <int Exponent>
+struct ImplSaturatingRoundingMultiplyByPOT<Exponent, v8i16, 1> {
+  static v8i16 eval(v8i16 x) {
+    static_assert(Exponent >= 0 && Exponent < 16, "");
+    if (Exponent < 5) {
+      for (int i = 0; i < Exponent; i++) {
+        x = __builtin_msa_adds_s_h(x, x);
+      }
+      return x;
+    } else {
+      // Saturate each signed 16-bit element to (16 - Exponent)
+      // bits (this takes full care of negative elements).
+      v8i16 res = __builtin_msa_sat_s_h(x, 15 - Exponent);
+      // Set tmp to 0x7FFF for those elements which staturated
+      // to smaller (positive) values and 0 for all others.
+      v8i16 tmp = __builtin_msa_srli_h(__builtin_msa_clt_s_h(res, x), 1);
+      // Shift the saturated elements. The positive saturated elements
+      // will have Exponent trailing zero bits after the shift. Those
+      // need to be ones, not zeroes.
+      res = __builtin_msa_slli_h(res, Exponent);
+      // Finally, set those trailing zero bits to ones.
+      res = reinterpret_cast<v8i16>(__builtin_msa_or_v(reinterpret_cast<v16u8>(res),
+                                                       reinterpret_cast<v16u8>(tmp)));
+      return res;
+    }
+  }
+};
+
+template <int Exponent>
+struct ImplSaturatingRoundingMultiplyByPOT<Exponent, v4i32, -1> {
+  static v4i32 eval(v4i32 x) {
+    static_assert(-31 <= Exponent && Exponent <= -1, "");
+    // Isolate the sign bits.
+    v4i32 sign = __builtin_msa_srli_w(x, 31);
+    // Decrement the negative elements by 1 (with saturation).
+    x = __builtin_msa_subs_s_w(x, sign);
+    // Arithmetic shift right with rounding.
+    // The srari instruction rounds all midpoint values towards +infinity.
+    // It will correctly round negative midpoint values as we just
+    // decremented the negative values by 1.
+    return __builtin_msa_srari_w(x, -Exponent);
+  }
+};
+
+template <int Exponent>
+struct ImplSaturatingRoundingMultiplyByPOT<Exponent, v8i16, -1> {
+  static v8i16 eval(v8i16 x) {
+    static_assert(-15 <= Exponent && Exponent <= -1, "");
+    // Isolate the sign bits.
+    v8i16 sign = __builtin_msa_srli_h(x, 15);
+    // Decrement the negative elements by 1 (with saturation).
+    x = __builtin_msa_subs_s_h(x, sign);
+    // Arithmetic shift right with rounding.
+    // The srari instruction rounds all midpoint values towards +infinity.
+    // It will correctly round negative midpoint values as we just
+    // decremented the negative values by 1.
+    return __builtin_msa_srari_h(x, -Exponent);
+  }
+};
+
+template <>
+inline v4i32 RoundingDivideByPOT(v4i32 x, int exponent) {
+  v4i32 e = __builtin_msa_fill_w(exponent);
+  // Isolate the sign bits.
+  v4i32 sign = __builtin_msa_srli_w(x, 31);
+  // Reset them to 0 if exponent is 0.
+  sign = __builtin_msa_min_s_w(sign, e);
+  // Decrement the negative elements by 1 (with saturation)
+  // if exponent is non-zero.
+  x = __builtin_msa_subs_s_w(x, sign);
+  // Arithmetic shift right with rounding.
+  // The srar instruction rounds all midpoint values towards +infinity.
+  // It will correctly round negative midpoint values as we just
+  // decremented the negative values by 1.
+  return __builtin_msa_srar_w(x, e);
+}
+
+template <>
+inline v8i16 RoundingDivideByPOT(v8i16 x, int exponent) {
+  v8i16 e = __builtin_msa_fill_h(exponent);
+  // Isolate the sign bits.
+  v8i16 sign = __builtin_msa_srli_h(x, 15);
+  // Reset them to 0 if exponent is 0.
+  sign = __builtin_msa_min_s_h(sign, e);
+  // Decrement the negative elements by 1 (with saturation)
+  // if exponent is non-zero.
+  x = __builtin_msa_subs_s_h(x, sign);
+  // Arithmetic shift right with rounding.
+  // The srar instruction rounds all midpoint values towards +infinity.
+  // It will correctly round negative midpoint values as we just
+  // decremented the negative values by 1.
+  return __builtin_msa_srar_h(x, e);
+}
+
+template <>
+inline v4i32 Dup<v4i32>(std::int32_t x) {
+  return __builtin_msa_fill_w(x);
+}
+
+template <>
+inline v8i16 Dup<v8i16>(std::int16_t x) {
+  return __builtin_msa_fill_h(x);
+}
+
+// So far this is only needed for int16.
+template <>
+inline v8i16 SaturatingAdd(v8i16 a, v8i16 b) {
+  return __builtin_msa_adds_s_h(a, b);
+}
+
+}  // end namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_FIXEDPOINT_MSA_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_neon.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_neon.h
new file mode 100644
index 00000000..4dab6c99
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_neon.h
@@ -0,0 +1,357 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// fixedpoint_neon.h: optimized NEON specializations of the templates
+// in fixedpoint.h.
+
+#ifndef GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
+#define GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
+
+#include <arm_neon.h>
+
+namespace gemmlowp {
+
+template <>
+struct FixedPointRawTypeTraits<int32x4_t> {
+  typedef std::int32_t ScalarRawType;
+  static constexpr int kLanes = 4;
+};
+
+template <>
+struct FixedPointRawTypeTraits<int16x8_t> {
+  typedef std::int16_t ScalarRawType;
+  static constexpr int kLanes = 8;
+};
+
+template <>
+inline int32x4_t BitAnd(int32x4_t a, int32x4_t b) {
+  return vandq_s32(a, b);
+}
+
+template <>
+inline int16x8_t BitAnd(int16x8_t a, int16x8_t b) {
+  return vandq_s16(a, b);
+}
+
+template <>
+inline int32x4_t BitOr(int32x4_t a, int32x4_t b) {
+  return vorrq_s32(a, b);
+}
+
+template <>
+inline int16x8_t BitOr(int16x8_t a, int16x8_t b) {
+  return vorrq_s16(a, b);
+}
+
+template <>
+inline int32x4_t BitXor(int32x4_t a, int32x4_t b) {
+  return veorq_s32(a, b);
+}
+
+template <>
+inline int16x8_t BitXor(int16x8_t a, int16x8_t b) {
+  return veorq_s16(a, b);
+}
+
+template <>
+inline int32x4_t BitNot(int32x4_t a) {
+  return veorq_s32(a, vdupq_n_s32(-1));
+}
+
+template <>
+inline int16x8_t BitNot(int16x8_t a) {
+  return veorq_s16(a, vdupq_n_s16(-1));
+}
+
+template <>
+inline int32x4_t Add(int32x4_t a, int32x4_t b) {
+  return vaddq_s32(a, b);
+}
+
+template <>
+inline int16x8_t Add(int16x8_t a, int16x8_t b) {
+  return vaddq_s16(a, b);
+}
+
+template <>
+inline int32x4_t Sub(int32x4_t a, int32x4_t b) {
+  return vsubq_s32(a, b);
+}
+
+template <>
+inline int16x8_t Sub(int16x8_t a, int16x8_t b) {
+  return vsubq_s16(a, b);
+}
+
+template <>
+inline int32x4_t Neg(int32x4_t a) {
+  return vnegq_s32(a);
+}
+
+template <>
+inline int16x8_t Neg(int16x8_t a) {
+  return vnegq_s16(a);
+}
+
+template <>
+inline int32x4_t ShiftLeft(int32x4_t a, int offset) {
+  return vshlq_s32(a, vdupq_n_s32(offset));
+}
+
+template <>
+inline int16x8_t ShiftLeft(int16x8_t a, int offset) {
+  return vshlq_s16(a, vdupq_n_s16(offset));
+}
+
+template <>
+inline int32x4_t ShiftLeft(int32x4_t a, int32x4_t offset) {
+  return vshlq_s32(a, offset);
+}
+
+template <>
+inline int16x8_t ShiftLeft(int16x8_t a, int16x8_t offset) {
+  return vshlq_s16(a, offset);
+}
+
+template <>
+inline int32x4_t ShiftRight(int32x4_t a, int offset) {
+  return vshlq_s32(a, vdupq_n_s32(-offset));
+}
+
+template <>
+inline int16x8_t ShiftRight(int16x8_t a, int offset) {
+  return vshlq_s16(a, vdupq_n_s16(-offset));
+}
+
+template <>
+inline int32x4_t SelectUsingMask(int32x4_t if_mask, int32x4_t then_val,
+                                 int32x4_t else_val) {
+  return vbslq_s32(vreinterpretq_u32_s32(if_mask), then_val, else_val);
+}
+
+template <>
+inline int16x8_t SelectUsingMask(int16x8_t if_mask, int16x8_t then_val,
+                                 int16x8_t else_val) {
+  return vbslq_s16(vreinterpretq_u16_s16(if_mask), then_val, else_val);
+}
+
+template <>
+inline int32x4_t MaskIfEqual(int32x4_t a, int32x4_t b) {
+  return vreinterpretq_s32_u32(vceqq_s32(a, b));
+}
+
+template <>
+inline int16x8_t MaskIfEqual(int16x8_t a, int16x8_t b) {
+  return vreinterpretq_s16_u16(vceqq_s16(a, b));
+}
+
+template <>
+inline int32x4_t MaskIfNotEqual(int32x4_t a, int32x4_t b) {
+  return BitNot(MaskIfEqual(a, b));
+}
+
+template <>
+inline int16x8_t MaskIfNotEqual(int16x8_t a, int16x8_t b) {
+  return BitNot(MaskIfEqual(a, b));
+}
+
+template <>
+inline int32x4_t MaskIfZero(int32x4_t a) {
+  return MaskIfEqual(a, vdupq_n_s32(0));
+}
+
+template <>
+inline int16x8_t MaskIfZero(int16x8_t a) {
+  return MaskIfEqual(a, vdupq_n_s16(0));
+}
+
+template <>
+inline int32x4_t MaskIfNonZero(int32x4_t a) {
+  return vreinterpretq_s32_u32(vtstq_s32(a, a));
+}
+
+template <>
+inline int16x8_t MaskIfNonZero(int16x8_t a) {
+  return vreinterpretq_s16_u16(vtstq_s16(a, a));
+}
+
+template <>
+inline int32x4_t MaskIfGreaterThan(int32x4_t a, int32x4_t b) {
+  return vreinterpretq_s32_u32(vcgtq_s32(a, b));
+}
+
+template <>
+inline int16x8_t MaskIfGreaterThan(int16x8_t a, int16x8_t b) {
+  return vreinterpretq_s16_u16(vcgtq_s16(a, b));
+}
+
+template <>
+inline int32x4_t MaskIfGreaterThanOrEqual(int32x4_t a, int32x4_t b) {
+  return vreinterpretq_s32_u32(vcgeq_s32(a, b));
+}
+
+template <>
+inline int16x8_t MaskIfGreaterThanOrEqual(int16x8_t a, int16x8_t b) {
+  return vreinterpretq_s16_u16(vcgeq_s16(a, b));
+}
+
+template <>
+inline int32x4_t MaskIfLessThan(int32x4_t a, int32x4_t b) {
+  return vreinterpretq_s32_u32(vcltq_s32(a, b));
+}
+
+template <>
+inline int16x8_t MaskIfLessThan(int16x8_t a, int16x8_t b) {
+  return vreinterpretq_s16_u16(vcltq_s16(a, b));
+}
+
+template <>
+inline int32x4_t MaskIfLessThanOrEqual(int32x4_t a, int32x4_t b) {
+  return vreinterpretq_s32_u32(vcleq_s32(a, b));
+}
+
+template <>
+inline int16x8_t MaskIfLessThanOrEqual(int16x8_t a, int16x8_t b) {
+  return vreinterpretq_s16_u16(vcleq_s16(a, b));
+}
+
+template <>
+inline bool All(int32x4_t a) {
+  a = vandq_s32(a, vextq_s32(a, a, 1));
+  a = vandq_s32(a, vextq_s32(a, a, 2));
+  return vgetq_lane_s32(a, 0);
+}
+
+template <>
+inline bool All(int16x8_t a) {
+  a = vandq_s16(a, vextq_s16(a, a, 1));
+  a = vandq_s16(a, vextq_s16(a, a, 2));
+  a = vandq_s16(a, vextq_s16(a, a, 4));
+  return vgetq_lane_s16(a, 0);
+}
+
+template <>
+inline bool Any(int32x4_t a) {
+  a = vorrq_s32(a, vextq_s32(a, a, 1));
+  a = vorrq_s32(a, vextq_s32(a, a, 2));
+  return vgetq_lane_s32(a, 0);
+}
+
+template <>
+inline bool Any(int16x8_t a) {
+  a = vorrq_s16(a, vextq_s16(a, a, 1));
+  a = vorrq_s16(a, vextq_s16(a, a, 2));
+  a = vorrq_s16(a, vextq_s16(a, a, 4));
+  return vgetq_lane_s16(a, 0);
+}
+
+template <>
+inline int32x4_t RoundingHalfSum(int32x4_t a, int32x4_t b) {
+  return vrhaddq_s32(a, b);
+}
+
+template <>
+inline int16x8_t RoundingHalfSum(int16x8_t a, int16x8_t b) {
+  return vrhaddq_s16(a, b);
+}
+
+template <>
+inline int32x4_t SaturatingRoundingDoublingHighMul(int32x4_t a, int32x4_t b) {
+  return vqrdmulhq_s32(a, b);
+}
+
+template <>
+inline int16x8_t SaturatingRoundingDoublingHighMul(int16x8_t a, int16x8_t b) {
+  return vqrdmulhq_s16(a, b);
+}
+
+template <>
+inline int32x4_t RoundingDivideByPOT(int32x4_t x, int exponent) {
+  const int32x4_t shift_vec = vdupq_n_s32(-exponent);
+  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
+  const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
+  return vrshlq_s32(fixed_up_x, shift_vec);
+}
+
+template <>
+inline int16x8_t RoundingDivideByPOT(int16x8_t x, int exponent) {
+  const int16x8_t shift_vec = vdupq_n_s16(-exponent);
+  const int16x8_t fixup = vshrq_n_s16(vandq_s16(x, shift_vec), 15);
+  const int16x8_t fixed_up_x = vqaddq_s16(x, fixup);
+  return vrshlq_s16(fixed_up_x, shift_vec);
+}
+
+template <>
+inline int32x4_t RoundingDivideByPOT(int32x4_t x, int32x4_t exponent) {
+  const int32x4_t shift_vec = vnegq_s32(exponent);
+  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
+  const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
+  return vrshlq_s32(fixed_up_x, shift_vec);
+}
+
+template <>
+inline int16x8_t RoundingDivideByPOT(int16x8_t x, int16x8_t exponent) {
+  const int16x8_t shift_vec = vnegq_s16(exponent);
+  const int16x8_t fixup = vshrq_n_s16(vandq_s16(x, shift_vec), 15);
+  const int16x8_t fixed_up_x = vqaddq_s16(x, fixup);
+  return vrshlq_s16(fixed_up_x, shift_vec);
+}
+
+template <int Exponent>
+struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, 1> {
+  static int32x4_t eval(int32x4_t x) { return vqshlq_n_s32(x, Exponent); }
+};
+
+template <int Exponent>
+struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, -1> {
+  static int32x4_t eval(int32x4_t x) {
+    const int32x4_t fixup = vshrq_n_s32(x, 31);
+    const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
+    return vrshrq_n_s32(fixed_up_x, -Exponent);
+  }
+};
+
+template <int Exponent>
+struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int16x8_t, 1> {
+  static int16x8_t eval(int16x8_t x) { return vqshlq_n_s16(x, Exponent); }
+};
+
+template <int Exponent>
+struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int16x8_t, -1> {
+  static int16x8_t eval(int16x8_t x) {
+    const int16x8_t fixup = vshrq_n_s16(x, 15);
+    const int16x8_t fixed_up_x = vqaddq_s16(x, fixup);
+    return vrshrq_n_s16(fixed_up_x, -Exponent);
+  }
+};
+
+template <>
+inline int32x4_t Dup<int32x4_t>(std::int32_t x) {
+  return vdupq_n_s32(x);
+}
+
+template <>
+inline int16x8_t Dup<int16x8_t>(std::int16_t x) {
+  return vdupq_n_s16(x);
+}
+
+// So far this is only needed for int16.
+template <>
+inline int16x8_t SaturatingAdd(int16x8_t a, int16x8_t b) {
+  return vqaddq_s16(a, b);
+}
+
+}  // end namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_sse.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_sse.h
new file mode 100644
index 00000000..fbaa26aa
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_sse.h
@@ -0,0 +1,388 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// fixedpoint_SSE.h: optimized SSE specializations of the templates
+// in fixedpoint.h.
+
+#ifndef GEMMLOWP_INTERNAL_FIXEDPOINT_SSE_H_
+#define GEMMLOWP_INTERNAL_FIXEDPOINT_SSE_H_
+
+#include <smmintrin.h>
+#include "fixedpoint.h"
+
+namespace gemmlowp {
+
+// SSE intrinsics are not finely typed: there is a single __m128i vector
+// type that does not distinguish between "int32x4" and "int16x8" use
+// cases, unlike the NEON equivalents. Because we had initially focused
+// on int32x4, we did not pay attention and specialized these fixedpoint
+// templates directly for __m128i hardcoding the int32x4 semantics,
+// not leaving room for int16x8 semantics. Amending that by adding a separate
+// data type, int16x8_m128i, that wraps __m128i while being a separate
+// type.
+struct int16x8_m128i {
+  __m128i v;
+};
+
+// Keep int16x8_m128i trivially constructible/destructible and provide
+// easily optimized helper function.
+inline int16x8_m128i to_int16x8_m128i(__m128i w) {
+  int16x8_m128i r;
+  r.v = w;
+  return r;
+}
+
+template <>
+struct FixedPointRawTypeTraits<__m128i> {
+  typedef std::int32_t ScalarRawType;
+  static constexpr int kLanes = 4;
+};
+
+template <>
+struct FixedPointRawTypeTraits<int16x8_m128i> {
+  typedef std::int16_t ScalarRawType;
+  static constexpr int kLanes = 8;
+};
+
+template <>
+inline __m128i BitAnd(__m128i a, __m128i b) {
+  return _mm_and_si128(a, b);
+}
+
+template <>
+inline int16x8_m128i BitAnd(int16x8_m128i a, int16x8_m128i b) {
+  return to_int16x8_m128i(_mm_and_si128(a.v, b.v));
+}
+
+template <>
+inline __m128i BitOr(__m128i a, __m128i b) {
+  return _mm_or_si128(a, b);
+}
+
+template <>
+inline int16x8_m128i BitOr(int16x8_m128i a, int16x8_m128i b) {
+  return to_int16x8_m128i(_mm_or_si128(a.v, b.v));
+}
+
+template <>
+inline __m128i BitXor(__m128i a, __m128i b) {
+  return _mm_xor_si128(a, b);
+}
+
+template <>
+inline int16x8_m128i BitXor(int16x8_m128i a, int16x8_m128i b) {
+  return to_int16x8_m128i(_mm_xor_si128(a.v, b.v));
+}
+
+template <>
+inline __m128i BitNot(__m128i a) {
+  return _mm_andnot_si128(a, _mm_set1_epi32(-1));
+}
+
+template <>
+inline int16x8_m128i BitNot(int16x8_m128i a) {
+  return to_int16x8_m128i(_mm_andnot_si128(a.v, _mm_set1_epi16(-1)));
+}
+
+template <>
+inline __m128i Add(__m128i a, __m128i b) {
+  return _mm_add_epi32(a, b);
+}
+
+template <>
+inline int16x8_m128i Add(int16x8_m128i a, int16x8_m128i b) {
+  return to_int16x8_m128i(_mm_add_epi16(a.v, b.v));
+}
+
+template <>
+inline __m128i Mul(__m128i a, __m128i b) {
+  return _mm_mullo_epi32(a, b);
+}
+
+template <>
+inline int16x8_m128i Mul(int16x8_m128i a, int16x8_m128i b) {
+  return to_int16x8_m128i(_mm_mullo_epi16(a.v, b.v));
+}
+
+template <>
+inline __m128i Sub(__m128i a, __m128i b) {
+  return _mm_sub_epi32(a, b);
+}
+
+template <>
+inline int16x8_m128i Sub(int16x8_m128i a, int16x8_m128i b) {
+  return to_int16x8_m128i(_mm_sub_epi16(a.v, b.v));
+}
+
+template <>
+inline __m128i Neg(__m128i a) {
+  return _mm_sign_epi32(a, _mm_set1_epi32(-1));
+}
+
+template <>
+inline int16x8_m128i Neg(int16x8_m128i a) {
+  return to_int16x8_m128i(_mm_sign_epi16(a.v, _mm_set1_epi16(-1)));
+}
+
+template <>
+inline __m128i ShiftLeft(__m128i a, int offset) {
+  return _mm_slli_epi32(a, offset);
+}
+
+template <>
+inline int16x8_m128i ShiftLeft(int16x8_m128i a, int offset) {
+  return to_int16x8_m128i(_mm_slli_epi16(a.v, offset));
+}
+
+template <>
+inline __m128i ShiftRight(__m128i a, int offset) {
+  return _mm_srai_epi32(a, offset);
+}
+
+template <>
+inline int16x8_m128i ShiftRight(int16x8_m128i a, int offset) {
+  return to_int16x8_m128i(_mm_srai_epi16(a.v, offset));
+}
+
+template <>
+inline __m128i SelectUsingMask(__m128i if_mask, __m128i then_val,
+                               __m128i else_val) {
+  // borrowed from Intel's arm_neon_sse.h header.
+  return _mm_or_si128(_mm_and_si128(if_mask, then_val),
+                      _mm_andnot_si128(if_mask, else_val));
+}
+
+template <>
+inline int16x8_m128i SelectUsingMask(int16x8_m128i if_mask,
+                                     int16x8_m128i then_val,
+                                     int16x8_m128i else_val) {
+  // borrowed from Intel's arm_neon_sse.h header.
+  return to_int16x8_m128i(SelectUsingMask(if_mask.v, then_val.v, else_val.v));
+}
+
+template <>
+inline __m128i MaskIfEqual(__m128i a, __m128i b) {
+  return _mm_cmpeq_epi32(a, b);
+}
+
+template <>
+inline int16x8_m128i MaskIfEqual(int16x8_m128i a, int16x8_m128i b) {
+  return to_int16x8_m128i(_mm_cmpeq_epi16(a.v, b.v));
+}
+
+template <>
+inline __m128i MaskIfNotEqual(__m128i a, __m128i b) {
+  return BitNot(MaskIfEqual(a, b));
+}
+
+template <>
+inline int16x8_m128i MaskIfNotEqual(int16x8_m128i a, int16x8_m128i b) {
+  return BitNot(MaskIfEqual(a, b));
+}
+
+template <>
+inline __m128i MaskIfZero(__m128i a) {
+  return MaskIfEqual(a, _mm_set1_epi32(0));
+}
+
+template <>
+inline int16x8_m128i MaskIfZero(int16x8_m128i a) {
+  return MaskIfEqual(a, to_int16x8_m128i(_mm_set1_epi16(0)));
+}
+
+template <>
+inline __m128i MaskIfNonZero(__m128i a) {
+  return MaskIfNotEqual(a, _mm_set1_epi32(0));
+}
+
+template <>
+inline int16x8_m128i MaskIfNonZero(int16x8_m128i a) {
+  return MaskIfNotEqual(a, to_int16x8_m128i(_mm_set1_epi16(0)));
+}
+
+template <>
+inline __m128i MaskIfGreaterThan(__m128i a, __m128i b) {
+  return _mm_cmpgt_epi32(a, b);
+}
+
+template <>
+inline int16x8_m128i MaskIfGreaterThan(int16x8_m128i a, int16x8_m128i b) {
+  return to_int16x8_m128i(_mm_cmpgt_epi16(a.v, b.v));
+}
+
+template <>
+inline __m128i MaskIfLessThan(__m128i a, __m128i b) {
+  return _mm_cmplt_epi32(a, b);
+}
+
+template <>
+inline int16x8_m128i MaskIfLessThan(int16x8_m128i a, int16x8_m128i b) {
+  return to_int16x8_m128i(_mm_cmplt_epi16(a.v, b.v));
+}
+
+template <>
+inline __m128i MaskIfGreaterThanOrEqual(__m128i a, __m128i b) {
+  return BitNot(MaskIfLessThan(a, b));
+}
+
+template <>
+inline int16x8_m128i MaskIfGreaterThanOrEqual(int16x8_m128i a,
+                                              int16x8_m128i b) {
+  return BitNot(MaskIfLessThan(a, b));
+}
+
+template <>
+inline __m128i MaskIfLessThanOrEqual(__m128i a, __m128i b) {
+  return BitNot(MaskIfGreaterThan(a, b));
+}
+
+template <>
+inline int16x8_m128i MaskIfLessThanOrEqual(int16x8_m128i a, int16x8_m128i b) {
+  return BitNot(MaskIfGreaterThan(a, b));
+}
+
+/* Assumptions:
+   - All and Any are used on masks.
+   - masks are all_ones for true lanes, all_zeroes otherwise.
+Hence, All means all 128bits set, and Any means any bit set.
+*/
+
+template <>
+inline bool All(__m128i a) {
+  return _mm_testc_si128(a, a);
+}
+
+template <>
+inline bool All(int16x8_m128i a) {
+  return _mm_testc_si128(a.v, a.v);
+}
+
+template <>
+inline bool Any(__m128i a) {
+  return !_mm_testz_si128(a, a);
+}
+
+template <>
+inline bool Any(int16x8_m128i a) {
+  return !_mm_testz_si128(a.v, a.v);
+}
+
+template <>
+inline __m128i RoundingHalfSum(__m128i a, __m128i b) {
+  /* __m128i round_bit_mask, a_over_2, b_over_2, round_bit, sum; */
+  /* We divide the inputs before the add to avoid the overflow and costly test
+   */
+  /* of checking if an overflow occured on signed add */
+  /* round_bit_mask = _mm_set1_epi32(1); */
+  /* a_over_2 = _mm_srai_epi32(a, 1); */
+  /* b_over_2 = _mm_srai_epi32(b, 1); */
+  /* sum = Add(a_over_2, b_over_2); */
+  /* round_bit = _mm_sign_epi32(BitAnd(BitOr(a,b), round_bit_mask), sum); */
+  /* return Add(sum, round_bit); */
+
+  /* Other possibility detecting overflow and xor the sign if an overflow
+   * happened*/
+  __m128i one, sign_bit_mask, sum, rounded_half_sum, overflow, result;
+  one = _mm_set1_epi32(1);
+  sign_bit_mask = _mm_set1_epi32(0x80000000);
+  sum = Add(a, b);
+  rounded_half_sum = _mm_srai_epi32(Add(sum, one), 1);
+  overflow =
+      BitAnd(BitAnd(BitXor(a, rounded_half_sum), BitXor(b, rounded_half_sum)),
+             sign_bit_mask);
+  result = BitXor(rounded_half_sum, overflow);
+  return result;
+}
+
+template <>
+inline int16x8_m128i RoundingHalfSum(int16x8_m128i a, int16x8_m128i b) {
+  // Idea: go to unsigned to use _mm_avg_epu16,
+  // borrowed from Intel's arm_neon_sse.h header.
+  __m128i constant_neg_32768 = _mm_set1_epi16(-32768);
+  __m128i a_unsigned = _mm_sub_epi16(a.v, constant_neg_32768);
+  __m128i b_unsigned = _mm_sub_epi16(b.v, constant_neg_32768);
+  __m128i avg_unsigned = _mm_avg_epu16(a_unsigned, b_unsigned);
+  __m128i avg = _mm_add_epi16(avg_unsigned, constant_neg_32768);
+  return to_int16x8_m128i(avg);
+}
+
+template <>
+inline __m128i SaturatingRoundingDoublingHighMul(__m128i a, __m128i b) {
+  __m128i min, saturation_mask, a0_a2, a1_a3, b0_b2, b1_b3;
+  __m128i a0b0_a2b2, a1b1_a3b3, a0b0_a2b2_rounded, a1b1_a3b3_rounded;
+  __m128i a0b0_a2b2_rounded_2x, a1b1_a3b3_rounded_2x, result;
+  __m128i nudge;
+
+  // saturation only happen if a == b == INT_MIN
+  min = _mm_set1_epi32(std::numeric_limits<std::int32_t>::min());
+  saturation_mask = BitAnd(MaskIfEqual(a, b), MaskIfEqual(a, min));
+
+  // a = a0 | a1 | a2 | a3
+  // b = b0 | b1 | b2 | b3
+  a0_a2 = a;
+  a1_a3 = _mm_srli_si128(a, 4);
+  b0_b2 = b;
+  b1_b3 = _mm_srli_si128(b, 4);
+
+  a0b0_a2b2 = _mm_mul_epi32(a0_a2, b0_b2);
+  a1b1_a3b3 = _mm_mul_epi32(a1_a3, b1_b3);
+
+  // do the rounding and take into account that it will be doubled
+  nudge = _mm_set1_epi64x(1 << 30);
+  a0b0_a2b2_rounded = _mm_add_epi64(a0b0_a2b2, nudge);
+  a1b1_a3b3_rounded = _mm_add_epi64(a1b1_a3b3, nudge);
+
+  // do the doubling
+  a0b0_a2b2_rounded_2x = _mm_slli_epi64(a0b0_a2b2_rounded, 1);
+  a1b1_a3b3_rounded_2x = _mm_slli_epi64(a1b1_a3b3_rounded, 1);
+
+  // get the high part of the products
+  result = _mm_blend_epi16(_mm_srli_si128(a0b0_a2b2_rounded_2x, 4),
+                           a1b1_a3b3_rounded_2x, 0xcc);
+
+  // saturate those which overflowed
+  return SelectUsingMask(saturation_mask, min, result);
+}
+
+template <>
+inline int16x8_m128i SaturatingRoundingDoublingHighMul(int16x8_m128i a,
+                                                       int16x8_m128i b) {
+  // Idea: use _mm_mulhrs_epi16 then saturate with a bit-operation,
+  // borrowed from Intel's arm_neon_sse.h header.
+  __m128i result_unsaturated = _mm_mulhrs_epi16(a.v, b.v);
+  __m128i saturation_mask =
+      _mm_cmpeq_epi16(result_unsaturated, _mm_set1_epi16(0x8000));
+  __m128i result = _mm_xor_si128(result_unsaturated, saturation_mask);
+  return to_int16x8_m128i(result);
+}
+
+template <>
+inline __m128i Dup<__m128i>(std::int32_t x) {
+  return _mm_set1_epi32(x);
+}
+
+template <>
+inline int16x8_m128i Dup<int16x8_m128i>(std::int16_t x) {
+  return to_int16x8_m128i(_mm_set1_epi16(x));
+}
+
+// So far this is only needed for int16.
+template <>
+inline int16x8_m128i SaturatingAdd(int16x8_m128i a, int16x8_m128i b) {
+  return to_int16x8_m128i(_mm_adds_epi16(a.v, b.v));
+}
+
+}  // end namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_FIXEDPOINT_SSE_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_wasmsimd.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_wasmsimd.h
new file mode 100644
index 00000000..868fbfed
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_wasmsimd.h
@@ -0,0 +1,381 @@
+// Copyright 2020 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// fixedpoint_wasmsimd.h: optimized WAsm SIMD specializations of the templates
+// in fixedpoint.h.
+
+#ifndef GEMMLOWP_INTERNAL_FIXEDPOINT_WASMSIMD_H_
+#define GEMMLOWP_INTERNAL_FIXEDPOINT_WASMSIMD_H_
+
+#include <wasm_simd128.h>
+
+namespace gemmlowp {
+
+// WAsm SIMD intrinsics are not typed: there is a single v128_t vector
+// type that does not distinguish between "int32x4" and "int16x8" use
+// cases, unlike the NEON equivalents. Because we had initially focused
+// on int32x4, we did not pay attention and specialized these fixedpoint
+// templates directly for v128_t hardcoding the int32x4 semantics,
+// not leaving room for int16x8 semantics. Amending that by adding a separate
+// data type, int16x8_v128_t, that wraps v128_t while being a separate
+// type.
+struct int16x8_v128_t {
+  v128_t v;
+};
+
+// Keep int16x8_v128_t trivially constructible/destructible and provide
+// easily optimized helper function.
+inline int16x8_v128_t to_int16x8_v128_t(v128_t w) {
+  int16x8_v128_t r;
+  r.v = w;
+  return r;
+}
+
+template <>
+struct FixedPointRawTypeTraits<v128_t> {
+  typedef std::int32_t ScalarRawType;
+  static constexpr int kLanes = 4;
+};
+
+template <>
+struct FixedPointRawTypeTraits<int16x8_v128_t> {
+  typedef std::int16_t ScalarRawType;
+  static constexpr int kLanes = 8;
+};
+
+template <>
+inline v128_t BitAnd(v128_t a, v128_t b) {
+  return wasm_v128_and(a, b);
+}
+
+template <>
+inline int16x8_v128_t BitAnd(int16x8_v128_t a, int16x8_v128_t b) {
+  return to_int16x8_v128_t(wasm_v128_and(a.v, b.v));
+}
+
+template <>
+inline v128_t BitOr(v128_t a, v128_t b) {
+  return wasm_v128_or(a, b);
+}
+
+template <>
+inline int16x8_v128_t BitOr(int16x8_v128_t a, int16x8_v128_t b) {
+  return to_int16x8_v128_t(wasm_v128_or(a.v, b.v));
+}
+
+template <>
+inline v128_t BitXor(v128_t a, v128_t b) {
+  return wasm_v128_xor(a, b);
+}
+
+template <>
+inline int16x8_v128_t BitXor(int16x8_v128_t a, int16x8_v128_t b) {
+  return to_int16x8_v128_t(wasm_v128_xor(a.v, b.v));
+}
+
+template <>
+inline v128_t BitNot(v128_t a) {
+  return wasm_v128_not(a);
+}
+
+template <>
+inline int16x8_v128_t BitNot(int16x8_v128_t a) {
+  return to_int16x8_v128_t(wasm_v128_not(a.v));
+}
+
+template <>
+inline v128_t Add(v128_t a, v128_t b) {
+  return wasm_i32x4_add(a, b);
+}
+
+template <>
+inline int16x8_v128_t Add(int16x8_v128_t a, int16x8_v128_t b) {
+  return to_int16x8_v128_t(wasm_i16x8_add(a.v, b.v));
+}
+
+template <>
+inline v128_t Mul(v128_t a, v128_t b) {
+  return wasm_i32x4_mul(a, b);
+}
+
+template <>
+inline int16x8_v128_t Mul(int16x8_v128_t a, int16x8_v128_t b) {
+  return to_int16x8_v128_t(wasm_i16x8_mul(a.v, b.v));
+}
+
+template <>
+inline v128_t Sub(v128_t a, v128_t b) {
+  return wasm_i32x4_sub(a, b);
+}
+
+template <>
+inline int16x8_v128_t Sub(int16x8_v128_t a, int16x8_v128_t b) {
+  return to_int16x8_v128_t(wasm_i16x8_sub(a.v, b.v));
+}
+
+template <>
+inline v128_t Neg(v128_t a) {
+  return wasm_i32x4_neg(a);
+}
+
+template <>
+inline int16x8_v128_t Neg(int16x8_v128_t a) {
+  return to_int16x8_v128_t(wasm_i16x8_neg(a.v));
+}
+
+template <>
+inline v128_t ShiftLeft(v128_t a, int offset) {
+  return wasm_i32x4_shl(a, offset);
+}
+
+template <>
+inline int16x8_v128_t ShiftLeft(int16x8_v128_t a, int offset) {
+  return to_int16x8_v128_t(wasm_i16x8_shl(a.v, offset));
+}
+
+template <>
+inline v128_t ShiftRight(v128_t a, int offset) {
+  return wasm_i32x4_shr(a, offset);
+}
+
+template <>
+inline int16x8_v128_t ShiftRight(int16x8_v128_t a, int offset) {
+  return to_int16x8_v128_t(wasm_i16x8_shr(a.v, offset));
+}
+
+template <>
+inline v128_t SelectUsingMask(v128_t if_mask, v128_t then_val,
+                              v128_t else_val) {
+  return wasm_v128_bitselect(then_val, else_val, if_mask);
+}
+
+template <>
+inline int16x8_v128_t SelectUsingMask(int16x8_v128_t if_mask,
+                                      int16x8_v128_t then_val,
+                                      int16x8_v128_t else_val) {
+  return to_int16x8_v128_t(
+      wasm_v128_bitselect(then_val.v, else_val.v, if_mask.v));
+}
+
+template <>
+inline v128_t MaskIfEqual(v128_t a, v128_t b) {
+  return wasm_i32x4_eq(a, b);
+}
+
+template <>
+inline int16x8_v128_t MaskIfEqual(int16x8_v128_t a, int16x8_v128_t b) {
+  return to_int16x8_v128_t(wasm_i16x8_eq(a.v, b.v));
+}
+
+template <>
+inline v128_t MaskIfNotEqual(v128_t a, v128_t b) {
+  return wasm_i32x4_ne(a, b);
+}
+
+template <>
+inline int16x8_v128_t MaskIfNotEqual(int16x8_v128_t a, int16x8_v128_t b) {
+  return to_int16x8_v128_t(wasm_i16x8_ne(a.v, b.v));
+}
+
+template <>
+inline v128_t MaskIfZero(v128_t a) {
+  return MaskIfEqual(a, wasm_i32x4_const(0, 0, 0, 0));
+}
+
+template <>
+inline int16x8_v128_t MaskIfZero(int16x8_v128_t a) {
+  return MaskIfEqual(
+      a, to_int16x8_v128_t(wasm_i16x8_const(0, 0, 0, 0, 0, 0, 0, 0)));
+}
+
+template <>
+inline v128_t MaskIfNonZero(v128_t a) {
+  return MaskIfNotEqual(a, wasm_i32x4_const(0, 0, 0, 0));
+}
+
+template <>
+inline int16x8_v128_t MaskIfNonZero(int16x8_v128_t a) {
+  return MaskIfNotEqual(
+      a, to_int16x8_v128_t(wasm_i16x8_const(0, 0, 0, 0, 0, 0, 0, 0)));
+}
+
+template <>
+inline v128_t MaskIfGreaterThan(v128_t a, v128_t b) {
+  return wasm_i32x4_gt(a, b);
+}
+
+template <>
+inline int16x8_v128_t MaskIfGreaterThan(int16x8_v128_t a, int16x8_v128_t b) {
+  return to_int16x8_v128_t(wasm_i16x8_gt(a.v, b.v));
+}
+
+template <>
+inline v128_t MaskIfLessThan(v128_t a, v128_t b) {
+  return wasm_i32x4_lt(a, b);
+}
+
+template <>
+inline int16x8_v128_t MaskIfLessThan(int16x8_v128_t a, int16x8_v128_t b) {
+  return to_int16x8_v128_t(wasm_i16x8_lt(a.v, b.v));
+}
+
+template <>
+inline v128_t MaskIfGreaterThanOrEqual(v128_t a, v128_t b) {
+  return wasm_i32x4_ge(a, b);
+}
+
+template <>
+inline int16x8_v128_t MaskIfGreaterThanOrEqual(int16x8_v128_t a,
+                                               int16x8_v128_t b) {
+  return to_int16x8_v128_t(wasm_i16x8_ge(a.v, b.v));
+}
+
+template <>
+inline v128_t MaskIfLessThanOrEqual(v128_t a, v128_t b) {
+  return wasm_i32x4_le(a, b);
+}
+
+template <>
+inline int16x8_v128_t MaskIfLessThanOrEqual(int16x8_v128_t a,
+                                            int16x8_v128_t b) {
+  return to_int16x8_v128_t(wasm_i16x8_le(a.v, b.v));
+}
+
+/* Assumptions:
+   - All and Any are used on masks.
+   - masks are all_ones for true lanes, all_zeroes otherwise.
+Hence, All means all 128bits set, and Any means any bit set.
+*/
+
+template <>
+inline bool All(v128_t a) {
+  return wasm_i32x4_all_true(a);
+}
+
+template <>
+inline bool All(int16x8_v128_t a) {
+  return wasm_i16x8_all_true(a.v);
+}
+
+template <>
+inline bool Any(v128_t a) {
+  return wasm_i32x4_any_true(a);
+}
+
+template <>
+inline bool Any(int16x8_v128_t a) {
+  return wasm_i16x8_any_true(a.v);
+}
+
+template <>
+inline v128_t RoundingHalfSum(v128_t a, v128_t b) {
+  // We divide the inputs before the add to avoid the overflow and costly test.
+  const v128_t one = wasm_i32x4_const(1, 1, 1, 1);
+  const v128_t sign_bit_mask =
+      wasm_i32x4_const(0x80000000, 0x80000000, 0x80000000, 0x80000000);
+  const v128_t sum = Add(a, b);
+  const v128_t rounded_half_sum = ShiftRight(Add(sum, one), 1);
+  const v128_t overflow =
+      BitAnd(BitAnd(BitXor(a, rounded_half_sum), BitXor(b, rounded_half_sum)),
+             sign_bit_mask);
+  const v128_t result = BitXor(rounded_half_sum, overflow);
+  return result;
+}
+
+template <>
+inline int16x8_v128_t RoundingHalfSum(int16x8_v128_t a, int16x8_v128_t b) {
+  // Idea: go to unsigned to use wasm_u16x8_avgr,
+  // borrowed from Intel's arm_neon_sse.h header.
+  const v128_t constant_neg_32768 = wasm_i16x8_const(
+      -32768, -32768, -32768, -32768, -32768, -32768, -32768, -32768);
+  const v128_t a_unsigned = wasm_v128_xor(a.v, constant_neg_32768);
+  const v128_t b_unsigned = wasm_v128_xor(b.v, constant_neg_32768);
+  const v128_t avg_unsigned = wasm_u16x8_avgr(a_unsigned, b_unsigned);
+  const v128_t avg = wasm_v128_xor(avg_unsigned, constant_neg_32768);
+  return to_int16x8_v128_t(avg);
+}
+
+template <>
+inline v128_t SaturatingRoundingDoublingHighMul(v128_t a, v128_t b) {
+  // TODO: switch to extended multiplication once implemented in the toolchain
+  const v128_t a_sign = wasm_i32x4_shr(a, 31);
+  const v128_t b_sign = wasm_i32x4_shr(b, 31);
+
+  const v128_t a_ext_lo = wasm_v32x4_shuffle(a, a_sign, 0, 4, 1, 5);
+  const v128_t a_ext_hi = wasm_v32x4_shuffle(a, a_sign, 2, 6, 3, 7);
+  const v128_t b_ext_lo = wasm_v32x4_shuffle(b, b_sign, 0, 4, 1, 5);
+  const v128_t b_ext_hi = wasm_v32x4_shuffle(b, b_sign, 2, 6, 3, 7);
+
+  const v128_t ab_lo = wasm_i64x2_mul(a_ext_lo, b_ext_lo);
+  const v128_t ab_hi = wasm_i64x2_mul(a_ext_hi, b_ext_hi);
+
+  const v128_t nudge_2x =
+      wasm_i64x2_const(INT64_C(0x80000000), INT64_C(0x80000000));
+  const v128_t ab_lo_2x = wasm_i64x2_add(ab_lo, ab_lo);
+  const v128_t ab_hi_2x = wasm_i64x2_add(ab_hi, ab_hi);
+
+  const v128_t ab_lo_rounded_2x = wasm_i64x2_add(ab_lo_2x, nudge_2x);
+  const v128_t ab_hi_rounded_2x = wasm_i64x2_add(ab_hi_2x, nudge_2x);
+
+  const v128_t prod =
+      wasm_v32x4_shuffle(ab_lo_rounded_2x, ab_hi_rounded_2x, 1, 3, 5, 7);
+
+  // Saturation only happen if a == b == INT_MIN, and this is the only case
+  // where prod == INT_MIN (0x80000000) instead of INT_MAX (0x7FFFFFFF).
+  const v128_t min = wasm_i32x4_const(INT32_C(0x80000000), INT32_C(0x80000000),
+                                      INT32_C(0x80000000), INT32_C(0x80000000));
+
+  return wasm_v128_xor(prod, wasm_i32x4_eq(prod, min));
+}
+
+template <>
+inline int16x8_v128_t SaturatingRoundingDoublingHighMul(int16x8_v128_t a,
+                                                        int16x8_v128_t b) {
+#if 0
+  // TODO: enable if https://github.com/WebAssembly/simd/pull/365 is accepted
+  return to_int16x8_v128_t(__builtin_wasm_q15mulr_saturate_s_i16x8(a.v, b.v));
+#else
+  // TODO: switch to extended multiplication once implemented in the toolchain
+  v128_t lo = wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(a.v),
+                             wasm_i32x4_widen_low_i16x8(b.v));
+  v128_t hi = wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8(a.v),
+                             wasm_i32x4_widen_high_i16x8(b.v));
+  const v128_t inc = wasm_i32x4_const(0x4000, 0x4000, 0x4000, 0x4000);
+  lo = wasm_i32x4_add(lo, inc);
+  hi = wasm_i32x4_add(hi, inc);
+  lo = wasm_i32x4_shr(lo, 15);
+  hi = wasm_i32x4_shr(hi, 15);
+  return to_int16x8_v128_t(wasm_i16x8_narrow_i32x4(lo, hi));
+#endif
+}
+
+template <>
+inline v128_t Dup<v128_t>(std::int32_t x) {
+  return wasm_i32x4_splat(x);
+}
+
+template <>
+inline int16x8_v128_t Dup<int16x8_v128_t>(std::int16_t x) {
+  return to_int16x8_v128_t(wasm_i16x8_splat(x));
+}
+
+// So far this is only needed for int16.
+template <>
+inline int16x8_v128_t SaturatingAdd(int16x8_v128_t a, int16x8_v128_t b) {
+  return to_int16x8_v128_t(wasm_i16x8_add_saturate(a.v, b.v));
+}
+
+}  // end namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_FIXEDPOINT_WASMSIMD_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/flags.bzl b/ethosu/regor/dependencies/thirdparty/gemmlowp/flags.bzl
new file mode 100644
index 00000000..e35fe9ec
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/flags.bzl
@@ -0,0 +1,11 @@
+# Android builds do not need to link in a separate pthread library.
+LIB_COPTS = []
+
+LIB_LINKOPTS = select({
+    ":android": [],
+    ":windows": [],
+    "//conditions:default": ["-lpthread"],
+})
+
+BIN_LINKOPTS = LIB_LINKOPTS
+
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/allocator.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/allocator.h
new file mode 100644
index 00000000..e71df155
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/allocator.h
@@ -0,0 +1,197 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// allocator.h: a buffer allocator that allows avoiding most of the
+// malloc/free overhead, by:
+// 1. Requiring all N allocations to be reserved in advance, and
+//    then commited at once, turning N allocations into 1.
+// 2. Being persistent, the allocated storage is reused across commits,
+//    and only reallocated as needed when the commit size gets larger.
+//
+// This is driven by Android-specific needs:
+// 1. On Android, the default (Bionic) allocator tends to aggressively
+// unmap pages, which means that malloc/free can be surprisingly expensive.
+// 2. On Android, stack allocations with alloca() can't be as large as on
+// desktop platforms.
+//
+// General usage:
+// 1. Reserve blocks by calling Reserve(), which returns a Handle.
+// 2. Call Commit() once.
+// 3. Now it is possible to get pointers to allocated buffers by calling
+//    GetPointer().
+// 4. Call Decommit() once.
+// 5. The allocator is now reverted to its original state, except that
+//    it retained its allocated storage, so the next Commit() will be faster.
+//    The allocated storage is only freed when the Allocator object is
+//    destroyed.
+
+#ifndef GEMMLOWP_INTERNAL_ALLOCATOR_H_
+#define GEMMLOWP_INTERNAL_ALLOCATOR_H_
+
+#include "common.h"
+
+namespace gemmlowp {
+
+enum class TypeId : std::uint8_t { Uint8, Int8, Uint16, Int16, Uint32, Int32 };
+
+template <typename T>
+struct GetTypeIdImpl {};
+
+template <typename T>
+inline TypeId GetTypeId() {
+  return GetTypeIdImpl<T>::Value;
+}
+
+template <typename T>
+struct GetTypeIdImpl<const T> : GetTypeIdImpl<T> {};
+
+#define GEMMLOWP_REGISTER_TYPEID(type_, id) \
+  template <>                               \
+  struct GetTypeIdImpl<type_> {             \
+    static const TypeId Value = TypeId::id; \
+  };
+
+GEMMLOWP_REGISTER_TYPEID(std::uint8_t, Uint8)
+GEMMLOWP_REGISTER_TYPEID(std::int8_t, Int8)
+GEMMLOWP_REGISTER_TYPEID(std::uint16_t, Uint16)
+GEMMLOWP_REGISTER_TYPEID(std::int16_t, Int16)
+GEMMLOWP_REGISTER_TYPEID(std::uint32_t, Uint32)
+GEMMLOWP_REGISTER_TYPEID(std::int32_t, Int32)
+
+class Allocator {
+ public:
+  Allocator()
+      : committed_(false),
+        storage_size_(0),
+        storage_(nullptr),
+        reserved_blocks_(0),
+        reserved_bytes_(0),
+        generation_(0) {}
+
+  ~Allocator() {
+    assert(!committed_);
+    assert(!reserved_blocks_);
+    DeallocateStorage();
+  }
+
+  // Alignment of allocated blocks.
+  static constexpr std::size_t kAlignment = kDefaultCacheLineSize;
+
+  // This is all we need so far, and since the usage pattern is fixed,
+  // there is no point in allowing more until we need to.
+  static constexpr std::size_t kMaxBlocks = 5;
+
+  void Commit() {
+    assert(!committed_);
+
+    if (reserved_bytes_ > storage_size_) {
+      DeallocateStorage();
+      storage_size_ = RoundUpToPowerOfTwo(reserved_bytes_);
+      storage_ = aligned_alloc(kAlignment, storage_size_);
+    }
+
+    ReleaseBuildAssertion(!storage_size_ || storage_, "allocation failure");
+    committed_ = true;
+  }
+
+  void Decommit() {
+    assert(committed_);
+    committed_ = false;
+    generation_++;
+
+    reserved_blocks_ = 0;
+    reserved_bytes_ = 0;
+  }
+
+  // See generation_
+  typedef std::size_t generation_t;
+
+  // A handle on a reserved block. The user obtains
+  // one by calling Reserve() and, after committing,
+  // passes it to GetPointer().
+  class Handle {
+    std::uint8_t index_;
+    generation_t generation_;
+    TypeId type_;
+
+    friend class Allocator;
+  };
+
+  // Reserves a block sized for n elements of type T, and
+  // returns a handle to it. Must be called before committing.
+  template <typename T>
+  Handle Reserve(std::size_t n) {
+    assert(!committed_ && "can't reserve blocks while committed");
+    assert(reserved_blocks_ < kMaxBlocks &&
+           "didn't expect to allocate this many blocks");
+    const std::size_t bytes = RoundUp<kAlignment>(n * sizeof(T));
+    const std::size_t offset = reserved_bytes_;
+    const std::size_t index = reserved_blocks_;
+
+    reserved_blocks_offsets_[index] = offset;
+    Handle h;
+    h.index_ = index;
+    h.generation_ = generation_;
+    h.type_ = GetTypeId<T>();
+
+    reserved_blocks_++;
+    reserved_bytes_ += bytes;
+
+    return h;
+  }
+
+  // Returns the pointer to the allocated buffer for the given handle.
+  // Must be called after committing.
+  template <typename T>
+  T* GetPointer(const Handle& h) const {
+    assert(committed_ && "can't get block pointers unless committed");
+    assert(h.index_ < reserved_blocks_ &&
+           "bad handle, points to inexistant block");
+    assert(h.generation_ == generation_ &&
+           "handle from earlier generation, have decommitted since");
+    assert(h.type_ == GetTypeId<T>() && "type mismatch");
+    std::size_t offset = reserved_blocks_offsets_[h.index_];
+    std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(storage_) + offset;
+    return reinterpret_cast<T*>(addr);
+  }
+
+ private:
+  void DeallocateStorage() {
+    assert(!committed_);
+    aligned_free(storage_);
+    storage_size_ = 0;
+  }
+
+  // Set to true by Commit() and to false by Decommit(). Initially false.
+  bool committed_;
+
+  // The actually allocated storage size and buffer pointer.
+  std::size_t storage_size_;
+  mutable void* storage_;
+
+  // The number of blocks that have been reserved by Reserve().
+  std::size_t reserved_blocks_;
+  // The number of bytes that have been reserved by Reserve().
+  std::size_t reserved_bytes_;
+  // The offsets of reserved blocks into the storage buffer.
+  std::size_t reserved_blocks_offsets_[kMaxBlocks];
+
+  // The 'generation' is incremented on Decommit() and allows catching
+  // bad GetPointer() calls still referring to a previous commit.
+  generation_t generation_;
+};
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_ALLOCATOR_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/block_params.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/block_params.h
new file mode 100644
index 00000000..aedd2614
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/block_params.h
@@ -0,0 +1,177 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// block_params.h: Logic to choose L1 and L2 block sizes
+// to optimize cache-friendliness.
+
+#ifndef GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_
+#define GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_
+
+#include "common.h"
+
+namespace gemmlowp {
+
+// A BlockParams instance contains a full description of all the block size
+// parameters to be used by a Gemm.
+// There are two nested levels of block subdivisions: first a subdivision
+// into large blocks that should fit in last-level cache (what we call L2 here)
+// and then another subdivision into smaller blocks that should fit in
+// L1 cache. There is then actually a third level of subdivision to fit
+// in registers, but we are not concerned with that here.
+struct BlockParams {
+  // L1 block parameters determine the size of small blocks that should
+  // fit in L1 cache.
+  int l1_rows;
+  int l1_cols;
+  int l1_depth;
+
+  // L2 block parameters determine the size of larger blocks that should
+  // fit in L2 cache.
+  int l2_rows;
+  int l2_cols;
+  int l2_depth;
+
+  template <typename KernelFormat>
+  void Init(int rows, int cols, int depth, int num_threads, int l1_bytes_to_use,
+            int l2_bytes_to_use, float l2_rhs_factor) {
+    FindL2BlockSizes<KernelFormat>(rows, cols, depth, num_threads,
+                                   l2_bytes_to_use, l2_rhs_factor, &l2_rows,
+                                   &l2_cols, &l2_depth);
+    FindL1BlockSizes<KernelFormat>(l2_rows, l2_cols, l2_depth, l1_bytes_to_use,
+                                   &l1_rows, &l1_cols, &l1_depth);
+  }
+
+  template <typename KernelFormat>
+  static void FindL2BlockSizes(int rows, int cols, int depth, int num_threads,
+                               int l2_bytes_to_use, float l2_rhs_factor,
+                               int* out_l2_rows, int* out_l2_cols,
+                               int* out_l2_depth) {
+    int l2_rows = 0;
+    int l2_cols = 0;
+    int l2_depth = 0;
+
+    int per_thread_rows =
+        std::max(1, RoundUp<KernelFormat::kRows>(rows) / num_threads);
+
+    // No L2 blocking in the depth dimension at the moment.
+    // Too much loss of accuracy due to storing intermediate results in
+    // low precision.
+    // However, we still want to round l2_depth up to the next multiple
+    // of register size, so as to avoid having to special-case unaligned depths.
+    l2_depth = RoundUp<kRegisterSize>(depth);
+
+    {
+      int max_cache_friendly_l2_cols = std::max(
+          1, static_cast<int>(l2_rhs_factor * (l2_bytes_to_use / l2_depth)));
+      int min_l2_cols_blocks =
+          std::max(1, CeilQuotient(cols, max_cache_friendly_l2_cols));
+      l2_cols =
+          RoundUp<KernelFormat::kCols>(CeilQuotient(cols, min_l2_cols_blocks));
+    }
+
+    // No L2 blocking in the row dimension if l2_rhs_factor is 1.0 as the row
+    // dimension concerns only the LHS. Blocking only RHS matrix for L2 enhances
+    // the performance on x86.
+    if (l2_rhs_factor == 1.0f) {
+      l2_rows = RoundUp<KernelFormat::kRows>(per_thread_rows);
+    } else {
+      int max_cache_friendly_l2_rows =
+          std::max(1, (l2_bytes_to_use - l2_depth * l2_cols) /
+                          (num_threads * (l2_depth + 4 * l2_cols)));
+      int min_l2_rows_blocks = std::max(
+          1, CeilQuotient(per_thread_rows, max_cache_friendly_l2_rows));
+      l2_rows = RoundUp<KernelFormat::kRows>(
+          CeilQuotient(per_thread_rows, min_l2_rows_blocks));
+    }
+
+    *out_l2_rows = l2_rows;
+    *out_l2_cols = l2_cols;
+    *out_l2_depth = l2_depth;
+  }
+
+  template <typename KernelFormat>
+  static void FindL1BlockSizes(int rows, int cols, int depth,
+                               int l1_bytes_to_use, int* out_l1_rows,
+                               int* out_l1_cols, int* out_l1_depth) {
+    int l1_rows = 0;
+    int l1_cols = 0;
+    int l1_depth = 0;
+
+    // L2 block sizes should already be multiples of kernel block sizes.
+    assert(rows % KernelFormat::kRows == 0);
+    assert(cols % KernelFormat::kCols == 0);
+    assert(depth % KernelFormat::kDepth == 0);
+
+    // No L1 blocking in the columns dimension at the moment.
+    // Thought not to be needed. Similar to Eigen.
+    l1_cols = cols;
+
+    {
+      int max_cache_friendly_l1_depth = std::max(
+          1, (l1_bytes_to_use - 4 * KernelFormat::kRows * KernelFormat::kCols) /
+                 (KernelFormat::kRows + KernelFormat::kCols));
+      int min_l1_depth_blocks =
+          std::max(1, CeilQuotient(depth, max_cache_friendly_l1_depth));
+      l1_depth =
+          RoundUp<kRegisterSize>(CeilQuotient(depth, min_l1_depth_blocks));
+    }
+
+    {
+      int max_cache_friendly_l1_rows =
+          std::max(1, l1_bytes_to_use / (l1_depth + 4 * l1_cols));
+      int min_l1_rows_blocks =
+          std::max(1, CeilQuotient(rows, max_cache_friendly_l1_rows));
+      l1_rows =
+          RoundUp<KernelFormat::kRows>(CeilQuotient(rows, min_l1_rows_blocks));
+    }
+
+    *out_l1_rows = l1_rows;
+    *out_l1_cols = l1_cols;
+    *out_l1_depth = l1_depth;
+  }
+};
+
+// A SideBlockParams instance contains only the block params relevant to
+// one side (LHS or RHS), expressed in terms of 'width' instead of
+// rows/colums. See the explanation in kernel.h: in the LHS, 'width' means
+// the number of rows, while in the RHS, 'width' means the number of columns.
+// That allows us to write generic code that applies to either LHS or RHS.
+struct SideBlockParams {
+  // L1 block parameters determine the size of small blocks that should
+  // fit in L1 cache.
+  int l1_width;
+  int l1_depth;
+
+  // L2 block parameters determine the size of larger blocks that should
+  // fit in L2 cache.
+  int l2_width;
+  int l2_depth;
+};
+
+enum class Side { Lhs, Rhs };
+
+inline void GetSideBlockParams(Side side, SideBlockParams* side_block_params,
+                               const BlockParams& block_params) {
+  side_block_params->l1_width =
+      side == Side::Lhs ? block_params.l1_rows : block_params.l1_cols;
+  side_block_params->l2_width =
+      side == Side::Lhs ? block_params.l2_rows : block_params.l2_cols;
+
+  side_block_params->l1_depth = block_params.l1_depth;
+  side_block_params->l2_depth = block_params.l2_depth;
+}
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/common.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/common.h
new file mode 100644
index 00000000..708cc407
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/common.h
@@ -0,0 +1,184 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// common.h: contains stuff that's used throughout gemmlowp
+// and should always be available.
+
+#ifndef GEMMLOWP_INTERNAL_COMMON_H_
+#define GEMMLOWP_INTERNAL_COMMON_H_
+
+#include "../internal/platform.h"
+#include "../profiling/pthread_everywhere.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+
+#include "../internal/detect_platform.h"
+#include "../profiling/instrumentation.h"
+
+namespace gemmlowp {
+
+// Standard cache line size. Useful to optimize alignment and
+// prefetches. Ideally we would query this at runtime, however
+// 64 byte cache lines are the vast majority, and even if it's
+// wrong on some device, it will be wrong by no more than a 2x factor,
+// which should be acceptable.
+const int kDefaultCacheLineSize = 64;
+
+// Default L1 and L2 data cache sizes.
+// The L1 cache size is assumed to be for each core.
+// The L2 cache size is assumed to be shared among all cores. What
+// we call 'L2' here is effectively top-level cache.
+//
+// On x86, we should ideally query this at
+// runtime. On ARM, the instruction to query this is privileged and
+// Android kernels do not expose it to userspace. Fortunately, the majority
+// of ARM devices have roughly comparable values:
+//   Nexus 5: L1 16k, L2 1M
+//   Android One: L1 32k, L2 512k
+// The following values are equal to or somewhat lower than that, and were
+// found to perform well on both the Nexus 5 and Android One.
+// Of course, these values are in principle too low for typical x86 CPUs
+// where we should set the L2 value to (L3 cache size / number of cores) at
+// least.
+//
+#if defined(GEMMLOWP_ARM) && defined(__APPLE__)
+// iPhone/iPad
+const int kDefaultL1CacheSize = 48 * 1024;
+const int kDefaultL2CacheSize = 2 * 1024 * 1024;
+#elif defined(GEMMLOWP_ARM) || defined(GEMMLOWP_ANDROID)
+// Other ARM or ARM-like hardware (Android implies ARM-like) so here it's OK
+// to tune for ARM, although on x86 Atom we might be able to query
+// cache sizes at runtime, which would be better.
+const int kDefaultL1CacheSize = 16 * 1024;
+const int kDefaultL2CacheSize = 384 * 1024;
+#elif defined(GEMMLOWP_X86_64)
+// x86-64 and not Android. Therefore, likely desktop-class x86 hardware.
+// Thus we assume larger cache sizes, though we really should query
+// them at runtime.
+const int kDefaultL1CacheSize = 32 * 1024;
+const int kDefaultL2CacheSize = 4 * 1024 * 1024;
+#elif defined(GEMMLOWP_X86_32)
+// x86-32 and not Android. Same as x86-64 but less bullish.
+const int kDefaultL1CacheSize = 32 * 1024;
+const int kDefaultL2CacheSize = 2 * 1024 * 1024;
+#elif defined(GEMMLOWP_MIPS)
+// MIPS and not Android. TODO: MIPS and Android?
+const int kDefaultL1CacheSize = 32 * 1024;
+const int kDefaultL2CacheSize = 1024 * 1024;
+#else
+// Less common hardware. Maybe some unusual or older or embedded thing.
+// Assume smaller caches, but don't depart too far from what we do
+// on ARM/Android to avoid accidentally exposing unexpected behavior.
+const int kDefaultL1CacheSize = 16 * 1024;
+const int kDefaultL2CacheSize = 256 * 1024;
+#endif
+
+// The proportion of the cache that we intend to use for storing
+// RHS blocks. This should be between 0 and 1, and typically closer to 1,
+// as we typically want to use most of the L2 cache for storing a large
+// RHS block.
+#if defined(GEMMLOWP_X86)
+// For IA, use the entire L2 cache for the RHS matrix. LHS matrix is not blocked
+// for L2 cache.
+const float kDefaultL2RhsFactor = 1.00f;
+#else
+const float kDefaultL2RhsFactor = 0.75f;
+#endif
+
+// The number of bytes in a SIMD register. This is used to determine
+// the dimensions of PackingRegisterBlock so that such blocks can
+// be efficiently loaded into registers, so that packing code can
+// work within registers as much as possible.
+// In the non-SIMD generic fallback code, this is just a generic array
+// size, so any size would work there. Different platforms may set this
+// to different values but must ensure that their own optimized packing paths
+// are consistent with this value.
+
+#ifdef GEMMLOWP_AVX2
+const int kRegisterSize = 32;
+#else
+const int kRegisterSize = 16;
+#endif
+
+// Hints the CPU to prefetch the cache line containing ptr.
+inline void Prefetch(const void* ptr) {
+#if defined GEMMLOWP_ARM_64 && defined GEMMLOWP_ALLOW_INLINE_ASM
+  // Aarch64 has very detailed prefetch instructions, that compilers
+  // can't know how to map __builtin_prefetch to, and as a result, don't,
+  // leaving __builtin_prefetch a no-op on this architecture.
+  // For our purposes, "pldl1keep" is usually what we want, meaning:
+  // "prefetch for load, into L1 cache, using each value multiple times".
+  asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+#elif defined \
+    __GNUC__  // Clang and GCC define __GNUC__ and have __builtin_prefetch.
+  __builtin_prefetch(ptr);
+#else
+  (void)ptr;
+#endif
+}
+
+// Returns the runtime argument rounded down to the nearest multiple of
+// the fixed Modulus.
+template <unsigned Modulus, typename Integer>
+Integer RoundDown(Integer i) {
+  return i - (i % Modulus);
+}
+
+// Returns the runtime argument rounded up to the nearest multiple of
+// the fixed Modulus.
+template <unsigned Modulus, typename Integer>
+Integer RoundUp(Integer i) {
+  return RoundDown<Modulus>(i + Modulus - 1);
+}
+
+// Returns the quotient a / b rounded up ('ceil') to the nearest integer.
+template <typename Integer>
+Integer CeilQuotient(Integer a, Integer b) {
+  return (a + b - 1) / b;
+}
+
+// Returns the argument rounded up to the nearest power of two.
+template <typename Integer>
+Integer RoundUpToPowerOfTwo(Integer n) {
+  Integer i = n - 1;
+  i |= i >> 1;
+  i |= i >> 2;
+  i |= i >> 4;
+  i |= i >> 8;
+  i |= i >> 16;
+  return i + 1;
+}
+
+template <int N>
+struct IsPowerOfTwo {
+  static constexpr bool value = !(N & (N - 1));
+};
+
+template <typename T>
+void MarkMemoryAsInitialized(T* ptr, int size) {
+#ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED
+  GEMMLOWP_MARK_MEMORY_AS_INITIALIZED(static_cast<void*>(ptr),
+                                      size * sizeof(T));
+#else
+  (void)ptr;
+  (void)size;
+#endif
+}
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_COMMON_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/compute.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/compute.h
new file mode 100644
index 00000000..44175070
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/compute.h
@@ -0,0 +1,118 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// compute.h: the central stage of the Gemm computation, operates
+// on already-packed LHS and RHS blocks and calls the Gemm kernel
+// to compute a block of the product.
+
+#ifndef GEMMLOWP_INTERNAL_COMPUTE_H_
+#define GEMMLOWP_INTERNAL_COMPUTE_H_
+
+#include "block_params.h"
+#include "kernel.h"
+#include "pack.h"
+
+namespace gemmlowp {
+
+template <typename PackedLhs, typename PackedRhs, typename PackedResult>
+class ComputeImpl {
+  typedef typename PackedLhs::KernelSideFormat KernelLhsFormat;
+  typedef typename PackedRhs::KernelSideFormat KernelRhsFormat;
+  typedef KernelFormat<KernelLhsFormat, KernelRhsFormat> Format;
+
+  const KernelBase& kernel_;
+  const BlockParams& block_params_;
+
+  PackedResult* const packed_result_;
+  const PackedLhs& packed_lhs_;
+  const PackedRhs& packed_rhs_;
+
+ public:
+  ComputeImpl(const KernelBase& _kernel, const BlockParams& _block_params,
+              PackedResult* _packed_result, const PackedLhs& _packed_lhs,
+              const PackedRhs& _packed_rhs)
+      : kernel_(_kernel),
+        block_params_(_block_params),
+        packed_result_(_packed_result),
+        packed_lhs_(_packed_lhs),
+        packed_rhs_(_packed_rhs) {}
+
+  void Compute(int depth) {
+    depth = RoundUp<Format::kDepth>(depth);
+    assert(depth <= block_params_.l2_depth);
+    for (int d = 0; d < depth; d += block_params_.l1_depth) {
+      int ds = std::min(block_params_.l1_depth, depth - d);
+
+      for (int r = 0; r < block_params_.l2_rows; r += block_params_.l1_rows) {
+        int rs = std::min(block_params_.l1_rows, block_params_.l2_rows - r);
+
+        ComputeL1(r, rs, 0, block_params_.l2_cols, d, ds);
+      }
+    }
+  }
+
+ private:
+  static void MarkPackedResultBlockAsInitialized(
+      const MatrixMap<std::int32_t, MapOrder::ColMajor>& packed_result_block) {
+#ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED
+    for (int col = 0; col < packed_result_block.cols(); col++) {
+      MarkMemoryAsInitialized(
+          packed_result_block.data() + col * packed_result_block.cols_stride(),
+          packed_result_block.rows());
+    }
+#else
+    (void)packed_result_block;
+#endif
+  }
+
+  void ComputeRun(int start_row, int start_col, int start_depth,
+                  int depth) GEMMLOWP_NOINLINE {
+    packed_lhs_.seek_run(start_row, start_depth);
+    packed_rhs_.seek_run(start_col, start_depth);
+    auto packed_result_block = packed_result_->Map().block(
+        start_row, start_col, Format::kRows, Format::kCols);
+    kernel_.Run(packed_result_block.data(), packed_result_block.rows_stride(),
+                packed_result_block.cols_stride(), packed_lhs_.current_data(),
+                packed_rhs_.current_data(), start_depth, depth);
+    MarkPackedResultBlockAsInitialized(packed_result_block);
+  }
+
+  void ComputeL1(int start_row, int rows, int start_col, int cols,
+                 int start_depth, int depth) {
+    assert(rows % Format::kRows == 0);
+    assert(cols % Format::kCols == 0);
+    assert(depth % Format::kDepth == 0);
+
+    for (int c = 0; c < cols; c += Format::kCols) {
+      for (int r = 0; r < rows; r += Format::kRows) {
+        ComputeRun(start_row + r, start_col + c, start_depth, depth);
+      }
+    }
+  }
+};
+
+template <typename PackedLhs, typename PackedRhs, typename PackedResult>
+void Compute(const KernelBase& kernel, const BlockParams& block_params,
+             PackedResult* packed_result, const PackedLhs& packed_lhs,
+             const PackedRhs& packed_rhs, int depth) {
+  ScopedProfilingLabel label("compute");
+  ComputeImpl<PackedLhs, PackedRhs, PackedResult> impl(
+      kernel, block_params, packed_result, packed_lhs, packed_rhs);
+
+  impl.Compute(depth);
+}
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_COMPUTE_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/detect_platform.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/detect_platform.h
new file mode 100644
index 00000000..6fd51bca
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/detect_platform.h
@@ -0,0 +1,171 @@
+// Copyright 2018 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// detect_platform.h: Sets up macros that control architecture-specific
+// features of gemmlowp's implementation.
+
+#ifndef GEMMLOWP_INTERNAL_DETECT_PLATFORM_H_
+#define GEMMLOWP_INTERNAL_DETECT_PLATFORM_H_
+
+// Our inline assembly path assume GCC/Clang syntax.
+// Native Client doesn't seem to support inline assembly(?).
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__native_client__)
+#define GEMMLOWP_ALLOW_INLINE_ASM
+#endif
+
+// Define macro statement that avoids inlining for GCC.
+// For non-GCC, define as empty macro.
+#if defined(__GNUC__)
+#define GEMMLOWP_NOINLINE __attribute__((noinline))
+#else
+#define GEMMLOWP_NOINLINE
+#endif
+
+// Detect ARM, 32-bit or 64-bit
+#ifdef __arm__
+#define GEMMLOWP_ARM_32
+#endif
+
+#ifdef __aarch64__
+#define GEMMLOWP_ARM_64
+#endif
+
+#if defined(GEMMLOWP_ARM_32) || defined(GEMMLOWP_ARM_64)
+#define GEMMLOWP_ARM
+#endif
+
+// Detect MIPS, 32-bit or 64-bit
+#if defined(__mips) && !defined(__LP64__)
+#define GEMMLOWP_MIPS_32
+#endif
+
+#if defined(__mips) && defined(__LP64__)
+#define GEMMLOWP_MIPS_64
+#endif
+
+#if defined(GEMMLOWP_MIPS_32) || defined(GEMMLOWP_MIPS_64)
+#define GEMMLOWP_MIPS
+#endif
+
+// Detect x86, 32-bit or 64-bit
+#if defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__i386)
+#define GEMMLOWP_X86_32
+#endif
+
+#if defined(__x86_64__) || defined(_M_X64) || defined(__amd64)
+#define GEMMLOWP_X86_64
+#endif
+
+#if defined(GEMMLOWP_X86_32) || defined(GEMMLOWP_X86_64)
+#define GEMMLOWP_X86
+#endif
+
+// Detect WebAssembly SIMD.
+#if defined(__wasm_simd128__)
+#define GEMMLOWP_WASMSIMD
+#endif
+
+// Some of our optimized paths use inline assembly and for
+// now we don't bother enabling some other optimized paths using intrinddics
+// where we can't use inline assembly paths.
+#ifdef GEMMLOWP_ALLOW_INLINE_ASM
+
+// Detect NEON. It's important to check for both tokens.
+#if (defined __ARM_NEON) || (defined __ARM_NEON__)
+#define GEMMLOWP_NEON
+#endif
+
+// Convenience NEON tokens for 32-bit or 64-bit
+#if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_32)
+#define GEMMLOWP_NEON_32
+#endif
+
+#if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_64)
+#define GEMMLOWP_NEON_64
+#endif
+
+// Detect MIPS MSA.
+// Limit MSA optimizations to little-endian CPUs for now.
+// TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs?
+#if defined(GEMMLOWP_MIPS) && (__mips_isa_rev >= 5) && defined(__mips_msa) && \
+    defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define GEMMLOWP_MSA
+#endif
+
+// Convenience MIPS MSA tokens for 32-bit or 64-bit.
+#if defined(GEMMLOWP_MSA) && defined(GEMMLOWP_MIPS_32)
+#define GEMMLOWP_MSA_32
+#endif
+
+#if defined(GEMMLOWP_MSA) && defined(GEMMLOWP_MIPS_64)
+#define GEMMLOWP_MSA_64
+#endif
+
+// compiler define for AVX2 -D GEMMLOWP_ENABLE_AVX2
+// Detect AVX2
+#if defined(__AVX2__) && defined(GEMMLOWP_ENABLE_AVX2)
+#define GEMMLOWP_AVX2
+// Detect SSE4.
+// MSVC does not have __SSE4_1__ macro, but will enable SSE4
+// when AVX is turned on.
+#elif defined(__SSE4_1__) || (defined(_MSC_VER) && defined(__AVX__))
+#define GEMMLOWP_SSE4
+// Detect SSE3.
+#elif defined(__SSE3__)
+#define GEMMLOWP_SSE3
+#endif
+
+// Convenience SSE4 tokens for 32-bit or 64-bit
+#if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_32) && \
+    !defined(GEMMLOWP_DISABLE_SSE4)
+#define GEMMLOWP_SSE4_32
+#endif
+
+#if defined(GEMMLOWP_SSE3) && defined(GEMMLOWP_X86_32)
+#define GEMMLOWP_SSE3_32
+#endif
+
+#if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_64) && \
+    !defined(GEMMLOWP_DISABLE_SSE4)
+#define GEMMLOWP_SSE4_64
+#endif
+
+#if defined(GEMMLOWP_SSE3) && defined(GEMMLOWP_X86_64)
+#define GEMMLOWP_SSE3_64
+#endif
+
+#if defined(GEMMLOWP_AVX2) && defined(GEMMLOWP_X86_64)
+#define GEMMLOWP_AVX2_64
+#endif
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#include <sanitizer/msan_interface.h>
+#define GEMMLOWP_MARK_MEMORY_AS_INITIALIZED __msan_unpoison
+#elif __has_feature(address_sanitizer)
+#include <sanitizer/asan_interface.h>
+#define GEMMLOWP_MARK_MEMORY_AS_INITIALIZED __asan_unpoison_memory_region
+#endif
+#endif
+
+#endif  // GEMMLOWP_ALLOW_INLINE_ASM
+
+// Detect Android. Don't conflate with ARM - we care about tuning
+// for non-ARM Android devices too. This can be used in conjunction
+// with x86 to tune differently for mobile x86 CPUs (Atom) vs. desktop x86 CPUs.
+#if defined(__ANDROID__) || defined(ANDROID)
+#define GEMMLOWP_ANDROID
+#endif
+
+#endif  // GEMMLOWP_INTERNAL_DETECT_PLATFORM_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/dispatch_gemm_shape.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/dispatch_gemm_shape.h
new file mode 100644
index 00000000..b844f789
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/dispatch_gemm_shape.h
@@ -0,0 +1,207 @@
+// Copyright 2017 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// dispatch_gemm_shape.h: dispatch GEMM calls according to their shape
+
+#ifndef GEMMLOWP_INTERNAL_DISPATCH_GEMM_SHAPE_H_
+#define GEMMLOWP_INTERNAL_DISPATCH_GEMM_SHAPE_H_
+
+#include "../internal/kernel_default.h"
+#include "../public/map.h"
+#include "../public/output_stages.h"
+#include "multi_thread_gemm.h"
+
+namespace gemmlowp {
+
+template <typename T>
+struct TransposeImpl {
+  typedef T DstType;
+  static T Run(const T& t) { return t; }
+};
+
+template <typename T>
+using TransposeType = typename TransposeImpl<T>::DstType;
+
+template <typename T>
+TransposeType<T> Transpose(const T& t) {
+  return TransposeImpl<T>::Run(t);
+}
+
+template <MapOrder Order>
+struct TransposeMapOrder {
+  static constexpr MapOrder Value =
+      Order == MapOrder::RowMajor ? MapOrder::ColMajor : MapOrder::RowMajor;
+};
+
+template <VectorShape Shape>
+struct TransposeVectorShape {
+  static constexpr VectorShape Value =
+      Shape == VectorShape::Row ? VectorShape::Col : VectorShape::Row;
+};
+
+template <typename Scalar, VectorShape Shape>
+struct TransposeImpl<VectorMap<Scalar, Shape>> {
+  typedef VectorMap<Scalar, Shape> SrcType;
+  static constexpr VectorShape TransposedShape =
+      TransposeVectorShape<Shape>::Value;
+  typedef VectorMap<Scalar, TransposedShape> DstType;
+  static DstType Run(const SrcType& src) {
+    return DstType(src.data(), src.size());
+  }
+};
+
+template <typename Scalar, MapOrder Order>
+struct TransposeImpl<MatrixMap<Scalar, Order>> {
+  typedef MatrixMap<Scalar, Order> SrcType;
+  static constexpr MapOrder TransposedOrder = TransposeMapOrder<Order>::Value;
+  typedef MatrixMap<Scalar, TransposedOrder> DstType;
+  static DstType Run(const SrcType& src) {
+    return DstType(src.data(), src.cols(), src.rows(), src.stride());
+  }
+};
+
+template <VectorShape Shape>
+struct TransposeImpl<OutputStageQuantizeDownInt32ToUint8ScalePC<Shape>> {
+  typedef OutputStageQuantizeDownInt32ToUint8ScalePC<Shape> SrcType;
+  static constexpr VectorShape TransposedShape =
+      TransposeVectorShape<Shape>::Value;
+  typedef OutputStageQuantizeDownInt32ToUint8ScalePC<TransposedShape> DstType;
+  static DstType Run(const SrcType& src) {
+    DstType dst;
+    dst.result_shift = src.result_shift;
+    dst.result_offset = Transpose(src.result_offset);
+    dst.result_mult_int = Transpose(src.result_mult_int);
+    return dst;
+  }
+};
+
+template <VectorShape Shape>
+struct TransposeImpl<OutputStageScaleInt32ByFixedPointAndExponentPC<Shape>> {
+  typedef OutputStageScaleInt32ByFixedPointAndExponentPC<Shape> SrcType;
+  static constexpr VectorShape TransposedShape =
+      TransposeVectorShape<Shape>::Value;
+  typedef OutputStageScaleInt32ByFixedPointAndExponentPC<TransposedShape>
+      DstType;
+  static DstType Run(const SrcType& src) {
+    DstType dst;
+    dst.result_fixedpoint_multiplier =
+        Transpose(src.result_fixedpoint_multiplier);
+    dst.result_exponent = Transpose(src.result_exponent);
+    dst.result_offset_after_shift = src.result_offset_after_shift;
+    return dst;
+  }
+};
+
+template <typename VectorMapType>
+struct TransposeImpl<OutputStageBiasAddition<VectorMapType>> {
+  typedef OutputStageBiasAddition<VectorMapType> SrcType;
+  typedef TransposeType<VectorMapType> TransposedVectorMapType;
+  typedef OutputStageBiasAddition<TransposedVectorMapType> DstType;
+  static DstType Run(const SrcType& src) {
+    DstType dst;
+    dst.bias_vector = Transpose(src.bias_vector);
+    return dst;
+  }
+};
+
+// TODO(benoitjacob) - does anyone understand C++ variadic templates?
+// How to use them to implement TransposeTuple? Note: there are lots
+// of answers on StackOverflow but they seem to all involve either
+// C++14/C++17 (we can only use C++11) or lots of abstract nonsense.
+inline std::tuple<> TransposeTuple(const std::tuple<>& t) { return t; }
+
+template <typename T0>
+std::tuple<TransposeType<T0>> TransposeTuple(const std::tuple<T0>& t) {
+  return std::make_tuple(Transpose(std::get<0>(t)));
+}
+
+template <typename T0, typename T1>
+std::tuple<TransposeType<T0>, TransposeType<T1>> TransposeTuple(
+    const std::tuple<T0, T1>& t) {
+  return std::make_tuple(Transpose(std::get<0>(t)), Transpose(std::get<1>(t)));
+}
+
+template <typename T0, typename T1, typename T2>
+std::tuple<TransposeType<T0>, TransposeType<T1>, TransposeType<T2>>
+TransposeTuple(const std::tuple<T0, T1, T2>& t) {
+  return std::make_tuple(Transpose(std::get<0>(t)), Transpose(std::get<1>(t)),
+                         Transpose(std::get<2>(t)));
+}
+
+template <typename T0, typename T1, typename T2, typename T3>
+std::tuple<TransposeType<T0>, TransposeType<T1>, TransposeType<T2>,
+           TransposeType<T3>>
+TransposeTuple(const std::tuple<T0, T1, T2, T3>& t) {
+  return std::make_tuple(Transpose(std::get<0>(t)), Transpose(std::get<1>(t)),
+                         Transpose(std::get<2>(t)), Transpose(std::get<3>(t)));
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4>
+std::tuple<TransposeType<T0>, TransposeType<T1>, TransposeType<T2>,
+           TransposeType<T3>, TransposeType<T4>>
+TransposeTuple(const std::tuple<T0, T1, T2, T3, T4>& t) {
+  return std::make_tuple(Transpose(std::get<0>(t)), Transpose(std::get<1>(t)),
+                         Transpose(std::get<2>(t)), Transpose(std::get<3>(t)),
+                         Transpose(std::get<4>(t)));
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4,
+          typename T5>
+std::tuple<TransposeType<T0>, TransposeType<T1>, TransposeType<T2>,
+           TransposeType<T3>, TransposeType<T4>, TransposeType<T5>>
+TransposeTuple(const std::tuple<T0, T1, T2, T3, T4, T5>& t) {
+  return std::make_tuple(Transpose(std::get<0>(t)), Transpose(std::get<1>(t)),
+                         Transpose(std::get<2>(t)), Transpose(std::get<3>(t)),
+                         Transpose(std::get<4>(t)), Transpose(std::get<5>(t)));
+}
+
+template <typename InputScalar, typename OutputScalar, typename BitDepthParams,
+          MapOrder LhsOrder, MapOrder RhsOrder, MapOrder ResultOrder,
+          typename LhsOffset, typename RhsOffset, typename OutputPipelineType,
+          typename GemmContextType>
+void DispatchGemmShape(GemmContextType* context,
+                       const MatrixMap<const InputScalar, LhsOrder>& lhs,
+                       const MatrixMap<const InputScalar, RhsOrder>& rhs,
+                       MatrixMap<OutputScalar, ResultOrder>* result,
+                       const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
+                       const OutputPipelineType& output_pipeline) {
+  assert(lhs.cols() == rhs.rows());
+
+  int rows = result->rows();
+  int cols = result->cols();
+  int depth = lhs.cols();
+
+  if (rows == 0 || cols == 0 || depth == 0) {
+    // Vacuous GEMM, return early to avoid having to deal with
+    // zero sizes below.
+    return;
+  }
+
+  if (rows < cols) {
+    auto transposed_result_map = Transpose(*result);
+    return DispatchGemmShape<InputScalar, OutputScalar, BitDepthParams>(
+        context, Transpose(rhs), Transpose(lhs), &transposed_result_map,
+        Transpose(rhs_offset), Transpose(lhs_offset),
+        TransposeTuple(output_pipeline));
+  }
+
+  typedef DefaultKernel<BitDepthParams> Kernel;
+  MultiThreadGemm<typename Kernel::Format, InputScalar, OutputScalar,
+                  BitDepthParams>(context, Kernel(), lhs, rhs, result,
+                                  lhs_offset, rhs_offset, output_pipeline);
+}
+
+}  // end namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_DISPATCH_GEMM_SHAPE_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel.h
new file mode 100644
index 00000000..f1a3fd84
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel.h
@@ -0,0 +1,251 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// kernel.h: general definitions for kernels.
+
+#ifndef GEMMLOWP_INTERNAL_KERNEL_H_
+#define GEMMLOWP_INTERNAL_KERNEL_H_
+
+#include "../public/bit_depth.h"
+#include "common.h"
+
+namespace gemmlowp {
+
+// Explanation of general gemmlowp terminology
+// ===========================================
+//
+// We use the following abbreviations:
+// LHS = "left-hand side"
+// RHS = "right-hand side"
+// Sometimes when referring to either LHS or RHS, we just say a "Side".
+//
+// In a matrix product of a MxK matrix times a KxN matrix,
+// we call K the 'depth'. Note that M is the number of rows
+// of the result (and of the LHS), and N is the number of columns
+// of the result (and of the RHS).
+//
+// In each of the LHS and RHS matrices, we call 'width' the
+// other dimension, besides the depth. So in the LHS, 'width'
+// is the number of rows, while in the RHS, 'width' is the number
+// of columns.
+//
+//  So in the LHS MxK matrix, the depth is K and the width in M.
+// And in the RHS KxN matrix, the depth is K and the width in N.
+//
+// This is illustrated in this picture:
+//
+//                             RHS width
+//                        <----------------->
+//                        +-----------------+ ^
+//                        |       RHS       | | Depth
+//                        +-----------------+ v
+//                 ^ +--+ +-----------------+
+//                 | |L | |                 |
+//       LHS width | |H | |      Result     |
+//                 | |S | |                 |
+//                 v +--+ +-----------------+
+//                   <-->
+//                   Depth
+
+// Explanation of gemmlowp kernel formats and "cells"
+// ==================================================
+//
+// Kernels operate on small LHS and RHS blocks that fit in registers.
+// These blocks are stored contiguously in memory, but not always
+// in a traditional column-major or row-major order; instead,
+// they consist of a number of sub-blocks, which we call "cells",
+// that are stored in column-major or row-major order. However,
+// what really matters to us is not so much rows vs columns, but
+// rather width vs depth. So we refer to "width-major" and "depth-major"
+// storage orders. In the LHS, width-major means row-major,
+// while in the RHS, width-major means column-major.
+// There is also a third possibility, "diagonal order",
+// which is unused at the moment.
+//
+// We aim to treat both sides, LHS and RHS, on an equal footing,
+// so we call them both 'sides'. A KernelFormat thus is just a pair
+// of KernelSideFormat's, one for LHS and one for RHS; each KernelSideFormat
+// contains a CellFormat and a number of cells; cells are only ever
+// stacked in the width dimension, which means stacked vertically in the
+// LHS and stacked horizondally in the RHS.
+//
+// Example
+// =======
+//
+// Let's work out the data layout expected by a kernel having the
+// following format (the struct names here are defined below in this file):
+//
+// KernelFormat<
+//   KernelSideFormat<CellFormat<3, 4>, 3>,
+//   KernelSideFormat<CellFormat<5, 4>, 2>
+// >
+//
+// The LHS format, KernelSideFormat<CellFormat<3, 4>, 3>, means:
+// 3 cells, each cell having dimensions (width=3, depth=4), laid out in
+// DepthMajor order (the default value, see CellFormat). In the LHS,
+// DepthMajor means column-major, so the LHS cells are of size 3x4 in
+// column-major order, so the LHS layout is:
+//
+// 0  3  6  9
+// 1  4  7  10
+// 2  5  8  11
+// 12 15 18 21
+// 13 16 19 22
+// 14 17 20 23
+// 24 27 30 33
+// 25 28 31 34
+// 26 29 32 35
+//
+// The RHS format, KernelSideFormat<CellFormat<5, 4>, 2>, means:
+// 2 cells each having dimensions (width=5, depth=4), laid out in
+// DepthMajor order (the default value, see CellFormat). In the RHS,
+// DepthMajor means row-major, so the RHS cells are of size 4x5 in
+// row-major order, so the RHS layout is:
+//
+// 0  1  2  3  4  20 21 22 23 24
+// 5  6  7  8  9  25 26 27 28 29
+// 10 11 12 13 14 30 31 32 33 34
+// 15 16 17 18 19 35 36 37 38 39
+
+// CellOrder enumerates the possible storage orders (=layouts) for
+// a cell (see explanation above).
+enum class CellOrder { DepthMajor, WidthMajor, Diagonal };
+
+// CellFormat describes how data is laid
+// out in a cell. That is, a CellOrder together with actual dimensions.
+template <int tWidth, int tDepth, CellOrder tOrder = CellOrder::DepthMajor>
+struct CellFormat {
+  static constexpr int kWidth = tWidth;
+  static constexpr int kDepth = tDepth;
+  static constexpr CellOrder kOrder = tOrder;
+
+  static constexpr int kSize = kWidth * kDepth;
+};
+
+// KernelSideFormat describes how data is laid out in a kernel side
+// (i.e. LHS or RHS). That is, a CellFormat together with a number of
+// cells. These cells are always stacked in the Width dimension.
+// For example, in the LHS case, the Width dimension is the rows dimension,
+// se we're saying that in the LHS, cells are stacked vertically.
+// We never stack cells in the Depth dimension.
+template <typename tCellFormat, int tCells>
+struct KernelSideFormat {
+  typedef tCellFormat Cell;
+  static constexpr int kCells = tCells;
+  static constexpr int kWidth = kCells * Cell::kWidth;
+  static constexpr int kDepth = Cell::kDepth;
+  typedef std::uint8_t Scalar;       // The scalar type of the Format.
+  typedef std::uint8_t InputScalar;  // The scalar type of the original input.
+};
+
+// KernelSideFormat for int8 fast kernel trick. The original input is uint8, but
+// packs converts it to int8.
+template <typename tCellFormat, int tCells>
+struct KernelSideFormatInt8 : KernelSideFormat<tCellFormat, tCells> {
+  typedef std::int8_t Scalar;
+  typedef std::uint8_t InputScalar;
+};
+
+// KernelSideFormat for int8 inputs, enabling int8 fast kernel trick without
+// pack conversion.
+template <typename tCellFormat, int tCells>
+struct KernelSideFormatInt8Inputs : KernelSideFormat<tCellFormat, tCells> {
+  typedef std::int8_t Scalar;
+  typedef std::int8_t InputScalar;
+};
+
+// KernelFormat describes fully the input data layout that a kernel expects.
+// It consists of two KernelSideFormat's, one for LHS and one for RHS.
+template <typename tLhs, typename tRhs>
+struct KernelFormat {
+  typedef tLhs Lhs;
+  typedef tRhs Rhs;
+
+  static_assert(Lhs::Cell::kDepth == Rhs::Cell::kDepth, "");
+  static constexpr int kDepth = Lhs::Cell::kDepth;
+  static constexpr int kRows = Lhs::Cell::kWidth * Lhs::kCells;
+  static constexpr int kCols = Rhs::Cell::kWidth * Rhs::kCells;
+};
+
+inline const char* CellOrderName(CellOrder o) {
+  switch (o) {
+    case CellOrder::DepthMajor:
+      return "DepthMajor";
+    case CellOrder::WidthMajor:
+      return "WidthMajor";
+    case CellOrder::Diagonal:
+      return "Diagonal";
+    default:
+      assert(false);
+      return nullptr;
+  }
+}
+
+// Returns the offset into a cell, at which a given coefficient is stored.
+template <typename CellFormat>
+inline int OffsetIntoCell(int w, int d) {
+  const int size = CellFormat::kWidth;
+  switch (CellFormat::kOrder) {
+    case CellOrder::DepthMajor:
+      return w + d * CellFormat::kWidth;
+    case CellOrder::WidthMajor:
+      return d + w * CellFormat::kDepth;
+    case CellOrder::Diagonal:
+      assert(CellFormat::kWidth == CellFormat::kDepth);
+      return ((size + w - d) * size + d) % (size * size);
+    default:
+      assert(false);
+      return 0;
+  }
+}
+
+// KernelBase is the virtual base class below all kernels.
+// The idea is that we don't need to templatize all our code on the exact
+// kernel type; we only need to templatize on kernel format. Kernels
+// sharing the same format can thus share the same packing/unpacking code.
+struct KernelBase {
+  virtual const char* Name() const = 0;
+
+  // This is the kernel implementation. We use the word 'run' consistently
+  // throughout gemmlowp to mean an inner loop, the implementation of which
+  // is to be provided by a separate optimized function.
+  virtual void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
+                   std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
+                   const std::uint8_t* rhs_ptr, std::size_t start_depth,
+                   std::size_t run_depth) const = 0;
+
+  virtual ~KernelBase() {}
+};
+
+template <typename InputKernelScalarType, typename KernelScalarType>
+struct ZeroPointInputValue {};
+
+template <>
+struct ZeroPointInputValue<std::uint8_t, std::uint8_t> {
+  static constexpr std::uint8_t kValue = 0;
+};
+
+template <>
+struct ZeroPointInputValue<std::uint8_t, std::int8_t> {
+  static constexpr std::uint8_t kValue = 128;
+};
+
+template <>
+struct ZeroPointInputValue<std::int8_t, std::int8_t> {
+  static constexpr std::uint8_t kValue = 0;
+};
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_KERNEL_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_avx.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_avx.h
new file mode 100644
index 00000000..2fe1249c
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_avx.h
@@ -0,0 +1,361 @@
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// kernel_SSE.h: a collection of Intel SSE optimized kernels.
+// Check in kernel_default.h which one(s) are actually used by default.
+// Others are mere experiments; they are still covered by tests
+// in case they might be useful some day.
+//
+
+#ifndef GEMMLOWP_INTERNAL_KERNEL_AVX_H_
+#define GEMMLOWP_INTERNAL_KERNEL_AVX_H_
+
+#include "kernel.h"
+
+#include <string.h>
+#include <cassert>
+
+namespace gemmlowp {
+
+#ifdef GEMMLOWP_AVX2_64
+struct AVX2_64_Kernel24x8Depth2 : KernelBase {
+  typedef KernelFormat<KernelSideFormat<CellFormat<8, 2, CellOrder::WidthMajor>, 3>,
+                       KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1>>
+      Format;
+
+  const char *Name() const override { return "AVX, 24x8, depth 2"; }
+
+  void Run(std::int32_t *dst_ptr, std::size_t dst_row_stride, std::size_t dst_col_stride,
+           const std::uint8_t *lhs_ptr, const std::uint8_t *rhs_ptr, std::size_t start_depth,
+           std::size_t run_depth) const override {
+    ScopedProfilingLabel label("optimized kernel");
+    assert(dst_row_stride == 1);
+    const std::int64_t run_depth_cells = run_depth / Format::kDepth;
+    const std::int64_t dst_col_stride_q = dst_col_stride;
+
+    /* Main loop */
+
+    // A 2x8 cell of Rhs is stored in 16bit in ymm1 .
+    // A 24x2 block of 3 8x2 cells Lhs is stored in 16bit in ymm0, replaced
+    // every Iteration.
+    // A 8x8 block of accumulators is stored in 32bit in xmm4--xmm15.
+    //
+    //                   +-------+-------+-------+-------+
+    //                   |ymm1[0]        |ymm2[2]        |
+    //              Rhs  +-------+---------------+-------+
+    //                   |ymm1[1]        |ymm1[4]        |
+    //                   +-------+-------+-------+-------+
+    //
+    //                   |       |       |       |       |
+    //
+    //    Lhs            |       |       |       |       |
+    //
+    //  +--+--+ - - - -  +-------+-------+-------+-------+
+    //  |ymm0 |          | ymm4  | ymm5  | ymm6  | ymm7  |
+    //  |ymm0 | (Iter1)  | ymm4  | ymm5  | ymm6  | ymm7  |
+    //  |ymm0 |          | ymm4  | ymm5  | ymm6  | ymm7  |
+    //  |ymm0 |          | ymm4  | ymm5  | ymm6  | ymm7  |
+    //  +--+--+ - - - -  +-------+-------+-------+-------+
+    //  |ymm0 |          | ymm8  | ymm9  | ymm10 | ymm11 |
+    //  |ymm0 | (Iter2)  | ymm8  | ymm9  | ymm10 | ymm11 |
+    //  |ymm0 |          | ymm8  | ymm9  | ymm10 | ymm11 |
+    //  |ymm0 |          | ymm8  | ymm9  | ymm10 | ymm11 |
+    //  +--+--+ - - - -  +-------+-------+-------+-------+
+    //  |ymm0 |          | ymm12 | ymm13 | ymm14 | ymm15 |
+    //  |ymm0 | (Iter3)  | ymm12 | ymm13 | ymm14 | ymm15 |
+    //  |ymm0 |          | ymm12 | ymm13 | ymm14 | ymm15 |
+    //  |ymm0 |          | ymm12 | ymm13 | ymm14 | ymm15 |
+    //  +--+--+ - - - -  +-------+-------+-------+-------+
+    //
+    //                              Accumulator
+
+    asm volatile(
+        // Set registers for destination
+        "movq  %[dst_col_stride_q], %%r12\n\t"  // stride is r12
+        "shlq $2, %%r12\n\t"                    // set stride dword
+        "leaq (%%r12,%%r12,0x2), %%r13\n\t"     // load stride aligned r13
+
+        // Set accumulators to zero.
+        "vpxor %%ymm4, %%ymm4, %%ymm4 \n\t"    // zero accumulators
+        "vpxor %%ymm5, %%ymm5, %%ymm5 \n\t"    // zero accumulators
+        "vpxor %%ymm6, %%ymm6, %%ymm6 \n\t"    // zero accumulators
+        "vpxor %%ymm7, %%ymm7, %%ymm7 \n\t"    // zero accumulators
+        "vpxor %%ymm8, %%ymm8, %%ymm8 \n\t"    // zero accumulators
+        "vpxor %%ymm9, %%ymm9, %%ymm9 \n\t"    // zero accumulators
+        "vpxor %%ymm10, %%ymm10, %%ymm10\n\t"  // zero accumulators
+        "vpxor %%ymm11, %%ymm11, %%ymm11\n\t"  // zero accumulators
+        "vpxor %%ymm12, %%ymm12, %%ymm12\n\t"  // zero accumulators
+        "vpxor %%ymm13, %%ymm13, %%ymm13\n\t"  // zero accumulators
+        "vpxor %%ymm14, %%ymm14, %%ymm14\n\t"  // zero accumulators
+        "vpxor %%ymm15, %%ymm15, %%ymm15\n\t"  // zero accumulators
+
+        "movq  %[run_depth_cells], %%r14 \n\t"  // load cell depth r14
+        "subq $2, %%r14 \n\t"                   // cell depth is 2
+        "js outerLoop1%= \n\t"                  // outerloop for matrix
+
+        // Loop for K unrolled by 4
+        "outerLoop2%=: \n\t"  // outer loop unroll
+
+        // K = 0,1,2,3
+        // RHS cell to ymm1
+
+        // lower half
+        "vpmovzxbw (%[rhs_ptr]), %%ymm1 \n\t"  // mov rhs to ymm1
+        "vpermq $0x44,%%ymm1, %%ymm1 \n\t"
+        // LHS cell elements 0 and 1
+        "vpmovzxbw 0x00(%[lhs_ptr]), %%ymm0\n\t"  // mov lhs to ymm0
+        "vpshufd $0x00,%%ymm1,%%ymm2     \n\t"    // move rhs 0 element to all ymm2
+        "vpshufd $0x55,%%ymm1,%%ymm3     \n\t"    // move rhs 1 element to all ymm3
+        "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t"    // mul add lhs rhs0 into ymm2
+        "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t"    // mul add lhs rhs1 into ymm3
+        "vpaddd %%ymm2, %%ymm4, %%ymm4   \n\t"    // add muladd lhs + rhs0 into ymm4
+        "vpaddd %%ymm3, %%ymm5, %%ymm5   \n\t"    // add muladd lhs + rhs1 into ymm5
+        // LHS cell elements 2 and 3
+        "vpshufd $0xaa, %%ymm1, %%ymm2   \n\t"  // move rhs 2 element to all ymm2
+        "vpmaddwd %%ymm0, %%ymm2, %%ymm2 \n\t"  // mul add lhs rh3 into ymm2
+        "vpshufd $0xff,%%ymm1,%%ymm3     \n\t"  // mov rhs 3 element into all ymm3
+        "vpmaddwd %%ymm0, %%ymm3, %%ymm3 \n\t"  // mul add lhs rh4 into ymm3
+        "vpaddd %%ymm2, %%ymm6, %%ymm6   \n\t"  // add muladd lhs + rhs2 into ymm6
+        "vpaddd %%ymm3, %%ymm7, %%ymm7   \n\t"  // add muladd lhs + rhs3 into ymm7
+
+        // cache prefect lhs //see if it works better?
+        //"prefetcht0 0x80(%[lhs_ptr]) \n\t" //prefetch cache lines
+        "vpmovzxbw (%[rhs_ptr]), %%ymm1 \n\t"  // mov rhs to ymm1
+        "vpermq $0x44,%%ymm1, %%ymm1 \n\t"
+
+        // K = 5,6,7,8
+        // next LHS cell elements 0 and 1
+        "vpmovzxbw 0x10(%[lhs_ptr]), %%ymm0 \n\t"  // mov lhs to ymm0
+        "vpshufd $0x00,%%ymm1,%%ymm2        \n\t"  // mov rhs 0 element to all ymm2
+        "vpshufd $0x55,%%ymm1,%%ymm3        \n\t"  // mov rhs 1 element to all ymm3
+        "vpmaddwd %%ymm0, %%ymm2, %%ymm2    \n\t"  // mul add lhs rhs0 into ymm2
+        "vpmaddwd %%ymm0, %%ymm3, %%ymm3    \n\t"  // mul add lhs rhs1 into ymm3
+        "vpaddd %%ymm2, %%ymm8, %%ymm8      \n\t"  // add muladd lhs + rhs0 into ymm8
+        "vpaddd %%ymm3, %%ymm9, %%ymm9      \n\t"  // add muladd lhs + rhs1 into ymm9
+        // next LHS cell elements 2 and 3
+        "vpshufd $0xaa,%%ymm1,%%ymm2        \n\t"  // mov rhs 2 element to all ymm2
+        "vpshufd $0xff,%%ymm1,%%ymm3        \n\t"  // mov rhs 3 element to all ymm3
+        "vpmaddwd %%ymm0, %%ymm2, %%ymm2    \n\t"  // mul add lhs rhs2 into ymm2
+        "vpmaddwd %%ymm0, %%ymm3, %%ymm3    \n\t"  // mul add lhs rhs3 into ymm3
+        "vpaddd %%ymm2, %%ymm10, %%ymm10    \n\t"  // add muladd lhs + rhs2 into ymm10
+        "vpaddd %%ymm3, %%ymm11, %%ymm11    \n\t"  // add muladd lhs + rhs3 into ymm11
+
+        // rhs lower half
+        "vpmovzxbw (%[rhs_ptr]), %%ymm1 \n\t"  // mov rhs to ymm1
+        "vpermq $0x44,%%ymm1, %%ymm1 \n\t"     // duplcate lower 16
+
+        // next LHS cell elements 0 and 1
+        "vpmovzxbw 0x20(%[lhs_ptr]), %%ymm0 \n\t"    // mov lhs to ymm0
+        "vpshufd $0x00,%%ymm1,%%ymm2        \n\t"    // mov rhs 0 element to all ymm2
+        "vpshufd $0x55,%%ymm1,%%ymm3        \n\t"    // mov rhs 1 element to all ymm3
+        "vpmaddwd %%ymm0, %%ymm2, %%ymm2    \n\t"    // mul add lhs rhs0 into ymm2
+        "vpmaddwd %%ymm0, %%ymm3, %%ymm3    \n\t"    // mul add lhs rhs1 into ymm3
+        "vpaddd %%ymm2, %%ymm12, %%ymm12      \n\t"  // add muladd lhs + rhs0 into ymm8
+        "vpaddd %%ymm3, %%ymm13, %%ymm13      \n\t"  // add muladd lhs + rhs1 into ymm9
+
+        // cache prefetch rhs //see if it works better?
+        //"prefetcht0 0x80(%[rhs_ptr]) \n\t"
+
+        // next LHS cell elements 2 and 3
+        "vpshufd $0xaa,%%ymm1,%%ymm2        \n\t"  // mov rhs 2 element to all ymm2
+        "vpshufd $0xff,%%ymm1,%%ymm3        \n\t"  // mov rhs 3 element to all ymm3
+        "vpmaddwd %%ymm0, %%ymm2, %%ymm2    \n\t"  // mul add lhs rhs2 into ymm2
+        "vpmaddwd %%ymm0, %%ymm3, %%ymm3    \n\t"  // mul add lhs rhs3 into ymm3
+        "vpaddd %%ymm2, %%ymm14, %%ymm14    \n\t"  // add muladd lhs + rhs2 into ymm10
+        "vpaddd %%ymm3, %%ymm15, %%ymm15    \n\t"  // add muladd lhs + rhs3 into ymm11
+
+        // current result in ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10 ymm11 ymm12 ymm13 ymm14 ymm15
+
+        // rhs+10 lower half
+        "vpmovzxbw 0x08(%[rhs_ptr]), %%ymm1 \n\t"  // mov rhs to ymm1
+        "vpermq $0x44,%%ymm1, %%ymm1 \n\t"
+        // next LHS cell elements 0 and 1
+        "vpmovzxbw 0x30(%[lhs_ptr]), %%ymm0 \n\t"  // mov lhs to ymm0
+        "vpshufd $0x00,%%ymm1,%%ymm2        \n\t"  // move rhs 0 element to ymm2
+        "vpshufd $0x55,%%ymm1,%%ymm3        \n\t"  // move rhs 1 element to ymm3
+        "vpmaddwd %%ymm0, %%ymm2, %%ymm2    \n\t"  // muladd lhs rhs0 into ymm2
+        "vpmaddwd %%ymm0, %%ymm3, %%ymm3    \n\t"  // muladd lhs rhs1 into ymm3
+        "vpaddd %%ymm2, %%ymm4, %%ymm4      \n\t"  // accumulate to ymm4
+        "vpaddd %%ymm3, %%ymm5, %%ymm5      \n\t"  // accumulate to ymm5
+        // next LHS cell elements 2 and 3
+        "vpshufd $0xaa,%%ymm1,%%ymm2        \n\t"  // mov rhs 2 element to ymm2
+        "vpshufd $0xff,%%ymm1,%%ymm3        \n\t"  // mov rhs 3 element to ymm2
+        "vpmaddwd %%ymm0, %%ymm2, %%ymm2    \n\t"  // mul add lhs rhs2 into ymm2
+        "vpmaddwd %%ymm0, %%ymm3, %%ymm3    \n\t"  // mull add lhs rhs3 into ymm3
+        "vpaddd %%ymm2, %%ymm6, %%ymm6      \n\t"  // add lhs rhs2 to ymm6
+        "vpaddd %%ymm3, %%ymm7, %%ymm7      \n\t"  // add lhs rhs3 to ymm7
+
+        // rhs+10 lower half
+        "vpmovzxbw 0x08(%[rhs_ptr]), %%ymm1 \n\t"  // mov rhs to ymm1
+        "vpermq $0x44,%%ymm1, %%ymm1 \n\t"
+
+        // next LHS cell elements 4 and 5
+        "vpmovzxbw 0x40(%[lhs_ptr]), %%ymm0 \n\t"  // mov lhs to ymm0
+        "vpshufd $0x00,%%ymm1,%%ymm2        \n\t"  // move rhs 0 element to ymm2
+        "vpshufd $0x55,%%ymm1,%%ymm3        \n\t"  // move rhs 1 element to ymm3
+        "vpmaddwd %%ymm0, %%ymm2, %%ymm2    \n\t"  // muladd lhs rhs0 into ymm2
+        "vpmaddwd %%ymm0, %%ymm3, %%ymm3    \n\t"  // muladd lhs rhs1 into ymm3
+        "vpaddd %%ymm2, %%ymm8, %%ymm8      \n\t"  // accumulate to ymm8
+        "vpaddd %%ymm3, %%ymm9, %%ymm9      \n\t"  // accumulate to ymm9
+        // next LHS cell elements 6 and 7
+        "vpshufd $0xaa,%%ymm1,%%ymm2        \n\t"  // mov rhs 2 element to ymm2
+        "vpshufd $0xff,%%ymm1,%%ymm3        \n\t"  // mov rhs 3 element to ymm2
+        "vpmaddwd %%ymm0, %%ymm2, %%ymm2    \n\t"  // mul add lhs rhs2 into ymm2
+        "vpmaddwd %%ymm0, %%ymm3, %%ymm3    \n\t"  // mull add lhs rhs3 into ymm3
+        "vpaddd %%ymm2, %%ymm10, %%ymm10    \n\t"  // add lhs rhs2 to ymm10
+        "vpaddd %%ymm3, %%ymm11, %%ymm11    \n\t"  // add lhs rhs3 to ymm11
+
+        "vpmovzxbw 0x08(%[rhs_ptr]), %%ymm1 \n\t"  // mov rhs to ymm1
+        "vpermq $0x44,%%ymm1, %%ymm1 \n\t"
+        // next LHS cell elements 9 and 10
+        "vpmovzxbw 0x50(%[lhs_ptr]), %%ymm0 \n\t"  // mov lhs to ymm0
+        "vpshufd $0x00,%%ymm1,%%ymm2        \n\t"  // move rhs 0 element to ymm2
+        "vpshufd $0x55,%%ymm1,%%ymm3        \n\t"  // move rhs 1 element to ymm3
+        "vpmaddwd %%ymm0, %%ymm2, %%ymm2    \n\t"  // muladd lhs rhs0 into ymm2
+        "vpmaddwd %%ymm0, %%ymm3, %%ymm3    \n\t"  // muladd lhs rhs1 into ymm3
+        "vpaddd %%ymm2, %%ymm12, %%ymm12    \n\t"  // accumulate to ymm12
+        "vpaddd %%ymm3, %%ymm13, %%ymm13    \n\t"  // accumulate to ymm13
+
+        // next LHS cell elements 11 and 12
+        "vpshufd $0xaa,%%ymm1,%%ymm2        \n\t"  // mov rhs 2 element to ymm2
+        "vpshufd $0xff,%%ymm1,%%ymm3        \n\t"  // mov rhs 3 element to ymm2
+        "vpmaddwd %%ymm0, %%ymm2, %%ymm2    \n\t"  // mul add lhs rhs2 into ymm2
+        "vpmaddwd %%ymm0, %%ymm3, %%ymm3    \n\t"  // mull add lhs rhs3 into ymm3
+        "vpaddd %%ymm2, %%ymm14, %%ymm14    \n\t"  // add lhs rhs2 to ymm14
+        "vpaddd %%ymm3, %%ymm15, %%ymm15    \n\t"  // add lhs rhs3 to ymm15
+
+        // completed rhs+10
+        "addq $0x60, %[lhs_ptr]             \n\t"  // increment stride lhs
+        "addq $0x10, %[rhs_ptr]             \n\t"  // increment stride rhs
+
+        "subq $2, %[run_depth_cells] \n\t"
+        "ja outerLoop2%= \n\t"
+
+        "movq %[run_depth_cells], %%r14 \n\t"
+        "decq %%r14 \n\t"
+        "js finish%= \n\t"
+
+        // Loop for K unrolled by 2
+        "outerLoop1%=: \n\t"
+
+        // rhs lower
+        "vpmovzxbw (%[rhs_ptr]), %%ymm1 \n\t"  // get rhs into ymm1
+        "vpermq $0x44,%%ymm1, %%ymm1 \n\t"
+
+        // LHS cell
+        "vpmovzxbw (%[lhs_ptr]), %%ymm0  \n\t"      // lhs in into ymm0
+        "vpshufd $0x00,%%ymm1,%%ymm2         \n\t"  // rhs element 0 into ymm2
+        "vpshufd $0x55,%%ymm1,%%ymm3         \n\t"  // rhs element 1 into ymm3
+        "vpmaddwd %%ymm0, %%ymm2, %%ymm2     \n\t"  // muladd lhs rhs element 0 ymm2
+        "vpmaddwd %%ymm0, %%ymm3, %%ymm3     \n\t"  // muladd lhs rhs element 1 ymm3
+        "vpaddd %%ymm2, %%ymm4, %%ymm4       \n\t"  // acc element 0 ymm4
+        "vpaddd %%ymm3, %%ymm5, %%ymm5       \n\t"  // acc element 1 ymm5
+        "vpshufd $0xaa,%%ymm1,%%ymm2         \n\t"  // rhs element 2 into ymm2
+        "vpshufd $0xff,%%ymm1,%%ymm3         \n\t"  // rhs element 3 into ymm3
+        "vpmaddwd %%ymm0, %%ymm2, %%ymm2     \n\t"  // muladd lhs rhs element 2 ymm2
+        "vpmaddwd %%ymm0, %%ymm3, %%ymm3     \n\t"  // muladd lhs rhs element 3 ymm3
+        "vpaddd %%ymm2, %%ymm6, %%ymm6       \n\t"  // acc element 2 into ymm6
+        "vpaddd %%ymm3, %%ymm7, %%ymm7       \n\t"  // acc element 3 into ymm7
+
+        // lhs+10
+        "vpmovzxbw 0x10(%[lhs_ptr]), %%ymm0  \n\t"  // lhs in into ymm0
+        "vpshufd $0x00, %%ymm1, %%ymm2       \n\t"  // rhs element 0 into ymm2
+        "vpshufd $0x55, %%ymm1, %%ymm3       \n\t"  // rhs element 1 into ymm3
+        "vpmaddwd %%ymm0, %%ymm2, %%ymm2     \n\t"  // muladd lhs rhs element 0 ymm2
+        "vpmaddwd %%ymm0, %%ymm3, %%ymm3     \n\t"  // muladd lhs rhs element 1 ymm3
+        "vpaddd %%ymm2, %%ymm8, %%ymm8       \n\t"  // acc element 0 ymm8
+        "vpaddd %%ymm3, %%ymm9, %%ymm9       \n\t"  // acc element 1 ymm9
+        "vpshufd $0xaa,%%ymm1,%%ymm2         \n\t"  // rhs element 2 into ymm2
+        "vpshufd $0xff,%%ymm1,%%ymm3         \n\t"  // rhs element 3 into ymm3
+        "vpmaddwd %%ymm0, %%ymm2, %%ymm2     \n\t"  // muladd lhs rhs element 2 ymm2
+        "vpmaddwd %%ymm0, %%ymm3, %%ymm3     \n\t"  // muladd lhs rhs element 3 ymm3
+        "vpaddd %%ymm2, %%ymm10, %%ymm10     \n\t"  // acc element 2 into ymm10
+        "vpaddd %%ymm3, %%ymm11, %%ymm11     \n\t"  // acc element 3 into ymm11
+
+        "vpmovzxbw 0x20(%[lhs_ptr]), %%ymm0  \n\t"
+        "vpshufd $0x00, %%ymm1, %%ymm2       \n\t"  // rhs element 0 into ymm2
+        "vpshufd $0x55, %%ymm1, %%ymm3       \n\t"  // rhs element 1 into ymm3
+        "vpmaddwd %%ymm0, %%ymm2, %%ymm2     \n\t"  // muladd lhs rhs element 0 ymm2
+        "vpmaddwd %%ymm0, %%ymm3, %%ymm3     \n\t"  // muladd lhs rhs element 1 ymm3
+        "vpaddd %%ymm2, %%ymm12, %%ymm12     \n\t"  // acc element 0 ymm12
+        "vpaddd %%ymm3, %%ymm13, %%ymm13     \n\t"  // acc element 1 ymm13
+        "vpshufd $0xaa,%%ymm1,%%ymm2         \n\t"  // rhs element 2 into ymm2
+        "vpshufd $0xff,%%ymm1,%%ymm3         \n\t"  // rhs element 3 into ymm3
+        "vpmaddwd %%ymm0, %%ymm2, %%ymm2     \n\t"  // muladd lhs rhs element 2 ymm2
+        "vpmaddwd %%ymm0, %%ymm3, %%ymm3     \n\t"  // muladd lhs rhs element 3 ymm3
+        "vpaddd %%ymm2, %%ymm14, %%ymm14     \n\t"  // acc element 2 into ymm14
+        "vpaddd %%ymm3, %%ymm15, %%ymm15     \n\t"  // acc element 3 into ymm15
+
+        // update matrix pointers
+        "addq $0x30, %[lhs_ptr]              \n\t"
+        "addq $0x08, %[rhs_ptr]              \n\t"
+
+        "decq %[run_depth_cells]             \n\t"
+        "jnz outerLoop1%=                    \n\t"
+
+        "finish%=:\n\t"
+
+        "test %[start_depth], %[start_depth] \n\t"
+        "jz storeDst%= \n\t"
+
+        "vpaddd 0x00(%[dst_ptr]), %%ymm4, %%ymm4 \n\t"    // rhs0
+        "vpaddd 0x20(%[dst_ptr]), %%ymm8, %%ymm8 \n\t"    // rhs0
+        "vpaddd 0x40(%[dst_ptr]), %%ymm12, %%ymm12 \n\t"  // rhs0
+
+        "vpaddd 0x00(%[dst_ptr], %%r12, 1) , %%ymm5, %%ymm5   \n\t"  // rhs1
+        "vpaddd 0x20(%[dst_ptr], %%r12, 1) , %%ymm9, %%ymm9   \n\t"  // rhs1
+        "vpaddd 0x40(%[dst_ptr], %%r12, 1) , %%ymm13, %%ymm13 \n\t"  // rhs1
+
+        "vpaddd 0x00(%[dst_ptr], %%r12, 2) , %%ymm6, %%ymm6   \n\t"  // rhs2
+        "vpaddd 0x20(%[dst_ptr], %%r12, 2) , %%ymm10, %%ymm10 \n\t"  // rhs2
+        "vpaddd 0x40(%[dst_ptr], %%r12, 2) , %%ymm14, %%ymm14 \n\t"  // rhs2
+
+        "vpaddd 0x00(%[dst_ptr], %%r13, 1) , %%ymm7, %%ymm7   \n\t"  // rhs3
+        "vpaddd 0x20(%[dst_ptr], %%r13, 1) , %%ymm11, %%ymm11 \n\t"  // rhs3
+        "vpaddd 0x40(%[dst_ptr], %%r13, 1) , %%ymm15, %%ymm15 \n\t"  // rhs3
+
+        "storeDst%=:\n\t"
+
+        "vmovdqu %%ymm4, 0x00(%[dst_ptr])            \n\t"  // rhs0
+        "vmovdqu %%ymm8, 0x20(%[dst_ptr])            \n\t"  // rhs0
+        "vmovdqu %%ymm12, 0x40(%[dst_ptr])           \n\t"  // rhs0
+
+        "vmovdqu %%ymm5, 0x00(%[dst_ptr], %%r12, 1)  \n\t"  // rhs1
+        "vmovdqu %%ymm9, 0x20(%[dst_ptr], %%r12, 1)  \n\t"  // rhs1
+        "vmovdqu %%ymm13, 0x40(%[dst_ptr], %%r12, 1) \n\t"  // rhs1
+
+        "vmovdqu %%ymm6, 0x00(%[dst_ptr], %%r12, 2)  \n\t"  // rhs2
+        "vmovdqu %%ymm10, 0x20(%[dst_ptr], %%r12, 2) \n\t"  // rhs2
+        "vmovdqu %%ymm14, 0x40(%[dst_ptr], %%r12, 2) \n\t"  // rhs2
+
+        "vmovdqu %%ymm7, 0x00(%[dst_ptr], %%r13, 1)  \n\t"  // rhs3
+        "vmovdqu %%ymm11, 0x20(%[dst_ptr], %%r13, 1) \n\t"  // rhs3
+        "vmovdqu %%ymm15, 0x40(%[dst_ptr], %%r13, 1) \n\t"  // rhs3
+
+        :  // outputs
+        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
+        [dst_ptr] "+r"(dst_ptr)
+        :  // inputs
+        [start_depth] "r"(start_depth), [dst_col_stride_q] "r"(dst_col_stride_q),
+        [run_depth_cells] "r"(run_depth_cells)
+        :  // clobbers
+        "cc", "memory", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7",
+        "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "%r12",
+        "%r13", "%r14");
+  }
+};
+#endif
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_KERNEL_AVX_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_default.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_default.h
new file mode 100644
index 00000000..29b09913
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_default.h
@@ -0,0 +1,112 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// kernel_default.h: Chooses default GEMM and GEMV kernels for the
+// host platform.
+
+#ifndef GEMMLOWP_INTERNAL_KERNEL_DEFAULT_H_
+#define GEMMLOWP_INTERNAL_KERNEL_DEFAULT_H_
+
+#include "../public/bit_depth.h"
+#include "common.h"
+#include "kernel.h"
+#include "kernel_reference.h"
+
+namespace gemmlowp {
+
+template <bool MaxProductIsLessThan4096, bool IsUnsigned, bool LhsNonZero>
+struct DefaultKernelImpl {};
+
+// Partial specialization implementing the logic that if we want to use
+// a kernel for MaxProductIsLessThan4096 but do not have such a kernel, then we
+// fall back to a generic kernel not taking advantage of
+// MaxProductIsLessThan4096.
+template <bool LhsNonZero>
+struct DefaultKernelImpl<true, true, LhsNonZero>
+    : DefaultKernelImpl<false, true, LhsNonZero> {};
+
+// Partial specialization implementing the logic that if we want to use
+// a kernel for LhsNonZero but do not have such a kernel, then we fall
+// back to a generic kernel not taking advantage of LhsNonZero.
+template <bool MaxProductIsLessThan4096>
+struct DefaultKernelImpl<MaxProductIsLessThan4096, true, true>
+    : DefaultKernelImpl<MaxProductIsLessThan4096, true, false> {};
+
+template <typename BitDepthParams>
+struct DefaultKernel
+    : DefaultKernelImpl<(BitDepthParams::LhsRange::kMaxValue *
+                             BitDepthParams::RhsRange::kMaxValue <
+                         4096),
+                        (BitDepthParams::LhsRange::kMinValue >= 0),
+                        (BitDepthParams::LhsRange::kMinValue > 0 ||
+                         (BitDepthParams::LhsRange::kMaxValue <= 127 &&
+                          BitDepthParams::LhsRange::kMinValue > -128))> {};
+
+}  // end namespace gemmlowp
+
+#define GEMMLOWP_SET_DEFAULT_KERNEL(MaxProductIsLessThan4096, IsUnsigned, \
+                                    LhsAlwaysNonZero, Kernel)             \
+  namespace gemmlowp {                                                    \
+  template <>                                                             \
+  struct DefaultKernelImpl<MaxProductIsLessThan4096, IsUnsigned,          \
+                           LhsAlwaysNonZero> : Kernel {};                 \
+  }
+
+// User-provided int8 inputs is only supported in the NEON path currently.
+#if defined GEMMLOWP_NEON_32
+#include "kernel_neon.h"
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, NEON_32_Kernel12x4Depth2)
+GEMMLOWP_SET_DEFAULT_KERNEL(true, true, false,
+                            NEON_32_Kernel12x4Depth2Assuming12BitProducts)
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, true,
+                            NEON_32bit_GEMM_Int8Operands_LhsNonzero)
+GEMMLOWP_SET_DEFAULT_KERNEL(false, false, true,
+                            NEON_32bit_GEMM_Int8Operands_LhsNonzero_Int8Inputs)
+#elif defined GEMMLOWP_NEON_64
+#include "kernel_neon.h"
+#if defined GEMMLOWP_DOTPROD_KERNEL
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false,
+                            NEON_64_Kernel12x8Depth4_dotprod)
+#else
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, NEON_64_Kernel12x8Depth2)
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, true,
+                            NEON_64bit_GEMM_Int8Operands_LhsNonzero)
+#endif
+GEMMLOWP_SET_DEFAULT_KERNEL(false, false, true,
+                            NEON_64bit_GEMM_Int8Operands_LhsNonzero_Int8Inputs)
+#elif defined(GEMMLOWP_MSA)
+#include "kernel_msa.h"
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, MSA_Kernel12x8Depth2)
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, true, MSA_GEMM_Int8Operands_LhsNonzero)
+#elif defined GEMMLOWP_SSE4_32
+#include "kernel_sse.h"
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, SSE4_32_Kernel4x4Depth2)
+#elif defined GEMMLOWP_SSE4_64
+#include "kernel_sse.h"
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, SSE4_64_Kernel12x4Depth2)
+#elif defined GEMMLOWP_AVX2_64
+#include "kernel_avx.h"
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, AVX2_64_Kernel24x8Depth2)
+#else
+#include "kernel_reference.h"
+namespace gemmlowp {
+typedef ReferenceKernel<KernelFormat<
+    KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
+    KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1> > >
+    DefaultReferenceKernel;
+}
+GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, DefaultReferenceKernel)
+#endif
+
+#endif  // GEMMLOWP_INTERNAL_KERNEL_DEFAULT_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_msa.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_msa.h
new file mode 100644
index 00000000..a9205f64
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_msa.h
@@ -0,0 +1,579 @@
+// Copyright 2018 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// kernel_msa.h: a collection of MSA optimized kernels.
+// Check in kernel_default.h which one(s) are actually used by default.
+// Others are mere experiments; they are still covered by tests
+// in case they might be useful some day.
+
+#ifndef GEMMLOWP_INTERNAL_KERNEL_MSA_H_
+#define GEMMLOWP_INTERNAL_KERNEL_MSA_H_
+
+#include "kernel.h"
+
+#include <msa.h>
+#include <cassert>
+
+namespace gemmlowp {
+
+#ifdef GEMMLOWP_MSA
+
+// Some convenience macros to hide differences between MIPS32 and MIPS64.
+#ifdef GEMMLOWP_MIPS_64
+#define GEMMLOWP_MIPS_XADDU "daddu"
+#define GEMMLOWP_MIPS_XADDIU "daddiu"
+#define GEMMLOWP_MIPS_XSLL "dsll"
+#else
+#define GEMMLOWP_MIPS_XADDU "addu"
+#define GEMMLOWP_MIPS_XADDIU "addiu"
+#define GEMMLOWP_MIPS_XSLL "sll"
+#endif
+
+// Our main GEMM kernel.
+struct MSA_Kernel12x8Depth2 : KernelBase {
+  typedef KernelFormat<KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 3>,
+                       KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 2> >
+      Format;
+
+  const char* Name() const override { return "MSA, 12x8, depth 2"; }
+
+  // TODO(benoitjacob): reorder function arguments so dst comes last
+  void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
+           std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
+           const std::uint8_t* rhs_ptr, std::size_t start_depth,
+           std::size_t run_depth) const override {
+    ScopedProfilingLabel label("optimized kernel (MSA 12x8)");
+// See comments above for why we need local numerical labels in our asm.
+#define GEMMLOWP_LABEL_CLEAR_ACCUMULATORS "1"
+#define GEMMLOWP_LABEL_BEFORE_LOOP "2"
+#define GEMMLOWP_LABEL_LOOP "3"
+#define GEMMLOWP_LABEL_AFTER_LOOP "4"
+
+    assert(dst_row_stride == 1);
+    asm volatile(
+        // Multiply dst_col_stride by 4 == sizeof(int32) to use
+        // it as a byte offset below.
+        GEMMLOWP_MIPS_XSLL
+        " %[dst_col_stride], %[dst_col_stride], 2\n"
+
+        // Check if start_depth==0 to decide whether we will clear
+        // accumulators or load existing accumulators.
+        "beqz   %[start_depth], " GEMMLOWP_LABEL_CLEAR_ACCUMULATORS "f\n"
+
+        // Load accumulators (start_depth != 0).
+        GEMMLOWP_MIPS_XADDU " $a0, %[dst_ptr], %[dst_col_stride]\n"
+        "ld.w   $w0,  (0*16)(%[dst_ptr])\n"
+        "ld.w   $w4,  (1*16)(%[dst_ptr])\n"
+        "ld.w   $w8,  (2*16)(%[dst_ptr])\n" GEMMLOWP_MIPS_XADDU " $a1, $a0, %[dst_col_stride]\n"
+        "ld.w   $w1,  (0*16)($a0)\n"
+        "ld.w   $w5,  (1*16)($a0)\n"
+        "ld.w   $w9,  (2*16)($a0)\n" GEMMLOWP_MIPS_XADDU " $a0, $a1, %[dst_col_stride]\n"
+        "ld.w   $w2,  (0*16)($a1)\n"
+        "ld.w   $w6,  (1*16)($a1)\n"
+        "ld.w   $w10, (2*16)($a1)\n" GEMMLOWP_MIPS_XADDU " $a1, $a0, %[dst_col_stride]\n"
+        "ld.w   $w3,  (0*16)($a0)\n"
+        "ld.w   $w7,  (1*16)($a0)\n"
+        "ld.w   $w11, (2*16)($a0)\n" GEMMLOWP_MIPS_XADDU " $a0, $a1, %[dst_col_stride]\n"
+        "ld.w   $w12, (0*16)($a1)\n"
+        "ld.w   $w16, (1*16)($a1)\n"
+        "ld.w   $w20, (2*16)($a1)\n" GEMMLOWP_MIPS_XADDU " $a1, $a0, %[dst_col_stride]\n"
+        "ld.w   $w13, (0*16)($a0)\n"
+        "ld.w   $w17, (1*16)($a0)\n"
+        "ld.w   $w21, (2*16)($a0)\n" GEMMLOWP_MIPS_XADDU " $a0, $a1, %[dst_col_stride]\n"
+        "ld.w   $w14, (0*16)($a1)\n"
+        "ld.w   $w18, (1*16)($a1)\n"
+        "ld.w   $w22, (2*16)($a1)\n"
+        "ld.w   $w15, (0*16)($a0)\n"
+        "ld.w   $w19, (1*16)($a0)\n"
+        "ld.w   $w23, (2*16)($a0)\n"
+        "b " GEMMLOWP_LABEL_BEFORE_LOOP "f\n"
+
+        GEMMLOWP_LABEL_CLEAR_ACCUMULATORS ":\n"
+        // Clear accumulators (start_depth == 0).
+        "ldi.w  $w0,  0\n"
+        "ldi.w  $w4,  0\n"
+        "ldi.w  $w8,  0\n"
+        "ldi.w  $w1,  0\n"
+        "ldi.w  $w5,  0\n"
+        "ldi.w  $w9,  0\n"
+        "ldi.w  $w2,  0\n"
+        "ldi.w  $w6,  0\n"
+        "ldi.w  $w10, 0\n"
+        "ldi.w  $w3,  0\n"
+        "ldi.w  $w7,  0\n"
+        "ldi.w  $w11, 0\n"
+        "ldi.w  $w12, 0\n"
+        "ldi.w  $w16, 0\n"
+        "ldi.w  $w20, 0\n"
+        "ldi.w  $w13, 0\n"
+        "ldi.w  $w17, 0\n"
+        "ldi.w  $w21, 0\n"
+        "ldi.w  $w14, 0\n"
+        "ldi.w  $w18, 0\n"
+        "ldi.w  $w22, 0\n"
+        "ldi.w  $w15, 0\n"
+        "ldi.w  $w19, 0\n"
+        "ldi.w  $w23, 0\n"
+
+        GEMMLOWP_LABEL_BEFORE_LOOP ":\n"
+
+        GEMMLOWP_LABEL_LOOP ":\n"
+        // Overview of register layout:
+        //
+        // A half of the 2 2x4 cells of Rhs is stored in 16bit in w28-w31
+        // (each register contains 4 replicas of a pair of elements).
+        // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in w24-w26.
+        // A 12x8 block of accumulators is stored in 32bit in w0-w23.
+        //
+        //                    +------+------+------+------+
+        //               Rhs  |w28   |w29   |w30   |w31   |
+        //                    +------+------+------+------+
+        //
+        //                    |      |      |      |      |
+        //
+        //       Lhs          |      |      |      |      |
+        //
+        //      +---+ - - - - +------+------+------+------+
+        //      |w24|         |w0/12 |w1/13 |w2/14 |w3/15 |
+        //      |w24|         |w0/12 |w1/13 |w2/14 |w3/15 |
+        //      |w24|         |w0/12 |w1/13 |w2/14 |w3/15 |
+        //      |w24|         |w0/12 |w1/13 |w2/14 |w3/15 |
+        //      +---+ - - - - +------+------+------+------+
+        //      |w25|         |w4/16 |w5/17 |w6/18 |w7/19 |
+        //      |w25|         |w4/16 |w5/17 |w6/18 |w7/19 |
+        //      |w25|         |w4/16 |w5/17 |w6/18 |w7/19 |
+        //      |w25|         |w4/16 |w5/17 |w6/18 |w7/19 |
+        //      +---+ - - - - +------+------+------+------+
+        //      |w26|         |w8/20 |w9/21 |w10/22|w11/23|
+        //      |w26|         |w8/20 |w9/21 |w10/22|w11/23|
+        //      |w26|         |w8/20 |w9/21 |w10/22|w11/23|
+        //      |w26|         |w8/20 |w9/21 |w10/22|w11/23|
+        //      +---+ - - - - +------+------+------+------+
+        //
+        //                             Accumulators
+
+        // Load 3 x 8 bytes of lhs[] with 2 16-byte overlapped loads.
+        "ld.b   $w24, 0(%[lhs_ptr])\n"
+        "ld.b   $w25, 8(%[lhs_ptr])\n"
+
+        // Load 2 x 8 bytes of rhs[].
+        "ld.b   $w27, 0(%[rhs_ptr])\n"
+
+        // Zero-extend 8-bit elements of lhs[] to 16 bits.
+        "ldi.b  $w31, 0\n"
+        "ilvr.b $w24, $w31, $w24\n"
+        "ilvl.b $w26, $w31, $w25\n"
+        "ilvr.b $w25, $w31, $w25\n"
+
+        // First half of depths 0 and 1.
+        // Zero-extend 8-bit elements of rhs[] to 16 bits.
+        "ilvr.b    $w31, $w31, $w27\n"
+        // Make 4 replicas of every pair of rhs[] elements.
+        "splati.w  $w28, $w31[0]\n"
+        "splati.w  $w29, $w31[1]\n"
+        "splati.w  $w30, $w31[2]\n"
+        "splati.w  $w31, $w31[3]\n"
+        // Dot-product-(and)-add doubles multiplicand width.
+        "dpadd_u.w  $w0, $w24, $w28\n"
+        "dpadd_u.w  $w4, $w25, $w28\n"
+        "dpadd_u.w  $w8, $w26, $w28\n"
+        "dpadd_u.w  $w1, $w24, $w29\n"
+        "dpadd_u.w  $w5, $w25, $w29\n"
+        "dpadd_u.w  $w9, $w26, $w29\n"
+        "dpadd_u.w  $w2, $w24, $w30\n"
+        "dpadd_u.w  $w6, $w25, $w30\n"
+        "dpadd_u.w $w10, $w26, $w30\n"
+        "dpadd_u.w  $w3, $w24, $w31\n"
+        "dpadd_u.w  $w7, $w25, $w31\n"
+        "dpadd_u.w $w11, $w26, $w31\n"
+
+        // Second half of depths 0 and 1.
+        // Zero-extend 8-bit elements of rhs[] to 16 bits.
+        "ldi.b     $w31, 0\n"
+        "ilvl.b    $w31, $w31, $w27\n"
+        // Make 4 replicas of every pair of rhs[] elements.
+        "splati.w  $w28, $w31[0]\n"
+        "splati.w  $w29, $w31[1]\n"
+        "splati.w  $w30, $w31[2]\n"
+        "splati.w  $w31, $w31[3]\n"
+        // Dot-product-(and)-add doubles multiplicand width.
+        "dpadd_u.w $w12, $w24, $w28\n"
+        "dpadd_u.w $w16, $w25, $w28\n"
+        "dpadd_u.w $w20, $w26, $w28\n"
+        "dpadd_u.w $w13, $w24, $w29\n"
+        "dpadd_u.w $w17, $w25, $w29\n"
+        "dpadd_u.w $w21, $w26, $w29\n"
+        "dpadd_u.w $w14, $w24, $w30\n"
+        "dpadd_u.w $w18, $w25, $w30\n"
+        "dpadd_u.w $w22, $w26, $w30\n"
+        "dpadd_u.w $w15, $w24, $w31\n"
+        "dpadd_u.w $w19, $w25, $w31\n"
+        "dpadd_u.w $w23, $w26, $w31\n"
+
+        GEMMLOWP_MIPS_XADDIU " %[run_depth], -2\n" GEMMLOWP_MIPS_XADDIU
+        " %[lhs_ptr], 24\n" GEMMLOWP_MIPS_XADDIU " %[rhs_ptr], 16\n"
+        "bnez   %[run_depth]," GEMMLOWP_LABEL_LOOP "b\n"
+
+        GEMMLOWP_LABEL_AFTER_LOOP ":\n"
+
+        // Store accumulators.
+        GEMMLOWP_MIPS_XADDU " $a0, %[dst_ptr], %[dst_col_stride]\n"
+        "st.w   $w0,  (0*16)(%[dst_ptr])\n"
+        "st.w   $w4,  (1*16)(%[dst_ptr])\n"
+        "st.w   $w8,  (2*16)(%[dst_ptr])\n" GEMMLOWP_MIPS_XADDU " $a1, $a0, %[dst_col_stride]\n"
+        "st.w   $w1,  (0*16)($a0)\n"
+        "st.w   $w5,  (1*16)($a0)\n"
+        "st.w   $w9,  (2*16)($a0)\n" GEMMLOWP_MIPS_XADDU " $a0, $a1, %[dst_col_stride]\n"
+        "st.w   $w2,  (0*16)($a1)\n"
+        "st.w   $w6,  (1*16)($a1)\n"
+        "st.w   $w10, (2*16)($a1)\n" GEMMLOWP_MIPS_XADDU " $a1, $a0, %[dst_col_stride]\n"
+        "st.w   $w3,  (0*16)($a0)\n"
+        "st.w   $w7,  (1*16)($a0)\n"
+        "st.w   $w11, (2*16)($a0)\n" GEMMLOWP_MIPS_XADDU " $a0, $a1, %[dst_col_stride]\n"
+        "st.w   $w12, (0*16)($a1)\n"
+        "st.w   $w16, (1*16)($a1)\n"
+        "st.w   $w20, (2*16)($a1)\n" GEMMLOWP_MIPS_XADDU " $a1, $a0, %[dst_col_stride]\n"
+        "st.w   $w13, (0*16)($a0)\n"
+        "st.w   $w17, (1*16)($a0)\n"
+        "st.w   $w21, (2*16)($a0)\n" GEMMLOWP_MIPS_XADDU " $a0, $a1, %[dst_col_stride]\n"
+        "st.w   $w14, (0*16)($a1)\n"
+        "st.w   $w18, (1*16)($a1)\n"
+        "st.w   $w22, (2*16)($a1)\n"
+        "st.w   $w15, (0*16)($a0)\n"
+        "st.w   $w19, (1*16)($a0)\n"
+        "st.w   $w23, (2*16)($a0)\n"
+        :  // outputs
+        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), [run_depth] "+r"(run_depth),
+        [dst_col_stride] "+r"(dst_col_stride)
+        :  // inputs
+        [dst_ptr] "r"(dst_ptr),
+        [start_depth] "r"(start_depth)
+        :  // clobbers
+        "memory", "a0", "a1", "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7", "$f8", "$f9",
+        "$f10", "$f11", "$f12", "$f13", "$f14", "$f15", "$f16", "$f17", "$f18", "$f19", "$f20",
+        "$f21", "$f22", "$f23", "$f24", "$f25", "$f26", "$f27", "$f28", "$f29", "$f30", "$f31");
+
+#undef GEMMLOWP_LABEL_CLEAR_ACCUMULATORS
+#undef GEMMLOWP_LABEL_BEFORE_LOOP
+#undef GEMMLOWP_LABEL_LOOP
+#undef GEMMLOWP_LABEL_AFTER_LOOP
+  }
+};
+
+// Fast kernel operating on int8 operands.
+// It is assumed that one of the two int8 operands only takes values
+// in [-127, 127], while the other may freely range in [-128, 127].
+// The issue with both operands taking the value -128 is that:
+// -128*-128 + -128*-128 == -32768 overflows int16.
+// Every other expression a*b + c*d, for any int8 a,b,c,d, fits in int16
+// range. That is the basic idea of this kernel.
+struct MSA_GEMM_Int8Operands_LhsNonzero : KernelBase {
+  typedef KernelFormat<
+      KernelSideFormatInt8<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
+      KernelSideFormatInt8<CellFormat<4, 16, CellOrder::WidthMajor>, 1> >
+      Format;
+
+  const char* Name() const override {
+    return "MSA, 4x4, depth 16, accumulating two within signed int16";
+  }
+
+  // TODO(benoitjacob): reorder function arguments so dst comes last
+  void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
+           std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
+           const std::uint8_t* rhs_ptr, std::size_t start_depth,
+           std::size_t run_depth) const override {
+    (void)dst_row_stride;
+#define GEMMLOWP_LABEL_AFTER_LOOP_LAST16 "1"
+#define GEMMLOWP_LABEL_LOOP "2"
+#define GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES "3"
+#define GEMMLOWP_LABEL_STORE "4"
+    asm volatile(
+        GEMMLOWP_MIPS_XADDIU " %[run_depth], -16\n"
+        // Load lhs[] and rhs[], zero out internal accumulators.
+        "ld.b       $w16, 0(%[lhs_ptr])\n"
+        "ldi.b      $w0, 0\n"
+        "ld.b       $w20, 0(%[rhs_ptr])\n"
+        "ldi.b      $w1, 0\n"
+        "ld.b       $w17, 16(%[lhs_ptr])\n"
+        "ldi.b      $w2, 0\n"
+        "ld.b       $w21, 16(%[rhs_ptr])\n"
+        "ldi.b      $w3, 0\n"
+        "ld.b       $w18, 32(%[lhs_ptr])\n"
+        "ldi.b      $w4, 0\n"
+        "ld.b       $w19, 48(%[lhs_ptr])\n"
+        "ldi.b      $w5, 0\n"
+        "ld.b       $w22, 32(%[rhs_ptr])\n"
+        "ldi.b      $w6, 0\n"
+        "ld.b       $w23, 48(%[rhs_ptr])\n"
+        "ldi.b      $w7, 0\n"
+        "ldi.b      $w8, 0\n"
+        "ldi.b      $w9, 0\n"
+        "ldi.b      $w10, 0\n"
+        "ldi.b      $w11, 0\n"
+        "ldi.b      $w12, 0\n"
+        "ldi.b      $w13, 0\n"
+        "ldi.b      $w14, 0\n"
+        "ldi.b      $w15, 0\n"
+        "ldi.h      $w31, 1\n"
+        // If the loop depth is only 16, then we can skip the general loop
+        // and go straight to the final part of the code.
+        "beqz %[run_depth], " GEMMLOWP_LABEL_AFTER_LOOP_LAST16 "f\n"
+
+        GEMMLOWP_LABEL_LOOP ":\n"
+        // Overview of register layout:
+        //
+        // A 4x16 block of Rhs is stored in 8 bit in w16-w19.
+        // A 4x16 block of Lhs is stored in 8 bit in w20-w23.
+        //
+        // A 4x4 block of accumulators is stored in w0-w15 (as 4x32 bit
+        // components which need to be horizontally added at the end).
+        //
+        // Dot products of Lhs and Rhs are 16-bit values, which can't
+        // immediately be accumulated in 32-bit accumulators by that
+        // same instruction that calculates them.
+        // For example, "dotp_s.h $w25, $w16, $w20" produces 8 16-bit
+        // sums in w25 (note, the 16 sums have already been reduced to 8
+        // by the horizontal addition of the dotp instruction).
+        // They are then sign-extended to 32 bits, horizontally added
+        // (again) to form 4 32-bit sums and then they are finally added
+        // to the 32-bit accumulators, all by "dpadd_s.w $w0, $w25, $w31".
+        //
+        //                    +-----+-----+-----+-----+
+        //               Rhs  | w20 | w21 | w22 | w23 |
+        //                    +-----+-----+-----+-----+
+        //
+        //                    |     |     |     |     |
+        //
+        //       Lhs          |     |     |     |     |
+        //
+        //      +---+ - - - - +-----+-----+-----+-----+
+        //      |w16|         | w0  | w4  | w8  | w12 |
+        //      |w17|         | w1  | w5  | w9  | w13 |
+        //      |w18|         | w2  | w6  | w10 | w14 |
+        //      |w19|         | w3  | w7  | w11 | w15 |
+        //      +---+ - - - - +-----+-----+-----+-----+
+        //
+        //                           Accumulators
+
+        // Calculate the results for 16 depths and load
+        // lhs[] and rhs[] for the next iteration.
+        GEMMLOWP_MIPS_XADDIU " %[lhs_ptr], 64\n"
+        GEMMLOWP_MIPS_XADDIU " %[rhs_ptr], 64\n"
+        GEMMLOWP_MIPS_XADDIU " %[run_depth], -16\n"
+
+        // Dot product: multiply-add pairs of adjacent int8 elements.
+        // Each dot product takes 16*2 int8 values in and produces 8 int16 sums.
+        "dotp_s.h   $w25, $w16, $w20\n"
+        "dotp_s.h   $w26, $w17, $w20\n"
+        "dotp_s.h   $w27, $w16, $w21\n"
+        "dotp_s.h   $w28, $w17, $w21\n"
+        "dotp_s.h   $w29, $w18, $w20\n"
+        // Horizontal add of pairs of adjacent int16 sums into internal int32
+        // accumulators.
+        "dpadd_s.w  $w0, $w25, $w31\n"
+        "dpadd_s.w  $w1, $w26, $w31\n"
+        "dpadd_s.w  $w4, $w27, $w31\n"
+        "dpadd_s.w  $w5, $w28, $w31\n"
+        "dpadd_s.w  $w2, $w29, $w31\n"
+
+        // Dot product: multiply-add pairs of adjacent int8 elements.
+        // Each dot product takes 16*2 int8 values in and produces 8 int16 sums.
+        "dotp_s.h   $w24, $w16, $w22\n"
+        "dotp_s.h   $w25, $w19, $w20\n"
+        "dotp_s.h   $w26, $w16, $w23\n"
+        "dotp_s.h   $w27, $w17, $w22\n"
+        "ld.b       $w20, 0(%[rhs_ptr])\n"
+        "dotp_s.h   $w28, $w17, $w23\n"
+        "ld.b       $w16, 0(%[lhs_ptr])\n"
+        "dotp_s.h   $w29, $w18, $w21\n"
+        "ld.b       $w17, 16(%[lhs_ptr])\n"
+        // Horizontal add of pairs of adjacent int16 sums into internal int32
+        // accumulators.
+        "dpadd_s.w  $w8, $w24, $w31\n"
+        "dpadd_s.w  $w3, $w25, $w31\n"
+        "dpadd_s.w  $w12, $w26, $w31\n"
+        "dpadd_s.w  $w9, $w27, $w31\n"
+        "dpadd_s.w  $w13, $w28, $w31\n"
+        "dpadd_s.w  $w6, $w29, $w31\n"
+
+        // Dot product: multiply-add pairs of adjacent int8 elements.
+        // Each dot product takes 16*2 int8 values in and produces 8 int16 sums.
+        "dotp_s.h   $w25, $w19, $w21\n"
+        "dotp_s.h   $w26, $w18, $w22\n"
+        "dotp_s.h   $w27, $w18, $w23\n"
+        "ld.b       $w21, 16(%[rhs_ptr])\n"
+        "dotp_s.h   $w28, $w19, $w22\n"
+        "ld.b       $w18, 32(%[lhs_ptr])\n"
+        "dotp_s.h   $w29, $w19, $w23\n"
+        "ld.b       $w22, 32(%[rhs_ptr])\n"
+        // Horizontal add of pairs of adjacent int16 sums into internal int32
+        // accumulators.
+        "dpadd_s.w  $w7, $w25, $w31\n"
+        "ld.b       $w19, 48(%[lhs_ptr])\n"
+        "dpadd_s.w  $w10, $w26, $w31\n"
+        "ld.b       $w23, 48(%[rhs_ptr])\n"
+        "dpadd_s.w  $w14, $w27, $w31\n"
+        "dpadd_s.w  $w11, $w28, $w31\n"
+        "dpadd_s.w  $w15, $w29, $w31\n"
+
+        "bnez %[run_depth], " GEMMLOWP_LABEL_LOOP "b\n"
+
+        GEMMLOWP_LABEL_AFTER_LOOP_LAST16 ":\n"
+        // Calculate the results for the last 16 depths.
+
+        // Dot product: multiply-add pairs of adjacent int8 elements.
+        // Each dot product takes 16*2 int8 values in and produces 8 int16 sums.
+        "dotp_s.h   $w25, $w16, $w20\n"
+        "dotp_s.h   $w26, $w17, $w20\n"
+        "dotp_s.h   $w27, $w16, $w21\n"
+        "dotp_s.h   $w28, $w17, $w21\n"
+        "dotp_s.h   $w29, $w18, $w20\n"
+        // Horizontal add of pairs of adjacent int16 sums into internal int32
+        // accumulators.
+        "dpadd_s.w  $w0, $w25, $w31\n"
+        "dpadd_s.w  $w1, $w26, $w31\n"
+        "dpadd_s.w  $w4, $w27, $w31\n"
+        "dpadd_s.w  $w5, $w28, $w31\n"
+        "dpadd_s.w  $w2, $w29, $w31\n"
+
+        // Dot product: multiply-add pairs of adjacent int8 elements.
+        // Each dot product takes 16*2 int8 values in and produces 8 int16 sums.
+        "dotp_s.h   $w24, $w16, $w22\n"
+        "dotp_s.h   $w25, $w19, $w20\n"
+        "dotp_s.h   $w26, $w16, $w23\n"
+        "dotp_s.h   $w27, $w17, $w22\n"
+        "dotp_s.h   $w28, $w17, $w23\n"
+        "dotp_s.h   $w29, $w18, $w21\n"
+        // Horizontal add of pairs of adjacent int16 sums into internal int32
+        // accumulators.
+        "dpadd_s.w  $w8, $w24, $w31\n"
+        "dpadd_s.w  $w3, $w25, $w31\n"
+        "dpadd_s.w  $w12, $w26, $w31\n"
+        "dpadd_s.w  $w9, $w27, $w31\n"
+        "dpadd_s.w  $w13, $w28, $w31\n"
+        "dpadd_s.w  $w6, $w29, $w31\n"
+
+        // Dot product: multiply-add pairs of adjacent int8 elements.
+        // Each dot product takes 16*2 int8 values in and produces 8 int16 sums.
+        "dotp_s.h   $w25, $w19, $w21\n"
+        "dotp_s.h   $w26, $w18, $w22\n"
+        "dotp_s.h   $w27, $w18, $w23\n"
+        "dotp_s.h   $w28, $w19, $w22\n"
+        "dotp_s.h   $w29, $w19, $w23\n"
+        // Horizontal add of pairs of adjacent int16 sums into internal int32
+        // accumulators.
+        "dpadd_s.w  $w7, $w25, $w31\n"
+        "dpadd_s.w  $w10, $w26, $w31\n"
+        "dpadd_s.w  $w14, $w27, $w31\n"
+        "dpadd_s.w  $w11, $w28, $w31\n"
+        "dpadd_s.w  $w15, $w29, $w31\n"
+
+        // Horizontal-add internal accumulators.
+        "hadd_s.d   $w0, $w0, $w0\n"
+        "hadd_s.d   $w1, $w1, $w1\n"
+        "hadd_s.d   $w2, $w2, $w2\n"
+        "hadd_s.d   $w3, $w3, $w3\n"
+        "hadd_s.d   $w4, $w4, $w4\n"
+        "hadd_s.d   $w5, $w5, $w5\n"
+        "hadd_s.d   $w6, $w6, $w6\n"
+        "hadd_s.d   $w7, $w7, $w7\n"
+        "hadd_s.d   $w8, $w8, $w8\n"
+        "hadd_s.d   $w9, $w9, $w9\n"
+        "hadd_s.d   $w10, $w10, $w10\n"
+        "hadd_s.d   $w11, $w11, $w11\n"
+        "hadd_s.d   $w12, $w12, $w12\n"
+        "hadd_s.d   $w13, $w13, $w13\n"
+        "hadd_s.d   $w14, $w14, $w14\n"
+        "hadd_s.d   $w15, $w15, $w15\n"
+        "pckev.w    $w0, $w1, $w0\n"
+        "pckev.w    $w2, $w3, $w2\n"
+        "pckev.w    $w4, $w5, $w4\n"
+        "pckev.w    $w6, $w7, $w6\n"
+        "pckev.w    $w8, $w9, $w8\n"
+        "pckev.w    $w10, $w11, $w10\n"
+        "pckev.w    $w12, $w13, $w12\n"
+        "pckev.w    $w14, $w15, $w14\n"
+        "hadd_s.d   $w0, $w0, $w0\n"
+        "hadd_s.d   $w2, $w2, $w2\n"
+        "hadd_s.d   $w4, $w4, $w4\n"
+        "hadd_s.d   $w6, $w6, $w6\n"
+        "hadd_s.d   $w8, $w8, $w8\n"
+        "hadd_s.d   $w10, $w10, $w10\n"
+        "hadd_s.d   $w12, $w12, $w12\n"
+        "hadd_s.d   $w14, $w14, $w14\n"
+        // 4 more pckev instructions follow in both paths below.
+
+        // Check if start_depth==0 to decide whether we will load
+        // existing accumulators from memory.
+        "bnez %[start_depth], " GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES "f\n"
+
+        "pckev.w    $w0, $w2, $w0\n"
+        "pckev.w    $w1, $w6, $w4\n"
+        "pckev.w    $w2, $w10, $w8\n"
+        "pckev.w    $w3, $w14, $w12\n"
+
+        "b " GEMMLOWP_LABEL_STORE "f\n"
+
+        GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES ":\n"
+        // Load accumulators from memory.
+        "ld.w       $w16, 0(%[dst_ptr0])\n"
+        "pckev.w    $w0, $w2, $w0\n"
+        "ld.w       $w17, 0(%[dst_ptr1])\n"
+        "pckev.w    $w1, $w6, $w4\n"
+        "ld.w       $w18, 0(%[dst_ptr2])\n"
+        "pckev.w    $w2, $w10, $w8\n"
+        "ld.w       $w19, 0(%[dst_ptr3])\n"
+        "pckev.w    $w3, $w14, $w12\n"
+
+        // Add them to internal accumulators.
+        "addv.w     $w0, $w0, $w16\n"
+        "addv.w     $w1, $w1, $w17\n"
+        "addv.w     $w2, $w2, $w18\n"
+        "addv.w     $w3, $w3, $w19\n"
+
+        GEMMLOWP_LABEL_STORE ":\n"
+        // Store accumulators.
+        "st.w       $w0, 0(%[dst_ptr0])\n"
+        "st.w       $w1, 0(%[dst_ptr1])\n"
+        "st.w       $w2, 0(%[dst_ptr2])\n"
+        "st.w       $w3, 0(%[dst_ptr3])\n"
+        :  // outputs
+        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
+        [run_depth] "+r"(run_depth)
+        :  // inputs
+        [dst_ptr0] "r"(dst_ptr), [dst_ptr1] "r"(dst_ptr + dst_col_stride),
+        [dst_ptr2] "r"(dst_ptr + dst_col_stride * 2),
+        [dst_ptr3] "r"(dst_ptr + dst_col_stride * 3),
+        [start_depth] "r"(start_depth)
+        :  // clobbers
+        "memory", "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7", "$f8",
+        "$f9", "$f10", "$f11", "$f12", "$f13", "$f14", "$f15", "$f16", "$f17",
+        "$f18", "$f19", "$f20", "$f21", "$f22", "$f23", "$f24", "$f25", "$f26",
+        "$f27", "$f28", "$f29", "$f30", "$f31");
+#undef GEMMLOWP_LABEL_LOOP
+#undef GEMMLOWP_LABEL_AFTER_LOOP_LAST16
+#undef GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
+#undef GEMMLOWP_LABEL_STORE
+  }
+};
+
+#undef GEMMLOWP_MIPS_XADDU
+#undef GEMMLOWP_MIPS_XADDIU
+#undef GEMMLOWP_MIPS_XSLL
+
+#endif  // GEMMLOWP_MSA
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_KERNEL_MSA_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_neon.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_neon.h
new file mode 100644
index 00000000..9859637f
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_neon.h
@@ -0,0 +1,1913 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// kernel_neon.h: a collection of NEON optimized kernels.
+// Check in kernel_default.h which one(s) are actually used by default.
+// Others are mere experiments; they are still covered by tests
+// in case they might be useful some day.
+
+#ifndef GEMMLOWP_INTERNAL_KERNEL_NEON_H_
+#define GEMMLOWP_INTERNAL_KERNEL_NEON_H_
+
+#include "kernel.h"
+
+#include <arm_neon.h>
+#include <cassert>
+
+namespace gemmlowp {
+
+// The kernels here are specifically arm 32bit assembly, not arm 64bit.
+#ifdef GEMMLOWP_NEON_32
+
+// Our main GEMM kernel.
+struct NEON_32_Kernel12x4Depth2 : KernelBase {
+  typedef KernelFormat<KernelSideFormat<CellFormat<4, 2>, 3>,
+                       KernelSideFormat<CellFormat<4, 2>, 1> >
+      Format;
+
+  const char* Name() const override { return "NEON, 12x4, depth 2"; }
+
+  // TODO(benoitjacob): reorder function arguments so dst comes last
+  void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
+           std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
+           const std::uint8_t* rhs_ptr, std::size_t start_depth,
+           std::size_t run_depth) const override {
+    ScopedProfilingLabel label("optimized kernel (NEON 12x4)");
+
+// For iOS assembler, the %= style of local labels cause compilation errors,
+//  so use numerical ones instead. See
+// http://stackoverflow.com/questions/3898435/labels-in-gcc-inline-assembly
+// If you add any labels, remember to undef them at the end.
+#define GEMMLOWP_LABEL_CLEAR_ACCUMULATORS "1"
+#define GEMMLOWP_LABEL_BEFORE_LOOP "2"
+#define GEMMLOWP_LABEL_LOOP "3"
+#define GEMMLOWP_LABEL_AFTER_LOOP "4"
+
+    assert(dst_row_stride == 1);
+    (void)dst_row_stride;
+    asm volatile(
+        // Overview of register layout:
+        //
+        // A 2x4 cell of Rhs is stored in 16bit in d0--d1 (q0).
+        // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in d2--d7
+        // (q1--q3).
+        // A 12x4 block of accumulators is stored in 32bit in q4--q15.
+        //
+        //                   +-----+-----+-----+-----+
+        //                   |d0[0]|d0[1]|d0[2]|d0[3]|
+        //              Rhs  +-----+-----+-----+-----+
+        //                   |d1[0]|d1[1]|d1[2]|d1[3]|
+        //                   +-----+-----+-----+-----+
+        //
+        //                   |     |     |     |     |
+        //
+        //    Lhs            |     |     |     |     |
+        //
+        //  +--+--+ - - - -  +-----+-----+-----+-----+
+        //  |d2|d3|          | q4  | q5  | q6  | q7  |
+        //  |d2|d3|          | q4  | q5  | q6  | q7  |
+        //  |d2|d3|          | q4  | q5  | q6  | q7  |
+        //  |d2|d3|          | q4  | q5  | q6  | q7  |
+        //  +--+--+ - - - -  +-----+-----+-----+-----+
+        //  |d4|d5|          | q8  | q9  | q10 | q11 |
+        //  |d4|d5|          | q8  | q9  | q10 | q11 |
+        //  |d4|d5|          | q8  | q9  | q10 | q11 |
+        //  |d4|d5|          | q8  | q9  | q10 | q11 |
+        //  +--+--+ - - - -  +-----+-----+-----+-----+
+        //  |d6|d7|          | q12 | q13 | q14 | q15 |
+        //  |d6|d7|          | q12 | q13 | q14 | q15 |
+        //  |d6|d7|          | q12 | q13 | q14 | q15 |
+        //  |d6|d7|          | q12 | q13 | q14 | q15 |
+        //  +--+--+ - - - -  +-----+-----+-----+-----+
+        //
+        //                            Accumulator
+
+        // Load 1 Rhs cell of size 2x4
+        "vld1.8 {d0}, [%[rhs_ptr]]!\n"
+        // Load 3 Lhs cells of size 4x2 each
+        "vld1.8 {d2}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d4}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d6}, [%[lhs_ptr]]!\n"
+
+        // Check if start_depth==0 to decide whether we will clear
+        // accumulators or load existing accumulators.
+        "cmp %[start_depth], #0\n"
+
+        // Multiply dst_col_stride by 4 == sizeof(int32) to use
+        // it as a byte offset below.
+        "lsl %[dst_col_stride], #2\n"
+
+        "beq " GEMMLOWP_LABEL_CLEAR_ACCUMULATORS
+        "f\n"
+
+        // Load accumulators (start_depth != 0)
+        "mov r1, %[dst_ptr]\n"
+        "subs %[run_depth], #2\n"
+        "mov r0, r1\n"
+        "vld1.32 {d8, d9},   [r0]!\n"
+        "add r1, %[dst_col_stride]\n"
+        "vld1.32 {d16, d17}, [r0]!\n"
+        "vld1.32 {d24, d25}, [r0]\n"
+        "mov r0, r1\n"
+        "vld1.32 {d10, d11}, [r0]!\n"
+        "add r1, %[dst_col_stride]\n"
+        "vld1.32 {d18, d19}, [r0]!\n"
+        "vld1.32 {d26, d27}, [r0]\n"
+        "mov r0, r1\n"
+        "vld1.32 {d12, d13}, [r0]!\n"
+        "add r1, %[dst_col_stride]\n"
+        "vld1.32 {d20, d21}, [r0]!\n"
+        "vld1.32 {d28, d29}, [r0]\n"
+        "mov r0, r1\n"
+        "vld1.32 {d14, d15}, [r0]!\n"
+        "vld1.32 {d22, d23}, [r0]!\n"
+        "vld1.32 {d30, d31}, [r0]\n"
+
+        "b " GEMMLOWP_LABEL_BEFORE_LOOP "f\n"
+
+        GEMMLOWP_LABEL_CLEAR_ACCUMULATORS
+        ":\n"
+
+        // Clear accumulators (start_depth == 0)
+        "vmov.s32 q4, #0\n"
+        "subs %[run_depth], #2\n"
+        "vmov.s32 q8, q4\n"
+        "vmov.s32 q12, q4\n"
+        "vmov.s32 q5, q4\n"
+        "vmov.s32 q9, q4\n"
+        "vmov.s32 q13, q4\n"
+        "vmov.s32 q6, q4\n"
+        "vmov.s32 q10, q4\n"
+        "vmov.s32 q14, q4\n"
+        "vmov.s32 q7, q4\n"
+        "vmov.s32 q11, q4\n"
+        "vmov.s32 q15, q4\n"
+
+        GEMMLOWP_LABEL_BEFORE_LOOP
+        ":\n"
+
+        // If there are only two levels of depth, skip the loop.
+        "beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n"
+
+        GEMMLOWP_LABEL_LOOP
+        ":\n"
+        // Expand Lhs/Rhs cells to 16 bit.
+        // Note: moving theses vmovls further down to allow for
+        // longer data pipelining helps a little on A57 but is
+        // harmful on A53 --- It looks as if A53 doesn't like
+        // interleaving vmovl's into the vmlal's.
+        "vmovl.u8 q0, d0\n"
+        "vmovl.u8 q1, d2\n"
+        "vmovl.u8 q2, d4\n"
+        "vmovl.u8 q3, d6\n"
+
+        // Multiply-accumulate, level of depth 0
+        "vmlal.u16 q4, d2, d0[0]\n"
+        "vmlal.u16 q5, d2, d0[1]\n"
+        "vmlal.u16 q6, d2, d0[2]\n"
+        "vmlal.u16 q7, d2, d0[3]\n"
+        "vldr d2, [%[lhs_ptr]]\n"
+        "vmlal.u16 q8, d4, d0[0]\n"
+        "vmlal.u16 q9, d4, d0[1]\n"
+        "vmlal.u16 q10, d4, d0[2]\n"
+        "vmlal.u16 q11, d4, d0[3]\n"
+        "vldr d4, [%[lhs_ptr], #8]\n"
+        "vmlal.u16 q12, d6, d0[0]\n"
+        "vmlal.u16 q13, d6, d0[1]\n"
+        "vmlal.u16 q14, d6, d0[2]\n"
+        "vmlal.u16 q15, d6, d0[3]\n"
+        "vldr d6, [%[lhs_ptr], #16]\n"
+        "vldr d0, [%[rhs_ptr]]\n"
+
+        // Multiply-accumulate, level of depth 1
+        "vmlal.u16 q4, d3, d1[0]\n"
+        "vmlal.u16 q5, d3, d1[1]\n"
+        "add %[lhs_ptr], #24\n"
+        "vmlal.u16 q6, d3, d1[2]\n"
+        "vmlal.u16 q7, d3, d1[3]\n"
+        "add %[rhs_ptr], #8\n"
+        "vmlal.u16 q8, d5, d1[0]\n"
+        "vmlal.u16 q9, d5, d1[1]\n"
+        "subs %[run_depth], #2\n"
+        "vmlal.u16 q10, d5, d1[2]\n"
+        "vmlal.u16 q11, d5, d1[3]\n"
+        "vmlal.u16 q12, d7, d1[0]\n"
+        "vmlal.u16 q13, d7, d1[1]\n"
+        "vmlal.u16 q14, d7, d1[2]\n"
+        "vmlal.u16 q15, d7, d1[3]\n"
+
+        "bne " GEMMLOWP_LABEL_LOOP "b\n"
+
+        GEMMLOWP_LABEL_AFTER_LOOP
+        ":\n"
+
+        // Do remaining arithmetic for the last 2 levels of depth.
+
+        // Expand Lhs/Rhs cells to 16 bit.
+        "vmovl.u8 q0, d0\n"
+        "vmovl.u8 q1, d2\n"
+        "vmovl.u8 q2, d4\n"
+        "vmovl.u8 q3, d6\n"
+
+        // Multiply-accumulate, level of depth 0
+        "vmlal.u16 q4, d2, d0[0]\n"
+        "vmlal.u16 q5, d2, d0[1]\n"
+        "vmlal.u16 q6, d2, d0[2]\n"
+        "vmlal.u16 q7, d2, d0[3]\n"
+        "vmlal.u16 q8, d4, d0[0]\n"
+        "vmlal.u16 q9, d4, d0[1]\n"
+        "vmlal.u16 q10, d4, d0[2]\n"
+        "vmlal.u16 q11, d4, d0[3]\n"
+        "vmlal.u16 q12, d6, d0[0]\n"
+        "vmlal.u16 q13, d6, d0[1]\n"
+        "vmlal.u16 q14, d6, d0[2]\n"
+        "vmlal.u16 q15, d6, d0[3]\n"
+
+        // Multiply-accumulate, level of depth 1
+        "vmlal.u16 q4, d3, d1[0]\n"
+        "vmlal.u16 q5, d3, d1[1]\n"
+        "vmlal.u16 q6, d3, d1[2]\n"
+        "vmlal.u16 q7, d3, d1[3]\n"
+        "vmlal.u16 q8, d5, d1[0]\n"
+        "vmlal.u16 q9, d5, d1[1]\n"
+        "vmlal.u16 q10, d5, d1[2]\n"
+        "vmlal.u16 q11, d5, d1[3]\n"
+        "vmlal.u16 q12, d7, d1[0]\n"
+        "vmlal.u16 q13, d7, d1[1]\n"
+        "vmlal.u16 q14, d7, d1[2]\n"
+        "vmlal.u16 q15, d7, d1[3]\n"
+
+        // Store accumulators
+        "mov r1, %[dst_ptr]\n"
+        "mov r0, r1\n"
+        "vst1.32 {d8, d9},   [r0]!\n"
+        "add r1, %[dst_col_stride]\n"
+        "vst1.32 {d16, d17}, [r0]!\n"
+        "vst1.32 {d24, d25}, [r0]\n"
+        "mov r0, r1\n"
+        "vst1.32 {d10, d11}, [r0]!\n"
+        "add r1, %[dst_col_stride]\n"
+        "vst1.32 {d18, d19}, [r0]!\n"
+        "vst1.32 {d26, d27}, [r0]\n"
+        "mov r0, r1\n"
+        "vst1.32 {d12, d13}, [r0]!\n"
+        "add r1, %[dst_col_stride]\n"
+        "vst1.32 {d20, d21}, [r0]!\n"
+        "vst1.32 {d28, d29}, [r0]\n"
+        "mov r0, r1\n"
+        "vst1.32 {d14, d15}, [r0]!\n"
+        "vst1.32 {d22, d23}, [r0]!\n"
+        "vst1.32 {d30, d31}, [r0]\n"
+        :  // outputs
+        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
+        [dst_ptr] "+r"(dst_ptr),
+        [run_depth] "+r"(run_depth)
+        :  // inputs
+        [start_depth] "r"(start_depth),
+        [dst_col_stride] "r"(dst_col_stride)
+        :  // clobbers
+        "cc", "memory", "r0", "r1",
+        // note: someone on internet says that quad registers are
+        // unsupported in the clobber list!
+        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
+        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
+        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
+        "d31");
+#undef GEMMLOWP_LABEL_CLEAR_ACCUMULATORS
+#undef GEMMLOWP_LABEL_BEFORE_LOOP
+#undef GEMMLOWP_LABEL_LOOP
+#undef GEMMLOWP_LABEL_AFTER_LOOP
+  }
+};
+
+struct NEON_32_Kernel12x4Depth2Assuming12BitProducts : KernelBase {
+  typedef KernelFormat<
+      KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 3>,
+      KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> >
+      Format;
+
+  const char* Name() const override {
+    return "NEON, 12x4, depth 2, assuming 12-bit products";
+  }
+
+  // TODO(benoitjacob): reorder function arguments so dst comes last
+  void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
+           std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
+           const std::uint8_t* rhs_ptr, std::size_t start_depth,
+           std::size_t run_depth) const override {
+    ScopedProfilingLabel label(
+        "optimized kernel (NEON 12x4, assuming 12-bit products)");
+    assert(dst_row_stride == 1);
+    (void)dst_row_stride;
+
+// See comments above for why we need local numerical labels in our asm.
+#define GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS "1"
+#define GEMMLOWP_LOAD_GLOBAL_ACCUMULATORS_NEON_32_KERNEL_12X4_DEPTH2_12BIT "2"
+#define GEMMLOWP_LABEL_32 "3"
+#define GEMMLOWP_LABEL_24 "4"
+#define GEMMLOWP_LABEL_16 "5"
+#define GEMMLOWP_LABEL_8 "6"
+#define GEMMLOWP_LABEL_2 "7"
+
+    // This kernel is special in that it uses local 16-bit accumulators.
+    // Because it assumes that each product fits in 12 bits, it can accumulate
+    // 16 products into a local 16-bit accumulator without risking overflow.
+    // At that point, it must accumulate these local 16-bit accumulators back
+    // into global 32-bit accumulators, which have to be stored in memory for
+    // lack of register space.
+    // This 12x4 block of global accumulators is laid out as 3 cells of size 4x4
+    // stored in diagonal-major order like this for the first 4x4 cell:
+    //
+    //   0   4   8  12
+    //  13   1   5   9
+    //  10  14   2   6
+    //   7  11  15   3
+    //
+    // and likewise for the 2nd  cell (16--31) and 3rd cell (32--47)
+    std::int32_t global_accumulators[3 * 4 * 4];
+    asm volatile(
+        // Compute stride between consecutive columns, in bytes
+        "mov r0, #4\n"  // multiply by 4 = sizeof(int32)
+        "mul %[dst_col_stride], r0\n"
+
+        "cmp %[start_depth], #0\n"
+        "bne"
+        " " GEMMLOWP_LOAD_GLOBAL_ACCUMULATORS_NEON_32_KERNEL_12X4_DEPTH2_12BIT
+        "f\n"
+
+        // If start_depth==0, we need to clear our global accumulators
+        "mov r0, %[global_accumulators]\n"
+        "vmov.s32 q8, #0\n"
+        "vmov.s32 q9, q8\n"
+        "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
+        "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
+        "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
+        "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
+        "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
+        "vst1.32 {d16,d17,d18,d19}, [r0]!\n"
+        "b " GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS
+        "f\n"
+
+        // If start_depth!=0, we need to load our existing global accumulators
+        GEMMLOWP_LOAD_GLOBAL_ACCUMULATORS_NEON_32_KERNEL_12X4_DEPTH2_12BIT
+        ":\n"
+        // Load global accumulators from destination matrix, column-major
+        "mov r1, %[dst_ptr]\n"
+        "mov r0, %[dst_col_stride]\n"
+        "sub r0, #32\n"
+        "vld1.32 {d0,d1}, [r1]!\n"
+        "vld1.32 {d8,d9}, [r1]!\n"
+        "vld1.32 {d16,d17}, [r1], r0\n"
+        "vld1.32 {d2,d3}, [r1]!\n"
+        "vld1.32 {d10,d11}, [r1]!\n"
+        "vld1.32 {d18,d19}, [r1], r0\n"
+        "vld1.32 {d4,d5}, [r1]!\n"
+        "vld1.32 {d12,d13}, [r1]!\n"
+        "vld1.32 {d20,d21}, [r1], r0\n"
+        "vld1.32 {d6,d7}, [r1]!\n"
+        "vld1.32 {d14,d15}, [r1]!\n"
+        "vld1.32 {d22,d23}, [r1], r0\n"
+        // Now we need to convert the global accumulator registers to
+        // 4x4-block-wise diagonal-major order. What we effectively want to do
+        // is to rotate the rows, however the accumulators are stored in
+        // column-major order in registers. So we achieve this by
+        // transposing, rotating the registers, and transposing again each
+        // 4x4 block.
+        //
+        // Transpose 3 4x4 blocks separately
+        "vtrn.32 q0, q1\n"
+        "vtrn.32 q2, q3\n"
+        "vswp d1, d4\n"
+        "vswp d3, d6\n"
+        "vtrn.32 q4, q5\n"
+        "vtrn.32 q6, q7\n"
+        "vswp d9, d12\n"
+        "vswp d11, d14\n"
+        "vtrn.32 q8, q9\n"
+        "vtrn.32 q10, q11\n"
+        "vswp d17, d20\n"
+        "vswp d19, d22\n"
+        // Rotate the registers
+        "vext.32 q1, q1, q1, #1\n"
+        "vext.32 q2, q2, q2, #2\n"
+        "vext.32 q3, q3, q3, #3\n"
+        "vext.32 q5, q5, q5, #1\n"
+        "vext.32 q6, q6, q6, #2\n"
+        "vext.32 q7, q7, q7, #3\n"
+        "vext.32 q9, q9, q9, #1\n"
+        "vext.32 q10, q10, q10, #2\n"
+        "vext.32 q11, q11, q11, #3\n"
+        // Transpose again and store into our global accumulators
+        // buffer. These two operations are done at once using vst4.
+        "mov r0, %[global_accumulators]\n"
+        "vst4.32 {d0,d2,d4,d6}, [r0]!\n"
+        "vst4.32 {d1,d3,d5,d7}, [r0]!\n"
+        "vst4.32 {d8,d10,d12,d14}, [r0]!\n"
+        "vst4.32 {d9,d11,d13,d15}, [r0]!\n"
+        "vst4.32 {d16,d18,d20,d22}, [r0]!\n"
+        "vst4.32 {d17,d19,d21,d23}, [r0]!\n"
+
+        /* Main loop */
+
+        GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS
+        ":\n"
+
+    // Overview of register layout:
+    //
+    // Registers q4--q16 are the local 16-bit accumulators.
+    // However, each entry in the result matrix is represented
+    // by *two* local 16-bit accumulators: one for even levels
+    // of depth and one for odd levels of depth. These correspond
+    // to the scalars at even and odd indices within each q-register.
+    // Thus we effectively use 32 bits of register space for each
+    // entry in the result matrix. The accumulators register layout
+    // is the same as was described above for the global 32-bit
+    // accumulators (3 cells of size 4x4 in diagonal-major order)
+    // with the only difference that instead of 32bit values we have
+    // pairs of 16bit values.
+    //
+    // A 2x4 cell of Rhs is stored in 8bit in d0.
+    // A 12x2 block of 3 4x2 cells Lhs is stored in 8bit in d1--d3.
+    //
+    //                      +--------+--------+--------+--------+
+    //                      |d0[0]   |d0[2]   |d0[4]   |d0[6]   |
+    //                 Rhs  +--------+--------+--------+--------+
+    //                      |d0[1]   |d0[3]   |d0[5]   |d0[7]   |
+    //                      +--------+--------+--------+--------+
+    //
+    //                      |        |        |        |        |
+    //
+    //    Lhs               |        |        |        |        |
+    //
+    //  +-----+-----+ - - - +--------+--------+--------+--------+
+    //  |d1[0]|d1[1]|       |q4[0,1] |q5[0,1] |q6[0,1] |q7[0,1] |
+    //  |d1[2]|d1[3]|       |q7[2,3] |q4[2,3] |q5[2,3] |q6[2,3] |
+    //  |d1[4]|d1[5]|       |q6[4,5] |q7[4,5] |q4[4,5] |q5[4,5] |
+    //  |d1[6]|d1[7]|       |q5[6,7] |q6[6,7] |q7[6,7] |q4[6,7] |
+    //  +-----+-----+ - - - +--------+--------+--------+--------+
+    //  |d2[0]|d2[1]|       |q8[0,1] |q8[0,1] |q8[0,1] |q8[0,1] |
+    //  |d2[2]|d2[3]|       |q9[2,3] |q9[2,3] |q9[2,3] |q9[2,3] |
+    //  |d2[4]|d2[5]|       |q10[4,5]|q10[4,5]|q10[4,5]|q10[4,5]|
+    //  |d2[6]|d2[7]|       |q11[6,7]|q11[6,7]|q11[6,7]|q11[6,7]|
+    //  +-----+-----+ - - - +--------+--------+--------+--------+
+    //  |d3[0]|d3[1]|       |q12[0,1]|q12[0,1]|q12[0,1]|q12[0,1]|
+    //  |d3[2]|d3[3]|       |q13[2,3]|q13[2,3]|q13[2,3]|q13[2,3]|
+    //  |d3[4]|d3[5]|       |q14[4,5]|q14[4,5]|q14[4,5]|q14[4,5]|
+    //  |d3[6]|d3[7]|       |q15[6,7]|q15[6,7]|q15[6,7]|q15[6,7]|
+    //  +-----+-----+ - - - +--------+--------+--------+--------+
+    //
+    //                            Local 16-bit accumulators
+    //                         Note: 2 scalars per matrix entry
+
+#define GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH \
+  /* Load 3 Lhs cells of size 4x2 */          \
+  "vld1.8 {d1,d2,d3}, [%[lhs_ptr]:64]!\n"     \
+                                              \
+  /* Load 1 Rhs cell of size 2x4 */           \
+  "vld1.8 {d0}, [%[rhs_ptr]:64]!\n"           \
+                                              \
+  /* Multiply-accumulate */                   \
+  "vmlal.u8 q4, d1, d0\n"                     \
+  "vmlal.u8 q8, d2, d0\n"                     \
+  "vmlal.u8 q12, d3, d0\n"                    \
+  "vext.8 d0, d0, d0, #2\n"                   \
+  "vmlal.u8 q5, d1, d0\n"                     \
+  "vmlal.u8 q9, d2, d0\n"                     \
+  "vmlal.u8 q13, d3, d0\n"                    \
+  "vext.8 d0, d0, d0, #2\n"                   \
+  "vmlal.u8 q6, d1, d0\n"                     \
+  "vmlal.u8 q10, d2, d0\n"                    \
+  "vmlal.u8 q14, d3, d0\n"                    \
+  "vext.8 d0, d0, d0, #2\n"                   \
+  "vmlal.u8 q7, d1, d0\n"                     \
+  "vmlal.u8 q11, d2, d0\n"                    \
+  "vmlal.u8 q15, d3, d0\n"                    \
+                                              \
+  "sub %[run_depth], #2\n"
+
+#define GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH \
+  GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH       \
+  GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH       \
+  GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH       \
+  GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH
+
+        // Clear local 16-bit accumulators
+        "vmov.s32 q4, #0\n"
+        "vmov.s32 q5, q4\n"
+        "vmov.s32 q6, q4\n"
+        "vmov.s32 q7, q4\n"
+        "vmov.s32 q8, q4\n"
+        "vmov.s32 q9, q4\n"
+        "vmov.s32 q10, q4\n"
+        "vmov.s32 q11, q4\n"
+        "vmov.s32 q12, q4\n"
+        "vmov.s32 q13, q4\n"
+        "vmov.s32 q14, q4\n"
+        "vmov.s32 q15, q4\n"
+
+        // Select a suitable number of depth levels
+        // to process at this iteration. TODO (benoitjacob) I guess that
+        // someone who really knows asm should make this a jump table.
+        "cmp %[run_depth], #32\n"
+        "bge " GEMMLOWP_LABEL_32
+        "f\n"
+        "cmp %[run_depth], #24\n"
+        "bge " GEMMLOWP_LABEL_24
+        "f\n"
+        "cmp %[run_depth], #16\n"
+        "bge " GEMMLOWP_LABEL_16
+        "f\n"
+        "cmp %[run_depth], #8\n"
+        "bge " GEMMLOWP_LABEL_8
+        "f\n"
+        "b " GEMMLOWP_LABEL_2 "f\n"
+
+        GEMMLOWP_LABEL_32
+        ":\n" GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH GEMMLOWP_LABEL_24
+        ":\n" GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH GEMMLOWP_LABEL_16
+        ":\n" GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH GEMMLOWP_LABEL_8
+        ":\n" GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH
+            GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH
+                GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH GEMMLOWP_LABEL_2
+        ":\n" GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH
+
+        // Accumulate the local accumulators into the global accumulators.
+        // This is about summing adjacent pairs of 16-bit scalars into
+        // single 32-bit scalars, so we use pairwise long addition (vpadal).
+        "mov r0, %[global_accumulators]\n"
+        "mov r1, %[global_accumulators]\n"
+        "vld1.32 {d0,d1,d2,d3}, [r0]!\n"
+        "vld1.32 {d4,d5,d6,d7}, [r0]!\n"
+        "vpadal.u16 q0, q4\n"
+        "vpadal.u16 q1, q5\n"
+        "vpadal.u16 q2, q6\n"
+        "vpadal.u16 q3, q7\n"
+        "vst1.32 {d0,d1,d2,d3}, [r1]!\n"
+        "vst1.32 {d4,d5,d6,d7}, [r1]!\n"
+        "vld1.32 {d0,d1,d2,d3}, [r0]!\n"
+        "vld1.32 {d4,d5,d6,d7}, [r0]!\n"
+        "vpadal.u16 q0, q8\n"
+        "vpadal.u16 q1, q9\n"
+        "vpadal.u16 q2, q10\n"
+        "vpadal.u16 q3, q11\n"
+        "vst1.32 {d0,d1,d2,d3}, [r1]!\n"
+        "vst1.32 {d4,d5,d6,d7}, [r1]!\n"
+        "vld1.32 {d0,d1,d2,d3}, [r0]!\n"
+        "vld1.32 {d4,d5,d6,d7}, [r0]!\n"
+        "vpadal.u16 q0, q12\n"
+        "vpadal.u16 q1, q13\n"
+        "vpadal.u16 q2, q14\n"
+        "vpadal.u16 q3, q15\n"
+        "vst1.32 {d0,d1,d2,d3}, [r1]!\n"
+        "vst1.32 {d4,d5,d6,d7}, [r1]!\n"
+
+        // Loop.
+        "cmp %[run_depth], #0\n"
+        "bne " GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS
+        "b\n"
+
+#undef GEMMLOWP_CLEAR_LOCAL_ACCUMULATORS
+#undef GEMMLOWP_ACCUMULATE_8_LEVELS_OF_DEPTH
+#undef GEMMLOWP_ACCUMULATE_2_LEVELS_OF_DEPTH
+#undef GEMMLOWP_ADD_TO_GLOBAL_ACCUMULATORS
+
+        /* end of main loop */
+
+        // Store the global accumulators to the destination matrix
+        // (column-major)
+        // This is the reverse of the steps that we followed at the beginning
+        // when we load the global accumulators from the destination matrix.
+        // The problem is the same: how to convert 4x4 blocks
+        // between column-major and diagonal-major orders.
+        // Like above, we do this by rotating rows, and we achieve that by
+        // tranposing, rotating columns, and transposing again.
+        //
+        // Load and transpose 4x4 blocks of global accumulators
+        // These two steps are done at once by the vld4 instruction.
+        "mov r0, %[global_accumulators]\n"
+        "vld4.32 {d0,d2,d4,d6}, [r0]!\n"
+        "vld4.32 {d1,d3,d5,d7}, [r0]!\n"
+        "vld4.32 {d8,d10,d12,d14}, [r0]!\n"
+        "vld4.32 {d9,d11,d13,d15}, [r0]!\n"
+        "vld4.32 {d16,d18,d20,d22}, [r0]!\n"
+        "vld4.32 {d17,d19,d21,d23}, [r0]!\n"
+        // Rotate the rows of each 4x4 block
+        "vext.32 q1, q1, q1, #3\n"
+        "vext.32 q2, q2, q2, #2\n"
+        "vext.32 q3, q3, q3, #1\n"
+        "vext.32 q5, q5, q5, #3\n"
+        "vext.32 q6, q6, q6, #2\n"
+        "vext.32 q7, q7, q7, #1\n"
+        "vext.32 q9, q9, q9, #3\n"
+        "vext.32 q10, q10, q10, #2\n"
+        "vext.32 q11, q11, q11, #1\n"
+        // Transpose again each 4x4 block
+        "vtrn.32 q0, q1\n"
+        "vtrn.32 q2, q3\n"
+        "vswp d1, d4\n"
+        "vswp d3, d6\n"
+        "vtrn.32 q4, q5\n"
+        "vtrn.32 q6, q7\n"
+        "vswp d9, d12\n"
+        "vswp d11, d14\n"
+        "vtrn.32 q8, q9\n"
+        "vtrn.32 q10, q11\n"
+        "vswp d17, d20\n"
+        "vswp d19, d22\n"
+        // Store into the column-major destination matrix
+        "mov r1, %[dst_ptr]\n"
+        "mov r0, %[dst_col_stride]\n"
+        "sub r0, #32\n"
+        "vst1.32 {d0,d1}, [r1]!\n"
+        "vst1.32 {d8,d9}, [r1]!\n"
+        "vst1.32 {d16,d17}, [r1], r0\n"
+        "vst1.32 {d2,d3}, [r1]!\n"
+        "vst1.32 {d10,d11}, [r1]!\n"
+        "vst1.32 {d18,d19}, [r1], r0\n"
+        "vst1.32 {d4,d5}, [r1]!\n"
+        "vst1.32 {d12,d13}, [r1]!\n"
+        "vst1.32 {d20,d21}, [r1], r0\n"
+        "vst1.32 {d6,d7}, [r1]!\n"
+        "vst1.32 {d14,d15}, [r1]!\n"
+        "vst1.32 {d22,d23}, [r1], r0\n"
+        :  // outputs
+        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
+        [dst_ptr] "+r"(dst_ptr),
+        [run_depth] "+r"(run_depth)
+        :  // inputs
+        [start_depth] "r"(start_depth), [dst_col_stride] "r"(dst_col_stride),
+        [global_accumulators] "r"(&global_accumulators[0])
+        :  // clobbers
+        "cc", "memory", "r0", "r1",
+        // note: someone on internet says that quad registers are
+        // unsupported in the clobber list!
+        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
+        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
+        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
+        "d31");
+#undef GEMMLOWP_LOOP_NEON_32_KERNEL_12X4_DEPTH2_ASSUMING_12BIT_PRODUCTS
+#undef GEMMLOWP_LOAD_GLOBAL_ACCUMULATORS_NEON_32_KERNEL_12X4_DEPTH2_12BIT
+#undef GEMMLOWP_LABEL_32
+#undef GEMMLOWP_LABEL_24
+#undef GEMMLOWP_LABEL_16
+#undef GEMMLOWP_LABEL_8
+#undef GEMMLOWP_LABEL_2
+  }
+};
+
+struct NEON_32bit_GEMM_Int8Operands_LhsNonzero : KernelBase {
+  typedef KernelFormat<
+      KernelSideFormatInt8<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
+      KernelSideFormatInt8<CellFormat<2, 16, CellOrder::WidthMajor>, 1> >
+      Format;
+  const char* Name() const override {
+    return "NEON, 4x2, depth 16, accumulating two within signed int16";
+  }
+
+  // TODO(benoitjacob): reorder function arguments so dst comes last
+  void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
+           std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
+           const std::uint8_t* rhs_ptr, std::size_t start_depth,
+           std::size_t run_depth) const override {
+    (void)dst_row_stride;
+#define GEMMLOWP_LABEL_AFTER_LOOP "1"
+#define GEMMLOWP_LABEL_LOOP "2"
+#define GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES "3"
+#define GEMMLOWP_LABEL_STORE "4"
+    asm volatile(
+        // Multiply dst_col_stride by 4 == sizeof(int32) to use
+        // it as a byte offset below.
+        "lsl %[dst_col_stride], %[dst_col_stride], #2\n"
+
+        // Overview of register layout:
+        //
+        // A 2x16 block of Rhs is stored in 8 bit in d0--d3.
+        // A 4x16 block of Lhs is stored in 8 bit in d4--d7. That is only
+        // half of the register space required, so we loop over these registers
+        // twice. Only half of it, a 2x16 block, is stored in d4--d7 at
+        // any given time.
+        //
+        // A 4x2 block of accumulators is stored in q8--q15 (as 4x32 bit
+        // components which need to be horizontally-added at the end)
+        //
+        // The Lhs vectors are multiplied by the Rhs vectors with a widening
+        // multiply over the 8 first levels of depth, producing int16x8
+        // vectors of products for each position in the accumulator matrix.
+        // Here comes the special trick: since the operands are signed int8,
+        // their range being [ -2^7 , 2^7 ), their products are in range
+        // [ -2^14 , 2^14 - 1 ), meaning that we can add two such values
+        // without any risk of overflowing int16.
+        // We thus proceed with the 8 next levels of depth, multiplying
+        // again Lhs by Rhs, accumulating into this existing int16x8 vector.
+        //
+        // Only then, having processed 16 levels of depth, do we need to
+        // horizontally add these int16x8 accumulators into the final
+        // int32x4 accumulators.
+        //
+        // As we do not have enough registers to store all 16 int16x8
+        // temporary-16bit-accumulators, we have them cycle through q4--q7.
+        //
+        //
+        // Register layout (ignoring the q4--q7 temporary 16bit accumulators):
+        //
+        //                               +----+----+
+        //                               | d0 | d2 |
+        //                               | .  | .  |
+        //                               | .  | .  |
+        //                               | .  | .  |
+        //                       Rhs     +----+----+
+        //                               | d1 | d3 |
+        //                               | .  | .  |
+        //                               | .  | .  |
+        //                               | .  | .  |
+        //                               +----+----+
+        //
+        //                               |    |    |
+        //
+        //    Lhs                        |    |    |
+        //
+        //  +--------+--------+ - - - -  +----+----+
+        //  | d4 ... | d5 ... |          | q8 | q9 |
+        //  | d6 ... | d7 ... |          | q10| q11|
+        //  | d4 ... | d5 ... |          | q12| q13|
+        //  | d6 ... | d7 ... |          | q14| q15|
+        //  +--------+--------+ - - - -  +----+----+
+        //
+        //                               Accumulator
+        //
+
+        // Clear accumulators, and, interleaved with it,
+        // initial loads of the first loop iteration,
+        // taken out of the loop so that in the loop itself we have
+        // optimal streaming of data from memory.
+        "vldr d0, [%[rhs_ptr], #0]\n"
+        "vmov.i32 q8, #0\n"
+        "vldr d4, [%[lhs_ptr], #0]\n"
+        "vmov.i32 q9, #0\n"
+        "vldr d2, [%[rhs_ptr], #16]\n"
+        "vmov.i32 q10, q8\n"
+        "vldr d6, [%[lhs_ptr], #16]\n"
+        "vmov.i32 q11, q8\n"
+        "vldr d1, [%[rhs_ptr], #8]\n"
+        "vmov.i32 q12, q8\n"
+        "vldr d5, [%[lhs_ptr], #8]\n"
+        "vmov.i32 q13, q8\n"
+        "vldr d3, [%[rhs_ptr], #24]\n"
+        "vmov.i32 q14, q8\n"
+        "vldr d7, [%[lhs_ptr], #24]\n"
+        "vmov.i32 q15, q8\n"
+
+        // General loop.
+        GEMMLOWP_LABEL_LOOP
+        ":\n"
+
+        // Multiply 8 first levels of depth.
+        "vmull.s8    q4,  d0,  d4\n"
+        "add %[rhs_ptr], %[rhs_ptr], #32\n"
+        "vmull.s8    q5,  d2,  d4\n"
+        "vldr d4, [%[lhs_ptr], #32]\n"
+        "vmull.s8    q6,  d0,  d6\n"
+        "vmull.s8    q7,  d2,  d6\n"
+        "vldr d6, [%[lhs_ptr], #48]\n"
+
+        // Multiply-accumulate second-half, again into the same
+        // 16bit local accumulator registers. This is where we
+        // take advantage of having int8 instead of uint8 and therefore
+        // being able to accumulate two products into int16.
+        "vmlal.s8    q4,  d1,  d5\n"
+        "vmlal.s8    q5,  d3,  d5\n"
+        "vldr d5, [%[lhs_ptr], #40]\n"
+        "vmlal.s8    q6,  d1,  d7\n"
+        "vmlal.s8    q7,  d3,  d7\n"
+        "vldr d7, [%[lhs_ptr], #56]\n"
+
+        // Add pairwise, accumulate into 32-bit accumulators.
+        "vpadal.s16   q8,  q4\n"
+        "add %[lhs_ptr], %[lhs_ptr], #64\n"
+        "vpadal.s16   q9,  q5\n"
+        "subs %[run_depth], %[run_depth], #16\n"
+        "vpadal.s16   q10, q6\n"
+        "vpadal.s16   q11, q7\n"
+
+        "beq " GEMMLOWP_LABEL_AFTER_LOOP
+        "f\n"
+
+        // Multiply first half.
+        "vmull.s8    q4,  d0,  d4\n"
+        "vmull.s8    q5,  d2,  d4\n"
+        "vldr d4, [%[lhs_ptr], #0]\n"
+        "vmull.s8    q6,  d0,  d6\n"
+        "vldr d0, [%[rhs_ptr], #0]\n"
+        "vmull.s8    q7,  d2,  d6\n"
+        "vldr d2, [%[rhs_ptr], #16]\n"
+
+        // Multiply-accumulate second-half, again into the same
+        // 16bit local accumulator registers. This is where we
+        // take advantage of having int8 instead of uint8 and therefore
+        // being able to accumulate two products into int16.
+        "vmlal.s8    q4,  d1,  d5\n"
+        "vldr d6, [%[lhs_ptr], #16]\n"
+        "vmlal.s8    q5,  d3,  d5\n"
+        "vldr d5, [%[lhs_ptr], #8]\n"
+        "vmlal.s8    q6,  d1,  d7\n"
+        "vldr d1, [%[rhs_ptr], #8]\n"
+        "vmlal.s8    q7,  d3,  d7\n"
+        "vldr d3, [%[rhs_ptr], #24]\n"
+
+        // Add pairwise, accumulate into 32-bit accumulators.
+        "vpadal.s16   q12, q4\n"
+        "vldr d7, [%[lhs_ptr], #24]\n"
+        "vpadal.s16   q13, q5\n"
+        "vpadal.s16   q14, q6\n"
+        "vpadal.s16   q15, q7\n"
+
+        "b " GEMMLOWP_LABEL_LOOP "b\n"
+
+        GEMMLOWP_LABEL_AFTER_LOOP
+        ":\n"
+
+        // Multiply first half.
+        "vmull.s8    q4,  d0,  d4\n"
+        "vmull.s8    q5,  d2,  d4\n"
+        "vmull.s8    q6,  d0,  d6\n"
+        "vmull.s8    q7,  d2,  d6\n"
+
+        // Multiply-accumulate second-half, again into the same
+        // 16bit local accumulator registers. This is where we
+        // take advantage of having int8 instead of uint8 and therefore
+        // being able to accumulate two products into int16.
+        "vmlal.s8    q4,  d1,  d5\n"
+        "vmlal.s8    q5,  d3,  d5\n"
+        "vmlal.s8    q6,  d1,  d7\n"
+        "vmlal.s8    q7,  d3,  d7\n"
+
+        // Add pairwise, accumulate into 32-bit accumulators.
+        "vpadal.s16   q12, q4\n"
+        "vpadal.s16   q13, q5\n"
+        "vpadal.s16   q14, q6\n"
+        "vpadal.s16   q15, q7\n"
+        "cmp %[start_depth], #0\n"
+
+        // Reduce 32bit accumulators horizontally.
+        "vpadd.s32 d0, d16, d17\n"
+        "vpadd.s32 d1, d18, d19\n"
+        "vpadd.s32 d2, d20, d21\n"
+        "vpadd.s32 d3, d22, d23\n"
+        "vpadd.s32 d4, d24, d25\n"
+        "vpadd.s32 d5, d26, d27\n"
+        "vpadd.s32 d6, d28, d29\n"
+        "vpadd.s32 d7, d30, d31\n"
+
+        "bne " GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
+        "f\n"
+
+        // Reduce 32bit accumulators horizontally, second pass
+        // (each pass adds pairwise. we need to add 4-wise).
+        "vpadd.s32 d8, d0, d2\n"
+        "vpadd.s32 d9, d4, d6\n"
+        "vpadd.s32 d10, d1, d3\n"
+        "vpadd.s32 d11, d5, d7\n"
+
+        "b " GEMMLOWP_LABEL_STORE "f\n"
+
+        GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
+        ":\n"
+
+        // Reduce 32bit accumulators horizontally, second pass
+        // (each pass adds pairwise. we need to add 4-wise),
+        // and load destination values from memory.
+        "mov r0, %[dst_ptr]\n"
+        "vld1.32 {d16, d17}, [r0], %[dst_col_stride]\n"
+        "vpadd.s32 d8, d0, d2\n"
+        "vpadd.s32 d9, d4, d6\n"
+        "vld1.32 {d18, d19}, [r0]\n"
+        "vpadd.s32 d10, d1, d3\n"
+        "vpadd.s32 d11, d5, d7\n"
+
+        // Add horizontally-reduced accumulators into
+        // the values loaded from memory
+        "vadd.s32 q4, q8, q4\n"
+        "vadd.s32 q5, q9, q5\n"
+
+        GEMMLOWP_LABEL_STORE
+        ":\n"
+        // Store back into memory
+        "mov r0, %[dst_ptr]\n"
+        "vst1.32 {d8, d9}, [r0], %[dst_col_stride]\n"
+        "vst1.32 {d10, d11}, [r0]\n"
+        :  // outputs
+        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
+        [dst_ptr] "+r"(dst_ptr), [run_depth] "+r"(run_depth)
+        :  // inputs
+        [start_depth] "r"(start_depth),
+        [dst_col_stride] "r"(dst_col_stride)
+        :  // clobbers
+        "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17",
+        "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
+        "d28", "d29", "d30", "d31");
+#undef GEMMLOWP_LABEL_LOOP
+#undef GEMMLOWP_LABEL_AFTER_LOOP
+#undef GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
+#undef GEMMLOWP_LABEL_STORE
+  }
+};
+
+// Same as NEON_32bit_GEMM_Int8Operands_LhsNonzero, but uses a side format that
+// requires that user inputs were originally int8. This avoids the uint8->int8
+// conversion in the pack step.
+struct NEON_32bit_GEMM_Int8Operands_LhsNonzero_Int8Inputs
+    : NEON_32bit_GEMM_Int8Operands_LhsNonzero {
+  typedef KernelFormat<
+      KernelSideFormatInt8Inputs<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
+      KernelSideFormatInt8Inputs<CellFormat<2, 16, CellOrder::WidthMajor>, 1> >
+      Format;
+};
+
+#endif  // GEMMLOWP_NEON_32
+
+// The kernels here are specifically arm 64bit assembly, not arm 32bit.
+#ifdef GEMMLOWP_NEON_64
+
+struct NEON_64bit_GEMM_Int8Operands_LhsNonzero : KernelBase {
+  typedef KernelFormat<
+      KernelSideFormatInt8<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
+      KernelSideFormatInt8<CellFormat<4, 16, CellOrder::WidthMajor>, 1> >
+      Format;
+  const char* Name() const override {
+    return "NEON, 4x4, depth 16, accumulating two within signed int16";
+  }
+
+  // TODO(benoitjacob): reorder function arguments so dst comes last
+  void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
+           std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
+           const std::uint8_t* rhs_ptr, std::size_t start_depth,
+           std::size_t run_depth) const override {
+    (void)dst_row_stride;
+#define GEMMLOWP_LABEL_AFTER_LOOP_LAST16 "1"
+#define GEMMLOWP_LABEL_LOOP "2"
+#define GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES "3"
+#define GEMMLOWP_LABEL_STORE "4"
+    asm volatile(
+        // Clear accumulators, and, interleaved with it,
+        // initial loads of the first loop iteration,
+        // taken out of the loop so that in the loop itself we have
+        // optimal streaming of data from memory.
+        "ld1 {v0.16b}, [%[rhs_ptr]], #16\n"
+        "dup v16.4s, wzr\n"
+        "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"
+        "dup v17.4s, wzr\n"
+        "ld1 {v1.16b}, [%[rhs_ptr]], #16\n"
+        "dup v18.4s, wzr\n"
+        "ld1 {v5.16b}, [%[lhs_ptr]], #16\n"
+        "dup v19.4s, wzr\n"
+        "ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
+        "dup v20.4s, wzr\n"
+        "ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
+        "dup v21.4s, wzr\n"
+        "ld1 {v6.16b}, [%[lhs_ptr]], #16\n"
+        "dup v22.4s, wzr\n"
+        "ld1 {v7.16b}, [%[lhs_ptr]], #16\n"
+        "dup v23.4s, wzr\n"
+        "dup v24.4s, wzr\n"
+        "dup v25.4s, wzr\n"
+        "dup v26.4s, wzr\n"
+        "dup v27.4s, wzr\n"
+        "dup v28.4s, wzr\n"
+        "dup v29.4s, wzr\n"
+        "dup v30.4s, wzr\n"
+        "dup v31.4s, wzr\n"
+
+        // Multiply dst_col_stride by 4 == sizeof(int32) to use
+        // it as a byte offset below.
+        "lsl %[dst_col_stride], %[dst_col_stride], #2\n"
+
+        // Initial arithmetic of the first loop iteration,
+        // taken out of the loop so that in the loop itself we have
+        // optimal streaming of data from memory.
+        "smull    v8.8h,  v0.8b,  v4.8b\n"
+        "smull    v9.8h,  v1.8b,  v4.8b\n"
+        "smull    v10.8h,  v2.8b,  v4.8b\n"
+        "smull    v11.8h,  v3.8b,  v4.8b\n"
+        "smull    v12.8h,  v0.8b,  v5.8b\n"
+        "smull    v13.8h,  v1.8b,  v5.8b\n"
+        "smull    v14.8h,  v2.8b,  v5.8b\n"
+        "smull    v15.8h,  v3.8b,  v5.8b\n"
+
+        // Multiply-accumulate second-half, again into the same
+        // 16bit local accumulator registers. This is where we
+        // take advantage of having int8 instead of uint8 and therefore
+        // being able to accumulate two products into int16.
+        "smlal2   v8.8h,  v0.16b,  v4.16b\n"
+        "smlal2   v9.8h,  v1.16b,  v4.16b\n"
+        "smlal2   v10.8h,  v2.16b,  v4.16b\n"
+        "smlal2   v11.8h,  v3.16b,  v4.16b\n"
+        "smlal2   v12.8h,  v0.16b,  v5.16b\n"
+        "smlal2   v13.8h,  v1.16b,  v5.16b\n"
+        "smlal2   v14.8h,  v2.16b,  v5.16b\n"
+        "smlal2   v15.8h,  v3.16b,  v5.16b\n"
+
+        "subs %[run_depth], %[run_depth], #16\n"
+
+        // If the loop depth is only 16, then we can skip the general loop
+        // and go straight to the final part of the code.
+        "beq " GEMMLOWP_LABEL_AFTER_LOOP_LAST16 "f\n"
+
+        // General loop.
+        GEMMLOWP_LABEL_LOOP
+        ":\n"
+
+        // Overview of register layout:
+        //
+        // A 4x16 block of Rhs is stored in 8 bit in v0--v3.
+        // A 4x16 block of Lhs is stored in 8 bit in v4--v7.
+        //
+        // A 4x4 block of accumulators is stored in v16-v31 (as 4x32 bit
+        // components which need to be horizontally-added at the end)
+        //
+        // The Lhs vectors are multiplied by the Rhs vectors with a widening
+        // multiply over the 8 first levels of depth, producing int16x8
+        // vectors of products for each position in the accumulator matrix.
+        // Here comes the special trick: since the operands are signed int8,
+        // their range being [ -2^7 , 2^7 ), their products are in range
+        // [ -2^14 , 2^14 - 1 ), meaning that we can add two such values
+        // without any risk of overflowing int16.
+        // We thus proceed with the 8 next levels of depth, multiplying
+        // again Lhs by Rhs, accumulating into this existing int16x8 vector.
+        //
+        // Only then, having processed 16 levels of depth, do we need to
+        // horizontally add these int16x8 accumulators into the final
+        // int32x4 accumulators.
+        //
+        // As we do not have enough registers to store all 16 int16x8
+        // temporary-16bit-accumulators, we have them cycle through v8--v15.
+        //
+        //
+        // Register layout (ignoring the v8--v15 temporary 16bit accumulators):
+        //
+        //                               +--------+--------+--------+--------+
+        //                               |v0.b[0] |v1.b[0] |v2.b[0] |v3.b[0] |
+        //                          Rhs  +--------+--------+--------+--------+
+        //                               |  ...   |  ...   |  ...   |  ...   |
+        //                               +--------+--------+--------+--------|
+        //                               |v0.b[15]|v1.b[15]|v2.b[15]|v3.b[15]|
+        //                               +--------+--------+--------+--------+
+        //
+        //                               |        |        |        |        |
+        //
+        //    Lhs                        |        |        |        |        |
+        //
+        //  +-------+-----+--------+ - - +--------+--------+--------+--------+
+        //  |v4.b[0]| ... |v4.b[15]|     | v16.4s | v17.4s | v18.4s | v19.4s |
+        //  |v5.b[0]| ... |v5.b[15]|     | v20.4s | v21.4s | v22.4s | v23.4s |
+        //  |v6.b[0]| ... |v6.b[15]|     | v24.4s | v25.4s | v26.4s | v27.4s |
+        //  |v7.b[0]| ... |v7.b[15]|     | v28.4s | v29.4s | v30.4s | v31.4s |
+        //  +-------+--------------+ - - +--------+--------+--------+--------+
+        //
+        //                                                Accumulator
+        //
+
+        // Some multiplications and 16-bit accumulation were already done above,
+        // so we start right away in the middle.
+        "sadalp  v16.4s, v8.8h\n"
+        "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"
+        "smull    v8.8h,  v0.8b,  v6.8b\n"
+        "sadalp  v17.4s, v9.8h\n"
+        "ld1 {v5.16b}, [%[lhs_ptr]], #16\n"
+        "smull    v9.8h,  v1.8b,  v6.8b\n"
+        "sadalp  v18.4s, v10.8h\n"
+        "smull    v10.8h,  v2.8b,  v6.8b\n"
+        "sadalp  v19.4s, v11.8h\n"
+        "smull    v11.8h,  v3.8b,  v6.8b\n"
+        "sadalp  v20.4s, v12.8h\n"
+        "smull    v12.8h,  v0.8b,  v7.8b\n"
+        "sadalp  v21.4s, v13.8h\n"
+        "smull    v13.8h,  v1.8b,  v7.8b\n"
+        "sadalp  v22.4s, v14.8h\n"
+        "smull    v14.8h,  v2.8b,  v7.8b\n"
+        "sadalp  v23.4s, v15.8h\n"
+        "smull    v15.8h,  v3.8b,  v7.8b\n"
+
+        // Multiply-accumulate second-half, again into the same
+        // 16bit local accumulator registers. This is where we
+        // take advantage of having int8 instead of uint8 and therefore
+        // being able to accumulate two products into int16.
+        "smlal2   v8.8h,  v0.16b,  v6.16b\n"
+        "smlal2   v9.8h,  v1.16b,  v6.16b\n"
+        "smlal2   v10.8h,  v2.16b,  v6.16b\n"
+        "smlal2   v11.8h,  v3.16b,  v6.16b\n"
+
+        "ld1 {v6.16b}, [%[lhs_ptr]], #16\n"
+
+        "smlal2   v12.8h,  v0.16b,  v7.16b\n"
+        "ld1 {v0.16b}, [%[rhs_ptr]], #16\n"
+        "smlal2   v13.8h,  v1.16b,  v7.16b\n"
+        "ld1 {v1.16b}, [%[rhs_ptr]], #16\n"
+        "smlal2   v14.8h,  v2.16b,  v7.16b\n"
+        "ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
+        "smlal2   v15.8h,  v3.16b,  v7.16b\n"
+        "ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
+
+        "sadalp  v24.4s, v8.8h\n"
+        "smull    v8.8h,  v0.8b,  v4.8b\n"
+        "sadalp  v25.4s, v9.8h\n"
+        "ld1 {v7.16b}, [%[lhs_ptr]], #16\n"
+        "smull    v9.8h,  v1.8b,  v4.8b\n"
+        "sadalp  v26.4s, v10.8h\n"
+        "smull    v10.8h,  v2.8b,  v4.8b\n"
+        "sadalp  v27.4s, v11.8h\n"
+        "smull    v11.8h,  v3.8b,  v4.8b\n"
+        "sadalp  v28.4s, v12.8h\n"
+        "smull    v12.8h,  v0.8b,  v5.8b\n"
+        "sadalp  v29.4s, v13.8h\n"
+        "smull    v13.8h,  v1.8b,  v5.8b\n"
+        "sadalp  v30.4s, v14.8h\n"
+        "smull    v14.8h,  v2.8b,  v5.8b\n"
+        "sadalp  v31.4s, v15.8h\n"
+        "smull    v15.8h,  v3.8b,  v5.8b\n"
+
+        // Multiply-accumulate second-half, again into the same
+        // 16bit local accumulator registers. This is where we
+        // take advantage of having int8 instead of uint8 and therefore
+        // being able to accumulate two products into int16.
+        "smlal2   v8.8h,  v0.16b,  v4.16b\n"
+        "smlal2   v9.8h,  v1.16b,  v4.16b\n"
+        "smlal2   v10.8h,  v2.16b,  v4.16b\n"
+        "smlal2   v11.8h,  v3.16b,  v4.16b\n"
+
+        // Loop. Decrement loop index (depth) by 16, since we just handled
+        // 16 levels of depth.  Do this subs a bit before the end of the loop
+        // for better dispatch on A57.
+        "subs %[run_depth], %[run_depth], #16\n"
+
+        "smlal2   v12.8h,  v0.16b,  v5.16b\n"
+        "smlal2   v13.8h,  v1.16b,  v5.16b\n"
+        "smlal2   v14.8h,  v2.16b,  v5.16b\n"
+        "smlal2   v15.8h,  v3.16b,  v5.16b\n"
+
+        "bne " GEMMLOWP_LABEL_LOOP "b\n"
+
+        // Final code for the last 16 levels of depth.
+        // There is nothing to load anymore, only some arithmetic to finish.
+        GEMMLOWP_LABEL_AFTER_LOOP_LAST16
+        ":\n"
+
+        // Some multiplications and 16-bit accumulation were already done above,
+        // so we start right away in the middle.
+        "sadalp  v16.4s, v8.8h\n"
+        "smull    v8.8h,  v0.8b,  v6.8b\n"
+        "sadalp  v17.4s, v9.8h\n"
+        "smull    v9.8h,  v1.8b,  v6.8b\n"
+        "sadalp  v18.4s, v10.8h\n"
+        "smull    v10.8h,  v2.8b,  v6.8b\n"
+        "sadalp  v19.4s, v11.8h\n"
+        "smull    v11.8h,  v3.8b,  v6.8b\n"
+        "sadalp  v20.4s, v12.8h\n"
+        "smull    v12.8h,  v0.8b,  v7.8b\n"
+        "sadalp  v21.4s, v13.8h\n"
+        "smull    v13.8h,  v1.8b,  v7.8b\n"
+        "sadalp  v22.4s, v14.8h\n"
+        "smull    v14.8h,  v2.8b,  v7.8b\n"
+        "sadalp  v23.4s, v15.8h\n"
+        "smull    v15.8h,  v3.8b,  v7.8b\n"
+
+        // Multiply-accumulate second-half, again into the same
+        // 16bit local accumulator registers. This is where we
+        // take advantage of having int8 instead of uint8 and therefore
+        // being able to accumulate two products into int16.
+        "smlal2   v8.8h,  v0.16b,  v6.16b\n"
+        "smlal2   v9.8h,  v1.16b,  v6.16b\n"
+        "smlal2   v10.8h,  v2.16b,  v6.16b\n"
+        "smlal2   v11.8h,  v3.16b,  v6.16b\n"
+        "smlal2   v12.8h,  v0.16b,  v7.16b\n"
+        "smlal2   v13.8h,  v1.16b,  v7.16b\n"
+        "smlal2   v14.8h,  v2.16b,  v7.16b\n"
+        "smlal2   v15.8h,  v3.16b,  v7.16b\n"
+
+        "sadalp  v24.4s, v8.8h\n"
+        "sadalp  v25.4s, v9.8h\n"
+        "sadalp  v26.4s, v10.8h\n"
+        "sadalp  v27.4s, v11.8h\n"
+        "sadalp  v28.4s, v12.8h\n"
+        "sadalp  v29.4s, v13.8h\n"
+        "sadalp  v30.4s, v14.8h\n"
+        "sadalp  v31.4s, v15.8h\n"
+
+        // Reduce 32bit accumulators horizontally.
+        "addp v0.4s, v16.4s, v20.4s\n"
+        "addp v2.4s, v17.4s, v21.4s\n"
+        "addp v4.4s, v18.4s, v22.4s\n"
+        "addp v6.4s, v19.4s, v23.4s\n"
+        "addp v1.4s, v24.4s, v28.4s\n"
+        "addp v3.4s, v25.4s, v29.4s\n"
+        "addp v5.4s, v26.4s, v30.4s\n"
+        "addp v7.4s, v27.4s, v31.4s\n"
+
+        "cmp %[start_depth], #0\n"
+        "bne " GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
+        "f\n"
+
+        // Reduce 32bit accumulators horizontally, second pass
+        // (each pass adds pairwise. we need to add 4-wise).
+        "addp v12.4s, v0.4s, v1.4s\n"
+        "addp v13.4s, v2.4s, v3.4s\n"
+        "addp v14.4s, v4.4s, v5.4s\n"
+        "addp v15.4s, v6.4s, v7.4s\n"
+
+        "b " GEMMLOWP_LABEL_STORE "f\n"
+
+        GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
+        ":\n"
+
+        // Reduce 32bit accumulators horizontally, second pass
+        // (each pass adds pairwise. we need to add 4-wise),
+        // and load destination values from memory.
+        "mov x0, %[dst_ptr]\n"
+        "ld1 {v12.16b}, [x0], %[dst_col_stride]\n"
+        "addp v8.4s, v0.4s, v1.4s\n"
+        "ld1 {v13.16b}, [x0], %[dst_col_stride]\n"
+        "addp v9.4s, v2.4s, v3.4s\n"
+        "ld1 {v14.16b}, [x0], %[dst_col_stride]\n"
+        "addp v10.4s, v4.4s, v5.4s\n"
+        "ld1 {v15.16b}, [x0]\n"
+        "addp v11.4s, v6.4s, v7.4s\n"
+
+        // Add horizontally-reduced accumulators into
+        // the values loaded from memory
+        "add v12.4s, v12.4s, v8.4s\n"
+        "add v13.4s, v13.4s, v9.4s\n"
+        "add v14.4s, v14.4s, v10.4s\n"
+        "add v15.4s, v15.4s, v11.4s\n"
+
+        GEMMLOWP_LABEL_STORE
+        ":\n"
+        // Store back into memory
+        "mov x0, %[dst_ptr]\n"
+        "st1 {v12.16b}, [x0], %[dst_col_stride]\n"
+        "st1 {v13.16b}, [x0], %[dst_col_stride]\n"
+        "st1 {v14.16b}, [x0], %[dst_col_stride]\n"
+        "st1 {v15.16b}, [x0]\n"
+        :  // outputs
+        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
+        [dst_ptr] "+r"(dst_ptr), [run_depth] "+r"(run_depth),
+        [dst_col_stride] "+r"(dst_col_stride)
+        :  // inputs
+        [start_depth] "r"(start_depth)
+        :  // clobbers
+        "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
+        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+        "v28", "v29", "v30", "v31");
+#undef GEMMLOWP_LABEL_LOOP
+#undef GEMMLOWP_LABEL_AFTER_LOOP_LAST16
+#undef GEMMLOWP_LABEL_ACCUMULATE_EXISTING_DST_VALUES
+#undef GEMMLOWP_LABEL_STORE
+  }
+};
+
+// Same as NEON_32bit_GEMM_Int8Operands_LhsNonzero, but uses a side format that
+// requires that user inputs were originally int8. This avoids the uint8->int8
+// conversion in the pack step.
+struct NEON_64bit_GEMM_Int8Operands_LhsNonzero_Int8Inputs
+    : NEON_64bit_GEMM_Int8Operands_LhsNonzero {
+  typedef KernelFormat<
+      KernelSideFormatInt8Inputs<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
+      KernelSideFormatInt8Inputs<CellFormat<4, 16, CellOrder::WidthMajor>, 1> >
+      Format;
+};
+
+// Our main GEMM kernel.
+struct NEON_64_Kernel12x8Depth2 : KernelBase {
+  typedef KernelFormat<KernelSideFormat<CellFormat<4, 2>, 3>,
+                       KernelSideFormat<CellFormat<4, 2>, 2> >
+      Format;
+
+  const char* Name() const override { return "NEON, 12x8, depth 2"; }
+
+  // TODO(benoitjacob): reorder function arguments so dst comes last
+  void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
+           std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
+           const std::uint8_t* rhs_ptr, std::size_t start_depth,
+           std::size_t run_depth) const override {
+    (void)dst_row_stride;
+    ScopedProfilingLabel label("optimized kernel (NEON 12x8)");
+// See comments above for why we need local numerical labels in our asm.
+#define GEMMLOWP_LABEL_CLEAR_ACCUMULATORS "1"
+#define GEMMLOWP_LABEL_BEFORE_LOOP "2"
+#define GEMMLOWP_LABEL_LOOP "3"
+#define GEMMLOWP_LABEL_AFTER_LOOP "4"
+
+    assert(dst_row_stride == 1);
+    asm volatile(
+        // Load 1 Rhs cell of size 2x8
+        "ld1 {v5.8b}, [%[rhs_ptr]], #8\n"
+        "ld1 {v6.8b}, [%[rhs_ptr]], #8\n"
+
+        // Load 3 Lhs cells of size 4x2 each
+        "ld1 {v2.8b}, [%[lhs_ptr]], #8\n"
+        "ld1 {v3.8b}, [%[lhs_ptr]], #8\n"
+        "ld1 {v4.8b}, [%[lhs_ptr]], #8\n"
+
+        // Multiply dst_col_stride by 4 == sizeof(int32) to use
+        // it as a byte offset below.
+        "lsl %[dst_col_stride], %[dst_col_stride], #2\n"
+
+        "cmp %[start_depth], #0\n"
+        "beq " GEMMLOWP_LABEL_CLEAR_ACCUMULATORS
+        "f\n"
+
+        // Load accumulators
+        "mov x1, %[dst_ptr]\n"
+        "mov x0, x1\n"
+        "ld1 {v8.16b}, [x0], #16\n"
+        "subs %[run_depth], %[run_depth], #2\n"
+        "ld1 {v16.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "ld1 {v24.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "ld1 {v9.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "ld1 {v17.16b}, [x0], #16\n"
+        "ld1 {v25.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "ld1 {v10.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "ld1 {v18.16b}, [x0], #16\n"
+        "ld1 {v26.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "ld1 {v11.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "ld1 {v19.16b}, [x0], #16\n"
+        "ld1 {v27.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "ld1 {v12.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "ld1 {v20.16b}, [x0], #16\n"
+        "ld1 {v28.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "ld1 {v13.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "ld1 {v21.16b}, [x0], #16\n"
+        "ld1 {v29.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "ld1 {v14.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "ld1 {v22.16b}, [x0], #16\n"
+        "ld1 {v30.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "ld1 {v15.16b}, [x0], #16\n"
+        "ld1 {v23.16b}, [x0], #16\n"
+        "ld1 {v31.16b}, [x0]\n"
+
+        "b " GEMMLOWP_LABEL_BEFORE_LOOP "f\n"
+
+        GEMMLOWP_LABEL_CLEAR_ACCUMULATORS
+        ":\n"
+
+        // Clear accumulator registers (see layout below)
+        "dup v8.4s, wzr\n"
+        "subs %[run_depth], %[run_depth], #2\n"
+        "dup v9.4s, wzr\n"
+        "dup v10.4s, wzr\n"
+        "dup v11.4s, wzr\n"
+        "dup v12.4s, wzr\n"
+        "dup v13.4s, wzr\n"
+        "dup v14.4s, wzr\n"
+        "dup v15.4s, wzr\n"
+        "dup v16.4s, wzr\n"
+        "dup v17.4s, wzr\n"
+        "dup v18.4s, wzr\n"
+        "dup v19.4s, wzr\n"
+        "dup v20.4s, wzr\n"
+        "dup v21.4s, wzr\n"
+        "dup v22.4s, wzr\n"
+        "dup v23.4s, wzr\n"
+        "dup v24.4s, wzr\n"
+        "dup v25.4s, wzr\n"
+        "dup v26.4s, wzr\n"
+        "dup v27.4s, wzr\n"
+        "dup v28.4s, wzr\n"
+        "dup v29.4s, wzr\n"
+        "dup v30.4s, wzr\n"
+        "dup v31.4s, wzr\n"
+
+        GEMMLOWP_LABEL_BEFORE_LOOP
+        ":\n"
+
+        "beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n"
+
+        GEMMLOWP_LABEL_LOOP
+        ":\n"
+
+        // Overview of register layout:
+        //
+        // A 2x8 block of 2 2x4 cells of Rhs is stored in 16bit in v0--v1.
+        // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in v2--v4.
+        // A 12x8 block of accumulators is stored in 32bit in v8--v31.
+        //
+        //                         +--------+--------+-----+--------+--------+
+        //                         |v0.h[0] |v0.h[1] | ... |v1.h[2] |v1.h[3] |
+        //                    Rhs  +--------+--------+-----+--------+--------+
+        //                         |v0.h[4] |v0.h[5] | ... |v1.h[6] |v1.h[7] |
+        //                         +--------+--------+-----+--------+--------+
+        //
+        //                         |        |        |     |        |        |
+        //
+        //    Lhs                  |        |        |     |        |        |
+        //
+        //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
+        //  |v2.h[0]|v2.h[4]|      |v8.s[0] |v9.s[0] | ... |v14.s[0]|v15.s[0]|
+        //  |v2.h[1]|v2.h[5]|      |v8.s[1] |v9.s[1] | ... |v14.s[1]|v15.s[1]|
+        //  |v2.h[2]|v2.h[6]|      |v8.s[2] |v9.s[2] | ... |v14.s[2]|v15.s[2]|
+        //  |v2.h[3]|v2.h[7]|      |v8.s[3] |v9.s[3] | ... |v14.s[3]|v15.s[3]|
+        //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
+        //  |v3.h[0]|v3.h[4]|      |v16.s[0]|v17.s[0]| ... |v22.s[0]|v23.s[0]|
+        //  |v3.h[1]|v3.h[5]|      |v16.s[1]|v17.s[1]| ... |v22.s[1]|v23.s[1]|
+        //  |v3.h[2]|v3.h[6]|      |v16.s[2]|v17.s[2]| ... |v22.s[2]|v23.s[2]|
+        //  |v3.h[3]|v3.h[7]|      |v16.s[3]|v17.s[3]| ... |v22.s[3]|v23.s[3]|
+        //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
+        //  |v4.h[0]|v4.h[4]|      |v24.s[0]|v25.s[0]| ... |v30.s[0]|v31.s[0]|
+        //  |v4.h[1]|v4.h[5]|      |v24.s[1]|v25.s[1]| ... |v30.s[1]|v31.s[1]|
+        //  |v4.h[2]|v4.h[6]|      |v24.s[2]|v25.s[2]| ... |v30.s[2]|v31.s[2]|
+        //  |v4.h[3]|v4.h[7]|      |v24.s[3]|v25.s[3]| ... |v30.s[3]|v31.s[3]|
+        //  +-------+-------+ - -  +--------+--------+-----+--------+--------+
+        //
+        //                            Accumulator
+
+        // Expand Lhs/Rhs cells to 16 bit.
+        "uxtl v0.8h, v5.8b\n"
+        "ld1 {v5.8b}, [%[rhs_ptr]], #8\n"
+        "uxtl v1.8h, v6.8b\n"
+        "ld1 {v6.8b}, [%[rhs_ptr]], #8\n"
+        "uxtl v2.8h, v2.8b\n"
+        "uxtl v3.8h, v3.8b\n"
+        "uxtl v4.8h, v4.8b\n"
+
+        // Multiply-accumulate, top third
+        "umlal v8.4s, v2.4h, v0.h[0]\n"
+        "umlal v9.4s, v2.4h, v0.h[1]\n"
+        "umlal v10.4s, v2.4h, v0.h[2]\n"
+        "umlal v11.4s, v2.4h, v0.h[3]\n"
+        "umlal v12.4s, v2.4h, v1.h[0]\n"
+        "umlal v13.4s, v2.4h, v1.h[1]\n"
+        "umlal v14.4s, v2.4h, v1.h[2]\n"
+        "umlal v15.4s, v2.4h, v1.h[3]\n"
+        "umlal2 v8.4s, v2.8h, v0.h[4]\n"
+        "umlal2 v9.4s, v2.8h, v0.h[5]\n"
+        "umlal2 v10.4s, v2.8h, v0.h[6]\n"
+        "umlal2 v11.4s, v2.8h, v0.h[7]\n"
+        "umlal2 v12.4s, v2.8h, v1.h[4]\n"
+        "umlal2 v13.4s, v2.8h, v1.h[5]\n"
+        "umlal2 v14.4s, v2.8h, v1.h[6]\n"
+        "umlal2 v15.4s, v2.8h, v1.h[7]\n"
+        "ld1 {v2.8b}, [%[lhs_ptr]], #8\n"
+
+        // Multiply-accumulate, middle third
+        "umlal v16.4s, v3.4h, v0.h[0]\n"
+        "umlal v17.4s, v3.4h, v0.h[1]\n"
+        "umlal v18.4s, v3.4h, v0.h[2]\n"
+        "umlal v19.4s, v3.4h, v0.h[3]\n"
+        "umlal v20.4s, v3.4h, v1.h[0]\n"
+        "umlal v21.4s, v3.4h, v1.h[1]\n"
+        "umlal v22.4s, v3.4h, v1.h[2]\n"
+        "umlal v23.4s, v3.4h, v1.h[3]\n"
+        "umlal2 v16.4s, v3.8h, v0.h[4]\n"
+        "umlal2 v17.4s, v3.8h, v0.h[5]\n"
+        "umlal2 v18.4s, v3.8h, v0.h[6]\n"
+        "umlal2 v19.4s, v3.8h, v0.h[7]\n"
+        "umlal2 v20.4s, v3.8h, v1.h[4]\n"
+        "umlal2 v21.4s, v3.8h, v1.h[5]\n"
+        "umlal2 v22.4s, v3.8h, v1.h[6]\n"
+        "umlal2 v23.4s, v3.8h, v1.h[7]\n"
+        "ld1 {v3.8b}, [%[lhs_ptr]], #8\n"
+
+        "subs %[run_depth], %[run_depth], #2\n"
+
+        // Multiply-accumulate, bottom third
+        "umlal v24.4s, v4.4h, v0.h[0]\n"
+        "umlal v25.4s, v4.4h, v0.h[1]\n"
+        "umlal v26.4s, v4.4h, v0.h[2]\n"
+        "umlal v27.4s, v4.4h, v0.h[3]\n"
+        "umlal v28.4s, v4.4h, v1.h[0]\n"
+        "umlal v29.4s, v4.4h, v1.h[1]\n"
+        "umlal v30.4s, v4.4h, v1.h[2]\n"
+        "umlal v31.4s, v4.4h, v1.h[3]\n"
+        "umlal2 v24.4s, v4.8h, v0.h[4]\n"
+        "umlal2 v25.4s, v4.8h, v0.h[5]\n"
+        "umlal2 v26.4s, v4.8h, v0.h[6]\n"
+        "umlal2 v27.4s, v4.8h, v0.h[7]\n"
+        "umlal2 v28.4s, v4.8h, v1.h[4]\n"
+        "umlal2 v29.4s, v4.8h, v1.h[5]\n"
+        "umlal2 v30.4s, v4.8h, v1.h[6]\n"
+        "umlal2 v31.4s, v4.8h, v1.h[7]\n"
+        "ld1 {v4.8b}, [%[lhs_ptr]], #8\n"
+
+        "bne " GEMMLOWP_LABEL_LOOP "b\n"
+
+        GEMMLOWP_LABEL_AFTER_LOOP
+        ":\n"
+
+        // Expand Lhs/Rhs cells to 16 bit.
+        "uxtl v0.8h, v5.8b\n"
+        "uxtl v1.8h, v6.8b\n"
+        "uxtl v2.8h, v2.8b\n"
+        "uxtl v3.8h, v3.8b\n"
+        "uxtl v4.8h, v4.8b\n"
+
+        // Multiply-accumulate, level of depth 0
+        "umlal v8.4s, v2.4h, v0.h[0]\n"
+        "umlal v9.4s, v2.4h, v0.h[1]\n"
+        "umlal v10.4s, v2.4h, v0.h[2]\n"
+        "umlal v11.4s, v2.4h, v0.h[3]\n"
+        "umlal v12.4s, v2.4h, v1.h[0]\n"
+        "umlal v13.4s, v2.4h, v1.h[1]\n"
+        "umlal v14.4s, v2.4h, v1.h[2]\n"
+        "umlal v15.4s, v2.4h, v1.h[3]\n"
+        "umlal v16.4s, v3.4h, v0.h[0]\n"
+        "umlal v17.4s, v3.4h, v0.h[1]\n"
+        "umlal v18.4s, v3.4h, v0.h[2]\n"
+        "umlal v19.4s, v3.4h, v0.h[3]\n"
+        "umlal v20.4s, v3.4h, v1.h[0]\n"
+        "umlal v21.4s, v3.4h, v1.h[1]\n"
+        "umlal v22.4s, v3.4h, v1.h[2]\n"
+        "umlal v23.4s, v3.4h, v1.h[3]\n"
+        "umlal v24.4s, v4.4h, v0.h[0]\n"
+        "umlal v25.4s, v4.4h, v0.h[1]\n"
+        "umlal v26.4s, v4.4h, v0.h[2]\n"
+        "umlal v27.4s, v4.4h, v0.h[3]\n"
+        "umlal v28.4s, v4.4h, v1.h[0]\n"
+        "umlal v29.4s, v4.4h, v1.h[1]\n"
+        "umlal v30.4s, v4.4h, v1.h[2]\n"
+        "umlal v31.4s, v4.4h, v1.h[3]\n"
+
+        // Multiply-accumulate, level of depth 1
+        "umlal2 v8.4s, v2.8h, v0.h[4]\n"
+        "umlal2 v9.4s, v2.8h, v0.h[5]\n"
+        "umlal2 v10.4s, v2.8h, v0.h[6]\n"
+        "umlal2 v11.4s, v2.8h, v0.h[7]\n"
+        "umlal2 v12.4s, v2.8h, v1.h[4]\n"
+        "umlal2 v13.4s, v2.8h, v1.h[5]\n"
+        "umlal2 v14.4s, v2.8h, v1.h[6]\n"
+        "umlal2 v15.4s, v2.8h, v1.h[7]\n"
+        "umlal2 v16.4s, v3.8h, v0.h[4]\n"
+        "umlal2 v17.4s, v3.8h, v0.h[5]\n"
+        "umlal2 v18.4s, v3.8h, v0.h[6]\n"
+        "umlal2 v19.4s, v3.8h, v0.h[7]\n"
+        "umlal2 v20.4s, v3.8h, v1.h[4]\n"
+        "umlal2 v21.4s, v3.8h, v1.h[5]\n"
+        "umlal2 v22.4s, v3.8h, v1.h[6]\n"
+        "umlal2 v23.4s, v3.8h, v1.h[7]\n"
+        "umlal2 v24.4s, v4.8h, v0.h[4]\n"
+        "umlal2 v25.4s, v4.8h, v0.h[5]\n"
+        "umlal2 v26.4s, v4.8h, v0.h[6]\n"
+        "umlal2 v27.4s, v4.8h, v0.h[7]\n"
+        "umlal2 v28.4s, v4.8h, v1.h[4]\n"
+        "umlal2 v29.4s, v4.8h, v1.h[5]\n"
+        "umlal2 v30.4s, v4.8h, v1.h[6]\n"
+        "umlal2 v31.4s, v4.8h, v1.h[7]\n"
+
+        // Store accumulators
+        "mov x1, %[dst_ptr]\n"
+        "mov x0, x1\n"
+        "st1 {v8.16b}, [x0], #16\n"
+        "subs %[run_depth], %[run_depth], #2\n"
+        "st1 {v16.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "st1 {v24.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "st1 {v9.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "st1 {v17.16b}, [x0], #16\n"
+        "st1 {v25.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "st1 {v10.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "st1 {v18.16b}, [x0], #16\n"
+        "st1 {v26.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "st1 {v11.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "st1 {v19.16b}, [x0], #16\n"
+        "st1 {v27.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "st1 {v12.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "st1 {v20.16b}, [x0], #16\n"
+        "st1 {v28.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "st1 {v13.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "st1 {v21.16b}, [x0], #16\n"
+        "st1 {v29.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "st1 {v14.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "st1 {v22.16b}, [x0], #16\n"
+        "st1 {v30.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "st1 {v15.16b}, [x0], #16\n"
+        "st1 {v23.16b}, [x0], #16\n"
+        "st1 {v31.16b}, [x0]\n"
+#undef GEMMLOWP_LABEL_CLEAR_ACCUMULATORS
+#undef GEMMLOWP_LABEL_BEFORE_LOOP
+#undef GEMMLOWP_LABEL_LOOP
+#undef GEMMLOWP_LABEL_AFTER_LOOP
+        :  // outputs
+        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
+        [dst_ptr] "+r"(dst_ptr),
+        [run_depth] "+r"(run_depth)
+        :  // inputs
+        [start_depth] "r"(start_depth),
+        [dst_col_stride] "r"(dst_col_stride)
+        :  // clobbers
+        "cc", "memory", "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+        "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
+        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+        "v27", "v28", "v29", "v30", "v31");
+  }
+};
+
+#ifdef GEMMLOWP_DOTPROD_KERNEL
+#ifndef __ARM_FEATURE_DOTPROD
+#error This kernel requires ARM dot-product instructions. Enable them by \
+  adding '+dotprod' to a compiler flag, e.g. -march=armv8.2-a+dotprod . \
+  Note that Clang up to version 7 fails to define the corresponding \
+  preprocessor token __ARM_FEATURE_DOTPROD, so you will still have to define \
+  it manually.
+#endif
+// Kernels utilizing the Armv8.2 Dot Product extension.
+//
+// The dot product instructions work by taking 4 consecutive 8-bit depth
+// values from each operand, multiplying the 4 pairs together and
+// accumulating all the results into the corresponding 32-bit accumulator
+// lane.  As such, the operation is identical to a 32-bit instruction (like
+// FMLA used in SGEMM), except that 4 depth values are processed at a time
+// instead of 1.
+
+// Thus, this first kernel is a carbon copy of
+// "NEON_64bit_GEMM_Float32_WithScalar_A57" (which should provide good
+// performance for most processors) below with the opcode (fmla -> udot) and
+// types (float32 -> uint8/uint32) changed.
+//
+// A signed version of this kernel could be produced by replacing "udot"
+// with "sdot" - performance should be identical to this udot kernel.
+struct NEON_64_Kernel12x8Depth4_dotprod : KernelBase {
+  typedef KernelFormat<KernelSideFormat<CellFormat<4, 4, CellOrder::WidthMajor>, 3>,
+                       KernelSideFormat<CellFormat<4, 4, CellOrder::WidthMajor>, 2> >
+      Format;
+
+  const char* Name() const override { return "NEON, 12x8, depth 4, dotprod"; }
+
+  void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride, std::size_t dst_col_stride,
+           const std::uint8_t* lhs_ptr, const std::uint8_t* rhs_ptr, std::size_t start_depth,
+           std::size_t depth) const override {
+    (void)dst_row_stride;
+    ScopedProfilingLabel label("optimized kernel (NEON 12x8, depth 4, dotprod)");
+// See comments above for why we need local numerical labels in our asm.
+#define GEMMLOWP_LABEL_CLEAR_ACCUMULATORS "1"
+#define GEMMLOWP_LABEL_BEFORE_LOOP "2"
+#define GEMMLOWP_LABEL_LOOP "3"
+#define GEMMLOWP_LABEL_AFTER_LOOP "4"
+
+    assert(dst_row_stride == 1);
+    asm volatile(
+        // Multiply dst_col_stride by 4 == sizeof(int32) to use
+        // it as a byte offset below.
+        "lsl %[dst_col_stride], %[dst_col_stride], #2\n"
+
+        "cmp %[start_depth], #0\n"
+        "beq " GEMMLOWP_LABEL_CLEAR_ACCUMULATORS "f\n"
+
+        // Load accumulators
+        "mov x1, %[dst_ptr]\n"
+        "mov x0, x1\n"
+        "ld1 {v8.16b}, [x0], #16\n"
+        "ld1 {v16.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "ld1 {v24.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "ld1 {v9.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "ld1 {v17.16b}, [x0], #16\n"
+        "ld1 {v25.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "ld1 {v10.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "ld1 {v18.16b}, [x0], #16\n"
+        "ld1 {v26.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "ld1 {v11.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "ld1 {v19.16b}, [x0], #16\n"
+        "ld1 {v27.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "ld1 {v12.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "ld1 {v20.16b}, [x0], #16\n"
+        "ld1 {v28.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "ld1 {v13.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "ld1 {v21.16b}, [x0], #16\n"
+        "ld1 {v29.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "ld1 {v14.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "ld1 {v22.16b}, [x0], #16\n"
+        "ld1 {v30.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "ld1 {v15.16b}, [x0], #16\n"
+        "ld1 {v23.16b}, [x0], #16\n"
+        "ld1 {v31.16b}, [x0]\n"
+
+        "b " GEMMLOWP_LABEL_BEFORE_LOOP "f\n"
+
+        GEMMLOWP_LABEL_CLEAR_ACCUMULATORS ":\n"
+
+        // Clear accumulator registers (see layout below)
+        "dup v8.4s, wzr\n"
+        "dup v9.4s, wzr\n"
+        "dup v10.4s, wzr\n"
+        "dup v11.4s, wzr\n"
+        "dup v12.4s, wzr\n"
+        "dup v13.4s, wzr\n"
+        "dup v14.4s, wzr\n"
+        "dup v15.4s, wzr\n"
+        "dup v16.4s, wzr\n"
+        "dup v17.4s, wzr\n"
+        "dup v18.4s, wzr\n"
+        "dup v19.4s, wzr\n"
+        "dup v20.4s, wzr\n"
+        "dup v21.4s, wzr\n"
+        "dup v22.4s, wzr\n"
+        "dup v23.4s, wzr\n"
+        "dup v24.4s, wzr\n"
+        "dup v25.4s, wzr\n"
+        "dup v26.4s, wzr\n"
+        "dup v27.4s, wzr\n"
+        "dup v28.4s, wzr\n"
+        "dup v29.4s, wzr\n"
+        "dup v30.4s, wzr\n"
+        "dup v31.4s, wzr\n"
+
+        GEMMLOWP_LABEL_BEFORE_LOOP ":\n"
+
+        "subs %w[depth], %w[depth], #4\n"
+
+        // The start of the loop assumes first Rhs cell is already loaded, so
+        // do it here for first iteration.
+        "ld1 {v0.16b}, [%[rhs_ptr]], #16\n"
+
+        // And the same for the first Lhs cell.
+        "ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
+
+        "beq " GEMMLOWP_LABEL_AFTER_LOOP "f\n"
+
+        GEMMLOWP_LABEL_LOOP ":\n"
+
+        // Start the MACs at the head of the loop - 1st cell from each side
+        // already loaded.
+        ".word 0x6f80e048  // udot v8.4s, v2.16b, v0.4b[0]\n"
+        ".word 0x6fa0e049  // udot v9.4s, v2.16b, v0.4b[1]\n"
+        "ld1 {v1.16b}, [%[rhs_ptr]], #16\n"  // Load second Rhs cell.
+        ".word 0x6f80e84a  // udot v10.4s, v2.16b, v0.4b[2]\n"
+        ".word 0x6fa0e84b  // udot v11.4s, v2.16b, v0.4b[3]\n"
+        "ld1 {v3.16b}, [%[lhs_ptr]], #16\n"  // Load second Lhs cell.
+        ".word 0x6f81e04c  // udot v12.4s, v2.16b, v1.4b[0]\n"
+        ".word 0x6fa1e04d  // udot v13.4s, v2.16b, v1.4b[1]\n"
+        "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"  // Load third Lhs cell.
+        ".word 0x6f81e84e  // udot v14.4s, v2.16b, v1.4b[2]\n"
+        ".word 0x6fa1e84f  // udot v15.4s, v2.16b, v1.4b[3]\n"
+        "ld1 {v2.16b}, [%[lhs_ptr]], #16\n"  // Done with first Lhs cell - load
+        // for the next iteration early.
+        ".word 0x6f80e070  // udot v16.4s, v3.16b, v0.4b[0]\n"
+        ".word 0x6fa0e071  // udot v17.4s, v3.16b, v0.4b[1]\n"
+        ".word 0x6f80e872  // udot v18.4s, v3.16b, v0.4b[2]\n"
+        ".word 0x6fa0e873  // udot v19.4s, v3.16b, v0.4b[3]\n"
+        ".word 0x6f81e074  // udot v20.4s, v3.16b, v1.4b[0]\n"
+        ".word 0x6fa1e075  // udot v21.4s, v3.16b, v1.4b[1]\n"
+        ".word 0x6f81e876  // udot v22.4s, v3.16b, v1.4b[2]\n"
+        ".word 0x6fa1e877  // udot v23.4s, v3.16b, v1.4b[3]\n"
+        ".word 0x6f80e098  // udot v24.4s, v4.16b, v0.4b[0]\n"
+        ".word 0x6fa0e099  // udot v25.4s, v4.16b, v0.4b[1]\n"
+        ".word 0x6f80e89a  // udot v26.4s, v4.16b, v0.4b[2]\n"
+        ".word 0x6fa0e89b  // udot v27.4s, v4.16b, v0.4b[3]\n"
+        "ld1 {v0.16b}, [%[rhs_ptr]], #16\n"  // Done with the first Rhs cell -
+        // load for the next iteration early.
+        ".word 0x6f81e09c  // udot v28.4s, v4.16b, v1.4b[0]\n"
+        ".word 0x6fa1e09d  // udot v29.4s, v4.16b, v1.4b[1]\n"
+
+        // Loop.  Decrement loop index (depth) by 4 as udot processes 4
+        // depth values.
+        "subs %w[depth], %w[depth], #4\n"
+        ".word 0x6f81e89e  // udot v30.4s, v4.16b, v1.4b[2]\n"
+        ".word 0x6fa1e89f  // udot v31.4s, v4.16b, v1.4b[3]\n"
+
+        "bne " GEMMLOWP_LABEL_LOOP "b\n"
+
+        GEMMLOWP_LABEL_AFTER_LOOP ":\n"
+
+        // Final iteration. v0 and v2 were already loaded, don't load
+        // them again, don't read past the end of buffers.
+        ".word 0x6f80e048  // udot v8.4s, v2.16b, v0.4b[0]\n"
+        ".word 0x6fa0e049  // udot v9.4s, v2.16b, v0.4b[1]\n"
+        "ld1 {v1.16b}, [%[rhs_ptr]], #16\n"  // Load second Rhs cell.
+        ".word 0x6f80e84a  // udot v10.4s, v2.16b, v0.4b[2]\n"
+        ".word 0x6fa0e84b  // udot v11.4s, v2.16b, v0.4b[3]\n"
+        "ld1 {v3.16b}, [%[lhs_ptr]], #16\n"  // Load second Lhs cell.
+        ".word 0x6f81e04c  // udot v12.4s, v2.16b, v1.4b[0]\n"
+        ".word 0x6fa1e04d  // udot v13.4s, v2.16b, v1.4b[1]\n"
+        "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"  // Load third Lhs cell.
+        ".word 0x6f81e84e  // udot v14.4s, v2.16b, v1.4b[2]\n"
+        ".word 0x6fa1e84f  // udot v15.4s, v2.16b, v1.4b[3]\n"
+        ".word 0x6f80e070  // udot v16.4s, v3.16b, v0.4b[0]\n"
+        ".word 0x6fa0e071  // udot v17.4s, v3.16b, v0.4b[1]\n"
+        ".word 0x6f80e872  // udot v18.4s, v3.16b, v0.4b[2]\n"
+        ".word 0x6fa0e873  // udot v19.4s, v3.16b, v0.4b[3]\n"
+        ".word 0x6f81e074  // udot v20.4s, v3.16b, v1.4b[0]\n"
+        ".word 0x6fa1e075  // udot v21.4s, v3.16b, v1.4b[1]\n"
+        ".word 0x6f81e876  // udot v22.4s, v3.16b, v1.4b[2]\n"
+        ".word 0x6fa1e877  // udot v23.4s, v3.16b, v1.4b[3]\n"
+        ".word 0x6f80e098  // udot v24.4s, v4.16b, v0.4b[0]\n"
+        ".word 0x6fa0e099  // udot v25.4s, v4.16b, v0.4b[1]\n"
+        ".word 0x6f80e89a  // udot v26.4s, v4.16b, v0.4b[2]\n"
+        ".word 0x6fa0e89b  // udot v27.4s, v4.16b, v0.4b[3]\n"
+        ".word 0x6f81e09c  // udot v28.4s, v4.16b, v1.4b[0]\n"
+        ".word 0x6fa1e09d  // udot v29.4s, v4.16b, v1.4b[1]\n"
+
+        // Loop.  Decrement loop index (depth) by 4 as udot processes 4
+        // depth values.
+        "subs %w[depth], %w[depth], #4\n"
+        ".word 0x6f81e89e  // udot v30.4s, v4.16b, v1.4b[2]\n"
+        ".word 0x6fa1e89f  // udot v31.4s, v4.16b, v1.4b[3]\n"
+
+        // Store accumulators
+        "mov x1, %[dst_ptr]\n"
+        "mov x0, x1\n"
+        "st1 {v8.16b}, [x0], #16\n"
+        "st1 {v16.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "st1 {v24.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "st1 {v9.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "st1 {v17.16b}, [x0], #16\n"
+        "st1 {v25.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "st1 {v10.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "st1 {v18.16b}, [x0], #16\n"
+        "st1 {v26.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "st1 {v11.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "st1 {v19.16b}, [x0], #16\n"
+        "st1 {v27.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "st1 {v12.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "st1 {v20.16b}, [x0], #16\n"
+        "st1 {v28.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "st1 {v13.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "st1 {v21.16b}, [x0], #16\n"
+        "st1 {v29.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "st1 {v14.16b}, [x0], #16\n"
+        "add x1, x1, %[dst_col_stride]\n"
+        "st1 {v22.16b}, [x0], #16\n"
+        "st1 {v30.16b}, [x0]\n"
+        "mov x0, x1\n"
+        "st1 {v15.16b}, [x0], #16\n"
+        "st1 {v23.16b}, [x0], #16\n"
+        "st1 {v31.16b}, [x0]\n"
+        :  // outputs
+        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
+        [depth] "+r"(depth)
+        :  // inputs
+        [dst_ptr] "r"(dst_ptr), [dst_col_stride] "r"(dst_col_stride), [start_depth] "r"(start_depth)
+        :  // clobbers
+        "cc", "memory", "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+        "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22",
+        "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+  }
+};
+#endif  // GEMMLOWP_DOTPROD_KERNEL
+
+#endif  // GEMMLOWP_NEON_64
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_KERNEL_NEON_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_reference.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_reference.h
new file mode 100644
index 00000000..3458c6a9
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_reference.h
@@ -0,0 +1,118 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// kernel_reference.h: a reference kernel for CPU architectures where we don't
+// have optimized kernels yet. Also useful for testing, as it's templatized
+// to have any arbitrary format, allowing tests to cover all sorts of corner
+// cases.
+
+#ifndef GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_
+#define GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_
+
+#include "kernel.h"
+
+#include <cstdio>
+#include <cstring>
+
+namespace gemmlowp {
+
+// This kernel is templatized in an arbitrary Format template parameter,
+// allowing it to have any arbitrary format.
+template <typename tFormat>
+struct ReferenceKernel : KernelBase {
+  typedef tFormat Format;
+
+  const char* Name() const override {
+    static char buf[256];
+    snprintf(buf, sizeof(buf),
+             "reference(Lhs: %d cells %dx%d %s, Rhs: %d cells %dx%d %s)",
+             Format::Lhs::kCells, Format::Lhs::Cell::kWidth,
+             Format::Lhs::Cell::kDepth,
+             CellOrderName(Format::Lhs::Cell::kOrder), Format::Rhs::kCells,
+             Format::Rhs::Cell::kDepth, Format::Rhs::Cell::kWidth,
+             CellOrderName(Format::Rhs::Cell::kOrder));
+    return buf;
+  }
+
+  void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
+           std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
+           const std::uint8_t* rhs_ptr, std::size_t start_depth,
+           std::size_t run_depth) const override {
+    std::int32_t accumulator[Format::kRows * Format::kCols];
+    memset(accumulator, 0, sizeof(accumulator));
+
+    const int run_depth_cells = static_cast<int>(run_depth / Format::kDepth);
+
+    // The outer loop is over the depth dimension.
+    for (int dc = 0; dc < run_depth_cells; dc++) {
+      // The next two loops are over cells of the Lhs (stacked vertically),
+      // and over cells of the Rhs (stacked horizontally).
+      for (int rc = 0; rc < Format::Lhs::kCells; rc++) {
+        const std::uint8_t* lhs_cell_ptr =
+            lhs_ptr + (dc * Format::Lhs::kCells + rc) *
+                          Format::Lhs::Cell::kWidth * Format::kDepth;
+        for (int cc = 0; cc < Format::Rhs::kCells; cc++) {
+          const std::uint8_t* rhs_cell_ptr =
+              rhs_ptr + (dc * Format::Rhs::kCells + cc) *
+                            Format::Rhs::Cell::kWidth * Format::kDepth;
+
+          // Now we are inside one cell of the Lhs and inside one cell
+          // of the Rhs, so the remaining inner loops are just
+          // traditional three loops of matrix multiplication.
+          for (int di = 0; di < Format::kDepth; di++) {
+            for (int ri = 0; ri < Format::Lhs::Cell::kWidth; ri++) {
+              for (int ci = 0; ci < Format::Rhs::Cell::kWidth; ci++) {
+                const std::uint8_t* lhs_coeff_ptr =
+                    lhs_cell_ptr +
+                    OffsetIntoCell<typename Format::Lhs::Cell>(ri, di);
+                const std::uint8_t* rhs_coeff_ptr =
+                    rhs_cell_ptr +
+                    OffsetIntoCell<typename Format::Rhs::Cell>(ci, di);
+                std::int32_t* accumulator_coeff_ptr =
+                    accumulator + (ri + rc * Format::Lhs::Cell::kWidth) +
+                    (ci + cc * Format::Rhs::Cell::kWidth) * Format::kRows;
+                *accumulator_coeff_ptr +=
+                    std::int32_t(*lhs_coeff_ptr) * std::int32_t(*rhs_coeff_ptr);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    if (start_depth == 0) {
+      // start_depth == 0 means we haven't accumulated anything yet, so we need
+      // to overwrite the accumulator, as it hasn't been initialized to zero.
+      for (int r = 0; r < Format::kRows; r++) {
+        for (int c = 0; c < Format::kCols; c++) {
+          dst_ptr[r * dst_row_stride + c * dst_col_stride] =
+              accumulator[r + c * Format::kRows];
+        }
+      }
+    } else {
+      // We have already accumulated stuff, so we need to continue accumulating
+      // instead of just overwriting.
+      for (int r = 0; r < Format::kRows; r++) {
+        for (int c = 0; c < Format::kCols; c++) {
+          dst_ptr[r * dst_row_stride + c * dst_col_stride] +=
+              accumulator[r + c * Format::kRows];
+        }
+      }
+    }
+  }
+};
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_sse.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_sse.h
new file mode 100644
index 00000000..ba7959b9
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_sse.h
@@ -0,0 +1,519 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// kernel_SSE.h: a collection of Intel SSE optimized kernels.
+// Check in kernel_default.h which one(s) are actually used by default.
+// Others are mere experiments; they are still covered by tests
+// in case they might be useful some day.
+//
+
+#ifndef GEMMLOWP_INTERNAL_KERNEL_SSE_H_
+#define GEMMLOWP_INTERNAL_KERNEL_SSE_H_
+
+#include "kernel.h"
+
+#include <string.h>
+#include <cassert>
+
+namespace gemmlowp {
+
+#ifdef GEMMLOWP_SSE4_32
+struct SSE4_32_Kernel4x4Depth2 : KernelBase {
+  typedef KernelFormat<
+      KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1>,
+      KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> >
+      Format;
+
+  const char* Name() const override { return "SSE, 4x4, depth 2"; }
+
+  void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
+           std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
+           const std::uint8_t* rhs_ptr, std::size_t start_depth,
+           std::size_t run_depth) const override {
+    ScopedProfilingLabel label("optimized kernel");
+    assert(dst_row_stride == 1);
+    (void)dst_row_stride;
+    std::int32_t run_depth_cells = run_depth / Format::kDepth;
+    /* Main loop */
+
+    // A 2x4 cell of Rhs is stored in 16bit in xmm1 .
+    // A 4x2 block Lhs is stored in 16bit in xmm0.
+    // A 4x4 block of accumulators is stored in 32bit in xmm4--xmm7.
+    //
+    //                   +-------+-------+-------+-------+
+    //                   |xmm1[0]|xmm1[2]|xmm1[4]|xmm1[6]|
+    //              Rhs  +-------+---------------+-------+
+    //                   |xmm1[1]|xmm1[3]|xmm1[5]|xmm1[7]|
+    //                   +-------+-------+-------+-------+
+    //
+    //                   |       |       |       |       |
+    //
+    //    Lhs            |       |       |       |       |
+    //
+    //  +--+--+ - - - -  +-------+-------+-------+-------+
+    //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
+    //  |xmm0 | (Iter1)  | xmm4  | xmm5  | xmm6  | xmm7  |
+    //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
+    //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
+    //  +--+--+ - - - -  +-------+-------+-------+-------+
+    //
+    //                              Accumulator
+
+    asm volatile(
+
+        // set accumulators to zero.
+        "pxor %%xmm4  , %%xmm4 \n\t"
+        "pxor %%xmm5  , %%xmm5 \n\t"
+        "pxor %%xmm6  , %%xmm6 \n\t"
+        "pxor %%xmm7  , %%xmm7 \n\t"
+
+        "movl  %[run_depth_cells], %%eax\n\t"
+        "subl $2, %%eax\n\t"
+        "js outerLoop1%=\n\t"
+
+        // Loop for K unrolled by 4
+        "outerLoop2%=:\n\t"
+
+        // K = 1,2
+        // RHS cell to xmm1
+        "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
+
+        // LHS cell
+        "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
+        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
+        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm2, %%xmm4           \n\t"
+        "paddd %%xmm3, %%xmm5           \n\t"
+
+        "prefetcht0 0x80(%[lhs_ptr]) \n\t"
+
+        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+
+        "prefetcht0 0x80(%[rhs_ptr]) \n\t"
+
+        // K = 3,4
+        // RHS cell to xmm1
+        "pmovzxbw 0x08(%[rhs_ptr]), %%xmm1\n\t"
+
+        "paddd %%xmm2, %%xmm6           \n\t"
+        "paddd %%xmm3, %%xmm7           \n\t"
+
+        // LHS cell
+        "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t"
+        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
+        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm2, %%xmm4           \n\t"
+        "paddd %%xmm3, %%xmm5           \n\t"
+        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
+        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
+
+        "addl $0x10, %[lhs_ptr]         \n\t"
+        "addl $0x10, %[rhs_ptr]         \n\t"
+
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm3, %%xmm7           \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "paddd %%xmm2, %%xmm6           \n\t"
+
+        "subl $2, %[run_depth_cells]\n\t"
+        "ja outerLoop2%=\n\t"
+
+        "movl %[run_depth_cells], %%eax\n\t"
+        "decl %%eax\n\t"
+        "js finish%=\n\t"
+
+        // Loop for K unrolled by 2
+        "outerLoop1%=:\n\t"
+
+        // RHS cell to xmm1
+        "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
+
+        // LHS cell
+        "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
+        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "paddd %%xmm2, %%xmm4           \n\t"
+        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm3, %%xmm5           \n\t"
+
+        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "paddd %%xmm2, %%xmm6           \n\t"
+        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm3, %%xmm7           \n\t"
+
+        "addl $0x08, %[lhs_ptr]\n\t"
+        "addl $0x08, %[rhs_ptr]\n\t"
+
+        "decl %[run_depth_cells]\n\t"
+        "jnz outerLoop1%=\n\t"
+
+        "finish%=:\n\t"
+
+        "movl  %[dst_col_stride], %%eax\n\t"
+        "shll $2, %%eax\n\t"
+
+        "movl  %[start_depth], %%ecx\n\t"
+        "test %%ecx, %%ecx\n\t"
+        "jz storeDst%=\n\t"
+
+        "leal (%%eax,%%eax,0x2), %%ecx\n\t"
+        "paddd 0x00(%[dst_ptr])           , %%xmm4 \n\t"
+        "paddd 0x00(%[dst_ptr], %%eax, 1) , %%xmm5 \n\t"
+        "paddd 0x00(%[dst_ptr], %%eax, 2) , %%xmm6 \n\t"
+        "paddd 0x00(%[dst_ptr], %%ecx, 1) , %%xmm7 \n\t"
+
+        "storeDst%=:\n\t"
+
+        "leal (%%eax,%%eax,0x2), %%ecx\n\t"
+        "movdqu %%xmm4  , 0x00(%[dst_ptr])          \n\t"
+        "movdqu %%xmm5  , 0x00(%[dst_ptr], %%eax, 1)\n\t"
+        "movdqu %%xmm6  , 0x00(%[dst_ptr], %%eax, 2)\n\t"
+        "movdqu %%xmm7  , 0x00(%[dst_ptr], %%ecx, 1)\n\t"
+
+        :  // outputs
+        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
+        [dst_ptr] "+r"(dst_ptr)
+        :  // inputs
+        [start_depth] "g"(start_depth), [dst_col_stride] "g"(dst_col_stride),
+        [run_depth_cells] "g"(run_depth_cells)
+        :  // clobbers
+        "cc", "memory", "%xmm0", "%xmm1", "%xmm3", "%xmm2", "%xmm4", "%xmm5",
+        "%xmm6", "%xmm7", "%eax", "%ecx");
+  }
+};
+#endif
+#ifdef GEMMLOWP_SSE4_64
+struct SSE4_64_Kernel12x4Depth2 : KernelBase {
+  typedef KernelFormat<
+      KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 3>,
+      KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> >
+      Format;
+
+  const char* Name() const override { return "SSE, 12x4, depth 2"; }
+
+  void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
+           std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
+           const std::uint8_t* rhs_ptr, std::size_t start_depth,
+           std::size_t run_depth) const override {
+    ScopedProfilingLabel label("optimized kernel");
+    assert(dst_row_stride == 1);
+    (void)dst_row_stride;
+    const std::int64_t run_depth_cells = run_depth / Format::kDepth;
+    const std::int64_t dst_col_stride_q = dst_col_stride;
+
+    /* Main loop */
+
+    // A 2x4 cell of Rhs is stored in 16bit in xmm1 .
+    // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in xmm0, replaced
+    // every Iteration.
+    // A 12x4 block of accumulators is stored in 32bit in xmm4--xmm15.
+    //
+    //                   +-------+-------+-------+-------+
+    //                   |xmm1[0]|xmm1[2]|xmm1[4]|xmm1[6]|
+    //              Rhs  +-------+---------------+-------+
+    //                   |xmm1[1]|xmm1[3]|xmm1[5]|xmm1[7]|
+    //                   +-------+-------+-------+-------+
+    //
+    //                   |       |       |       |       |
+    //
+    //    Lhs            |       |       |       |       |
+    //
+    //  +--+--+ - - - -  +-------+-------+-------+-------+
+    //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
+    //  |xmm0 | (Iter1)  | xmm4  | xmm5  | xmm6  | xmm7  |
+    //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
+    //  |xmm0 |          | xmm4  | xmm5  | xmm6  | xmm7  |
+    //  +--+--+ - - - -  +-------+-------+-------+-------+
+    //  |xmm0 |          | xmm8  | xmm9  | xmm10 | xmm11 |
+    //  |xmm0 | (Iter2)  | xmm8  | xmm9  | xmm10 | xmm11 |
+    //  |xmm0 |          | xmm8  | xmm9  | xmm10 | xmm11 |
+    //  |xmm0 |          | xmm8  | xmm9  | xmm10 | xmm11 |
+    //  +--+--+ - - - -  +-------+-------+-------+-------+
+    //  |xmm0 |          | xmm12 | xmm13 | xmm14 | xmm15 |
+    //  |xmm0 | (Iter3)  | xmm12 | xmm13 | xmm14 | xmm15 |
+    //  |xmm0 |          | xmm12 | xmm13 | xmm14 | xmm15 |
+    //  |xmm0 |          | xmm12 | xmm13 | xmm14 | xmm15 |
+    //  +--+--+ - - - -  +-------+-------+-------+-------+
+    //
+    //                              Accumulator
+
+    asm volatile(
+
+        // Set registers for destination
+        "movq  %[dst_col_stride_q], %%r12\n\t"
+        "shlq $2, %%r12\n\t"
+        "leaq (%%r12,%%r12,0x2), %%r13\n\t"
+
+        // Set accumulators to zero.
+        "pxor %%xmm4  , %%xmm4 \n\t"
+        "pxor %%xmm5  , %%xmm5 \n\t"
+        "pxor %%xmm6  , %%xmm6 \n\t"
+        "pxor %%xmm7  , %%xmm7 \n\t"
+        "pxor %%xmm8  , %%xmm8 \n\t"
+        "pxor %%xmm9  , %%xmm9 \n\t"
+        "pxor %%xmm10 , %%xmm10\n\t"
+        "pxor %%xmm11 , %%xmm11\n\t"
+        "pxor %%xmm12 , %%xmm12\n\t"
+        "pxor %%xmm13 , %%xmm13\n\t"
+        "pxor %%xmm14 , %%xmm14\n\t"
+        "pxor %%xmm15 , %%xmm15\n\t"
+
+        "movq  %[run_depth_cells], %%r14\n\t"
+        "subq $2, %%r14\n\t"
+        "js outerLoop1%=\n\t"
+
+        // Loop for K unrolled by 4
+        "outerLoop2%=:\n\t"
+
+        // K = 1,2
+        // RHS cell to xmm1
+
+        "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
+
+        // LHS cell
+        "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
+        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
+        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm2, %%xmm4           \n\t"
+        "paddd %%xmm3, %%xmm5           \n\t"
+
+        "prefetcht0 0x80(%[lhs_ptr]) \n\t"
+
+        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+
+        // next LHS cell
+        "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t"
+
+        "paddd %%xmm2, %%xmm6           \n\t"
+        "paddd %%xmm3, %%xmm7           \n\t"
+
+        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
+        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm2, %%xmm8           \n\t"
+        "paddd %%xmm3, %%xmm9           \n\t"
+
+        "prefetcht0 0x80(%[rhs_ptr]) \n\t"
+
+        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
+        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm2, %%xmm10          \n\t"
+        "paddd %%xmm3, %%xmm11          \n\t"
+
+        // next LHS cell
+        "pmovzxbw 0x10(%[lhs_ptr]), %%xmm0\n\t"
+        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
+        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm2, %%xmm12          \n\t"
+        "paddd %%xmm3, %%xmm13          \n\t"
+
+        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
+        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm2, %%xmm14          \n\t"
+        "paddd %%xmm3, %%xmm15          \n\t"
+
+        // K = 3,4
+        // RHS cell to xmm1
+        "pmovzxbw 0x08(%[rhs_ptr]), %%xmm1\n\t"
+
+        // LHS cell
+        "pmovzxbw 0x18(%[lhs_ptr]), %%xmm0\n\t"
+        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
+        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm2, %%xmm4           \n\t"
+        "paddd %%xmm3, %%xmm5           \n\t"
+
+        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
+        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm2, %%xmm6           \n\t"
+        "paddd %%xmm3, %%xmm7           \n\t"
+
+        // next LHS cell
+        "pmovzxbw 0x20(%[lhs_ptr]), %%xmm0\n\t"
+        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
+        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm2, %%xmm8           \n\t"
+        "paddd %%xmm3, %%xmm9           \n\t"
+
+        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
+        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm2, %%xmm10          \n\t"
+        "paddd %%xmm3, %%xmm11          \n\t"
+
+        // next LHS cell
+        "pmovzxbw 0x28(%[lhs_ptr]), %%xmm0\n\t"
+
+        "addq $0x30, %[lhs_ptr]         \n\t"
+        "addq $0x10, %[rhs_ptr]         \n\t"
+
+        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
+        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm2, %%xmm12          \n\t"
+        "paddd %%xmm3, %%xmm13          \n\t"
+
+        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
+        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm2, %%xmm14          \n\t"
+        "paddd %%xmm3, %%xmm15          \n\t"
+
+        "subq $2, %[run_depth_cells]\n\t"
+        "ja outerLoop2%=\n\t"
+
+        "movq %[run_depth_cells], %%r14\n\t"
+        "decq %%r14\n\t"
+        "js finish%=\n\t"
+
+        // Loop for K unrolled by 2
+        "outerLoop1%=:\n\t"
+
+        // RHS cell to xmm1
+        "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
+
+        // LHS cell
+        "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
+        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
+        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm2, %%xmm4           \n\t"
+        "paddd %%xmm3, %%xmm5           \n\t"
+        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
+        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm2, %%xmm6           \n\t"
+        "paddd %%xmm3, %%xmm7           \n\t"
+
+        // next LHS cell
+        "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t"
+        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
+        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm2, %%xmm8           \n\t"
+        "paddd %%xmm3, %%xmm9           \n\t"
+        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
+        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm2, %%xmm10          \n\t"
+        "paddd %%xmm3, %%xmm11          \n\t"
+
+        // next LHS cell
+        "pmovzxbw 0x10(%[lhs_ptr]), %%xmm0\n\t"
+
+        "addq $0x18, %[lhs_ptr]         \n\t"
+        "addq $0x08, %[rhs_ptr]         \n\t"
+
+        "pshufd $0x00,%%xmm1,%%xmm2     \n\t"
+        "pshufd $0x55,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm2, %%xmm12          \n\t"
+        "paddd %%xmm3, %%xmm13          \n\t"
+        "pshufd $0xaa,%%xmm1,%%xmm2     \n\t"
+        "pshufd $0xff,%%xmm1,%%xmm3     \n\t"
+        "pmaddwd %%xmm0, %%xmm2         \n\t"
+        "pmaddwd %%xmm0, %%xmm3         \n\t"
+        "paddd %%xmm2, %%xmm14          \n\t"
+        "paddd %%xmm3, %%xmm15          \n\t"
+
+        "decq %[run_depth_cells]\n\t"
+        "jnz outerLoop1%=\n\t"
+
+        "finish%=:\n\t"
+
+        "test %[start_depth], %[start_depth]\n\t"
+        "jz storeDst%=\n\t"
+
+        "paddd 0x00(%[dst_ptr])           , %%xmm4 \n\t"
+        "paddd 0x10(%[dst_ptr])           , %%xmm8 \n\t"
+        "paddd 0x20(%[dst_ptr])           , %%xmm12\n\t"
+        "paddd 0x00(%[dst_ptr], %%r12, 1) , %%xmm5 \n\t"
+        "paddd 0x10(%[dst_ptr], %%r12, 1) , %%xmm9 \n\t"
+        "paddd 0x20(%[dst_ptr], %%r12, 1) , %%xmm13\n\t"
+        "paddd 0x00(%[dst_ptr], %%r12, 2) , %%xmm6 \n\t"
+        "paddd 0x10(%[dst_ptr], %%r12, 2) , %%xmm10\n\t"
+        "paddd 0x20(%[dst_ptr], %%r12, 2) , %%xmm14\n\t"
+        "paddd 0x00(%[dst_ptr], %%r13, 1) , %%xmm7 \n\t"
+        "paddd 0x10(%[dst_ptr], %%r13, 1) , %%xmm11\n\t"
+        "paddd 0x20(%[dst_ptr], %%r13, 1) , %%xmm15\n\t"
+
+        "storeDst%=:\n\t"
+
+        "movdqu %%xmm4  , 0x00(%[dst_ptr])          \n\t"
+        "movdqu %%xmm8  , 0x10(%[dst_ptr])          \n\t"
+        "movdqu %%xmm12 , 0x20(%[dst_ptr])          \n\t"
+        "movdqu %%xmm5  , 0x00(%[dst_ptr], %%r12, 1)\n\t"
+        "movdqu %%xmm9  , 0x10(%[dst_ptr], %%r12, 1)\n\t"
+        "movdqu %%xmm13 , 0x20(%[dst_ptr], %%r12, 1)\n\t"
+        "movdqu %%xmm6  , 0x00(%[dst_ptr], %%r12, 2)\n\t"
+        "movdqu %%xmm10 , 0x10(%[dst_ptr], %%r12, 2)\n\t"
+        "movdqu %%xmm14 , 0x20(%[dst_ptr], %%r12, 2)\n\t"
+        "movdqu %%xmm7  , 0x00(%[dst_ptr], %%r13, 1)\n\t"
+        "movdqu %%xmm11 , 0x10(%[dst_ptr], %%r13, 1)\n\t"
+        "movdqu %%xmm15 , 0x20(%[dst_ptr], %%r13, 1)\n\t"
+
+        :  // outputs
+        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
+        [dst_ptr] "+r"(dst_ptr)
+        :  // inputs
+        [start_depth] "r"(start_depth),
+        [dst_col_stride_q] "r"(dst_col_stride_q),
+        [run_depth_cells] "r"(run_depth_cells)
+        :  // clobbers
+        "cc", "memory", "%xmm0", "%xmm1", "%xmm3", "%xmm2", "%xmm4", "%xmm5",
+        "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%r12", "%r13", "%r14",
+        "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15");
+  }
+};
+#endif
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_KERNEL_SSE_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/multi_thread_gemm.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/multi_thread_gemm.h
new file mode 100644
index 00000000..97183e72
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/multi_thread_gemm.h
@@ -0,0 +1,721 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// multi_thread_gemm.h: Multi-threaded GEMM entry point.
+// Readers note: To understand this file, it is useful to first
+// read and understand the much simpler single_thread_gemm.h.
+
+#ifndef GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
+#define GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
+
+#include <atomic>  // NOLINT
+#include <chrono>  // NOLINT
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "single_thread_gemm.h"
+
+namespace gemmlowp {
+
+// This value was empirically derived on an end-to-end application benchmark.
+// That this number of cycles means that we may be sleeping substantially longer
+// than a scheduler timeslice's duration is not necessarily surprising. The
+// idea is to pick up quickly new work after having finished the previous
+// workload. When it's new work within the same GEMM as the previous work, the
+// time interval that we might be busy-waiting is very small, so for that
+// purpose it would be more than enough to sleep for 1 million cycles.
+// That is all what we would observe on a GEMM benchmark. However, in a real
+// application, after having finished a GEMM, we might do unrelated work for
+// a little while, then start on a new GEMM. Think of a neural network
+// application performing inference, where many but not all layers are
+// implemented by a GEMM. In such cases, our worker threads might be idle for
+// longer periods of time before having work again. If we let them passively
+// wait, on a mobile device, the CPU scheduler might aggressively clock down
+// or even turn off the CPU cores that they were running on. That would result
+// in a long delay the next time these need to be turned back on for the next
+// GEMM. So we need to strike a balance that reflects typical time intervals
+// between consecutive GEMM invokations, not just intra-GEMM considerations.
+// Of course, we need to balance keeping CPUs spinning longer to resume work
+// faster, versus passively waiting to conserve power.
+const int kMaxBusyWaitNOPs = 4 * 1000 * 1000;
+
+// On X86 and ARM platforms we may use NOP instructions to know how long we
+// are busy-waiting.
+
+#if defined(GEMMLOWP_ALLOW_INLINE_ASM) && !defined(GEMMLOWP_NO_BUSYWAIT) && \
+    (defined(GEMMLOWP_ARM) || defined(GEMMLOWP_X86))
+
+#define GEMMLOWP_NOP "nop\n"
+
+#define GEMMLOWP_STRING_CONCAT_4(X) X X X X
+#define GEMMLOWP_NOP4 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP)
+#define GEMMLOWP_NOP16 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP4)
+#define GEMMLOWP_NOP64 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP16)
+
+inline int DoSomeNOPs() {
+  asm volatile(GEMMLOWP_NOP64);
+  return 64;
+}
+
+#undef GEMMLOWP_STRING_CONCAT_4
+#undef GEMMLOWP_NOP64
+#undef GEMMLOWP_NOP16
+#undef GEMMLOWP_NOP4
+#undef GEMMLOWP_NOP
+
+#else  // May not use asm NOP.
+
+// If we can't use NOPs, let's use a non-inline function call as a basic
+// thing that has some vaguely known, nonzero cost.
+GEMMLOWP_NOINLINE
+inline int DoSomeNOPs() {
+  // Pretend that calling an empty function takes as long as 16 NOPs...
+  return 16;
+}
+#endif
+
+// Waits until *var != initial_value.
+//
+// Returns the new value of *var. The guarantee here is that
+// the return value is different from initial_value, and that that
+// new value has been taken by *var at some point during the
+// execution of this function. There is no guarantee that this is
+// still the value of *var when this function returns, since *var is
+// not assumed to be guarded by any lock.
+//
+// First does some busy-waiting for a fixed number of no-op cycles,
+// then falls back to passive waiting for the given condvar, guarded
+// by the given mutex.
+//
+// The idea of doing some initial busy-waiting is to help get
+// better and more consistent multithreading benefits for small GEMM sizes.
+// Busy-waiting help ensuring that if we need to wake up soon after having
+// started waiting, then we can wake up quickly (as opposed to, say,
+// having to wait to be scheduled again by the OS). On the other hand,
+// we must still eventually revert to passive waiting for longer waits
+// (e.g. worker threads having finished a GEMM and waiting until the next GEMM)
+// so as to avoid permanently spinning.
+//
+template <typename T>
+T WaitForVariableChange(std::atomic<T>* var, T initial_value,
+                        pthread_cond_t* cond, pthread_mutex_t* mutex) {
+  // First, trivial case where the variable already changed value.
+  T new_value = var->load(std::memory_order_acquire);
+  if (new_value != initial_value) {
+    return new_value;
+  }
+  // Then try busy-waiting.
+  int nops = 0;
+  while (nops < kMaxBusyWaitNOPs) {
+    nops += DoSomeNOPs();
+    new_value = var->load(std::memory_order_acquire);
+    if (new_value != initial_value) {
+      return new_value;
+    }
+  }
+
+  // Finally, do real passive waiting.
+  pthread_mutex_lock(mutex);
+  new_value = var->load(std::memory_order_acquire);
+  while (new_value == initial_value) {
+    pthread_cond_wait(cond, mutex);
+    new_value = var->load(std::memory_order_acquire);
+  }
+  pthread_mutex_unlock(mutex);
+  return new_value;
+}
+
+// A BlockingCounter lets one thread to wait for N events to occur.
+// This is how the master thread waits for all the worker threads
+// to have finished working.
+// The waiting is done using a naive spinlock waiting for the atomic
+// count_ to hit the value 0. This is acceptable because in our usage
+// pattern, BlockingCounter is used only to synchronize threads after
+// short-lived tasks (performing parts of the same GEMM). It is not used
+// for synchronizing longer waits (resuming work on the next GEMM).
+class BlockingCounter {
+ public:
+  BlockingCounter() : count_(0) {}
+
+  // Sets/resets the counter; initial_count is the number of
+  // decrementing events that the Wait() call will be waiting for.
+  void Reset(std::size_t initial_count) {
+    std::size_t old_count_value = count_.load(std::memory_order_relaxed);
+    assert(old_count_value == 0);
+    (void)old_count_value;
+    count_.store(initial_count, std::memory_order_release);
+  }
+
+  // Decrements the counter; if the counter hits zero, signals
+  // the threads that were waiting for that, and returns true.
+  // Otherwise (if the decremented count is still nonzero),
+  // returns false.
+  bool DecrementCount() {
+    std::size_t old_count_value =
+        count_.fetch_sub(1, std::memory_order_acq_rel);
+    assert(old_count_value > 0);
+    std::size_t count_value = old_count_value - 1;
+    return count_value == 0;
+  }
+
+  // Waits for the N other threads (N having been set by Reset())
+  // to hit the BlockingCounter.
+  void Wait() {
+    ScopedProfilingLabel label("BlockingCounter::Wait");
+    // Busy-wait until the count value is 0.
+    int nops = 0;
+    while (count_.load(std::memory_order_acquire)) {
+      nops += DoSomeNOPs();
+      if (nops > kMaxBusyWaitNOPs) {
+        nops = 0;
+        // If we are unlucky, the blocking thread (that calls DecrementCount)
+        // and the blocked thread (here, calling Wait) may be scheduled on
+        // the same CPU, so the busy-waiting of the present thread may prevent
+        // the blocking thread from resuming and unblocking.
+        // If we are even unluckier, the priorities of the present thread
+        // might be higher than that of the blocking thread, so just yielding
+        // wouldn't allow the blocking thread to resume. So we sleep for
+        // a substantial amount of time in that case. Notice that we only
+        // do so after having busy-waited for kMaxBusyWaitNOPs, which is
+        // typically several milliseconds, so sleeping 1 more millisecond
+        // isn't terrible at that point.
+        //
+        // How this is mitigated in practice:
+        // In practice, it is well known that the application should be
+        // conservative in choosing how many threads to tell gemmlowp to use,
+        // as it's hard to know how many CPU cores it will get to run on,
+        // on typical mobile devices.
+        // It seems impossible for gemmlowp to make this choice automatically,
+        // which is why gemmlowp's default is to use only 1 thread, and
+        // applications may override that if they know that they can count on
+        // using more than that.
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
+      }
+    }
+  }
+
+ private:
+  std::atomic<std::size_t> count_;
+};
+
+// A workload for a worker.
+struct Task {
+  Task() : local_allocator(nullptr) {}
+  virtual ~Task() {}
+  virtual void Run() = 0;
+  Allocator* local_allocator;
+};
+
+// A worker thread.
+class Worker {
+ public:
+  enum class State {
+    ThreadStartup,  // The initial state before the thread main loop runs.
+    Ready,          // Is not working, has not yet received new work to do.
+    HasWork,        // Has work to do.
+    ExitAsSoonAsPossible  // Should exit at earliest convenience.
+  };
+
+  explicit Worker(BlockingCounter* counter_to_decrement_when_ready)
+      : task_(nullptr),
+        state_(State::ThreadStartup),
+        counter_to_decrement_when_ready_(counter_to_decrement_when_ready) {
+    pthread_cond_init(&state_cond_, nullptr);
+    pthread_mutex_init(&state_mutex_, nullptr);
+    pthread_create(&thread_, nullptr, ThreadFunc, this);
+  }
+
+  ~Worker() {
+    ChangeState(State::ExitAsSoonAsPossible);
+    pthread_join(thread_, nullptr);
+    pthread_cond_destroy(&state_cond_);
+    pthread_mutex_destroy(&state_mutex_);
+  }
+
+  // Changes State; may be called from either the worker thread
+  // or the master thread; however, not all state transitions are legal,
+  // which is guarded by assertions.
+  //
+  // The Task argument is to be used only with new_state==HasWork.
+  // It specifies the Task being handed to this Worker.
+  void ChangeState(State new_state, Task* task = nullptr) {
+    ScopedProfilingLabel label("Worker::ChangeState");
+    pthread_mutex_lock(&state_mutex_);
+    State old_state = state_.load(std::memory_order_relaxed);
+    assert(old_state != new_state);
+    switch (old_state) {
+      case State::ThreadStartup:
+        assert(new_state == State::Ready);
+        break;
+      case State::Ready:
+        assert(new_state == State::HasWork ||
+               new_state == State::ExitAsSoonAsPossible);
+        break;
+      case State::HasWork:
+        assert(new_state == State::Ready ||
+               new_state == State::ExitAsSoonAsPossible);
+        break;
+      default:
+        abort();
+    }
+    switch (new_state) {
+      case State::Ready:
+        if (task_) {
+          // Doing work is part of reverting to 'ready' state.
+          task_->Run();
+          task_ = nullptr;
+        }
+        break;
+      case State::HasWork:
+        assert(!task_);
+        task->local_allocator = &local_allocator_;
+        task_ = task;
+        break;
+      default:
+        break;
+    }
+    state_.store(new_state, std::memory_order_relaxed);
+    pthread_cond_broadcast(&state_cond_);
+    pthread_mutex_unlock(&state_mutex_);
+    if (new_state == State::Ready) {
+      counter_to_decrement_when_ready_->DecrementCount();
+    }
+  }
+
+  // Thread entry point.
+  void ThreadFunc() {
+    ScopedProfilingLabel label("Worker::ThreadFunc");
+
+    ChangeState(State::Ready);
+
+    // Thread main loop
+    while (true) {
+      // Get a state to act on
+      // In the 'Ready' state, we have nothing to do but to wait until
+      // we switch to another state.
+      State state_to_act_upon = WaitForVariableChange(
+          &state_, State::Ready, &state_cond_, &state_mutex_);
+
+      // We now have a state to act on, so act.
+      switch (state_to_act_upon) {
+        case State::HasWork:
+          // Got work to do! So do it, and then revert to 'Ready' state.
+          ChangeState(State::Ready);
+          break;
+        case State::ExitAsSoonAsPossible:
+          return;
+        default:
+          abort();
+      }
+    }
+  }
+
+  static void* ThreadFunc(void* arg) {
+    static_cast<Worker*>(arg)->ThreadFunc();
+    return nullptr;
+  }
+
+  // Called by the master thead to give this worker work to do.
+  void StartWork(Task* task) { ChangeState(State::HasWork, task); }
+
+ private:
+  // The underlying thread.
+  pthread_t thread_;
+
+  // The task to be worked on.
+  Task* task_;
+
+  // The condition variable and mutex guarding state changes.
+  pthread_cond_t state_cond_;
+  pthread_mutex_t state_mutex_;
+
+  // The state enum tells if we're currently working, waiting for work, etc.
+  // Its concurrent accesses by the worker and main threads are guarded by
+  // state_mutex_, and can thus use memory_order_relaxed. This still needs
+  // to be a std::atomic because we use WaitForVariableChange.
+  std::atomic<State> state_;
+
+  // Each thread had a local allocator so they can allocate temporary
+  // buffers without blocking each other.
+  Allocator local_allocator_;
+
+  // pointer to the master's thread BlockingCounter object, to notify the
+  // master thread of when this worker switches to the 'Ready' state.
+  BlockingCounter* const counter_to_decrement_when_ready_;
+};
+
+// A very simple pool of workers, that only allows the very
+// specific parallelization pattern that we use here:
+// a fixed number of workers can be given work, and one then
+// waits for all of them to finish.
+//
+// See MultiThreadGemmContextBase for how other WorkersPool implementations can
+// be used.
+class WorkersPool {
+ public:
+  WorkersPool() {}
+
+  ~WorkersPool() {
+    for (auto w : workers_) {
+      delete w;
+    }
+  }
+
+  // Just executes the tasks. Does not destroy them. Similar to
+  // ruy::ThreadPool::Execute.
+  template <typename TaskType>
+  void Execute(int tasks_count, TaskType* tasks) {
+    assert(tasks_count >= 1);
+    // One of the tasks will be run on the current thread.
+    std::size_t workers_count = tasks_count - 1;
+    CreateWorkers(workers_count);
+    assert(workers_count <= workers_.size());
+    counter_to_decrement_when_ready_.Reset(workers_count);
+    for (std::size_t i = 0; i < tasks_count - 1; i++) {
+      workers_[i]->StartWork(&tasks[i]);
+    }
+    // Execute the remaining workload immediately on the current thread.
+    Task* task = &tasks[tasks_count - 1];
+    task->local_allocator = &main_thread_task_allocator_;
+    task->Run();
+    // Wait for the workers submitted above to finish.
+    counter_to_decrement_when_ready_.Wait();
+  }
+
+  // Legacy: executes the tasks and destroys them
+  void LegacyExecuteAndDestroyTasks(const std::vector<Task*>& tasks) {
+    std::size_t tasks_count = tasks.size();
+    assert(tasks_count >= 1);
+    // One of the tasks will be run on the current thread.
+    std::size_t workers_count = tasks_count - 1;
+    CreateWorkers(workers_count);
+    assert(workers_count <= workers_.size());
+    counter_to_decrement_when_ready_.Reset(workers_count);
+    for (int i = 0; i < tasks_count - 1; i++) {
+      workers_[i]->StartWork(tasks[i]);
+    }
+    // Execute the remaining workload immediately on the current thread.
+    Task* task = tasks[tasks_count - 1];
+    task->local_allocator = &main_thread_task_allocator_;
+    task->Run();
+    // Wait for the workers submitted above to finish.
+    counter_to_decrement_when_ready_.Wait();
+    // Cleanup tasks (best to do this from the same thread that allocated
+    // the memory).
+    std::for_each(tasks.begin(), tasks.end(), [](Task* task) { delete task; });
+  }
+
+  // Legacy old name of LegacyExecuteAndDestroyTasks
+  void Execute(const std::vector<Task*>& tasks) {
+    LegacyExecuteAndDestroyTasks(tasks);
+  }
+
+ private:
+  // Ensures that the pool has at least the given count of workers.
+  // If any new worker has to be created, this function waits for it to
+  // be ready.
+  void CreateWorkers(std::size_t workers_count) {
+    if (workers_.size() >= workers_count) {
+      return;
+    }
+    counter_to_decrement_when_ready_.Reset(workers_count - workers_.size());
+    while (workers_.size() < workers_count) {
+      workers_.push_back(new Worker(&counter_to_decrement_when_ready_));
+    }
+    counter_to_decrement_when_ready_.Wait();
+  }
+
+  // copy construction disallowed
+  WorkersPool(const WorkersPool&) = delete;
+
+  // The workers in this pool. They are owned by the pool:
+  // the pool creates workers and destroys them in its destructor.
+  std::vector<Worker*> workers_;
+
+  // The BlockingCounter used to wait for the workers.
+  BlockingCounter counter_to_decrement_when_ready_;
+
+  // For N-threaded operations, we will use only N-1 worker threads
+  // while the last task will be run directly on the main thread.
+  // It will then use this main_thread_task_allocator_; having a
+  // dedicated allocator for that (separate from the base allocator_)
+  // allows to use the same code for all tasks regardless of which
+  // thread they run on.
+  Allocator main_thread_task_allocator_;
+};
+
+// The task we use to implement a multi-threaded Gemm: a block of the
+// RHS has been packed by the master thread; each worker thread
+// then has to pack a block of the LHS and accumulate the Gemm of these
+// packed LHS and RHS blocks.
+template <typename KernelFormat, typename InputScalar, typename OutputScalar,
+          typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
+          MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
+          typename OutputPipelineType, typename GemmContextType>
+struct GemmWithPackedRhsTask : Task {
+  typedef PackedSideBlock<typename KernelFormat::Lhs> PackedLhs;
+  typedef PackedSideBlock<typename KernelFormat::Rhs> PackedRhs;
+  GemmWithPackedRhsTask(GemmContextType* _context, const KernelBase& _kernel,
+                        const MatrixMap<const InputScalar, LhsOrder>& _lhs,
+                        const PackedRhs& _packed_rhs,
+                        MatrixMap<OutputScalar, ResultOrder>* _result,
+                        const MatrixBlockBounds& _result_block,
+                        const LhsOffset& _lhs_offset,
+                        const RhsOffset& _rhs_offset,
+                        const BlockParams& _block_params,
+                        const OutputPipelineType& _output_pipeline)
+      : context(_context),
+        kernel(_kernel),
+        lhs(_lhs),
+        packed_rhs(_packed_rhs),
+        result(*_result),
+        result_block(_result_block),
+        lhs_offset(_lhs_offset),
+        rhs_offset(_rhs_offset),
+        block_params(_block_params),
+        output_pipeline(_output_pipeline) {}
+
+  void Run() override {
+    ScopedProfilingLabel label("GemmWithPackedRhsTask");
+
+    const int rows = result_block.rows;
+    const int cols = result_block.cols;
+    const int depth = lhs.cols();
+
+    PackedLhs packed_lhs(Side::Lhs, local_allocator, block_params);
+
+    PackedResult packed_result(local_allocator, block_params);
+
+    local_allocator->Commit();
+
+    for (int c = 0; c < cols; c += block_params.l2_cols) {
+      int cs = std::min(block_params.l2_cols, cols - c);
+
+      for (int r = 0; r < rows; r += block_params.l2_rows) {
+        int rs = std::min(block_params.l2_rows, rows - r);
+
+        PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth));
+
+        Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs,
+                depth);
+
+        auto curr_result_block = MatrixBlockBounds(
+            result_block.start_row + r, result_block.start_col + c, rs, cs);
+        UnpackResult<KernelFormat>(
+            &result, curr_result_block, packed_result, depth,
+            packed_lhs.sums_of_each_slice(), packed_rhs.sums_of_each_slice(),
+            lhs_offset.block(curr_result_block.start_row, rs),
+            rhs_offset.block(curr_result_block.start_col, cs), output_pipeline);
+      }
+    }
+
+    local_allocator->Decommit();
+  }
+
+  const GemmContextType* context;
+  const KernelBase& kernel;
+  const MatrixMap<const InputScalar, LhsOrder> lhs;
+  const PackedRhs packed_rhs;
+  MatrixMap<OutputScalar, ResultOrder> result;
+  const MatrixBlockBounds result_block;
+  const LhsOffset& lhs_offset;
+  const RhsOffset& rhs_offset;
+  const BlockParams& block_params;
+  const OutputPipelineType& output_pipeline;
+};
+
+// This base class for multi-threading allows subclasses to implement their own
+// workers_pool() method.  See MultiThreadGemmContext below for an example;
+// any other implementation of workers_pool() must return an object with the
+// same public methods as WorkersPool.
+class MultiThreadGemmContextBase : public SingleThreadGemmContext {
+ public:
+  void set_max_num_threads(int n) { max_num_threads_ = n; }
+
+  int max_num_threads() const { return max_num_threads_; }
+
+ protected:
+  // The maximum number of worker threads to use (including
+  // the master thread).
+  // The default value 1 means single-threading. That is the default
+  // because gemmlowp's primary target is mobile hardware, where thermal
+  // constraints usually mean that it may not be realistic to use more
+  // than 1 CPU core even if multiple cores are present.
+  // The special value 0 means try to detect the number of hardware threads.
+  // Note: this assumes that all CPU cores are equivalent. That assumption
+  // is defeated on big.LITTLE ARM devices, where we have no API to query
+  // the number of big cores (which is typically what we would want to use,
+  // leaving aside above-mentioned thermal issues). That is the other reason
+  // why the best compromise here is to let max_num_threads_ default to 1,
+  // so users who want multi-threading have to make the decision of how many
+  // threads to use by themselves.
+  int max_num_threads_ = 1;
+};
+
+class MultiThreadGemmContext : public MultiThreadGemmContextBase {
+ public:
+  WorkersPool* workers_pool() { return &workers_pool_; }
+
+ private:
+  // The workers pool used by MultiThreadGemm. Making
+  // this part of the context allows it to be persistent,
+  // avoiding recreating threads on every Gemm.
+  WorkersPool workers_pool_;
+};
+
+// Determines how many threads should be used for a given Gemm
+// operation.
+template <int KernelRows>
+inline int HowManyThreads(int max_num_threads, int rows, int cols, int depth) {
+  // Early-exit in the default case where multi-threading is disabled.
+  if (max_num_threads == 1) {
+    return 1;
+  }
+
+  // Determine the maximum number of threads.
+  int max_count = GetHardwareConcurrency(max_num_threads);
+
+  // Basic calculation: take into account max pool size, and
+  // how many rows we have to feed our kernel.
+  // The motivation for an absolute minimum number of rows per thread,
+  // potentially higher than KernelRows, is that very thin thread workload
+  // currently defeat assumptions of the AddMod generator, resulting
+  // in substantial bias in TestWithRealData on 24 threads.
+  // Ideally, the AddMod generator should be aware of global (r,c) coordinates
+  // so as to be independent of the number of threads.
+  static const int AbsoluteMinRowsPerThread = 16;
+  static const int MinRowsPerThread = KernelRows > AbsoluteMinRowsPerThread
+                                          ? KernelRows
+                                          : AbsoluteMinRowsPerThread;
+  int thread_count = std::min(max_count, CeilQuotient(rows, MinRowsPerThread));
+
+  // At this point for small products we already have thread_count==1 so
+  // we can avoid doing more work; otherwise, we still want to check
+  // that the cubic size (rows*cols*depth) is big enough to keep
+  // workers_ busy.
+  if (thread_count > 1) {
+    // Empirically determined value.
+    static const std::uint64_t min_cubic_size_per_thread = 64 * 1024;
+
+    // We can only multiply two out of three sizes without risking overflow
+    const std::uint64_t cubic_size =
+        std::uint64_t(rows) * std::uint64_t(cols) * std::uint64_t(depth);
+
+    thread_count =
+        std::min(thread_count, int(cubic_size / min_cubic_size_per_thread));
+
+    if (thread_count < 1) {
+      thread_count = 1;
+    }
+  }
+
+  assert(thread_count > 0 && thread_count <= max_count);
+  return thread_count;
+}
+
+// The main multi-threaded Gemm function.
+// To understand it, first read the code of SingleThreadGemm().
+// The parallelization scheme used here is to have this master function
+// pack a block of RHS and then start worker threads to pack a block of LHS
+// each, and accumulate the corresponding products.
+template <typename KernelFormat, typename InputScalar, typename OutputScalar,
+          typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
+          MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
+          typename OutputPipelineType, typename GemmContextType>
+void MultiThreadGemm(GemmContextType* context, const KernelBase& kernel,
+                     const MatrixMap<const InputScalar, LhsOrder>& lhs,
+                     const MatrixMap<const InputScalar, RhsOrder>& rhs,
+                     MatrixMap<OutputScalar, ResultOrder>* result,
+                     const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
+                     const OutputPipelineType& output_pipeline) {
+  ScopedProfilingLabel label("gemmlowp::MultiThreadGemm");
+
+  assert(lhs.cols() == rhs.rows());
+
+  int rows = result->rows();
+  int cols = result->cols();
+  int depth = lhs.cols();
+
+  // zero sizes should have been caught earlier and early-returned.
+  assert(rows > 0);
+  assert(cols > 0);
+  assert(depth > 0);
+
+  // The case of rows<cols should have been caught earlier and transposed.
+  assert(rows >= cols);
+
+  const int thread_count = HowManyThreads<KernelFormat::kRows>(
+      context->max_num_threads(), rows, cols, depth);
+  if (thread_count == 1) {
+    return SingleThreadGemm<KernelFormat, InputScalar, OutputScalar,
+                            BitDepthParams>(context, kernel, lhs, rhs, result,
+                                            lhs_offset, rhs_offset,
+                                            output_pipeline);
+  }
+  assert(thread_count > 1);
+
+  // Simple 1:1 mapping of tasks to physical cores, which is very important
+  // to getting good multithreaded performance, specially for not-very-large
+  // GEMMs, and especially on Android.
+  const int task_count = thread_count;
+
+  Allocator* allocator = context->allocator();
+  auto* workers_pool = context->workers_pool();
+
+  BlockParams block_params;
+  block_params.Init<KernelFormat>(
+      rows, cols, depth, task_count, context->l1_bytes_to_use(),
+      context->l2_bytes_to_use(), context->l2_rhs_factor());
+
+  PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(Side::Rhs, allocator,
+                                                         block_params);
+  allocator->Commit();
+
+  // We loop over large blocks of the RHS.
+  for (int c = 0; c < cols; c += block_params.l2_cols) {
+    int cs = std::min(block_params.l2_cols, cols - c);
+
+    // Pack a large block of the RHS.
+    PackRhs(&packed_rhs, rhs.block(0, c, depth, cs));
+
+    // Give work to each worker.
+    std::vector<Task*> tasks;
+    int next_start_row = 0;
+    for (int n = 0; n < task_count; ++n) {
+      int start_row = next_start_row;
+      next_start_row = std::min(
+          rows, RoundUp<KernelFormat::kRows>(rows * (n + 1) / task_count));
+
+      int block_rows = next_start_row - start_row;
+      auto lhs_block = lhs.block(start_row, 0, block_rows, depth);
+      typedef GemmWithPackedRhsTask<KernelFormat, InputScalar, OutputScalar,
+                                    BitDepthParams, LhsOrder, RhsOrder,
+                                    ResultOrder, LhsOffset, RhsOffset,
+                                    OutputPipelineType, GemmContextType>
+          TaskType;
+      tasks.push_back(
+          new TaskType(context, kernel, lhs_block, packed_rhs, result,
+                       MatrixBlockBounds(start_row, c, block_rows, cs),
+                       lhs_offset, rhs_offset, block_params, output_pipeline));
+    }
+    // Execute the work on the workers (and partially on this thread).
+    workers_pool->Execute(tasks);
+  }
+
+  allocator->Decommit();
+}
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output.h
new file mode 100644
index 00000000..92bf7b9f
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output.h
@@ -0,0 +1,579 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// output.h: processing the 32-bit accumulators output by the unpack
+// stage, obtaining the final result matrix entries and storing them into
+// the destination matrix.
+
+#ifndef GEMMLOWP_INTERNAL_OUTPUT_H_
+#define GEMMLOWP_INTERNAL_OUTPUT_H_
+
+#include <cmath>
+#include <tuple>
+#include <type_traits>
+#include <typeinfo>
+
+#include "../fixedpoint/fixedpoint.h"
+#include "../public/output_stages.h"
+#include "simd_wrappers.h"
+
+namespace gemmlowp {
+
+template <typename OutputStage, typename InputBufferType>
+struct OutputStageEvalBufferImpl {
+  // This generic template body should never be hit.
+  static_assert(
+      std::is_same<InputBufferType, void>::value,
+      "Unimplemented: missing implementation of this output pipeline stage "
+      "for this data type. This would happen if some architecture-specific "
+      "SIMD back-end (output_$arch.h) were incomplete.");
+};
+
+template <typename OutputStage, typename InputType>
+struct OutputStageEvalImpl {
+  static constexpr int kRows = InputType::kRows;
+  static constexpr int kCols = InputType::kCols;
+  using InputBufferType = typename InputType::BufferType;
+  using BufferEvalImplType =
+      OutputStageEvalBufferImpl<OutputStage, InputBufferType>;
+  using OutputBufferType = typename BufferEvalImplType::OutputType;
+  using OutputScalarType = typename OutputBufferType::ScalarType;
+  using OutputType = RegisterBlock<OutputScalarType, kRows, kCols>;
+
+  OutputStageEvalImpl(const OutputStage& s) : buffer_eval_impl(s) {}
+
+  OutputType Eval(InputType input, int, int) const {
+    OutputType output;
+    output.buf = buffer_eval_impl.Eval(input.buf);
+    return output;
+  }
+
+  const BufferEvalImplType buffer_eval_impl;
+};
+
+template <int Size>
+struct OutputStageEvalBufferImpl<OutputStageQuantizeDownInt32ToUint8Scale,
+                                 RegisterBuffer<std::int32_t, Size>> {
+  using InputType = RegisterBuffer<std::int32_t, Size>;
+  using OutputType = RegisterBuffer<std::int32_t, Size>;
+
+  typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {}
+
+  OutputType Eval(InputType input) const {
+    const int result_shift = output_stage.result_shift;
+    const std::int32_t result_mult_int = output_stage.result_mult_int;
+    using RegisterType = typename InputType::RegisterType;
+    const RegisterType result_offset =
+        Dup<RegisterType>(output_stage.result_offset);
+    OutputType output;
+    for (int i = 0; i < InputType::kRegisterCount; i++) {
+      output.reg[i] = RoundingDivideByPOT(
+          Mul(Add(input.reg[i], result_offset), result_mult_int), result_shift);
+    }
+    return output;
+  }
+
+  const OutputStage& output_stage;
+};
+
+template <int Rows, int Cols, VectorShape Shape>
+struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8ScalePC<Shape>,
+                           RegisterBlock<std::int32_t, Rows, Cols>> {
+  typedef RegisterBlock<std::int32_t, Rows, Cols> InputType;
+  typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType;
+  typedef OutputStageQuantizeDownInt32ToUint8ScalePC<Shape> OutputStage;
+
+  OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
+
+  OutputType Eval(InputType input, int row, int col) const {
+    OutputType output;
+    const int result_shift = output_stage.result_shift;
+    const int pos = Shape == VectorShape::Col ? row : col;
+    const auto result_mult_int =
+        LoadForBroadcasting<InputType>(output_stage.result_mult_int, pos);
+    const auto result_offset =
+        LoadForBroadcasting<InputType>(output_stage.result_offset, pos);
+    const auto dividend = BroadcastMul<InputType>(
+        BroadcastAdd<InputType>(input, result_offset), result_mult_int);
+    for (int i = 0; i < InputType::kRegisterCount; i++) {
+      output.buf.reg[i] =
+          RoundingDivideByPOT(dividend.buf.reg[i], result_shift);
+    }
+    return output;
+  }
+
+  const OutputStage& output_stage;
+};
+
+template <int Size>
+struct OutputStageEvalBufferImpl<
+    OutputStageQuantizeDownInt32ByFixedPoint,
+    RegisterBuffer<std::int32_t, Size>> {
+  typedef RegisterBuffer<std::int32_t, Size> InputType;
+  typedef RegisterBuffer<std::int32_t, Size> OutputType;
+
+  typedef OutputStageQuantizeDownInt32ByFixedPoint OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    using RegisterType = typename InputType::RegisterType;
+    const RegisterType result_offset_after_shift =
+        Dup<RegisterType>(output_stage.result_offset_after_shift);
+    for (int i = 0; i < InputType::kRegisterCount; i++) {
+      const RegisterType mulhigh_val = SaturatingRoundingDoublingHighMul(
+          input.reg[i], output_stage.result_fixedpoint_multiplier);
+      output.reg[i] =
+          Add(RoundingDivideByPOT(mulhigh_val, output_stage.result_shift),
+              result_offset_after_shift);
+    }
+    return output;
+  }
+
+  const OutputStage& output_stage;
+};
+
+template <int Size>
+struct OutputStageEvalBufferImpl<OutputStageScaleInt32ByFixedPointAndExponent,
+                                 RegisterBuffer<std::int32_t, Size>> {
+  typedef RegisterBuffer<std::int32_t, Size> InputType;
+  typedef RegisterBuffer<std::int32_t, Size> OutputType;
+
+  typedef OutputStageScaleInt32ByFixedPointAndExponent OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {
+    left_shift = std::max(0, output_stage.result_exponent);
+    right_shift = std::max(0, -output_stage.result_exponent);
+  }
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    using RegisterType = typename InputType::RegisterType;
+    const RegisterType result_offset_after_shift =
+        Dup<RegisterType>(output_stage.result_offset_after_shift);
+    for (int i = 0; i < InputType::kRegisterCount; i++) {
+      const RegisterType mulhigh_val = SaturatingRoundingDoublingHighMul(
+          ShiftLeft(input.reg[i], left_shift),
+          output_stage.result_fixedpoint_multiplier);
+      output.reg[i] = Add(RoundingDivideByPOT(mulhigh_val, right_shift),
+                          result_offset_after_shift);
+    }
+    return output;
+  }
+
+  const OutputStage& output_stage;
+  int left_shift;
+  int right_shift;
+};
+
+template <int Rows, int Cols, VectorShape Shape>
+struct OutputStageEvalImpl<
+    OutputStageScaleInt32ByFixedPointAndExponentPC<Shape>,
+    RegisterBlock<std::int32_t, Rows, Cols>> {
+  typedef RegisterBlock<std::int32_t, Rows, Cols> InputType;
+  typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType;
+
+  typedef OutputStageScaleInt32ByFixedPointAndExponentPC<Shape> OutputStage;
+
+  OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
+
+  OutputType Eval(InputType input, int row, int col) const {
+    OutputType output;
+    const int pos = Shape == VectorShape::Row ? col : row;
+    using RegisterType = typename InputType::RegisterType;
+    const RegisterType result_offset_after_shift =
+        Dup<RegisterType>(output_stage.result_offset_after_shift);
+    auto left_shift =
+        LoadForBroadcasting<InputType>(output_stage.result_exponent, pos);
+    auto right_shift =
+        LoadForBroadcasting<InputType>(output_stage.result_exponent, pos);
+    const auto result_fixedpoint_multiplier = LoadForBroadcasting<InputType>(
+        output_stage.result_fixedpoint_multiplier, pos);
+    for (int i = 0; i < decltype(left_shift)::kRegisterCount; i++) {
+      left_shift.buf.reg[i] = Max(left_shift.buf.reg[i], 0);
+      right_shift.buf.reg[i] = Max(-right_shift.buf.reg[i], 0);
+    }
+    const auto mulhigh_val = BroadcastSaturatingRoundingDoublingHighMul(
+        BroadcastShiftLeft(input, left_shift), result_fixedpoint_multiplier);
+    const auto rdpot_val =
+        BroadcastRoundingDivideByPOT(mulhigh_val, right_shift);
+    for (int i = 0; i < InputType::kRegisterCount; i++) {
+      output.buf.reg[i] = Add(rdpot_val.buf.reg[i], result_offset_after_shift);
+    }
+    return output;
+  }
+
+  const OutputStage& output_stage;
+};
+
+// Implementation of OutputStageSaturatingCastToUint8 for scalar data.
+template <int Size>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
+                                 RegisterBuffer<std::int32_t, Size>> {
+  typedef RegisterBuffer<std::int32_t, Size> InputType;
+  typedef RegisterBuffer<std::uint8_t, Size> OutputType;
+  static_assert(InputType::kRegisterLanes == 1,
+                "This path is only for scalar values");
+
+  typedef OutputStageSaturatingCastToUint8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    for (int i = 0; i < InputType::kRegisterCount; i++) {
+      std::int32_t data = input.reg[i];
+      output.reg[i] = data > 255 ? 255 : data < 0 ? 0 : data;
+    }
+    return output;
+  }
+};
+
+// Implementation of OutputStageSaturatingCastToInt8 for scalar data.
+template <int Size>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt8,
+                                 RegisterBuffer<std::int32_t, Size>> {
+  typedef RegisterBuffer<std::int32_t, Size> InputType;
+  typedef RegisterBuffer<std::int8_t, Size> OutputType;
+  static_assert(InputType::kRegisterLanes == 1,
+                "This path is only for scalar values");
+
+  typedef OutputStageSaturatingCastToInt8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    for (int i = 0; i < InputType::kRegisterCount; i++) {
+      std::int32_t data = input.reg[i];
+      output.reg[i] = data > 127 ? 127 : data < -128 ? -128 : data;
+    }
+    return output;
+  }
+};
+
+// Implementation of OutputStageSaturatingCastToInt16 for scalar data.
+template <int Size>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
+                                 RegisterBuffer<std::int32_t, Size>> {
+  typedef RegisterBuffer<std::int32_t, Size> InputType;
+  typedef RegisterBuffer<std::int16_t, Size> OutputType;
+  static_assert(InputType::kRegisterLanes == 1,
+                "This path is only for scalar values");
+
+  typedef OutputStageSaturatingCastToInt16 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    for (int i = 0; i < InputType::kRegisterCount; i++) {
+      std::int32_t data = input.reg[i];
+      output.reg[i] = data > 32767 ? 32767 : data < -32768 ? -32768 : data;
+    }
+    return output;
+  }
+};
+
+// Implementation of OutputStageTruncatingCastToUint8 for scalar data
+template <int Size>
+struct OutputStageEvalBufferImpl<OutputStageTruncatingCastToUint8,
+                                 RegisterBuffer<std::int32_t, Size>> {
+  typedef RegisterBuffer<std::int32_t, Size> InputType;
+  typedef RegisterBuffer<std::uint8_t, Size> OutputType;
+  static_assert(InputType::kRegisterLanes == 1,
+                "This path is only for scalar values");
+
+  typedef OutputStageTruncatingCastToUint8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    for (int i = 0; i < InputType::kRegisterCount; i++) {
+      output.reg[i] = input.reg[i];
+    }
+    return output;
+  }
+};
+
+template <int Rows, int Cols, typename VectorType>
+struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>,
+                           RegisterBlock<std::int32_t, Rows, Cols>> {
+  typedef RegisterBlock<std::int32_t, Rows, Cols> InputType;
+  typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType;
+  typedef OutputStageBiasAddition<VectorType> OutputStage;
+
+  OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
+
+  OutputType Eval(InputType input, int row, int col) const {
+    const int pos = VectorType::kShape == VectorShape::Row ? col : row;
+    return BroadcastAdd<InputType>(
+        input, LoadForBroadcasting<InputType>(output_stage.bias_vector, pos));
+  }
+
+  const OutputStage& output_stage;
+};
+
+template <int Size>
+struct OutputStageEvalBufferImpl<OutputStageClamp,
+                                 RegisterBuffer<std::int32_t, Size>> {
+  typedef RegisterBuffer<std::int32_t, Size> InputType;
+  typedef RegisterBuffer<std::int32_t, Size> OutputType;
+
+  typedef OutputStageClamp OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {}
+
+  OutputType Eval(InputType input) const {
+    using RegisterType = typename InputType::RegisterType;
+    const RegisterType min = Dup<RegisterType>(output_stage.min);
+    const RegisterType max = Dup<RegisterType>(output_stage.max);
+    OutputType output;
+    for (int i = 0; i < InputType::kRegisterCount; i++) {
+      output.reg[i] = Min(Max(input.reg[i], min), max);
+    }
+    return output;
+  }
+
+  const OutputStage& output_stage;
+};
+
+template <int Size>
+struct OutputStageEvalBufferImpl<OutputStageTanh,
+                                 RegisterBuffer<std::int32_t, Size>> {
+  typedef RegisterBuffer<std::int32_t, Size> InputType;
+  typedef RegisterBuffer<std::int32_t, Size> OutputType;
+  using RegisterType = typename InputType::RegisterType;
+  typedef RegisterType DataType;
+  typedef OutputStageTanh OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {
+    const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;
+    const std::int32_t real_amplitude_as_int32 =
+        output_stage.real_amplitude_as_int32;
+
+    input_cutoff_min = real_zero_as_int32 - 8 * real_amplitude_as_int32;
+    input_cutoff_max = real_zero_as_int32 + 8 * real_amplitude_as_int32;
+    output_min = real_zero_as_int32 - real_amplitude_as_int32;
+    output_max = real_zero_as_int32 + real_amplitude_as_int32;
+
+    double inverse_amplitude_normalized_double = 1.0 / real_amplitude_as_int32;
+    inverse_amplitude_neg_exponent = 0;
+    while (inverse_amplitude_normalized_double < 0.5) {
+      inverse_amplitude_normalized_double *= 2;
+      inverse_amplitude_neg_exponent++;
+    }
+    inverse_amplitude_normalized = FixedPoint<DataType, 0>::FromDouble(
+        inverse_amplitude_normalized_double);
+
+    double amplitude_normalized_double = real_amplitude_as_int32;
+    amplitude_exponent = 0;
+    while (amplitude_normalized_double >= 1.0) {
+      amplitude_normalized_double *= 0.5;
+      amplitude_exponent++;
+    }
+    amplitude_normalized =
+        FixedPoint<DataType, 0>::FromDouble(amplitude_normalized_double);
+  }
+
+  OutputType Eval(InputType input) const {
+    const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;
+
+    typedef FixedPoint<DataType, 3> F3;
+    typedef FixedPoint<DataType, 0> F0;
+
+    OutputType output;
+
+    for (int i = 0; i < OutputType::kRegisterCount; i++) {
+      // fixed-point affine transformation
+      DataType input_centered =
+          Sub(input.reg[i], Dup<DataType>(real_zero_as_int32));
+      F3 fixedpoint_input =
+          F3::FromRaw(input_centered) * inverse_amplitude_normalized;
+      // left shift
+      fixedpoint_input.raw() = ShiftLeft(fixedpoint_input.raw(),
+                                         28 - inverse_amplitude_neg_exponent);
+      // fixed-point tanh and multiplication
+      F0 fixedpoint_output = tanh(fixedpoint_input) * amplitude_normalized;
+      // right shift
+      DataType int32_output =
+          Add(Dup<DataType>(real_zero_as_int32),
+              ShiftRight(fixedpoint_output.raw(), 31 - amplitude_exponent));
+
+      DataType mask_if_below_cutoff_min =
+          MaskIfLessThanOrEqual(input.reg[i], Dup<DataType>(input_cutoff_min));
+      DataType mask_if_above_cutoff_max = MaskIfGreaterThanOrEqual(
+          input.reg[i], Dup<DataType>(input_cutoff_max));
+
+      output.reg[i] = SelectUsingMask(
+          mask_if_below_cutoff_min, Dup<DataType>(output_min),
+          SelectUsingMask(mask_if_above_cutoff_max, Dup<DataType>(output_max),
+                          int32_output));
+    }
+    return output;
+  }
+
+  const OutputStage& output_stage;
+  std::int32_t input_cutoff_min, input_cutoff_max;
+  std::int32_t output_min, output_max;
+  FixedPoint<DataType, 0> inverse_amplitude_normalized;
+  int inverse_amplitude_neg_exponent;
+  FixedPoint<DataType, 0> amplitude_normalized;
+  int amplitude_exponent;
+};
+
+// OutputPipelineOutputType is a helper to determine the output data type of a
+// pipeline, for a
+// given input data type. It is a recursive template; see the explanation on
+// OutputPipelineEvalImpl below.
+template <typename OutputPipelineType, int FirstStage, typename InputType,
+          bool StopRecursion =
+              FirstStage == std::tuple_size<OutputPipelineType>::value>
+struct OutputPipelineOutputType {
+  typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
+      FirstStageType;
+  typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
+      FirstStageOutputType;
+  typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage + 1,
+                                            FirstStageOutputType>::Type Type;
+};
+
+template <typename OutputPipelineType, int FirstStage, typename InputType>
+struct OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType,
+                                true> {
+  typedef InputType Type;
+};
+
+// OutputPipelineEvalImpl is a helper to implement the evaluation of
+// the whole pipeline. It is a recursive template to implement compile-time
+// unrolling of the loop over all pipeline stages. The 'FirstStage' parameter
+// is how we implement recursion: each specialization implements only
+// evaluation starting at 'FirstStage'. The StopRecursion parameter is just a
+// helper to implement the termination of the recursion as a partial
+// specialization below.
+template <typename OutputPipelineType, int FirstStage, typename InputType,
+          bool StopRecursion =
+              FirstStage == std::tuple_size<OutputPipelineType>::value>
+struct OutputPipelineEvalImpl {
+  typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
+      FirstStageType;
+  typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
+      FirstStageOutputType;
+  typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage,
+                                            InputType>::Type OutputType;
+
+  OutputPipelineEvalImpl(const OutputPipelineType& output_pipeline)
+      : head_impl(std::get<FirstStage>(output_pipeline)),
+        tail_impl(output_pipeline) {}
+
+  OutputType Eval(InputType input, int row, int col) const {
+    // Evaluate the first stage.
+    FirstStageOutputType first_stage_output = head_impl.Eval(input, row, col);
+    // Recurse into the remaining stages.
+    return tail_impl.Eval(first_stage_output, row, col);
+  }
+
+  const OutputStageEvalImpl<FirstStageType, InputType> head_impl;
+  const OutputPipelineEvalImpl<OutputPipelineType, FirstStage + 1,
+                               FirstStageOutputType>
+      tail_impl;
+};
+
+// Specialization on 'StopRecursion' for terminating the recursion.
+template <typename OutputPipelineType, int FirstStage, typename InputType>
+struct OutputPipelineEvalImpl<OutputPipelineType, FirstStage, InputType, true> {
+  OutputPipelineEvalImpl(const OutputPipelineType&) {}
+
+  InputType Eval(InputType input, int, int) const {
+    // Terminating the recursion.
+    return input;
+  }
+};
+
+template <typename RegisterBlockType, typename DstType>
+struct StoreFinalOutputImpl {
+  static_assert(std::is_same<RegisterBlockType, void>::value,
+                "This generic impl should never be hit");
+};
+
+template <typename ScalarType, int Rows, int Cols, typename DstType>
+struct StoreFinalOutputImpl<RegisterBlock<ScalarType, Rows, Cols>, DstType> {
+  using RegisterBlockType = RegisterBlock<ScalarType, Rows, Cols>;
+  static void Run(const RegisterBlockType& src, DstType* dst, int row,
+                  int col) {
+    for (int r = 0; r < Rows; r++) {
+      for (int c = 0; c < Cols; c++) {
+        *dst->data(row + r, col + c) = src.buf.reg[r + c * Rows];
+      }
+    }
+  }
+};
+
+// StoreFinalOutput takes the final value at the end of the output pipeline and
+// stores it into the destination matrix. It can be specialized for different
+// data types; the generic implementation here is typically used only for plain
+// old scalar (not SIMD) types.
+template <typename RegisterBlockType, typename DstType>
+void StoreFinalOutput(RegisterBlockType src, DstType* dst, int row, int col) {
+  StoreFinalOutputImpl<RegisterBlockType, DstType>::Run(src, dst, row, col);
+}
+
+template <typename OutputPipelineType, typename InputType>
+struct OutputPipelineExecutor {
+  OutputPipelineExecutor(const OutputPipelineType& output_pipeline)
+      : output_pipeline_eval_impl_(output_pipeline) {}
+
+  // Execute is the entry point into the output pipeline evaluation
+  // code. It should be the only thing that unpack code calls. It takes the
+  // result
+  // of the unpack stage and stores it into the destination matrix.
+  template <typename DstType>
+  void Execute(InputType input, DstType* dst, int src_global_row,
+               int src_global_col, int dst_row, int dst_col) const {
+    // Statically assert that the output pipeline matches the given destination
+    // matrix's scalar type.
+    typedef typename OutputPipelineOutputType<
+        OutputPipelineType, 0, InputType>::Type::BufferType::ScalarType
+
+        ScalarOutputType;
+    typedef typename DstType::Scalar ScalarDstType;
+    static_assert(std::is_same<ScalarOutputType, ScalarDstType>::value,
+                  "mismatched destination scalar type and output pipeline");
+
+    // Evaluate the output pipeline.
+    auto output =
+        output_pipeline_eval_impl_.Eval(input, src_global_row, src_global_col);
+    // Store the result into the destination matrix.
+    StoreFinalOutput(output, dst, dst_row, dst_col);
+  }
+
+  const OutputPipelineEvalImpl<OutputPipelineType, 0, InputType>
+      output_pipeline_eval_impl_;
+};
+
+}  // namespace gemmlowp
+
+#ifdef GEMMLOWP_NEON
+#include "output_neon.h"
+#elif defined(GEMMLOWP_SSE4)
+#include "output_sse.h"
+#elif defined(GEMMLOWP_MSA)
+#include "output_msa.h"
+#endif
+
+#endif  // GEMMLOWP_INTERNAL_OUTPUT_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output_avx.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output_avx.h
new file mode 100644
index 00000000..b8f94fba
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output_avx.h
@@ -0,0 +1,19 @@
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// output_avx.h: optimized AVX 2 specializations of the templates in output.h.
+
+#ifndef GEMMLOWP_INTERNAL_OUTPUT_AVX_H_
+#define GEMMLOWP_INTERNAL_OUTPUT_AVX_H_
+
+#endif  // GEMMLOWP_INTERNAL_OUTPUT_AVX_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output_msa.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output_msa.h
new file mode 100644
index 00000000..0540bb34
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output_msa.h
@@ -0,0 +1,1111 @@
+// Copyright 2018 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// output_msa.h: optimized MSA specializations of the templates in output.h.
+
+#ifndef GEMMLOWP_INTERNAL_OUTPUT_MSA_H_
+#define GEMMLOWP_INTERNAL_OUTPUT_MSA_H_
+
+#include "output.h"
+
+#include <msa.h>
+
+namespace gemmlowp {
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
+                                 RegBufferInt32<4>> {
+  typedef RegBufferInt32<4> InputType;
+  typedef RegBufferUint8<4> OutputType;
+
+  typedef OutputStageSaturatingCastToUint8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    // Signed saturate each 32-bit element to 9 bits
+    // (this takes full care of non-negative elements).
+    v4i32 tmp = __builtin_msa_sat_s_w(input.reg[0], 8);
+    // Zero out negative elements.
+    tmp = __builtin_msa_maxi_s_w(tmp, 0);
+    // Pack every 32-bit element into 16 bits.
+    tmp = reinterpret_cast<v4i32>(__builtin_msa_pckev_h(
+        reinterpret_cast<v8i16>(tmp), reinterpret_cast<v8i16>(tmp)));
+    // Pack every element into 8 bits.
+    tmp = reinterpret_cast<v4i32>(__builtin_msa_pckev_b(
+        reinterpret_cast<v16i8>(tmp), reinterpret_cast<v16i8>(tmp)));
+    // Return 4 uint8_t elements as uint32_t.
+    output.reg[0] = __builtin_msa_copy_s_w(tmp, 0);
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
+                                 RegBufferInt32<8>> {
+  typedef RegBufferInt32<8> InputType;
+  typedef RegBufferUint8<8> OutputType;
+
+  typedef OutputStageSaturatingCastToUint8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    // Signed saturate each 32-bit element to 9 bits
+    // (this takes full care of non-negative elements).
+    v4i32 tmp_lo = __builtin_msa_sat_s_w(input.reg[0], 8);
+    v4i32 tmp_hi = __builtin_msa_sat_s_w(input.reg[1], 8);
+    // Pack every 32-bit element into 16 bits,
+    // combining all 8 elements into one vector.
+    tmp_lo = reinterpret_cast<v4i32>(__builtin_msa_pckev_h(
+        reinterpret_cast<v8i16>(tmp_hi), reinterpret_cast<v8i16>(tmp_lo)));
+    // Zero out negative elements.
+    tmp_lo = reinterpret_cast<v4i32>(__builtin_msa_maxi_s_h(
+        reinterpret_cast<v8i16>(tmp_lo), 0));
+    // Pack every element into 8 bits.
+    tmp_lo = reinterpret_cast<v4i32>(__builtin_msa_pckev_b(
+        reinterpret_cast<v16i8>(tmp_lo), reinterpret_cast<v16i8>(tmp_lo)));
+    // Return 8 uint8_t elements as 2 uint32_t's.
+    output.reg[0] = __builtin_msa_copy_s_w(tmp_lo, 0);
+    output.reg[1] = __builtin_msa_copy_s_w(tmp_lo, 1);
+    return output;
+  }
+};
+
+#define GEMMLOWP_MIPS_SAT_U8_16(out, in0, in1, in2, in3)                     \
+  {                                                                          \
+    v4i32 tmp0 = __builtin_msa_sat_s_w(in0, 8);                              \
+    v4i32 tmp1 = __builtin_msa_sat_s_w(in1, 8);                              \
+    v4i32 tmp2 = __builtin_msa_sat_s_w(in2, 8);                              \
+    v4i32 tmp3 = __builtin_msa_sat_s_w(in3, 8);                              \
+    tmp0 = reinterpret_cast<v4i32>(__builtin_msa_pckev_h(                    \
+        reinterpret_cast<v8i16>(tmp1), reinterpret_cast<v8i16>(tmp0)));      \
+    tmp2 = reinterpret_cast<v4i32>(__builtin_msa_pckev_h(                    \
+        reinterpret_cast<v8i16>(tmp3), reinterpret_cast<v8i16>(tmp2)));      \
+    tmp0 = reinterpret_cast<v4i32>(__builtin_msa_maxi_s_h(                   \
+        reinterpret_cast<v8i16>(tmp0), 0));                                  \
+    tmp2 = reinterpret_cast<v4i32>(__builtin_msa_maxi_s_h(                   \
+        reinterpret_cast<v8i16>(tmp2), 0));                                  \
+    tmp0 = reinterpret_cast<v4i32>(__builtin_msa_pckev_b(                    \
+        reinterpret_cast<v16i8>(tmp2), reinterpret_cast<v16i8>(tmp0)));      \
+    out = reinterpret_cast<v16i8>(tmp0);                                     \
+  }
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
+                                 RegBufferInt32<16>> {
+  typedef RegBufferInt32<16> InputType;
+  typedef RegBufferUint8<16> OutputType;
+
+  typedef OutputStageSaturatingCastToUint8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    GEMMLOWP_MIPS_SAT_U8_16(output.reg[0], input.reg[0], input.reg[1],
+                            input.reg[2], input.reg[3]);
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
+                                 RegBufferInt32<32>> {
+  typedef RegBufferInt32<32> InputType;
+  typedef RegBufferUint8<32> OutputType;
+
+  typedef OutputStageSaturatingCastToUint8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    GEMMLOWP_MIPS_SAT_U8_16(output.reg[0], input.reg[0], input.reg[1],
+                            input.reg[2], input.reg[3]);
+    GEMMLOWP_MIPS_SAT_U8_16(output.reg[1], input.reg[4], input.reg[5],
+                            input.reg[6], input.reg[7]);
+    return output;
+  }
+};
+
+#undef GEMMLOWP_MIPS_SAT_U8_16
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
+                                 RegBufferInt32<4>> {
+  typedef RegBufferInt32<4> InputType;
+  typedef RegBufferInt16<4> OutputType;
+
+  typedef OutputStageSaturatingCastToInt16 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    // Signed saturate each 32-bit element to 16 bits.
+    v8i16 tmp =
+        reinterpret_cast<v8i16>(__builtin_msa_sat_s_w(input.reg[0], 15));
+    output.reg[0] = __builtin_msa_copy_s_h(tmp, 0);
+    output.reg[1] = __builtin_msa_copy_s_h(tmp, 2);
+    output.reg[2] = __builtin_msa_copy_s_h(tmp, 4);
+    output.reg[3] = __builtin_msa_copy_s_h(tmp, 6);
+    return output;
+  }
+};
+
+#define GEMMLOWP_MIPS_SAT_I16_8(out, in0, in1)                  \
+  {                                                             \
+    v4i32 tmp0 = __builtin_msa_sat_s_w(in0, 15);                \
+    v4i32 tmp1 = __builtin_msa_sat_s_w(in1, 15);                \
+    out = __builtin_msa_pckev_h(reinterpret_cast<v8i16>(tmp1),  \
+                                reinterpret_cast<v8i16>(tmp0)); \
+  }
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
+                                 RegBufferInt32<8>> {
+  typedef RegBufferInt32<8> InputType;
+  typedef RegBufferInt16<8> OutputType;
+
+  typedef OutputStageSaturatingCastToInt16 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    GEMMLOWP_MIPS_SAT_I16_8(output.reg[0], input.reg[0], input.reg[1]);
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
+                                 RegBufferInt32<16>> {
+  typedef RegBufferInt32<16> InputType;
+  typedef RegBufferInt16<16> OutputType;
+
+  typedef OutputStageSaturatingCastToInt16 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    GEMMLOWP_MIPS_SAT_I16_8(output.reg[0], input.reg[0], input.reg[1]);
+    GEMMLOWP_MIPS_SAT_I16_8(output.reg[1], input.reg[2], input.reg[3]);
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
+                                 RegBufferInt32<32>> {
+  typedef RegBufferInt32<32> InputType;
+  typedef RegBufferInt16<32> OutputType;
+
+  typedef OutputStageSaturatingCastToInt16 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    GEMMLOWP_MIPS_SAT_I16_8(output.reg[0], input.reg[0], input.reg[1]);
+    GEMMLOWP_MIPS_SAT_I16_8(output.reg[1], input.reg[2], input.reg[3]);
+    GEMMLOWP_MIPS_SAT_I16_8(output.reg[2], input.reg[4], input.reg[5]);
+    GEMMLOWP_MIPS_SAT_I16_8(output.reg[3], input.reg[6], input.reg[7]);
+    return output;
+  }
+};
+
+#undef GEMMLOWP_MIPS_SAT_I16_8
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageTruncatingCastToUint8,
+                                 RegBufferInt32<4>> {
+  typedef RegBufferInt32<4> InputType;
+  typedef RegBufferUint8<4> OutputType;
+
+  typedef OutputStageTruncatingCastToUint8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    // Pack every 32-bit element into 16 bits.
+    v4i32 tmp = reinterpret_cast<v4i32>(__builtin_msa_pckev_h(
+        reinterpret_cast<v8i16>(input.reg[0]),
+        reinterpret_cast<v8i16>(input.reg[0])));
+    // Pack every element into 8 bits.
+    tmp = reinterpret_cast<v4i32>(__builtin_msa_pckev_b(
+        reinterpret_cast<v16i8>(tmp), reinterpret_cast<v16i8>(tmp)));
+    // Return 4 uint8_t elements as uint32_t.
+    output.reg[0] = __builtin_msa_copy_s_w(tmp, 0);
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageTruncatingCastToUint8,
+                                 RegBufferInt32<8>> {
+  typedef RegBufferInt32<8> InputType;
+  typedef RegBufferUint8<8> OutputType;
+
+  typedef OutputStageTruncatingCastToUint8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    // Pack every 32-bit element into 16 bits.
+    v4i32 tmp = reinterpret_cast<v4i32>(__builtin_msa_pckev_h(
+        reinterpret_cast<v8i16>(input.reg[1]),
+        reinterpret_cast<v8i16>(input.reg[0])));
+    // Pack every element into 8 bits.
+    tmp = reinterpret_cast<v4i32>(__builtin_msa_pckev_b(
+        reinterpret_cast<v16i8>(tmp), reinterpret_cast<v16i8>(tmp)));
+    // Return 8 uint8_t elements as 2 uint32_t's.
+    output.reg[0] = __builtin_msa_copy_s_w(tmp, 0);
+    output.reg[1] = __builtin_msa_copy_s_w(tmp, 1);
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageTruncatingCastToUint8,
+                                 RegBufferInt32<16>> {
+  typedef RegBufferInt32<16> InputType;
+  typedef RegBufferUint8<16> OutputType;
+
+  typedef OutputStageTruncatingCastToUint8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    // Pack every 32-bit element into 16 bits.
+    v8i16 tmp0 = __builtin_msa_pckev_h(
+        reinterpret_cast<v8i16>(input.reg[1]),
+        reinterpret_cast<v8i16>(input.reg[0]));
+    v8i16 tmp1 = __builtin_msa_pckev_h(
+        reinterpret_cast<v8i16>(input.reg[3]),
+        reinterpret_cast<v8i16>(input.reg[2]));
+    // Pack every element into 8 bits.
+    output.reg[0] = __builtin_msa_pckev_b(
+        reinterpret_cast<v16i8>(tmp1), reinterpret_cast<v16i8>(tmp0));
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageTruncatingCastToUint8,
+                                 RegBufferInt32<32>> {
+  typedef RegBufferInt32<32> InputType;
+  typedef RegBufferUint8<32> OutputType;
+
+  typedef OutputStageTruncatingCastToUint8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    // Pack every 32-bit element into 16 bits.
+    v8i16 tmp0 = __builtin_msa_pckev_h(
+        reinterpret_cast<v8i16>(input.reg[1]),
+        reinterpret_cast<v8i16>(input.reg[0]));
+    v8i16 tmp1 = __builtin_msa_pckev_h(
+        reinterpret_cast<v8i16>(input.reg[3]),
+        reinterpret_cast<v8i16>(input.reg[2]));
+    v8i16 tmp2 = __builtin_msa_pckev_h(
+        reinterpret_cast<v8i16>(input.reg[5]),
+        reinterpret_cast<v8i16>(input.reg[4]));
+    v8i16 tmp3 = __builtin_msa_pckev_h(
+        reinterpret_cast<v8i16>(input.reg[7]),
+        reinterpret_cast<v8i16>(input.reg[6]));
+    // Pack every element into 8 bits.
+    output.reg[0] = __builtin_msa_pckev_b(
+        reinterpret_cast<v16i8>(tmp1), reinterpret_cast<v16i8>(tmp0));
+    output.reg[1] = __builtin_msa_pckev_b(
+        reinterpret_cast<v16i8>(tmp3), reinterpret_cast<v16i8>(tmp2));
+    return output;
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt32<4, 1>, DstType> {
+  static void Run(const RegBlockInt32<4, 1>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      StoreInt32x4(dst->data(row, col), src.buf.reg[0]);
+    } else {
+      *dst->data(row + 0, col) = GetLane<0>(src.buf.reg[0]);
+      *dst->data(row + 1, col) = GetLane<1>(src.buf.reg[0]);
+      *dst->data(row + 2, col) = GetLane<2>(src.buf.reg[0]);
+      *dst->data(row + 3, col) = GetLane<3>(src.buf.reg[0]);
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt32<8, 1>, DstType> {
+  static void Run(const RegBlockInt32<8, 1>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      StoreInt32x4(dst->data(row, col), src.buf.reg[0]);
+      StoreInt32x4(dst->data(row + 4, col), src.buf.reg[1]);
+    } else {
+      *dst->data(row + 0, col) = GetLane<0>(src.buf.reg[0]);
+      *dst->data(row + 1, col) = GetLane<1>(src.buf.reg[0]);
+      *dst->data(row + 2, col) = GetLane<2>(src.buf.reg[0]);
+      *dst->data(row + 3, col) = GetLane<3>(src.buf.reg[0]);
+      *dst->data(row + 4, col) = GetLane<0>(src.buf.reg[1]);
+      *dst->data(row + 5, col) = GetLane<1>(src.buf.reg[1]);
+      *dst->data(row + 6, col) = GetLane<2>(src.buf.reg[1]);
+      *dst->data(row + 7, col) = GetLane<3>(src.buf.reg[1]);
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<4, 1>, DstType> {
+  static void Run(const RegBlockInt16<4, 1>& src, DstType* dst, int row,
+                  int col) {
+    *dst->data(row + 0, col) = src.buf.reg[0];
+    *dst->data(row + 1, col) = src.buf.reg[1];
+    *dst->data(row + 2, col) = src.buf.reg[2];
+    *dst->data(row + 3, col) = src.buf.reg[3];
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<8, 1>, DstType> {
+  static void Run(const RegBlockInt16<8, 1>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      StoreInt16x8(dst->data(row, col), src.buf.reg[0]);
+    } else {
+      *dst->data(row + 0, col) = __builtin_msa_copy_s_h(src.buf.reg[0], 0);
+      *dst->data(row + 1, col) = __builtin_msa_copy_s_h(src.buf.reg[0], 1);
+      *dst->data(row + 2, col) = __builtin_msa_copy_s_h(src.buf.reg[0], 2);
+      *dst->data(row + 3, col) = __builtin_msa_copy_s_h(src.buf.reg[0], 3);
+      *dst->data(row + 4, col) = __builtin_msa_copy_s_h(src.buf.reg[0], 4);
+      *dst->data(row + 5, col) = __builtin_msa_copy_s_h(src.buf.reg[0], 5);
+      *dst->data(row + 6, col) = __builtin_msa_copy_s_h(src.buf.reg[0], 6);
+      *dst->data(row + 7, col) = __builtin_msa_copy_s_h(src.buf.reg[0], 7);
+    }
+  }
+};
+
+inline RegBlockInt32<4, 4> Transpose(const RegBlockInt32<4, 4>& src) {
+  RegBlockInt32<4, 4> result;
+  v4i32 tmp0, tmp1;
+  tmp0 = __builtin_msa_ilvr_w(src.buf.reg[1], src.buf.reg[0]);
+  tmp1 = __builtin_msa_ilvr_w(src.buf.reg[3], src.buf.reg[2]);
+  result.buf.reg[0] = reinterpret_cast<v4i32>(__builtin_msa_ilvr_d(
+      reinterpret_cast<v2i64>(tmp1), reinterpret_cast<v2i64>(tmp0)));
+  result.buf.reg[1] = reinterpret_cast<v4i32>(__builtin_msa_ilvl_d(
+      reinterpret_cast<v2i64>(tmp1), reinterpret_cast<v2i64>(tmp0)));
+  tmp0 = __builtin_msa_ilvl_w(src.buf.reg[1], src.buf.reg[0]);
+  tmp1 = __builtin_msa_ilvl_w(src.buf.reg[3], src.buf.reg[2]);
+  result.buf.reg[2] = reinterpret_cast<v4i32>(__builtin_msa_ilvr_d(
+      reinterpret_cast<v2i64>(tmp1), reinterpret_cast<v2i64>(tmp0)));
+  result.buf.reg[3] = reinterpret_cast<v4i32>(__builtin_msa_ilvl_d(
+      reinterpret_cast<v2i64>(tmp1), reinterpret_cast<v2i64>(tmp0)));
+  return result;
+}
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt32<4, 4>, DstType> {
+  static void Run(const RegBlockInt32<4, 4>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      for (int i = 0; i < 4; i++) {
+        StoreInt32x4(dst->data(row, col + i), src.buf.reg[i]);
+      }
+    } else {
+      const auto transpose = Transpose(src);
+      for (int i = 0; i < 4; i++) {
+        StoreInt32x4(dst->data(row + i, col), transpose.buf.reg[i]);
+      }
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<4, 4>, DstType> {
+  static void Run(const RegBlockInt16<4, 4>& src, DstType* dst, int row,
+                  int col) {
+    std::int16_t buf[16];
+    StoreInt16x8(buf + 0, src.buf.reg[0]);
+    StoreInt16x8(buf + 8, src.buf.reg[1]);
+    for (int i = 0; i < 4; i++) {
+      for (int j = 0; j < 4; j++) {
+        *dst->data(row + i, col + j) = buf[i + 4 * j];
+      }
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt32<8, 4>, DstType> {
+  static void Run(const RegBlockInt32<8, 4>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      for (int i = 0; i < 4; i++) {
+        StoreInt32x4(dst->data(row, col + i), src.buf.reg[2 * i]);
+        StoreInt32x4(dst->data(row + 4, col + i), src.buf.reg[2 * i + 1]);
+      }
+    } else {
+      RegBlockInt32<4, 4> top;
+      top.buf.reg[0] = src.buf.reg[0];
+      top.buf.reg[1] = src.buf.reg[2];
+      top.buf.reg[2] = src.buf.reg[4];
+      top.buf.reg[3] = src.buf.reg[6];
+      const auto transpose_top = Transpose(top);
+      for (int i = 0; i < 4; i++) {
+        StoreInt32x4(dst->data(row + i, col), transpose_top.buf.reg[i]);
+      }
+      RegBlockInt32<4, 4> bottom;
+      bottom.buf.reg[0] = src.buf.reg[1];
+      bottom.buf.reg[1] = src.buf.reg[3];
+      bottom.buf.reg[2] = src.buf.reg[5];
+      bottom.buf.reg[3] = src.buf.reg[7];
+      const auto transpose_bottom = Transpose(bottom);
+      for (int i = 0; i < 4; i++) {
+        StoreInt32x4(dst->data(row + 4 + i, col), transpose_bottom.buf.reg[i]);
+      }
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<8, 4>, DstType> {
+  static void Run(const RegBlockInt16<8, 4>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      for (int i = 0; i < 4; i++) {
+        StoreInt16x8(dst->data(row, col + i), src.buf.reg[i]);
+      }
+    } else {
+      std::int16_t buf[32];
+      StoreInt16x8(buf + 0, src.buf.reg[0]);
+      StoreInt16x8(buf + 8, src.buf.reg[1]);
+      StoreInt16x8(buf + 16, src.buf.reg[2]);
+      StoreInt16x8(buf + 24, src.buf.reg[3]);
+      for (int i = 0; i < 8; i++) {
+        for (int j = 0; j < 4; j++) {
+          *dst->data(row + i, col + j) = buf[i + 8 * j];
+        }
+      }
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt32<8, 8>, DstType> {
+  static void Run(const RegBlockInt32<8, 8>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      for (int i = 0; i < 8; i++) {
+        StoreInt32x4(dst->data(row, col + i), src.buf.reg[2 * i]);
+        StoreInt32x4(dst->data(row + 4, col + i), src.buf.reg[2 * i + 1]);
+      }
+    } else {
+      RegBlockInt32<4, 4> top_left;
+      top_left.buf.reg[0] = src.buf.reg[0];
+      top_left.buf.reg[1] = src.buf.reg[2];
+      top_left.buf.reg[2] = src.buf.reg[4];
+      top_left.buf.reg[3] = src.buf.reg[6];
+      const auto transpose_top_left = Transpose(top_left);
+      for (int i = 0; i < 4; i++) {
+        StoreInt32x4(dst->data(row + i, col), transpose_top_left.buf.reg[i]);
+      }
+      RegBlockInt32<4, 4> bottom_left;
+      bottom_left.buf.reg[0] = src.buf.reg[1];
+      bottom_left.buf.reg[1] = src.buf.reg[3];
+      bottom_left.buf.reg[2] = src.buf.reg[5];
+      bottom_left.buf.reg[3] = src.buf.reg[7];
+      const auto transpose_bottom_left = Transpose(bottom_left);
+      for (int i = 0; i < 4; i++) {
+        StoreInt32x4(dst->data(row + 4 + i, col),
+                     transpose_bottom_left.buf.reg[i]);
+      }
+      RegBlockInt32<4, 4> top_right;
+      top_right.buf.reg[0] = src.buf.reg[8];
+      top_right.buf.reg[1] = src.buf.reg[10];
+      top_right.buf.reg[2] = src.buf.reg[12];
+      top_right.buf.reg[3] = src.buf.reg[14];
+      const auto transpose_top_right = Transpose(top_right);
+      for (int i = 0; i < 4; i++) {
+        StoreInt32x4(dst->data(row + i, col + 4),
+                     transpose_top_right.buf.reg[i]);
+      }
+      RegBlockInt32<4, 4> bottom_right;
+      bottom_right.buf.reg[0] = src.buf.reg[9];
+      bottom_right.buf.reg[1] = src.buf.reg[11];
+      bottom_right.buf.reg[2] = src.buf.reg[13];
+      bottom_right.buf.reg[3] = src.buf.reg[15];
+      const auto transpose_bottom_right = Transpose(bottom_right);
+      for (int i = 0; i < 4; i++) {
+        StoreInt32x4(dst->data(row + 4 + i, col + 4),
+                     transpose_bottom_right.buf.reg[i]);
+      }
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<8, 8>, DstType> {
+  static void Run(const RegBlockInt16<8, 8>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      for (int i = 0; i < 8; i++) {
+        StoreInt16x8(dst->data(row, col + i), src.buf.reg[i]);
+      }
+    } else {
+      // top-left 4x4
+      v4i32 t0 = reinterpret_cast<v4i32>(
+          __builtin_msa_ilvr_h(src.buf.reg[1], src.buf.reg[0]));
+      v4i32 t1 = reinterpret_cast<v4i32>(
+          __builtin_msa_ilvr_h(src.buf.reg[3], src.buf.reg[2]));
+      v2i64 u0 = reinterpret_cast<v2i64>(__builtin_msa_ilvr_w(t1, t0));
+      v2i64 u1 = reinterpret_cast<v2i64>(__builtin_msa_ilvl_w(t1, t0));
+      // top-right 4x4
+      v4i32 t2 = reinterpret_cast<v4i32>(
+          __builtin_msa_ilvr_h(src.buf.reg[5], src.buf.reg[4]));
+      v4i32 t3 = reinterpret_cast<v4i32>(
+          __builtin_msa_ilvr_h(src.buf.reg[7], src.buf.reg[6]));
+      v2i64 u2 = reinterpret_cast<v2i64>(__builtin_msa_ilvr_w(t3, t2));
+      v2i64 u3 = reinterpret_cast<v2i64>(__builtin_msa_ilvl_w(t3, t2));
+      // bottom-left 4x4
+      v4i32 t4 = reinterpret_cast<v4i32>(
+          __builtin_msa_ilvl_h(src.buf.reg[1], src.buf.reg[0]));
+      v4i32 t5 = reinterpret_cast<v4i32>(
+          __builtin_msa_ilvl_h(src.buf.reg[3], src.buf.reg[2]));
+      v2i64 u4 = reinterpret_cast<v2i64>(__builtin_msa_ilvr_w(t5, t4));
+      v2i64 u5 = reinterpret_cast<v2i64>(__builtin_msa_ilvl_w(t5, t4));
+      // bottom-right 4x4
+      v4i32 t6 = reinterpret_cast<v4i32>(
+          __builtin_msa_ilvl_h(src.buf.reg[5], src.buf.reg[4]));
+      v4i32 t7 = reinterpret_cast<v4i32>(
+          __builtin_msa_ilvl_h(src.buf.reg[7], src.buf.reg[6]));
+      v2i64 u6 = reinterpret_cast<v2i64>(__builtin_msa_ilvr_w(t7, t6));
+      v2i64 u7 = reinterpret_cast<v2i64>(__builtin_msa_ilvl_w(t7, t6));
+
+      StoreInt16x8(dst->data(row + 0, col),
+                   reinterpret_cast<v8i16>(__builtin_msa_ilvr_d(u2, u0)));
+      StoreInt16x8(dst->data(row + 1, col),
+                   reinterpret_cast<v8i16>(__builtin_msa_ilvl_d(u2, u0)));
+      StoreInt16x8(dst->data(row + 2, col),
+                   reinterpret_cast<v8i16>(__builtin_msa_ilvr_d(u3, u1)));
+      StoreInt16x8(dst->data(row + 3, col),
+                   reinterpret_cast<v8i16>(__builtin_msa_ilvl_d(u3, u1)));
+      StoreInt16x8(dst->data(row + 4, col),
+                   reinterpret_cast<v8i16>(__builtin_msa_ilvr_d(u6, u4)));
+      StoreInt16x8(dst->data(row + 5, col),
+                   reinterpret_cast<v8i16>(__builtin_msa_ilvl_d(u6, u4)));
+      StoreInt16x8(dst->data(row + 6, col),
+                   reinterpret_cast<v8i16>(__builtin_msa_ilvr_d(u7, u5)));
+      StoreInt16x8(dst->data(row + 7, col),
+                   reinterpret_cast<v8i16>(__builtin_msa_ilvl_d(u7, u5)));
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt32<1, 4>, DstType> {
+  static void Run(const RegBlockInt32<1, 4>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      *dst->data(row, col + 0) = GetLane<0>(src.buf.reg[0]);
+      *dst->data(row, col + 1) = GetLane<1>(src.buf.reg[0]);
+      *dst->data(row, col + 2) = GetLane<2>(src.buf.reg[0]);
+      *dst->data(row, col + 3) = GetLane<3>(src.buf.reg[0]);
+    } else {
+      StoreInt32x4(dst->data(row, col), src.buf.reg[0]);
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockUint8<4, 1>, DstType> {
+  static void Run(const RegBlockUint8<4, 1>& src, DstType* dst, int row,
+                  int col) {
+    const std::uint32_t src_reg = src.buf.reg[0];
+    for (int i = 0; i < 4; i++) {
+      *dst->data(row + i, col) = (src_reg >> (8 * i));
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockUint8<8, 1>, DstType> {
+  static void Run(const RegBlockUint8<8, 1>& src, DstType* dst, int row,
+                  int col) {
+    for (int i = 0; i < 4; i++) {
+      *dst->data(row + i, col) = (src.buf.reg[0] >> (8 * i));
+    }
+    for (int i = 0; i < 4; i++) {
+      *dst->data(row + 4 + i, col) = (src.buf.reg[1] >> (8 * i));
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockUint8<1, 4>, DstType> {
+  static void Run(const RegBlockUint8<1, 4>& src, DstType* dst, int row,
+                  int col) {
+    for (int i = 0; i < 4; i++) {
+      *dst->data(row, col + i) = (src.buf.reg[0] >> (8 * i));
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockUint8<4, 4>, DstType> {
+  static void Run(const RegBlockUint8<4, 4>& src, DstType* dst, int row,
+                  int col) {
+    std::uint8_t buf[16];
+    StoreUint8x16(buf, src.buf.reg[0]);
+    for (int c = 0; c < 4; c++) {
+      for (int r = 0; r < 4; r++) {
+        *dst->data(row + r, col + c) = buf[r + 4 * c];
+      }
+    }
+  }
+};
+
+// There's no way to express in C++ the desired machine code for
+// StoreFinalOutputImpl<RegBlockUint8<8, 4>, DstType> and
+// StoreFinalOutputImpl<RegBlockUint8<8, 8>, DstType>.
+// Hence, if we can, we use inline assembly, which takes advantage
+// of little-endian byte order and specifics of different CPU revisions.
+// Note, clang currently can't derive MSA register names from floating-
+// point register names and vice versa in inline assembly.
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && \
+    !defined(__clang__)
+
+// Instructions for pointer-sized operands.
+#ifdef GEMMLOWP_MIPS_64
+#define GEMMLOWP_MIPS_XADDU "daddu"
+#define GEMMLOWP_MIPS_XLSA "dlsa"
+#else
+#define GEMMLOWP_MIPS_XADDU "addu"
+#define GEMMLOWP_MIPS_XLSA "lsa"
+#endif
+
+// Stores 4 8-byte half-vectors with a stride.
+inline void MipsMsaStore4x8(const RegBlockUint8<8, 4>& src,
+                            std::uint8_t* dst_ptr, int stride) {
+#if (__mips_isa_rev >= 6)
+  // Assembly temporaries that will be handily referred to by their names.
+  std::uint8_t *dst_ptr1, *dst_ptr2, *dst_ptr3;
+  v16i8 vtmp0, vtmp1;
+  asm volatile(
+    GEMMLOWP_MIPS_XADDU " %[dst_ptr1], %[dst_ptr0], %[stride]\n"
+    "ilvl.d               %w[vtmp0], %w[src0], %w[src0]\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr2], %[stride], %[dst_ptr0], 1\n"
+    "ilvl.d               %w[vtmp1], %w[src1], %w[src1]\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr3], %[stride], %[dst_ptr1], 1\n"
+    "sdc1                 %[src0], 0(%[dst_ptr0])\n"
+    "sdc1                 %[vtmp0], 0(%[dst_ptr1])\n"
+    "sdc1                 %[src1], 0(%[dst_ptr2])\n"
+    "sdc1                 %[vtmp1], 0(%[dst_ptr3])\n"
+    :
+    // Outputs.
+    [dst_ptr0] "+r"(dst_ptr), [dst_ptr1] "=&r"(dst_ptr1),
+    [dst_ptr2] "=&r"(dst_ptr2), [dst_ptr3] "=&r"(dst_ptr3),
+    [vtmp0] "=&f"(vtmp0), [vtmp1] "=&f"(vtmp1)
+    :
+    // Inputs.
+    [src0] "f"(src.buf.reg[0]), [src1] "f"(src.buf.reg[1]),
+    [stride] "r"(stride)
+    :
+    // Clobbers.
+    "memory");
+#else
+  // Assembly temporaries that will be handily referred to by their names.
+  std::uint8_t *dst_ptr1, *dst_ptr2, *dst_ptr3;
+  int tmp0, tmp1, tmp2, tmp3;
+  asm volatile(
+    GEMMLOWP_MIPS_XADDU " %[dst_ptr1], %[dst_ptr0], %[stride]\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr2], %[stride], %[dst_ptr0], 1\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr3], %[stride], %[dst_ptr1], 1\n"
+    "copy_s.w             %[tmp0], %w[src0][0]\n"
+    "copy_s.w             %[tmp1], %w[src0][1]\n"
+    "copy_s.w             %[tmp2], %w[src0][2]\n"
+    "copy_s.w             %[tmp3], %w[src0][3]\n"
+    "swr                  %[tmp0], 0(%[dst_ptr0])\n"
+    "swl                  %[tmp0], 3(%[dst_ptr0])\n"
+    "swr                  %[tmp1], 4(%[dst_ptr0])\n"
+    "swl                  %[tmp1], 7(%[dst_ptr0])\n"
+    "swr                  %[tmp2], 0(%[dst_ptr1])\n"
+    "swl                  %[tmp2], 3(%[dst_ptr1])\n"
+    "swr                  %[tmp3], 4(%[dst_ptr1])\n"
+    "swl                  %[tmp3], 7(%[dst_ptr1])\n"
+    "copy_s.w             %[tmp0], %w[src1][0]\n"
+    "copy_s.w             %[tmp1], %w[src1][1]\n"
+    "copy_s.w             %[tmp2], %w[src1][2]\n"
+    "copy_s.w             %[tmp3], %w[src1][3]\n"
+    "swr                  %[tmp0], 0(%[dst_ptr2])\n"
+    "swl                  %[tmp0], 3(%[dst_ptr2])\n"
+    "swr                  %[tmp1], 4(%[dst_ptr2])\n"
+    "swl                  %[tmp1], 7(%[dst_ptr2])\n"
+    "swr                  %[tmp2], 0(%[dst_ptr3])\n"
+    "swl                  %[tmp2], 3(%[dst_ptr3])\n"
+    "swr                  %[tmp3], 4(%[dst_ptr3])\n"
+    "swl                  %[tmp3], 7(%[dst_ptr3])\n"
+    :
+    // Outputs.
+    [dst_ptr0] "+r"(dst_ptr), [dst_ptr1] "=&r"(dst_ptr1),
+    [dst_ptr2] "=&r"(dst_ptr2), [dst_ptr3] "=&r"(dst_ptr3), [tmp0] "=&r"(tmp0),
+    [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3)
+    :
+    // Inputs.
+    [src0] "f"(src.buf.reg[0]), [src1] "f"(src.buf.reg[1]),
+    [stride] "r"(stride)
+    :
+    // Clobbers.
+    "memory");
+#endif
+}
+
+// Stores 8 4-byte quarter-vectors with a stride.
+inline void MipsMsaStore8x4(const RegBlockUint8<4, 8>& src,
+                            std::uint8_t* dst_ptr, int stride) {
+#if (__mips_isa_rev >= 6)
+  // Assembly temporaries that will be handily referred to by their names.
+  std::uint8_t *dst_ptr1, *dst_ptr2, *dst_ptr3, *dst_ptr4, *dst_ptr5,
+      *dst_ptr6, *dst_ptr7;
+  int tmp1, tmp2, tmp3;
+  asm volatile(
+    GEMMLOWP_MIPS_XADDU " %[dst_ptr1], %[dst_ptr0], %[stride]\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr2], %[stride], %[dst_ptr0], 1\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr4], %[stride], %[dst_ptr0], 2\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr3], %[stride], %[dst_ptr1], 1\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr5], %[stride], %[dst_ptr1], 2\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr6], %[stride], %[dst_ptr2], 2\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr7], %[stride], %[dst_ptr3], 2\n"
+    "copy_s.w             %[tmp1], %w[src0][1]\n"
+    "copy_s.w             %[tmp2], %w[src0][2]\n"
+    "copy_s.w             %[tmp3], %w[src0][3]\n"
+    "swc1                 %[src0], 0(%[dst_ptr0])\n"
+    "sw                   %[tmp1], 0(%[dst_ptr1])\n"
+    "sw                   %[tmp2], 0(%[dst_ptr2])\n"
+    "sw                   %[tmp3], 0(%[dst_ptr3])\n"
+    "copy_s.w             %[tmp1], %w[src1][1]\n"
+    "copy_s.w             %[tmp2], %w[src1][2]\n"
+    "copy_s.w             %[tmp3], %w[src1][3]\n"
+    "swc1                 %[src1], 0(%[dst_ptr4])\n"
+    "sw                   %[tmp1], 0(%[dst_ptr5])\n"
+    "sw                   %[tmp2], 0(%[dst_ptr6])\n"
+    "sw                   %[tmp3], 0(%[dst_ptr7])\n"
+    :
+    // Outputs.
+    [dst_ptr0] "+r"(dst_ptr), [dst_ptr1] "=&r"(dst_ptr1),
+    [dst_ptr2] "=&r"(dst_ptr2), [dst_ptr3] "=&r"(dst_ptr3),
+    [dst_ptr4] "=&r"(dst_ptr4), [dst_ptr5] "=&r"(dst_ptr5),
+    [dst_ptr6] "=&r"(dst_ptr6), [dst_ptr7] "=&r"(dst_ptr7),
+    [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3)
+    :
+    // Inputs.
+    [src0] "f"(src.buf.reg[0]), [src1] "f"(src.buf.reg[1]),
+    [stride] "r"(stride)
+    :
+    // Clobbers.
+    "memory");
+#else
+  // Assembly temporaries that will be handily referred to by their names.
+  std::uint8_t *dst_ptr1, *dst_ptr2, *dst_ptr3, *dst_ptr4, *dst_ptr5,
+      *dst_ptr6, *dst_ptr7;
+  int tmp0, tmp1, tmp2, tmp3;
+  asm volatile(
+    GEMMLOWP_MIPS_XADDU " %[dst_ptr1], %[dst_ptr0], %[stride]\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr2], %[stride], %[dst_ptr0], 1\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr4], %[stride], %[dst_ptr0], 2\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr3], %[stride], %[dst_ptr1], 1\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr5], %[stride], %[dst_ptr1], 2\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr6], %[stride], %[dst_ptr2], 2\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr7], %[stride], %[dst_ptr3], 2\n"
+    "copy_s.w             %[tmp0], %w[src0][0]\n"
+    "copy_s.w             %[tmp1], %w[src0][1]\n"
+    "copy_s.w             %[tmp2], %w[src0][2]\n"
+    "copy_s.w             %[tmp3], %w[src0][3]\n"
+    "swr                  %[tmp0], 0(%[dst_ptr0])\n"
+    "swl                  %[tmp0], 3(%[dst_ptr0])\n"
+    "swr                  %[tmp1], 0(%[dst_ptr1])\n"
+    "swl                  %[tmp1], 3(%[dst_ptr1])\n"
+    "swr                  %[tmp2], 0(%[dst_ptr2])\n"
+    "swl                  %[tmp2], 3(%[dst_ptr2])\n"
+    "swr                  %[tmp3], 0(%[dst_ptr3])\n"
+    "swl                  %[tmp3], 3(%[dst_ptr3])\n"
+    "copy_s.w             %[tmp0], %w[src1][0]\n"
+    "copy_s.w             %[tmp1], %w[src1][1]\n"
+    "copy_s.w             %[tmp2], %w[src1][2]\n"
+    "copy_s.w             %[tmp3], %w[src1][3]\n"
+    "swr                  %[tmp0], 0(%[dst_ptr4])\n"
+    "swl                  %[tmp0], 3(%[dst_ptr4])\n"
+    "swr                  %[tmp1], 0(%[dst_ptr5])\n"
+    "swl                  %[tmp1], 3(%[dst_ptr5])\n"
+    "swr                  %[tmp2], 0(%[dst_ptr6])\n"
+    "swl                  %[tmp2], 3(%[dst_ptr6])\n"
+    "swr                  %[tmp3], 0(%[dst_ptr7])\n"
+    "swl                  %[tmp3], 3(%[dst_ptr7])\n"
+    :
+    // Outputs.
+    [dst_ptr0] "+r"(dst_ptr), [dst_ptr1] "=&r"(dst_ptr1),
+    [dst_ptr2] "=&r"(dst_ptr2), [dst_ptr3] "=&r"(dst_ptr3),
+    [dst_ptr4] "=&r"(dst_ptr4), [dst_ptr5] "=&r"(dst_ptr5),
+    [dst_ptr6] "=&r"(dst_ptr6), [dst_ptr7] "=&r"(dst_ptr7),
+    [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2),
+    [tmp3] "=&r"(tmp3)
+    :
+    // Inputs.
+    [src0] "f"(src.buf.reg[0]), [src1] "f"(src.buf.reg[1]),
+    [stride] "r"(stride)
+    :
+    // Clobbers.
+    "memory");
+#endif
+}
+
+// Stores 8 8-byte half-vectors with a stride.
+inline void MipsMsaStore8x8(const RegBlockUint8<8, 8>& src,
+                            std::uint8_t* dst_ptr, int stride) {
+#if (__mips_isa_rev >= 6)
+  // Assembly temporaries that will be handily referred to by their names.
+  std::uint8_t *dst_ptr1, *dst_ptr2, *dst_ptr3, *dst_ptr4, *dst_ptr5,
+      *dst_ptr6, *dst_ptr7;
+  v16i8 vtmp0, vtmp1, vtmp2, vtmp3;
+  asm volatile(
+    "ilvl.d               %w[vtmp0], %w[src0], %w[src0]\n"
+    GEMMLOWP_MIPS_XADDU " %[dst_ptr1], %[dst_ptr0], %[stride]\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr2], %[stride], %[dst_ptr0], 1\n"
+    "ilvl.d               %w[vtmp1], %w[src1], %w[src1]\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr4], %[stride], %[dst_ptr0], 2\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr3], %[stride], %[dst_ptr1], 1\n"
+    "ilvl.d               %w[vtmp2], %w[src2], %w[src2]\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr5], %[stride], %[dst_ptr1], 2\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr6], %[stride], %[dst_ptr2], 2\n"
+    "ilvl.d               %w[vtmp3], %w[src3], %w[src3]\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr7], %[stride], %[dst_ptr3], 2\n"
+    "sdc1                 %[src0], 0(%[dst_ptr0])\n"
+    "sdc1                 %[vtmp0], 0(%[dst_ptr1])\n"
+    "sdc1                 %[src1], 0(%[dst_ptr2])\n"
+    "sdc1                 %[vtmp1], 0(%[dst_ptr3])\n"
+    "sdc1                 %[src2], 0(%[dst_ptr4])\n"
+    "sdc1                 %[vtmp2], 0(%[dst_ptr5])\n"
+    "sdc1                 %[src3], 0(%[dst_ptr6])\n"
+    "sdc1                 %[vtmp3], 0(%[dst_ptr7])\n"
+    :
+    // Outputs.
+    [dst_ptr0] "+r"(dst_ptr), [dst_ptr1] "=&r"(dst_ptr1),
+    [dst_ptr2] "=&r"(dst_ptr2), [dst_ptr3] "=&r"(dst_ptr3),
+    [dst_ptr4] "=&r"(dst_ptr4), [dst_ptr5] "=&r"(dst_ptr5),
+    [dst_ptr6] "=&r"(dst_ptr6), [dst_ptr7] "=&r"(dst_ptr7),
+    [vtmp0] "=&f"(vtmp0), [vtmp1] "=&f"(vtmp1), [vtmp2] "=&f"(vtmp2),
+    [vtmp3] "=&f"(vtmp3)
+    :
+    // Inputs.
+    [src0] "f"(src.buf.reg[0]), [src1] "f"(src.buf.reg[1]),
+    [src2] "f"(src.buf.reg[2]), [src3] "f"(src.buf.reg[3]),
+    [stride] "r"(stride)
+    :
+    // Clobbers.
+    "memory");
+#else
+  // Assembly temporaries that will be handily referred to by their names.
+  std::uint8_t *dst_ptr1, *dst_ptr2, *dst_ptr3, *dst_ptr4, *dst_ptr5,
+      *dst_ptr6, *dst_ptr7;
+  int tmp0, tmp1, tmp2, tmp3;
+  asm volatile(
+    GEMMLOWP_MIPS_XADDU " %[dst_ptr1], %[dst_ptr0], %[stride]\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr2], %[stride], %[dst_ptr0], 1\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr4], %[stride], %[dst_ptr0], 2\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr3], %[stride], %[dst_ptr1], 1\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr5], %[stride], %[dst_ptr1], 2\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr6], %[stride], %[dst_ptr2], 2\n"
+    GEMMLOWP_MIPS_XLSA  " %[dst_ptr7], %[stride], %[dst_ptr3], 2\n"
+    "copy_s.w             %[tmp0], %w[src0][0]\n"
+    "copy_s.w             %[tmp1], %w[src0][1]\n"
+    "copy_s.w             %[tmp2], %w[src0][2]\n"
+    "copy_s.w             %[tmp3], %w[src0][3]\n"
+    "swr                  %[tmp0], 0(%[dst_ptr0])\n"
+    "swl                  %[tmp0], 3(%[dst_ptr0])\n"
+    "swr                  %[tmp1], 4(%[dst_ptr0])\n"
+    "swl                  %[tmp1], 7(%[dst_ptr0])\n"
+    "swr                  %[tmp2], 0(%[dst_ptr1])\n"
+    "swl                  %[tmp2], 3(%[dst_ptr1])\n"
+    "swr                  %[tmp3], 4(%[dst_ptr1])\n"
+    "swl                  %[tmp3], 7(%[dst_ptr1])\n"
+    "copy_s.w             %[tmp0], %w[src1][0]\n"
+    "copy_s.w             %[tmp1], %w[src1][1]\n"
+    "copy_s.w             %[tmp2], %w[src1][2]\n"
+    "copy_s.w             %[tmp3], %w[src1][3]\n"
+    "swr                  %[tmp0], 0(%[dst_ptr2])\n"
+    "swl                  %[tmp0], 3(%[dst_ptr2])\n"
+    "swr                  %[tmp1], 4(%[dst_ptr2])\n"
+    "swl                  %[tmp1], 7(%[dst_ptr2])\n"
+    "swr                  %[tmp2], 0(%[dst_ptr3])\n"
+    "swl                  %[tmp2], 3(%[dst_ptr3])\n"
+    "swr                  %[tmp3], 4(%[dst_ptr3])\n"
+    "swl                  %[tmp3], 7(%[dst_ptr3])\n"
+    "copy_s.w             %[tmp0], %w[src2][0]\n"
+    "copy_s.w             %[tmp1], %w[src2][1]\n"
+    "copy_s.w             %[tmp2], %w[src2][2]\n"
+    "copy_s.w             %[tmp3], %w[src2][3]\n"
+    "swr                  %[tmp0], 0(%[dst_ptr4])\n"
+    "swl                  %[tmp0], 3(%[dst_ptr4])\n"
+    "swr                  %[tmp1], 4(%[dst_ptr4])\n"
+    "swl                  %[tmp1], 7(%[dst_ptr4])\n"
+    "swr                  %[tmp2], 0(%[dst_ptr5])\n"
+    "swl                  %[tmp2], 3(%[dst_ptr5])\n"
+    "swr                  %[tmp3], 4(%[dst_ptr5])\n"
+    "swl                  %[tmp3], 7(%[dst_ptr5])\n"
+    "copy_s.w             %[tmp0], %w[src3][0]\n"
+    "copy_s.w             %[tmp1], %w[src3][1]\n"
+    "copy_s.w             %[tmp2], %w[src3][2]\n"
+    "copy_s.w             %[tmp3], %w[src3][3]\n"
+    "swr                  %[tmp0], 0(%[dst_ptr6])\n"
+    "swl                  %[tmp0], 3(%[dst_ptr6])\n"
+    "swr                  %[tmp1], 4(%[dst_ptr6])\n"
+    "swl                  %[tmp1], 7(%[dst_ptr6])\n"
+    "swr                  %[tmp2], 0(%[dst_ptr7])\n"
+    "swl                  %[tmp2], 3(%[dst_ptr7])\n"
+    "swr                  %[tmp3], 4(%[dst_ptr7])\n"
+    "swl                  %[tmp3], 7(%[dst_ptr7])\n"
+    :
+    // Outputs.
+    [dst_ptr0] "+r"(dst_ptr), [dst_ptr1] "=&r"(dst_ptr1),
+    [dst_ptr2] "=&r"(dst_ptr2), [dst_ptr3] "=&r"(dst_ptr3),
+    [dst_ptr4] "=&r"(dst_ptr4), [dst_ptr5] "=&r"(dst_ptr5),
+    [dst_ptr6] "=&r"(dst_ptr6), [dst_ptr7] "=&r"(dst_ptr7),
+    [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2),
+    [tmp3] "=&r"(tmp3)
+    :
+    // Inputs.
+    [src0] "f"(src.buf.reg[0]), [src1] "f"(src.buf.reg[1]),
+    [src2] "f"(src.buf.reg[2]), [src3] "f"(src.buf.reg[3]),
+    [stride] "r"(stride)
+    :
+    // Clobbers.
+    "memory");
+#endif
+}
+
+#undef GEMMLOWP_MIPS_XADDU
+#undef GEMMLOWP_MIPS_XLSA
+
+// Transposes a column-major 8x4 block for storage into a row-major matrix.
+inline RegBlockUint8<4, 8> Transpose(const RegBlockUint8<8, 4>& src) {
+  v16i8 tmp0 = __builtin_msa_ilvr_b(src.buf.reg[1], src.buf.reg[0]);
+  v16i8 tmp1 = __builtin_msa_ilvl_b(src.buf.reg[1], src.buf.reg[0]);
+  RegBlockUint8<4, 8> result;
+  result.buf.reg[0] = __builtin_msa_ilvr_b(tmp1, tmp0);
+  result.buf.reg[1] = __builtin_msa_ilvl_b(tmp1, tmp0);
+  return result;
+}
+
+inline RegBlockUint8<8, 8> Transpose(const RegBlockUint8<8, 8>& src) {
+  v16i8 tmp0[4];
+  tmp0[0] = __builtin_msa_ilvr_b(src.buf.reg[1], src.buf.reg[0]);
+  tmp0[1] = __builtin_msa_ilvl_b(src.buf.reg[1], src.buf.reg[0]);
+  tmp0[2] = __builtin_msa_ilvr_b(src.buf.reg[3], src.buf.reg[2]);
+  tmp0[3] = __builtin_msa_ilvl_b(src.buf.reg[3], src.buf.reg[2]);
+  v16i8 tmp1[4];
+  tmp1[0] = __builtin_msa_ilvr_b(tmp0[1], tmp0[0]);
+  tmp1[1] = __builtin_msa_ilvl_b(tmp0[1], tmp0[0]);
+  tmp1[2] = __builtin_msa_ilvr_b(tmp0[3], tmp0[2]);
+  tmp1[3] = __builtin_msa_ilvl_b(tmp0[3], tmp0[2]);
+  RegBlockUint8<8, 8> result;
+  result.buf.reg[0] = reinterpret_cast<v16i8>(__builtin_msa_ilvr_w(
+      reinterpret_cast<v4i32>(tmp1[2]), reinterpret_cast<v4i32>(tmp1[0])));
+  result.buf.reg[1] = reinterpret_cast<v16i8>(__builtin_msa_ilvl_w(
+      reinterpret_cast<v4i32>(tmp1[2]), reinterpret_cast<v4i32>(tmp1[0])));
+  result.buf.reg[2] = reinterpret_cast<v16i8>(__builtin_msa_ilvr_w(
+      reinterpret_cast<v4i32>(tmp1[3]), reinterpret_cast<v4i32>(tmp1[1])));
+  result.buf.reg[3] = reinterpret_cast<v16i8>(__builtin_msa_ilvl_w(
+      reinterpret_cast<v4i32>(tmp1[3]), reinterpret_cast<v4i32>(tmp1[1])));
+  return result;
+}
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockUint8<8, 4>, DstType> {
+  static void Run(const RegBlockUint8<8, 4>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      std::uint8_t* dst_ptr = dst->data(row, col);
+      int col_stride = dst->cols_stride();
+      MipsMsaStore4x8(src, dst_ptr, col_stride);
+    } else {
+      const auto& block = Transpose(src);
+      std::uint8_t* dst_ptr = dst->data(row, col);
+      int row_stride = dst->rows_stride();
+      MipsMsaStore8x4(block, dst_ptr, row_stride);
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockUint8<8, 8>, DstType> {
+  static void Run(const RegBlockUint8<8, 8>& src, DstType* dst, int row,
+                  int col) {
+    const auto& block =
+        (DstType::kOrder == MapOrder::ColMajor) ? src : Transpose(src);
+    std::uint8_t* dst_ptr = dst->data(row, col);
+    int stride = dst->stride();
+    MipsMsaStore8x8(block, dst_ptr, stride);
+  }
+};
+
+#else
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockUint8<8, 4>, DstType> {
+  static void Run(const RegBlockUint8<8, 4>& src, DstType* dst, int row,
+                  int col) {
+    std::uint8_t buf[32];
+    StoreUint8x16(buf, src.buf.reg[0]);
+    StoreUint8x16(buf + 16, src.buf.reg[1]);
+    for (int c = 0; c < 4; c++) {
+      for (int r = 0; r < 8; r++) {
+        *dst->data(row + r, col + c) = buf[r + 8 * c];
+      }
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockUint8<8, 8>, DstType> {
+  static void Run(const RegBlockUint8<8, 8>& src, DstType* dst, int row,
+                  int col) {
+    std::uint8_t buf[64];
+    StoreUint8x16(buf, src.buf.reg[0]);
+    StoreUint8x16(buf + 16, src.buf.reg[1]);
+    StoreUint8x16(buf + 32, src.buf.reg[2]);
+    StoreUint8x16(buf + 48, src.buf.reg[3]);
+    for (int c = 0; c < 8; c++) {
+      for (int r = 0; r < 8; r++) {
+        *dst->data(row + r, col + c) = buf[r + 8 * c];
+      }
+    }
+  }
+};
+
+#endif  // Endianness, compiler.
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_OUTPUT_MSA_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output_neon.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output_neon.h
new file mode 100644
index 00000000..52ea1bc4
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output_neon.h
@@ -0,0 +1,922 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// output_neon.h: optimized NEON specializations of the templates in output.h.
+
+#ifndef GEMMLOWP_INTERNAL_OUTPUT_NEON_H_
+#define GEMMLOWP_INTERNAL_OUTPUT_NEON_H_
+
+#include "output.h"
+
+#include <arm_neon.h>
+
+namespace gemmlowp {
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
+                                 RegBufferInt32<4>> {
+  typedef RegBufferInt32<4> InputType;
+  typedef RegBufferUint8<4> OutputType;
+
+  typedef OutputStageSaturatingCastToUint8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    int16x4_t res_16 = vqmovn_s32(input.reg[0]);
+    uint8x8_t res_8 = vqmovun_s16(vcombine_s16(res_16, res_16));
+    output.reg[0] = vget_lane_u32(vreinterpret_u32_u8(res_8), 0);
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
+                                 RegBufferInt32<8>> {
+  typedef RegBufferInt32<8> InputType;
+  typedef RegBufferUint8<8> OutputType;
+
+  typedef OutputStageSaturatingCastToUint8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    int16x8_t res_16 =
+        vcombine_s16(vqmovn_s32(input.reg[0]), vqmovn_s32(input.reg[1]));
+    output.reg[0] = vqmovun_s16(res_16);
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
+                                 RegBufferInt32<16>> {
+  typedef RegBufferInt32<16> InputType;
+  typedef RegBufferUint8<16> OutputType;
+
+  typedef OutputStageSaturatingCastToUint8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    int16x8_t res_16_0 =
+        vcombine_s16(vqmovn_s32(input.reg[0]), vqmovn_s32(input.reg[1]));
+    int16x8_t res_16_1 =
+        vcombine_s16(vqmovn_s32(input.reg[2]), vqmovn_s32(input.reg[3]));
+    output.reg[0] = vqmovun_s16(res_16_0);
+    output.reg[1] = vqmovun_s16(res_16_1);
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
+                                 RegBufferInt32<32>> {
+  typedef RegBufferInt32<32> InputType;
+  typedef RegBufferUint8<32> OutputType;
+
+  typedef OutputStageSaturatingCastToUint8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    int16x8_t res_16[4];
+    for (int i = 0; i < 4; i++) {
+      res_16[i] = vcombine_s16(vqmovn_s32(input.reg[2 * i]),
+                               vqmovn_s32(input.reg[2 * i + 1]));
+    }
+    for (int i = 0; i < 4; i++) {
+      output.reg[i] = vqmovun_s16(res_16[i]);
+    }
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt8,
+                                 RegBufferInt32<4>> {
+  typedef RegBufferInt32<4> InputType;
+  typedef RegBufferInt8<4> OutputType;
+
+  typedef OutputStageSaturatingCastToInt8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    int16x4_t res_16 = vqmovn_s32(input.reg[0]);
+    int8x8_t res_8 = vqmovn_s16(vcombine_s16(res_16, res_16));
+    output.reg[0] = vget_lane_s32(vreinterpret_s32_s8(res_8), 0);
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt8,
+                                 RegBufferInt32<8>> {
+  typedef RegBufferInt32<8> InputType;
+  typedef RegBufferInt8<8> OutputType;
+
+  typedef OutputStageSaturatingCastToInt8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    int16x8_t res_16 =
+        vcombine_s16(vqmovn_s32(input.reg[0]), vqmovn_s32(input.reg[1]));
+    output.reg[0] = vqmovn_s16(res_16);
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt8,
+                                 RegBufferInt32<16>> {
+  typedef RegBufferInt32<16> InputType;
+  typedef RegBufferInt8<16> OutputType;
+
+  typedef OutputStageSaturatingCastToInt8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    int16x8_t res_16_0 =
+        vcombine_s16(vqmovn_s32(input.reg[0]), vqmovn_s32(input.reg[1]));
+    int16x8_t res_16_1 =
+        vcombine_s16(vqmovn_s32(input.reg[2]), vqmovn_s32(input.reg[3]));
+    output.reg[0] = vqmovn_s16(res_16_0);
+    output.reg[1] = vqmovn_s16(res_16_1);
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt8,
+                                 RegBufferInt32<32>> {
+  typedef RegBufferInt32<32> InputType;
+  typedef RegBufferInt8<32> OutputType;
+
+  typedef OutputStageSaturatingCastToInt8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    int16x8_t res_16[4];
+    for (int i = 0; i < 4; i++) {
+      res_16[i] = vcombine_s16(vqmovn_s32(input.reg[2 * i]),
+                               vqmovn_s32(input.reg[2 * i + 1]));
+    }
+    for (int i = 0; i < 4; i++) {
+      output.reg[i] = vqmovn_s16(res_16[i]);
+    }
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
+                                 RegBufferInt32<4>> {
+  typedef RegBufferInt32<4> InputType;
+  typedef RegBufferInt16<4> OutputType;
+
+  typedef OutputStageSaturatingCastToInt16 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    output.reg[0] = vqmovn_s32(input.reg[0]);
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
+                                 RegBufferInt32<8>> {
+  typedef RegBufferInt32<8> InputType;
+  typedef RegBufferInt16<8> OutputType;
+
+  typedef OutputStageSaturatingCastToInt16 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    output.reg[0] =
+        vcombine_s16(vqmovn_s32(input.reg[0]), vqmovn_s32(input.reg[1]));
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
+                                 RegBufferInt32<16>> {
+  typedef RegBufferInt32<16> InputType;
+  typedef RegBufferInt16<16> OutputType;
+
+  typedef OutputStageSaturatingCastToInt16 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    output.reg[0] =
+        vcombine_s16(vqmovn_s32(input.reg[0]), vqmovn_s32(input.reg[1]));
+    output.reg[1] =
+        vcombine_s16(vqmovn_s32(input.reg[2]), vqmovn_s32(input.reg[3]));
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
+                                 RegBufferInt32<32>> {
+  typedef RegBufferInt32<32> InputType;
+  typedef RegBufferInt16<32> OutputType;
+
+  typedef OutputStageSaturatingCastToInt16 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    output.reg[0] =
+        vcombine_s16(vqmovn_s32(input.reg[0]), vqmovn_s32(input.reg[1]));
+    output.reg[1] =
+        vcombine_s16(vqmovn_s32(input.reg[2]), vqmovn_s32(input.reg[3]));
+    output.reg[2] =
+        vcombine_s16(vqmovn_s32(input.reg[4]), vqmovn_s32(input.reg[5]));
+    output.reg[3] =
+        vcombine_s16(vqmovn_s32(input.reg[6]), vqmovn_s32(input.reg[7]));
+    return output;
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt32<8, 1>, DstType> {
+  static void Run(const RegBlockInt32<8, 1>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      StoreInt32x4(dst->data(row, col), src.buf.reg[0]);
+      StoreInt32x4(dst->data(row + 4, col), src.buf.reg[1]);
+    } else {
+      vst1q_lane_s32(dst->data(row + 0, col), src.buf.reg[0], 0);
+      vst1q_lane_s32(dst->data(row + 1, col), src.buf.reg[0], 1);
+      vst1q_lane_s32(dst->data(row + 2, col), src.buf.reg[0], 2);
+      vst1q_lane_s32(dst->data(row + 3, col), src.buf.reg[0], 3);
+      vst1q_lane_s32(dst->data(row + 4, col), src.buf.reg[1], 0);
+      vst1q_lane_s32(dst->data(row + 5, col), src.buf.reg[1], 1);
+      vst1q_lane_s32(dst->data(row + 6, col), src.buf.reg[1], 2);
+      vst1q_lane_s32(dst->data(row + 7, col), src.buf.reg[1], 3);
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<4, 1>, DstType> {
+  static void Run(const RegBlockInt16<4, 1>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      StoreInt16x4(dst->data(row, col), src.buf.reg[0]);
+    } else {
+      vst1_lane_s16(dst->data(row + 0, col), src.buf.reg[0], 0);
+      vst1_lane_s16(dst->data(row + 1, col), src.buf.reg[0], 1);
+      vst1_lane_s16(dst->data(row + 2, col), src.buf.reg[0], 2);
+      vst1_lane_s16(dst->data(row + 3, col), src.buf.reg[0], 3);
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<8, 1>, DstType> {
+  static void Run(const RegBlockInt16<8, 1>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      StoreInt16x8(dst->data(row, col), src.buf.reg[0]);
+    } else {
+      vst1q_lane_s16(dst->data(row + 0, col), src.buf.reg[0], 0);
+      vst1q_lane_s16(dst->data(row + 1, col), src.buf.reg[0], 1);
+      vst1q_lane_s16(dst->data(row + 2, col), src.buf.reg[0], 2);
+      vst1q_lane_s16(dst->data(row + 3, col), src.buf.reg[0], 3);
+      vst1q_lane_s16(dst->data(row + 4, col), src.buf.reg[0], 4);
+      vst1q_lane_s16(dst->data(row + 5, col), src.buf.reg[0], 5);
+      vst1q_lane_s16(dst->data(row + 6, col), src.buf.reg[0], 6);
+      vst1q_lane_s16(dst->data(row + 7, col), src.buf.reg[0], 7);
+    }
+  }
+};
+
+inline RegBlockInt32<4, 4> Transpose(const RegBlockInt32<4, 4>& src) {
+  const int32x4x2_t t0 = vtrnq_s32(src.buf.reg[0], src.buf.reg[1]);
+  const int32x4x2_t t1 = vtrnq_s32(src.buf.reg[2], src.buf.reg[3]);
+  RegBlockInt32<4, 4> result;
+  result.buf.reg[0] =
+      vcombine_s32(vget_low_s32(t0.val[0]), vget_low_s32(t1.val[0]));
+  result.buf.reg[1] =
+      vcombine_s32(vget_low_s32(t0.val[1]), vget_low_s32(t1.val[1]));
+  result.buf.reg[2] =
+      vcombine_s32(vget_high_s32(t0.val[0]), vget_high_s32(t1.val[0]));
+  result.buf.reg[3] =
+      vcombine_s32(vget_high_s32(t0.val[1]), vget_high_s32(t1.val[1]));
+  return result;
+}
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt32<4, 4>, DstType> {
+  static void Run(const RegBlockInt32<4, 4>& src, DstType* dst, int row,
+                  int col) {
+    const auto& block =
+        DstType::kOrder == MapOrder::ColMajor ? src : Transpose(src);
+    std::int32_t* dst_ptr = dst->data(row, col);
+    int stride = dst->stride();
+    for (int i = 0; i < 4; i++) {
+      vst1q_s32(dst_ptr + i * stride, block.buf.reg[i]);
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<4, 4>, DstType> {
+  static void Run(const RegBlockInt16<4, 4>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      vst1_s16(dst->data(row, col + 0), vget_low_s16(src.buf.reg[0]));
+      vst1_s16(dst->data(row, col + 1), vget_high_s16(src.buf.reg[0]));
+      vst1_s16(dst->data(row, col + 2), vget_low_s16(src.buf.reg[1]));
+      vst1_s16(dst->data(row, col + 3), vget_high_s16(src.buf.reg[1]));
+    } else {
+      const int16x4x2_t t0 =
+          vtrn_s16(vget_low_s16(src.buf.reg[0]), vget_high_s16(src.buf.reg[0]));
+      const int16x4x2_t t1 =
+          vtrn_s16(vget_low_s16(src.buf.reg[1]), vget_high_s16(src.buf.reg[1]));
+      const int32x4x2_t t =
+          vtrnq_s32(vreinterpretq_s32_s16(vcombine_s16(t0.val[0], t0.val[1])),
+                    vreinterpretq_s32_s16(vcombine_s16(t1.val[0], t1.val[1])));
+      vst1_s16(dst->data(row + 0, col),
+               vget_low_s16(vreinterpretq_s16_s32(t.val[0])));
+      vst1_s16(dst->data(row + 1, col),
+               vget_high_s16(vreinterpretq_s16_s32(t.val[0])));
+      vst1_s16(dst->data(row + 2, col),
+               vget_low_s16(vreinterpretq_s16_s32(t.val[1])));
+      vst1_s16(dst->data(row + 3, col),
+               vget_high_s16(vreinterpretq_s16_s32(t.val[1])));
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt32<8, 4>, DstType> {
+  static void Run(const RegBlockInt32<8, 4>& src, DstType* dst, int row,
+                  int col) {
+    std::int32_t* dst_ptr = dst->data(row, col);
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      int col_stride = dst->cols_stride();
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(dst_ptr + i * col_stride + 0, src.buf.reg[2 * i + 0]);
+        vst1q_s32(dst_ptr + i * col_stride + 4, src.buf.reg[2 * i + 1]);
+      }
+    } else {
+      int row_stride = dst->rows_stride();
+      RegBlockInt32<4, 4> top;
+      top.buf.reg[0] = src.buf.reg[0];
+      top.buf.reg[1] = src.buf.reg[2];
+      top.buf.reg[2] = src.buf.reg[4];
+      top.buf.reg[3] = src.buf.reg[6];
+      const auto transpose_top = Transpose(top);
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(dst_ptr + i * row_stride, transpose_top.buf.reg[i]);
+      }
+      RegBlockInt32<4, 4> bottom;
+      bottom.buf.reg[0] = src.buf.reg[1];
+      bottom.buf.reg[1] = src.buf.reg[3];
+      bottom.buf.reg[2] = src.buf.reg[5];
+      bottom.buf.reg[3] = src.buf.reg[7];
+      const auto transpose_bottom = Transpose(bottom);
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(dst_ptr + (i + 4) * row_stride, transpose_bottom.buf.reg[i]);
+      }
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<8, 4>, DstType> {
+  static void Run(const RegBlockInt16<8, 4>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      vst1q_s16(dst->data(row, col + 0), src.buf.reg[0]);
+      vst1q_s16(dst->data(row, col + 1), src.buf.reg[1]);
+      vst1q_s16(dst->data(row, col + 2), src.buf.reg[2]);
+      vst1q_s16(dst->data(row, col + 3), src.buf.reg[3]);
+    } else {
+      const int16x8x2_t t0 = vtrnq_s16(src.buf.reg[0], src.buf.reg[1]);
+      const int16x8x2_t t1 = vtrnq_s16(src.buf.reg[2], src.buf.reg[3]);
+      const int32x4x2_t u0 = vtrnq_s32(vreinterpretq_s32_s16(t0.val[0]),
+                                       vreinterpretq_s32_s16(t1.val[0]));
+      const int32x4x2_t u1 = vtrnq_s32(vreinterpretq_s32_s16(t0.val[1]),
+                                       vreinterpretq_s32_s16(t1.val[1]));
+      vst1_s16(dst->data(row + 0, col),
+               vget_low_s16(vreinterpretq_s16_s32(u0.val[0])));
+      vst1_s16(dst->data(row + 1, col),
+               vget_low_s16(vreinterpretq_s16_s32(u1.val[0])));
+      vst1_s16(dst->data(row + 2, col),
+               vget_low_s16(vreinterpretq_s16_s32(u0.val[1])));
+      vst1_s16(dst->data(row + 3, col),
+               vget_low_s16(vreinterpretq_s16_s32(u1.val[1])));
+      vst1_s16(dst->data(row + 4, col),
+               vget_high_s16(vreinterpretq_s16_s32(u0.val[0])));
+      vst1_s16(dst->data(row + 5, col),
+               vget_high_s16(vreinterpretq_s16_s32(u1.val[0])));
+      vst1_s16(dst->data(row + 6, col),
+               vget_high_s16(vreinterpretq_s16_s32(u0.val[1])));
+      vst1_s16(dst->data(row + 7, col),
+               vget_high_s16(vreinterpretq_s16_s32(u1.val[1])));
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt32<8, 8>, DstType> {
+  static void Run(const RegBlockInt32<8, 8>& src, DstType* dst, int row,
+                  int col) {
+    std::int32_t* dst_ptr = dst->data(row, col);
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      int col_stride = dst->cols_stride();
+      for (int i = 0; i < 8; i++) {
+        vst1q_s32(dst_ptr + i * col_stride, src.buf.reg[2 * i]);
+        vst1q_s32(dst_ptr + i * col_stride + 4, src.buf.reg[2 * i + 1]);
+      }
+    } else {
+      int row_stride = dst->rows_stride();
+      RegBlockInt32<4, 4> top_left;
+      top_left.buf.reg[0] = src.buf.reg[0];
+      top_left.buf.reg[1] = src.buf.reg[2];
+      top_left.buf.reg[2] = src.buf.reg[4];
+      top_left.buf.reg[3] = src.buf.reg[6];
+      const auto transpose_top_left = Transpose(top_left);
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(dst_ptr + i * row_stride, transpose_top_left.buf.reg[i]);
+      }
+      RegBlockInt32<4, 4> bottom_left;
+      bottom_left.buf.reg[0] = src.buf.reg[1];
+      bottom_left.buf.reg[1] = src.buf.reg[3];
+      bottom_left.buf.reg[2] = src.buf.reg[5];
+      bottom_left.buf.reg[3] = src.buf.reg[7];
+      const auto transpose_bottom_left = Transpose(bottom_left);
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(dst_ptr + (i + 4) * row_stride,
+                  transpose_bottom_left.buf.reg[i]);
+      }
+      RegBlockInt32<4, 4> top_right;
+      top_right.buf.reg[0] = src.buf.reg[8];
+      top_right.buf.reg[1] = src.buf.reg[10];
+      top_right.buf.reg[2] = src.buf.reg[12];
+      top_right.buf.reg[3] = src.buf.reg[14];
+      const auto transpose_top_right = Transpose(top_right);
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(dst_ptr + i * row_stride + 4, transpose_top_right.buf.reg[i]);
+      }
+      RegBlockInt32<4, 4> bottom_right;
+      bottom_right.buf.reg[0] = src.buf.reg[9];
+      bottom_right.buf.reg[1] = src.buf.reg[11];
+      bottom_right.buf.reg[2] = src.buf.reg[13];
+      bottom_right.buf.reg[3] = src.buf.reg[15];
+      const auto transpose_bottom_right = Transpose(bottom_right);
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(dst_ptr + (i + 4) * row_stride + 4,
+                  transpose_bottom_right.buf.reg[i]);
+      }
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt32<4, 1>, DstType> {
+  static void Run(const RegBlockInt32<4, 1>& src, DstType* dst, int row,
+                  int col) {
+    std::int32_t* dst_ptr = dst->data(row, col);
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      vst1q_s32(dst_ptr, src.buf.reg[0]);
+    } else {
+      int row_stride = dst->rows_stride();
+      vst1q_lane_s32(dst_ptr + 0 * row_stride, src.buf.reg[0], 0);
+      vst1q_lane_s32(dst_ptr + 1 * row_stride, src.buf.reg[0], 1);
+      vst1q_lane_s32(dst_ptr + 2 * row_stride, src.buf.reg[0], 2);
+      vst1q_lane_s32(dst_ptr + 3 * row_stride, src.buf.reg[0], 3);
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt32<1, 4>, DstType> {
+  static void Run(const RegBlockInt32<1, 4>& src, DstType* dst, int row,
+                  int col) {
+    std::int32_t* dst_ptr = dst->data(row, col);
+    if (DstType::kOrder == MapOrder::RowMajor) {
+      vst1q_s32(dst_ptr, src.buf.reg[0]);
+    } else {
+      int col_stride = dst->cols_stride();
+      vst1q_lane_s32(dst_ptr + 0 * col_stride, src.buf.reg[0], 0);
+      vst1q_lane_s32(dst_ptr + 1 * col_stride, src.buf.reg[0], 1);
+      vst1q_lane_s32(dst_ptr + 2 * col_stride, src.buf.reg[0], 2);
+      vst1q_lane_s32(dst_ptr + 3 * col_stride, src.buf.reg[0], 3);
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<1, 4>, DstType> {
+  static void Run(const RegBlockInt16<1, 4>& src, DstType* dst, int row,
+                  int col) {
+    std::int16_t* dst_ptr = dst->data(row, col);
+    if (DstType::kOrder == MapOrder::RowMajor) {
+      vst1_s16(dst_ptr, src.buf.reg[0]);
+    } else {
+      int col_stride = dst->cols_stride();
+      vst1_lane_s16(dst_ptr + 0 * col_stride, src.buf.reg[0], 0);
+      vst1_lane_s16(dst_ptr + 1 * col_stride, src.buf.reg[0], 1);
+      vst1_lane_s16(dst_ptr + 2 * col_stride, src.buf.reg[0], 2);
+      vst1_lane_s16(dst_ptr + 3 * col_stride, src.buf.reg[0], 3);
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockUint8<4, 1>, DstType> {
+  static void Run(const RegBlockUint8<4, 1>& src, DstType* dst, int row,
+                  int col) {
+    const std::uint32_t src_reg = src.buf.reg[0];
+    for (int i = 0; i < 4; i++) {
+      *dst->data(row + i, col) = (src_reg >> (8 * i));
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockUint8<1, 4>, DstType> {
+  static void Run(const RegBlockUint8<1, 4>& src, DstType* dst, int row,
+                  int col) {
+    for (int i = 0; i < 4; i++) {
+      *dst->data(row, col + i) = (src.buf.reg[0] >> (8 * i));
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockUint8<8, 1>, DstType> {
+  static void Run(const RegBlockUint8<8, 1>& src, DstType* dst, int row,
+                  int col) {
+    std::uint8_t* dst_ptr = dst->data(row, col);
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      vst1_u8(dst_ptr, src.buf.reg[0]);
+    } else {
+      const int row_stride = dst->rows_stride();
+      vst1_lane_u8(dst_ptr + 0 * row_stride, src.buf.reg[0], 0);
+      vst1_lane_u8(dst_ptr + 1 * row_stride, src.buf.reg[0], 1);
+      vst1_lane_u8(dst_ptr + 2 * row_stride, src.buf.reg[0], 2);
+      vst1_lane_u8(dst_ptr + 3 * row_stride, src.buf.reg[0], 3);
+      vst1_lane_u8(dst_ptr + 4 * row_stride, src.buf.reg[0], 4);
+      vst1_lane_u8(dst_ptr + 5 * row_stride, src.buf.reg[0], 5);
+      vst1_lane_u8(dst_ptr + 6 * row_stride, src.buf.reg[0], 6);
+      vst1_lane_u8(dst_ptr + 7 * row_stride, src.buf.reg[0], 7);
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockUint8<4, 4>, DstType> {
+  static void Run(const RegBlockUint8<4, 4>& src, DstType* dst, int row,
+                  int col) {
+    std::uint8_t* dst_ptr = dst->data(row, col);
+    const int row_stride = dst->rows_stride();
+    const int col_stride = dst->cols_stride();
+    for (int i = 0; i < 2; i++) {
+      vst1_lane_u8(dst_ptr + 0 * row_stride + (2 * i + 0) * col_stride,
+                   src.buf.reg[i], 0);
+      vst1_lane_u8(dst_ptr + 1 * row_stride + (2 * i + 0) * col_stride,
+                   src.buf.reg[i], 1);
+      vst1_lane_u8(dst_ptr + 2 * row_stride + (2 * i + 0) * col_stride,
+                   src.buf.reg[i], 2);
+      vst1_lane_u8(dst_ptr + 3 * row_stride + (2 * i + 0) * col_stride,
+                   src.buf.reg[i], 3);
+      vst1_lane_u8(dst_ptr + 0 * row_stride + (2 * i + 1) * col_stride,
+                   src.buf.reg[i], 4);
+      vst1_lane_u8(dst_ptr + 1 * row_stride + (2 * i + 1) * col_stride,
+                   src.buf.reg[i], 5);
+      vst1_lane_u8(dst_ptr + 2 * row_stride + (2 * i + 1) * col_stride,
+                   src.buf.reg[i], 6);
+      vst1_lane_u8(dst_ptr + 3 * row_stride + (2 * i + 1) * col_stride,
+                   src.buf.reg[i], 7);
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockUint8<8, 4>, DstType> {
+  static void Run(const RegBlockUint8<8, 4>& src, DstType* dst, int row,
+                  int col) {
+    std::uint8_t* dst_ptr = dst->data(row, col);
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      int col_stride = dst->cols_stride();
+      for (int i = 0; i < 4; i++) {
+        vst1_u8(dst_ptr + i * col_stride, src.buf.reg[i]);
+      }
+    } else {
+      int row_stride = dst->rows_stride();
+      for (int i = 0; i < 4; i++) {
+        std::uint8_t* col_ptr = dst_ptr + i;
+        vst1_lane_u8(col_ptr + 0 * row_stride, src.buf.reg[i], 0);
+        vst1_lane_u8(col_ptr + 1 * row_stride, src.buf.reg[i], 1);
+        vst1_lane_u8(col_ptr + 2 * row_stride, src.buf.reg[i], 2);
+        vst1_lane_u8(col_ptr + 3 * row_stride, src.buf.reg[i], 3);
+        vst1_lane_u8(col_ptr + 4 * row_stride, src.buf.reg[i], 4);
+        vst1_lane_u8(col_ptr + 5 * row_stride, src.buf.reg[i], 5);
+        vst1_lane_u8(col_ptr + 6 * row_stride, src.buf.reg[i], 6);
+        vst1_lane_u8(col_ptr + 7 * row_stride, src.buf.reg[i], 7);
+      }
+    }
+  }
+};
+
+inline RegBlockUint8<8, 8> Transpose(const RegBlockUint8<8, 8>& src) {
+  uint8x8x2_t a[4];
+  a[0] = vtrn_u8(src.buf.reg[0], src.buf.reg[1]);
+  a[1] = vtrn_u8(src.buf.reg[2], src.buf.reg[3]);
+  a[2] = vtrn_u8(src.buf.reg[4], src.buf.reg[5]);
+  a[3] = vtrn_u8(src.buf.reg[6], src.buf.reg[7]);
+  uint16x4x2_t b[4];
+  b[0] = vtrn_u16(vreinterpret_u16_u8(a[0].val[0]),
+                  vreinterpret_u16_u8(a[1].val[0]));
+  b[1] = vtrn_u16(vreinterpret_u16_u8(a[0].val[1]),
+                  vreinterpret_u16_u8(a[1].val[1]));
+  b[2] = vtrn_u16(vreinterpret_u16_u8(a[2].val[0]),
+                  vreinterpret_u16_u8(a[3].val[0]));
+  b[3] = vtrn_u16(vreinterpret_u16_u8(a[2].val[1]),
+                  vreinterpret_u16_u8(a[3].val[1]));
+  uint32x2x2_t c[4];
+  c[0] = vtrn_u32(vreinterpret_u32_u16(b[0].val[0]),
+                  vreinterpret_u32_u16(b[2].val[0]));
+  c[1] = vtrn_u32(vreinterpret_u32_u16(b[1].val[0]),
+                  vreinterpret_u32_u16(b[3].val[0]));
+  c[2] = vtrn_u32(vreinterpret_u32_u16(b[0].val[1]),
+                  vreinterpret_u32_u16(b[2].val[1]));
+  c[3] = vtrn_u32(vreinterpret_u32_u16(b[1].val[1]),
+                  vreinterpret_u32_u16(b[3].val[1]));
+  RegBlockUint8<8, 8> result;
+  result.buf.reg[0] = vreinterpret_u8_u32(c[0].val[0]);
+  result.buf.reg[1] = vreinterpret_u8_u32(c[1].val[0]);
+  result.buf.reg[2] = vreinterpret_u8_u32(c[2].val[0]);
+  result.buf.reg[3] = vreinterpret_u8_u32(c[3].val[0]);
+  result.buf.reg[4] = vreinterpret_u8_u32(c[0].val[1]);
+  result.buf.reg[5] = vreinterpret_u8_u32(c[1].val[1]);
+  result.buf.reg[6] = vreinterpret_u8_u32(c[2].val[1]);
+  result.buf.reg[7] = vreinterpret_u8_u32(c[3].val[1]);
+  return result;
+}
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockUint8<8, 8>, DstType> {
+  static void Run(const RegBlockUint8<8, 8>& src, DstType* dst, int row,
+                  int col) {
+    const auto& block =
+        DstType::kOrder == MapOrder::ColMajor ? src : Transpose(src);
+    std::uint8_t* dst_ptr = dst->data(row, col);
+    int stride = dst->stride();
+    for (int i = 0; i < 8; i++) {
+      vst1_u8(dst_ptr + i * stride, block.buf.reg[i]);
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt8<4, 1>, DstType> {
+  static void Run(const RegBlockInt8<4, 1>& src, DstType* dst, int row,
+                  int col) {
+    const std::int32_t src_reg = src.buf.reg[0];
+    for (int i = 0; i < 4; i++) {
+      *dst->data(row + i, col) = (src_reg >> (8 * i));
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt8<1, 4>, DstType> {
+  static void Run(const RegBlockInt8<1, 4>& src, DstType* dst, int row,
+                  int col) {
+    for (int i = 0; i < 4; i++) {
+      *dst->data(row, col + i) = (src.buf.reg[0] >> (8 * i));
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt8<8, 1>, DstType> {
+  static void Run(const RegBlockInt8<8, 1>& src, DstType* dst, int row,
+                  int col) {
+    std::int8_t* dst_ptr = dst->data(row, col);
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      vst1_s8(dst_ptr, src.buf.reg[0]);
+    } else {
+      const int row_stride = dst->rows_stride();
+      vst1_lane_s8(dst_ptr + 0 * row_stride, src.buf.reg[0], 0);
+      vst1_lane_s8(dst_ptr + 1 * row_stride, src.buf.reg[0], 1);
+      vst1_lane_s8(dst_ptr + 2 * row_stride, src.buf.reg[0], 2);
+      vst1_lane_s8(dst_ptr + 3 * row_stride, src.buf.reg[0], 3);
+      vst1_lane_s8(dst_ptr + 4 * row_stride, src.buf.reg[0], 4);
+      vst1_lane_s8(dst_ptr + 5 * row_stride, src.buf.reg[0], 5);
+      vst1_lane_s8(dst_ptr + 6 * row_stride, src.buf.reg[0], 6);
+      vst1_lane_s8(dst_ptr + 7 * row_stride, src.buf.reg[0], 7);
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt8<4, 4>, DstType> {
+  static void Run(const RegBlockInt8<4, 4>& src, DstType* dst, int row,
+                  int col) {
+    std::int8_t* dst_ptr = dst->data(row, col);
+    const int row_stride = dst->rows_stride();
+    const int col_stride = dst->cols_stride();
+    for (int i = 0; i < 2; i++) {
+      vst1_lane_s8(dst_ptr + 0 * row_stride + (2 * i + 0) * col_stride,
+                   src.buf.reg[i], 0);
+      vst1_lane_s8(dst_ptr + 1 * row_stride + (2 * i + 0) * col_stride,
+                   src.buf.reg[i], 1);
+      vst1_lane_s8(dst_ptr + 2 * row_stride + (2 * i + 0) * col_stride,
+                   src.buf.reg[i], 2);
+      vst1_lane_s8(dst_ptr + 3 * row_stride + (2 * i + 0) * col_stride,
+                   src.buf.reg[i], 3);
+      vst1_lane_s8(dst_ptr + 0 * row_stride + (2 * i + 1) * col_stride,
+                   src.buf.reg[i], 4);
+      vst1_lane_s8(dst_ptr + 1 * row_stride + (2 * i + 1) * col_stride,
+                   src.buf.reg[i], 5);
+      vst1_lane_s8(dst_ptr + 2 * row_stride + (2 * i + 1) * col_stride,
+                   src.buf.reg[i], 6);
+      vst1_lane_s8(dst_ptr + 3 * row_stride + (2 * i + 1) * col_stride,
+                   src.buf.reg[i], 7);
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt8<8, 4>, DstType> {
+  static void Run(const RegBlockInt8<8, 4>& src, DstType* dst, int row,
+                  int col) {
+    std::int8_t* dst_ptr = dst->data(row, col);
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      int col_stride = dst->cols_stride();
+      for (int i = 0; i < 4; i++) {
+        vst1_s8(dst_ptr + i * col_stride, src.buf.reg[i]);
+      }
+    } else {
+      int row_stride = dst->rows_stride();
+      for (int i = 0; i < 4; i++) {
+        std::int8_t* col_ptr = dst_ptr + i;
+        vst1_lane_s8(col_ptr + 0 * row_stride, src.buf.reg[i], 0);
+        vst1_lane_s8(col_ptr + 1 * row_stride, src.buf.reg[i], 1);
+        vst1_lane_s8(col_ptr + 2 * row_stride, src.buf.reg[i], 2);
+        vst1_lane_s8(col_ptr + 3 * row_stride, src.buf.reg[i], 3);
+        vst1_lane_s8(col_ptr + 4 * row_stride, src.buf.reg[i], 4);
+        vst1_lane_s8(col_ptr + 5 * row_stride, src.buf.reg[i], 5);
+        vst1_lane_s8(col_ptr + 6 * row_stride, src.buf.reg[i], 6);
+        vst1_lane_s8(col_ptr + 7 * row_stride, src.buf.reg[i], 7);
+      }
+    }
+  }
+};
+
+inline RegBlockInt8<8, 8> Transpose(const RegBlockInt8<8, 8>& src) {
+  int8x8x2_t a[4];
+  a[0] = vtrn_s8(src.buf.reg[0], src.buf.reg[1]);
+  a[1] = vtrn_s8(src.buf.reg[2], src.buf.reg[3]);
+  a[2] = vtrn_s8(src.buf.reg[4], src.buf.reg[5]);
+  a[3] = vtrn_s8(src.buf.reg[6], src.buf.reg[7]);
+  int16x4x2_t b[4];
+  b[0] = vtrn_s16(vreinterpret_s16_s8(a[0].val[0]),
+                  vreinterpret_s16_s8(a[1].val[0]));
+  b[1] = vtrn_s16(vreinterpret_s16_s8(a[0].val[1]),
+                  vreinterpret_s16_s8(a[1].val[1]));
+  b[2] = vtrn_s16(vreinterpret_s16_s8(a[2].val[0]),
+                  vreinterpret_s16_s8(a[3].val[0]));
+  b[3] = vtrn_s16(vreinterpret_s16_s8(a[2].val[1]),
+                  vreinterpret_s16_s8(a[3].val[1]));
+  int32x2x2_t c[4];
+  c[0] = vtrn_s32(vreinterpret_s32_s16(b[0].val[0]),
+                  vreinterpret_s32_s16(b[2].val[0]));
+  c[1] = vtrn_s32(vreinterpret_s32_s16(b[1].val[0]),
+                  vreinterpret_s32_s16(b[3].val[0]));
+  c[2] = vtrn_s32(vreinterpret_s32_s16(b[0].val[1]),
+                  vreinterpret_s32_s16(b[2].val[1]));
+  c[3] = vtrn_s32(vreinterpret_s32_s16(b[1].val[1]),
+                  vreinterpret_s32_s16(b[3].val[1]));
+  RegBlockInt8<8, 8> result;
+  result.buf.reg[0] = vreinterpret_s8_s32(c[0].val[0]);
+  result.buf.reg[1] = vreinterpret_s8_s32(c[1].val[0]);
+  result.buf.reg[2] = vreinterpret_s8_s32(c[2].val[0]);
+  result.buf.reg[3] = vreinterpret_s8_s32(c[3].val[0]);
+  result.buf.reg[4] = vreinterpret_s8_s32(c[0].val[1]);
+  result.buf.reg[5] = vreinterpret_s8_s32(c[1].val[1]);
+  result.buf.reg[6] = vreinterpret_s8_s32(c[2].val[1]);
+  result.buf.reg[7] = vreinterpret_s8_s32(c[3].val[1]);
+  return result;
+}
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt8<8, 8>, DstType> {
+  static void Run(const RegBlockInt8<8, 8>& src, DstType* dst, int row,
+                  int col) {
+    const auto& block =
+        DstType::kOrder == MapOrder::ColMajor ? src : Transpose(src);
+    std::int8_t* dst_ptr = dst->data(row, col);
+    int stride = dst->stride();
+    for (int i = 0; i < 8; i++) {
+      vst1_s8(dst_ptr + i * stride, block.buf.reg[i]);
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<8, 8>, DstType> {
+  static void Run(const RegBlockInt16<8, 8>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      vst1q_s16(dst->data(row, col + 0), src.buf.reg[0]);
+      vst1q_s16(dst->data(row, col + 1), src.buf.reg[1]);
+      vst1q_s16(dst->data(row, col + 2), src.buf.reg[2]);
+      vst1q_s16(dst->data(row, col + 3), src.buf.reg[3]);
+      vst1q_s16(dst->data(row, col + 4), src.buf.reg[4]);
+      vst1q_s16(dst->data(row, col + 5), src.buf.reg[5]);
+      vst1q_s16(dst->data(row, col + 6), src.buf.reg[6]);
+      vst1q_s16(dst->data(row, col + 7), src.buf.reg[7]);
+    } else {
+      int16x8x2_t a[4];
+      a[0] = vtrnq_s16(src.buf.reg[0], src.buf.reg[1]);
+      a[1] = vtrnq_s16(src.buf.reg[2], src.buf.reg[3]);
+      a[2] = vtrnq_s16(src.buf.reg[4], src.buf.reg[5]);
+      a[3] = vtrnq_s16(src.buf.reg[6], src.buf.reg[7]);
+      int32x4x2_t b[4];
+      b[0] = vtrnq_s32(vreinterpretq_s32_s16(a[0].val[0]),
+                       vreinterpretq_s32_s16(a[1].val[0]));
+      b[1] = vtrnq_s32(vreinterpretq_s32_s16(a[0].val[1]),
+                       vreinterpretq_s32_s16(a[1].val[1]));
+      b[2] = vtrnq_s32(vreinterpretq_s32_s16(a[2].val[0]),
+                       vreinterpretq_s32_s16(a[3].val[0]));
+      b[3] = vtrnq_s32(vreinterpretq_s32_s16(a[2].val[1]),
+                       vreinterpretq_s32_s16(a[3].val[1]));
+      vst1_s16(dst->data(row + 0, col + 0),
+               vget_low_s16(vreinterpretq_s16_s32(b[0].val[0])));
+      vst1_s16(dst->data(row + 0, col + 4),
+               vget_low_s16(vreinterpretq_s16_s32(b[2].val[0])));
+      vst1_s16(dst->data(row + 1, col + 0),
+               vget_low_s16(vreinterpretq_s16_s32(b[1].val[0])));
+      vst1_s16(dst->data(row + 1, col + 4),
+               vget_low_s16(vreinterpretq_s16_s32(b[3].val[0])));
+      vst1_s16(dst->data(row + 2, col + 0),
+               vget_low_s16(vreinterpretq_s16_s32(b[0].val[1])));
+      vst1_s16(dst->data(row + 2, col + 4),
+               vget_low_s16(vreinterpretq_s16_s32(b[2].val[1])));
+      vst1_s16(dst->data(row + 3, col + 0),
+               vget_low_s16(vreinterpretq_s16_s32(b[1].val[1])));
+      vst1_s16(dst->data(row + 3, col + 4),
+               vget_low_s16(vreinterpretq_s16_s32(b[3].val[1])));
+      vst1_s16(dst->data(row + 4, col + 0),
+               vget_high_s16(vreinterpretq_s16_s32(b[0].val[0])));
+      vst1_s16(dst->data(row + 4, col + 4),
+               vget_high_s16(vreinterpretq_s16_s32(b[2].val[0])));
+      vst1_s16(dst->data(row + 5, col + 0),
+               vget_high_s16(vreinterpretq_s16_s32(b[1].val[0])));
+      vst1_s16(dst->data(row + 5, col + 4),
+               vget_high_s16(vreinterpretq_s16_s32(b[3].val[0])));
+      vst1_s16(dst->data(row + 6, col + 0),
+               vget_high_s16(vreinterpretq_s16_s32(b[0].val[1])));
+      vst1_s16(dst->data(row + 6, col + 4),
+               vget_high_s16(vreinterpretq_s16_s32(b[2].val[1])));
+      vst1_s16(dst->data(row + 7, col + 0),
+               vget_high_s16(vreinterpretq_s16_s32(b[1].val[1])));
+      vst1_s16(dst->data(row + 7, col + 4),
+               vget_high_s16(vreinterpretq_s16_s32(b[3].val[1])));
+    }
+  }
+};
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_OUTPUT_NEON_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output_sse.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output_sse.h
new file mode 100644
index 00000000..6ea3290b
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output_sse.h
@@ -0,0 +1,561 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// output_sse.h: optimized SSE4.2 specializations of the templates in output.h.
+
+#ifndef GEMMLOWP_INTERNAL_OUTPUT_SSE_H_
+#define GEMMLOWP_INTERNAL_OUTPUT_SSE_H_
+
+#include "output.h"
+
+#include <smmintrin.h>
+
+namespace gemmlowp {
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
+                                 RegBufferInt32<4>> {
+  typedef RegBufferInt32<4> InputType;
+  typedef RegBufferUint8<4> OutputType;
+
+  typedef OutputStageSaturatingCastToUint8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    __m128i res_16 = _mm_packs_epi32(input.reg[0], input.reg[0]);
+    __m128i res_8 = _mm_packus_epi16(res_16, res_16);
+    output.reg[0] = _mm_cvtsi128_si32(res_8);
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
+                                 RegBufferInt32<8>> {
+  typedef RegBufferInt32<8> InputType;
+  typedef RegBufferUint8<8> OutputType;
+
+  typedef OutputStageSaturatingCastToUint8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    __m128i res_16 = _mm_packs_epi32(input.reg[0], input.reg[1]);
+    __m128i res_8 = _mm_packus_epi16(res_16, res_16);
+    output.reg[0] = _mm_extract_epi32(res_8, 0);
+    output.reg[1] = _mm_extract_epi32(res_8, 1);
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
+                                 RegBufferInt32<16>> {
+  typedef RegBufferInt32<16> InputType;
+  typedef RegBufferUint8<16> OutputType;
+
+  typedef OutputStageSaturatingCastToUint8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    __m128i res_16_0 = _mm_packs_epi32(input.reg[0], input.reg[1]);
+    __m128i res_16_1 = _mm_packs_epi32(input.reg[2], input.reg[3]);
+    output.reg[0] = _mm_packus_epi16(res_16_0, res_16_1);
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8,
+                                 RegBufferInt32<32>> {
+  typedef RegBufferInt32<32> InputType;
+  typedef RegBufferUint8<32> OutputType;
+
+  typedef OutputStageSaturatingCastToUint8 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    __m128i res_16_0 = _mm_packs_epi32(input.reg[0], input.reg[1]);
+    __m128i res_16_1 = _mm_packs_epi32(input.reg[2], input.reg[3]);
+    output.reg[0] = _mm_packus_epi16(res_16_0, res_16_1);
+    __m128i res_16_2 = _mm_packs_epi32(input.reg[4], input.reg[5]);
+    __m128i res_16_3 = _mm_packs_epi32(input.reg[6], input.reg[7]);
+    output.reg[1] = _mm_packus_epi16(res_16_2, res_16_3);
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
+                                 RegBufferInt32<4>> {
+  typedef RegBufferInt32<4> InputType;
+  typedef RegBufferInt16<4> OutputType;
+
+  typedef OutputStageSaturatingCastToInt16 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    __m128i res_16 = _mm_packs_epi32(input.reg[0], input.reg[0]);
+    output.reg[0] = _mm_extract_epi16(res_16, 0);
+    output.reg[1] = _mm_extract_epi16(res_16, 1);
+    output.reg[2] = _mm_extract_epi16(res_16, 2);
+    output.reg[3] = _mm_extract_epi16(res_16, 3);
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
+                                 RegBufferInt32<8>> {
+  typedef RegBufferInt32<8> InputType;
+  typedef RegBufferInt16<8> OutputType;
+
+  typedef OutputStageSaturatingCastToInt16 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    output.reg[0] = _mm_packs_epi32(input.reg[0], input.reg[1]);
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
+                                 RegBufferInt32<16>> {
+  typedef RegBufferInt32<16> InputType;
+  typedef RegBufferInt16<16> OutputType;
+
+  typedef OutputStageSaturatingCastToInt16 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    output.reg[0] = _mm_packs_epi32(input.reg[0], input.reg[1]);
+    output.reg[1] = _mm_packs_epi32(input.reg[2], input.reg[3]);
+    return output;
+  }
+};
+
+template <>
+struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16,
+                                 RegBufferInt32<32>> {
+  typedef RegBufferInt32<32> InputType;
+  typedef RegBufferInt16<32> OutputType;
+
+  typedef OutputStageSaturatingCastToInt16 OutputStage;
+
+  OutputStageEvalBufferImpl(const OutputStage&) {}
+
+  OutputType Eval(InputType input) const {
+    OutputType output;
+    output.reg[0] = _mm_packs_epi32(input.reg[0], input.reg[1]);
+    output.reg[1] = _mm_packs_epi32(input.reg[2], input.reg[3]);
+    output.reg[2] = _mm_packs_epi32(input.reg[4], input.reg[5]);
+    output.reg[3] = _mm_packs_epi32(input.reg[6], input.reg[7]);
+    return output;
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt32<4, 1>, DstType> {
+  static void Run(const RegBlockInt32<4, 1>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      StoreInt32x4(dst->data(row, col), src.buf.reg[0]);
+    } else {
+      *dst->data(row + 0, col) = GetLane<0>(src.buf.reg[0]);
+      *dst->data(row + 1, col) = GetLane<1>(src.buf.reg[0]);
+      *dst->data(row + 2, col) = GetLane<2>(src.buf.reg[0]);
+      *dst->data(row + 3, col) = GetLane<3>(src.buf.reg[0]);
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt32<8, 1>, DstType> {
+  static void Run(const RegBlockInt32<8, 1>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      StoreInt32x4(dst->data(row, col), src.buf.reg[0]);
+      StoreInt32x4(dst->data(row + 4, col), src.buf.reg[1]);
+    } else {
+      *dst->data(row + 0, col) = GetLane<0>(src.buf.reg[0]);
+      *dst->data(row + 1, col) = GetLane<1>(src.buf.reg[0]);
+      *dst->data(row + 2, col) = GetLane<2>(src.buf.reg[0]);
+      *dst->data(row + 3, col) = GetLane<3>(src.buf.reg[0]);
+      *dst->data(row + 4, col) = GetLane<0>(src.buf.reg[1]);
+      *dst->data(row + 5, col) = GetLane<1>(src.buf.reg[1]);
+      *dst->data(row + 6, col) = GetLane<2>(src.buf.reg[1]);
+      *dst->data(row + 7, col) = GetLane<3>(src.buf.reg[1]);
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<4, 1>, DstType> {
+  static void Run(const RegBlockInt16<4, 1>& src, DstType* dst, int row,
+                  int col) {
+    *dst->data(row + 0, col) = src.buf.reg[0];
+    *dst->data(row + 1, col) = src.buf.reg[1];
+    *dst->data(row + 2, col) = src.buf.reg[2];
+    *dst->data(row + 3, col) = src.buf.reg[3];
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<8, 1>, DstType> {
+  static void Run(const RegBlockInt16<8, 1>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      StoreInt16x8(dst->data(row, col), src.buf.reg[0]);
+    } else {
+      *dst->data(row + 0, col) = _mm_extract_epi16(src.buf.reg[0], 0);
+      *dst->data(row + 1, col) = _mm_extract_epi16(src.buf.reg[0], 1);
+      *dst->data(row + 2, col) = _mm_extract_epi16(src.buf.reg[0], 2);
+      *dst->data(row + 3, col) = _mm_extract_epi16(src.buf.reg[0], 3);
+      *dst->data(row + 4, col) = _mm_extract_epi16(src.buf.reg[0], 4);
+      *dst->data(row + 5, col) = _mm_extract_epi16(src.buf.reg[0], 5);
+      *dst->data(row + 6, col) = _mm_extract_epi16(src.buf.reg[0], 6);
+      *dst->data(row + 7, col) = _mm_extract_epi16(src.buf.reg[0], 7);
+    }
+  }
+};
+
+inline RegBlockInt32<4, 4> Transpose(const RegBlockInt32<4, 4>& src) {
+  __m128i t0 = _mm_unpacklo_epi32(src.buf.reg[0], src.buf.reg[1]);
+  __m128i t1 = _mm_unpacklo_epi32(src.buf.reg[2], src.buf.reg[3]);
+  __m128i t2 = _mm_unpackhi_epi32(src.buf.reg[0], src.buf.reg[1]);
+  __m128i t3 = _mm_unpackhi_epi32(src.buf.reg[2], src.buf.reg[3]);
+
+  RegBlockInt32<4, 4> result;
+  result.buf.reg[0] = _mm_unpacklo_epi64(t0, t1);
+  result.buf.reg[1] = _mm_unpackhi_epi64(t0, t1);
+  result.buf.reg[2] = _mm_unpacklo_epi64(t2, t3);
+  result.buf.reg[3] = _mm_unpackhi_epi64(t2, t3);
+  return result;
+}
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt32<4, 4>, DstType> {
+  static void Run(const RegBlockInt32<4, 4>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      for (int i = 0; i < 4; i++) {
+        StoreInt32x4(dst->data(row, col + i), src.buf.reg[i]);
+      }
+    } else {
+      const auto transpose = Transpose(src);
+      for (int i = 0; i < 4; i++) {
+        StoreInt32x4(dst->data(row + i, col), transpose.buf.reg[i]);
+      }
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<4, 4>, DstType> {
+  static void Run(const RegBlockInt16<4, 4>& src, DstType* dst, int row,
+                  int col) {
+    std::int16_t buf[16];
+    StoreInt16x8(buf + 0, src.buf.reg[0]);
+    StoreInt16x8(buf + 8, src.buf.reg[1]);
+    for (int i = 0; i < 4; i++) {
+      for (int j = 0; j < 4; j++) {
+        *dst->data(row + i, col + j) = buf[i + 4 * j];
+      }
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt32<8, 4>, DstType> {
+  static void Run(const RegBlockInt32<8, 4>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      for (int i = 0; i < 4; i++) {
+        StoreInt32x4(dst->data(row, col + i), src.buf.reg[2 * i]);
+        StoreInt32x4(dst->data(row + 4, col + i), src.buf.reg[2 * i + 1]);
+      }
+    } else {
+      RegBlockInt32<4, 4> top;
+      top.buf.reg[0] = src.buf.reg[0];
+      top.buf.reg[1] = src.buf.reg[2];
+      top.buf.reg[2] = src.buf.reg[4];
+      top.buf.reg[3] = src.buf.reg[6];
+      const auto transpose_top = Transpose(top);
+      for (int i = 0; i < 4; i++) {
+        StoreInt32x4(dst->data(row + i, col), transpose_top.buf.reg[i]);
+      }
+      RegBlockInt32<4, 4> bottom;
+      bottom.buf.reg[0] = src.buf.reg[1];
+      bottom.buf.reg[1] = src.buf.reg[3];
+      bottom.buf.reg[2] = src.buf.reg[5];
+      bottom.buf.reg[3] = src.buf.reg[7];
+      const auto transpose_bottom = Transpose(bottom);
+      for (int i = 0; i < 4; i++) {
+        StoreInt32x4(dst->data(row + 4 + i, col), transpose_bottom.buf.reg[i]);
+      }
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<8, 4>, DstType> {
+  static void Run(const RegBlockInt16<8, 4>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      for (int i = 0; i < 4; i++) {
+        StoreInt16x8(dst->data(row, col + i), src.buf.reg[i]);
+      }
+    } else {
+      std::int16_t buf[32];
+      StoreInt16x8(buf + 0, src.buf.reg[0]);
+      StoreInt16x8(buf + 8, src.buf.reg[1]);
+      StoreInt16x8(buf + 16, src.buf.reg[2]);
+      StoreInt16x8(buf + 24, src.buf.reg[3]);
+      for (int i = 0; i < 8; i++) {
+        for (int j = 0; j < 4; j++) {
+          *dst->data(row + i, col + j) = buf[i + 8 * j];
+        }
+      }
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt32<8, 8>, DstType> {
+  static void Run(const RegBlockInt32<8, 8>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      for (int i = 0; i < 8; i++) {
+        StoreInt32x4(dst->data(row, col + i), src.buf.reg[2 * i]);
+        StoreInt32x4(dst->data(row + 4, col + i), src.buf.reg[2 * i + 1]);
+      }
+    } else {
+      RegBlockInt32<4, 4> top_left;
+      top_left.buf.reg[0] = src.buf.reg[0];
+      top_left.buf.reg[1] = src.buf.reg[2];
+      top_left.buf.reg[2] = src.buf.reg[4];
+      top_left.buf.reg[3] = src.buf.reg[6];
+      const auto transpose_top_left = Transpose(top_left);
+      for (int i = 0; i < 4; i++) {
+        StoreInt32x4(dst->data(row + i, col), transpose_top_left.buf.reg[i]);
+      }
+      RegBlockInt32<4, 4> bottom_left;
+      bottom_left.buf.reg[0] = src.buf.reg[1];
+      bottom_left.buf.reg[1] = src.buf.reg[3];
+      bottom_left.buf.reg[2] = src.buf.reg[5];
+      bottom_left.buf.reg[3] = src.buf.reg[7];
+      const auto transpose_bottom_left = Transpose(bottom_left);
+      for (int i = 0; i < 4; i++) {
+        StoreInt32x4(dst->data(row + 4 + i, col),
+                     transpose_bottom_left.buf.reg[i]);
+      }
+      RegBlockInt32<4, 4> top_right;
+      top_right.buf.reg[0] = src.buf.reg[8];
+      top_right.buf.reg[1] = src.buf.reg[10];
+      top_right.buf.reg[2] = src.buf.reg[12];
+      top_right.buf.reg[3] = src.buf.reg[14];
+      const auto transpose_top_right = Transpose(top_right);
+      for (int i = 0; i < 4; i++) {
+        StoreInt32x4(dst->data(row + i, col + 4),
+                     transpose_top_right.buf.reg[i]);
+      }
+      RegBlockInt32<4, 4> bottom_right;
+      bottom_right.buf.reg[0] = src.buf.reg[9];
+      bottom_right.buf.reg[1] = src.buf.reg[11];
+      bottom_right.buf.reg[2] = src.buf.reg[13];
+      bottom_right.buf.reg[3] = src.buf.reg[15];
+      const auto transpose_bottom_right = Transpose(bottom_right);
+      for (int i = 0; i < 4; i++) {
+        StoreInt32x4(dst->data(row + 4 + i, col + 4),
+                     transpose_bottom_right.buf.reg[i]);
+      }
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt16<8, 8>, DstType> {
+  static void Run(const RegBlockInt16<8, 8>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      for (int i = 0; i < 8; i++) {
+        StoreInt16x8(dst->data(row, col + i), src.buf.reg[i]);
+      }
+    } else {
+      // top-left 4x4
+      __m128i t0 = _mm_unpacklo_epi16(src.buf.reg[0], src.buf.reg[1]);
+      __m128i t1 = _mm_unpacklo_epi16(src.buf.reg[2], src.buf.reg[3]);
+      __m128i u0 = _mm_unpacklo_epi32(t0, t1);
+      __m128i u1 = _mm_unpackhi_epi32(t0, t1);
+      // top-right 4x4
+      __m128i t2 = _mm_unpacklo_epi16(src.buf.reg[4], src.buf.reg[5]);
+      __m128i t3 = _mm_unpacklo_epi16(src.buf.reg[6], src.buf.reg[7]);
+      __m128i u2 = _mm_unpacklo_epi32(t2, t3);
+      __m128i u3 = _mm_unpackhi_epi32(t2, t3);
+      // bottom-left 4x4
+      __m128i t4 = _mm_unpackhi_epi16(src.buf.reg[0], src.buf.reg[1]);
+      __m128i t5 = _mm_unpackhi_epi16(src.buf.reg[2], src.buf.reg[3]);
+      __m128i u4 = _mm_unpacklo_epi32(t4, t5);
+      __m128i u5 = _mm_unpackhi_epi32(t4, t5);
+      // bottom-right 4x4
+      __m128i t6 = _mm_unpackhi_epi16(src.buf.reg[4], src.buf.reg[5]);
+      __m128i t7 = _mm_unpackhi_epi16(src.buf.reg[6], src.buf.reg[7]);
+      __m128i u6 = _mm_unpacklo_epi32(t6, t7);
+      __m128i u7 = _mm_unpackhi_epi32(t6, t7);
+
+      StoreInt16x8(dst->data(row + 0, col), _mm_unpacklo_epi64(u0, u2));
+      StoreInt16x8(dst->data(row + 1, col), _mm_unpackhi_epi64(u0, u2));
+      StoreInt16x8(dst->data(row + 2, col), _mm_unpacklo_epi64(u1, u3));
+      StoreInt16x8(dst->data(row + 3, col), _mm_unpackhi_epi64(u1, u3));
+      StoreInt16x8(dst->data(row + 4, col), _mm_unpacklo_epi64(u4, u6));
+      StoreInt16x8(dst->data(row + 5, col), _mm_unpackhi_epi64(u4, u6));
+      StoreInt16x8(dst->data(row + 6, col), _mm_unpacklo_epi64(u5, u7));
+      StoreInt16x8(dst->data(row + 7, col), _mm_unpackhi_epi64(u5, u7));
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockInt32<1, 4>, DstType> {
+  static void Run(const RegBlockInt32<1, 4>& src, DstType* dst, int row,
+                  int col) {
+    if (DstType::kOrder == MapOrder::ColMajor) {
+      *dst->data(row, col + 0) = GetLane<0>(src.buf.reg[0]);
+      *dst->data(row, col + 1) = GetLane<1>(src.buf.reg[0]);
+      *dst->data(row, col + 2) = GetLane<2>(src.buf.reg[0]);
+      *dst->data(row, col + 3) = GetLane<3>(src.buf.reg[0]);
+    } else {
+      StoreInt32x4(dst->data(row, col), src.buf.reg[0]);
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockUint8<4, 1>, DstType> {
+  static void Run(const RegBlockUint8<4, 1>& src, DstType* dst, int row,
+                  int col) {
+    const std::uint32_t src_reg = src.buf.reg[0];
+    for (int i = 0; i < 4; i++) {
+      *dst->data(row + i, col) = (src_reg >> (8 * i));
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockUint8<8, 1>, DstType> {
+  static void Run(const RegBlockUint8<8, 1>& src, DstType* dst, int row,
+                  int col) {
+    for (int i = 0; i < 4; i++) {
+      *dst->data(row + i, col) = (src.buf.reg[0] >> (8 * i));
+    }
+    for (int i = 0; i < 4; i++) {
+      *dst->data(row + 4 + i, col) = (src.buf.reg[1] >> (8 * i));
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockUint8<1, 4>, DstType> {
+  static void Run(const RegBlockUint8<1, 4>& src, DstType* dst, int row,
+                  int col) {
+    for (int i = 0; i < 4; i++) {
+      *dst->data(row, col + i) = (src.buf.reg[0] >> (8 * i));
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockUint8<4, 4>, DstType> {
+  static void Run(const RegBlockUint8<4, 4>& src, DstType* dst, int row,
+                  int col) {
+    std::uint8_t buf[16];
+    StoreUint8x16(buf, src.buf.reg[0]);
+    for (int c = 0; c < 4; c++) {
+      for (int r = 0; r < 4; r++) {
+        *dst->data(row + r, col + c) = buf[r + 4 * c];
+      }
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockUint8<8, 4>, DstType> {
+  static void Run(const RegBlockUint8<8, 4>& src, DstType* dst, int row,
+                  int col) {
+    std::uint8_t buf[32];
+    StoreUint8x16(buf, src.buf.reg[0]);
+    StoreUint8x16(buf + 16, src.buf.reg[1]);
+    for (int c = 0; c < 4; c++) {
+      for (int r = 0; r < 8; r++) {
+        *dst->data(row + r, col + c) = buf[r + 8 * c];
+      }
+    }
+  }
+};
+
+template <typename DstType>
+struct StoreFinalOutputImpl<RegBlockUint8<8, 8>, DstType> {
+  static void Run(const RegBlockUint8<8, 8>& src, DstType* dst, int row,
+                  int col) {
+    std::uint8_t buf[64];
+    StoreUint8x16(buf, src.buf.reg[0]);
+    StoreUint8x16(buf + 16, src.buf.reg[1]);
+    StoreUint8x16(buf + 32, src.buf.reg[2]);
+    StoreUint8x16(buf + 48, src.buf.reg[3]);
+    for (int c = 0; c < 8; c++) {
+      for (int r = 0; r < 8; r++) {
+        *dst->data(row + r, col + c) = buf[r + 8 * c];
+      }
+    }
+  }
+};
+
+// Specialization for MatrixMap, for performance.
+template <typename tScalar, MapOrder tOrder>
+struct StoreFinalOutputImpl<RegBlockUint8<8, 8>, MatrixMap<tScalar, tOrder>> {
+  static void Run(const RegBlockUint8<8, 8>& src,
+                  MatrixMap<tScalar, tOrder>* dst, int row, int col) {
+    std::uint8_t buf[64];
+    StoreUint8x16(buf, src.buf.reg[0]);
+    StoreUint8x16(buf + 16, src.buf.reg[1]);
+    StoreUint8x16(buf + 32, src.buf.reg[2]);
+    StoreUint8x16(buf + 48, src.buf.reg[3]);
+    // Make a local copy so that the compiler can prove that data_ does not
+    // alias &data_ or &stride_.
+    MatrixMap<tScalar, tOrder> local = *dst;
+    for (int c = 0; c < 8; c++) {
+      for (int r = 0; r < 8; r++) {
+        *local.data(row + r, col + c) = buf[r + 8 * c];
+      }
+    }
+  }
+};
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_OUTPUT_SSE_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack.h
new file mode 100644
index 00000000..a9eb396a
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack.h
@@ -0,0 +1,444 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// pack.h: packing blocks of the LHS and RHS into the data layout
+// that is expected by compute.h and eventually by kernels.
+// Because this data layout depends on the kernel format, code here
+// is templated in KernelLhsFormat/KernelRhsFormat.
+//
+// Readers note: an important theme around here is that we try hard
+// to handle both Lhs and Rhs with a single piece of code. We indifferently
+// refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices
+// by (row, column) indices, we address them by (width, depth), as explained
+// in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing,
+// at once.
+
+#ifndef GEMMLOWP_INTERNAL_PACK_H_
+#define GEMMLOWP_INTERNAL_PACK_H_
+
+#include <cstring>
+
+#include "allocator.h"
+#include "block_params.h"
+#include "common.h"
+#include "kernel.h"
+
+namespace gemmlowp {
+
+// A PackedSideBlock instance is a packed block of either the LHS or RHS
+// (whence the generic 'Side' name).
+//
+// 'Packed' means that it is laid out in the storage order that
+// is expected by the specified kernel format. From a block of the input
+// LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs()
+// or PackRhs().
+template <typename tKernelSideFormat>
+class PackedSideBlock {
+ public:
+  typedef tKernelSideFormat KernelSideFormat;
+
+  PackedSideBlock(Side side, Allocator* allocator,
+                  const BlockParams& block_params)
+      : allocator_(allocator), pos_(0) {
+    GetSideBlockParams(side, &params_, block_params);
+    data_handle_ =
+        allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth);
+    sums_of_each_slice_handle_ =
+        allocator_->Reserve<std::int32_t>(params_.l2_width);
+  }
+
+  ~PackedSideBlock() {}
+
+  void seek_run(int start_width, int start_depth) const {
+    int kernel_run_depth =
+        std::min<int>(params_.l1_depth, params_.l2_depth - start_depth);
+    pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth;
+  }
+
+  void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; }
+
+  void seek_forward_n_cells(int n) const {
+    pos_ += n * KernelSideFormat::Cell::kSize;
+  }
+
+  // TODO(suharshs): The datatype can now be int8 as well. We could introduce a
+  // new int8 current_data impl as well. This change would propagate to all pack
+  // impls and the Kernel::Run API, which all assume uint8. For now we leave
+  // this as-is pending future refactor.
+  const std::uint8_t* current_data() const {
+    return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
+  }
+
+  std::uint8_t* current_data() {
+    return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
+  }
+
+  std::int32_t* sums_of_each_slice() {
+    return allocator_->GetPointer<std::int32_t>(sums_of_each_slice_handle_);
+  }
+
+  const std::int32_t* sums_of_each_slice() const {
+    return allocator_->GetPointer<const std::int32_t>(
+        sums_of_each_slice_handle_);
+  }
+
+  const SideBlockParams& params() const { return params_; }
+
+ private:
+  // The block size parameters that this PackedSizeBlock follows.
+  // The L2 parameters determine its overall size, while the L1 parameters,
+  // together with the kernel format template parameter, determine
+  // the fine details of the storage/traversal order.
+  SideBlockParams params_;
+
+  // Pointer to the allocator provided by the caller. Not owned.
+  // The Allocator is assumed to outlive the PackedSideBlock.
+  Allocator* const allocator_;
+
+  // Handle on the buffer backing this packed block. Owned.
+  Allocator::Handle data_handle_;
+
+  // Handle on the additional buffer backing the vector of sums of slices
+  // associated with this block. Owned.
+  Allocator::Handle sums_of_each_slice_handle_;
+
+  // pos_ is the current position in the buffer, which we access
+  // sequentially, like a file.
+  // The idea is that we pack data in the same order as it is
+  // going to be traversed during the computation, which for
+  // cache-friendliness reasons is complicated to random-access,
+  // as the offsets calculations would be intricate. So we
+  // give up random-access addressing, and instead content ourselves
+  // with sequential access.
+  //
+  // pos_ is mutable because during the computation we will want to
+  // be able to iterate on the data in a const PackedSideBlock.
+  mutable int pos_;
+};
+
+// WidthMajor and DepthMajor are custom phrases modelled after the
+// standard terminology 'row-major' and 'column-major'. Their meaning
+// should be transparent once one has read the explanation in kernel.h:
+// for example, in the Lhs, the 'width' dimension is the rows dimension,
+// so there WidthMajor means RowMajor, while in the Rhs it is the opposite.
+// Another way to put it: WidthMajor means that contiguous storage is used
+// for entries having the same 'width' index.
+enum class SideMapOrder { WidthMajor, DepthMajor };
+
+// Similar to MatrixMap from map.h, but in terms of width/depth instead of
+// rows/columns. Used to address blocks of the input LHS/RHS matrices when
+// packing them.
+template <typename tScalar, SideMapOrder tOrder>
+class SideMap {
+ public:
+  typedef tScalar Scalar;
+  static constexpr SideMapOrder kOrder = tOrder;
+
+  SideMap(Scalar* data, int width, int depth, int stride)
+      : data_(data), width_(width), depth_(depth), stride_(stride) {}
+
+  SideMap(Scalar* data, int width, int depth)
+      : data_(data), width_(width), depth_(depth) {
+    stride_ = kOrder == SideMapOrder::WidthMajor ? depth_ : width_;
+  }
+
+  SideMap(const SideMap& other) = default;
+  SideMap& operator=(const SideMap& other) = default;
+
+  int width() const { return width_; }
+  int depth() const { return depth_; }
+  int stride() const { return stride_; }
+  int width_stride() const {
+    return kOrder == SideMapOrder::DepthMajor ? 1 : stride_;
+  }
+  int depth_stride() const {
+    return kOrder == SideMapOrder::WidthMajor ? 1 : stride_;
+  }
+  Scalar* data() const { return data_; }
+  Scalar* data(int w, int d) const {
+    return data_ + w * width_stride() + d * depth_stride();
+  }
+  Scalar operator()(int w, int d) const { return *data(w, d); }
+  Scalar& operator()(int w, int d) { return *data(w, d); }
+
+  SideMap block(int start_width, int start_depth, int block_width,
+                int block_depth) const {
+    assert(start_width >= 0);
+    assert(start_width + block_width <= width_);
+    assert(start_depth >= 0);
+    assert(start_depth + block_depth <= depth_);
+
+    return SideMap(data(start_width, start_depth), block_width, block_depth,
+                   stride_);
+  }
+
+ private:
+  Scalar* data_;  // not owned.
+  int width_, depth_, stride_;
+};
+
+// A PackingRegisterBlock is a small fixed-size block of a matrix being
+// packed. This class is the generic non-optimized implementation,
+// it is inherited by the generic implementation of PackingRegisterBlock,
+// which may be overriden by template specialization. Overriding it is how
+// one may provide optimized packing code paths.
+//
+// The packing of a block proceeds in two steps:
+//   1. Ensuring that we have a complete block of source data, i.e. a block of
+//      the compile-time prescribed size. This is where we handle unaligned
+//      boundaries: if we don't have a complete block of source data, then
+//      we copy and zero-extend it into a local temporary (complete_src_),
+//      see MakeCompleteSrc. In the generic case, we do have a complete block,
+//      so we just use it in-place, see UseCompleteSrcInPlace.
+//   2. Packing a complete block into the destination, see Pack. This is the
+//      most critical part, so it's convenient that unaligned boundaries have
+//      already been handled in step 1.
+template <typename SrcMapType, typename PackedSideBlock>
+class PackingRegisterBlockBase {
+ public:
+  typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
+  typedef typename KernelSideFormat::Cell CellFormat;
+  typedef typename KernelSideFormat::InputScalar KernelInputScalar;
+  typedef typename KernelSideFormat::Scalar KernelScalar;
+  static constexpr int kCells = KernelSideFormat::kCells;
+  static constexpr int kCellWidth = CellFormat::kWidth;
+  static constexpr int kKernelWidth = CellFormat::kWidth * kCells;
+  static constexpr int kCellDepth = CellFormat::kDepth;
+  static constexpr int kCellSize = CellFormat::kSize;
+  static constexpr SideMapOrder kSrcOrder = SrcMapType::kOrder;
+  static constexpr int kZeroPointInputValue =
+      ZeroPointInputValue<KernelInputScalar, KernelScalar>::kValue;
+
+  PackingRegisterBlockBase() : complete_src_(nullptr, 0, 0, 0) {}
+
+ protected:
+  // The source data that's ready for packing. May point to
+  // in-place actual source data if it's already a complete block,
+  // (see UseCompleteSrcInPlace)
+  // or to the local buf_ below into which we copy incomplete blocks
+  // (see MakeCompleteSrc)
+  SrcMapType complete_src_;
+
+  // Temporary buffer for loading incomplete blocks to,
+  // in the source storage order
+  std::uint8_t buf_[kKernelWidth * kRegisterSize];
+
+ public:
+  // Selects a block if in-place source data that's already a complete block.
+  void UseCompleteSrcInPlace(const SrcMapType& src) { complete_src_ = src; }
+  // Copies an incomplete block of source data into a local temporary
+  // complete block by zero-extending it.
+  void MakeCompleteSrc(const SrcMapType& src) {
+    memset(buf_, kZeroPointInputValue, kKernelWidth * kRegisterSize);
+    if (kSrcOrder == SideMapOrder::WidthMajor) {
+      for (int w = 0; w < src.width(); w++) {
+        memcpy(buf_ + w * kRegisterSize, src.data(w, 0), src.depth());
+      }
+    } else {
+      assert(kSrcOrder == SideMapOrder::DepthMajor);
+      for (int d = 0; d < src.depth(); d++) {
+        memcpy(buf_ + d * kKernelWidth, src.data(0, d), src.width());
+      }
+    }
+
+    // Since the KernelInputScalar type may not be uint8, we need to cast buf_.
+    complete_src_ = SrcMapType(reinterpret_cast<KernelInputScalar*>(buf_),
+                               kKernelWidth, kRegisterSize);
+  }
+  // Packs a complete block into the destination. This is the most
+  // critical part and the part that we most typically want to
+  // override in architecture-specific optimized specializations.
+  void Pack(PackedSideBlock* dst, int start_width) {
+    std::uint8_t* dst_ptr = dst->current_data();
+    for (int cell_start_depth = 0; cell_start_depth < kRegisterSize;
+         cell_start_depth += kCellDepth) {
+      for (int cell_start_width = 0; cell_start_width < kKernelWidth;
+           cell_start_width += kCellWidth) {
+        std::int32_t* cell_sums_of_each_slice_ptr =
+            dst->sums_of_each_slice() + start_width + cell_start_width;
+        const SideMap<const std::uint8_t, kSrcOrder> src_cell_map(
+            complete_src_.block(cell_start_width, cell_start_depth, kCellWidth,
+                                kCellDepth));
+        for (int w = 0; w < kCellWidth; w++) {
+          std::int32_t sum = 0;
+          for (int d = 0; d < kCellDepth; d++) {
+            const std::uint8_t src_val = src_cell_map(w, d);
+            const std::int16_t kernel_val_unwrapped =
+                src_val - kZeroPointInputValue;
+            const std::uint8_t kernel_val_uint8 = kernel_val_unwrapped;
+            dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = kernel_val_uint8;
+            sum += kernel_val_unwrapped;
+          }
+          cell_sums_of_each_slice_ptr[w] += sum;
+        }
+        dst_ptr += kCellSize;
+      }
+    }
+    dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
+  }
+};
+
+template <typename SrcMapType, typename PackedSideBlock>
+class PackingRegisterBlock
+    : public PackingRegisterBlockBase<SrcMapType, PackedSideBlock> {};
+
+// Large-scale implementation of packing.
+template <typename SrcMapType, typename PackedSideBlock>
+class PackSideBlockImpl {
+ public:
+  typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
+  typedef typename KernelSideFormat::Cell CellFormat;
+  static constexpr int kCells = KernelSideFormat::kCells;
+  static constexpr int kCellWidth = CellFormat::kWidth;
+  static constexpr int kKernelWidth = CellFormat::kWidth * kCells;
+  static constexpr int kCellDepth = CellFormat::kDepth;
+
+  typedef PackingRegisterBlock<SrcMapType, PackedSideBlock>
+      PackingRegisterBlockType;
+
+  PackSideBlockImpl(PackedSideBlock* packed_side_block,
+                    const SrcMapType& src_map)
+      : packed_side_block_(packed_side_block), src_map_(src_map) {}
+
+  PackedSideBlock* packed_side_block() const { return packed_side_block_; }
+
+  const SrcMapType& src_map() const { return src_map_; }
+
+  // The public entry point to pack a block.
+  void PackL2() {
+    memset(packed_side_block_->sums_of_each_slice(), 0,
+           sizeof(std::int32_t) * packed_side_block_->params().l2_width);
+    for (int d = 0; d < src_map_.depth();
+         d += packed_side_block_->params().l1_depth) {
+      int ds = std::min<int>(packed_side_block_->params().l1_depth,
+                             src_map_.depth() - d);
+
+      for (int w = 0; w < src_map_.width();
+           w += packed_side_block_->params().l1_width) {
+        int ws = std::min<int>(packed_side_block_->params().l1_width,
+                               src_map_.width() - w);
+
+        PrefetchL1(w, ws, d, ds);
+        PackL1(w, ws, d, ds);
+      }
+    }
+  }
+
+ protected:
+  // The intermediate-level loops, between PackL2 and PackRun.
+  void PackL1(int start_width, int width, int start_depth, int depth) {
+    for (int w = 0; w < width; w += kKernelWidth) {
+      int ws = std::min(+kKernelWidth, width - w);
+      packed_side_block_->seek_run(start_width + w, start_depth);
+      PackRun(start_width + w, ws, start_depth, depth);
+    }
+  }
+
+  // Prefetches the data that will be read by PackL1.
+  void PrefetchL1(int start_width, int width, int start_depth, int depth) {
+    if (SrcMapType::kOrder == SideMapOrder::WidthMajor) {
+      for (int d = 0; d < depth; d += kDefaultCacheLineSize) {
+        for (int w = 0; w < width; w += 1) {
+          Prefetch(src_map_.data(start_width + w, start_depth + d));
+        }
+      }
+    } else {
+      for (int d = 0; d < depth; d++) {
+        for (int w = 0; w < width; w += kDefaultCacheLineSize) {
+          Prefetch(src_map_.data(start_width + w, start_depth + d));
+        }
+      }
+    }
+  }
+
+  // PackRun packs only a run i.e. is the inner loop in the depth dimension.
+  void PackRun(int start_width, int width, int start_depth, int depth) {
+    PackingRegisterBlockType b;
+    if (width == kKernelWidth) {
+      const int register_aligned_depth = RoundDown<kRegisterSize>(depth);
+      if (register_aligned_depth) {
+        for (int d = 0; d < register_aligned_depth; d += kRegisterSize) {
+          b.UseCompleteSrcInPlace(src_map_.block(start_width, start_depth + d,
+                                                 width, kRegisterSize));
+          b.Pack(packed_side_block_, start_width);
+        }
+      }
+      if (register_aligned_depth < depth) {
+        b.MakeCompleteSrc(
+            src_map_.block(start_width, start_depth + register_aligned_depth,
+                           width, depth - register_aligned_depth));
+        b.Pack(packed_side_block_, start_width);
+      }
+    } else {
+      assert(width < kKernelWidth);
+      for (int d = 0; d < depth; d += kRegisterSize) {
+        const int ds = std::min(+kRegisterSize, depth - d);
+        b.MakeCompleteSrc(
+            src_map_.block(start_width, start_depth + d, width, ds));
+        b.Pack(packed_side_block_, start_width);
+      }
+    }
+  }
+
+  // The PackedSideBlock being packed, i.e. the 'destination'.
+  PackedSideBlock* const packed_side_block_;
+
+  // A map on the block of the original matrix block being packed,
+  // i.e. the 'source'.
+  const SrcMapType& src_map_;
+};
+
+// Packs a block of the input LHS matrix, into a PackedSideBlock.
+template <typename PackedSideBlock, typename MatrixMapType>
+void PackLhs(PackedSideBlock* dst, const MatrixMapType& src) {
+  ScopedProfilingLabel label("pack LHS");
+  static const SideMapOrder kSideMapOrder =
+      MatrixMapType::kOrder == MapOrder::RowMajor ? SideMapOrder::WidthMajor
+                                                  : SideMapOrder::DepthMajor;
+  typedef typename MatrixMapType::Scalar Scalar;
+  typedef SideMap<Scalar, kSideMapOrder> SideMapType;
+  SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride());
+  typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType;
+  ImplType impl(dst, src_side_map);
+  impl.PackL2();
+}
+
+// Packs a block of the input RHS matrix, into a PackedSideBlock.
+template <typename PackedSideBlock, typename MatrixMapType>
+void PackRhs(PackedSideBlock* dst, const MatrixMapType& src) {
+  ScopedProfilingLabel label("pack RHS");
+  static const SideMapOrder kSideMapOrder =
+      MatrixMapType::kOrder == MapOrder::ColMajor ? SideMapOrder::WidthMajor
+                                                  : SideMapOrder::DepthMajor;
+  typedef typename MatrixMapType::Scalar Scalar;
+  typedef SideMap<Scalar, kSideMapOrder> SideMapType;
+  SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride());
+  typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType;
+  ImplType impl(dst, src_side_map);
+  impl.PackL2();
+}
+
+}  // namespace gemmlowp
+
+#ifdef GEMMLOWP_NEON
+#include "pack_neon.h"
+#elif defined(GEMMLOWP_SSE4)
+#include "pack_sse.h"
+#elif defined(GEMMLOWP_AVX2)
+#include "pack_avx.h"
+#elif defined(GEMMLOWP_MSA)
+#include "pack_msa.h"
+#endif
+
+#endif  // GEMMLOWP_INTERNAL_PACK_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack_avx.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack_avx.h
new file mode 100644
index 00000000..1ef5ce1e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack_avx.h
@@ -0,0 +1,282 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// pack_avx.h: optimized AVX specializations of the templates in pack.h.
+
+#ifndef GEMMLOWP_INTERNAL_PACK_AVX_H_
+#define GEMMLOWP_INTERNAL_PACK_AVX_H_
+
+#include <immintrin.h>
+#include "pack.h"
+
+namespace gemmlowp {
+
+// TODO: Add DepthMajorUint8SideMap
+
+typedef SideMap<const std::uint8_t, SideMapOrder::WidthMajor>
+    WidthMajorUint8SideMap;
+
+template <int Cells>
+using WidthMajorSideFormatNCells4x2 =
+    KernelSideFormat<CellFormat<8, 2, CellOrder::WidthMajor>, Cells>;
+
+template <int Cells>
+class PackingRegisterBlock<
+    WidthMajorUint8SideMap,
+    PackedSideBlock<WidthMajorSideFormatNCells4x2<Cells>>>
+    : public PackingRegisterBlockBase<
+          WidthMajorUint8SideMap,
+          PackedSideBlock<WidthMajorSideFormatNCells4x2<Cells>>> {
+ public:
+  typedef WidthMajorSideFormatNCells4x2<Cells> KernelSideFormat;
+  typedef typename KernelSideFormat::Cell CellFormat;
+  static const int kCells = KernelSideFormat::kCells;
+  static const int kCellWidth = CellFormat::kWidth;
+  static const int kKernelWidth = CellFormat::kWidth * kCells;
+  static const int kCellDepth = CellFormat::kDepth;
+  static const int kCellSize = CellFormat::kSize;
+
+  void Pack(PackedSideBlock<KernelSideFormat> *dst, int start_width) {
+    std::uint8_t *dst_ptr = dst->current_data();
+    const int width_stride = this->complete_src_.width_stride();
+    int depth_step = 16;
+
+    __m256i one = _mm256_set1_epi16(1);
+    for (int cell_start_depth = 0; cell_start_depth < kRegisterSize;
+         cell_start_depth += depth_step) {
+      for (int cell_start_width = 0; cell_start_width < kKernelWidth;
+           cell_start_width += kCellWidth) {
+        std::int32_t *cell_sums_of_each_slice_ptr =
+            dst->sums_of_each_slice() + start_width + cell_start_width;
+        const std::uint8_t *src_data =
+            this->complete_src_.data(cell_start_width, cell_start_depth);
+
+        __m128i xmm1 =
+            _mm_loadu_si128(reinterpret_cast<const __m128i *>(&src_data[0]));
+        __m128i xmm2 = _mm_loadu_si128(
+            reinterpret_cast<const __m128i *>(&src_data[1 * width_stride]));
+        __m128i xmm3 = _mm_loadu_si128(
+            reinterpret_cast<const __m128i *>(&src_data[2 * width_stride]));
+        __m128i xmm4 = _mm_loadu_si128(
+            reinterpret_cast<const __m128i *>(&src_data[3 * width_stride]));
+        __m128i xmm5 = _mm_loadu_si128(
+            reinterpret_cast<const __m128i *>(&src_data[4 * width_stride]));
+        __m128i xmm6 = _mm_loadu_si128(
+            reinterpret_cast<const __m128i *>(&src_data[5 * width_stride]));
+        __m128i xmm7 = _mm_loadu_si128(
+            reinterpret_cast<const __m128i *>(&src_data[6 * width_stride]));
+        __m128i xmm8 = _mm_loadu_si128(
+            reinterpret_cast<const __m128i *>(&src_data[7 * width_stride]));
+
+        __m256i ymm1 = _mm256_set_m128i(xmm5, xmm1);
+        __m256i ymm2 = _mm256_set_m128i(xmm6, xmm2);
+        __m256i ymm3 = _mm256_set_m128i(xmm7, xmm3);
+        __m256i ymm4 = _mm256_set_m128i(xmm8, xmm4);
+
+        __m256i ymm5 = _mm256_unpacklo_epi16(ymm1, ymm2);
+        __m256i ymm6 = _mm256_unpacklo_epi16(ymm3, ymm4);
+
+        __m256i ymm9 = _mm256_unpackhi_epi16(ymm1, ymm2);
+        __m256i ymm10 = _mm256_unpackhi_epi16(ymm3, ymm4);
+
+        __m256i ymm7 = _mm256_unpacklo_epi32(ymm5, ymm6);
+        __m256i ymm8 = _mm256_unpackhi_epi32(ymm5, ymm6);
+
+        __m256i ymm13 = _mm256_unpacklo_epi32(ymm9, ymm10);
+        __m256i ymm14 = _mm256_unpackhi_epi32(ymm9, ymm10);
+
+        __m256i ymm11 = _mm256_permute4x64_epi64(ymm7, 0xd8);
+        __m256i ymm12 = _mm256_permute4x64_epi64(ymm8, 0xd8);
+
+        __m256i ymm15 = _mm256_permute4x64_epi64(ymm13, 0xd8);
+        __m256i ymm16 = _mm256_permute4x64_epi64(ymm14, 0xd8);
+
+        __m128i xmm9 = _mm256_castsi256_si128(ymm11);
+        __m128i xmm10 = _mm256_castsi256_si128(ymm12);
+        __m128i xmm11 = _mm256_extracti128_si256(ymm11, 1);
+        __m128i xmm12 = _mm256_extracti128_si256(ymm12, 1);
+
+        xmm1 = _mm256_castsi256_si128(ymm15);
+        xmm2 = _mm256_castsi256_si128(ymm16);
+        xmm3 = _mm256_extracti128_si256(ymm15, 1);
+        xmm4 = _mm256_extracti128_si256(ymm16, 1);
+
+        _mm_storeu_si128(reinterpret_cast<__m128i *>(&dst_ptr[0]), xmm9);
+        _mm_storeu_si128(
+            reinterpret_cast<__m128i *>(&dst_ptr[kCellSize * kCells]), xmm11);
+        _mm_storeu_si128(
+            reinterpret_cast<__m128i *>(&dst_ptr[2 * kCellSize * kCells]),
+            xmm10);
+        _mm_storeu_si128(
+            reinterpret_cast<__m128i *>(&dst_ptr[3 * kCellSize * kCells]),
+            xmm12);
+        _mm_storeu_si128(
+            reinterpret_cast<__m128i *>(&dst_ptr[4 * kCellSize * kCells]),
+            xmm1);
+        _mm_storeu_si128(
+            reinterpret_cast<__m128i *>(&dst_ptr[5 * kCellSize * kCells]),
+            xmm3);
+
+        _mm_storeu_si128(
+            reinterpret_cast<__m128i *>(&dst_ptr[6 * kCellSize * kCells]),
+            xmm2);
+        _mm_storeu_si128(
+            reinterpret_cast<__m128i *>(&dst_ptr[7 * kCellSize * kCells]),
+            xmm4);
+
+        ymm6 = _mm256_cvtepu8_epi16(xmm9);
+        ymm7 = _mm256_madd_epi16(ymm6, one);
+        __m256i sums_of_each_slice_xmm = _mm256_loadu_si256(
+            reinterpret_cast<const __m256i *>(&cell_sums_of_each_slice_ptr[0]));
+        sums_of_each_slice_xmm = _mm256_add_epi32(sums_of_each_slice_xmm, ymm7);
+
+        ymm6 = _mm256_cvtepu8_epi16(xmm11);
+        ymm7 = _mm256_madd_epi16(ymm6, one);
+        sums_of_each_slice_xmm = _mm256_add_epi32(sums_of_each_slice_xmm, ymm7);
+
+        ymm6 = _mm256_cvtepu8_epi16(xmm10);
+        ymm7 = _mm256_madd_epi16(ymm6, one);
+        sums_of_each_slice_xmm = _mm256_add_epi32(sums_of_each_slice_xmm, ymm7);
+
+        ymm6 = _mm256_cvtepu8_epi16(xmm12);
+        ymm7 = _mm256_madd_epi16(ymm6, one);
+        sums_of_each_slice_xmm = _mm256_add_epi32(sums_of_each_slice_xmm, ymm7);
+
+        ymm6 = _mm256_cvtepu8_epi16(xmm1);
+        ymm7 = _mm256_madd_epi16(ymm6, one);
+        sums_of_each_slice_xmm = _mm256_add_epi32(sums_of_each_slice_xmm, ymm7);
+
+        ymm6 = _mm256_cvtepu8_epi16(xmm3);
+        ymm7 = _mm256_madd_epi16(ymm6, one);
+        sums_of_each_slice_xmm = _mm256_add_epi32(sums_of_each_slice_xmm, ymm7);
+
+        ymm6 = _mm256_cvtepu8_epi16(xmm2);
+        ymm7 = _mm256_madd_epi16(ymm6, one);
+        sums_of_each_slice_xmm = _mm256_add_epi32(sums_of_each_slice_xmm, ymm7);
+
+        ymm6 = _mm256_cvtepu8_epi16(xmm4);
+        ymm7 = _mm256_madd_epi16(ymm6, one);
+        sums_of_each_slice_xmm = _mm256_add_epi32(sums_of_each_slice_xmm, ymm7);
+
+        _mm256_storeu_si256(
+            reinterpret_cast<__m256i *>(&cell_sums_of_each_slice_ptr[0]),
+            sums_of_each_slice_xmm);
+        dst_ptr += kCellSize;
+      }
+      dst_ptr += 7 * kCellSize * kCells;
+    }
+    dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
+  }
+};
+
+// Pack format for 4x2 rhs format
+template <int Cells>
+using RhsWidthMajorSideFormatNCells4x2 =
+    KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, Cells>;
+
+template <int Cells>
+class PackingRegisterBlock<
+    WidthMajorUint8SideMap,
+    PackedSideBlock<RhsWidthMajorSideFormatNCells4x2<Cells>>>
+    : public PackingRegisterBlockBase<
+          WidthMajorUint8SideMap,
+          PackedSideBlock<RhsWidthMajorSideFormatNCells4x2<Cells>>> {
+ public:
+  typedef RhsWidthMajorSideFormatNCells4x2<Cells> KernelSideFormat;
+  typedef typename KernelSideFormat::Cell CellFormat;
+  static const int kCells = KernelSideFormat::kCells;
+  static const int kCellWidth = CellFormat::kWidth;
+  static const int kKernelWidth = CellFormat::kWidth * kCells;
+  static const int kCellDepth = CellFormat::kDepth;
+  static const int kCellSize = CellFormat::kSize;
+
+  void Pack(PackedSideBlock<KernelSideFormat> *dst, int start_width) {
+    std::uint8_t *dst_ptr = dst->current_data();
+    const int width_stride = this->complete_src_.width_stride();
+    int depth_step = 8;
+
+    __m128i one = _mm_set1_epi16(1);
+    for (int cell_start_depth = 0; cell_start_depth < kRegisterSize;
+         cell_start_depth += depth_step) {
+      for (int cell_start_width = 0; cell_start_width < kKernelWidth;
+           cell_start_width += kCellWidth) {
+        std::int32_t *cell_sums_of_each_slice_ptr =
+            dst->sums_of_each_slice() + start_width + cell_start_width;
+        const std::uint8_t *src_data =
+            this->complete_src_.data(cell_start_width, cell_start_depth);
+
+        __m128i xmm1 =
+            _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&src_data[0]));
+        __m128i xmm2 = _mm_loadl_epi64(
+            reinterpret_cast<const __m128i *>(&src_data[1 * width_stride]));
+        __m128i xmm3 = _mm_loadl_epi64(
+            reinterpret_cast<const __m128i *>(&src_data[2 * width_stride]));
+        __m128i xmm4 = _mm_loadl_epi64(
+            reinterpret_cast<const __m128i *>(&src_data[3 * width_stride]));
+
+        __m128i xmm5 = _mm_unpacklo_epi16(xmm1, xmm2);
+        __m128i xmm8 = _mm_shuffle_epi32(xmm5, 0x31);
+
+        __m128i xmm6 = _mm_unpacklo_epi16(xmm3, xmm4);
+        __m128i xmm7 = _mm_shuffle_epi32(xmm6, 0x80);
+
+        __m128i xmm9 = _mm_blend_epi16(xmm5, xmm7, 0xcc);
+        __m128i xmm10 = _mm_blend_epi16(xmm8, xmm6, 0xcc);
+
+        _mm_storel_epi64(reinterpret_cast<__m128i *>(&dst_ptr[0]), xmm9);
+        _mm_storel_epi64(
+            reinterpret_cast<__m128i *>(&dst_ptr[kCellSize * kCells]), xmm10);
+
+        __m128i xmm11 = _mm_shuffle_epi32(xmm9, 0xee);
+        __m128i xmm12 = _mm_shuffle_epi32(xmm10, 0xee);
+
+        _mm_storel_epi64(
+            reinterpret_cast<__m128i *>(&dst_ptr[2 * kCellSize * kCells]),
+            xmm11);
+        _mm_storel_epi64(
+            reinterpret_cast<__m128i *>(&dst_ptr[3 * kCellSize * kCells]),
+            xmm12);
+
+        xmm1 = _mm_cvtepu8_epi16(xmm9);
+        xmm2 = _mm_madd_epi16(xmm1, one);
+        __m128i sums_of_each_slice_xmm = _mm_loadu_si128(
+            reinterpret_cast<const __m128i *>(&cell_sums_of_each_slice_ptr[0]));
+        sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2);
+
+        xmm1 = _mm_cvtepu8_epi16(xmm10);
+        xmm2 = _mm_madd_epi16(xmm1, one);
+        sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2);
+
+        xmm1 = _mm_cvtepu8_epi16(xmm11);
+        xmm2 = _mm_madd_epi16(xmm1, one);
+        sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2);
+
+        xmm1 = _mm_cvtepu8_epi16(xmm12);
+        xmm2 = _mm_madd_epi16(xmm1, one);
+        sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2);
+
+        _mm_storeu_si128(
+            reinterpret_cast<__m128i *>(&cell_sums_of_each_slice_ptr[0]),
+            sums_of_each_slice_xmm);
+        dst_ptr += kCellSize;
+      }
+      dst_ptr += 3 * kCellSize * kCells;
+    }
+    dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
+  }
+};
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_PACK_AVX_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack_msa.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack_msa.h
new file mode 100644
index 00000000..4072229b
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack_msa.h
@@ -0,0 +1,431 @@
+// Copyright 2018 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// pack_msa.h: optimized MSA specializations of the templates in pack.h.
+
+#ifndef GEMMLOWP_INTERNAL_PACK_MSA_H_
+#define GEMMLOWP_INTERNAL_PACK_MSA_H_
+
+#include "pack.h"
+
+#include <msa.h>
+
+namespace gemmlowp {
+
+typedef SideMap<const std::uint8_t, SideMapOrder::WidthMajor>
+    WidthMajorUint8SideMap;
+
+template <int Cells>
+using DepthMajorSideFormatNCells4x2 = KernelSideFormat<CellFormat<4, 2>, Cells>;
+
+template <int Cells>
+class PackingRegisterBlock<
+    WidthMajorUint8SideMap,
+    PackedSideBlock<DepthMajorSideFormatNCells4x2<Cells>>>
+    : public PackingRegisterBlockBase<
+          WidthMajorUint8SideMap,
+          PackedSideBlock<DepthMajorSideFormatNCells4x2<Cells>>> {
+ public:
+  typedef DepthMajorSideFormatNCells4x2<Cells> KernelSideFormat;
+  typedef typename KernelSideFormat::Cell CellFormat;
+  static constexpr int kCells = KernelSideFormat::kCells;
+  static const int kCellWidth = CellFormat::kWidth;
+  static const int kKernelWidth = CellFormat::kWidth * kCells;
+  static const int kCellDepth = CellFormat::kDepth;
+  static const int kCellSize = CellFormat::kSize;
+
+  void Pack(PackedSideBlock<KernelSideFormat>* dst, int start_width) {
+    std::uint8_t* dst_ptr = dst->current_data();
+    const std::uint8_t* const src_ptr = this->complete_src_.data();
+    const int stride = this->complete_src_.stride();
+    // Load source WidthMajor data
+    v16i8 src_lines[4 * kCells];
+    for (int i = 0; i < 4 * kCells; i++) {
+      src_lines[i] = __builtin_msa_ld_b(
+          const_cast<std::uint8_t*>(src_ptr + i * stride), 0);
+    }
+    // Reorder the data within registers to make DepthMajor 4x2 cells
+    v16i8 src_lines_intertwined_2x[2 * kCells][2];
+    for (int i = 0; i < kCells; i++) {
+      src_lines_intertwined_2x[2 * i][0] =
+          __builtin_msa_ilvr_b(src_lines[4 * i + 2], src_lines[4 * i]);
+      src_lines_intertwined_2x[2 * i][1] =
+          __builtin_msa_ilvl_b(src_lines[4 * i + 2], src_lines[4 * i]);
+      src_lines_intertwined_2x[2 * i + 1][0] =
+          __builtin_msa_ilvr_b(src_lines[4 * i + 3], src_lines[4 * i + 1]);
+      src_lines_intertwined_2x[2 * i + 1][1] =
+          __builtin_msa_ilvl_b(src_lines[4 * i + 3], src_lines[4 * i + 1]);
+    }
+    v16i8 src_lines_intertwined_4x[2 * kCells][2];
+    for (int i = 0; i < kCells; i++) {
+      src_lines_intertwined_4x[2 * i][0] =
+          __builtin_msa_ilvr_b(src_lines_intertwined_2x[2 * i + 1][0],
+                               src_lines_intertwined_2x[2 * i][0]);
+      src_lines_intertwined_4x[2 * i][1] =
+          __builtin_msa_ilvl_b(src_lines_intertwined_2x[2 * i + 1][0],
+                               src_lines_intertwined_2x[2 * i][0]);
+      src_lines_intertwined_4x[2 * i + 1][0] =
+          __builtin_msa_ilvr_b(src_lines_intertwined_2x[2 * i + 1][1],
+                               src_lines_intertwined_2x[2 * i][1]);
+      src_lines_intertwined_4x[2 * i + 1][1] =
+          __builtin_msa_ilvl_b(src_lines_intertwined_2x[2 * i + 1][1],
+                               src_lines_intertwined_2x[2 * i][1]);
+    }
+    // Store the resulting DepthMajor 4x2 cells in the destination packed block
+    for (int outer = 0; outer < 2; outer++) {
+      for (int inner = 0; inner < 2; inner++) {
+        if (kCells % 2 == 0) {
+          for (int cell = 0; cell < kCells; cell += 2) {
+            v2i64 tmp = __builtin_msa_ilvr_d(
+                reinterpret_cast<v2i64>(
+                    src_lines_intertwined_4x[2 * (cell + 1) + outer][inner]),
+                reinterpret_cast<v2i64>(
+                    src_lines_intertwined_4x[2 * cell + outer][inner]));
+            __builtin_msa_st_b(reinterpret_cast<v16i8>(tmp), dst_ptr, 0);
+            dst_ptr += 16;
+          }
+          for (int cell = 0; cell < kCells; cell += 2) {
+            v2i64 tmp = __builtin_msa_ilvl_d(
+                reinterpret_cast<v2i64>(
+                    src_lines_intertwined_4x[2 * (cell + 1) + outer][inner]),
+                reinterpret_cast<v2i64>(
+                    src_lines_intertwined_4x[2 * cell + outer][inner]));
+            __builtin_msa_st_b(reinterpret_cast<v16i8>(tmp), dst_ptr, 0);
+            dst_ptr += 16;
+          }
+        } else {
+          // Store even number of low vector halves.
+          for (int cell = 0; cell < kCells - 1; cell += 2) {
+            v2i64 tmp = __builtin_msa_ilvr_d(
+                reinterpret_cast<v2i64>(
+                    src_lines_intertwined_4x[2 * (cell + 1) + outer][inner]),
+                reinterpret_cast<v2i64>(
+                    src_lines_intertwined_4x[2 * cell + outer][inner]));
+            __builtin_msa_st_b(reinterpret_cast<v16i8>(tmp), dst_ptr, 0);
+            dst_ptr += 16;
+          }
+          // Store last low half and first high half.
+          v2i64 tmp = reinterpret_cast<v2i64>(
+              src_lines_intertwined_4x[2 * 0 + outer][inner]);
+          tmp = __builtin_msa_insve_d(
+              tmp, 0,
+              reinterpret_cast<v2i64>(
+                  src_lines_intertwined_4x[2 * (kCells - 1) + outer][inner]));
+          __builtin_msa_st_b(reinterpret_cast<v16i8>(tmp), dst_ptr, 0);
+          dst_ptr += 16;
+          // Store even number of high vector halves.
+          for (int cell = 1; cell < kCells; cell += 2) {
+            v2i64 tmp = __builtin_msa_ilvl_d(
+                reinterpret_cast<v2i64>(
+                    src_lines_intertwined_4x[2 * (cell + 1) + outer][inner]),
+                reinterpret_cast<v2i64>(
+                    src_lines_intertwined_4x[2 * cell + outer][inner]));
+            __builtin_msa_st_b(reinterpret_cast<v16i8>(tmp), dst_ptr, 0);
+            dst_ptr += 16;
+          }
+        }
+      }
+    }
+    // Compute sums across the depth dimension
+    v8i16 sums_of_2_cells[kCells][4];
+    const v16i8 zeroes = __builtin_msa_ldi_b(0);
+    for (int outer = 0; outer < 2; outer++) {
+      for (int inner = 0; inner < 2; inner++) {
+        int i = 2 * outer + inner;
+        for (int cell = 0; cell < kCells; cell++) {
+          v8i16 tmp0 = reinterpret_cast<v8i16>(__builtin_msa_ilvr_b(
+              zeroes, src_lines_intertwined_4x[2 * cell + outer][inner]));
+          v8i16 tmp1 = reinterpret_cast<v8i16>(__builtin_msa_ilvl_b(
+              zeroes, src_lines_intertwined_4x[2 * cell + outer][inner]));
+          sums_of_2_cells[cell][i] = __builtin_msa_addv_h(tmp0, tmp1);
+        }
+      }
+    }
+    v4i32 sums_of_4_cells[kCells][4];
+    for (int i = 0; i < 4; i++) {
+      for (int cell = 0; cell < kCells; cell++) {
+        v4i32 tmp0 = reinterpret_cast<v4i32>(__builtin_msa_ilvr_h(
+            reinterpret_cast<v8i16>(zeroes), sums_of_2_cells[cell][i]));
+        v4i32 tmp1 = reinterpret_cast<v4i32>(__builtin_msa_ilvl_h(
+            reinterpret_cast<v8i16>(zeroes), sums_of_2_cells[cell][i]));
+        sums_of_4_cells[cell][i] = __builtin_msa_addv_w(tmp0, tmp1);
+      }
+    }
+    // Update the sums_of_each_slice vector
+    for (int cell = 0; cell < kCells; cell++) {
+      v4i32 s01 = __builtin_msa_addv_w(sums_of_4_cells[cell][0],
+                                       sums_of_4_cells[cell][1]);
+      v4i32 s23 = __builtin_msa_addv_w(sums_of_4_cells[cell][2],
+                                       sums_of_4_cells[cell][3]);
+      v4i32 s = __builtin_msa_addv_w(s01, s23);
+      std::int32_t* sums_of_each_slice_ptr =
+          dst->sums_of_each_slice() + start_width + 4 * cell;
+      v4i32 tmp = __builtin_msa_ld_w(sums_of_each_slice_ptr, 0);
+      tmp = __builtin_msa_addv_w(tmp, s);
+      __builtin_msa_st_w(tmp, sums_of_each_slice_ptr, 0);
+    }
+    dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
+  }
+};
+
+template <int Cells>
+using WidthMajorSideFormatNCells4x2 =
+    KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, Cells>;
+
+template <int Cells>
+class PackingRegisterBlock<
+    WidthMajorUint8SideMap,
+    PackedSideBlock<WidthMajorSideFormatNCells4x2<Cells>>>
+    : public PackingRegisterBlockBase<
+          WidthMajorUint8SideMap,
+          PackedSideBlock<WidthMajorSideFormatNCells4x2<Cells>>> {
+ public:
+  typedef WidthMajorSideFormatNCells4x2<Cells> KernelSideFormat;
+  typedef typename KernelSideFormat::Cell CellFormat;
+  static constexpr int kCells = KernelSideFormat::kCells;
+  static const int kCellWidth = CellFormat::kWidth;
+  static const int kKernelWidth = CellFormat::kWidth * kCells;
+  static const int kCellDepth = CellFormat::kDepth;
+  static const int kCellSize = CellFormat::kSize;
+
+  void Pack(PackedSideBlock<KernelSideFormat>* dst, int start_width) {
+    std::uint8_t* dst_ptr = dst->current_data();
+    const std::uint8_t* src_ptr = this->complete_src_.data();
+    const int stride = this->complete_src_.stride();
+    // Load source WidthMajor data
+    v8i16 src_lines[kCells * 4];
+    for (int i = 0; i < kCells; i++) {
+#define GEMMLOWP_UNROLLED_LOOP_ITER(k)                           \
+  src_lines[4 * i + k] =                                         \
+      __builtin_msa_ld_h(const_cast<std::uint8_t*>(src_ptr), 0); \
+  src_ptr += stride;
+
+      GEMMLOWP_UNROLLED_LOOP_ITER(0)
+      GEMMLOWP_UNROLLED_LOOP_ITER(1)
+      GEMMLOWP_UNROLLED_LOOP_ITER(2)
+      GEMMLOWP_UNROLLED_LOOP_ITER(3)
+
+#undef GEMMLOWP_UNROLLED_LOOP_ITER
+    }
+    // Reorder the data within registers to make WidthMajor 4x2 cells
+    v8i16 src_lines_intertwined_2x[2 * kCells][2];
+    for (int i = 0; i < kCells; i++) {
+      src_lines_intertwined_2x[2 * i][0] =
+          __builtin_msa_ilvr_h(src_lines[4 * i + 2], src_lines[4 * i]);
+      src_lines_intertwined_2x[2 * i][1] =
+          __builtin_msa_ilvl_h(src_lines[4 * i + 2], src_lines[4 * i]);
+      src_lines_intertwined_2x[2 * i + 1][0] =
+          __builtin_msa_ilvr_h(src_lines[4 * i + 3], src_lines[4 * i + 1]);
+      src_lines_intertwined_2x[2 * i + 1][1] =
+          __builtin_msa_ilvl_h(src_lines[4 * i + 3], src_lines[4 * i + 1]);
+    }
+    v8i16 src_lines_intertwined_4x[2 * kCells][2];
+    for (int i = 0; i < kCells; i++) {
+      src_lines_intertwined_4x[2 * i][0] =
+          __builtin_msa_ilvr_h(src_lines_intertwined_2x[2 * i + 1][0],
+                               src_lines_intertwined_2x[2 * i][0]);
+      src_lines_intertwined_4x[2 * i][1] =
+          __builtin_msa_ilvl_h(src_lines_intertwined_2x[2 * i + 1][0],
+                               src_lines_intertwined_2x[2 * i][0]);
+      src_lines_intertwined_4x[2 * i + 1][0] =
+          __builtin_msa_ilvr_h(src_lines_intertwined_2x[2 * i + 1][1],
+                               src_lines_intertwined_2x[2 * i][1]);
+      src_lines_intertwined_4x[2 * i + 1][1] =
+          __builtin_msa_ilvl_h(src_lines_intertwined_2x[2 * i + 1][1],
+                               src_lines_intertwined_2x[2 * i][1]);
+    }
+    // Store the resulting WidthMajor 4x2 cells in the destination packed block
+    for (int outer = 0; outer < 2; outer++) {
+      for (int inner = 0; inner < 2; inner++) {
+        if (kCells % 2 == 0) {
+          for (int cell = 0; cell < kCells; cell += 2) {
+            v2i64 tmp = __builtin_msa_ilvr_d(
+                reinterpret_cast<v2i64>(
+                    src_lines_intertwined_4x[2 * (cell + 1) + outer][inner]),
+                reinterpret_cast<v2i64>(
+                    src_lines_intertwined_4x[2 * cell + outer][inner]));
+            __builtin_msa_st_b(reinterpret_cast<v16i8>(tmp), dst_ptr, 0);
+            dst_ptr += 16;
+          }
+          for (int cell = 0; cell < kCells; cell += 2) {
+            v2i64 tmp = __builtin_msa_ilvl_d(
+                reinterpret_cast<v2i64>(
+                    src_lines_intertwined_4x[2 * (cell + 1) + outer][inner]),
+                reinterpret_cast<v2i64>(
+                    src_lines_intertwined_4x[2 * cell + outer][inner]));
+            __builtin_msa_st_b(reinterpret_cast<v16i8>(tmp), dst_ptr, 0);
+            dst_ptr += 16;
+          }
+        } else {
+          // Store even number of low vector halves.
+          for (int cell = 0; cell < kCells - 1; cell += 2) {
+            v2i64 tmp = __builtin_msa_ilvr_d(
+                reinterpret_cast<v2i64>(
+                    src_lines_intertwined_4x[2 * (cell + 1) + outer][inner]),
+                reinterpret_cast<v2i64>(
+                    src_lines_intertwined_4x[2 * cell + outer][inner]));
+            __builtin_msa_st_b(reinterpret_cast<v16i8>(tmp), dst_ptr, 0);
+            dst_ptr += 16;
+          }
+          // Store last low half and first high half.
+          v2i64 tmp = reinterpret_cast<v2i64>(
+              src_lines_intertwined_4x[2 * 0 + outer][inner]);
+          tmp = __builtin_msa_insve_d(
+              tmp, 0,
+              reinterpret_cast<v2i64>(
+                  src_lines_intertwined_4x[2 * (kCells - 1) + outer][inner]));
+          __builtin_msa_st_b(reinterpret_cast<v16i8>(tmp), dst_ptr, 0);
+          dst_ptr += 16;
+          // Store even number of high vector halves.
+          for (int cell = 1; cell < kCells; cell += 2) {
+            v2i64 tmp = __builtin_msa_ilvl_d(
+                reinterpret_cast<v2i64>(
+                    src_lines_intertwined_4x[2 * (cell + 1) + outer][inner]),
+                reinterpret_cast<v2i64>(
+                    src_lines_intertwined_4x[2 * cell + outer][inner]));
+            __builtin_msa_st_b(reinterpret_cast<v16i8>(tmp), dst_ptr, 0);
+            dst_ptr += 16;
+          }
+        }
+      }
+    }
+    // Compute sums across the depth dimension
+    v8i16 sums_of_2[kCells][4];
+    for (int outer = 0; outer < 2; outer++) {
+      for (int inner = 0; inner < 2; inner++) {
+        int i = 2 * outer + inner;
+        for (int cell = 0; cell < kCells; cell++) {
+          sums_of_2[cell][i] = reinterpret_cast<v8i16>(__builtin_msa_hadd_u_h(
+              reinterpret_cast<v16u8>(
+                  src_lines_intertwined_4x[2 * cell + outer][inner]),
+              reinterpret_cast<v16u8>(
+                  src_lines_intertwined_4x[2 * cell + outer][inner])));
+        }
+      }
+    }
+    v8i16 sums_of_4[kCells][2];
+    for (int i = 0; i < 2; i++) {
+      for (int cell = 0; cell < kCells; cell++) {
+        sums_of_4[cell][i] = __builtin_msa_addv_h(sums_of_2[cell][2 * i],
+                                                  sums_of_2[cell][2 * i + 1]);
+      }
+    }
+    v8i16 sums_of_8[kCells];
+    for (int cell = 0; cell < kCells; cell++) {
+      sums_of_8[cell] =
+          __builtin_msa_addv_h(sums_of_4[cell][0], sums_of_4[cell][1]);
+    }
+
+    v4i32 sums_of_16[kCells];
+    const v8i16 zeroes = __builtin_msa_ldi_h(0);
+    for (int cell = 0; cell < kCells; cell++) {
+      sums_of_16[cell] = reinterpret_cast<v4i32>(
+          __builtin_msa_ilvr_h(zeroes, sums_of_8[cell]));
+      v8i16 tmp = __builtin_msa_ilvl_h(zeroes, sums_of_8[cell]);
+      sums_of_16[cell] =
+          __builtin_msa_addv_w(sums_of_16[cell], reinterpret_cast<v4i32>(tmp));
+    }
+    // Update the sums_of_each_slice vector
+    for (int cell = 0; cell < kCells; cell++) {
+      std::int32_t* sums_of_each_slice_ptr =
+          dst->sums_of_each_slice() + start_width + 4 * cell;
+      v4i32 tmp = __builtin_msa_ld_w(sums_of_each_slice_ptr, 0);
+      tmp = __builtin_msa_addv_w(tmp, sums_of_16[cell]);
+      __builtin_msa_st_w(tmp, sums_of_each_slice_ptr, 0);
+    }
+    dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
+  }
+};
+
+template <int Width>
+using Int8FastKernelFormat =
+    KernelSideFormatInt8<CellFormat<Width, 16, CellOrder::WidthMajor>, 1>;
+
+template <int Width>
+class PackingRegisterBlock<WidthMajorUint8SideMap,
+                           PackedSideBlock<Int8FastKernelFormat<Width>>>
+    : public PackingRegisterBlockBase<
+          WidthMajorUint8SideMap,
+          PackedSideBlock<Int8FastKernelFormat<Width>>> {
+ public:
+  static_assert(Width == 2 || Width == 4, "");
+  typedef Int8FastKernelFormat<Width> KernelSideFormat;
+  typedef typename KernelSideFormat::Cell CellFormat;
+  static const int kCells = KernelSideFormat::kCells;
+  static const int kCellWidth = CellFormat::kWidth;
+  static const int kKernelWidth = CellFormat::kWidth * kCells;
+  static const int kCellDepth = CellFormat::kDepth;
+  static const int kCellSize = CellFormat::kSize;
+
+  void Pack(PackedSideBlock<KernelSideFormat>* dst, int start_width) {
+    std::int32_t* sums_ptr = dst->sums_of_each_slice() + start_width;
+    std::uint8_t* dst_ptr = dst->current_data();
+    const std::uint8_t* const src_ptr = this->complete_src_.data();
+    const int stride = this->complete_src_.stride();
+    // Load source WidthMajor data.
+    v16i8 src_lines[Width];
+    for (int i = 0; i < Width; i++) {
+      src_lines[i] = __builtin_msa_ld_b(
+          const_cast<std::uint8_t*>(src_ptr + i * stride), 0);
+    }
+    for (int i = 0; i < Width; i++) {
+      // Subtract 128 by inverting bit 7.
+      src_lines[i] = reinterpret_cast<v16i8>(
+          __builtin_msa_bnegi_b(reinterpret_cast<v16u8>(src_lines[i]), 7));
+    }
+    for (int i = 0; i < Width; i++) {
+      __builtin_msa_st_b(src_lines[i], dst_ptr + 16 * i, 0);
+    }
+    v8i16 sums2[Width];
+    for (int i = 0; i < Width; i++) {
+      sums2[i] = __builtin_msa_hadd_s_h(src_lines[i], src_lines[i]);
+    }
+    v4i32 sums4_wide[Width];
+    for (int i = 0; i < Width; i++) {
+      sums4_wide[i] = __builtin_msa_hadd_s_w(sums2[i], sums2[i]);
+    }
+    v8i16 sums4[Width / 2];
+    for (int i = 0; i < Width / 2; i++) {
+      sums4[i] = __builtin_msa_pckev_h(
+          reinterpret_cast<v8i16>(sums4_wide[2 * i + 1]),
+          reinterpret_cast<v8i16>(sums4_wide[2 * i]));
+    }
+    v4i32 sums8_wide[Width / 2];
+    for (int i = 0; i < Width / 2; i++) {
+      sums8_wide[i] = __builtin_msa_hadd_s_w(sums4[i], sums4[i]);
+    }
+    if (Width == 4) {
+      v4i32 sum = __builtin_msa_ld_w(const_cast<std::int32_t*>(sums_ptr), 0);
+      v8i16 sums8 = __builtin_msa_pckev_h(
+          reinterpret_cast<v8i16>(sums8_wide[1]),
+          reinterpret_cast<v8i16>(sums8_wide[0]));
+      v4i32 sums16 = __builtin_msa_hadd_s_w(sums8, sums8);
+      sum = __builtin_msa_addv_w(sum, sums16);
+      __builtin_msa_st_w(sum, sums_ptr, 0);
+    } else {
+      assert(Width == 2);
+      std::int32_t sum[2] = { sums_ptr[0], sums_ptr[1] };
+      v2i64 sums16 = __builtin_msa_hadd_s_d(sums8_wide[0], sums8_wide[0]);
+      sum[0] += __builtin_msa_copy_s_w(reinterpret_cast<v4i32>(sums16), 0);
+      sum[1] += __builtin_msa_copy_s_w(reinterpret_cast<v4i32>(sums16), 2);
+      sums_ptr[0] = sum[0];
+      sums_ptr[1] = sum[1];
+    }
+    dst->seek_forward_n_cells(1);
+  }
+};
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_PACK_MSA_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack_neon.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack_neon.h
new file mode 100644
index 00000000..f113d9ef
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack_neon.h
@@ -0,0 +1,384 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// pack_neon.h: optimized NEON specializations of the templates in pack.h.
+
+#ifndef GEMMLOWP_INTERNAL_PACK_NEON_H_
+#define GEMMLOWP_INTERNAL_PACK_NEON_H_
+
+#include "pack.h"
+
+#include <arm_neon.h>
+
+namespace gemmlowp {
+
+typedef SideMap<const std::uint8_t, SideMapOrder::WidthMajor>
+    WidthMajorUint8SideMap;
+
+typedef SideMap<const std::int8_t, SideMapOrder::WidthMajor>
+    WidthMajorInt8SideMap;
+
+template <int Cells>
+using DepthMajorSideFormatNCells4x2 = KernelSideFormat<CellFormat<4, 2>, Cells>;
+
+template <int Cells>
+class PackingRegisterBlock<
+    WidthMajorUint8SideMap,
+    PackedSideBlock<DepthMajorSideFormatNCells4x2<Cells>>>
+    : public PackingRegisterBlockBase<
+          WidthMajorUint8SideMap,
+          PackedSideBlock<DepthMajorSideFormatNCells4x2<Cells>>> {
+ public:
+  typedef DepthMajorSideFormatNCells4x2<Cells> KernelSideFormat;
+  typedef typename KernelSideFormat::Cell CellFormat;
+  static const int kCells = KernelSideFormat::kCells;
+  static const int kCellWidth = CellFormat::kWidth;
+  static const int kKernelWidth = CellFormat::kWidth * kCells;
+  static const int kCellDepth = CellFormat::kDepth;
+  static const int kCellSize = CellFormat::kSize;
+
+  void Pack(PackedSideBlock<KernelSideFormat>* dst, int start_width) {
+    std::uint8_t* dst_ptr = dst->current_data();
+    const std::uint8_t* const src_ptr = this->complete_src_.data();
+    const int stride = this->complete_src_.stride();
+    // Load source WidthMajor data
+    uint8x16_t src_lines[4 * kCells];
+    for (int i = 0; i < 4 * kCells; i++) {
+      src_lines[i] = vld1q_u8(src_ptr + i * stride);
+    }
+    // Reorder the data within registers to make DepthMajor 4x2 cells
+    uint8x16x2_t src_lines_intertwined_2x[2 * kCells];
+    for (int i = 0; i < kCells; i++) {
+      src_lines_intertwined_2x[2 * i] =
+          vzipq_u8(src_lines[4 * i], src_lines[4 * i + 2]);
+      src_lines_intertwined_2x[2 * i + 1] =
+          vzipq_u8(src_lines[4 * i + 1], src_lines[4 * i + 3]);
+    }
+    uint8x16x2_t src_lines_intertwined_4x[2 * kCells];
+    for (int i = 0; i < kCells; i++) {
+      src_lines_intertwined_4x[2 * i] =
+          vzipq_u8(src_lines_intertwined_2x[2 * i].val[0],
+                   src_lines_intertwined_2x[2 * i + 1].val[0]);
+      src_lines_intertwined_4x[2 * i + 1] =
+          vzipq_u8(src_lines_intertwined_2x[2 * i].val[1],
+                   src_lines_intertwined_2x[2 * i + 1].val[1]);
+    }
+    // Store the resulting DepthMajor 4x2 cells in the destination packed block
+    for (int outer = 0; outer < 2; outer++) {
+      for (int inner = 0; inner < 2; inner++) {
+        for (int cell = 0; cell < kCells; cell++) {
+          uint8x8_t value = vget_low_u8(
+              src_lines_intertwined_4x[2 * cell + outer].val[inner]);
+          vst1_u8(dst_ptr, value);
+          dst_ptr += 8;
+        }
+        for (int cell = 0; cell < kCells; cell++) {
+          uint8x8_t value = vget_high_u8(
+              src_lines_intertwined_4x[2 * cell + outer].val[inner]);
+          vst1_u8(dst_ptr, value);
+          dst_ptr += 8;
+        }
+      }
+    }
+    // Compute sums across the depth dimension
+    uint16x8_t sums_of_2_cells[kCells][4];
+    for (int outer = 0; outer < 2; outer++) {
+      for (int inner = 0; inner < 2; inner++) {
+        int i = 2 * outer + inner;
+        for (int cell = 0; cell < kCells; cell++) {
+          sums_of_2_cells[cell][i] = vaddl_u8(
+              vget_low_u8(
+                  src_lines_intertwined_4x[2 * cell + outer].val[inner]),
+              vget_high_u8(
+                  src_lines_intertwined_4x[2 * cell + outer].val[inner]));
+        }
+      }
+    }
+    int32x4_t sums_of_4_cells[kCells][4];
+    for (int i = 0; i < 4; i++) {
+      for (int cell = 0; cell < kCells; cell++) {
+        sums_of_4_cells[cell][i] = vreinterpretq_s32_u32(
+            vaddl_u16(vget_low_u16(sums_of_2_cells[cell][i]),
+                      vget_high_u16(sums_of_2_cells[cell][i])));
+      }
+    }
+    // Update the sums_of_each_slice vector
+    for (int cell = 0; cell < kCells; cell++) {
+      int32x4_t s01 =
+          vaddq_s32(sums_of_4_cells[cell][0], sums_of_4_cells[cell][1]);
+      int32x4_t s23 =
+          vaddq_s32(sums_of_4_cells[cell][2], sums_of_4_cells[cell][3]);
+      int32x4_t s = vaddq_s32(s01, s23);
+      std::int32_t* sums_of_each_slice_ptr =
+          dst->sums_of_each_slice() + start_width + 4 * cell;
+      vst1q_s32(sums_of_each_slice_ptr,
+                vaddq_s32(s, vld1q_s32(sums_of_each_slice_ptr)));
+    }
+    dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
+  }
+};
+
+template <int Cells>
+using WidthMajorSideFormatNCells4x2 =
+    KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, Cells>;
+
+template <int Cells>
+class PackingRegisterBlock<
+    WidthMajorUint8SideMap,
+    PackedSideBlock<WidthMajorSideFormatNCells4x2<Cells>>>
+    : public PackingRegisterBlockBase<
+          WidthMajorUint8SideMap,
+          PackedSideBlock<WidthMajorSideFormatNCells4x2<Cells>>> {
+ public:
+  typedef WidthMajorSideFormatNCells4x2<Cells> KernelSideFormat;
+  typedef typename KernelSideFormat::Cell CellFormat;
+  static const int kCells = KernelSideFormat::kCells;
+  static const int kCellWidth = CellFormat::kWidth;
+  static const int kKernelWidth = CellFormat::kWidth * kCells;
+  static const int kCellDepth = CellFormat::kDepth;
+  static const int kCellSize = CellFormat::kSize;
+
+  void Pack(PackedSideBlock<KernelSideFormat>* dst, int start_width) {
+    std::uint8_t* dst_ptr = dst->current_data();
+    const std::uint8_t* src_ptr = this->complete_src_.data();
+    const int stride = this->complete_src_.stride();
+    // Load source WidthMajor data
+    uint16x8_t src_lines[kCells * 4];
+    for (int i = 0; i < kCells; i++) {
+      // This packing path is used with our current
+      // less-than-8-bit kernel, and the partial unrolling of this loop
+      // results in substantially faster code (thanks to better
+      // register allocation) on Nexus 5.
+
+#define GEMMLOWP_UNROLLED_LOOP_ITER(k)                            \
+  src_lines[4 * i + k] = vreinterpretq_u16_u8(vld1q_u8(src_ptr)); \
+  src_ptr += stride;
+
+      GEMMLOWP_UNROLLED_LOOP_ITER(0)
+      GEMMLOWP_UNROLLED_LOOP_ITER(1)
+      GEMMLOWP_UNROLLED_LOOP_ITER(2)
+      GEMMLOWP_UNROLLED_LOOP_ITER(3)
+
+#undef GEMMLOWP_UNROLLED_LOOP_ITER
+    }
+    // Reorder the data within registers to make WidthMajor 4x2 cells
+    uint16x8x2_t src_lines_intertwined_2x[2 * kCells];
+    for (int i = 0; i < kCells; i++) {
+      src_lines_intertwined_2x[2 * i] =
+          vzipq_u16(src_lines[4 * i], src_lines[4 * i + 2]);
+      src_lines_intertwined_2x[2 * i + 1] =
+          vzipq_u16(src_lines[4 * i + 1], src_lines[4 * i + 3]);
+    }
+    uint16x8x2_t src_lines_intertwined_4x[2 * kCells];
+    for (int i = 0; i < kCells; i++) {
+      src_lines_intertwined_4x[2 * i] =
+          vzipq_u16(src_lines_intertwined_2x[2 * i].val[0],
+                    src_lines_intertwined_2x[2 * i + 1].val[0]);
+      src_lines_intertwined_4x[2 * i + 1] =
+          vzipq_u16(src_lines_intertwined_2x[2 * i].val[1],
+                    src_lines_intertwined_2x[2 * i + 1].val[1]);
+    }
+    // Store the resulting WidthMajor 4x2 cells in the destination packed block
+    for (int outer = 0; outer < 2; outer++) {
+      for (int inner = 0; inner < 2; inner++) {
+        for (int cell = 0; cell < kCells; cell++) {
+          uint8x8_t value = vreinterpret_u8_u16(vget_low_u16(
+              src_lines_intertwined_4x[2 * cell + outer].val[inner]));
+          vst1_u8(dst_ptr, value);
+          dst_ptr += 8;
+        }
+        for (int cell = 0; cell < kCells; cell++) {
+          uint8x8_t value = vreinterpret_u8_u16(vget_high_u16(
+              src_lines_intertwined_4x[2 * cell + outer].val[inner]));
+          vst1_u8(dst_ptr, value);
+          dst_ptr += 8;
+        }
+      }
+    }
+    // Compute sums across the depth dimension
+    uint16x8_t sums_of_2[kCells][4];
+    for (int outer = 0; outer < 2; outer++) {
+      for (int inner = 0; inner < 2; inner++) {
+        int i = 2 * outer + inner;
+        for (int cell = 0; cell < kCells; cell++) {
+          sums_of_2[cell][i] = vpaddlq_u8(vreinterpretq_u8_u16(
+              src_lines_intertwined_4x[2 * cell + outer].val[inner]));
+        }
+      }
+    }
+    uint16x8_t sums_of_4[kCells][2];
+    for (int i = 0; i < 2; i++) {
+      for (int cell = 0; cell < kCells; cell++) {
+        sums_of_4[cell][i] =
+            vaddq_u16(sums_of_2[cell][2 * i], sums_of_2[cell][2 * i + 1]);
+      }
+    }
+    uint16x8_t sums_of_8[kCells];
+    for (int cell = 0; cell < kCells; cell++) {
+      sums_of_8[cell] = vaddq_u16(sums_of_4[cell][0], sums_of_4[cell][1]);
+    }
+
+    uint16x4_t sums_of_16[kCells];
+    for (int cell = 0; cell < kCells; cell++) {
+      sums_of_16[cell] = vadd_u16(vget_low_u16(sums_of_8[cell]),
+                                  vget_high_u16(sums_of_8[cell]));
+    }
+    // Update the sums_of_each_slice vector
+    for (int cell = 0; cell < kCells; cell++) {
+      int32x4_t s = vreinterpretq_s32_u32(vmovl_u16(sums_of_16[cell]));
+      std::int32_t* sums_of_each_slice_ptr =
+          dst->sums_of_each_slice() + start_width + 4 * cell;
+      vst1q_s32(sums_of_each_slice_ptr,
+                vaddq_s32(s, vld1q_s32(sums_of_each_slice_ptr)));
+    }
+    dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
+  }
+};
+
+#ifdef GEMMLOWP_NEON_32
+inline int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
+  const int16x4_t c = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
+  const int16x4_t d = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
+  return vcombine_s16(c, d);
+}
+#endif
+
+template <int Width>
+using Int8FastKernelFormat =
+    KernelSideFormatInt8<CellFormat<Width, 16, CellOrder::WidthMajor>, 1>;
+
+template <int Width>
+class PackingRegisterBlock<WidthMajorUint8SideMap,
+                           PackedSideBlock<Int8FastKernelFormat<Width>>>
+    : public PackingRegisterBlockBase<
+          WidthMajorUint8SideMap,
+          PackedSideBlock<Int8FastKernelFormat<Width>>> {
+ public:
+  static_assert(Width == 2 || Width == 4, "");
+  typedef Int8FastKernelFormat<Width> KernelSideFormat;
+  typedef typename KernelSideFormat::Cell CellFormat;
+  static const int kCells = KernelSideFormat::kCells;
+  static const int kCellWidth = CellFormat::kWidth;
+  static const int kKernelWidth = CellFormat::kWidth * kCells;
+  static const int kCellDepth = CellFormat::kDepth;
+  static const int kCellSize = CellFormat::kSize;
+
+  void Pack(PackedSideBlock<KernelSideFormat>* dst, int start_width) {
+    std::int32_t* sums_ptr = dst->sums_of_each_slice() + start_width;
+    std::uint8_t* dst_ptr = dst->current_data();
+    const std::uint8_t* const src_ptr = this->complete_src_.data();
+    const int stride = this->complete_src_.stride();
+    // Load source WidthMajor data
+    uint8x16_t src_lines[Width];
+    for (int i = 0; i < Width; i++) {
+      src_lines[i] = vld1q_u8(src_ptr + i * stride);
+    }
+    const uint8x16_t sign_bit_dup = vdupq_n_u8(0x80);
+    for (int i = 0; i < Width; i++) {
+      src_lines[i] = veorq_u8(src_lines[i], sign_bit_dup);
+    }
+    for (int i = 0; i < Width; i++) {
+      vst1q_u8(dst_ptr + 16 * i, src_lines[i]);
+    }
+    int16x8_t sums2[Width];
+    for (int i = 0; i < Width; i++) {
+      const int8x8_t lo = vreinterpret_s8_u8(vget_low_u8(src_lines[i]));
+      const int8x8_t hi = vreinterpret_s8_u8(vget_high_u8(src_lines[i]));
+      sums2[i] = vaddl_s8(lo, hi);
+    }
+    int16x8_t sums4[Width / 2];
+    for (int i = 0; i < Width / 2; i++) {
+      sums4[i] = vpaddq_s16(sums2[2 * i], sums2[2 * i + 1]);
+    }
+    if (Width == 4) {
+      int32x4_t sum = vld1q_s32(sums_ptr);
+      int16x8_t sums8 = vpaddq_s16(sums4[0], sums4[1]);
+      sum = vpadalq_s16(sum, sums8);
+      vst1q_s32(sums_ptr, sum);
+    } else {
+      assert(Width == 2);
+      int32x2_t sum = vld1_s32(sums_ptr);
+      int16x4_t sums8 =
+          vpadd_s16(vget_low_s16(sums4[0]), vget_high_s16(sums4[0]));
+      sum = vpadal_s16(sum, sums8);
+      vst1_s32(sums_ptr, sum);
+    }
+    dst->seek_forward_n_cells(1);
+  }
+};
+
+template <int Width>
+using Int8InputsFastKernelFormat =
+    KernelSideFormatInt8Inputs<CellFormat<Width, 16, CellOrder::WidthMajor>, 1>;
+
+// Same as above, but for int8 inputs, avoiding the uint8 -> int8 conversion.
+template <int Width>
+class PackingRegisterBlock<WidthMajorInt8SideMap,
+                           PackedSideBlock<Int8InputsFastKernelFormat<Width>>>
+    : public PackingRegisterBlockBase<
+          WidthMajorInt8SideMap,
+          PackedSideBlock<Int8InputsFastKernelFormat<Width>>> {
+ public:
+  static_assert(Width == 2 || Width == 4, "");
+  typedef Int8InputsFastKernelFormat<Width> KernelSideFormat;
+  typedef typename KernelSideFormat::Cell CellFormat;
+  static const int kCells = KernelSideFormat::kCells;
+  static const int kCellWidth = CellFormat::kWidth;
+  static const int kKernelWidth = CellFormat::kWidth * kCells;
+  static const int kCellDepth = CellFormat::kDepth;
+  static const int kCellSize = CellFormat::kSize;
+
+  void Pack(PackedSideBlock<KernelSideFormat>* dst, int start_width) {
+    std::int32_t* sums_ptr = dst->sums_of_each_slice() + start_width;
+    std::int8_t* dst_ptr = reinterpret_cast<std::int8_t*>(dst->current_data());
+    const std::int8_t* const src_ptr = this->complete_src_.data();
+    const int stride = this->complete_src_.stride();
+    // Load source WidthMajor data
+    int8x16_t src_lines[Width];
+    for (int i = 0; i < Width; i++) {
+      src_lines[i] = vld1q_s8(src_ptr + i * stride);
+    }
+    for (int i = 0; i < Width; i++) {
+      vst1q_s8(dst_ptr + 16 * i, src_lines[i]);
+    }
+    int16x8_t sums2[Width];
+    for (int i = 0; i < Width; i++) {
+      const int8x8_t lo = vget_low_s8(src_lines[i]);
+      const int8x8_t hi = vget_high_s8(src_lines[i]);
+      sums2[i] = vaddl_s8(lo, hi);
+    }
+    int16x8_t sums4[Width / 2];
+    for (int i = 0; i < Width / 2; i++) {
+      sums4[i] = vpaddq_s16(sums2[2 * i], sums2[2 * i + 1]);
+    }
+    if (Width == 4) {
+      int32x4_t sum = vld1q_s32(sums_ptr);
+      int16x8_t sums8 = vpaddq_s16(sums4[0], sums4[1]);
+      sum = vpadalq_s16(sum, sums8);
+      vst1q_s32(sums_ptr, sum);
+    } else {
+      assert(Width == 2);
+      int32x2_t sum = vld1_s32(sums_ptr);
+      int16x4_t sums8 =
+          vpadd_s16(vget_low_s16(sums4[0]), vget_high_s16(sums4[0]));
+      sum = vpadal_s16(sum, sums8);
+      vst1_s32(sums_ptr, sum);
+    }
+    dst->seek_forward_n_cells(1);
+  }
+};
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_PACK_NEON_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack_sse.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack_sse.h
new file mode 100644
index 00000000..b729014a
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack_sse.h
@@ -0,0 +1,128 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// pack_SSE.h: optimized SSE specializations of the templates in pack.h.
+
+#ifndef GEMMLOWP_INTERNAL_PACK_SSE_H_
+#define GEMMLOWP_INTERNAL_PACK_SSE_H_
+
+#include <smmintrin.h>
+#include "pack.h"
+
+namespace gemmlowp {
+
+// TODO: Add DepthMajorUint8SideMap
+
+typedef SideMap<const std::uint8_t, SideMapOrder::WidthMajor>
+    WidthMajorUint8SideMap;
+
+template <int Cells>
+using WidthMajorSideFormatNCells4x2 =
+    KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, Cells>;
+
+template <int Cells>
+class PackingRegisterBlock<
+    WidthMajorUint8SideMap,
+    PackedSideBlock<WidthMajorSideFormatNCells4x2<Cells> > >
+    : public PackingRegisterBlockBase<
+          WidthMajorUint8SideMap,
+          PackedSideBlock<WidthMajorSideFormatNCells4x2<Cells> > > {
+ public:
+  typedef WidthMajorSideFormatNCells4x2<Cells> KernelSideFormat;
+  typedef typename KernelSideFormat::Cell CellFormat;
+  static constexpr int kCells = KernelSideFormat::kCells;
+  static constexpr int kCellWidth = CellFormat::kWidth;
+  static constexpr int kKernelWidth = CellFormat::kWidth * kCells;
+  static constexpr int kCellDepth = CellFormat::kDepth;
+  static constexpr int kCellSize = CellFormat::kSize;
+
+  void Pack(PackedSideBlock<KernelSideFormat>* dst, int start_width) {
+    std::uint8_t* dst_ptr = dst->current_data();
+    const int width_stride = this->complete_src_.width_stride();
+    int depth_step = 8;
+
+    __m128i one = _mm_set1_epi16(1);
+    for (int cell_start_depth = 0; cell_start_depth < kRegisterSize;
+         cell_start_depth += depth_step) {
+      for (int cell_start_width = 0; cell_start_width < kKernelWidth;
+           cell_start_width += kCellWidth) {
+        std::int32_t* cell_sums_of_each_slice_ptr =
+            dst->sums_of_each_slice() + start_width + cell_start_width;
+        const std::uint8_t* src_data =
+            this->complete_src_.data(cell_start_width, cell_start_depth);
+
+        __m128i xmm1 =
+            _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src_data[0]));
+        __m128i xmm2 = _mm_loadl_epi64(
+            reinterpret_cast<const __m128i*>(&src_data[1 * width_stride]));
+        __m128i xmm3 = _mm_loadl_epi64(
+            reinterpret_cast<const __m128i*>(&src_data[2 * width_stride]));
+        __m128i xmm4 = _mm_loadl_epi64(
+            reinterpret_cast<const __m128i*>(&src_data[3 * width_stride]));
+
+        __m128i xmm5 = _mm_unpacklo_epi16(xmm1, xmm2);
+        __m128i xmm8 = _mm_shuffle_epi32(xmm5, 0x31);
+
+        __m128i xmm6 = _mm_unpacklo_epi16(xmm3, xmm4);
+        __m128i xmm7 = _mm_shuffle_epi32(xmm6, 0x80);
+
+        __m128i xmm9 = _mm_blend_epi16(xmm5, xmm7, 0xcc);
+        __m128i xmm10 = _mm_blend_epi16(xmm8, xmm6, 0xcc);
+
+        _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst_ptr[0]), xmm9);
+        _mm_storel_epi64(
+            reinterpret_cast<__m128i*>(&dst_ptr[kCellSize * kCells]), xmm10);
+
+        __m128i xmm11 = _mm_shuffle_epi32(xmm9, 0xee);
+        __m128i xmm12 = _mm_shuffle_epi32(xmm10, 0xee);
+
+        _mm_storel_epi64(
+            reinterpret_cast<__m128i*>(&dst_ptr[2 * kCellSize * kCells]),
+            xmm11);
+        _mm_storel_epi64(
+            reinterpret_cast<__m128i*>(&dst_ptr[3 * kCellSize * kCells]),
+            xmm12);
+
+        xmm1 = _mm_cvtepu8_epi16(xmm9);
+        xmm2 = _mm_madd_epi16(xmm1, one);
+        __m128i sums_of_each_slice_xmm = _mm_loadu_si128(
+            reinterpret_cast<const __m128i*>(&cell_sums_of_each_slice_ptr[0]));
+        sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2);
+
+        xmm1 = _mm_cvtepu8_epi16(xmm10);
+        xmm2 = _mm_madd_epi16(xmm1, one);
+        sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2);
+
+        xmm1 = _mm_cvtepu8_epi16(xmm11);
+        xmm2 = _mm_madd_epi16(xmm1, one);
+        sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2);
+
+        xmm1 = _mm_cvtepu8_epi16(xmm12);
+        xmm2 = _mm_madd_epi16(xmm1, one);
+        sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2);
+
+        _mm_storeu_si128(
+            reinterpret_cast<__m128i*>(&cell_sums_of_each_slice_ptr[0]),
+            sums_of_each_slice_xmm);
+        dst_ptr += kCellSize;
+      }
+      dst_ptr += 3 * kCellSize * kCells;
+    }
+    dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
+  }
+};
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_PACK_SSE_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/platform.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/platform.h
new file mode 100644
index 00000000..ab714146
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/platform.h
@@ -0,0 +1,117 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// internal/platform.h: a place to put platform specific code
+
+#ifndef GEMMLOWP_INTERNAL_PLATFORM_H_
+#define GEMMLOWP_INTERNAL_PLATFORM_H_
+
+#ifdef _WIN32
+#include <malloc.h>
+#include <windows.h>
+#else
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+#endif
+
+#ifdef __APPLE__
+#include <sys/time.h>
+#endif
+
+#if defined ANDROID || defined __ANDROID__
+#include <malloc.h>
+#include <android/api-level.h>
+// The 18 here should be 16, but has to be 18 for now due
+// to a Google-internal issue.
+#if __ANDROID_API__ < 18
+#define GEMMLOWP_USE_MEMALIGN
+#endif
+// posix_memalign is missing on some 4.1 x86 devices
+#if __ANDROID_API__ == 18
+#ifdef GEMMLOWP_X86_32
+#define GEMMLOWP_USE_MEMALIGN
+#endif
+#endif
+#endif
+
+// Needed by chrome native builds
+#ifndef _SC_NPROCESSORS_CONF
+#define _SC_NPROCESSORS_CONF _SC_NPROCESSORS_ONLN
+#endif
+
+namespace gemmlowp {
+
+#ifdef _WIN32
+inline void *aligned_alloc(size_t alignment, size_t size) {
+  return _aligned_malloc(size, alignment);
+}
+
+inline void aligned_free(void *memptr) { _aligned_free(memptr); }
+
+inline int GetHardwareConcurrency(int max_threads) {
+  if (max_threads == 0) {
+    SYSTEM_INFO sysinfo;
+    GetSystemInfo(&sysinfo);
+    return sysinfo.dwNumberOfProcessors;
+  }
+  return max_threads;
+}
+
+inline double real_time_in_seconds() {
+  __int64 wintime;
+  GetSystemTimeAsFileTime((FILETIME *)&wintime);
+  wintime -= 116444736000000000LL;  // 1jan1601 to 1jan1970
+  return wintime / 10000000LL + wintime % 10000000LL * 100 * 1e-9;
+}
+
+#else
+inline void *aligned_alloc(size_t alignment, size_t size) {
+#ifdef GEMMLOWP_USE_MEMALIGN
+  return memalign(alignment, size);
+#else
+  void *memptr;
+  if (posix_memalign(&memptr, alignment, size)) {
+    memptr = nullptr;
+  }
+  return memptr;
+#endif
+}
+
+inline int GetHardwareConcurrency(int max_threads) {
+  if (max_threads == 0) {
+    static const int hardware_threads_count =
+        static_cast<int>(sysconf(_SC_NPROCESSORS_CONF));
+    return hardware_threads_count;
+  }
+  return max_threads;
+}
+
+inline void aligned_free(void *memptr) { free(memptr); }
+
+inline double real_time_in_seconds() {
+#ifdef __APPLE__
+  timeval t;
+  gettimeofday(&t, nullptr);
+  return t.tv_sec + 1e-6 * t.tv_usec;
+#else
+  timespec t;
+  clock_gettime(CLOCK_REALTIME, &t);
+  return t.tv_sec + 1e-9 * t.tv_nsec;
+#endif
+}
+
+#endif
+}  // namespace gemmlowp
+#endif  // GEMMLOWP_INTERNAL_PLATFORM_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers.h
new file mode 100644
index 00000000..4e4cce86
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers.h
@@ -0,0 +1,669 @@
+// Copyright 2017 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// simd_wrappers.h: some inline functions wrapping SIMD intrinsics,
+// extending the set of such functions from fixedpoint.h.
+
+#ifndef GEMMLOWP_INTERNAL_SIMD_WRAPPERS_H_
+#define GEMMLOWP_INTERNAL_SIMD_WRAPPERS_H_
+
+#include <algorithm>
+#include <type_traits>
+#include "../fixedpoint/fixedpoint.h"
+
+namespace gemmlowp {
+
+template <typename ScalarType, int ScalarCount>
+struct RegisterType {
+  using Type = ScalarType;
+};
+
+inline std::int32_t Min(std::int32_t a, std::int32_t b) {
+  return std::min(a, b);
+}
+
+inline std::int32_t Max(std::int32_t a, std::int32_t b) {
+  return std::max(a, b);
+}
+
+inline void MulAdd(std::int32_t lhs, std::int32_t rhs, std::int32_t* acc) {
+  *acc += lhs * rhs;
+}
+
+template <typename tScalarType, int tScalarCount>
+struct RegisterBuffer {
+  using ScalarType = tScalarType;
+  static constexpr int kScalarCount = tScalarCount;
+  using RegisterType = typename RegisterType<ScalarType, kScalarCount>::Type;
+  static_assert((kScalarCount & (kScalarCount - 1)) == 0,
+                "kScalarCount must be a power of two");
+  static_assert(sizeof(RegisterType) % sizeof(ScalarType) == 0, "");
+  static constexpr int kRegisterLanes =
+      sizeof(RegisterType) / sizeof(ScalarType);
+  static constexpr int kRegisterCount =
+      (kScalarCount * sizeof(ScalarType) + sizeof(RegisterType) - 1) /
+      sizeof(RegisterType);
+
+  RegisterType reg[kRegisterCount];
+};
+
+template <typename tScalarType, int tRows, int tCols>
+struct RegisterBlock {
+  using ScalarType = tScalarType;
+  static constexpr int kRows = tRows;
+  static constexpr int kCols = tCols;
+  static constexpr int kScalarCount = kRows * kCols;
+  using BufferType = RegisterBuffer<ScalarType, kScalarCount>;
+  using RegisterType = typename BufferType::RegisterType;
+  static constexpr int kRegisterCount = BufferType::kRegisterCount;
+  static constexpr int kRegisterLanes = BufferType::kRegisterLanes;
+
+  BufferType buf;
+};
+
+template <typename RegisterBlockType>
+struct RegisterBlockAddImpl {
+  static RegisterBlockType Run(const RegisterBlockType& lhs,
+                               const RegisterBlockType& rhs) {
+    RegisterBlockType result;
+    for (int i = 0; i < RegisterBlockType::kRegisterCount; i++) {
+      result.buf.reg[i] = Add(lhs.buf.reg[i], rhs.buf.reg[i]);
+    }
+    return result;
+  }
+};
+
+template <typename RegisterBlockType>
+RegisterBlockType RegisterBlockAdd(const RegisterBlockType& lhs,
+                                   const RegisterBlockType& rhs) {
+  return RegisterBlockAddImpl<RegisterBlockType>::Run(lhs, rhs);
+}
+
+template <typename LhsType, typename RhsType>
+struct ShouldFlipLhsRhs {
+  static constexpr bool kValue =
+      (LhsType::kScalarCount < RhsType::kScalarCount) ||
+      (LhsType::kScalarCount == RhsType::kScalarCount &&
+       (LhsType::kRows < RhsType::kRows));
+};
+
+template <typename LhsType, typename RhsType,
+          bool Flip = ShouldFlipLhsRhs<LhsType, RhsType>::kValue>
+struct FlipLhsRhs {
+  using FlippedLhsType = LhsType;
+  using FlippedRhsType = RhsType;
+  static const FlippedLhsType& FlippedLhs(const LhsType& lhs,
+                                          const RhsType& rhs) {
+    (void)rhs;
+    return lhs;
+  }
+  static const FlippedRhsType& FlippedRhs(const LhsType& lhs,
+                                          const RhsType& rhs) {
+    (void)lhs;
+    return rhs;
+  }
+};
+
+template <typename LhsType, typename RhsType>
+struct FlipLhsRhs<LhsType, RhsType, true> {
+  using FlippedLhsType = RhsType;
+  using FlippedRhsType = LhsType;
+  static const FlippedLhsType& FlippedLhs(const LhsType& lhs,
+                                          const RhsType& rhs) {
+    (void)lhs;
+    return rhs;
+  }
+  static const FlippedRhsType& FlippedRhs(const LhsType& lhs,
+                                          const RhsType& rhs) {
+    (void)rhs;
+    return lhs;
+  }
+};
+
+template <typename Lhs, typename Rhs>
+struct BroadcastBinaryOpShape {
+  static constexpr int kRows =
+      Lhs::kRows > Rhs::kRows ? Lhs::kRows : Rhs::kRows;
+  static constexpr int kCols =
+      Lhs::kCols > Rhs::kCols ? Lhs::kCols : Rhs::kCols;
+};
+
+template <typename Lhs, typename Rhs>
+struct BroadcastBinaryOpRegisterBlock {
+  using Shape = BroadcastBinaryOpShape<Lhs, Rhs>;
+  using ScalarType = typename Lhs::ScalarType;
+  using Type = RegisterBlock<ScalarType, Shape::kRows, Shape::kCols>;
+};
+
+template <typename Lhs, typename Rhs>
+struct BroadcastAddImpl {
+  using ResultBlockType =
+      typename BroadcastBinaryOpRegisterBlock<Lhs, Rhs>::Type;
+  static ResultBlockType Run(const Lhs& lhs, const Rhs& rhs) {
+    ResultBlockType result;
+    static constexpr int Rows = ResultBlockType::kRows;
+    static constexpr int Cols = ResultBlockType::kCols;
+    static constexpr int LhsRows = Lhs::kRows;
+    static constexpr int LhsCols = Lhs::kCols;
+    static constexpr int RhsRows = Rhs::kRows;
+    static constexpr int RhsCols = Rhs::kCols;
+
+    static_assert(LhsRows == Rows || LhsRows == 1, "");
+    static_assert(RhsRows == Rows || RhsRows == 1, "");
+    static_assert(LhsCols == Cols || LhsCols == 1, "");
+    static_assert(RhsCols == Cols || RhsCols == 1, "");
+    static_assert(ResultBlockType::kRegisterLanes == 1,
+                  "This path is only for scalar values");
+    static_assert(Lhs::kRegisterLanes == 1,
+                  "This path is only for scalar values");
+    static_assert(Rhs::kRegisterLanes == 1,
+                  "This path is only for scalar values");
+
+    for (int c = 0; c < Cols; c++) {
+      const int lhs_c = LhsCols == Cols ? c : 0;
+      const int rhs_c = RhsCols == Cols ? c : 0;
+      for (int r = 0; r < Rows; r++) {
+        const int lhs_r = LhsRows == Rows ? r : 0;
+        const int rhs_r = RhsRows == Rows ? r : 0;
+        result.buf.reg[r + c * Rows] =
+            Add(lhs.buf.reg[lhs_r + lhs_c * LhsRows],
+                rhs.buf.reg[rhs_r + rhs_c * RhsRows]);
+      }
+    }
+    return result;
+  }
+};
+
+template <typename Lhs, typename Rhs>
+typename BroadcastBinaryOpRegisterBlock<Lhs, Rhs>::Type BroadcastAdd(
+    const Lhs& lhs, const Rhs& rhs) {
+  using Flip = FlipLhsRhs<Lhs, Rhs>;
+  return BroadcastAddImpl<
+      typename Flip::FlippedLhsType,
+      typename Flip::FlippedRhsType>::Run(Flip::FlippedLhs(lhs, rhs),
+                                          Flip::FlippedRhs(lhs, rhs));
+}
+
+template <typename Lhs, typename Rhs>
+struct BroadcastShiftLeftImpl {
+  using ResultBlockType =
+      typename BroadcastBinaryOpRegisterBlock<Lhs, Rhs>::Type;
+  static ResultBlockType Run(const Lhs& lhs, const Rhs& rhs) {
+    ResultBlockType result;
+    static constexpr int Rows = ResultBlockType::kRows;
+    static constexpr int Cols = ResultBlockType::kCols;
+    static constexpr int LhsRows = Lhs::kRows;
+    static constexpr int LhsCols = Lhs::kCols;
+    static constexpr int RhsRows = Rhs::kRows;
+    static constexpr int RhsCols = Rhs::kCols;
+
+    static_assert(LhsRows == Rows || LhsRows == 1, "");
+    static_assert(RhsRows == Rows || RhsRows == 1, "");
+    static_assert(LhsCols == Cols || LhsCols == 1, "");
+    static_assert(RhsCols == Cols || RhsCols == 1, "");
+    static_assert(ResultBlockType::kRegisterLanes == 1,
+                  "This path is only for scalar values");
+    static_assert(Lhs::kRegisterLanes == 1,
+                  "This path is only for scalar values");
+    static_assert(Rhs::kRegisterLanes == 1,
+                  "This path is only for scalar values");
+
+    for (int c = 0; c < Cols; c++) {
+      const int lhs_c = LhsCols == Cols ? c : 0;
+      const int rhs_c = RhsCols == Cols ? c : 0;
+      for (int r = 0; r < Rows; r++) {
+        const int lhs_r = LhsRows == Rows ? r : 0;
+        const int rhs_r = RhsRows == Rows ? r : 0;
+        result.buf.reg[r + c * Rows] =
+            ShiftLeft(lhs.buf.reg[lhs_r + lhs_c * LhsRows],
+                      rhs.buf.reg[rhs_r + rhs_c * RhsRows]);
+      }
+    }
+    return result;
+  }
+};
+
+template <typename Lhs, typename Rhs>
+typename BroadcastBinaryOpRegisterBlock<Lhs, Rhs>::Type BroadcastShiftLeft(
+    const Lhs& lhs, const Rhs& rhs) {
+  using Flip = FlipLhsRhs<Lhs, Rhs>;
+  return BroadcastShiftLeftImpl<
+      typename Flip::FlippedLhsType,
+      typename Flip::FlippedRhsType>::Run(Flip::FlippedLhs(lhs, rhs),
+                                          Flip::FlippedRhs(lhs, rhs));
+}
+
+template <typename Lhs, typename Rhs>
+struct BroadcastSaturatingRoundingDoublingHighMulImpl {
+  using ResultBlockType =
+      typename BroadcastBinaryOpRegisterBlock<Lhs, Rhs>::Type;
+  static ResultBlockType Run(const Lhs& lhs, const Rhs& rhs) {
+    ResultBlockType result;
+    static constexpr int Rows = ResultBlockType::kRows;
+    static constexpr int Cols = ResultBlockType::kCols;
+    static constexpr int LhsRows = Lhs::kRows;
+    static constexpr int LhsCols = Lhs::kCols;
+    static constexpr int RhsRows = Rhs::kRows;
+    static constexpr int RhsCols = Rhs::kCols;
+
+    static_assert(LhsRows == Rows || LhsRows == 1, "");
+    static_assert(RhsRows == Rows || RhsRows == 1, "");
+    static_assert(LhsCols == Cols || LhsCols == 1, "");
+    static_assert(RhsCols == Cols || RhsCols == 1, "");
+    static_assert(ResultBlockType::kRegisterLanes == 1,
+                  "This path is only for scalar values");
+    static_assert(Lhs::kRegisterLanes == 1,
+                  "This path is only for scalar values");
+    static_assert(Rhs::kRegisterLanes == 1,
+                  "This path is only for scalar values");
+
+    for (int c = 0; c < Cols; c++) {
+      const int lhs_c = LhsCols == Cols ? c : 0;
+      const int rhs_c = RhsCols == Cols ? c : 0;
+      for (int r = 0; r < Rows; r++) {
+        const int lhs_r = LhsRows == Rows ? r : 0;
+        const int rhs_r = RhsRows == Rows ? r : 0;
+        result.buf.reg[r + c * Rows] = SaturatingRoundingDoublingHighMul(
+            lhs.buf.reg[lhs_r + lhs_c * LhsRows],
+            rhs.buf.reg[rhs_r + rhs_c * RhsRows]);
+      }
+    }
+    return result;
+  }
+};
+
+template <typename Lhs, typename Rhs>
+typename BroadcastBinaryOpRegisterBlock<Lhs, Rhs>::Type
+BroadcastSaturatingRoundingDoublingHighMul(const Lhs& lhs, const Rhs& rhs) {
+  using Flip = FlipLhsRhs<Lhs, Rhs>;
+  return BroadcastSaturatingRoundingDoublingHighMulImpl<
+      typename Flip::FlippedLhsType,
+      typename Flip::FlippedRhsType>::Run(Flip::FlippedLhs(lhs, rhs),
+                                          Flip::FlippedRhs(lhs, rhs));
+}
+
+template <typename Lhs, typename Rhs>
+struct BroadcastRoundingDivideByPOTImpl {
+  using ResultBlockType =
+      typename BroadcastBinaryOpRegisterBlock<Lhs, Rhs>::Type;
+  static ResultBlockType Run(const Lhs& lhs, const Rhs& rhs) {
+    ResultBlockType result;
+    static constexpr int Rows = ResultBlockType::kRows;
+    static constexpr int Cols = ResultBlockType::kCols;
+    static constexpr int LhsRows = Lhs::kRows;
+    static constexpr int LhsCols = Lhs::kCols;
+    static constexpr int RhsRows = Rhs::kRows;
+    static constexpr int RhsCols = Rhs::kCols;
+
+    static_assert(LhsRows == Rows || LhsRows == 1, "");
+    static_assert(RhsRows == Rows || RhsRows == 1, "");
+    static_assert(LhsCols == Cols || LhsCols == 1, "");
+    static_assert(RhsCols == Cols || RhsCols == 1, "");
+    static_assert(ResultBlockType::kRegisterLanes == 1,
+                  "This path is only for scalar values");
+    static_assert(Lhs::kRegisterLanes == 1,
+                  "This path is only for scalar values");
+    static_assert(Rhs::kRegisterLanes == 1,
+                  "This path is only for scalar values");
+
+    for (int c = 0; c < Cols; c++) {
+      const int lhs_c = LhsCols == Cols ? c : 0;
+      const int rhs_c = RhsCols == Cols ? c : 0;
+      for (int r = 0; r < Rows; r++) {
+        const int lhs_r = LhsRows == Rows ? r : 0;
+        const int rhs_r = RhsRows == Rows ? r : 0;
+        result.buf.reg[r + c * Rows] =
+            RoundingDivideByPOT(lhs.buf.reg[lhs_r + lhs_c * LhsRows],
+                                rhs.buf.reg[rhs_r + rhs_c * RhsRows]);
+      }
+    }
+    return result;
+  }
+};
+
+template <typename Lhs, typename Rhs>
+typename BroadcastBinaryOpRegisterBlock<Lhs, Rhs>::Type
+BroadcastRoundingDivideByPOT(const Lhs& lhs, const Rhs& rhs) {
+  using Flip = FlipLhsRhs<Lhs, Rhs>;
+  return BroadcastRoundingDivideByPOTImpl<
+      typename Flip::FlippedLhsType,
+      typename Flip::FlippedRhsType>::Run(Flip::FlippedLhs(lhs, rhs),
+                                          Flip::FlippedRhs(lhs, rhs));
+}
+
+template <typename Lhs, typename Rhs>
+struct BroadcastMulImpl {
+  using ResultBlockType =
+      typename BroadcastBinaryOpRegisterBlock<Lhs, Rhs>::Type;
+  static ResultBlockType Run(const Lhs& lhs, const Rhs& rhs) {
+    ResultBlockType result;
+    static constexpr int Rows = ResultBlockType::kRows;
+    static constexpr int Cols = ResultBlockType::kCols;
+    static constexpr int LhsRows = Lhs::kRows;
+    static constexpr int LhsCols = Lhs::kCols;
+    static constexpr int RhsRows = Rhs::kRows;
+    static constexpr int RhsCols = Rhs::kCols;
+    static_assert(ResultBlockType::kRegisterLanes == 1,
+                  "This path is only for scalar values");
+    static_assert(Lhs::kRegisterLanes == 1,
+                  "This path is only for scalar values");
+    static_assert(Rhs::kRegisterLanes == 1,
+                  "This path is only for scalar values");
+
+    static_assert(LhsRows == Rows || LhsRows == 1, "");
+    static_assert(RhsRows == Rows || RhsRows == 1, "");
+    static_assert(LhsCols == Cols || LhsCols == 1, "");
+    static_assert(RhsCols == Cols || RhsCols == 1, "");
+    for (int c = 0; c < Cols; c++) {
+      const int lhs_c = LhsCols == Cols ? c : 0;
+      const int rhs_c = RhsCols == Cols ? c : 0;
+      for (int r = 0; r < Rows; r++) {
+        const int lhs_r = LhsRows == Rows ? r : 0;
+        const int rhs_r = RhsRows == Rows ? r : 0;
+        result.buf.reg[r + c * Rows] =
+            Mul(lhs.buf.reg[lhs_r + lhs_c * LhsRows],
+                rhs.buf.reg[rhs_r + rhs_c * RhsRows]);
+      }
+    }
+    return result;
+  }
+};
+
+template <typename Lhs, typename Rhs>
+typename BroadcastBinaryOpRegisterBlock<Lhs, Rhs>::Type BroadcastMul(
+    const Lhs& lhs, const Rhs& rhs) {
+  using Flip = FlipLhsRhs<Lhs, Rhs>;
+  return BroadcastMulImpl<
+      typename Flip::FlippedLhsType,
+      typename Flip::FlippedRhsType>::Run(Flip::FlippedLhs(lhs, rhs),
+                                          Flip::FlippedRhs(lhs, rhs));
+}
+
+template <typename Lhs, typename Rhs, typename Acc>
+struct BroadcastMulAddImpl {
+  static void Run(const Lhs& lhs, const Rhs& rhs, Acc* acc) {
+    static constexpr int Rows = Acc::kRows;
+    static constexpr int Cols = Acc::kCols;
+    static constexpr int LhsRows = Lhs::kRows;
+    static constexpr int LhsCols = Lhs::kCols;
+    static constexpr int RhsRows = Rhs::kRows;
+    static constexpr int RhsCols = Rhs::kCols;
+    static_assert(Acc::kRegisterLanes == 1,
+                  "This path is only for scalar values");
+    static_assert(Lhs::kRegisterLanes == 1,
+                  "This path is only for scalar values");
+    static_assert(Rhs::kRegisterLanes == 1,
+                  "This path is only for scalar values");
+
+    static_assert(LhsRows == Rows || LhsRows == 1, "");
+    static_assert(RhsRows == Rows || RhsRows == 1, "");
+    static_assert(LhsCols == Cols || LhsCols == 1, "");
+    static_assert(RhsCols == Cols || RhsCols == 1, "");
+    for (int c = 0; c < Cols; c++) {
+      const int lhs_c = LhsCols == Cols ? c : 0;
+      const int rhs_c = RhsCols == Cols ? c : 0;
+      for (int r = 0; r < Rows; r++) {
+        const int lhs_r = LhsRows == Rows ? r : 0;
+        const int rhs_r = RhsRows == Rows ? r : 0;
+        MulAdd(lhs.buf.reg[lhs_r + lhs_c * LhsRows],
+               rhs.buf.reg[rhs_r + rhs_c * RhsRows],
+               &acc->buf.reg[r + c * Rows]);
+      }
+    }
+  }
+};
+
+template <typename Lhs, typename Rhs, typename Acc>
+void BroadcastMulAdd(const Lhs& lhs, const Rhs& rhs, Acc* acc) {
+  using Flip = FlipLhsRhs<Lhs, Rhs>;
+  BroadcastMulAddImpl<typename Flip::FlippedLhsType,
+                      typename Flip::FlippedRhsType,
+                      Acc>::Run(Flip::FlippedLhs(lhs, rhs),
+                                Flip::FlippedRhs(lhs, rhs), acc);
+}
+
+template <typename RegisterBlockType, typename SrcObjectType>
+struct LoadImpl {
+  static_assert(std::is_same<SrcObjectType, void>::value,
+                "This generic impl should never be hit");
+};
+
+template <typename ScalarType, int Rows, int Cols, typename SrcScalarType>
+struct LoadImpl<RegisterBlock<ScalarType, Rows, Cols>,
+                MatrixMap<SrcScalarType, MapOrder::ColMajor>> {
+  using RegisterBlockType = RegisterBlock<ScalarType, Rows, Cols>;
+  using SrcObjectType = MatrixMap<SrcScalarType, MapOrder::ColMajor>;
+  static RegisterBlockType Run(const SrcObjectType& src, int row, int col) {
+    RegisterBlockType result;
+    int i = 0;
+    for (int c = 0; c < Cols; c++) {
+      const ScalarType* src_ptr = src.data(row, col + c);
+      for (int r = 0; r < Rows; r++) {
+        result.buf.reg[i++] = *src_ptr++;
+      }
+    }
+    return result;
+  }
+};
+
+template <typename ScalarType, int Rows, int Cols, typename SrcScalarType,
+          VectorShape Shape>
+struct LoadImpl<RegisterBlock<ScalarType, Rows, Cols>,
+                VectorMap<SrcScalarType, Shape>> {
+  using RegisterBlockType = RegisterBlock<ScalarType, Rows, Cols>;
+  using SrcObjectType = VectorMap<SrcScalarType, Shape>;
+  static RegisterBlockType Run(const SrcObjectType& src, int pos) {
+    static_assert(Shape == VectorShape::Col || Rows == 1, "");
+    static_assert(Shape == VectorShape::Row || Cols == 1, "");
+    RegisterBlockType result;
+    for (int i = 0; i < Rows * Cols; i++) {
+      result.buf.reg[i] = src(pos + i);
+    }
+    return result;
+  }
+};
+
+template <typename ScalarType, int Rows, int Cols, typename SrcScalarType,
+          VectorShape Shape>
+struct LoadImpl<RegisterBlock<ScalarType, Rows, Cols>,
+                VectorDup<SrcScalarType, Shape>> {
+  using RegisterBlockType = RegisterBlock<ScalarType, Rows, Cols>;
+  using SrcObjectType = VectorDup<SrcScalarType, Shape>;
+  static RegisterBlockType Run(const SrcObjectType& src, int) {
+    static_assert(Shape == VectorShape::Col || Rows == 1, "");
+    static_assert(Shape == VectorShape::Row || Cols == 1, "");
+    RegisterBlockType result;
+    for (int i = 0; i < Rows * Cols; i++) {
+      result.buf.reg[i] = src(0);
+    }
+    return result;
+  }
+};
+
+template <typename RegisterBlockType, typename SrcObjectType>
+RegisterBlockType Load(const SrcObjectType& src, int row, int col) {
+  return LoadImpl<RegisterBlockType, SrcObjectType>::Run(src, row, col);
+}
+
+template <typename RegisterBlockType, typename SrcObjectType>
+RegisterBlockType Load(const SrcObjectType& src, int pos) {
+  return LoadImpl<RegisterBlockType, SrcObjectType>::Run(src, pos);
+}
+
+template <typename RegisterBlockType>
+struct LoadContiguousImpl {
+  using ScalarType = typename RegisterBlockType::ScalarType;
+  static_assert(RegisterBlockType::kRegisterLanes == 1,
+                "This path is only for scalar values");
+  static RegisterBlockType Run(const ScalarType* src) {
+    RegisterBlockType result;
+    for (int i = 0; i < RegisterBlockType::kScalarCount; i++) {
+      result.buf.reg[i] = src[i];
+    }
+    return result;
+  }
+};
+
+template <typename RegisterBlockType>
+RegisterBlockType LoadContiguous(
+    const typename RegisterBlockType::ScalarType* src) {
+  return LoadContiguousImpl<RegisterBlockType>::Run(src);
+}
+
+template <int BroadcastRows, int BroadcastCols, typename SrcObjectType>
+struct LoadForBroadcastingShape {};
+
+template <int BroadcastRows, int BroadcastCols, typename ScalarType,
+          VectorShape Shape>
+struct LoadForBroadcastingShape<BroadcastRows, BroadcastCols,
+                                VectorMap<ScalarType, Shape>> {
+  static constexpr int kRows = Shape == VectorShape::Col ? BroadcastRows : 1;
+  static constexpr int kCols = Shape == VectorShape::Row ? BroadcastCols : 1;
+};
+
+template <int BroadcastRows, int BroadcastCols, typename ScalarType,
+          VectorShape Shape>
+struct LoadForBroadcastingShape<BroadcastRows, BroadcastCols,
+                                VectorDup<ScalarType, Shape>> {
+  static constexpr int kRows = 1;
+  static constexpr int kCols = 1;
+};
+
+template <typename RegisterBlockType, typename SrcObjectType>
+struct LoadForBroadcastingRegisterBlock {
+  using Shape =
+      LoadForBroadcastingShape<RegisterBlockType::kRows,
+                               RegisterBlockType::kCols, SrcObjectType>;
+  using ScalarType = typename RegisterBlockType::ScalarType;
+  using Type = RegisterBlock<ScalarType, Shape::kRows, Shape::kCols>;
+};
+
+template <typename RegisterBlockType, typename SrcObjectType>
+struct LoadForBroadcastingImpl {
+  static_assert(std::is_same<SrcObjectType, void>::value,
+                "This generic impl should never be hit");
+};
+
+template <typename ScalarType, int Rows, int Cols, typename SrcScalarType,
+          VectorShape Shape>
+struct LoadForBroadcastingImpl<RegisterBlock<ScalarType, Rows, Cols>,
+                               VectorMap<SrcScalarType, Shape>> {
+  using RegisterBlockType = RegisterBlock<ScalarType, Rows, Cols>;
+  using SrcObjectType = VectorMap<SrcScalarType, Shape>;
+  using ResultBlockType =
+      typename LoadForBroadcastingRegisterBlock<RegisterBlockType,
+                                                SrcObjectType>::Type;
+  static_assert(ResultBlockType::kRegisterLanes == 1,
+                "This path is only for scalar values");
+  static ResultBlockType Run(const SrcObjectType& src, int pos) {
+    ResultBlockType result;
+    for (int c = 0; c < ResultBlockType::kCols; c++) {
+      for (int r = 0; r < ResultBlockType::kRows; r++) {
+        const int i = Shape == VectorShape::Col ? r : c;
+        result.buf.reg[r + c * ResultBlockType::kRows] = src(pos + i);
+      }
+    }
+    return result;
+  }
+};
+
+template <typename ScalarType, int Rows, int Cols, typename SrcScalarType,
+          VectorShape Shape>
+struct LoadForBroadcastingImpl<RegisterBlock<ScalarType, Rows, Cols>,
+                               VectorDup<SrcScalarType, Shape>> {
+  using RegisterBlockType = RegisterBlock<ScalarType, Rows, Cols>;
+  using SrcObjectType = VectorDup<SrcScalarType, Shape>;
+  using ResultBlockType =
+      typename LoadForBroadcastingRegisterBlock<RegisterBlockType,
+                                                SrcObjectType>::Type;
+  static_assert(ResultBlockType::kRegisterLanes == 1,
+                "This path is only for scalar values");
+  static ResultBlockType Run(const SrcObjectType& src, int) {
+    ResultBlockType result;
+    for (int c = 0; c < ResultBlockType::kCols; c++) {
+      for (int r = 0; r < ResultBlockType::kRows; r++) {
+        result.buf.reg[r + c * ResultBlockType::kRows] = src(0);
+      }
+    }
+    return result;
+  }
+};
+
+template <typename RegisterBlockType, typename SrcObjectType>
+typename LoadForBroadcastingRegisterBlock<RegisterBlockType,
+                                          SrcObjectType>::Type
+LoadForBroadcasting(const SrcObjectType& src, int row, int col) {
+  return LoadForBroadcastingImpl<RegisterBlockType, SrcObjectType>::Run(
+      src, row, col);
+}
+
+template <typename RegisterBlockType, typename SrcObjectType>
+typename LoadForBroadcastingRegisterBlock<RegisterBlockType,
+                                          SrcObjectType>::Type
+LoadForBroadcasting(const SrcObjectType& src, int pos) {
+  return LoadForBroadcastingImpl<RegisterBlockType, SrcObjectType>::Run(src,
+                                                                        pos);
+}
+
+template <int ConstantValue, typename RegisterBlockType>
+struct AddConstantImpl {
+  static void Run(RegisterBlockType* block) {
+    using RegisterType = typename RegisterBlockType::RegisterType;
+    const RegisterType dup = Dup<RegisterType>(ConstantValue);
+    for (int i = 0; i < RegisterBlockType::kRegisterCount; i++) {
+      block->buf.reg[i] = Add(block->buf.reg[i], dup);
+    }
+  }
+};
+
+template <typename RegisterBlockType>
+struct AddConstantImpl<0, RegisterBlockType> {
+  static void Run(RegisterBlockType*) {
+    // This is a no-op.
+  }
+};
+
+template <int ConstantValue, typename RegisterBlockType>
+void AddConstant(RegisterBlockType* block) {
+  AddConstantImpl<ConstantValue, RegisterBlockType>::Run(block);
+}
+
+template <int N>
+using RegBufferInt32 = RegisterBuffer<std::int32_t, N>;
+template <int N>
+using RegBufferInt16 = RegisterBuffer<std::int16_t, N>;
+template <int N>
+using RegBufferUint8 = RegisterBuffer<std::uint8_t, N>;
+template <int N>
+using RegBufferInt8 = RegisterBuffer<std::int8_t, N>;
+template <int R, int C>
+using RegBlockInt32 = RegisterBlock<std::int32_t, R, C>;
+template <int R, int C>
+using RegBlockInt16 = RegisterBlock<std::int16_t, R, C>;
+template <int R, int C>
+using RegBlockUint8 = RegisterBlock<std::uint8_t, R, C>;
+template <int R, int C>
+using RegBlockInt8 = RegisterBlock<std::int8_t, R, C>;
+
+}  // end namespace gemmlowp
+
+#if defined GEMMLOWP_NEON
+#include "simd_wrappers_neon.h"
+#elif defined GEMMLOWP_SSE4
+#include "simd_wrappers_sse.h"
+#elif defined GEMMLOWP_MSA
+#include "simd_wrappers_msa.h"
+#endif
+
+#endif  // GEMMLOWP_INTERNAL_SIMD_WRAPPERS_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers_common_neon_sse.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers_common_neon_sse.h
new file mode 100644
index 00000000..694bf998
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers_common_neon_sse.h
@@ -0,0 +1,850 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// simd_wrappers_common_neon_sse.h: common SIMD (NEON and SSE) wrapper code
+
+#ifndef GEMMLOWP_INTERNAL_SIMD_WRAPPERS_COMMON_NEON_SSE_H_
+#define GEMMLOWP_INTERNAL_SIMD_WRAPPERS_COMMON_NEON_SSE_H_
+
+#include "simd_wrappers.h"
+
+namespace gemmlowp {
+
+template <typename SrcScalarType, int N>
+struct LoadImpl<RegBlockInt32<4, N>,
+                MatrixMap<SrcScalarType, MapOrder::ColMajor>> {
+  static RegBlockInt32<4, N> Run(
+      const MatrixMap<SrcScalarType, MapOrder::ColMajor>& src, int row,
+      int col) {
+    RegBlockInt32<4, N> result;
+    for (int i = 0; i < N; i++) {
+      result.buf.reg[i] = LoadInt32x4(src.data(row, col + i));
+    }
+    return result;
+  }
+};
+
+template <typename SrcScalarType, int N>
+struct LoadImpl<RegBlockInt32<8, N>,
+                MatrixMap<SrcScalarType, MapOrder::ColMajor>> {
+  static RegBlockInt32<8, N> Run(
+      const MatrixMap<SrcScalarType, MapOrder::ColMajor>& src, int row,
+      int col) {
+    RegBlockInt32<8, N> result;
+    for (int i = 0; i < N; i++) {
+      result.buf.reg[2 * i + 0] = LoadInt32x4(src.data(row + 0, col + i));
+      result.buf.reg[2 * i + 1] = LoadInt32x4(src.data(row + 4, col + i));
+    }
+    return result;
+  }
+};
+
+template <typename SrcScalarType>
+struct LoadImpl<RegBlockInt32<1, 4>,
+                MatrixMap<SrcScalarType, MapOrder::ColMajor>> {
+  static RegBlockInt32<1, 4> Run(
+      const MatrixMap<SrcScalarType, MapOrder::ColMajor>& src, int row,
+      int col) {
+    RegBlockInt32<1, 4> result;
+    std::int32_t buf[4];
+    for (int i = 0; i < 4; i++) {
+      buf[i] = src(row, col + i);
+    }
+    result.buf.reg[0] = LoadInt32x4(buf);
+    return result;
+  }
+};
+
+template <typename SrcScalarType>
+struct LoadImpl<RegBlockInt32<1, 8>,
+                MatrixMap<SrcScalarType, MapOrder::ColMajor>> {
+  static RegBlockInt32<1, 8> Run(
+      const MatrixMap<SrcScalarType, MapOrder::ColMajor>& src, int row,
+      int col) {
+    RegBlockInt32<1, 8> result;
+    std::int32_t buf[8];
+    for (int i = 0; i < 8; i++) {
+      buf[i] = src(row, col + i);
+    }
+    result.buf.reg[0] = LoadInt32x4(buf);
+    result.buf.reg[1] = LoadInt32x4(buf + 4);
+    return result;
+  }
+};
+
+template <typename SrcScalarType>
+struct LoadImpl<RegBlockInt32<4, 1>,
+                VectorMap<SrcScalarType, VectorShape::Col>> {
+  static RegBlockInt32<4, 1> Run(
+      const VectorMap<SrcScalarType, VectorShape::Col>& src, int pos) {
+    RegBlockInt32<4, 1> result;
+    result.buf.reg[0] = LoadInt32x4(src.data(pos));
+    return result;
+  }
+};
+
+template <typename SrcScalarType>
+struct LoadImpl<RegBlockInt32<4, 1>,
+                VectorDup<SrcScalarType, VectorShape::Col>> {
+  static RegBlockInt32<4, 1> Run(
+      const VectorDup<SrcScalarType, VectorShape::Col>& src, int) {
+    RegBlockInt32<4, 1> result;
+    result.buf.reg[0] = LoadInt32x4(src(0));
+    return result;
+  }
+};
+
+template <typename SrcScalarType, int N>
+struct LoadForBroadcastingImpl<RegBlockInt32<4, N>,
+                               VectorMap<SrcScalarType, VectorShape::Col>> {
+  using SrcObjectType = VectorMap<SrcScalarType, VectorShape::Col>;
+  using RegisterBlockType = RegBlockInt32<4, N>;
+  using ResultBlockType =
+      typename LoadForBroadcastingRegisterBlock<RegisterBlockType,
+                                                SrcObjectType>::Type;
+
+  static ResultBlockType Run(const SrcObjectType& src, int pos) {
+    ResultBlockType result;
+    static_assert(ResultBlockType::kRegisterCount == 1, "");
+    result.buf.reg[0] = LoadInt32x4(src.data(pos));
+    return result;
+  }
+};
+
+template <typename SrcScalarType, int N>
+struct LoadForBroadcastingImpl<RegBlockInt32<8, N>,
+                               VectorMap<SrcScalarType, VectorShape::Col>> {
+  using SrcObjectType = VectorMap<SrcScalarType, VectorShape::Col>;
+  using RegisterBlockType = RegBlockInt32<8, N>;
+  using ResultBlockType =
+      typename LoadForBroadcastingRegisterBlock<RegisterBlockType,
+                                                SrcObjectType>::Type;
+
+  static ResultBlockType Run(const SrcObjectType& src, int pos) {
+    ResultBlockType result;
+    static_assert(ResultBlockType::kRegisterCount == 2, "");
+    result.buf.reg[0] = LoadInt32x4(src.data(pos));
+    result.buf.reg[1] = LoadInt32x4(src.data(pos + 4));
+    return result;
+  }
+};
+
+template <typename SrcScalarType>
+struct LoadForBroadcastingImpl<RegBlockInt32<4, 1>,
+                               VectorMap<SrcScalarType, VectorShape::Row>> {
+  using SrcObjectType = VectorMap<SrcScalarType, VectorShape::Row>;
+  using RegisterBlockType = RegBlockInt32<4, 1>;
+  using ResultBlockType =
+      typename LoadForBroadcastingRegisterBlock<RegisterBlockType,
+                                                SrcObjectType>::Type;
+
+  static ResultBlockType Run(const SrcObjectType& src, int pos) {
+    ResultBlockType result;
+    result.buf.reg[0] = src(pos);
+    return result;
+  }
+};
+
+template <typename SrcScalarType, int N>
+struct LoadForBroadcastingImpl<RegBlockInt32<N, 4>,
+                               VectorMap<SrcScalarType, VectorShape::Row>> {
+  using SrcObjectType = VectorMap<SrcScalarType, VectorShape::Row>;
+  using RegisterBlockType = RegBlockInt32<N, 4>;
+  using ResultBlockType =
+      typename LoadForBroadcastingRegisterBlock<RegisterBlockType,
+                                                SrcObjectType>::Type;
+
+  static ResultBlockType Run(const SrcObjectType& src, int pos) {
+    ResultBlockType result;
+    static_assert(ResultBlockType::kRegisterCount == 1, "");
+    result.buf.reg[0] = LoadInt32x4(src.data(pos));
+    return result;
+  }
+};
+
+template <typename SrcScalarType, int N>
+struct LoadForBroadcastingImpl<RegBlockInt32<N, 8>,
+                               VectorMap<SrcScalarType, VectorShape::Row>> {
+  using SrcObjectType = VectorMap<SrcScalarType, VectorShape::Row>;
+  using RegisterBlockType = RegBlockInt32<N, 8>;
+  using ResultBlockType =
+      typename LoadForBroadcastingRegisterBlock<RegisterBlockType,
+                                                SrcObjectType>::Type;
+
+  static ResultBlockType Run(const SrcObjectType& src, int pos) {
+    ResultBlockType result;
+    static_assert(ResultBlockType::kRegisterCount == 2, "");
+    result.buf.reg[0] = LoadInt32x4(src.data(pos));
+    result.buf.reg[1] = LoadInt32x4(src.data(pos + 4));
+    return result;
+  }
+};
+
+// 4x1 := 4x1 + 1x1
+template <>
+struct BroadcastAddImpl<RegBlockInt32<4, 1>, RegBlockInt32<1, 1>> {
+  static RegBlockInt32<4, 1> Run(const RegBlockInt32<4, 1>& lhs,
+                                 const RegBlockInt32<1, 1>& rhs) {
+    RegBlockInt32<4, 1> result;
+    result.buf.reg[0] = Add(lhs.buf.reg[0], Dup<Int32x4>(rhs.buf.reg[0]));
+    return result;
+  }
+};
+
+// 1x4 := 1x4 + 1x1
+template <>
+struct BroadcastAddImpl<RegBlockInt32<1, 4>, RegBlockInt32<1, 1>> {
+  static RegBlockInt32<1, 4> Run(const RegBlockInt32<1, 4>& lhs,
+                                 const RegBlockInt32<1, 1>& rhs) {
+    RegBlockInt32<1, 4> result;
+    result.buf.reg[0] = Add(lhs.buf.reg[0], Dup<Int32x4>(rhs.buf.reg[0]));
+    return result;
+  }
+};
+
+// 4x1 := 4x1 + 4x1
+template <>
+struct BroadcastAddImpl<RegBlockInt32<4, 1>, RegBlockInt32<4, 1>> {
+  static RegBlockInt32<4, 1> Run(const RegBlockInt32<4, 1>& lhs,
+                                 const RegBlockInt32<4, 1>& rhs) {
+    RegBlockInt32<4, 1> result;
+    result.buf.reg[0] = Add(lhs.buf.reg[0], rhs.buf.reg[0]);
+    return result;
+  }
+};
+
+// 1x4 := 1x4 + 1x4
+template <>
+struct BroadcastAddImpl<RegBlockInt32<1, 4>, RegBlockInt32<1, 4>> {
+  static RegBlockInt32<1, 4> Run(const RegBlockInt32<1, 4>& lhs,
+                                 const RegBlockInt32<1, 4>& rhs) {
+    RegBlockInt32<1, 4> result;
+    result.buf.reg[0] = Add(lhs.buf.reg[0], rhs.buf.reg[0]);
+    return result;
+  }
+};
+
+// 4x4 := 4x4 + 1x4
+template <>
+struct BroadcastAddImpl<RegBlockInt32<4, 4>, RegBlockInt32<1, 4>> {
+  static RegBlockInt32<4, 4> Run(const RegBlockInt32<4, 4>& lhs,
+                                 const RegBlockInt32<1, 4>& rhs) {
+    RegBlockInt32<4, 4> result;
+    result.buf.reg[0] = Add(lhs.buf.reg[0], DupLane<0>(rhs.buf.reg[0]));
+    result.buf.reg[1] = Add(lhs.buf.reg[1], DupLane<1>(rhs.buf.reg[0]));
+    result.buf.reg[2] = Add(lhs.buf.reg[2], DupLane<2>(rhs.buf.reg[0]));
+    result.buf.reg[3] = Add(lhs.buf.reg[3], DupLane<3>(rhs.buf.reg[0]));
+    return result;
+  }
+};
+
+// 4x4 := 4x4 + 4x1
+template <>
+struct BroadcastAddImpl<RegBlockInt32<4, 4>, RegBlockInt32<4, 1>> {
+  static RegBlockInt32<4, 4> Run(const RegBlockInt32<4, 4>& lhs,
+                                 const RegBlockInt32<4, 1>& rhs) {
+    RegBlockInt32<4, 4> result;
+    result.buf.reg[0] = Add(lhs.buf.reg[0], rhs.buf.reg[0]);
+    result.buf.reg[1] = Add(lhs.buf.reg[1], rhs.buf.reg[0]);
+    result.buf.reg[2] = Add(lhs.buf.reg[2], rhs.buf.reg[0]);
+    result.buf.reg[3] = Add(lhs.buf.reg[3], rhs.buf.reg[0]);
+    return result;
+  }
+};
+
+// 8x1 := 8x1 + 1x1
+template <>
+struct BroadcastAddImpl<RegBlockInt32<8, 1>, RegBlockInt32<1, 1>> {
+  static RegBlockInt32<8, 1> Run(const RegBlockInt32<8, 1>& lhs,
+                                 const RegBlockInt32<1, 1>& rhs) {
+    RegBlockInt32<8, 1> result;
+    const Int32x4 p = Dup<Int32x4>(rhs.buf.reg[0]);
+    for (int i = 0; i < 2; i++) {
+      result.buf.reg[i] = Add(lhs.buf.reg[i], p);
+    }
+    return result;
+  }
+};
+
+// 8x1 := 8x1 + 8x1
+template <>
+struct BroadcastAddImpl<RegBlockInt32<8, 1>, RegBlockInt32<8, 1>> {
+  static RegBlockInt32<8, 1> Run(const RegBlockInt32<8, 1>& lhs,
+                                 const RegBlockInt32<8, 1>& rhs) {
+    RegBlockInt32<8, 1> result;
+    for (int i = 0; i < 2; i++) {
+      result.buf.reg[i] = Add(lhs.buf.reg[i], rhs.buf.reg[i]);
+    }
+    return result;
+  }
+};
+
+// 8x4 := 8x4 + 1x4
+template <>
+struct BroadcastAddImpl<RegBlockInt32<8, 4>, RegBlockInt32<1, 4>> {
+  static RegBlockInt32<8, 4> Run(const RegBlockInt32<8, 4>& lhs,
+                                 const RegBlockInt32<1, 4>& rhs) {
+    RegBlockInt32<8, 4> result;
+    result.buf.reg[0] = Add(lhs.buf.reg[0], DupLane<0>(rhs.buf.reg[0]));
+    result.buf.reg[1] = Add(lhs.buf.reg[1], DupLane<0>(rhs.buf.reg[0]));
+    result.buf.reg[2] = Add(lhs.buf.reg[2], DupLane<1>(rhs.buf.reg[0]));
+    result.buf.reg[3] = Add(lhs.buf.reg[3], DupLane<1>(rhs.buf.reg[0]));
+    result.buf.reg[4] = Add(lhs.buf.reg[4], DupLane<2>(rhs.buf.reg[0]));
+    result.buf.reg[5] = Add(lhs.buf.reg[5], DupLane<2>(rhs.buf.reg[0]));
+    result.buf.reg[6] = Add(lhs.buf.reg[6], DupLane<3>(rhs.buf.reg[0]));
+    result.buf.reg[7] = Add(lhs.buf.reg[7], DupLane<3>(rhs.buf.reg[0]));
+    return result;
+  }
+};
+
+// 8x4 := 8x4 + 8x1
+template <>
+struct BroadcastAddImpl<RegBlockInt32<8, 4>, RegBlockInt32<8, 1>> {
+  static RegBlockInt32<8, 4> Run(const RegBlockInt32<8, 4>& lhs,
+                                 const RegBlockInt32<8, 1>& rhs) {
+    RegBlockInt32<8, 4> result;
+    result.buf.reg[0] = Add(lhs.buf.reg[0], rhs.buf.reg[0]);
+    result.buf.reg[1] = Add(lhs.buf.reg[1], rhs.buf.reg[1]);
+    result.buf.reg[2] = Add(lhs.buf.reg[2], rhs.buf.reg[0]);
+    result.buf.reg[3] = Add(lhs.buf.reg[3], rhs.buf.reg[1]);
+    result.buf.reg[4] = Add(lhs.buf.reg[4], rhs.buf.reg[0]);
+    result.buf.reg[5] = Add(lhs.buf.reg[5], rhs.buf.reg[1]);
+    result.buf.reg[6] = Add(lhs.buf.reg[6], rhs.buf.reg[0]);
+    result.buf.reg[7] = Add(lhs.buf.reg[7], rhs.buf.reg[1]);
+    return result;
+  }
+};
+
+// 1x8 := 1x8 + 1x8
+template <>
+struct BroadcastAddImpl<RegBlockInt32<1, 8>, RegBlockInt32<1, 8>> {
+  static RegBlockInt32<1, 8> Run(const RegBlockInt32<1, 8>& lhs,
+                                 const RegBlockInt32<1, 8>& rhs) {
+    RegBlockInt32<1, 8> result;
+    result.buf.reg[0] = Add(lhs.buf.reg[0], rhs.buf.reg[0]);
+    result.buf.reg[1] = Add(lhs.buf.reg[1], rhs.buf.reg[1]);
+    return result;
+  }
+};
+
+// 1x8 := 1x8 + 1x1
+template <>
+struct BroadcastAddImpl<RegBlockInt32<1, 8>, RegBlockInt32<1, 1>> {
+  static RegBlockInt32<1, 8> Run(const RegBlockInt32<1, 8>& lhs,
+                                 const RegBlockInt32<1, 1>& rhs) {
+    RegBlockInt32<1, 8> result;
+    result.buf.reg[0] = Add(lhs.buf.reg[0], Dup<Int32x4>(rhs.buf.reg[0]));
+    result.buf.reg[1] = Add(lhs.buf.reg[1], Dup<Int32x4>(rhs.buf.reg[0]));
+    return result;
+  }
+};
+
+// 4x1 := 4x1 + 1x1
+template <>
+struct BroadcastSaturatingRoundingDoublingHighMulImpl<RegBlockInt32<4, 1>,
+                                                      RegBlockInt32<1, 1>> {
+  static RegBlockInt32<4, 1> Run(const RegBlockInt32<4, 1>& lhs,
+                                 const RegBlockInt32<1, 1>& rhs) {
+    RegBlockInt32<4, 1> result;
+    result.buf.reg[0] = SaturatingRoundingDoublingHighMul(
+        lhs.buf.reg[0], Dup<Int32x4>(rhs.buf.reg[0]));
+    return result;
+  }
+};
+
+// 1x4 := 1x4 + 1x1
+template <>
+struct BroadcastSaturatingRoundingDoublingHighMulImpl<RegBlockInt32<1, 4>,
+                                                      RegBlockInt32<1, 1>> {
+  static RegBlockInt32<1, 4> Run(const RegBlockInt32<1, 4>& lhs,
+                                 const RegBlockInt32<1, 1>& rhs) {
+    RegBlockInt32<1, 4> result;
+    result.buf.reg[0] = SaturatingRoundingDoublingHighMul(
+        lhs.buf.reg[0], Dup<Int32x4>(rhs.buf.reg[0]));
+    return result;
+  }
+};
+
+// 4x1 := 4x1 + 4x1
+template <>
+struct BroadcastSaturatingRoundingDoublingHighMulImpl<RegBlockInt32<4, 1>,
+                                                      RegBlockInt32<4, 1>> {
+  static RegBlockInt32<4, 1> Run(const RegBlockInt32<4, 1>& lhs,
+                                 const RegBlockInt32<4, 1>& rhs) {
+    RegBlockInt32<4, 1> result;
+    result.buf.reg[0] =
+        SaturatingRoundingDoublingHighMul(lhs.buf.reg[0], rhs.buf.reg[0]);
+    return result;
+  }
+};
+
+// 1x4 := 1x4 + 1x4
+template <>
+struct BroadcastSaturatingRoundingDoublingHighMulImpl<RegBlockInt32<1, 4>,
+                                                      RegBlockInt32<1, 4>> {
+  static RegBlockInt32<1, 4> Run(const RegBlockInt32<1, 4>& lhs,
+                                 const RegBlockInt32<1, 4>& rhs) {
+    RegBlockInt32<1, 4> result;
+    result.buf.reg[0] =
+        SaturatingRoundingDoublingHighMul(lhs.buf.reg[0], rhs.buf.reg[0]);
+    return result;
+  }
+};
+
+// 4x4 := 4x4 + 1x4
+template <>
+struct BroadcastSaturatingRoundingDoublingHighMulImpl<RegBlockInt32<4, 4>,
+                                                      RegBlockInt32<1, 4>> {
+  static RegBlockInt32<4, 4> Run(const RegBlockInt32<4, 4>& lhs,
+                                 const RegBlockInt32<1, 4>& rhs) {
+    RegBlockInt32<4, 4> result;
+    result.buf.reg[0] = SaturatingRoundingDoublingHighMul(
+        lhs.buf.reg[0], DupLane<0>(rhs.buf.reg[0]));
+    result.buf.reg[1] = SaturatingRoundingDoublingHighMul(
+        lhs.buf.reg[1], DupLane<1>(rhs.buf.reg[0]));
+    result.buf.reg[2] = SaturatingRoundingDoublingHighMul(
+        lhs.buf.reg[2], DupLane<2>(rhs.buf.reg[0]));
+    result.buf.reg[3] = SaturatingRoundingDoublingHighMul(
+        lhs.buf.reg[3], DupLane<3>(rhs.buf.reg[0]));
+    return result;
+  }
+};
+
+// 4x4 := 4x4 + 4x1
+template <>
+struct BroadcastSaturatingRoundingDoublingHighMulImpl<RegBlockInt32<4, 4>,
+                                                      RegBlockInt32<4, 1>> {
+  static RegBlockInt32<4, 4> Run(const RegBlockInt32<4, 4>& lhs,
+                                 const RegBlockInt32<4, 1>& rhs) {
+    RegBlockInt32<4, 4> result;
+    result.buf.reg[0] =
+        SaturatingRoundingDoublingHighMul(lhs.buf.reg[0], rhs.buf.reg[0]);
+    result.buf.reg[1] =
+        SaturatingRoundingDoublingHighMul(lhs.buf.reg[1], rhs.buf.reg[0]);
+    result.buf.reg[2] =
+        SaturatingRoundingDoublingHighMul(lhs.buf.reg[2], rhs.buf.reg[0]);
+    result.buf.reg[3] =
+        SaturatingRoundingDoublingHighMul(lhs.buf.reg[3], rhs.buf.reg[0]);
+    return result;
+  }
+};
+
+// 8x1 := 8x1 + 1x1
+template <>
+struct BroadcastSaturatingRoundingDoublingHighMulImpl<RegBlockInt32<8, 1>,
+                                                      RegBlockInt32<1, 1>> {
+  static RegBlockInt32<8, 1> Run(const RegBlockInt32<8, 1>& lhs,
+                                 const RegBlockInt32<1, 1>& rhs) {
+    RegBlockInt32<8, 1> result;
+    const Int32x4 p = Dup<Int32x4>(rhs.buf.reg[0]);
+    for (int i = 0; i < 2; i++) {
+      result.buf.reg[i] = SaturatingRoundingDoublingHighMul(lhs.buf.reg[i], p);
+    }
+    return result;
+  }
+};
+
+// 8x1 := 8x1 + 8x1
+template <>
+struct BroadcastSaturatingRoundingDoublingHighMulImpl<RegBlockInt32<8, 1>,
+                                                      RegBlockInt32<8, 1>> {
+  static RegBlockInt32<8, 1> Run(const RegBlockInt32<8, 1>& lhs,
+                                 const RegBlockInt32<8, 1>& rhs) {
+    RegBlockInt32<8, 1> result;
+    for (int i = 0; i < 2; i++) {
+      result.buf.reg[i] =
+          SaturatingRoundingDoublingHighMul(lhs.buf.reg[i], rhs.buf.reg[i]);
+    }
+    return result;
+  }
+};
+
+// 8x4 := 8x4 + 1x4
+template <>
+struct BroadcastSaturatingRoundingDoublingHighMulImpl<RegBlockInt32<8, 4>,
+                                                      RegBlockInt32<1, 4>> {
+  static RegBlockInt32<8, 4> Run(const RegBlockInt32<8, 4>& lhs,
+                                 const RegBlockInt32<1, 4>& rhs) {
+    RegBlockInt32<8, 4> result;
+    result.buf.reg[0] = SaturatingRoundingDoublingHighMul(
+        lhs.buf.reg[0], DupLane<0>(rhs.buf.reg[0]));
+    result.buf.reg[1] = SaturatingRoundingDoublingHighMul(
+        lhs.buf.reg[1], DupLane<0>(rhs.buf.reg[0]));
+    result.buf.reg[2] = SaturatingRoundingDoublingHighMul(
+        lhs.buf.reg[2], DupLane<1>(rhs.buf.reg[0]));
+    result.buf.reg[3] = SaturatingRoundingDoublingHighMul(
+        lhs.buf.reg[3], DupLane<1>(rhs.buf.reg[0]));
+    result.buf.reg[4] = SaturatingRoundingDoublingHighMul(
+        lhs.buf.reg[4], DupLane<2>(rhs.buf.reg[0]));
+    result.buf.reg[5] = SaturatingRoundingDoublingHighMul(
+        lhs.buf.reg[5], DupLane<2>(rhs.buf.reg[0]));
+    result.buf.reg[6] = SaturatingRoundingDoublingHighMul(
+        lhs.buf.reg[6], DupLane<3>(rhs.buf.reg[0]));
+    result.buf.reg[7] = SaturatingRoundingDoublingHighMul(
+        lhs.buf.reg[7], DupLane<3>(rhs.buf.reg[0]));
+    return result;
+  }
+};
+
+// 8x4 := 8x4 + 8x1
+template <>
+struct BroadcastSaturatingRoundingDoublingHighMulImpl<RegBlockInt32<8, 4>,
+                                                      RegBlockInt32<8, 1>> {
+  static RegBlockInt32<8, 4> Run(const RegBlockInt32<8, 4>& lhs,
+                                 const RegBlockInt32<8, 1>& rhs) {
+    RegBlockInt32<8, 4> result;
+    result.buf.reg[0] =
+        SaturatingRoundingDoublingHighMul(lhs.buf.reg[0], rhs.buf.reg[0]);
+    result.buf.reg[1] =
+        SaturatingRoundingDoublingHighMul(lhs.buf.reg[1], rhs.buf.reg[1]);
+    result.buf.reg[2] =
+        SaturatingRoundingDoublingHighMul(lhs.buf.reg[2], rhs.buf.reg[0]);
+    result.buf.reg[3] =
+        SaturatingRoundingDoublingHighMul(lhs.buf.reg[3], rhs.buf.reg[1]);
+    result.buf.reg[4] =
+        SaturatingRoundingDoublingHighMul(lhs.buf.reg[4], rhs.buf.reg[0]);
+    result.buf.reg[5] =
+        SaturatingRoundingDoublingHighMul(lhs.buf.reg[5], rhs.buf.reg[1]);
+    result.buf.reg[6] =
+        SaturatingRoundingDoublingHighMul(lhs.buf.reg[6], rhs.buf.reg[0]);
+    result.buf.reg[7] =
+        SaturatingRoundingDoublingHighMul(lhs.buf.reg[7], rhs.buf.reg[1]);
+    return result;
+  }
+};
+
+// 1x8 := 1x8 + 1x8
+template <>
+struct BroadcastSaturatingRoundingDoublingHighMulImpl<RegBlockInt32<1, 8>,
+                                                      RegBlockInt32<1, 8>> {
+  static RegBlockInt32<1, 8> Run(const RegBlockInt32<1, 8>& lhs,
+                                 const RegBlockInt32<1, 8>& rhs) {
+    RegBlockInt32<1, 8> result;
+    result.buf.reg[0] =
+        SaturatingRoundingDoublingHighMul(lhs.buf.reg[0], rhs.buf.reg[0]);
+    result.buf.reg[1] =
+        SaturatingRoundingDoublingHighMul(lhs.buf.reg[1], rhs.buf.reg[1]);
+    return result;
+  }
+};
+
+// 1x8 := 1x8 + 1x1
+template <>
+struct BroadcastSaturatingRoundingDoublingHighMulImpl<RegBlockInt32<1, 8>,
+                                                      RegBlockInt32<1, 1>> {
+  static RegBlockInt32<1, 8> Run(const RegBlockInt32<1, 8>& lhs,
+                                 const RegBlockInt32<1, 1>& rhs) {
+    RegBlockInt32<1, 8> result;
+    result.buf.reg[0] = SaturatingRoundingDoublingHighMul(
+        lhs.buf.reg[0], Dup<Int32x4>(rhs.buf.reg[0]));
+    result.buf.reg[1] = SaturatingRoundingDoublingHighMul(
+        lhs.buf.reg[1], Dup<Int32x4>(rhs.buf.reg[0]));
+    return result;
+  }
+};
+
+// 4x1 := 4x1 * 1x1
+template <>
+struct BroadcastMulImpl<RegBlockInt32<4, 1>, RegBlockInt32<1, 1>> {
+  static RegBlockInt32<4, 1> Run(const RegBlockInt32<4, 1>& lhs,
+                                 const RegBlockInt32<1, 1>& rhs) {
+    RegBlockInt32<4, 1> result;
+    result.buf.reg[0] = Mul(lhs.buf.reg[0], Dup<Int32x4>(rhs.buf.reg[0]));
+    return result;
+  }
+};
+
+// 4x1 := 4x1 * 4x1
+template <>
+struct BroadcastMulImpl<RegBlockInt32<4, 1>, RegBlockInt32<4, 1>> {
+  static RegBlockInt32<4, 1> Run(const RegBlockInt32<4, 1>& lhs,
+                                 const RegBlockInt32<4, 1>& rhs) {
+    RegBlockInt32<4, 1> result;
+    result.buf.reg[0] = Mul(lhs.buf.reg[0], rhs.buf.reg[0]);
+    return result;
+  }
+};
+
+// 1x4 := 1x4 * 1x4
+template <>
+struct BroadcastMulImpl<RegBlockInt32<1, 4>, RegBlockInt32<1, 4>> {
+  static RegBlockInt32<1, 4> Run(const RegBlockInt32<1, 4>& lhs,
+                                 const RegBlockInt32<1, 4>& rhs) {
+    RegBlockInt32<1, 4> result;
+    result.buf.reg[0] = Mul(lhs.buf.reg[0], rhs.buf.reg[0]);
+    return result;
+  }
+};
+
+// 1x4 := 1x4 * 1x1
+template <>
+struct BroadcastMulImpl<RegBlockInt32<1, 4>, RegBlockInt32<1, 1>> {
+  static RegBlockInt32<1, 4> Run(const RegBlockInt32<1, 4>& lhs,
+                                 const RegBlockInt32<1, 1>& rhs) {
+    RegBlockInt32<1, 4> result;
+    result.buf.reg[0] = Mul(lhs.buf.reg[0], rhs.buf.reg[0]);
+    return result;
+  }
+};
+
+// 4x4 := 4x4 * 1x4
+template <>
+struct BroadcastMulImpl<RegBlockInt32<4, 4>, RegBlockInt32<1, 4>> {
+  static RegBlockInt32<4, 4> Run(const RegBlockInt32<4, 4>& lhs,
+                                 const RegBlockInt32<1, 4>& rhs) {
+    RegBlockInt32<4, 4> result;
+    const Int32x4 p = rhs.buf.reg[0];
+    result.buf.reg[0] = MulByRhsLane<0>(lhs.buf.reg[0], p);
+    result.buf.reg[1] = MulByRhsLane<1>(lhs.buf.reg[1], p);
+    result.buf.reg[2] = MulByRhsLane<2>(lhs.buf.reg[2], p);
+    result.buf.reg[3] = MulByRhsLane<3>(lhs.buf.reg[3], p);
+    return result;
+  }
+};
+
+// 4x4 := 4x4 * 4x1
+template <>
+struct BroadcastMulImpl<RegBlockInt32<4, 4>, RegBlockInt32<4, 1>> {
+  static RegBlockInt32<4, 4> Run(const RegBlockInt32<4, 4>& lhs,
+                                 const RegBlockInt32<4, 1>& rhs) {
+    RegBlockInt32<4, 4> result;
+    const Int32x4 p = rhs.buf.reg[0];
+    result.buf.reg[0] = Mul(lhs.buf.reg[0], p);
+    result.buf.reg[1] = Mul(lhs.buf.reg[1], p);
+    result.buf.reg[2] = Mul(lhs.buf.reg[2], p);
+    result.buf.reg[3] = Mul(lhs.buf.reg[3], p);
+    return result;
+  }
+};
+
+// 8x1 := 8x1 * 1x1
+template <>
+struct BroadcastMulImpl<RegBlockInt32<8, 1>, RegBlockInt32<1, 1>> {
+  static RegBlockInt32<8, 1> Run(const RegBlockInt32<8, 1>& lhs,
+                                 const RegBlockInt32<1, 1>& rhs) {
+    RegBlockInt32<8, 1> result;
+    const std::int32_t p = rhs.buf.reg[0];
+    for (int i = 0; i < 2; i++) {
+      result.buf.reg[i] = Mul(lhs.buf.reg[i], p);
+    }
+    return result;
+  }
+};
+
+// 8x1 := 8x1 * 8x1
+template <>
+struct BroadcastMulImpl<RegBlockInt32<8, 1>, RegBlockInt32<8, 1>> {
+  static RegBlockInt32<8, 1> Run(const RegBlockInt32<8, 1>& lhs,
+                                 const RegBlockInt32<8, 1>& rhs) {
+    RegBlockInt32<8, 1> result;
+    for (int i = 0; i < 2; i++) {
+      result.buf.reg[i] = Mul(lhs.buf.reg[i], rhs.buf.reg[i]);
+    }
+    return result;
+  }
+};
+
+// 8x4 := 8x4 * 1x4
+template <>
+struct BroadcastMulImpl<RegBlockInt32<8, 4>, RegBlockInt32<1, 4>> {
+  static RegBlockInt32<8, 4> Run(const RegBlockInt32<8, 4>& lhs,
+                                 const RegBlockInt32<1, 4>& rhs) {
+    RegBlockInt32<8, 4> result;
+    const Int32x4 p = rhs.buf.reg[0];
+    for (int i = 0; i < 2; i++) {
+      result.buf.reg[i + 0] = MulByRhsLane<0>(lhs.buf.reg[i + 0], p);
+      result.buf.reg[i + 2] = MulByRhsLane<1>(lhs.buf.reg[i + 2], p);
+      result.buf.reg[i + 4] = MulByRhsLane<2>(lhs.buf.reg[i + 4], p);
+      result.buf.reg[i + 6] = MulByRhsLane<3>(lhs.buf.reg[i + 6], p);
+    }
+    return result;
+  }
+};
+
+// 8x4 := 8x4 * 8x1
+template <>
+struct BroadcastMulImpl<RegBlockInt32<8, 4>, RegBlockInt32<8, 1>> {
+  static RegBlockInt32<8, 4> Run(const RegBlockInt32<8, 4>& lhs,
+                                 const RegBlockInt32<8, 1>& rhs) {
+    RegBlockInt32<8, 4> result;
+    const Int32x4 p[2]{rhs.buf.reg[0], rhs.buf.reg[1]};
+    for (int i = 0; i < 4; i++) {
+      for (int j = 0; j < 2; j++) {
+        const int k = j + 2 * i;
+        result.buf.reg[k] = Mul(lhs.buf.reg[k], p[j]);
+      }
+    }
+    return result;
+  }
+};
+
+// Rx1 += Rx1 * 1x1
+template <int Rows>
+struct BroadcastMulAddImpl<RegBlockInt32<Rows, 1>, RegBlockInt32<1, 1>,
+                           RegBlockInt32<Rows, 1>> {
+  static void Run(const RegBlockInt32<Rows, 1>& lhs,
+                  const RegBlockInt32<1, 1>& rhs, RegBlockInt32<Rows, 1>* acc) {
+    const std::int32_t p = rhs.buf.reg[0];
+    for (int i = 0; i < RegBlockInt32<Rows, 1>::kRegisterCount; i++) {
+      MulAdd(lhs.buf.reg[i], p, &acc->buf.reg[i]);
+    }
+  }
+};
+
+// RxC += Rx1 * 1x1
+template <int Rows, int Cols>
+struct BroadcastMulAddImpl<RegBlockInt32<Rows, 1>, RegBlockInt32<1, 1>,
+                           RegBlockInt32<Rows, Cols>> {
+  static void Run(const RegBlockInt32<Rows, 1>& lhs,
+                  const RegBlockInt32<1, 1>& rhs,
+                  RegBlockInt32<Rows, Cols>* acc) {
+    const std::int32_t p = rhs.buf.reg[0];
+    static constexpr int kRegsPerCol = RegBlockInt32<Rows, 1>::kRegisterCount;
+    for (int i = 0; i < kRegsPerCol; i++) {
+      const Int32x4 q = Mul(lhs.buf.reg[i], p);
+      for (int j = 0; j < Cols; j++) {
+        acc->buf.reg[i + j * kRegsPerCol] =
+            Add(acc->buf.reg[i + j * kRegsPerCol], q);
+      }
+    }
+  }
+};
+
+// 1xC += 1xC * 1x1
+template <int Cols>
+struct BroadcastMulAddImpl<RegBlockInt32<1, Cols>, RegBlockInt32<1, 1>,
+                           RegBlockInt32<1, Cols>> {
+  static void Run(const RegBlockInt32<1, Cols>& lhs,
+                  const RegBlockInt32<1, 1>& rhs, RegBlockInt32<1, Cols>* acc) {
+    const std::int32_t p = rhs.buf.reg[0];
+    for (int i = 0; i < RegBlockInt32<1, Cols>::kRegisterCount; i++) {
+      MulAdd(lhs.buf.reg[i], p, &acc->buf.reg[i]);
+    }
+  }
+};
+
+// RxC += 1x1 * 1x1
+template <int Rows, int Cols>
+struct BroadcastMulAddImpl<RegBlockInt32<1, 1>, RegBlockInt32<1, 1>,
+                           RegBlockInt32<Rows, Cols>> {
+  static void Run(const RegBlockInt32<1, 1>& lhs,
+                  const RegBlockInt32<1, 1>& rhs,
+                  RegBlockInt32<Rows, Cols>* acc) {
+    const Int32x4 p = Dup<Int32x4>(Mul(lhs.buf.reg[0], rhs.buf.reg[0]));
+    for (int i = 0; i < RegBlockInt32<Rows, Cols>::kRegisterCount; i++) {
+      acc->buf.reg[i] = Add(acc->buf.reg[i], p);
+    }
+  }
+};
+
+// 1x1 += 1x1 * 1x1
+template <>
+struct BroadcastMulAddImpl<RegBlockInt32<1, 1>, RegBlockInt32<1, 1>,
+                           RegBlockInt32<1, 1>> {
+  static void Run(const RegBlockInt32<1, 1>& lhs,
+                  const RegBlockInt32<1, 1>& rhs, RegBlockInt32<1, 1>* acc) {
+    MulAdd(lhs.buf.reg[0], rhs.buf.reg[0], &acc->buf.reg[0]);
+  }
+};
+
+// Rx4 += Rx1 * 1x4
+template <int Rows>
+struct BroadcastMulAddImpl<RegBlockInt32<Rows, 1>, RegBlockInt32<1, 4>,
+                           RegBlockInt32<Rows, 4>> {
+  static void Run(const RegBlockInt32<Rows, 1>& lhs,
+                  const RegBlockInt32<1, 4>& rhs, RegBlockInt32<Rows, 4>* acc) {
+    const Int32x4 p = rhs.buf.reg[0];
+    static constexpr int kRegsPerCol = RegBlockInt32<Rows, 1>::kRegisterCount;
+    for (int i = 0; i < kRegsPerCol; i++) {
+      MulAddByRhsLane<0>(lhs.buf.reg[i], p, &acc->buf.reg[i + 0 * kRegsPerCol]);
+      MulAddByRhsLane<1>(lhs.buf.reg[i], p, &acc->buf.reg[i + 1 * kRegsPerCol]);
+      MulAddByRhsLane<2>(lhs.buf.reg[i], p, &acc->buf.reg[i + 2 * kRegsPerCol]);
+      MulAddByRhsLane<3>(lhs.buf.reg[i], p, &acc->buf.reg[i + 3 * kRegsPerCol]);
+    }
+  }
+};
+
+// Rx4 += 1x4 * 1x1
+template <int Rows>
+struct BroadcastMulAddImpl<RegBlockInt32<1, 4>, RegBlockInt32<1, 1>,
+                           RegBlockInt32<Rows, 4>> {
+  static void Run(const RegBlockInt32<1, 4>& lhs,
+                  const RegBlockInt32<1, 1>& rhs, RegBlockInt32<Rows, 4>* acc) {
+    const Int32x4 p = Mul(lhs.buf.reg[0], rhs.buf.reg[0]);
+    Int32x4 q[4];
+    q[0] = DupLane<0>(p);
+    q[1] = DupLane<1>(p);
+    q[2] = DupLane<2>(p);
+    q[3] = DupLane<3>(p);
+    static constexpr int kRegsPerCol = RegBlockInt32<Rows, 1>::kRegisterCount;
+    for (int i = 0; i < kRegsPerCol; i++) {
+      for (int j = 0; j < 4; j++) {
+        acc->buf.reg[i + j * kRegsPerCol] =
+            Add(q[j], acc->buf.reg[i + j * kRegsPerCol]);
+      }
+    }
+  }
+};
+
+// 1xC += 1x1 * 1x1
+template <int Cols>
+struct BroadcastMulAddImpl<RegBlockInt32<1, 1>, RegBlockInt32<1, 1>,
+                           RegBlockInt32<1, Cols>> {
+  static void Run(const RegBlockInt32<1, 1>& lhs,
+                  const RegBlockInt32<1, 1>& rhs, RegBlockInt32<1, Cols>* acc) {
+    const Int32x4 p = Dup<Int32x4>(Mul(lhs.buf.reg[0], rhs.buf.reg[0]));
+    for (int i = 0; i < RegBlockInt32<1, Cols>::kRegisterCount; i++) {
+      acc->buf.reg[i] = Add(acc->buf.reg[i], p);
+    }
+  }
+};
+
+// 1x4 += 1x4 * 1x1
+template <>
+struct BroadcastMulAddImpl<RegBlockInt32<1, 4>, RegBlockInt32<1, 1>,
+                           RegBlockInt32<1, 4>> {
+  static void Run(const RegBlockInt32<1, 4>& lhs,
+                  const RegBlockInt32<1, 1>& rhs, RegBlockInt32<1, 4>* acc) {
+    const std::int32_t p = rhs.buf.reg[0];
+    MulAdd(lhs.buf.reg[0], p, &acc->buf.reg[0]);
+  }
+};
+
+// 4xC += 4x1 * 1x1
+template <int Cols>
+struct BroadcastMulAddImpl<RegBlockInt32<4, 1>, RegBlockInt32<1, 1>,
+                           RegBlockInt32<4, Cols>> {
+  static void Run(const RegBlockInt32<4, 1>& lhs,
+                  const RegBlockInt32<1, 1>& rhs, RegBlockInt32<4, Cols>* acc) {
+    const Int32x4 p = Mul(lhs.buf.reg[0], rhs.buf.reg[0]);
+    for (int i = 0; i < Cols; i++) {
+      acc->buf.reg[i] = Add(p, acc->buf.reg[i]);
+    }
+  }
+};
+
+// 4x1 += 4x1 * 1x1
+template <>
+struct BroadcastMulAddImpl<RegBlockInt32<4, 1>, RegBlockInt32<1, 1>,
+                           RegBlockInt32<4, 1>> {
+  static void Run(const RegBlockInt32<4, 1>& lhs,
+                  const RegBlockInt32<1, 1>& rhs, RegBlockInt32<4, 1>* acc) {
+    const std::int32_t p = rhs.buf.reg[0];
+    MulAdd(lhs.buf.reg[0], p, &acc->buf.reg[0]);
+  }
+};
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_SIMD_WRAPPERS_COMMON_NEON_SSE_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers_msa.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers_msa.h
new file mode 100644
index 00000000..7de01ff5
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers_msa.h
@@ -0,0 +1,191 @@
+// Copyright 2018 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// simd_wrappers_msa.h: MSA specialization of simd_wrappers.h
+
+#ifndef GEMMLOWP_INTERNAL_SIMD_WRAPPERS_MSA_H_
+#define GEMMLOWP_INTERNAL_SIMD_WRAPPERS_MSA_H_
+
+#include <msa.h>
+
+namespace gemmlowp {
+
+using Int32x4 = v4i32;
+using Int16x8 = v8i16;
+using Uint8x16 = v16i8;
+
+template <int ScalarCount>
+struct RegisterType<std::int32_t, ScalarCount> {
+  using Type =
+      typename std::conditional<ScalarCount >= 4, Int32x4, std::int32_t>::type;
+};
+
+template <int ScalarCount>
+struct RegisterType<std::int16_t, ScalarCount> {
+  using Type = typename std::conditional<ScalarCount >= 8, Int16x8, std::int16_t>::type;
+};
+
+template <int ScalarCount>
+struct RegisterType<std::uint8_t, ScalarCount> {
+  using Type = typename std::conditional<
+      ScalarCount >= 16, Uint8x16,
+      typename std::conditional<ScalarCount >= 4, std::uint32_t,
+                                std::uint8_t>::type>::type;
+};
+
+inline Int32x4 LoadInt32x4(const std::int32_t* src) {
+  return __builtin_msa_ld_w(const_cast<std::int32_t*>(src), 0);
+}
+
+inline Int32x4 LoadInt32x4(const Int32x4* src) {
+  return __builtin_msa_ld_w(const_cast<Int32x4*>(src), 0);
+}
+
+inline void StoreInt32x4(std::int32_t* dst, Int32x4 value) {
+  __builtin_msa_st_w(value, dst, 0);
+}
+
+inline void StoreInt32x4(Int32x4* dst, Int32x4 value) {
+  __builtin_msa_st_w(value, dst, 0);
+}
+
+inline Int16x8 LoadInt16x8(const std::int16_t* src) {
+  return __builtin_msa_ld_h(const_cast<std::int16_t*>(src), 0);
+}
+
+inline Int16x8 LoadInt16x8(const Int16x8* src) {
+  return __builtin_msa_ld_h(const_cast<Int16x8*>(src), 0);
+}
+
+inline void StoreInt16x8(std::int16_t* dst, Int16x8 value) { __builtin_msa_st_h(value, dst, 0); }
+
+inline void StoreInt16x8(Int16x8* dst, Int16x8 value) { __builtin_msa_st_h(value, dst, 0); }
+
+inline Uint8x16 LoadUint8x16(const std::uint8_t* src) {
+  return __builtin_msa_ld_b(const_cast<std::uint8_t*>(src), 0);
+}
+
+inline Uint8x16 LoadUint8x16(const Uint8x16* src) {
+  return __builtin_msa_ld_b(const_cast<Uint8x16*>(src), 0);
+}
+
+inline void StoreUint8x16(std::uint8_t* dst, Uint8x16 value) {
+  __builtin_msa_st_b(value, dst, 0);
+}
+
+inline void StoreUint8x16(Uint8x16* dst, Uint8x16 value) {
+  __builtin_msa_st_b(value, dst, 0);
+}
+
+template <int Lane>
+std::int32_t GetLane(Int32x4 value) {
+  return __builtin_msa_copy_s_w(value, Lane);
+}
+
+template <int Lane>
+Int32x4 DupLane(Int32x4 value) {
+  static_assert(Lane >= 0 && Lane <= 3, "");
+  return __builtin_msa_splati_w(value, Lane);
+}
+
+inline Int32x4 Mul(Int32x4 a, std::int32_t b) {
+  return __builtin_msa_mulv_w(a, __builtin_msa_fill_w(b));
+}
+
+inline Int32x4 Min(Int32x4 a, Int32x4 b) { return __builtin_msa_min_s_w(a, b); }
+
+inline Int32x4 Max(Int32x4 a, Int32x4 b) { return __builtin_msa_max_s_w(a, b); }
+
+inline Int32x4 SaturatingRoundingDoublingHighMul(Int32x4 a, std::int32_t b) {
+  return __builtin_msa_mulr_q_w(a, __builtin_msa_fill_w(b));
+}
+
+template <int Lane>
+Int32x4 MulByRhsLane(Int32x4 a, Int32x4 b) {
+  static_assert(Lane >= 0 && Lane <= 3, "");
+  return __builtin_msa_mulv_w(a, __builtin_msa_splati_w(b, Lane));
+}
+
+static inline v4i32 workaround_msa_maddv_w(v4i32 a, v4i32 b, v4i32 c) {
+  // Workaround for incorrect encoding of maddv.df in gcc (a exchanged with c).
+#if 0
+  return __builtin_msa_maddv_w(a, b, c);
+#else
+  asm volatile("maddv.w %w[a], %w[b], %w[c]\n"
+               // Outputs
+               : [a] "+f"(a)
+               // Inputs
+               : [b] "f"(b), [c] "f"(c));
+  return a;
+#endif
+}
+
+inline void MulAdd(Int32x4 lhs, Int32x4 rhs, Int32x4* acc) {
+  Int32x4 tmp = LoadInt32x4(acc);
+  tmp = workaround_msa_maddv_w(tmp, lhs, rhs);
+  StoreInt32x4(acc, tmp);
+}
+
+inline void MulAdd(Int32x4 lhs, std::int32_t rhs, Int32x4* acc) {
+  Int32x4 tmp = LoadInt32x4(acc);
+  tmp = workaround_msa_maddv_w(tmp, lhs, __builtin_msa_fill_w(rhs));
+  StoreInt32x4(acc, tmp);
+}
+
+template <int Lane>
+inline void MulAddByRhsLane(Int32x4 lhs, Int32x4 rhs, Int32x4* acc) {
+  static_assert(Lane >= 0 && Lane <= 3, "");
+  Int32x4 tmp = LoadInt32x4(acc);
+  tmp = workaround_msa_maddv_w(tmp, lhs, __builtin_msa_splati_w(rhs, Lane));
+  StoreInt32x4(acc, tmp);
+}
+
+template <>
+struct LoadContiguousImpl<RegBlockUint8<8, 8>> {
+  static RegBlockUint8<8, 8> Run(const std::uint8_t* src) {
+    RegBlockUint8<8, 8> result;
+    for (int i = 0; i < 4; i++) {
+      result.buf.reg[i] = LoadUint8x16(src + 16 * i);
+    }
+    return result;
+  }
+};
+
+template <>
+struct LoadContiguousImpl<RegBlockInt32<8, 8>> {
+  static RegBlockInt32<8, 8> Run(const std::int32_t* src) {
+    RegBlockInt32<8, 8> result;
+    for (int i = 0; i < 16; i++) {
+      result.buf.reg[i] = LoadInt32x4(src + 4 * i);
+    }
+    return result;
+  }
+};
+
+template <>
+struct LoadContiguousImpl<RegBlockInt16<8, 8>> {
+  static RegBlockInt16<8, 8> Run(const std::int16_t* src) {
+    RegBlockInt16<8, 8> result;
+    for (int i = 0; i < 8; i++) {
+      result.buf.reg[i] = LoadInt16x8(src + 8 * i);
+    }
+    return result;
+  }
+};
+
+}  // end namespace gemmlowp
+
+#include "simd_wrappers_common_neon_sse.h"
+
+#endif  // GEMMLOWP_INTERNAL_SIMD_WRAPPERS_MSA_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers_neon.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers_neon.h
new file mode 100644
index 00000000..6871055a
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers_neon.h
@@ -0,0 +1,551 @@
+// Copyright 2017 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// simd_wrappers_neon.h: NEON specialization of simd_wrappers.h
+
+#ifndef GEMMLOWP_INTERNAL_SIMD_WRAPPERS_NEON_H_
+#define GEMMLOWP_INTERNAL_SIMD_WRAPPERS_NEON_H_
+
+#include <arm_neon.h>
+
+namespace gemmlowp {
+
+using Int32x4 = int32x4_t;
+using Int16x4 = int16x4_t;
+using Int16x8 = int16x8_t;
+using Uint8x8 = uint8x8_t;
+using Int8x8 = int8x8_t;
+
+template <int ScalarCount>
+struct RegisterType<std::int32_t, ScalarCount> {
+  using Type =
+      typename std::conditional<ScalarCount >= 4, Int32x4, std::int32_t>::type;
+};
+
+template <int ScalarCount>
+struct RegisterType<std::int16_t, ScalarCount> {
+  using Type = typename std::conditional<
+      ScalarCount >= 8, Int16x8,
+      typename std::conditional<ScalarCount >= 4, Int16x4,
+                                std::int16_t>::type>::type;
+};
+
+template <int ScalarCount>
+struct RegisterType<std::uint8_t, ScalarCount> {
+  using Type = typename std::conditional<
+      ScalarCount >= 8, Uint8x8,
+      typename std::conditional<ScalarCount >= 4, std::uint32_t,
+                                std::uint8_t>::type>::type;
+};
+
+template <int ScalarCount>
+struct RegisterType<std::int8_t, ScalarCount> {
+  using Type = typename std::conditional<
+      ScalarCount >= 8, Int8x8,
+      typename std::conditional<ScalarCount >= 4, std::int32_t,
+                                std::int8_t>::type>::type;
+};
+
+inline Int32x4 LoadInt32x4(const std::int32_t* src) { return vld1q_s32(src); }
+inline Int16x4 LoadInt16x4(const std::int16_t* src) { return vld1_s16(src); }
+inline Int16x8 LoadInt16x8(const std::int16_t* src) { return vld1q_s16(src); }
+
+inline void StoreInt32x4(std::int32_t* dst, Int32x4 value) {
+  vst1q_s32(dst, value);
+}
+
+inline void StoreInt16x4(std::int16_t* dst, Int16x4 value) {
+  vst1_s16(dst, value);
+}
+
+inline void StoreInt16x8(std::int16_t* dst, Int16x8 value) {
+  vst1q_s16(dst, value);
+}
+
+template <int Lane>
+std::int32_t GetLane(Int32x4 value) {
+  return vgetq_lane_s32(value, Lane);
+}
+
+template <int Lane>
+Int32x4 DupLane(Int32x4 value) {
+  switch (Lane) {
+    case 0:
+      return vdupq_lane_s32(vget_low_s32(value), 0);
+    case 1:
+      return vdupq_lane_s32(vget_low_s32(value), 1);
+    case 2:
+      return vdupq_lane_s32(vget_high_s32(value), 0);
+    case 3:
+      return vdupq_lane_s32(vget_high_s32(value), 1);
+    default:
+      static_assert(Lane >= 0 && Lane <= 3, "");
+      return vdupq_n_s32(0);
+  }
+}
+
+inline Int32x4 Mul(Int32x4 a, std::int32_t b) { return vmulq_n_s32(a, b); }
+
+inline Int32x4 Min(Int32x4 a, Int32x4 b) { return vminq_s32(a, b); }
+
+inline Int32x4 Max(Int32x4 a, Int32x4 b) { return vmaxq_s32(a, b); }
+
+inline Int32x4 Max(Int32x4 a, std::int32_t b) {
+  return vmaxq_s32(a, vdupq_n_s32(b));
+}
+
+inline Int32x4 SaturatingRoundingDoublingHighMul(Int32x4 a, std::int32_t b) {
+  return vqrdmulhq_n_s32(a, b);
+}
+
+template <int Lane>
+Int32x4 MulByRhsLane(Int32x4 a, Int32x4 b) {
+  switch (Lane) {
+    case 0:
+      return vmulq_lane_s32(a, vget_low_s32(b), 0);
+    case 1:
+      return vmulq_lane_s32(a, vget_low_s32(b), 1);
+    case 2:
+      return vmulq_lane_s32(a, vget_high_s32(b), 0);
+    case 3:
+      return vmulq_lane_s32(a, vget_high_s32(b), 1);
+    default:
+      static_assert(Lane >= 0 && Lane <= 3, "");
+      return vdupq_n_s32(0);
+  }
+}
+
+inline void MulAdd(Int32x4 lhs, Int32x4 rhs, Int32x4* acc) {
+  *acc = vmlaq_s32(*acc, lhs, rhs);
+}
+
+inline void MulAdd(Int32x4 lhs, std::int32_t rhs, Int32x4* acc) {
+  *acc = vmlaq_n_s32(*acc, lhs, rhs);
+}
+
+template <int Lane>
+inline void MulAddByRhsLane(Int32x4 lhs, Int32x4 rhs, Int32x4* acc) {
+  switch (Lane) {
+    case 0:
+      *acc = vmlaq_lane_s32(*acc, lhs, vget_low_s32(rhs), 0);
+      break;
+    case 1:
+      *acc = vmlaq_lane_s32(*acc, lhs, vget_low_s32(rhs), 1);
+      break;
+    case 2:
+      *acc = vmlaq_lane_s32(*acc, lhs, vget_high_s32(rhs), 0);
+      break;
+    case 3:
+      *acc = vmlaq_lane_s32(*acc, lhs, vget_high_s32(rhs), 1);
+      break;
+    default:
+      static_assert(Lane >= 0 && Lane <= 3, "");
+  }
+}
+
+template <>
+struct LoadContiguousImpl<RegBlockInt16<8, 8>> {
+  static RegBlockInt16<8, 8> Run(const std::int16_t* src) {
+    RegBlockInt16<8, 8> result;
+    for (int i = 0; i < 8; i++) {
+      result.buf.reg[i] = vld1q_s16(src + 8 * i);
+    }
+    return result;
+  }
+};
+
+template <>
+struct LoadContiguousImpl<RegBlockUint8<8, 8>> {
+  static RegBlockUint8<8, 8> Run(const std::uint8_t* src) {
+    RegBlockUint8<8, 8> result;
+    for (int i = 0; i < 8; i++) {
+      result.buf.reg[i] = vld1_u8(src + 8 * i);
+    }
+    return result;
+  }
+};
+
+template <>
+struct LoadContiguousImpl<RegBlockInt8<8, 8>> {
+  static RegBlockInt8<8, 8> Run(const std::int8_t* src) {
+    RegBlockInt8<8, 8> result;
+    for (int i = 0; i < 8; i++) {
+      result.buf.reg[i] = vld1_s8(src + 8 * i);
+    }
+    return result;
+  }
+};
+
+template <>
+struct LoadContiguousImpl<RegBlockInt32<8, 8>> {
+  static RegBlockInt32<8, 8> Run(const std::int32_t* src) {
+    RegBlockInt32<8, 8> result;
+    for (int i = 0; i < 16; i++) {
+      result.buf.reg[i] = vld1q_s32(src + 4 * i);
+    }
+    return result;
+  }
+};
+
+// 4x1 := 4x1 + 1x1
+template <>
+struct BroadcastShiftLeftImpl<RegBlockInt32<4, 1>, RegBlockInt32<1, 1>> {
+  static RegBlockInt32<4, 1> Run(const RegBlockInt32<4, 1>& lhs,
+                                 const RegBlockInt32<1, 1>& rhs) {
+    RegBlockInt32<4, 1> result;
+    result.buf.reg[0] = ShiftLeft(lhs.buf.reg[0], Dup<Int32x4>(rhs.buf.reg[0]));
+    return result;
+  }
+};
+
+// 1x4 := 1x4 + 1x1
+template <>
+struct BroadcastShiftLeftImpl<RegBlockInt32<1, 4>, RegBlockInt32<1, 1>> {
+  static RegBlockInt32<1, 4> Run(const RegBlockInt32<1, 4>& lhs,
+                                 const RegBlockInt32<1, 1>& rhs) {
+    RegBlockInt32<1, 4> result;
+    result.buf.reg[0] = ShiftLeft(lhs.buf.reg[0], Dup<Int32x4>(rhs.buf.reg[0]));
+    return result;
+  }
+};
+
+// 4x1 := 4x1 + 4x1
+template <>
+struct BroadcastShiftLeftImpl<RegBlockInt32<4, 1>, RegBlockInt32<4, 1>> {
+  static RegBlockInt32<4, 1> Run(const RegBlockInt32<4, 1>& lhs,
+                                 const RegBlockInt32<4, 1>& rhs) {
+    RegBlockInt32<4, 1> result;
+    result.buf.reg[0] = ShiftLeft(lhs.buf.reg[0], rhs.buf.reg[0]);
+    return result;
+  }
+};
+
+// 1x4 := 1x4 + 1x4
+template <>
+struct BroadcastShiftLeftImpl<RegBlockInt32<1, 4>, RegBlockInt32<1, 4>> {
+  static RegBlockInt32<1, 4> Run(const RegBlockInt32<1, 4>& lhs,
+                                 const RegBlockInt32<1, 4>& rhs) {
+    RegBlockInt32<1, 4> result;
+    result.buf.reg[0] = ShiftLeft(lhs.buf.reg[0], rhs.buf.reg[0]);
+    return result;
+  }
+};
+
+// 4x4 := 4x4 + 1x4
+template <>
+struct BroadcastShiftLeftImpl<RegBlockInt32<4, 4>, RegBlockInt32<1, 4>> {
+  static RegBlockInt32<4, 4> Run(const RegBlockInt32<4, 4>& lhs,
+                                 const RegBlockInt32<1, 4>& rhs) {
+    RegBlockInt32<4, 4> result;
+    result.buf.reg[0] = ShiftLeft(lhs.buf.reg[0], DupLane<0>(rhs.buf.reg[0]));
+    result.buf.reg[1] = ShiftLeft(lhs.buf.reg[1], DupLane<1>(rhs.buf.reg[0]));
+    result.buf.reg[2] = ShiftLeft(lhs.buf.reg[2], DupLane<2>(rhs.buf.reg[0]));
+    result.buf.reg[3] = ShiftLeft(lhs.buf.reg[3], DupLane<3>(rhs.buf.reg[0]));
+    return result;
+  }
+};
+
+// 4x4 := 4x4 + 4x1
+template <>
+struct BroadcastShiftLeftImpl<RegBlockInt32<4, 4>, RegBlockInt32<4, 1>> {
+  static RegBlockInt32<4, 4> Run(const RegBlockInt32<4, 4>& lhs,
+                                 const RegBlockInt32<4, 1>& rhs) {
+    RegBlockInt32<4, 4> result;
+    result.buf.reg[0] = ShiftLeft(lhs.buf.reg[0], rhs.buf.reg[0]);
+    result.buf.reg[1] = ShiftLeft(lhs.buf.reg[1], rhs.buf.reg[0]);
+    result.buf.reg[2] = ShiftLeft(lhs.buf.reg[2], rhs.buf.reg[0]);
+    result.buf.reg[3] = ShiftLeft(lhs.buf.reg[3], rhs.buf.reg[0]);
+    return result;
+  }
+};
+
+// 8x1 := 8x1 + 1x1
+template <>
+struct BroadcastShiftLeftImpl<RegBlockInt32<8, 1>, RegBlockInt32<1, 1>> {
+  static RegBlockInt32<8, 1> Run(const RegBlockInt32<8, 1>& lhs,
+                                 const RegBlockInt32<1, 1>& rhs) {
+    RegBlockInt32<8, 1> result;
+    const Int32x4 p = Dup<Int32x4>(rhs.buf.reg[0]);
+    for (int i = 0; i < 2; i++) {
+      result.buf.reg[i] = ShiftLeft(lhs.buf.reg[i], p);
+    }
+    return result;
+  }
+};
+
+// 8x1 := 8x1 + 8x1
+template <>
+struct BroadcastShiftLeftImpl<RegBlockInt32<8, 1>, RegBlockInt32<8, 1>> {
+  static RegBlockInt32<8, 1> Run(const RegBlockInt32<8, 1>& lhs,
+                                 const RegBlockInt32<8, 1>& rhs) {
+    RegBlockInt32<8, 1> result;
+    for (int i = 0; i < 2; i++) {
+      result.buf.reg[i] = ShiftLeft(lhs.buf.reg[i], rhs.buf.reg[i]);
+    }
+    return result;
+  }
+};
+
+// 8x4 := 8x4 + 1x4
+template <>
+struct BroadcastShiftLeftImpl<RegBlockInt32<8, 4>, RegBlockInt32<1, 4>> {
+  static RegBlockInt32<8, 4> Run(const RegBlockInt32<8, 4>& lhs,
+                                 const RegBlockInt32<1, 4>& rhs) {
+    RegBlockInt32<8, 4> result;
+    result.buf.reg[0] = ShiftLeft(lhs.buf.reg[0], DupLane<0>(rhs.buf.reg[0]));
+    result.buf.reg[1] = ShiftLeft(lhs.buf.reg[1], DupLane<0>(rhs.buf.reg[0]));
+    result.buf.reg[2] = ShiftLeft(lhs.buf.reg[2], DupLane<1>(rhs.buf.reg[0]));
+    result.buf.reg[3] = ShiftLeft(lhs.buf.reg[3], DupLane<1>(rhs.buf.reg[0]));
+    result.buf.reg[4] = ShiftLeft(lhs.buf.reg[4], DupLane<2>(rhs.buf.reg[0]));
+    result.buf.reg[5] = ShiftLeft(lhs.buf.reg[5], DupLane<2>(rhs.buf.reg[0]));
+    result.buf.reg[6] = ShiftLeft(lhs.buf.reg[6], DupLane<3>(rhs.buf.reg[0]));
+    result.buf.reg[7] = ShiftLeft(lhs.buf.reg[7], DupLane<3>(rhs.buf.reg[0]));
+    return result;
+  }
+};
+
+// 8x4 := 8x4 + 8x1
+template <>
+struct BroadcastShiftLeftImpl<RegBlockInt32<8, 4>, RegBlockInt32<8, 1>> {
+  static RegBlockInt32<8, 4> Run(const RegBlockInt32<8, 4>& lhs,
+                                 const RegBlockInt32<8, 1>& rhs) {
+    RegBlockInt32<8, 4> result;
+    result.buf.reg[0] = ShiftLeft(lhs.buf.reg[0], rhs.buf.reg[0]);
+    result.buf.reg[1] = ShiftLeft(lhs.buf.reg[1], rhs.buf.reg[1]);
+    result.buf.reg[2] = ShiftLeft(lhs.buf.reg[2], rhs.buf.reg[0]);
+    result.buf.reg[3] = ShiftLeft(lhs.buf.reg[3], rhs.buf.reg[1]);
+    result.buf.reg[4] = ShiftLeft(lhs.buf.reg[4], rhs.buf.reg[0]);
+    result.buf.reg[5] = ShiftLeft(lhs.buf.reg[5], rhs.buf.reg[1]);
+    result.buf.reg[6] = ShiftLeft(lhs.buf.reg[6], rhs.buf.reg[0]);
+    result.buf.reg[7] = ShiftLeft(lhs.buf.reg[7], rhs.buf.reg[1]);
+    return result;
+  }
+};
+
+// 1x8 := 1x8 + 1x8
+template <>
+struct BroadcastShiftLeftImpl<RegBlockInt32<1, 8>, RegBlockInt32<1, 8>> {
+  static RegBlockInt32<1, 8> Run(const RegBlockInt32<1, 8>& lhs,
+                                 const RegBlockInt32<1, 8>& rhs) {
+    RegBlockInt32<1, 8> result;
+    result.buf.reg[0] = ShiftLeft(lhs.buf.reg[0], rhs.buf.reg[0]);
+    result.buf.reg[1] = ShiftLeft(lhs.buf.reg[1], rhs.buf.reg[1]);
+    return result;
+  }
+};
+
+// 1x8 := 1x8 + 1x1
+template <>
+struct BroadcastShiftLeftImpl<RegBlockInt32<1, 8>, RegBlockInt32<1, 1>> {
+  static RegBlockInt32<1, 8> Run(const RegBlockInt32<1, 8>& lhs,
+                                 const RegBlockInt32<1, 1>& rhs) {
+    RegBlockInt32<1, 8> result;
+    result.buf.reg[0] = ShiftLeft(lhs.buf.reg[0], Dup<Int32x4>(rhs.buf.reg[0]));
+    result.buf.reg[1] = ShiftLeft(lhs.buf.reg[1], Dup<Int32x4>(rhs.buf.reg[0]));
+    return result;
+  }
+};
+
+// 4x1 := 4x1 + 1x1
+template <>
+struct BroadcastRoundingDivideByPOTImpl<RegBlockInt32<4, 1>,
+                                        RegBlockInt32<1, 1>> {
+  static RegBlockInt32<4, 1> Run(const RegBlockInt32<4, 1>& lhs,
+                                 const RegBlockInt32<1, 1>& rhs) {
+    RegBlockInt32<4, 1> result;
+    result.buf.reg[0] =
+        RoundingDivideByPOT(lhs.buf.reg[0], Dup<Int32x4>(rhs.buf.reg[0]));
+    return result;
+  }
+};
+
+// 1x4 := 1x4 + 1x1
+template <>
+struct BroadcastRoundingDivideByPOTImpl<RegBlockInt32<1, 4>,
+                                        RegBlockInt32<1, 1>> {
+  static RegBlockInt32<1, 4> Run(const RegBlockInt32<1, 4>& lhs,
+                                 const RegBlockInt32<1, 1>& rhs) {
+    RegBlockInt32<1, 4> result;
+    result.buf.reg[0] =
+        RoundingDivideByPOT(lhs.buf.reg[0], Dup<Int32x4>(rhs.buf.reg[0]));
+    return result;
+  }
+};
+
+// 4x1 := 4x1 + 4x1
+template <>
+struct BroadcastRoundingDivideByPOTImpl<RegBlockInt32<4, 1>,
+                                        RegBlockInt32<4, 1>> {
+  static RegBlockInt32<4, 1> Run(const RegBlockInt32<4, 1>& lhs,
+                                 const RegBlockInt32<4, 1>& rhs) {
+    RegBlockInt32<4, 1> result;
+    result.buf.reg[0] = RoundingDivideByPOT(lhs.buf.reg[0], rhs.buf.reg[0]);
+    return result;
+  }
+};
+
+// 1x4 := 1x4 + 1x4
+template <>
+struct BroadcastRoundingDivideByPOTImpl<RegBlockInt32<1, 4>,
+                                        RegBlockInt32<1, 4>> {
+  static RegBlockInt32<1, 4> Run(const RegBlockInt32<1, 4>& lhs,
+                                 const RegBlockInt32<1, 4>& rhs) {
+    RegBlockInt32<1, 4> result;
+    result.buf.reg[0] = RoundingDivideByPOT(lhs.buf.reg[0], rhs.buf.reg[0]);
+    return result;
+  }
+};
+
+// 4x4 := 4x4 + 1x4
+template <>
+struct BroadcastRoundingDivideByPOTImpl<RegBlockInt32<4, 4>,
+                                        RegBlockInt32<1, 4>> {
+  static RegBlockInt32<4, 4> Run(const RegBlockInt32<4, 4>& lhs,
+                                 const RegBlockInt32<1, 4>& rhs) {
+    RegBlockInt32<4, 4> result;
+    result.buf.reg[0] =
+        RoundingDivideByPOT(lhs.buf.reg[0], DupLane<0>(rhs.buf.reg[0]));
+    result.buf.reg[1] =
+        RoundingDivideByPOT(lhs.buf.reg[1], DupLane<1>(rhs.buf.reg[0]));
+    result.buf.reg[2] =
+        RoundingDivideByPOT(lhs.buf.reg[2], DupLane<2>(rhs.buf.reg[0]));
+    result.buf.reg[3] =
+        RoundingDivideByPOT(lhs.buf.reg[3], DupLane<3>(rhs.buf.reg[0]));
+    return result;
+  }
+};
+
+// 4x4 := 4x4 + 4x1
+template <>
+struct BroadcastRoundingDivideByPOTImpl<RegBlockInt32<4, 4>,
+                                        RegBlockInt32<4, 1>> {
+  static RegBlockInt32<4, 4> Run(const RegBlockInt32<4, 4>& lhs,
+                                 const RegBlockInt32<4, 1>& rhs) {
+    RegBlockInt32<4, 4> result;
+    result.buf.reg[0] = RoundingDivideByPOT(lhs.buf.reg[0], rhs.buf.reg[0]);
+    result.buf.reg[1] = RoundingDivideByPOT(lhs.buf.reg[1], rhs.buf.reg[0]);
+    result.buf.reg[2] = RoundingDivideByPOT(lhs.buf.reg[2], rhs.buf.reg[0]);
+    result.buf.reg[3] = RoundingDivideByPOT(lhs.buf.reg[3], rhs.buf.reg[0]);
+    return result;
+  }
+};
+
+// 8x1 := 8x1 + 1x1
+template <>
+struct BroadcastRoundingDivideByPOTImpl<RegBlockInt32<8, 1>,
+                                        RegBlockInt32<1, 1>> {
+  static RegBlockInt32<8, 1> Run(const RegBlockInt32<8, 1>& lhs,
+                                 const RegBlockInt32<1, 1>& rhs) {
+    RegBlockInt32<8, 1> result;
+    const Int32x4 p = Dup<Int32x4>(rhs.buf.reg[0]);
+    for (int i = 0; i < 2; i++) {
+      result.buf.reg[i] = RoundingDivideByPOT(lhs.buf.reg[i], p);
+    }
+    return result;
+  }
+};
+
+// 8x1 := 8x1 + 8x1
+template <>
+struct BroadcastRoundingDivideByPOTImpl<RegBlockInt32<8, 1>,
+                                        RegBlockInt32<8, 1>> {
+  static RegBlockInt32<8, 1> Run(const RegBlockInt32<8, 1>& lhs,
+                                 const RegBlockInt32<8, 1>& rhs) {
+    RegBlockInt32<8, 1> result;
+    for (int i = 0; i < 2; i++) {
+      result.buf.reg[i] = RoundingDivideByPOT(lhs.buf.reg[i], rhs.buf.reg[i]);
+    }
+    return result;
+  }
+};
+
+// 8x4 := 8x4 + 1x4
+template <>
+struct BroadcastRoundingDivideByPOTImpl<RegBlockInt32<8, 4>,
+                                        RegBlockInt32<1, 4>> {
+  static RegBlockInt32<8, 4> Run(const RegBlockInt32<8, 4>& lhs,
+                                 const RegBlockInt32<1, 4>& rhs) {
+    RegBlockInt32<8, 4> result;
+    result.buf.reg[0] =
+        RoundingDivideByPOT(lhs.buf.reg[0], DupLane<0>(rhs.buf.reg[0]));
+    result.buf.reg[1] =
+        RoundingDivideByPOT(lhs.buf.reg[1], DupLane<0>(rhs.buf.reg[0]));
+    result.buf.reg[2] =
+        RoundingDivideByPOT(lhs.buf.reg[2], DupLane<1>(rhs.buf.reg[0]));
+    result.buf.reg[3] =
+        RoundingDivideByPOT(lhs.buf.reg[3], DupLane<1>(rhs.buf.reg[0]));
+    result.buf.reg[4] =
+        RoundingDivideByPOT(lhs.buf.reg[4], DupLane<2>(rhs.buf.reg[0]));
+    result.buf.reg[5] =
+        RoundingDivideByPOT(lhs.buf.reg[5], DupLane<2>(rhs.buf.reg[0]));
+    result.buf.reg[6] =
+        RoundingDivideByPOT(lhs.buf.reg[6], DupLane<3>(rhs.buf.reg[0]));
+    result.buf.reg[7] =
+        RoundingDivideByPOT(lhs.buf.reg[7], DupLane<3>(rhs.buf.reg[0]));
+    return result;
+  }
+};
+
+// 8x4 := 8x4 + 8x1
+template <>
+struct BroadcastRoundingDivideByPOTImpl<RegBlockInt32<8, 4>,
+                                        RegBlockInt32<8, 1>> {
+  static RegBlockInt32<8, 4> Run(const RegBlockInt32<8, 4>& lhs,
+                                 const RegBlockInt32<8, 1>& rhs) {
+    RegBlockInt32<8, 4> result;
+    result.buf.reg[0] = RoundingDivideByPOT(lhs.buf.reg[0], rhs.buf.reg[0]);
+    result.buf.reg[1] = RoundingDivideByPOT(lhs.buf.reg[1], rhs.buf.reg[1]);
+    result.buf.reg[2] = RoundingDivideByPOT(lhs.buf.reg[2], rhs.buf.reg[0]);
+    result.buf.reg[3] = RoundingDivideByPOT(lhs.buf.reg[3], rhs.buf.reg[1]);
+    result.buf.reg[4] = RoundingDivideByPOT(lhs.buf.reg[4], rhs.buf.reg[0]);
+    result.buf.reg[5] = RoundingDivideByPOT(lhs.buf.reg[5], rhs.buf.reg[1]);
+    result.buf.reg[6] = RoundingDivideByPOT(lhs.buf.reg[6], rhs.buf.reg[0]);
+    result.buf.reg[7] = RoundingDivideByPOT(lhs.buf.reg[7], rhs.buf.reg[1]);
+    return result;
+  }
+};
+
+// 1x8 := 1x8 + 1x8
+template <>
+struct BroadcastRoundingDivideByPOTImpl<RegBlockInt32<1, 8>,
+                                        RegBlockInt32<1, 8>> {
+  static RegBlockInt32<1, 8> Run(const RegBlockInt32<1, 8>& lhs,
+                                 const RegBlockInt32<1, 8>& rhs) {
+    RegBlockInt32<1, 8> result;
+    result.buf.reg[0] = RoundingDivideByPOT(lhs.buf.reg[0], rhs.buf.reg[0]);
+    result.buf.reg[1] = RoundingDivideByPOT(lhs.buf.reg[1], rhs.buf.reg[1]);
+    return result;
+  }
+};
+
+// 1x8 := 1x8 + 1x1
+template <>
+struct BroadcastRoundingDivideByPOTImpl<RegBlockInt32<1, 8>,
+                                        RegBlockInt32<1, 1>> {
+  static RegBlockInt32<1, 8> Run(const RegBlockInt32<1, 8>& lhs,
+                                 const RegBlockInt32<1, 1>& rhs) {
+    RegBlockInt32<1, 8> result;
+    result.buf.reg[0] =
+        RoundingDivideByPOT(lhs.buf.reg[0], Dup<Int32x4>(rhs.buf.reg[0]));
+    result.buf.reg[1] =
+        RoundingDivideByPOT(lhs.buf.reg[1], Dup<Int32x4>(rhs.buf.reg[0]));
+    return result;
+  }
+};
+
+}  // end namespace gemmlowp
+
+#include "simd_wrappers_common_neon_sse.h"
+
+#endif  // GEMMLOWP_INTERNAL_SIMD_WRAPPERS_NEON_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers_sse.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers_sse.h
new file mode 100644
index 00000000..3b78cb41
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers_sse.h
@@ -0,0 +1,149 @@
+// Copyright 2017 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// simd_wrappers_neon.h: SSE SIMD wrappers
+
+#ifndef GEMMLOWP_INTERNAL_SIMD_WRAPPERS_SSE_H_
+#define GEMMLOWP_INTERNAL_SIMD_WRAPPERS_SSE_H_
+
+#include <smmintrin.h>
+
+namespace gemmlowp {
+
+using Int32x4 = __m128i;
+using Int16x8 = __m128i;
+using Uint8x16 = __m128i;
+
+template <int ScalarCount>
+struct RegisterType<std::int32_t, ScalarCount> {
+  using Type =
+      typename std::conditional<ScalarCount >= 4, Int32x4, std::int32_t>::type;
+};
+
+template <int ScalarCount>
+struct RegisterType<std::int16_t, ScalarCount> {
+  using Type =
+      typename std::conditional<ScalarCount >= 8, Int16x8, std::int16_t>::type;
+};
+
+template <int ScalarCount>
+struct RegisterType<std::uint8_t, ScalarCount> {
+  using Type = typename std::conditional<
+      ScalarCount >= 16, Uint8x16,
+      typename std::conditional<ScalarCount >= 4, std::uint32_t,
+                                std::uint8_t>::type>::type;
+};
+
+inline Int32x4 LoadInt32x4(const std::int32_t* src) {
+  return _mm_loadu_si128(reinterpret_cast<const Int32x4*>(src));
+}
+
+inline Int32x4 LoadInt16x8(const std::int16_t* src) {
+  return _mm_loadu_si128(reinterpret_cast<const Int16x8*>(src));
+}
+
+inline void StoreInt32x4(std::int32_t* dst, Int32x4 value) {
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), value);
+}
+
+inline void StoreInt16x8(std::int16_t* dst, Int16x8 value) {
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), value);
+}
+
+inline Uint8x16 LoadUint8x16(const std::uint8_t* src) {
+  return _mm_loadu_si128(reinterpret_cast<const Uint8x16*>(src));
+}
+
+inline void StoreUint8x16(std::uint8_t* dst, Uint8x16 value) {
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), value);
+}
+
+template <int Lane>
+std::int32_t GetLane(Int32x4 value) {
+  return _mm_extract_epi32(value, Lane);
+}
+
+template <int Lane>
+Int32x4 DupLane(Int32x4 value) {
+  return _mm_shuffle_epi32(value, _MM_SHUFFLE(Lane, Lane, Lane, Lane));
+}
+
+inline Int32x4 Mul(Int32x4 a, std::int32_t b) {
+  return Mul(a, Dup<Int32x4>(b));
+}
+
+inline Int32x4 Min(Int32x4 a, Int32x4 b) { return _mm_min_epi32(a, b); }
+
+inline Int32x4 Max(Int32x4 a, Int32x4 b) { return _mm_max_epi32(a, b); }
+
+inline Int32x4 SaturatingRoundingDoublingHighMul(Int32x4 a, std::int32_t b) {
+  return SaturatingRoundingDoublingHighMul(a, Dup<Int32x4>(b));
+}
+
+template <int Lane>
+Int32x4 MulByRhsLane(Int32x4 a, Int32x4 b) {
+  return Mul(a, DupLane<Lane>(b));
+}
+
+inline void MulAdd(Int32x4 lhs, Int32x4 rhs, Int32x4* acc) {
+  *acc = Add(*acc, Mul(lhs, rhs));
+}
+
+inline void MulAdd(Int32x4 lhs, std::int32_t rhs, Int32x4* acc) {
+  *acc = Add(*acc, Mul(lhs, rhs));
+}
+
+template <int Lane>
+inline void MulAddByRhsLane(Int32x4 lhs, Int32x4 rhs, Int32x4* acc) {
+  *acc = Add(*acc, MulByRhsLane<Lane>(lhs, rhs));
+}
+
+template <>
+struct LoadContiguousImpl<RegBlockUint8<8, 8>> {
+  static RegBlockUint8<8, 8> Run(const std::uint8_t* src) {
+    RegBlockUint8<8, 8> result;
+    for (int i = 0; i < 4; i++) {
+      result.buf.reg[i] = LoadUint8x16(src + 16 * i);
+    }
+    return result;
+  }
+};
+
+template <>
+struct LoadContiguousImpl<RegBlockInt32<8, 8>> {
+  static RegBlockInt32<8, 8> Run(const std::int32_t* src) {
+    RegBlockInt32<8, 8> result;
+    for (int i = 0; i < 16; i++) {
+      result.buf.reg[i] = LoadInt32x4(src + 4 * i);
+    }
+    return result;
+  }
+};
+
+template <>
+struct LoadContiguousImpl<RegBlockInt16<8, 8>> {
+  static RegBlockInt16<8, 8> Run(const std::int16_t* src) {
+    RegBlockInt16<8, 8> result;
+    for (int i = 0; i < 8; i++) {
+      result.buf.reg[i] = LoadInt16x8(src + 8 * i);
+    }
+    return result;
+  }
+};
+
+}  // end namespace gemmlowp
+
+#include "simd_wrappers_common_neon_sse.h"
+
+#endif  // GEMMLOWP_INTERNAL_SIMD_WRAPPERS_SSE_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/single_thread_gemm.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/single_thread_gemm.h
new file mode 100644
index 00000000..35a78350
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/single_thread_gemm.h
@@ -0,0 +1,157 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// single_thread_gemm.h: Single-threaded GEMM implementation.
+// This is a good place to start reading code, as it shows the overall
+// structure of a GEMM and is much simpler than multi_thread_gemm.h.
+
+#ifndef GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
+#define GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
+
+#include <cassert>
+
+#include "../public/map.h"
+#include "allocator.h"
+#include "compute.h"
+#include "kernel.h"
+#include "pack.h"
+#include "unpack.h"
+
+#ifdef GEMMLOWP_PROFILING_SIZES
+#ifndef GEMMLOWP_PROFILING
+#error GEMMLOWP_PROFILING_SIZES without GEMMLOWP_PROFILING
+#endif
+#include <string>
+#include <unordered_map>
+#endif
+
+namespace gemmlowp {
+
+class SingleThreadGemmContext {
+ public:
+  Allocator* allocator() { return &allocator_; }
+
+  void set_l1_bytes_to_use(int n) { l1_bytes_to_use_ = n; }
+  void set_l2_bytes_to_use(int n) { l2_bytes_to_use_ = n; }
+  void set_l2_rhs_factor(float n) { l2_rhs_factor_ = n; }
+
+  int l1_bytes_to_use() const { return l1_bytes_to_use_; }
+  int l2_bytes_to_use() const { return l2_bytes_to_use_; }
+  float l2_rhs_factor() const { return l2_rhs_factor_; }
+
+ protected:
+  Allocator allocator_;
+
+  // The cache configurationt to use.
+  int l1_bytes_to_use_ = kDefaultL1CacheSize;
+  int l2_bytes_to_use_ = kDefaultL2CacheSize;
+  float l2_rhs_factor_ = kDefaultL2RhsFactor;
+};
+
+template <typename KernelFormat, typename InputScalar, typename OutputScalar,
+          typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
+          MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
+          typename OutputPipelineType>
+void SingleThreadGemm(SingleThreadGemmContext* context,
+                      const KernelBase& kernel,
+                      const MatrixMap<const InputScalar, LhsOrder>& lhs,
+                      const MatrixMap<const InputScalar, RhsOrder>& rhs,
+                      MatrixMap<OutputScalar, ResultOrder>* result,
+                      const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
+                      const OutputPipelineType& output_pipeline) {
+  ScopedProfilingLabel label("gemmlowp::SingleThreadGemm");
+
+  assert(lhs.cols() == rhs.rows());
+
+  int rows = result->rows();
+  int cols = result->cols();
+  int depth = lhs.cols();
+
+  // zero sizes should have been caught earlier and early-returned.
+  assert(rows > 0);
+  assert(cols > 0);
+  assert(depth > 0);
+
+  // The case of rows<cols should have been caught earlier and transposed.
+  assert(rows >= cols);
+
+  Allocator* allocator = context->allocator();
+
+  BlockParams block_params;
+  block_params.Init<KernelFormat>(
+      rows, cols, depth, 1, context->l1_bytes_to_use(),
+      context->l2_bytes_to_use(), context->l2_rhs_factor());
+
+#ifdef GEMMLOWP_PROFILING_SIZES
+  // Using a static map of label strings. Not reentrant at all!
+  static std::unordered_map<std::uint64_t, std::string> labels_map;
+  std::uint64_t sizes_hash = static_cast<std::uint64_t>(rows) ^
+                             (static_cast<std::uint64_t>(depth) << 16) ^
+                             (static_cast<std::uint64_t>(cols) << 32);
+  if (!labels_map.count(sizes_hash)) {
+    char label[256];
+    snprintf(label, sizeof(label),
+             "(rows = %d, depth = %d, cols = %d, l2_rows = %d, l2_depth = %d, "
+             "l2_cols = %d, l1_rows = %d, l1_depth = %d, l1_cols = %d)",
+             rows, depth, cols, block_params.l2_rows, block_params.l2_depth,
+             block_params.l2_cols, block_params.l1_rows, block_params.l1_depth,
+             block_params.l1_cols);
+    labels_map[sizes_hash] = label;
+  }
+  ScopedProfilingLabel size_label(labels_map[sizes_hash].c_str());
+#endif
+
+  PackedSideBlock<typename KernelFormat::Lhs> packed_lhs(Side::Lhs, allocator,
+                                                         block_params);
+  PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(Side::Rhs, allocator,
+                                                         block_params);
+
+  PackedResult packed_result(allocator, block_params);
+
+  allocator->Commit();
+
+  const bool pack_rhs_once = block_params.l2_cols >= cols;
+
+  if (pack_rhs_once) {
+    PackRhs(&packed_rhs, rhs);
+  }
+
+  for (int r = 0; r < rows; r += block_params.l2_rows) {
+    int rs = std::min(block_params.l2_rows, rows - r);
+
+    PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth));
+
+    for (int c = 0; c < cols; c += block_params.l2_cols) {
+      int cs = std::min(block_params.l2_cols, cols - c);
+
+      if (!pack_rhs_once) {
+        PackRhs(&packed_rhs, rhs.block(0, c, depth, cs));
+      }
+
+      Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs,
+              depth);
+
+      UnpackResult<KernelFormat>(
+          result, MatrixBlockBounds(r, c, rs, cs), packed_result, depth,
+          packed_lhs.sums_of_each_slice(), packed_rhs.sums_of_each_slice(),
+          lhs_offset.block(r, rs), rhs_offset.block(c, cs), output_pipeline);
+    }
+  }
+
+  allocator->Decommit();
+}
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
diff --git a/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/unpack.h b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/unpack.h
new file mode 100644
index 00000000..021f4aa0
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/gemmlowp/internal/unpack.h
@@ -0,0 +1,280 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// unpack.h: unpacking the result blocks computed by compute.h,
+// storing them into the destination matrix.
+
+#ifndef GEMMLOWP_INTERNAL_UNPACK_H_
+#define GEMMLOWP_INTERNAL_UNPACK_H_
+
+#include "allocator.h"
+#include "block_params.h"
+#include "output.h"
+#include "pack.h"
+
+#include <cmath>
+
+namespace gemmlowp {
+
+class PackedResult {
+ public:
+  PackedResult(Allocator* _allocator, const BlockParams& _block_params)
+      : allocator_(_allocator), block_params_(_block_params) {
+    matrix_handle_ = allocator_->Reserve<std::int32_t>(block_params_.l2_rows *
+                                                       block_params_.l2_cols);
+  }
+
+  ~PackedResult() {}
+
+  MatrixMap<std::int32_t, MapOrder::ColMajor> Map() {
+    return MatrixMap<std::int32_t, MapOrder::ColMajor>(
+        allocator_->GetPointer<std::int32_t>(matrix_handle_),
+        block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows);
+  }
+
+  MatrixMap<const std::int32_t, MapOrder::ColMajor> Map() const {
+    return MatrixMap<const std::int32_t, MapOrder::ColMajor>(
+        allocator_->GetPointer<const std::int32_t>(matrix_handle_),
+        block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows);
+  }
+
+ private:
+  Allocator* allocator_;
+  Allocator::Handle matrix_handle_;
+  const BlockParams& block_params_;
+};
+
+struct MatrixBlockBounds {
+  int start_row;
+  int start_col;
+  int rows;
+  int cols;
+
+  MatrixBlockBounds(int start_row_, int start_col_, int rows_, int cols_)
+      : start_row(start_row_),
+        start_col(start_col_),
+        rows(rows_),
+        cols(cols_) {}
+};
+
+template <int Rows, int Cols, typename SrcMapType>
+void PrefetchResultBlock(const SrcMapType& src,
+                         const VectorMap<const std::int32_t, VectorShape::Col>&
+                             lhs_sums_of_each_slice,
+                         int src_row, int src_col) {
+  const std::int32_t* src_data = src.data(src_row, src_col);
+  const int src_stride = src.stride();
+  const std::int32_t* lhs_sums_data = lhs_sums_of_each_slice.data(src_row);
+  for (int r = 0; r < Rows; r += 4) {
+    Prefetch(lhs_sums_data + r);
+  }
+  for (int c = 0; c < Cols; c++) {
+    for (int r = 0; r < Rows; r += 4) {
+      Prefetch(src_data + r + c * src_stride);
+    }
+  }
+}
+
+template <typename KernelFormat, typename RegisterBlockType,
+          typename SrcMapType, typename LhsOffset, typename RhsOffset,
+          typename OutputPipelineExecutorType, typename DstType>
+void UnpackResultBlock(const SrcMapType& src,
+                       const OutputPipelineExecutorType& executor, DstType* dst,
+                       const VectorMap<const std::int32_t, VectorShape::Col>&
+                           lhs_sums_of_each_slice,
+                       const VectorMap<const std::int32_t, VectorShape::Row>&
+                           rhs_sums_of_each_slice,
+                       const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
+                       int depth, int src_row, int src_col, int src_global_row,
+                       int src_global_col, int dst_row, int dst_col) {
+  using KernelLhsInputScalar = typename KernelFormat::Lhs::InputScalar;
+  using KernelLhsScalar = typename KernelFormat::Lhs::Scalar;
+  using KernelRhsInputScalar = typename KernelFormat::Rhs::InputScalar;
+  using KernelRhsScalar = typename KernelFormat::Rhs::Scalar;
+  static constexpr int KernelLhsZeroPointInput =
+      ZeroPointInputValue<KernelLhsInputScalar, KernelLhsScalar>::kValue;
+  static constexpr int KernelRhsZeroPointInput =
+      ZeroPointInputValue<KernelRhsInputScalar, KernelRhsScalar>::kValue;
+  auto acc = Load<RegisterBlockType>(src, src_row, src_col);
+  const auto& lhs_sums_of_each_slice_block =
+      LoadForBroadcasting<RegisterBlockType>(lhs_sums_of_each_slice, src_row);
+  const auto& rhs_sums_of_each_slice_block =
+      LoadForBroadcasting<RegisterBlockType>(rhs_sums_of_each_slice, src_col);
+  auto lhs_offset_block =
+      LoadForBroadcasting<RegisterBlockType>(lhs_offset, src_row);
+  auto rhs_offset_block =
+      LoadForBroadcasting<RegisterBlockType>(rhs_offset, src_col);
+  AddConstant<KernelLhsZeroPointInput>(&lhs_offset_block);
+  AddConstant<KernelRhsZeroPointInput>(&rhs_offset_block);
+  BroadcastMulAdd(lhs_sums_of_each_slice_block, rhs_offset_block, &acc);
+  for (int i = 0; i < decltype(rhs_offset_block)::kRegisterCount; i++) {
+    rhs_offset_block.buf.reg[i] = Mul(rhs_offset_block.buf.reg[i], depth);
+  }
+  BroadcastMulAdd(BroadcastAdd(rhs_sums_of_each_slice_block, rhs_offset_block),
+                  lhs_offset_block, &acc);
+  executor.Execute(acc, dst, src_global_row, src_global_col, dst_row, dst_col);
+}
+
+template <typename KernelFormat, typename ResultBlockType,
+          typename PackedResultType, typename LhsOffset, typename RhsOffset,
+          typename OutputPipelineType>
+void UnpackResult(ResultBlockType* dst, const MatrixBlockBounds& dst_block,
+                  const PackedResultType& src, int depth,
+                  const std::int32_t* lhs_sums_of_each_slice_ptr,
+                  const std::int32_t* rhs_sums_of_each_slice_ptr,
+                  const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
+                  const OutputPipelineType& output_pipeline) {
+  ScopedProfilingLabel label(ResultBlockType::kOrder == MapOrder::ColMajor
+                                 ? "unpack to column-major"
+                                 : "unpack to row-major");
+  assert(dst_block.start_row >= 0);
+  assert(dst_block.start_row + dst_block.rows <= dst->rows());
+  assert(dst_block.start_col >= 0);
+  assert(dst_block.start_col + dst_block.cols <= dst->cols());
+  const auto src_map = src.Map();
+  const VectorMap<const std::int32_t, VectorShape::Col> lhs_sums_of_each_slice(
+      lhs_sums_of_each_slice_ptr, dst_block.rows);
+  const VectorMap<const std::int32_t, VectorShape::Row> rhs_sums_of_each_slice(
+      rhs_sums_of_each_slice_ptr, dst_block.cols);
+  using Int32x1x1 = RegisterBlock<std::int32_t, 1, 1>;
+  using Int32x4x1 = RegisterBlock<std::int32_t, 4, 1>;
+  using Int32x8x1 = RegisterBlock<std::int32_t, 8, 1>;
+  using Int32x1x4 = RegisterBlock<std::int32_t, 1, 4>;
+  using Int32x4x4 = RegisterBlock<std::int32_t, 4, 4>;
+  using Int32x8x4 = RegisterBlock<std::int32_t, 8, 4>;
+
+  using DstScalarType = typename ResultBlockType::Scalar;
+  using DstScalarx8x8 = RegisterBlock<DstScalarType, 8, 8>;
+
+  OutputPipelineExecutor<OutputPipelineType, Int32x1x1>
+      output_pipeline_executor_1x1(output_pipeline);
+  OutputPipelineExecutor<OutputPipelineType, Int32x4x1>
+      output_pipeline_executor_4x1(output_pipeline);
+  OutputPipelineExecutor<OutputPipelineType, Int32x8x1>
+      output_pipeline_executor_8x1(output_pipeline);
+  OutputPipelineExecutor<OutputPipelineType, Int32x1x4>
+      output_pipeline_executor_1x4(output_pipeline);
+  OutputPipelineExecutor<OutputPipelineType, Int32x4x4>
+      output_pipeline_executor_4x4(output_pipeline);
+  OutputPipelineExecutor<OutputPipelineType, Int32x8x4>
+      output_pipeline_executor_8x4(output_pipeline);
+
+  int c8 = 0;
+  if (ResultBlockType::kOrder == MapOrder::RowMajor) {
+    for (; c8 <= dst_block.cols - 8; c8 += 8) {
+      PrefetchResultBlock<8, 8>(src_map, lhs_sums_of_each_slice, 0, c8);
+      int r = 0;
+      for (; r <= dst_block.rows - 8; r += 8) {
+        const int global_row = r + dst_block.start_row;
+        PrefetchResultBlock<8, 8>(src_map, lhs_sums_of_each_slice, r + 8, c8);
+        DstScalarType dst_colmajor_buf[64];
+        MatrixMap<DstScalarType, MapOrder::ColMajor> dst_colmajor_map(
+            dst_colmajor_buf, 8, 8);
+        for (int cx = 0; cx < 8; cx += 4) {
+          const int c = c8 + cx;
+          const int global_col = c + dst_block.start_col;
+          UnpackResultBlock<KernelFormat, Int32x8x4>(
+              src_map, output_pipeline_executor_8x4, &dst_colmajor_map,
+              lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset,
+              rhs_offset, depth, r, c, global_row, global_col, 0, cx);
+        }
+        StoreFinalOutput(LoadContiguous<DstScalarx8x8>(dst_colmajor_buf), dst,
+                         r + dst_block.start_row, c8 + dst_block.start_col);
+      }
+      for (; r <= dst_block.rows - 4; r += 4) {
+        const int global_row = r + dst_block.start_row;
+        for (int cx = 0; cx < 8; cx += 4) {
+          const int c = c8 + cx;
+          const int global_col = c + dst_block.start_col;
+          UnpackResultBlock<KernelFormat, Int32x4x4>(
+              src_map, output_pipeline_executor_4x4, dst,
+              lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset,
+              rhs_offset, depth, r, c, global_row, global_col, global_row,
+              global_col);
+        }
+      }
+      for (; r < dst_block.rows; r++) {
+        const int global_row = r + dst_block.start_row;
+        for (int cx = 0; cx < 8; cx += 4) {
+          const int c = c8 + cx;
+          const int global_col = c + dst_block.start_col;
+          UnpackResultBlock<KernelFormat, Int32x1x4>(
+              src_map, output_pipeline_executor_1x4, dst,
+              lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset,
+              rhs_offset, depth, r, c, global_row, global_col, global_row,
+              global_col);
+        }
+      }
+    }
+  }
+  int c = c8;
+  for (; c <= dst_block.cols - 4; c += 4) {
+    const int global_col = c + dst_block.start_col;
+    PrefetchResultBlock<8, 4>(src_map, lhs_sums_of_each_slice, 0, c);
+    int r = 0;
+    for (; r <= dst_block.rows - 8; r += 8) {
+      const int global_row = r + dst_block.start_row;
+      PrefetchResultBlock<8, 4>(src_map, lhs_sums_of_each_slice, r + 8, c);
+      UnpackResultBlock<KernelFormat, Int32x8x4>(
+          src_map, output_pipeline_executor_8x4, dst, lhs_sums_of_each_slice,
+          rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
+          global_row, global_col, global_row, global_col);
+    }
+    for (; r <= dst_block.rows - 4; r += 4) {
+      const int global_row = r + dst_block.start_row;
+      UnpackResultBlock<KernelFormat, Int32x4x4>(
+          src_map, output_pipeline_executor_4x4, dst, lhs_sums_of_each_slice,
+          rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
+          global_row, global_col, global_row, global_col);
+    }
+    for (; r < dst_block.rows; r++) {
+      const int global_row = r + dst_block.start_row;
+      UnpackResultBlock<KernelFormat, Int32x1x4>(
+          src_map, output_pipeline_executor_1x4, dst, lhs_sums_of_each_slice,
+          rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
+          global_row, global_col, global_row, global_col);
+    }
+  }
+  for (; c < dst_block.cols; c++) {
+    const int global_col = c + dst_block.start_col;
+    PrefetchResultBlock<8, 1>(src_map, lhs_sums_of_each_slice, 0, c);
+    int r = 0;
+    for (; r <= dst_block.rows - 8; r += 8) {
+      const int global_row = r + dst_block.start_row;
+      PrefetchResultBlock<8, 1>(src_map, lhs_sums_of_each_slice, r + 8, c);
+      UnpackResultBlock<KernelFormat, Int32x8x1>(
+          src_map, output_pipeline_executor_8x1, dst, lhs_sums_of_each_slice,
+          rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
+          global_row, global_col, global_row, global_col);
+    }
+    for (; r <= dst_block.rows - 4; r += 4) {
+      const int global_row = r + dst_block.start_row;
+      UnpackResultBlock<KernelFormat, Int32x4x1>(
+          src_map, output_pipeline_executor_4x1, dst, lhs_sums_of_each_slice,
+          rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
+          global_row, global_col, global_row, global_col);
+    }
+    for (; r < dst_block.rows; r++) {
+      const int global_row = r + dst_block.start_row;
+      UnpackResultBlock<KernelFormat, Int32x1x1>(
+          src_map, output_pipeline_executor_1x1, dst, lhs_sums_of_each_slice,
+          rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
+          global_row, global_col, global_row, global_col);
+    }
+  }
+}
+
+}  // end namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_UNPACK_H_
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/CMakeLists.txt b/ethosu/regor/dependencies/thirdparty/pybind11/CMakeLists.txt
new file mode 100644
index 00000000..87ec1034
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/CMakeLists.txt
@@ -0,0 +1,322 @@
+# CMakeLists.txt -- Build system for the pybind11 modules
+#
+# Copyright (c) 2015 Wenzel Jakob <wenzel@inf.ethz.ch>
+#
+# All rights reserved. Use of this source code is governed by a
+# BSD-style license that can be found in the LICENSE file.
+
+cmake_minimum_required(VERSION 3.5)
+
+# The `cmake_minimum_required(VERSION 3.5...3.26)` syntax does not work with
+# some versions of VS that have a patched CMake 3.11. This forces us to emulate
+# the behavior using the following workaround:
+if(${CMAKE_VERSION} VERSION_LESS 3.26)
+  cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION})
+else()
+  cmake_policy(VERSION 3.26)
+endif()
+
+# Avoid infinite recursion if tests include this as a subdirectory
+if(DEFINED PYBIND11_MASTER_PROJECT)
+  return()
+endif()
+
+# Extract project version from source
+file(STRINGS "${CMAKE_CURRENT_SOURCE_DIR}/include/pybind11/detail/common.h"
+     pybind11_version_defines REGEX "#define PYBIND11_VERSION_(MAJOR|MINOR|PATCH) ")
+
+foreach(ver ${pybind11_version_defines})
+  if(ver MATCHES [[#define PYBIND11_VERSION_(MAJOR|MINOR|PATCH) +([^ ]+)$]])
+    set(PYBIND11_VERSION_${CMAKE_MATCH_1} "${CMAKE_MATCH_2}")
+  endif()
+endforeach()
+
+if(PYBIND11_VERSION_PATCH MATCHES [[\.([a-zA-Z0-9]+)$]])
+  set(pybind11_VERSION_TYPE "${CMAKE_MATCH_1}")
+endif()
+string(REGEX MATCH "^[0-9]+" PYBIND11_VERSION_PATCH "${PYBIND11_VERSION_PATCH}")
+
+project(
+  pybind11
+  LANGUAGES CXX
+  VERSION "${PYBIND11_VERSION_MAJOR}.${PYBIND11_VERSION_MINOR}.${PYBIND11_VERSION_PATCH}")
+
+# Standard includes
+include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)
+include(CMakeDependentOption)
+
+if(NOT pybind11_FIND_QUIETLY)
+  message(STATUS "pybind11 v${pybind11_VERSION} ${pybind11_VERSION_TYPE}")
+endif()
+
+# Check if pybind11 is being used directly or via add_subdirectory
+if(CMAKE_SOURCE_DIR STREQUAL PROJECT_SOURCE_DIR)
+  ### Warn if not an out-of-source builds
+  if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR)
+    set(lines
+        "You are building in-place. If that is not what you intended to "
+        "do, you can clean the source directory with:\n"
+        "rm -r CMakeCache.txt CMakeFiles/ cmake_uninstall.cmake pybind11Config.cmake "
+        "pybind11ConfigVersion.cmake tests/CMakeFiles/\n")
+    message(AUTHOR_WARNING ${lines})
+  endif()
+
+  set(PYBIND11_MASTER_PROJECT ON)
+
+  if(OSX AND CMAKE_VERSION VERSION_LESS 3.7)
+    # Bug in macOS CMake < 3.7 is unable to download catch
+    message(WARNING "CMAKE 3.7+ needed on macOS to download catch, and newer HIGHLY recommended")
+  elseif(WINDOWS AND CMAKE_VERSION VERSION_LESS 3.8)
+    # Only tested with 3.8+ in CI.
+    message(WARNING "CMAKE 3.8+ tested on Windows, previous versions untested")
+  endif()
+
+  message(STATUS "CMake ${CMAKE_VERSION}")
+
+  if(CMAKE_CXX_STANDARD)
+    set(CMAKE_CXX_EXTENSIONS OFF)
+    set(CMAKE_CXX_STANDARD_REQUIRED ON)
+  endif()
+
+  set(pybind11_system "")
+
+  set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+else()
+  set(PYBIND11_MASTER_PROJECT OFF)
+  set(pybind11_system SYSTEM)
+endif()
+
+# Options
+option(PYBIND11_INSTALL "Install pybind11 header files?" ${PYBIND11_MASTER_PROJECT})
+option(PYBIND11_TEST "Build pybind11 test suite?" ${PYBIND11_MASTER_PROJECT})
+option(PYBIND11_NOPYTHON "Disable search for Python" OFF)
+option(PYBIND11_SIMPLE_GIL_MANAGEMENT
+       "Use simpler GIL management logic that does not support disassociation" OFF)
+set(PYBIND11_INTERNALS_VERSION
+    ""
+    CACHE STRING "Override the ABI version, may be used to enable the unstable ABI.")
+
+if(PYBIND11_SIMPLE_GIL_MANAGEMENT)
+  add_compile_definitions(PYBIND11_SIMPLE_GIL_MANAGEMENT)
+endif()
+
+cmake_dependent_option(
+  USE_PYTHON_INCLUDE_DIR
+  "Install pybind11 headers in Python include directory instead of default installation prefix"
+  OFF "PYBIND11_INSTALL" OFF)
+
+cmake_dependent_option(PYBIND11_FINDPYTHON "Force new FindPython" OFF
+                       "NOT CMAKE_VERSION VERSION_LESS 3.12" OFF)
+
+# NB: when adding a header don't forget to also add it to setup.py
+set(PYBIND11_HEADERS
+    include/pybind11/detail/class.h
+    include/pybind11/detail/common.h
+    include/pybind11/detail/descr.h
+    include/pybind11/detail/init.h
+    include/pybind11/detail/internals.h
+    include/pybind11/detail/type_caster_base.h
+    include/pybind11/detail/typeid.h
+    include/pybind11/attr.h
+    include/pybind11/buffer_info.h
+    include/pybind11/cast.h
+    include/pybind11/chrono.h
+    include/pybind11/common.h
+    include/pybind11/complex.h
+    include/pybind11/options.h
+    include/pybind11/eigen.h
+    include/pybind11/eigen/common.h
+    include/pybind11/eigen/matrix.h
+    include/pybind11/eigen/tensor.h
+    include/pybind11/embed.h
+    include/pybind11/eval.h
+    include/pybind11/gil.h
+    include/pybind11/iostream.h
+    include/pybind11/functional.h
+    include/pybind11/numpy.h
+    include/pybind11/operators.h
+    include/pybind11/pybind11.h
+    include/pybind11/pytypes.h
+    include/pybind11/stl.h
+    include/pybind11/stl_bind.h
+    include/pybind11/stl/filesystem.h
+    include/pybind11/type_caster_pyobject_ptr.h)
+
+# Compare with grep and warn if mismatched
+if(PYBIND11_MASTER_PROJECT AND NOT CMAKE_VERSION VERSION_LESS 3.12)
+  file(
+    GLOB_RECURSE _pybind11_header_check
+    LIST_DIRECTORIES false
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+    CONFIGURE_DEPENDS "include/pybind11/*.h")
+  set(_pybind11_here_only ${PYBIND11_HEADERS})
+  set(_pybind11_disk_only ${_pybind11_header_check})
+  list(REMOVE_ITEM _pybind11_here_only ${_pybind11_header_check})
+  list(REMOVE_ITEM _pybind11_disk_only ${PYBIND11_HEADERS})
+  if(_pybind11_here_only)
+    message(AUTHOR_WARNING "PYBIND11_HEADERS has extra files:" ${_pybind11_here_only})
+  endif()
+  if(_pybind11_disk_only)
+    message(AUTHOR_WARNING "PYBIND11_HEADERS is missing files:" ${_pybind11_disk_only})
+  endif()
+endif()
+
+# CMake 3.12 added list(TRANSFORM <list> PREPEND
+# But we can't use it yet
+string(REPLACE "include/" "${CMAKE_CURRENT_SOURCE_DIR}/include/" PYBIND11_HEADERS
+               "${PYBIND11_HEADERS}")
+
+# Cache variable so this can be used in parent projects
+set(pybind11_INCLUDE_DIR
+    "${CMAKE_CURRENT_LIST_DIR}/include"
+    CACHE INTERNAL "Directory where pybind11 headers are located")
+
+# Backward compatible variable for add_subdirectory mode
+if(NOT PYBIND11_MASTER_PROJECT)
+  set(PYBIND11_INCLUDE_DIR
+      "${pybind11_INCLUDE_DIR}"
+      CACHE INTERNAL "")
+endif()
+
+# Note: when creating targets, you cannot use if statements at configure time -
+# you need generator expressions, because those will be placed in the target file.
+# You can also place ifs *in* the Config.in, but not here.
+
+# This section builds targets, but does *not* touch Python
+# Non-IMPORT targets cannot be defined twice
+if(NOT TARGET pybind11_headers)
+  # Build the headers-only target (no Python included):
+  # (long name used here to keep this from clashing in subdirectory mode)
+  add_library(pybind11_headers INTERFACE)
+  add_library(pybind11::pybind11_headers ALIAS pybind11_headers) # to match exported target
+  add_library(pybind11::headers ALIAS pybind11_headers) # easier to use/remember
+
+  target_include_directories(
+    pybind11_headers ${pybind11_system} INTERFACE $<BUILD_INTERFACE:${pybind11_INCLUDE_DIR}>
+                                                  $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+
+  target_compile_features(pybind11_headers INTERFACE cxx_inheriting_constructors cxx_user_literals
+                                                     cxx_right_angle_brackets)
+  if(NOT "${PYBIND11_INTERNALS_VERSION}" STREQUAL "")
+    target_compile_definitions(
+      pybind11_headers INTERFACE "PYBIND11_INTERNALS_VERSION=${PYBIND11_INTERNALS_VERSION}")
+  endif()
+else()
+  # It is invalid to install a target twice, too.
+  set(PYBIND11_INSTALL OFF)
+endif()
+
+include("${CMAKE_CURRENT_SOURCE_DIR}/tools/pybind11Common.cmake")
+# https://github.com/jtojnar/cmake-snips/#concatenating-paths-when-building-pkg-config-files
+# TODO: cmake 3.20 adds the cmake_path() function, which obsoletes this snippet
+include("${CMAKE_CURRENT_SOURCE_DIR}/tools/JoinPaths.cmake")
+
+# Relative directory setting
+if(USE_PYTHON_INCLUDE_DIR AND DEFINED Python_INCLUDE_DIRS)
+  file(RELATIVE_PATH CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX} ${Python_INCLUDE_DIRS})
+elseif(USE_PYTHON_INCLUDE_DIR AND DEFINED PYTHON_INCLUDE_DIR)
+  file(RELATIVE_PATH CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX} ${PYTHON_INCLUDE_DIRS})
+endif()
+
+if(PYBIND11_INSTALL)
+  install(DIRECTORY ${pybind11_INCLUDE_DIR}/pybind11 DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+  set(PYBIND11_CMAKECONFIG_INSTALL_DIR
+      "${CMAKE_INSTALL_DATAROOTDIR}/cmake/${PROJECT_NAME}"
+      CACHE STRING "install path for pybind11Config.cmake")
+
+  if(IS_ABSOLUTE "${CMAKE_INSTALL_INCLUDEDIR}")
+    set(pybind11_INCLUDEDIR "${CMAKE_INSTALL_FULL_INCLUDEDIR}")
+  else()
+    set(pybind11_INCLUDEDIR "\$\{PACKAGE_PREFIX_DIR\}/${CMAKE_INSTALL_INCLUDEDIR}")
+  endif()
+
+  configure_package_config_file(
+    tools/${PROJECT_NAME}Config.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
+    INSTALL_DESTINATION ${PYBIND11_CMAKECONFIG_INSTALL_DIR})
+
+  if(CMAKE_VERSION VERSION_LESS 3.14)
+    # Remove CMAKE_SIZEOF_VOID_P from ConfigVersion.cmake since the library does
+    # not depend on architecture specific settings or libraries.
+    set(_PYBIND11_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P})
+    unset(CMAKE_SIZEOF_VOID_P)
+
+    write_basic_package_version_file(
+      ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake
+      VERSION ${PROJECT_VERSION}
+      COMPATIBILITY AnyNewerVersion)
+
+    set(CMAKE_SIZEOF_VOID_P ${_PYBIND11_CMAKE_SIZEOF_VOID_P})
+  else()
+    # CMake 3.14+ natively supports header-only libraries
+    write_basic_package_version_file(
+      ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake
+      VERSION ${PROJECT_VERSION}
+      COMPATIBILITY AnyNewerVersion ARCH_INDEPENDENT)
+  endif()
+
+  install(
+    FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake
+          ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake
+          tools/FindPythonLibsNew.cmake
+          tools/pybind11Common.cmake
+          tools/pybind11Tools.cmake
+          tools/pybind11NewTools.cmake
+    DESTINATION ${PYBIND11_CMAKECONFIG_INSTALL_DIR})
+
+  if(NOT PYBIND11_EXPORT_NAME)
+    set(PYBIND11_EXPORT_NAME "${PROJECT_NAME}Targets")
+  endif()
+
+  install(TARGETS pybind11_headers EXPORT "${PYBIND11_EXPORT_NAME}")
+
+  install(
+    EXPORT "${PYBIND11_EXPORT_NAME}"
+    NAMESPACE "pybind11::"
+    DESTINATION ${PYBIND11_CMAKECONFIG_INSTALL_DIR})
+
+  # pkg-config support
+  if(NOT prefix_for_pc_file)
+    set(prefix_for_pc_file "${CMAKE_INSTALL_PREFIX}")
+  endif()
+  join_paths(includedir_for_pc_file "\${prefix}" "${CMAKE_INSTALL_INCLUDEDIR}")
+  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/tools/pybind11.pc.in"
+                 "${CMAKE_CURRENT_BINARY_DIR}/pybind11.pc" @ONLY)
+  install(FILES "${CMAKE_CURRENT_BINARY_DIR}/pybind11.pc"
+          DESTINATION "${CMAKE_INSTALL_DATAROOTDIR}/pkgconfig/")
+
+  # Uninstall target
+  if(PYBIND11_MASTER_PROJECT)
+    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/tools/cmake_uninstall.cmake.in"
+                   "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" IMMEDIATE @ONLY)
+
+    add_custom_target(uninstall COMMAND ${CMAKE_COMMAND} -P
+                                        ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
+  endif()
+endif()
+
+# BUILD_TESTING takes priority, but only if this is the master project
+if(PYBIND11_MASTER_PROJECT AND DEFINED BUILD_TESTING)
+  if(BUILD_TESTING)
+    if(_pybind11_nopython)
+      message(FATAL_ERROR "Cannot activate tests in NOPYTHON mode")
+    else()
+      add_subdirectory(tests)
+    endif()
+  endif()
+else()
+  if(PYBIND11_TEST)
+    if(_pybind11_nopython)
+      message(FATAL_ERROR "Cannot activate tests in NOPYTHON mode")
+    else()
+      add_subdirectory(tests)
+    endif()
+  endif()
+endif()
+
+# Better symmetry with find_package(pybind11 CONFIG) mode.
+if(NOT PYBIND11_MASTER_PROJECT)
+  set(pybind11_FOUND
+      TRUE
+      CACHE INTERNAL "True if pybind11 and all required components found on the system")
+endif()
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/LICENSE b/ethosu/regor/dependencies/thirdparty/pybind11/LICENSE
new file mode 100644
index 00000000..e466b0df
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/LICENSE
@@ -0,0 +1,29 @@
+Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Please also refer to the file .github/CONTRIBUTING.md, which clarifies licensing of
+external contributions to this project including patches, pull requests, etc.
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/MANIFEST.in b/ethosu/regor/dependencies/thirdparty/pybind11/MANIFEST.in
new file mode 100644
index 00000000..7ce83c55
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/MANIFEST.in
@@ -0,0 +1,6 @@
+prune tests
+recursive-include pybind11/include/pybind11 *.h
+recursive-include pybind11 *.py
+recursive-include pybind11 py.typed
+include pybind11/share/cmake/pybind11/*.cmake
+include LICENSE README.rst SECURITY.md pyproject.toml setup.py setup.cfg
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/README.rst b/ethosu/regor/dependencies/thirdparty/pybind11/README.rst
new file mode 100644
index 00000000..80213a40
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/README.rst
@@ -0,0 +1,180 @@
+.. figure:: https://github.com/pybind/pybind11/raw/master/docs/pybind11-logo.png
+   :alt: pybind11 logo
+
+**pybind11 — Seamless operability between C++11 and Python**
+
+|Latest Documentation Status| |Stable Documentation Status| |Gitter chat| |GitHub Discussions| |CI| |Build status|
+
+|Repology| |PyPI package| |Conda-forge| |Python Versions|
+
+`Setuptools example <https://github.com/pybind/python_example>`_
+• `Scikit-build example <https://github.com/pybind/scikit_build_example>`_
+• `CMake example <https://github.com/pybind/cmake_example>`_
+
+.. start
+
+
+**pybind11** is a lightweight header-only library that exposes C++ types
+in Python and vice versa, mainly to create Python bindings of existing
+C++ code. Its goals and syntax are similar to the excellent
+`Boost.Python <http://www.boost.org/doc/libs/1_58_0/libs/python/doc/>`_
+library by David Abrahams: to minimize boilerplate code in traditional
+extension modules by inferring type information using compile-time
+introspection.
+
+The main issue with Boost.Python—and the reason for creating such a
+similar project—is Boost. Boost is an enormously large and complex suite
+of utility libraries that works with almost every C++ compiler in
+existence. This compatibility has its cost: arcane template tricks and
+workarounds are necessary to support the oldest and buggiest of compiler
+specimens. Now that C++11-compatible compilers are widely available,
+this heavy machinery has become an excessively large and unnecessary
+dependency.
+
+Think of this library as a tiny self-contained version of Boost.Python
+with everything stripped away that isn't relevant for binding
+generation. Without comments, the core header files only require ~4K
+lines of code and depend on Python (3.6+, or PyPy) and the C++
+standard library. This compact implementation was possible thanks to
+some of the new C++11 language features (specifically: tuples, lambda
+functions and variadic templates). Since its creation, this library has
+grown beyond Boost.Python in many ways, leading to dramatically simpler
+binding code in many common situations.
+
+Tutorial and reference documentation is provided at
+`pybind11.readthedocs.io <https://pybind11.readthedocs.io/en/latest>`_.
+A PDF version of the manual is available
+`here <https://pybind11.readthedocs.io/_/downloads/en/latest/pdf/>`_.
+And the source code is always available at
+`github.com/pybind/pybind11 <https://github.com/pybind/pybind11>`_.
+
+
+Core features
+-------------
+
+
+pybind11 can map the following core C++ features to Python:
+
+- Functions accepting and returning custom data structures per value,
+  reference, or pointer
+- Instance methods and static methods
+- Overloaded functions
+- Instance attributes and static attributes
+- Arbitrary exception types
+- Enumerations
+- Callbacks
+- Iterators and ranges
+- Custom operators
+- Single and multiple inheritance
+- STL data structures
+- Smart pointers with reference counting like ``std::shared_ptr``
+- Internal references with correct reference counting
+- C++ classes with virtual (and pure virtual) methods can be extended
+  in Python
+
+Goodies
+-------
+
+In addition to the core functionality, pybind11 provides some extra
+goodies:
+
+- Python 3.6+, and PyPy3 7.3 are supported with an implementation-agnostic
+  interface (pybind11 2.9 was the last version to support Python 2 and 3.5).
+
+- It is possible to bind C++11 lambda functions with captured
+  variables. The lambda capture data is stored inside the resulting
+  Python function object.
+
+- pybind11 uses C++11 move constructors and move assignment operators
+  whenever possible to efficiently transfer custom data types.
+
+- It's easy to expose the internal storage of custom data types through
+  Pythons' buffer protocols. This is handy e.g. for fast conversion
+  between C++ matrix classes like Eigen and NumPy without expensive
+  copy operations.
+
+- pybind11 can automatically vectorize functions so that they are
+  transparently applied to all entries of one or more NumPy array
+  arguments.
+
+- Python's slice-based access and assignment operations can be
+  supported with just a few lines of code.
+
+- Everything is contained in just a few header files; there is no need
+  to link against any additional libraries.
+
+- Binaries are generally smaller by a factor of at least 2 compared to
+  equivalent bindings generated by Boost.Python. A recent pybind11
+  conversion of PyRosetta, an enormous Boost.Python binding project,
+  `reported <https://graylab.jhu.edu/Sergey/2016.RosettaCon/PyRosetta-4.pdf>`_
+  a binary size reduction of **5.4x** and compile time reduction by
+  **5.8x**.
+
+- Function signatures are precomputed at compile time (using
+  ``constexpr``), leading to smaller binaries.
+
+- With little extra effort, C++ types can be pickled and unpickled
+  similar to regular Python objects.
+
+Supported compilers
+-------------------
+
+1. Clang/LLVM 3.3 or newer (for Apple Xcode's clang, this is 5.0.0 or
+   newer)
+2. GCC 4.8 or newer
+3. Microsoft Visual Studio 2017 or newer
+4. Intel classic C++ compiler 18 or newer (ICC 20.2 tested in CI)
+5. Cygwin/GCC (previously tested on 2.5.1)
+6. NVCC (CUDA 11.0 tested in CI)
+7. NVIDIA PGI (20.9 tested in CI)
+
+About
+-----
+
+This project was created by `Wenzel
+Jakob <http://rgl.epfl.ch/people/wjakob>`_. Significant features and/or
+improvements to the code were contributed by Jonas Adler, Lori A. Burns,
+Sylvain Corlay, Eric Cousineau, Aaron Gokaslan, Ralf Grosse-Kunstleve, Trent Houliston, Axel
+Huebl, @hulucc, Yannick Jadoul, Sergey Lyskov, Johan Mabille, Tomasz Miąsko,
+Dean Moldovan, Ben Pritchard, Jason Rhinelander, Boris Schäling, Pim
+Schellart, Henry Schreiner, Ivan Smirnov, Boris Staletic, and Patrick Stewart.
+
+We thank Google for a generous financial contribution to the continuous
+integration infrastructure used by this project.
+
+
+Contributing
+~~~~~~~~~~~~
+
+See the `contributing
+guide <https://github.com/pybind/pybind11/blob/master/.github/CONTRIBUTING.md>`_
+for information on building and contributing to pybind11.
+
+License
+~~~~~~~
+
+pybind11 is provided under a BSD-style license that can be found in the
+`LICENSE <https://github.com/pybind/pybind11/blob/master/LICENSE>`_
+file. By using, distributing, or contributing to this project, you agree
+to the terms and conditions of this license.
+
+.. |Latest Documentation Status| image:: https://readthedocs.org/projects/pybind11/badge?version=latest
+   :target: http://pybind11.readthedocs.org/en/latest
+.. |Stable Documentation Status| image:: https://img.shields.io/badge/docs-stable-blue.svg
+   :target: http://pybind11.readthedocs.org/en/stable
+.. |Gitter chat| image:: https://img.shields.io/gitter/room/gitterHQ/gitter.svg
+   :target: https://gitter.im/pybind/Lobby
+.. |CI| image:: https://github.com/pybind/pybind11/workflows/CI/badge.svg
+   :target: https://github.com/pybind/pybind11/actions
+.. |Build status| image:: https://ci.appveyor.com/api/projects/status/riaj54pn4h08xy40?svg=true
+   :target: https://ci.appveyor.com/project/wjakob/pybind11
+.. |PyPI package| image:: https://img.shields.io/pypi/v/pybind11.svg
+   :target: https://pypi.org/project/pybind11/
+.. |Conda-forge| image:: https://img.shields.io/conda/vn/conda-forge/pybind11.svg
+   :target: https://github.com/conda-forge/pybind11-feedstock
+.. |Repology| image:: https://repology.org/badge/latest-versions/python:pybind11.svg
+   :target: https://repology.org/project/python:pybind11/versions
+.. |Python Versions| image:: https://img.shields.io/pypi/pyversions/pybind11.svg
+   :target: https://pypi.org/project/pybind11/
+.. |GitHub Discussions| image:: https://img.shields.io/static/v1?label=Discussions&message=Ask&color=blue&logo=github
+   :target: https://github.com/pybind/pybind11/discussions
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/SECURITY.md b/ethosu/regor/dependencies/thirdparty/pybind11/SECURITY.md
new file mode 100644
index 00000000..3d74611f
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/SECURITY.md
@@ -0,0 +1,13 @@
+# Security Policy
+
+## Supported Versions
+
+Security updates are applied only to the latest release.
+
+## Reporting a Vulnerability
+
+If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
+
+Please disclose it at [security advisory](https://github.com/pybind/pybind11/security/advisories/new).
+
+This project is maintained by a team of volunteers on a reasonable-effort basis. As such, please give us at least 90 days to work on a fix before public exposure.
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/attr.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/attr.h
new file mode 100644
index 00000000..1044db94
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/attr.h
@@ -0,0 +1,690 @@
+/*
+    pybind11/attr.h: Infrastructure for processing custom
+    type and function attributes
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "cast.h"
+
+#include <functional>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/// \addtogroup annotations
+/// @{
+
+/// Annotation for methods
+struct is_method {
+    handle class_;
+    explicit is_method(const handle &c) : class_(c) {}
+};
+
+/// Annotation for setters
+struct is_setter {};
+
+/// Annotation for operators
+struct is_operator {};
+
+/// Annotation for classes that cannot be subclassed
+struct is_final {};
+
+/// Annotation for parent scope
+struct scope {
+    handle value;
+    explicit scope(const handle &s) : value(s) {}
+};
+
+/// Annotation for documentation
+struct doc {
+    const char *value;
+    explicit doc(const char *value) : value(value) {}
+};
+
+/// Annotation for function names
+struct name {
+    const char *value;
+    explicit name(const char *value) : value(value) {}
+};
+
+/// Annotation indicating that a function is an overload associated with a given "sibling"
+struct sibling {
+    handle value;
+    explicit sibling(const handle &value) : value(value.ptr()) {}
+};
+
+/// Annotation indicating that a class derives from another given type
+template <typename T>
+struct base {
+
+    PYBIND11_DEPRECATED(
+        "base<T>() was deprecated in favor of specifying 'T' as a template argument to class_")
+    base() = default;
+};
+
+/// Keep patient alive while nurse lives
+template <size_t Nurse, size_t Patient>
+struct keep_alive {};
+
+/// Annotation indicating that a class is involved in a multiple inheritance relationship
+struct multiple_inheritance {};
+
+/// Annotation which enables dynamic attributes, i.e. adds `__dict__` to a class
+struct dynamic_attr {};
+
+/// Annotation which enables the buffer protocol for a type
+struct buffer_protocol {};
+
+/// Annotation which requests that a special metaclass is created for a type
+struct metaclass {
+    handle value;
+
+    PYBIND11_DEPRECATED("py::metaclass() is no longer required. It's turned on by default now.")
+    metaclass() = default;
+
+    /// Override pybind11's default metaclass
+    explicit metaclass(handle value) : value(value) {}
+};
+
+/// Specifies a custom callback with signature `void (PyHeapTypeObject*)` that
+/// may be used to customize the Python type.
+///
+/// The callback is invoked immediately before `PyType_Ready`.
+///
+/// Note: This is an advanced interface, and uses of it may require changes to
+/// work with later versions of pybind11.  You may wish to consult the
+/// implementation of `make_new_python_type` in `detail/classes.h` to understand
+/// the context in which the callback will be run.
+struct custom_type_setup {
+    using callback = std::function<void(PyHeapTypeObject *heap_type)>;
+
+    explicit custom_type_setup(callback value) : value(std::move(value)) {}
+
+    callback value;
+};
+
+/// Annotation that marks a class as local to the module:
+struct module_local {
+    const bool value;
+    constexpr explicit module_local(bool v = true) : value(v) {}
+};
+
+/// Annotation to mark enums as an arithmetic type
+struct arithmetic {};
+
+/// Mark a function for addition at the beginning of the existing overload chain instead of the end
+struct prepend {};
+
+/** \rst
+    A call policy which places one or more guard variables (``Ts...``) around the function call.
+
+    For example, this definition:
+
+    .. code-block:: cpp
+
+        m.def("foo", foo, py::call_guard<T>());
+
+    is equivalent to the following pseudocode:
+
+    .. code-block:: cpp
+
+        m.def("foo", [](args...) {
+            T scope_guard;
+            return foo(args...); // forwarded arguments
+        });
+ \endrst */
+template <typename... Ts>
+struct call_guard;
+
+template <>
+struct call_guard<> {
+    using type = detail::void_type;
+};
+
+template <typename T>
+struct call_guard<T> {
+    static_assert(std::is_default_constructible<T>::value,
+                  "The guard type must be default constructible");
+
+    using type = T;
+};
+
+template <typename T, typename... Ts>
+struct call_guard<T, Ts...> {
+    struct type {
+        T guard{}; // Compose multiple guard types with left-to-right default-constructor order
+        typename call_guard<Ts...>::type next{};
+    };
+};
+
+/// @} annotations
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+/* Forward declarations */
+enum op_id : int;
+enum op_type : int;
+struct undefined_t;
+template <op_id id, op_type ot, typename L = undefined_t, typename R = undefined_t>
+struct op_;
+void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret);
+
+/// Internal data structure which holds metadata about a keyword argument
+struct argument_record {
+    const char *name;  ///< Argument name
+    const char *descr; ///< Human-readable version of the argument value
+    handle value;      ///< Associated Python object
+    bool convert : 1;  ///< True if the argument is allowed to convert when loading
+    bool none : 1;     ///< True if None is allowed when loading
+
+    argument_record(const char *name, const char *descr, handle value, bool convert, bool none)
+        : name(name), descr(descr), value(value), convert(convert), none(none) {}
+};
+
+/// Internal data structure which holds metadata about a bound function (signature, overloads,
+/// etc.)
+struct function_record {
+    function_record()
+        : is_constructor(false), is_new_style_constructor(false), is_stateless(false),
+          is_operator(false), is_method(false), is_setter(false), has_args(false),
+          has_kwargs(false), prepend(false) {}
+
+    /// Function name
+    char *name = nullptr; /* why no C++ strings? They generate heavier code.. */
+
+    // User-specified documentation string
+    char *doc = nullptr;
+
+    /// Human-readable version of the function signature
+    char *signature = nullptr;
+
+    /// List of registered keyword arguments
+    std::vector<argument_record> args;
+
+    /// Pointer to lambda function which converts arguments and performs the actual call
+    handle (*impl)(function_call &) = nullptr;
+
+    /// Storage for the wrapped function pointer and captured data, if any
+    void *data[3] = {};
+
+    /// Pointer to custom destructor for 'data' (if needed)
+    void (*free_data)(function_record *ptr) = nullptr;
+
+    /// Return value policy associated with this function
+    return_value_policy policy = return_value_policy::automatic;
+
+    /// True if name == '__init__'
+    bool is_constructor : 1;
+
+    /// True if this is a new-style `__init__` defined in `detail/init.h`
+    bool is_new_style_constructor : 1;
+
+    /// True if this is a stateless function pointer
+    bool is_stateless : 1;
+
+    /// True if this is an operator (__add__), etc.
+    bool is_operator : 1;
+
+    /// True if this is a method
+    bool is_method : 1;
+
+    /// True if this is a setter
+    bool is_setter : 1;
+
+    /// True if the function has a '*args' argument
+    bool has_args : 1;
+
+    /// True if the function has a '**kwargs' argument
+    bool has_kwargs : 1;
+
+    /// True if this function is to be inserted at the beginning of the overload resolution chain
+    bool prepend : 1;
+
+    /// Number of arguments (including py::args and/or py::kwargs, if present)
+    std::uint16_t nargs;
+
+    /// Number of leading positional arguments, which are terminated by a py::args or py::kwargs
+    /// argument or by a py::kw_only annotation.
+    std::uint16_t nargs_pos = 0;
+
+    /// Number of leading arguments (counted in `nargs`) that are positional-only
+    std::uint16_t nargs_pos_only = 0;
+
+    /// Python method object
+    PyMethodDef *def = nullptr;
+
+    /// Python handle to the parent scope (a class or a module)
+    handle scope;
+
+    /// Python handle to the sibling function representing an overload chain
+    handle sibling;
+
+    /// Pointer to next overload
+    function_record *next = nullptr;
+};
+
+/// Special data structure which (temporarily) holds metadata about a bound class
+struct type_record {
+    PYBIND11_NOINLINE type_record()
+        : multiple_inheritance(false), dynamic_attr(false), buffer_protocol(false),
+          default_holder(true), module_local(false), is_final(false) {}
+
+    /// Handle to the parent scope
+    handle scope;
+
+    /// Name of the class
+    const char *name = nullptr;
+
+    // Pointer to RTTI type_info data structure
+    const std::type_info *type = nullptr;
+
+    /// How large is the underlying C++ type?
+    size_t type_size = 0;
+
+    /// What is the alignment of the underlying C++ type?
+    size_t type_align = 0;
+
+    /// How large is the type's holder?
+    size_t holder_size = 0;
+
+    /// The global operator new can be overridden with a class-specific variant
+    void *(*operator_new)(size_t) = nullptr;
+
+    /// Function pointer to class_<..>::init_instance
+    void (*init_instance)(instance *, const void *) = nullptr;
+
+    /// Function pointer to class_<..>::dealloc
+    void (*dealloc)(detail::value_and_holder &) = nullptr;
+
+    /// List of base classes of the newly created type
+    list bases;
+
+    /// Optional docstring
+    const char *doc = nullptr;
+
+    /// Custom metaclass (optional)
+    handle metaclass;
+
+    /// Custom type setup.
+    custom_type_setup::callback custom_type_setup_callback;
+
+    /// Multiple inheritance marker
+    bool multiple_inheritance : 1;
+
+    /// Does the class manage a __dict__?
+    bool dynamic_attr : 1;
+
+    /// Does the class implement the buffer protocol?
+    bool buffer_protocol : 1;
+
+    /// Is the default (unique_ptr) holder type used?
+    bool default_holder : 1;
+
+    /// Is the class definition local to the module shared object?
+    bool module_local : 1;
+
+    /// Is the class inheritable from python classes?
+    bool is_final : 1;
+
+    PYBIND11_NOINLINE void add_base(const std::type_info &base, void *(*caster)(void *) ) {
+        auto *base_info = detail::get_type_info(base, false);
+        if (!base_info) {
+            std::string tname(base.name());
+            detail::clean_type_id(tname);
+            pybind11_fail("generic_type: type \"" + std::string(name)
+                          + "\" referenced unknown base type \"" + tname + "\"");
+        }
+
+        if (default_holder != base_info->default_holder) {
+            std::string tname(base.name());
+            detail::clean_type_id(tname);
+            pybind11_fail("generic_type: type \"" + std::string(name) + "\" "
+                          + (default_holder ? "does not have" : "has")
+                          + " a non-default holder type while its base \"" + tname + "\" "
+                          + (base_info->default_holder ? "does not" : "does"));
+        }
+
+        bases.append((PyObject *) base_info->type);
+
+#if PY_VERSION_HEX < 0x030B0000
+        dynamic_attr |= base_info->type->tp_dictoffset != 0;
+#else
+        dynamic_attr |= (base_info->type->tp_flags & Py_TPFLAGS_MANAGED_DICT) != 0;
+#endif
+
+        if (caster) {
+            base_info->implicit_casts.emplace_back(type, caster);
+        }
+    }
+};
+
+inline function_call::function_call(const function_record &f, handle p) : func(f), parent(p) {
+    args.reserve(f.nargs);
+    args_convert.reserve(f.nargs);
+}
+
+/// Tag for a new-style `__init__` defined in `detail/init.h`
+struct is_new_style_constructor {};
+
+/**
+ * Partial template specializations to process custom attributes provided to
+ * cpp_function_ and class_. These are either used to initialize the respective
+ * fields in the type_record and function_record data structures or executed at
+ * runtime to deal with custom call policies (e.g. keep_alive).
+ */
+template <typename T, typename SFINAE = void>
+struct process_attribute;
+
+template <typename T>
+struct process_attribute_default {
+    /// Default implementation: do nothing
+    static void init(const T &, function_record *) {}
+    static void init(const T &, type_record *) {}
+    static void precall(function_call &) {}
+    static void postcall(function_call &, handle) {}
+};
+
+/// Process an attribute specifying the function's name
+template <>
+struct process_attribute<name> : process_attribute_default<name> {
+    static void init(const name &n, function_record *r) { r->name = const_cast<char *>(n.value); }
+};
+
+/// Process an attribute specifying the function's docstring
+template <>
+struct process_attribute<doc> : process_attribute_default<doc> {
+    static void init(const doc &n, function_record *r) { r->doc = const_cast<char *>(n.value); }
+};
+
+/// Process an attribute specifying the function's docstring (provided as a C-style string)
+template <>
+struct process_attribute<const char *> : process_attribute_default<const char *> {
+    static void init(const char *d, function_record *r) { r->doc = const_cast<char *>(d); }
+    static void init(const char *d, type_record *r) { r->doc = d; }
+};
+template <>
+struct process_attribute<char *> : process_attribute<const char *> {};
+
+/// Process an attribute indicating the function's return value policy
+template <>
+struct process_attribute<return_value_policy> : process_attribute_default<return_value_policy> {
+    static void init(const return_value_policy &p, function_record *r) { r->policy = p; }
+};
+
+/// Process an attribute which indicates that this is an overloaded function associated with a
+/// given sibling
+template <>
+struct process_attribute<sibling> : process_attribute_default<sibling> {
+    static void init(const sibling &s, function_record *r) { r->sibling = s.value; }
+};
+
+/// Process an attribute which indicates that this function is a method
+template <>
+struct process_attribute<is_method> : process_attribute_default<is_method> {
+    static void init(const is_method &s, function_record *r) {
+        r->is_method = true;
+        r->scope = s.class_;
+    }
+};
+
+/// Process an attribute which indicates that this function is a setter
+template <>
+struct process_attribute<is_setter> : process_attribute_default<is_setter> {
+    static void init(const is_setter &, function_record *r) { r->is_setter = true; }
+};
+
+/// Process an attribute which indicates the parent scope of a method
+template <>
+struct process_attribute<scope> : process_attribute_default<scope> {
+    static void init(const scope &s, function_record *r) { r->scope = s.value; }
+};
+
+/// Process an attribute which indicates that this function is an operator
+template <>
+struct process_attribute<is_operator> : process_attribute_default<is_operator> {
+    static void init(const is_operator &, function_record *r) { r->is_operator = true; }
+};
+
+template <>
+struct process_attribute<is_new_style_constructor>
+    : process_attribute_default<is_new_style_constructor> {
+    static void init(const is_new_style_constructor &, function_record *r) {
+        r->is_new_style_constructor = true;
+    }
+};
+
+inline void check_kw_only_arg(const arg &a, function_record *r) {
+    if (r->args.size() > r->nargs_pos && (!a.name || a.name[0] == '\0')) {
+        pybind11_fail("arg(): cannot specify an unnamed argument after a kw_only() annotation or "
+                      "args() argument");
+    }
+}
+
+inline void append_self_arg_if_needed(function_record *r) {
+    if (r->is_method && r->args.empty()) {
+        r->args.emplace_back("self", nullptr, handle(), /*convert=*/true, /*none=*/false);
+    }
+}
+
+/// Process a keyword argument attribute (*without* a default value)
+template <>
+struct process_attribute<arg> : process_attribute_default<arg> {
+    static void init(const arg &a, function_record *r) {
+        append_self_arg_if_needed(r);
+        r->args.emplace_back(a.name, nullptr, handle(), !a.flag_noconvert, a.flag_none);
+
+        check_kw_only_arg(a, r);
+    }
+};
+
+/// Process a keyword argument attribute (*with* a default value)
+template <>
+struct process_attribute<arg_v> : process_attribute_default<arg_v> {
+    static void init(const arg_v &a, function_record *r) {
+        if (r->is_method && r->args.empty()) {
+            r->args.emplace_back(
+                "self", /*descr=*/nullptr, /*parent=*/handle(), /*convert=*/true, /*none=*/false);
+        }
+
+        if (!a.value) {
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            std::string descr("'");
+            if (a.name) {
+                descr += std::string(a.name) + ": ";
+            }
+            descr += a.type + "'";
+            if (r->is_method) {
+                if (r->name) {
+                    descr += " in method '" + (std::string) str(r->scope) + "."
+                             + (std::string) r->name + "'";
+                } else {
+                    descr += " in method of '" + (std::string) str(r->scope) + "'";
+                }
+            } else if (r->name) {
+                descr += " in function '" + (std::string) r->name + "'";
+            }
+            pybind11_fail("arg(): could not convert default argument " + descr
+                          + " into a Python object (type not registered yet?)");
+#else
+            pybind11_fail("arg(): could not convert default argument "
+                          "into a Python object (type not registered yet?). "
+                          "#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for "
+                          "more information.");
+#endif
+        }
+        r->args.emplace_back(a.name, a.descr, a.value.inc_ref(), !a.flag_noconvert, a.flag_none);
+
+        check_kw_only_arg(a, r);
+    }
+};
+
+/// Process a keyword-only-arguments-follow pseudo argument
+template <>
+struct process_attribute<kw_only> : process_attribute_default<kw_only> {
+    static void init(const kw_only &, function_record *r) {
+        append_self_arg_if_needed(r);
+        if (r->has_args && r->nargs_pos != static_cast<std::uint16_t>(r->args.size())) {
+            pybind11_fail("Mismatched args() and kw_only(): they must occur at the same relative "
+                          "argument location (or omit kw_only() entirely)");
+        }
+        r->nargs_pos = static_cast<std::uint16_t>(r->args.size());
+    }
+};
+
+/// Process a positional-only-argument maker
+template <>
+struct process_attribute<pos_only> : process_attribute_default<pos_only> {
+    static void init(const pos_only &, function_record *r) {
+        append_self_arg_if_needed(r);
+        r->nargs_pos_only = static_cast<std::uint16_t>(r->args.size());
+        if (r->nargs_pos_only > r->nargs_pos) {
+            pybind11_fail("pos_only(): cannot follow a py::args() argument");
+        }
+        // It also can't follow a kw_only, but a static_assert in pybind11.h checks that
+    }
+};
+
+/// Process a parent class attribute.  Single inheritance only (class_ itself already guarantees
+/// that)
+template <typename T>
+struct process_attribute<T, enable_if_t<is_pyobject<T>::value>>
+    : process_attribute_default<handle> {
+    static void init(const handle &h, type_record *r) { r->bases.append(h); }
+};
+
+/// Process a parent class attribute (deprecated, does not support multiple inheritance)
+template <typename T>
+struct process_attribute<base<T>> : process_attribute_default<base<T>> {
+    static void init(const base<T> &, type_record *r) { r->add_base(typeid(T), nullptr); }
+};
+
+/// Process a multiple inheritance attribute
+template <>
+struct process_attribute<multiple_inheritance> : process_attribute_default<multiple_inheritance> {
+    static void init(const multiple_inheritance &, type_record *r) {
+        r->multiple_inheritance = true;
+    }
+};
+
+template <>
+struct process_attribute<dynamic_attr> : process_attribute_default<dynamic_attr> {
+    static void init(const dynamic_attr &, type_record *r) { r->dynamic_attr = true; }
+};
+
+template <>
+struct process_attribute<custom_type_setup> {
+    static void init(const custom_type_setup &value, type_record *r) {
+        r->custom_type_setup_callback = value.value;
+    }
+};
+
+template <>
+struct process_attribute<is_final> : process_attribute_default<is_final> {
+    static void init(const is_final &, type_record *r) { r->is_final = true; }
+};
+
+template <>
+struct process_attribute<buffer_protocol> : process_attribute_default<buffer_protocol> {
+    static void init(const buffer_protocol &, type_record *r) { r->buffer_protocol = true; }
+};
+
+template <>
+struct process_attribute<metaclass> : process_attribute_default<metaclass> {
+    static void init(const metaclass &m, type_record *r) { r->metaclass = m.value; }
+};
+
+template <>
+struct process_attribute<module_local> : process_attribute_default<module_local> {
+    static void init(const module_local &l, type_record *r) { r->module_local = l.value; }
+};
+
+/// Process a 'prepend' attribute, putting this at the beginning of the overload chain
+template <>
+struct process_attribute<prepend> : process_attribute_default<prepend> {
+    static void init(const prepend &, function_record *r) { r->prepend = true; }
+};
+
+/// Process an 'arithmetic' attribute for enums (does nothing here)
+template <>
+struct process_attribute<arithmetic> : process_attribute_default<arithmetic> {};
+
+template <typename... Ts>
+struct process_attribute<call_guard<Ts...>> : process_attribute_default<call_guard<Ts...>> {};
+
+/**
+ * Process a keep_alive call policy -- invokes keep_alive_impl during the
+ * pre-call handler if both Nurse, Patient != 0 and use the post-call handler
+ * otherwise
+ */
+template <size_t Nurse, size_t Patient>
+struct process_attribute<keep_alive<Nurse, Patient>>
+    : public process_attribute_default<keep_alive<Nurse, Patient>> {
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
+    static void precall(function_call &call) {
+        keep_alive_impl(Nurse, Patient, call, handle());
+    }
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
+    static void postcall(function_call &, handle) {}
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
+    static void precall(function_call &) {}
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
+    static void postcall(function_call &call, handle ret) {
+        keep_alive_impl(Nurse, Patient, call, ret);
+    }
+};
+
+/// Recursively iterate over variadic template arguments
+template <typename... Args>
+struct process_attributes {
+    static void init(const Args &...args, function_record *r) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(r);
+        PYBIND11_WORKAROUND_INCORRECT_GCC_UNUSED_BUT_SET_PARAMETER(r);
+        using expander = int[];
+        (void) expander{
+            0, ((void) process_attribute<typename std::decay<Args>::type>::init(args, r), 0)...};
+    }
+    static void init(const Args &...args, type_record *r) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(r);
+        PYBIND11_WORKAROUND_INCORRECT_GCC_UNUSED_BUT_SET_PARAMETER(r);
+        using expander = int[];
+        (void) expander{0,
+                        (process_attribute<typename std::decay<Args>::type>::init(args, r), 0)...};
+    }
+    static void precall(function_call &call) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(call);
+        using expander = int[];
+        (void) expander{0,
+                        (process_attribute<typename std::decay<Args>::type>::precall(call), 0)...};
+    }
+    static void postcall(function_call &call, handle fn_ret) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(call, fn_ret);
+        PYBIND11_WORKAROUND_INCORRECT_GCC_UNUSED_BUT_SET_PARAMETER(fn_ret);
+        using expander = int[];
+        (void) expander{
+            0, (process_attribute<typename std::decay<Args>::type>::postcall(call, fn_ret), 0)...};
+    }
+};
+
+template <typename T>
+using is_call_guard = is_instantiation<call_guard, T>;
+
+/// Extract the ``type`` from the first `call_guard` in `Extras...` (or `void_type` if none found)
+template <typename... Extra>
+using extract_guard_t = typename exactly_one_t<is_call_guard, call_guard<>, Extra...>::type;
+
+/// Check the number of named arguments at compile time
+template <typename... Extra,
+          size_t named = constexpr_sum(std::is_base_of<arg, Extra>::value...),
+          size_t self = constexpr_sum(std::is_same<is_method, Extra>::value...)>
+constexpr bool expected_num_args(size_t nargs, bool has_args, bool has_kwargs) {
+    PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(nargs, has_args, has_kwargs);
+    return named == 0 || (self + named + size_t(has_args) + size_t(has_kwargs)) == nargs;
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/buffer_info.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/buffer_info.h
new file mode 100644
index 00000000..b99ee8be
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/buffer_info.h
@@ -0,0 +1,208 @@
+/*
+    pybind11/buffer_info.h: Python buffer object interface
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Default, C-style strides
+inline std::vector<ssize_t> c_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
+    auto ndim = shape.size();
+    std::vector<ssize_t> strides(ndim, itemsize);
+    if (ndim > 0) {
+        for (size_t i = ndim - 1; i > 0; --i) {
+            strides[i - 1] = strides[i] * shape[i];
+        }
+    }
+    return strides;
+}
+
+// F-style strides; default when constructing an array_t with `ExtraFlags & f_style`
+inline std::vector<ssize_t> f_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
+    auto ndim = shape.size();
+    std::vector<ssize_t> strides(ndim, itemsize);
+    for (size_t i = 1; i < ndim; ++i) {
+        strides[i] = strides[i - 1] * shape[i - 1];
+    }
+    return strides;
+}
+
+template <typename T, typename SFINAE = void>
+struct compare_buffer_info;
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Information record describing a Python buffer object
+struct buffer_info {
+    void *ptr = nullptr;          // Pointer to the underlying storage
+    ssize_t itemsize = 0;         // Size of individual items in bytes
+    ssize_t size = 0;             // Total number of entries
+    std::string format;           // For homogeneous buffers, this should be set to
+                                  // format_descriptor<T>::format()
+    ssize_t ndim = 0;             // Number of dimensions
+    std::vector<ssize_t> shape;   // Shape of the tensor (1 entry per dimension)
+    std::vector<ssize_t> strides; // Number of bytes between adjacent entries
+                                  // (for each per dimension)
+    bool readonly = false;        // flag to indicate if the underlying storage may be written to
+
+    buffer_info() = default;
+
+    buffer_info(void *ptr,
+                ssize_t itemsize,
+                const std::string &format,
+                ssize_t ndim,
+                detail::any_container<ssize_t> shape_in,
+                detail::any_container<ssize_t> strides_in,
+                bool readonly = false)
+        : ptr(ptr), itemsize(itemsize), size(1), format(format), ndim(ndim),
+          shape(std::move(shape_in)), strides(std::move(strides_in)), readonly(readonly) {
+        if (ndim != (ssize_t) shape.size() || ndim != (ssize_t) strides.size()) {
+            pybind11_fail("buffer_info: ndim doesn't match shape and/or strides length");
+        }
+        for (size_t i = 0; i < (size_t) ndim; ++i) {
+            size *= shape[i];
+        }
+    }
+
+    template <typename T>
+    buffer_info(T *ptr,
+                detail::any_container<ssize_t> shape_in,
+                detail::any_container<ssize_t> strides_in,
+                bool readonly = false)
+        : buffer_info(private_ctr_tag(),
+                      ptr,
+                      sizeof(T),
+                      format_descriptor<T>::format(),
+                      static_cast<ssize_t>(shape_in->size()),
+                      std::move(shape_in),
+                      std::move(strides_in),
+                      readonly) {}
+
+    buffer_info(void *ptr,
+                ssize_t itemsize,
+                const std::string &format,
+                ssize_t size,
+                bool readonly = false)
+        : buffer_info(ptr, itemsize, format, 1, {size}, {itemsize}, readonly) {}
+
+    template <typename T>
+    buffer_info(T *ptr, ssize_t size, bool readonly = false)
+        : buffer_info(ptr, sizeof(T), format_descriptor<T>::format(), size, readonly) {}
+
+    template <typename T>
+    buffer_info(const T *ptr, ssize_t size, bool readonly = true)
+        : buffer_info(
+            const_cast<T *>(ptr), sizeof(T), format_descriptor<T>::format(), size, readonly) {}
+
+    explicit buffer_info(Py_buffer *view, bool ownview = true)
+        : buffer_info(
+            view->buf,
+            view->itemsize,
+            view->format,
+            view->ndim,
+            {view->shape, view->shape + view->ndim},
+            /* Though buffer::request() requests PyBUF_STRIDES, ctypes objects
+             * ignore this flag and return a view with NULL strides.
+             * When strides are NULL, build them manually.  */
+            view->strides
+                ? std::vector<ssize_t>(view->strides, view->strides + view->ndim)
+                : detail::c_strides({view->shape, view->shape + view->ndim}, view->itemsize),
+            (view->readonly != 0)) {
+        // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
+        this->m_view = view;
+        // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
+        this->ownview = ownview;
+    }
+
+    buffer_info(const buffer_info &) = delete;
+    buffer_info &operator=(const buffer_info &) = delete;
+
+    buffer_info(buffer_info &&other) noexcept { (*this) = std::move(other); }
+
+    buffer_info &operator=(buffer_info &&rhs) noexcept {
+        ptr = rhs.ptr;
+        itemsize = rhs.itemsize;
+        size = rhs.size;
+        format = std::move(rhs.format);
+        ndim = rhs.ndim;
+        shape = std::move(rhs.shape);
+        strides = std::move(rhs.strides);
+        std::swap(m_view, rhs.m_view);
+        std::swap(ownview, rhs.ownview);
+        readonly = rhs.readonly;
+        return *this;
+    }
+
+    ~buffer_info() {
+        if (m_view && ownview) {
+            PyBuffer_Release(m_view);
+            delete m_view;
+        }
+    }
+
+    Py_buffer *view() const { return m_view; }
+    Py_buffer *&view() { return m_view; }
+
+    /* True if the buffer item type is equivalent to `T`. */
+    // To define "equivalent" by example:
+    // `buffer_info::item_type_is_equivalent_to<int>(b)` and
+    // `buffer_info::item_type_is_equivalent_to<long>(b)` may both be true
+    // on some platforms, but `int` and `unsigned` will never be equivalent.
+    // For the ground truth, please inspect `detail::compare_buffer_info<>`.
+    template <typename T>
+    bool item_type_is_equivalent_to() const {
+        return detail::compare_buffer_info<T>::compare(*this);
+    }
+
+private:
+    struct private_ctr_tag {};
+
+    buffer_info(private_ctr_tag,
+                void *ptr,
+                ssize_t itemsize,
+                const std::string &format,
+                ssize_t ndim,
+                detail::any_container<ssize_t> &&shape_in,
+                detail::any_container<ssize_t> &&strides_in,
+                bool readonly)
+        : buffer_info(
+            ptr, itemsize, format, ndim, std::move(shape_in), std::move(strides_in), readonly) {}
+
+    Py_buffer *m_view = nullptr;
+    bool ownview = false;
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename T, typename SFINAE>
+struct compare_buffer_info {
+    static bool compare(const buffer_info &b) {
+        // NOLINTNEXTLINE(bugprone-sizeof-expression) Needed for `PyObject *`
+        return b.format == format_descriptor<T>::format() && b.itemsize == (ssize_t) sizeof(T);
+    }
+};
+
+template <typename T>
+struct compare_buffer_info<T, detail::enable_if_t<std::is_integral<T>::value>> {
+    static bool compare(const buffer_info &b) {
+        return (size_t) b.itemsize == sizeof(T)
+               && (b.format == format_descriptor<T>::value
+                   || ((sizeof(T) == sizeof(long))
+                       && b.format == (std::is_unsigned<T>::value ? "L" : "l"))
+                   || ((sizeof(T) == sizeof(size_t))
+                       && b.format == (std::is_unsigned<T>::value ? "N" : "n")));
+    }
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/cast.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/cast.h
new file mode 100644
index 00000000..db393411
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/cast.h
@@ -0,0 +1,1704 @@
+/*
+    pybind11/cast.h: Partial template specializations to cast between
+    C++ and Python types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "detail/descr.h"
+#include "detail/type_caster_base.h"
+#include "detail/typeid.h"
+#include "pytypes.h"
+
+#include <array>
+#include <cstring>
+#include <functional>
+#include <iosfwd>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename type, typename SFINAE = void>
+class type_caster : public type_caster_base<type> {};
+template <typename type>
+using make_caster = type_caster<intrinsic_t<type>>;
+
+// Shortcut for calling a caster's `cast_op_type` cast operator for casting a type_caster to a T
+template <typename T>
+typename make_caster<T>::template cast_op_type<T> cast_op(make_caster<T> &caster) {
+    return caster.operator typename make_caster<T>::template cast_op_type<T>();
+}
+template <typename T>
+typename make_caster<T>::template cast_op_type<typename std::add_rvalue_reference<T>::type>
+cast_op(make_caster<T> &&caster) {
+    return std::move(caster).operator typename make_caster<T>::
+        template cast_op_type<typename std::add_rvalue_reference<T>::type>();
+}
+
+template <typename type>
+class type_caster<std::reference_wrapper<type>> {
+private:
+    using caster_t = make_caster<type>;
+    caster_t subcaster;
+    using reference_t = type &;
+    using subcaster_cast_op_type = typename caster_t::template cast_op_type<reference_t>;
+
+    static_assert(
+        std::is_same<typename std::remove_const<type>::type &, subcaster_cast_op_type>::value
+            || std::is_same<reference_t, subcaster_cast_op_type>::value,
+        "std::reference_wrapper<T> caster requires T to have a caster with an "
+        "`operator T &()` or `operator const T &()`");
+
+public:
+    bool load(handle src, bool convert) { return subcaster.load(src, convert); }
+    static constexpr auto name = caster_t::name;
+    static handle
+    cast(const std::reference_wrapper<type> &src, return_value_policy policy, handle parent) {
+        // It is definitely wrong to take ownership of this pointer, so mask that rvp
+        if (policy == return_value_policy::take_ownership
+            || policy == return_value_policy::automatic) {
+            policy = return_value_policy::automatic_reference;
+        }
+        return caster_t::cast(&src.get(), policy, parent);
+    }
+    template <typename T>
+    using cast_op_type = std::reference_wrapper<type>;
+    explicit operator std::reference_wrapper<type>() { return cast_op<type &>(subcaster); }
+};
+
+#define PYBIND11_TYPE_CASTER(type, py_name)                                                       \
+protected:                                                                                        \
+    type value;                                                                                   \
+                                                                                                  \
+public:                                                                                           \
+    static constexpr auto name = py_name;                                                         \
+    template <typename T_,                                                                        \
+              ::pybind11::detail::enable_if_t<                                                    \
+                  std::is_same<type, ::pybind11::detail::remove_cv_t<T_>>::value,                 \
+                  int>                                                                            \
+              = 0>                                                                                \
+    static ::pybind11::handle cast(                                                               \
+        T_ *src, ::pybind11::return_value_policy policy, ::pybind11::handle parent) {             \
+        if (!src)                                                                                 \
+            return ::pybind11::none().release();                                                  \
+        if (policy == ::pybind11::return_value_policy::take_ownership) {                          \
+            auto h = cast(std::move(*src), policy, parent);                                       \
+            delete src;                                                                           \
+            return h;                                                                             \
+        }                                                                                         \
+        return cast(*src, policy, parent);                                                        \
+    }                                                                                             \
+    operator type *() { return &value; }               /* NOLINT(bugprone-macro-parentheses) */   \
+    operator type &() { return value; }                /* NOLINT(bugprone-macro-parentheses) */   \
+    operator type &&() && { return std::move(value); } /* NOLINT(bugprone-macro-parentheses) */   \
+    template <typename T_>                                                                        \
+    using cast_op_type = ::pybind11::detail::movable_cast_op_type<T_>
+
+template <typename CharT>
+using is_std_char_type = any_of<std::is_same<CharT, char>, /* std::string */
+#if defined(PYBIND11_HAS_U8STRING)
+                                std::is_same<CharT, char8_t>, /* std::u8string */
+#endif
+                                std::is_same<CharT, char16_t>, /* std::u16string */
+                                std::is_same<CharT, char32_t>, /* std::u32string */
+                                std::is_same<CharT, wchar_t>   /* std::wstring */
+                                >;
+
+template <typename T>
+struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value && !is_std_char_type<T>::value>> {
+    using _py_type_0 = conditional_t<sizeof(T) <= sizeof(long), long, long long>;
+    using _py_type_1 = conditional_t<std::is_signed<T>::value,
+                                     _py_type_0,
+                                     typename std::make_unsigned<_py_type_0>::type>;
+    using py_type = conditional_t<std::is_floating_point<T>::value, double, _py_type_1>;
+
+public:
+    bool load(handle src, bool convert) {
+        py_type py_value;
+
+        if (!src) {
+            return false;
+        }
+
+#if !defined(PYPY_VERSION)
+        auto index_check = [](PyObject *o) { return PyIndex_Check(o); };
+#else
+        // In PyPy 7.3.3, `PyIndex_Check` is implemented by calling `__index__`,
+        // while CPython only considers the existence of `nb_index`/`__index__`.
+        auto index_check = [](PyObject *o) { return hasattr(o, "__index__"); };
+#endif
+
+        if (std::is_floating_point<T>::value) {
+            if (convert || PyFloat_Check(src.ptr())) {
+                py_value = (py_type) PyFloat_AsDouble(src.ptr());
+            } else {
+                return false;
+            }
+        } else if (PyFloat_Check(src.ptr())
+                   || (!convert && !PYBIND11_LONG_CHECK(src.ptr()) && !index_check(src.ptr()))) {
+            return false;
+        } else {
+            handle src_or_index = src;
+            // PyPy: 7.3.7's 3.8 does not implement PyLong_*'s __index__ calls.
+#if PY_VERSION_HEX < 0x03080000 || defined(PYPY_VERSION)
+            object index;
+            if (!PYBIND11_LONG_CHECK(src.ptr())) { // So: index_check(src.ptr())
+                index = reinterpret_steal<object>(PyNumber_Index(src.ptr()));
+                if (!index) {
+                    PyErr_Clear();
+                    if (!convert)
+                        return false;
+                } else {
+                    src_or_index = index;
+                }
+            }
+#endif
+            if (std::is_unsigned<py_type>::value) {
+                py_value = as_unsigned<py_type>(src_or_index.ptr());
+            } else { // signed integer:
+                py_value = sizeof(T) <= sizeof(long)
+                               ? (py_type) PyLong_AsLong(src_or_index.ptr())
+                               : (py_type) PYBIND11_LONG_AS_LONGLONG(src_or_index.ptr());
+            }
+        }
+
+        // Python API reported an error
+        bool py_err = py_value == (py_type) -1 && PyErr_Occurred();
+
+        // Check to see if the conversion is valid (integers should match exactly)
+        // Signed/unsigned checks happen elsewhere
+        if (py_err
+            || (std::is_integral<T>::value && sizeof(py_type) != sizeof(T)
+                && py_value != (py_type) (T) py_value)) {
+            PyErr_Clear();
+            if (py_err && convert && (PyNumber_Check(src.ptr()) != 0)) {
+                auto tmp = reinterpret_steal<object>(std::is_floating_point<T>::value
+                                                         ? PyNumber_Float(src.ptr())
+                                                         : PyNumber_Long(src.ptr()));
+                PyErr_Clear();
+                return load(tmp, false);
+            }
+            return false;
+        }
+
+        value = (T) py_value;
+        return true;
+    }
+
+    template <typename U = T>
+    static typename std::enable_if<std::is_floating_point<U>::value, handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PyFloat_FromDouble((double) src);
+    }
+
+    template <typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_signed<U>::value
+                                       && (sizeof(U) <= sizeof(long)),
+                                   handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PYBIND11_LONG_FROM_SIGNED((long) src);
+    }
+
+    template <typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_unsigned<U>::value
+                                       && (sizeof(U) <= sizeof(unsigned long)),
+                                   handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PYBIND11_LONG_FROM_UNSIGNED((unsigned long) src);
+    }
+
+    template <typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_signed<U>::value
+                                       && (sizeof(U) > sizeof(long)),
+                                   handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PyLong_FromLongLong((long long) src);
+    }
+
+    template <typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_unsigned<U>::value
+                                       && (sizeof(U) > sizeof(unsigned long)),
+                                   handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PyLong_FromUnsignedLongLong((unsigned long long) src);
+    }
+
+    PYBIND11_TYPE_CASTER(T, const_name<std::is_integral<T>::value>("int", "float"));
+};
+
+template <typename T>
+struct void_caster {
+public:
+    bool load(handle src, bool) {
+        if (src && src.is_none()) {
+            return true;
+        }
+        return false;
+    }
+    static handle cast(T, return_value_policy /* policy */, handle /* parent */) {
+        return none().release();
+    }
+    PYBIND11_TYPE_CASTER(T, const_name("None"));
+};
+
+template <>
+class type_caster<void_type> : public void_caster<void_type> {};
+
+template <>
+class type_caster<void> : public type_caster<void_type> {
+public:
+    using type_caster<void_type>::cast;
+
+    bool load(handle h, bool) {
+        if (!h) {
+            return false;
+        }
+        if (h.is_none()) {
+            value = nullptr;
+            return true;
+        }
+
+        /* Check if this is a capsule */
+        if (isinstance<capsule>(h)) {
+            value = reinterpret_borrow<capsule>(h);
+            return true;
+        }
+
+        /* Check if this is a C++ type */
+        const auto &bases = all_type_info((PyTypeObject *) type::handle_of(h).ptr());
+        if (bases.size() == 1) { // Only allowing loading from a single-value type
+            value = values_and_holders(reinterpret_cast<instance *>(h.ptr())).begin()->value_ptr();
+            return true;
+        }
+
+        /* Fail */
+        return false;
+    }
+
+    static handle cast(const void *ptr, return_value_policy /* policy */, handle /* parent */) {
+        if (ptr) {
+            return capsule(ptr).release();
+        }
+        return none().release();
+    }
+
+    template <typename T>
+    using cast_op_type = void *&;
+    explicit operator void *&() { return value; }
+    static constexpr auto name = const_name("capsule");
+
+private:
+    void *value = nullptr;
+};
+
+template <>
+class type_caster<std::nullptr_t> : public void_caster<std::nullptr_t> {};
+
+template <>
+class type_caster<bool> {
+public:
+    bool load(handle src, bool convert) {
+        if (!src) {
+            return false;
+        }
+        if (src.ptr() == Py_True) {
+            value = true;
+            return true;
+        }
+        if (src.ptr() == Py_False) {
+            value = false;
+            return true;
+        }
+        if (convert || (std::strcmp("numpy.bool_", Py_TYPE(src.ptr())->tp_name) == 0)) {
+            // (allow non-implicit conversion for numpy booleans)
+
+            Py_ssize_t res = -1;
+            if (src.is_none()) {
+                res = 0; // None is implicitly converted to False
+            }
+#if defined(PYPY_VERSION)
+            // On PyPy, check that "__bool__" attr exists
+            else if (hasattr(src, PYBIND11_BOOL_ATTR)) {
+                res = PyObject_IsTrue(src.ptr());
+            }
+#else
+            // Alternate approach for CPython: this does the same as the above, but optimized
+            // using the CPython API so as to avoid an unneeded attribute lookup.
+            else if (auto *tp_as_number = src.ptr()->ob_type->tp_as_number) {
+                if (PYBIND11_NB_BOOL(tp_as_number)) {
+                    res = (*PYBIND11_NB_BOOL(tp_as_number))(src.ptr());
+                }
+            }
+#endif
+            if (res == 0 || res == 1) {
+                value = (res != 0);
+                return true;
+            }
+            PyErr_Clear();
+        }
+        return false;
+    }
+    static handle cast(bool src, return_value_policy /* policy */, handle /* parent */) {
+        return handle(src ? Py_True : Py_False).inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(bool, const_name("bool"));
+};
+
+// Helper class for UTF-{8,16,32} C++ stl strings:
+template <typename StringType, bool IsView = false>
+struct string_caster {
+    using CharT = typename StringType::value_type;
+
+    // Simplify life by being able to assume standard char sizes (the standard only guarantees
+    // minimums, but Python requires exact sizes)
+    static_assert(!std::is_same<CharT, char>::value || sizeof(CharT) == 1,
+                  "Unsupported char size != 1");
+#if defined(PYBIND11_HAS_U8STRING)
+    static_assert(!std::is_same<CharT, char8_t>::value || sizeof(CharT) == 1,
+                  "Unsupported char8_t size != 1");
+#endif
+    static_assert(!std::is_same<CharT, char16_t>::value || sizeof(CharT) == 2,
+                  "Unsupported char16_t size != 2");
+    static_assert(!std::is_same<CharT, char32_t>::value || sizeof(CharT) == 4,
+                  "Unsupported char32_t size != 4");
+    // wchar_t can be either 16 bits (Windows) or 32 (everywhere else)
+    static_assert(!std::is_same<CharT, wchar_t>::value || sizeof(CharT) == 2 || sizeof(CharT) == 4,
+                  "Unsupported wchar_t size != 2/4");
+    static constexpr size_t UTF_N = 8 * sizeof(CharT);
+
+    bool load(handle src, bool) {
+        handle load_src = src;
+        if (!src) {
+            return false;
+        }
+        if (!PyUnicode_Check(load_src.ptr())) {
+            return load_raw(load_src);
+        }
+
+        // For UTF-8 we avoid the need for a temporary `bytes` object by using
+        // `PyUnicode_AsUTF8AndSize`.
+        if (UTF_N == 8) {
+            Py_ssize_t size = -1;
+            const auto *buffer
+                = reinterpret_cast<const CharT *>(PyUnicode_AsUTF8AndSize(load_src.ptr(), &size));
+            if (!buffer) {
+                PyErr_Clear();
+                return false;
+            }
+            value = StringType(buffer, static_cast<size_t>(size));
+            return true;
+        }
+
+        auto utfNbytes
+            = reinterpret_steal<object>(PyUnicode_AsEncodedString(load_src.ptr(),
+                                                                  UTF_N == 8    ? "utf-8"
+                                                                  : UTF_N == 16 ? "utf-16"
+                                                                                : "utf-32",
+                                                                  nullptr));
+        if (!utfNbytes) {
+            PyErr_Clear();
+            return false;
+        }
+
+        const auto *buffer
+            = reinterpret_cast<const CharT *>(PYBIND11_BYTES_AS_STRING(utfNbytes.ptr()));
+        size_t length = (size_t) PYBIND11_BYTES_SIZE(utfNbytes.ptr()) / sizeof(CharT);
+        // Skip BOM for UTF-16/32
+        if (UTF_N > 8) {
+            buffer++;
+            length--;
+        }
+        value = StringType(buffer, length);
+
+        // If we're loading a string_view we need to keep the encoded Python object alive:
+        if (IsView) {
+            loader_life_support::add_patient(utfNbytes);
+        }
+
+        return true;
+    }
+
+    static handle
+    cast(const StringType &src, return_value_policy /* policy */, handle /* parent */) {
+        const char *buffer = reinterpret_cast<const char *>(src.data());
+        auto nbytes = ssize_t(src.size() * sizeof(CharT));
+        handle s = decode_utfN(buffer, nbytes);
+        if (!s) {
+            throw error_already_set();
+        }
+        return s;
+    }
+
+    PYBIND11_TYPE_CASTER(StringType, const_name(PYBIND11_STRING_NAME));
+
+private:
+    static handle decode_utfN(const char *buffer, ssize_t nbytes) {
+#if !defined(PYPY_VERSION)
+        return UTF_N == 8    ? PyUnicode_DecodeUTF8(buffer, nbytes, nullptr)
+               : UTF_N == 16 ? PyUnicode_DecodeUTF16(buffer, nbytes, nullptr, nullptr)
+                             : PyUnicode_DecodeUTF32(buffer, nbytes, nullptr, nullptr);
+#else
+        // PyPy segfaults when on PyUnicode_DecodeUTF16 (and possibly on PyUnicode_DecodeUTF32 as
+        // well), so bypass the whole thing by just passing the encoding as a string value, which
+        // works properly:
+        return PyUnicode_Decode(buffer,
+                                nbytes,
+                                UTF_N == 8    ? "utf-8"
+                                : UTF_N == 16 ? "utf-16"
+                                              : "utf-32",
+                                nullptr);
+#endif
+    }
+
+    // When loading into a std::string or char*, accept a bytes/bytearray object as-is (i.e.
+    // without any encoding/decoding attempt).  For other C++ char sizes this is a no-op.
+    // which supports loading a unicode from a str, doesn't take this path.
+    template <typename C = CharT>
+    bool load_raw(enable_if_t<std::is_same<C, char>::value, handle> src) {
+        if (PYBIND11_BYTES_CHECK(src.ptr())) {
+            // We were passed raw bytes; accept it into a std::string or char*
+            // without any encoding attempt.
+            const char *bytes = PYBIND11_BYTES_AS_STRING(src.ptr());
+            if (!bytes) {
+                pybind11_fail("Unexpected PYBIND11_BYTES_AS_STRING() failure.");
+            }
+            value = StringType(bytes, (size_t) PYBIND11_BYTES_SIZE(src.ptr()));
+            return true;
+        }
+        if (PyByteArray_Check(src.ptr())) {
+            // We were passed a bytearray; accept it into a std::string or char*
+            // without any encoding attempt.
+            const char *bytearray = PyByteArray_AsString(src.ptr());
+            if (!bytearray) {
+                pybind11_fail("Unexpected PyByteArray_AsString() failure.");
+            }
+            value = StringType(bytearray, (size_t) PyByteArray_Size(src.ptr()));
+            return true;
+        }
+
+        return false;
+    }
+
+    template <typename C = CharT>
+    bool load_raw(enable_if_t<!std::is_same<C, char>::value, handle>) {
+        return false;
+    }
+};
+
+template <typename CharT, class Traits, class Allocator>
+struct type_caster<std::basic_string<CharT, Traits, Allocator>,
+                   enable_if_t<is_std_char_type<CharT>::value>>
+    : string_caster<std::basic_string<CharT, Traits, Allocator>> {};
+
+#ifdef PYBIND11_HAS_STRING_VIEW
+template <typename CharT, class Traits>
+struct type_caster<std::basic_string_view<CharT, Traits>,
+                   enable_if_t<is_std_char_type<CharT>::value>>
+    : string_caster<std::basic_string_view<CharT, Traits>, true> {};
+#endif
+
+// Type caster for C-style strings.  We basically use a std::string type caster, but also add the
+// ability to use None as a nullptr char* (which the string caster doesn't allow).
+template <typename CharT>
+struct type_caster<CharT, enable_if_t<is_std_char_type<CharT>::value>> {
+    using StringType = std::basic_string<CharT>;
+    using StringCaster = make_caster<StringType>;
+    StringCaster str_caster;
+    bool none = false;
+    CharT one_char = 0;
+
+public:
+    bool load(handle src, bool convert) {
+        if (!src) {
+            return false;
+        }
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) {
+                return false;
+            }
+            none = true;
+            return true;
+        }
+        return str_caster.load(src, convert);
+    }
+
+    static handle cast(const CharT *src, return_value_policy policy, handle parent) {
+        if (src == nullptr) {
+            return pybind11::none().release();
+        }
+        return StringCaster::cast(StringType(src), policy, parent);
+    }
+
+    static handle cast(CharT src, return_value_policy policy, handle parent) {
+        if (std::is_same<char, CharT>::value) {
+            handle s = PyUnicode_DecodeLatin1((const char *) &src, 1, nullptr);
+            if (!s) {
+                throw error_already_set();
+            }
+            return s;
+        }
+        return StringCaster::cast(StringType(1, src), policy, parent);
+    }
+
+    explicit operator CharT *() {
+        return none ? nullptr : const_cast<CharT *>(static_cast<StringType &>(str_caster).c_str());
+    }
+    explicit operator CharT &() {
+        if (none) {
+            throw value_error("Cannot convert None to a character");
+        }
+
+        auto &value = static_cast<StringType &>(str_caster);
+        size_t str_len = value.size();
+        if (str_len == 0) {
+            throw value_error("Cannot convert empty string to a character");
+        }
+
+        // If we're in UTF-8 mode, we have two possible failures: one for a unicode character that
+        // is too high, and one for multiple unicode characters (caught later), so we need to
+        // figure out how long the first encoded character is in bytes to distinguish between these
+        // two errors.  We also allow want to allow unicode characters U+0080 through U+00FF, as
+        // those can fit into a single char value.
+        if (StringCaster::UTF_N == 8 && str_len > 1 && str_len <= 4) {
+            auto v0 = static_cast<unsigned char>(value[0]);
+            // low bits only: 0-127
+            // 0b110xxxxx - start of 2-byte sequence
+            // 0b1110xxxx - start of 3-byte sequence
+            // 0b11110xxx - start of 4-byte sequence
+            size_t char0_bytes = (v0 & 0x80) == 0      ? 1
+                                 : (v0 & 0xE0) == 0xC0 ? 2
+                                 : (v0 & 0xF0) == 0xE0 ? 3
+                                                       : 4;
+
+            if (char0_bytes == str_len) {
+                // If we have a 128-255 value, we can decode it into a single char:
+                if (char0_bytes == 2 && (v0 & 0xFC) == 0xC0) { // 0x110000xx 0x10xxxxxx
+                    one_char = static_cast<CharT>(((v0 & 3) << 6)
+                                                  + (static_cast<unsigned char>(value[1]) & 0x3F));
+                    return one_char;
+                }
+                // Otherwise we have a single character, but it's > U+00FF
+                throw value_error("Character code point not in range(0x100)");
+            }
+        }
+
+        // UTF-16 is much easier: we can only have a surrogate pair for values above U+FFFF, thus a
+        // surrogate pair with total length 2 instantly indicates a range error (but not a "your
+        // string was too long" error).
+        else if (StringCaster::UTF_N == 16 && str_len == 2) {
+            one_char = static_cast<CharT>(value[0]);
+            if (one_char >= 0xD800 && one_char < 0xE000) {
+                throw value_error("Character code point not in range(0x10000)");
+            }
+        }
+
+        if (str_len != 1) {
+            throw value_error("Expected a character, but multi-character string found");
+        }
+
+        one_char = value[0];
+        return one_char;
+    }
+
+    static constexpr auto name = const_name(PYBIND11_STRING_NAME);
+    template <typename _T>
+    using cast_op_type = pybind11::detail::cast_op_type<_T>;
+};
+
+// Base implementation for std::tuple and std::pair
+template <template <typename...> class Tuple, typename... Ts>
+class tuple_caster {
+    using type = Tuple<Ts...>;
+    static constexpr auto size = sizeof...(Ts);
+    using indices = make_index_sequence<size>;
+
+public:
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src)) {
+            return false;
+        }
+        const auto seq = reinterpret_borrow<sequence>(src);
+        if (seq.size() != size) {
+            return false;
+        }
+        return load_impl(seq, convert, indices{});
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        return cast_impl(std::forward<T>(src), policy, parent, indices{});
+    }
+
+    // copied from the PYBIND11_TYPE_CASTER macro
+    template <typename T>
+    static handle cast(T *src, return_value_policy policy, handle parent) {
+        if (!src) {
+            return none().release();
+        }
+        if (policy == return_value_policy::take_ownership) {
+            auto h = cast(std::move(*src), policy, parent);
+            delete src;
+            return h;
+        }
+        return cast(*src, policy, parent);
+    }
+
+    static constexpr auto name
+        = const_name("Tuple[") + concat(make_caster<Ts>::name...) + const_name("]");
+
+    template <typename T>
+    using cast_op_type = type;
+
+    explicit operator type() & { return implicit_cast(indices{}); }
+    explicit operator type() && { return std::move(*this).implicit_cast(indices{}); }
+
+protected:
+    template <size_t... Is>
+    type implicit_cast(index_sequence<Is...>) & {
+        return type(cast_op<Ts>(std::get<Is>(subcasters))...);
+    }
+    template <size_t... Is>
+    type implicit_cast(index_sequence<Is...>) && {
+        return type(cast_op<Ts>(std::move(std::get<Is>(subcasters)))...);
+    }
+
+    static constexpr bool load_impl(const sequence &, bool, index_sequence<>) { return true; }
+
+    template <size_t... Is>
+    bool load_impl(const sequence &seq, bool convert, index_sequence<Is...>) {
+#ifdef __cpp_fold_expressions
+        if ((... || !std::get<Is>(subcasters).load(seq[Is], convert))) {
+            return false;
+        }
+#else
+        for (bool r : {std::get<Is>(subcasters).load(seq[Is], convert)...}) {
+            if (!r) {
+                return false;
+            }
+        }
+#endif
+        return true;
+    }
+
+    /* Implementation: Convert a C++ tuple into a Python tuple */
+    template <typename T, size_t... Is>
+    static handle
+    cast_impl(T &&src, return_value_policy policy, handle parent, index_sequence<Is...>) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(src, policy, parent);
+        PYBIND11_WORKAROUND_INCORRECT_GCC_UNUSED_BUT_SET_PARAMETER(policy, parent);
+        std::array<object, size> entries{{reinterpret_steal<object>(
+            make_caster<Ts>::cast(std::get<Is>(std::forward<T>(src)), policy, parent))...}};
+        for (const auto &entry : entries) {
+            if (!entry) {
+                return handle();
+            }
+        }
+        tuple result(size);
+        int counter = 0;
+        for (auto &entry : entries) {
+            PyTuple_SET_ITEM(result.ptr(), counter++, entry.release().ptr());
+        }
+        return result.release();
+    }
+
+    Tuple<make_caster<Ts>...> subcasters;
+};
+
+template <typename T1, typename T2>
+class type_caster<std::pair<T1, T2>> : public tuple_caster<std::pair, T1, T2> {};
+
+template <typename... Ts>
+class type_caster<std::tuple<Ts...>> : public tuple_caster<std::tuple, Ts...> {};
+
+/// Helper class which abstracts away certain actions. Users can provide specializations for
+/// custom holders, but it's only necessary if the type has a non-standard interface.
+template <typename T>
+struct holder_helper {
+    static auto get(const T &p) -> decltype(p.get()) { return p.get(); }
+};
+
+/// Type caster for holder types like std::shared_ptr, etc.
+/// The SFINAE hook is provided to help work around the current lack of support
+/// for smart-pointer interoperability. Please consider it an implementation
+/// detail that may change in the future, as formal support for smart-pointer
+/// interoperability is added into pybind11.
+template <typename type, typename holder_type, typename SFINAE = void>
+struct copyable_holder_caster : public type_caster_base<type> {
+public:
+    using base = type_caster_base<type>;
+    static_assert(std::is_base_of<base, type_caster<type>>::value,
+                  "Holder classes are only supported for custom types");
+    using base::base;
+    using base::cast;
+    using base::typeinfo;
+    using base::value;
+
+    bool load(handle src, bool convert) {
+        return base::template load_impl<copyable_holder_caster<type, holder_type>>(src, convert);
+    }
+
+    explicit operator type *() { return this->value; }
+    // static_cast works around compiler error with MSVC 17 and CUDA 10.2
+    // see issue #2180
+    explicit operator type &() { return *(static_cast<type *>(this->value)); }
+    explicit operator holder_type *() { return std::addressof(holder); }
+    explicit operator holder_type &() { return holder; }
+
+    static handle cast(const holder_type &src, return_value_policy, handle) {
+        const auto *ptr = holder_helper<holder_type>::get(src);
+        return type_caster_base<type>::cast_holder(ptr, &src);
+    }
+
+protected:
+    friend class type_caster_generic;
+    void check_holder_compat() {
+        if (typeinfo->default_holder) {
+            throw cast_error("Unable to load a custom holder type from a default-holder instance");
+        }
+    }
+
+    bool load_value(value_and_holder &&v_h) {
+        if (v_h.holder_constructed()) {
+            value = v_h.value_ptr();
+            holder = v_h.template holder<holder_type>();
+            return true;
+        }
+        throw cast_error("Unable to cast from non-held to held instance (T& to Holder<T>) "
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+                         "(#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for "
+                         "type information)");
+#else
+                         "of type '"
+                         + type_id<holder_type>() + "''");
+#endif
+    }
+
+    template <typename T = holder_type,
+              detail::enable_if_t<!std::is_constructible<T, const T &, type *>::value, int> = 0>
+    bool try_implicit_casts(handle, bool) {
+        return false;
+    }
+
+    template <typename T = holder_type,
+              detail::enable_if_t<std::is_constructible<T, const T &, type *>::value, int> = 0>
+    bool try_implicit_casts(handle src, bool convert) {
+        for (auto &cast : typeinfo->implicit_casts) {
+            copyable_holder_caster sub_caster(*cast.first);
+            if (sub_caster.load(src, convert)) {
+                value = cast.second(sub_caster.value);
+                holder = holder_type(sub_caster.holder, (type *) value);
+                return true;
+            }
+        }
+        return false;
+    }
+
+    static bool try_direct_conversions(handle) { return false; }
+
+    holder_type holder;
+};
+
+/// Specialize for the common std::shared_ptr, so users don't need to
+template <typename T>
+class type_caster<std::shared_ptr<T>> : public copyable_holder_caster<T, std::shared_ptr<T>> {};
+
+/// Type caster for holder types like std::unique_ptr.
+/// Please consider the SFINAE hook an implementation detail, as explained
+/// in the comment for the copyable_holder_caster.
+template <typename type, typename holder_type, typename SFINAE = void>
+struct move_only_holder_caster {
+    static_assert(std::is_base_of<type_caster_base<type>, type_caster<type>>::value,
+                  "Holder classes are only supported for custom types");
+
+    static handle cast(holder_type &&src, return_value_policy, handle) {
+        auto *ptr = holder_helper<holder_type>::get(src);
+        return type_caster_base<type>::cast_holder(ptr, std::addressof(src));
+    }
+    static constexpr auto name = type_caster_base<type>::name;
+};
+
+template <typename type, typename deleter>
+class type_caster<std::unique_ptr<type, deleter>>
+    : public move_only_holder_caster<type, std::unique_ptr<type, deleter>> {};
+
+template <typename type, typename holder_type>
+using type_caster_holder = conditional_t<is_copy_constructible<holder_type>::value,
+                                         copyable_holder_caster<type, holder_type>,
+                                         move_only_holder_caster<type, holder_type>>;
+
+template <typename T, bool Value = false>
+struct always_construct_holder {
+    static constexpr bool value = Value;
+};
+
+/// Create a specialization for custom holder types (silently ignores std::shared_ptr)
+#define PYBIND11_DECLARE_HOLDER_TYPE(type, holder_type, ...)                                      \
+    PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)                                                  \
+    namespace detail {                                                                            \
+    template <typename type>                                                                      \
+    struct always_construct_holder<holder_type> : always_construct_holder<void, ##__VA_ARGS__> {  \
+    };                                                                                            \
+    template <typename type>                                                                      \
+    class type_caster<holder_type, enable_if_t<!is_shared_ptr<holder_type>::value>>               \
+        : public type_caster_holder<type, holder_type> {};                                        \
+    }                                                                                             \
+    PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
+
+// PYBIND11_DECLARE_HOLDER_TYPE holder types:
+template <typename base, typename holder>
+struct is_holder_type
+    : std::is_base_of<detail::type_caster_holder<base, holder>, detail::type_caster<holder>> {};
+// Specialization for always-supported unique_ptr holders:
+template <typename base, typename deleter>
+struct is_holder_type<base, std::unique_ptr<base, deleter>> : std::true_type {};
+
+template <typename T>
+struct handle_type_name {
+    static constexpr auto name = const_name<T>();
+};
+template <>
+struct handle_type_name<bool_> {
+    static constexpr auto name = const_name("bool");
+};
+template <>
+struct handle_type_name<bytes> {
+    static constexpr auto name = const_name(PYBIND11_BYTES_NAME);
+};
+template <>
+struct handle_type_name<int_> {
+    static constexpr auto name = const_name("int");
+};
+template <>
+struct handle_type_name<iterable> {
+    static constexpr auto name = const_name("Iterable");
+};
+template <>
+struct handle_type_name<iterator> {
+    static constexpr auto name = const_name("Iterator");
+};
+template <>
+struct handle_type_name<float_> {
+    static constexpr auto name = const_name("float");
+};
+template <>
+struct handle_type_name<none> {
+    static constexpr auto name = const_name("None");
+};
+template <>
+struct handle_type_name<args> {
+    static constexpr auto name = const_name("*args");
+};
+template <>
+struct handle_type_name<kwargs> {
+    static constexpr auto name = const_name("**kwargs");
+};
+
+template <typename type>
+struct pyobject_caster {
+    template <typename T = type, enable_if_t<std::is_same<T, handle>::value, int> = 0>
+    pyobject_caster() : value() {}
+
+    // `type` may not be default constructible (e.g. frozenset, anyset).  Initializing `value`
+    // to a nil handle is safe since it will only be accessed if `load` succeeds.
+    template <typename T = type, enable_if_t<std::is_base_of<object, T>::value, int> = 0>
+    pyobject_caster() : value(reinterpret_steal<type>(handle())) {}
+
+    template <typename T = type, enable_if_t<std::is_same<T, handle>::value, int> = 0>
+    bool load(handle src, bool /* convert */) {
+        value = src;
+        return static_cast<bool>(value);
+    }
+
+    template <typename T = type, enable_if_t<std::is_base_of<object, T>::value, int> = 0>
+    bool load(handle src, bool /* convert */) {
+        if (!isinstance<type>(src)) {
+            return false;
+        }
+        value = reinterpret_borrow<type>(src);
+        return true;
+    }
+
+    static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) {
+        return src.inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(type, handle_type_name<type>::name);
+};
+
+template <typename T>
+class type_caster<T, enable_if_t<is_pyobject<T>::value>> : public pyobject_caster<T> {};
+
+// Our conditions for enabling moving are quite restrictive:
+// At compile time:
+// - T needs to be a non-const, non-pointer, non-reference type
+// - type_caster<T>::operator T&() must exist
+// - the type must be move constructible (obviously)
+// At run-time:
+// - if the type is non-copy-constructible, the object must be the sole owner of the type (i.e. it
+//   must have ref_count() == 1)h
+// If any of the above are not satisfied, we fall back to copying.
+template <typename T>
+using move_is_plain_type
+    = satisfies_none_of<T, std::is_void, std::is_pointer, std::is_reference, std::is_const>;
+template <typename T, typename SFINAE = void>
+struct move_always : std::false_type {};
+template <typename T>
+struct move_always<
+    T,
+    enable_if_t<
+        all_of<move_is_plain_type<T>,
+               negation<is_copy_constructible<T>>,
+               is_move_constructible<T>,
+               std::is_same<decltype(std::declval<make_caster<T>>().operator T &()), T &>>::value>>
+    : std::true_type {};
+template <typename T, typename SFINAE = void>
+struct move_if_unreferenced : std::false_type {};
+template <typename T>
+struct move_if_unreferenced<
+    T,
+    enable_if_t<
+        all_of<move_is_plain_type<T>,
+               negation<move_always<T>>,
+               is_move_constructible<T>,
+               std::is_same<decltype(std::declval<make_caster<T>>().operator T &()), T &>>::value>>
+    : std::true_type {};
+template <typename T>
+using move_never = none_of<move_always<T>, move_if_unreferenced<T>>;
+
+// Detect whether returning a `type` from a cast on type's type_caster is going to result in a
+// reference or pointer to a local variable of the type_caster.  Basically, only
+// non-reference/pointer `type`s and reference/pointers from a type_caster_generic are safe;
+// everything else returns a reference/pointer to a local variable.
+template <typename type>
+using cast_is_temporary_value_reference
+    = bool_constant<(std::is_reference<type>::value || std::is_pointer<type>::value)
+                    && !std::is_base_of<type_caster_generic, make_caster<type>>::value
+                    && !std::is_same<intrinsic_t<type>, void>::value>;
+
+// When a value returned from a C++ function is being cast back to Python, we almost always want to
+// force `policy = move`, regardless of the return value policy the function/method was declared
+// with.
+template <typename Return, typename SFINAE = void>
+struct return_value_policy_override {
+    static return_value_policy policy(return_value_policy p) { return p; }
+};
+
+template <typename Return>
+struct return_value_policy_override<
+    Return,
+    detail::enable_if_t<std::is_base_of<type_caster_generic, make_caster<Return>>::value, void>> {
+    static return_value_policy policy(return_value_policy p) {
+        return !std::is_lvalue_reference<Return>::value && !std::is_pointer<Return>::value
+                   ? return_value_policy::move
+                   : p;
+    }
+};
+
+// Basic python -> C++ casting; throws if casting fails
+template <typename T, typename SFINAE>
+type_caster<T, SFINAE> &load_type(type_caster<T, SFINAE> &conv, const handle &handle) {
+    static_assert(!detail::is_pyobject<T>::value,
+                  "Internal error: type_caster should only be used for C++ types");
+    if (!conv.load(handle, true)) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+        throw cast_error(
+            "Unable to cast Python instance of type "
+            + str(type::handle_of(handle)).cast<std::string>()
+            + " to C++ type '?' (#define "
+              "PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for details)");
+#else
+        throw cast_error("Unable to cast Python instance of type "
+                         + str(type::handle_of(handle)).cast<std::string>() + " to C++ type '"
+                         + type_id<T>() + "'");
+#endif
+    }
+    return conv;
+}
+// Wrapper around the above that also constructs and returns a type_caster
+template <typename T>
+make_caster<T> load_type(const handle &handle) {
+    make_caster<T> conv;
+    load_type(conv, handle);
+    return conv;
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+// pytype -> C++ type
+template <typename T,
+          detail::enable_if_t<!detail::is_pyobject<T>::value
+                                  && !detail::is_same_ignoring_cvref<T, PyObject *>::value,
+                              int>
+          = 0>
+T cast(const handle &handle) {
+    using namespace detail;
+    static_assert(!cast_is_temporary_value_reference<T>::value,
+                  "Unable to cast type to reference: value is local to type caster");
+    return cast_op<T>(load_type<T>(handle));
+}
+
+// pytype -> pytype (calls converting constructor)
+template <typename T, detail::enable_if_t<detail::is_pyobject<T>::value, int> = 0>
+T cast(const handle &handle) {
+    return T(reinterpret_borrow<object>(handle));
+}
+
+// Note that `cast<PyObject *>(obj)` increments the reference count of `obj`.
+// This is necessary for the case that `obj` is a temporary, and could
+// not possibly be different, given
+// 1. the established convention that the passed `handle` is borrowed, and
+// 2. we don't want to force all generic code using `cast<T>()` to special-case
+//    handling of `T` = `PyObject *` (to increment the reference count there).
+// It is the responsibility of the caller to ensure that the reference count
+// is decremented.
+template <typename T,
+          typename Handle,
+          detail::enable_if_t<detail::is_same_ignoring_cvref<T, PyObject *>::value
+                                  && detail::is_same_ignoring_cvref<Handle, handle>::value,
+                              int>
+          = 0>
+T cast(Handle &&handle) {
+    return handle.inc_ref().ptr();
+}
+// To optimize way an inc_ref/dec_ref cycle:
+template <typename T,
+          typename Object,
+          detail::enable_if_t<detail::is_same_ignoring_cvref<T, PyObject *>::value
+                                  && detail::is_same_ignoring_cvref<Object, object>::value,
+                              int>
+          = 0>
+T cast(Object &&obj) {
+    return obj.release().ptr();
+}
+
+// C++ type -> py::object
+template <typename T, detail::enable_if_t<!detail::is_pyobject<T>::value, int> = 0>
+object cast(T &&value,
+            return_value_policy policy = return_value_policy::automatic_reference,
+            handle parent = handle()) {
+    using no_ref_T = typename std::remove_reference<T>::type;
+    if (policy == return_value_policy::automatic) {
+        policy = std::is_pointer<no_ref_T>::value     ? return_value_policy::take_ownership
+                 : std::is_lvalue_reference<T>::value ? return_value_policy::copy
+                                                      : return_value_policy::move;
+    } else if (policy == return_value_policy::automatic_reference) {
+        policy = std::is_pointer<no_ref_T>::value     ? return_value_policy::reference
+                 : std::is_lvalue_reference<T>::value ? return_value_policy::copy
+                                                      : return_value_policy::move;
+    }
+    return reinterpret_steal<object>(
+        detail::make_caster<T>::cast(std::forward<T>(value), policy, parent));
+}
+
+template <typename T>
+T handle::cast() const {
+    return pybind11::cast<T>(*this);
+}
+template <>
+inline void handle::cast() const {
+    return;
+}
+
+template <typename T>
+detail::enable_if_t<!detail::move_never<T>::value, T> move(object &&obj) {
+    if (obj.ref_count() > 1) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+        throw cast_error(
+            "Unable to cast Python " + str(type::handle_of(obj)).cast<std::string>()
+            + " instance to C++ rvalue: instance has multiple references"
+              " (#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for details)");
+#else
+        throw cast_error("Unable to move from Python "
+                         + str(type::handle_of(obj)).cast<std::string>() + " instance to C++ "
+                         + type_id<T>() + " instance: instance has multiple references");
+#endif
+    }
+
+    // Move into a temporary and return that, because the reference may be a local value of `conv`
+    T ret = std::move(detail::load_type<T>(obj).operator T &());
+    return ret;
+}
+
+// Calling cast() on an rvalue calls pybind11::cast with the object rvalue, which does:
+// - If we have to move (because T has no copy constructor), do it.  This will fail if the moved
+//   object has multiple references, but trying to copy will fail to compile.
+// - If both movable and copyable, check ref count: if 1, move; otherwise copy
+// - Otherwise (not movable), copy.
+template <typename T>
+detail::enable_if_t<!detail::is_pyobject<T>::value && detail::move_always<T>::value, T>
+cast(object &&object) {
+    return move<T>(std::move(object));
+}
+template <typename T>
+detail::enable_if_t<!detail::is_pyobject<T>::value && detail::move_if_unreferenced<T>::value, T>
+cast(object &&object) {
+    if (object.ref_count() > 1) {
+        return cast<T>(object);
+    }
+    return move<T>(std::move(object));
+}
+template <typename T>
+detail::enable_if_t<!detail::is_pyobject<T>::value && detail::move_never<T>::value, T>
+cast(object &&object) {
+    return cast<T>(object);
+}
+
+// pytype rvalue -> pytype (calls converting constructor)
+template <typename T>
+detail::enable_if_t<detail::is_pyobject<T>::value, T> cast(object &&object) {
+    return T(std::move(object));
+}
+
+template <typename T>
+T object::cast() const & {
+    return pybind11::cast<T>(*this);
+}
+template <typename T>
+T object::cast() && {
+    return pybind11::cast<T>(std::move(*this));
+}
+template <>
+inline void object::cast() const & {
+    return;
+}
+template <>
+inline void object::cast() && {
+    return;
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Declared in pytypes.h:
+template <typename T, enable_if_t<!is_pyobject<T>::value, int>>
+object object_or_cast(T &&o) {
+    return pybind11::cast(std::forward<T>(o));
+}
+
+// Placeholder type for the unneeded (and dead code) static variable in the
+// PYBIND11_OVERRIDE_OVERRIDE macro
+struct override_unused {};
+template <typename ret_type>
+using override_caster_t = conditional_t<cast_is_temporary_value_reference<ret_type>::value,
+                                        make_caster<ret_type>,
+                                        override_unused>;
+
+// Trampoline use: for reference/pointer types to value-converted values, we do a value cast, then
+// store the result in the given variable.  For other types, this is a no-op.
+template <typename T>
+enable_if_t<cast_is_temporary_value_reference<T>::value, T> cast_ref(object &&o,
+                                                                     make_caster<T> &caster) {
+    return cast_op<T>(load_type(caster, o));
+}
+template <typename T>
+enable_if_t<!cast_is_temporary_value_reference<T>::value, T> cast_ref(object &&,
+                                                                      override_unused &) {
+    pybind11_fail("Internal error: cast_ref fallback invoked");
+}
+
+// Trampoline use: Having a pybind11::cast with an invalid reference type is going to
+// static_assert, even though if it's in dead code, so we provide a "trampoline" to pybind11::cast
+// that only does anything in cases where pybind11::cast is valid.
+template <typename T>
+enable_if_t<cast_is_temporary_value_reference<T>::value, T> cast_safe(object &&) {
+    pybind11_fail("Internal error: cast_safe fallback invoked");
+}
+template <typename T>
+enable_if_t<std::is_void<T>::value, void> cast_safe(object &&) {}
+template <typename T>
+enable_if_t<detail::none_of<cast_is_temporary_value_reference<T>, std::is_void<T>>::value, T>
+cast_safe(object &&o) {
+    return pybind11::cast<T>(std::move(o));
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+// The overloads could coexist, i.e. the #if is not strictly speaking needed,
+// but it is an easy minor optimization.
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+inline cast_error cast_error_unable_to_convert_call_arg(const std::string &name) {
+    return cast_error("Unable to convert call argument '" + name
+                      + "' to Python object (#define "
+                        "PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for details)");
+}
+#else
+inline cast_error cast_error_unable_to_convert_call_arg(const std::string &name,
+                                                        const std::string &type) {
+    return cast_error("Unable to convert call argument '" + name + "' of type '" + type
+                      + "' to Python object");
+}
+#endif
+
+template <return_value_policy policy = return_value_policy::automatic_reference>
+tuple make_tuple() {
+    return tuple(0);
+}
+
+template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
+tuple make_tuple(Args &&...args_) {
+    constexpr size_t size = sizeof...(Args);
+    std::array<object, size> args{{reinterpret_steal<object>(
+        detail::make_caster<Args>::cast(std::forward<Args>(args_), policy, nullptr))...}};
+    for (size_t i = 0; i < args.size(); i++) {
+        if (!args[i]) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            throw cast_error_unable_to_convert_call_arg(std::to_string(i));
+#else
+            std::array<std::string, size> argtypes{{type_id<Args>()...}};
+            throw cast_error_unable_to_convert_call_arg(std::to_string(i), argtypes[i]);
+#endif
+        }
+    }
+    tuple result(size);
+    int counter = 0;
+    for (auto &arg_value : args) {
+        PyTuple_SET_ITEM(result.ptr(), counter++, arg_value.release().ptr());
+    }
+    return result;
+}
+
+/// \ingroup annotations
+/// Annotation for arguments
+struct arg {
+    /// Constructs an argument with the name of the argument; if null or omitted, this is a
+    /// positional argument.
+    constexpr explicit arg(const char *name = nullptr)
+        : name(name), flag_noconvert(false), flag_none(true) {}
+    /// Assign a value to this argument
+    template <typename T>
+    arg_v operator=(T &&value) const;
+    /// Indicate that the type should not be converted in the type caster
+    arg &noconvert(bool flag = true) {
+        flag_noconvert = flag;
+        return *this;
+    }
+    /// Indicates that the argument should/shouldn't allow None (e.g. for nullable pointer args)
+    arg &none(bool flag = true) {
+        flag_none = flag;
+        return *this;
+    }
+
+    const char *name;        ///< If non-null, this is a named kwargs argument
+    bool flag_noconvert : 1; ///< If set, do not allow conversion (requires a supporting type
+                             ///< caster!)
+    bool flag_none : 1;      ///< If set (the default), allow None to be passed to this argument
+};
+
+/// \ingroup annotations
+/// Annotation for arguments with values
+struct arg_v : arg {
+private:
+    template <typename T>
+    arg_v(arg &&base, T &&x, const char *descr = nullptr)
+        : arg(base), value(reinterpret_steal<object>(detail::make_caster<T>::cast(
+                         std::forward<T>(x), return_value_policy::automatic, {}))),
+          descr(descr)
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+          ,
+          type(type_id<T>())
+#endif
+    {
+        // Workaround! See:
+        // https://github.com/pybind/pybind11/issues/2336
+        // https://github.com/pybind/pybind11/pull/2685#issuecomment-731286700
+        if (PyErr_Occurred()) {
+            PyErr_Clear();
+        }
+    }
+
+public:
+    /// Direct construction with name, default, and description
+    template <typename T>
+    arg_v(const char *name, T &&x, const char *descr = nullptr)
+        : arg_v(arg(name), std::forward<T>(x), descr) {}
+
+    /// Called internally when invoking `py::arg("a") = value`
+    template <typename T>
+    arg_v(const arg &base, T &&x, const char *descr = nullptr)
+        : arg_v(arg(base), std::forward<T>(x), descr) {}
+
+    /// Same as `arg::noconvert()`, but returns *this as arg_v&, not arg&
+    arg_v &noconvert(bool flag = true) {
+        arg::noconvert(flag);
+        return *this;
+    }
+
+    /// Same as `arg::nonone()`, but returns *this as arg_v&, not arg&
+    arg_v &none(bool flag = true) {
+        arg::none(flag);
+        return *this;
+    }
+
+    /// The default value
+    object value;
+    /// The (optional) description of the default value
+    const char *descr;
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+    /// The C++ type name of the default value (only available when compiled in debug mode)
+    std::string type;
+#endif
+};
+
+/// \ingroup annotations
+/// Annotation indicating that all following arguments are keyword-only; the is the equivalent of
+/// an unnamed '*' argument
+struct kw_only {};
+
+/// \ingroup annotations
+/// Annotation indicating that all previous arguments are positional-only; the is the equivalent of
+/// an unnamed '/' argument (in Python 3.8)
+struct pos_only {};
+
+template <typename T>
+arg_v arg::operator=(T &&value) const {
+    return {*this, std::forward<T>(value)};
+}
+
+/// Alias for backward compatibility -- to be removed in version 2.0
+template <typename /*unused*/>
+using arg_t = arg_v;
+
+inline namespace literals {
+/** \rst
+    String literal version of `arg`
+ \endrst */
+constexpr arg operator"" _a(const char *name, size_t) { return arg(name); }
+} // namespace literals
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename T>
+using is_kw_only = std::is_same<intrinsic_t<T>, kw_only>;
+template <typename T>
+using is_pos_only = std::is_same<intrinsic_t<T>, pos_only>;
+
+// forward declaration (definition in attr.h)
+struct function_record;
+
+/// Internal data associated with a single function call
+struct function_call {
+    function_call(const function_record &f, handle p); // Implementation in attr.h
+
+    /// The function data:
+    const function_record &func;
+
+    /// Arguments passed to the function:
+    std::vector<handle> args;
+
+    /// The `convert` value the arguments should be loaded with
+    std::vector<bool> args_convert;
+
+    /// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if
+    /// present, are also in `args` but without a reference).
+    object args_ref, kwargs_ref;
+
+    /// The parent, if any
+    handle parent;
+
+    /// If this is a call to an initializer, this argument contains `self`
+    handle init_self;
+};
+
+/// Helper class which loads arguments for C++ functions called from Python
+template <typename... Args>
+class argument_loader {
+    using indices = make_index_sequence<sizeof...(Args)>;
+
+    template <typename Arg>
+    using argument_is_args = std::is_same<intrinsic_t<Arg>, args>;
+    template <typename Arg>
+    using argument_is_kwargs = std::is_same<intrinsic_t<Arg>, kwargs>;
+    // Get kwargs argument position, or -1 if not present:
+    static constexpr auto kwargs_pos = constexpr_last<argument_is_kwargs, Args...>();
+
+    static_assert(kwargs_pos == -1 || kwargs_pos == (int) sizeof...(Args) - 1,
+                  "py::kwargs is only permitted as the last argument of a function");
+
+public:
+    static constexpr bool has_kwargs = kwargs_pos != -1;
+
+    // py::args argument position; -1 if not present.
+    static constexpr int args_pos = constexpr_last<argument_is_args, Args...>();
+
+    static_assert(args_pos == -1 || args_pos == constexpr_first<argument_is_args, Args...>(),
+                  "py::args cannot be specified more than once");
+
+    static constexpr auto arg_names = concat(type_descr(make_caster<Args>::name)...);
+
+    bool load_args(function_call &call) { return load_impl_sequence(call, indices{}); }
+
+    template <typename Return, typename Guard, typename Func>
+    // NOLINTNEXTLINE(readability-const-return-type)
+    enable_if_t<!std::is_void<Return>::value, Return> call(Func &&f) && {
+        return std::move(*this).template call_impl<remove_cv_t<Return>>(
+            std::forward<Func>(f), indices{}, Guard{});
+    }
+
+    template <typename Return, typename Guard, typename Func>
+    enable_if_t<std::is_void<Return>::value, void_type> call(Func &&f) && {
+        std::move(*this).template call_impl<remove_cv_t<Return>>(
+            std::forward<Func>(f), indices{}, Guard{});
+        return void_type();
+    }
+
+private:
+    static bool load_impl_sequence(function_call &, index_sequence<>) { return true; }
+
+    template <size_t... Is>
+    bool load_impl_sequence(function_call &call, index_sequence<Is...>) {
+#ifdef __cpp_fold_expressions
+        if ((... || !std::get<Is>(argcasters).load(call.args[Is], call.args_convert[Is]))) {
+            return false;
+        }
+#else
+        for (bool r : {std::get<Is>(argcasters).load(call.args[Is], call.args_convert[Is])...}) {
+            if (!r) {
+                return false;
+            }
+        }
+#endif
+        return true;
+    }
+
+    template <typename Return, typename Func, size_t... Is, typename Guard>
+    Return call_impl(Func &&f, index_sequence<Is...>, Guard &&) && {
+        return std::forward<Func>(f)(cast_op<Args>(std::move(std::get<Is>(argcasters)))...);
+    }
+
+    std::tuple<make_caster<Args>...> argcasters;
+};
+
+/// Helper class which collects only positional arguments for a Python function call.
+/// A fancier version below can collect any argument, but this one is optimal for simple calls.
+template <return_value_policy policy>
+class simple_collector {
+public:
+    template <typename... Ts>
+    explicit simple_collector(Ts &&...values)
+        : m_args(pybind11::make_tuple<policy>(std::forward<Ts>(values)...)) {}
+
+    const tuple &args() const & { return m_args; }
+    dict kwargs() const { return {}; }
+
+    tuple args() && { return std::move(m_args); }
+
+    /// Call a Python function and pass the collected arguments
+    object call(PyObject *ptr) const {
+        PyObject *result = PyObject_CallObject(ptr, m_args.ptr());
+        if (!result) {
+            throw error_already_set();
+        }
+        return reinterpret_steal<object>(result);
+    }
+
+private:
+    tuple m_args;
+};
+
+/// Helper class which collects positional, keyword, * and ** arguments for a Python function call
+template <return_value_policy policy>
+class unpacking_collector {
+public:
+    template <typename... Ts>
+    explicit unpacking_collector(Ts &&...values) {
+        // Tuples aren't (easily) resizable so a list is needed for collection,
+        // but the actual function call strictly requires a tuple.
+        auto args_list = list();
+        using expander = int[];
+        (void) expander{0, (process(args_list, std::forward<Ts>(values)), 0)...};
+
+        m_args = std::move(args_list);
+    }
+
+    const tuple &args() const & { return m_args; }
+    const dict &kwargs() const & { return m_kwargs; }
+
+    tuple args() && { return std::move(m_args); }
+    dict kwargs() && { return std::move(m_kwargs); }
+
+    /// Call a Python function and pass the collected arguments
+    object call(PyObject *ptr) const {
+        PyObject *result = PyObject_Call(ptr, m_args.ptr(), m_kwargs.ptr());
+        if (!result) {
+            throw error_already_set();
+        }
+        return reinterpret_steal<object>(result);
+    }
+
+private:
+    template <typename T>
+    void process(list &args_list, T &&x) {
+        auto o = reinterpret_steal<object>(
+            detail::make_caster<T>::cast(std::forward<T>(x), policy, {}));
+        if (!o) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            throw cast_error_unable_to_convert_call_arg(std::to_string(args_list.size()));
+#else
+            throw cast_error_unable_to_convert_call_arg(std::to_string(args_list.size()),
+                                                        type_id<T>());
+#endif
+        }
+        args_list.append(std::move(o));
+    }
+
+    void process(list &args_list, detail::args_proxy ap) {
+        for (auto a : ap) {
+            args_list.append(a);
+        }
+    }
+
+    void process(list & /*args_list*/, arg_v a) {
+        if (!a.name) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            nameless_argument_error();
+#else
+            nameless_argument_error(a.type);
+#endif
+        }
+        if (m_kwargs.contains(a.name)) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            multiple_values_error();
+#else
+            multiple_values_error(a.name);
+#endif
+        }
+        if (!a.value) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            throw cast_error_unable_to_convert_call_arg(a.name);
+#else
+            throw cast_error_unable_to_convert_call_arg(a.name, a.type);
+#endif
+        }
+        m_kwargs[a.name] = std::move(a.value);
+    }
+
+    void process(list & /*args_list*/, detail::kwargs_proxy kp) {
+        if (!kp) {
+            return;
+        }
+        for (auto k : reinterpret_borrow<dict>(kp)) {
+            if (m_kwargs.contains(k.first)) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+                multiple_values_error();
+#else
+                multiple_values_error(str(k.first));
+#endif
+            }
+            m_kwargs[k.first] = k.second;
+        }
+    }
+
+    [[noreturn]] static void nameless_argument_error() {
+        throw type_error(
+            "Got kwargs without a name; only named arguments "
+            "may be passed via py::arg() to a python function call. "
+            "(#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for details)");
+    }
+    [[noreturn]] static void nameless_argument_error(const std::string &type) {
+        throw type_error("Got kwargs without a name of type '" + type
+                         + "'; only named "
+                           "arguments may be passed via py::arg() to a python function call. ");
+    }
+    [[noreturn]] static void multiple_values_error() {
+        throw type_error(
+            "Got multiple values for keyword argument "
+            "(#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for details)");
+    }
+
+    [[noreturn]] static void multiple_values_error(const std::string &name) {
+        throw type_error("Got multiple values for keyword argument '" + name + "'");
+    }
+
+private:
+    tuple m_args;
+    dict m_kwargs;
+};
+
+// [workaround(intel)] Separate function required here
+// We need to put this into a separate function because the Intel compiler
+// fails to compile enable_if_t<!all_of<is_positional<Args>...>::value>
+// (tested with ICC 2021.1 Beta 20200827).
+template <typename... Args>
+constexpr bool args_are_all_positional() {
+    return all_of<is_positional<Args>...>::value;
+}
+
+/// Collect only positional arguments for a Python function call
+template <return_value_policy policy,
+          typename... Args,
+          typename = enable_if_t<args_are_all_positional<Args...>()>>
+simple_collector<policy> collect_arguments(Args &&...args) {
+    return simple_collector<policy>(std::forward<Args>(args)...);
+}
+
+/// Collect all arguments, including keywords and unpacking (only instantiated when needed)
+template <return_value_policy policy,
+          typename... Args,
+          typename = enable_if_t<!args_are_all_positional<Args...>()>>
+unpacking_collector<policy> collect_arguments(Args &&...args) {
+    // Following argument order rules for generalized unpacking according to PEP 448
+    static_assert(constexpr_last<is_positional, Args...>()
+                          < constexpr_first<is_keyword_or_ds, Args...>()
+                      && constexpr_last<is_s_unpacking, Args...>()
+                             < constexpr_first<is_ds_unpacking, Args...>(),
+                  "Invalid function call: positional args must precede keywords and ** unpacking; "
+                  "* unpacking must precede ** unpacking");
+    return unpacking_collector<policy>(std::forward<Args>(args)...);
+}
+
+template <typename Derived>
+template <return_value_policy policy, typename... Args>
+object object_api<Derived>::operator()(Args &&...args) const {
+#ifndef NDEBUG
+    if (!PyGILState_Check()) {
+        pybind11_fail("pybind11::object_api<>::operator() PyGILState_Check() failure.");
+    }
+#endif
+    return detail::collect_arguments<policy>(std::forward<Args>(args)...).call(derived().ptr());
+}
+
+template <typename Derived>
+template <return_value_policy policy, typename... Args>
+object object_api<Derived>::call(Args &&...args) const {
+    return operator()<policy>(std::forward<Args>(args)...);
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+template <typename T>
+handle type::handle_of() {
+    static_assert(std::is_base_of<detail::type_caster_generic, detail::make_caster<T>>::value,
+                  "py::type::of<T> only supports the case where T is a registered C++ types.");
+
+    return detail::get_type_handle(typeid(T), true);
+}
+
+#define PYBIND11_MAKE_OPAQUE(...)                                                                 \
+    PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)                                                  \
+    namespace detail {                                                                            \
+    template <>                                                                                   \
+    class type_caster<__VA_ARGS__> : public type_caster_base<__VA_ARGS__> {};                     \
+    }                                                                                             \
+    PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
+
+/// Lets you pass a type containing a `,` through a macro parameter without needing a separate
+/// typedef, e.g.:
+/// `PYBIND11_OVERRIDE(PYBIND11_TYPE(ReturnType<A, B>), PYBIND11_TYPE(Parent<C, D>), f, arg)`
+#define PYBIND11_TYPE(...) __VA_ARGS__
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/chrono.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/chrono.h
new file mode 100644
index 00000000..167ea0e3
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/chrono.h
@@ -0,0 +1,225 @@
+/*
+    pybind11/chrono.h: Transparent conversion between std::chrono and python's datetime
+
+    Copyright (c) 2016 Trent Houliston <trent@houliston.me> and
+                       Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#include <chrono>
+#include <cmath>
+#include <ctime>
+#include <datetime.h>
+#include <mutex>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename type>
+class duration_caster {
+public:
+    using rep = typename type::rep;
+    using period = typename type::period;
+
+    // signed 25 bits required by the standard.
+    using days = std::chrono::duration<int_least32_t, std::ratio<86400>>;
+
+    bool load(handle src, bool) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) {
+            PyDateTime_IMPORT;
+        }
+
+        if (!src) {
+            return false;
+        }
+        // If invoked with datetime.delta object
+        if (PyDelta_Check(src.ptr())) {
+            value = type(duration_cast<duration<rep, period>>(
+                days(PyDateTime_DELTA_GET_DAYS(src.ptr()))
+                + seconds(PyDateTime_DELTA_GET_SECONDS(src.ptr()))
+                + microseconds(PyDateTime_DELTA_GET_MICROSECONDS(src.ptr()))));
+            return true;
+        }
+        // If invoked with a float we assume it is seconds and convert
+        if (PyFloat_Check(src.ptr())) {
+            value = type(duration_cast<duration<rep, period>>(
+                duration<double>(PyFloat_AsDouble(src.ptr()))));
+            return true;
+        }
+        return false;
+    }
+
+    // If this is a duration just return it back
+    static const std::chrono::duration<rep, period> &
+    get_duration(const std::chrono::duration<rep, period> &src) {
+        return src;
+    }
+
+    // If this is a time_point get the time_since_epoch
+    template <typename Clock>
+    static std::chrono::duration<rep, period>
+    get_duration(const std::chrono::time_point<Clock, std::chrono::duration<rep, period>> &src) {
+        return src.time_since_epoch();
+    }
+
+    static handle cast(const type &src, return_value_policy /* policy */, handle /* parent */) {
+        using namespace std::chrono;
+
+        // Use overloaded function to get our duration from our source
+        // Works out if it is a duration or time_point and get the duration
+        auto d = get_duration(src);
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) {
+            PyDateTime_IMPORT;
+        }
+
+        // Declare these special duration types so the conversions happen with the correct
+        // primitive types (int)
+        using dd_t = duration<int, std::ratio<86400>>;
+        using ss_t = duration<int, std::ratio<1>>;
+        using us_t = duration<int, std::micro>;
+
+        auto dd = duration_cast<dd_t>(d);
+        auto subd = d - dd;
+        auto ss = duration_cast<ss_t>(subd);
+        auto us = duration_cast<us_t>(subd - ss);
+        return PyDelta_FromDSU(dd.count(), ss.count(), us.count());
+    }
+
+    PYBIND11_TYPE_CASTER(type, const_name("datetime.timedelta"));
+};
+
+inline std::tm *localtime_thread_safe(const std::time_t *time, std::tm *buf) {
+#if (defined(__STDC_LIB_EXT1__) && defined(__STDC_WANT_LIB_EXT1__)) || defined(_MSC_VER)
+    if (localtime_s(buf, time))
+        return nullptr;
+    return buf;
+#else
+    static std::mutex mtx;
+    std::lock_guard<std::mutex> lock(mtx);
+    std::tm *tm_ptr = std::localtime(time);
+    if (tm_ptr != nullptr) {
+        *buf = *tm_ptr;
+    }
+    return tm_ptr;
+#endif
+}
+
+// This is for casting times on the system clock into datetime.datetime instances
+template <typename Duration>
+class type_caster<std::chrono::time_point<std::chrono::system_clock, Duration>> {
+public:
+    using type = std::chrono::time_point<std::chrono::system_clock, Duration>;
+    bool load(handle src, bool) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) {
+            PyDateTime_IMPORT;
+        }
+
+        if (!src) {
+            return false;
+        }
+
+        std::tm cal;
+        microseconds msecs;
+
+        if (PyDateTime_Check(src.ptr())) {
+            cal.tm_sec = PyDateTime_DATE_GET_SECOND(src.ptr());
+            cal.tm_min = PyDateTime_DATE_GET_MINUTE(src.ptr());
+            cal.tm_hour = PyDateTime_DATE_GET_HOUR(src.ptr());
+            cal.tm_mday = PyDateTime_GET_DAY(src.ptr());
+            cal.tm_mon = PyDateTime_GET_MONTH(src.ptr()) - 1;
+            cal.tm_year = PyDateTime_GET_YEAR(src.ptr()) - 1900;
+            cal.tm_isdst = -1;
+            msecs = microseconds(PyDateTime_DATE_GET_MICROSECOND(src.ptr()));
+        } else if (PyDate_Check(src.ptr())) {
+            cal.tm_sec = 0;
+            cal.tm_min = 0;
+            cal.tm_hour = 0;
+            cal.tm_mday = PyDateTime_GET_DAY(src.ptr());
+            cal.tm_mon = PyDateTime_GET_MONTH(src.ptr()) - 1;
+            cal.tm_year = PyDateTime_GET_YEAR(src.ptr()) - 1900;
+            cal.tm_isdst = -1;
+            msecs = microseconds(0);
+        } else if (PyTime_Check(src.ptr())) {
+            cal.tm_sec = PyDateTime_TIME_GET_SECOND(src.ptr());
+            cal.tm_min = PyDateTime_TIME_GET_MINUTE(src.ptr());
+            cal.tm_hour = PyDateTime_TIME_GET_HOUR(src.ptr());
+            cal.tm_mday = 1;  // This date (day, month, year) = (1, 0, 70)
+            cal.tm_mon = 0;   // represents 1-Jan-1970, which is the first
+            cal.tm_year = 70; // earliest available date for Python's datetime
+            cal.tm_isdst = -1;
+            msecs = microseconds(PyDateTime_TIME_GET_MICROSECOND(src.ptr()));
+        } else {
+            return false;
+        }
+
+        value = time_point_cast<Duration>(system_clock::from_time_t(std::mktime(&cal)) + msecs);
+        return true;
+    }
+
+    static handle cast(const std::chrono::time_point<std::chrono::system_clock, Duration> &src,
+                       return_value_policy /* policy */,
+                       handle /* parent */) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) {
+            PyDateTime_IMPORT;
+        }
+
+        // Get out microseconds, and make sure they are positive, to avoid bug in eastern
+        // hemisphere time zones (cfr. https://github.com/pybind/pybind11/issues/2417)
+        using us_t = duration<int, std::micro>;
+        auto us = duration_cast<us_t>(src.time_since_epoch() % seconds(1));
+        if (us.count() < 0) {
+            us += seconds(1);
+        }
+
+        // Subtract microseconds BEFORE `system_clock::to_time_t`, because:
+        // > If std::time_t has lower precision, it is implementation-defined whether the value is
+        // rounded or truncated. (https://en.cppreference.com/w/cpp/chrono/system_clock/to_time_t)
+        std::time_t tt
+            = system_clock::to_time_t(time_point_cast<system_clock::duration>(src - us));
+
+        std::tm localtime;
+        std::tm *localtime_ptr = localtime_thread_safe(&tt, &localtime);
+        if (!localtime_ptr) {
+            throw cast_error("Unable to represent system_clock in local time");
+        }
+        return PyDateTime_FromDateAndTime(localtime.tm_year + 1900,
+                                          localtime.tm_mon + 1,
+                                          localtime.tm_mday,
+                                          localtime.tm_hour,
+                                          localtime.tm_min,
+                                          localtime.tm_sec,
+                                          us.count());
+    }
+    PYBIND11_TYPE_CASTER(type, const_name("datetime.datetime"));
+};
+
+// Other clocks that are not the system clock are not measured as datetime.datetime objects
+// since they are not measured on calendar time. So instead we just make them timedeltas
+// Or if they have passed us a time as a float we convert that
+template <typename Clock, typename Duration>
+class type_caster<std::chrono::time_point<Clock, Duration>>
+    : public duration_caster<std::chrono::time_point<Clock, Duration>> {};
+
+template <typename Rep, typename Period>
+class type_caster<std::chrono::duration<Rep, Period>>
+    : public duration_caster<std::chrono::duration<Rep, Period>> {};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/common.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/common.h
new file mode 100644
index 00000000..6c8a4f1e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/common.h
@@ -0,0 +1,2 @@
+#include "detail/common.h"
+#warning "Including 'common.h' is deprecated. It will be removed in v3.0. Use 'pybind11.h'."
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/complex.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/complex.h
new file mode 100644
index 00000000..8a831c12
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/complex.h
@@ -0,0 +1,74 @@
+/*
+    pybind11/complex.h: Complex number support
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#include <complex>
+
+/// glibc defines I as a macro which breaks things, e.g., boost template names
+#ifdef I
+#    undef I
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+template <typename T>
+struct format_descriptor<std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr const char c = format_descriptor<T>::c;
+    static constexpr const char value[3] = {'Z', c, '\0'};
+    static std::string format() { return std::string(value); }
+};
+
+#ifndef PYBIND11_CPP17
+
+template <typename T>
+constexpr const char
+    format_descriptor<std::complex<T>,
+                      detail::enable_if_t<std::is_floating_point<T>::value>>::value[3];
+
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename T>
+struct is_fmt_numeric<std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr bool value = true;
+    static constexpr int index = is_fmt_numeric<T>::index + 3;
+};
+
+template <typename T>
+class type_caster<std::complex<T>> {
+public:
+    bool load(handle src, bool convert) {
+        if (!src) {
+            return false;
+        }
+        if (!convert && !PyComplex_Check(src.ptr())) {
+            return false;
+        }
+        Py_complex result = PyComplex_AsCComplex(src.ptr());
+        if (result.real == -1.0 && PyErr_Occurred()) {
+            PyErr_Clear();
+            return false;
+        }
+        value = std::complex<T>((T) result.real, (T) result.imag);
+        return true;
+    }
+
+    static handle
+    cast(const std::complex<T> &src, return_value_policy /* policy */, handle /* parent */) {
+        return PyComplex_FromDoubles((double) src.real(), (double) src.imag());
+    }
+
+    PYBIND11_TYPE_CASTER(std::complex<T>, const_name("complex"));
+};
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/class.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/class.h
new file mode 100644
index 00000000..bc2b40c5
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/class.h
@@ -0,0 +1,743 @@
+/*
+    pybind11/detail/class.h: Python C API implementation details for py::class_
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "../attr.h"
+#include "../options.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+#if !defined(PYPY_VERSION)
+#    define PYBIND11_BUILTIN_QUALNAME
+#    define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj)
+#else
+// In PyPy, we still set __qualname__ so that we can produce reliable function type
+// signatures; in CPython this macro expands to nothing:
+#    define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj)                                             \
+        setattr((PyObject *) obj, "__qualname__", nameobj)
+#endif
+
+inline std::string get_fully_qualified_tp_name(PyTypeObject *type) {
+#if !defined(PYPY_VERSION)
+    return type->tp_name;
+#else
+    auto module_name = handle((PyObject *) type).attr("__module__").cast<std::string>();
+    if (module_name == PYBIND11_BUILTINS_MODULE)
+        return type->tp_name;
+    else
+        return std::move(module_name) + "." + type->tp_name;
+#endif
+}
+
+inline PyTypeObject *type_incref(PyTypeObject *type) {
+    Py_INCREF(type);
+    return type;
+}
+
+#if !defined(PYPY_VERSION)
+
+/// `pybind11_static_property.__get__()`: Always pass the class instead of the instance.
+extern "C" inline PyObject *pybind11_static_get(PyObject *self, PyObject * /*ob*/, PyObject *cls) {
+    return PyProperty_Type.tp_descr_get(self, cls, cls);
+}
+
+/// `pybind11_static_property.__set__()`: Just like the above `__get__()`.
+extern "C" inline int pybind11_static_set(PyObject *self, PyObject *obj, PyObject *value) {
+    PyObject *cls = PyType_Check(obj) ? obj : (PyObject *) Py_TYPE(obj);
+    return PyProperty_Type.tp_descr_set(self, cls, value);
+}
+
+// Forward declaration to use in `make_static_property_type()`
+inline void enable_dynamic_attributes(PyHeapTypeObject *heap_type);
+
+/** A `static_property` is the same as a `property` but the `__get__()` and `__set__()`
+    methods are modified to always use the object type instead of a concrete instance.
+    Return value: New reference. */
+inline PyTypeObject *make_static_property_type() {
+    constexpr auto *name = "pybind11_static_property";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto *heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
+    if (!heap_type) {
+        pybind11_fail("make_static_property_type(): error allocating type!");
+    }
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#    ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#    endif
+
+    auto *type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyProperty_Type);
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+    type->tp_descr_get = pybind11_static_get;
+    type->tp_descr_set = pybind11_static_set;
+
+    if (PyType_Ready(type) < 0) {
+        pybind11_fail("make_static_property_type(): failure in PyType_Ready()!");
+    }
+
+#    if PY_VERSION_HEX >= 0x030C0000
+    // PRE 3.12 FEATURE FREEZE. PLEASE REVIEW AFTER FREEZE.
+    // Since Python-3.12 property-derived types are required to
+    // have dynamic attributes (to set `__doc__`)
+    enable_dynamic_attributes(heap_type);
+#    endif
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    return type;
+}
+
+#else // PYPY
+
+/** PyPy has some issues with the above C API, so we evaluate Python code instead.
+    This function will only be called once so performance isn't really a concern.
+    Return value: New reference. */
+inline PyTypeObject *make_static_property_type() {
+    auto d = dict();
+    PyObject *result = PyRun_String(R"(\
+class pybind11_static_property(property):
+    def __get__(self, obj, cls):
+        return property.__get__(self, cls, cls)
+
+    def __set__(self, obj, value):
+        cls = obj if isinstance(obj, type) else type(obj)
+        property.__set__(self, cls, value)
+)",
+                                    Py_file_input,
+                                    d.ptr(),
+                                    d.ptr());
+    if (result == nullptr)
+        throw error_already_set();
+    Py_DECREF(result);
+    return (PyTypeObject *) d["pybind11_static_property"].cast<object>().release().ptr();
+}
+
+#endif // PYPY
+
+/** Types with static properties need to handle `Type.static_prop = x` in a specific way.
+    By default, Python replaces the `static_property` itself, but for wrapped C++ types
+    we need to call `static_property.__set__()` in order to propagate the new value to
+    the underlying C++ data structure. */
+extern "C" inline int pybind11_meta_setattro(PyObject *obj, PyObject *name, PyObject *value) {
+    // Use `_PyType_Lookup()` instead of `PyObject_GetAttr()` in order to get the raw
+    // descriptor (`property`) instead of calling `tp_descr_get` (`property.__get__()`).
+    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
+
+    // The following assignment combinations are possible:
+    //   1. `Type.static_prop = value`             --> descr_set: `Type.static_prop.__set__(value)`
+    //   2. `Type.static_prop = other_static_prop` --> setattro:  replace existing `static_prop`
+    //   3. `Type.regular_attribute = value`       --> setattro:  regular attribute assignment
+    auto *const static_prop = (PyObject *) get_internals().static_property_type;
+    const auto call_descr_set = (descr != nullptr) && (value != nullptr)
+                                && (PyObject_IsInstance(descr, static_prop) != 0)
+                                && (PyObject_IsInstance(value, static_prop) == 0);
+    if (call_descr_set) {
+        // Call `static_property.__set__()` instead of replacing the `static_property`.
+#if !defined(PYPY_VERSION)
+        return Py_TYPE(descr)->tp_descr_set(descr, obj, value);
+#else
+        if (PyObject *result = PyObject_CallMethod(descr, "__set__", "OO", obj, value)) {
+            Py_DECREF(result);
+            return 0;
+        } else {
+            return -1;
+        }
+#endif
+    } else {
+        // Replace existing attribute.
+        return PyType_Type.tp_setattro(obj, name, value);
+    }
+}
+
+/**
+ * Python 3's PyInstanceMethod_Type hides itself via its tp_descr_get, which prevents aliasing
+ * methods via cls.attr("m2") = cls.attr("m1"): instead the tp_descr_get returns a plain function,
+ * when called on a class, or a PyMethod, when called on an instance.  Override that behaviour here
+ * to do a special case bypass for PyInstanceMethod_Types.
+ */
+extern "C" inline PyObject *pybind11_meta_getattro(PyObject *obj, PyObject *name) {
+    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
+    if (descr && PyInstanceMethod_Check(descr)) {
+        Py_INCREF(descr);
+        return descr;
+    }
+    return PyType_Type.tp_getattro(obj, name);
+}
+
+/// metaclass `__call__` function that is used to create all pybind11 objects.
+extern "C" inline PyObject *pybind11_meta_call(PyObject *type, PyObject *args, PyObject *kwargs) {
+
+    // use the default metaclass call to create/initialize the object
+    PyObject *self = PyType_Type.tp_call(type, args, kwargs);
+    if (self == nullptr) {
+        return nullptr;
+    }
+
+    // This must be a pybind11 instance
+    auto *instance = reinterpret_cast<detail::instance *>(self);
+
+    // Ensure that the base __init__ function(s) were called
+    for (const auto &vh : values_and_holders(instance)) {
+        if (!vh.holder_constructed()) {
+            PyErr_Format(PyExc_TypeError,
+                         "%.200s.__init__() must be called when overriding __init__",
+                         get_fully_qualified_tp_name(vh.type->type).c_str());
+            Py_DECREF(self);
+            return nullptr;
+        }
+    }
+
+    return self;
+}
+
+/// Cleanup the type-info for a pybind11-registered type.
+extern "C" inline void pybind11_meta_dealloc(PyObject *obj) {
+    auto *type = (PyTypeObject *) obj;
+    auto &internals = get_internals();
+
+    // A pybind11-registered type will:
+    // 1) be found in internals.registered_types_py
+    // 2) have exactly one associated `detail::type_info`
+    auto found_type = internals.registered_types_py.find(type);
+    if (found_type != internals.registered_types_py.end() && found_type->second.size() == 1
+        && found_type->second[0]->type == type) {
+
+        auto *tinfo = found_type->second[0];
+        auto tindex = std::type_index(*tinfo->cpptype);
+        internals.direct_conversions.erase(tindex);
+
+        if (tinfo->module_local) {
+            get_local_internals().registered_types_cpp.erase(tindex);
+        } else {
+            internals.registered_types_cpp.erase(tindex);
+        }
+        internals.registered_types_py.erase(tinfo->type);
+
+        // Actually just `std::erase_if`, but that's only available in C++20
+        auto &cache = internals.inactive_override_cache;
+        for (auto it = cache.begin(), last = cache.end(); it != last;) {
+            if (it->first == (PyObject *) tinfo->type) {
+                it = cache.erase(it);
+            } else {
+                ++it;
+            }
+        }
+
+        delete tinfo;
+    }
+
+    PyType_Type.tp_dealloc(obj);
+}
+
+/** This metaclass is assigned by default to all pybind11 types and is required in order
+    for static properties to function correctly. Users may override this using `py::metaclass`.
+    Return value: New reference. */
+inline PyTypeObject *make_default_metaclass() {
+    constexpr auto *name = "pybind11_type";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto *heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
+    if (!heap_type) {
+        pybind11_fail("make_default_metaclass(): error allocating metaclass!");
+    }
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto *type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyType_Type);
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+
+    type->tp_call = pybind11_meta_call;
+
+    type->tp_setattro = pybind11_meta_setattro;
+    type->tp_getattro = pybind11_meta_getattro;
+
+    type->tp_dealloc = pybind11_meta_dealloc;
+
+    if (PyType_Ready(type) < 0) {
+        pybind11_fail("make_default_metaclass(): failure in PyType_Ready()!");
+    }
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    return type;
+}
+
+/// For multiple inheritance types we need to recursively register/deregister base pointers for any
+/// base classes with pointers that are difference from the instance value pointer so that we can
+/// correctly recognize an offset base class pointer. This calls a function with any offset base
+/// ptrs.
+inline void traverse_offset_bases(void *valueptr,
+                                  const detail::type_info *tinfo,
+                                  instance *self,
+                                  bool (*f)(void * /*parentptr*/, instance * /*self*/)) {
+    for (handle h : reinterpret_borrow<tuple>(tinfo->type->tp_bases)) {
+        if (auto *parent_tinfo = get_type_info((PyTypeObject *) h.ptr())) {
+            for (auto &c : parent_tinfo->implicit_casts) {
+                if (c.first == tinfo->cpptype) {
+                    auto *parentptr = c.second(valueptr);
+                    if (parentptr != valueptr) {
+                        f(parentptr, self);
+                    }
+                    traverse_offset_bases(parentptr, parent_tinfo, self, f);
+                    break;
+                }
+            }
+        }
+    }
+}
+
+inline bool register_instance_impl(void *ptr, instance *self) {
+    get_internals().registered_instances.emplace(ptr, self);
+    return true; // unused, but gives the same signature as the deregister func
+}
+inline bool deregister_instance_impl(void *ptr, instance *self) {
+    auto &registered_instances = get_internals().registered_instances;
+    auto range = registered_instances.equal_range(ptr);
+    for (auto it = range.first; it != range.second; ++it) {
+        if (self == it->second) {
+            registered_instances.erase(it);
+            return true;
+        }
+    }
+    return false;
+}
+
+inline void register_instance(instance *self, void *valptr, const type_info *tinfo) {
+    register_instance_impl(valptr, self);
+    if (!tinfo->simple_ancestors) {
+        traverse_offset_bases(valptr, tinfo, self, register_instance_impl);
+    }
+}
+
+inline bool deregister_instance(instance *self, void *valptr, const type_info *tinfo) {
+    bool ret = deregister_instance_impl(valptr, self);
+    if (!tinfo->simple_ancestors) {
+        traverse_offset_bases(valptr, tinfo, self, deregister_instance_impl);
+    }
+    return ret;
+}
+
+/// Instance creation function for all pybind11 types. It allocates the internal instance layout
+/// for holding C++ objects and holders.  Allocation is done lazily (the first time the instance is
+/// cast to a reference or pointer), and initialization is done by an `__init__` function.
+inline PyObject *make_new_instance(PyTypeObject *type) {
+#if defined(PYPY_VERSION)
+    // PyPy gets tp_basicsize wrong (issue 2482) under multiple inheritance when the first
+    // inherited object is a plain Python type (i.e. not derived from an extension type).  Fix it.
+    ssize_t instance_size = static_cast<ssize_t>(sizeof(instance));
+    if (type->tp_basicsize < instance_size) {
+        type->tp_basicsize = instance_size;
+    }
+#endif
+    PyObject *self = type->tp_alloc(type, 0);
+    auto *inst = reinterpret_cast<instance *>(self);
+    // Allocate the value/holder internals:
+    inst->allocate_layout();
+
+    return self;
+}
+
+/// Instance creation function for all pybind11 types. It only allocates space for the
+/// C++ object, but doesn't call the constructor -- an `__init__` function must do that.
+extern "C" inline PyObject *pybind11_object_new(PyTypeObject *type, PyObject *, PyObject *) {
+    return make_new_instance(type);
+}
+
+/// An `__init__` function constructs the C++ object. Users should provide at least one
+/// of these using `py::init` or directly with `.def(__init__, ...)`. Otherwise, the
+/// following default function will be used which simply throws an exception.
+extern "C" inline int pybind11_object_init(PyObject *self, PyObject *, PyObject *) {
+    PyTypeObject *type = Py_TYPE(self);
+    std::string msg = get_fully_qualified_tp_name(type) + ": No constructor defined!";
+    PyErr_SetString(PyExc_TypeError, msg.c_str());
+    return -1;
+}
+
+inline void add_patient(PyObject *nurse, PyObject *patient) {
+    auto &internals = get_internals();
+    auto *instance = reinterpret_cast<detail::instance *>(nurse);
+    instance->has_patients = true;
+    Py_INCREF(patient);
+    internals.patients[nurse].push_back(patient);
+}
+
+inline void clear_patients(PyObject *self) {
+    auto *instance = reinterpret_cast<detail::instance *>(self);
+    auto &internals = get_internals();
+    auto pos = internals.patients.find(self);
+    assert(pos != internals.patients.end());
+    // Clearing the patients can cause more Python code to run, which
+    // can invalidate the iterator. Extract the vector of patients
+    // from the unordered_map first.
+    auto patients = std::move(pos->second);
+    internals.patients.erase(pos);
+    instance->has_patients = false;
+    for (PyObject *&patient : patients) {
+        Py_CLEAR(patient);
+    }
+}
+
+/// Clears all internal data from the instance and removes it from registered instances in
+/// preparation for deallocation.
+inline void clear_instance(PyObject *self) {
+    auto *instance = reinterpret_cast<detail::instance *>(self);
+
+    // Deallocate any values/holders, if present:
+    for (auto &v_h : values_and_holders(instance)) {
+        if (v_h) {
+
+            // We have to deregister before we call dealloc because, for virtual MI types, we still
+            // need to be able to get the parent pointers.
+            if (v_h.instance_registered()
+                && !deregister_instance(instance, v_h.value_ptr(), v_h.type)) {
+                pybind11_fail(
+                    "pybind11_object_dealloc(): Tried to deallocate unregistered instance!");
+            }
+
+            if (instance->owned || v_h.holder_constructed()) {
+                v_h.type->dealloc(v_h);
+            }
+        }
+    }
+    // Deallocate the value/holder layout internals:
+    instance->deallocate_layout();
+
+    if (instance->weakrefs) {
+        PyObject_ClearWeakRefs(self);
+    }
+
+    PyObject **dict_ptr = _PyObject_GetDictPtr(self);
+    if (dict_ptr) {
+        Py_CLEAR(*dict_ptr);
+    }
+
+    if (instance->has_patients) {
+        clear_patients(self);
+    }
+}
+
+/// Instance destructor function for all pybind11 types. It calls `type_info.dealloc`
+/// to destroy the C++ object itself, while the rest is Python bookkeeping.
+extern "C" inline void pybind11_object_dealloc(PyObject *self) {
+    auto *type = Py_TYPE(self);
+
+    // If this is a GC tracked object, untrack it first
+    // Note that the track call is implicitly done by the
+    // default tp_alloc, which we never override.
+    if (PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC) != 0) {
+        PyObject_GC_UnTrack(self);
+    }
+
+    clear_instance(self);
+
+    type->tp_free(self);
+
+#if PY_VERSION_HEX < 0x03080000
+    // `type->tp_dealloc != pybind11_object_dealloc` means that we're being called
+    // as part of a derived type's dealloc, in which case we're not allowed to decref
+    // the type here. For cross-module compatibility, we shouldn't compare directly
+    // with `pybind11_object_dealloc`, but with the common one stashed in internals.
+    auto pybind11_object_type = (PyTypeObject *) get_internals().instance_base;
+    if (type->tp_dealloc == pybind11_object_type->tp_dealloc)
+        Py_DECREF(type);
+#else
+    // This was not needed before Python 3.8 (Python issue 35810)
+    // https://github.com/pybind/pybind11/issues/1946
+    Py_DECREF(type);
+#endif
+}
+
+std::string error_string();
+
+/** Create the type which can be used as a common base for all classes.  This is
+    needed in order to satisfy Python's requirements for multiple inheritance.
+    Return value: New reference. */
+inline PyObject *make_object_base_type(PyTypeObject *metaclass) {
+    constexpr auto *name = "pybind11_object";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto *heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
+    if (!heap_type) {
+        pybind11_fail("make_object_base_type(): error allocating type!");
+    }
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto *type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyBaseObject_Type);
+    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+
+    type->tp_new = pybind11_object_new;
+    type->tp_init = pybind11_object_init;
+    type->tp_dealloc = pybind11_object_dealloc;
+
+    /* Support weak references (needed for the keep_alive feature) */
+    type->tp_weaklistoffset = offsetof(instance, weakrefs);
+
+    if (PyType_Ready(type) < 0) {
+        pybind11_fail("PyType_Ready failed in make_object_base_type(): " + error_string());
+    }
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    assert(!PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+    return (PyObject *) heap_type;
+}
+
+/// dynamic_attr: Allow the garbage collector to traverse the internal instance `__dict__`.
+extern "C" inline int pybind11_traverse(PyObject *self, visitproc visit, void *arg) {
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_VISIT(dict);
+// https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_traverse
+#if PY_VERSION_HEX >= 0x03090000
+    Py_VISIT(Py_TYPE(self));
+#endif
+    return 0;
+}
+
+/// dynamic_attr: Allow the GC to clear the dictionary.
+extern "C" inline int pybind11_clear(PyObject *self) {
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_CLEAR(dict);
+    return 0;
+}
+
+/// Give instances of this type a `__dict__` and opt into garbage collection.
+inline void enable_dynamic_attributes(PyHeapTypeObject *heap_type) {
+    auto *type = &heap_type->ht_type;
+    type->tp_flags |= Py_TPFLAGS_HAVE_GC;
+#if PY_VERSION_HEX < 0x030B0000
+    type->tp_dictoffset = type->tp_basicsize;           // place dict at the end
+    type->tp_basicsize += (ssize_t) sizeof(PyObject *); // and allocate enough space for it
+#else
+    type->tp_flags |= Py_TPFLAGS_MANAGED_DICT;
+#endif
+    type->tp_traverse = pybind11_traverse;
+    type->tp_clear = pybind11_clear;
+
+    static PyGetSetDef getset[] = {{
+#if PY_VERSION_HEX < 0x03070000
+                                       const_cast<char *>("__dict__"),
+#else
+                                       "__dict__",
+#endif
+                                       PyObject_GenericGetDict,
+                                       PyObject_GenericSetDict,
+                                       nullptr,
+                                       nullptr},
+                                   {nullptr, nullptr, nullptr, nullptr, nullptr}};
+    type->tp_getset = getset;
+}
+
+/// buffer_protocol: Fill in the view as specified by flags.
+extern "C" inline int pybind11_getbuffer(PyObject *obj, Py_buffer *view, int flags) {
+    // Look for a `get_buffer` implementation in this type's info or any bases (following MRO).
+    type_info *tinfo = nullptr;
+    for (auto type : reinterpret_borrow<tuple>(Py_TYPE(obj)->tp_mro)) {
+        tinfo = get_type_info((PyTypeObject *) type.ptr());
+        if (tinfo && tinfo->get_buffer) {
+            break;
+        }
+    }
+    if (view == nullptr || !tinfo || !tinfo->get_buffer) {
+        if (view) {
+            view->obj = nullptr;
+        }
+        PyErr_SetString(PyExc_BufferError, "pybind11_getbuffer(): Internal error");
+        return -1;
+    }
+    std::memset(view, 0, sizeof(Py_buffer));
+    buffer_info *info = tinfo->get_buffer(obj, tinfo->get_buffer_data);
+    if ((flags & PyBUF_WRITABLE) == PyBUF_WRITABLE && info->readonly) {
+        delete info;
+        // view->obj = nullptr;  // Was just memset to 0, so not necessary
+        PyErr_SetString(PyExc_BufferError, "Writable buffer requested for readonly storage");
+        return -1;
+    }
+    view->obj = obj;
+    view->ndim = 1;
+    view->internal = info;
+    view->buf = info->ptr;
+    view->itemsize = info->itemsize;
+    view->len = view->itemsize;
+    for (auto s : info->shape) {
+        view->len *= s;
+    }
+    view->readonly = static_cast<int>(info->readonly);
+    if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) {
+        view->format = const_cast<char *>(info->format.c_str());
+    }
+    if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
+        view->ndim = (int) info->ndim;
+        view->strides = info->strides.data();
+        view->shape = info->shape.data();
+    }
+    Py_INCREF(view->obj);
+    return 0;
+}
+
+/// buffer_protocol: Release the resources of the buffer.
+extern "C" inline void pybind11_releasebuffer(PyObject *, Py_buffer *view) {
+    delete (buffer_info *) view->internal;
+}
+
+/// Give this type a buffer interface.
+inline void enable_buffer_protocol(PyHeapTypeObject *heap_type) {
+    heap_type->ht_type.tp_as_buffer = &heap_type->as_buffer;
+
+    heap_type->as_buffer.bf_getbuffer = pybind11_getbuffer;
+    heap_type->as_buffer.bf_releasebuffer = pybind11_releasebuffer;
+}
+
+/** Create a brand new Python type according to the `type_record` specification.
+    Return value: New reference. */
+inline PyObject *make_new_python_type(const type_record &rec) {
+    auto name = reinterpret_steal<object>(PYBIND11_FROM_STRING(rec.name));
+
+    auto qualname = name;
+    if (rec.scope && !PyModule_Check(rec.scope.ptr()) && hasattr(rec.scope, "__qualname__")) {
+        qualname = reinterpret_steal<object>(
+            PyUnicode_FromFormat("%U.%U", rec.scope.attr("__qualname__").ptr(), name.ptr()));
+    }
+
+    object module_;
+    if (rec.scope) {
+        if (hasattr(rec.scope, "__module__")) {
+            module_ = rec.scope.attr("__module__");
+        } else if (hasattr(rec.scope, "__name__")) {
+            module_ = rec.scope.attr("__name__");
+        }
+    }
+
+    const auto *full_name = c_str(
+#if !defined(PYPY_VERSION)
+        module_ ? str(module_).cast<std::string>() + "." + rec.name :
+#endif
+                rec.name);
+
+    char *tp_doc = nullptr;
+    if (rec.doc && options::show_user_defined_docstrings()) {
+        /* Allocate memory for docstring (using PyObject_MALLOC, since
+           Python will free this later on) */
+        size_t size = std::strlen(rec.doc) + 1;
+        tp_doc = (char *) PyObject_MALLOC(size);
+        std::memcpy((void *) tp_doc, rec.doc, size);
+    }
+
+    auto &internals = get_internals();
+    auto bases = tuple(rec.bases);
+    auto *base = (bases.empty()) ? internals.instance_base : bases[0].ptr();
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto *metaclass
+        = rec.metaclass.ptr() ? (PyTypeObject *) rec.metaclass.ptr() : internals.default_metaclass;
+
+    auto *heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
+    if (!heap_type) {
+        pybind11_fail(std::string(rec.name) + ": Unable to create type object!");
+    }
+
+    heap_type->ht_name = name.release().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = qualname.inc_ref().ptr();
+#endif
+
+    auto *type = &heap_type->ht_type;
+    type->tp_name = full_name;
+    type->tp_doc = tp_doc;
+    type->tp_base = type_incref((PyTypeObject *) base);
+    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+    if (!bases.empty()) {
+        type->tp_bases = bases.release().ptr();
+    }
+
+    /* Don't inherit base __init__ */
+    type->tp_init = pybind11_object_init;
+
+    /* Supported protocols */
+    type->tp_as_number = &heap_type->as_number;
+    type->tp_as_sequence = &heap_type->as_sequence;
+    type->tp_as_mapping = &heap_type->as_mapping;
+    type->tp_as_async = &heap_type->as_async;
+
+    /* Flags */
+    type->tp_flags |= Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE;
+    if (!rec.is_final) {
+        type->tp_flags |= Py_TPFLAGS_BASETYPE;
+    }
+
+    if (rec.dynamic_attr) {
+        enable_dynamic_attributes(heap_type);
+    }
+
+    if (rec.buffer_protocol) {
+        enable_buffer_protocol(heap_type);
+    }
+
+    if (rec.custom_type_setup_callback) {
+        rec.custom_type_setup_callback(heap_type);
+    }
+
+    if (PyType_Ready(type) < 0) {
+        pybind11_fail(std::string(rec.name) + ": PyType_Ready failed: " + error_string());
+    }
+
+    assert(!rec.dynamic_attr || PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+
+    /* Register type with the parent scope */
+    if (rec.scope) {
+        setattr(rec.scope, rec.name, (PyObject *) type);
+    } else {
+        Py_INCREF(type); // Keep it alive forever (reference leak)
+    }
+
+    if (module_) { // Needed by pydoc
+        setattr((PyObject *) type, "__module__", module_);
+    }
+
+    PYBIND11_SET_OLDPY_QUALNAME(type, qualname);
+
+    return (PyObject *) type;
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/common.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/common.h
new file mode 100644
index 00000000..31a54c77
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/common.h
@@ -0,0 +1,1255 @@
+/*
+    pybind11/detail/common.h -- Basic macros
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#define PYBIND11_VERSION_MAJOR 2
+#define PYBIND11_VERSION_MINOR 11
+#define PYBIND11_VERSION_PATCH 1
+
+// Similar to Python's convention: https://docs.python.org/3/c-api/apiabiversion.html
+// Additional convention: 0xD = dev
+#define PYBIND11_VERSION_HEX 0x020B0100
+
+// Define some generic pybind11 helper macros for warning management.
+//
+// Note that compiler-specific push/pop pairs are baked into the
+// PYBIND11_NAMESPACE_BEGIN/PYBIND11_NAMESPACE_END pair of macros. Therefore manual
+// PYBIND11_WARNING_PUSH/PYBIND11_WARNING_POP are usually only needed in `#include` sections.
+//
+// If you find you need to suppress a warning, please try to make the suppression as local as
+// possible using these macros. Please also be sure to push/pop with the pybind11 macros. Please
+// only use compiler specifics if you need to check specific versions, e.g. Apple Clang vs. vanilla
+// Clang.
+#if defined(_MSC_VER)
+#    define PYBIND11_COMPILER_MSVC
+#    define PYBIND11_PRAGMA(...) __pragma(__VA_ARGS__)
+#    define PYBIND11_WARNING_PUSH PYBIND11_PRAGMA(warning(push))
+#    define PYBIND11_WARNING_POP PYBIND11_PRAGMA(warning(pop))
+#elif defined(__INTEL_COMPILER)
+#    define PYBIND11_COMPILER_INTEL
+#    define PYBIND11_PRAGMA(...) _Pragma(#__VA_ARGS__)
+#    define PYBIND11_WARNING_PUSH PYBIND11_PRAGMA(warning push)
+#    define PYBIND11_WARNING_POP PYBIND11_PRAGMA(warning pop)
+#elif defined(__clang__)
+#    define PYBIND11_COMPILER_CLANG
+#    define PYBIND11_PRAGMA(...) _Pragma(#__VA_ARGS__)
+#    define PYBIND11_WARNING_PUSH PYBIND11_PRAGMA(clang diagnostic push)
+#    define PYBIND11_WARNING_POP PYBIND11_PRAGMA(clang diagnostic push)
+#elif defined(__GNUC__)
+#    define PYBIND11_COMPILER_GCC
+#    define PYBIND11_PRAGMA(...) _Pragma(#__VA_ARGS__)
+#    define PYBIND11_WARNING_PUSH PYBIND11_PRAGMA(GCC diagnostic push)
+#    define PYBIND11_WARNING_POP PYBIND11_PRAGMA(GCC diagnostic pop)
+#endif
+
+#ifdef PYBIND11_COMPILER_MSVC
+#    define PYBIND11_WARNING_DISABLE_MSVC(name) PYBIND11_PRAGMA(warning(disable : name))
+#else
+#    define PYBIND11_WARNING_DISABLE_MSVC(name)
+#endif
+
+#ifdef PYBIND11_COMPILER_CLANG
+#    define PYBIND11_WARNING_DISABLE_CLANG(name) PYBIND11_PRAGMA(clang diagnostic ignored name)
+#else
+#    define PYBIND11_WARNING_DISABLE_CLANG(name)
+#endif
+
+#ifdef PYBIND11_COMPILER_GCC
+#    define PYBIND11_WARNING_DISABLE_GCC(name) PYBIND11_PRAGMA(GCC diagnostic ignored name)
+#else
+#    define PYBIND11_WARNING_DISABLE_GCC(name)
+#endif
+
+#ifdef PYBIND11_COMPILER_INTEL
+#    define PYBIND11_WARNING_DISABLE_INTEL(name) PYBIND11_PRAGMA(warning disable name)
+#else
+#    define PYBIND11_WARNING_DISABLE_INTEL(name)
+#endif
+
+#define PYBIND11_NAMESPACE_BEGIN(name)                                                            \
+    namespace name {                                                                              \
+    PYBIND11_WARNING_PUSH
+
+#define PYBIND11_NAMESPACE_END(name)                                                              \
+    PYBIND11_WARNING_POP                                                                          \
+    }
+
+// Robust support for some features and loading modules compiled against different pybind versions
+// requires forcing hidden visibility on pybind code, so we enforce this by setting the attribute
+// on the main `pybind11` namespace.
+#if !defined(PYBIND11_NAMESPACE)
+#    ifdef __GNUG__
+#        define PYBIND11_NAMESPACE pybind11 __attribute__((visibility("hidden")))
+#    else
+#        define PYBIND11_NAMESPACE pybind11
+#    endif
+#endif
+
+#if !(defined(_MSC_VER) && __cplusplus == 199711L)
+#    if __cplusplus >= 201402L
+#        define PYBIND11_CPP14
+#        if __cplusplus >= 201703L
+#            define PYBIND11_CPP17
+#            if __cplusplus >= 202002L
+#                define PYBIND11_CPP20
+// Please update tests/pybind11_tests.cpp `cpp_std()` when adding a macro here.
+#            endif
+#        endif
+#    endif
+#elif defined(_MSC_VER) && __cplusplus == 199711L
+// MSVC sets _MSVC_LANG rather than __cplusplus (supposedly until the standard is fully
+// implemented). Unless you use the /Zc:__cplusplus flag on Visual Studio 2017 15.7 Preview 3
+// or newer.
+#    if _MSVC_LANG >= 201402L
+#        define PYBIND11_CPP14
+#        if _MSVC_LANG > 201402L
+#            define PYBIND11_CPP17
+#            if _MSVC_LANG >= 202002L
+#                define PYBIND11_CPP20
+#            endif
+#        endif
+#    endif
+#endif
+
+// Compiler version assertions
+#if defined(__INTEL_COMPILER)
+#    if __INTEL_COMPILER < 1800
+#        error pybind11 requires Intel C++ compiler v18 or newer
+#    elif __INTEL_COMPILER < 1900 && defined(PYBIND11_CPP14)
+#        error pybind11 supports only C++11 with Intel C++ compiler v18. Use v19 or newer for C++14.
+#    endif
+/* The following pragma cannot be pop'ed:
+   https://community.intel.com/t5/Intel-C-Compiler/Inline-and-no-inline-warning/td-p/1216764 */
+#    pragma warning disable 2196 // warning #2196: routine is both "inline" and "noinline"
+#elif defined(__clang__) && !defined(__apple_build_version__)
+#    if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 3)
+#        error pybind11 requires clang 3.3 or newer
+#    endif
+#elif defined(__clang__)
+// Apple changes clang version macros to its Xcode version; the first Xcode release based on
+// (upstream) clang 3.3 was Xcode 5:
+#    if __clang_major__ < 5
+#        error pybind11 requires Xcode/clang 5.0 or newer
+#    endif
+#elif defined(__GNUG__)
+#    if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8)
+#        error pybind11 requires gcc 4.8 or newer
+#    endif
+#elif defined(_MSC_VER)
+#    if _MSC_VER < 1910
+#        error pybind11 2.10+ requires MSVC 2017 or newer
+#    endif
+#endif
+
+#if !defined(PYBIND11_EXPORT)
+#    if defined(WIN32) || defined(_WIN32)
+#        define PYBIND11_EXPORT __declspec(dllexport)
+#    else
+#        define PYBIND11_EXPORT __attribute__((visibility("default")))
+#    endif
+#endif
+
+#if !defined(PYBIND11_EXPORT_EXCEPTION)
+#    if defined(__apple_build_version__)
+#        define PYBIND11_EXPORT_EXCEPTION PYBIND11_EXPORT
+#    else
+#        define PYBIND11_EXPORT_EXCEPTION
+#    endif
+#endif
+
+// For CUDA, GCC7, GCC8:
+// PYBIND11_NOINLINE_FORCED is incompatible with `-Wattributes -Werror`.
+// When defining PYBIND11_NOINLINE_FORCED, it is best to also use `-Wno-attributes`.
+// However, the measured shared-library size saving when using noinline are only
+// 1.7% for CUDA, -0.2% for GCC7, and 0.0% for GCC8 (using -DCMAKE_BUILD_TYPE=MinSizeRel,
+// the default under pybind11/tests).
+#if !defined(PYBIND11_NOINLINE_FORCED)                                                            \
+    && (defined(__CUDACC__) || (defined(__GNUC__) && (__GNUC__ == 7 || __GNUC__ == 8)))
+#    define PYBIND11_NOINLINE_DISABLED
+#endif
+
+// The PYBIND11_NOINLINE macro is for function DEFINITIONS.
+// In contrast, FORWARD DECLARATIONS should never use this macro:
+// https://stackoverflow.com/questions/9317473/forward-declaration-of-inline-functions
+#if defined(PYBIND11_NOINLINE_DISABLED) // Option for maximum portability and experimentation.
+#    define PYBIND11_NOINLINE inline
+#elif defined(_MSC_VER)
+#    define PYBIND11_NOINLINE __declspec(noinline) inline
+#else
+#    define PYBIND11_NOINLINE __attribute__((noinline)) inline
+#endif
+
+#if defined(__MINGW32__)
+// For unknown reasons all PYBIND11_DEPRECATED member trigger a warning when declared
+// whether it is used or not
+#    define PYBIND11_DEPRECATED(reason)
+#elif defined(PYBIND11_CPP14)
+#    define PYBIND11_DEPRECATED(reason) [[deprecated(reason)]]
+#else
+#    define PYBIND11_DEPRECATED(reason) __attribute__((deprecated(reason)))
+#endif
+
+#if defined(PYBIND11_CPP17)
+#    define PYBIND11_MAYBE_UNUSED [[maybe_unused]]
+#elif defined(_MSC_VER) && !defined(__clang__)
+#    define PYBIND11_MAYBE_UNUSED
+#else
+#    define PYBIND11_MAYBE_UNUSED __attribute__((__unused__))
+#endif
+
+/* Don't let Python.h #define (v)snprintf as macro because they are implemented
+   properly in Visual Studio since 2015. */
+#if defined(_MSC_VER)
+#    define HAVE_SNPRINTF 1
+#endif
+
+/// Include Python header, disable linking to pythonX_d.lib on Windows in debug mode
+#if defined(_MSC_VER)
+PYBIND11_WARNING_PUSH
+PYBIND11_WARNING_DISABLE_MSVC(4505)
+// C4505: 'PySlice_GetIndicesEx': unreferenced local function has been removed (PyPy only)
+#    if defined(_DEBUG) && !defined(Py_DEBUG)
+// Workaround for a VS 2022 issue.
+// NOTE: This workaround knowingly violates the Python.h include order requirement:
+// https://docs.python.org/3/c-api/intro.html#include-files
+// See https://github.com/pybind/pybind11/pull/3497 for full context.
+#        include <yvals.h>
+#        if _MSVC_STL_VERSION >= 143
+#            include <crtdefs.h>
+#        endif
+#        define PYBIND11_DEBUG_MARKER
+#        undef _DEBUG
+#    endif
+#endif
+
+// https://en.cppreference.com/w/c/chrono/localtime
+#if defined(__STDC_LIB_EXT1__) && !defined(__STDC_WANT_LIB_EXT1__)
+#    define __STDC_WANT_LIB_EXT1__
+#endif
+
+#ifdef __has_include
+// std::optional (but including it in c++14 mode isn't allowed)
+#    if defined(PYBIND11_CPP17) && __has_include(<optional>)
+#        define PYBIND11_HAS_OPTIONAL 1
+#    endif
+// std::experimental::optional (but not allowed in c++11 mode)
+#    if defined(PYBIND11_CPP14) && (__has_include(<experimental/optional>) && \
+                                 !__has_include(<optional>))
+#        define PYBIND11_HAS_EXP_OPTIONAL 1
+#    endif
+// std::variant
+#    if defined(PYBIND11_CPP17) && __has_include(<variant>)
+#        define PYBIND11_HAS_VARIANT 1
+#    endif
+#elif defined(_MSC_VER) && defined(PYBIND11_CPP17)
+#    define PYBIND11_HAS_OPTIONAL 1
+#    define PYBIND11_HAS_VARIANT 1
+#endif
+
+#if defined(PYBIND11_CPP17)
+#    if defined(__has_include)
+#        if __has_include(<string_view>)
+#            define PYBIND11_HAS_STRING_VIEW
+#        endif
+#    elif defined(_MSC_VER)
+#        define PYBIND11_HAS_STRING_VIEW
+#    endif
+#endif
+
+#include <Python.h>
+// Reminder: WITH_THREAD is always defined if PY_VERSION_HEX >= 0x03070000
+#if PY_VERSION_HEX < 0x03060000
+#    error "PYTHON < 3.6 IS UNSUPPORTED. pybind11 v2.9 was the last to support Python 2 and 3.5."
+#endif
+#include <frameobject.h>
+#include <pythread.h>
+
+/* Python #defines overrides on all sorts of core functions, which
+   tends to weak havok in C++ codebases that expect these to work
+   like regular functions (potentially with several overloads) */
+#if defined(isalnum)
+#    undef isalnum
+#    undef isalpha
+#    undef islower
+#    undef isspace
+#    undef isupper
+#    undef tolower
+#    undef toupper
+#endif
+
+#if defined(copysign)
+#    undef copysign
+#endif
+
+#if defined(PYPY_VERSION) && !defined(PYBIND11_SIMPLE_GIL_MANAGEMENT)
+#    define PYBIND11_SIMPLE_GIL_MANAGEMENT
+#endif
+
+#if defined(_MSC_VER)
+#    if defined(PYBIND11_DEBUG_MARKER)
+#        define _DEBUG
+#        undef PYBIND11_DEBUG_MARKER
+#    endif
+PYBIND11_WARNING_POP
+#endif
+
+#include <cstddef>
+#include <cstring>
+#include <exception>
+#include <forward_list>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <typeindex>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#if defined(__has_include)
+#    if __has_include(<version>)
+#        include <version>
+#    endif
+#endif
+
+// Must be after including <version> or one of the other headers specified by the standard
+#if defined(__cpp_lib_char8_t) && __cpp_lib_char8_t >= 201811L
+#    define PYBIND11_HAS_U8STRING
+#endif
+
+// See description of PR #4246:
+#if !defined(PYBIND11_NO_ASSERT_GIL_HELD_INCREF_DECREF) && !defined(NDEBUG)                       \
+    && !defined(PYPY_VERSION) && !defined(PYBIND11_ASSERT_GIL_HELD_INCREF_DECREF)
+#    define PYBIND11_ASSERT_GIL_HELD_INCREF_DECREF
+#endif
+
+// #define PYBIND11_STR_LEGACY_PERMISSIVE
+// If DEFINED, pybind11::str can hold PyUnicodeObject or PyBytesObject
+//             (probably surprising and never documented, but this was the
+//             legacy behavior until and including v2.6.x). As a side-effect,
+//             pybind11::isinstance<str>() is true for both pybind11::str and
+//             pybind11::bytes.
+// If UNDEFINED, pybind11::str can only hold PyUnicodeObject, and
+//               pybind11::isinstance<str>() is true only for pybind11::str.
+//               However, for Python 2 only (!), the pybind11::str caster
+//               implicitly decoded bytes to PyUnicodeObject. This was to ease
+//               the transition from the legacy behavior to the non-permissive
+//               behavior.
+
+/// Compatibility macros for Python 2 / Python 3 versions TODO: remove
+#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyInstanceMethod_New(ptr)
+#define PYBIND11_INSTANCE_METHOD_CHECK PyInstanceMethod_Check
+#define PYBIND11_INSTANCE_METHOD_GET_FUNCTION PyInstanceMethod_GET_FUNCTION
+#define PYBIND11_BYTES_CHECK PyBytes_Check
+#define PYBIND11_BYTES_FROM_STRING PyBytes_FromString
+#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyBytes_FromStringAndSize
+#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyBytes_AsStringAndSize
+#define PYBIND11_BYTES_AS_STRING PyBytes_AsString
+#define PYBIND11_BYTES_SIZE PyBytes_Size
+#define PYBIND11_LONG_CHECK(o) PyLong_Check(o)
+#define PYBIND11_LONG_AS_LONGLONG(o) PyLong_AsLongLong(o)
+#define PYBIND11_LONG_FROM_SIGNED(o) PyLong_FromSsize_t((ssize_t) (o))
+#define PYBIND11_LONG_FROM_UNSIGNED(o) PyLong_FromSize_t((size_t) (o))
+#define PYBIND11_BYTES_NAME "bytes"
+#define PYBIND11_STRING_NAME "str"
+#define PYBIND11_SLICE_OBJECT PyObject
+#define PYBIND11_FROM_STRING PyUnicode_FromString
+#define PYBIND11_STR_TYPE ::pybind11::str
+#define PYBIND11_BOOL_ATTR "__bool__"
+#define PYBIND11_NB_BOOL(ptr) ((ptr)->nb_bool)
+#define PYBIND11_BUILTINS_MODULE "builtins"
+// Providing a separate declaration to make Clang's -Wmissing-prototypes happy.
+// See comment for PYBIND11_MODULE below for why this is marked "maybe unused".
+#define PYBIND11_PLUGIN_IMPL(name)                                                                \
+    extern "C" PYBIND11_MAYBE_UNUSED PYBIND11_EXPORT PyObject *PyInit_##name();                   \
+    extern "C" PYBIND11_EXPORT PyObject *PyInit_##name()
+
+#define PYBIND11_TRY_NEXT_OVERLOAD ((PyObject *) 1) // special failure return code
+#define PYBIND11_STRINGIFY(x) #x
+#define PYBIND11_TOSTRING(x) PYBIND11_STRINGIFY(x)
+#define PYBIND11_CONCAT(first, second) first##second
+#define PYBIND11_ENSURE_INTERNALS_READY pybind11::detail::get_internals();
+
+#define PYBIND11_CHECK_PYTHON_VERSION                                                             \
+    {                                                                                             \
+        const char *compiled_ver                                                                  \
+            = PYBIND11_TOSTRING(PY_MAJOR_VERSION) "." PYBIND11_TOSTRING(PY_MINOR_VERSION);        \
+        const char *runtime_ver = Py_GetVersion();                                                \
+        size_t len = std::strlen(compiled_ver);                                                   \
+        if (std::strncmp(runtime_ver, compiled_ver, len) != 0                                     \
+            || (runtime_ver[len] >= '0' && runtime_ver[len] <= '9')) {                            \
+            PyErr_Format(PyExc_ImportError,                                                       \
+                         "Python version mismatch: module was compiled for Python %s, "           \
+                         "but the interpreter version is incompatible: %s.",                      \
+                         compiled_ver,                                                            \
+                         runtime_ver);                                                            \
+            return nullptr;                                                                       \
+        }                                                                                         \
+    }
+
+#define PYBIND11_CATCH_INIT_EXCEPTIONS                                                            \
+    catch (pybind11::error_already_set & e) {                                                     \
+        pybind11::raise_from(e, PyExc_ImportError, "initialization failed");                      \
+        return nullptr;                                                                           \
+    }                                                                                             \
+    catch (const std::exception &e) {                                                             \
+        PyErr_SetString(PyExc_ImportError, e.what());                                             \
+        return nullptr;                                                                           \
+    }
+
+/** \rst
+    ***Deprecated in favor of PYBIND11_MODULE***
+
+    This macro creates the entry point that will be invoked when the Python interpreter
+    imports a plugin library. Please create a `module_` in the function body and return
+    the pointer to its underlying Python object at the end.
+
+    .. code-block:: cpp
+
+        PYBIND11_PLUGIN(example) {
+            pybind11::module_ m("example", "pybind11 example plugin");
+            /// Set up bindings here
+            return m.ptr();
+        }
+\endrst */
+#define PYBIND11_PLUGIN(name)                                                                     \
+    PYBIND11_DEPRECATED("PYBIND11_PLUGIN is deprecated, use PYBIND11_MODULE")                     \
+    static PyObject *pybind11_init();                                                             \
+    PYBIND11_PLUGIN_IMPL(name) {                                                                  \
+        PYBIND11_CHECK_PYTHON_VERSION                                                             \
+        PYBIND11_ENSURE_INTERNALS_READY                                                           \
+        try {                                                                                     \
+            return pybind11_init();                                                               \
+        }                                                                                         \
+        PYBIND11_CATCH_INIT_EXCEPTIONS                                                            \
+    }                                                                                             \
+    PyObject *pybind11_init()
+
+/** \rst
+    This macro creates the entry point that will be invoked when the Python interpreter
+    imports an extension module. The module name is given as the first argument and it
+    should not be in quotes. The second macro argument defines a variable of type
+    `py::module_` which can be used to initialize the module.
+
+    The entry point is marked as "maybe unused" to aid dead-code detection analysis:
+    since the entry point is typically only looked up at runtime and not referenced
+    during translation, it would otherwise appear as unused ("dead") code.
+
+    .. code-block:: cpp
+
+        PYBIND11_MODULE(example, m) {
+            m.doc() = "pybind11 example module";
+
+            // Add bindings here
+            m.def("foo", []() {
+                return "Hello, World!";
+            });
+        }
+\endrst */
+#define PYBIND11_MODULE(name, variable)                                                           \
+    static ::pybind11::module_::module_def PYBIND11_CONCAT(pybind11_module_def_, name)            \
+        PYBIND11_MAYBE_UNUSED;                                                                    \
+    PYBIND11_MAYBE_UNUSED                                                                         \
+    static void PYBIND11_CONCAT(pybind11_init_, name)(::pybind11::module_ &);                     \
+    PYBIND11_PLUGIN_IMPL(name) {                                                                  \
+        PYBIND11_CHECK_PYTHON_VERSION                                                             \
+        PYBIND11_ENSURE_INTERNALS_READY                                                           \
+        auto m = ::pybind11::module_::create_extension_module(                                    \
+            PYBIND11_TOSTRING(name), nullptr, &PYBIND11_CONCAT(pybind11_module_def_, name));      \
+        try {                                                                                     \
+            PYBIND11_CONCAT(pybind11_init_, name)(m);                                             \
+            return m.ptr();                                                                       \
+        }                                                                                         \
+        PYBIND11_CATCH_INIT_EXCEPTIONS                                                            \
+    }                                                                                             \
+    void PYBIND11_CONCAT(pybind11_init_, name)(::pybind11::module_ & (variable))
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+using ssize_t = Py_ssize_t;
+using size_t = std::size_t;
+
+template <typename IntType>
+inline ssize_t ssize_t_cast(const IntType &val) {
+    static_assert(sizeof(IntType) <= sizeof(ssize_t), "Implicit narrowing is not permitted.");
+    return static_cast<ssize_t>(val);
+}
+
+/// Approach used to cast a previously unknown C++ instance into a Python object
+enum class return_value_policy : uint8_t {
+    /** This is the default return value policy, which falls back to the policy
+        return_value_policy::take_ownership when the return value is a pointer.
+        Otherwise, it uses return_value::move or return_value::copy for rvalue
+        and lvalue references, respectively. See below for a description of what
+        all of these different policies do. */
+    automatic = 0,
+
+    /** As above, but use policy return_value_policy::reference when the return
+        value is a pointer. This is the default conversion policy for function
+        arguments when calling Python functions manually from C++ code (i.e. via
+        handle::operator()). You probably won't need to use this. */
+    automatic_reference,
+
+    /** Reference an existing object (i.e. do not create a new copy) and take
+        ownership. Python will call the destructor and delete operator when the
+        object's reference count reaches zero. Undefined behavior ensues when
+        the C++ side does the same.. */
+    take_ownership,
+
+    /** Create a new copy of the returned object, which will be owned by
+        Python. This policy is comparably safe because the lifetimes of the two
+        instances are decoupled. */
+    copy,
+
+    /** Use std::move to move the return value contents into a new instance
+        that will be owned by Python. This policy is comparably safe because the
+        lifetimes of the two instances (move source and destination) are
+        decoupled. */
+    move,
+
+    /** Reference an existing object, but do not take ownership. The C++ side
+        is responsible for managing the object's lifetime and deallocating it
+        when it is no longer used. Warning: undefined behavior will ensue when
+        the C++ side deletes an object that is still referenced and used by
+        Python. */
+    reference,
+
+    /** This policy only applies to methods and properties. It references the
+        object without taking ownership similar to the above
+        return_value_policy::reference policy. In contrast to that policy, the
+        function or property's implicit this argument (called the parent) is
+        considered to be the the owner of the return value (the child).
+        pybind11 then couples the lifetime of the parent to the child via a
+        reference relationship that ensures that the parent cannot be garbage
+        collected while Python is still using the child. More advanced
+        variations of this scheme are also possible using combinations of
+        return_value_policy::reference and the keep_alive call policy */
+    reference_internal
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+inline static constexpr int log2(size_t n, int k = 0) {
+    return (n <= 1) ? k : log2(n >> 1, k + 1);
+}
+
+// Returns the size as a multiple of sizeof(void *), rounded up.
+inline static constexpr size_t size_in_ptrs(size_t s) {
+    return 1 + ((s - 1) >> log2(sizeof(void *)));
+}
+
+/**
+ * The space to allocate for simple layout instance holders (see below) in multiple of the size of
+ * a pointer (e.g.  2 means 16 bytes on 64-bit architectures).  The default is the minimum required
+ * to holder either a std::unique_ptr or std::shared_ptr (which is almost always
+ * sizeof(std::shared_ptr<T>)).
+ */
+constexpr size_t instance_simple_holder_in_ptrs() {
+    static_assert(sizeof(std::shared_ptr<int>) >= sizeof(std::unique_ptr<int>),
+                  "pybind assumes std::shared_ptrs are at least as big as std::unique_ptrs");
+    return size_in_ptrs(sizeof(std::shared_ptr<int>));
+}
+
+// Forward declarations
+struct type_info;
+struct value_and_holder;
+
+struct nonsimple_values_and_holders {
+    void **values_and_holders;
+    uint8_t *status;
+};
+
+/// The 'instance' type which needs to be standard layout (need to be able to use 'offsetof')
+struct instance {
+    PyObject_HEAD
+    /// Storage for pointers and holder; see simple_layout, below, for a description
+    union {
+        void *simple_value_holder[1 + instance_simple_holder_in_ptrs()];
+        nonsimple_values_and_holders nonsimple;
+    };
+    /// Weak references
+    PyObject *weakrefs;
+    /// If true, the pointer is owned which means we're free to manage it with a holder.
+    bool owned : 1;
+    /**
+     * An instance has two possible value/holder layouts.
+     *
+     * Simple layout (when this flag is true), means the `simple_value_holder` is set with a
+     * pointer and the holder object governing that pointer, i.e. [val1*][holder].  This layout is
+     * applied whenever there is no python-side multiple inheritance of bound C++ types *and* the
+     * type's holder will fit in the default space (which is large enough to hold either a
+     * std::unique_ptr or std::shared_ptr).
+     *
+     * Non-simple layout applies when using custom holders that require more space than
+     * `shared_ptr` (which is typically the size of two pointers), or when multiple inheritance is
+     * used on the python side.  Non-simple layout allocates the required amount of memory to have
+     * multiple bound C++ classes as parents.  Under this layout, `nonsimple.values_and_holders` is
+     * set to a pointer to allocated space of the required space to hold a sequence of value
+     * pointers and holders followed `status`, a set of bit flags (1 byte each), i.e.
+     * [val1*][holder1][val2*][holder2]...[bb...]  where each [block] is rounded up to a multiple
+     * of `sizeof(void *)`.  `nonsimple.status` is, for convenience, a pointer to the beginning of
+     * the [bb...] block (but not independently allocated).
+     *
+     * Status bits indicate whether the associated holder is constructed (&
+     * status_holder_constructed) and whether the value pointer is registered (&
+     * status_instance_registered) in `registered_instances`.
+     */
+    bool simple_layout : 1;
+    /// For simple layout, tracks whether the holder has been constructed
+    bool simple_holder_constructed : 1;
+    /// For simple layout, tracks whether the instance is registered in `registered_instances`
+    bool simple_instance_registered : 1;
+    /// If true, get_internals().patients has an entry for this object
+    bool has_patients : 1;
+
+    /// Initializes all of the above type/values/holders data (but not the instance values
+    /// themselves)
+    void allocate_layout();
+
+    /// Destroys/deallocates all of the above
+    void deallocate_layout();
+
+    /// Returns the value_and_holder wrapper for the given type (or the first, if `find_type`
+    /// omitted).  Returns a default-constructed (with `.inst = nullptr`) object on failure if
+    /// `throw_if_missing` is false.
+    value_and_holder get_value_and_holder(const type_info *find_type = nullptr,
+                                          bool throw_if_missing = true);
+
+    /// Bit values for the non-simple status flags
+    static constexpr uint8_t status_holder_constructed = 1;
+    static constexpr uint8_t status_instance_registered = 2;
+};
+
+static_assert(std::is_standard_layout<instance>::value,
+              "Internal error: `pybind11::detail::instance` is not standard layout!");
+
+/// from __cpp_future__ import (convenient aliases from C++14/17)
+#if defined(PYBIND11_CPP14)
+using std::conditional_t;
+using std::enable_if_t;
+using std::remove_cv_t;
+using std::remove_reference_t;
+#else
+template <bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+template <bool B, typename T, typename F>
+using conditional_t = typename std::conditional<B, T, F>::type;
+template <typename T>
+using remove_cv_t = typename std::remove_cv<T>::type;
+template <typename T>
+using remove_reference_t = typename std::remove_reference<T>::type;
+#endif
+
+#if defined(PYBIND11_CPP20)
+using std::remove_cvref;
+using std::remove_cvref_t;
+#else
+template <class T>
+struct remove_cvref {
+    using type = remove_cv_t<remove_reference_t<T>>;
+};
+template <class T>
+using remove_cvref_t = typename remove_cvref<T>::type;
+#endif
+
+/// Example usage: is_same_ignoring_cvref<T, PyObject *>::value
+template <typename T, typename U>
+using is_same_ignoring_cvref = std::is_same<detail::remove_cvref_t<T>, U>;
+
+/// Index sequences
+#if defined(PYBIND11_CPP14)
+using std::index_sequence;
+using std::make_index_sequence;
+#else
+template <size_t...>
+struct index_sequence {};
+template <size_t N, size_t... S>
+struct make_index_sequence_impl : make_index_sequence_impl<N - 1, N - 1, S...> {};
+template <size_t... S>
+struct make_index_sequence_impl<0, S...> {
+    using type = index_sequence<S...>;
+};
+template <size_t N>
+using make_index_sequence = typename make_index_sequence_impl<N>::type;
+#endif
+
+/// Make an index sequence of the indices of true arguments
+template <typename ISeq, size_t, bool...>
+struct select_indices_impl {
+    using type = ISeq;
+};
+template <size_t... IPrev, size_t I, bool B, bool... Bs>
+struct select_indices_impl<index_sequence<IPrev...>, I, B, Bs...>
+    : select_indices_impl<conditional_t<B, index_sequence<IPrev..., I>, index_sequence<IPrev...>>,
+                          I + 1,
+                          Bs...> {};
+template <bool... Bs>
+using select_indices = typename select_indices_impl<index_sequence<>, 0, Bs...>::type;
+
+/// Backports of std::bool_constant and std::negation to accommodate older compilers
+template <bool B>
+using bool_constant = std::integral_constant<bool, B>;
+template <typename T>
+struct negation : bool_constant<!T::value> {};
+
+// PGI/Intel cannot detect operator delete with the "compatible" void_t impl, so
+// using the new one (C++14 defect, so generally works on newer compilers, even
+// if not in C++17 mode)
+#if defined(__PGIC__) || defined(__INTEL_COMPILER)
+template <typename...>
+using void_t = void;
+#else
+template <typename...>
+struct void_t_impl {
+    using type = void;
+};
+template <typename... Ts>
+using void_t = typename void_t_impl<Ts...>::type;
+#endif
+
+/// Compile-time all/any/none of that check the boolean value of all template types
+#if defined(__cpp_fold_expressions) && !(defined(_MSC_VER) && (_MSC_VER < 1916))
+template <class... Ts>
+using all_of = bool_constant<(Ts::value && ...)>;
+template <class... Ts>
+using any_of = bool_constant<(Ts::value || ...)>;
+#elif !defined(_MSC_VER)
+template <bool...>
+struct bools {};
+template <class... Ts>
+using all_of = std::is_same<bools<Ts::value..., true>, bools<true, Ts::value...>>;
+template <class... Ts>
+using any_of = negation<all_of<negation<Ts>...>>;
+#else
+// MSVC has trouble with the above, but supports std::conjunction, which we can use instead (albeit
+// at a slight loss of compilation efficiency).
+template <class... Ts>
+using all_of = std::conjunction<Ts...>;
+template <class... Ts>
+using any_of = std::disjunction<Ts...>;
+#endif
+template <class... Ts>
+using none_of = negation<any_of<Ts...>>;
+
+template <class T, template <class> class... Predicates>
+using satisfies_all_of = all_of<Predicates<T>...>;
+template <class T, template <class> class... Predicates>
+using satisfies_any_of = any_of<Predicates<T>...>;
+template <class T, template <class> class... Predicates>
+using satisfies_none_of = none_of<Predicates<T>...>;
+
+/// Strip the class from a method type
+template <typename T>
+struct remove_class {};
+template <typename C, typename R, typename... A>
+struct remove_class<R (C::*)(A...)> {
+    using type = R(A...);
+};
+template <typename C, typename R, typename... A>
+struct remove_class<R (C::*)(A...) const> {
+    using type = R(A...);
+};
+#ifdef __cpp_noexcept_function_type
+template <typename C, typename R, typename... A>
+struct remove_class<R (C::*)(A...) noexcept> {
+    using type = R(A...);
+};
+template <typename C, typename R, typename... A>
+struct remove_class<R (C::*)(A...) const noexcept> {
+    using type = R(A...);
+};
+#endif
+/// Helper template to strip away type modifiers
+template <typename T>
+struct intrinsic_type {
+    using type = T;
+};
+template <typename T>
+struct intrinsic_type<const T> {
+    using type = typename intrinsic_type<T>::type;
+};
+template <typename T>
+struct intrinsic_type<T *> {
+    using type = typename intrinsic_type<T>::type;
+};
+template <typename T>
+struct intrinsic_type<T &> {
+    using type = typename intrinsic_type<T>::type;
+};
+template <typename T>
+struct intrinsic_type<T &&> {
+    using type = typename intrinsic_type<T>::type;
+};
+template <typename T, size_t N>
+struct intrinsic_type<const T[N]> {
+    using type = typename intrinsic_type<T>::type;
+};
+template <typename T, size_t N>
+struct intrinsic_type<T[N]> {
+    using type = typename intrinsic_type<T>::type;
+};
+template <typename T>
+using intrinsic_t = typename intrinsic_type<T>::type;
+
+/// Helper type to replace 'void' in some expressions
+struct void_type {};
+
+/// Helper template which holds a list of types
+template <typename...>
+struct type_list {};
+
+/// Compile-time integer sum
+#ifdef __cpp_fold_expressions
+template <typename... Ts>
+constexpr size_t constexpr_sum(Ts... ns) {
+    return (0 + ... + size_t{ns});
+}
+#else
+constexpr size_t constexpr_sum() { return 0; }
+template <typename T, typename... Ts>
+constexpr size_t constexpr_sum(T n, Ts... ns) {
+    return size_t{n} + constexpr_sum(ns...);
+}
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(constexpr_impl)
+/// Implementation details for constexpr functions
+constexpr int first(int i) { return i; }
+template <typename T, typename... Ts>
+constexpr int first(int i, T v, Ts... vs) {
+    return v ? i : first(i + 1, vs...);
+}
+
+constexpr int last(int /*i*/, int result) { return result; }
+template <typename T, typename... Ts>
+constexpr int last(int i, int result, T v, Ts... vs) {
+    return last(i + 1, v ? i : result, vs...);
+}
+PYBIND11_NAMESPACE_END(constexpr_impl)
+
+/// Return the index of the first type in Ts which satisfies Predicate<T>.
+/// Returns sizeof...(Ts) if none match.
+template <template <typename> class Predicate, typename... Ts>
+constexpr int constexpr_first() {
+    return constexpr_impl::first(0, Predicate<Ts>::value...);
+}
+
+/// Return the index of the last type in Ts which satisfies Predicate<T>, or -1 if none match.
+template <template <typename> class Predicate, typename... Ts>
+constexpr int constexpr_last() {
+    return constexpr_impl::last(0, -1, Predicate<Ts>::value...);
+}
+
+/// Return the Nth element from the parameter pack
+template <size_t N, typename T, typename... Ts>
+struct pack_element {
+    using type = typename pack_element<N - 1, Ts...>::type;
+};
+template <typename T, typename... Ts>
+struct pack_element<0, T, Ts...> {
+    using type = T;
+};
+
+/// Return the one and only type which matches the predicate, or Default if none match.
+/// If more than one type matches the predicate, fail at compile-time.
+template <template <typename> class Predicate, typename Default, typename... Ts>
+struct exactly_one {
+    static constexpr auto found = constexpr_sum(Predicate<Ts>::value...);
+    static_assert(found <= 1, "Found more than one type matching the predicate");
+
+    static constexpr auto index = found ? constexpr_first<Predicate, Ts...>() : 0;
+    using type = conditional_t<found, typename pack_element<index, Ts...>::type, Default>;
+};
+template <template <typename> class P, typename Default>
+struct exactly_one<P, Default> {
+    using type = Default;
+};
+
+template <template <typename> class Predicate, typename Default, typename... Ts>
+using exactly_one_t = typename exactly_one<Predicate, Default, Ts...>::type;
+
+/// Defer the evaluation of type T until types Us are instantiated
+template <typename T, typename... /*Us*/>
+struct deferred_type {
+    using type = T;
+};
+template <typename T, typename... Us>
+using deferred_t = typename deferred_type<T, Us...>::type;
+
+/// Like is_base_of, but requires a strict base (i.e. `is_strict_base_of<T, T>::value == false`,
+/// unlike `std::is_base_of`)
+template <typename Base, typename Derived>
+using is_strict_base_of
+    = bool_constant<std::is_base_of<Base, Derived>::value && !std::is_same<Base, Derived>::value>;
+
+/// Like is_base_of, but also requires that the base type is accessible (i.e. that a Derived
+/// pointer can be converted to a Base pointer) For unions, `is_base_of<T, T>::value` is False, so
+/// we need to check `is_same` as well.
+template <typename Base, typename Derived>
+using is_accessible_base_of
+    = bool_constant<(std::is_same<Base, Derived>::value || std::is_base_of<Base, Derived>::value)
+                    && std::is_convertible<Derived *, Base *>::value>;
+
+template <template <typename...> class Base>
+struct is_template_base_of_impl {
+    template <typename... Us>
+    static std::true_type check(Base<Us...> *);
+    static std::false_type check(...);
+};
+
+/// Check if a template is the base of a type. For example:
+/// `is_template_base_of<Base, T>` is true if `struct T : Base<U> {}` where U can be anything
+template <template <typename...> class Base, typename T>
+// Sadly, all MSVC versions incl. 2022 need the workaround, even in C++20 mode.
+// See also: https://github.com/pybind/pybind11/pull/3741
+#if !defined(_MSC_VER)
+using is_template_base_of
+    = decltype(is_template_base_of_impl<Base>::check((intrinsic_t<T> *) nullptr));
+#else
+struct is_template_base_of
+    : decltype(is_template_base_of_impl<Base>::check((intrinsic_t<T> *) nullptr)) {
+};
+#endif
+
+/// Check if T is an instantiation of the template `Class`. For example:
+/// `is_instantiation<shared_ptr, T>` is true if `T == shared_ptr<U>` where U can be anything.
+template <template <typename...> class Class, typename T>
+struct is_instantiation : std::false_type {};
+template <template <typename...> class Class, typename... Us>
+struct is_instantiation<Class, Class<Us...>> : std::true_type {};
+
+/// Check if T is std::shared_ptr<U> where U can be anything
+template <typename T>
+using is_shared_ptr = is_instantiation<std::shared_ptr, T>;
+
+/// Check if T looks like an input iterator
+template <typename T, typename = void>
+struct is_input_iterator : std::false_type {};
+template <typename T>
+struct is_input_iterator<T,
+                         void_t<decltype(*std::declval<T &>()), decltype(++std::declval<T &>())>>
+    : std::true_type {};
+
+template <typename T>
+using is_function_pointer
+    = bool_constant<std::is_pointer<T>::value
+                    && std::is_function<typename std::remove_pointer<T>::type>::value>;
+
+template <typename F>
+struct strip_function_object {
+    // If you are encountering an
+    // 'error: name followed by "::" must be a class or namespace name'
+    // with the Intel compiler and a noexcept function here,
+    // try to use noexcept(true) instead of plain noexcept.
+    using type = typename remove_class<decltype(&F::operator())>::type;
+};
+
+// Extracts the function signature from a function, function pointer or lambda.
+template <typename Function, typename F = remove_reference_t<Function>>
+using function_signature_t = conditional_t<
+    std::is_function<F>::value,
+    F,
+    typename conditional_t<std::is_pointer<F>::value || std::is_member_pointer<F>::value,
+                           std::remove_pointer<F>,
+                           strip_function_object<F>>::type>;
+
+/// Returns true if the type looks like a lambda: that is, isn't a function, pointer or member
+/// pointer.  Note that this can catch all sorts of other things, too; this is intended to be used
+/// in a place where passing a lambda makes sense.
+template <typename T>
+using is_lambda = satisfies_none_of<remove_reference_t<T>,
+                                    std::is_function,
+                                    std::is_pointer,
+                                    std::is_member_pointer>;
+
+// [workaround(intel)] Internal error on fold expression
+/// Apply a function over each element of a parameter pack
+#if defined(__cpp_fold_expressions) && !defined(__INTEL_COMPILER)
+// Intel compiler produces an internal error on this fold expression (tested with ICC 19.0.2)
+#    define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN) (((PATTERN), void()), ...)
+#else
+using expand_side_effects = bool[];
+#    define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN)                                                 \
+        (void) pybind11::detail::expand_side_effects { ((PATTERN), void(), false)..., false }
+#endif
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// C++ bindings of builtin Python exceptions
+class PYBIND11_EXPORT_EXCEPTION builtin_exception : public std::runtime_error {
+public:
+    using std::runtime_error::runtime_error;
+    /// Set the error using the Python C API
+    virtual void set_error() const = 0;
+};
+
+#define PYBIND11_RUNTIME_EXCEPTION(name, type)                                                    \
+    class PYBIND11_EXPORT_EXCEPTION name : public builtin_exception {                             \
+    public:                                                                                       \
+        using builtin_exception::builtin_exception;                                               \
+        name() : name("") {}                                                                      \
+        void set_error() const override { PyErr_SetString(type, what()); }                        \
+    };
+
+PYBIND11_RUNTIME_EXCEPTION(stop_iteration, PyExc_StopIteration)
+PYBIND11_RUNTIME_EXCEPTION(index_error, PyExc_IndexError)
+PYBIND11_RUNTIME_EXCEPTION(key_error, PyExc_KeyError)
+PYBIND11_RUNTIME_EXCEPTION(value_error, PyExc_ValueError)
+PYBIND11_RUNTIME_EXCEPTION(type_error, PyExc_TypeError)
+PYBIND11_RUNTIME_EXCEPTION(buffer_error, PyExc_BufferError)
+PYBIND11_RUNTIME_EXCEPTION(import_error, PyExc_ImportError)
+PYBIND11_RUNTIME_EXCEPTION(attribute_error, PyExc_AttributeError)
+PYBIND11_RUNTIME_EXCEPTION(cast_error, PyExc_RuntimeError) /// Thrown when pybind11::cast or
+                                                           /// handle::call fail due to a type
+                                                           /// casting error
+PYBIND11_RUNTIME_EXCEPTION(reference_cast_error, PyExc_RuntimeError) /// Used internally
+
+[[noreturn]] PYBIND11_NOINLINE void pybind11_fail(const char *reason) {
+    assert(!PyErr_Occurred());
+    throw std::runtime_error(reason);
+}
+[[noreturn]] PYBIND11_NOINLINE void pybind11_fail(const std::string &reason) {
+    assert(!PyErr_Occurred());
+    throw std::runtime_error(reason);
+}
+
+template <typename T, typename SFINAE = void>
+struct format_descriptor {};
+
+template <typename T>
+struct format_descriptor<
+    T,
+    detail::enable_if_t<detail::is_same_ignoring_cvref<T, PyObject *>::value>> {
+    static constexpr const char c = 'O';
+    static constexpr const char value[2] = {c, '\0'};
+    static std::string format() { return std::string(1, c); }
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+// Returns the index of the given type in the type char array below, and in the list in numpy.h
+// The order here is: bool; 8 ints ((signed,unsigned)x(8,16,32,64)bits); float,double,long double;
+// complex float,double,long double.  Note that the long double types only participate when long
+// double is actually longer than double (it isn't under MSVC).
+// NB: not only the string below but also complex.h and numpy.h rely on this order.
+template <typename T, typename SFINAE = void>
+struct is_fmt_numeric {
+    static constexpr bool value = false;
+};
+template <typename T>
+struct is_fmt_numeric<T, enable_if_t<std::is_arithmetic<T>::value>> {
+    static constexpr bool value = true;
+    static constexpr int index
+        = std::is_same<T, bool>::value
+              ? 0
+              : 1
+                    + (std::is_integral<T>::value
+                           ? detail::log2(sizeof(T)) * 2 + std::is_unsigned<T>::value
+                           : 8
+                                 + (std::is_same<T, double>::value        ? 1
+                                    : std::is_same<T, long double>::value ? 2
+                                                                          : 0));
+};
+PYBIND11_NAMESPACE_END(detail)
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<std::is_arithmetic<T>::value>> {
+    static constexpr const char c = "?bBhHiIqQfdg"[detail::is_fmt_numeric<T>::index];
+    static constexpr const char value[2] = {c, '\0'};
+    static std::string format() { return std::string(1, c); }
+};
+
+#if !defined(PYBIND11_CPP17)
+
+template <typename T>
+constexpr const char
+    format_descriptor<T, detail::enable_if_t<std::is_arithmetic<T>::value>>::value[2];
+
+#endif
+
+/// RAII wrapper that temporarily clears any Python error state
+struct error_scope {
+    PyObject *type, *value, *trace;
+    error_scope() { PyErr_Fetch(&type, &value, &trace); }
+    error_scope(const error_scope &) = delete;
+    error_scope &operator=(const error_scope &) = delete;
+    ~error_scope() { PyErr_Restore(type, value, trace); }
+};
+
+/// Dummy destructor wrapper that can be used to expose classes with a private destructor
+struct nodelete {
+    template <typename T>
+    void operator()(T *) {}
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+template <typename... Args>
+struct overload_cast_impl {
+    template <typename Return>
+    constexpr auto operator()(Return (*pf)(Args...)) const noexcept -> decltype(pf) {
+        return pf;
+    }
+
+    template <typename Return, typename Class>
+    constexpr auto operator()(Return (Class::*pmf)(Args...), std::false_type = {}) const noexcept
+        -> decltype(pmf) {
+        return pmf;
+    }
+
+    template <typename Return, typename Class>
+    constexpr auto operator()(Return (Class::*pmf)(Args...) const, std::true_type) const noexcept
+        -> decltype(pmf) {
+        return pmf;
+    }
+};
+PYBIND11_NAMESPACE_END(detail)
+
+// overload_cast requires variable templates: C++14
+#if defined(PYBIND11_CPP14)
+#    define PYBIND11_OVERLOAD_CAST 1
+/// Syntax sugar for resolving overloaded function pointers:
+///  - regular: static_cast<Return (Class::*)(Arg0, Arg1, Arg2)>(&Class::func)
+///  - sweet:   overload_cast<Arg0, Arg1, Arg2>(&Class::func)
+template <typename... Args>
+static constexpr detail::overload_cast_impl<Args...> overload_cast{};
+#endif
+
+/// Const member function selector for overload_cast
+///  - regular: static_cast<Return (Class::*)(Arg) const>(&Class::func)
+///  - sweet:   overload_cast<Arg>(&Class::func, const_)
+static constexpr auto const_ = std::true_type{};
+
+#if !defined(PYBIND11_CPP14) // no overload_cast: providing something that static_assert-fails:
+template <typename... Args>
+struct overload_cast {
+    static_assert(detail::deferred_t<std::false_type, Args...>::value,
+                  "pybind11::overload_cast<...> requires compiling in C++14 mode");
+};
+#endif // overload_cast
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Adaptor for converting arbitrary container arguments into a vector; implicitly convertible from
+// any standard container (or C-style array) supporting std::begin/std::end, any singleton
+// arithmetic type (if T is arithmetic), or explicitly constructible from an iterator pair.
+template <typename T>
+class any_container {
+    std::vector<T> v;
+
+public:
+    any_container() = default;
+
+    // Can construct from a pair of iterators
+    template <typename It, typename = enable_if_t<is_input_iterator<It>::value>>
+    any_container(It first, It last) : v(first, last) {}
+
+    // Implicit conversion constructor from any arbitrary container type
+    // with values convertible to T
+    template <typename Container,
+              typename = enable_if_t<
+                  std::is_convertible<decltype(*std::begin(std::declval<const Container &>())),
+                                      T>::value>>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    any_container(const Container &c) : any_container(std::begin(c), std::end(c)) {}
+
+    // initializer_list's aren't deducible, so don't get matched by the above template;
+    // we need this to explicitly allow implicit conversion from one:
+    template <typename TIn, typename = enable_if_t<std::is_convertible<TIn, T>::value>>
+    any_container(const std::initializer_list<TIn> &c) : any_container(c.begin(), c.end()) {}
+
+    // Avoid copying if given an rvalue vector of the correct type.
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    any_container(std::vector<T> &&v) : v(std::move(v)) {}
+
+    // Moves the vector out of an rvalue any_container
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator std::vector<T> &&() && { return std::move(v); }
+
+    // Dereferencing obtains a reference to the underlying vector
+    std::vector<T> &operator*() { return v; }
+    const std::vector<T> &operator*() const { return v; }
+
+    // -> lets you call methods on the underlying vector
+    std::vector<T> *operator->() { return &v; }
+    const std::vector<T> *operator->() const { return &v; }
+};
+
+// Forward-declaration; see detail/class.h
+std::string get_fully_qualified_tp_name(PyTypeObject *);
+
+template <typename T>
+inline static std::shared_ptr<T>
+try_get_shared_from_this(std::enable_shared_from_this<T> *holder_value_ptr) {
+// Pre C++17, this code path exploits undefined behavior, but is known to work on many platforms.
+// Use at your own risk!
+// See also https://en.cppreference.com/w/cpp/memory/enable_shared_from_this, and in particular
+// the `std::shared_ptr<Good> gp1 = not_so_good.getptr();` and `try`-`catch` parts of the example.
+#if defined(__cpp_lib_enable_shared_from_this) && (!defined(_MSC_VER) || _MSC_VER >= 1912)
+    return holder_value_ptr->weak_from_this().lock();
+#else
+    try {
+        return holder_value_ptr->shared_from_this();
+    } catch (const std::bad_weak_ptr &) {
+        return nullptr;
+    }
+#endif
+}
+
+// For silencing "unused" compiler warnings in special situations.
+template <typename... Args>
+#if defined(_MSC_VER) && _MSC_VER < 1920 // MSVC 2017
+constexpr
+#endif
+    inline void
+    silence_unused_warnings(Args &&...) {
+}
+
+// MSVC warning C4100: Unreferenced formal parameter
+#if defined(_MSC_VER) && _MSC_VER <= 1916
+#    define PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(...)                                         \
+        detail::silence_unused_warnings(__VA_ARGS__)
+#else
+#    define PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(...)
+#endif
+
+// GCC -Wunused-but-set-parameter  All GCC versions (as of July 2021).
+#if defined(__GNUG__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#    define PYBIND11_WORKAROUND_INCORRECT_GCC_UNUSED_BUT_SET_PARAMETER(...)                       \
+        detail::silence_unused_warnings(__VA_ARGS__)
+#else
+#    define PYBIND11_WORKAROUND_INCORRECT_GCC_UNUSED_BUT_SET_PARAMETER(...)
+#endif
+
+#if defined(__clang__)                                                                            \
+    && (defined(__apple_build_version__) /* AppleClang 13.0.0.13000029 was the only data point    \
+                                            available. */                                         \
+        || (__clang_major__ >= 7                                                                  \
+            && __clang_major__ <= 12) /* Clang 3, 5, 13, 14, 15 do not generate the warning. */   \
+    )
+#    define PYBIND11_DETECTED_CLANG_WITH_MISLEADING_CALL_STD_MOVE_EXPLICITLY_WARNING
+// Example:
+// tests/test_kwargs_and_defaults.cpp:46:68: error: local variable 'args' will be copied despite
+// being returned by name [-Werror,-Wreturn-std-move]
+//     m.def("args_function", [](py::args args) -> py::tuple { return args; });
+//                                                                    ^~~~
+// test_kwargs_and_defaults.cpp:46:68: note: call 'std::move' explicitly to avoid copying
+//     m.def("args_function", [](py::args args) -> py::tuple { return args; });
+//                                                                    ^~~~
+//                                                                    std::move(args)
+#endif
+
+// Pybind offers detailed error messages by default for all builts that are debug (through the
+// negation of NDEBUG). This can also be manually enabled by users, for any builds, through
+// defining PYBIND11_DETAILED_ERROR_MESSAGES. This information is primarily useful for those
+// who are writing (as opposed to merely using) libraries that use pybind11.
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES) && !defined(NDEBUG)
+#    define PYBIND11_DETAILED_ERROR_MESSAGES
+#endif
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/descr.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/descr.h
new file mode 100644
index 00000000..635614b0
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/descr.h
@@ -0,0 +1,171 @@
+/*
+    pybind11/detail/descr.h: Helper type for concatenating type signatures at compile time
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "common.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+#if !defined(_MSC_VER)
+#    define PYBIND11_DESCR_CONSTEXPR static constexpr
+#else
+#    define PYBIND11_DESCR_CONSTEXPR const
+#endif
+
+/* Concatenate type signatures at compile time */
+template <size_t N, typename... Ts>
+struct descr {
+    char text[N + 1]{'\0'};
+
+    constexpr descr() = default;
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    constexpr descr(char const (&s)[N + 1]) : descr(s, make_index_sequence<N>()) {}
+
+    template <size_t... Is>
+    constexpr descr(char const (&s)[N + 1], index_sequence<Is...>) : text{s[Is]..., '\0'} {}
+
+    template <typename... Chars>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    constexpr descr(char c, Chars... cs) : text{c, static_cast<char>(cs)..., '\0'} {}
+
+    static constexpr std::array<const std::type_info *, sizeof...(Ts) + 1> types() {
+        return {{&typeid(Ts)..., nullptr}};
+    }
+};
+
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2, size_t... Is1, size_t... Is2>
+constexpr descr<N1 + N2, Ts1..., Ts2...> plus_impl(const descr<N1, Ts1...> &a,
+                                                   const descr<N2, Ts2...> &b,
+                                                   index_sequence<Is1...>,
+                                                   index_sequence<Is2...>) {
+    PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(b);
+    return {a.text[Is1]..., b.text[Is2]...};
+}
+
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2>
+constexpr descr<N1 + N2, Ts1..., Ts2...> operator+(const descr<N1, Ts1...> &a,
+                                                   const descr<N2, Ts2...> &b) {
+    return plus_impl(a, b, make_index_sequence<N1>(), make_index_sequence<N2>());
+}
+
+template <size_t N>
+constexpr descr<N - 1> const_name(char const (&text)[N]) {
+    return descr<N - 1>(text);
+}
+constexpr descr<0> const_name(char const (&)[1]) { return {}; }
+
+template <size_t Rem, size_t... Digits>
+struct int_to_str : int_to_str<Rem / 10, Rem % 10, Digits...> {};
+template <size_t... Digits>
+struct int_to_str<0, Digits...> {
+    // WARNING: This only works with C++17 or higher.
+    static constexpr auto digits = descr<sizeof...(Digits)>(('0' + Digits)...);
+};
+
+// Ternary description (like std::conditional)
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<B, descr<N1 - 1>> const_name(char const (&text1)[N1], char const (&)[N2]) {
+    return const_name(text1);
+}
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<!B, descr<N2 - 1>> const_name(char const (&)[N1], char const (&text2)[N2]) {
+    return const_name(text2);
+}
+
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<B, T1> const_name(const T1 &d, const T2 &) {
+    return d;
+}
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<!B, T2> const_name(const T1 &, const T2 &d) {
+    return d;
+}
+
+template <size_t Size>
+auto constexpr const_name() -> remove_cv_t<decltype(int_to_str<Size / 10, Size % 10>::digits)> {
+    return int_to_str<Size / 10, Size % 10>::digits;
+}
+
+template <typename Type>
+constexpr descr<1, Type> const_name() {
+    return {'%'};
+}
+
+// If "_" is defined as a macro, py::detail::_ cannot be provided.
+// It is therefore best to use py::detail::const_name universally.
+// This block is for backward compatibility only.
+// (The const_name code is repeated to avoid introducing a "_" #define ourselves.)
+#ifndef _
+#    define PYBIND11_DETAIL_UNDERSCORE_BACKWARD_COMPATIBILITY
+template <size_t N>
+constexpr descr<N - 1> _(char const (&text)[N]) {
+    return const_name<N>(text);
+}
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<B, descr<N1 - 1>> _(char const (&text1)[N1], char const (&text2)[N2]) {
+    return const_name<B, N1, N2>(text1, text2);
+}
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<!B, descr<N2 - 1>> _(char const (&text1)[N1], char const (&text2)[N2]) {
+    return const_name<B, N1, N2>(text1, text2);
+}
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<B, T1> _(const T1 &d1, const T2 &d2) {
+    return const_name<B, T1, T2>(d1, d2);
+}
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<!B, T2> _(const T1 &d1, const T2 &d2) {
+    return const_name<B, T1, T2>(d1, d2);
+}
+
+template <size_t Size>
+auto constexpr _() -> remove_cv_t<decltype(int_to_str<Size / 10, Size % 10>::digits)> {
+    return const_name<Size>();
+}
+template <typename Type>
+constexpr descr<1, Type> _() {
+    return const_name<Type>();
+}
+#endif // #ifndef _
+
+constexpr descr<0> concat() { return {}; }
+
+template <size_t N, typename... Ts>
+constexpr descr<N, Ts...> concat(const descr<N, Ts...> &descr) {
+    return descr;
+}
+
+#ifdef __cpp_fold_expressions
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2>
+constexpr descr<N1 + N2 + 2, Ts1..., Ts2...> operator,(const descr<N1, Ts1...> &a,
+                                                       const descr<N2, Ts2...> &b) {
+    return a + const_name(", ") + b;
+}
+
+template <size_t N, typename... Ts, typename... Args>
+constexpr auto concat(const descr<N, Ts...> &d, const Args &...args) {
+    return (d, ..., args);
+}
+#else
+template <size_t N, typename... Ts, typename... Args>
+constexpr auto concat(const descr<N, Ts...> &d, const Args &...args)
+    -> decltype(std::declval<descr<N + 2, Ts...>>() + concat(args...)) {
+    return d + const_name(", ") + concat(args...);
+}
+#endif
+
+template <size_t N, typename... Ts>
+constexpr descr<N + 2, Ts...> type_descr(const descr<N, Ts...> &descr) {
+    return const_name("{") + descr + const_name("}");
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/init.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/init.h
new file mode 100644
index 00000000..e2117168
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/init.h
@@ -0,0 +1,434 @@
+/*
+    pybind11/detail/init.h: init factory function implementation and support code.
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "class.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <>
+class type_caster<value_and_holder> {
+public:
+    bool load(handle h, bool) {
+        value = reinterpret_cast<value_and_holder *>(h.ptr());
+        return true;
+    }
+
+    template <typename>
+    using cast_op_type = value_and_holder &;
+    explicit operator value_and_holder &() { return *value; }
+    static constexpr auto name = const_name<value_and_holder>();
+
+private:
+    value_and_holder *value = nullptr;
+};
+
+PYBIND11_NAMESPACE_BEGIN(initimpl)
+
+inline void no_nullptr(void *ptr) {
+    if (!ptr) {
+        throw type_error("pybind11::init(): factory function returned nullptr");
+    }
+}
+
+// Implementing functions for all forms of py::init<...> and py::init(...)
+template <typename Class>
+using Cpp = typename Class::type;
+template <typename Class>
+using Alias = typename Class::type_alias;
+template <typename Class>
+using Holder = typename Class::holder_type;
+
+template <typename Class>
+using is_alias_constructible = std::is_constructible<Alias<Class>, Cpp<Class> &&>;
+
+// Takes a Cpp pointer and returns true if it actually is a polymorphic Alias instance.
+template <typename Class, enable_if_t<Class::has_alias, int> = 0>
+bool is_alias(Cpp<Class> *ptr) {
+    return dynamic_cast<Alias<Class> *>(ptr) != nullptr;
+}
+// Failing fallback version of the above for a no-alias class (always returns false)
+template <typename /*Class*/>
+constexpr bool is_alias(void *) {
+    return false;
+}
+
+// Constructs and returns a new object; if the given arguments don't map to a constructor, we fall
+// back to brace aggregate initiailization so that for aggregate initialization can be used with
+// py::init, e.g.  `py::init<int, int>` to initialize a `struct T { int a; int b; }`.  For
+// non-aggregate types, we need to use an ordinary T(...) constructor (invoking as `T{...}` usually
+// works, but will not do the expected thing when `T` has an `initializer_list<T>` constructor).
+template <typename Class,
+          typename... Args,
+          detail::enable_if_t<std::is_constructible<Class, Args...>::value, int> = 0>
+inline Class *construct_or_initialize(Args &&...args) {
+    return new Class(std::forward<Args>(args)...);
+}
+template <typename Class,
+          typename... Args,
+          detail::enable_if_t<!std::is_constructible<Class, Args...>::value, int> = 0>
+inline Class *construct_or_initialize(Args &&...args) {
+    return new Class{std::forward<Args>(args)...};
+}
+
+// Attempts to constructs an alias using a `Alias(Cpp &&)` constructor.  This allows types with
+// an alias to provide only a single Cpp factory function as long as the Alias can be
+// constructed from an rvalue reference of the base Cpp type.  This means that Alias classes
+// can, when appropriate, simply define a `Alias(Cpp &&)` constructor rather than needing to
+// inherit all the base class constructors.
+template <typename Class>
+void construct_alias_from_cpp(std::true_type /*is_alias_constructible*/,
+                              value_and_holder &v_h,
+                              Cpp<Class> &&base) {
+    v_h.value_ptr() = new Alias<Class>(std::move(base));
+}
+template <typename Class>
+[[noreturn]] void construct_alias_from_cpp(std::false_type /*!is_alias_constructible*/,
+                                           value_and_holder &,
+                                           Cpp<Class> &&) {
+    throw type_error("pybind11::init(): unable to convert returned instance to required "
+                     "alias class: no `Alias<Class>(Class &&)` constructor available");
+}
+
+// Error-generating fallback for factories that don't match one of the below construction
+// mechanisms.
+template <typename Class>
+void construct(...) {
+    static_assert(!std::is_same<Class, Class>::value /* always false */,
+                  "pybind11::init(): init function must return a compatible pointer, "
+                  "holder, or value");
+}
+
+// Pointer return v1: the factory function returns a class pointer for a registered class.
+// If we don't need an alias (because this class doesn't have one, or because the final type is
+// inherited on the Python side) we can simply take over ownership.  Otherwise we need to try to
+// construct an Alias from the returned base instance.
+template <typename Class>
+void construct(value_and_holder &v_h, Cpp<Class> *ptr, bool need_alias) {
+    PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(need_alias);
+    no_nullptr(ptr);
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr)) {
+        // We're going to try to construct an alias by moving the cpp type.  Whether or not
+        // that succeeds, we still need to destroy the original cpp pointer (either the
+        // moved away leftover, if the alias construction works, or the value itself if we
+        // throw an error), but we can't just call `delete ptr`: it might have a special
+        // deleter, or might be shared_from_this.  So we construct a holder around it as if
+        // it was a normal instance, then steal the holder away into a local variable; thus
+        // the holder and destruction happens when we leave the C++ scope, and the holder
+        // class gets to handle the destruction however it likes.
+        v_h.value_ptr() = ptr;
+        v_h.set_instance_registered(true);          // To prevent init_instance from registering it
+        v_h.type->init_instance(v_h.inst, nullptr); // Set up the holder
+        Holder<Class> temp_holder(std::move(v_h.holder<Holder<Class>>())); // Steal the holder
+        v_h.type->dealloc(v_h); // Destroys the moved-out holder remains, resets value ptr to null
+        v_h.set_instance_registered(false);
+
+        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(*ptr));
+    } else {
+        // Otherwise the type isn't inherited, so we don't need an Alias
+        v_h.value_ptr() = ptr;
+    }
+}
+
+// Pointer return v2: a factory that always returns an alias instance ptr.  We simply take over
+// ownership of the pointer.
+template <typename Class, enable_if_t<Class::has_alias, int> = 0>
+void construct(value_and_holder &v_h, Alias<Class> *alias_ptr, bool) {
+    no_nullptr(alias_ptr);
+    v_h.value_ptr() = static_cast<Cpp<Class> *>(alias_ptr);
+}
+
+// Holder return: copy its pointer, and move or copy the returned holder into the new instance's
+// holder.  This also handles types like std::shared_ptr<T> and std::unique_ptr<T> where T is a
+// derived type (through those holder's implicit conversion from derived class holder
+// constructors).
+template <typename Class>
+void construct(value_and_holder &v_h, Holder<Class> holder, bool need_alias) {
+    PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(need_alias);
+    auto *ptr = holder_helper<Holder<Class>>::get(holder);
+    no_nullptr(ptr);
+    // If we need an alias, check that the held pointer is actually an alias instance
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr)) {
+        throw type_error("pybind11::init(): construction failed: returned holder-wrapped instance "
+                         "is not an alias instance");
+    }
+
+    v_h.value_ptr() = ptr;
+    v_h.type->init_instance(v_h.inst, &holder);
+}
+
+// return-by-value version 1: returning a cpp class by value.  If the class has an alias and an
+// alias is required the alias must have an `Alias(Cpp &&)` constructor so that we can construct
+// the alias from the base when needed (i.e. because of Python-side inheritance).  When we don't
+// need it, we simply move-construct the cpp value into a new instance.
+template <typename Class>
+void construct(value_and_holder &v_h, Cpp<Class> &&result, bool need_alias) {
+    PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(need_alias);
+    static_assert(is_move_constructible<Cpp<Class>>::value,
+                  "pybind11::init() return-by-value factory function requires a movable class");
+    if (Class::has_alias && need_alias) {
+        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(result));
+    } else {
+        v_h.value_ptr() = new Cpp<Class>(std::move(result));
+    }
+}
+
+// return-by-value version 2: returning a value of the alias type itself.  We move-construct an
+// Alias instance (even if no the python-side inheritance is involved).  The is intended for
+// cases where Alias initialization is always desired.
+template <typename Class>
+void construct(value_and_holder &v_h, Alias<Class> &&result, bool) {
+    static_assert(
+        is_move_constructible<Alias<Class>>::value,
+        "pybind11::init() return-by-alias-value factory function requires a movable alias class");
+    v_h.value_ptr() = new Alias<Class>(std::move(result));
+}
+
+// Implementing class for py::init<...>()
+template <typename... Args>
+struct constructor {
+    template <typename Class, typename... Extra, enable_if_t<!Class::has_alias, int> = 0>
+    static void execute(Class &cl, const Extra &...extra) {
+        cl.def(
+            "__init__",
+            [](value_and_holder &v_h, Args... args) {
+                v_h.value_ptr() = construct_or_initialize<Cpp<Class>>(std::forward<Args>(args)...);
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+
+    template <
+        typename Class,
+        typename... Extra,
+        enable_if_t<Class::has_alias && std::is_constructible<Cpp<Class>, Args...>::value, int>
+        = 0>
+    static void execute(Class &cl, const Extra &...extra) {
+        cl.def(
+            "__init__",
+            [](value_and_holder &v_h, Args... args) {
+                if (Py_TYPE(v_h.inst) == v_h.type->type) {
+                    v_h.value_ptr()
+                        = construct_or_initialize<Cpp<Class>>(std::forward<Args>(args)...);
+                } else {
+                    v_h.value_ptr()
+                        = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+                }
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+
+    template <
+        typename Class,
+        typename... Extra,
+        enable_if_t<Class::has_alias && !std::is_constructible<Cpp<Class>, Args...>::value, int>
+        = 0>
+    static void execute(Class &cl, const Extra &...extra) {
+        cl.def(
+            "__init__",
+            [](value_and_holder &v_h, Args... args) {
+                v_h.value_ptr()
+                    = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+};
+
+// Implementing class for py::init_alias<...>()
+template <typename... Args>
+struct alias_constructor {
+    template <
+        typename Class,
+        typename... Extra,
+        enable_if_t<Class::has_alias && std::is_constructible<Alias<Class>, Args...>::value, int>
+        = 0>
+    static void execute(Class &cl, const Extra &...extra) {
+        cl.def(
+            "__init__",
+            [](value_and_holder &v_h, Args... args) {
+                v_h.value_ptr()
+                    = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+};
+
+// Implementation class for py::init(Func) and py::init(Func, AliasFunc)
+template <typename CFunc,
+          typename AFunc = void_type (*)(),
+          typename = function_signature_t<CFunc>,
+          typename = function_signature_t<AFunc>>
+struct factory;
+
+// Specialization for py::init(Func)
+template <typename Func, typename Return, typename... Args>
+struct factory<Func, void_type (*)(), Return(Args...)> {
+    remove_reference_t<Func> class_factory;
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    factory(Func &&f) : class_factory(std::forward<Func>(f)) {}
+
+    // The given class either has no alias or has no separate alias factory;
+    // this always constructs the class itself.  If the class is registered with an alias
+    // type and an alias instance is needed (i.e. because the final type is a Python class
+    // inheriting from the C++ type) the returned value needs to either already be an alias
+    // instance, or the alias needs to be constructible from a `Class &&` argument.
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+#if defined(PYBIND11_CPP14)
+        cl.def(
+            "__init__",
+            [func = std::move(class_factory)]
+#else
+        auto &func = class_factory;
+        cl.def(
+            "__init__",
+            [func]
+#endif
+            (value_and_holder &v_h, Args... args) {
+                construct<Class>(
+                    v_h, func(std::forward<Args>(args)...), Py_TYPE(v_h.inst) != v_h.type->type);
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+};
+
+// Specialization for py::init(Func, AliasFunc)
+template <typename CFunc,
+          typename AFunc,
+          typename CReturn,
+          typename... CArgs,
+          typename AReturn,
+          typename... AArgs>
+struct factory<CFunc, AFunc, CReturn(CArgs...), AReturn(AArgs...)> {
+    static_assert(sizeof...(CArgs) == sizeof...(AArgs),
+                  "pybind11::init(class_factory, alias_factory): class and alias factories "
+                  "must have identical argument signatures");
+    static_assert(all_of<std::is_same<CArgs, AArgs>...>::value,
+                  "pybind11::init(class_factory, alias_factory): class and alias factories "
+                  "must have identical argument signatures");
+
+    remove_reference_t<CFunc> class_factory;
+    remove_reference_t<AFunc> alias_factory;
+
+    factory(CFunc &&c, AFunc &&a)
+        : class_factory(std::forward<CFunc>(c)), alias_factory(std::forward<AFunc>(a)) {}
+
+    // The class factory is called when the `self` type passed to `__init__` is the direct
+    // class (i.e. not inherited), the alias factory when `self` is a Python-side subtype.
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+        static_assert(Class::has_alias,
+                      "The two-argument version of `py::init()` can "
+                      "only be used if the class has an alias");
+#if defined(PYBIND11_CPP14)
+        cl.def(
+            "__init__",
+            [class_func = std::move(class_factory), alias_func = std::move(alias_factory)]
+#else
+        auto &class_func = class_factory;
+        auto &alias_func = alias_factory;
+        cl.def(
+            "__init__",
+            [class_func, alias_func]
+#endif
+            (value_and_holder &v_h, CArgs... args) {
+                if (Py_TYPE(v_h.inst) == v_h.type->type) {
+                    // If the instance type equals the registered type we don't have inheritance,
+                    // so don't need the alias and can construct using the class function:
+                    construct<Class>(v_h, class_func(std::forward<CArgs>(args)...), false);
+                } else {
+                    construct<Class>(v_h, alias_func(std::forward<CArgs>(args)...), true);
+                }
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+};
+
+/// Set just the C++ state. Same as `__init__`.
+template <typename Class, typename T>
+void setstate(value_and_holder &v_h, T &&result, bool need_alias) {
+    construct<Class>(v_h, std::forward<T>(result), need_alias);
+}
+
+/// Set both the C++ and Python states
+template <typename Class,
+          typename T,
+          typename O,
+          enable_if_t<std::is_convertible<O, handle>::value, int> = 0>
+void setstate(value_and_holder &v_h, std::pair<T, O> &&result, bool need_alias) {
+    construct<Class>(v_h, std::move(result.first), need_alias);
+    auto d = handle(result.second);
+    if (PyDict_Check(d.ptr()) && PyDict_Size(d.ptr()) == 0) {
+        // Skipping setattr below, to not force use of py::dynamic_attr() for Class unnecessarily.
+        // See PR #2972 for details.
+        return;
+    }
+    setattr((PyObject *) v_h.inst, "__dict__", d);
+}
+
+/// Implementation for py::pickle(GetState, SetState)
+template <typename Get,
+          typename Set,
+          typename = function_signature_t<Get>,
+          typename = function_signature_t<Set>>
+struct pickle_factory;
+
+template <typename Get,
+          typename Set,
+          typename RetState,
+          typename Self,
+          typename NewInstance,
+          typename ArgState>
+struct pickle_factory<Get, Set, RetState(Self), NewInstance(ArgState)> {
+    static_assert(std::is_same<intrinsic_t<RetState>, intrinsic_t<ArgState>>::value,
+                  "The type returned by `__getstate__` must be the same "
+                  "as the argument accepted by `__setstate__`");
+
+    remove_reference_t<Get> get;
+    remove_reference_t<Set> set;
+
+    pickle_factory(Get get, Set set) : get(std::forward<Get>(get)), set(std::forward<Set>(set)) {}
+
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+        cl.def("__getstate__", std::move(get));
+
+#if defined(PYBIND11_CPP14)
+        cl.def(
+            "__setstate__",
+            [func = std::move(set)]
+#else
+        auto &func = set;
+        cl.def(
+            "__setstate__",
+            [func]
+#endif
+            (value_and_holder &v_h, ArgState state) {
+                setstate<Class>(
+                    v_h, func(std::forward<ArgState>(state)), Py_TYPE(v_h.inst) != v_h.type->type);
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+};
+
+PYBIND11_NAMESPACE_END(initimpl)
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/internals.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/internals.h
new file mode 100644
index 00000000..aaa7f868
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/internals.h
@@ -0,0 +1,656 @@
+/*
+    pybind11/detail/internals.h: Internal data structure and related functions
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "common.h"
+
+#if defined(WITH_THREAD) && defined(PYBIND11_SIMPLE_GIL_MANAGEMENT)
+#    include "../gil.h"
+#endif
+
+#include "../pytypes.h"
+
+#include <exception>
+
+/// Tracks the `internals` and `type_info` ABI version independent of the main library version.
+///
+/// Some portions of the code use an ABI that is conditional depending on this
+/// version number.  That allows ABI-breaking changes to be "pre-implemented".
+/// Once the default version number is incremented, the conditional logic that
+/// no longer applies can be removed.  Additionally, users that need not
+/// maintain ABI compatibility can increase the version number in order to take
+/// advantage of any functionality/efficiency improvements that depend on the
+/// newer ABI.
+///
+/// WARNING: If you choose to manually increase the ABI version, note that
+/// pybind11 may not be tested as thoroughly with a non-default ABI version, and
+/// further ABI-incompatible changes may be made before the ABI is officially
+/// changed to the new version.
+#ifndef PYBIND11_INTERNALS_VERSION
+#    if PY_VERSION_HEX >= 0x030C0000
+// Version bump for Python 3.12+, before first 3.12 beta release.
+#        define PYBIND11_INTERNALS_VERSION 5
+#    else
+#        define PYBIND11_INTERNALS_VERSION 4
+#    endif
+#endif
+
+// This requirement is mainly to reduce the support burden (see PR #4570).
+static_assert(PY_VERSION_HEX < 0x030C0000 || PYBIND11_INTERNALS_VERSION >= 5,
+              "pybind11 ABI version 5 is the minimum for Python 3.12+");
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+using ExceptionTranslator = void (*)(std::exception_ptr);
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+constexpr const char *internals_function_record_capsule_name = "pybind11_function_record_capsule";
+
+// Forward declarations
+inline PyTypeObject *make_static_property_type();
+inline PyTypeObject *make_default_metaclass();
+inline PyObject *make_object_base_type(PyTypeObject *metaclass);
+
+// The old Python Thread Local Storage (TLS) API is deprecated in Python 3.7 in favor of the new
+// Thread Specific Storage (TSS) API.
+#if PY_VERSION_HEX >= 0x03070000
+// Avoid unnecessary allocation of `Py_tss_t`, since we cannot use
+// `Py_LIMITED_API` anyway.
+#    if PYBIND11_INTERNALS_VERSION > 4
+#        define PYBIND11_TLS_KEY_REF Py_tss_t &
+#        if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+// Clang on macOS warns due to `Py_tss_NEEDS_INIT` not specifying an initializer
+// for every field.
+#            define PYBIND11_TLS_KEY_INIT(var)                                                    \
+                _Pragma("GCC diagnostic push")                                         /**/       \
+                    _Pragma("GCC diagnostic ignored \"-Wmissing-field-initializers\"") /**/       \
+                    Py_tss_t var                                                                  \
+                    = Py_tss_NEEDS_INIT;                                                          \
+                _Pragma("GCC diagnostic pop")
+#        else
+#            define PYBIND11_TLS_KEY_INIT(var) Py_tss_t var = Py_tss_NEEDS_INIT;
+#        endif
+#        define PYBIND11_TLS_KEY_CREATE(var) (PyThread_tss_create(&(var)) == 0)
+#        define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get(&(key))
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set(&(key), (value))
+#        define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set(&(key), nullptr)
+#        define PYBIND11_TLS_FREE(key) PyThread_tss_delete(&(key))
+#    else
+#        define PYBIND11_TLS_KEY_REF Py_tss_t *
+#        define PYBIND11_TLS_KEY_INIT(var) Py_tss_t *var = nullptr;
+#        define PYBIND11_TLS_KEY_CREATE(var)                                                      \
+            (((var) = PyThread_tss_alloc()) != nullptr && (PyThread_tss_create((var)) == 0))
+#        define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get((key))
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set((key), (value))
+#        define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set((key), nullptr)
+#        define PYBIND11_TLS_FREE(key) PyThread_tss_free(key)
+#    endif
+#else
+// Usually an int but a long on Cygwin64 with Python 3.x
+#    define PYBIND11_TLS_KEY_REF decltype(PyThread_create_key())
+#    define PYBIND11_TLS_KEY_INIT(var) PYBIND11_TLS_KEY_REF var = 0;
+#    define PYBIND11_TLS_KEY_CREATE(var) (((var) = PyThread_create_key()) != -1)
+#    define PYBIND11_TLS_GET_VALUE(key) PyThread_get_key_value((key))
+#    if defined(PYPY_VERSION)
+// On CPython < 3.4 and on PyPy, `PyThread_set_key_value` strangely does not set
+// the value if it has already been set.  Instead, it must first be deleted and
+// then set again.
+inline void tls_replace_value(PYBIND11_TLS_KEY_REF key, void *value) {
+    PyThread_delete_key_value(key);
+    PyThread_set_key_value(key, value);
+}
+#        define PYBIND11_TLS_DELETE_VALUE(key) PyThread_delete_key_value(key)
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value)                                            \
+            ::pybind11::detail::tls_replace_value((key), (value))
+#    else
+#        define PYBIND11_TLS_DELETE_VALUE(key) PyThread_set_key_value((key), nullptr)
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_set_key_value((key), (value))
+#    endif
+#    define PYBIND11_TLS_FREE(key) (void) key
+#endif
+
+// Python loads modules by default with dlopen with the RTLD_LOCAL flag; under libc++ and possibly
+// other STLs, this means `typeid(A)` from one module won't equal `typeid(A)` from another module
+// even when `A` is the same, non-hidden-visibility type (e.g. from a common include).  Under
+// libstdc++, this doesn't happen: equality and the type_index hash are based on the type name,
+// which works.  If not under a known-good stl, provide our own name-based hash and equality
+// functions that use the type name.
+#if (PYBIND11_INTERNALS_VERSION <= 4 && defined(__GLIBCXX__))                                     \
+    || (PYBIND11_INTERNALS_VERSION >= 5 && !defined(_LIBCPP_VERSION))
+inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) { return lhs == rhs; }
+using type_hash = std::hash<std::type_index>;
+using type_equal_to = std::equal_to<std::type_index>;
+#else
+inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) {
+    return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
+}
+
+struct type_hash {
+    size_t operator()(const std::type_index &t) const {
+        size_t hash = 5381;
+        const char *ptr = t.name();
+        while (auto c = static_cast<unsigned char>(*ptr++)) {
+            hash = (hash * 33) ^ c;
+        }
+        return hash;
+    }
+};
+
+struct type_equal_to {
+    bool operator()(const std::type_index &lhs, const std::type_index &rhs) const {
+        return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
+    }
+};
+#endif
+
+template <typename value_type>
+using type_map = std::unordered_map<std::type_index, value_type, type_hash, type_equal_to>;
+
+struct override_hash {
+    inline size_t operator()(const std::pair<const PyObject *, const char *> &v) const {
+        size_t value = std::hash<const void *>()(v.first);
+        value ^= std::hash<const void *>()(v.second) + 0x9e3779b9 + (value << 6) + (value >> 2);
+        return value;
+    }
+};
+
+/// Internal data structure used to track registered instances and types.
+/// Whenever binary incompatible changes are made to this structure,
+/// `PYBIND11_INTERNALS_VERSION` must be incremented.
+struct internals {
+    // std::type_index -> pybind11's type information
+    type_map<type_info *> registered_types_cpp;
+    // PyTypeObject* -> base type_info(s)
+    std::unordered_map<PyTypeObject *, std::vector<type_info *>> registered_types_py;
+    std::unordered_multimap<const void *, instance *> registered_instances; // void * -> instance*
+    std::unordered_set<std::pair<const PyObject *, const char *>, override_hash>
+        inactive_override_cache;
+    type_map<std::vector<bool (*)(PyObject *, void *&)>> direct_conversions;
+    std::unordered_map<const PyObject *, std::vector<PyObject *>> patients;
+    std::forward_list<ExceptionTranslator> registered_exception_translators;
+    std::unordered_map<std::string, void *> shared_data; // Custom data to be shared across
+                                                         // extensions
+#if PYBIND11_INTERNALS_VERSION == 4
+    std::vector<PyObject *> unused_loader_patient_stack_remove_at_v5;
+#endif
+    std::forward_list<std::string> static_strings; // Stores the std::strings backing
+                                                   // detail::c_str()
+    PyTypeObject *static_property_type;
+    PyTypeObject *default_metaclass;
+    PyObject *instance_base;
+#if defined(WITH_THREAD)
+    // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined:
+    PYBIND11_TLS_KEY_INIT(tstate)
+#    if PYBIND11_INTERNALS_VERSION > 4
+    PYBIND11_TLS_KEY_INIT(loader_life_support_tls_key)
+#    endif // PYBIND11_INTERNALS_VERSION > 4
+    // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined:
+    PyInterpreterState *istate = nullptr;
+
+#    if PYBIND11_INTERNALS_VERSION > 4
+    // Note that we have to use a std::string to allocate memory to ensure a unique address
+    // We want unique addresses since we use pointer equality to compare function records
+    std::string function_record_capsule_name = internals_function_record_capsule_name;
+#    endif
+
+    internals() = default;
+    internals(const internals &other) = delete;
+    internals &operator=(const internals &other) = delete;
+    ~internals() {
+#    if PYBIND11_INTERNALS_VERSION > 4
+        PYBIND11_TLS_FREE(loader_life_support_tls_key);
+#    endif // PYBIND11_INTERNALS_VERSION > 4
+
+        // This destructor is called *after* Py_Finalize() in finalize_interpreter().
+        // That *SHOULD BE* fine. The following details what happens when PyThread_tss_free is
+        // called. PYBIND11_TLS_FREE is PyThread_tss_free on python 3.7+. On older python, it does
+        // nothing. PyThread_tss_free calls PyThread_tss_delete and PyMem_RawFree.
+        // PyThread_tss_delete just calls TlsFree (on Windows) or pthread_key_delete (on *NIX).
+        // Neither of those have anything to do with CPython internals. PyMem_RawFree *requires*
+        // that the `tstate` be allocated with the CPython allocator.
+        PYBIND11_TLS_FREE(tstate);
+    }
+#endif
+};
+
+/// Additional type information which does not fit into the PyTypeObject.
+/// Changes to this struct also require bumping `PYBIND11_INTERNALS_VERSION`.
+struct type_info {
+    PyTypeObject *type;
+    const std::type_info *cpptype;
+    size_t type_size, type_align, holder_size_in_ptrs;
+    void *(*operator_new)(size_t);
+    void (*init_instance)(instance *, const void *);
+    void (*dealloc)(value_and_holder &v_h);
+    std::vector<PyObject *(*) (PyObject *, PyTypeObject *)> implicit_conversions;
+    std::vector<std::pair<const std::type_info *, void *(*) (void *)>> implicit_casts;
+    std::vector<bool (*)(PyObject *, void *&)> *direct_conversions;
+    buffer_info *(*get_buffer)(PyObject *, void *) = nullptr;
+    void *get_buffer_data = nullptr;
+    void *(*module_local_load)(PyObject *, const type_info *) = nullptr;
+    /* A simple type never occurs as a (direct or indirect) parent
+     * of a class that makes use of multiple inheritance.
+     * A type can be simple even if it has non-simple ancestors as long as it has no descendants.
+     */
+    bool simple_type : 1;
+    /* True if there is no multiple inheritance in this type's inheritance tree */
+    bool simple_ancestors : 1;
+    /* for base vs derived holder_type checks */
+    bool default_holder : 1;
+    /* true if this is a type registered with py::module_local */
+    bool module_local : 1;
+};
+
+/// On MSVC, debug and release builds are not ABI-compatible!
+#if defined(_MSC_VER) && defined(_DEBUG)
+#    define PYBIND11_BUILD_TYPE "_debug"
+#else
+#    define PYBIND11_BUILD_TYPE ""
+#endif
+
+/// Let's assume that different compilers are ABI-incompatible.
+/// A user can manually set this string if they know their
+/// compiler is compatible.
+#ifndef PYBIND11_COMPILER_TYPE
+#    if defined(_MSC_VER)
+#        define PYBIND11_COMPILER_TYPE "_msvc"
+#    elif defined(__INTEL_COMPILER)
+#        define PYBIND11_COMPILER_TYPE "_icc"
+#    elif defined(__clang__)
+#        define PYBIND11_COMPILER_TYPE "_clang"
+#    elif defined(__PGI)
+#        define PYBIND11_COMPILER_TYPE "_pgi"
+#    elif defined(__MINGW32__)
+#        define PYBIND11_COMPILER_TYPE "_mingw"
+#    elif defined(__CYGWIN__)
+#        define PYBIND11_COMPILER_TYPE "_gcc_cygwin"
+#    elif defined(__GNUC__)
+#        define PYBIND11_COMPILER_TYPE "_gcc"
+#    else
+#        define PYBIND11_COMPILER_TYPE "_unknown"
+#    endif
+#endif
+
+/// Also standard libs
+#ifndef PYBIND11_STDLIB
+#    if defined(_LIBCPP_VERSION)
+#        define PYBIND11_STDLIB "_libcpp"
+#    elif defined(__GLIBCXX__) || defined(__GLIBCPP__)
+#        define PYBIND11_STDLIB "_libstdcpp"
+#    else
+#        define PYBIND11_STDLIB ""
+#    endif
+#endif
+
+/// On Linux/OSX, changes in __GXX_ABI_VERSION__ indicate ABI incompatibility.
+#ifndef PYBIND11_BUILD_ABI
+#    if defined(__GXX_ABI_VERSION)
+#        define PYBIND11_BUILD_ABI "_cxxabi" PYBIND11_TOSTRING(__GXX_ABI_VERSION)
+#    else
+#        define PYBIND11_BUILD_ABI ""
+#    endif
+#endif
+
+#ifndef PYBIND11_INTERNALS_KIND
+#    if defined(WITH_THREAD)
+#        define PYBIND11_INTERNALS_KIND ""
+#    else
+#        define PYBIND11_INTERNALS_KIND "_without_thread"
+#    endif
+#endif
+
+#define PYBIND11_INTERNALS_ID                                                                     \
+    "__pybind11_internals_v" PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION)                        \
+        PYBIND11_INTERNALS_KIND PYBIND11_COMPILER_TYPE PYBIND11_STDLIB PYBIND11_BUILD_ABI         \
+            PYBIND11_BUILD_TYPE "__"
+
+#define PYBIND11_MODULE_LOCAL_ID                                                                  \
+    "__pybind11_module_local_v" PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION)                     \
+        PYBIND11_INTERNALS_KIND PYBIND11_COMPILER_TYPE PYBIND11_STDLIB PYBIND11_BUILD_ABI         \
+            PYBIND11_BUILD_TYPE "__"
+
+/// Each module locally stores a pointer to the `internals` data. The data
+/// itself is shared among modules with the same `PYBIND11_INTERNALS_ID`.
+inline internals **&get_internals_pp() {
+    static internals **internals_pp = nullptr;
+    return internals_pp;
+}
+
+// forward decl
+inline void translate_exception(std::exception_ptr);
+
+template <class T,
+          enable_if_t<std::is_same<std::nested_exception, remove_cvref_t<T>>::value, int> = 0>
+bool handle_nested_exception(const T &exc, const std::exception_ptr &p) {
+    std::exception_ptr nested = exc.nested_ptr();
+    if (nested != nullptr && nested != p) {
+        translate_exception(nested);
+        return true;
+    }
+    return false;
+}
+
+template <class T,
+          enable_if_t<!std::is_same<std::nested_exception, remove_cvref_t<T>>::value, int> = 0>
+bool handle_nested_exception(const T &exc, const std::exception_ptr &p) {
+    if (const auto *nep = dynamic_cast<const std::nested_exception *>(std::addressof(exc))) {
+        return handle_nested_exception(*nep, p);
+    }
+    return false;
+}
+
+inline bool raise_err(PyObject *exc_type, const char *msg) {
+    if (PyErr_Occurred()) {
+        raise_from(exc_type, msg);
+        return true;
+    }
+    PyErr_SetString(exc_type, msg);
+    return false;
+}
+
+inline void translate_exception(std::exception_ptr p) {
+    if (!p) {
+        return;
+    }
+    try {
+        std::rethrow_exception(p);
+    } catch (error_already_set &e) {
+        handle_nested_exception(e, p);
+        e.restore();
+        return;
+    } catch (const builtin_exception &e) {
+        // Could not use template since it's an abstract class.
+        if (const auto *nep = dynamic_cast<const std::nested_exception *>(std::addressof(e))) {
+            handle_nested_exception(*nep, p);
+        }
+        e.set_error();
+        return;
+    } catch (const std::bad_alloc &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_MemoryError, e.what());
+        return;
+    } catch (const std::domain_error &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_ValueError, e.what());
+        return;
+    } catch (const std::invalid_argument &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_ValueError, e.what());
+        return;
+    } catch (const std::length_error &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_ValueError, e.what());
+        return;
+    } catch (const std::out_of_range &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_IndexError, e.what());
+        return;
+    } catch (const std::range_error &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_ValueError, e.what());
+        return;
+    } catch (const std::overflow_error &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_OverflowError, e.what());
+        return;
+    } catch (const std::exception &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_RuntimeError, e.what());
+        return;
+    } catch (const std::nested_exception &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_RuntimeError, "Caught an unknown nested exception!");
+        return;
+    } catch (...) {
+        raise_err(PyExc_RuntimeError, "Caught an unknown exception!");
+        return;
+    }
+}
+
+#if !defined(__GLIBCXX__)
+inline void translate_local_exception(std::exception_ptr p) {
+    try {
+        if (p) {
+            std::rethrow_exception(p);
+        }
+    } catch (error_already_set &e) {
+        e.restore();
+        return;
+    } catch (const builtin_exception &e) {
+        e.set_error();
+        return;
+    }
+}
+#endif
+
+inline object get_python_state_dict() {
+    object state_dict;
+#if PYBIND11_INTERNALS_VERSION <= 4 || PY_VERSION_HEX < 0x03080000 || defined(PYPY_VERSION)
+    state_dict = reinterpret_borrow<object>(PyEval_GetBuiltins());
+#else
+#    if PY_VERSION_HEX < 0x03090000
+    PyInterpreterState *istate = _PyInterpreterState_Get();
+#    else
+    PyInterpreterState *istate = PyInterpreterState_Get();
+#    endif
+    if (istate) {
+        state_dict = reinterpret_borrow<object>(PyInterpreterState_GetDict(istate));
+    }
+#endif
+    if (!state_dict) {
+        raise_from(PyExc_SystemError, "pybind11::detail::get_python_state_dict() FAILED");
+    }
+    return state_dict;
+}
+
+inline object get_internals_obj_from_state_dict(handle state_dict) {
+    return reinterpret_borrow<object>(dict_getitemstring(state_dict.ptr(), PYBIND11_INTERNALS_ID));
+}
+
+inline internals **get_internals_pp_from_capsule(handle obj) {
+    void *raw_ptr = PyCapsule_GetPointer(obj.ptr(), /*name=*/nullptr);
+    if (raw_ptr == nullptr) {
+        raise_from(PyExc_SystemError, "pybind11::detail::get_internals_pp_from_capsule() FAILED");
+    }
+    return static_cast<internals **>(raw_ptr);
+}
+
+/// Return a reference to the current `internals` data
+PYBIND11_NOINLINE internals &get_internals() {
+    auto **&internals_pp = get_internals_pp();
+    if (internals_pp && *internals_pp) {
+        return **internals_pp;
+    }
+
+#if defined(WITH_THREAD)
+#    if defined(PYBIND11_SIMPLE_GIL_MANAGEMENT)
+    gil_scoped_acquire gil;
+#    else
+    // Ensure that the GIL is held since we will need to make Python calls.
+    // Cannot use py::gil_scoped_acquire here since that constructor calls get_internals.
+    struct gil_scoped_acquire_local {
+        gil_scoped_acquire_local() : state(PyGILState_Ensure()) {}
+        gil_scoped_acquire_local(const gil_scoped_acquire_local &) = delete;
+        gil_scoped_acquire_local &operator=(const gil_scoped_acquire_local &) = delete;
+        ~gil_scoped_acquire_local() { PyGILState_Release(state); }
+        const PyGILState_STATE state;
+    } gil;
+#    endif
+#endif
+    error_scope err_scope;
+
+    dict state_dict = get_python_state_dict();
+    if (object internals_obj = get_internals_obj_from_state_dict(state_dict)) {
+        internals_pp = get_internals_pp_from_capsule(internals_obj);
+    }
+    if (internals_pp && *internals_pp) {
+        // We loaded the internals through `state_dict`, which means that our `error_already_set`
+        // and `builtin_exception` may be different local classes than the ones set up in the
+        // initial exception translator, below, so add another for our local exception classes.
+        //
+        // libstdc++ doesn't require this (types there are identified only by name)
+        // libc++ with CPython doesn't require this (types are explicitly exported)
+        // libc++ with PyPy still need it, awaiting further investigation
+#if !defined(__GLIBCXX__)
+        (*internals_pp)->registered_exception_translators.push_front(&translate_local_exception);
+#endif
+    } else {
+        if (!internals_pp) {
+            internals_pp = new internals *();
+        }
+        auto *&internals_ptr = *internals_pp;
+        internals_ptr = new internals();
+#if defined(WITH_THREAD)
+
+        PyThreadState *tstate = PyThreadState_Get();
+        // NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
+        if (!PYBIND11_TLS_KEY_CREATE(internals_ptr->tstate)) {
+            pybind11_fail("get_internals: could not successfully initialize the tstate TSS key!");
+        }
+        PYBIND11_TLS_REPLACE_VALUE(internals_ptr->tstate, tstate);
+
+#    if PYBIND11_INTERNALS_VERSION > 4
+        // NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
+        if (!PYBIND11_TLS_KEY_CREATE(internals_ptr->loader_life_support_tls_key)) {
+            pybind11_fail("get_internals: could not successfully initialize the "
+                          "loader_life_support TSS key!");
+        }
+#    endif
+        internals_ptr->istate = tstate->interp;
+#endif
+        state_dict[PYBIND11_INTERNALS_ID] = capsule(internals_pp);
+        internals_ptr->registered_exception_translators.push_front(&translate_exception);
+        internals_ptr->static_property_type = make_static_property_type();
+        internals_ptr->default_metaclass = make_default_metaclass();
+        internals_ptr->instance_base = make_object_base_type(internals_ptr->default_metaclass);
+    }
+    return **internals_pp;
+}
+
+// the internals struct (above) is shared between all the modules. local_internals are only
+// for a single module. Any changes made to internals may require an update to
+// PYBIND11_INTERNALS_VERSION, breaking backwards compatibility. local_internals is, by design,
+// restricted to a single module. Whether a module has local internals or not should not
+// impact any other modules, because the only things accessing the local internals is the
+// module that contains them.
+struct local_internals {
+    type_map<type_info *> registered_types_cpp;
+    std::forward_list<ExceptionTranslator> registered_exception_translators;
+#if defined(WITH_THREAD) && PYBIND11_INTERNALS_VERSION == 4
+
+    // For ABI compatibility, we can't store the loader_life_support TLS key in
+    // the `internals` struct directly.  Instead, we store it in `shared_data` and
+    // cache a copy in `local_internals`.  If we allocated a separate TLS key for
+    // each instance of `local_internals`, we could end up allocating hundreds of
+    // TLS keys if hundreds of different pybind11 modules are loaded (which is a
+    // plausible number).
+    PYBIND11_TLS_KEY_INIT(loader_life_support_tls_key)
+
+    // Holds the shared TLS key for the loader_life_support stack.
+    struct shared_loader_life_support_data {
+        PYBIND11_TLS_KEY_INIT(loader_life_support_tls_key)
+        shared_loader_life_support_data() {
+            // NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
+            if (!PYBIND11_TLS_KEY_CREATE(loader_life_support_tls_key)) {
+                pybind11_fail("local_internals: could not successfully initialize the "
+                              "loader_life_support TLS key!");
+            }
+        }
+        // We can't help but leak the TLS key, because Python never unloads extension modules.
+    };
+
+    local_internals() {
+        auto &internals = get_internals();
+        // Get or create the `loader_life_support_stack_key`.
+        auto &ptr = internals.shared_data["_life_support"];
+        if (!ptr) {
+            ptr = new shared_loader_life_support_data;
+        }
+        loader_life_support_tls_key
+            = static_cast<shared_loader_life_support_data *>(ptr)->loader_life_support_tls_key;
+    }
+#endif //  defined(WITH_THREAD) && PYBIND11_INTERNALS_VERSION == 4
+};
+
+/// Works like `get_internals`, but for things which are locally registered.
+inline local_internals &get_local_internals() {
+    // Current static can be created in the interpreter finalization routine. If the later will be
+    // destroyed in another static variable destructor, creation of this static there will cause
+    // static deinitialization fiasco. In order to avoid it we avoid destruction of the
+    // local_internals static. One can read more about the problem and current solution here:
+    // https://google.github.io/styleguide/cppguide.html#Static_and_Global_Variables
+    static auto *locals = new local_internals();
+    return *locals;
+}
+
+/// Constructs a std::string with the given arguments, stores it in `internals`, and returns its
+/// `c_str()`.  Such strings objects have a long storage duration -- the internal strings are only
+/// cleared when the program exits or after interpreter shutdown (when embedding), and so are
+/// suitable for c-style strings needed by Python internals (such as PyTypeObject's tp_name).
+template <typename... Args>
+const char *c_str(Args &&...args) {
+    auto &strings = get_internals().static_strings;
+    strings.emplace_front(std::forward<Args>(args)...);
+    return strings.front().c_str();
+}
+
+inline const char *get_function_record_capsule_name() {
+#if PYBIND11_INTERNALS_VERSION > 4
+    return get_internals().function_record_capsule_name.c_str();
+#else
+    return nullptr;
+#endif
+}
+
+// Determine whether or not the following capsule contains a pybind11 function record.
+// Note that we use `internals` to make sure that only ABI compatible records are touched.
+//
+// This check is currently used in two places:
+// - An important optimization in functional.h to avoid overhead in C++ -> Python -> C++
+// - The sibling feature of cpp_function to allow overloads
+inline bool is_function_record_capsule(const capsule &cap) {
+    // Pointer equality as we rely on internals() to ensure unique pointers
+    return cap.name() == get_function_record_capsule_name();
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Returns a named pointer that is shared among all extension modules (using the same
+/// pybind11 version) running in the current interpreter. Names starting with underscores
+/// are reserved for internal usage. Returns `nullptr` if no matching entry was found.
+PYBIND11_NOINLINE void *get_shared_data(const std::string &name) {
+    auto &internals = detail::get_internals();
+    auto it = internals.shared_data.find(name);
+    return it != internals.shared_data.end() ? it->second : nullptr;
+}
+
+/// Set the shared data that can be later recovered by `get_shared_data()`.
+PYBIND11_NOINLINE void *set_shared_data(const std::string &name, void *data) {
+    detail::get_internals().shared_data[name] = data;
+    return data;
+}
+
+/// Returns a typed reference to a shared data entry (by using `get_shared_data()`) if
+/// such entry exists. Otherwise, a new object of default-constructible type `T` is
+/// added to the shared data under the given name and a reference to it is returned.
+template <typename T>
+T &get_or_create_shared_data(const std::string &name) {
+    auto &internals = detail::get_internals();
+    auto it = internals.shared_data.find(name);
+    T *ptr = (T *) (it != internals.shared_data.end() ? it->second : nullptr);
+    if (!ptr) {
+        ptr = new T();
+        internals.shared_data[name] = ptr;
+    }
+    return *ptr;
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/type_caster_base.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/type_caster_base.h
new file mode 100644
index 00000000..16387506
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/type_caster_base.h
@@ -0,0 +1,1177 @@
+/*
+    pybind11/detail/type_caster_base.h (originally first part of pybind11/cast.h)
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "../pytypes.h"
+#include "common.h"
+#include "descr.h"
+#include "internals.h"
+#include "typeid.h"
+
+#include <cstdint>
+#include <iterator>
+#include <new>
+#include <string>
+#include <type_traits>
+#include <typeindex>
+#include <typeinfo>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// A life support system for temporary objects created by `type_caster::load()`.
+/// Adding a patient will keep it alive up until the enclosing function returns.
+class loader_life_support {
+private:
+    loader_life_support *parent = nullptr;
+    std::unordered_set<PyObject *> keep_alive;
+
+#if defined(WITH_THREAD)
+    // Store stack pointer in thread-local storage.
+    static PYBIND11_TLS_KEY_REF get_stack_tls_key() {
+#    if PYBIND11_INTERNALS_VERSION == 4
+        return get_local_internals().loader_life_support_tls_key;
+#    else
+        return get_internals().loader_life_support_tls_key;
+#    endif
+    }
+    static loader_life_support *get_stack_top() {
+        return static_cast<loader_life_support *>(PYBIND11_TLS_GET_VALUE(get_stack_tls_key()));
+    }
+    static void set_stack_top(loader_life_support *value) {
+        PYBIND11_TLS_REPLACE_VALUE(get_stack_tls_key(), value);
+    }
+#else
+    // Use single global variable for stack.
+    static loader_life_support **get_stack_pp() {
+        static loader_life_support *global_stack = nullptr;
+        return global_stack;
+    }
+    static loader_life_support *get_stack_top() { return *get_stack_pp(); }
+    static void set_stack_top(loader_life_support *value) { *get_stack_pp() = value; }
+#endif
+
+public:
+    /// A new patient frame is created when a function is entered
+    loader_life_support() : parent{get_stack_top()} { set_stack_top(this); }
+
+    /// ... and destroyed after it returns
+    ~loader_life_support() {
+        if (get_stack_top() != this) {
+            pybind11_fail("loader_life_support: internal error");
+        }
+        set_stack_top(parent);
+        for (auto *item : keep_alive) {
+            Py_DECREF(item);
+        }
+    }
+
+    /// This can only be used inside a pybind11-bound function, either by `argument_loader`
+    /// at argument preparation time or by `py::cast()` at execution time.
+    PYBIND11_NOINLINE static void add_patient(handle h) {
+        loader_life_support *frame = get_stack_top();
+        if (!frame) {
+            // NOTE: It would be nice to include the stack frames here, as this indicates
+            // use of pybind11::cast<> outside the normal call framework, finding such
+            // a location is challenging. Developers could consider printing out
+            // stack frame addresses here using something like __builtin_frame_address(0)
+            throw cast_error("When called outside a bound function, py::cast() cannot "
+                             "do Python -> C++ conversions which require the creation "
+                             "of temporary values");
+        }
+
+        if (frame->keep_alive.insert(h.ptr()).second) {
+            Py_INCREF(h.ptr());
+        }
+    }
+};
+
+// Gets the cache entry for the given type, creating it if necessary.  The return value is the pair
+// returned by emplace, i.e. an iterator for the entry and a bool set to `true` if the entry was
+// just created.
+inline std::pair<decltype(internals::registered_types_py)::iterator, bool>
+all_type_info_get_cache(PyTypeObject *type);
+
+// Populates a just-created cache entry.
+PYBIND11_NOINLINE void all_type_info_populate(PyTypeObject *t, std::vector<type_info *> &bases) {
+    std::vector<PyTypeObject *> check;
+    for (handle parent : reinterpret_borrow<tuple>(t->tp_bases)) {
+        check.push_back((PyTypeObject *) parent.ptr());
+    }
+
+    auto const &type_dict = get_internals().registered_types_py;
+    for (size_t i = 0; i < check.size(); i++) {
+        auto *type = check[i];
+        // Ignore Python2 old-style class super type:
+        if (!PyType_Check((PyObject *) type)) {
+            continue;
+        }
+
+        // Check `type` in the current set of registered python types:
+        auto it = type_dict.find(type);
+        if (it != type_dict.end()) {
+            // We found a cache entry for it, so it's either pybind-registered or has pre-computed
+            // pybind bases, but we have to make sure we haven't already seen the type(s) before:
+            // we want to follow Python/virtual C++ rules that there should only be one instance of
+            // a common base.
+            for (auto *tinfo : it->second) {
+                // NB: Could use a second set here, rather than doing a linear search, but since
+                // having a large number of immediate pybind11-registered types seems fairly
+                // unlikely, that probably isn't worthwhile.
+                bool found = false;
+                for (auto *known : bases) {
+                    if (known == tinfo) {
+                        found = true;
+                        break;
+                    }
+                }
+                if (!found) {
+                    bases.push_back(tinfo);
+                }
+            }
+        } else if (type->tp_bases) {
+            // It's some python type, so keep follow its bases classes to look for one or more
+            // registered types
+            if (i + 1 == check.size()) {
+                // When we're at the end, we can pop off the current element to avoid growing
+                // `check` when adding just one base (which is typical--i.e. when there is no
+                // multiple inheritance)
+                check.pop_back();
+                i--;
+            }
+            for (handle parent : reinterpret_borrow<tuple>(type->tp_bases)) {
+                check.push_back((PyTypeObject *) parent.ptr());
+            }
+        }
+    }
+}
+
+/**
+ * Extracts vector of type_info pointers of pybind-registered roots of the given Python type.  Will
+ * be just 1 pybind type for the Python type of a pybind-registered class, or for any Python-side
+ * derived class that uses single inheritance.  Will contain as many types as required for a Python
+ * class that uses multiple inheritance to inherit (directly or indirectly) from multiple
+ * pybind-registered classes.  Will be empty if neither the type nor any base classes are
+ * pybind-registered.
+ *
+ * The value is cached for the lifetime of the Python type.
+ */
+inline const std::vector<detail::type_info *> &all_type_info(PyTypeObject *type) {
+    auto ins = all_type_info_get_cache(type);
+    if (ins.second) {
+        // New cache entry: populate it
+        all_type_info_populate(type, ins.first->second);
+    }
+
+    return ins.first->second;
+}
+
+/**
+ * Gets a single pybind11 type info for a python type.  Returns nullptr if neither the type nor any
+ * ancestors are pybind11-registered.  Throws an exception if there are multiple bases--use
+ * `all_type_info` instead if you want to support multiple bases.
+ */
+PYBIND11_NOINLINE detail::type_info *get_type_info(PyTypeObject *type) {
+    const auto &bases = all_type_info(type);
+    if (bases.empty()) {
+        return nullptr;
+    }
+    if (bases.size() > 1) {
+        pybind11_fail(
+            "pybind11::detail::get_type_info: type has multiple pybind11-registered bases");
+    }
+    return bases.front();
+}
+
+inline detail::type_info *get_local_type_info(const std::type_index &tp) {
+    auto &locals = get_local_internals().registered_types_cpp;
+    auto it = locals.find(tp);
+    if (it != locals.end()) {
+        return it->second;
+    }
+    return nullptr;
+}
+
+inline detail::type_info *get_global_type_info(const std::type_index &tp) {
+    auto &types = get_internals().registered_types_cpp;
+    auto it = types.find(tp);
+    if (it != types.end()) {
+        return it->second;
+    }
+    return nullptr;
+}
+
+/// Return the type info for a given C++ type; on lookup failure can either throw or return
+/// nullptr.
+PYBIND11_NOINLINE detail::type_info *get_type_info(const std::type_index &tp,
+                                                   bool throw_if_missing = false) {
+    if (auto *ltype = get_local_type_info(tp)) {
+        return ltype;
+    }
+    if (auto *gtype = get_global_type_info(tp)) {
+        return gtype;
+    }
+
+    if (throw_if_missing) {
+        std::string tname = tp.name();
+        detail::clean_type_id(tname);
+        pybind11_fail("pybind11::detail::get_type_info: unable to find type info for \""
+                      + std::move(tname) + '"');
+    }
+    return nullptr;
+}
+
+PYBIND11_NOINLINE handle get_type_handle(const std::type_info &tp, bool throw_if_missing) {
+    detail::type_info *type_info = get_type_info(tp, throw_if_missing);
+    return handle(type_info ? ((PyObject *) type_info->type) : nullptr);
+}
+
+// Searches the inheritance graph for a registered Python instance, using all_type_info().
+PYBIND11_NOINLINE handle find_registered_python_instance(void *src,
+                                                         const detail::type_info *tinfo) {
+    auto it_instances = get_internals().registered_instances.equal_range(src);
+    for (auto it_i = it_instances.first; it_i != it_instances.second; ++it_i) {
+        for (auto *instance_type : detail::all_type_info(Py_TYPE(it_i->second))) {
+            if (instance_type && same_type(*instance_type->cpptype, *tinfo->cpptype)) {
+                return handle((PyObject *) it_i->second).inc_ref();
+            }
+        }
+    }
+    return handle();
+}
+
+struct value_and_holder {
+    instance *inst = nullptr;
+    size_t index = 0u;
+    const detail::type_info *type = nullptr;
+    void **vh = nullptr;
+
+    // Main constructor for a found value/holder:
+    value_and_holder(instance *i, const detail::type_info *type, size_t vpos, size_t index)
+        : inst{i}, index{index}, type{type},
+          vh{inst->simple_layout ? inst->simple_value_holder
+                                 : &inst->nonsimple.values_and_holders[vpos]} {}
+
+    // Default constructor (used to signal a value-and-holder not found by get_value_and_holder())
+    value_and_holder() = default;
+
+    // Used for past-the-end iterator
+    explicit value_and_holder(size_t index) : index{index} {}
+
+    template <typename V = void>
+    V *&value_ptr() const {
+        return reinterpret_cast<V *&>(vh[0]);
+    }
+    // True if this `value_and_holder` has a non-null value pointer
+    explicit operator bool() const { return value_ptr() != nullptr; }
+
+    template <typename H>
+    H &holder() const {
+        return reinterpret_cast<H &>(vh[1]);
+    }
+    bool holder_constructed() const {
+        return inst->simple_layout
+                   ? inst->simple_holder_constructed
+                   : (inst->nonsimple.status[index] & instance::status_holder_constructed) != 0u;
+    }
+    // NOLINTNEXTLINE(readability-make-member-function-const)
+    void set_holder_constructed(bool v = true) {
+        if (inst->simple_layout) {
+            inst->simple_holder_constructed = v;
+        } else if (v) {
+            inst->nonsimple.status[index] |= instance::status_holder_constructed;
+        } else {
+            inst->nonsimple.status[index] &= (std::uint8_t) ~instance::status_holder_constructed;
+        }
+    }
+    bool instance_registered() const {
+        return inst->simple_layout
+                   ? inst->simple_instance_registered
+                   : ((inst->nonsimple.status[index] & instance::status_instance_registered) != 0);
+    }
+    // NOLINTNEXTLINE(readability-make-member-function-const)
+    void set_instance_registered(bool v = true) {
+        if (inst->simple_layout) {
+            inst->simple_instance_registered = v;
+        } else if (v) {
+            inst->nonsimple.status[index] |= instance::status_instance_registered;
+        } else {
+            inst->nonsimple.status[index] &= (std::uint8_t) ~instance::status_instance_registered;
+        }
+    }
+};
+
+// Container for accessing and iterating over an instance's values/holders
+struct values_and_holders {
+private:
+    instance *inst;
+    using type_vec = std::vector<detail::type_info *>;
+    const type_vec &tinfo;
+
+public:
+    explicit values_and_holders(instance *inst)
+        : inst{inst}, tinfo(all_type_info(Py_TYPE(inst))) {}
+
+    struct iterator {
+    private:
+        instance *inst = nullptr;
+        const type_vec *types = nullptr;
+        value_and_holder curr;
+        friend struct values_and_holders;
+        iterator(instance *inst, const type_vec *tinfo)
+            : inst{inst}, types{tinfo},
+              curr(inst /* instance */,
+                   types->empty() ? nullptr : (*types)[0] /* type info */,
+                   0, /* vpos: (non-simple types only): the first vptr comes first */
+                   0 /* index */) {}
+        // Past-the-end iterator:
+        explicit iterator(size_t end) : curr(end) {}
+
+    public:
+        bool operator==(const iterator &other) const { return curr.index == other.curr.index; }
+        bool operator!=(const iterator &other) const { return curr.index != other.curr.index; }
+        iterator &operator++() {
+            if (!inst->simple_layout) {
+                curr.vh += 1 + (*types)[curr.index]->holder_size_in_ptrs;
+            }
+            ++curr.index;
+            curr.type = curr.index < types->size() ? (*types)[curr.index] : nullptr;
+            return *this;
+        }
+        value_and_holder &operator*() { return curr; }
+        value_and_holder *operator->() { return &curr; }
+    };
+
+    iterator begin() { return iterator(inst, &tinfo); }
+    iterator end() { return iterator(tinfo.size()); }
+
+    iterator find(const type_info *find_type) {
+        auto it = begin(), endit = end();
+        while (it != endit && it->type != find_type) {
+            ++it;
+        }
+        return it;
+    }
+
+    size_t size() { return tinfo.size(); }
+};
+
+/**
+ * Extracts C++ value and holder pointer references from an instance (which may contain multiple
+ * values/holders for python-side multiple inheritance) that match the given type.  Throws an error
+ * if the given type (or ValueType, if omitted) is not a pybind11 base of the given instance.  If
+ * `find_type` is omitted (or explicitly specified as nullptr) the first value/holder are returned,
+ * regardless of type (and the resulting .type will be nullptr).
+ *
+ * The returned object should be short-lived: in particular, it must not outlive the called-upon
+ * instance.
+ */
+PYBIND11_NOINLINE value_and_holder
+instance::get_value_and_holder(const type_info *find_type /*= nullptr default in common.h*/,
+                               bool throw_if_missing /*= true in common.h*/) {
+    // Optimize common case:
+    if (!find_type || Py_TYPE(this) == find_type->type) {
+        return value_and_holder(this, find_type, 0, 0);
+    }
+
+    detail::values_and_holders vhs(this);
+    auto it = vhs.find(find_type);
+    if (it != vhs.end()) {
+        return *it;
+    }
+
+    if (!throw_if_missing) {
+        return value_and_holder();
+    }
+
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+    pybind11_fail("pybind11::detail::instance::get_value_and_holder: `"
+                  + get_fully_qualified_tp_name(find_type->type)
+                  + "' is not a pybind11 base of the given `"
+                  + get_fully_qualified_tp_name(Py_TYPE(this)) + "' instance");
+#else
+    pybind11_fail(
+        "pybind11::detail::instance::get_value_and_holder: "
+        "type is not a pybind11 base of the given instance "
+        "(#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for type details)");
+#endif
+}
+
+PYBIND11_NOINLINE void instance::allocate_layout() {
+    const auto &tinfo = all_type_info(Py_TYPE(this));
+
+    const size_t n_types = tinfo.size();
+
+    if (n_types == 0) {
+        pybind11_fail(
+            "instance allocation failed: new instance has no pybind11-registered base types");
+    }
+
+    simple_layout
+        = n_types == 1 && tinfo.front()->holder_size_in_ptrs <= instance_simple_holder_in_ptrs();
+
+    // Simple path: no python-side multiple inheritance, and a small-enough holder
+    if (simple_layout) {
+        simple_value_holder[0] = nullptr;
+        simple_holder_constructed = false;
+        simple_instance_registered = false;
+    } else { // multiple base types or a too-large holder
+        // Allocate space to hold: [v1*][h1][v2*][h2]...[bb...] where [vN*] is a value pointer,
+        // [hN] is the (uninitialized) holder instance for value N, and [bb...] is a set of bool
+        // values that tracks whether each associated holder has been initialized.  Each [block] is
+        // padded, if necessary, to an integer multiple of sizeof(void *).
+        size_t space = 0;
+        for (auto *t : tinfo) {
+            space += 1;                      // value pointer
+            space += t->holder_size_in_ptrs; // holder instance
+        }
+        size_t flags_at = space;
+        space += size_in_ptrs(n_types); // status bytes (holder_constructed and
+                                        // instance_registered)
+
+        // Allocate space for flags, values, and holders, and initialize it to 0 (flags and values,
+        // in particular, need to be 0).  Use Python's memory allocation
+        // functions: Python is using pymalloc, which is designed to be
+        // efficient for small allocations like the one we're doing here;
+        // for larger allocations they are just wrappers around malloc.
+        // TODO: is this still true for pure Python 3.6?
+        nonsimple.values_and_holders = (void **) PyMem_Calloc(space, sizeof(void *));
+        if (!nonsimple.values_and_holders) {
+            throw std::bad_alloc();
+        }
+        nonsimple.status
+            = reinterpret_cast<std::uint8_t *>(&nonsimple.values_and_holders[flags_at]);
+    }
+    owned = true;
+}
+
+// NOLINTNEXTLINE(readability-make-member-function-const)
+PYBIND11_NOINLINE void instance::deallocate_layout() {
+    if (!simple_layout) {
+        PyMem_Free(nonsimple.values_and_holders);
+    }
+}
+
+PYBIND11_NOINLINE bool isinstance_generic(handle obj, const std::type_info &tp) {
+    handle type = detail::get_type_handle(tp, false);
+    if (!type) {
+        return false;
+    }
+    return isinstance(obj, type);
+}
+
+PYBIND11_NOINLINE handle get_object_handle(const void *ptr, const detail::type_info *type) {
+    auto &instances = get_internals().registered_instances;
+    auto range = instances.equal_range(ptr);
+    for (auto it = range.first; it != range.second; ++it) {
+        for (const auto &vh : values_and_holders(it->second)) {
+            if (vh.type == type) {
+                return handle((PyObject *) it->second);
+            }
+        }
+    }
+    return handle();
+}
+
+inline PyThreadState *get_thread_state_unchecked() {
+#if defined(PYPY_VERSION)
+    return PyThreadState_GET();
+#else
+    return _PyThreadState_UncheckedGet();
+#endif
+}
+
+// Forward declarations
+void keep_alive_impl(handle nurse, handle patient);
+inline PyObject *make_new_instance(PyTypeObject *type);
+
+class type_caster_generic {
+public:
+    PYBIND11_NOINLINE explicit type_caster_generic(const std::type_info &type_info)
+        : typeinfo(get_type_info(type_info)), cpptype(&type_info) {}
+
+    explicit type_caster_generic(const type_info *typeinfo)
+        : typeinfo(typeinfo), cpptype(typeinfo ? typeinfo->cpptype : nullptr) {}
+
+    bool load(handle src, bool convert) { return load_impl<type_caster_generic>(src, convert); }
+
+    PYBIND11_NOINLINE static handle cast(const void *_src,
+                                         return_value_policy policy,
+                                         handle parent,
+                                         const detail::type_info *tinfo,
+                                         void *(*copy_constructor)(const void *),
+                                         void *(*move_constructor)(const void *),
+                                         const void *existing_holder = nullptr) {
+        if (!tinfo) { // no type info: error will be set already
+            return handle();
+        }
+
+        void *src = const_cast<void *>(_src);
+        if (src == nullptr) {
+            return none().release();
+        }
+
+        if (handle registered_inst = find_registered_python_instance(src, tinfo)) {
+            return registered_inst;
+        }
+
+        auto inst = reinterpret_steal<object>(make_new_instance(tinfo->type));
+        auto *wrapper = reinterpret_cast<instance *>(inst.ptr());
+        wrapper->owned = false;
+        void *&valueptr = values_and_holders(wrapper).begin()->value_ptr();
+
+        switch (policy) {
+            case return_value_policy::automatic:
+            case return_value_policy::take_ownership:
+                valueptr = src;
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::automatic_reference:
+            case return_value_policy::reference:
+                valueptr = src;
+                wrapper->owned = false;
+                break;
+
+            case return_value_policy::copy:
+                if (copy_constructor) {
+                    valueptr = copy_constructor(src);
+                } else {
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+                    std::string type_name(tinfo->cpptype->name());
+                    detail::clean_type_id(type_name);
+                    throw cast_error("return_value_policy = copy, but type " + type_name
+                                     + " is non-copyable!");
+#else
+                    throw cast_error("return_value_policy = copy, but type is "
+                                     "non-copyable! (#define PYBIND11_DETAILED_ERROR_MESSAGES or "
+                                     "compile in debug mode for details)");
+#endif
+                }
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::move:
+                if (move_constructor) {
+                    valueptr = move_constructor(src);
+                } else if (copy_constructor) {
+                    valueptr = copy_constructor(src);
+                } else {
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+                    std::string type_name(tinfo->cpptype->name());
+                    detail::clean_type_id(type_name);
+                    throw cast_error("return_value_policy = move, but type " + type_name
+                                     + " is neither movable nor copyable!");
+#else
+                    throw cast_error("return_value_policy = move, but type is neither "
+                                     "movable nor copyable! "
+                                     "(#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in "
+                                     "debug mode for details)");
+#endif
+                }
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::reference_internal:
+                valueptr = src;
+                wrapper->owned = false;
+                keep_alive_impl(inst, parent);
+                break;
+
+            default:
+                throw cast_error("unhandled return_value_policy: should not happen!");
+        }
+
+        tinfo->init_instance(wrapper, existing_holder);
+
+        return inst.release();
+    }
+
+    // Base methods for generic caster; there are overridden in copyable_holder_caster
+    void load_value(value_and_holder &&v_h) {
+        auto *&vptr = v_h.value_ptr();
+        // Lazy allocation for unallocated values:
+        if (vptr == nullptr) {
+            const auto *type = v_h.type ? v_h.type : typeinfo;
+            if (type->operator_new) {
+                vptr = type->operator_new(type->type_size);
+            } else {
+#if defined(__cpp_aligned_new) && (!defined(_MSC_VER) || _MSC_VER >= 1912)
+                if (type->type_align > __STDCPP_DEFAULT_NEW_ALIGNMENT__) {
+                    vptr = ::operator new(type->type_size, std::align_val_t(type->type_align));
+                } else {
+                    vptr = ::operator new(type->type_size);
+                }
+#else
+                vptr = ::operator new(type->type_size);
+#endif
+            }
+        }
+        value = vptr;
+    }
+    bool try_implicit_casts(handle src, bool convert) {
+        for (const auto &cast : typeinfo->implicit_casts) {
+            type_caster_generic sub_caster(*cast.first);
+            if (sub_caster.load(src, convert)) {
+                value = cast.second(sub_caster.value);
+                return true;
+            }
+        }
+        return false;
+    }
+    bool try_direct_conversions(handle src) {
+        for (auto &converter : *typeinfo->direct_conversions) {
+            if (converter(src.ptr(), value)) {
+                return true;
+            }
+        }
+        return false;
+    }
+    void check_holder_compat() {}
+
+    PYBIND11_NOINLINE static void *local_load(PyObject *src, const type_info *ti) {
+        auto caster = type_caster_generic(ti);
+        if (caster.load(src, false)) {
+            return caster.value;
+        }
+        return nullptr;
+    }
+
+    /// Try to load with foreign typeinfo, if available. Used when there is no
+    /// native typeinfo, or when the native one wasn't able to produce a value.
+    PYBIND11_NOINLINE bool try_load_foreign_module_local(handle src) {
+        constexpr auto *local_key = PYBIND11_MODULE_LOCAL_ID;
+        const auto pytype = type::handle_of(src);
+        if (!hasattr(pytype, local_key)) {
+            return false;
+        }
+
+        type_info *foreign_typeinfo = reinterpret_borrow<capsule>(getattr(pytype, local_key));
+        // Only consider this foreign loader if actually foreign and is a loader of the correct cpp
+        // type
+        if (foreign_typeinfo->module_local_load == &local_load
+            || (cpptype && !same_type(*cpptype, *foreign_typeinfo->cpptype))) {
+            return false;
+        }
+
+        if (auto *result = foreign_typeinfo->module_local_load(src.ptr(), foreign_typeinfo)) {
+            value = result;
+            return true;
+        }
+        return false;
+    }
+
+    // Implementation of `load`; this takes the type of `this` so that it can dispatch the relevant
+    // bits of code between here and copyable_holder_caster where the two classes need different
+    // logic (without having to resort to virtual inheritance).
+    template <typename ThisT>
+    PYBIND11_NOINLINE bool load_impl(handle src, bool convert) {
+        if (!src) {
+            return false;
+        }
+        if (!typeinfo) {
+            return try_load_foreign_module_local(src);
+        }
+
+        auto &this_ = static_cast<ThisT &>(*this);
+        this_.check_holder_compat();
+
+        PyTypeObject *srctype = Py_TYPE(src.ptr());
+
+        // Case 1: If src is an exact type match for the target type then we can reinterpret_cast
+        // the instance's value pointer to the target type:
+        if (srctype == typeinfo->type) {
+            this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+            return true;
+        }
+        // Case 2: We have a derived class
+        if (PyType_IsSubtype(srctype, typeinfo->type)) {
+            const auto &bases = all_type_info(srctype);
+            bool no_cpp_mi = typeinfo->simple_type;
+
+            // Case 2a: the python type is a Python-inherited derived class that inherits from just
+            // one simple (no MI) pybind11 class, or is an exact match, so the C++ instance is of
+            // the right type and we can use reinterpret_cast.
+            // (This is essentially the same as case 2b, but because not using multiple inheritance
+            // is extremely common, we handle it specially to avoid the loop iterator and type
+            // pointer lookup overhead)
+            if (bases.size() == 1 && (no_cpp_mi || bases.front()->type == typeinfo->type)) {
+                this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+                return true;
+            }
+            // Case 2b: the python type inherits from multiple C++ bases.  Check the bases to see
+            // if we can find an exact match (or, for a simple C++ type, an inherited match); if
+            // so, we can safely reinterpret_cast to the relevant pointer.
+            if (bases.size() > 1) {
+                for (auto *base : bases) {
+                    if (no_cpp_mi ? PyType_IsSubtype(base->type, typeinfo->type)
+                                  : base->type == typeinfo->type) {
+                        this_.load_value(
+                            reinterpret_cast<instance *>(src.ptr())->get_value_and_holder(base));
+                        return true;
+                    }
+                }
+            }
+
+            // Case 2c: C++ multiple inheritance is involved and we couldn't find an exact type
+            // match in the registered bases, above, so try implicit casting (needed for proper C++
+            // casting when MI is involved).
+            if (this_.try_implicit_casts(src, convert)) {
+                return true;
+            }
+        }
+
+        // Perform an implicit conversion
+        if (convert) {
+            for (const auto &converter : typeinfo->implicit_conversions) {
+                auto temp = reinterpret_steal<object>(converter(src.ptr(), typeinfo->type));
+                if (load_impl<ThisT>(temp, false)) {
+                    loader_life_support::add_patient(temp);
+                    return true;
+                }
+            }
+            if (this_.try_direct_conversions(src)) {
+                return true;
+            }
+        }
+
+        // Failed to match local typeinfo. Try again with global.
+        if (typeinfo->module_local) {
+            if (auto *gtype = get_global_type_info(*typeinfo->cpptype)) {
+                typeinfo = gtype;
+                return load(src, false);
+            }
+        }
+
+        // Global typeinfo has precedence over foreign module_local
+        if (try_load_foreign_module_local(src)) {
+            return true;
+        }
+
+        // Custom converters didn't take None, now we convert None to nullptr.
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) {
+                return false;
+            }
+            value = nullptr;
+            return true;
+        }
+
+        return false;
+    }
+
+    // Called to do type lookup and wrap the pointer and type in a pair when a dynamic_cast
+    // isn't needed or can't be used.  If the type is unknown, sets the error and returns a pair
+    // with .second = nullptr.  (p.first = nullptr is not an error: it becomes None).
+    PYBIND11_NOINLINE static std::pair<const void *, const type_info *>
+    src_and_type(const void *src,
+                 const std::type_info &cast_type,
+                 const std::type_info *rtti_type = nullptr) {
+        if (auto *tpi = get_type_info(cast_type)) {
+            return {src, const_cast<const type_info *>(tpi)};
+        }
+
+        // Not found, set error:
+        std::string tname = rtti_type ? rtti_type->name() : cast_type.name();
+        detail::clean_type_id(tname);
+        std::string msg = "Unregistered type : " + tname;
+        PyErr_SetString(PyExc_TypeError, msg.c_str());
+        return {nullptr, nullptr};
+    }
+
+    const type_info *typeinfo = nullptr;
+    const std::type_info *cpptype = nullptr;
+    void *value = nullptr;
+};
+
+/**
+ * Determine suitable casting operator for pointer-or-lvalue-casting type casters.  The type caster
+ * needs to provide `operator T*()` and `operator T&()` operators.
+ *
+ * If the type supports moving the value away via an `operator T&&() &&` method, it should use
+ * `movable_cast_op_type` instead.
+ */
+template <typename T>
+using cast_op_type = conditional_t<std::is_pointer<remove_reference_t<T>>::value,
+                                   typename std::add_pointer<intrinsic_t<T>>::type,
+                                   typename std::add_lvalue_reference<intrinsic_t<T>>::type>;
+
+/**
+ * Determine suitable casting operator for a type caster with a movable value.  Such a type caster
+ * needs to provide `operator T*()`, `operator T&()`, and `operator T&&() &&`.  The latter will be
+ * called in appropriate contexts where the value can be moved rather than copied.
+ *
+ * These operator are automatically provided when using the PYBIND11_TYPE_CASTER macro.
+ */
+template <typename T>
+using movable_cast_op_type
+    = conditional_t<std::is_pointer<typename std::remove_reference<T>::type>::value,
+                    typename std::add_pointer<intrinsic_t<T>>::type,
+                    conditional_t<std::is_rvalue_reference<T>::value,
+                                  typename std::add_rvalue_reference<intrinsic_t<T>>::type,
+                                  typename std::add_lvalue_reference<intrinsic_t<T>>::type>>;
+
+// Does the container have a mapped type and is it recursive?
+// Implemented by specializations below.
+template <typename Container, typename SFINAE = void>
+struct container_mapped_type_traits {
+    static constexpr bool has_mapped_type = false;
+    static constexpr bool has_recursive_mapped_type = false;
+};
+
+template <typename Container>
+struct container_mapped_type_traits<
+    Container,
+    typename std::enable_if<
+        std::is_same<typename Container::mapped_type, Container>::value>::type> {
+    static constexpr bool has_mapped_type = true;
+    static constexpr bool has_recursive_mapped_type = true;
+};
+
+template <typename Container>
+struct container_mapped_type_traits<
+    Container,
+    typename std::enable_if<
+        negation<std::is_same<typename Container::mapped_type, Container>>::value>::type> {
+    static constexpr bool has_mapped_type = true;
+    static constexpr bool has_recursive_mapped_type = false;
+};
+
+// Does the container have a value type and is it recursive?
+// Implemented by specializations below.
+template <typename Container, typename SFINAE = void>
+struct container_value_type_traits : std::false_type {
+    static constexpr bool has_value_type = false;
+    static constexpr bool has_recursive_value_type = false;
+};
+
+template <typename Container>
+struct container_value_type_traits<
+    Container,
+    typename std::enable_if<
+        std::is_same<typename Container::value_type, Container>::value>::type> {
+    static constexpr bool has_value_type = true;
+    static constexpr bool has_recursive_value_type = true;
+};
+
+template <typename Container>
+struct container_value_type_traits<
+    Container,
+    typename std::enable_if<
+        negation<std::is_same<typename Container::value_type, Container>>::value>::type> {
+    static constexpr bool has_value_type = true;
+    static constexpr bool has_recursive_value_type = false;
+};
+
+/*
+ * Tag to be used for representing the bottom of recursively defined types.
+ * Define this tag so we don't have to use void.
+ */
+struct recursive_bottom {};
+
+/*
+ * Implementation detail of `recursive_container_traits` below.
+ * `T` is the `value_type` of the container, which might need to be modified to
+ * avoid recursive types and const types.
+ */
+template <typename T, bool is_this_a_map>
+struct impl_type_to_check_recursively {
+    /*
+     * If the container is recursive, then no further recursion should be done.
+     */
+    using if_recursive = recursive_bottom;
+    /*
+     * Otherwise yield `T` unchanged.
+     */
+    using if_not_recursive = T;
+};
+
+/*
+ * For pairs - only as value type of a map -, the first type should remove the `const`.
+ * Also, if the map is recursive, then the recursive checking should consider
+ * the first type only.
+ */
+template <typename A, typename B>
+struct impl_type_to_check_recursively<std::pair<A, B>, /* is_this_a_map = */ true> {
+    using if_recursive = typename std::remove_const<A>::type;
+    using if_not_recursive = std::pair<typename std::remove_const<A>::type, B>;
+};
+
+/*
+ * Implementation of `recursive_container_traits` below.
+ */
+template <typename Container, typename SFINAE = void>
+struct impl_recursive_container_traits {
+    using type_to_check_recursively = recursive_bottom;
+};
+
+template <typename Container>
+struct impl_recursive_container_traits<
+    Container,
+    typename std::enable_if<container_value_type_traits<Container>::has_value_type>::type> {
+    static constexpr bool is_recursive
+        = container_mapped_type_traits<Container>::has_recursive_mapped_type
+          || container_value_type_traits<Container>::has_recursive_value_type;
+    /*
+     * This member dictates which type Pybind11 should check recursively in traits
+     * such as `is_move_constructible`, `is_copy_constructible`, `is_move_assignable`, ...
+     * Direct access to `value_type` should be avoided:
+     * 1. `value_type` might recursively contain the type again
+     * 2. `value_type` of STL map types is `std::pair<A const, B>`, the `const`
+     *    should be removed.
+     *
+     */
+    using type_to_check_recursively = typename std::conditional<
+        is_recursive,
+        typename impl_type_to_check_recursively<
+            typename Container::value_type,
+            container_mapped_type_traits<Container>::has_mapped_type>::if_recursive,
+        typename impl_type_to_check_recursively<
+            typename Container::value_type,
+            container_mapped_type_traits<Container>::has_mapped_type>::if_not_recursive>::type;
+};
+
+/*
+ * This trait defines the `type_to_check_recursively` which is needed to properly
+ * handle recursively defined traits such as `is_move_constructible` without going
+ * into an infinite recursion.
+ * Should be used instead of directly accessing the `value_type`.
+ * It cancels the recursion by returning the `recursive_bottom` tag.
+ *
+ * The default definition of `type_to_check_recursively` is as follows:
+ *
+ * 1. By default, it is `recursive_bottom`, so that the recursion is canceled.
+ * 2. If the type is non-recursive and defines a `value_type`, then the `value_type` is used.
+ *    If the `value_type` is a pair and a `mapped_type` is defined,
+ *    then the `const` is removed from the first type.
+ * 3. If the type is recursive and `value_type` is not a pair, then `recursive_bottom` is returned.
+ * 4. If the type is recursive and `value_type` is a pair and a `mapped_type` is defined,
+ *    then `const` is removed from the first type and the first type is returned.
+ *
+ * This behavior can be extended by the user as seen in test_stl_binders.cpp.
+ *
+ * This struct is exactly the same as impl_recursive_container_traits.
+ * The duplication achieves that user-defined specializations don't compete
+ * with internal specializations, but take precedence.
+ */
+template <typename Container, typename SFINAE = void>
+struct recursive_container_traits : impl_recursive_container_traits<Container> {};
+
+template <typename T>
+struct is_move_constructible
+    : all_of<std::is_move_constructible<T>,
+             is_move_constructible<
+                 typename recursive_container_traits<T>::type_to_check_recursively>> {};
+
+template <>
+struct is_move_constructible<recursive_bottom> : std::true_type {};
+
+// Likewise for std::pair
+// (after C++17 it is mandatory that the move constructor not exist when the two types aren't
+// themselves move constructible, but this can not be relied upon when T1 or T2 are themselves
+// containers).
+template <typename T1, typename T2>
+struct is_move_constructible<std::pair<T1, T2>>
+    : all_of<is_move_constructible<T1>, is_move_constructible<T2>> {};
+
+// std::is_copy_constructible isn't quite enough: it lets std::vector<T> (and similar) through when
+// T is non-copyable, but code containing such a copy constructor fails to actually compile.
+template <typename T>
+struct is_copy_constructible
+    : all_of<std::is_copy_constructible<T>,
+             is_copy_constructible<
+                 typename recursive_container_traits<T>::type_to_check_recursively>> {};
+
+template <>
+struct is_copy_constructible<recursive_bottom> : std::true_type {};
+
+// Likewise for std::pair
+// (after C++17 it is mandatory that the copy constructor not exist when the two types aren't
+// themselves copy constructible, but this can not be relied upon when T1 or T2 are themselves
+// containers).
+template <typename T1, typename T2>
+struct is_copy_constructible<std::pair<T1, T2>>
+    : all_of<is_copy_constructible<T1>, is_copy_constructible<T2>> {};
+
+// The same problems arise with std::is_copy_assignable, so we use the same workaround.
+template <typename T>
+struct is_copy_assignable
+    : all_of<
+          std::is_copy_assignable<T>,
+          is_copy_assignable<typename recursive_container_traits<T>::type_to_check_recursively>> {
+};
+
+template <>
+struct is_copy_assignable<recursive_bottom> : std::true_type {};
+
+template <typename T1, typename T2>
+struct is_copy_assignable<std::pair<T1, T2>>
+    : all_of<is_copy_assignable<T1>, is_copy_assignable<T2>> {};
+
+PYBIND11_NAMESPACE_END(detail)
+
+// polymorphic_type_hook<itype>::get(src, tinfo) determines whether the object pointed
+// to by `src` actually is an instance of some class derived from `itype`.
+// If so, it sets `tinfo` to point to the std::type_info representing that derived
+// type, and returns a pointer to the start of the most-derived object of that type
+// (in which `src` is a subobject; this will be the same address as `src` in most
+// single inheritance cases). If not, or if `src` is nullptr, it simply returns `src`
+// and leaves `tinfo` at its default value of nullptr.
+//
+// The default polymorphic_type_hook just returns src. A specialization for polymorphic
+// types determines the runtime type of the passed object and adjusts the this-pointer
+// appropriately via dynamic_cast<void*>. This is what enables a C++ Animal* to appear
+// to Python as a Dog (if Dog inherits from Animal, Animal is polymorphic, Dog is
+// registered with pybind11, and this Animal is in fact a Dog).
+//
+// You may specialize polymorphic_type_hook yourself for types that want to appear
+// polymorphic to Python but do not use C++ RTTI. (This is a not uncommon pattern
+// in performance-sensitive applications, used most notably in LLVM.)
+//
+// polymorphic_type_hook_base allows users to specialize polymorphic_type_hook with
+// std::enable_if. User provided specializations will always have higher priority than
+// the default implementation and specialization provided in polymorphic_type_hook_base.
+template <typename itype, typename SFINAE = void>
+struct polymorphic_type_hook_base {
+    static const void *get(const itype *src, const std::type_info *&) { return src; }
+};
+template <typename itype>
+struct polymorphic_type_hook_base<itype, detail::enable_if_t<std::is_polymorphic<itype>::value>> {
+    static const void *get(const itype *src, const std::type_info *&type) {
+        type = src ? &typeid(*src) : nullptr;
+        return dynamic_cast<const void *>(src);
+    }
+};
+template <typename itype, typename SFINAE = void>
+struct polymorphic_type_hook : public polymorphic_type_hook_base<itype> {};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// Generic type caster for objects stored on the heap
+template <typename type>
+class type_caster_base : public type_caster_generic {
+    using itype = intrinsic_t<type>;
+
+public:
+    static constexpr auto name = const_name<type>();
+
+    type_caster_base() : type_caster_base(typeid(type)) {}
+    explicit type_caster_base(const std::type_info &info) : type_caster_generic(info) {}
+
+    static handle cast(const itype &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast(&src, policy, parent);
+    }
+
+    static handle cast(itype &&src, return_value_policy, handle parent) {
+        return cast(&src, return_value_policy::move, parent);
+    }
+
+    // Returns a (pointer, type_info) pair taking care of necessary type lookup for a
+    // polymorphic type (using RTTI by default, but can be overridden by specializing
+    // polymorphic_type_hook). If the instance isn't derived, returns the base version.
+    static std::pair<const void *, const type_info *> src_and_type(const itype *src) {
+        const auto &cast_type = typeid(itype);
+        const std::type_info *instance_type = nullptr;
+        const void *vsrc = polymorphic_type_hook<itype>::get(src, instance_type);
+        if (instance_type && !same_type(cast_type, *instance_type)) {
+            // This is a base pointer to a derived type. If the derived type is registered
+            // with pybind11, we want to make the full derived object available.
+            // In the typical case where itype is polymorphic, we get the correct
+            // derived pointer (which may be != base pointer) by a dynamic_cast to
+            // most derived type. If itype is not polymorphic, we won't get here
+            // except via a user-provided specialization of polymorphic_type_hook,
+            // and the user has promised that no this-pointer adjustment is
+            // required in that case, so it's OK to use static_cast.
+            if (const auto *tpi = get_type_info(*instance_type)) {
+                return {vsrc, tpi};
+            }
+        }
+        // Otherwise we have either a nullptr, an `itype` pointer, or an unknown derived pointer,
+        // so don't do a cast
+        return type_caster_generic::src_and_type(src, cast_type, instance_type);
+    }
+
+    static handle cast(const itype *src, return_value_policy policy, handle parent) {
+        auto st = src_and_type(src);
+        return type_caster_generic::cast(st.first,
+                                         policy,
+                                         parent,
+                                         st.second,
+                                         make_copy_constructor(src),
+                                         make_move_constructor(src));
+    }
+
+    static handle cast_holder(const itype *src, const void *holder) {
+        auto st = src_and_type(src);
+        return type_caster_generic::cast(st.first,
+                                         return_value_policy::take_ownership,
+                                         {},
+                                         st.second,
+                                         nullptr,
+                                         nullptr,
+                                         holder);
+    }
+
+    template <typename T>
+    using cast_op_type = detail::cast_op_type<T>;
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator itype *() { return (type *) value; }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator itype &() {
+        if (!value) {
+            throw reference_cast_error();
+        }
+        return *((itype *) value);
+    }
+
+protected:
+    using Constructor = void *(*) (const void *);
+
+    /* Only enabled when the types are {copy,move}-constructible *and* when the type
+       does not have a private operator new implementation. A comma operator is used in the
+       decltype argument to apply SFINAE to the public copy/move constructors.*/
+    template <typename T, typename = enable_if_t<is_copy_constructible<T>::value>>
+    static auto make_copy_constructor(const T *)
+        -> decltype(new T(std::declval<const T>()), Constructor{}) {
+        return [](const void *arg) -> void * { return new T(*reinterpret_cast<const T *>(arg)); };
+    }
+
+    template <typename T, typename = enable_if_t<is_move_constructible<T>::value>>
+    static auto make_move_constructor(const T *)
+        -> decltype(new T(std::declval<T &&>()), Constructor{}) {
+        return [](const void *arg) -> void * {
+            return new T(std::move(*const_cast<T *>(reinterpret_cast<const T *>(arg))));
+        };
+    }
+
+    static Constructor make_copy_constructor(...) { return nullptr; }
+    static Constructor make_move_constructor(...) { return nullptr; }
+};
+
+PYBIND11_NOINLINE std::string type_info_description(const std::type_info &ti) {
+    if (auto *type_data = get_type_info(ti)) {
+        handle th((PyObject *) type_data->type);
+        return th.attr("__module__").cast<std::string>() + '.'
+               + th.attr("__qualname__").cast<std::string>();
+    }
+    return clean_type_id(ti.name());
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/typeid.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/typeid.h
new file mode 100644
index 00000000..a67b5213
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/typeid.h
@@ -0,0 +1,65 @@
+/*
+    pybind11/detail/typeid.h: Compiler-independent access to type identifiers
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include <cstdio>
+#include <cstdlib>
+
+#if defined(__GNUG__)
+#    include <cxxabi.h>
+#endif
+
+#include "common.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// Erase all occurrences of a substring
+inline void erase_all(std::string &string, const std::string &search) {
+    for (size_t pos = 0;;) {
+        pos = string.find(search, pos);
+        if (pos == std::string::npos) {
+            break;
+        }
+        string.erase(pos, search.length());
+    }
+}
+
+PYBIND11_NOINLINE void clean_type_id(std::string &name) {
+#if defined(__GNUG__)
+    int status = 0;
+    std::unique_ptr<char, void (*)(void *)> res{
+        abi::__cxa_demangle(name.c_str(), nullptr, nullptr, &status), std::free};
+    if (status == 0) {
+        name = res.get();
+    }
+#else
+    detail::erase_all(name, "class ");
+    detail::erase_all(name, "struct ");
+    detail::erase_all(name, "enum ");
+#endif
+    detail::erase_all(name, "pybind11::");
+}
+
+inline std::string clean_type_id(const char *typeid_name) {
+    std::string name(typeid_name);
+    detail::clean_type_id(name);
+    return name;
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Return a string representation of a C++ type
+template <typename T>
+static std::string type_id() {
+    return detail::clean_type_id(typeid(T).name());
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eigen.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eigen.h
new file mode 100644
index 00000000..273b9c93
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eigen.h
@@ -0,0 +1,12 @@
+/*
+    pybind11/eigen.h: Transparent conversion for dense and sparse Eigen matrices
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "eigen/matrix.h"
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eigen/common.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eigen/common.h
new file mode 100644
index 00000000..24f56d15
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eigen/common.h
@@ -0,0 +1,9 @@
+// Copyright (c) 2023 The pybind Community.
+
+#pragma once
+
+// Common message for `static_assert()`s, which are useful to easily
+// preempt much less obvious errors.
+#define PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED                                    \
+    "Pointer types (in particular `PyObject *`) are not supported as scalar types for Eigen "     \
+    "types."
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eigen/matrix.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eigen/matrix.h
new file mode 100644
index 00000000..8d4342f8
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eigen/matrix.h
@@ -0,0 +1,714 @@
+/*
+    pybind11/eigen/matrix.h: Transparent conversion for dense and sparse Eigen matrices
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "../numpy.h"
+#include "common.h"
+
+/* HINT: To suppress warnings originating from the Eigen headers, use -isystem.
+   See also:
+       https://stackoverflow.com/questions/2579576/i-dir-vs-isystem-dir
+       https://stackoverflow.com/questions/1741816/isystem-for-ms-visual-studio-c-compiler
+*/
+PYBIND11_WARNING_PUSH
+PYBIND11_WARNING_DISABLE_MSVC(5054) // https://github.com/pybind/pybind11/pull/3741
+//       C5054: operator '&': deprecated between enumerations of different types
+#if defined(__MINGW32__)
+PYBIND11_WARNING_DISABLE_GCC("-Wmaybe-uninitialized")
+#endif
+
+#include <Eigen/Core>
+#include <Eigen/SparseCore>
+
+PYBIND11_WARNING_POP
+
+// Eigen prior to 3.2.7 doesn't have proper move constructors--but worse, some classes get implicit
+// move constructors that break things.  We could detect this an explicitly copy, but an extra copy
+// of matrices seems highly undesirable.
+static_assert(EIGEN_VERSION_AT_LEAST(3, 2, 7),
+              "Eigen matrix support in pybind11 requires Eigen >= 3.2.7");
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+// Provide a convenience alias for easier pass-by-ref usage with fully dynamic strides:
+using EigenDStride = Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>;
+template <typename MatrixType>
+using EigenDRef = Eigen::Ref<MatrixType, 0, EigenDStride>;
+template <typename MatrixType>
+using EigenDMap = Eigen::Map<MatrixType, 0, EigenDStride>;
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+#if EIGEN_VERSION_AT_LEAST(3, 3, 0)
+using EigenIndex = Eigen::Index;
+template <typename Scalar, int Flags, typename StorageIndex>
+using EigenMapSparseMatrix = Eigen::Map<Eigen::SparseMatrix<Scalar, Flags, StorageIndex>>;
+#else
+using EigenIndex = EIGEN_DEFAULT_DENSE_INDEX_TYPE;
+template <typename Scalar, int Flags, typename StorageIndex>
+using EigenMapSparseMatrix = Eigen::MappedSparseMatrix<Scalar, Flags, StorageIndex>;
+#endif
+
+// Matches Eigen::Map, Eigen::Ref, blocks, etc:
+template <typename T>
+using is_eigen_dense_map = all_of<is_template_base_of<Eigen::DenseBase, T>,
+                                  std::is_base_of<Eigen::MapBase<T, Eigen::ReadOnlyAccessors>, T>>;
+template <typename T>
+using is_eigen_mutable_map = std::is_base_of<Eigen::MapBase<T, Eigen::WriteAccessors>, T>;
+template <typename T>
+using is_eigen_dense_plain
+    = all_of<negation<is_eigen_dense_map<T>>, is_template_base_of<Eigen::PlainObjectBase, T>>;
+template <typename T>
+using is_eigen_sparse = is_template_base_of<Eigen::SparseMatrixBase, T>;
+// Test for objects inheriting from EigenBase<Derived> that aren't captured by the above.  This
+// basically covers anything that can be assigned to a dense matrix but that don't have a typical
+// matrix data layout that can be copied from their .data().  For example, DiagonalMatrix and
+// SelfAdjointView fall into this category.
+template <typename T>
+using is_eigen_other
+    = all_of<is_template_base_of<Eigen::EigenBase, T>,
+             negation<any_of<is_eigen_dense_map<T>, is_eigen_dense_plain<T>, is_eigen_sparse<T>>>>;
+
+// Captures numpy/eigen conformability status (returned by EigenProps::conformable()):
+template <bool EigenRowMajor>
+struct EigenConformable {
+    bool conformable = false;
+    EigenIndex rows = 0, cols = 0;
+    EigenDStride stride{0, 0};    // Only valid if negativestrides is false!
+    bool negativestrides = false; // If true, do not use stride!
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    EigenConformable(bool fits = false) : conformable{fits} {}
+    // Matrix type:
+    EigenConformable(EigenIndex r, EigenIndex c, EigenIndex rstride, EigenIndex cstride)
+        : conformable{true}, rows{r}, cols{c},
+          // TODO: when Eigen bug #747 is fixed, remove the tests for non-negativity.
+          // http://eigen.tuxfamily.org/bz/show_bug.cgi?id=747
+          stride{EigenRowMajor ? (rstride > 0 ? rstride : 0)
+                               : (cstride > 0 ? cstride : 0) /* outer stride */,
+                 EigenRowMajor ? (cstride > 0 ? cstride : 0)
+                               : (rstride > 0 ? rstride : 0) /* inner stride */},
+          negativestrides{rstride < 0 || cstride < 0} {}
+    // Vector type:
+    EigenConformable(EigenIndex r, EigenIndex c, EigenIndex stride)
+        : EigenConformable(r, c, r == 1 ? c * stride : stride, c == 1 ? r : r * stride) {}
+
+    template <typename props>
+    bool stride_compatible() const {
+        // To have compatible strides, we need (on both dimensions) one of fully dynamic strides,
+        // matching strides, or a dimension size of 1 (in which case the stride value is
+        // irrelevant). Alternatively, if any dimension size is 0, the strides are not relevant
+        // (and numpy ≥ 1.23 sets the strides to 0 in that case, so we need to check explicitly).
+        if (negativestrides) {
+            return false;
+        }
+        if (rows == 0 || cols == 0) {
+            return true;
+        }
+        return (props::inner_stride == Eigen::Dynamic || props::inner_stride == stride.inner()
+                || (EigenRowMajor ? cols : rows) == 1)
+               && (props::outer_stride == Eigen::Dynamic || props::outer_stride == stride.outer()
+                   || (EigenRowMajor ? rows : cols) == 1);
+    }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator bool() const { return conformable; }
+};
+
+template <typename Type>
+struct eigen_extract_stride {
+    using type = Type;
+};
+template <typename PlainObjectType, int MapOptions, typename StrideType>
+struct eigen_extract_stride<Eigen::Map<PlainObjectType, MapOptions, StrideType>> {
+    using type = StrideType;
+};
+template <typename PlainObjectType, int Options, typename StrideType>
+struct eigen_extract_stride<Eigen::Ref<PlainObjectType, Options, StrideType>> {
+    using type = StrideType;
+};
+
+// Helper struct for extracting information from an Eigen type
+template <typename Type_>
+struct EigenProps {
+    using Type = Type_;
+    using Scalar = typename Type::Scalar;
+    using StrideType = typename eigen_extract_stride<Type>::type;
+    static constexpr EigenIndex rows = Type::RowsAtCompileTime, cols = Type::ColsAtCompileTime,
+                                size = Type::SizeAtCompileTime;
+    static constexpr bool row_major = Type::IsRowMajor,
+                          vector
+                          = Type::IsVectorAtCompileTime, // At least one dimension has fixed size 1
+        fixed_rows = rows != Eigen::Dynamic, fixed_cols = cols != Eigen::Dynamic,
+                          fixed = size != Eigen::Dynamic, // Fully-fixed size
+        dynamic = !fixed_rows && !fixed_cols;             // Fully-dynamic size
+
+    template <EigenIndex i, EigenIndex ifzero>
+    using if_zero = std::integral_constant<EigenIndex, i == 0 ? ifzero : i>;
+    static constexpr EigenIndex inner_stride
+        = if_zero<StrideType::InnerStrideAtCompileTime, 1>::value,
+        outer_stride = if_zero < StrideType::OuterStrideAtCompileTime,
+        vector      ? size
+        : row_major ? cols
+                    : rows > ::value;
+    static constexpr bool dynamic_stride
+        = inner_stride == Eigen::Dynamic && outer_stride == Eigen::Dynamic;
+    static constexpr bool requires_row_major
+        = !dynamic_stride && !vector && (row_major ? inner_stride : outer_stride) == 1;
+    static constexpr bool requires_col_major
+        = !dynamic_stride && !vector && (row_major ? outer_stride : inner_stride) == 1;
+
+    // Takes an input array and determines whether we can make it fit into the Eigen type.  If
+    // the array is a vector, we attempt to fit it into either an Eigen 1xN or Nx1 vector
+    // (preferring the latter if it will fit in either, i.e. for a fully dynamic matrix type).
+    static EigenConformable<row_major> conformable(const array &a) {
+        const auto dims = a.ndim();
+        if (dims < 1 || dims > 2) {
+            return false;
+        }
+
+        if (dims == 2) { // Matrix type: require exact match (or dynamic)
+
+            EigenIndex np_rows = a.shape(0), np_cols = a.shape(1),
+                       np_rstride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar)),
+                       np_cstride = a.strides(1) / static_cast<ssize_t>(sizeof(Scalar));
+            if ((fixed_rows && np_rows != rows) || (fixed_cols && np_cols != cols)) {
+                return false;
+            }
+
+            return {np_rows, np_cols, np_rstride, np_cstride};
+        }
+
+        // Otherwise we're storing an n-vector.  Only one of the strides will be used, but
+        // whichever is used, we want the (single) numpy stride value.
+        const EigenIndex n = a.shape(0),
+                         stride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar));
+
+        if (vector) { // Eigen type is a compile-time vector
+            if (fixed && size != n) {
+                return false; // Vector size mismatch
+            }
+            return {rows == 1 ? 1 : n, cols == 1 ? 1 : n, stride};
+        }
+        if (fixed) {
+            // The type has a fixed size, but is not a vector: abort
+            return false;
+        }
+        if (fixed_cols) {
+            // Since this isn't a vector, cols must be != 1.  We allow this only if it exactly
+            // equals the number of elements (rows is Dynamic, and so 1 row is allowed).
+            if (cols != n) {
+                return false;
+            }
+            return {1, n, stride};
+        } // Otherwise it's either fully dynamic, or column dynamic; both become a column vector
+        if (fixed_rows && rows != n) {
+            return false;
+        }
+        return {n, 1, stride};
+    }
+
+    static constexpr bool show_writeable
+        = is_eigen_dense_map<Type>::value && is_eigen_mutable_map<Type>::value;
+    static constexpr bool show_order = is_eigen_dense_map<Type>::value;
+    static constexpr bool show_c_contiguous = show_order && requires_row_major;
+    static constexpr bool show_f_contiguous
+        = !show_c_contiguous && show_order && requires_col_major;
+
+    static constexpr auto descriptor
+        = const_name("numpy.ndarray[") + npy_format_descriptor<Scalar>::name + const_name("[")
+          + const_name<fixed_rows>(const_name<(size_t) rows>(), const_name("m")) + const_name(", ")
+          + const_name<fixed_cols>(const_name<(size_t) cols>(), const_name("n")) + const_name("]")
+          +
+          // For a reference type (e.g. Ref<MatrixXd>) we have other constraints that might need to
+          // be satisfied: writeable=True (for a mutable reference), and, depending on the map's
+          // stride options, possibly f_contiguous or c_contiguous.  We include them in the
+          // descriptor output to provide some hint as to why a TypeError is occurring (otherwise
+          // it can be confusing to see that a function accepts a 'numpy.ndarray[float64[3,2]]' and
+          // an error message that you *gave* a numpy.ndarray of the right type and dimensions.
+          const_name<show_writeable>(", flags.writeable", "")
+          + const_name<show_c_contiguous>(", flags.c_contiguous", "")
+          + const_name<show_f_contiguous>(", flags.f_contiguous", "") + const_name("]");
+};
+
+// Casts an Eigen type to numpy array.  If given a base, the numpy array references the src data,
+// otherwise it'll make a copy.  writeable lets you turn off the writeable flag for the array.
+template <typename props>
+handle
+eigen_array_cast(typename props::Type const &src, handle base = handle(), bool writeable = true) {
+    constexpr ssize_t elem_size = sizeof(typename props::Scalar);
+    array a;
+    if (props::vector) {
+        a = array({src.size()}, {elem_size * src.innerStride()}, src.data(), base);
+    } else {
+        a = array({src.rows(), src.cols()},
+                  {elem_size * src.rowStride(), elem_size * src.colStride()},
+                  src.data(),
+                  base);
+    }
+
+    if (!writeable) {
+        array_proxy(a.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_;
+    }
+
+    return a.release();
+}
+
+// Takes an lvalue ref to some Eigen type and a (python) base object, creating a numpy array that
+// reference the Eigen object's data with `base` as the python-registered base class (if omitted,
+// the base will be set to None, and lifetime management is up to the caller).  The numpy array is
+// non-writeable if the given type is const.
+template <typename props, typename Type>
+handle eigen_ref_array(Type &src, handle parent = none()) {
+    // none here is to get past array's should-we-copy detection, which currently always
+    // copies when there is no base.  Setting the base to None should be harmless.
+    return eigen_array_cast<props>(src, parent, !std::is_const<Type>::value);
+}
+
+// Takes a pointer to some dense, plain Eigen type, builds a capsule around it, then returns a
+// numpy array that references the encapsulated data with a python-side reference to the capsule to
+// tie its destruction to that of any dependent python objects.  Const-ness is determined by
+// whether or not the Type of the pointer given is const.
+template <typename props, typename Type, typename = enable_if_t<is_eigen_dense_plain<Type>::value>>
+handle eigen_encapsulate(Type *src) {
+    capsule base(src, [](void *o) { delete static_cast<Type *>(o); });
+    return eigen_ref_array<props>(*src, base);
+}
+
+// Type caster for regular, dense matrix types (e.g. MatrixXd), but not maps/refs/etc. of dense
+// types.
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_dense_plain<Type>::value>> {
+    using Scalar = typename Type::Scalar;
+    static_assert(!std::is_pointer<Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+    using props = EigenProps<Type>;
+
+    bool load(handle src, bool convert) {
+        // If we're in no-convert mode, only load if given an array of the correct type
+        if (!convert && !isinstance<array_t<Scalar>>(src)) {
+            return false;
+        }
+
+        // Coerce into an array, but don't do type conversion yet; the copy below handles it.
+        auto buf = array::ensure(src);
+
+        if (!buf) {
+            return false;
+        }
+
+        auto dims = buf.ndim();
+        if (dims < 1 || dims > 2) {
+            return false;
+        }
+
+        auto fits = props::conformable(buf);
+        if (!fits) {
+            return false;
+        }
+
+        // Allocate the new type, then build a numpy reference into it
+        value = Type(fits.rows, fits.cols);
+        auto ref = reinterpret_steal<array>(eigen_ref_array<props>(value));
+        if (dims == 1) {
+            ref = ref.squeeze();
+        } else if (ref.ndim() == 1) {
+            buf = buf.squeeze();
+        }
+
+        int result = detail::npy_api::get().PyArray_CopyInto_(ref.ptr(), buf.ptr());
+
+        if (result < 0) { // Copy failed!
+            PyErr_Clear();
+            return false;
+        }
+
+        return true;
+    }
+
+private:
+    // Cast implementation
+    template <typename CType>
+    static handle cast_impl(CType *src, return_value_policy policy, handle parent) {
+        switch (policy) {
+            case return_value_policy::take_ownership:
+            case return_value_policy::automatic:
+                return eigen_encapsulate<props>(src);
+            case return_value_policy::move:
+                return eigen_encapsulate<props>(new CType(std::move(*src)));
+            case return_value_policy::copy:
+                return eigen_array_cast<props>(*src);
+            case return_value_policy::reference:
+            case return_value_policy::automatic_reference:
+                return eigen_ref_array<props>(*src);
+            case return_value_policy::reference_internal:
+                return eigen_ref_array<props>(*src, parent);
+            default:
+                throw cast_error("unhandled return_value_policy: should not happen!");
+        };
+    }
+
+public:
+    // Normal returned non-reference, non-const value:
+    static handle cast(Type &&src, return_value_policy /* policy */, handle parent) {
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+    // If you return a non-reference const, we mark the numpy array readonly:
+    static handle cast(const Type &&src, return_value_policy /* policy */, handle parent) {
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+    // lvalue reference return; default (automatic) becomes copy
+    static handle cast(Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast_impl(&src, policy, parent);
+    }
+    // const lvalue reference return; default (automatic) becomes copy
+    static handle cast(const Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast(&src, policy, parent);
+    }
+    // non-const pointer return
+    static handle cast(Type *src, return_value_policy policy, handle parent) {
+        return cast_impl(src, policy, parent);
+    }
+    // const pointer return
+    static handle cast(const Type *src, return_value_policy policy, handle parent) {
+        return cast_impl(src, policy, parent);
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator Type *() { return &value; }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator Type &() { return value; }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator Type &&() && { return std::move(value); }
+    template <typename T>
+    using cast_op_type = movable_cast_op_type<T>;
+
+private:
+    Type value;
+};
+
+// Base class for casting reference/map/block/etc. objects back to python.
+template <typename MapType>
+struct eigen_map_caster {
+    static_assert(!std::is_pointer<typename MapType::Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+
+private:
+    using props = EigenProps<MapType>;
+
+public:
+    // Directly referencing a ref/map's data is a bit dangerous (whatever the map/ref points to has
+    // to stay around), but we'll allow it under the assumption that you know what you're doing
+    // (and have an appropriate keep_alive in place).  We return a numpy array pointing directly at
+    // the ref's data (The numpy array ends up read-only if the ref was to a const matrix type.)
+    // Note that this means you need to ensure you don't destroy the object in some other way (e.g.
+    // with an appropriate keep_alive, or with a reference to a statically allocated matrix).
+    static handle cast(const MapType &src, return_value_policy policy, handle parent) {
+        switch (policy) {
+            case return_value_policy::copy:
+                return eigen_array_cast<props>(src);
+            case return_value_policy::reference_internal:
+                return eigen_array_cast<props>(src, parent, is_eigen_mutable_map<MapType>::value);
+            case return_value_policy::reference:
+            case return_value_policy::automatic:
+            case return_value_policy::automatic_reference:
+                return eigen_array_cast<props>(src, none(), is_eigen_mutable_map<MapType>::value);
+            default:
+                // move, take_ownership don't make any sense for a ref/map:
+                pybind11_fail("Invalid return_value_policy for Eigen Map/Ref/Block type");
+        }
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
+    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
+    // you end up here if you try anyway.
+    bool load(handle, bool) = delete;
+    operator MapType() = delete;
+    template <typename>
+    using cast_op_type = MapType;
+};
+
+// We can return any map-like object (but can only load Refs, specialized next):
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_dense_map<Type>::value>> : eigen_map_caster<Type> {};
+
+// Loader for Ref<...> arguments.  See the documentation for info on how to make this work without
+// copying (it requires some extra effort in many cases).
+template <typename PlainObjectType, typename StrideType>
+struct type_caster<
+    Eigen::Ref<PlainObjectType, 0, StrideType>,
+    enable_if_t<is_eigen_dense_map<Eigen::Ref<PlainObjectType, 0, StrideType>>::value>>
+    : public eigen_map_caster<Eigen::Ref<PlainObjectType, 0, StrideType>> {
+private:
+    using Type = Eigen::Ref<PlainObjectType, 0, StrideType>;
+    using props = EigenProps<Type>;
+    using Scalar = typename props::Scalar;
+    static_assert(!std::is_pointer<Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+    using MapType = Eigen::Map<PlainObjectType, 0, StrideType>;
+    using Array
+        = array_t<Scalar,
+                  array::forcecast
+                      | ((props::row_major ? props::inner_stride : props::outer_stride) == 1
+                             ? array::c_style
+                         : (props::row_major ? props::outer_stride : props::inner_stride) == 1
+                             ? array::f_style
+                             : 0)>;
+    static constexpr bool need_writeable = is_eigen_mutable_map<Type>::value;
+    // Delay construction (these have no default constructor)
+    std::unique_ptr<MapType> map;
+    std::unique_ptr<Type> ref;
+    // Our array.  When possible, this is just a numpy array pointing to the source data, but
+    // sometimes we can't avoid copying (e.g. input is not a numpy array at all, has an
+    // incompatible layout, or is an array of a type that needs to be converted).  Using a numpy
+    // temporary (rather than an Eigen temporary) saves an extra copy when we need both type
+    // conversion and storage order conversion.  (Note that we refuse to use this temporary copy
+    // when loading an argument for a Ref<M> with M non-const, i.e. a read-write reference).
+    Array copy_or_ref;
+
+public:
+    bool load(handle src, bool convert) {
+        // First check whether what we have is already an array of the right type.  If not, we
+        // can't avoid a copy (because the copy is also going to do type conversion).
+        bool need_copy = !isinstance<Array>(src);
+
+        EigenConformable<props::row_major> fits;
+        if (!need_copy) {
+            // We don't need a converting copy, but we also need to check whether the strides are
+            // compatible with the Ref's stride requirements
+            auto aref = reinterpret_borrow<Array>(src);
+
+            if (aref && (!need_writeable || aref.writeable())) {
+                fits = props::conformable(aref);
+                if (!fits) {
+                    return false; // Incompatible dimensions
+                }
+                if (!fits.template stride_compatible<props>()) {
+                    need_copy = true;
+                } else {
+                    copy_or_ref = std::move(aref);
+                }
+            } else {
+                need_copy = true;
+            }
+        }
+
+        if (need_copy) {
+            // We need to copy: If we need a mutable reference, or we're not supposed to convert
+            // (either because we're in the no-convert overload pass, or because we're explicitly
+            // instructed not to copy (via `py::arg().noconvert()`) we have to fail loading.
+            if (!convert || need_writeable) {
+                return false;
+            }
+
+            Array copy = Array::ensure(src);
+            if (!copy) {
+                return false;
+            }
+            fits = props::conformable(copy);
+            if (!fits || !fits.template stride_compatible<props>()) {
+                return false;
+            }
+            copy_or_ref = std::move(copy);
+            loader_life_support::add_patient(copy_or_ref);
+        }
+
+        ref.reset();
+        map.reset(new MapType(data(copy_or_ref),
+                              fits.rows,
+                              fits.cols,
+                              make_stride(fits.stride.outer(), fits.stride.inner())));
+        ref.reset(new Type(*map));
+
+        return true;
+    }
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator Type *() { return ref.get(); }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator Type &() { return *ref; }
+    template <typename _T>
+    using cast_op_type = pybind11::detail::cast_op_type<_T>;
+
+private:
+    template <typename T = Type, enable_if_t<is_eigen_mutable_map<T>::value, int> = 0>
+    Scalar *data(Array &a) {
+        return a.mutable_data();
+    }
+
+    template <typename T = Type, enable_if_t<!is_eigen_mutable_map<T>::value, int> = 0>
+    const Scalar *data(Array &a) {
+        return a.data();
+    }
+
+    // Attempt to figure out a constructor of `Stride` that will work.
+    // If both strides are fixed, use a default constructor:
+    template <typename S>
+    using stride_ctor_default = bool_constant<S::InnerStrideAtCompileTime != Eigen::Dynamic
+                                              && S::OuterStrideAtCompileTime != Eigen::Dynamic
+                                              && std::is_default_constructible<S>::value>;
+    // Otherwise, if there is a two-index constructor, assume it is (outer,inner) like
+    // Eigen::Stride, and use it:
+    template <typename S>
+    using stride_ctor_dual
+        = bool_constant<!stride_ctor_default<S>::value
+                        && std::is_constructible<S, EigenIndex, EigenIndex>::value>;
+    // Otherwise, if there is a one-index constructor, and just one of the strides is dynamic, use
+    // it (passing whichever stride is dynamic).
+    template <typename S>
+    using stride_ctor_outer
+        = bool_constant<!any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value
+                        && S::OuterStrideAtCompileTime == Eigen::Dynamic
+                        && S::InnerStrideAtCompileTime != Eigen::Dynamic
+                        && std::is_constructible<S, EigenIndex>::value>;
+    template <typename S>
+    using stride_ctor_inner
+        = bool_constant<!any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value
+                        && S::InnerStrideAtCompileTime == Eigen::Dynamic
+                        && S::OuterStrideAtCompileTime != Eigen::Dynamic
+                        && std::is_constructible<S, EigenIndex>::value>;
+
+    template <typename S = StrideType, enable_if_t<stride_ctor_default<S>::value, int> = 0>
+    static S make_stride(EigenIndex, EigenIndex) {
+        return S();
+    }
+    template <typename S = StrideType, enable_if_t<stride_ctor_dual<S>::value, int> = 0>
+    static S make_stride(EigenIndex outer, EigenIndex inner) {
+        return S(outer, inner);
+    }
+    template <typename S = StrideType, enable_if_t<stride_ctor_outer<S>::value, int> = 0>
+    static S make_stride(EigenIndex outer, EigenIndex) {
+        return S(outer);
+    }
+    template <typename S = StrideType, enable_if_t<stride_ctor_inner<S>::value, int> = 0>
+    static S make_stride(EigenIndex, EigenIndex inner) {
+        return S(inner);
+    }
+};
+
+// type_caster for special matrix types (e.g. DiagonalMatrix), which are EigenBase, but not
+// EigenDense (i.e. they don't have a data(), at least not with the usual matrix layout).
+// load() is not supported, but we can cast them into the python domain by first copying to a
+// regular Eigen::Matrix, then casting that.
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_other<Type>::value>> {
+    static_assert(!std::is_pointer<typename Type::Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+
+protected:
+    using Matrix
+        = Eigen::Matrix<typename Type::Scalar, Type::RowsAtCompileTime, Type::ColsAtCompileTime>;
+    using props = EigenProps<Matrix>;
+
+public:
+    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
+        handle h = eigen_encapsulate<props>(new Matrix(src));
+        return h;
+    }
+    static handle cast(const Type *src, return_value_policy policy, handle parent) {
+        return cast(*src, policy, parent);
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
+    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
+    // you end up here if you try anyway.
+    bool load(handle, bool) = delete;
+    operator Type() = delete;
+    template <typename>
+    using cast_op_type = Type;
+};
+
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_sparse<Type>::value>> {
+    using Scalar = typename Type::Scalar;
+    static_assert(!std::is_pointer<Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+    using StorageIndex = remove_reference_t<decltype(*std::declval<Type>().outerIndexPtr())>;
+    using Index = typename Type::Index;
+    static constexpr bool rowMajor = Type::IsRowMajor;
+
+    bool load(handle src, bool) {
+        if (!src) {
+            return false;
+        }
+
+        auto obj = reinterpret_borrow<object>(src);
+        object sparse_module = module_::import("scipy.sparse");
+        object matrix_type = sparse_module.attr(rowMajor ? "csr_matrix" : "csc_matrix");
+
+        if (!type::handle_of(obj).is(matrix_type)) {
+            try {
+                obj = matrix_type(obj);
+            } catch (const error_already_set &) {
+                return false;
+            }
+        }
+
+        auto values = array_t<Scalar>((object) obj.attr("data"));
+        auto innerIndices = array_t<StorageIndex>((object) obj.attr("indices"));
+        auto outerIndices = array_t<StorageIndex>((object) obj.attr("indptr"));
+        auto shape = pybind11::tuple((pybind11::object) obj.attr("shape"));
+        auto nnz = obj.attr("nnz").cast<Index>();
+
+        if (!values || !innerIndices || !outerIndices) {
+            return false;
+        }
+
+        value = EigenMapSparseMatrix<Scalar,
+                                     Type::Flags &(Eigen::RowMajor | Eigen::ColMajor),
+                                     StorageIndex>(shape[0].cast<Index>(),
+                                                   shape[1].cast<Index>(),
+                                                   std::move(nnz),
+                                                   outerIndices.mutable_data(),
+                                                   innerIndices.mutable_data(),
+                                                   values.mutable_data());
+
+        return true;
+    }
+
+    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
+        const_cast<Type &>(src).makeCompressed();
+
+        object matrix_type
+            = module_::import("scipy.sparse").attr(rowMajor ? "csr_matrix" : "csc_matrix");
+
+        array data(src.nonZeros(), src.valuePtr());
+        array outerIndices((rowMajor ? src.rows() : src.cols()) + 1, src.outerIndexPtr());
+        array innerIndices(src.nonZeros(), src.innerIndexPtr());
+
+        return matrix_type(pybind11::make_tuple(
+                               std::move(data), std::move(innerIndices), std::move(outerIndices)),
+                           pybind11::make_tuple(src.rows(), src.cols()))
+            .release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type,
+                         const_name<(Type::IsRowMajor) != 0>("scipy.sparse.csr_matrix[",
+                                                             "scipy.sparse.csc_matrix[")
+                             + npy_format_descriptor<Scalar>::name + const_name("]"));
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eigen/tensor.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eigen/tensor.h
new file mode 100644
index 00000000..25d12bac
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eigen/tensor.h
@@ -0,0 +1,516 @@
+/*
+    pybind11/eigen/tensor.h: Transparent conversion for Eigen tensors
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "../numpy.h"
+#include "common.h"
+
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+static_assert(__GNUC__ > 5, "Eigen Tensor support in pybind11 requires GCC > 5.0");
+#endif
+
+// Disable warnings for Eigen
+PYBIND11_WARNING_PUSH
+PYBIND11_WARNING_DISABLE_MSVC(4554)
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+#if defined(__MINGW32__)
+PYBIND11_WARNING_DISABLE_GCC("-Wmaybe-uninitialized")
+#endif
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+PYBIND11_WARNING_POP
+
+static_assert(EIGEN_VERSION_AT_LEAST(3, 3, 0),
+              "Eigen Tensor support in pybind11 requires Eigen >= 3.3.0");
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+inline bool is_tensor_aligned(const void *data) {
+    return (reinterpret_cast<std::size_t>(data) % EIGEN_DEFAULT_ALIGN_BYTES) == 0;
+}
+
+template <typename T>
+constexpr int compute_array_flag_from_tensor() {
+    static_assert((static_cast<int>(T::Layout) == static_cast<int>(Eigen::RowMajor))
+                      || (static_cast<int>(T::Layout) == static_cast<int>(Eigen::ColMajor)),
+                  "Layout must be row or column major");
+    return (static_cast<int>(T::Layout) == static_cast<int>(Eigen::RowMajor)) ? array::c_style
+                                                                              : array::f_style;
+}
+
+template <typename T>
+struct eigen_tensor_helper {};
+
+template <typename Scalar_, int NumIndices_, int Options_, typename IndexType>
+struct eigen_tensor_helper<Eigen::Tensor<Scalar_, NumIndices_, Options_, IndexType>> {
+    using Type = Eigen::Tensor<Scalar_, NumIndices_, Options_, IndexType>;
+    using ValidType = void;
+
+    static Eigen::DSizes<typename Type::Index, Type::NumIndices> get_shape(const Type &f) {
+        return f.dimensions();
+    }
+
+    static constexpr bool
+    is_correct_shape(const Eigen::DSizes<typename Type::Index, Type::NumIndices> & /*shape*/) {
+        return true;
+    }
+
+    template <typename T>
+    struct helper {};
+
+    template <size_t... Is>
+    struct helper<index_sequence<Is...>> {
+        static constexpr auto value = concat(const_name(((void) Is, "?"))...);
+    };
+
+    static constexpr auto dimensions_descriptor
+        = helper<decltype(make_index_sequence<Type::NumIndices>())>::value;
+
+    template <typename... Args>
+    static Type *alloc(Args &&...args) {
+        return new Type(std::forward<Args>(args)...);
+    }
+
+    static void free(Type *tensor) { delete tensor; }
+};
+
+template <typename Scalar_, typename std::ptrdiff_t... Indices, int Options_, typename IndexType>
+struct eigen_tensor_helper<
+    Eigen::TensorFixedSize<Scalar_, Eigen::Sizes<Indices...>, Options_, IndexType>> {
+    using Type = Eigen::TensorFixedSize<Scalar_, Eigen::Sizes<Indices...>, Options_, IndexType>;
+    using ValidType = void;
+
+    static constexpr Eigen::DSizes<typename Type::Index, Type::NumIndices>
+    get_shape(const Type & /*f*/) {
+        return get_shape();
+    }
+
+    static constexpr Eigen::DSizes<typename Type::Index, Type::NumIndices> get_shape() {
+        return Eigen::DSizes<typename Type::Index, Type::NumIndices>(Indices...);
+    }
+
+    static bool
+    is_correct_shape(const Eigen::DSizes<typename Type::Index, Type::NumIndices> &shape) {
+        return get_shape() == shape;
+    }
+
+    static constexpr auto dimensions_descriptor = concat(const_name<Indices>()...);
+
+    template <typename... Args>
+    static Type *alloc(Args &&...args) {
+        Eigen::aligned_allocator<Type> allocator;
+        return ::new (allocator.allocate(1)) Type(std::forward<Args>(args)...);
+    }
+
+    static void free(Type *tensor) {
+        Eigen::aligned_allocator<Type> allocator;
+        tensor->~Type();
+        allocator.deallocate(tensor, 1);
+    }
+};
+
+template <typename Type, bool ShowDetails, bool NeedsWriteable = false>
+struct get_tensor_descriptor {
+    static constexpr auto details
+        = const_name<NeedsWriteable>(", flags.writeable", "")
+          + const_name<static_cast<int>(Type::Layout) == static_cast<int>(Eigen::RowMajor)>(
+              ", flags.c_contiguous", ", flags.f_contiguous");
+    static constexpr auto value
+        = const_name("numpy.ndarray[") + npy_format_descriptor<typename Type::Scalar>::name
+          + const_name("[") + eigen_tensor_helper<remove_cv_t<Type>>::dimensions_descriptor
+          + const_name("]") + const_name<ShowDetails>(details, const_name("")) + const_name("]");
+};
+
+// When EIGEN_AVOID_STL_ARRAY is defined, Eigen::DSizes<T, 0> does not have the begin() member
+// function. Falling back to a simple loop works around this issue.
+//
+// We need to disable the type-limits warning for the inner loop when size = 0.
+
+PYBIND11_WARNING_PUSH
+PYBIND11_WARNING_DISABLE_GCC("-Wtype-limits")
+
+template <typename T, int size>
+std::vector<T> convert_dsizes_to_vector(const Eigen::DSizes<T, size> &arr) {
+    std::vector<T> result(size);
+
+    for (size_t i = 0; i < size; i++) {
+        result[i] = arr[i];
+    }
+
+    return result;
+}
+
+template <typename T, int size>
+Eigen::DSizes<T, size> get_shape_for_array(const array &arr) {
+    Eigen::DSizes<T, size> result;
+    const T *shape = arr.shape();
+    for (size_t i = 0; i < size; i++) {
+        result[i] = shape[i];
+    }
+
+    return result;
+}
+
+PYBIND11_WARNING_POP
+
+template <typename Type>
+struct type_caster<Type, typename eigen_tensor_helper<Type>::ValidType> {
+    static_assert(!std::is_pointer<typename Type::Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+    using Helper = eigen_tensor_helper<Type>;
+    static constexpr auto temp_name = get_tensor_descriptor<Type, false>::value;
+    PYBIND11_TYPE_CASTER(Type, temp_name);
+
+    bool load(handle src, bool convert) {
+        if (!convert) {
+            if (!isinstance<array>(src)) {
+                return false;
+            }
+            array temp = array::ensure(src);
+            if (!temp) {
+                return false;
+            }
+
+            if (!temp.dtype().is(dtype::of<typename Type::Scalar>())) {
+                return false;
+            }
+        }
+
+        array_t<typename Type::Scalar, compute_array_flag_from_tensor<Type>()> arr(
+            reinterpret_borrow<object>(src));
+
+        if (arr.ndim() != Type::NumIndices) {
+            return false;
+        }
+        auto shape = get_shape_for_array<typename Type::Index, Type::NumIndices>(arr);
+
+        if (!Helper::is_correct_shape(shape)) {
+            return false;
+        }
+
+#if EIGEN_VERSION_AT_LEAST(3, 4, 0)
+        auto data_pointer = arr.data();
+#else
+        // Handle Eigen bug
+        auto data_pointer = const_cast<typename Type::Scalar *>(arr.data());
+#endif
+
+        if (is_tensor_aligned(arr.data())) {
+            value = Eigen::TensorMap<const Type, Eigen::Aligned>(data_pointer, shape);
+        } else {
+            value = Eigen::TensorMap<const Type>(data_pointer, shape);
+        }
+
+        return true;
+    }
+
+    static handle cast(Type &&src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::reference
+            || policy == return_value_policy::reference_internal) {
+            pybind11_fail("Cannot use a reference return value policy for an rvalue");
+        }
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+
+    static handle cast(const Type &&src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::reference
+            || policy == return_value_policy::reference_internal) {
+            pybind11_fail("Cannot use a reference return value policy for an rvalue");
+        }
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+
+    static handle cast(Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast_impl(&src, policy, parent);
+    }
+
+    static handle cast(const Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast(&src, policy, parent);
+    }
+
+    static handle cast(Type *src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic) {
+            policy = return_value_policy::take_ownership;
+        } else if (policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::reference;
+        }
+        return cast_impl(src, policy, parent);
+    }
+
+    static handle cast(const Type *src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic) {
+            policy = return_value_policy::take_ownership;
+        } else if (policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::reference;
+        }
+        return cast_impl(src, policy, parent);
+    }
+
+    template <typename C>
+    static handle cast_impl(C *src, return_value_policy policy, handle parent) {
+        object parent_object;
+        bool writeable = false;
+        switch (policy) {
+            case return_value_policy::move:
+                if (std::is_const<C>::value) {
+                    pybind11_fail("Cannot move from a constant reference");
+                }
+
+                src = Helper::alloc(std::move(*src));
+
+                parent_object
+                    = capsule(src, [](void *ptr) { Helper::free(reinterpret_cast<Type *>(ptr)); });
+                writeable = true;
+                break;
+
+            case return_value_policy::take_ownership:
+                if (std::is_const<C>::value) {
+                    // This cast is ugly, and might be UB in some cases, but we don't have an
+                    // alternative here as we must free that memory
+                    Helper::free(const_cast<Type *>(src));
+                    pybind11_fail("Cannot take ownership of a const reference");
+                }
+
+                parent_object
+                    = capsule(src, [](void *ptr) { Helper::free(reinterpret_cast<Type *>(ptr)); });
+                writeable = true;
+                break;
+
+            case return_value_policy::copy:
+                writeable = true;
+                break;
+
+            case return_value_policy::reference:
+                parent_object = none();
+                writeable = !std::is_const<C>::value;
+                break;
+
+            case return_value_policy::reference_internal:
+                // Default should do the right thing
+                if (!parent) {
+                    pybind11_fail("Cannot use reference internal when there is no parent");
+                }
+                parent_object = reinterpret_borrow<object>(parent);
+                writeable = !std::is_const<C>::value;
+                break;
+
+            default:
+                pybind11_fail("pybind11 bug in eigen.h, please file a bug report");
+        }
+
+        auto result = array_t<typename Type::Scalar, compute_array_flag_from_tensor<Type>()>(
+            convert_dsizes_to_vector(Helper::get_shape(*src)), src->data(), parent_object);
+
+        if (!writeable) {
+            array_proxy(result.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_;
+        }
+
+        return result.release();
+    }
+};
+
+template <typename StoragePointerType,
+          bool needs_writeable,
+          enable_if_t<!needs_writeable, bool> = true>
+StoragePointerType get_array_data_for_type(array &arr) {
+#if EIGEN_VERSION_AT_LEAST(3, 4, 0)
+    return reinterpret_cast<StoragePointerType>(arr.data());
+#else
+    // Handle Eigen bug
+    return reinterpret_cast<StoragePointerType>(const_cast<void *>(arr.data()));
+#endif
+}
+
+template <typename StoragePointerType,
+          bool needs_writeable,
+          enable_if_t<needs_writeable, bool> = true>
+StoragePointerType get_array_data_for_type(array &arr) {
+    return reinterpret_cast<StoragePointerType>(arr.mutable_data());
+}
+
+template <typename T, typename = void>
+struct get_storage_pointer_type;
+
+template <typename MapType>
+struct get_storage_pointer_type<MapType, void_t<typename MapType::StoragePointerType>> {
+    using SPT = typename MapType::StoragePointerType;
+};
+
+template <typename MapType>
+struct get_storage_pointer_type<MapType, void_t<typename MapType::PointerArgType>> {
+    using SPT = typename MapType::PointerArgType;
+};
+
+template <typename Type, int Options>
+struct type_caster<Eigen::TensorMap<Type, Options>,
+                   typename eigen_tensor_helper<remove_cv_t<Type>>::ValidType> {
+    static_assert(!std::is_pointer<typename Type::Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+    using MapType = Eigen::TensorMap<Type, Options>;
+    using Helper = eigen_tensor_helper<remove_cv_t<Type>>;
+
+    bool load(handle src, bool /*convert*/) {
+        // Note that we have a lot more checks here as we want to make sure to avoid copies
+        if (!isinstance<array>(src)) {
+            return false;
+        }
+        auto arr = reinterpret_borrow<array>(src);
+        if ((arr.flags() & compute_array_flag_from_tensor<Type>()) == 0) {
+            return false;
+        }
+
+        if (!arr.dtype().is(dtype::of<typename Type::Scalar>())) {
+            return false;
+        }
+
+        if (arr.ndim() != Type::NumIndices) {
+            return false;
+        }
+
+        constexpr bool is_aligned = (Options & Eigen::Aligned) != 0;
+
+        if (is_aligned && !is_tensor_aligned(arr.data())) {
+            return false;
+        }
+
+        auto shape = get_shape_for_array<typename Type::Index, Type::NumIndices>(arr);
+
+        if (!Helper::is_correct_shape(shape)) {
+            return false;
+        }
+
+        if (needs_writeable && !arr.writeable()) {
+            return false;
+        }
+
+        auto result = get_array_data_for_type<typename get_storage_pointer_type<MapType>::SPT,
+                                              needs_writeable>(arr);
+
+        value.reset(new MapType(std::move(result), std::move(shape)));
+
+        return true;
+    }
+
+    static handle cast(MapType &&src, return_value_policy policy, handle parent) {
+        return cast_impl(&src, policy, parent);
+    }
+
+    static handle cast(const MapType &&src, return_value_policy policy, handle parent) {
+        return cast_impl(&src, policy, parent);
+    }
+
+    static handle cast(MapType &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast_impl(&src, policy, parent);
+    }
+
+    static handle cast(const MapType &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast(&src, policy, parent);
+    }
+
+    static handle cast(MapType *src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic) {
+            policy = return_value_policy::take_ownership;
+        } else if (policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::reference;
+        }
+        return cast_impl(src, policy, parent);
+    }
+
+    static handle cast(const MapType *src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic) {
+            policy = return_value_policy::take_ownership;
+        } else if (policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::reference;
+        }
+        return cast_impl(src, policy, parent);
+    }
+
+    template <typename C>
+    static handle cast_impl(C *src, return_value_policy policy, handle parent) {
+        object parent_object;
+        constexpr bool writeable = !std::is_const<C>::value;
+        switch (policy) {
+            case return_value_policy::reference:
+                parent_object = none();
+                break;
+
+            case return_value_policy::reference_internal:
+                // Default should do the right thing
+                if (!parent) {
+                    pybind11_fail("Cannot use reference internal when there is no parent");
+                }
+                parent_object = reinterpret_borrow<object>(parent);
+                break;
+
+            case return_value_policy::take_ownership:
+                delete src;
+                // fallthrough
+            default:
+                // move, take_ownership don't make any sense for a ref/map:
+                pybind11_fail("Invalid return_value_policy for Eigen Map type, must be either "
+                              "reference or reference_internal");
+        }
+
+        auto result = array_t<typename Type::Scalar, compute_array_flag_from_tensor<Type>()>(
+            convert_dsizes_to_vector(Helper::get_shape(*src)),
+            src->data(),
+            std::move(parent_object));
+
+        if (!writeable) {
+            array_proxy(result.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_;
+        }
+
+        return result.release();
+    }
+
+#if EIGEN_VERSION_AT_LEAST(3, 4, 0)
+
+    static constexpr bool needs_writeable = !std::is_const<typename std::remove_pointer<
+        typename get_storage_pointer_type<MapType>::SPT>::type>::value;
+#else
+    // Handle Eigen bug
+    static constexpr bool needs_writeable = !std::is_const<Type>::value;
+#endif
+
+protected:
+    // TODO: Move to std::optional once std::optional has more support
+    std::unique_ptr<MapType> value;
+
+public:
+    static constexpr auto name = get_tensor_descriptor<Type, true, needs_writeable>::value;
+    explicit operator MapType *() { return value.get(); }
+    explicit operator MapType &() { return *value; }
+    explicit operator MapType &&() && { return std::move(*value); }
+
+    template <typename T_>
+    using cast_op_type = ::pybind11::detail::movable_cast_op_type<T_>;
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/embed.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/embed.h
new file mode 100644
index 00000000..caa14f4a
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/embed.h
@@ -0,0 +1,316 @@
+/*
+    pybind11/embed.h: Support for embedding the interpreter
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include "eval.h"
+
+#include <memory>
+#include <vector>
+
+#if defined(PYPY_VERSION)
+#    error Embedding the interpreter is not supported with PyPy
+#endif
+
+#define PYBIND11_EMBEDDED_MODULE_IMPL(name)                                                       \
+    extern "C" PyObject *pybind11_init_impl_##name();                                             \
+    extern "C" PyObject *pybind11_init_impl_##name() { return pybind11_init_wrapper_##name(); }
+
+/** \rst
+    Add a new module to the table of builtins for the interpreter. Must be
+    defined in global scope. The first macro parameter is the name of the
+    module (without quotes). The second parameter is the variable which will
+    be used as the interface to add functions and classes to the module.
+
+    .. code-block:: cpp
+
+        PYBIND11_EMBEDDED_MODULE(example, m) {
+            // ... initialize functions and classes here
+            m.def("foo", []() {
+                return "Hello, World!";
+            });
+        }
+ \endrst */
+#define PYBIND11_EMBEDDED_MODULE(name, variable)                                                  \
+    static ::pybind11::module_::module_def PYBIND11_CONCAT(pybind11_module_def_, name);           \
+    static void PYBIND11_CONCAT(pybind11_init_, name)(::pybind11::module_ &);                     \
+    static PyObject PYBIND11_CONCAT(*pybind11_init_wrapper_, name)() {                            \
+        auto m = ::pybind11::module_::create_extension_module(                                    \
+            PYBIND11_TOSTRING(name), nullptr, &PYBIND11_CONCAT(pybind11_module_def_, name));      \
+        try {                                                                                     \
+            PYBIND11_CONCAT(pybind11_init_, name)(m);                                             \
+            return m.ptr();                                                                       \
+        }                                                                                         \
+        PYBIND11_CATCH_INIT_EXCEPTIONS                                                            \
+    }                                                                                             \
+    PYBIND11_EMBEDDED_MODULE_IMPL(name)                                                           \
+    ::pybind11::detail::embedded_module PYBIND11_CONCAT(pybind11_module_, name)(                  \
+        PYBIND11_TOSTRING(name), PYBIND11_CONCAT(pybind11_init_impl_, name));                     \
+    void PYBIND11_CONCAT(pybind11_init_, name)(::pybind11::module_                                \
+                                               & variable) // NOLINT(bugprone-macro-parentheses)
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// Python 2.7/3.x compatible version of `PyImport_AppendInittab` and error checks.
+struct embedded_module {
+    using init_t = PyObject *(*) ();
+    embedded_module(const char *name, init_t init) {
+        if (Py_IsInitialized() != 0) {
+            pybind11_fail("Can't add new modules after the interpreter has been initialized");
+        }
+
+        auto result = PyImport_AppendInittab(name, init);
+        if (result == -1) {
+            pybind11_fail("Insufficient memory to add a new module");
+        }
+    }
+};
+
+struct wide_char_arg_deleter {
+    void operator()(wchar_t *ptr) const {
+        // API docs: https://docs.python.org/3/c-api/sys.html#c.Py_DecodeLocale
+        PyMem_RawFree(ptr);
+    }
+};
+
+inline wchar_t *widen_chars(const char *safe_arg) {
+    wchar_t *widened_arg = Py_DecodeLocale(safe_arg, nullptr);
+    return widened_arg;
+}
+
+inline void precheck_interpreter() {
+    if (Py_IsInitialized() != 0) {
+        pybind11_fail("The interpreter is already running");
+    }
+}
+
+#if !defined(PYBIND11_PYCONFIG_SUPPORT_PY_VERSION_HEX)
+#    define PYBIND11_PYCONFIG_SUPPORT_PY_VERSION_HEX (0x03080000)
+#endif
+
+#if PY_VERSION_HEX < PYBIND11_PYCONFIG_SUPPORT_PY_VERSION_HEX
+inline void initialize_interpreter_pre_pyconfig(bool init_signal_handlers,
+                                                int argc,
+                                                const char *const *argv,
+                                                bool add_program_dir_to_path) {
+    detail::precheck_interpreter();
+    Py_InitializeEx(init_signal_handlers ? 1 : 0);
+#    if defined(WITH_THREAD) && PY_VERSION_HEX < 0x03070000
+    PyEval_InitThreads();
+#    endif
+
+    // Before it was special-cased in python 3.8, passing an empty or null argv
+    // caused a segfault, so we have to reimplement the special case ourselves.
+    bool special_case = (argv == nullptr || argc <= 0);
+
+    const char *const empty_argv[]{"\0"};
+    const char *const *safe_argv = special_case ? empty_argv : argv;
+    if (special_case) {
+        argc = 1;
+    }
+
+    auto argv_size = static_cast<size_t>(argc);
+    // SetArgv* on python 3 takes wchar_t, so we have to convert.
+    std::unique_ptr<wchar_t *[]> widened_argv(new wchar_t *[argv_size]);
+    std::vector<std::unique_ptr<wchar_t[], detail::wide_char_arg_deleter>> widened_argv_entries;
+    widened_argv_entries.reserve(argv_size);
+    for (size_t ii = 0; ii < argv_size; ++ii) {
+        widened_argv_entries.emplace_back(detail::widen_chars(safe_argv[ii]));
+        if (!widened_argv_entries.back()) {
+            // A null here indicates a character-encoding failure or the python
+            // interpreter out of memory. Give up.
+            return;
+        }
+        widened_argv[ii] = widened_argv_entries.back().get();
+    }
+
+    auto *pysys_argv = widened_argv.get();
+
+    PySys_SetArgvEx(argc, pysys_argv, static_cast<int>(add_program_dir_to_path));
+}
+#endif
+
+PYBIND11_NAMESPACE_END(detail)
+
+#if PY_VERSION_HEX >= PYBIND11_PYCONFIG_SUPPORT_PY_VERSION_HEX
+inline void initialize_interpreter(PyConfig *config,
+                                   int argc = 0,
+                                   const char *const *argv = nullptr,
+                                   bool add_program_dir_to_path = true) {
+    detail::precheck_interpreter();
+    PyStatus status = PyConfig_SetBytesArgv(config, argc, const_cast<char *const *>(argv));
+    if (PyStatus_Exception(status) != 0) {
+        // A failure here indicates a character-encoding failure or the python
+        // interpreter out of memory. Give up.
+        PyConfig_Clear(config);
+        throw std::runtime_error(PyStatus_IsError(status) != 0 ? status.err_msg
+                                                               : "Failed to prepare CPython");
+    }
+    status = Py_InitializeFromConfig(config);
+    if (PyStatus_Exception(status) != 0) {
+        PyConfig_Clear(config);
+        throw std::runtime_error(PyStatus_IsError(status) != 0 ? status.err_msg
+                                                               : "Failed to init CPython");
+    }
+    if (add_program_dir_to_path) {
+        PyRun_SimpleString("import sys, os.path; "
+                           "sys.path.insert(0, "
+                           "os.path.abspath(os.path.dirname(sys.argv[0])) "
+                           "if sys.argv and os.path.exists(sys.argv[0]) else '')");
+    }
+    PyConfig_Clear(config);
+}
+#endif
+
+/** \rst
+    Initialize the Python interpreter. No other pybind11 or CPython API functions can be
+    called before this is done; with the exception of `PYBIND11_EMBEDDED_MODULE`. The
+    optional `init_signal_handlers` parameter can be used to skip the registration of
+    signal handlers (see the `Python documentation`_ for details). Calling this function
+    again after the interpreter has already been initialized is a fatal error.
+
+    If initializing the Python interpreter fails, then the program is terminated.  (This
+    is controlled by the CPython runtime and is an exception to pybind11's normal behavior
+    of throwing exceptions on errors.)
+
+    The remaining optional parameters, `argc`, `argv`, and `add_program_dir_to_path` are
+    used to populate ``sys.argv`` and ``sys.path``.
+    See the |PySys_SetArgvEx documentation|_ for details.
+
+    .. _Python documentation: https://docs.python.org/3/c-api/init.html#c.Py_InitializeEx
+    .. |PySys_SetArgvEx documentation| replace:: ``PySys_SetArgvEx`` documentation
+    .. _PySys_SetArgvEx documentation: https://docs.python.org/3/c-api/init.html#c.PySys_SetArgvEx
+ \endrst */
+inline void initialize_interpreter(bool init_signal_handlers = true,
+                                   int argc = 0,
+                                   const char *const *argv = nullptr,
+                                   bool add_program_dir_to_path = true) {
+#if PY_VERSION_HEX < PYBIND11_PYCONFIG_SUPPORT_PY_VERSION_HEX
+    detail::initialize_interpreter_pre_pyconfig(
+        init_signal_handlers, argc, argv, add_program_dir_to_path);
+#else
+    PyConfig config;
+    PyConfig_InitPythonConfig(&config);
+    // See PR #4473 for background
+    config.parse_argv = 0;
+
+    config.install_signal_handlers = init_signal_handlers ? 1 : 0;
+    initialize_interpreter(&config, argc, argv, add_program_dir_to_path);
+#endif
+}
+
+/** \rst
+    Shut down the Python interpreter. No pybind11 or CPython API functions can be called
+    after this. In addition, pybind11 objects must not outlive the interpreter:
+
+    .. code-block:: cpp
+
+        { // BAD
+            py::initialize_interpreter();
+            auto hello = py::str("Hello, World!");
+            py::finalize_interpreter();
+        } // <-- BOOM, hello's destructor is called after interpreter shutdown
+
+        { // GOOD
+            py::initialize_interpreter();
+            { // scoped
+                auto hello = py::str("Hello, World!");
+            } // <-- OK, hello is cleaned up properly
+            py::finalize_interpreter();
+        }
+
+        { // BETTER
+            py::scoped_interpreter guard{};
+            auto hello = py::str("Hello, World!");
+        }
+
+    .. warning::
+
+        The interpreter can be restarted by calling `initialize_interpreter` again.
+        Modules created using pybind11 can be safely re-initialized. However, Python
+        itself cannot completely unload binary extension modules and there are several
+        caveats with regard to interpreter restarting. All the details can be found
+        in the CPython documentation. In short, not all interpreter memory may be
+        freed, either due to reference cycles or user-created global data.
+
+ \endrst */
+inline void finalize_interpreter() {
+    // Get the internals pointer (without creating it if it doesn't exist).  It's possible for the
+    // internals to be created during Py_Finalize() (e.g. if a py::capsule calls `get_internals()`
+    // during destruction), so we get the pointer-pointer here and check it after Py_Finalize().
+    detail::internals **internals_ptr_ptr = detail::get_internals_pp();
+    // It could also be stashed in state_dict, so look there too:
+    if (object internals_obj
+        = get_internals_obj_from_state_dict(detail::get_python_state_dict())) {
+        internals_ptr_ptr = detail::get_internals_pp_from_capsule(internals_obj);
+    }
+    // Local internals contains data managed by the current interpreter, so we must clear them to
+    // avoid undefined behaviors when initializing another interpreter
+    detail::get_local_internals().registered_types_cpp.clear();
+    detail::get_local_internals().registered_exception_translators.clear();
+
+    Py_Finalize();
+
+    if (internals_ptr_ptr) {
+        delete *internals_ptr_ptr;
+        *internals_ptr_ptr = nullptr;
+    }
+}
+
+/** \rst
+    Scope guard version of `initialize_interpreter` and `finalize_interpreter`.
+    This a move-only guard and only a single instance can exist.
+
+    See `initialize_interpreter` for a discussion of its constructor arguments.
+
+    .. code-block:: cpp
+
+        #include <pybind11/embed.h>
+
+        int main() {
+            py::scoped_interpreter guard{};
+            py::print(Hello, World!);
+        } // <-- interpreter shutdown
+ \endrst */
+class scoped_interpreter {
+public:
+    explicit scoped_interpreter(bool init_signal_handlers = true,
+                                int argc = 0,
+                                const char *const *argv = nullptr,
+                                bool add_program_dir_to_path = true) {
+        initialize_interpreter(init_signal_handlers, argc, argv, add_program_dir_to_path);
+    }
+
+#if PY_VERSION_HEX >= PYBIND11_PYCONFIG_SUPPORT_PY_VERSION_HEX
+    explicit scoped_interpreter(PyConfig *config,
+                                int argc = 0,
+                                const char *const *argv = nullptr,
+                                bool add_program_dir_to_path = true) {
+        initialize_interpreter(config, argc, argv, add_program_dir_to_path);
+    }
+#endif
+
+    scoped_interpreter(const scoped_interpreter &) = delete;
+    scoped_interpreter(scoped_interpreter &&other) noexcept { other.is_valid = false; }
+    scoped_interpreter &operator=(const scoped_interpreter &) = delete;
+    scoped_interpreter &operator=(scoped_interpreter &&) = delete;
+
+    ~scoped_interpreter() {
+        if (is_valid) {
+            finalize_interpreter();
+        }
+    }
+
+private:
+    bool is_valid = true;
+};
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eval.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eval.h
new file mode 100644
index 00000000..bd5f981f
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eval.h
@@ -0,0 +1,156 @@
+/*
+    pybind11/eval.h: Support for evaluating Python expressions and statements
+    from strings and files
+
+    Copyright (c) 2016 Klemens Morgenstern <klemens.morgenstern@ed-chemnitz.de> and
+                       Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#include <utility>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+inline void ensure_builtins_in_globals(object &global) {
+#if defined(PYPY_VERSION) || PY_VERSION_HEX < 0x03080000
+    // Running exec and eval adds `builtins` module under `__builtins__` key to
+    // globals if not yet present.  Python 3.8 made PyRun_String behave
+    // similarly. Let's also do that for older versions, for consistency. This
+    // was missing from PyPy3.8 7.3.7.
+    if (!global.contains("__builtins__"))
+        global["__builtins__"] = module_::import(PYBIND11_BUILTINS_MODULE);
+#else
+    (void) global;
+#endif
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+enum eval_mode {
+    /// Evaluate a string containing an isolated expression
+    eval_expr,
+
+    /// Evaluate a string containing a single statement. Returns \c none
+    eval_single_statement,
+
+    /// Evaluate a string containing a sequence of statement. Returns \c none
+    eval_statements
+};
+
+template <eval_mode mode = eval_expr>
+object eval(const str &expr, object global = globals(), object local = object()) {
+    if (!local) {
+        local = global;
+    }
+
+    detail::ensure_builtins_in_globals(global);
+
+    /* PyRun_String does not accept a PyObject / encoding specifier,
+       this seems to be the only alternative */
+    std::string buffer = "# -*- coding: utf-8 -*-\n" + (std::string) expr;
+
+    int start = 0;
+    switch (mode) {
+        case eval_expr:
+            start = Py_eval_input;
+            break;
+        case eval_single_statement:
+            start = Py_single_input;
+            break;
+        case eval_statements:
+            start = Py_file_input;
+            break;
+        default:
+            pybind11_fail("invalid evaluation mode");
+    }
+
+    PyObject *result = PyRun_String(buffer.c_str(), start, global.ptr(), local.ptr());
+    if (!result) {
+        throw error_already_set();
+    }
+    return reinterpret_steal<object>(result);
+}
+
+template <eval_mode mode = eval_expr, size_t N>
+object eval(const char (&s)[N], object global = globals(), object local = object()) {
+    /* Support raw string literals by removing common leading whitespace */
+    auto expr = (s[0] == '\n') ? str(module_::import("textwrap").attr("dedent")(s)) : str(s);
+    return eval<mode>(expr, std::move(global), std::move(local));
+}
+
+inline void exec(const str &expr, object global = globals(), object local = object()) {
+    eval<eval_statements>(expr, std::move(global), std::move(local));
+}
+
+template <size_t N>
+void exec(const char (&s)[N], object global = globals(), object local = object()) {
+    eval<eval_statements>(s, std::move(global), std::move(local));
+}
+
+#if defined(PYPY_VERSION)
+template <eval_mode mode = eval_statements>
+object eval_file(str, object, object) {
+    pybind11_fail("eval_file not supported in PyPy3. Use eval");
+}
+template <eval_mode mode = eval_statements>
+object eval_file(str, object) {
+    pybind11_fail("eval_file not supported in PyPy3. Use eval");
+}
+template <eval_mode mode = eval_statements>
+object eval_file(str) {
+    pybind11_fail("eval_file not supported in PyPy3. Use eval");
+}
+#else
+template <eval_mode mode = eval_statements>
+object eval_file(str fname, object global = globals(), object local = object()) {
+    if (!local) {
+        local = global;
+    }
+
+    detail::ensure_builtins_in_globals(global);
+
+    int start = 0;
+    switch (mode) {
+        case eval_expr:
+            start = Py_eval_input;
+            break;
+        case eval_single_statement:
+            start = Py_single_input;
+            break;
+        case eval_statements:
+            start = Py_file_input;
+            break;
+        default:
+            pybind11_fail("invalid evaluation mode");
+    }
+
+    int closeFile = 1;
+    std::string fname_str = (std::string) fname;
+    FILE *f = _Py_fopen_obj(fname.ptr(), "r");
+    if (!f) {
+        PyErr_Clear();
+        pybind11_fail("File \"" + fname_str + "\" could not be opened!");
+    }
+
+    if (!global.contains("__file__")) {
+        global["__file__"] = std::move(fname);
+    }
+
+    PyObject *result
+        = PyRun_FileEx(f, fname_str.c_str(), start, global.ptr(), local.ptr(), closeFile);
+
+    if (!result) {
+        throw error_already_set();
+    }
+    return reinterpret_steal<object>(result);
+}
+#endif
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/functional.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/functional.h
new file mode 100644
index 00000000..87ec4d10
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/functional.h
@@ -0,0 +1,137 @@
+/*
+    pybind11/functional.h: std::function<> support
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#include <functional>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename Return, typename... Args>
+struct type_caster<std::function<Return(Args...)>> {
+    using type = std::function<Return(Args...)>;
+    using retval_type = conditional_t<std::is_same<Return, void>::value, void_type, Return>;
+    using function_type = Return (*)(Args...);
+
+public:
+    bool load(handle src, bool convert) {
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) {
+                return false;
+            }
+            return true;
+        }
+
+        if (!isinstance<function>(src)) {
+            return false;
+        }
+
+        auto func = reinterpret_borrow<function>(src);
+
+        /*
+           When passing a C++ function as an argument to another C++
+           function via Python, every function call would normally involve
+           a full C++ -> Python -> C++ roundtrip, which can be prohibitive.
+           Here, we try to at least detect the case where the function is
+           stateless (i.e. function pointer or lambda function without
+           captured variables), in which case the roundtrip can be avoided.
+         */
+        if (auto cfunc = func.cpp_function()) {
+            auto *cfunc_self = PyCFunction_GET_SELF(cfunc.ptr());
+            if (cfunc_self == nullptr) {
+                PyErr_Clear();
+            } else if (isinstance<capsule>(cfunc_self)) {
+                auto c = reinterpret_borrow<capsule>(cfunc_self);
+
+                function_record *rec = nullptr;
+                // Check that we can safely reinterpret the capsule into a function_record
+                if (detail::is_function_record_capsule(c)) {
+                    rec = c.get_pointer<function_record>();
+                }
+
+                while (rec != nullptr) {
+                    if (rec->is_stateless
+                        && same_type(typeid(function_type),
+                                     *reinterpret_cast<const std::type_info *>(rec->data[1]))) {
+                        struct capture {
+                            function_type f;
+                        };
+                        value = ((capture *) &rec->data)->f;
+                        return true;
+                    }
+                    rec = rec->next;
+                }
+            }
+            // PYPY segfaults here when passing builtin function like sum.
+            // Raising an fail exception here works to prevent the segfault, but only on gcc.
+            // See PR #1413 for full details
+        }
+
+        // ensure GIL is held during functor destruction
+        struct func_handle {
+            function f;
+#if !(defined(_MSC_VER) && _MSC_VER == 1916 && defined(PYBIND11_CPP17))
+            // This triggers a syntax error under very special conditions (very weird indeed).
+            explicit
+#endif
+                func_handle(function &&f_) noexcept
+                : f(std::move(f_)) {
+            }
+            func_handle(const func_handle &f_) { operator=(f_); }
+            func_handle &operator=(const func_handle &f_) {
+                gil_scoped_acquire acq;
+                f = f_.f;
+                return *this;
+            }
+            ~func_handle() {
+                gil_scoped_acquire acq;
+                function kill_f(std::move(f));
+            }
+        };
+
+        // to emulate 'move initialization capture' in C++11
+        struct func_wrapper {
+            func_handle hfunc;
+            explicit func_wrapper(func_handle &&hf) noexcept : hfunc(std::move(hf)) {}
+            Return operator()(Args... args) const {
+                gil_scoped_acquire acq;
+                // casts the returned object as a rvalue to the return type
+                return hfunc.f(std::forward<Args>(args)...).template cast<Return>();
+            }
+        };
+
+        value = func_wrapper(func_handle(std::move(func)));
+        return true;
+    }
+
+    template <typename Func>
+    static handle cast(Func &&f_, return_value_policy policy, handle /* parent */) {
+        if (!f_) {
+            return none().release();
+        }
+
+        auto result = f_.template target<function_type>();
+        if (result) {
+            return cpp_function(*result, policy).release();
+        }
+        return cpp_function(std::forward<Func>(f_), policy).release();
+    }
+
+    PYBIND11_TYPE_CASTER(type,
+                         const_name("Callable[[") + concat(make_caster<Args>::name...)
+                             + const_name("], ") + make_caster<retval_type>::name
+                             + const_name("]"));
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/gil.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/gil.h
new file mode 100644
index 00000000..570a5581
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/gil.h
@@ -0,0 +1,239 @@
+/*
+    pybind11/gil.h: RAII helpers for managing the GIL
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+
+#if defined(WITH_THREAD) && !defined(PYBIND11_SIMPLE_GIL_MANAGEMENT)
+#    include "detail/internals.h"
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// forward declarations
+PyThreadState *get_thread_state_unchecked();
+
+PYBIND11_NAMESPACE_END(detail)
+
+#if defined(WITH_THREAD)
+
+#    if !defined(PYBIND11_SIMPLE_GIL_MANAGEMENT)
+
+/* The functions below essentially reproduce the PyGILState_* API using a RAII
+ * pattern, but there are a few important differences:
+ *
+ * 1. When acquiring the GIL from an non-main thread during the finalization
+ *    phase, the GILState API blindly terminates the calling thread, which
+ *    is often not what is wanted. This API does not do this.
+ *
+ * 2. The gil_scoped_release function can optionally cut the relationship
+ *    of a PyThreadState and its associated thread, which allows moving it to
+ *    another thread (this is a fairly rare/advanced use case).
+ *
+ * 3. The reference count of an acquired thread state can be controlled. This
+ *    can be handy to prevent cases where callbacks issued from an external
+ *    thread would otherwise constantly construct and destroy thread state data
+ *    structures.
+ *
+ * See the Python bindings of NanoGUI (http://github.com/wjakob/nanogui) for an
+ * example which uses features 2 and 3 to migrate the Python thread of
+ * execution to another thread (to run the event loop on the original thread,
+ * in this case).
+ */
+
+class gil_scoped_acquire {
+public:
+    PYBIND11_NOINLINE gil_scoped_acquire() {
+        auto &internals = detail::get_internals();
+        tstate = (PyThreadState *) PYBIND11_TLS_GET_VALUE(internals.tstate);
+
+        if (!tstate) {
+            /* Check if the GIL was acquired using the PyGILState_* API instead (e.g. if
+               calling from a Python thread). Since we use a different key, this ensures
+               we don't create a new thread state and deadlock in PyEval_AcquireThread
+               below. Note we don't save this state with internals.tstate, since we don't
+               create it we would fail to clear it (its reference count should be > 0). */
+            tstate = PyGILState_GetThisThreadState();
+        }
+
+        if (!tstate) {
+            tstate = PyThreadState_New(internals.istate);
+#        if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            if (!tstate) {
+                pybind11_fail("scoped_acquire: could not create thread state!");
+            }
+#        endif
+            tstate->gilstate_counter = 0;
+            PYBIND11_TLS_REPLACE_VALUE(internals.tstate, tstate);
+        } else {
+            release = detail::get_thread_state_unchecked() != tstate;
+        }
+
+        if (release) {
+            PyEval_AcquireThread(tstate);
+        }
+
+        inc_ref();
+    }
+
+    gil_scoped_acquire(const gil_scoped_acquire &) = delete;
+    gil_scoped_acquire &operator=(const gil_scoped_acquire &) = delete;
+
+    void inc_ref() { ++tstate->gilstate_counter; }
+
+    PYBIND11_NOINLINE void dec_ref() {
+        --tstate->gilstate_counter;
+#        if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+        if (detail::get_thread_state_unchecked() != tstate) {
+            pybind11_fail("scoped_acquire::dec_ref(): thread state must be current!");
+        }
+        if (tstate->gilstate_counter < 0) {
+            pybind11_fail("scoped_acquire::dec_ref(): reference count underflow!");
+        }
+#        endif
+        if (tstate->gilstate_counter == 0) {
+#        if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            if (!release) {
+                pybind11_fail("scoped_acquire::dec_ref(): internal error!");
+            }
+#        endif
+            PyThreadState_Clear(tstate);
+            if (active) {
+                PyThreadState_DeleteCurrent();
+            }
+            PYBIND11_TLS_DELETE_VALUE(detail::get_internals().tstate);
+            release = false;
+        }
+    }
+
+    /// This method will disable the PyThreadState_DeleteCurrent call and the
+    /// GIL won't be acquired. This method should be used if the interpreter
+    /// could be shutting down when this is called, as thread deletion is not
+    /// allowed during shutdown. Check _Py_IsFinalizing() on Python 3.7+, and
+    /// protect subsequent code.
+    PYBIND11_NOINLINE void disarm() { active = false; }
+
+    PYBIND11_NOINLINE ~gil_scoped_acquire() {
+        dec_ref();
+        if (release) {
+            PyEval_SaveThread();
+        }
+    }
+
+private:
+    PyThreadState *tstate = nullptr;
+    bool release = true;
+    bool active = true;
+};
+
+class gil_scoped_release {
+public:
+    explicit gil_scoped_release(bool disassoc = false) : disassoc(disassoc) {
+        // `get_internals()` must be called here unconditionally in order to initialize
+        // `internals.tstate` for subsequent `gil_scoped_acquire` calls. Otherwise, an
+        // initialization race could occur as multiple threads try `gil_scoped_acquire`.
+        auto &internals = detail::get_internals();
+        // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
+        tstate = PyEval_SaveThread();
+        if (disassoc) {
+            // Python >= 3.7 can remove this, it's an int before 3.7
+            // NOLINTNEXTLINE(readability-qualified-auto)
+            auto key = internals.tstate;
+            PYBIND11_TLS_DELETE_VALUE(key);
+        }
+    }
+
+    gil_scoped_release(const gil_scoped_release &) = delete;
+    gil_scoped_release &operator=(const gil_scoped_release &) = delete;
+
+    /// This method will disable the PyThreadState_DeleteCurrent call and the
+    /// GIL won't be acquired. This method should be used if the interpreter
+    /// could be shutting down when this is called, as thread deletion is not
+    /// allowed during shutdown. Check _Py_IsFinalizing() on Python 3.7+, and
+    /// protect subsequent code.
+    PYBIND11_NOINLINE void disarm() { active = false; }
+
+    ~gil_scoped_release() {
+        if (!tstate) {
+            return;
+        }
+        // `PyEval_RestoreThread()` should not be called if runtime is finalizing
+        if (active) {
+            PyEval_RestoreThread(tstate);
+        }
+        if (disassoc) {
+            // Python >= 3.7 can remove this, it's an int before 3.7
+            // NOLINTNEXTLINE(readability-qualified-auto)
+            auto key = detail::get_internals().tstate;
+            PYBIND11_TLS_REPLACE_VALUE(key, tstate);
+        }
+    }
+
+private:
+    PyThreadState *tstate;
+    bool disassoc;
+    bool active = true;
+};
+
+#    else // PYBIND11_SIMPLE_GIL_MANAGEMENT
+
+class gil_scoped_acquire {
+    PyGILState_STATE state;
+
+public:
+    gil_scoped_acquire() : state{PyGILState_Ensure()} {}
+    gil_scoped_acquire(const gil_scoped_acquire &) = delete;
+    gil_scoped_acquire &operator=(const gil_scoped_acquire &) = delete;
+    ~gil_scoped_acquire() { PyGILState_Release(state); }
+    void disarm() {}
+};
+
+class gil_scoped_release {
+    PyThreadState *state;
+
+public:
+    gil_scoped_release() : state{PyEval_SaveThread()} {}
+    gil_scoped_release(const gil_scoped_release &) = delete;
+    gil_scoped_release &operator=(const gil_scoped_release &) = delete;
+    ~gil_scoped_release() { PyEval_RestoreThread(state); }
+    void disarm() {}
+};
+
+#    endif // PYBIND11_SIMPLE_GIL_MANAGEMENT
+
+#else // WITH_THREAD
+
+class gil_scoped_acquire {
+public:
+    gil_scoped_acquire() {
+        // Trick to suppress `unused variable` error messages (at call sites).
+        (void) (this != (this + 1));
+    }
+    gil_scoped_acquire(const gil_scoped_acquire &) = delete;
+    gil_scoped_acquire &operator=(const gil_scoped_acquire &) = delete;
+    void disarm() {}
+};
+
+class gil_scoped_release {
+public:
+    gil_scoped_release() {
+        // Trick to suppress `unused variable` error messages (at call sites).
+        (void) (this != (this + 1));
+    }
+    gil_scoped_release(const gil_scoped_release &) = delete;
+    gil_scoped_release &operator=(const gil_scoped_release &) = delete;
+    void disarm() {}
+};
+
+#endif // WITH_THREAD
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/iostream.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/iostream.h
new file mode 100644
index 00000000..1878089e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/iostream.h
@@ -0,0 +1,265 @@
+/*
+    pybind11/iostream.h -- Tools to assist with redirecting cout and cerr to Python
+
+    Copyright (c) 2017 Henry F. Schreiner
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+
+    WARNING: The implementation in this file is NOT thread safe. Multiple
+    threads writing to a redirected ostream concurrently cause data races
+    and potentially buffer overflows. Therefore it is currently a requirement
+    that all (possibly) concurrent redirected ostream writes are protected by
+    a mutex.
+    #HelpAppreciated: Work on iostream.h thread safety.
+    For more background see the discussions under
+    https://github.com/pybind/pybind11/pull/2982 and
+    https://github.com/pybind/pybind11/pull/2995.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <ostream>
+#include <streambuf>
+#include <string>
+#include <utility>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Buffer that writes to Python instead of C++
+class pythonbuf : public std::streambuf {
+private:
+    using traits_type = std::streambuf::traits_type;
+
+    const size_t buf_size;
+    std::unique_ptr<char[]> d_buffer;
+    object pywrite;
+    object pyflush;
+
+    int overflow(int c) override {
+        if (!traits_type::eq_int_type(c, traits_type::eof())) {
+            *pptr() = traits_type::to_char_type(c);
+            pbump(1);
+        }
+        return sync() == 0 ? traits_type::not_eof(c) : traits_type::eof();
+    }
+
+    // Computes how many bytes at the end of the buffer are part of an
+    // incomplete sequence of UTF-8 bytes.
+    // Precondition: pbase() < pptr()
+    size_t utf8_remainder() const {
+        const auto rbase = std::reverse_iterator<char *>(pbase());
+        const auto rpptr = std::reverse_iterator<char *>(pptr());
+        auto is_ascii = [](char c) { return (static_cast<unsigned char>(c) & 0x80) == 0x00; };
+        auto is_leading = [](char c) { return (static_cast<unsigned char>(c) & 0xC0) == 0xC0; };
+        auto is_leading_2b = [](char c) { return static_cast<unsigned char>(c) <= 0xDF; };
+        auto is_leading_3b = [](char c) { return static_cast<unsigned char>(c) <= 0xEF; };
+        // If the last character is ASCII, there are no incomplete code points
+        if (is_ascii(*rpptr)) {
+            return 0;
+        }
+        // Otherwise, work back from the end of the buffer and find the first
+        // UTF-8 leading byte
+        const auto rpend = rbase - rpptr >= 3 ? rpptr + 3 : rbase;
+        const auto leading = std::find_if(rpptr, rpend, is_leading);
+        if (leading == rbase) {
+            return 0;
+        }
+        const auto dist = static_cast<size_t>(leading - rpptr);
+        size_t remainder = 0;
+
+        if (dist == 0) {
+            remainder = 1; // 1-byte code point is impossible
+        } else if (dist == 1) {
+            remainder = is_leading_2b(*leading) ? 0 : dist + 1;
+        } else if (dist == 2) {
+            remainder = is_leading_3b(*leading) ? 0 : dist + 1;
+        }
+        // else if (dist >= 3), at least 4 bytes before encountering an UTF-8
+        // leading byte, either no remainder or invalid UTF-8.
+        // Invalid UTF-8 will cause an exception later when converting
+        // to a Python string, so that's not handled here.
+        return remainder;
+    }
+
+    // This function must be non-virtual to be called in a destructor.
+    int _sync() {
+        if (pbase() != pptr()) { // If buffer is not empty
+            gil_scoped_acquire tmp;
+            // This subtraction cannot be negative, so dropping the sign.
+            auto size = static_cast<size_t>(pptr() - pbase());
+            size_t remainder = utf8_remainder();
+
+            if (size > remainder) {
+                str line(pbase(), size - remainder);
+                pywrite(std::move(line));
+                pyflush();
+            }
+
+            // Copy the remainder at the end of the buffer to the beginning:
+            if (remainder > 0) {
+                std::memmove(pbase(), pptr() - remainder, remainder);
+            }
+            setp(pbase(), epptr());
+            pbump(static_cast<int>(remainder));
+        }
+        return 0;
+    }
+
+    int sync() override { return _sync(); }
+
+public:
+    explicit pythonbuf(const object &pyostream, size_t buffer_size = 1024)
+        : buf_size(buffer_size), d_buffer(new char[buf_size]), pywrite(pyostream.attr("write")),
+          pyflush(pyostream.attr("flush")) {
+        setp(d_buffer.get(), d_buffer.get() + buf_size - 1);
+    }
+
+    pythonbuf(pythonbuf &&) = default;
+
+    /// Sync before destroy
+    ~pythonbuf() override { _sync(); }
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+/** \rst
+    This a move-only guard that redirects output.
+
+    .. code-block:: cpp
+
+        #include <pybind11/iostream.h>
+
+        ...
+
+        {
+            py::scoped_ostream_redirect output;
+            std::cout << "Hello, World!"; // Python stdout
+        } // <-- return std::cout to normal
+
+    You can explicitly pass the c++ stream and the python object,
+    for example to guard stderr instead.
+
+    .. code-block:: cpp
+
+        {
+            py::scoped_ostream_redirect output{
+                std::cerr, py::module::import("sys").attr("stderr")};
+            std::cout << "Hello, World!";
+        }
+ \endrst */
+class scoped_ostream_redirect {
+protected:
+    std::streambuf *old;
+    std::ostream &costream;
+    detail::pythonbuf buffer;
+
+public:
+    explicit scoped_ostream_redirect(std::ostream &costream = std::cout,
+                                     const object &pyostream
+                                     = module_::import("sys").attr("stdout"))
+        : costream(costream), buffer(pyostream) {
+        old = costream.rdbuf(&buffer);
+    }
+
+    ~scoped_ostream_redirect() { costream.rdbuf(old); }
+
+    scoped_ostream_redirect(const scoped_ostream_redirect &) = delete;
+    scoped_ostream_redirect(scoped_ostream_redirect &&other) = default;
+    scoped_ostream_redirect &operator=(const scoped_ostream_redirect &) = delete;
+    scoped_ostream_redirect &operator=(scoped_ostream_redirect &&) = delete;
+};
+
+/** \rst
+    Like `scoped_ostream_redirect`, but redirects cerr by default. This class
+    is provided primary to make ``py::call_guard`` easier to make.
+
+    .. code-block:: cpp
+
+     m.def("noisy_func", &noisy_func,
+           py::call_guard<scoped_ostream_redirect,
+                          scoped_estream_redirect>());
+
+\endrst */
+class scoped_estream_redirect : public scoped_ostream_redirect {
+public:
+    explicit scoped_estream_redirect(std::ostream &costream = std::cerr,
+                                     const object &pyostream
+                                     = module_::import("sys").attr("stderr"))
+        : scoped_ostream_redirect(costream, pyostream) {}
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Class to redirect output as a context manager. C++ backend.
+class OstreamRedirect {
+    bool do_stdout_;
+    bool do_stderr_;
+    std::unique_ptr<scoped_ostream_redirect> redirect_stdout;
+    std::unique_ptr<scoped_estream_redirect> redirect_stderr;
+
+public:
+    explicit OstreamRedirect(bool do_stdout = true, bool do_stderr = true)
+        : do_stdout_(do_stdout), do_stderr_(do_stderr) {}
+
+    void enter() {
+        if (do_stdout_) {
+            redirect_stdout.reset(new scoped_ostream_redirect());
+        }
+        if (do_stderr_) {
+            redirect_stderr.reset(new scoped_estream_redirect());
+        }
+    }
+
+    void exit() {
+        redirect_stdout.reset();
+        redirect_stderr.reset();
+    }
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+/** \rst
+    This is a helper function to add a C++ redirect context manager to Python
+    instead of using a C++ guard. To use it, add the following to your binding code:
+
+    .. code-block:: cpp
+
+        #include <pybind11/iostream.h>
+
+        ...
+
+        py::add_ostream_redirect(m, "ostream_redirect");
+
+    You now have a Python context manager that redirects your output:
+
+    .. code-block:: python
+
+        with m.ostream_redirect():
+            m.print_to_cout_function()
+
+    This manager can optionally be told which streams to operate on:
+
+    .. code-block:: python
+
+        with m.ostream_redirect(stdout=true, stderr=true):
+            m.noisy_function_with_error_printing()
+
+ \endrst */
+inline class_<detail::OstreamRedirect>
+add_ostream_redirect(module_ m, const std::string &name = "ostream_redirect") {
+    return class_<detail::OstreamRedirect>(std::move(m), name.c_str(), module_local())
+        .def(init<bool, bool>(), arg("stdout") = true, arg("stderr") = true)
+        .def("__enter__", &detail::OstreamRedirect::enter)
+        .def("__exit__", [](detail::OstreamRedirect &self_, const args &) { self_.exit(); });
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/numpy.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/numpy.h
new file mode 100644
index 00000000..36077ec0
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/numpy.h
@@ -0,0 +1,1998 @@
+/*
+    pybind11/numpy.h: Basic NumPy support, vectorize() wrapper
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include "complex.h"
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <typeindex>
+#include <utility>
+#include <vector>
+
+/* This will be true on all flat address space platforms and allows us to reduce the
+   whole npy_intp / ssize_t / Py_intptr_t business down to just ssize_t for all size
+   and dimension types (e.g. shape, strides, indexing), instead of inflicting this
+   upon the library user. */
+static_assert(sizeof(::pybind11::ssize_t) == sizeof(Py_intptr_t), "ssize_t != Py_intptr_t");
+static_assert(std::is_signed<Py_intptr_t>::value, "Py_intptr_t must be signed");
+// We now can reinterpret_cast between py::ssize_t and Py_intptr_t (MSVC + PyPy cares)
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+class array; // Forward declaration
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <>
+struct handle_type_name<array> {
+    static constexpr auto name = const_name("numpy.ndarray");
+};
+
+template <typename type, typename SFINAE = void>
+struct npy_format_descriptor;
+
+struct PyArrayDescr_Proxy {
+    PyObject_HEAD
+    PyObject *typeobj;
+    char kind;
+    char type;
+    char byteorder;
+    char flags;
+    int type_num;
+    int elsize;
+    int alignment;
+    char *subarray;
+    PyObject *fields;
+    PyObject *names;
+};
+
+struct PyArray_Proxy {
+    PyObject_HEAD
+    char *data;
+    int nd;
+    ssize_t *dimensions;
+    ssize_t *strides;
+    PyObject *base;
+    PyObject *descr;
+    int flags;
+};
+
+struct PyVoidScalarObject_Proxy {
+    PyObject_VAR_HEAD char *obval;
+    PyArrayDescr_Proxy *descr;
+    int flags;
+    PyObject *base;
+};
+
+struct numpy_type_info {
+    PyObject *dtype_ptr;
+    std::string format_str;
+};
+
+struct numpy_internals {
+    std::unordered_map<std::type_index, numpy_type_info> registered_dtypes;
+
+    numpy_type_info *get_type_info(const std::type_info &tinfo, bool throw_if_missing = true) {
+        auto it = registered_dtypes.find(std::type_index(tinfo));
+        if (it != registered_dtypes.end()) {
+            return &(it->second);
+        }
+        if (throw_if_missing) {
+            pybind11_fail(std::string("NumPy type info missing for ") + tinfo.name());
+        }
+        return nullptr;
+    }
+
+    template <typename T>
+    numpy_type_info *get_type_info(bool throw_if_missing = true) {
+        return get_type_info(typeid(typename std::remove_cv<T>::type), throw_if_missing);
+    }
+};
+
+PYBIND11_NOINLINE void load_numpy_internals(numpy_internals *&ptr) {
+    ptr = &get_or_create_shared_data<numpy_internals>("_numpy_internals");
+}
+
+inline numpy_internals &get_numpy_internals() {
+    static numpy_internals *ptr = nullptr;
+    if (!ptr) {
+        load_numpy_internals(ptr);
+    }
+    return *ptr;
+}
+
+template <typename T>
+struct same_size {
+    template <typename U>
+    using as = bool_constant<sizeof(T) == sizeof(U)>;
+};
+
+template <typename Concrete>
+constexpr int platform_lookup() {
+    return -1;
+}
+
+// Lookup a type according to its size, and return a value corresponding to the NumPy typenum.
+template <typename Concrete, typename T, typename... Ts, typename... Ints>
+constexpr int platform_lookup(int I, Ints... Is) {
+    return sizeof(Concrete) == sizeof(T) ? I : platform_lookup<Concrete, Ts...>(Is...);
+}
+
+struct npy_api {
+    enum constants {
+        NPY_ARRAY_C_CONTIGUOUS_ = 0x0001,
+        NPY_ARRAY_F_CONTIGUOUS_ = 0x0002,
+        NPY_ARRAY_OWNDATA_ = 0x0004,
+        NPY_ARRAY_FORCECAST_ = 0x0010,
+        NPY_ARRAY_ENSUREARRAY_ = 0x0040,
+        NPY_ARRAY_ALIGNED_ = 0x0100,
+        NPY_ARRAY_WRITEABLE_ = 0x0400,
+        NPY_BOOL_ = 0,
+        NPY_BYTE_,
+        NPY_UBYTE_,
+        NPY_SHORT_,
+        NPY_USHORT_,
+        NPY_INT_,
+        NPY_UINT_,
+        NPY_LONG_,
+        NPY_ULONG_,
+        NPY_LONGLONG_,
+        NPY_ULONGLONG_,
+        NPY_FLOAT_,
+        NPY_DOUBLE_,
+        NPY_LONGDOUBLE_,
+        NPY_CFLOAT_,
+        NPY_CDOUBLE_,
+        NPY_CLONGDOUBLE_,
+        NPY_OBJECT_ = 17,
+        NPY_STRING_,
+        NPY_UNICODE_,
+        NPY_VOID_,
+        // Platform-dependent normalization
+        NPY_INT8_ = NPY_BYTE_,
+        NPY_UINT8_ = NPY_UBYTE_,
+        NPY_INT16_ = NPY_SHORT_,
+        NPY_UINT16_ = NPY_USHORT_,
+        // `npy_common.h` defines the integer aliases. In order, it checks:
+        // NPY_BITSOF_LONG, NPY_BITSOF_LONGLONG, NPY_BITSOF_INT, NPY_BITSOF_SHORT, NPY_BITSOF_CHAR
+        // and assigns the alias to the first matching size, so we should check in this order.
+        NPY_INT32_
+        = platform_lookup<std::int32_t, long, int, short>(NPY_LONG_, NPY_INT_, NPY_SHORT_),
+        NPY_UINT32_ = platform_lookup<std::uint32_t, unsigned long, unsigned int, unsigned short>(
+            NPY_ULONG_, NPY_UINT_, NPY_USHORT_),
+        NPY_INT64_
+        = platform_lookup<std::int64_t, long, long long, int>(NPY_LONG_, NPY_LONGLONG_, NPY_INT_),
+        NPY_UINT64_
+        = platform_lookup<std::uint64_t, unsigned long, unsigned long long, unsigned int>(
+            NPY_ULONG_, NPY_ULONGLONG_, NPY_UINT_),
+    };
+
+    struct PyArray_Dims {
+        Py_intptr_t *ptr;
+        int len;
+    };
+
+    static npy_api &get() {
+        static npy_api api = lookup();
+        return api;
+    }
+
+    bool PyArray_Check_(PyObject *obj) const {
+        return PyObject_TypeCheck(obj, PyArray_Type_) != 0;
+    }
+    bool PyArrayDescr_Check_(PyObject *obj) const {
+        return PyObject_TypeCheck(obj, PyArrayDescr_Type_) != 0;
+    }
+
+    unsigned int (*PyArray_GetNDArrayCFeatureVersion_)();
+    PyObject *(*PyArray_DescrFromType_)(int);
+    PyObject *(*PyArray_NewFromDescr_)(PyTypeObject *,
+                                       PyObject *,
+                                       int,
+                                       Py_intptr_t const *,
+                                       Py_intptr_t const *,
+                                       void *,
+                                       int,
+                                       PyObject *);
+    // Unused. Not removed because that affects ABI of the class.
+    PyObject *(*PyArray_DescrNewFromType_)(int);
+    int (*PyArray_CopyInto_)(PyObject *, PyObject *);
+    PyObject *(*PyArray_NewCopy_)(PyObject *, int);
+    PyTypeObject *PyArray_Type_;
+    PyTypeObject *PyVoidArrType_Type_;
+    PyTypeObject *PyArrayDescr_Type_;
+    PyObject *(*PyArray_DescrFromScalar_)(PyObject *);
+    PyObject *(*PyArray_FromAny_)(PyObject *, PyObject *, int, int, int, PyObject *);
+    int (*PyArray_DescrConverter_)(PyObject *, PyObject **);
+    bool (*PyArray_EquivTypes_)(PyObject *, PyObject *);
+    int (*PyArray_GetArrayParamsFromObject_)(PyObject *,
+                                             PyObject *,
+                                             unsigned char,
+                                             PyObject **,
+                                             int *,
+                                             Py_intptr_t *,
+                                             PyObject **,
+                                             PyObject *);
+    PyObject *(*PyArray_Squeeze_)(PyObject *);
+    // Unused. Not removed because that affects ABI of the class.
+    int (*PyArray_SetBaseObject_)(PyObject *, PyObject *);
+    PyObject *(*PyArray_Resize_)(PyObject *, PyArray_Dims *, int, int);
+    PyObject *(*PyArray_Newshape_)(PyObject *, PyArray_Dims *, int);
+    PyObject *(*PyArray_View_)(PyObject *, PyObject *, PyObject *);
+
+private:
+    enum functions {
+        API_PyArray_GetNDArrayCFeatureVersion = 211,
+        API_PyArray_Type = 2,
+        API_PyArrayDescr_Type = 3,
+        API_PyVoidArrType_Type = 39,
+        API_PyArray_DescrFromType = 45,
+        API_PyArray_DescrFromScalar = 57,
+        API_PyArray_FromAny = 69,
+        API_PyArray_Resize = 80,
+        API_PyArray_CopyInto = 82,
+        API_PyArray_NewCopy = 85,
+        API_PyArray_NewFromDescr = 94,
+        API_PyArray_DescrNewFromType = 96,
+        API_PyArray_Newshape = 135,
+        API_PyArray_Squeeze = 136,
+        API_PyArray_View = 137,
+        API_PyArray_DescrConverter = 174,
+        API_PyArray_EquivTypes = 182,
+        API_PyArray_GetArrayParamsFromObject = 278,
+        API_PyArray_SetBaseObject = 282
+    };
+
+    static npy_api lookup() {
+        module_ m = module_::import("numpy.core.multiarray");
+        auto c = m.attr("_ARRAY_API");
+        void **api_ptr = (void **) PyCapsule_GetPointer(c.ptr(), nullptr);
+        npy_api api;
+#define DECL_NPY_API(Func) api.Func##_ = (decltype(api.Func##_)) api_ptr[API_##Func];
+        DECL_NPY_API(PyArray_GetNDArrayCFeatureVersion);
+        if (api.PyArray_GetNDArrayCFeatureVersion_() < 0x7) {
+            pybind11_fail("pybind11 numpy support requires numpy >= 1.7.0");
+        }
+        DECL_NPY_API(PyArray_Type);
+        DECL_NPY_API(PyVoidArrType_Type);
+        DECL_NPY_API(PyArrayDescr_Type);
+        DECL_NPY_API(PyArray_DescrFromType);
+        DECL_NPY_API(PyArray_DescrFromScalar);
+        DECL_NPY_API(PyArray_FromAny);
+        DECL_NPY_API(PyArray_Resize);
+        DECL_NPY_API(PyArray_CopyInto);
+        DECL_NPY_API(PyArray_NewCopy);
+        DECL_NPY_API(PyArray_NewFromDescr);
+        DECL_NPY_API(PyArray_DescrNewFromType);
+        DECL_NPY_API(PyArray_Newshape);
+        DECL_NPY_API(PyArray_Squeeze);
+        DECL_NPY_API(PyArray_View);
+        DECL_NPY_API(PyArray_DescrConverter);
+        DECL_NPY_API(PyArray_EquivTypes);
+        DECL_NPY_API(PyArray_GetArrayParamsFromObject);
+        DECL_NPY_API(PyArray_SetBaseObject);
+
+#undef DECL_NPY_API
+        return api;
+    }
+};
+
+inline PyArray_Proxy *array_proxy(void *ptr) { return reinterpret_cast<PyArray_Proxy *>(ptr); }
+
+inline const PyArray_Proxy *array_proxy(const void *ptr) {
+    return reinterpret_cast<const PyArray_Proxy *>(ptr);
+}
+
+inline PyArrayDescr_Proxy *array_descriptor_proxy(PyObject *ptr) {
+    return reinterpret_cast<PyArrayDescr_Proxy *>(ptr);
+}
+
+inline const PyArrayDescr_Proxy *array_descriptor_proxy(const PyObject *ptr) {
+    return reinterpret_cast<const PyArrayDescr_Proxy *>(ptr);
+}
+
+inline bool check_flags(const void *ptr, int flag) {
+    return (flag == (array_proxy(ptr)->flags & flag));
+}
+
+template <typename T>
+struct is_std_array : std::false_type {};
+template <typename T, size_t N>
+struct is_std_array<std::array<T, N>> : std::true_type {};
+template <typename T>
+struct is_complex : std::false_type {};
+template <typename T>
+struct is_complex<std::complex<T>> : std::true_type {};
+
+template <typename T>
+struct array_info_scalar {
+    using type = T;
+    static constexpr bool is_array = false;
+    static constexpr bool is_empty = false;
+    static constexpr auto extents = const_name("");
+    static void append_extents(list & /* shape */) {}
+};
+// Computes underlying type and a comma-separated list of extents for array
+// types (any mix of std::array and built-in arrays). An array of char is
+// treated as scalar because it gets special handling.
+template <typename T>
+struct array_info : array_info_scalar<T> {};
+template <typename T, size_t N>
+struct array_info<std::array<T, N>> {
+    using type = typename array_info<T>::type;
+    static constexpr bool is_array = true;
+    static constexpr bool is_empty = (N == 0) || array_info<T>::is_empty;
+    static constexpr size_t extent = N;
+
+    // appends the extents to shape
+    static void append_extents(list &shape) {
+        shape.append(N);
+        array_info<T>::append_extents(shape);
+    }
+
+    static constexpr auto extents = const_name<array_info<T>::is_array>(
+        concat(const_name<N>(), array_info<T>::extents), const_name<N>());
+};
+// For numpy we have special handling for arrays of characters, so we don't include
+// the size in the array extents.
+template <size_t N>
+struct array_info<char[N]> : array_info_scalar<char[N]> {};
+template <size_t N>
+struct array_info<std::array<char, N>> : array_info_scalar<std::array<char, N>> {};
+template <typename T, size_t N>
+struct array_info<T[N]> : array_info<std::array<T, N>> {};
+template <typename T>
+using remove_all_extents_t = typename array_info<T>::type;
+
+template <typename T>
+using is_pod_struct
+    = all_of<std::is_standard_layout<T>, // since we're accessing directly in memory
+                                         // we need a standard layout type
+#if defined(__GLIBCXX__)                                                                          \
+    && (__GLIBCXX__ < 20150422 || __GLIBCXX__ == 20150426 || __GLIBCXX__ == 20150623              \
+        || __GLIBCXX__ == 20150626 || __GLIBCXX__ == 20160803)
+             // libstdc++ < 5 (including versions 4.8.5, 4.9.3 and 4.9.4 which were released after
+             // 5) don't implement is_trivially_copyable, so approximate it
+             std::is_trivially_destructible<T>,
+             satisfies_any_of<T, std::has_trivial_copy_constructor, std::has_trivial_copy_assign>,
+#else
+             std::is_trivially_copyable<T>,
+#endif
+             satisfies_none_of<T,
+                               std::is_reference,
+                               std::is_array,
+                               is_std_array,
+                               std::is_arithmetic,
+                               is_complex,
+                               std::is_enum>>;
+
+// Replacement for std::is_pod (deprecated in C++20)
+template <typename T>
+using is_pod = all_of<std::is_standard_layout<T>, std::is_trivial<T>>;
+
+template <ssize_t Dim = 0, typename Strides>
+ssize_t byte_offset_unsafe(const Strides &) {
+    return 0;
+}
+template <ssize_t Dim = 0, typename Strides, typename... Ix>
+ssize_t byte_offset_unsafe(const Strides &strides, ssize_t i, Ix... index) {
+    return i * strides[Dim] + byte_offset_unsafe<Dim + 1>(strides, index...);
+}
+
+/**
+ * Proxy class providing unsafe, unchecked const access to array data.  This is constructed through
+ * the `unchecked<T, N>()` method of `array` or the `unchecked<N>()` method of `array_t<T>`. `Dims`
+ * will be -1 for dimensions determined at runtime.
+ */
+template <typename T, ssize_t Dims>
+class unchecked_reference {
+protected:
+    static constexpr bool Dynamic = Dims < 0;
+    const unsigned char *data_;
+    // Storing the shape & strides in local variables (i.e. these arrays) allows the compiler to
+    // make large performance gains on big, nested loops, but requires compile-time dimensions
+    conditional_t<Dynamic, const ssize_t *, std::array<ssize_t, (size_t) Dims>> shape_, strides_;
+    const ssize_t dims_;
+
+    friend class pybind11::array;
+    // Constructor for compile-time dimensions:
+    template <bool Dyn = Dynamic>
+    unchecked_reference(const void *data,
+                        const ssize_t *shape,
+                        const ssize_t *strides,
+                        enable_if_t<!Dyn, ssize_t>)
+        : data_{reinterpret_cast<const unsigned char *>(data)}, dims_{Dims} {
+        for (size_t i = 0; i < (size_t) dims_; i++) {
+            shape_[i] = shape[i];
+            strides_[i] = strides[i];
+        }
+    }
+    // Constructor for runtime dimensions:
+    template <bool Dyn = Dynamic>
+    unchecked_reference(const void *data,
+                        const ssize_t *shape,
+                        const ssize_t *strides,
+                        enable_if_t<Dyn, ssize_t> dims)
+        : data_{reinterpret_cast<const unsigned char *>(data)}, shape_{shape}, strides_{strides},
+          dims_{dims} {}
+
+public:
+    /**
+     * Unchecked const reference access to data at the given indices.  For a compile-time known
+     * number of dimensions, this requires the correct number of arguments; for run-time
+     * dimensionality, this is not checked (and so is up to the caller to use safely).
+     */
+    template <typename... Ix>
+    const T &operator()(Ix... index) const {
+        static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
+                      "Invalid number of indices for unchecked array reference");
+        return *reinterpret_cast<const T *>(data_
+                                            + byte_offset_unsafe(strides_, ssize_t(index)...));
+    }
+    /**
+     * Unchecked const reference access to data; this operator only participates if the reference
+     * is to a 1-dimensional array.  When present, this is exactly equivalent to `obj(index)`.
+     */
+    template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
+    const T &operator[](ssize_t index) const {
+        return operator()(index);
+    }
+
+    /// Pointer access to the data at the given indices.
+    template <typename... Ix>
+    const T *data(Ix... ix) const {
+        return &operator()(ssize_t(ix)...);
+    }
+
+    /// Returns the item size, i.e. sizeof(T)
+    constexpr static ssize_t itemsize() { return sizeof(T); }
+
+    /// Returns the shape (i.e. size) of dimension `dim`
+    ssize_t shape(ssize_t dim) const { return shape_[(size_t) dim]; }
+
+    /// Returns the number of dimensions of the array
+    ssize_t ndim() const { return dims_; }
+
+    /// Returns the total number of elements in the referenced array, i.e. the product of the
+    /// shapes
+    template <bool Dyn = Dynamic>
+    enable_if_t<!Dyn, ssize_t> size() const {
+        return std::accumulate(
+            shape_.begin(), shape_.end(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+    template <bool Dyn = Dynamic>
+    enable_if_t<Dyn, ssize_t> size() const {
+        return std::accumulate(shape_, shape_ + ndim(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+
+    /// Returns the total number of bytes used by the referenced data.  Note that the actual span
+    /// in memory may be larger if the referenced array has non-contiguous strides (e.g. for a
+    /// slice).
+    ssize_t nbytes() const { return size() * itemsize(); }
+};
+
+template <typename T, ssize_t Dims>
+class unchecked_mutable_reference : public unchecked_reference<T, Dims> {
+    friend class pybind11::array;
+    using ConstBase = unchecked_reference<T, Dims>;
+    using ConstBase::ConstBase;
+    using ConstBase::Dynamic;
+
+public:
+    // Bring in const-qualified versions from base class
+    using ConstBase::operator();
+    using ConstBase::operator[];
+
+    /// Mutable, unchecked access to data at the given indices.
+    template <typename... Ix>
+    T &operator()(Ix... index) {
+        static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
+                      "Invalid number of indices for unchecked array reference");
+        return const_cast<T &>(ConstBase::operator()(index...));
+    }
+    /**
+     * Mutable, unchecked access data at the given index; this operator only participates if the
+     * reference is to a 1-dimensional array (or has runtime dimensions).  When present, this is
+     * exactly equivalent to `obj(index)`.
+     */
+    template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
+    T &operator[](ssize_t index) {
+        return operator()(index);
+    }
+
+    /// Mutable pointer access to the data at the given indices.
+    template <typename... Ix>
+    T *mutable_data(Ix... ix) {
+        return &operator()(ssize_t(ix)...);
+    }
+};
+
+template <typename T, ssize_t Dim>
+struct type_caster<unchecked_reference<T, Dim>> {
+    static_assert(Dim == 0 && Dim > 0 /* always fail */,
+                  "unchecked array proxy object is not castable");
+};
+template <typename T, ssize_t Dim>
+struct type_caster<unchecked_mutable_reference<T, Dim>>
+    : type_caster<unchecked_reference<T, Dim>> {};
+
+PYBIND11_NAMESPACE_END(detail)
+
+class dtype : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(dtype, object, detail::npy_api::get().PyArrayDescr_Check_)
+
+    explicit dtype(const buffer_info &info) {
+        dtype descr(_dtype_from_pep3118()(pybind11::str(info.format)));
+        // If info.itemsize == 0, use the value calculated from the format string
+        m_ptr = descr.strip_padding(info.itemsize != 0 ? info.itemsize : descr.itemsize())
+                    .release()
+                    .ptr();
+    }
+
+    explicit dtype(const pybind11::str &format) : dtype(from_args(format)) {}
+
+    explicit dtype(const std::string &format) : dtype(pybind11::str(format)) {}
+
+    explicit dtype(const char *format) : dtype(pybind11::str(format)) {}
+
+    dtype(list names, list formats, list offsets, ssize_t itemsize) {
+        dict args;
+        args["names"] = std::move(names);
+        args["formats"] = std::move(formats);
+        args["offsets"] = std::move(offsets);
+        args["itemsize"] = pybind11::int_(itemsize);
+        m_ptr = from_args(args).release().ptr();
+    }
+
+    /// Return dtype for the given typenum (one of the NPY_TYPES).
+    /// https://numpy.org/devdocs/reference/c-api/array.html#c.PyArray_DescrFromType
+    explicit dtype(int typenum)
+        : object(detail::npy_api::get().PyArray_DescrFromType_(typenum), stolen_t{}) {
+        if (m_ptr == nullptr) {
+            throw error_already_set();
+        }
+    }
+
+    /// This is essentially the same as calling numpy.dtype(args) in Python.
+    static dtype from_args(const object &args) {
+        PyObject *ptr = nullptr;
+        if ((detail::npy_api::get().PyArray_DescrConverter_(args.ptr(), &ptr) == 0) || !ptr) {
+            throw error_already_set();
+        }
+        return reinterpret_steal<dtype>(ptr);
+    }
+
+    /// Return dtype associated with a C++ type.
+    template <typename T>
+    static dtype of() {
+        return detail::npy_format_descriptor<typename std::remove_cv<T>::type>::dtype();
+    }
+
+    /// Size of the data type in bytes.
+    ssize_t itemsize() const { return detail::array_descriptor_proxy(m_ptr)->elsize; }
+
+    /// Returns true for structured data types.
+    bool has_fields() const { return detail::array_descriptor_proxy(m_ptr)->names != nullptr; }
+
+    /// Single-character code for dtype's kind.
+    /// For example, floating point types are 'f' and integral types are 'i'.
+    char kind() const { return detail::array_descriptor_proxy(m_ptr)->kind; }
+
+    /// Single-character for dtype's type.
+    /// For example, ``float`` is 'f', ``double`` 'd', ``int`` 'i', and ``long`` 'l'.
+    char char_() const {
+        // Note: The signature, `dtype::char_` follows the naming of NumPy's
+        // public Python API (i.e., ``dtype.char``), rather than its internal
+        // C API (``PyArray_Descr::type``).
+        return detail::array_descriptor_proxy(m_ptr)->type;
+    }
+
+    /// type number of dtype.
+    int num() const {
+        // Note: The signature, `dtype::num` follows the naming of NumPy's public
+        // Python API (i.e., ``dtype.num``), rather than its internal
+        // C API (``PyArray_Descr::type_num``).
+        return detail::array_descriptor_proxy(m_ptr)->type_num;
+    }
+
+    /// Single character for byteorder
+    char byteorder() const { return detail::array_descriptor_proxy(m_ptr)->byteorder; }
+
+    /// Alignment of the data type
+    int alignment() const { return detail::array_descriptor_proxy(m_ptr)->alignment; }
+
+    /// Flags for the array descriptor
+    char flags() const { return detail::array_descriptor_proxy(m_ptr)->flags; }
+
+private:
+    static object _dtype_from_pep3118() {
+        static PyObject *obj = module_::import("numpy.core._internal")
+                                   .attr("_dtype_from_pep3118")
+                                   .cast<object>()
+                                   .release()
+                                   .ptr();
+        return reinterpret_borrow<object>(obj);
+    }
+
+    dtype strip_padding(ssize_t itemsize) {
+        // Recursively strip all void fields with empty names that are generated for
+        // padding fields (as of NumPy v1.11).
+        if (!has_fields()) {
+            return *this;
+        }
+
+        struct field_descr {
+            pybind11::str name;
+            object format;
+            pybind11::int_ offset;
+            field_descr(pybind11::str &&name, object &&format, pybind11::int_ &&offset)
+                : name{std::move(name)}, format{std::move(format)}, offset{std::move(offset)} {};
+        };
+        auto field_dict = attr("fields").cast<dict>();
+        std::vector<field_descr> field_descriptors;
+        field_descriptors.reserve(field_dict.size());
+
+        for (auto field : field_dict.attr("items")()) {
+            auto spec = field.cast<tuple>();
+            auto name = spec[0].cast<pybind11::str>();
+            auto spec_fo = spec[1].cast<tuple>();
+            auto format = spec_fo[0].cast<dtype>();
+            auto offset = spec_fo[1].cast<pybind11::int_>();
+            if ((len(name) == 0u) && format.kind() == 'V') {
+                continue;
+            }
+            field_descriptors.emplace_back(
+                std::move(name), format.strip_padding(format.itemsize()), std::move(offset));
+        }
+
+        std::sort(field_descriptors.begin(),
+                  field_descriptors.end(),
+                  [](const field_descr &a, const field_descr &b) {
+                      return a.offset.cast<int>() < b.offset.cast<int>();
+                  });
+
+        list names, formats, offsets;
+        for (auto &descr : field_descriptors) {
+            names.append(std::move(descr.name));
+            formats.append(std::move(descr.format));
+            offsets.append(std::move(descr.offset));
+        }
+        return dtype(std::move(names), std::move(formats), std::move(offsets), itemsize);
+    }
+};
+
+class array : public buffer {
+public:
+    PYBIND11_OBJECT_CVT(array, buffer, detail::npy_api::get().PyArray_Check_, raw_array)
+
+    enum {
+        c_style = detail::npy_api::NPY_ARRAY_C_CONTIGUOUS_,
+        f_style = detail::npy_api::NPY_ARRAY_F_CONTIGUOUS_,
+        forcecast = detail::npy_api::NPY_ARRAY_FORCECAST_
+    };
+
+    array() : array(0, static_cast<const double *>(nullptr)) {}
+
+    using ShapeContainer = detail::any_container<ssize_t>;
+    using StridesContainer = detail::any_container<ssize_t>;
+
+    // Constructs an array taking shape/strides from arbitrary container types
+    array(const pybind11::dtype &dt,
+          ShapeContainer shape,
+          StridesContainer strides,
+          const void *ptr = nullptr,
+          handle base = handle()) {
+
+        if (strides->empty()) {
+            *strides = detail::c_strides(*shape, dt.itemsize());
+        }
+
+        auto ndim = shape->size();
+        if (ndim != strides->size()) {
+            pybind11_fail("NumPy: shape ndim doesn't match strides ndim");
+        }
+        auto descr = dt;
+
+        int flags = 0;
+        if (base && ptr) {
+            if (isinstance<array>(base)) {
+                /* Copy flags from base (except ownership bit) */
+                flags = reinterpret_borrow<array>(base).flags()
+                        & ~detail::npy_api::NPY_ARRAY_OWNDATA_;
+            } else {
+                /* Writable by default, easy to downgrade later on if needed */
+                flags = detail::npy_api::NPY_ARRAY_WRITEABLE_;
+            }
+        }
+
+        auto &api = detail::npy_api::get();
+        auto tmp = reinterpret_steal<object>(api.PyArray_NewFromDescr_(
+            api.PyArray_Type_,
+            descr.release().ptr(),
+            (int) ndim,
+            // Use reinterpret_cast for PyPy on Windows (remove if fixed, checked on 7.3.1)
+            reinterpret_cast<Py_intptr_t *>(shape->data()),
+            reinterpret_cast<Py_intptr_t *>(strides->data()),
+            const_cast<void *>(ptr),
+            flags,
+            nullptr));
+        if (!tmp) {
+            throw error_already_set();
+        }
+        if (ptr) {
+            if (base) {
+                api.PyArray_SetBaseObject_(tmp.ptr(), base.inc_ref().ptr());
+            } else {
+                tmp = reinterpret_steal<object>(
+                    api.PyArray_NewCopy_(tmp.ptr(), -1 /* any order */));
+            }
+        }
+        m_ptr = tmp.release().ptr();
+    }
+
+    array(const pybind11::dtype &dt,
+          ShapeContainer shape,
+          const void *ptr = nullptr,
+          handle base = handle())
+        : array(dt, std::move(shape), {}, ptr, base) {}
+
+    template <typename T,
+              typename
+              = detail::enable_if_t<std::is_integral<T>::value && !std::is_same<bool, T>::value>>
+    array(const pybind11::dtype &dt, T count, const void *ptr = nullptr, handle base = handle())
+        : array(dt, {{count}}, ptr, base) {}
+
+    template <typename T>
+    array(ShapeContainer shape, StridesContainer strides, const T *ptr, handle base = handle())
+        : array(pybind11::dtype::of<T>(), std::move(shape), std::move(strides), ptr, base) {}
+
+    template <typename T>
+    array(ShapeContainer shape, const T *ptr, handle base = handle())
+        : array(std::move(shape), {}, ptr, base) {}
+
+    template <typename T>
+    explicit array(ssize_t count, const T *ptr, handle base = handle())
+        : array({count}, {}, ptr, base) {}
+
+    explicit array(const buffer_info &info, handle base = handle())
+        : array(pybind11::dtype(info), info.shape, info.strides, info.ptr, base) {}
+
+    /// Array descriptor (dtype)
+    pybind11::dtype dtype() const {
+        return reinterpret_borrow<pybind11::dtype>(detail::array_proxy(m_ptr)->descr);
+    }
+
+    /// Total number of elements
+    ssize_t size() const {
+        return std::accumulate(shape(), shape() + ndim(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+
+    /// Byte size of a single element
+    ssize_t itemsize() const {
+        return detail::array_descriptor_proxy(detail::array_proxy(m_ptr)->descr)->elsize;
+    }
+
+    /// Total number of bytes
+    ssize_t nbytes() const { return size() * itemsize(); }
+
+    /// Number of dimensions
+    ssize_t ndim() const { return detail::array_proxy(m_ptr)->nd; }
+
+    /// Base object
+    object base() const { return reinterpret_borrow<object>(detail::array_proxy(m_ptr)->base); }
+
+    /// Dimensions of the array
+    const ssize_t *shape() const { return detail::array_proxy(m_ptr)->dimensions; }
+
+    /// Dimension along a given axis
+    ssize_t shape(ssize_t dim) const {
+        if (dim >= ndim()) {
+            fail_dim_check(dim, "invalid axis");
+        }
+        return shape()[dim];
+    }
+
+    /// Strides of the array
+    const ssize_t *strides() const { return detail::array_proxy(m_ptr)->strides; }
+
+    /// Stride along a given axis
+    ssize_t strides(ssize_t dim) const {
+        if (dim >= ndim()) {
+            fail_dim_check(dim, "invalid axis");
+        }
+        return strides()[dim];
+    }
+
+    /// Return the NumPy array flags
+    int flags() const { return detail::array_proxy(m_ptr)->flags; }
+
+    /// If set, the array is writeable (otherwise the buffer is read-only)
+    bool writeable() const {
+        return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_WRITEABLE_);
+    }
+
+    /// If set, the array owns the data (will be freed when the array is deleted)
+    bool owndata() const {
+        return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_OWNDATA_);
+    }
+
+    /// Pointer to the contained data. If index is not provided, points to the
+    /// beginning of the buffer. May throw if the index would lead to out of bounds access.
+    template <typename... Ix>
+    const void *data(Ix... index) const {
+        return static_cast<const void *>(detail::array_proxy(m_ptr)->data + offset_at(index...));
+    }
+
+    /// Mutable pointer to the contained data. If index is not provided, points to the
+    /// beginning of the buffer. May throw if the index would lead to out of bounds access.
+    /// May throw if the array is not writeable.
+    template <typename... Ix>
+    void *mutable_data(Ix... index) {
+        check_writeable();
+        return static_cast<void *>(detail::array_proxy(m_ptr)->data + offset_at(index...));
+    }
+
+    /// Byte offset from beginning of the array to a given index (full or partial).
+    /// May throw if the index would lead to out of bounds access.
+    template <typename... Ix>
+    ssize_t offset_at(Ix... index) const {
+        if ((ssize_t) sizeof...(index) > ndim()) {
+            fail_dim_check(sizeof...(index), "too many indices for an array");
+        }
+        return byte_offset(ssize_t(index)...);
+    }
+
+    ssize_t offset_at() const { return 0; }
+
+    /// Item count from beginning of the array to a given index (full or partial).
+    /// May throw if the index would lead to out of bounds access.
+    template <typename... Ix>
+    ssize_t index_at(Ix... index) const {
+        return offset_at(index...) / itemsize();
+    }
+
+    /**
+     * Returns a proxy object that provides access to the array's data without bounds or
+     * dimensionality checking.  Will throw if the array is missing the `writeable` flag.  Use with
+     * care: the array must not be destroyed or reshaped for the duration of the returned object,
+     * and the caller must take care not to access invalid dimensions or dimension indices.
+     */
+    template <typename T, ssize_t Dims = -1>
+    detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
+        if (Dims >= 0 && ndim() != Dims) {
+            throw std::domain_error("array has incorrect number of dimensions: "
+                                    + std::to_string(ndim()) + "; expected "
+                                    + std::to_string(Dims));
+        }
+        return detail::unchecked_mutable_reference<T, Dims>(
+            mutable_data(), shape(), strides(), ndim());
+    }
+
+    /**
+     * Returns a proxy object that provides const access to the array's data without bounds or
+     * dimensionality checking.  Unlike `mutable_unchecked()`, this does not require that the
+     * underlying array have the `writable` flag.  Use with care: the array must not be destroyed
+     * or reshaped for the duration of the returned object, and the caller must take care not to
+     * access invalid dimensions or dimension indices.
+     */
+    template <typename T, ssize_t Dims = -1>
+    detail::unchecked_reference<T, Dims> unchecked() const & {
+        if (Dims >= 0 && ndim() != Dims) {
+            throw std::domain_error("array has incorrect number of dimensions: "
+                                    + std::to_string(ndim()) + "; expected "
+                                    + std::to_string(Dims));
+        }
+        return detail::unchecked_reference<T, Dims>(data(), shape(), strides(), ndim());
+    }
+
+    /// Return a new view with all of the dimensions of length 1 removed
+    array squeeze() {
+        auto &api = detail::npy_api::get();
+        return reinterpret_steal<array>(api.PyArray_Squeeze_(m_ptr));
+    }
+
+    /// Resize array to given shape
+    /// If refcheck is true and more that one reference exist to this array
+    /// then resize will succeed only if it makes a reshape, i.e. original size doesn't change
+    void resize(ShapeContainer new_shape, bool refcheck = true) {
+        detail::npy_api::PyArray_Dims d
+            = {// Use reinterpret_cast for PyPy on Windows (remove if fixed, checked on 7.3.1)
+               reinterpret_cast<Py_intptr_t *>(new_shape->data()),
+               int(new_shape->size())};
+        // try to resize, set ordering param to -1 cause it's not used anyway
+        auto new_array = reinterpret_steal<object>(
+            detail::npy_api::get().PyArray_Resize_(m_ptr, &d, int(refcheck), -1));
+        if (!new_array) {
+            throw error_already_set();
+        }
+        if (isinstance<array>(new_array)) {
+            *this = std::move(new_array);
+        }
+    }
+
+    /// Optional `order` parameter omitted, to be added as needed.
+    array reshape(ShapeContainer new_shape) {
+        detail::npy_api::PyArray_Dims d
+            = {reinterpret_cast<Py_intptr_t *>(new_shape->data()), int(new_shape->size())};
+        auto new_array
+            = reinterpret_steal<array>(detail::npy_api::get().PyArray_Newshape_(m_ptr, &d, 0));
+        if (!new_array) {
+            throw error_already_set();
+        }
+        return new_array;
+    }
+
+    /// Create a view of an array in a different data type.
+    /// This function may fundamentally reinterpret the data in the array.
+    /// It is the responsibility of the caller to ensure that this is safe.
+    /// Only supports the `dtype` argument, the `type` argument is omitted,
+    /// to be added as needed.
+    array view(const std::string &dtype) {
+        auto &api = detail::npy_api::get();
+        auto new_view = reinterpret_steal<array>(api.PyArray_View_(
+            m_ptr, dtype::from_args(pybind11::str(dtype)).release().ptr(), nullptr));
+        if (!new_view) {
+            throw error_already_set();
+        }
+        return new_view;
+    }
+
+    /// Ensure that the argument is a NumPy array
+    /// In case of an error, nullptr is returned and the Python error is cleared.
+    static array ensure(handle h, int ExtraFlags = 0) {
+        auto result = reinterpret_steal<array>(raw_array(h.ptr(), ExtraFlags));
+        if (!result) {
+            PyErr_Clear();
+        }
+        return result;
+    }
+
+protected:
+    template <typename, typename>
+    friend struct detail::npy_format_descriptor;
+
+    void fail_dim_check(ssize_t dim, const std::string &msg) const {
+        throw index_error(msg + ": " + std::to_string(dim) + " (ndim = " + std::to_string(ndim())
+                          + ')');
+    }
+
+    template <typename... Ix>
+    ssize_t byte_offset(Ix... index) const {
+        check_dimensions(index...);
+        return detail::byte_offset_unsafe(strides(), ssize_t(index)...);
+    }
+
+    void check_writeable() const {
+        if (!writeable()) {
+            throw std::domain_error("array is not writeable");
+        }
+    }
+
+    template <typename... Ix>
+    void check_dimensions(Ix... index) const {
+        check_dimensions_impl(ssize_t(0), shape(), ssize_t(index)...);
+    }
+
+    void check_dimensions_impl(ssize_t, const ssize_t *) const {}
+
+    template <typename... Ix>
+    void check_dimensions_impl(ssize_t axis, const ssize_t *shape, ssize_t i, Ix... index) const {
+        if (i >= *shape) {
+            throw index_error(std::string("index ") + std::to_string(i)
+                              + " is out of bounds for axis " + std::to_string(axis)
+                              + " with size " + std::to_string(*shape));
+        }
+        check_dimensions_impl(axis + 1, shape + 1, index...);
+    }
+
+    /// Create array from any object -- always returns a new reference
+    static PyObject *raw_array(PyObject *ptr, int ExtraFlags = 0) {
+        if (ptr == nullptr) {
+            PyErr_SetString(PyExc_ValueError, "cannot create a pybind11::array from a nullptr");
+            return nullptr;
+        }
+        return detail::npy_api::get().PyArray_FromAny_(
+            ptr, nullptr, 0, 0, detail::npy_api::NPY_ARRAY_ENSUREARRAY_ | ExtraFlags, nullptr);
+    }
+};
+
+template <typename T, int ExtraFlags = array::forcecast>
+class array_t : public array {
+private:
+    struct private_ctor {};
+    // Delegating constructor needed when both moving and accessing in the same constructor
+    array_t(private_ctor,
+            ShapeContainer &&shape,
+            StridesContainer &&strides,
+            const T *ptr,
+            handle base)
+        : array(std::move(shape), std::move(strides), ptr, base) {}
+
+public:
+    static_assert(!detail::array_info<T>::is_array, "Array types cannot be used with array_t");
+
+    using value_type = T;
+
+    array_t() : array(0, static_cast<const T *>(nullptr)) {}
+    array_t(handle h, borrowed_t) : array(h, borrowed_t{}) {}
+    array_t(handle h, stolen_t) : array(h, stolen_t{}) {}
+
+    PYBIND11_DEPRECATED("Use array_t<T>::ensure() instead")
+    array_t(handle h, bool is_borrowed) : array(raw_array_t(h.ptr()), stolen_t{}) {
+        if (!m_ptr) {
+            PyErr_Clear();
+        }
+        if (!is_borrowed) {
+            Py_XDECREF(h.ptr());
+        }
+    }
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    array_t(const object &o) : array(raw_array_t(o.ptr()), stolen_t{}) {
+        if (!m_ptr) {
+            throw error_already_set();
+        }
+    }
+
+    explicit array_t(const buffer_info &info, handle base = handle()) : array(info, base) {}
+
+    array_t(ShapeContainer shape,
+            StridesContainer strides,
+            const T *ptr = nullptr,
+            handle base = handle())
+        : array(std::move(shape), std::move(strides), ptr, base) {}
+
+    explicit array_t(ShapeContainer shape, const T *ptr = nullptr, handle base = handle())
+        : array_t(private_ctor{},
+                  std::move(shape),
+                  (ExtraFlags & f_style) != 0 ? detail::f_strides(*shape, itemsize())
+                                              : detail::c_strides(*shape, itemsize()),
+                  ptr,
+                  base) {}
+
+    explicit array_t(ssize_t count, const T *ptr = nullptr, handle base = handle())
+        : array({count}, {}, ptr, base) {}
+
+    constexpr ssize_t itemsize() const { return sizeof(T); }
+
+    template <typename... Ix>
+    ssize_t index_at(Ix... index) const {
+        return offset_at(index...) / itemsize();
+    }
+
+    template <typename... Ix>
+    const T *data(Ix... index) const {
+        return static_cast<const T *>(array::data(index...));
+    }
+
+    template <typename... Ix>
+    T *mutable_data(Ix... index) {
+        return static_cast<T *>(array::mutable_data(index...));
+    }
+
+    // Reference to element at a given index
+    template <typename... Ix>
+    const T &at(Ix... index) const {
+        if ((ssize_t) sizeof...(index) != ndim()) {
+            fail_dim_check(sizeof...(index), "index dimension mismatch");
+        }
+        return *(static_cast<const T *>(array::data())
+                 + byte_offset(ssize_t(index)...) / itemsize());
+    }
+
+    // Mutable reference to element at a given index
+    template <typename... Ix>
+    T &mutable_at(Ix... index) {
+        if ((ssize_t) sizeof...(index) != ndim()) {
+            fail_dim_check(sizeof...(index), "index dimension mismatch");
+        }
+        return *(static_cast<T *>(array::mutable_data())
+                 + byte_offset(ssize_t(index)...) / itemsize());
+    }
+
+    /**
+     * Returns a proxy object that provides access to the array's data without bounds or
+     * dimensionality checking.  Will throw if the array is missing the `writeable` flag.  Use with
+     * care: the array must not be destroyed or reshaped for the duration of the returned object,
+     * and the caller must take care not to access invalid dimensions or dimension indices.
+     */
+    template <ssize_t Dims = -1>
+    detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
+        return array::mutable_unchecked<T, Dims>();
+    }
+
+    /**
+     * Returns a proxy object that provides const access to the array's data without bounds or
+     * dimensionality checking.  Unlike `mutable_unchecked()`, this does not require that the
+     * underlying array have the `writable` flag.  Use with care: the array must not be destroyed
+     * or reshaped for the duration of the returned object, and the caller must take care not to
+     * access invalid dimensions or dimension indices.
+     */
+    template <ssize_t Dims = -1>
+    detail::unchecked_reference<T, Dims> unchecked() const & {
+        return array::unchecked<T, Dims>();
+    }
+
+    /// Ensure that the argument is a NumPy array of the correct dtype (and if not, try to convert
+    /// it).  In case of an error, nullptr is returned and the Python error is cleared.
+    static array_t ensure(handle h) {
+        auto result = reinterpret_steal<array_t>(raw_array_t(h.ptr()));
+        if (!result) {
+            PyErr_Clear();
+        }
+        return result;
+    }
+
+    static bool check_(handle h) {
+        const auto &api = detail::npy_api::get();
+        return api.PyArray_Check_(h.ptr())
+               && api.PyArray_EquivTypes_(detail::array_proxy(h.ptr())->descr,
+                                          dtype::of<T>().ptr())
+               && detail::check_flags(h.ptr(), ExtraFlags & (array::c_style | array::f_style));
+    }
+
+protected:
+    /// Create array from any object -- always returns a new reference
+    static PyObject *raw_array_t(PyObject *ptr) {
+        if (ptr == nullptr) {
+            PyErr_SetString(PyExc_ValueError, "cannot create a pybind11::array_t from a nullptr");
+            return nullptr;
+        }
+        return detail::npy_api::get().PyArray_FromAny_(ptr,
+                                                       dtype::of<T>().release().ptr(),
+                                                       0,
+                                                       0,
+                                                       detail::npy_api::NPY_ARRAY_ENSUREARRAY_
+                                                           | ExtraFlags,
+                                                       nullptr);
+    }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<detail::is_pod_struct<T>::value>> {
+    static std::string format() {
+        return detail::npy_format_descriptor<typename std::remove_cv<T>::type>::format();
+    }
+};
+
+template <size_t N>
+struct format_descriptor<char[N]> {
+    static std::string format() { return std::to_string(N) + 's'; }
+};
+template <size_t N>
+struct format_descriptor<std::array<char, N>> {
+    static std::string format() { return std::to_string(N) + 's'; }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<std::is_enum<T>::value>> {
+    static std::string format() {
+        return format_descriptor<
+            typename std::remove_cv<typename std::underlying_type<T>::type>::type>::format();
+    }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<detail::array_info<T>::is_array>> {
+    static std::string format() {
+        using namespace detail;
+        static constexpr auto extents = const_name("(") + array_info<T>::extents + const_name(")");
+        return extents.text + format_descriptor<remove_all_extents_t<T>>::format();
+    }
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+template <typename T, int ExtraFlags>
+struct pyobject_caster<array_t<T, ExtraFlags>> {
+    using type = array_t<T, ExtraFlags>;
+
+    bool load(handle src, bool convert) {
+        if (!convert && !type::check_(src)) {
+            return false;
+        }
+        value = type::ensure(src);
+        return static_cast<bool>(value);
+    }
+
+    static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) {
+        return src.inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(type, handle_type_name<type>::name);
+};
+
+template <typename T>
+struct compare_buffer_info<T, detail::enable_if_t<detail::is_pod_struct<T>::value>> {
+    static bool compare(const buffer_info &b) {
+        return npy_api::get().PyArray_EquivTypes_(dtype::of<T>().ptr(), dtype(b).ptr());
+    }
+};
+
+template <typename T, typename = void>
+struct npy_format_descriptor_name;
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<std::is_integral<T>::value>> {
+    static constexpr auto name = const_name<std::is_same<T, bool>::value>(
+        const_name("bool"),
+        const_name<std::is_signed<T>::value>("numpy.int", "numpy.uint")
+            + const_name<sizeof(T) * 8>());
+};
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr auto name = const_name < std::is_same<T, float>::value
+                                 || std::is_same<T, const float>::value
+                                 || std::is_same<T, double>::value
+                                 || std::is_same<T, const double>::value
+                                        > (const_name("numpy.float") + const_name<sizeof(T) * 8>(),
+                                           const_name("numpy.longdouble"));
+};
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<is_complex<T>::value>> {
+    static constexpr auto name = const_name < std::is_same<typename T::value_type, float>::value
+                                 || std::is_same<typename T::value_type, const float>::value
+                                 || std::is_same<typename T::value_type, double>::value
+                                 || std::is_same<typename T::value_type, const double>::value
+                                        > (const_name("numpy.complex")
+                                               + const_name<sizeof(typename T::value_type) * 16>(),
+                                           const_name("numpy.longcomplex"));
+};
+
+template <typename T>
+struct npy_format_descriptor<
+    T,
+    enable_if_t<satisfies_any_of<T, std::is_arithmetic, is_complex>::value>>
+    : npy_format_descriptor_name<T> {
+private:
+    // NB: the order here must match the one in common.h
+    constexpr static const int values[15] = {npy_api::NPY_BOOL_,
+                                             npy_api::NPY_BYTE_,
+                                             npy_api::NPY_UBYTE_,
+                                             npy_api::NPY_INT16_,
+                                             npy_api::NPY_UINT16_,
+                                             npy_api::NPY_INT32_,
+                                             npy_api::NPY_UINT32_,
+                                             npy_api::NPY_INT64_,
+                                             npy_api::NPY_UINT64_,
+                                             npy_api::NPY_FLOAT_,
+                                             npy_api::NPY_DOUBLE_,
+                                             npy_api::NPY_LONGDOUBLE_,
+                                             npy_api::NPY_CFLOAT_,
+                                             npy_api::NPY_CDOUBLE_,
+                                             npy_api::NPY_CLONGDOUBLE_};
+
+public:
+    static constexpr int value = values[detail::is_fmt_numeric<T>::index];
+
+    static pybind11::dtype dtype() { return pybind11::dtype(/*typenum*/ value); }
+};
+
+template <typename T>
+struct npy_format_descriptor<T, enable_if_t<is_same_ignoring_cvref<T, PyObject *>::value>> {
+    static constexpr auto name = const_name("object");
+
+    static constexpr int value = npy_api::NPY_OBJECT_;
+
+    static pybind11::dtype dtype() { return pybind11::dtype(/*typenum*/ value); }
+};
+
+#define PYBIND11_DECL_CHAR_FMT                                                                    \
+    static constexpr auto name = const_name("S") + const_name<N>();                               \
+    static pybind11::dtype dtype() {                                                              \
+        return pybind11::dtype(std::string("S") + std::to_string(N));                             \
+    }
+template <size_t N>
+struct npy_format_descriptor<char[N]> {
+    PYBIND11_DECL_CHAR_FMT
+};
+template <size_t N>
+struct npy_format_descriptor<std::array<char, N>> {
+    PYBIND11_DECL_CHAR_FMT
+};
+#undef PYBIND11_DECL_CHAR_FMT
+
+template <typename T>
+struct npy_format_descriptor<T, enable_if_t<array_info<T>::is_array>> {
+private:
+    using base_descr = npy_format_descriptor<typename array_info<T>::type>;
+
+public:
+    static_assert(!array_info<T>::is_empty, "Zero-sized arrays are not supported");
+
+    static constexpr auto name
+        = const_name("(") + array_info<T>::extents + const_name(")") + base_descr::name;
+    static pybind11::dtype dtype() {
+        list shape;
+        array_info<T>::append_extents(shape);
+        return pybind11::dtype::from_args(
+            pybind11::make_tuple(base_descr::dtype(), std::move(shape)));
+    }
+};
+
+template <typename T>
+struct npy_format_descriptor<T, enable_if_t<std::is_enum<T>::value>> {
+private:
+    using base_descr = npy_format_descriptor<typename std::underlying_type<T>::type>;
+
+public:
+    static constexpr auto name = base_descr::name;
+    static pybind11::dtype dtype() { return base_descr::dtype(); }
+};
+
+struct field_descriptor {
+    const char *name;
+    ssize_t offset;
+    ssize_t size;
+    std::string format;
+    dtype descr;
+};
+
+PYBIND11_NOINLINE void register_structured_dtype(any_container<field_descriptor> fields,
+                                                 const std::type_info &tinfo,
+                                                 ssize_t itemsize,
+                                                 bool (*direct_converter)(PyObject *, void *&)) {
+
+    auto &numpy_internals = get_numpy_internals();
+    if (numpy_internals.get_type_info(tinfo, false)) {
+        pybind11_fail("NumPy: dtype is already registered");
+    }
+
+    // Use ordered fields because order matters as of NumPy 1.14:
+    // https://docs.scipy.org/doc/numpy/release.html#multiple-field-indexing-assignment-of-structured-arrays
+    std::vector<field_descriptor> ordered_fields(std::move(fields));
+    std::sort(
+        ordered_fields.begin(),
+        ordered_fields.end(),
+        [](const field_descriptor &a, const field_descriptor &b) { return a.offset < b.offset; });
+
+    list names, formats, offsets;
+    for (auto &field : ordered_fields) {
+        if (!field.descr) {
+            pybind11_fail(std::string("NumPy: unsupported field dtype: `") + field.name + "` @ "
+                          + tinfo.name());
+        }
+        names.append(pybind11::str(field.name));
+        formats.append(field.descr);
+        offsets.append(pybind11::int_(field.offset));
+    }
+    auto *dtype_ptr
+        = pybind11::dtype(std::move(names), std::move(formats), std::move(offsets), itemsize)
+              .release()
+              .ptr();
+
+    // There is an existing bug in NumPy (as of v1.11): trailing bytes are
+    // not encoded explicitly into the format string. This will supposedly
+    // get fixed in v1.12; for further details, see these:
+    // - https://github.com/numpy/numpy/issues/7797
+    // - https://github.com/numpy/numpy/pull/7798
+    // Because of this, we won't use numpy's logic to generate buffer format
+    // strings and will just do it ourselves.
+    ssize_t offset = 0;
+    std::ostringstream oss;
+    // mark the structure as unaligned with '^', because numpy and C++ don't
+    // always agree about alignment (particularly for complex), and we're
+    // explicitly listing all our padding. This depends on none of the fields
+    // overriding the endianness. Putting the ^ in front of individual fields
+    // isn't guaranteed to work due to https://github.com/numpy/numpy/issues/9049
+    oss << "^T{";
+    for (auto &field : ordered_fields) {
+        if (field.offset > offset) {
+            oss << (field.offset - offset) << 'x';
+        }
+        oss << field.format << ':' << field.name << ':';
+        offset = field.offset + field.size;
+    }
+    if (itemsize > offset) {
+        oss << (itemsize - offset) << 'x';
+    }
+    oss << '}';
+    auto format_str = oss.str();
+
+    // Smoke test: verify that NumPy properly parses our buffer format string
+    auto &api = npy_api::get();
+    auto arr = array(buffer_info(nullptr, itemsize, format_str, 1));
+    if (!api.PyArray_EquivTypes_(dtype_ptr, arr.dtype().ptr())) {
+        pybind11_fail("NumPy: invalid buffer descriptor!");
+    }
+
+    auto tindex = std::type_index(tinfo);
+    numpy_internals.registered_dtypes[tindex] = {dtype_ptr, std::move(format_str)};
+    get_internals().direct_conversions[tindex].push_back(direct_converter);
+}
+
+template <typename T, typename SFINAE>
+struct npy_format_descriptor {
+    static_assert(is_pod_struct<T>::value,
+                  "Attempt to use a non-POD or unimplemented POD type as a numpy dtype");
+
+    static constexpr auto name = make_caster<T>::name;
+
+    static pybind11::dtype dtype() { return reinterpret_borrow<pybind11::dtype>(dtype_ptr()); }
+
+    static std::string format() {
+        static auto format_str = get_numpy_internals().get_type_info<T>(true)->format_str;
+        return format_str;
+    }
+
+    static void register_dtype(any_container<field_descriptor> fields) {
+        register_structured_dtype(std::move(fields),
+                                  typeid(typename std::remove_cv<T>::type),
+                                  sizeof(T),
+                                  &direct_converter);
+    }
+
+private:
+    static PyObject *dtype_ptr() {
+        static PyObject *ptr = get_numpy_internals().get_type_info<T>(true)->dtype_ptr;
+        return ptr;
+    }
+
+    static bool direct_converter(PyObject *obj, void *&value) {
+        auto &api = npy_api::get();
+        if (!PyObject_TypeCheck(obj, api.PyVoidArrType_Type_)) {
+            return false;
+        }
+        if (auto descr = reinterpret_steal<object>(api.PyArray_DescrFromScalar_(obj))) {
+            if (api.PyArray_EquivTypes_(dtype_ptr(), descr.ptr())) {
+                value = ((PyVoidScalarObject_Proxy *) obj)->obval;
+                return true;
+            }
+        }
+        return false;
+    }
+};
+
+#ifdef __CLION_IDE__ // replace heavy macro with dummy code for the IDE (doesn't affect code)
+#    define PYBIND11_NUMPY_DTYPE(Type, ...) ((void) 0)
+#    define PYBIND11_NUMPY_DTYPE_EX(Type, ...) ((void) 0)
+#else
+
+#    define PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, Name)                                          \
+        ::pybind11::detail::field_descriptor {                                                    \
+            Name, offsetof(T, Field), sizeof(decltype(std::declval<T>().Field)),                  \
+                ::pybind11::format_descriptor<decltype(std::declval<T>().Field)>::format(),       \
+                ::pybind11::detail::npy_format_descriptor<                                        \
+                    decltype(std::declval<T>().Field)>::dtype()                                   \
+        }
+
+// Extract name, offset and format descriptor for a struct field
+#    define PYBIND11_FIELD_DESCRIPTOR(T, Field) PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, #Field)
+
+// The main idea of this macro is borrowed from https://github.com/swansontec/map-macro
+// (C) William Swanson, Paul Fultz
+#    define PYBIND11_EVAL0(...) __VA_ARGS__
+#    define PYBIND11_EVAL1(...) PYBIND11_EVAL0(PYBIND11_EVAL0(PYBIND11_EVAL0(__VA_ARGS__)))
+#    define PYBIND11_EVAL2(...) PYBIND11_EVAL1(PYBIND11_EVAL1(PYBIND11_EVAL1(__VA_ARGS__)))
+#    define PYBIND11_EVAL3(...) PYBIND11_EVAL2(PYBIND11_EVAL2(PYBIND11_EVAL2(__VA_ARGS__)))
+#    define PYBIND11_EVAL4(...) PYBIND11_EVAL3(PYBIND11_EVAL3(PYBIND11_EVAL3(__VA_ARGS__)))
+#    define PYBIND11_EVAL(...) PYBIND11_EVAL4(PYBIND11_EVAL4(PYBIND11_EVAL4(__VA_ARGS__)))
+#    define PYBIND11_MAP_END(...)
+#    define PYBIND11_MAP_OUT
+#    define PYBIND11_MAP_COMMA ,
+#    define PYBIND11_MAP_GET_END() 0, PYBIND11_MAP_END
+#    define PYBIND11_MAP_NEXT0(test, next, ...) next PYBIND11_MAP_OUT
+#    define PYBIND11_MAP_NEXT1(test, next) PYBIND11_MAP_NEXT0(test, next, 0)
+#    define PYBIND11_MAP_NEXT(test, next) PYBIND11_MAP_NEXT1(PYBIND11_MAP_GET_END test, next)
+#    if defined(_MSC_VER)                                                                         \
+        && !defined(__clang__) // MSVC is not as eager to expand macros, hence this workaround
+#        define PYBIND11_MAP_LIST_NEXT1(test, next)                                               \
+            PYBIND11_EVAL0(PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0))
+#    else
+#        define PYBIND11_MAP_LIST_NEXT1(test, next)                                               \
+            PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0)
+#    endif
+#    define PYBIND11_MAP_LIST_NEXT(test, next)                                                    \
+        PYBIND11_MAP_LIST_NEXT1(PYBIND11_MAP_GET_END test, next)
+#    define PYBIND11_MAP_LIST0(f, t, x, peek, ...)                                                \
+        f(t, x) PYBIND11_MAP_LIST_NEXT(peek, PYBIND11_MAP_LIST1)(f, t, peek, __VA_ARGS__)
+#    define PYBIND11_MAP_LIST1(f, t, x, peek, ...)                                                \
+        f(t, x) PYBIND11_MAP_LIST_NEXT(peek, PYBIND11_MAP_LIST0)(f, t, peek, __VA_ARGS__)
+// PYBIND11_MAP_LIST(f, t, a1, a2, ...) expands to f(t, a1), f(t, a2), ...
+#    define PYBIND11_MAP_LIST(f, t, ...)                                                          \
+        PYBIND11_EVAL(PYBIND11_MAP_LIST1(f, t, __VA_ARGS__, (), 0))
+
+#    define PYBIND11_NUMPY_DTYPE(Type, ...)                                                       \
+        ::pybind11::detail::npy_format_descriptor<Type>::register_dtype(                          \
+            ::std::vector<::pybind11::detail::field_descriptor>{                                  \
+                PYBIND11_MAP_LIST(PYBIND11_FIELD_DESCRIPTOR, Type, __VA_ARGS__)})
+
+#    if defined(_MSC_VER) && !defined(__clang__)
+#        define PYBIND11_MAP2_LIST_NEXT1(test, next)                                              \
+            PYBIND11_EVAL0(PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0))
+#    else
+#        define PYBIND11_MAP2_LIST_NEXT1(test, next)                                              \
+            PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0)
+#    endif
+#    define PYBIND11_MAP2_LIST_NEXT(test, next)                                                   \
+        PYBIND11_MAP2_LIST_NEXT1(PYBIND11_MAP_GET_END test, next)
+#    define PYBIND11_MAP2_LIST0(f, t, x1, x2, peek, ...)                                          \
+        f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT(peek, PYBIND11_MAP2_LIST1)(f, t, peek, __VA_ARGS__)
+#    define PYBIND11_MAP2_LIST1(f, t, x1, x2, peek, ...)                                          \
+        f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT(peek, PYBIND11_MAP2_LIST0)(f, t, peek, __VA_ARGS__)
+// PYBIND11_MAP2_LIST(f, t, a1, a2, ...) expands to f(t, a1, a2), f(t, a3, a4), ...
+#    define PYBIND11_MAP2_LIST(f, t, ...)                                                         \
+        PYBIND11_EVAL(PYBIND11_MAP2_LIST1(f, t, __VA_ARGS__, (), 0))
+
+#    define PYBIND11_NUMPY_DTYPE_EX(Type, ...)                                                    \
+        ::pybind11::detail::npy_format_descriptor<Type>::register_dtype(                          \
+            ::std::vector<::pybind11::detail::field_descriptor>{                                  \
+                PYBIND11_MAP2_LIST(PYBIND11_FIELD_DESCRIPTOR_EX, Type, __VA_ARGS__)})
+
+#endif // __CLION_IDE__
+
+class common_iterator {
+public:
+    using container_type = std::vector<ssize_t>;
+    using value_type = container_type::value_type;
+    using size_type = container_type::size_type;
+
+    common_iterator() : m_strides() {}
+
+    common_iterator(void *ptr, const container_type &strides, const container_type &shape)
+        : p_ptr(reinterpret_cast<char *>(ptr)), m_strides(strides.size()) {
+        m_strides.back() = static_cast<value_type>(strides.back());
+        for (size_type i = m_strides.size() - 1; i != 0; --i) {
+            size_type j = i - 1;
+            auto s = static_cast<value_type>(shape[i]);
+            m_strides[j] = strides[j] + m_strides[i] - strides[i] * s;
+        }
+    }
+
+    void increment(size_type dim) { p_ptr += m_strides[dim]; }
+
+    void *data() const { return p_ptr; }
+
+private:
+    char *p_ptr{nullptr};
+    container_type m_strides;
+};
+
+template <size_t N>
+class multi_array_iterator {
+public:
+    using container_type = std::vector<ssize_t>;
+
+    multi_array_iterator(const std::array<buffer_info, N> &buffers, const container_type &shape)
+        : m_shape(shape.size()), m_index(shape.size(), 0), m_common_iterator() {
+
+        // Manual copy to avoid conversion warning if using std::copy
+        for (size_t i = 0; i < shape.size(); ++i) {
+            m_shape[i] = shape[i];
+        }
+
+        container_type strides(shape.size());
+        for (size_t i = 0; i < N; ++i) {
+            init_common_iterator(buffers[i], shape, m_common_iterator[i], strides);
+        }
+    }
+
+    multi_array_iterator &operator++() {
+        for (size_t j = m_index.size(); j != 0; --j) {
+            size_t i = j - 1;
+            if (++m_index[i] != m_shape[i]) {
+                increment_common_iterator(i);
+                break;
+            }
+            m_index[i] = 0;
+        }
+        return *this;
+    }
+
+    template <size_t K, class T = void>
+    T *data() const {
+        return reinterpret_cast<T *>(m_common_iterator[K].data());
+    }
+
+private:
+    using common_iter = common_iterator;
+
+    void init_common_iterator(const buffer_info &buffer,
+                              const container_type &shape,
+                              common_iter &iterator,
+                              container_type &strides) {
+        auto buffer_shape_iter = buffer.shape.rbegin();
+        auto buffer_strides_iter = buffer.strides.rbegin();
+        auto shape_iter = shape.rbegin();
+        auto strides_iter = strides.rbegin();
+
+        while (buffer_shape_iter != buffer.shape.rend()) {
+            if (*shape_iter == *buffer_shape_iter) {
+                *strides_iter = *buffer_strides_iter;
+            } else {
+                *strides_iter = 0;
+            }
+
+            ++buffer_shape_iter;
+            ++buffer_strides_iter;
+            ++shape_iter;
+            ++strides_iter;
+        }
+
+        std::fill(strides_iter, strides.rend(), 0);
+        iterator = common_iter(buffer.ptr, strides, shape);
+    }
+
+    void increment_common_iterator(size_t dim) {
+        for (auto &iter : m_common_iterator) {
+            iter.increment(dim);
+        }
+    }
+
+    container_type m_shape;
+    container_type m_index;
+    std::array<common_iter, N> m_common_iterator;
+};
+
+enum class broadcast_trivial { non_trivial, c_trivial, f_trivial };
+
+// Populates the shape and number of dimensions for the set of buffers.  Returns a
+// broadcast_trivial enum value indicating whether the broadcast is "trivial"--that is, has each
+// buffer being either a singleton or a full-size, C-contiguous (`c_trivial`) or Fortran-contiguous
+// (`f_trivial`) storage buffer; returns `non_trivial` otherwise.
+template <size_t N>
+broadcast_trivial
+broadcast(const std::array<buffer_info, N> &buffers, ssize_t &ndim, std::vector<ssize_t> &shape) {
+    ndim = std::accumulate(
+        buffers.begin(), buffers.end(), ssize_t(0), [](ssize_t res, const buffer_info &buf) {
+            return std::max(res, buf.ndim);
+        });
+
+    shape.clear();
+    shape.resize((size_t) ndim, 1);
+
+    // Figure out the output size, and make sure all input arrays conform (i.e. are either size 1
+    // or the full size).
+    for (size_t i = 0; i < N; ++i) {
+        auto res_iter = shape.rbegin();
+        auto end = buffers[i].shape.rend();
+        for (auto shape_iter = buffers[i].shape.rbegin(); shape_iter != end;
+             ++shape_iter, ++res_iter) {
+            const auto &dim_size_in = *shape_iter;
+            auto &dim_size_out = *res_iter;
+
+            // Each input dimension can either be 1 or `n`, but `n` values must match across
+            // buffers
+            if (dim_size_out == 1) {
+                dim_size_out = dim_size_in;
+            } else if (dim_size_in != 1 && dim_size_in != dim_size_out) {
+                pybind11_fail("pybind11::vectorize: incompatible size/dimension of inputs!");
+            }
+        }
+    }
+
+    bool trivial_broadcast_c = true;
+    bool trivial_broadcast_f = true;
+    for (size_t i = 0; i < N && (trivial_broadcast_c || trivial_broadcast_f); ++i) {
+        if (buffers[i].size == 1) {
+            continue;
+        }
+
+        // Require the same number of dimensions:
+        if (buffers[i].ndim != ndim) {
+            return broadcast_trivial::non_trivial;
+        }
+
+        // Require all dimensions be full-size:
+        if (!std::equal(buffers[i].shape.cbegin(), buffers[i].shape.cend(), shape.cbegin())) {
+            return broadcast_trivial::non_trivial;
+        }
+
+        // Check for C contiguity (but only if previous inputs were also C contiguous)
+        if (trivial_broadcast_c) {
+            ssize_t expect_stride = buffers[i].itemsize;
+            auto end = buffers[i].shape.crend();
+            for (auto shape_iter = buffers[i].shape.crbegin(),
+                      stride_iter = buffers[i].strides.crbegin();
+                 trivial_broadcast_c && shape_iter != end;
+                 ++shape_iter, ++stride_iter) {
+                if (expect_stride == *stride_iter) {
+                    expect_stride *= *shape_iter;
+                } else {
+                    trivial_broadcast_c = false;
+                }
+            }
+        }
+
+        // Check for Fortran contiguity (if previous inputs were also F contiguous)
+        if (trivial_broadcast_f) {
+            ssize_t expect_stride = buffers[i].itemsize;
+            auto end = buffers[i].shape.cend();
+            for (auto shape_iter = buffers[i].shape.cbegin(),
+                      stride_iter = buffers[i].strides.cbegin();
+                 trivial_broadcast_f && shape_iter != end;
+                 ++shape_iter, ++stride_iter) {
+                if (expect_stride == *stride_iter) {
+                    expect_stride *= *shape_iter;
+                } else {
+                    trivial_broadcast_f = false;
+                }
+            }
+        }
+    }
+
+    return trivial_broadcast_c   ? broadcast_trivial::c_trivial
+           : trivial_broadcast_f ? broadcast_trivial::f_trivial
+                                 : broadcast_trivial::non_trivial;
+}
+
+template <typename T>
+struct vectorize_arg {
+    static_assert(!std::is_rvalue_reference<T>::value,
+                  "Functions with rvalue reference arguments cannot be vectorized");
+    // The wrapped function gets called with this type:
+    using call_type = remove_reference_t<T>;
+    // Is this a vectorized argument?
+    static constexpr bool vectorize
+        = satisfies_any_of<call_type, std::is_arithmetic, is_complex, is_pod>::value
+          && satisfies_none_of<call_type,
+                               std::is_pointer,
+                               std::is_array,
+                               is_std_array,
+                               std::is_enum>::value
+          && (!std::is_reference<T>::value
+              || (std::is_lvalue_reference<T>::value && std::is_const<call_type>::value));
+    // Accept this type: an array for vectorized types, otherwise the type as-is:
+    using type = conditional_t<vectorize, array_t<remove_cv_t<call_type>, array::forcecast>, T>;
+};
+
+// py::vectorize when a return type is present
+template <typename Func, typename Return, typename... Args>
+struct vectorize_returned_array {
+    using Type = array_t<Return>;
+
+    static Type create(broadcast_trivial trivial, const std::vector<ssize_t> &shape) {
+        if (trivial == broadcast_trivial::f_trivial) {
+            return array_t<Return, array::f_style>(shape);
+        }
+        return array_t<Return>(shape);
+    }
+
+    static Return *mutable_data(Type &array) { return array.mutable_data(); }
+
+    static Return call(Func &f, Args &...args) { return f(args...); }
+
+    static void call(Return *out, size_t i, Func &f, Args &...args) { out[i] = f(args...); }
+};
+
+// py::vectorize when a return type is not present
+template <typename Func, typename... Args>
+struct vectorize_returned_array<Func, void, Args...> {
+    using Type = none;
+
+    static Type create(broadcast_trivial, const std::vector<ssize_t> &) { return none(); }
+
+    static void *mutable_data(Type &) { return nullptr; }
+
+    static detail::void_type call(Func &f, Args &...args) {
+        f(args...);
+        return {};
+    }
+
+    static void call(void *, size_t, Func &f, Args &...args) { f(args...); }
+};
+
+template <typename Func, typename Return, typename... Args>
+struct vectorize_helper {
+
+// NVCC for some reason breaks if NVectorized is private
+#ifdef __CUDACC__
+public:
+#else
+private:
+#endif
+
+    static constexpr size_t N = sizeof...(Args);
+    static constexpr size_t NVectorized = constexpr_sum(vectorize_arg<Args>::vectorize...);
+    static_assert(
+        NVectorized >= 1,
+        "pybind11::vectorize(...) requires a function with at least one vectorizable argument");
+
+public:
+    template <typename T,
+              // SFINAE to prevent shadowing the copy constructor.
+              typename = detail::enable_if_t<
+                  !std::is_same<vectorize_helper, typename std::decay<T>::type>::value>>
+    explicit vectorize_helper(T &&f) : f(std::forward<T>(f)) {}
+
+    object operator()(typename vectorize_arg<Args>::type... args) {
+        return run(args...,
+                   make_index_sequence<N>(),
+                   select_indices<vectorize_arg<Args>::vectorize...>(),
+                   make_index_sequence<NVectorized>());
+    }
+
+private:
+    remove_reference_t<Func> f;
+
+    // Internal compiler error in MSVC 19.16.27025.1 (Visual Studio 2017 15.9.4), when compiling
+    // with "/permissive-" flag when arg_call_types is manually inlined.
+    using arg_call_types = std::tuple<typename vectorize_arg<Args>::call_type...>;
+    template <size_t Index>
+    using param_n_t = typename std::tuple_element<Index, arg_call_types>::type;
+
+    using returned_array = vectorize_returned_array<Func, Return, Args...>;
+
+    // Runs a vectorized function given arguments tuple and three index sequences:
+    //     - Index is the full set of 0 ... (N-1) argument indices;
+    //     - VIndex is the subset of argument indices with vectorized parameters, letting us access
+    //       vectorized arguments (anything not in this sequence is passed through)
+    //     - BIndex is a incremental sequence (beginning at 0) of the same size as VIndex, so that
+    //       we can store vectorized buffer_infos in an array (argument VIndex has its buffer at
+    //       index BIndex in the array).
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    object run(typename vectorize_arg<Args>::type &...args,
+               index_sequence<Index...> i_seq,
+               index_sequence<VIndex...> vi_seq,
+               index_sequence<BIndex...> bi_seq) {
+
+        // Pointers to values the function was called with; the vectorized ones set here will start
+        // out as array_t<T> pointers, but they will be changed them to T pointers before we make
+        // call the wrapped function.  Non-vectorized pointers are left as-is.
+        std::array<void *, N> params{{&args...}};
+
+        // The array of `buffer_info`s of vectorized arguments:
+        std::array<buffer_info, NVectorized> buffers{
+            {reinterpret_cast<array *>(params[VIndex])->request()...}};
+
+        /* Determine dimensions parameters of output array */
+        ssize_t nd = 0;
+        std::vector<ssize_t> shape(0);
+        auto trivial = broadcast(buffers, nd, shape);
+        auto ndim = (size_t) nd;
+
+        size_t size
+            = std::accumulate(shape.begin(), shape.end(), (size_t) 1, std::multiplies<size_t>());
+
+        // If all arguments are 0-dimension arrays (i.e. single values) return a plain value (i.e.
+        // not wrapped in an array).
+        if (size == 1 && ndim == 0) {
+            PYBIND11_EXPAND_SIDE_EFFECTS(params[VIndex] = buffers[BIndex].ptr);
+            return cast(
+                returned_array::call(f, *reinterpret_cast<param_n_t<Index> *>(params[Index])...));
+        }
+
+        auto result = returned_array::create(trivial, shape);
+
+        PYBIND11_WARNING_PUSH
+#ifdef PYBIND11_DETECTED_CLANG_WITH_MISLEADING_CALL_STD_MOVE_EXPLICITLY_WARNING
+        PYBIND11_WARNING_DISABLE_CLANG("-Wreturn-std-move")
+#endif
+
+        if (size == 0) {
+            return result;
+        }
+
+        /* Call the function */
+        auto *mutable_data = returned_array::mutable_data(result);
+        if (trivial == broadcast_trivial::non_trivial) {
+            apply_broadcast(buffers, params, mutable_data, size, shape, i_seq, vi_seq, bi_seq);
+        } else {
+            apply_trivial(buffers, params, mutable_data, size, i_seq, vi_seq, bi_seq);
+        }
+
+        return result;
+        PYBIND11_WARNING_POP
+    }
+
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    void apply_trivial(std::array<buffer_info, NVectorized> &buffers,
+                       std::array<void *, N> &params,
+                       Return *out,
+                       size_t size,
+                       index_sequence<Index...>,
+                       index_sequence<VIndex...>,
+                       index_sequence<BIndex...>) {
+
+        // Initialize an array of mutable byte references and sizes with references set to the
+        // appropriate pointer in `params`; as we iterate, we'll increment each pointer by its size
+        // (except for singletons, which get an increment of 0).
+        std::array<std::pair<unsigned char *&, const size_t>, NVectorized> vecparams{
+            {std::pair<unsigned char *&, const size_t>(
+                reinterpret_cast<unsigned char *&>(params[VIndex] = buffers[BIndex].ptr),
+                buffers[BIndex].size == 1 ? 0 : sizeof(param_n_t<VIndex>))...}};
+
+        for (size_t i = 0; i < size; ++i) {
+            returned_array::call(
+                out, i, f, *reinterpret_cast<param_n_t<Index> *>(params[Index])...);
+            for (auto &x : vecparams) {
+                x.first += x.second;
+            }
+        }
+    }
+
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    void apply_broadcast(std::array<buffer_info, NVectorized> &buffers,
+                         std::array<void *, N> &params,
+                         Return *out,
+                         size_t size,
+                         const std::vector<ssize_t> &output_shape,
+                         index_sequence<Index...>,
+                         index_sequence<VIndex...>,
+                         index_sequence<BIndex...>) {
+
+        multi_array_iterator<NVectorized> input_iter(buffers, output_shape);
+
+        for (size_t i = 0; i < size; ++i, ++input_iter) {
+            PYBIND11_EXPAND_SIDE_EFFECTS((params[VIndex] = input_iter.template data<BIndex>()));
+            returned_array::call(
+                out, i, f, *reinterpret_cast<param_n_t<Index> *>(std::get<Index>(params))...);
+        }
+    }
+};
+
+template <typename Func, typename Return, typename... Args>
+vectorize_helper<Func, Return, Args...> vectorize_extractor(const Func &f, Return (*)(Args...)) {
+    return detail::vectorize_helper<Func, Return, Args...>(f);
+}
+
+template <typename T, int Flags>
+struct handle_type_name<array_t<T, Flags>> {
+    static constexpr auto name
+        = const_name("numpy.ndarray[") + npy_format_descriptor<T>::name + const_name("]");
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+// Vanilla pointer vectorizer:
+template <typename Return, typename... Args>
+detail::vectorize_helper<Return (*)(Args...), Return, Args...> vectorize(Return (*f)(Args...)) {
+    return detail::vectorize_helper<Return (*)(Args...), Return, Args...>(f);
+}
+
+// lambda vectorizer:
+template <typename Func, detail::enable_if_t<detail::is_lambda<Func>::value, int> = 0>
+auto vectorize(Func &&f)
+    -> decltype(detail::vectorize_extractor(std::forward<Func>(f),
+                                            (detail::function_signature_t<Func> *) nullptr)) {
+    return detail::vectorize_extractor(std::forward<Func>(f),
+                                       (detail::function_signature_t<Func> *) nullptr);
+}
+
+// Vectorize a class method (non-const):
+template <typename Return,
+          typename Class,
+          typename... Args,
+          typename Helper = detail::vectorize_helper<
+              decltype(std::mem_fn(std::declval<Return (Class::*)(Args...)>())),
+              Return,
+              Class *,
+              Args...>>
+Helper vectorize(Return (Class::*f)(Args...)) {
+    return Helper(std::mem_fn(f));
+}
+
+// Vectorize a class method (const):
+template <typename Return,
+          typename Class,
+          typename... Args,
+          typename Helper = detail::vectorize_helper<
+              decltype(std::mem_fn(std::declval<Return (Class::*)(Args...) const>())),
+              Return,
+              const Class *,
+              Args...>>
+Helper vectorize(Return (Class::*f)(Args...) const) {
+    return Helper(std::mem_fn(f));
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/operators.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/operators.h
new file mode 100644
index 00000000..16a88ae1
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/operators.h
@@ -0,0 +1,202 @@
+/*
+    pybind11/operator.h: Metatemplates for operator overloading
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// Enumeration with all supported operator types
+enum op_id : int {
+    op_add,
+    op_sub,
+    op_mul,
+    op_div,
+    op_mod,
+    op_divmod,
+    op_pow,
+    op_lshift,
+    op_rshift,
+    op_and,
+    op_xor,
+    op_or,
+    op_neg,
+    op_pos,
+    op_abs,
+    op_invert,
+    op_int,
+    op_long,
+    op_float,
+    op_str,
+    op_cmp,
+    op_gt,
+    op_ge,
+    op_lt,
+    op_le,
+    op_eq,
+    op_ne,
+    op_iadd,
+    op_isub,
+    op_imul,
+    op_idiv,
+    op_imod,
+    op_ilshift,
+    op_irshift,
+    op_iand,
+    op_ixor,
+    op_ior,
+    op_complex,
+    op_bool,
+    op_nonzero,
+    op_repr,
+    op_truediv,
+    op_itruediv,
+    op_hash
+};
+
+enum op_type : int {
+    op_l, /* base type on left */
+    op_r, /* base type on right */
+    op_u  /* unary operator */
+};
+
+struct self_t {};
+static const self_t self = self_t();
+
+/// Type for an unused type slot
+struct undefined_t {};
+
+/// Don't warn about an unused variable
+inline self_t __self() { return self; }
+
+/// base template of operator implementations
+template <op_id, op_type, typename B, typename L, typename R>
+struct op_impl {};
+
+/// Operator implementation generator
+template <op_id id, op_type ot, typename L, typename R>
+struct op_ {
+    static constexpr bool op_enable_if_hook = true;
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) const {
+        using Base = typename Class::type;
+        using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
+        using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
+        using op = op_impl<id, ot, Base, L_type, R_type>;
+        cl.def(op::name(), &op::execute, is_operator(), extra...);
+    }
+    template <typename Class, typename... Extra>
+    void execute_cast(Class &cl, const Extra &...extra) const {
+        using Base = typename Class::type;
+        using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
+        using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
+        using op = op_impl<id, ot, Base, L_type, R_type>;
+        cl.def(op::name(), &op::execute_cast, is_operator(), extra...);
+    }
+};
+
+#define PYBIND11_BINARY_OPERATOR(id, rid, op, expr)                                               \
+    template <typename B, typename L, typename R>                                                 \
+    struct op_impl<op_##id, op_l, B, L, R> {                                                      \
+        static char const *name() { return "__" #id "__"; }                                       \
+        static auto execute(const L &l, const R &r) -> decltype(expr) { return (expr); }          \
+        static B execute_cast(const L &l, const R &r) { return B(expr); }                         \
+    };                                                                                            \
+    template <typename B, typename L, typename R>                                                 \
+    struct op_impl<op_##id, op_r, B, L, R> {                                                      \
+        static char const *name() { return "__" #rid "__"; }                                      \
+        static auto execute(const R &r, const L &l) -> decltype(expr) { return (expr); }          \
+        static B execute_cast(const R &r, const L &l) { return B(expr); }                         \
+    };                                                                                            \
+    inline op_<op_##id, op_l, self_t, self_t> op(const self_t &, const self_t &) {                \
+        return op_<op_##id, op_l, self_t, self_t>();                                              \
+    }                                                                                             \
+    template <typename T>                                                                         \
+    op_<op_##id, op_l, self_t, T> op(const self_t &, const T &) {                                 \
+        return op_<op_##id, op_l, self_t, T>();                                                   \
+    }                                                                                             \
+    template <typename T>                                                                         \
+    op_<op_##id, op_r, T, self_t> op(const T &, const self_t &) {                                 \
+        return op_<op_##id, op_r, T, self_t>();                                                   \
+    }
+
+#define PYBIND11_INPLACE_OPERATOR(id, op, expr)                                                   \
+    template <typename B, typename L, typename R>                                                 \
+    struct op_impl<op_##id, op_l, B, L, R> {                                                      \
+        static char const *name() { return "__" #id "__"; }                                       \
+        static auto execute(L &l, const R &r) -> decltype(expr) { return expr; }                  \
+        static B execute_cast(L &l, const R &r) { return B(expr); }                               \
+    };                                                                                            \
+    template <typename T>                                                                         \
+    op_<op_##id, op_l, self_t, T> op(const self_t &, const T &) {                                 \
+        return op_<op_##id, op_l, self_t, T>();                                                   \
+    }
+
+#define PYBIND11_UNARY_OPERATOR(id, op, expr)                                                     \
+    template <typename B, typename L>                                                             \
+    struct op_impl<op_##id, op_u, B, L, undefined_t> {                                            \
+        static char const *name() { return "__" #id "__"; }                                       \
+        static auto execute(const L &l) -> decltype(expr) { return expr; }                        \
+        static B execute_cast(const L &l) { return B(expr); }                                     \
+    };                                                                                            \
+    inline op_<op_##id, op_u, self_t, undefined_t> op(const self_t &) {                           \
+        return op_<op_##id, op_u, self_t, undefined_t>();                                         \
+    }
+
+PYBIND11_BINARY_OPERATOR(sub, rsub, operator-, l - r)
+PYBIND11_BINARY_OPERATOR(add, radd, operator+, l + r)
+PYBIND11_BINARY_OPERATOR(mul, rmul, operator*, l *r)
+PYBIND11_BINARY_OPERATOR(truediv, rtruediv, operator/, l / r)
+PYBIND11_BINARY_OPERATOR(mod, rmod, operator%, l % r)
+PYBIND11_BINARY_OPERATOR(lshift, rlshift, operator<<, l << r)
+PYBIND11_BINARY_OPERATOR(rshift, rrshift, operator>>, l >> r)
+PYBIND11_BINARY_OPERATOR(and, rand, operator&, l &r)
+PYBIND11_BINARY_OPERATOR(xor, rxor, operator^, l ^ r)
+PYBIND11_BINARY_OPERATOR(eq, eq, operator==, l == r)
+PYBIND11_BINARY_OPERATOR(ne, ne, operator!=, l != r)
+PYBIND11_BINARY_OPERATOR(or, ror, operator|, l | r)
+PYBIND11_BINARY_OPERATOR(gt, lt, operator>, l > r)
+PYBIND11_BINARY_OPERATOR(ge, le, operator>=, l >= r)
+PYBIND11_BINARY_OPERATOR(lt, gt, operator<, l < r)
+PYBIND11_BINARY_OPERATOR(le, ge, operator<=, l <= r)
+// PYBIND11_BINARY_OPERATOR(pow,       rpow,         pow,          std::pow(l,  r))
+PYBIND11_INPLACE_OPERATOR(iadd, operator+=, l += r)
+PYBIND11_INPLACE_OPERATOR(isub, operator-=, l -= r)
+PYBIND11_INPLACE_OPERATOR(imul, operator*=, l *= r)
+PYBIND11_INPLACE_OPERATOR(itruediv, operator/=, l /= r)
+PYBIND11_INPLACE_OPERATOR(imod, operator%=, l %= r)
+PYBIND11_INPLACE_OPERATOR(ilshift, operator<<=, l <<= r)
+PYBIND11_INPLACE_OPERATOR(irshift, operator>>=, l >>= r)
+PYBIND11_INPLACE_OPERATOR(iand, operator&=, l &= r)
+PYBIND11_INPLACE_OPERATOR(ixor, operator^=, l ^= r)
+PYBIND11_INPLACE_OPERATOR(ior, operator|=, l |= r)
+PYBIND11_UNARY_OPERATOR(neg, operator-, -l)
+PYBIND11_UNARY_OPERATOR(pos, operator+, +l)
+// WARNING: This usage of `abs` should only be done for existing STL overloads.
+// Adding overloads directly in to the `std::` namespace is advised against:
+// https://en.cppreference.com/w/cpp/language/extending_std
+PYBIND11_UNARY_OPERATOR(abs, abs, std::abs(l))
+PYBIND11_UNARY_OPERATOR(hash, hash, std::hash<L>()(l))
+PYBIND11_UNARY_OPERATOR(invert, operator~, (~l))
+PYBIND11_UNARY_OPERATOR(bool, operator!, !!l)
+PYBIND11_UNARY_OPERATOR(int, int_, (int) l)
+PYBIND11_UNARY_OPERATOR(float, float_, (double) l)
+
+#undef PYBIND11_BINARY_OPERATOR
+#undef PYBIND11_INPLACE_OPERATOR
+#undef PYBIND11_UNARY_OPERATOR
+PYBIND11_NAMESPACE_END(detail)
+
+using detail::self;
+// Add named operators so that they are accessible via `py::`.
+using detail::hash;
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/options.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/options.h
new file mode 100644
index 00000000..1b212252
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/options.h
@@ -0,0 +1,92 @@
+/*
+    pybind11/options.h: global settings that are configurable at runtime.
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+class options {
+public:
+    // Default RAII constructor, which leaves settings as they currently are.
+    options() : previous_state(global_state()) {}
+
+    // Class is non-copyable.
+    options(const options &) = delete;
+    options &operator=(const options &) = delete;
+
+    // Destructor, which restores settings that were in effect before.
+    ~options() { global_state() = previous_state; }
+
+    // Setter methods (affect the global state):
+
+    options &disable_user_defined_docstrings() & {
+        global_state().show_user_defined_docstrings = false;
+        return *this;
+    }
+
+    options &enable_user_defined_docstrings() & {
+        global_state().show_user_defined_docstrings = true;
+        return *this;
+    }
+
+    options &disable_function_signatures() & {
+        global_state().show_function_signatures = false;
+        return *this;
+    }
+
+    options &enable_function_signatures() & {
+        global_state().show_function_signatures = true;
+        return *this;
+    }
+
+    options &disable_enum_members_docstring() & {
+        global_state().show_enum_members_docstring = false;
+        return *this;
+    }
+
+    options &enable_enum_members_docstring() & {
+        global_state().show_enum_members_docstring = true;
+        return *this;
+    }
+
+    // Getter methods (return the global state):
+
+    static bool show_user_defined_docstrings() {
+        return global_state().show_user_defined_docstrings;
+    }
+
+    static bool show_function_signatures() { return global_state().show_function_signatures; }
+
+    static bool show_enum_members_docstring() {
+        return global_state().show_enum_members_docstring;
+    }
+
+    // This type is not meant to be allocated on the heap.
+    void *operator new(size_t) = delete;
+
+private:
+    struct state {
+        bool show_user_defined_docstrings = true; //< Include user-supplied texts in docstrings.
+        bool show_function_signatures = true;     //< Include auto-generated function signatures
+                                                  //  in docstrings.
+        bool show_enum_members_docstring = true;  //< Include auto-generated member list in enum
+                                                  //  docstrings.
+    };
+
+    static state &global_state() {
+        static state instance;
+        return instance;
+    }
+
+    state previous_state;
+};
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/pybind11.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/pybind11.h
new file mode 100644
index 00000000..3bce1a01
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/pybind11.h
@@ -0,0 +1,2890 @@
+/*
+    pybind11/pybind11.h: Main header file of the C++11 python
+    binding generator library
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/class.h"
+#include "detail/init.h"
+#include "attr.h"
+#include "gil.h"
+#include "options.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <string>
+#include <utility>
+#include <vector>
+
+#if defined(__cpp_lib_launder) && !(defined(_MSC_VER) && (_MSC_VER < 1914))
+#    define PYBIND11_STD_LAUNDER std::launder
+#    define PYBIND11_HAS_STD_LAUNDER 1
+#else
+#    define PYBIND11_STD_LAUNDER
+#    define PYBIND11_HAS_STD_LAUNDER 0
+#endif
+#if defined(__GNUG__) && !defined(__clang__)
+#    include <cxxabi.h>
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/* https://stackoverflow.com/questions/46798456/handling-gccs-noexcept-type-warning
+   This warning is about ABI compatibility, not code health.
+   It is only actually needed in a couple places, but apparently GCC 7 "generates this warning if
+   and only if the first template instantiation ... involves noexcept" [stackoverflow], therefore
+   it could get triggered from seemingly random places, depending on user code.
+   No other GCC version generates this warning.
+ */
+#if defined(__GNUC__) && __GNUC__ == 7
+PYBIND11_WARNING_DISABLE_GCC("-Wnoexcept-type")
+#endif
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Apply all the extensions translators from a list
+// Return true if one of the translators completed without raising an exception
+// itself. Return of false indicates that if there are other translators
+// available, they should be tried.
+inline bool apply_exception_translators(std::forward_list<ExceptionTranslator> &translators) {
+    auto last_exception = std::current_exception();
+
+    for (auto &translator : translators) {
+        try {
+            translator(last_exception);
+            return true;
+        } catch (...) {
+            last_exception = std::current_exception();
+        }
+    }
+    return false;
+}
+
+#if defined(_MSC_VER)
+#    define PYBIND11_COMPAT_STRDUP _strdup
+#else
+#    define PYBIND11_COMPAT_STRDUP strdup
+#endif
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Wraps an arbitrary C++ function/method/lambda function/.. into a callable Python object
+class cpp_function : public function {
+public:
+    cpp_function() = default;
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    cpp_function(std::nullptr_t) {}
+    cpp_function(std::nullptr_t, const is_setter &) {}
+
+    /// Construct a cpp_function from a vanilla function pointer
+    template <typename Return, typename... Args, typename... Extra>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    cpp_function(Return (*f)(Args...), const Extra &...extra) {
+        initialize(f, f, extra...);
+    }
+
+    /// Construct a cpp_function from a lambda function (possibly with internal state)
+    template <typename Func,
+              typename... Extra,
+              typename = detail::enable_if_t<detail::is_lambda<Func>::value>>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    cpp_function(Func &&f, const Extra &...extra) {
+        initialize(
+            std::forward<Func>(f), (detail::function_signature_t<Func> *) nullptr, extra...);
+    }
+
+    /// Construct a cpp_function from a class method (non-const, no ref-qualifier)
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    cpp_function(Return (Class::*f)(Arg...), const Extra &...extra) {
+        initialize(
+            [f](Class *c, Arg... args) -> Return { return (c->*f)(std::forward<Arg>(args)...); },
+            (Return(*)(Class *, Arg...)) nullptr,
+            extra...);
+    }
+
+    /// Construct a cpp_function from a class method (non-const, lvalue ref-qualifier)
+    /// A copy of the overload for non-const functions without explicit ref-qualifier
+    /// but with an added `&`.
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    cpp_function(Return (Class::*f)(Arg...) &, const Extra &...extra) {
+        initialize(
+            [f](Class *c, Arg... args) -> Return { return (c->*f)(std::forward<Arg>(args)...); },
+            (Return(*)(Class *, Arg...)) nullptr,
+            extra...);
+    }
+
+    /// Construct a cpp_function from a class method (const, no ref-qualifier)
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    cpp_function(Return (Class::*f)(Arg...) const, const Extra &...extra) {
+        initialize([f](const Class *c,
+                       Arg... args) -> Return { return (c->*f)(std::forward<Arg>(args)...); },
+                   (Return(*)(const Class *, Arg...)) nullptr,
+                   extra...);
+    }
+
+    /// Construct a cpp_function from a class method (const, lvalue ref-qualifier)
+    /// A copy of the overload for const functions without explicit ref-qualifier
+    /// but with an added `&`.
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    cpp_function(Return (Class::*f)(Arg...) const &, const Extra &...extra) {
+        initialize([f](const Class *c,
+                       Arg... args) -> Return { return (c->*f)(std::forward<Arg>(args)...); },
+                   (Return(*)(const Class *, Arg...)) nullptr,
+                   extra...);
+    }
+
+    /// Return the function name
+    object name() const { return attr("__name__"); }
+
+protected:
+    struct InitializingFunctionRecordDeleter {
+        // `destruct(function_record, false)`: `initialize_generic` copies strings and
+        // takes care of cleaning up in case of exceptions. So pass `false` to `free_strings`.
+        void operator()(detail::function_record *rec) { destruct(rec, false); }
+    };
+    using unique_function_record
+        = std::unique_ptr<detail::function_record, InitializingFunctionRecordDeleter>;
+
+    /// Space optimization: don't inline this frequently instantiated fragment
+    PYBIND11_NOINLINE unique_function_record make_function_record() {
+        return unique_function_record(new detail::function_record());
+    }
+
+    /// Special internal constructor for functors, lambda functions, etc.
+    template <typename Func, typename Return, typename... Args, typename... Extra>
+    void initialize(Func &&f, Return (*)(Args...), const Extra &...extra) {
+        using namespace detail;
+        struct capture {
+            remove_reference_t<Func> f;
+        };
+
+        /* Store the function including any extra state it might have (e.g. a lambda capture
+         * object) */
+        // The unique_ptr makes sure nothing is leaked in case of an exception.
+        auto unique_rec = make_function_record();
+        auto *rec = unique_rec.get();
+
+        /* Store the capture object directly in the function record if there is enough space */
+        if (sizeof(capture) <= sizeof(rec->data)) {
+            /* Without these pragmas, GCC warns that there might not be
+               enough space to use the placement new operator. However, the
+               'if' statement above ensures that this is the case. */
+            PYBIND11_WARNING_PUSH
+
+#if defined(__GNUG__) && __GNUC__ >= 6
+            PYBIND11_WARNING_DISABLE_GCC("-Wplacement-new")
+#endif
+
+            new ((capture *) &rec->data) capture{std::forward<Func>(f)};
+
+#if !PYBIND11_HAS_STD_LAUNDER
+            PYBIND11_WARNING_DISABLE_GCC("-Wstrict-aliasing")
+#endif
+
+            // UB without std::launder, but without breaking ABI and/or
+            // a significant refactoring it's "impossible" to solve.
+            if (!std::is_trivially_destructible<capture>::value) {
+                rec->free_data = [](function_record *r) {
+                    auto data = PYBIND11_STD_LAUNDER((capture *) &r->data);
+                    (void) data;
+                    data->~capture();
+                };
+            }
+            PYBIND11_WARNING_POP
+        } else {
+            rec->data[0] = new capture{std::forward<Func>(f)};
+            rec->free_data = [](function_record *r) { delete ((capture *) r->data[0]); };
+        }
+
+        /* Type casters for the function arguments and return value */
+        using cast_in = argument_loader<Args...>;
+        using cast_out
+            = make_caster<conditional_t<std::is_void<Return>::value, void_type, Return>>;
+
+        static_assert(
+            expected_num_args<Extra...>(
+                sizeof...(Args), cast_in::args_pos >= 0, cast_in::has_kwargs),
+            "The number of argument annotations does not match the number of function arguments");
+
+        /* Dispatch code which converts function arguments and performs the actual function call */
+        rec->impl = [](function_call &call) -> handle {
+            cast_in args_converter;
+
+            /* Try to cast the function arguments into the C++ domain */
+            if (!args_converter.load_args(call)) {
+                return PYBIND11_TRY_NEXT_OVERLOAD;
+            }
+
+            /* Invoke call policy pre-call hook */
+            process_attributes<Extra...>::precall(call);
+
+            /* Get a pointer to the capture object */
+            const auto *data = (sizeof(capture) <= sizeof(call.func.data) ? &call.func.data
+                                                                          : call.func.data[0]);
+            auto *cap = const_cast<capture *>(reinterpret_cast<const capture *>(data));
+
+            /* Override policy for rvalues -- usually to enforce rvp::move on an rvalue */
+            return_value_policy policy
+                = return_value_policy_override<Return>::policy(call.func.policy);
+
+            /* Function scope guard -- defaults to the compile-to-nothing `void_type` */
+            using Guard = extract_guard_t<Extra...>;
+
+            /* Perform the function call */
+            handle result;
+            if (call.func.is_setter) {
+                (void) std::move(args_converter).template call<Return, Guard>(cap->f);
+                result = none().release();
+            } else {
+                result = cast_out::cast(
+                    std::move(args_converter).template call<Return, Guard>(cap->f),
+                    policy,
+                    call.parent);
+            }
+
+            /* Invoke call policy post-call hook */
+            process_attributes<Extra...>::postcall(call, result);
+
+            return result;
+        };
+
+        rec->nargs_pos = cast_in::args_pos >= 0
+                             ? static_cast<std::uint16_t>(cast_in::args_pos)
+                             : sizeof...(Args) - cast_in::has_kwargs; // Will get reduced more if
+                                                                      // we have a kw_only
+        rec->has_args = cast_in::args_pos >= 0;
+        rec->has_kwargs = cast_in::has_kwargs;
+
+        /* Process any user-provided function attributes */
+        process_attributes<Extra...>::init(extra..., rec);
+
+        {
+            constexpr bool has_kw_only_args = any_of<std::is_same<kw_only, Extra>...>::value,
+                           has_pos_only_args = any_of<std::is_same<pos_only, Extra>...>::value,
+                           has_arg_annotations = any_of<is_keyword<Extra>...>::value;
+            static_assert(has_arg_annotations || !has_kw_only_args,
+                          "py::kw_only requires the use of argument annotations");
+            static_assert(has_arg_annotations || !has_pos_only_args,
+                          "py::pos_only requires the use of argument annotations (for docstrings "
+                          "and aligning the annotations to the argument)");
+
+            static_assert(constexpr_sum(is_kw_only<Extra>::value...) <= 1,
+                          "py::kw_only may be specified only once");
+            static_assert(constexpr_sum(is_pos_only<Extra>::value...) <= 1,
+                          "py::pos_only may be specified only once");
+            constexpr auto kw_only_pos = constexpr_first<is_kw_only, Extra...>();
+            constexpr auto pos_only_pos = constexpr_first<is_pos_only, Extra...>();
+            static_assert(!(has_kw_only_args && has_pos_only_args) || pos_only_pos < kw_only_pos,
+                          "py::pos_only must come before py::kw_only");
+        }
+
+        /* Generate a readable signature describing the function's arguments and return
+           value types */
+        static constexpr auto signature
+            = const_name("(") + cast_in::arg_names + const_name(") -> ") + cast_out::name;
+        PYBIND11_DESCR_CONSTEXPR auto types = decltype(signature)::types();
+
+        /* Register the function with Python from generic (non-templated) code */
+        // Pass on the ownership over the `unique_rec` to `initialize_generic`. `rec` stays valid.
+        initialize_generic(std::move(unique_rec), signature.text, types.data(), sizeof...(Args));
+
+        /* Stash some additional information used by an important optimization in 'functional.h' */
+        using FunctionType = Return (*)(Args...);
+        constexpr bool is_function_ptr
+            = std::is_convertible<Func, FunctionType>::value && sizeof(capture) == sizeof(void *);
+        if (is_function_ptr) {
+            rec->is_stateless = true;
+            rec->data[1]
+                = const_cast<void *>(reinterpret_cast<const void *>(&typeid(FunctionType)));
+        }
+    }
+
+    // Utility class that keeps track of all duplicated strings, and cleans them up in its
+    // destructor, unless they are released. Basically a RAII-solution to deal with exceptions
+    // along the way.
+    class strdup_guard {
+    public:
+        strdup_guard() = default;
+        strdup_guard(const strdup_guard &) = delete;
+        strdup_guard &operator=(const strdup_guard &) = delete;
+
+        ~strdup_guard() {
+            for (auto *s : strings) {
+                std::free(s);
+            }
+        }
+        char *operator()(const char *s) {
+            auto *t = PYBIND11_COMPAT_STRDUP(s);
+            strings.push_back(t);
+            return t;
+        }
+        void release() { strings.clear(); }
+
+    private:
+        std::vector<char *> strings;
+    };
+
+    /// Register a function call with Python (generic non-templated code goes here)
+    void initialize_generic(unique_function_record &&unique_rec,
+                            const char *text,
+                            const std::type_info *const *types,
+                            size_t args) {
+        // Do NOT receive `unique_rec` by value. If this function fails to move out the unique_ptr,
+        // we do not want this to destruct the pointer. `initialize` (the caller) still relies on
+        // the pointee being alive after this call. Only move out if a `capsule` is going to keep
+        // it alive.
+        auto *rec = unique_rec.get();
+
+        // Keep track of strdup'ed strings, and clean them up as long as the function's capsule
+        // has not taken ownership yet (when `unique_rec.release()` is called).
+        // Note: This cannot easily be fixed by a `unique_ptr` with custom deleter, because the
+        // strings are only referenced before strdup'ing. So only *after* the following block could
+        // `destruct` safely be called, but even then, `repr` could still throw in the middle of
+        // copying all strings.
+        strdup_guard guarded_strdup;
+
+        /* Create copies of all referenced C-style strings */
+        rec->name = guarded_strdup(rec->name ? rec->name : "");
+        if (rec->doc) {
+            rec->doc = guarded_strdup(rec->doc);
+        }
+        for (auto &a : rec->args) {
+            if (a.name) {
+                a.name = guarded_strdup(a.name);
+            }
+            if (a.descr) {
+                a.descr = guarded_strdup(a.descr);
+            } else if (a.value) {
+                a.descr = guarded_strdup(repr(a.value).cast<std::string>().c_str());
+            }
+        }
+
+        rec->is_constructor = (std::strcmp(rec->name, "__init__") == 0)
+                              || (std::strcmp(rec->name, "__setstate__") == 0);
+
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES) && !defined(PYBIND11_DISABLE_NEW_STYLE_INIT_WARNING)
+        if (rec->is_constructor && !rec->is_new_style_constructor) {
+            const auto class_name
+                = detail::get_fully_qualified_tp_name((PyTypeObject *) rec->scope.ptr());
+            const auto func_name = std::string(rec->name);
+            PyErr_WarnEx(PyExc_FutureWarning,
+                         ("pybind11-bound class '" + class_name
+                          + "' is using an old-style "
+                            "placement-new '"
+                          + func_name
+                          + "' which has been deprecated. See "
+                            "the upgrade guide in pybind11's docs. This message is only visible "
+                            "when compiled in debug mode.")
+                             .c_str(),
+                         0);
+        }
+#endif
+
+        /* Generate a proper function signature */
+        std::string signature;
+        size_t type_index = 0, arg_index = 0;
+        bool is_starred = false;
+        for (const auto *pc = text; *pc != '\0'; ++pc) {
+            const auto c = *pc;
+
+            if (c == '{') {
+                // Write arg name for everything except *args and **kwargs.
+                is_starred = *(pc + 1) == '*';
+                if (is_starred) {
+                    continue;
+                }
+                // Separator for keyword-only arguments, placed before the kw
+                // arguments start (unless we are already putting an *args)
+                if (!rec->has_args && arg_index == rec->nargs_pos) {
+                    signature += "*, ";
+                }
+                if (arg_index < rec->args.size() && rec->args[arg_index].name) {
+                    signature += rec->args[arg_index].name;
+                } else if (arg_index == 0 && rec->is_method) {
+                    signature += "self";
+                } else {
+                    signature += "arg" + std::to_string(arg_index - (rec->is_method ? 1 : 0));
+                }
+                signature += ": ";
+            } else if (c == '}') {
+                // Write default value if available.
+                if (!is_starred && arg_index < rec->args.size() && rec->args[arg_index].descr) {
+                    signature += " = ";
+                    signature += rec->args[arg_index].descr;
+                }
+                // Separator for positional-only arguments (placed after the
+                // argument, rather than before like *
+                if (rec->nargs_pos_only > 0 && (arg_index + 1) == rec->nargs_pos_only) {
+                    signature += ", /";
+                }
+                if (!is_starred) {
+                    arg_index++;
+                }
+            } else if (c == '%') {
+                const std::type_info *t = types[type_index++];
+                if (!t) {
+                    pybind11_fail("Internal error while parsing type signature (1)");
+                }
+                if (auto *tinfo = detail::get_type_info(*t)) {
+                    handle th((PyObject *) tinfo->type);
+                    signature += th.attr("__module__").cast<std::string>() + "."
+                                 + th.attr("__qualname__").cast<std::string>();
+                } else if (rec->is_new_style_constructor && arg_index == 0) {
+                    // A new-style `__init__` takes `self` as `value_and_holder`.
+                    // Rewrite it to the proper class type.
+                    signature += rec->scope.attr("__module__").cast<std::string>() + "."
+                                 + rec->scope.attr("__qualname__").cast<std::string>();
+                } else {
+                    std::string tname(t->name());
+                    detail::clean_type_id(tname);
+                    signature += tname;
+                }
+            } else {
+                signature += c;
+            }
+        }
+
+        if (arg_index != args - rec->has_args - rec->has_kwargs || types[type_index] != nullptr) {
+            pybind11_fail("Internal error while parsing type signature (2)");
+        }
+
+        rec->signature = guarded_strdup(signature.c_str());
+        rec->args.shrink_to_fit();
+        rec->nargs = (std::uint16_t) args;
+
+        if (rec->sibling && PYBIND11_INSTANCE_METHOD_CHECK(rec->sibling.ptr())) {
+            rec->sibling = PYBIND11_INSTANCE_METHOD_GET_FUNCTION(rec->sibling.ptr());
+        }
+
+        detail::function_record *chain = nullptr, *chain_start = rec;
+        if (rec->sibling) {
+            if (PyCFunction_Check(rec->sibling.ptr())) {
+                auto *self = PyCFunction_GET_SELF(rec->sibling.ptr());
+                if (!isinstance<capsule>(self)) {
+                    chain = nullptr;
+                } else {
+                    auto rec_capsule = reinterpret_borrow<capsule>(self);
+                    if (detail::is_function_record_capsule(rec_capsule)) {
+                        chain = rec_capsule.get_pointer<detail::function_record>();
+                        /* Never append a method to an overload chain of a parent class;
+                           instead, hide the parent's overloads in this case */
+                        if (!chain->scope.is(rec->scope)) {
+                            chain = nullptr;
+                        }
+                    } else {
+                        chain = nullptr;
+                    }
+                }
+            }
+            // Don't trigger for things like the default __init__, which are wrapper_descriptors
+            // that we are intentionally replacing
+            else if (!rec->sibling.is_none() && rec->name[0] != '_') {
+                pybind11_fail("Cannot overload existing non-function object \""
+                              + std::string(rec->name) + "\" with a function of the same name");
+            }
+        }
+
+        if (!chain) {
+            /* No existing overload was found, create a new function object */
+            rec->def = new PyMethodDef();
+            std::memset(rec->def, 0, sizeof(PyMethodDef));
+            rec->def->ml_name = rec->name;
+            rec->def->ml_meth
+                = reinterpret_cast<PyCFunction>(reinterpret_cast<void (*)()>(dispatcher));
+            rec->def->ml_flags = METH_VARARGS | METH_KEYWORDS;
+
+            capsule rec_capsule(unique_rec.release(),
+                                detail::get_function_record_capsule_name(),
+                                [](void *ptr) { destruct((detail::function_record *) ptr); });
+            guarded_strdup.release();
+
+            object scope_module;
+            if (rec->scope) {
+                if (hasattr(rec->scope, "__module__")) {
+                    scope_module = rec->scope.attr("__module__");
+                } else if (hasattr(rec->scope, "__name__")) {
+                    scope_module = rec->scope.attr("__name__");
+                }
+            }
+
+            m_ptr = PyCFunction_NewEx(rec->def, rec_capsule.ptr(), scope_module.ptr());
+            if (!m_ptr) {
+                pybind11_fail("cpp_function::cpp_function(): Could not allocate function object");
+            }
+        } else {
+            /* Append at the beginning or end of the overload chain */
+            m_ptr = rec->sibling.ptr();
+            inc_ref();
+            if (chain->is_method != rec->is_method) {
+                pybind11_fail(
+                    "overloading a method with both static and instance methods is not supported; "
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+                    "#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for more "
+                    "details"
+#else
+                    "error while attempting to bind "
+                    + std::string(rec->is_method ? "instance" : "static") + " method "
+                    + std::string(pybind11::str(rec->scope.attr("__name__"))) + "."
+                    + std::string(rec->name) + signature
+#endif
+                );
+            }
+
+            if (rec->prepend) {
+                // Beginning of chain; we need to replace the capsule's current head-of-the-chain
+                // pointer with this one, then make this one point to the previous head of the
+                // chain.
+                chain_start = rec;
+                rec->next = chain;
+                auto rec_capsule
+                    = reinterpret_borrow<capsule>(((PyCFunctionObject *) m_ptr)->m_self);
+                rec_capsule.set_pointer(unique_rec.release());
+                guarded_strdup.release();
+            } else {
+                // Or end of chain (normal behavior)
+                chain_start = chain;
+                while (chain->next) {
+                    chain = chain->next;
+                }
+                chain->next = unique_rec.release();
+                guarded_strdup.release();
+            }
+        }
+
+        std::string signatures;
+        int index = 0;
+        /* Create a nice pydoc rec including all signatures and
+           docstrings of the functions in the overload chain */
+        if (chain && options::show_function_signatures()) {
+            // First a generic signature
+            signatures += rec->name;
+            signatures += "(*args, **kwargs)\n";
+            signatures += "Overloaded function.\n\n";
+        }
+        // Then specific overload signatures
+        bool first_user_def = true;
+        for (auto *it = chain_start; it != nullptr; it = it->next) {
+            if (options::show_function_signatures()) {
+                if (index > 0) {
+                    signatures += '\n';
+                }
+                if (chain) {
+                    signatures += std::to_string(++index) + ". ";
+                }
+                signatures += rec->name;
+                signatures += it->signature;
+                signatures += '\n';
+            }
+            if (it->doc && it->doc[0] != '\0' && options::show_user_defined_docstrings()) {
+                // If we're appending another docstring, and aren't printing function signatures,
+                // we need to append a newline first:
+                if (!options::show_function_signatures()) {
+                    if (first_user_def) {
+                        first_user_def = false;
+                    } else {
+                        signatures += '\n';
+                    }
+                }
+                if (options::show_function_signatures()) {
+                    signatures += '\n';
+                }
+                signatures += it->doc;
+                if (options::show_function_signatures()) {
+                    signatures += '\n';
+                }
+            }
+        }
+
+        /* Install docstring */
+        auto *func = (PyCFunctionObject *) m_ptr;
+        std::free(const_cast<char *>(func->m_ml->ml_doc));
+        // Install docstring if it's non-empty (when at least one option is enabled)
+        func->m_ml->ml_doc
+            = signatures.empty() ? nullptr : PYBIND11_COMPAT_STRDUP(signatures.c_str());
+
+        if (rec->is_method) {
+            m_ptr = PYBIND11_INSTANCE_METHOD_NEW(m_ptr, rec->scope.ptr());
+            if (!m_ptr) {
+                pybind11_fail(
+                    "cpp_function::cpp_function(): Could not allocate instance method object");
+            }
+            Py_DECREF(func);
+        }
+    }
+
+    /// When a cpp_function is GCed, release any memory allocated by pybind11
+    static void destruct(detail::function_record *rec, bool free_strings = true) {
+// If on Python 3.9, check the interpreter "MICRO" (patch) version.
+// If this is running on 3.9.0, we have to work around a bug.
+#if !defined(PYPY_VERSION) && PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION == 9
+        static bool is_zero = Py_GetVersion()[4] == '0';
+#endif
+
+        while (rec) {
+            detail::function_record *next = rec->next;
+            if (rec->free_data) {
+                rec->free_data(rec);
+            }
+            // During initialization, these strings might not have been copied yet,
+            // so they cannot be freed. Once the function has been created, they can.
+            // Check `make_function_record` for more details.
+            if (free_strings) {
+                std::free((char *) rec->name);
+                std::free((char *) rec->doc);
+                std::free((char *) rec->signature);
+                for (auto &arg : rec->args) {
+                    std::free(const_cast<char *>(arg.name));
+                    std::free(const_cast<char *>(arg.descr));
+                }
+            }
+            for (auto &arg : rec->args) {
+                arg.value.dec_ref();
+            }
+            if (rec->def) {
+                std::free(const_cast<char *>(rec->def->ml_doc));
+// Python 3.9.0 decref's these in the wrong order; rec->def
+// If loaded on 3.9.0, let these leak (use Python 3.9.1 at runtime to fix)
+// See https://github.com/python/cpython/pull/22670
+#if !defined(PYPY_VERSION) && PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION == 9
+                if (!is_zero) {
+                    delete rec->def;
+                }
+#else
+                delete rec->def;
+#endif
+            }
+            delete rec;
+            rec = next;
+        }
+    }
+
+    /// Main dispatch logic for calls to functions bound using pybind11
+    static PyObject *dispatcher(PyObject *self, PyObject *args_in, PyObject *kwargs_in) {
+        using namespace detail;
+        assert(isinstance<capsule>(self));
+
+        /* Iterator over the list of potentially admissible overloads */
+        const function_record *overloads = reinterpret_cast<function_record *>(
+                                  PyCapsule_GetPointer(self, get_function_record_capsule_name())),
+                              *it = overloads;
+        assert(overloads != nullptr);
+
+        /* Need to know how many arguments + keyword arguments there are to pick the right
+           overload */
+        const auto n_args_in = (size_t) PyTuple_GET_SIZE(args_in);
+
+        handle parent = n_args_in > 0 ? PyTuple_GET_ITEM(args_in, 0) : nullptr,
+               result = PYBIND11_TRY_NEXT_OVERLOAD;
+
+        auto self_value_and_holder = value_and_holder();
+        if (overloads->is_constructor) {
+            if (!parent
+                || !PyObject_TypeCheck(parent.ptr(), (PyTypeObject *) overloads->scope.ptr())) {
+                PyErr_SetString(
+                    PyExc_TypeError,
+                    "__init__(self, ...) called with invalid or missing `self` argument");
+                return nullptr;
+            }
+
+            auto *const tinfo = get_type_info((PyTypeObject *) overloads->scope.ptr());
+            auto *const pi = reinterpret_cast<instance *>(parent.ptr());
+            self_value_and_holder = pi->get_value_and_holder(tinfo, true);
+
+            // If this value is already registered it must mean __init__ is invoked multiple times;
+            // we really can't support that in C++, so just ignore the second __init__.
+            if (self_value_and_holder.instance_registered()) {
+                return none().release().ptr();
+            }
+        }
+
+        try {
+            // We do this in two passes: in the first pass, we load arguments with `convert=false`;
+            // in the second, we allow conversion (except for arguments with an explicit
+            // py::arg().noconvert()).  This lets us prefer calls without conversion, with
+            // conversion as a fallback.
+            std::vector<function_call> second_pass;
+
+            // However, if there are no overloads, we can just skip the no-convert pass entirely
+            const bool overloaded = it != nullptr && it->next != nullptr;
+
+            for (; it != nullptr; it = it->next) {
+
+                /* For each overload:
+                   1. Copy all positional arguments we were given, also checking to make sure that
+                      named positional arguments weren't *also* specified via kwarg.
+                   2. If we weren't given enough, try to make up the omitted ones by checking
+                      whether they were provided by a kwarg matching the `py::arg("name")` name. If
+                      so, use it (and remove it from kwargs); if not, see if the function binding
+                      provided a default that we can use.
+                   3. Ensure that either all keyword arguments were "consumed", or that the
+                   function takes a kwargs argument to accept unconsumed kwargs.
+                   4. Any positional arguments still left get put into a tuple (for args), and any
+                      leftover kwargs get put into a dict.
+                   5. Pack everything into a vector; if we have py::args or py::kwargs, they are an
+                      extra tuple or dict at the end of the positional arguments.
+                   6. Call the function call dispatcher (function_record::impl)
+
+                   If one of these fail, move on to the next overload and keep trying until we get
+                   a result other than PYBIND11_TRY_NEXT_OVERLOAD.
+                 */
+
+                const function_record &func = *it;
+                size_t num_args = func.nargs; // Number of positional arguments that we need
+                if (func.has_args) {
+                    --num_args; // (but don't count py::args
+                }
+                if (func.has_kwargs) {
+                    --num_args; //  or py::kwargs)
+                }
+                size_t pos_args = func.nargs_pos;
+
+                if (!func.has_args && n_args_in > pos_args) {
+                    continue; // Too many positional arguments for this overload
+                }
+
+                if (n_args_in < pos_args && func.args.size() < pos_args) {
+                    continue; // Not enough positional arguments given, and not enough defaults to
+                              // fill in the blanks
+                }
+
+                function_call call(func, parent);
+
+                // Protect std::min with parentheses
+                size_t args_to_copy = (std::min)(pos_args, n_args_in);
+                size_t args_copied = 0;
+
+                // 0. Inject new-style `self` argument
+                if (func.is_new_style_constructor) {
+                    // The `value` may have been preallocated by an old-style `__init__`
+                    // if it was a preceding candidate for overload resolution.
+                    if (self_value_and_holder) {
+                        self_value_and_holder.type->dealloc(self_value_and_holder);
+                    }
+
+                    call.init_self = PyTuple_GET_ITEM(args_in, 0);
+                    call.args.emplace_back(reinterpret_cast<PyObject *>(&self_value_and_holder));
+                    call.args_convert.push_back(false);
+                    ++args_copied;
+                }
+
+                // 1. Copy any position arguments given.
+                bool bad_arg = false;
+                for (; args_copied < args_to_copy; ++args_copied) {
+                    const argument_record *arg_rec
+                        = args_copied < func.args.size() ? &func.args[args_copied] : nullptr;
+                    if (kwargs_in && arg_rec && arg_rec->name
+                        && dict_getitemstring(kwargs_in, arg_rec->name)) {
+                        bad_arg = true;
+                        break;
+                    }
+
+                    handle arg(PyTuple_GET_ITEM(args_in, args_copied));
+                    if (arg_rec && !arg_rec->none && arg.is_none()) {
+                        bad_arg = true;
+                        break;
+                    }
+                    call.args.push_back(arg);
+                    call.args_convert.push_back(arg_rec ? arg_rec->convert : true);
+                }
+                if (bad_arg) {
+                    continue; // Maybe it was meant for another overload (issue #688)
+                }
+
+                // Keep track of how many position args we copied out in case we need to come back
+                // to copy the rest into a py::args argument.
+                size_t positional_args_copied = args_copied;
+
+                // We'll need to copy this if we steal some kwargs for defaults
+                dict kwargs = reinterpret_borrow<dict>(kwargs_in);
+
+                // 1.5. Fill in any missing pos_only args from defaults if they exist
+                if (args_copied < func.nargs_pos_only) {
+                    for (; args_copied < func.nargs_pos_only; ++args_copied) {
+                        const auto &arg_rec = func.args[args_copied];
+                        handle value;
+
+                        if (arg_rec.value) {
+                            value = arg_rec.value;
+                        }
+                        if (value) {
+                            call.args.push_back(value);
+                            call.args_convert.push_back(arg_rec.convert);
+                        } else {
+                            break;
+                        }
+                    }
+
+                    if (args_copied < func.nargs_pos_only) {
+                        continue; // Not enough defaults to fill the positional arguments
+                    }
+                }
+
+                // 2. Check kwargs and, failing that, defaults that may help complete the list
+                if (args_copied < num_args) {
+                    bool copied_kwargs = false;
+
+                    for (; args_copied < num_args; ++args_copied) {
+                        const auto &arg_rec = func.args[args_copied];
+
+                        handle value;
+                        if (kwargs_in && arg_rec.name) {
+                            value = dict_getitemstring(kwargs.ptr(), arg_rec.name);
+                        }
+
+                        if (value) {
+                            // Consume a kwargs value
+                            if (!copied_kwargs) {
+                                kwargs = reinterpret_steal<dict>(PyDict_Copy(kwargs.ptr()));
+                                copied_kwargs = true;
+                            }
+                            if (PyDict_DelItemString(kwargs.ptr(), arg_rec.name) == -1) {
+                                throw error_already_set();
+                            }
+                        } else if (arg_rec.value) {
+                            value = arg_rec.value;
+                        }
+
+                        if (!arg_rec.none && value.is_none()) {
+                            break;
+                        }
+
+                        if (value) {
+                            // If we're at the py::args index then first insert a stub for it to be
+                            // replaced later
+                            if (func.has_args && call.args.size() == func.nargs_pos) {
+                                call.args.push_back(none());
+                            }
+
+                            call.args.push_back(value);
+                            call.args_convert.push_back(arg_rec.convert);
+                        } else {
+                            break;
+                        }
+                    }
+
+                    if (args_copied < num_args) {
+                        continue; // Not enough arguments, defaults, or kwargs to fill the
+                                  // positional arguments
+                    }
+                }
+
+                // 3. Check everything was consumed (unless we have a kwargs arg)
+                if (kwargs && !kwargs.empty() && !func.has_kwargs) {
+                    continue; // Unconsumed kwargs, but no py::kwargs argument to accept them
+                }
+
+                // 4a. If we have a py::args argument, create a new tuple with leftovers
+                if (func.has_args) {
+                    tuple extra_args;
+                    if (args_to_copy == 0) {
+                        // We didn't copy out any position arguments from the args_in tuple, so we
+                        // can reuse it directly without copying:
+                        extra_args = reinterpret_borrow<tuple>(args_in);
+                    } else if (positional_args_copied >= n_args_in) {
+                        extra_args = tuple(0);
+                    } else {
+                        size_t args_size = n_args_in - positional_args_copied;
+                        extra_args = tuple(args_size);
+                        for (size_t i = 0; i < args_size; ++i) {
+                            extra_args[i] = PyTuple_GET_ITEM(args_in, positional_args_copied + i);
+                        }
+                    }
+                    if (call.args.size() <= func.nargs_pos) {
+                        call.args.push_back(extra_args);
+                    } else {
+                        call.args[func.nargs_pos] = extra_args;
+                    }
+                    call.args_convert.push_back(false);
+                    call.args_ref = std::move(extra_args);
+                }
+
+                // 4b. If we have a py::kwargs, pass on any remaining kwargs
+                if (func.has_kwargs) {
+                    if (!kwargs.ptr()) {
+                        kwargs = dict(); // If we didn't get one, send an empty one
+                    }
+                    call.args.push_back(kwargs);
+                    call.args_convert.push_back(false);
+                    call.kwargs_ref = std::move(kwargs);
+                }
+
+// 5. Put everything in a vector.  Not technically step 5, we've been building it
+// in `call.args` all along.
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+                if (call.args.size() != func.nargs || call.args_convert.size() != func.nargs) {
+                    pybind11_fail("Internal error: function call dispatcher inserted wrong number "
+                                  "of arguments!");
+                }
+#endif
+
+                std::vector<bool> second_pass_convert;
+                if (overloaded) {
+                    // We're in the first no-convert pass, so swap out the conversion flags for a
+                    // set of all-false flags.  If the call fails, we'll swap the flags back in for
+                    // the conversion-allowed call below.
+                    second_pass_convert.resize(func.nargs, false);
+                    call.args_convert.swap(second_pass_convert);
+                }
+
+                // 6. Call the function.
+                try {
+                    loader_life_support guard{};
+                    result = func.impl(call);
+                } catch (reference_cast_error &) {
+                    result = PYBIND11_TRY_NEXT_OVERLOAD;
+                }
+
+                if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD) {
+                    break;
+                }
+
+                if (overloaded) {
+                    // The (overloaded) call failed; if the call has at least one argument that
+                    // permits conversion (i.e. it hasn't been explicitly specified `.noconvert()`)
+                    // then add this call to the list of second pass overloads to try.
+                    for (size_t i = func.is_method ? 1 : 0; i < pos_args; i++) {
+                        if (second_pass_convert[i]) {
+                            // Found one: swap the converting flags back in and store the call for
+                            // the second pass.
+                            call.args_convert.swap(second_pass_convert);
+                            second_pass.push_back(std::move(call));
+                            break;
+                        }
+                    }
+                }
+            }
+
+            if (overloaded && !second_pass.empty() && result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) {
+                // The no-conversion pass finished without success, try again with conversion
+                // allowed
+                for (auto &call : second_pass) {
+                    try {
+                        loader_life_support guard{};
+                        result = call.func.impl(call);
+                    } catch (reference_cast_error &) {
+                        result = PYBIND11_TRY_NEXT_OVERLOAD;
+                    }
+
+                    if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD) {
+                        // The error reporting logic below expects 'it' to be valid, as it would be
+                        // if we'd encountered this failure in the first-pass loop.
+                        if (!result) {
+                            it = &call.func;
+                        }
+                        break;
+                    }
+                }
+            }
+        } catch (error_already_set &e) {
+            e.restore();
+            return nullptr;
+#ifdef __GLIBCXX__
+        } catch (abi::__forced_unwind &) {
+            throw;
+#endif
+        } catch (...) {
+            /* When an exception is caught, give each registered exception
+               translator a chance to translate it to a Python exception. First
+               all module-local translators will be tried in reverse order of
+               registration. If none of the module-locale translators handle
+               the exception (or there are no module-locale translators) then
+               the global translators will be tried, also in reverse order of
+               registration.
+
+               A translator may choose to do one of the following:
+
+                - catch the exception and call PyErr_SetString or PyErr_SetObject
+                  to set a standard (or custom) Python exception, or
+                - do nothing and let the exception fall through to the next translator, or
+                - delegate translation to the next translator by throwing a new type of exception.
+             */
+
+            auto &local_exception_translators
+                = get_local_internals().registered_exception_translators;
+            if (detail::apply_exception_translators(local_exception_translators)) {
+                return nullptr;
+            }
+            auto &exception_translators = get_internals().registered_exception_translators;
+            if (detail::apply_exception_translators(exception_translators)) {
+                return nullptr;
+            }
+
+            PyErr_SetString(PyExc_SystemError,
+                            "Exception escaped from default exception translator!");
+            return nullptr;
+        }
+
+        auto append_note_if_missing_header_is_suspected = [](std::string &msg) {
+            if (msg.find("std::") != std::string::npos) {
+                msg += "\n\n"
+                       "Did you forget to `#include <pybind11/stl.h>`? Or <pybind11/complex.h>,\n"
+                       "<pybind11/functional.h>, <pybind11/chrono.h>, etc. Some automatic\n"
+                       "conversions are optional and require extra headers to be included\n"
+                       "when compiling your pybind11 module.";
+            }
+        };
+
+        if (result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) {
+            if (overloads->is_operator) {
+                return handle(Py_NotImplemented).inc_ref().ptr();
+            }
+
+            std::string msg = std::string(overloads->name) + "(): incompatible "
+                              + std::string(overloads->is_constructor ? "constructor" : "function")
+                              + " arguments. The following argument types are supported:\n";
+
+            int ctr = 0;
+            for (const function_record *it2 = overloads; it2 != nullptr; it2 = it2->next) {
+                msg += "    " + std::to_string(++ctr) + ". ";
+
+                bool wrote_sig = false;
+                if (overloads->is_constructor) {
+                    // For a constructor, rewrite `(self: Object, arg0, ...) -> NoneType` as
+                    // `Object(arg0, ...)`
+                    std::string sig = it2->signature;
+                    size_t start = sig.find('(') + 7; // skip "(self: "
+                    if (start < sig.size()) {
+                        // End at the , for the next argument
+                        size_t end = sig.find(", "), next = end + 2;
+                        size_t ret = sig.rfind(" -> ");
+                        // Or the ), if there is no comma:
+                        if (end >= sig.size()) {
+                            next = end = sig.find(')');
+                        }
+                        if (start < end && next < sig.size()) {
+                            msg.append(sig, start, end - start);
+                            msg += '(';
+                            msg.append(sig, next, ret - next);
+                            wrote_sig = true;
+                        }
+                    }
+                }
+                if (!wrote_sig) {
+                    msg += it2->signature;
+                }
+
+                msg += '\n';
+            }
+            msg += "\nInvoked with: ";
+            auto args_ = reinterpret_borrow<tuple>(args_in);
+            bool some_args = false;
+            for (size_t ti = overloads->is_constructor ? 1 : 0; ti < args_.size(); ++ti) {
+                if (!some_args) {
+                    some_args = true;
+                } else {
+                    msg += ", ";
+                }
+                try {
+                    msg += pybind11::repr(args_[ti]);
+                } catch (const error_already_set &) {
+                    msg += "<repr raised Error>";
+                }
+            }
+            if (kwargs_in) {
+                auto kwargs = reinterpret_borrow<dict>(kwargs_in);
+                if (!kwargs.empty()) {
+                    if (some_args) {
+                        msg += "; ";
+                    }
+                    msg += "kwargs: ";
+                    bool first = true;
+                    for (auto kwarg : kwargs) {
+                        if (first) {
+                            first = false;
+                        } else {
+                            msg += ", ";
+                        }
+                        msg += pybind11::str("{}=").format(kwarg.first);
+                        try {
+                            msg += pybind11::repr(kwarg.second);
+                        } catch (const error_already_set &) {
+                            msg += "<repr raised Error>";
+                        }
+                    }
+                }
+            }
+
+            append_note_if_missing_header_is_suspected(msg);
+            // Attach additional error info to the exception if supported
+            if (PyErr_Occurred()) {
+                // #HelpAppreciated: unit test coverage for this branch.
+                raise_from(PyExc_TypeError, msg.c_str());
+                return nullptr;
+            }
+            PyErr_SetString(PyExc_TypeError, msg.c_str());
+            return nullptr;
+        }
+        if (!result) {
+            std::string msg = "Unable to convert function return value to a "
+                              "Python type! The signature was\n\t";
+            msg += it->signature;
+            append_note_if_missing_header_is_suspected(msg);
+            // Attach additional error info to the exception if supported
+            if (PyErr_Occurred()) {
+                raise_from(PyExc_TypeError, msg.c_str());
+                return nullptr;
+            }
+            PyErr_SetString(PyExc_TypeError, msg.c_str());
+            return nullptr;
+        }
+        if (overloads->is_constructor && !self_value_and_holder.holder_constructed()) {
+            auto *pi = reinterpret_cast<instance *>(parent.ptr());
+            self_value_and_holder.type->init_instance(pi, nullptr);
+        }
+        return result.ptr();
+    }
+};
+
+/// Wrapper for Python extension modules
+class module_ : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(module_, object, PyModule_Check)
+
+    /// Create a new top-level Python module with the given name and docstring
+    PYBIND11_DEPRECATED("Use PYBIND11_MODULE or module_::create_extension_module instead")
+    explicit module_(const char *name, const char *doc = nullptr) {
+        *this = create_extension_module(name, doc, new PyModuleDef());
+    }
+
+    /** \rst
+        Create Python binding for a new function within the module scope. ``Func``
+        can be a plain C++ function, a function pointer, or a lambda function. For
+        details on the ``Extra&& ... extra`` argument, see section :ref:`extras`.
+    \endrst */
+    template <typename Func, typename... Extra>
+    module_ &def(const char *name_, Func &&f, const Extra &...extra) {
+        cpp_function func(std::forward<Func>(f),
+                          name(name_),
+                          scope(*this),
+                          sibling(getattr(*this, name_, none())),
+                          extra...);
+        // NB: allow overwriting here because cpp_function sets up a chain with the intention of
+        // overwriting (and has already checked internally that it isn't overwriting
+        // non-functions).
+        add_object(name_, func, true /* overwrite */);
+        return *this;
+    }
+
+    /** \rst
+        Create and return a new Python submodule with the given name and docstring.
+        This also works recursively, i.e.
+
+        .. code-block:: cpp
+
+            py::module_ m("example", "pybind11 example plugin");
+            py::module_ m2 = m.def_submodule("sub", "A submodule of 'example'");
+            py::module_ m3 = m2.def_submodule("subsub", "A submodule of 'example.sub'");
+    \endrst */
+    module_ def_submodule(const char *name, const char *doc = nullptr) {
+        const char *this_name = PyModule_GetName(m_ptr);
+        if (this_name == nullptr) {
+            throw error_already_set();
+        }
+        std::string full_name = std::string(this_name) + '.' + name;
+        handle submodule = PyImport_AddModule(full_name.c_str());
+        if (!submodule) {
+            throw error_already_set();
+        }
+        auto result = reinterpret_borrow<module_>(submodule);
+        if (doc && options::show_user_defined_docstrings()) {
+            result.attr("__doc__") = pybind11::str(doc);
+        }
+        attr(name) = result;
+        return result;
+    }
+
+    /// Import and return a module or throws `error_already_set`.
+    static module_ import(const char *name) {
+        PyObject *obj = PyImport_ImportModule(name);
+        if (!obj) {
+            throw error_already_set();
+        }
+        return reinterpret_steal<module_>(obj);
+    }
+
+    /// Reload the module or throws `error_already_set`.
+    void reload() {
+        PyObject *obj = PyImport_ReloadModule(ptr());
+        if (!obj) {
+            throw error_already_set();
+        }
+        *this = reinterpret_steal<module_>(obj);
+    }
+
+    /** \rst
+        Adds an object to the module using the given name.  Throws if an object with the given name
+        already exists.
+
+        ``overwrite`` should almost always be false: attempting to overwrite objects that pybind11
+        has established will, in most cases, break things.
+    \endrst */
+    PYBIND11_NOINLINE void add_object(const char *name, handle obj, bool overwrite = false) {
+        if (!overwrite && hasattr(*this, name)) {
+            pybind11_fail(
+                "Error during initialization: multiple incompatible definitions with name \""
+                + std::string(name) + "\"");
+        }
+
+        PyModule_AddObject(ptr(), name, obj.inc_ref().ptr() /* steals a reference */);
+    }
+
+    using module_def = PyModuleDef; // TODO: Can this be removed (it was needed only for Python 2)?
+
+    /** \rst
+        Create a new top-level module that can be used as the main module of a C extension.
+
+        ``def`` should point to a statically allocated module_def.
+    \endrst */
+    static module_ create_extension_module(const char *name, const char *doc, module_def *def) {
+        // module_def is PyModuleDef
+        // Placement new (not an allocation).
+        def = new (def)
+            PyModuleDef{/* m_base */ PyModuleDef_HEAD_INIT,
+                        /* m_name */ name,
+                        /* m_doc */ options::show_user_defined_docstrings() ? doc : nullptr,
+                        /* m_size */ -1,
+                        /* m_methods */ nullptr,
+                        /* m_slots */ nullptr,
+                        /* m_traverse */ nullptr,
+                        /* m_clear */ nullptr,
+                        /* m_free */ nullptr};
+        auto *m = PyModule_Create(def);
+        if (m == nullptr) {
+            if (PyErr_Occurred()) {
+                throw error_already_set();
+            }
+            pybind11_fail("Internal error in module_::create_extension_module()");
+        }
+        // TODO: Should be reinterpret_steal for Python 3, but Python also steals it again when
+        //       returned from PyInit_...
+        //       For Python 2, reinterpret_borrow was correct.
+        return reinterpret_borrow<module_>(m);
+    }
+};
+
+// When inside a namespace (or anywhere as long as it's not the first item on a line),
+// C++20 allows "module" to be used. This is provided for backward compatibility, and for
+// simplicity, if someone wants to use py::module for example, that is perfectly safe.
+using module = module_;
+
+/// \ingroup python_builtins
+/// Return a dictionary representing the global variables in the current execution frame,
+/// or ``__main__.__dict__`` if there is no frame (usually when the interpreter is embedded).
+inline dict globals() {
+    PyObject *p = PyEval_GetGlobals();
+    return reinterpret_borrow<dict>(p ? p : module_::import("__main__").attr("__dict__").ptr());
+}
+
+template <typename... Args, typename = detail::enable_if_t<args_are_all_keyword_or_ds<Args...>()>>
+PYBIND11_DEPRECATED("make_simple_namespace should be replaced with "
+                    "py::module_::import(\"types\").attr(\"SimpleNamespace\") ")
+object make_simple_namespace(Args &&...args_) {
+    return module_::import("types").attr("SimpleNamespace")(std::forward<Args>(args_)...);
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+/// Generic support for creating new Python heap types
+class generic_type : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(generic_type, object, PyType_Check)
+protected:
+    void initialize(const type_record &rec) {
+        if (rec.scope && hasattr(rec.scope, "__dict__")
+            && rec.scope.attr("__dict__").contains(rec.name)) {
+            pybind11_fail("generic_type: cannot initialize type \"" + std::string(rec.name)
+                          + "\": an object with that name is already defined");
+        }
+
+        if ((rec.module_local ? get_local_type_info(*rec.type) : get_global_type_info(*rec.type))
+            != nullptr) {
+            pybind11_fail("generic_type: type \"" + std::string(rec.name)
+                          + "\" is already registered!");
+        }
+
+        m_ptr = make_new_python_type(rec);
+
+        /* Register supplemental type information in C++ dict */
+        auto *tinfo = new detail::type_info();
+        tinfo->type = (PyTypeObject *) m_ptr;
+        tinfo->cpptype = rec.type;
+        tinfo->type_size = rec.type_size;
+        tinfo->type_align = rec.type_align;
+        tinfo->operator_new = rec.operator_new;
+        tinfo->holder_size_in_ptrs = size_in_ptrs(rec.holder_size);
+        tinfo->init_instance = rec.init_instance;
+        tinfo->dealloc = rec.dealloc;
+        tinfo->simple_type = true;
+        tinfo->simple_ancestors = true;
+        tinfo->default_holder = rec.default_holder;
+        tinfo->module_local = rec.module_local;
+
+        auto &internals = get_internals();
+        auto tindex = std::type_index(*rec.type);
+        tinfo->direct_conversions = &internals.direct_conversions[tindex];
+        if (rec.module_local) {
+            get_local_internals().registered_types_cpp[tindex] = tinfo;
+        } else {
+            internals.registered_types_cpp[tindex] = tinfo;
+        }
+        internals.registered_types_py[(PyTypeObject *) m_ptr] = {tinfo};
+
+        if (rec.bases.size() > 1 || rec.multiple_inheritance) {
+            mark_parents_nonsimple(tinfo->type);
+            tinfo->simple_ancestors = false;
+        } else if (rec.bases.size() == 1) {
+            auto *parent_tinfo = get_type_info((PyTypeObject *) rec.bases[0].ptr());
+            assert(parent_tinfo != nullptr);
+            bool parent_simple_ancestors = parent_tinfo->simple_ancestors;
+            tinfo->simple_ancestors = parent_simple_ancestors;
+            // The parent can no longer be a simple type if it has MI and has a child
+            parent_tinfo->simple_type = parent_tinfo->simple_type && parent_simple_ancestors;
+        }
+
+        if (rec.module_local) {
+            // Stash the local typeinfo and loader so that external modules can access it.
+            tinfo->module_local_load = &type_caster_generic::local_load;
+            setattr(m_ptr, PYBIND11_MODULE_LOCAL_ID, capsule(tinfo));
+        }
+    }
+
+    /// Helper function which tags all parents of a type using mult. inheritance
+    void mark_parents_nonsimple(PyTypeObject *value) {
+        auto t = reinterpret_borrow<tuple>(value->tp_bases);
+        for (handle h : t) {
+            auto *tinfo2 = get_type_info((PyTypeObject *) h.ptr());
+            if (tinfo2) {
+                tinfo2->simple_type = false;
+            }
+            mark_parents_nonsimple((PyTypeObject *) h.ptr());
+        }
+    }
+
+    void install_buffer_funcs(buffer_info *(*get_buffer)(PyObject *, void *),
+                              void *get_buffer_data) {
+        auto *type = (PyHeapTypeObject *) m_ptr;
+        auto *tinfo = detail::get_type_info(&type->ht_type);
+
+        if (!type->ht_type.tp_as_buffer) {
+            pybind11_fail("To be able to register buffer protocol support for the type '"
+                          + get_fully_qualified_tp_name(tinfo->type)
+                          + "' the associated class<>(..) invocation must "
+                            "include the pybind11::buffer_protocol() annotation!");
+        }
+
+        tinfo->get_buffer = get_buffer;
+        tinfo->get_buffer_data = get_buffer_data;
+    }
+
+    // rec_func must be set for either fget or fset.
+    void def_property_static_impl(const char *name,
+                                  handle fget,
+                                  handle fset,
+                                  detail::function_record *rec_func) {
+        const auto is_static = (rec_func != nullptr) && !(rec_func->is_method && rec_func->scope);
+        const auto has_doc = (rec_func != nullptr) && (rec_func->doc != nullptr)
+                             && pybind11::options::show_user_defined_docstrings();
+        auto property = handle(
+            (PyObject *) (is_static ? get_internals().static_property_type : &PyProperty_Type));
+        attr(name) = property(fget.ptr() ? fget : none(),
+                              fset.ptr() ? fset : none(),
+                              /*deleter*/ none(),
+                              pybind11::str(has_doc ? rec_func->doc : ""));
+    }
+};
+
+/// Set the pointer to operator new if it exists. The cast is needed because it can be overloaded.
+template <typename T,
+          typename = void_t<decltype(static_cast<void *(*) (size_t)>(T::operator new))>>
+void set_operator_new(type_record *r) {
+    r->operator_new = &T::operator new;
+}
+
+template <typename>
+void set_operator_new(...) {}
+
+template <typename T, typename SFINAE = void>
+struct has_operator_delete : std::false_type {};
+template <typename T>
+struct has_operator_delete<T, void_t<decltype(static_cast<void (*)(void *)>(T::operator delete))>>
+    : std::true_type {};
+template <typename T, typename SFINAE = void>
+struct has_operator_delete_size : std::false_type {};
+template <typename T>
+struct has_operator_delete_size<
+    T,
+    void_t<decltype(static_cast<void (*)(void *, size_t)>(T::operator delete))>> : std::true_type {
+};
+/// Call class-specific delete if it exists or global otherwise. Can also be an overload set.
+template <typename T, enable_if_t<has_operator_delete<T>::value, int> = 0>
+void call_operator_delete(T *p, size_t, size_t) {
+    T::operator delete(p);
+}
+template <typename T,
+          enable_if_t<!has_operator_delete<T>::value && has_operator_delete_size<T>::value, int>
+          = 0>
+void call_operator_delete(T *p, size_t s, size_t) {
+    T::operator delete(p, s);
+}
+
+inline void call_operator_delete(void *p, size_t s, size_t a) {
+    (void) s;
+    (void) a;
+#if defined(__cpp_aligned_new) && (!defined(_MSC_VER) || _MSC_VER >= 1912)
+    if (a > __STDCPP_DEFAULT_NEW_ALIGNMENT__) {
+#    ifdef __cpp_sized_deallocation
+        ::operator delete(p, s, std::align_val_t(a));
+#    else
+        ::operator delete(p, std::align_val_t(a));
+#    endif
+        return;
+    }
+#endif
+#ifdef __cpp_sized_deallocation
+    ::operator delete(p, s);
+#else
+    ::operator delete(p);
+#endif
+}
+
+inline void add_class_method(object &cls, const char *name_, const cpp_function &cf) {
+    cls.attr(cf.name()) = cf;
+    if (std::strcmp(name_, "__eq__") == 0 && !cls.attr("__dict__").contains("__hash__")) {
+        cls.attr("__hash__") = none();
+    }
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Given a pointer to a member function, cast it to its `Derived` version.
+/// Forward everything else unchanged.
+template <typename /*Derived*/, typename F>
+auto method_adaptor(F &&f) -> decltype(std::forward<F>(f)) {
+    return std::forward<F>(f);
+}
+
+template <typename Derived, typename Return, typename Class, typename... Args>
+auto method_adaptor(Return (Class::*pmf)(Args...)) -> Return (Derived::*)(Args...) {
+    static_assert(
+        detail::is_accessible_base_of<Class, Derived>::value,
+        "Cannot bind an inaccessible base class method; use a lambda definition instead");
+    return pmf;
+}
+
+template <typename Derived, typename Return, typename Class, typename... Args>
+auto method_adaptor(Return (Class::*pmf)(Args...) const) -> Return (Derived::*)(Args...) const {
+    static_assert(
+        detail::is_accessible_base_of<Class, Derived>::value,
+        "Cannot bind an inaccessible base class method; use a lambda definition instead");
+    return pmf;
+}
+
+template <typename type_, typename... options>
+class class_ : public detail::generic_type {
+    template <typename T>
+    using is_holder = detail::is_holder_type<type_, T>;
+    template <typename T>
+    using is_subtype = detail::is_strict_base_of<type_, T>;
+    template <typename T>
+    using is_base = detail::is_strict_base_of<T, type_>;
+    // struct instead of using here to help MSVC:
+    template <typename T>
+    struct is_valid_class_option : detail::any_of<is_holder<T>, is_subtype<T>, is_base<T>> {};
+
+public:
+    using type = type_;
+    using type_alias = detail::exactly_one_t<is_subtype, void, options...>;
+    constexpr static bool has_alias = !std::is_void<type_alias>::value;
+    using holder_type = detail::exactly_one_t<is_holder, std::unique_ptr<type>, options...>;
+
+    static_assert(detail::all_of<is_valid_class_option<options>...>::value,
+                  "Unknown/invalid class_ template parameters provided");
+
+    static_assert(!has_alias || std::is_polymorphic<type>::value,
+                  "Cannot use an alias class with a non-polymorphic type");
+
+    PYBIND11_OBJECT(class_, generic_type, PyType_Check)
+
+    template <typename... Extra>
+    class_(handle scope, const char *name, const Extra &...extra) {
+        using namespace detail;
+
+        // MI can only be specified via class_ template options, not constructor parameters
+        static_assert(
+            none_of<is_pyobject<Extra>...>::value || // no base class arguments, or:
+                (constexpr_sum(is_pyobject<Extra>::value...) == 1 && // Exactly one base
+                 constexpr_sum(is_base<options>::value...) == 0 &&   // no template option bases
+                 // no multiple_inheritance attr
+                 none_of<std::is_same<multiple_inheritance, Extra>...>::value),
+            "Error: multiple inheritance bases must be specified via class_ template options");
+
+        type_record record;
+        record.scope = scope;
+        record.name = name;
+        record.type = &typeid(type);
+        record.type_size = sizeof(conditional_t<has_alias, type_alias, type>);
+        record.type_align = alignof(conditional_t<has_alias, type_alias, type> &);
+        record.holder_size = sizeof(holder_type);
+        record.init_instance = init_instance;
+        record.dealloc = dealloc;
+        record.default_holder = detail::is_instantiation<std::unique_ptr, holder_type>::value;
+
+        set_operator_new<type>(&record);
+
+        /* Register base classes specified via template arguments to class_, if any */
+        PYBIND11_EXPAND_SIDE_EFFECTS(add_base<options>(record));
+
+        /* Process optional arguments, if any */
+        process_attributes<Extra...>::init(extra..., &record);
+
+        generic_type::initialize(record);
+
+        if (has_alias) {
+            auto &instances = record.module_local ? get_local_internals().registered_types_cpp
+                                                  : get_internals().registered_types_cpp;
+            instances[std::type_index(typeid(type_alias))]
+                = instances[std::type_index(typeid(type))];
+        }
+    }
+
+    template <typename Base, detail::enable_if_t<is_base<Base>::value, int> = 0>
+    static void add_base(detail::type_record &rec) {
+        rec.add_base(typeid(Base), [](void *src) -> void * {
+            return static_cast<Base *>(reinterpret_cast<type *>(src));
+        });
+    }
+
+    template <typename Base, detail::enable_if_t<!is_base<Base>::value, int> = 0>
+    static void add_base(detail::type_record &) {}
+
+    template <typename Func, typename... Extra>
+    class_ &def(const char *name_, Func &&f, const Extra &...extra) {
+        cpp_function cf(method_adaptor<type>(std::forward<Func>(f)),
+                        name(name_),
+                        is_method(*this),
+                        sibling(getattr(*this, name_, none())),
+                        extra...);
+        add_class_method(*this, name_, cf);
+        return *this;
+    }
+
+    template <typename Func, typename... Extra>
+    class_ &def_static(const char *name_, Func &&f, const Extra &...extra) {
+        static_assert(!std::is_member_function_pointer<Func>::value,
+                      "def_static(...) called with a non-static member function pointer");
+        cpp_function cf(std::forward<Func>(f),
+                        name(name_),
+                        scope(*this),
+                        sibling(getattr(*this, name_, none())),
+                        extra...);
+        auto cf_name = cf.name();
+        attr(std::move(cf_name)) = staticmethod(std::move(cf));
+        return *this;
+    }
+
+    template <typename T, typename... Extra, detail::enable_if_t<T::op_enable_if_hook, int> = 0>
+    class_ &def(const T &op, const Extra &...extra) {
+        op.execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename T, typename... Extra, detail::enable_if_t<T::op_enable_if_hook, int> = 0>
+    class_ &def_cast(const T &op, const Extra &...extra) {
+        op.execute_cast(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(const detail::initimpl::constructor<Args...> &init, const Extra &...extra) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(init);
+        init.execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(const detail::initimpl::alias_constructor<Args...> &init, const Extra &...extra) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(init);
+        init.execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(detail::initimpl::factory<Args...> &&init, const Extra &...extra) {
+        std::move(init).execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(detail::initimpl::pickle_factory<Args...> &&pf, const Extra &...extra) {
+        std::move(pf).execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename Func>
+    class_ &def_buffer(Func &&func) {
+        struct capture {
+            Func func;
+        };
+        auto *ptr = new capture{std::forward<Func>(func)};
+        install_buffer_funcs(
+            [](PyObject *obj, void *ptr) -> buffer_info * {
+                detail::make_caster<type> caster;
+                if (!caster.load(obj, false)) {
+                    return nullptr;
+                }
+                return new buffer_info(((capture *) ptr)->func(std::move(caster)));
+            },
+            ptr);
+        weakref(m_ptr, cpp_function([ptr](handle wr) {
+                    delete ptr;
+                    wr.dec_ref();
+                }))
+            .release();
+        return *this;
+    }
+
+    template <typename Return, typename Class, typename... Args>
+    class_ &def_buffer(Return (Class::*func)(Args...)) {
+        return def_buffer([func](type &obj) { return (obj.*func)(); });
+    }
+
+    template <typename Return, typename Class, typename... Args>
+    class_ &def_buffer(Return (Class::*func)(Args...) const) {
+        return def_buffer([func](const type &obj) { return (obj.*func)(); });
+    }
+
+    template <typename C, typename D, typename... Extra>
+    class_ &def_readwrite(const char *name, D C::*pm, const Extra &...extra) {
+        static_assert(std::is_same<C, type>::value || std::is_base_of<C, type>::value,
+                      "def_readwrite() requires a class member (or base class member)");
+        cpp_function fget([pm](const type &c) -> const D & { return c.*pm; }, is_method(*this)),
+            fset([pm](type &c, const D &value) { c.*pm = value; }, is_method(*this));
+        def_property(name, fget, fset, return_value_policy::reference_internal, extra...);
+        return *this;
+    }
+
+    template <typename C, typename D, typename... Extra>
+    class_ &def_readonly(const char *name, const D C::*pm, const Extra &...extra) {
+        static_assert(std::is_same<C, type>::value || std::is_base_of<C, type>::value,
+                      "def_readonly() requires a class member (or base class member)");
+        cpp_function fget([pm](const type &c) -> const D & { return c.*pm; }, is_method(*this));
+        def_property_readonly(name, fget, return_value_policy::reference_internal, extra...);
+        return *this;
+    }
+
+    template <typename D, typename... Extra>
+    class_ &def_readwrite_static(const char *name, D *pm, const Extra &...extra) {
+        cpp_function fget([pm](const object &) -> const D & { return *pm; }, scope(*this)),
+            fset([pm](const object &, const D &value) { *pm = value; }, scope(*this));
+        def_property_static(name, fget, fset, return_value_policy::reference, extra...);
+        return *this;
+    }
+
+    template <typename D, typename... Extra>
+    class_ &def_readonly_static(const char *name, const D *pm, const Extra &...extra) {
+        cpp_function fget([pm](const object &) -> const D & { return *pm; }, scope(*this));
+        def_property_readonly_static(name, fget, return_value_policy::reference, extra...);
+        return *this;
+    }
+
+    /// Uses return_value_policy::reference_internal by default
+    template <typename Getter, typename... Extra>
+    class_ &def_property_readonly(const char *name, const Getter &fget, const Extra &...extra) {
+        return def_property_readonly(name,
+                                     cpp_function(method_adaptor<type>(fget)),
+                                     return_value_policy::reference_internal,
+                                     extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &
+    def_property_readonly(const char *name, const cpp_function &fget, const Extra &...extra) {
+        return def_property(name, fget, nullptr, extra...);
+    }
+
+    /// Uses return_value_policy::reference by default
+    template <typename Getter, typename... Extra>
+    class_ &
+    def_property_readonly_static(const char *name, const Getter &fget, const Extra &...extra) {
+        return def_property_readonly_static(
+            name, cpp_function(fget), return_value_policy::reference, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property_readonly_static(const char *name,
+                                         const cpp_function &fget,
+                                         const Extra &...extra) {
+        return def_property_static(name, fget, nullptr, extra...);
+    }
+
+    /// Uses return_value_policy::reference_internal by default
+    template <typename Getter, typename Setter, typename... Extra>
+    class_ &
+    def_property(const char *name, const Getter &fget, const Setter &fset, const Extra &...extra) {
+        return def_property(
+            name, fget, cpp_function(method_adaptor<type>(fset), is_setter()), extra...);
+    }
+    template <typename Getter, typename... Extra>
+    class_ &def_property(const char *name,
+                         const Getter &fget,
+                         const cpp_function &fset,
+                         const Extra &...extra) {
+        return def_property(name,
+                            cpp_function(method_adaptor<type>(fget)),
+                            fset,
+                            return_value_policy::reference_internal,
+                            extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property(const char *name,
+                         const cpp_function &fget,
+                         const cpp_function &fset,
+                         const Extra &...extra) {
+        return def_property_static(name, fget, fset, is_method(*this), extra...);
+    }
+
+    /// Uses return_value_policy::reference by default
+    template <typename Getter, typename... Extra>
+    class_ &def_property_static(const char *name,
+                                const Getter &fget,
+                                const cpp_function &fset,
+                                const Extra &...extra) {
+        return def_property_static(
+            name, cpp_function(fget), fset, return_value_policy::reference, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property_static(const char *name,
+                                const cpp_function &fget,
+                                const cpp_function &fset,
+                                const Extra &...extra) {
+        static_assert(0 == detail::constexpr_sum(std::is_base_of<arg, Extra>::value...),
+                      "Argument annotations are not allowed for properties");
+        auto rec_fget = get_function_record(fget), rec_fset = get_function_record(fset);
+        auto *rec_active = rec_fget;
+        if (rec_fget) {
+            char *doc_prev = rec_fget->doc; /* 'extra' field may include a property-specific
+                                               documentation string */
+            detail::process_attributes<Extra...>::init(extra..., rec_fget);
+            if (rec_fget->doc && rec_fget->doc != doc_prev) {
+                std::free(doc_prev);
+                rec_fget->doc = PYBIND11_COMPAT_STRDUP(rec_fget->doc);
+            }
+        }
+        if (rec_fset) {
+            char *doc_prev = rec_fset->doc;
+            detail::process_attributes<Extra...>::init(extra..., rec_fset);
+            if (rec_fset->doc && rec_fset->doc != doc_prev) {
+                std::free(doc_prev);
+                rec_fset->doc = PYBIND11_COMPAT_STRDUP(rec_fset->doc);
+            }
+            if (!rec_active) {
+                rec_active = rec_fset;
+            }
+        }
+        def_property_static_impl(name, fget, fset, rec_active);
+        return *this;
+    }
+
+private:
+    /// Initialize holder object, variant 1: object derives from enable_shared_from_this
+    template <typename T>
+    static void init_holder(detail::instance *inst,
+                            detail::value_and_holder &v_h,
+                            const holder_type * /* unused */,
+                            const std::enable_shared_from_this<T> * /* dummy */) {
+
+        auto sh = std::dynamic_pointer_cast<typename holder_type::element_type>(
+            detail::try_get_shared_from_this(v_h.value_ptr<type>()));
+        if (sh) {
+            new (std::addressof(v_h.holder<holder_type>())) holder_type(std::move(sh));
+            v_h.set_holder_constructed();
+        }
+
+        if (!v_h.holder_constructed() && inst->owned) {
+            new (std::addressof(v_h.holder<holder_type>())) holder_type(v_h.value_ptr<type>());
+            v_h.set_holder_constructed();
+        }
+    }
+
+    static void init_holder_from_existing(const detail::value_and_holder &v_h,
+                                          const holder_type *holder_ptr,
+                                          std::true_type /*is_copy_constructible*/) {
+        new (std::addressof(v_h.holder<holder_type>()))
+            holder_type(*reinterpret_cast<const holder_type *>(holder_ptr));
+    }
+
+    static void init_holder_from_existing(const detail::value_and_holder &v_h,
+                                          const holder_type *holder_ptr,
+                                          std::false_type /*is_copy_constructible*/) {
+        new (std::addressof(v_h.holder<holder_type>()))
+            holder_type(std::move(*const_cast<holder_type *>(holder_ptr)));
+    }
+
+    /// Initialize holder object, variant 2: try to construct from existing holder object, if
+    /// possible
+    static void init_holder(detail::instance *inst,
+                            detail::value_and_holder &v_h,
+                            const holder_type *holder_ptr,
+                            const void * /* dummy -- not enable_shared_from_this<T>) */) {
+        if (holder_ptr) {
+            init_holder_from_existing(v_h, holder_ptr, std::is_copy_constructible<holder_type>());
+            v_h.set_holder_constructed();
+        } else if (detail::always_construct_holder<holder_type>::value || inst->owned) {
+            new (std::addressof(v_h.holder<holder_type>())) holder_type(v_h.value_ptr<type>());
+            v_h.set_holder_constructed();
+        }
+    }
+
+    /// Performs instance initialization including constructing a holder and registering the known
+    /// instance.  Should be called as soon as the `type` value_ptr is set for an instance.  Takes
+    /// an optional pointer to an existing holder to use; if not specified and the instance is
+    /// `.owned`, a new holder will be constructed to manage the value pointer.
+    static void init_instance(detail::instance *inst, const void *holder_ptr) {
+        auto v_h = inst->get_value_and_holder(detail::get_type_info(typeid(type)));
+        if (!v_h.instance_registered()) {
+            register_instance(inst, v_h.value_ptr(), v_h.type);
+            v_h.set_instance_registered();
+        }
+        init_holder(inst, v_h, (const holder_type *) holder_ptr, v_h.value_ptr<type>());
+    }
+
+    /// Deallocates an instance; via holder, if constructed; otherwise via operator delete.
+    static void dealloc(detail::value_and_holder &v_h) {
+        // We could be deallocating because we are cleaning up after a Python exception.
+        // If so, the Python error indicator will be set. We need to clear that before
+        // running the destructor, in case the destructor code calls more Python.
+        // If we don't, the Python API will exit with an exception, and pybind11 will
+        // throw error_already_set from the C++ destructor which is forbidden and triggers
+        // std::terminate().
+        error_scope scope;
+        if (v_h.holder_constructed()) {
+            v_h.holder<holder_type>().~holder_type();
+            v_h.set_holder_constructed(false);
+        } else {
+            detail::call_operator_delete(
+                v_h.value_ptr<type>(), v_h.type->type_size, v_h.type->type_align);
+        }
+        v_h.value_ptr() = nullptr;
+    }
+
+    static detail::function_record *get_function_record(handle h) {
+        h = detail::get_function(h);
+        if (!h) {
+            return nullptr;
+        }
+
+        handle func_self = PyCFunction_GET_SELF(h.ptr());
+        if (!func_self) {
+            throw error_already_set();
+        }
+        if (!isinstance<capsule>(func_self)) {
+            return nullptr;
+        }
+        auto cap = reinterpret_borrow<capsule>(func_self);
+        if (!detail::is_function_record_capsule(cap)) {
+            return nullptr;
+        }
+        return cap.get_pointer<detail::function_record>();
+    }
+};
+
+/// Binds an existing constructor taking arguments Args...
+template <typename... Args>
+detail::initimpl::constructor<Args...> init() {
+    return {};
+}
+/// Like `init<Args...>()`, but the instance is always constructed through the alias class (even
+/// when not inheriting on the Python side).
+template <typename... Args>
+detail::initimpl::alias_constructor<Args...> init_alias() {
+    return {};
+}
+
+/// Binds a factory function as a constructor
+template <typename Func, typename Ret = detail::initimpl::factory<Func>>
+Ret init(Func &&f) {
+    return {std::forward<Func>(f)};
+}
+
+/// Dual-argument factory function: the first function is called when no alias is needed, the
+/// second when an alias is needed (i.e. due to python-side inheritance).  Arguments must be
+/// identical.
+template <typename CFunc, typename AFunc, typename Ret = detail::initimpl::factory<CFunc, AFunc>>
+Ret init(CFunc &&c, AFunc &&a) {
+    return {std::forward<CFunc>(c), std::forward<AFunc>(a)};
+}
+
+/// Binds pickling functions `__getstate__` and `__setstate__` and ensures that the type
+/// returned by `__getstate__` is the same as the argument accepted by `__setstate__`.
+template <typename GetState, typename SetState>
+detail::initimpl::pickle_factory<GetState, SetState> pickle(GetState &&g, SetState &&s) {
+    return {std::forward<GetState>(g), std::forward<SetState>(s)};
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+inline str enum_name(handle arg) {
+    dict entries = arg.get_type().attr("__entries");
+    for (auto kv : entries) {
+        if (handle(kv.second[int_(0)]).equal(arg)) {
+            return pybind11::str(kv.first);
+        }
+    }
+    return "???";
+}
+
+struct enum_base {
+    enum_base(const handle &base, const handle &parent) : m_base(base), m_parent(parent) {}
+
+    PYBIND11_NOINLINE void init(bool is_arithmetic, bool is_convertible) {
+        m_base.attr("__entries") = dict();
+        auto property = handle((PyObject *) &PyProperty_Type);
+        auto static_property = handle((PyObject *) get_internals().static_property_type);
+
+        m_base.attr("__repr__") = cpp_function(
+            [](const object &arg) -> str {
+                handle type = type::handle_of(arg);
+                object type_name = type.attr("__name__");
+                return pybind11::str("<{}.{}: {}>")
+                    .format(std::move(type_name), enum_name(arg), int_(arg));
+            },
+            name("__repr__"),
+            is_method(m_base));
+
+        m_base.attr("name") = property(cpp_function(&enum_name, name("name"), is_method(m_base)));
+
+        m_base.attr("__str__") = cpp_function(
+            [](handle arg) -> str {
+                object type_name = type::handle_of(arg).attr("__name__");
+                return pybind11::str("{}.{}").format(std::move(type_name), enum_name(arg));
+            },
+            name("name"),
+            is_method(m_base));
+
+        if (options::show_enum_members_docstring()) {
+            m_base.attr("__doc__") = static_property(
+                cpp_function(
+                    [](handle arg) -> std::string {
+                        std::string docstring;
+                        dict entries = arg.attr("__entries");
+                        if (((PyTypeObject *) arg.ptr())->tp_doc) {
+                            docstring += std::string(
+                                reinterpret_cast<PyTypeObject *>(arg.ptr())->tp_doc);
+                            docstring += "\n\n";
+                        }
+                        docstring += "Members:";
+                        for (auto kv : entries) {
+                            auto key = std::string(pybind11::str(kv.first));
+                            auto comment = kv.second[int_(1)];
+                            docstring += "\n\n  ";
+                            docstring += key;
+                            if (!comment.is_none()) {
+                                docstring += " : ";
+                                docstring += pybind11::str(comment).cast<std::string>();
+                            }
+                        }
+                        return docstring;
+                    },
+                    name("__doc__")),
+                none(),
+                none(),
+                "");
+        }
+
+        m_base.attr("__members__") = static_property(cpp_function(
+                                                         [](handle arg) -> dict {
+                                                             dict entries = arg.attr("__entries"),
+                                                                  m;
+                                                             for (auto kv : entries) {
+                                                                 m[kv.first] = kv.second[int_(0)];
+                                                             }
+                                                             return m;
+                                                         },
+                                                         name("__members__")),
+                                                     none(),
+                                                     none(),
+                                                     "");
+
+#define PYBIND11_ENUM_OP_STRICT(op, expr, strict_behavior)                                        \
+    m_base.attr(op) = cpp_function(                                                               \
+        [](const object &a, const object &b) {                                                    \
+            if (!type::handle_of(a).is(type::handle_of(b)))                                       \
+                strict_behavior; /* NOLINT(bugprone-macro-parentheses) */                         \
+            return expr;                                                                          \
+        },                                                                                        \
+        name(op),                                                                                 \
+        is_method(m_base),                                                                        \
+        arg("other"))
+
+#define PYBIND11_ENUM_OP_CONV(op, expr)                                                           \
+    m_base.attr(op) = cpp_function(                                                               \
+        [](const object &a_, const object &b_) {                                                  \
+            int_ a(a_), b(b_);                                                                    \
+            return expr;                                                                          \
+        },                                                                                        \
+        name(op),                                                                                 \
+        is_method(m_base),                                                                        \
+        arg("other"))
+
+#define PYBIND11_ENUM_OP_CONV_LHS(op, expr)                                                       \
+    m_base.attr(op) = cpp_function(                                                               \
+        [](const object &a_, const object &b) {                                                   \
+            int_ a(a_);                                                                           \
+            return expr;                                                                          \
+        },                                                                                        \
+        name(op),                                                                                 \
+        is_method(m_base),                                                                        \
+        arg("other"))
+
+        if (is_convertible) {
+            PYBIND11_ENUM_OP_CONV_LHS("__eq__", !b.is_none() && a.equal(b));
+            PYBIND11_ENUM_OP_CONV_LHS("__ne__", b.is_none() || !a.equal(b));
+
+            if (is_arithmetic) {
+                PYBIND11_ENUM_OP_CONV("__lt__", a < b);
+                PYBIND11_ENUM_OP_CONV("__gt__", a > b);
+                PYBIND11_ENUM_OP_CONV("__le__", a <= b);
+                PYBIND11_ENUM_OP_CONV("__ge__", a >= b);
+                PYBIND11_ENUM_OP_CONV("__and__", a & b);
+                PYBIND11_ENUM_OP_CONV("__rand__", a & b);
+                PYBIND11_ENUM_OP_CONV("__or__", a | b);
+                PYBIND11_ENUM_OP_CONV("__ror__", a | b);
+                PYBIND11_ENUM_OP_CONV("__xor__", a ^ b);
+                PYBIND11_ENUM_OP_CONV("__rxor__", a ^ b);
+                m_base.attr("__invert__")
+                    = cpp_function([](const object &arg) { return ~(int_(arg)); },
+                                   name("__invert__"),
+                                   is_method(m_base));
+            }
+        } else {
+            PYBIND11_ENUM_OP_STRICT("__eq__", int_(a).equal(int_(b)), return false);
+            PYBIND11_ENUM_OP_STRICT("__ne__", !int_(a).equal(int_(b)), return true);
+
+            if (is_arithmetic) {
+#define PYBIND11_THROW throw type_error("Expected an enumeration of matching type!");
+                PYBIND11_ENUM_OP_STRICT("__lt__", int_(a) < int_(b), PYBIND11_THROW);
+                PYBIND11_ENUM_OP_STRICT("__gt__", int_(a) > int_(b), PYBIND11_THROW);
+                PYBIND11_ENUM_OP_STRICT("__le__", int_(a) <= int_(b), PYBIND11_THROW);
+                PYBIND11_ENUM_OP_STRICT("__ge__", int_(a) >= int_(b), PYBIND11_THROW);
+#undef PYBIND11_THROW
+            }
+        }
+
+#undef PYBIND11_ENUM_OP_CONV_LHS
+#undef PYBIND11_ENUM_OP_CONV
+#undef PYBIND11_ENUM_OP_STRICT
+
+        m_base.attr("__getstate__") = cpp_function(
+            [](const object &arg) { return int_(arg); }, name("__getstate__"), is_method(m_base));
+
+        m_base.attr("__hash__") = cpp_function(
+            [](const object &arg) { return int_(arg); }, name("__hash__"), is_method(m_base));
+    }
+
+    PYBIND11_NOINLINE void value(char const *name_, object value, const char *doc = nullptr) {
+        dict entries = m_base.attr("__entries");
+        str name(name_);
+        if (entries.contains(name)) {
+            std::string type_name = (std::string) str(m_base.attr("__name__"));
+            throw value_error(std::move(type_name) + ": element \"" + std::string(name_)
+                              + "\" already exists!");
+        }
+
+        entries[name] = pybind11::make_tuple(value, doc);
+        m_base.attr(std::move(name)) = std::move(value);
+    }
+
+    PYBIND11_NOINLINE void export_values() {
+        dict entries = m_base.attr("__entries");
+        for (auto kv : entries) {
+            m_parent.attr(kv.first) = kv.second[int_(0)];
+        }
+    }
+
+    handle m_base;
+    handle m_parent;
+};
+
+template <bool is_signed, size_t length>
+struct equivalent_integer {};
+template <>
+struct equivalent_integer<true, 1> {
+    using type = int8_t;
+};
+template <>
+struct equivalent_integer<false, 1> {
+    using type = uint8_t;
+};
+template <>
+struct equivalent_integer<true, 2> {
+    using type = int16_t;
+};
+template <>
+struct equivalent_integer<false, 2> {
+    using type = uint16_t;
+};
+template <>
+struct equivalent_integer<true, 4> {
+    using type = int32_t;
+};
+template <>
+struct equivalent_integer<false, 4> {
+    using type = uint32_t;
+};
+template <>
+struct equivalent_integer<true, 8> {
+    using type = int64_t;
+};
+template <>
+struct equivalent_integer<false, 8> {
+    using type = uint64_t;
+};
+
+template <typename IntLike>
+using equivalent_integer_t =
+    typename equivalent_integer<std::is_signed<IntLike>::value, sizeof(IntLike)>::type;
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Binds C++ enumerations and enumeration classes to Python
+template <typename Type>
+class enum_ : public class_<Type> {
+public:
+    using Base = class_<Type>;
+    using Base::attr;
+    using Base::def;
+    using Base::def_property_readonly;
+    using Base::def_property_readonly_static;
+    using Underlying = typename std::underlying_type<Type>::type;
+    // Scalar is the integer representation of underlying type
+    using Scalar = detail::conditional_t<detail::any_of<detail::is_std_char_type<Underlying>,
+                                                        std::is_same<Underlying, bool>>::value,
+                                         detail::equivalent_integer_t<Underlying>,
+                                         Underlying>;
+
+    template <typename... Extra>
+    enum_(const handle &scope, const char *name, const Extra &...extra)
+        : class_<Type>(scope, name, extra...), m_base(*this, scope) {
+        constexpr bool is_arithmetic = detail::any_of<std::is_same<arithmetic, Extra>...>::value;
+        constexpr bool is_convertible = std::is_convertible<Type, Underlying>::value;
+        m_base.init(is_arithmetic, is_convertible);
+
+        def(init([](Scalar i) { return static_cast<Type>(i); }), arg("value"));
+        def_property_readonly("value", [](Type value) { return (Scalar) value; });
+        def("__int__", [](Type value) { return (Scalar) value; });
+        def("__index__", [](Type value) { return (Scalar) value; });
+        attr("__setstate__") = cpp_function(
+            [](detail::value_and_holder &v_h, Scalar arg) {
+                detail::initimpl::setstate<Base>(
+                    v_h, static_cast<Type>(arg), Py_TYPE(v_h.inst) != v_h.type->type);
+            },
+            detail::is_new_style_constructor(),
+            pybind11::name("__setstate__"),
+            is_method(*this),
+            arg("state"));
+    }
+
+    /// Export enumeration entries into the parent scope
+    enum_ &export_values() {
+        m_base.export_values();
+        return *this;
+    }
+
+    /// Add an enumeration entry
+    enum_ &value(char const *name, Type value, const char *doc = nullptr) {
+        m_base.value(name, pybind11::cast(value, return_value_policy::copy), doc);
+        return *this;
+    }
+
+private:
+    detail::enum_base m_base;
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+PYBIND11_NOINLINE void keep_alive_impl(handle nurse, handle patient) {
+    if (!nurse || !patient) {
+        pybind11_fail("Could not activate keep_alive!");
+    }
+
+    if (patient.is_none() || nurse.is_none()) {
+        return; /* Nothing to keep alive or nothing to be kept alive by */
+    }
+
+    auto tinfo = all_type_info(Py_TYPE(nurse.ptr()));
+    if (!tinfo.empty()) {
+        /* It's a pybind-registered type, so we can store the patient in the
+         * internal list. */
+        add_patient(nurse.ptr(), patient.ptr());
+    } else {
+        /* Fall back to clever approach based on weak references taken from
+         * Boost.Python. This is not used for pybind-registered types because
+         * the objects can be destroyed out-of-order in a GC pass. */
+        cpp_function disable_lifesupport([patient](handle weakref) {
+            patient.dec_ref();
+            weakref.dec_ref();
+        });
+
+        weakref wr(nurse, disable_lifesupport);
+
+        patient.inc_ref(); /* reference patient and leak the weak reference */
+        (void) wr.release();
+    }
+}
+
+PYBIND11_NOINLINE void
+keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret) {
+    auto get_arg = [&](size_t n) {
+        if (n == 0) {
+            return ret;
+        }
+        if (n == 1 && call.init_self) {
+            return call.init_self;
+        }
+        if (n <= call.args.size()) {
+            return call.args[n - 1];
+        }
+        return handle();
+    };
+
+    keep_alive_impl(get_arg(Nurse), get_arg(Patient));
+}
+
+inline std::pair<decltype(internals::registered_types_py)::iterator, bool>
+all_type_info_get_cache(PyTypeObject *type) {
+    auto res = get_internals()
+                   .registered_types_py
+#ifdef __cpp_lib_unordered_map_try_emplace
+                   .try_emplace(type);
+#else
+                   .emplace(type, std::vector<detail::type_info *>());
+#endif
+    if (res.second) {
+        // New cache entry created; set up a weak reference to automatically remove it if the type
+        // gets destroyed:
+        weakref((PyObject *) type, cpp_function([type](handle wr) {
+                    get_internals().registered_types_py.erase(type);
+
+                    // TODO consolidate the erasure code in pybind11_meta_dealloc() in class.h
+                    auto &cache = get_internals().inactive_override_cache;
+                    for (auto it = cache.begin(), last = cache.end(); it != last;) {
+                        if (it->first == reinterpret_cast<PyObject *>(type)) {
+                            it = cache.erase(it);
+                        } else {
+                            ++it;
+                        }
+                    }
+
+                    wr.dec_ref();
+                }))
+            .release();
+    }
+
+    return res;
+}
+
+/* There are a large number of apparently unused template arguments because
+ * each combination requires a separate py::class_ registration.
+ */
+template <typename Access,
+          return_value_policy Policy,
+          typename Iterator,
+          typename Sentinel,
+          typename ValueType,
+          typename... Extra>
+struct iterator_state {
+    Iterator it;
+    Sentinel end;
+    bool first_or_done;
+};
+
+// Note: these helpers take the iterator by non-const reference because some
+// iterators in the wild can't be dereferenced when const. The & after Iterator
+// is required for MSVC < 16.9. SFINAE cannot be reused for result_type due to
+// bugs in ICC, NVCC, and PGI compilers. See PR #3293.
+template <typename Iterator, typename SFINAE = decltype(*std::declval<Iterator &>())>
+struct iterator_access {
+    using result_type = decltype(*std::declval<Iterator &>());
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+    result_type operator()(Iterator &it) const { return *it; }
+};
+
+template <typename Iterator, typename SFINAE = decltype((*std::declval<Iterator &>()).first)>
+class iterator_key_access {
+private:
+    using pair_type = decltype(*std::declval<Iterator &>());
+
+public:
+    /* If either the pair itself or the element of the pair is a reference, we
+     * want to return a reference, otherwise a value. When the decltype
+     * expression is parenthesized it is based on the value category of the
+     * expression; otherwise it is the declared type of the pair member.
+     * The use of declval<pair_type> in the second branch rather than directly
+     * using *std::declval<Iterator &>() is a workaround for nvcc
+     * (it's not used in the first branch because going via decltype and back
+     * through declval does not perfectly preserve references).
+     */
+    using result_type
+        = conditional_t<std::is_reference<decltype(*std::declval<Iterator &>())>::value,
+                        decltype(((*std::declval<Iterator &>()).first)),
+                        decltype(std::declval<pair_type>().first)>;
+    result_type operator()(Iterator &it) const { return (*it).first; }
+};
+
+template <typename Iterator, typename SFINAE = decltype((*std::declval<Iterator &>()).second)>
+class iterator_value_access {
+private:
+    using pair_type = decltype(*std::declval<Iterator &>());
+
+public:
+    using result_type
+        = conditional_t<std::is_reference<decltype(*std::declval<Iterator &>())>::value,
+                        decltype(((*std::declval<Iterator &>()).second)),
+                        decltype(std::declval<pair_type>().second)>;
+    result_type operator()(Iterator &it) const { return (*it).second; }
+};
+
+template <typename Access,
+          return_value_policy Policy,
+          typename Iterator,
+          typename Sentinel,
+          typename ValueType,
+          typename... Extra>
+iterator make_iterator_impl(Iterator first, Sentinel last, Extra &&...extra) {
+    using state = detail::iterator_state<Access, Policy, Iterator, Sentinel, ValueType, Extra...>;
+    // TODO: state captures only the types of Extra, not the values
+
+    if (!detail::get_type_info(typeid(state), false)) {
+        class_<state>(handle(), "iterator", pybind11::module_local())
+            .def("__iter__", [](state &s) -> state & { return s; })
+            .def(
+                "__next__",
+                [](state &s) -> ValueType {
+                    if (!s.first_or_done) {
+                        ++s.it;
+                    } else {
+                        s.first_or_done = false;
+                    }
+                    if (s.it == s.end) {
+                        s.first_or_done = true;
+                        throw stop_iteration();
+                    }
+                    return Access()(s.it);
+                    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+                },
+                std::forward<Extra>(extra)...,
+                Policy);
+    }
+
+    return cast(state{first, last, true});
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Makes a python iterator from a first and past-the-end C++ InputIterator.
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Iterator,
+          typename Sentinel,
+          typename ValueType = typename detail::iterator_access<Iterator>::result_type,
+          typename... Extra>
+iterator make_iterator(Iterator first, Sentinel last, Extra &&...extra) {
+    return detail::make_iterator_impl<detail::iterator_access<Iterator>,
+                                      Policy,
+                                      Iterator,
+                                      Sentinel,
+                                      ValueType,
+                                      Extra...>(first, last, std::forward<Extra>(extra)...);
+}
+
+/// Makes a python iterator over the keys (`.first`) of a iterator over pairs from a
+/// first and past-the-end InputIterator.
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Iterator,
+          typename Sentinel,
+          typename KeyType = typename detail::iterator_key_access<Iterator>::result_type,
+          typename... Extra>
+iterator make_key_iterator(Iterator first, Sentinel last, Extra &&...extra) {
+    return detail::make_iterator_impl<detail::iterator_key_access<Iterator>,
+                                      Policy,
+                                      Iterator,
+                                      Sentinel,
+                                      KeyType,
+                                      Extra...>(first, last, std::forward<Extra>(extra)...);
+}
+
+/// Makes a python iterator over the values (`.second`) of a iterator over pairs from a
+/// first and past-the-end InputIterator.
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Iterator,
+          typename Sentinel,
+          typename ValueType = typename detail::iterator_value_access<Iterator>::result_type,
+          typename... Extra>
+iterator make_value_iterator(Iterator first, Sentinel last, Extra &&...extra) {
+    return detail::make_iterator_impl<detail::iterator_value_access<Iterator>,
+                                      Policy,
+                                      Iterator,
+                                      Sentinel,
+                                      ValueType,
+                                      Extra...>(first, last, std::forward<Extra>(extra)...);
+}
+
+/// Makes an iterator over values of an stl container or other container supporting
+/// `std::begin()`/`std::end()`
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Type,
+          typename... Extra>
+iterator make_iterator(Type &value, Extra &&...extra) {
+    return make_iterator<Policy>(
+        std::begin(value), std::end(value), std::forward<Extra>(extra)...);
+}
+
+/// Makes an iterator over the keys (`.first`) of a stl map-like container supporting
+/// `std::begin()`/`std::end()`
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Type,
+          typename... Extra>
+iterator make_key_iterator(Type &value, Extra &&...extra) {
+    return make_key_iterator<Policy>(
+        std::begin(value), std::end(value), std::forward<Extra>(extra)...);
+}
+
+/// Makes an iterator over the values (`.second`) of a stl map-like container supporting
+/// `std::begin()`/`std::end()`
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Type,
+          typename... Extra>
+iterator make_value_iterator(Type &value, Extra &&...extra) {
+    return make_value_iterator<Policy>(
+        std::begin(value), std::end(value), std::forward<Extra>(extra)...);
+}
+
+template <typename InputType, typename OutputType>
+void implicitly_convertible() {
+    struct set_flag {
+        bool &flag;
+        explicit set_flag(bool &flag_) : flag(flag_) { flag_ = true; }
+        ~set_flag() { flag = false; }
+    };
+    auto implicit_caster = [](PyObject *obj, PyTypeObject *type) -> PyObject * {
+        static bool currently_used = false;
+        if (currently_used) { // implicit conversions are non-reentrant
+            return nullptr;
+        }
+        set_flag flag_helper(currently_used);
+        if (!detail::make_caster<InputType>().load(obj, false)) {
+            return nullptr;
+        }
+        tuple args(1);
+        args[0] = obj;
+        PyObject *result = PyObject_Call((PyObject *) type, args.ptr(), nullptr);
+        if (result == nullptr) {
+            PyErr_Clear();
+        }
+        return result;
+    };
+
+    if (auto *tinfo = detail::get_type_info(typeid(OutputType))) {
+        tinfo->implicit_conversions.emplace_back(std::move(implicit_caster));
+    } else {
+        pybind11_fail("implicitly_convertible: Unable to find type " + type_id<OutputType>());
+    }
+}
+
+inline void register_exception_translator(ExceptionTranslator &&translator) {
+    detail::get_internals().registered_exception_translators.push_front(
+        std::forward<ExceptionTranslator>(translator));
+}
+
+/**
+ * Add a new module-local exception translator. Locally registered functions
+ * will be tried before any globally registered exception translators, which
+ * will only be invoked if the module-local handlers do not deal with
+ * the exception.
+ */
+inline void register_local_exception_translator(ExceptionTranslator &&translator) {
+    detail::get_local_internals().registered_exception_translators.push_front(
+        std::forward<ExceptionTranslator>(translator));
+}
+
+/**
+ * Wrapper to generate a new Python exception type.
+ *
+ * This should only be used with PyErr_SetString for now.
+ * It is not (yet) possible to use as a py::base.
+ * Template type argument is reserved for future use.
+ */
+template <typename type>
+class exception : public object {
+public:
+    exception() = default;
+    exception(handle scope, const char *name, handle base = PyExc_Exception) {
+        std::string full_name
+            = scope.attr("__name__").cast<std::string>() + std::string(".") + name;
+        m_ptr = PyErr_NewException(const_cast<char *>(full_name.c_str()), base.ptr(), nullptr);
+        if (hasattr(scope, "__dict__") && scope.attr("__dict__").contains(name)) {
+            pybind11_fail("Error during initialization: multiple incompatible "
+                          "definitions with name \""
+                          + std::string(name) + "\"");
+        }
+        scope.attr(name) = *this;
+    }
+
+    // Sets the current python exception to this exception object with the given message
+    void operator()(const char *message) { PyErr_SetString(m_ptr, message); }
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+// Returns a reference to a function-local static exception object used in the simple
+// register_exception approach below.  (It would be simpler to have the static local variable
+// directly in register_exception, but that makes clang <3.5 segfault - issue #1349).
+template <typename CppException>
+exception<CppException> &get_exception_object() {
+    static exception<CppException> ex;
+    return ex;
+}
+
+// Helper function for register_exception and register_local_exception
+template <typename CppException>
+exception<CppException> &
+register_exception_impl(handle scope, const char *name, handle base, bool isLocal) {
+    auto &ex = detail::get_exception_object<CppException>();
+    if (!ex) {
+        ex = exception<CppException>(scope, name, base);
+    }
+
+    auto register_func
+        = isLocal ? &register_local_exception_translator : &register_exception_translator;
+
+    register_func([](std::exception_ptr p) {
+        if (!p) {
+            return;
+        }
+        try {
+            std::rethrow_exception(p);
+        } catch (const CppException &e) {
+            detail::get_exception_object<CppException>()(e.what());
+        }
+    });
+    return ex;
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+/**
+ * Registers a Python exception in `m` of the given `name` and installs a translator to
+ * translate the C++ exception to the created Python exception using the what() method.
+ * This is intended for simple exception translations; for more complex translation, register the
+ * exception object and translator directly.
+ */
+template <typename CppException>
+exception<CppException> &
+register_exception(handle scope, const char *name, handle base = PyExc_Exception) {
+    return detail::register_exception_impl<CppException>(scope, name, base, false /* isLocal */);
+}
+
+/**
+ * Registers a Python exception in `m` of the given `name` and installs a translator to
+ * translate the C++ exception to the created Python exception using the what() method.
+ * This translator will only be used for exceptions that are thrown in this module and will be
+ * tried before global exception translators, including those registered with register_exception.
+ * This is intended for simple exception translations; for more complex translation, register the
+ * exception object and translator directly.
+ */
+template <typename CppException>
+exception<CppException> &
+register_local_exception(handle scope, const char *name, handle base = PyExc_Exception) {
+    return detail::register_exception_impl<CppException>(scope, name, base, true /* isLocal */);
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+PYBIND11_NOINLINE void print(const tuple &args, const dict &kwargs) {
+    auto strings = tuple(args.size());
+    for (size_t i = 0; i < args.size(); ++i) {
+        strings[i] = str(args[i]);
+    }
+    auto sep = kwargs.contains("sep") ? kwargs["sep"] : str(" ");
+    auto line = sep.attr("join")(std::move(strings));
+
+    object file;
+    if (kwargs.contains("file")) {
+        file = kwargs["file"].cast<object>();
+    } else {
+        try {
+            file = module_::import("sys").attr("stdout");
+        } catch (const error_already_set &) {
+            /* If print() is called from code that is executed as
+               part of garbage collection during interpreter shutdown,
+               importing 'sys' can fail. Give up rather than crashing the
+               interpreter in this case. */
+            return;
+        }
+    }
+
+    auto write = file.attr("write");
+    write(std::move(line));
+    write(kwargs.contains("end") ? kwargs["end"] : str("\n"));
+
+    if (kwargs.contains("flush") && kwargs["flush"].cast<bool>()) {
+        file.attr("flush")();
+    }
+}
+PYBIND11_NAMESPACE_END(detail)
+
+template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
+void print(Args &&...args) {
+    auto c = detail::collect_arguments<policy>(std::forward<Args>(args)...);
+    detail::print(c.args(), c.kwargs());
+}
+
+inline void
+error_already_set::m_fetched_error_deleter(detail::error_fetch_and_normalize *raw_ptr) {
+    gil_scoped_acquire gil;
+    error_scope scope;
+    delete raw_ptr;
+}
+
+inline const char *error_already_set::what() const noexcept {
+    gil_scoped_acquire gil;
+    error_scope scope;
+    return m_fetched_error->error_string().c_str();
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+inline function
+get_type_override(const void *this_ptr, const type_info *this_type, const char *name) {
+    handle self = get_object_handle(this_ptr, this_type);
+    if (!self) {
+        return function();
+    }
+    handle type = type::handle_of(self);
+    auto key = std::make_pair(type.ptr(), name);
+
+    /* Cache functions that aren't overridden in Python to avoid
+       many costly Python dictionary lookups below */
+    auto &cache = get_internals().inactive_override_cache;
+    if (cache.find(key) != cache.end()) {
+        return function();
+    }
+
+    function override = getattr(self, name, function());
+    if (override.is_cpp_function()) {
+        cache.insert(std::move(key));
+        return function();
+    }
+
+    /* Don't call dispatch code if invoked from overridden function.
+       Unfortunately this doesn't work on PyPy. */
+#if !defined(PYPY_VERSION)
+#    if PY_VERSION_HEX >= 0x03090000
+    PyFrameObject *frame = PyThreadState_GetFrame(PyThreadState_Get());
+    if (frame != nullptr) {
+        PyCodeObject *f_code = PyFrame_GetCode(frame);
+        // f_code is guaranteed to not be NULL
+        if ((std::string) str(f_code->co_name) == name && f_code->co_argcount > 0) {
+            PyObject *locals = PyEval_GetLocals();
+            if (locals != nullptr) {
+                PyObject *co_varnames = PyObject_GetAttrString((PyObject *) f_code, "co_varnames");
+                PyObject *self_arg = PyTuple_GET_ITEM(co_varnames, 0);
+                Py_DECREF(co_varnames);
+                PyObject *self_caller = dict_getitem(locals, self_arg);
+                if (self_caller == self.ptr()) {
+                    Py_DECREF(f_code);
+                    Py_DECREF(frame);
+                    return function();
+                }
+            }
+        }
+        Py_DECREF(f_code);
+        Py_DECREF(frame);
+    }
+#    else
+    PyFrameObject *frame = PyThreadState_Get()->frame;
+    if (frame != nullptr && (std::string) str(frame->f_code->co_name) == name
+        && frame->f_code->co_argcount > 0) {
+        PyFrame_FastToLocals(frame);
+        PyObject *self_caller
+            = dict_getitem(frame->f_locals, PyTuple_GET_ITEM(frame->f_code->co_varnames, 0));
+        if (self_caller == self.ptr()) {
+            return function();
+        }
+    }
+#    endif
+
+#else
+    /* PyPy currently doesn't provide a detailed cpyext emulation of
+       frame objects, so we have to emulate this using Python. This
+       is going to be slow..*/
+    dict d;
+    d["self"] = self;
+    d["name"] = pybind11::str(name);
+    PyObject *result
+        = PyRun_String("import inspect\n"
+                       "frame = inspect.currentframe()\n"
+                       "if frame is not None:\n"
+                       "    frame = frame.f_back\n"
+                       "    if frame is not None and str(frame.f_code.co_name) == name and "
+                       "frame.f_code.co_argcount > 0:\n"
+                       "        self_caller = frame.f_locals[frame.f_code.co_varnames[0]]\n"
+                       "        if self_caller == self:\n"
+                       "            self = None\n",
+                       Py_file_input,
+                       d.ptr(),
+                       d.ptr());
+    if (result == nullptr)
+        throw error_already_set();
+    Py_DECREF(result);
+    if (d["self"].is_none())
+        return function();
+#endif
+
+    return override;
+}
+PYBIND11_NAMESPACE_END(detail)
+
+/** \rst
+  Try to retrieve a python method by the provided name from the instance pointed to by the
+  this_ptr.
+
+  :this_ptr: The pointer to the object the overridden method should be retrieved for. This should
+             be the first non-trampoline class encountered in the inheritance chain.
+  :name: The name of the overridden Python method to retrieve.
+  :return: The Python method by this name from the object or an empty function wrapper.
+ \endrst */
+template <class T>
+function get_override(const T *this_ptr, const char *name) {
+    auto *tinfo = detail::get_type_info(typeid(T));
+    return tinfo ? detail::get_type_override(this_ptr, tinfo, name) : function();
+}
+
+#define PYBIND11_OVERRIDE_IMPL(ret_type, cname, name, ...)                                        \
+    do {                                                                                          \
+        pybind11::gil_scoped_acquire gil;                                                         \
+        pybind11::function override                                                               \
+            = pybind11::get_override(static_cast<const cname *>(this), name);                     \
+        if (override) {                                                                           \
+            auto o = override(__VA_ARGS__);                                                       \
+            if (pybind11::detail::cast_is_temporary_value_reference<ret_type>::value) {           \
+                static pybind11::detail::override_caster_t<ret_type> caster;                      \
+                return pybind11::detail::cast_ref<ret_type>(std::move(o), caster);                \
+            }                                                                                     \
+            return pybind11::detail::cast_safe<ret_type>(std::move(o));                           \
+        }                                                                                         \
+    } while (false)
+
+/** \rst
+    Macro to populate the virtual method in the trampoline class. This macro tries to look up a
+    method named 'fn' from the Python side, deals with the :ref:`gil` and necessary argument
+    conversions to call this method and return the appropriate type.
+    See :ref:`overriding_virtuals` for more information. This macro should be used when the method
+    name in C is not the same as the method name in Python. For example with `__str__`.
+
+    .. code-block:: cpp
+
+      std::string toString() override {
+        PYBIND11_OVERRIDE_NAME(
+            std::string, // Return type (ret_type)
+            Animal,      // Parent class (cname)
+            "__str__",   // Name of method in Python (name)
+            toString,    // Name of function in C++ (fn)
+        );
+      }
+\endrst */
+#define PYBIND11_OVERRIDE_NAME(ret_type, cname, name, fn, ...)                                    \
+    do {                                                                                          \
+        PYBIND11_OVERRIDE_IMPL(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__); \
+        return cname::fn(__VA_ARGS__);                                                            \
+    } while (false)
+
+/** \rst
+    Macro for pure virtual functions, this function is identical to
+    :c:macro:`PYBIND11_OVERRIDE_NAME`, except that it throws if no override can be found.
+\endrst */
+#define PYBIND11_OVERRIDE_PURE_NAME(ret_type, cname, name, fn, ...)                               \
+    do {                                                                                          \
+        PYBIND11_OVERRIDE_IMPL(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__); \
+        pybind11::pybind11_fail(                                                                  \
+            "Tried to call pure virtual function \"" PYBIND11_STRINGIFY(cname) "::" name "\"");   \
+    } while (false)
+
+/** \rst
+    Macro to populate the virtual method in the trampoline class. This macro tries to look up the
+    method from the Python side, deals with the :ref:`gil` and necessary argument conversions to
+    call this method and return the appropriate type. This macro should be used if the method name
+    in C and in Python are identical.
+    See :ref:`overriding_virtuals` for more information.
+
+    .. code-block:: cpp
+
+      class PyAnimal : public Animal {
+      public:
+          // Inherit the constructors
+          using Animal::Animal;
+
+          // Trampoline (need one for each virtual function)
+          std::string go(int n_times) override {
+              PYBIND11_OVERRIDE_PURE(
+                  std::string, // Return type (ret_type)
+                  Animal,      // Parent class (cname)
+                  go,          // Name of function in C++ (must match Python name) (fn)
+                  n_times      // Argument(s) (...)
+              );
+          }
+      };
+\endrst */
+#define PYBIND11_OVERRIDE(ret_type, cname, fn, ...)                                               \
+    PYBIND11_OVERRIDE_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__)
+
+/** \rst
+    Macro for pure virtual functions, this function is identical to :c:macro:`PYBIND11_OVERRIDE`,
+    except that it throws if no override can be found.
+\endrst */
+#define PYBIND11_OVERRIDE_PURE(ret_type, cname, fn, ...)                                          \
+    PYBIND11_OVERRIDE_PURE_NAME(                                                                  \
+        PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__)
+
+// Deprecated versions
+
+PYBIND11_DEPRECATED("get_type_overload has been deprecated")
+inline function
+get_type_overload(const void *this_ptr, const detail::type_info *this_type, const char *name) {
+    return detail::get_type_override(this_ptr, this_type, name);
+}
+
+template <class T>
+inline function get_overload(const T *this_ptr, const char *name) {
+    return get_override(this_ptr, name);
+}
+
+#define PYBIND11_OVERLOAD_INT(ret_type, cname, name, ...)                                         \
+    PYBIND11_OVERRIDE_IMPL(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__)
+#define PYBIND11_OVERLOAD_NAME(ret_type, cname, name, fn, ...)                                    \
+    PYBIND11_OVERRIDE_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, fn, __VA_ARGS__)
+#define PYBIND11_OVERLOAD_PURE_NAME(ret_type, cname, name, fn, ...)                               \
+    PYBIND11_OVERRIDE_PURE_NAME(                                                                  \
+        PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, fn, __VA_ARGS__);
+#define PYBIND11_OVERLOAD(ret_type, cname, fn, ...)                                               \
+    PYBIND11_OVERRIDE(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), fn, __VA_ARGS__)
+#define PYBIND11_OVERLOAD_PURE(ret_type, cname, fn, ...)                                          \
+    PYBIND11_OVERRIDE_PURE(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), fn, __VA_ARGS__);
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/pytypes.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/pytypes.h
new file mode 100644
index 00000000..64aad634
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/pytypes.h
@@ -0,0 +1,2557 @@
+/*
+    pybind11/pytypes.h: Convenience wrapper classes for basic Python types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "buffer_info.h"
+
+#include <assert.h>
+#include <cstddef>
+#include <exception>
+#include <frameobject.h>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <typeinfo>
+#include <utility>
+
+#if defined(PYBIND11_HAS_OPTIONAL)
+#    include <optional>
+#endif
+
+#ifdef PYBIND11_HAS_STRING_VIEW
+#    include <string_view>
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+/* A few forward declarations */
+class handle;
+class object;
+class str;
+class iterator;
+class type;
+struct arg;
+struct arg_v;
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+class args_proxy;
+bool isinstance_generic(handle obj, const std::type_info &tp);
+
+// Accessor forward declarations
+template <typename Policy>
+class accessor;
+namespace accessor_policies {
+struct obj_attr;
+struct str_attr;
+struct generic_item;
+struct sequence_item;
+struct list_item;
+struct tuple_item;
+} // namespace accessor_policies
+using obj_attr_accessor = accessor<accessor_policies::obj_attr>;
+using str_attr_accessor = accessor<accessor_policies::str_attr>;
+using item_accessor = accessor<accessor_policies::generic_item>;
+using sequence_accessor = accessor<accessor_policies::sequence_item>;
+using list_accessor = accessor<accessor_policies::list_item>;
+using tuple_accessor = accessor<accessor_policies::tuple_item>;
+
+/// Tag and check to identify a class which implements the Python object API
+class pyobject_tag {};
+template <typename T>
+using is_pyobject = std::is_base_of<pyobject_tag, remove_reference_t<T>>;
+
+/** \rst
+    A mixin class which adds common functions to `handle`, `object` and various accessors.
+    The only requirement for `Derived` is to implement ``PyObject *Derived::ptr() const``.
+\endrst */
+template <typename Derived>
+class object_api : public pyobject_tag {
+    const Derived &derived() const { return static_cast<const Derived &>(*this); }
+
+public:
+    /** \rst
+        Return an iterator equivalent to calling ``iter()`` in Python. The object
+        must be a collection which supports the iteration protocol.
+    \endrst */
+    iterator begin() const;
+    /// Return a sentinel which ends iteration.
+    iterator end() const;
+
+    /** \rst
+        Return an internal functor to invoke the object's sequence protocol. Casting
+        the returned ``detail::item_accessor`` instance to a `handle` or `object`
+        subclass causes a corresponding call to ``__getitem__``. Assigning a `handle`
+        or `object` subclass causes a call to ``__setitem__``.
+    \endrst */
+    item_accessor operator[](handle key) const;
+    /// See above (the only difference is that the key's reference is stolen)
+    item_accessor operator[](object &&key) const;
+    /// See above (the only difference is that the key is provided as a string literal)
+    item_accessor operator[](const char *key) const;
+
+    /** \rst
+        Return an internal functor to access the object's attributes. Casting the
+        returned ``detail::obj_attr_accessor`` instance to a `handle` or `object`
+        subclass causes a corresponding call to ``getattr``. Assigning a `handle`
+        or `object` subclass causes a call to ``setattr``.
+    \endrst */
+    obj_attr_accessor attr(handle key) const;
+    /// See above (the only difference is that the key's reference is stolen)
+    obj_attr_accessor attr(object &&key) const;
+    /// See above (the only difference is that the key is provided as a string literal)
+    str_attr_accessor attr(const char *key) const;
+
+    /** \rst
+        Matches * unpacking in Python, e.g. to unpack arguments out of a ``tuple``
+        or ``list`` for a function call. Applying another * to the result yields
+        ** unpacking, e.g. to unpack a dict as function keyword arguments.
+        See :ref:`calling_python_functions`.
+    \endrst */
+    args_proxy operator*() const;
+
+    /// Check if the given item is contained within this object, i.e. ``item in obj``.
+    template <typename T>
+    bool contains(T &&item) const;
+
+    /** \rst
+        Assuming the Python object is a function or implements the ``__call__``
+        protocol, ``operator()`` invokes the underlying function, passing an
+        arbitrary set of parameters. The result is returned as a `object` and
+        may need to be converted back into a Python object using `handle::cast()`.
+
+        When some of the arguments cannot be converted to Python objects, the
+        function will throw a `cast_error` exception. When the Python function
+        call fails, a `error_already_set` exception is thrown.
+    \endrst */
+    template <return_value_policy policy = return_value_policy::automatic_reference,
+              typename... Args>
+    object operator()(Args &&...args) const;
+    template <return_value_policy policy = return_value_policy::automatic_reference,
+              typename... Args>
+    PYBIND11_DEPRECATED("call(...) was deprecated in favor of operator()(...)")
+    object call(Args &&...args) const;
+
+    /// Equivalent to ``obj is other`` in Python.
+    bool is(object_api const &other) const { return derived().ptr() == other.derived().ptr(); }
+    /// Equivalent to ``obj is None`` in Python.
+    bool is_none() const { return derived().ptr() == Py_None; }
+    /// Equivalent to obj == other in Python
+    bool equal(object_api const &other) const { return rich_compare(other, Py_EQ); }
+    bool not_equal(object_api const &other) const { return rich_compare(other, Py_NE); }
+    bool operator<(object_api const &other) const { return rich_compare(other, Py_LT); }
+    bool operator<=(object_api const &other) const { return rich_compare(other, Py_LE); }
+    bool operator>(object_api const &other) const { return rich_compare(other, Py_GT); }
+    bool operator>=(object_api const &other) const { return rich_compare(other, Py_GE); }
+
+    object operator-() const;
+    object operator~() const;
+    object operator+(object_api const &other) const;
+    object operator+=(object_api const &other);
+    object operator-(object_api const &other) const;
+    object operator-=(object_api const &other);
+    object operator*(object_api const &other) const;
+    object operator*=(object_api const &other);
+    object operator/(object_api const &other) const;
+    object operator/=(object_api const &other);
+    object operator|(object_api const &other) const;
+    object operator|=(object_api const &other);
+    object operator&(object_api const &other) const;
+    object operator&=(object_api const &other);
+    object operator^(object_api const &other) const;
+    object operator^=(object_api const &other);
+    object operator<<(object_api const &other) const;
+    object operator<<=(object_api const &other);
+    object operator>>(object_api const &other) const;
+    object operator>>=(object_api const &other);
+
+    PYBIND11_DEPRECATED("Use py::str(obj) instead")
+    pybind11::str str() const;
+
+    /// Get or set the object's docstring, i.e. ``obj.__doc__``.
+    str_attr_accessor doc() const;
+
+    /// Return the object's current reference count
+    int ref_count() const { return static_cast<int>(Py_REFCNT(derived().ptr())); }
+
+    // TODO PYBIND11_DEPRECATED(
+    //     "Call py::type::handle_of(h) or py::type::of(h) instead of h.get_type()")
+    handle get_type() const;
+
+private:
+    bool rich_compare(object_api const &other, int value) const;
+};
+
+template <typename T>
+using is_pyobj_ptr_or_nullptr_t = detail::any_of<std::is_same<T, PyObject *>,
+                                                 std::is_same<T, PyObject *const>,
+                                                 std::is_same<T, std::nullptr_t>>;
+
+PYBIND11_NAMESPACE_END(detail)
+
+#if !defined(PYBIND11_HANDLE_REF_DEBUG) && !defined(NDEBUG)
+#    define PYBIND11_HANDLE_REF_DEBUG
+#endif
+
+/** \rst
+    Holds a reference to a Python object (no reference counting)
+
+    The `handle` class is a thin wrapper around an arbitrary Python object (i.e. a
+    ``PyObject *`` in Python's C API). It does not perform any automatic reference
+    counting and merely provides a basic C++ interface to various Python API functions.
+
+    .. seealso::
+        The `object` class inherits from `handle` and adds automatic reference
+        counting features.
+\endrst */
+class handle : public detail::object_api<handle> {
+public:
+    /// The default constructor creates a handle with a ``nullptr``-valued pointer
+    handle() = default;
+
+    /// Enable implicit conversion from ``PyObject *`` and ``nullptr``.
+    /// Not using ``handle(PyObject *ptr)`` to avoid implicit conversion from ``0``.
+    template <typename T,
+              detail::enable_if_t<detail::is_pyobj_ptr_or_nullptr_t<T>::value, int> = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    handle(T ptr) : m_ptr(ptr) {}
+
+    /// Enable implicit conversion through ``T::operator PyObject *()``.
+    template <
+        typename T,
+        detail::enable_if_t<detail::all_of<detail::none_of<std::is_base_of<handle, T>,
+                                                           detail::is_pyobj_ptr_or_nullptr_t<T>>,
+                                           std::is_convertible<T, PyObject *>>::value,
+                            int>
+        = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    handle(T &obj) : m_ptr(obj) {}
+
+    /// Return the underlying ``PyObject *`` pointer
+    PyObject *ptr() const { return m_ptr; }
+    PyObject *&ptr() { return m_ptr; }
+
+    /** \rst
+        Manually increase the reference count of the Python object. Usually, it is
+        preferable to use the `object` class which derives from `handle` and calls
+        this function automatically. Returns a reference to itself.
+    \endrst */
+    const handle &inc_ref() const & {
+#ifdef PYBIND11_HANDLE_REF_DEBUG
+        inc_ref_counter(1);
+#endif
+#ifdef PYBIND11_ASSERT_GIL_HELD_INCREF_DECREF
+        if (m_ptr != nullptr && !PyGILState_Check()) {
+            throw_gilstate_error("pybind11::handle::inc_ref()");
+        }
+#endif
+        Py_XINCREF(m_ptr);
+        return *this;
+    }
+
+    /** \rst
+        Manually decrease the reference count of the Python object. Usually, it is
+        preferable to use the `object` class which derives from `handle` and calls
+        this function automatically. Returns a reference to itself.
+    \endrst */
+    const handle &dec_ref() const & {
+#ifdef PYBIND11_ASSERT_GIL_HELD_INCREF_DECREF
+        if (m_ptr != nullptr && !PyGILState_Check()) {
+            throw_gilstate_error("pybind11::handle::dec_ref()");
+        }
+#endif
+        Py_XDECREF(m_ptr);
+        return *this;
+    }
+
+    /** \rst
+        Attempt to cast the Python object into the given C++ type. A `cast_error`
+        will be throw upon failure.
+    \endrst */
+    template <typename T>
+    T cast() const;
+    /// Return ``true`` when the `handle` wraps a valid Python object
+    explicit operator bool() const { return m_ptr != nullptr; }
+    /** \rst
+        Deprecated: Check that the underlying pointers are the same.
+        Equivalent to ``obj1 is obj2`` in Python.
+    \endrst */
+    PYBIND11_DEPRECATED("Use obj1.is(obj2) instead")
+    bool operator==(const handle &h) const { return m_ptr == h.m_ptr; }
+    PYBIND11_DEPRECATED("Use !obj1.is(obj2) instead")
+    bool operator!=(const handle &h) const { return m_ptr != h.m_ptr; }
+    PYBIND11_DEPRECATED("Use handle::operator bool() instead")
+    bool check() const { return m_ptr != nullptr; }
+
+protected:
+    PyObject *m_ptr = nullptr;
+
+private:
+#ifdef PYBIND11_ASSERT_GIL_HELD_INCREF_DECREF
+    void throw_gilstate_error(const std::string &function_name) const {
+        fprintf(
+            stderr,
+            "%s is being called while the GIL is either not held or invalid. Please see "
+            "https://pybind11.readthedocs.io/en/stable/advanced/"
+            "misc.html#common-sources-of-global-interpreter-lock-errors for debugging advice.\n"
+            "If you are convinced there is no bug in your code, you can #define "
+            "PYBIND11_NO_ASSERT_GIL_HELD_INCREF_DECREF"
+            "to disable this check. In that case you have to ensure this #define is consistently "
+            "used for all translation units linked into a given pybind11 extension, otherwise "
+            "there will be ODR violations.",
+            function_name.c_str());
+        fflush(stderr);
+        if (Py_TYPE(m_ptr)->tp_name != nullptr) {
+            fprintf(stderr,
+                    "The failing %s call was triggered on a %s object.\n",
+                    function_name.c_str(),
+                    Py_TYPE(m_ptr)->tp_name);
+            fflush(stderr);
+        }
+        throw std::runtime_error(function_name + " PyGILState_Check() failure.");
+    }
+#endif
+
+#ifdef PYBIND11_HANDLE_REF_DEBUG
+    static std::size_t inc_ref_counter(std::size_t add) {
+        thread_local std::size_t counter = 0;
+        counter += add;
+        return counter;
+    }
+
+public:
+    static std::size_t inc_ref_counter() { return inc_ref_counter(0); }
+#endif
+};
+
+/** \rst
+    Holds a reference to a Python object (with reference counting)
+
+    Like `handle`, the `object` class is a thin wrapper around an arbitrary Python
+    object (i.e. a ``PyObject *`` in Python's C API). In contrast to `handle`, it
+    optionally increases the object's reference count upon construction, and it
+    *always* decreases the reference count when the `object` instance goes out of
+    scope and is destructed. When using `object` instances consistently, it is much
+    easier to get reference counting right at the first attempt.
+\endrst */
+class object : public handle {
+public:
+    object() = default;
+    PYBIND11_DEPRECATED("Use reinterpret_borrow<object>() or reinterpret_steal<object>()")
+    object(handle h, bool is_borrowed) : handle(h) {
+        if (is_borrowed) {
+            inc_ref();
+        }
+    }
+    /// Copy constructor; always increases the reference count
+    object(const object &o) : handle(o) { inc_ref(); }
+    /// Move constructor; steals the object from ``other`` and preserves its reference count
+    object(object &&other) noexcept : handle(other) { other.m_ptr = nullptr; }
+    /// Destructor; automatically calls `handle::dec_ref()`
+    ~object() { dec_ref(); }
+
+    /** \rst
+        Resets the internal pointer to ``nullptr`` without decreasing the
+        object's reference count. The function returns a raw handle to the original
+        Python object.
+    \endrst */
+    handle release() {
+        PyObject *tmp = m_ptr;
+        m_ptr = nullptr;
+        return handle(tmp);
+    }
+
+    object &operator=(const object &other) {
+        // Skip inc_ref and dec_ref if both objects are the same
+        if (!this->is(other)) {
+            other.inc_ref();
+            // Use temporary variable to ensure `*this` remains valid while
+            // `Py_XDECREF` executes, in case `*this` is accessible from Python.
+            handle temp(m_ptr);
+            m_ptr = other.m_ptr;
+            temp.dec_ref();
+        }
+        return *this;
+    }
+
+    object &operator=(object &&other) noexcept {
+        if (this != &other) {
+            handle temp(m_ptr);
+            m_ptr = other.m_ptr;
+            other.m_ptr = nullptr;
+            temp.dec_ref();
+        }
+        return *this;
+    }
+
+#define PYBIND11_INPLACE_OP(iop)                                                                  \
+    object iop(object_api const &other) { return operator=(handle::iop(other)); }
+
+    PYBIND11_INPLACE_OP(operator+=)
+    PYBIND11_INPLACE_OP(operator-=)
+    PYBIND11_INPLACE_OP(operator*=)
+    PYBIND11_INPLACE_OP(operator/=)
+    PYBIND11_INPLACE_OP(operator|=)
+    PYBIND11_INPLACE_OP(operator&=)
+    PYBIND11_INPLACE_OP(operator^=)
+    PYBIND11_INPLACE_OP(operator<<=)
+    PYBIND11_INPLACE_OP(operator>>=)
+#undef PYBIND11_INPLACE_OP
+
+    // Calling cast() on an object lvalue just copies (via handle::cast)
+    template <typename T>
+    T cast() const &;
+    // Calling on an object rvalue does a move, if needed and/or possible
+    template <typename T>
+    T cast() &&;
+
+protected:
+    // Tags for choosing constructors from raw PyObject *
+    struct borrowed_t {};
+    struct stolen_t {};
+
+    /// @cond BROKEN
+    template <typename T>
+    friend T reinterpret_borrow(handle);
+    template <typename T>
+    friend T reinterpret_steal(handle);
+    /// @endcond
+
+public:
+    // Only accessible from derived classes and the reinterpret_* functions
+    object(handle h, borrowed_t) : handle(h) { inc_ref(); }
+    object(handle h, stolen_t) : handle(h) {}
+};
+
+/** \rst
+    Declare that a `handle` or ``PyObject *`` is a certain type and borrow the reference.
+    The target type ``T`` must be `object` or one of its derived classes. The function
+    doesn't do any conversions or checks. It's up to the user to make sure that the
+    target type is correct.
+
+    .. code-block:: cpp
+
+        PyObject *p = PyList_GetItem(obj, index);
+        py::object o = reinterpret_borrow<py::object>(p);
+        // or
+        py::tuple t = reinterpret_borrow<py::tuple>(p); // <-- `p` must be already be a `tuple`
+\endrst */
+template <typename T>
+T reinterpret_borrow(handle h) {
+    return {h, object::borrowed_t{}};
+}
+
+/** \rst
+    Like `reinterpret_borrow`, but steals the reference.
+
+     .. code-block:: cpp
+
+        PyObject *p = PyObject_Str(obj);
+        py::str s = reinterpret_steal<py::str>(p); // <-- `p` must be already be a `str`
+\endrst */
+template <typename T>
+T reinterpret_steal(handle h) {
+    return {h, object::stolen_t{}};
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Equivalent to obj.__class__.__name__ (or obj.__name__ if obj is a class).
+inline const char *obj_class_name(PyObject *obj) {
+    if (PyType_Check(obj)) {
+        return reinterpret_cast<PyTypeObject *>(obj)->tp_name;
+    }
+    return Py_TYPE(obj)->tp_name;
+}
+
+std::string error_string();
+
+// The code in this struct is very unusual, to minimize the chances of
+// masking bugs (elsewhere) by errors during the error handling (here).
+// This is meant to be a lifeline for troubleshooting long-running processes
+// that crash under conditions that are virtually impossible to reproduce.
+// Low-level implementation alternatives are preferred to higher-level ones
+// that might raise cascading exceptions. Last-ditch-kind-of attempts are made
+// to report as much of the original error as possible, even if there are
+// secondary issues obtaining some of the details.
+struct error_fetch_and_normalize {
+    // This comment only applies to Python <= 3.11:
+    //     Immediate normalization is long-established behavior (starting with
+    //     https://github.com/pybind/pybind11/commit/135ba8deafb8bf64a15b24d1513899eb600e2011
+    //     from Sep 2016) and safest. Normalization could be deferred, but this could mask
+    //     errors elsewhere, the performance gain is very minor in typical situations
+    //     (usually the dominant bottleneck is EH unwinding), and the implementation here
+    //     would be more complex.
+    // Starting with Python 3.12, PyErr_Fetch() normalizes exceptions immediately.
+    // Any errors during normalization are tracked under __notes__.
+    explicit error_fetch_and_normalize(const char *called) {
+        PyErr_Fetch(&m_type.ptr(), &m_value.ptr(), &m_trace.ptr());
+        if (!m_type) {
+            pybind11_fail("Internal error: " + std::string(called)
+                          + " called while "
+                            "Python error indicator not set.");
+        }
+        const char *exc_type_name_orig = detail::obj_class_name(m_type.ptr());
+        if (exc_type_name_orig == nullptr) {
+            pybind11_fail("Internal error: " + std::string(called)
+                          + " failed to obtain the name "
+                            "of the original active exception type.");
+        }
+        m_lazy_error_string = exc_type_name_orig;
+#if PY_VERSION_HEX >= 0x030C0000
+        // The presence of __notes__ is likely due to exception normalization
+        // errors, although that is not necessarily true, therefore insert a
+        // hint only:
+        if (PyObject_HasAttrString(m_value.ptr(), "__notes__")) {
+            m_lazy_error_string += "[WITH __notes__]";
+        }
+#else
+        // PyErr_NormalizeException() may change the exception type if there are cascading
+        // failures. This can potentially be extremely confusing.
+        PyErr_NormalizeException(&m_type.ptr(), &m_value.ptr(), &m_trace.ptr());
+        if (m_type.ptr() == nullptr) {
+            pybind11_fail("Internal error: " + std::string(called)
+                          + " failed to normalize the "
+                            "active exception.");
+        }
+        const char *exc_type_name_norm = detail::obj_class_name(m_type.ptr());
+        if (exc_type_name_norm == nullptr) {
+            pybind11_fail("Internal error: " + std::string(called)
+                          + " failed to obtain the name "
+                            "of the normalized active exception type.");
+        }
+#    if defined(PYPY_VERSION_NUM) && PYPY_VERSION_NUM < 0x07030a00
+        // This behavior runs the risk of masking errors in the error handling, but avoids a
+        // conflict with PyPy, which relies on the normalization here to change OSError to
+        // FileNotFoundError (https://github.com/pybind/pybind11/issues/4075).
+        m_lazy_error_string = exc_type_name_norm;
+#    else
+        if (exc_type_name_norm != m_lazy_error_string) {
+            std::string msg = std::string(called)
+                              + ": MISMATCH of original and normalized "
+                                "active exception types: ";
+            msg += "ORIGINAL ";
+            msg += m_lazy_error_string;
+            msg += " REPLACED BY ";
+            msg += exc_type_name_norm;
+            msg += ": " + format_value_and_trace();
+            pybind11_fail(msg);
+        }
+#    endif
+#endif
+    }
+
+    error_fetch_and_normalize(const error_fetch_and_normalize &) = delete;
+    error_fetch_and_normalize(error_fetch_and_normalize &&) = delete;
+
+    std::string format_value_and_trace() const {
+        std::string result;
+        std::string message_error_string;
+        if (m_value) {
+            auto value_str = reinterpret_steal<object>(PyObject_Str(m_value.ptr()));
+            constexpr const char *message_unavailable_exc
+                = "<MESSAGE UNAVAILABLE DUE TO ANOTHER EXCEPTION>";
+            if (!value_str) {
+                message_error_string = detail::error_string();
+                result = message_unavailable_exc;
+            } else {
+                // Not using `value_str.cast<std::string>()`, to not potentially throw a secondary
+                // error_already_set that will then result in process termination (#4288).
+                auto value_bytes = reinterpret_steal<object>(
+                    PyUnicode_AsEncodedString(value_str.ptr(), "utf-8", "backslashreplace"));
+                if (!value_bytes) {
+                    message_error_string = detail::error_string();
+                    result = message_unavailable_exc;
+                } else {
+                    char *buffer = nullptr;
+                    Py_ssize_t length = 0;
+                    if (PyBytes_AsStringAndSize(value_bytes.ptr(), &buffer, &length) == -1) {
+                        message_error_string = detail::error_string();
+                        result = message_unavailable_exc;
+                    } else {
+                        result = std::string(buffer, static_cast<std::size_t>(length));
+                    }
+                }
+            }
+#if PY_VERSION_HEX >= 0x030B0000
+            auto notes
+                = reinterpret_steal<object>(PyObject_GetAttrString(m_value.ptr(), "__notes__"));
+            if (!notes) {
+                PyErr_Clear(); // No notes is good news.
+            } else {
+                auto len_notes = PyList_Size(notes.ptr());
+                if (len_notes < 0) {
+                    result += "\nFAILURE obtaining len(__notes__): " + detail::error_string();
+                } else {
+                    result += "\n__notes__ (len=" + std::to_string(len_notes) + "):";
+                    for (ssize_t i = 0; i < len_notes; i++) {
+                        PyObject *note = PyList_GET_ITEM(notes.ptr(), i);
+                        auto note_bytes = reinterpret_steal<object>(
+                            PyUnicode_AsEncodedString(note, "utf-8", "backslashreplace"));
+                        if (!note_bytes) {
+                            result += "\nFAILURE obtaining __notes__[" + std::to_string(i)
+                                      + "]: " + detail::error_string();
+                        } else {
+                            char *buffer = nullptr;
+                            Py_ssize_t length = 0;
+                            if (PyBytes_AsStringAndSize(note_bytes.ptr(), &buffer, &length)
+                                == -1) {
+                                result += "\nFAILURE formatting __notes__[" + std::to_string(i)
+                                          + "]: " + detail::error_string();
+                            } else {
+                                result += '\n';
+                                result += std::string(buffer, static_cast<std::size_t>(length));
+                            }
+                        }
+                    }
+                }
+            }
+#endif
+        } else {
+            result = "<MESSAGE UNAVAILABLE>";
+        }
+        if (result.empty()) {
+            result = "<EMPTY MESSAGE>";
+        }
+
+        bool have_trace = false;
+        if (m_trace) {
+#if !defined(PYPY_VERSION)
+            auto *tb = reinterpret_cast<PyTracebackObject *>(m_trace.ptr());
+
+            // Get the deepest trace possible.
+            while (tb->tb_next) {
+                tb = tb->tb_next;
+            }
+
+            PyFrameObject *frame = tb->tb_frame;
+            Py_XINCREF(frame);
+            result += "\n\nAt:\n";
+            while (frame) {
+#    if PY_VERSION_HEX >= 0x030900B1
+                PyCodeObject *f_code = PyFrame_GetCode(frame);
+#    else
+                PyCodeObject *f_code = frame->f_code;
+                Py_INCREF(f_code);
+#    endif
+                int lineno = PyFrame_GetLineNumber(frame);
+                result += "  ";
+                result += handle(f_code->co_filename).cast<std::string>();
+                result += '(';
+                result += std::to_string(lineno);
+                result += "): ";
+                result += handle(f_code->co_name).cast<std::string>();
+                result += '\n';
+                Py_DECREF(f_code);
+#    if PY_VERSION_HEX >= 0x030900B1
+                auto *b_frame = PyFrame_GetBack(frame);
+#    else
+                auto *b_frame = frame->f_back;
+                Py_XINCREF(b_frame);
+#    endif
+                Py_DECREF(frame);
+                frame = b_frame;
+            }
+
+            have_trace = true;
+#endif //! defined(PYPY_VERSION)
+        }
+
+        if (!message_error_string.empty()) {
+            if (!have_trace) {
+                result += '\n';
+            }
+            result += "\nMESSAGE UNAVAILABLE DUE TO EXCEPTION: " + message_error_string;
+        }
+
+        return result;
+    }
+
+    std::string const &error_string() const {
+        if (!m_lazy_error_string_completed) {
+            m_lazy_error_string += ": " + format_value_and_trace();
+            m_lazy_error_string_completed = true;
+        }
+        return m_lazy_error_string;
+    }
+
+    void restore() {
+        if (m_restore_called) {
+            pybind11_fail("Internal error: pybind11::detail::error_fetch_and_normalize::restore() "
+                          "called a second time. ORIGINAL ERROR: "
+                          + error_string());
+        }
+        PyErr_Restore(m_type.inc_ref().ptr(), m_value.inc_ref().ptr(), m_trace.inc_ref().ptr());
+        m_restore_called = true;
+    }
+
+    bool matches(handle exc) const {
+        return (PyErr_GivenExceptionMatches(m_type.ptr(), exc.ptr()) != 0);
+    }
+
+    // Not protecting these for simplicity.
+    object m_type, m_value, m_trace;
+
+private:
+    // Only protecting invariants.
+    mutable std::string m_lazy_error_string;
+    mutable bool m_lazy_error_string_completed = false;
+    mutable bool m_restore_called = false;
+};
+
+inline std::string error_string() {
+    return error_fetch_and_normalize("pybind11::detail::error_string").error_string();
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Fetch and hold an error which was already set in Python.  An instance of this is typically
+/// thrown to propagate python-side errors back through C++ which can either be caught manually or
+/// else falls back to the function dispatcher (which then raises the captured error back to
+/// python).
+class PYBIND11_EXPORT_EXCEPTION error_already_set : public std::exception {
+public:
+    /// Fetches the current Python exception (using PyErr_Fetch()), which will clear the
+    /// current Python error indicator.
+    error_already_set()
+        : m_fetched_error{new detail::error_fetch_and_normalize("pybind11::error_already_set"),
+                          m_fetched_error_deleter} {}
+
+    /// The what() result is built lazily on demand.
+    /// WARNING: This member function needs to acquire the Python GIL. This can lead to
+    ///          crashes (undefined behavior) if the Python interpreter is finalizing.
+    const char *what() const noexcept override;
+
+    /// Restores the currently-held Python error (which will clear the Python error indicator first
+    /// if already set).
+    /// NOTE: This member function will always restore the normalized exception, which may or may
+    ///       not be the original Python exception.
+    /// WARNING: The GIL must be held when this member function is called!
+    void restore() { m_fetched_error->restore(); }
+
+    /// If it is impossible to raise the currently-held error, such as in a destructor, we can
+    /// write it out using Python's unraisable hook (`sys.unraisablehook`). The error context
+    /// should be some object whose `repr()` helps identify the location of the error. Python
+    /// already knows the type and value of the error, so there is no need to repeat that.
+    void discard_as_unraisable(object err_context) {
+        restore();
+        PyErr_WriteUnraisable(err_context.ptr());
+    }
+    /// An alternate version of `discard_as_unraisable()`, where a string provides information on
+    /// the location of the error. For example, `__func__` could be helpful.
+    /// WARNING: The GIL must be held when this member function is called!
+    void discard_as_unraisable(const char *err_context) {
+        discard_as_unraisable(reinterpret_steal<object>(PYBIND11_FROM_STRING(err_context)));
+    }
+
+    // Does nothing; provided for backwards compatibility.
+    PYBIND11_DEPRECATED("Use of error_already_set.clear() is deprecated")
+    void clear() {}
+
+    /// Check if the currently trapped error type matches the given Python exception class (or a
+    /// subclass thereof).  May also be passed a tuple to search for any exception class matches in
+    /// the given tuple.
+    bool matches(handle exc) const { return m_fetched_error->matches(exc); }
+
+    const object &type() const { return m_fetched_error->m_type; }
+    const object &value() const { return m_fetched_error->m_value; }
+    const object &trace() const { return m_fetched_error->m_trace; }
+
+private:
+    std::shared_ptr<detail::error_fetch_and_normalize> m_fetched_error;
+
+    /// WARNING: This custom deleter needs to acquire the Python GIL. This can lead to
+    ///          crashes (undefined behavior) if the Python interpreter is finalizing.
+    static void m_fetched_error_deleter(detail::error_fetch_and_normalize *raw_ptr);
+};
+
+/// Replaces the current Python error indicator with the chosen error, performing a
+/// 'raise from' to indicate that the chosen error was caused by the original error.
+inline void raise_from(PyObject *type, const char *message) {
+    // Based on _PyErr_FormatVFromCause:
+    // https://github.com/python/cpython/blob/467ab194fc6189d9f7310c89937c51abeac56839/Python/errors.c#L405
+    // See https://github.com/pybind/pybind11/pull/2112 for details.
+    PyObject *exc = nullptr, *val = nullptr, *val2 = nullptr, *tb = nullptr;
+
+    assert(PyErr_Occurred());
+    PyErr_Fetch(&exc, &val, &tb);
+    PyErr_NormalizeException(&exc, &val, &tb);
+    if (tb != nullptr) {
+        PyException_SetTraceback(val, tb);
+        Py_DECREF(tb);
+    }
+    Py_DECREF(exc);
+    assert(!PyErr_Occurred());
+
+    PyErr_SetString(type, message);
+
+    PyErr_Fetch(&exc, &val2, &tb);
+    PyErr_NormalizeException(&exc, &val2, &tb);
+    Py_INCREF(val);
+    PyException_SetCause(val2, val);
+    PyException_SetContext(val2, val);
+    PyErr_Restore(exc, val2, tb);
+}
+
+/// Sets the current Python error indicator with the chosen error, performing a 'raise from'
+/// from the error contained in error_already_set to indicate that the chosen error was
+/// caused by the original error.
+inline void raise_from(error_already_set &err, PyObject *type, const char *message) {
+    err.restore();
+    raise_from(type, message);
+}
+
+/** \defgroup python_builtins const_name
+    Unless stated otherwise, the following C++ functions behave the same
+    as their Python counterparts.
+ */
+
+/** \ingroup python_builtins
+    \rst
+    Return true if ``obj`` is an instance of ``T``. Type ``T`` must be a subclass of
+    `object` or a class which was exposed to Python as ``py::class_<T>``.
+\endrst */
+template <typename T, detail::enable_if_t<std::is_base_of<object, T>::value, int> = 0>
+bool isinstance(handle obj) {
+    return T::check_(obj);
+}
+
+template <typename T, detail::enable_if_t<!std::is_base_of<object, T>::value, int> = 0>
+bool isinstance(handle obj) {
+    return detail::isinstance_generic(obj, typeid(T));
+}
+
+template <>
+inline bool isinstance<handle>(handle) = delete;
+template <>
+inline bool isinstance<object>(handle obj) {
+    return obj.ptr() != nullptr;
+}
+
+/// \ingroup python_builtins
+/// Return true if ``obj`` is an instance of the ``type``.
+inline bool isinstance(handle obj, handle type) {
+    const auto result = PyObject_IsInstance(obj.ptr(), type.ptr());
+    if (result == -1) {
+        throw error_already_set();
+    }
+    return result != 0;
+}
+
+/// \addtogroup python_builtins
+/// @{
+inline bool hasattr(handle obj, handle name) {
+    return PyObject_HasAttr(obj.ptr(), name.ptr()) == 1;
+}
+
+inline bool hasattr(handle obj, const char *name) {
+    return PyObject_HasAttrString(obj.ptr(), name) == 1;
+}
+
+inline void delattr(handle obj, handle name) {
+    if (PyObject_DelAttr(obj.ptr(), name.ptr()) != 0) {
+        throw error_already_set();
+    }
+}
+
+inline void delattr(handle obj, const char *name) {
+    if (PyObject_DelAttrString(obj.ptr(), name) != 0) {
+        throw error_already_set();
+    }
+}
+
+inline object getattr(handle obj, handle name) {
+    PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr());
+    if (!result) {
+        throw error_already_set();
+    }
+    return reinterpret_steal<object>(result);
+}
+
+inline object getattr(handle obj, const char *name) {
+    PyObject *result = PyObject_GetAttrString(obj.ptr(), name);
+    if (!result) {
+        throw error_already_set();
+    }
+    return reinterpret_steal<object>(result);
+}
+
+inline object getattr(handle obj, handle name, handle default_) {
+    if (PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr())) {
+        return reinterpret_steal<object>(result);
+    }
+    PyErr_Clear();
+    return reinterpret_borrow<object>(default_);
+}
+
+inline object getattr(handle obj, const char *name, handle default_) {
+    if (PyObject *result = PyObject_GetAttrString(obj.ptr(), name)) {
+        return reinterpret_steal<object>(result);
+    }
+    PyErr_Clear();
+    return reinterpret_borrow<object>(default_);
+}
+
+inline void setattr(handle obj, handle name, handle value) {
+    if (PyObject_SetAttr(obj.ptr(), name.ptr(), value.ptr()) != 0) {
+        throw error_already_set();
+    }
+}
+
+inline void setattr(handle obj, const char *name, handle value) {
+    if (PyObject_SetAttrString(obj.ptr(), name, value.ptr()) != 0) {
+        throw error_already_set();
+    }
+}
+
+inline ssize_t hash(handle obj) {
+    auto h = PyObject_Hash(obj.ptr());
+    if (h == -1) {
+        throw error_already_set();
+    }
+    return h;
+}
+
+/// @} python_builtins
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+inline handle get_function(handle value) {
+    if (value) {
+        if (PyInstanceMethod_Check(value.ptr())) {
+            value = PyInstanceMethod_GET_FUNCTION(value.ptr());
+        } else if (PyMethod_Check(value.ptr())) {
+            value = PyMethod_GET_FUNCTION(value.ptr());
+        }
+    }
+    return value;
+}
+
+// Reimplementation of python's dict helper functions to ensure that exceptions
+// aren't swallowed (see #2862)
+
+// copied from cpython _PyDict_GetItemStringWithError
+inline PyObject *dict_getitemstring(PyObject *v, const char *key) {
+    PyObject *kv = nullptr, *rv = nullptr;
+    kv = PyUnicode_FromString(key);
+    if (kv == nullptr) {
+        throw error_already_set();
+    }
+
+    rv = PyDict_GetItemWithError(v, kv);
+    Py_DECREF(kv);
+    if (rv == nullptr && PyErr_Occurred()) {
+        throw error_already_set();
+    }
+    return rv;
+}
+
+inline PyObject *dict_getitem(PyObject *v, PyObject *key) {
+    PyObject *rv = PyDict_GetItemWithError(v, key);
+    if (rv == nullptr && PyErr_Occurred()) {
+        throw error_already_set();
+    }
+    return rv;
+}
+
+// Helper aliases/functions to support implicit casting of values given to python
+// accessors/methods. When given a pyobject, this simply returns the pyobject as-is; for other C++
+// type, the value goes through pybind11::cast(obj) to convert it to an `object`.
+template <typename T, enable_if_t<is_pyobject<T>::value, int> = 0>
+auto object_or_cast(T &&o) -> decltype(std::forward<T>(o)) {
+    return std::forward<T>(o);
+}
+// The following casting version is implemented in cast.h:
+template <typename T, enable_if_t<!is_pyobject<T>::value, int> = 0>
+object object_or_cast(T &&o);
+// Match a PyObject*, which we want to convert directly to handle via its converting constructor
+inline handle object_or_cast(PyObject *ptr) { return ptr; }
+
+PYBIND11_WARNING_PUSH
+PYBIND11_WARNING_DISABLE_MSVC(4522) // warning C4522: multiple assignment operators specified
+template <typename Policy>
+class accessor : public object_api<accessor<Policy>> {
+    using key_type = typename Policy::key_type;
+
+public:
+    accessor(handle obj, key_type key) : obj(obj), key(std::move(key)) {}
+    accessor(const accessor &) = default;
+    accessor(accessor &&) noexcept = default;
+
+    // accessor overload required to override default assignment operator (templates are not
+    // allowed to replace default compiler-generated assignments).
+    void operator=(const accessor &a) && { std::move(*this).operator=(handle(a)); }
+    void operator=(const accessor &a) & { operator=(handle(a)); }
+
+    template <typename T>
+    void operator=(T &&value) && {
+        Policy::set(obj, key, object_or_cast(std::forward<T>(value)));
+    }
+    template <typename T>
+    void operator=(T &&value) & {
+        get_cache() = ensure_object(object_or_cast(std::forward<T>(value)));
+    }
+
+    template <typename T = Policy>
+    PYBIND11_DEPRECATED(
+        "Use of obj.attr(...) as bool is deprecated in favor of pybind11::hasattr(obj, ...)")
+    explicit
+    operator enable_if_t<std::is_same<T, accessor_policies::str_attr>::value
+                             || std::is_same<T, accessor_policies::obj_attr>::value,
+                         bool>() const {
+        return hasattr(obj, key);
+    }
+    template <typename T = Policy>
+    PYBIND11_DEPRECATED("Use of obj[key] as bool is deprecated in favor of obj.contains(key)")
+    explicit
+    operator enable_if_t<std::is_same<T, accessor_policies::generic_item>::value, bool>() const {
+        return obj.contains(key);
+    }
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator object() const { return get_cache(); }
+    PyObject *ptr() const { return get_cache().ptr(); }
+    template <typename T>
+    T cast() const {
+        return get_cache().template cast<T>();
+    }
+
+private:
+    static object ensure_object(object &&o) { return std::move(o); }
+    static object ensure_object(handle h) { return reinterpret_borrow<object>(h); }
+
+    object &get_cache() const {
+        if (!cache) {
+            cache = Policy::get(obj, key);
+        }
+        return cache;
+    }
+
+private:
+    handle obj;
+    key_type key;
+    mutable object cache;
+};
+PYBIND11_WARNING_POP
+
+PYBIND11_NAMESPACE_BEGIN(accessor_policies)
+struct obj_attr {
+    using key_type = object;
+    static object get(handle obj, handle key) { return getattr(obj, key); }
+    static void set(handle obj, handle key, handle val) { setattr(obj, key, val); }
+};
+
+struct str_attr {
+    using key_type = const char *;
+    static object get(handle obj, const char *key) { return getattr(obj, key); }
+    static void set(handle obj, const char *key, handle val) { setattr(obj, key, val); }
+};
+
+struct generic_item {
+    using key_type = object;
+
+    static object get(handle obj, handle key) {
+        PyObject *result = PyObject_GetItem(obj.ptr(), key.ptr());
+        if (!result) {
+            throw error_already_set();
+        }
+        return reinterpret_steal<object>(result);
+    }
+
+    static void set(handle obj, handle key, handle val) {
+        if (PyObject_SetItem(obj.ptr(), key.ptr(), val.ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+struct sequence_item {
+    using key_type = size_t;
+
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static object get(handle obj, const IdxType &index) {
+        PyObject *result = PySequence_GetItem(obj.ptr(), ssize_t_cast(index));
+        if (!result) {
+            throw error_already_set();
+        }
+        return reinterpret_steal<object>(result);
+    }
+
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static void set(handle obj, const IdxType &index, handle val) {
+        // PySequence_SetItem does not steal a reference to 'val'
+        if (PySequence_SetItem(obj.ptr(), ssize_t_cast(index), val.ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+struct list_item {
+    using key_type = size_t;
+
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static object get(handle obj, const IdxType &index) {
+        PyObject *result = PyList_GetItem(obj.ptr(), ssize_t_cast(index));
+        if (!result) {
+            throw error_already_set();
+        }
+        return reinterpret_borrow<object>(result);
+    }
+
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static void set(handle obj, const IdxType &index, handle val) {
+        // PyList_SetItem steals a reference to 'val'
+        if (PyList_SetItem(obj.ptr(), ssize_t_cast(index), val.inc_ref().ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+struct tuple_item {
+    using key_type = size_t;
+
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static object get(handle obj, const IdxType &index) {
+        PyObject *result = PyTuple_GetItem(obj.ptr(), ssize_t_cast(index));
+        if (!result) {
+            throw error_already_set();
+        }
+        return reinterpret_borrow<object>(result);
+    }
+
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static void set(handle obj, const IdxType &index, handle val) {
+        // PyTuple_SetItem steals a reference to 'val'
+        if (PyTuple_SetItem(obj.ptr(), ssize_t_cast(index), val.inc_ref().ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+PYBIND11_NAMESPACE_END(accessor_policies)
+
+/// STL iterator template used for tuple, list, sequence and dict
+template <typename Policy>
+class generic_iterator : public Policy {
+    using It = generic_iterator;
+
+public:
+    using difference_type = ssize_t;
+    using iterator_category = typename Policy::iterator_category;
+    using value_type = typename Policy::value_type;
+    using reference = typename Policy::reference;
+    using pointer = typename Policy::pointer;
+
+    generic_iterator() = default;
+    generic_iterator(handle seq, ssize_t index) : Policy(seq, index) {}
+
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+    reference operator*() const { return Policy::dereference(); }
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+    reference operator[](difference_type n) const { return *(*this + n); }
+    pointer operator->() const { return **this; }
+
+    It &operator++() {
+        Policy::increment();
+        return *this;
+    }
+    It operator++(int) {
+        auto copy = *this;
+        Policy::increment();
+        return copy;
+    }
+    It &operator--() {
+        Policy::decrement();
+        return *this;
+    }
+    It operator--(int) {
+        auto copy = *this;
+        Policy::decrement();
+        return copy;
+    }
+    It &operator+=(difference_type n) {
+        Policy::advance(n);
+        return *this;
+    }
+    It &operator-=(difference_type n) {
+        Policy::advance(-n);
+        return *this;
+    }
+
+    friend It operator+(const It &a, difference_type n) {
+        auto copy = a;
+        return copy += n;
+    }
+    friend It operator+(difference_type n, const It &b) { return b + n; }
+    friend It operator-(const It &a, difference_type n) {
+        auto copy = a;
+        return copy -= n;
+    }
+    friend difference_type operator-(const It &a, const It &b) { return a.distance_to(b); }
+
+    friend bool operator==(const It &a, const It &b) { return a.equal(b); }
+    friend bool operator!=(const It &a, const It &b) { return !(a == b); }
+    friend bool operator<(const It &a, const It &b) { return b - a > 0; }
+    friend bool operator>(const It &a, const It &b) { return b < a; }
+    friend bool operator>=(const It &a, const It &b) { return !(a < b); }
+    friend bool operator<=(const It &a, const It &b) { return !(a > b); }
+};
+
+PYBIND11_NAMESPACE_BEGIN(iterator_policies)
+/// Quick proxy class needed to implement ``operator->`` for iterators which can't return pointers
+template <typename T>
+struct arrow_proxy {
+    T value;
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    arrow_proxy(T &&value) noexcept : value(std::move(value)) {}
+    T *operator->() const { return &value; }
+};
+
+/// Lightweight iterator policy using just a simple pointer: see ``PySequence_Fast_ITEMS``
+class sequence_fast_readonly {
+protected:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = handle;
+    using reference = const handle; // PR #3263
+    using pointer = arrow_proxy<const handle>;
+
+    sequence_fast_readonly(handle obj, ssize_t n) : ptr(PySequence_Fast_ITEMS(obj.ptr()) + n) {}
+
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+    reference dereference() const { return *ptr; }
+    void increment() { ++ptr; }
+    void decrement() { --ptr; }
+    void advance(ssize_t n) { ptr += n; }
+    bool equal(const sequence_fast_readonly &b) const { return ptr == b.ptr; }
+    ssize_t distance_to(const sequence_fast_readonly &b) const { return ptr - b.ptr; }
+
+private:
+    PyObject **ptr;
+};
+
+/// Full read and write access using the sequence protocol: see ``detail::sequence_accessor``
+class sequence_slow_readwrite {
+protected:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = object;
+    using reference = sequence_accessor;
+    using pointer = arrow_proxy<const sequence_accessor>;
+
+    sequence_slow_readwrite(handle obj, ssize_t index) : obj(obj), index(index) {}
+
+    reference dereference() const { return {obj, static_cast<size_t>(index)}; }
+    void increment() { ++index; }
+    void decrement() { --index; }
+    void advance(ssize_t n) { index += n; }
+    bool equal(const sequence_slow_readwrite &b) const { return index == b.index; }
+    ssize_t distance_to(const sequence_slow_readwrite &b) const { return index - b.index; }
+
+private:
+    handle obj;
+    ssize_t index;
+};
+
+/// Python's dictionary protocol permits this to be a forward iterator
+class dict_readonly {
+protected:
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = std::pair<handle, handle>;
+    using reference = const value_type; // PR #3263
+    using pointer = arrow_proxy<const value_type>;
+
+    dict_readonly() = default;
+    dict_readonly(handle obj, ssize_t pos) : obj(obj), pos(pos) { increment(); }
+
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+    reference dereference() const { return {key, value}; }
+    void increment() {
+        if (PyDict_Next(obj.ptr(), &pos, &key, &value) == 0) {
+            pos = -1;
+        }
+    }
+    bool equal(const dict_readonly &b) const { return pos == b.pos; }
+
+private:
+    handle obj;
+    PyObject *key = nullptr, *value = nullptr;
+    ssize_t pos = -1;
+};
+PYBIND11_NAMESPACE_END(iterator_policies)
+
+#if !defined(PYPY_VERSION)
+using tuple_iterator = generic_iterator<iterator_policies::sequence_fast_readonly>;
+using list_iterator = generic_iterator<iterator_policies::sequence_fast_readonly>;
+#else
+using tuple_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+using list_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+#endif
+
+using sequence_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+using dict_iterator = generic_iterator<iterator_policies::dict_readonly>;
+
+inline bool PyIterable_Check(PyObject *obj) {
+    PyObject *iter = PyObject_GetIter(obj);
+    if (iter) {
+        Py_DECREF(iter);
+        return true;
+    }
+    PyErr_Clear();
+    return false;
+}
+
+inline bool PyNone_Check(PyObject *o) { return o == Py_None; }
+inline bool PyEllipsis_Check(PyObject *o) { return o == Py_Ellipsis; }
+
+#ifdef PYBIND11_STR_LEGACY_PERMISSIVE
+inline bool PyUnicode_Check_Permissive(PyObject *o) {
+    return PyUnicode_Check(o) || PYBIND11_BYTES_CHECK(o);
+}
+#    define PYBIND11_STR_CHECK_FUN detail::PyUnicode_Check_Permissive
+#else
+#    define PYBIND11_STR_CHECK_FUN PyUnicode_Check
+#endif
+
+inline bool PyStaticMethod_Check(PyObject *o) { return o->ob_type == &PyStaticMethod_Type; }
+
+class kwargs_proxy : public handle {
+public:
+    explicit kwargs_proxy(handle h) : handle(h) {}
+};
+
+class args_proxy : public handle {
+public:
+    explicit args_proxy(handle h) : handle(h) {}
+    kwargs_proxy operator*() const { return kwargs_proxy(*this); }
+};
+
+/// Python argument categories (using PEP 448 terms)
+template <typename T>
+using is_keyword = std::is_base_of<arg, T>;
+template <typename T>
+using is_s_unpacking = std::is_same<args_proxy, T>; // * unpacking
+template <typename T>
+using is_ds_unpacking = std::is_same<kwargs_proxy, T>; // ** unpacking
+template <typename T>
+using is_positional = satisfies_none_of<T, is_keyword, is_s_unpacking, is_ds_unpacking>;
+template <typename T>
+using is_keyword_or_ds = satisfies_any_of<T, is_keyword, is_ds_unpacking>;
+
+// Call argument collector forward declarations
+template <return_value_policy policy = return_value_policy::automatic_reference>
+class simple_collector;
+template <return_value_policy policy = return_value_policy::automatic_reference>
+class unpacking_collector;
+
+PYBIND11_NAMESPACE_END(detail)
+
+// TODO: After the deprecated constructors are removed, this macro can be simplified by
+//       inheriting ctors: `using Parent::Parent`. It's not an option right now because
+//       the `using` statement triggers the parent deprecation warning even if the ctor
+//       isn't even used.
+#define PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun)                                            \
+public:                                                                                           \
+    PYBIND11_DEPRECATED("Use reinterpret_borrow<" #Name ">() or reinterpret_steal<" #Name ">()")  \
+    Name(handle h, bool is_borrowed)                                                              \
+        : Parent(is_borrowed ? Parent(h, borrowed_t{}) : Parent(h, stolen_t{})) {}                \
+    Name(handle h, borrowed_t) : Parent(h, borrowed_t{}) {}                                       \
+    Name(handle h, stolen_t) : Parent(h, stolen_t{}) {}                                           \
+    PYBIND11_DEPRECATED("Use py::isinstance<py::python_type>(obj) instead")                       \
+    bool check() const { return m_ptr != nullptr && (CheckFun(m_ptr) != 0); }                     \
+    static bool check_(handle h) { return h.ptr() != nullptr && CheckFun(h.ptr()); }              \
+    template <typename Policy_> /* NOLINTNEXTLINE(google-explicit-constructor) */                 \
+    Name(const ::pybind11::detail::accessor<Policy_> &a) : Name(object(a)) {}
+
+#define PYBIND11_OBJECT_CVT(Name, Parent, CheckFun, ConvertFun)                                   \
+    PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun)                                                \
+    /* This is deliberately not 'explicit' to allow implicit conversion from object: */           \
+    /* NOLINTNEXTLINE(google-explicit-constructor) */                                             \
+    Name(const object &o)                                                                         \
+        : Parent(check_(o) ? o.inc_ref().ptr() : ConvertFun(o.ptr()), stolen_t{}) {               \
+        if (!m_ptr)                                                                               \
+            throw ::pybind11::error_already_set();                                                \
+    }                                                                                             \
+    /* NOLINTNEXTLINE(google-explicit-constructor) */                                             \
+    Name(object &&o) : Parent(check_(o) ? o.release().ptr() : ConvertFun(o.ptr()), stolen_t{}) {  \
+        if (!m_ptr)                                                                               \
+            throw ::pybind11::error_already_set();                                                \
+    }
+
+#define PYBIND11_OBJECT_CVT_DEFAULT(Name, Parent, CheckFun, ConvertFun)                           \
+    PYBIND11_OBJECT_CVT(Name, Parent, CheckFun, ConvertFun)                                       \
+    Name() = default;
+
+#define PYBIND11_OBJECT_CHECK_FAILED(Name, o_ptr)                                                 \
+    ::pybind11::type_error("Object of type '"                                                     \
+                           + ::pybind11::detail::get_fully_qualified_tp_name(Py_TYPE(o_ptr))      \
+                           + "' is not an instance of '" #Name "'")
+
+#define PYBIND11_OBJECT(Name, Parent, CheckFun)                                                   \
+    PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun)                                                \
+    /* This is deliberately not 'explicit' to allow implicit conversion from object: */           \
+    /* NOLINTNEXTLINE(google-explicit-constructor) */                                             \
+    Name(const object &o) : Parent(o) {                                                           \
+        if (m_ptr && !check_(m_ptr))                                                              \
+            throw PYBIND11_OBJECT_CHECK_FAILED(Name, m_ptr);                                      \
+    }                                                                                             \
+    /* NOLINTNEXTLINE(google-explicit-constructor) */                                             \
+    Name(object &&o) : Parent(std::move(o)) {                                                     \
+        if (m_ptr && !check_(m_ptr))                                                              \
+            throw PYBIND11_OBJECT_CHECK_FAILED(Name, m_ptr);                                      \
+    }
+
+#define PYBIND11_OBJECT_DEFAULT(Name, Parent, CheckFun)                                           \
+    PYBIND11_OBJECT(Name, Parent, CheckFun)                                                       \
+    Name() = default;
+
+/// \addtogroup pytypes
+/// @{
+
+/** \rst
+    Wraps a Python iterator so that it can also be used as a C++ input iterator
+
+    Caveat: copying an iterator does not (and cannot) clone the internal
+    state of the Python iterable. This also applies to the post-increment
+    operator. This iterator should only be used to retrieve the current
+    value using ``operator*()``.
+\endrst */
+class iterator : public object {
+public:
+    using iterator_category = std::input_iterator_tag;
+    using difference_type = ssize_t;
+    using value_type = handle;
+    using reference = const handle; // PR #3263
+    using pointer = const handle *;
+
+    PYBIND11_OBJECT_DEFAULT(iterator, object, PyIter_Check)
+
+    iterator &operator++() {
+        advance();
+        return *this;
+    }
+
+    iterator operator++(int) {
+        auto rv = *this;
+        advance();
+        return rv;
+    }
+
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+    reference operator*() const {
+        if (m_ptr && !value.ptr()) {
+            auto &self = const_cast<iterator &>(*this);
+            self.advance();
+        }
+        return value;
+    }
+
+    pointer operator->() const {
+        operator*();
+        return &value;
+    }
+
+    /** \rst
+         The value which marks the end of the iteration. ``it == iterator::sentinel()``
+         is equivalent to catching ``StopIteration`` in Python.
+
+         .. code-block:: cpp
+
+             void foo(py::iterator it) {
+                 while (it != py::iterator::sentinel()) {
+                    // use `*it`
+                    ++it;
+                 }
+             }
+    \endrst */
+    static iterator sentinel() { return {}; }
+
+    friend bool operator==(const iterator &a, const iterator &b) { return a->ptr() == b->ptr(); }
+    friend bool operator!=(const iterator &a, const iterator &b) { return a->ptr() != b->ptr(); }
+
+private:
+    void advance() {
+        value = reinterpret_steal<object>(PyIter_Next(m_ptr));
+        if (value.ptr() == nullptr && PyErr_Occurred()) {
+            throw error_already_set();
+        }
+    }
+
+private:
+    object value = {};
+};
+
+class type : public object {
+public:
+    PYBIND11_OBJECT(type, object, PyType_Check)
+
+    /// Return a type handle from a handle or an object
+    static handle handle_of(handle h) { return handle((PyObject *) Py_TYPE(h.ptr())); }
+
+    /// Return a type object from a handle or an object
+    static type of(handle h) { return type(type::handle_of(h), borrowed_t{}); }
+
+    // Defined in pybind11/cast.h
+    /// Convert C++ type to handle if previously registered. Does not convert
+    /// standard types, like int, float. etc. yet.
+    /// See https://github.com/pybind/pybind11/issues/2486
+    template <typename T>
+    static handle handle_of();
+
+    /// Convert C++ type to type if previously registered. Does not convert
+    /// standard types, like int, float. etc. yet.
+    /// See https://github.com/pybind/pybind11/issues/2486
+    template <typename T>
+    static type of() {
+        return type(type::handle_of<T>(), borrowed_t{});
+    }
+};
+
+class iterable : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(iterable, object, detail::PyIterable_Check)
+};
+
+class bytes;
+
+class str : public object {
+public:
+    PYBIND11_OBJECT_CVT(str, object, PYBIND11_STR_CHECK_FUN, raw_str)
+
+    template <typename SzType, detail::enable_if_t<std::is_integral<SzType>::value, int> = 0>
+    str(const char *c, const SzType &n)
+        : object(PyUnicode_FromStringAndSize(c, ssize_t_cast(n)), stolen_t{}) {
+        if (!m_ptr) {
+            if (PyErr_Occurred()) {
+                throw error_already_set();
+            }
+            pybind11_fail("Could not allocate string object!");
+        }
+    }
+
+    // 'explicit' is explicitly omitted from the following constructors to allow implicit
+    // conversion to py::str from C++ string-like objects
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    str(const char *c = "") : object(PyUnicode_FromString(c), stolen_t{}) {
+        if (!m_ptr) {
+            if (PyErr_Occurred()) {
+                throw error_already_set();
+            }
+            pybind11_fail("Could not allocate string object!");
+        }
+    }
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    str(const std::string &s) : str(s.data(), s.size()) {}
+
+#ifdef PYBIND11_HAS_STRING_VIEW
+    // enable_if is needed to avoid "ambiguous conversion" errors (see PR #3521).
+    template <typename T, detail::enable_if_t<std::is_same<T, std::string_view>::value, int> = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    str(T s) : str(s.data(), s.size()) {}
+
+#    ifdef PYBIND11_HAS_U8STRING
+    // reinterpret_cast here is safe (C++20 guarantees char8_t has the same size/alignment as char)
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    str(std::u8string_view s) : str(reinterpret_cast<const char *>(s.data()), s.size()) {}
+#    endif
+
+#endif
+
+    explicit str(const bytes &b);
+
+    /** \rst
+        Return a string representation of the object. This is analogous to
+        the ``str()`` function in Python.
+    \endrst */
+    explicit str(handle h) : object(raw_str(h.ptr()), stolen_t{}) {
+        if (!m_ptr) {
+            throw error_already_set();
+        }
+    }
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator std::string() const {
+        object temp = *this;
+        if (PyUnicode_Check(m_ptr)) {
+            temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(m_ptr));
+            if (!temp) {
+                throw error_already_set();
+            }
+        }
+        char *buffer = nullptr;
+        ssize_t length = 0;
+        if (PyBytes_AsStringAndSize(temp.ptr(), &buffer, &length) != 0) {
+            throw error_already_set();
+        }
+        return std::string(buffer, (size_t) length);
+    }
+
+    template <typename... Args>
+    str format(Args &&...args) const {
+        return attr("format")(std::forward<Args>(args)...);
+    }
+
+private:
+    /// Return string representation -- always returns a new reference, even if already a str
+    static PyObject *raw_str(PyObject *op) {
+        PyObject *str_value = PyObject_Str(op);
+        return str_value;
+    }
+};
+/// @} pytypes
+
+inline namespace literals {
+/** \rst
+    String literal version of `str`
+ \endrst */
+inline str operator"" _s(const char *s, size_t size) { return {s, size}; }
+} // namespace literals
+
+/// \addtogroup pytypes
+/// @{
+class bytes : public object {
+public:
+    PYBIND11_OBJECT(bytes, object, PYBIND11_BYTES_CHECK)
+
+    // Allow implicit conversion:
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    bytes(const char *c = "") : object(PYBIND11_BYTES_FROM_STRING(c), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate bytes object!");
+        }
+    }
+
+    template <typename SzType, detail::enable_if_t<std::is_integral<SzType>::value, int> = 0>
+    bytes(const char *c, const SzType &n)
+        : object(PYBIND11_BYTES_FROM_STRING_AND_SIZE(c, ssize_t_cast(n)), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate bytes object!");
+        }
+    }
+
+    // Allow implicit conversion:
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    bytes(const std::string &s) : bytes(s.data(), s.size()) {}
+
+    explicit bytes(const pybind11::str &s);
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator std::string() const { return string_op<std::string>(); }
+
+#ifdef PYBIND11_HAS_STRING_VIEW
+    // enable_if is needed to avoid "ambiguous conversion" errors (see PR #3521).
+    template <typename T, detail::enable_if_t<std::is_same<T, std::string_view>::value, int> = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    bytes(T s) : bytes(s.data(), s.size()) {}
+
+    // Obtain a string view that views the current `bytes` buffer value.  Note that this is only
+    // valid so long as the `bytes` instance remains alive and so generally should not outlive the
+    // lifetime of the `bytes` instance.
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator std::string_view() const { return string_op<std::string_view>(); }
+#endif
+private:
+    template <typename T>
+    T string_op() const {
+        char *buffer = nullptr;
+        ssize_t length = 0;
+        if (PyBytes_AsStringAndSize(m_ptr, &buffer, &length) != 0) {
+            throw error_already_set();
+        }
+        return {buffer, static_cast<size_t>(length)};
+    }
+};
+// Note: breathe >= 4.17.0 will fail to build docs if the below two constructors
+// are included in the doxygen group; close here and reopen after as a workaround
+/// @} pytypes
+
+inline bytes::bytes(const pybind11::str &s) {
+    object temp = s;
+    if (PyUnicode_Check(s.ptr())) {
+        temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(s.ptr()));
+        if (!temp) {
+            throw error_already_set();
+        }
+    }
+    char *buffer = nullptr;
+    ssize_t length = 0;
+    if (PyBytes_AsStringAndSize(temp.ptr(), &buffer, &length) != 0) {
+        throw error_already_set();
+    }
+    auto obj = reinterpret_steal<object>(PYBIND11_BYTES_FROM_STRING_AND_SIZE(buffer, length));
+    if (!obj) {
+        pybind11_fail("Could not allocate bytes object!");
+    }
+    m_ptr = obj.release().ptr();
+}
+
+inline str::str(const bytes &b) {
+    char *buffer = nullptr;
+    ssize_t length = 0;
+    if (PyBytes_AsStringAndSize(b.ptr(), &buffer, &length) != 0) {
+        throw error_already_set();
+    }
+    auto obj = reinterpret_steal<object>(PyUnicode_FromStringAndSize(buffer, length));
+    if (!obj) {
+        if (PyErr_Occurred()) {
+            throw error_already_set();
+        }
+        pybind11_fail("Could not allocate string object!");
+    }
+    m_ptr = obj.release().ptr();
+}
+
+/// \addtogroup pytypes
+/// @{
+class bytearray : public object {
+public:
+    PYBIND11_OBJECT_CVT(bytearray, object, PyByteArray_Check, PyByteArray_FromObject)
+
+    template <typename SzType, detail::enable_if_t<std::is_integral<SzType>::value, int> = 0>
+    bytearray(const char *c, const SzType &n)
+        : object(PyByteArray_FromStringAndSize(c, ssize_t_cast(n)), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate bytearray object!");
+        }
+    }
+
+    bytearray() : bytearray("", 0) {}
+
+    explicit bytearray(const std::string &s) : bytearray(s.data(), s.size()) {}
+
+    size_t size() const { return static_cast<size_t>(PyByteArray_Size(m_ptr)); }
+
+    explicit operator std::string() const {
+        char *buffer = PyByteArray_AS_STRING(m_ptr);
+        ssize_t size = PyByteArray_GET_SIZE(m_ptr);
+        return std::string(buffer, static_cast<size_t>(size));
+    }
+};
+// Note: breathe >= 4.17.0 will fail to build docs if the below two constructors
+// are included in the doxygen group; close here and reopen after as a workaround
+/// @} pytypes
+
+/// \addtogroup pytypes
+/// @{
+class none : public object {
+public:
+    PYBIND11_OBJECT(none, object, detail::PyNone_Check)
+    none() : object(Py_None, borrowed_t{}) {}
+};
+
+class ellipsis : public object {
+public:
+    PYBIND11_OBJECT(ellipsis, object, detail::PyEllipsis_Check)
+    ellipsis() : object(Py_Ellipsis, borrowed_t{}) {}
+};
+
+class bool_ : public object {
+public:
+    PYBIND11_OBJECT_CVT(bool_, object, PyBool_Check, raw_bool)
+    bool_() : object(Py_False, borrowed_t{}) {}
+    // Allow implicit conversion from and to `bool`:
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    bool_(bool value) : object(value ? Py_True : Py_False, borrowed_t{}) {}
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator bool() const { return (m_ptr != nullptr) && PyLong_AsLong(m_ptr) != 0; }
+
+private:
+    /// Return the truth value of an object -- always returns a new reference
+    static PyObject *raw_bool(PyObject *op) {
+        const auto value = PyObject_IsTrue(op);
+        if (value == -1) {
+            return nullptr;
+        }
+        return handle(value != 0 ? Py_True : Py_False).inc_ref().ptr();
+    }
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+// Converts a value to the given unsigned type.  If an error occurs, you get back (Unsigned) -1;
+// otherwise you get back the unsigned long or unsigned long long value cast to (Unsigned).
+// (The distinction is critically important when casting a returned -1 error value to some other
+// unsigned type: (A)-1 != (B)-1 when A and B are unsigned types of different sizes).
+template <typename Unsigned>
+Unsigned as_unsigned(PyObject *o) {
+    if (sizeof(Unsigned) <= sizeof(unsigned long)) {
+        unsigned long v = PyLong_AsUnsignedLong(o);
+        return v == (unsigned long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v;
+    }
+    unsigned long long v = PyLong_AsUnsignedLongLong(o);
+    return v == (unsigned long long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v;
+}
+PYBIND11_NAMESPACE_END(detail)
+
+class int_ : public object {
+public:
+    PYBIND11_OBJECT_CVT(int_, object, PYBIND11_LONG_CHECK, PyNumber_Long)
+    int_() : object(PyLong_FromLong(0), stolen_t{}) {}
+    // Allow implicit conversion from C++ integral types:
+    template <typename T, detail::enable_if_t<std::is_integral<T>::value, int> = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    int_(T value) {
+        if (sizeof(T) <= sizeof(long)) {
+            if (std::is_signed<T>::value) {
+                m_ptr = PyLong_FromLong((long) value);
+            } else {
+                m_ptr = PyLong_FromUnsignedLong((unsigned long) value);
+            }
+        } else {
+            if (std::is_signed<T>::value) {
+                m_ptr = PyLong_FromLongLong((long long) value);
+            } else {
+                m_ptr = PyLong_FromUnsignedLongLong((unsigned long long) value);
+            }
+        }
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate int object!");
+        }
+    }
+
+    template <typename T, detail::enable_if_t<std::is_integral<T>::value, int> = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator T() const {
+        return std::is_unsigned<T>::value  ? detail::as_unsigned<T>(m_ptr)
+               : sizeof(T) <= sizeof(long) ? (T) PyLong_AsLong(m_ptr)
+                                           : (T) PYBIND11_LONG_AS_LONGLONG(m_ptr);
+    }
+};
+
+class float_ : public object {
+public:
+    PYBIND11_OBJECT_CVT(float_, object, PyFloat_Check, PyNumber_Float)
+    // Allow implicit conversion from float/double:
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    float_(float value) : object(PyFloat_FromDouble((double) value), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate float object!");
+        }
+    }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    float_(double value = .0) : object(PyFloat_FromDouble((double) value), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate float object!");
+        }
+    }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator float() const { return (float) PyFloat_AsDouble(m_ptr); }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator double() const { return (double) PyFloat_AsDouble(m_ptr); }
+};
+
+class weakref : public object {
+public:
+    PYBIND11_OBJECT_CVT_DEFAULT(weakref, object, PyWeakref_Check, raw_weakref)
+    explicit weakref(handle obj, handle callback = {})
+        : object(PyWeakref_NewRef(obj.ptr(), callback.ptr()), stolen_t{}) {
+        if (!m_ptr) {
+            if (PyErr_Occurred()) {
+                throw error_already_set();
+            }
+            pybind11_fail("Could not allocate weak reference!");
+        }
+    }
+
+private:
+    static PyObject *raw_weakref(PyObject *o) { return PyWeakref_NewRef(o, nullptr); }
+};
+
+class slice : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(slice, object, PySlice_Check)
+    slice(handle start, handle stop, handle step)
+        : object(PySlice_New(start.ptr(), stop.ptr(), step.ptr()), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate slice object!");
+        }
+    }
+
+#ifdef PYBIND11_HAS_OPTIONAL
+    slice(std::optional<ssize_t> start, std::optional<ssize_t> stop, std::optional<ssize_t> step)
+        : slice(index_to_object(start), index_to_object(stop), index_to_object(step)) {}
+#else
+    slice(ssize_t start_, ssize_t stop_, ssize_t step_)
+        : slice(int_(start_), int_(stop_), int_(step_)) {}
+#endif
+
+    bool
+    compute(size_t length, size_t *start, size_t *stop, size_t *step, size_t *slicelength) const {
+        return PySlice_GetIndicesEx((PYBIND11_SLICE_OBJECT *) m_ptr,
+                                    (ssize_t) length,
+                                    (ssize_t *) start,
+                                    (ssize_t *) stop,
+                                    (ssize_t *) step,
+                                    (ssize_t *) slicelength)
+               == 0;
+    }
+    bool compute(
+        ssize_t length, ssize_t *start, ssize_t *stop, ssize_t *step, ssize_t *slicelength) const {
+        return PySlice_GetIndicesEx(
+                   (PYBIND11_SLICE_OBJECT *) m_ptr, length, start, stop, step, slicelength)
+               == 0;
+    }
+
+private:
+    template <typename T>
+    static object index_to_object(T index) {
+        return index ? object(int_(*index)) : object(none());
+    }
+};
+
+class capsule : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(capsule, object, PyCapsule_CheckExact)
+    PYBIND11_DEPRECATED("Use reinterpret_borrow<capsule>() or reinterpret_steal<capsule>()")
+    capsule(PyObject *ptr, bool is_borrowed)
+        : object(is_borrowed ? object(ptr, borrowed_t{}) : object(ptr, stolen_t{})) {}
+
+    explicit capsule(const void *value,
+                     const char *name = nullptr,
+                     PyCapsule_Destructor destructor = nullptr)
+        : object(PyCapsule_New(const_cast<void *>(value), name, destructor), stolen_t{}) {
+        if (!m_ptr) {
+            throw error_already_set();
+        }
+    }
+
+    PYBIND11_DEPRECATED("Please use the ctor with value, name, destructor args")
+    capsule(const void *value, PyCapsule_Destructor destructor)
+        : object(PyCapsule_New(const_cast<void *>(value), nullptr, destructor), stolen_t{}) {
+        if (!m_ptr) {
+            throw error_already_set();
+        }
+    }
+
+    /// Capsule name is nullptr.
+    capsule(const void *value, void (*destructor)(void *)) {
+        initialize_with_void_ptr_destructor(value, nullptr, destructor);
+    }
+
+    capsule(const void *value, const char *name, void (*destructor)(void *)) {
+        initialize_with_void_ptr_destructor(value, name, destructor);
+    }
+
+    explicit capsule(void (*destructor)()) {
+        m_ptr = PyCapsule_New(reinterpret_cast<void *>(destructor), nullptr, [](PyObject *o) {
+            const char *name = get_name_in_error_scope(o);
+            auto destructor = reinterpret_cast<void (*)()>(PyCapsule_GetPointer(o, name));
+            if (destructor == nullptr) {
+                throw error_already_set();
+            }
+            destructor();
+        });
+
+        if (!m_ptr) {
+            throw error_already_set();
+        }
+    }
+
+    template <typename T>
+    operator T *() const { // NOLINT(google-explicit-constructor)
+        return get_pointer<T>();
+    }
+
+    /// Get the pointer the capsule holds.
+    template <typename T = void>
+    T *get_pointer() const {
+        const auto *name = this->name();
+        T *result = static_cast<T *>(PyCapsule_GetPointer(m_ptr, name));
+        if (!result) {
+            throw error_already_set();
+        }
+        return result;
+    }
+
+    /// Replaces a capsule's pointer *without* calling the destructor on the existing one.
+    void set_pointer(const void *value) {
+        if (PyCapsule_SetPointer(m_ptr, const_cast<void *>(value)) != 0) {
+            throw error_already_set();
+        }
+    }
+
+    const char *name() const {
+        const char *name = PyCapsule_GetName(m_ptr);
+        if ((name == nullptr) && PyErr_Occurred()) {
+            throw error_already_set();
+        }
+        return name;
+    }
+
+    /// Replaces a capsule's name *without* calling the destructor on the existing one.
+    void set_name(const char *new_name) {
+        if (PyCapsule_SetName(m_ptr, new_name) != 0) {
+            throw error_already_set();
+        }
+    }
+
+private:
+    static const char *get_name_in_error_scope(PyObject *o) {
+        error_scope error_guard;
+
+        const char *name = PyCapsule_GetName(o);
+        if ((name == nullptr) && PyErr_Occurred()) {
+            // write out and consume error raised by call to PyCapsule_GetName
+            PyErr_WriteUnraisable(o);
+        }
+
+        return name;
+    }
+
+    void initialize_with_void_ptr_destructor(const void *value,
+                                             const char *name,
+                                             void (*destructor)(void *)) {
+        m_ptr = PyCapsule_New(const_cast<void *>(value), name, [](PyObject *o) {
+            // guard if destructor called while err indicator is set
+            error_scope error_guard;
+            auto destructor = reinterpret_cast<void (*)(void *)>(PyCapsule_GetContext(o));
+            if (destructor == nullptr && PyErr_Occurred()) {
+                throw error_already_set();
+            }
+            const char *name = get_name_in_error_scope(o);
+            void *ptr = PyCapsule_GetPointer(o, name);
+            if (ptr == nullptr) {
+                throw error_already_set();
+            }
+
+            if (destructor != nullptr) {
+                destructor(ptr);
+            }
+        });
+
+        if (!m_ptr || PyCapsule_SetContext(m_ptr, reinterpret_cast<void *>(destructor)) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+class tuple : public object {
+public:
+    PYBIND11_OBJECT_CVT(tuple, object, PyTuple_Check, PySequence_Tuple)
+    template <typename SzType = ssize_t,
+              detail::enable_if_t<std::is_integral<SzType>::value, int> = 0>
+    // Some compilers generate link errors when using `const SzType &` here:
+    explicit tuple(SzType size = 0) : object(PyTuple_New(ssize_t_cast(size)), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate tuple object!");
+        }
+    }
+    size_t size() const { return (size_t) PyTuple_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    detail::tuple_accessor operator[](size_t index) const { return {*this, index}; }
+    template <typename T, detail::enable_if_t<detail::is_pyobject<T>::value, int> = 0>
+    detail::item_accessor operator[](T &&o) const {
+        return object::operator[](std::forward<T>(o));
+    }
+    detail::tuple_iterator begin() const { return {*this, 0}; }
+    detail::tuple_iterator end() const { return {*this, PyTuple_GET_SIZE(m_ptr)}; }
+};
+
+// We need to put this into a separate function because the Intel compiler
+// fails to compile enable_if_t<all_of<is_keyword_or_ds<Args>...>::value> part below
+// (tested with ICC 2021.1 Beta 20200827).
+template <typename... Args>
+constexpr bool args_are_all_keyword_or_ds() {
+    return detail::all_of<detail::is_keyword_or_ds<Args>...>::value;
+}
+
+class dict : public object {
+public:
+    PYBIND11_OBJECT_CVT(dict, object, PyDict_Check, raw_dict)
+    dict() : object(PyDict_New(), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate dict object!");
+        }
+    }
+    template <typename... Args,
+              typename = detail::enable_if_t<args_are_all_keyword_or_ds<Args...>()>,
+              // MSVC workaround: it can't compile an out-of-line definition, so defer the
+              // collector
+              typename collector = detail::deferred_t<detail::unpacking_collector<>, Args...>>
+    explicit dict(Args &&...args) : dict(collector(std::forward<Args>(args)...).kwargs()) {}
+
+    size_t size() const { return (size_t) PyDict_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    detail::dict_iterator begin() const { return {*this, 0}; }
+    detail::dict_iterator end() const { return {}; }
+    void clear() /* py-non-const */ { PyDict_Clear(ptr()); }
+    template <typename T>
+    bool contains(T &&key) const {
+        auto result = PyDict_Contains(m_ptr, detail::object_or_cast(std::forward<T>(key)).ptr());
+        if (result == -1) {
+            throw error_already_set();
+        }
+        return result == 1;
+    }
+
+private:
+    /// Call the `dict` Python type -- always returns a new reference
+    static PyObject *raw_dict(PyObject *op) {
+        if (PyDict_Check(op)) {
+            return handle(op).inc_ref().ptr();
+        }
+        return PyObject_CallFunctionObjArgs((PyObject *) &PyDict_Type, op, nullptr);
+    }
+};
+
+class sequence : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(sequence, object, PySequence_Check)
+    size_t size() const {
+        ssize_t result = PySequence_Size(m_ptr);
+        if (result == -1) {
+            throw error_already_set();
+        }
+        return (size_t) result;
+    }
+    bool empty() const { return size() == 0; }
+    detail::sequence_accessor operator[](size_t index) const { return {*this, index}; }
+    template <typename T, detail::enable_if_t<detail::is_pyobject<T>::value, int> = 0>
+    detail::item_accessor operator[](T &&o) const {
+        return object::operator[](std::forward<T>(o));
+    }
+    detail::sequence_iterator begin() const { return {*this, 0}; }
+    detail::sequence_iterator end() const { return {*this, PySequence_Size(m_ptr)}; }
+};
+
+class list : public object {
+public:
+    PYBIND11_OBJECT_CVT(list, object, PyList_Check, PySequence_List)
+    template <typename SzType = ssize_t,
+              detail::enable_if_t<std::is_integral<SzType>::value, int> = 0>
+    // Some compilers generate link errors when using `const SzType &` here:
+    explicit list(SzType size = 0) : object(PyList_New(ssize_t_cast(size)), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate list object!");
+        }
+    }
+    size_t size() const { return (size_t) PyList_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    detail::list_accessor operator[](size_t index) const { return {*this, index}; }
+    template <typename T, detail::enable_if_t<detail::is_pyobject<T>::value, int> = 0>
+    detail::item_accessor operator[](T &&o) const {
+        return object::operator[](std::forward<T>(o));
+    }
+    detail::list_iterator begin() const { return {*this, 0}; }
+    detail::list_iterator end() const { return {*this, PyList_GET_SIZE(m_ptr)}; }
+    template <typename T>
+    void append(T &&val) /* py-non-const */ {
+        if (PyList_Append(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+    template <typename IdxType,
+              typename ValType,
+              detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    void insert(const IdxType &index, ValType &&val) /* py-non-const */ {
+        if (PyList_Insert(m_ptr,
+                          ssize_t_cast(index),
+                          detail::object_or_cast(std::forward<ValType>(val)).ptr())
+            != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+class args : public tuple {
+    PYBIND11_OBJECT_DEFAULT(args, tuple, PyTuple_Check)
+};
+class kwargs : public dict {
+    PYBIND11_OBJECT_DEFAULT(kwargs, dict, PyDict_Check)
+};
+
+class anyset : public object {
+public:
+    PYBIND11_OBJECT(anyset, object, PyAnySet_Check)
+    size_t size() const { return static_cast<size_t>(PySet_Size(m_ptr)); }
+    bool empty() const { return size() == 0; }
+    template <typename T>
+    bool contains(T &&val) const {
+        auto result = PySet_Contains(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr());
+        if (result == -1) {
+            throw error_already_set();
+        }
+        return result == 1;
+    }
+};
+
+class set : public anyset {
+public:
+    PYBIND11_OBJECT_CVT(set, anyset, PySet_Check, PySet_New)
+    set() : anyset(PySet_New(nullptr), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate set object!");
+        }
+    }
+    template <typename T>
+    bool add(T &&val) /* py-non-const */ {
+        return PySet_Add(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr()) == 0;
+    }
+    void clear() /* py-non-const */ { PySet_Clear(m_ptr); }
+};
+
+class frozenset : public anyset {
+public:
+    PYBIND11_OBJECT_CVT(frozenset, anyset, PyFrozenSet_Check, PyFrozenSet_New)
+};
+
+class function : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(function, object, PyCallable_Check)
+    handle cpp_function() const {
+        handle fun = detail::get_function(m_ptr);
+        if (fun && PyCFunction_Check(fun.ptr())) {
+            return fun;
+        }
+        return handle();
+    }
+    bool is_cpp_function() const { return (bool) cpp_function(); }
+};
+
+class staticmethod : public object {
+public:
+    PYBIND11_OBJECT_CVT(staticmethod, object, detail::PyStaticMethod_Check, PyStaticMethod_New)
+};
+
+class buffer : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(buffer, object, PyObject_CheckBuffer)
+
+    buffer_info request(bool writable = false) const {
+        int flags = PyBUF_STRIDES | PyBUF_FORMAT;
+        if (writable) {
+            flags |= PyBUF_WRITABLE;
+        }
+        auto *view = new Py_buffer();
+        if (PyObject_GetBuffer(m_ptr, view, flags) != 0) {
+            delete view;
+            throw error_already_set();
+        }
+        return buffer_info(view);
+    }
+};
+
+class memoryview : public object {
+public:
+    PYBIND11_OBJECT_CVT(memoryview, object, PyMemoryView_Check, PyMemoryView_FromObject)
+
+    /** \rst
+        Creates ``memoryview`` from ``buffer_info``.
+
+        ``buffer_info`` must be created from ``buffer::request()``. Otherwise
+        throws an exception.
+
+        For creating a ``memoryview`` from objects that support buffer protocol,
+        use ``memoryview(const object& obj)`` instead of this constructor.
+     \endrst */
+    explicit memoryview(const buffer_info &info) {
+        if (!info.view()) {
+            pybind11_fail("Prohibited to create memoryview without Py_buffer");
+        }
+        // Note: PyMemoryView_FromBuffer never increments obj reference.
+        m_ptr = (info.view()->obj) ? PyMemoryView_FromObject(info.view()->obj)
+                                   : PyMemoryView_FromBuffer(info.view());
+        if (!m_ptr) {
+            pybind11_fail("Unable to create memoryview from buffer descriptor");
+        }
+    }
+
+    /** \rst
+        Creates ``memoryview`` from static buffer.
+
+        This method is meant for providing a ``memoryview`` for C/C++ buffer not
+        managed by Python. The caller is responsible for managing the lifetime
+        of ``ptr`` and ``format``, which MUST outlive the memoryview constructed
+        here.
+
+        See also: Python C API documentation for `PyMemoryView_FromBuffer`_.
+
+        .. _PyMemoryView_FromBuffer:
+           https://docs.python.org/c-api/memoryview.html#c.PyMemoryView_FromBuffer
+
+        :param ptr: Pointer to the buffer.
+        :param itemsize: Byte size of an element.
+        :param format: Pointer to the null-terminated format string. For
+            homogeneous Buffers, this should be set to
+            ``format_descriptor<T>::value``.
+        :param shape: Shape of the tensor (1 entry per dimension).
+        :param strides: Number of bytes between adjacent entries (for each
+            per dimension).
+        :param readonly: Flag to indicate if the underlying storage may be
+            written to.
+     \endrst */
+    static memoryview from_buffer(void *ptr,
+                                  ssize_t itemsize,
+                                  const char *format,
+                                  detail::any_container<ssize_t> shape,
+                                  detail::any_container<ssize_t> strides,
+                                  bool readonly = false);
+
+    static memoryview from_buffer(const void *ptr,
+                                  ssize_t itemsize,
+                                  const char *format,
+                                  detail::any_container<ssize_t> shape,
+                                  detail::any_container<ssize_t> strides) {
+        return memoryview::from_buffer(
+            const_cast<void *>(ptr), itemsize, format, std::move(shape), std::move(strides), true);
+    }
+
+    template <typename T>
+    static memoryview from_buffer(T *ptr,
+                                  detail::any_container<ssize_t> shape,
+                                  detail::any_container<ssize_t> strides,
+                                  bool readonly = false) {
+        return memoryview::from_buffer(reinterpret_cast<void *>(ptr),
+                                       sizeof(T),
+                                       format_descriptor<T>::value,
+                                       std::move(shape),
+                                       std::move(strides),
+                                       readonly);
+    }
+
+    template <typename T>
+    static memoryview from_buffer(const T *ptr,
+                                  detail::any_container<ssize_t> shape,
+                                  detail::any_container<ssize_t> strides) {
+        return memoryview::from_buffer(
+            const_cast<T *>(ptr), std::move(shape), std::move(strides), true);
+    }
+
+    /** \rst
+        Creates ``memoryview`` from static memory.
+
+        This method is meant for providing a ``memoryview`` for C/C++ buffer not
+        managed by Python. The caller is responsible for managing the lifetime
+        of ``mem``, which MUST outlive the memoryview constructed here.
+
+        See also: Python C API documentation for `PyMemoryView_FromBuffer`_.
+
+        .. _PyMemoryView_FromMemory:
+           https://docs.python.org/c-api/memoryview.html#c.PyMemoryView_FromMemory
+     \endrst */
+    static memoryview from_memory(void *mem, ssize_t size, bool readonly = false) {
+        PyObject *ptr = PyMemoryView_FromMemory(
+            reinterpret_cast<char *>(mem), size, (readonly) ? PyBUF_READ : PyBUF_WRITE);
+        if (!ptr) {
+            pybind11_fail("Could not allocate memoryview object!");
+        }
+        return memoryview(object(ptr, stolen_t{}));
+    }
+
+    static memoryview from_memory(const void *mem, ssize_t size) {
+        return memoryview::from_memory(const_cast<void *>(mem), size, true);
+    }
+
+#ifdef PYBIND11_HAS_STRING_VIEW
+    static memoryview from_memory(std::string_view mem) {
+        return from_memory(const_cast<char *>(mem.data()), static_cast<ssize_t>(mem.size()), true);
+    }
+#endif
+};
+
+/// @cond DUPLICATE
+inline memoryview memoryview::from_buffer(void *ptr,
+                                          ssize_t itemsize,
+                                          const char *format,
+                                          detail::any_container<ssize_t> shape,
+                                          detail::any_container<ssize_t> strides,
+                                          bool readonly) {
+    size_t ndim = shape->size();
+    if (ndim != strides->size()) {
+        pybind11_fail("memoryview: shape length doesn't match strides length");
+    }
+    ssize_t size = ndim != 0u ? 1 : 0;
+    for (size_t i = 0; i < ndim; ++i) {
+        size *= (*shape)[i];
+    }
+    Py_buffer view;
+    view.buf = ptr;
+    view.obj = nullptr;
+    view.len = size * itemsize;
+    view.readonly = static_cast<int>(readonly);
+    view.itemsize = itemsize;
+    view.format = const_cast<char *>(format);
+    view.ndim = static_cast<int>(ndim);
+    view.shape = shape->data();
+    view.strides = strides->data();
+    view.suboffsets = nullptr;
+    view.internal = nullptr;
+    PyObject *obj = PyMemoryView_FromBuffer(&view);
+    if (!obj) {
+        throw error_already_set();
+    }
+    return memoryview(object(obj, stolen_t{}));
+}
+/// @endcond
+/// @} pytypes
+
+/// \addtogroup python_builtins
+/// @{
+
+/// Get the length of a Python object.
+inline size_t len(handle h) {
+    ssize_t result = PyObject_Length(h.ptr());
+    if (result < 0) {
+        throw error_already_set();
+    }
+    return (size_t) result;
+}
+
+/// Get the length hint of a Python object.
+/// Returns 0 when this cannot be determined.
+inline size_t len_hint(handle h) {
+    ssize_t result = PyObject_LengthHint(h.ptr(), 0);
+    if (result < 0) {
+        // Sometimes a length can't be determined at all (eg generators)
+        // In which case simply return 0
+        PyErr_Clear();
+        return 0;
+    }
+    return (size_t) result;
+}
+
+inline str repr(handle h) {
+    PyObject *str_value = PyObject_Repr(h.ptr());
+    if (!str_value) {
+        throw error_already_set();
+    }
+    return reinterpret_steal<str>(str_value);
+}
+
+inline iterator iter(handle obj) {
+    PyObject *result = PyObject_GetIter(obj.ptr());
+    if (!result) {
+        throw error_already_set();
+    }
+    return reinterpret_steal<iterator>(result);
+}
+/// @} python_builtins
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+template <typename D>
+iterator object_api<D>::begin() const {
+    return iter(derived());
+}
+template <typename D>
+iterator object_api<D>::end() const {
+    return iterator::sentinel();
+}
+template <typename D>
+item_accessor object_api<D>::operator[](handle key) const {
+    return {derived(), reinterpret_borrow<object>(key)};
+}
+template <typename D>
+item_accessor object_api<D>::operator[](object &&key) const {
+    return {derived(), std::move(key)};
+}
+template <typename D>
+item_accessor object_api<D>::operator[](const char *key) const {
+    return {derived(), pybind11::str(key)};
+}
+template <typename D>
+obj_attr_accessor object_api<D>::attr(handle key) const {
+    return {derived(), reinterpret_borrow<object>(key)};
+}
+template <typename D>
+obj_attr_accessor object_api<D>::attr(object &&key) const {
+    return {derived(), std::move(key)};
+}
+template <typename D>
+str_attr_accessor object_api<D>::attr(const char *key) const {
+    return {derived(), key};
+}
+template <typename D>
+args_proxy object_api<D>::operator*() const {
+    return args_proxy(derived().ptr());
+}
+template <typename D>
+template <typename T>
+bool object_api<D>::contains(T &&item) const {
+    return attr("__contains__")(std::forward<T>(item)).template cast<bool>();
+}
+
+template <typename D>
+pybind11::str object_api<D>::str() const {
+    return pybind11::str(derived());
+}
+
+template <typename D>
+str_attr_accessor object_api<D>::doc() const {
+    return attr("__doc__");
+}
+
+template <typename D>
+handle object_api<D>::get_type() const {
+    return type::handle_of(derived());
+}
+
+template <typename D>
+bool object_api<D>::rich_compare(object_api const &other, int value) const {
+    int rv = PyObject_RichCompareBool(derived().ptr(), other.derived().ptr(), value);
+    if (rv == -1) {
+        throw error_already_set();
+    }
+    return rv == 1;
+}
+
+#define PYBIND11_MATH_OPERATOR_UNARY(op, fn)                                                      \
+    template <typename D>                                                                         \
+    object object_api<D>::op() const {                                                            \
+        object result = reinterpret_steal<object>(fn(derived().ptr()));                           \
+        if (!result.ptr())                                                                        \
+            throw error_already_set();                                                            \
+        return result;                                                                            \
+    }
+
+#define PYBIND11_MATH_OPERATOR_BINARY(op, fn)                                                     \
+    template <typename D>                                                                         \
+    object object_api<D>::op(object_api const &other) const {                                     \
+        object result = reinterpret_steal<object>(fn(derived().ptr(), other.derived().ptr()));    \
+        if (!result.ptr())                                                                        \
+            throw error_already_set();                                                            \
+        return result;                                                                            \
+    }
+
+#define PYBIND11_MATH_OPERATOR_BINARY_INPLACE(iop, fn)                                            \
+    template <typename D>                                                                         \
+    object object_api<D>::iop(object_api const &other) {                                          \
+        object result = reinterpret_steal<object>(fn(derived().ptr(), other.derived().ptr()));    \
+        if (!result.ptr())                                                                        \
+            throw error_already_set();                                                            \
+        return result;                                                                            \
+    }
+
+PYBIND11_MATH_OPERATOR_UNARY(operator~, PyNumber_Invert)
+PYBIND11_MATH_OPERATOR_UNARY(operator-, PyNumber_Negative)
+PYBIND11_MATH_OPERATOR_BINARY(operator+, PyNumber_Add)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator+=, PyNumber_InPlaceAdd)
+PYBIND11_MATH_OPERATOR_BINARY(operator-, PyNumber_Subtract)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator-=, PyNumber_InPlaceSubtract)
+PYBIND11_MATH_OPERATOR_BINARY(operator*, PyNumber_Multiply)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator*=, PyNumber_InPlaceMultiply)
+PYBIND11_MATH_OPERATOR_BINARY(operator/, PyNumber_TrueDivide)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator/=, PyNumber_InPlaceTrueDivide)
+PYBIND11_MATH_OPERATOR_BINARY(operator|, PyNumber_Or)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator|=, PyNumber_InPlaceOr)
+PYBIND11_MATH_OPERATOR_BINARY(operator&, PyNumber_And)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator&=, PyNumber_InPlaceAnd)
+PYBIND11_MATH_OPERATOR_BINARY(operator^, PyNumber_Xor)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator^=, PyNumber_InPlaceXor)
+PYBIND11_MATH_OPERATOR_BINARY(operator<<, PyNumber_Lshift)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator<<=, PyNumber_InPlaceLshift)
+PYBIND11_MATH_OPERATOR_BINARY(operator>>, PyNumber_Rshift)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator>>=, PyNumber_InPlaceRshift)
+
+#undef PYBIND11_MATH_OPERATOR_UNARY
+#undef PYBIND11_MATH_OPERATOR_BINARY
+#undef PYBIND11_MATH_OPERATOR_BINARY_INPLACE
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/stl.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/stl.h
new file mode 100644
index 00000000..f39f44f7
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/stl.h
@@ -0,0 +1,447 @@
+/*
+    pybind11/stl.h: Transparent conversion for STL data types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include "detail/common.h"
+
+#include <deque>
+#include <list>
+#include <map>
+#include <ostream>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+#include <valarray>
+
+// See `detail/common.h` for implementation of these guards.
+#if defined(PYBIND11_HAS_OPTIONAL)
+#    include <optional>
+#elif defined(PYBIND11_HAS_EXP_OPTIONAL)
+#    include <experimental/optional>
+#endif
+
+#if defined(PYBIND11_HAS_VARIANT)
+#    include <variant>
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// Extracts an const lvalue reference or rvalue reference for U based on the type of T (e.g. for
+/// forwarding a container element).  Typically used indirect via forwarded_type(), below.
+template <typename T, typename U>
+using forwarded_type = conditional_t<std::is_lvalue_reference<T>::value,
+                                     remove_reference_t<U> &,
+                                     remove_reference_t<U> &&>;
+
+/// Forwards a value U as rvalue or lvalue according to whether T is rvalue or lvalue; typically
+/// used for forwarding a container's elements.
+template <typename T, typename U>
+constexpr forwarded_type<T, U> forward_like(U &&u) {
+    return std::forward<detail::forwarded_type<T, U>>(std::forward<U>(u));
+}
+
+// Checks if a container has a STL style reserve method.
+// This will only return true for a `reserve()` with a `void` return.
+template <typename C>
+using has_reserve_method = std::is_same<decltype(std::declval<C>().reserve(0)), void>;
+
+template <typename Type, typename Key>
+struct set_caster {
+    using type = Type;
+    using key_conv = make_caster<Key>;
+
+private:
+    template <typename T = Type, enable_if_t<has_reserve_method<T>::value, int> = 0>
+    void reserve_maybe(const anyset &s, Type *) {
+        value.reserve(s.size());
+    }
+    void reserve_maybe(const anyset &, void *) {}
+
+public:
+    bool load(handle src, bool convert) {
+        if (!isinstance<anyset>(src)) {
+            return false;
+        }
+        auto s = reinterpret_borrow<anyset>(src);
+        value.clear();
+        reserve_maybe(s, &value);
+        for (auto entry : s) {
+            key_conv conv;
+            if (!conv.load(entry, convert)) {
+                return false;
+            }
+            value.insert(cast_op<Key &&>(std::move(conv)));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        if (!std::is_lvalue_reference<T>::value) {
+            policy = return_value_policy_override<Key>::policy(policy);
+        }
+        pybind11::set s;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(
+                key_conv::cast(detail::forward_like<T>(value), policy, parent));
+            if (!value_ || !s.add(std::move(value_))) {
+                return handle();
+            }
+        }
+        return s.release();
+    }
+
+    PYBIND11_TYPE_CASTER(type, const_name("Set[") + key_conv::name + const_name("]"));
+};
+
+template <typename Type, typename Key, typename Value>
+struct map_caster {
+    using key_conv = make_caster<Key>;
+    using value_conv = make_caster<Value>;
+
+private:
+    template <typename T = Type, enable_if_t<has_reserve_method<T>::value, int> = 0>
+    void reserve_maybe(const dict &d, Type *) {
+        value.reserve(d.size());
+    }
+    void reserve_maybe(const dict &, void *) {}
+
+public:
+    bool load(handle src, bool convert) {
+        if (!isinstance<dict>(src)) {
+            return false;
+        }
+        auto d = reinterpret_borrow<dict>(src);
+        value.clear();
+        reserve_maybe(d, &value);
+        for (auto it : d) {
+            key_conv kconv;
+            value_conv vconv;
+            if (!kconv.load(it.first.ptr(), convert) || !vconv.load(it.second.ptr(), convert)) {
+                return false;
+            }
+            value.emplace(cast_op<Key &&>(std::move(kconv)), cast_op<Value &&>(std::move(vconv)));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        dict d;
+        return_value_policy policy_key = policy;
+        return_value_policy policy_value = policy;
+        if (!std::is_lvalue_reference<T>::value) {
+            policy_key = return_value_policy_override<Key>::policy(policy_key);
+            policy_value = return_value_policy_override<Value>::policy(policy_value);
+        }
+        for (auto &&kv : src) {
+            auto key = reinterpret_steal<object>(
+                key_conv::cast(detail::forward_like<T>(kv.first), policy_key, parent));
+            auto value = reinterpret_steal<object>(
+                value_conv::cast(detail::forward_like<T>(kv.second), policy_value, parent));
+            if (!key || !value) {
+                return handle();
+            }
+            d[std::move(key)] = std::move(value);
+        }
+        return d.release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type,
+                         const_name("Dict[") + key_conv::name + const_name(", ") + value_conv::name
+                             + const_name("]"));
+};
+
+template <typename Type, typename Value>
+struct list_caster {
+    using value_conv = make_caster<Value>;
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src) || isinstance<bytes>(src) || isinstance<str>(src)) {
+            return false;
+        }
+        auto s = reinterpret_borrow<sequence>(src);
+        value.clear();
+        reserve_maybe(s, &value);
+        for (auto it : s) {
+            value_conv conv;
+            if (!conv.load(it, convert)) {
+                return false;
+            }
+            value.push_back(cast_op<Value &&>(std::move(conv)));
+        }
+        return true;
+    }
+
+private:
+    template <typename T = Type, enable_if_t<has_reserve_method<T>::value, int> = 0>
+    void reserve_maybe(const sequence &s, Type *) {
+        value.reserve(s.size());
+    }
+    void reserve_maybe(const sequence &, void *) {}
+
+public:
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        if (!std::is_lvalue_reference<T>::value) {
+            policy = return_value_policy_override<Value>::policy(policy);
+        }
+        list l(src.size());
+        ssize_t index = 0;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(
+                value_conv::cast(detail::forward_like<T>(value), policy, parent));
+            if (!value_) {
+                return handle();
+            }
+            PyList_SET_ITEM(l.ptr(), index++, value_.release().ptr()); // steals a reference
+        }
+        return l.release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type, const_name("List[") + value_conv::name + const_name("]"));
+};
+
+template <typename Type, typename Alloc>
+struct type_caster<std::vector<Type, Alloc>> : list_caster<std::vector<Type, Alloc>, Type> {};
+
+template <typename Type, typename Alloc>
+struct type_caster<std::deque<Type, Alloc>> : list_caster<std::deque<Type, Alloc>, Type> {};
+
+template <typename Type, typename Alloc>
+struct type_caster<std::list<Type, Alloc>> : list_caster<std::list<Type, Alloc>, Type> {};
+
+template <typename ArrayType, typename Value, bool Resizable, size_t Size = 0>
+struct array_caster {
+    using value_conv = make_caster<Value>;
+
+private:
+    template <bool R = Resizable>
+    bool require_size(enable_if_t<R, size_t> size) {
+        if (value.size() != size) {
+            value.resize(size);
+        }
+        return true;
+    }
+    template <bool R = Resizable>
+    bool require_size(enable_if_t<!R, size_t> size) {
+        return size == Size;
+    }
+
+public:
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src)) {
+            return false;
+        }
+        auto l = reinterpret_borrow<sequence>(src);
+        if (!require_size(l.size())) {
+            return false;
+        }
+        size_t ctr = 0;
+        for (auto it : l) {
+            value_conv conv;
+            if (!conv.load(it, convert)) {
+                return false;
+            }
+            value[ctr++] = cast_op<Value &&>(std::move(conv));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        list l(src.size());
+        ssize_t index = 0;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(
+                value_conv::cast(detail::forward_like<T>(value), policy, parent));
+            if (!value_) {
+                return handle();
+            }
+            PyList_SET_ITEM(l.ptr(), index++, value_.release().ptr()); // steals a reference
+        }
+        return l.release();
+    }
+
+    PYBIND11_TYPE_CASTER(ArrayType,
+                         const_name<Resizable>(const_name(""), const_name("Annotated["))
+                             + const_name("List[") + value_conv::name + const_name("]")
+                             + const_name<Resizable>(const_name(""),
+                                                     const_name(", FixedSize(")
+                                                         + const_name<Size>() + const_name(")]")));
+};
+
+template <typename Type, size_t Size>
+struct type_caster<std::array<Type, Size>>
+    : array_caster<std::array<Type, Size>, Type, false, Size> {};
+
+template <typename Type>
+struct type_caster<std::valarray<Type>> : array_caster<std::valarray<Type>, Type, true> {};
+
+template <typename Key, typename Compare, typename Alloc>
+struct type_caster<std::set<Key, Compare, Alloc>>
+    : set_caster<std::set<Key, Compare, Alloc>, Key> {};
+
+template <typename Key, typename Hash, typename Equal, typename Alloc>
+struct type_caster<std::unordered_set<Key, Hash, Equal, Alloc>>
+    : set_caster<std::unordered_set<Key, Hash, Equal, Alloc>, Key> {};
+
+template <typename Key, typename Value, typename Compare, typename Alloc>
+struct type_caster<std::map<Key, Value, Compare, Alloc>>
+    : map_caster<std::map<Key, Value, Compare, Alloc>, Key, Value> {};
+
+template <typename Key, typename Value, typename Hash, typename Equal, typename Alloc>
+struct type_caster<std::unordered_map<Key, Value, Hash, Equal, Alloc>>
+    : map_caster<std::unordered_map<Key, Value, Hash, Equal, Alloc>, Key, Value> {};
+
+// This type caster is intended to be used for std::optional and std::experimental::optional
+template <typename Type, typename Value = typename Type::value_type>
+struct optional_caster {
+    using value_conv = make_caster<Value>;
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        if (!src) {
+            return none().release();
+        }
+        if (!std::is_lvalue_reference<T>::value) {
+            policy = return_value_policy_override<Value>::policy(policy);
+        }
+        // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+        return value_conv::cast(*std::forward<T>(src), policy, parent);
+    }
+
+    bool load(handle src, bool convert) {
+        if (!src) {
+            return false;
+        }
+        if (src.is_none()) {
+            return true; // default-constructed value is already empty
+        }
+        value_conv inner_caster;
+        if (!inner_caster.load(src, convert)) {
+            return false;
+        }
+
+        value.emplace(cast_op<Value &&>(std::move(inner_caster)));
+        return true;
+    }
+
+    PYBIND11_TYPE_CASTER(Type, const_name("Optional[") + value_conv::name + const_name("]"));
+};
+
+#if defined(PYBIND11_HAS_OPTIONAL)
+template <typename T>
+struct type_caster<std::optional<T>> : public optional_caster<std::optional<T>> {};
+
+template <>
+struct type_caster<std::nullopt_t> : public void_caster<std::nullopt_t> {};
+#endif
+
+#if defined(PYBIND11_HAS_EXP_OPTIONAL)
+template <typename T>
+struct type_caster<std::experimental::optional<T>>
+    : public optional_caster<std::experimental::optional<T>> {};
+
+template <>
+struct type_caster<std::experimental::nullopt_t>
+    : public void_caster<std::experimental::nullopt_t> {};
+#endif
+
+/// Visit a variant and cast any found type to Python
+struct variant_caster_visitor {
+    return_value_policy policy;
+    handle parent;
+
+    using result_type = handle; // required by boost::variant in C++11
+
+    template <typename T>
+    result_type operator()(T &&src) const {
+        return make_caster<T>::cast(std::forward<T>(src), policy, parent);
+    }
+};
+
+/// Helper class which abstracts away variant's `visit` function. `std::variant` and similar
+/// `namespace::variant` types which provide a `namespace::visit()` function are handled here
+/// automatically using argument-dependent lookup. Users can provide specializations for other
+/// variant-like classes, e.g. `boost::variant` and `boost::apply_visitor`.
+template <template <typename...> class Variant>
+struct visit_helper {
+    template <typename... Args>
+    static auto call(Args &&...args) -> decltype(visit(std::forward<Args>(args)...)) {
+        return visit(std::forward<Args>(args)...);
+    }
+};
+
+/// Generic variant caster
+template <typename Variant>
+struct variant_caster;
+
+template <template <typename...> class V, typename... Ts>
+struct variant_caster<V<Ts...>> {
+    static_assert(sizeof...(Ts) > 0, "Variant must consist of at least one alternative.");
+
+    template <typename U, typename... Us>
+    bool load_alternative(handle src, bool convert, type_list<U, Us...>) {
+        auto caster = make_caster<U>();
+        if (caster.load(src, convert)) {
+            value = cast_op<U>(std::move(caster));
+            return true;
+        }
+        return load_alternative(src, convert, type_list<Us...>{});
+    }
+
+    bool load_alternative(handle, bool, type_list<>) { return false; }
+
+    bool load(handle src, bool convert) {
+        // Do a first pass without conversions to improve constructor resolution.
+        // E.g. `py::int_(1).cast<variant<double, int>>()` needs to fill the `int`
+        // slot of the variant. Without two-pass loading `double` would be filled
+        // because it appears first and a conversion is possible.
+        if (convert && load_alternative(src, false, type_list<Ts...>{})) {
+            return true;
+        }
+        return load_alternative(src, convert, type_list<Ts...>{});
+    }
+
+    template <typename Variant>
+    static handle cast(Variant &&src, return_value_policy policy, handle parent) {
+        return visit_helper<V>::call(variant_caster_visitor{policy, parent},
+                                     std::forward<Variant>(src));
+    }
+
+    using Type = V<Ts...>;
+    PYBIND11_TYPE_CASTER(Type,
+                         const_name("Union[") + detail::concat(make_caster<Ts>::name...)
+                             + const_name("]"));
+};
+
+#if defined(PYBIND11_HAS_VARIANT)
+template <typename... Ts>
+struct type_caster<std::variant<Ts...>> : variant_caster<std::variant<Ts...>> {};
+
+template <>
+struct type_caster<std::monostate> : public void_caster<std::monostate> {};
+#endif
+
+PYBIND11_NAMESPACE_END(detail)
+
+inline std::ostream &operator<<(std::ostream &os, const handle &obj) {
+#ifdef PYBIND11_HAS_STRING_VIEW
+    os << str(obj).cast<std::string_view>();
+#else
+    os << (std::string) str(obj);
+#endif
+    return os;
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/stl/filesystem.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/stl/filesystem.h
new file mode 100644
index 00000000..e26f4217
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/stl/filesystem.h
@@ -0,0 +1,116 @@
+// Copyright (c) 2021 The Pybind Development Team.
+// All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+#pragma once
+
+#include "../pybind11.h"
+#include "../detail/common.h"
+#include "../detail/descr.h"
+#include "../cast.h"
+#include "../pytypes.h"
+
+#include <string>
+
+#ifdef __has_include
+#    if defined(PYBIND11_CPP17)
+#        if __has_include(<filesystem>) && \
+          PY_VERSION_HEX >= 0x03060000
+#            include <filesystem>
+#            define PYBIND11_HAS_FILESYSTEM 1
+#        elif __has_include(<experimental/filesystem>)
+#            include <experimental/filesystem>
+#            define PYBIND11_HAS_EXPERIMENTAL_FILESYSTEM 1
+#        endif
+#    endif
+#endif
+
+#if !defined(PYBIND11_HAS_FILESYSTEM) && !defined(PYBIND11_HAS_EXPERIMENTAL_FILESYSTEM)           \
+    && !defined(PYBIND11_HAS_FILESYSTEM_IS_OPTIONAL)
+#    error                                                                                        \
+        "Neither #include <filesystem> nor #include <experimental/filesystem is available. (Use -DPYBIND11_HAS_FILESYSTEM_IS_OPTIONAL to ignore.)"
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+#if defined(PYBIND11_HAS_FILESYSTEM) || defined(PYBIND11_HAS_EXPERIMENTAL_FILESYSTEM)
+template <typename T>
+struct path_caster {
+
+private:
+    static PyObject *unicode_from_fs_native(const std::string &w) {
+#    if !defined(PYPY_VERSION)
+        return PyUnicode_DecodeFSDefaultAndSize(w.c_str(), ssize_t(w.size()));
+#    else
+        // PyPy mistakenly declares the first parameter as non-const.
+        return PyUnicode_DecodeFSDefaultAndSize(const_cast<char *>(w.c_str()), ssize_t(w.size()));
+#    endif
+    }
+
+    static PyObject *unicode_from_fs_native(const std::wstring &w) {
+        return PyUnicode_FromWideChar(w.c_str(), ssize_t(w.size()));
+    }
+
+public:
+    static handle cast(const T &path, return_value_policy, handle) {
+        if (auto py_str = unicode_from_fs_native(path.native())) {
+            return module_::import("pathlib")
+                .attr("Path")(reinterpret_steal<object>(py_str))
+                .release();
+        }
+        return nullptr;
+    }
+
+    bool load(handle handle, bool) {
+        // PyUnicode_FSConverter and PyUnicode_FSDecoder normally take care of
+        // calling PyOS_FSPath themselves, but that's broken on PyPy (PyPy
+        // issue #3168) so we do it ourselves instead.
+        PyObject *buf = PyOS_FSPath(handle.ptr());
+        if (!buf) {
+            PyErr_Clear();
+            return false;
+        }
+        PyObject *native = nullptr;
+        if constexpr (std::is_same_v<typename T::value_type, char>) {
+            if (PyUnicode_FSConverter(buf, &native) != 0) {
+                if (auto *c_str = PyBytes_AsString(native)) {
+                    // AsString returns a pointer to the internal buffer, which
+                    // must not be free'd.
+                    value = c_str;
+                }
+            }
+        } else if constexpr (std::is_same_v<typename T::value_type, wchar_t>) {
+            if (PyUnicode_FSDecoder(buf, &native) != 0) {
+                if (auto *c_str = PyUnicode_AsWideCharString(native, nullptr)) {
+                    // AsWideCharString returns a new string that must be free'd.
+                    value = c_str; // Copies the string.
+                    PyMem_Free(c_str);
+                }
+            }
+        }
+        Py_XDECREF(native);
+        Py_DECREF(buf);
+        if (PyErr_Occurred()) {
+            PyErr_Clear();
+            return false;
+        }
+        return true;
+    }
+
+    PYBIND11_TYPE_CASTER(T, const_name("os.PathLike"));
+};
+
+#endif // PYBIND11_HAS_FILESYSTEM || defined(PYBIND11_HAS_EXPERIMENTAL_FILESYSTEM)
+
+#if defined(PYBIND11_HAS_FILESYSTEM)
+template <>
+struct type_caster<std::filesystem::path> : public path_caster<std::filesystem::path> {};
+#elif defined(PYBIND11_HAS_EXPERIMENTAL_FILESYSTEM)
+template <>
+struct type_caster<std::experimental::filesystem::path>
+    : public path_caster<std::experimental::filesystem::path> {};
+#endif
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/stl_bind.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/stl_bind.h
new file mode 100644
index 00000000..49f1b778
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/stl_bind.h
@@ -0,0 +1,851 @@
+/*
+    pybind11/std_bind.h: Binding generators for STL data types
+
+    Copyright (c) 2016 Sergey Lyskov and Wenzel Jakob
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "detail/type_caster_base.h"
+#include "cast.h"
+#include "operators.h"
+
+#include <algorithm>
+#include <sstream>
+#include <type_traits>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/* SFINAE helper class used by 'is_comparable */
+template <typename T>
+struct container_traits {
+    template <typename T2>
+    static std::true_type
+    test_comparable(decltype(std::declval<const T2 &>() == std::declval<const T2 &>()) *);
+    template <typename T2>
+    static std::false_type test_comparable(...);
+    template <typename T2>
+    static std::true_type test_value(typename T2::value_type *);
+    template <typename T2>
+    static std::false_type test_value(...);
+    template <typename T2>
+    static std::true_type test_pair(typename T2::first_type *, typename T2::second_type *);
+    template <typename T2>
+    static std::false_type test_pair(...);
+
+    static constexpr const bool is_comparable
+        = std::is_same<std::true_type, decltype(test_comparable<T>(nullptr))>::value;
+    static constexpr const bool is_pair
+        = std::is_same<std::true_type, decltype(test_pair<T>(nullptr, nullptr))>::value;
+    static constexpr const bool is_vector
+        = std::is_same<std::true_type, decltype(test_value<T>(nullptr))>::value;
+    static constexpr const bool is_element = !is_pair && !is_vector;
+};
+
+/* Default: is_comparable -> std::false_type */
+template <typename T, typename SFINAE = void>
+struct is_comparable : std::false_type {};
+
+/* For non-map data structures, check whether operator== can be instantiated */
+template <typename T>
+struct is_comparable<
+    T,
+    enable_if_t<container_traits<T>::is_element && container_traits<T>::is_comparable>>
+    : std::true_type {};
+
+/* For a vector/map data structure, recursively check the value type
+   (which is std::pair for maps) */
+template <typename T>
+struct is_comparable<T, enable_if_t<container_traits<T>::is_vector>>
+    : is_comparable<typename recursive_container_traits<T>::type_to_check_recursively> {};
+
+template <>
+struct is_comparable<recursive_bottom> : std::true_type {};
+
+/* For pairs, recursively check the two data types */
+template <typename T>
+struct is_comparable<T, enable_if_t<container_traits<T>::is_pair>> {
+    static constexpr const bool value = is_comparable<typename T::first_type>::value
+                                        && is_comparable<typename T::second_type>::value;
+};
+
+/* Fallback functions */
+template <typename, typename, typename... Args>
+void vector_if_copy_constructible(const Args &...) {}
+template <typename, typename, typename... Args>
+void vector_if_equal_operator(const Args &...) {}
+template <typename, typename, typename... Args>
+void vector_if_insertion_operator(const Args &...) {}
+template <typename, typename, typename... Args>
+void vector_modifiers(const Args &...) {}
+
+template <typename Vector, typename Class_>
+void vector_if_copy_constructible(enable_if_t<is_copy_constructible<Vector>::value, Class_> &cl) {
+    cl.def(init<const Vector &>(), "Copy constructor");
+}
+
+template <typename Vector, typename Class_>
+void vector_if_equal_operator(enable_if_t<is_comparable<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+
+    cl.def(self == self);
+    cl.def(self != self);
+
+    cl.def(
+        "count",
+        [](const Vector &v, const T &x) { return std::count(v.begin(), v.end(), x); },
+        arg("x"),
+        "Return the number of times ``x`` appears in the list");
+
+    cl.def(
+        "remove",
+        [](Vector &v, const T &x) {
+            auto p = std::find(v.begin(), v.end(), x);
+            if (p != v.end()) {
+                v.erase(p);
+            } else {
+                throw value_error();
+            }
+        },
+        arg("x"),
+        "Remove the first item from the list whose value is x. "
+        "It is an error if there is no such item.");
+
+    cl.def(
+        "__contains__",
+        [](const Vector &v, const T &x) { return std::find(v.begin(), v.end(), x) != v.end(); },
+        arg("x"),
+        "Return true the container contains ``x``");
+}
+
+// Vector modifiers -- requires a copyable vector_type:
+// (Technically, some of these (pop and __delitem__) don't actually require copyability, but it
+// seems silly to allow deletion but not insertion, so include them here too.)
+template <typename Vector, typename Class_>
+void vector_modifiers(
+    enable_if_t<is_copy_constructible<typename Vector::value_type>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+
+    auto wrap_i = [](DiffType i, SizeType n) {
+        if (i < 0) {
+            i += n;
+        }
+        if (i < 0 || (SizeType) i >= n) {
+            throw index_error();
+        }
+        return i;
+    };
+
+    cl.def(
+        "append",
+        [](Vector &v, const T &value) { v.push_back(value); },
+        arg("x"),
+        "Add an item to the end of the list");
+
+    cl.def(init([](const iterable &it) {
+        auto v = std::unique_ptr<Vector>(new Vector());
+        v->reserve(len_hint(it));
+        for (handle h : it) {
+            v->push_back(h.cast<T>());
+        }
+        return v.release();
+    }));
+
+    cl.def(
+        "clear", [](Vector &v) { v.clear(); }, "Clear the contents");
+
+    cl.def(
+        "extend",
+        [](Vector &v, const Vector &src) { v.insert(v.end(), src.begin(), src.end()); },
+        arg("L"),
+        "Extend the list by appending all the items in the given list");
+
+    cl.def(
+        "extend",
+        [](Vector &v, const iterable &it) {
+            const size_t old_size = v.size();
+            v.reserve(old_size + len_hint(it));
+            try {
+                for (handle h : it) {
+                    v.push_back(h.cast<T>());
+                }
+            } catch (const cast_error &) {
+                v.erase(v.begin() + static_cast<typename Vector::difference_type>(old_size),
+                        v.end());
+                try {
+                    v.shrink_to_fit();
+                } catch (const std::exception &) {
+                    // Do nothing
+                }
+                throw;
+            }
+        },
+        arg("L"),
+        "Extend the list by appending all the items in the given list");
+
+    cl.def(
+        "insert",
+        [](Vector &v, DiffType i, const T &x) {
+            // Can't use wrap_i; i == v.size() is OK
+            if (i < 0) {
+                i += v.size();
+            }
+            if (i < 0 || (SizeType) i > v.size()) {
+                throw index_error();
+            }
+            v.insert(v.begin() + i, x);
+        },
+        arg("i"),
+        arg("x"),
+        "Insert an item at a given position.");
+
+    cl.def(
+        "pop",
+        [](Vector &v) {
+            if (v.empty()) {
+                throw index_error();
+            }
+            T t = std::move(v.back());
+            v.pop_back();
+            return t;
+        },
+        "Remove and return the last item");
+
+    cl.def(
+        "pop",
+        [wrap_i](Vector &v, DiffType i) {
+            i = wrap_i(i, v.size());
+            T t = std::move(v[(SizeType) i]);
+            v.erase(std::next(v.begin(), i));
+            return t;
+        },
+        arg("i"),
+        "Remove and return the item at index ``i``");
+
+    cl.def("__setitem__", [wrap_i](Vector &v, DiffType i, const T &t) {
+        i = wrap_i(i, v.size());
+        v[(SizeType) i] = t;
+    });
+
+    /// Slicing protocol
+    cl.def(
+        "__getitem__",
+        [](const Vector &v, const slice &slice) -> Vector * {
+            size_t start = 0, stop = 0, step = 0, slicelength = 0;
+
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength)) {
+                throw error_already_set();
+            }
+
+            auto *seq = new Vector();
+            seq->reserve((size_t) slicelength);
+
+            for (size_t i = 0; i < slicelength; ++i) {
+                seq->push_back(v[start]);
+                start += step;
+            }
+            return seq;
+        },
+        arg("s"),
+        "Retrieve list elements using a slice object");
+
+    cl.def(
+        "__setitem__",
+        [](Vector &v, const slice &slice, const Vector &value) {
+            size_t start = 0, stop = 0, step = 0, slicelength = 0;
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength)) {
+                throw error_already_set();
+            }
+
+            if (slicelength != value.size()) {
+                throw std::runtime_error(
+                    "Left and right hand size of slice assignment have different sizes!");
+            }
+
+            for (size_t i = 0; i < slicelength; ++i) {
+                v[start] = value[i];
+                start += step;
+            }
+        },
+        "Assign list elements using a slice object");
+
+    cl.def(
+        "__delitem__",
+        [wrap_i](Vector &v, DiffType i) {
+            i = wrap_i(i, v.size());
+            v.erase(v.begin() + i);
+        },
+        "Delete the list elements at index ``i``");
+
+    cl.def(
+        "__delitem__",
+        [](Vector &v, const slice &slice) {
+            size_t start = 0, stop = 0, step = 0, slicelength = 0;
+
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength)) {
+                throw error_already_set();
+            }
+
+            if (step == 1 && false) {
+                v.erase(v.begin() + (DiffType) start, v.begin() + DiffType(start + slicelength));
+            } else {
+                for (size_t i = 0; i < slicelength; ++i) {
+                    v.erase(v.begin() + DiffType(start));
+                    start += step - 1;
+                }
+            }
+        },
+        "Delete list elements using a slice object");
+}
+
+// If the type has an operator[] that doesn't return a reference (most notably std::vector<bool>),
+// we have to access by copying; otherwise we return by reference.
+template <typename Vector>
+using vector_needs_copy
+    = negation<std::is_same<decltype(std::declval<Vector>()[typename Vector::size_type()]),
+                            typename Vector::value_type &>>;
+
+// The usual case: access and iterate by reference
+template <typename Vector, typename Class_>
+void vector_accessor(enable_if_t<!vector_needs_copy<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+    using ItType = typename Vector::iterator;
+
+    auto wrap_i = [](DiffType i, SizeType n) {
+        if (i < 0) {
+            i += n;
+        }
+        if (i < 0 || (SizeType) i >= n) {
+            throw index_error();
+        }
+        return i;
+    };
+
+    cl.def(
+        "__getitem__",
+        [wrap_i](Vector &v, DiffType i) -> T & {
+            i = wrap_i(i, v.size());
+            return v[(SizeType) i];
+        },
+        return_value_policy::reference_internal // ref + keepalive
+    );
+
+    cl.def(
+        "__iter__",
+        [](Vector &v) {
+            return make_iterator<return_value_policy::reference_internal, ItType, ItType, T &>(
+                v.begin(), v.end());
+        },
+        keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+}
+
+// The case for special objects, like std::vector<bool>, that have to be returned-by-copy:
+template <typename Vector, typename Class_>
+void vector_accessor(enable_if_t<vector_needs_copy<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+    using ItType = typename Vector::iterator;
+    cl.def("__getitem__", [](const Vector &v, DiffType i) -> T {
+        if (i < 0) {
+            i += v.size();
+            if (i < 0) {
+                throw index_error();
+            }
+        }
+        auto i_st = static_cast<SizeType>(i);
+        if (i_st >= v.size()) {
+            throw index_error();
+        }
+        return v[i_st];
+    });
+
+    cl.def(
+        "__iter__",
+        [](Vector &v) {
+            return make_iterator<return_value_policy::copy, ItType, ItType, T>(v.begin(), v.end());
+        },
+        keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+}
+
+template <typename Vector, typename Class_>
+auto vector_if_insertion_operator(Class_ &cl, std::string const &name)
+    -> decltype(std::declval<std::ostream &>() << std::declval<typename Vector::value_type>(),
+                void()) {
+    using size_type = typename Vector::size_type;
+
+    cl.def(
+        "__repr__",
+        [name](Vector &v) {
+            std::ostringstream s;
+            s << name << '[';
+            for (size_type i = 0; i < v.size(); ++i) {
+                s << v[i];
+                if (i != v.size() - 1) {
+                    s << ", ";
+                }
+            }
+            s << ']';
+            return s.str();
+        },
+        "Return the canonical string representation of this list.");
+}
+
+// Provide the buffer interface for vectors if we have data() and we have a format for it
+// GCC seems to have "void std::vector<bool>::data()" - doing SFINAE on the existence of data()
+// is insufficient, we need to check it returns an appropriate pointer
+template <typename Vector, typename = void>
+struct vector_has_data_and_format : std::false_type {};
+template <typename Vector>
+struct vector_has_data_and_format<
+    Vector,
+    enable_if_t<std::is_same<decltype(format_descriptor<typename Vector::value_type>::format(),
+                                      std::declval<Vector>().data()),
+                             typename Vector::value_type *>::value>> : std::true_type {};
+
+// [workaround(intel)] Separate function required here
+// Workaround as the Intel compiler does not compile the enable_if_t part below
+// (tested with icc (ICC) 2021.1 Beta 20200827)
+template <typename... Args>
+constexpr bool args_any_are_buffer() {
+    return detail::any_of<std::is_same<Args, buffer_protocol>...>::value;
+}
+
+// [workaround(intel)] Separate function required here
+// [workaround(msvc)] Can't use constexpr bool in return type
+
+// Add the buffer interface to a vector
+template <typename Vector, typename Class_, typename... Args>
+void vector_buffer_impl(Class_ &cl, std::true_type) {
+    using T = typename Vector::value_type;
+
+    static_assert(vector_has_data_and_format<Vector>::value,
+                  "There is not an appropriate format descriptor for this vector");
+
+    // numpy.h declares this for arbitrary types, but it may raise an exception and crash hard
+    // at runtime if PYBIND11_NUMPY_DTYPE hasn't been called, so check here
+    format_descriptor<T>::format();
+
+    cl.def_buffer([](Vector &v) -> buffer_info {
+        return buffer_info(v.data(),
+                           static_cast<ssize_t>(sizeof(T)),
+                           format_descriptor<T>::format(),
+                           1,
+                           {v.size()},
+                           {sizeof(T)});
+    });
+
+    cl.def(init([](const buffer &buf) {
+        auto info = buf.request();
+        if (info.ndim != 1 || info.strides[0] % static_cast<ssize_t>(sizeof(T))) {
+            throw type_error("Only valid 1D buffers can be copied to a vector");
+        }
+        if (!detail::compare_buffer_info<T>::compare(info)
+            || (ssize_t) sizeof(T) != info.itemsize) {
+            throw type_error("Format mismatch (Python: " + info.format
+                             + " C++: " + format_descriptor<T>::format() + ")");
+        }
+
+        T *p = static_cast<T *>(info.ptr);
+        ssize_t step = info.strides[0] / static_cast<ssize_t>(sizeof(T));
+        T *end = p + info.shape[0] * step;
+        if (step == 1) {
+            return Vector(p, end);
+        }
+        Vector vec;
+        vec.reserve((size_t) info.shape[0]);
+        for (; p != end; p += step) {
+            vec.push_back(*p);
+        }
+        return vec;
+    }));
+
+    return;
+}
+
+template <typename Vector, typename Class_, typename... Args>
+void vector_buffer_impl(Class_ &, std::false_type) {}
+
+template <typename Vector, typename Class_, typename... Args>
+void vector_buffer(Class_ &cl) {
+    vector_buffer_impl<Vector, Class_, Args...>(
+        cl, detail::any_of<std::is_same<Args, buffer_protocol>...>{});
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+//
+// std::vector
+//
+template <typename Vector, typename holder_type = std::unique_ptr<Vector>, typename... Args>
+class_<Vector, holder_type> bind_vector(handle scope, std::string const &name, Args &&...args) {
+    using Class_ = class_<Vector, holder_type>;
+
+    // If the value_type is unregistered (e.g. a converting type) or is itself registered
+    // module-local then make the vector binding module-local as well:
+    using vtype = typename Vector::value_type;
+    auto *vtype_info = detail::get_type_info(typeid(vtype));
+    bool local = !vtype_info || vtype_info->module_local;
+
+    Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);
+
+    // Declare the buffer interface if a buffer_protocol() is passed in
+    detail::vector_buffer<Vector, Class_, Args...>(cl);
+
+    cl.def(init<>());
+
+    // Register copy constructor (if possible)
+    detail::vector_if_copy_constructible<Vector, Class_>(cl);
+
+    // Register comparison-related operators and functions (if possible)
+    detail::vector_if_equal_operator<Vector, Class_>(cl);
+
+    // Register stream insertion operator (if possible)
+    detail::vector_if_insertion_operator<Vector, Class_>(cl, name);
+
+    // Modifiers require copyable vector value type
+    detail::vector_modifiers<Vector, Class_>(cl);
+
+    // Accessor and iterator; return by value if copyable, otherwise we return by ref + keep-alive
+    detail::vector_accessor<Vector, Class_>(cl);
+
+    cl.def(
+        "__bool__",
+        [](const Vector &v) -> bool { return !v.empty(); },
+        "Check whether the list is nonempty");
+
+    cl.def("__len__", &Vector::size);
+
+#if 0
+    // C++ style functions deprecated, leaving it here as an example
+    cl.def(init<size_type>());
+
+    cl.def("resize",
+         (void (Vector::*) (size_type count)) & Vector::resize,
+         "changes the number of elements stored");
+
+    cl.def("erase",
+        [](Vector &v, SizeType i) {
+        if (i >= v.size())
+            throw index_error();
+        v.erase(v.begin() + i);
+    }, "erases element at index ``i``");
+
+    cl.def("empty",         &Vector::empty,         "checks whether the container is empty");
+    cl.def("size",          &Vector::size,          "returns the number of elements");
+    cl.def("push_back", (void (Vector::*)(const T&)) &Vector::push_back, "adds an element to the end");
+    cl.def("pop_back",                               &Vector::pop_back, "removes the last element");
+
+    cl.def("max_size",      &Vector::max_size,      "returns the maximum possible number of elements");
+    cl.def("reserve",       &Vector::reserve,       "reserves storage");
+    cl.def("capacity",      &Vector::capacity,      "returns the number of elements that can be held in currently allocated storage");
+    cl.def("shrink_to_fit", &Vector::shrink_to_fit, "reduces memory usage by freeing unused memory");
+
+    cl.def("clear", &Vector::clear, "clears the contents");
+    cl.def("swap",   &Vector::swap, "swaps the contents");
+
+    cl.def("front", [](Vector &v) {
+        if (v.size()) return v.front();
+        else throw index_error();
+    }, "access the first element");
+
+    cl.def("back", [](Vector &v) {
+        if (v.size()) return v.back();
+        else throw index_error();
+    }, "access the last element ");
+
+#endif
+
+    return cl;
+}
+
+//
+// std::map, std::unordered_map
+//
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/* Fallback functions */
+template <typename, typename, typename... Args>
+void map_if_insertion_operator(const Args &...) {}
+template <typename, typename, typename... Args>
+void map_assignment(const Args &...) {}
+
+// Map assignment when copy-assignable: just copy the value
+template <typename Map, typename Class_>
+void map_assignment(
+    enable_if_t<is_copy_assignable<typename Map::mapped_type>::value, Class_> &cl) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+
+    cl.def("__setitem__", [](Map &m, const KeyType &k, const MappedType &v) {
+        auto it = m.find(k);
+        if (it != m.end()) {
+            it->second = v;
+        } else {
+            m.emplace(k, v);
+        }
+    });
+}
+
+// Not copy-assignable, but still copy-constructible: we can update the value by erasing and
+// reinserting
+template <typename Map, typename Class_>
+void map_assignment(enable_if_t<!is_copy_assignable<typename Map::mapped_type>::value
+                                    && is_copy_constructible<typename Map::mapped_type>::value,
+                                Class_> &cl) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+
+    cl.def("__setitem__", [](Map &m, const KeyType &k, const MappedType &v) {
+        // We can't use m[k] = v; because value type might not be default constructable
+        auto r = m.emplace(k, v);
+        if (!r.second) {
+            // value type is not copy assignable so the only way to insert it is to erase it
+            // first...
+            m.erase(r.first);
+            m.emplace(k, v);
+        }
+    });
+}
+
+template <typename Map, typename Class_>
+auto map_if_insertion_operator(Class_ &cl, std::string const &name)
+    -> decltype(std::declval<std::ostream &>() << std::declval<typename Map::key_type>()
+                                               << std::declval<typename Map::mapped_type>(),
+                void()) {
+
+    cl.def(
+        "__repr__",
+        [name](Map &m) {
+            std::ostringstream s;
+            s << name << '{';
+            bool f = false;
+            for (auto const &kv : m) {
+                if (f) {
+                    s << ", ";
+                }
+                s << kv.first << ": " << kv.second;
+                f = true;
+            }
+            s << '}';
+            return s.str();
+        },
+        "Return the canonical string representation of this map.");
+}
+
+template <typename KeyType>
+struct keys_view {
+    virtual size_t len() = 0;
+    virtual iterator iter() = 0;
+    virtual bool contains(const KeyType &k) = 0;
+    virtual bool contains(const object &k) = 0;
+    virtual ~keys_view() = default;
+};
+
+template <typename MappedType>
+struct values_view {
+    virtual size_t len() = 0;
+    virtual iterator iter() = 0;
+    virtual ~values_view() = default;
+};
+
+template <typename KeyType, typename MappedType>
+struct items_view {
+    virtual size_t len() = 0;
+    virtual iterator iter() = 0;
+    virtual ~items_view() = default;
+};
+
+template <typename Map, typename KeysView>
+struct KeysViewImpl : public KeysView {
+    explicit KeysViewImpl(Map &map) : map(map) {}
+    size_t len() override { return map.size(); }
+    iterator iter() override { return make_key_iterator(map.begin(), map.end()); }
+    bool contains(const typename Map::key_type &k) override { return map.find(k) != map.end(); }
+    bool contains(const object &) override { return false; }
+    Map &map;
+};
+
+template <typename Map, typename ValuesView>
+struct ValuesViewImpl : public ValuesView {
+    explicit ValuesViewImpl(Map &map) : map(map) {}
+    size_t len() override { return map.size(); }
+    iterator iter() override { return make_value_iterator(map.begin(), map.end()); }
+    Map &map;
+};
+
+template <typename Map, typename ItemsView>
+struct ItemsViewImpl : public ItemsView {
+    explicit ItemsViewImpl(Map &map) : map(map) {}
+    size_t len() override { return map.size(); }
+    iterator iter() override { return make_iterator(map.begin(), map.end()); }
+    Map &map;
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+template <typename Map, typename holder_type = std::unique_ptr<Map>, typename... Args>
+class_<Map, holder_type> bind_map(handle scope, const std::string &name, Args &&...args) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+    using StrippedKeyType = detail::remove_cvref_t<KeyType>;
+    using StrippedMappedType = detail::remove_cvref_t<MappedType>;
+    using KeysView = detail::keys_view<StrippedKeyType>;
+    using ValuesView = detail::values_view<StrippedMappedType>;
+    using ItemsView = detail::items_view<StrippedKeyType, StrippedMappedType>;
+    using Class_ = class_<Map, holder_type>;
+
+    // If either type is a non-module-local bound type then make the map binding non-local as well;
+    // otherwise (e.g. both types are either module-local or converting) the map will be
+    // module-local.
+    auto *tinfo = detail::get_type_info(typeid(MappedType));
+    bool local = !tinfo || tinfo->module_local;
+    if (local) {
+        tinfo = detail::get_type_info(typeid(KeyType));
+        local = !tinfo || tinfo->module_local;
+    }
+
+    Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);
+    static constexpr auto key_type_descr = detail::make_caster<KeyType>::name;
+    static constexpr auto mapped_type_descr = detail::make_caster<MappedType>::name;
+    std::string key_type_name(key_type_descr.text), mapped_type_name(mapped_type_descr.text);
+
+    // If key type isn't properly wrapped, fall back to C++ names
+    if (key_type_name == "%") {
+        key_type_name = detail::type_info_description(typeid(KeyType));
+    }
+    // Similarly for value type:
+    if (mapped_type_name == "%") {
+        mapped_type_name = detail::type_info_description(typeid(MappedType));
+    }
+
+    // Wrap KeysView[KeyType] if it wasn't already wrapped
+    if (!detail::get_type_info(typeid(KeysView))) {
+        class_<KeysView> keys_view(
+            scope, ("KeysView[" + key_type_name + "]").c_str(), pybind11::module_local(local));
+        keys_view.def("__len__", &KeysView::len);
+        keys_view.def("__iter__",
+                      &KeysView::iter,
+                      keep_alive<0, 1>() /* Essential: keep view alive while iterator exists */
+        );
+        keys_view.def("__contains__",
+                      static_cast<bool (KeysView::*)(const KeyType &)>(&KeysView::contains));
+        // Fallback for when the object is not of the key type
+        keys_view.def("__contains__",
+                      static_cast<bool (KeysView::*)(const object &)>(&KeysView::contains));
+    }
+    // Similarly for ValuesView:
+    if (!detail::get_type_info(typeid(ValuesView))) {
+        class_<ValuesView> values_view(scope,
+                                       ("ValuesView[" + mapped_type_name + "]").c_str(),
+                                       pybind11::module_local(local));
+        values_view.def("__len__", &ValuesView::len);
+        values_view.def("__iter__",
+                        &ValuesView::iter,
+                        keep_alive<0, 1>() /* Essential: keep view alive while iterator exists */
+        );
+    }
+    // Similarly for ItemsView:
+    if (!detail::get_type_info(typeid(ItemsView))) {
+        class_<ItemsView> items_view(
+            scope,
+            ("ItemsView[" + key_type_name + ", ").append(mapped_type_name + "]").c_str(),
+            pybind11::module_local(local));
+        items_view.def("__len__", &ItemsView::len);
+        items_view.def("__iter__",
+                       &ItemsView::iter,
+                       keep_alive<0, 1>() /* Essential: keep view alive while iterator exists */
+        );
+    }
+
+    cl.def(init<>());
+
+    // Register stream insertion operator (if possible)
+    detail::map_if_insertion_operator<Map, Class_>(cl, name);
+
+    cl.def(
+        "__bool__",
+        [](const Map &m) -> bool { return !m.empty(); },
+        "Check whether the map is nonempty");
+
+    cl.def(
+        "__iter__",
+        [](Map &m) { return make_key_iterator(m.begin(), m.end()); },
+        keep_alive<0, 1>() /* Essential: keep map alive while iterator exists */
+    );
+
+    cl.def(
+        "keys",
+        [](Map &m) {
+            return std::unique_ptr<KeysView>(new detail::KeysViewImpl<Map, KeysView>(m));
+        },
+        keep_alive<0, 1>() /* Essential: keep map alive while view exists */
+    );
+
+    cl.def(
+        "values",
+        [](Map &m) {
+            return std::unique_ptr<ValuesView>(new detail::ValuesViewImpl<Map, ValuesView>(m));
+        },
+        keep_alive<0, 1>() /* Essential: keep map alive while view exists */
+    );
+
+    cl.def(
+        "items",
+        [](Map &m) {
+            return std::unique_ptr<ItemsView>(new detail::ItemsViewImpl<Map, ItemsView>(m));
+        },
+        keep_alive<0, 1>() /* Essential: keep map alive while view exists */
+    );
+
+    cl.def(
+        "__getitem__",
+        [](Map &m, const KeyType &k) -> MappedType & {
+            auto it = m.find(k);
+            if (it == m.end()) {
+                throw key_error();
+            }
+            return it->second;
+        },
+        return_value_policy::reference_internal // ref + keepalive
+    );
+
+    cl.def("__contains__", [](Map &m, const KeyType &k) -> bool {
+        auto it = m.find(k);
+        if (it == m.end()) {
+            return false;
+        }
+        return true;
+    });
+    // Fallback for when the object is not of the key type
+    cl.def("__contains__", [](Map &, const object &) -> bool { return false; });
+
+    // Assignment provided only if the type is copyable
+    detail::map_assignment<Map, Class_>(cl);
+
+    cl.def("__delitem__", [](Map &m, const KeyType &k) {
+        auto it = m.find(k);
+        if (it == m.end()) {
+            throw key_error();
+        }
+        m.erase(it);
+    });
+
+    cl.def("__len__", &Map::size);
+
+    return cl;
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/type_caster_pyobject_ptr.h b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/type_caster_pyobject_ptr.h
new file mode 100644
index 00000000..aa914f9e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/type_caster_pyobject_ptr.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2023 The pybind Community.
+
+#pragma once
+
+#include "detail/common.h"
+#include "detail/descr.h"
+#include "cast.h"
+#include "pytypes.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <>
+class type_caster<PyObject> {
+public:
+    static constexpr auto name = const_name("object"); // See discussion under PR #4601.
+
+    // This overload is purely to guard against accidents.
+    template <typename T,
+              detail::enable_if_t<!is_same_ignoring_cvref<T, PyObject *>::value, int> = 0>
+    static handle cast(T &&, return_value_policy, handle /*parent*/) {
+        static_assert(is_same_ignoring_cvref<T, PyObject *>::value,
+                      "Invalid C++ type T for to-Python conversion (type_caster<PyObject>).");
+        return nullptr; // Unreachable.
+    }
+
+    static handle cast(PyObject *src, return_value_policy policy, handle /*parent*/) {
+        if (src == nullptr) {
+            throw error_already_set();
+        }
+        if (PyErr_Occurred()) {
+            raise_from(PyExc_SystemError, "src != nullptr but PyErr_Occurred()");
+            throw error_already_set();
+        }
+        if (policy == return_value_policy::take_ownership) {
+            return src;
+        }
+        if (policy == return_value_policy::reference
+            || policy == return_value_policy::automatic_reference) {
+            return handle(src).inc_ref();
+        }
+        pybind11_fail("type_caster<PyObject>::cast(): unsupported return_value_policy: "
+                      + std::to_string(static_cast<int>(policy)));
+    }
+
+    bool load(handle src, bool) {
+        value = reinterpret_borrow<object>(src);
+        return true;
+    }
+
+    template <typename T>
+    using cast_op_type = PyObject *;
+
+    explicit operator PyObject *() { return value.ptr(); }
+
+private:
+    object value;
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/noxfile.py b/ethosu/regor/dependencies/thirdparty/pybind11/noxfile.py
new file mode 100644
index 00000000..021ced24
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/noxfile.py
@@ -0,0 +1,107 @@
+import os
+
+import nox
+
+nox.needs_version = ">=2022.1.7"
+nox.options.sessions = ["lint", "tests", "tests_packaging"]
+
+PYTHON_VERSIONS = [
+    "3.6",
+    "3.7",
+    "3.8",
+    "3.9",
+    "3.10",
+    "3.11",
+    "pypy3.7",
+    "pypy3.8",
+    "pypy3.9",
+]
+
+if os.environ.get("CI", None):
+    nox.options.error_on_missing_interpreters = True
+
+
+@nox.session(reuse_venv=True)
+def lint(session: nox.Session) -> None:
+    """
+    Lint the codebase (except for clang-format/tidy).
+    """
+    session.install("pre-commit")
+    session.run("pre-commit", "run", "-a", *session.posargs)
+
+
+@nox.session(python=PYTHON_VERSIONS)
+def tests(session: nox.Session) -> None:
+    """
+    Run the tests (requires a compiler).
+    """
+    tmpdir = session.create_tmp()
+    session.install("cmake")
+    session.install("-r", "tests/requirements.txt")
+    session.run(
+        "cmake",
+        "-S.",
+        f"-B{tmpdir}",
+        "-DPYBIND11_WERROR=ON",
+        "-DDOWNLOAD_CATCH=ON",
+        "-DDOWNLOAD_EIGEN=ON",
+        *session.posargs,
+    )
+    session.run("cmake", "--build", tmpdir)
+    session.run("cmake", "--build", tmpdir, "--config=Release", "--target", "check")
+
+
+@nox.session
+def tests_packaging(session: nox.Session) -> None:
+    """
+    Run the packaging tests.
+    """
+
+    session.install("-r", "tests/requirements.txt", "--prefer-binary")
+    session.run("pytest", "tests/extra_python_package", *session.posargs)
+
+
+@nox.session(reuse_venv=True)
+def docs(session: nox.Session) -> None:
+    """
+    Build the docs. Pass "serve" to serve.
+    """
+
+    session.install("-r", "docs/requirements.txt")
+    session.chdir("docs")
+
+    if "pdf" in session.posargs:
+        session.run("sphinx-build", "-M", "latexpdf", ".", "_build")
+        return
+
+    session.run("sphinx-build", "-M", "html", ".", "_build")
+
+    if "serve" in session.posargs:
+        session.log("Launching docs at http://localhost:8000/ - use Ctrl-C to quit")
+        session.run("python", "-m", "http.server", "8000", "-d", "_build/html")
+    elif session.posargs:
+        session.error("Unsupported argument to docs")
+
+
+@nox.session(reuse_venv=True)
+def make_changelog(session: nox.Session) -> None:
+    """
+    Inspect the closed issues and make entries for a changelog.
+    """
+    session.install("ghapi", "rich")
+    session.run("python", "tools/make_changelog.py")
+
+
+@nox.session(reuse_venv=True)
+def build(session: nox.Session) -> None:
+    """
+    Build SDists and wheels.
+    """
+
+    session.install("build")
+    session.log("Building normal files")
+    session.run("python", "-m", "build", *session.posargs)
+    session.log("Building pybind11-global files (PYBIND11_GLOBAL_SDIST=1)")
+    session.run(
+        "python", "-m", "build", *session.posargs, env={"PYBIND11_GLOBAL_SDIST": "1"}
+    )
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/pybind11/__init__.py b/ethosu/regor/dependencies/thirdparty/pybind11/pybind11/__init__.py
new file mode 100644
index 00000000..7c10b305
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/pybind11/__init__.py
@@ -0,0 +1,17 @@
+import sys
+
+if sys.version_info < (3, 6):  # noqa: UP036
+    msg = "pybind11 does not support Python < 3.6. 2.9 was the last release supporting Python 2.7 and 3.5."
+    raise ImportError(msg)
+
+
+from ._version import __version__, version_info
+from .commands import get_cmake_dir, get_include, get_pkgconfig_dir
+
+__all__ = (
+    "version_info",
+    "__version__",
+    "get_include",
+    "get_cmake_dir",
+    "get_pkgconfig_dir",
+)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/pybind11/__main__.py b/ethosu/regor/dependencies/thirdparty/pybind11/pybind11/__main__.py
new file mode 100644
index 00000000..180665c2
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/pybind11/__main__.py
@@ -0,0 +1,62 @@
+# pylint: disable=missing-function-docstring
+
+import argparse
+import sys
+import sysconfig
+
+from ._version import __version__
+from .commands import get_cmake_dir, get_include, get_pkgconfig_dir
+
+
+def print_includes() -> None:
+    dirs = [
+        sysconfig.get_path("include"),
+        sysconfig.get_path("platinclude"),
+        get_include(),
+    ]
+
+    # Make unique but preserve order
+    unique_dirs = []
+    for d in dirs:
+        if d and d not in unique_dirs:
+            unique_dirs.append(d)
+
+    print(" ".join("-I" + d for d in unique_dirs))
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--version",
+        action="version",
+        version=__version__,
+        help="Print the version and exit.",
+    )
+    parser.add_argument(
+        "--includes",
+        action="store_true",
+        help="Include flags for both pybind11 and Python headers.",
+    )
+    parser.add_argument(
+        "--cmakedir",
+        action="store_true",
+        help="Print the CMake module directory, ideal for setting -Dpybind11_ROOT in CMake.",
+    )
+    parser.add_argument(
+        "--pkgconfigdir",
+        action="store_true",
+        help="Print the pkgconfig directory, ideal for setting $PKG_CONFIG_PATH.",
+    )
+    args = parser.parse_args()
+    if not sys.argv[1:]:
+        parser.print_help()
+    if args.includes:
+        print_includes()
+    if args.cmakedir:
+        print(get_cmake_dir())
+    if args.pkgconfigdir:
+        print(get_pkgconfig_dir())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/pybind11/_version.py b/ethosu/regor/dependencies/thirdparty/pybind11/pybind11/_version.py
new file mode 100644
index 00000000..9280fa05
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/pybind11/_version.py
@@ -0,0 +1,12 @@
+from typing import Union
+
+
+def _to_int(s: str) -> Union[int, str]:
+    try:
+        return int(s)
+    except ValueError:
+        return s
+
+
+__version__ = "2.11.1"
+version_info = tuple(_to_int(s) for s in __version__.split("."))
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/pybind11/commands.py b/ethosu/regor/dependencies/thirdparty/pybind11/pybind11/commands.py
new file mode 100644
index 00000000..b11690f4
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/pybind11/commands.py
@@ -0,0 +1,37 @@
+import os
+
+DIR = os.path.abspath(os.path.dirname(__file__))
+
+
+def get_include(user: bool = False) -> str:  # noqa: ARG001
+    """
+    Return the path to the pybind11 include directory. The historical "user"
+    argument is unused, and may be removed.
+    """
+    installed_path = os.path.join(DIR, "include")
+    source_path = os.path.join(os.path.dirname(DIR), "include")
+    return installed_path if os.path.exists(installed_path) else source_path
+
+
+def get_cmake_dir() -> str:
+    """
+    Return the path to the pybind11 CMake module directory.
+    """
+    cmake_installed_path = os.path.join(DIR, "share", "cmake", "pybind11")
+    if os.path.exists(cmake_installed_path):
+        return cmake_installed_path
+
+    msg = "pybind11 not installed, installation required to access the CMake files"
+    raise ImportError(msg)
+
+
+def get_pkgconfig_dir() -> str:
+    """
+    Return the path to the pybind11 pkgconfig directory.
+    """
+    pkgconfig_installed_path = os.path.join(DIR, "share", "pkgconfig")
+    if os.path.exists(pkgconfig_installed_path):
+        return pkgconfig_installed_path
+
+    msg = "pybind11 not installed, installation required to access the pkgconfig files"
+    raise ImportError(msg)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/pybind11/py.typed b/ethosu/regor/dependencies/thirdparty/pybind11/pybind11/py.typed
new file mode 100644
index 00000000..e69de29b
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/pybind11/setup_helpers.py b/ethosu/regor/dependencies/thirdparty/pybind11/pybind11/setup_helpers.py
new file mode 100644
index 00000000..aeeee9dc
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/pybind11/setup_helpers.py
@@ -0,0 +1,498 @@
+"""
+This module provides helpers for C++11+ projects using pybind11.
+
+LICENSE:
+
+Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+# IMPORTANT: If you change this file in the pybind11 repo, also review
+# setup_helpers.pyi for matching changes.
+#
+# If you copy this file in, you don't
+# need the .pyi file; it's just an interface file for static type checkers.
+
+import contextlib
+import os
+import platform
+import shlex
+import shutil
+import sys
+import sysconfig
+import tempfile
+import threading
+import warnings
+from functools import lru_cache
+from pathlib import Path
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    TypeVar,
+    Union,
+)
+
+try:
+    from setuptools import Extension as _Extension
+    from setuptools.command.build_ext import build_ext as _build_ext
+except ImportError:
+    from distutils.command.build_ext import build_ext as _build_ext  # type: ignore[assignment]
+    from distutils.extension import Extension as _Extension  # type: ignore[assignment]
+
+import distutils.ccompiler
+import distutils.errors
+
+WIN = sys.platform.startswith("win32") and "mingw" not in sysconfig.get_platform()
+MACOS = sys.platform.startswith("darwin")
+STD_TMPL = "/std:c++{}" if WIN else "-std=c++{}"
+
+
+# It is recommended to use PEP 518 builds if using this module. However, this
+# file explicitly supports being copied into a user's project directory
+# standalone, and pulling pybind11 with the deprecated setup_requires feature.
+# If you copy the file, remember to add it to your MANIFEST.in, and add the current
+# directory into your path if it sits beside your setup.py.
+
+
+class Pybind11Extension(_Extension):
+    """
+    Build a C++11+ Extension module with pybind11. This automatically adds the
+    recommended flags when you init the extension and assumes C++ sources - you
+    can further modify the options yourself.
+
+    The customizations are:
+
+    * ``/EHsc`` and ``/bigobj`` on Windows
+    * ``stdlib=libc++`` on macOS
+    * ``visibility=hidden`` and ``-g0`` on Unix
+
+    Finally, you can set ``cxx_std`` via constructor or afterwards to enable
+    flags for C++ std, and a few extra helper flags related to the C++ standard
+    level. It is _highly_ recommended you either set this, or use the provided
+    ``build_ext``, which will search for the highest supported extension for
+    you if the ``cxx_std`` property is not set. Do not set the ``cxx_std``
+    property more than once, as flags are added when you set it. Set the
+    property to None to disable the addition of C++ standard flags.
+
+    If you want to add pybind11 headers manually, for example for an exact
+    git checkout, then set ``include_pybind11=False``.
+    """
+
+    # flags are prepended, so that they can be further overridden, e.g. by
+    # ``extra_compile_args=["-g"]``.
+
+    def _add_cflags(self, flags: List[str]) -> None:
+        self.extra_compile_args[:0] = flags
+
+    def _add_ldflags(self, flags: List[str]) -> None:
+        self.extra_link_args[:0] = flags
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        self._cxx_level = 0
+        cxx_std = kwargs.pop("cxx_std", 0)
+
+        if "language" not in kwargs:
+            kwargs["language"] = "c++"
+
+        include_pybind11 = kwargs.pop("include_pybind11", True)
+
+        super().__init__(*args, **kwargs)
+
+        # Include the installed package pybind11 headers
+        if include_pybind11:
+            # If using setup_requires, this fails the first time - that's okay
+            try:
+                import pybind11
+
+                pyinc = pybind11.get_include()
+
+                if pyinc not in self.include_dirs:
+                    self.include_dirs.append(pyinc)
+            except ModuleNotFoundError:
+                pass
+
+        self.cxx_std = cxx_std
+
+        cflags = []
+        if WIN:
+            cflags += ["/EHsc", "/bigobj"]
+        else:
+            cflags += ["-fvisibility=hidden"]
+            env_cflags = os.environ.get("CFLAGS", "")
+            env_cppflags = os.environ.get("CPPFLAGS", "")
+            c_cpp_flags = shlex.split(env_cflags) + shlex.split(env_cppflags)
+            if not any(opt.startswith("-g") for opt in c_cpp_flags):
+                cflags += ["-g0"]
+        self._add_cflags(cflags)
+
+    @property
+    def cxx_std(self) -> int:
+        """
+        The CXX standard level. If set, will add the required flags. If left at
+        0, it will trigger an automatic search when pybind11's build_ext is
+        used. If None, will have no effect.  Besides just the flags, this may
+        add a macos-min 10.9 or 10.14 flag if MACOSX_DEPLOYMENT_TARGET is
+        unset.
+        """
+        return self._cxx_level
+
+    @cxx_std.setter
+    def cxx_std(self, level: int) -> None:
+        if self._cxx_level:
+            warnings.warn(
+                "You cannot safely change the cxx_level after setting it!", stacklevel=2
+            )
+
+        # MSVC 2015 Update 3 and later only have 14 (and later 17) modes, so
+        # force a valid flag here.
+        if WIN and level == 11:
+            level = 14
+
+        self._cxx_level = level
+
+        if not level:
+            return
+
+        cflags = [STD_TMPL.format(level)]
+        ldflags = []
+
+        if MACOS and "MACOSX_DEPLOYMENT_TARGET" not in os.environ:
+            # C++17 requires a higher min version of macOS. An earlier version
+            # (10.12 or 10.13) can be set manually via environment variable if
+            # you are careful in your feature usage, but 10.14 is the safest
+            # setting for general use. However, never set higher than the
+            # current macOS version!
+            current_macos = tuple(int(x) for x in platform.mac_ver()[0].split(".")[:2])
+            desired_macos = (10, 9) if level < 17 else (10, 14)
+            macos_string = ".".join(str(x) for x in min(current_macos, desired_macos))
+            macosx_min = f"-mmacosx-version-min={macos_string}"
+            cflags += [macosx_min]
+            ldflags += [macosx_min]
+
+        self._add_cflags(cflags)
+        self._add_ldflags(ldflags)
+
+
+# Just in case someone clever tries to multithread
+tmp_chdir_lock = threading.Lock()
+
+
+@contextlib.contextmanager
+def tmp_chdir() -> Iterator[str]:
+    "Prepare and enter a temporary directory, cleanup when done"
+
+    # Threadsafe
+    with tmp_chdir_lock:
+        olddir = os.getcwd()
+        try:
+            tmpdir = tempfile.mkdtemp()
+            os.chdir(tmpdir)
+            yield tmpdir
+        finally:
+            os.chdir(olddir)
+            shutil.rmtree(tmpdir)
+
+
+# cf http://bugs.python.org/issue26689
+def has_flag(compiler: Any, flag: str) -> bool:
+    """
+    Return the flag if a flag name is supported on the
+    specified compiler, otherwise None (can be used as a boolean).
+    If multiple flags are passed, return the first that matches.
+    """
+
+    with tmp_chdir():
+        fname = Path("flagcheck.cpp")
+        # Don't trigger -Wunused-parameter.
+        fname.write_text("int main (int, char **) { return 0; }", encoding="utf-8")
+
+        try:
+            compiler.compile([str(fname)], extra_postargs=[flag])
+        except distutils.errors.CompileError:
+            return False
+        return True
+
+
+# Every call will cache the result
+cpp_flag_cache = None
+
+
+@lru_cache()
+def auto_cpp_level(compiler: Any) -> Union[str, int]:
+    """
+    Return the max supported C++ std level (17, 14, or 11). Returns latest on Windows.
+    """
+
+    if WIN:
+        return "latest"
+
+    levels = [17, 14, 11]
+
+    for level in levels:
+        if has_flag(compiler, STD_TMPL.format(level)):
+            return level
+
+    msg = "Unsupported compiler -- at least C++11 support is needed!"
+    raise RuntimeError(msg)
+
+
+class build_ext(_build_ext):  # noqa: N801
+    """
+    Customized build_ext that allows an auto-search for the highest supported
+    C++ level for Pybind11Extension. This is only needed for the auto-search
+    for now, and is completely optional otherwise.
+    """
+
+    def build_extensions(self) -> None:
+        """
+        Build extensions, injecting C++ std for Pybind11Extension if needed.
+        """
+
+        for ext in self.extensions:
+            if hasattr(ext, "_cxx_level") and ext._cxx_level == 0:
+                ext.cxx_std = auto_cpp_level(self.compiler)
+
+        super().build_extensions()
+
+
+def intree_extensions(
+    paths: Iterable[str], package_dir: Optional[Dict[str, str]] = None
+) -> List[Pybind11Extension]:
+    """
+    Generate Pybind11Extensions from source files directly located in a Python
+    source tree.
+
+    ``package_dir`` behaves as in ``setuptools.setup``.  If unset, the Python
+    package root parent is determined as the first parent directory that does
+    not contain an ``__init__.py`` file.
+    """
+    exts = []
+
+    if package_dir is None:
+        for path in paths:
+            parent, _ = os.path.split(path)
+            while os.path.exists(os.path.join(parent, "__init__.py")):
+                parent, _ = os.path.split(parent)
+            relname, _ = os.path.splitext(os.path.relpath(path, parent))
+            qualified_name = relname.replace(os.path.sep, ".")
+            exts.append(Pybind11Extension(qualified_name, [path]))
+        return exts
+
+    for path in paths:
+        for prefix, parent in package_dir.items():
+            if path.startswith(parent):
+                relname, _ = os.path.splitext(os.path.relpath(path, parent))
+                qualified_name = relname.replace(os.path.sep, ".")
+                if prefix:
+                    qualified_name = prefix + "." + qualified_name
+                exts.append(Pybind11Extension(qualified_name, [path]))
+                break
+        else:
+            msg = (
+                f"path {path} is not a child of any of the directories listed "
+                f"in 'package_dir' ({package_dir})"
+            )
+            raise ValueError(msg)
+
+    return exts
+
+
+def naive_recompile(obj: str, src: str) -> bool:
+    """
+    This will recompile only if the source file changes. It does not check
+    header files, so a more advanced function or Ccache is better if you have
+    editable header files in your package.
+    """
+    return os.stat(obj).st_mtime < os.stat(src).st_mtime
+
+
+def no_recompile(obg: str, src: str) -> bool:  # noqa: ARG001
+    """
+    This is the safest but slowest choice (and is the default) - will always
+    recompile sources.
+    """
+    return True
+
+
+S = TypeVar("S", bound="ParallelCompile")
+
+CCompilerMethod = Callable[
+    [
+        distutils.ccompiler.CCompiler,
+        List[str],
+        Optional[str],
+        Optional[Union[Tuple[str], Tuple[str, Optional[str]]]],
+        Optional[List[str]],
+        bool,
+        Optional[List[str]],
+        Optional[List[str]],
+        Optional[List[str]],
+    ],
+    List[str],
+]
+
+
+# Optional parallel compile utility
+# inspired by: http://stackoverflow.com/questions/11013851/speeding-up-build-process-with-distutils
+# and: https://github.com/tbenthompson/cppimport/blob/stable/cppimport/build_module.py
+# and NumPy's parallel distutils module:
+#              https://github.com/numpy/numpy/blob/master/numpy/distutils/ccompiler.py
+class ParallelCompile:
+    """
+    Make a parallel compile function. Inspired by
+    numpy.distutils.ccompiler.CCompiler.compile and cppimport.
+
+    This takes several arguments that allow you to customize the compile
+    function created:
+
+    envvar:
+        Set an environment variable to control the compilation threads, like
+        NPY_NUM_BUILD_JOBS
+    default:
+        0 will automatically multithread, or 1 will only multithread if the
+        envvar is set.
+    max:
+        The limit for automatic multithreading if non-zero
+    needs_recompile:
+        A function of (obj, src) that returns True when recompile is needed.  No
+        effect in isolated mode; use ccache instead, see
+        https://github.com/matplotlib/matplotlib/issues/1507/
+
+    To use::
+
+        ParallelCompile("NPY_NUM_BUILD_JOBS").install()
+
+    or::
+
+        with ParallelCompile("NPY_NUM_BUILD_JOBS"):
+            setup(...)
+
+    By default, this assumes all files need to be recompiled. A smarter
+    function can be provided via needs_recompile.  If the output has not yet
+    been generated, the compile will always run, and this function is not
+    called.
+    """
+
+    __slots__ = ("envvar", "default", "max", "_old", "needs_recompile")
+
+    def __init__(
+        self,
+        envvar: Optional[str] = None,
+        default: int = 0,
+        max: int = 0,  # pylint: disable=redefined-builtin
+        needs_recompile: Callable[[str, str], bool] = no_recompile,
+    ) -> None:
+        self.envvar = envvar
+        self.default = default
+        self.max = max
+        self.needs_recompile = needs_recompile
+        self._old: List[CCompilerMethod] = []
+
+    def function(self) -> CCompilerMethod:
+        """
+        Builds a function object usable as distutils.ccompiler.CCompiler.compile.
+        """
+
+        def compile_function(
+            compiler: distutils.ccompiler.CCompiler,
+            sources: List[str],
+            output_dir: Optional[str] = None,
+            macros: Optional[Union[Tuple[str], Tuple[str, Optional[str]]]] = None,
+            include_dirs: Optional[List[str]] = None,
+            debug: bool = False,
+            extra_preargs: Optional[List[str]] = None,
+            extra_postargs: Optional[List[str]] = None,
+            depends: Optional[List[str]] = None,
+        ) -> Any:
+            # These lines are directly from distutils.ccompiler.CCompiler
+            macros, objects, extra_postargs, pp_opts, build = compiler._setup_compile(  # type: ignore[attr-defined]
+                output_dir, macros, include_dirs, sources, depends, extra_postargs
+            )
+            cc_args = compiler._get_cc_args(pp_opts, debug, extra_preargs)  # type: ignore[attr-defined]
+
+            # The number of threads; start with default.
+            threads = self.default
+
+            # Determine the number of compilation threads, unless set by an environment variable.
+            if self.envvar is not None:
+                threads = int(os.environ.get(self.envvar, self.default))
+
+            def _single_compile(obj: Any) -> None:
+                try:
+                    src, ext = build[obj]
+                except KeyError:
+                    return
+
+                if not os.path.exists(obj) or self.needs_recompile(obj, src):
+                    compiler._compile(obj, src, ext, cc_args, extra_postargs, pp_opts)  # type: ignore[attr-defined]
+
+            try:
+                # Importing .synchronize checks for platforms that have some multiprocessing
+                # capabilities but lack semaphores, such as AWS Lambda and Android Termux.
+                import multiprocessing.synchronize
+                from multiprocessing.pool import ThreadPool
+            except ImportError:
+                threads = 1
+
+            if threads == 0:
+                try:
+                    threads = multiprocessing.cpu_count()
+                    threads = self.max if self.max and self.max < threads else threads
+                except NotImplementedError:
+                    threads = 1
+
+            if threads > 1:
+                with ThreadPool(threads) as pool:
+                    for _ in pool.imap_unordered(_single_compile, objects):
+                        pass
+            else:
+                for ob in objects:
+                    _single_compile(ob)
+
+            return objects
+
+        return compile_function
+
+    def install(self: S) -> S:
+        """
+        Installs the compile function into distutils.ccompiler.CCompiler.compile.
+        """
+        distutils.ccompiler.CCompiler.compile = self.function()  # type: ignore[assignment]
+        return self
+
+    def __enter__(self: S) -> S:
+        self._old.append(distutils.ccompiler.CCompiler.compile)
+        return self.install()
+
+    def __exit__(self, *args: Any) -> None:
+        distutils.ccompiler.CCompiler.compile = self._old.pop()  # type: ignore[assignment]
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/pyproject.toml b/ethosu/regor/dependencies/thirdparty/pybind11/pyproject.toml
new file mode 100644
index 00000000..59c15ea6
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/pyproject.toml
@@ -0,0 +1,98 @@
+[build-system]
+requires = ["setuptools>=42", "cmake>=3.18", "ninja"]
+build-backend = "setuptools.build_meta"
+
+
+[tool.check-manifest]
+ignore = [
+    "tests/**",
+    "docs/**",
+    "tools/**",
+    "include/**",
+    ".*",
+    "pybind11/include/**",
+    "pybind11/share/**",
+    "CMakeLists.txt",
+    "noxfile.py",
+]
+
+
+[tool.mypy]
+files = ["pybind11"]
+python_version = "3.6"
+strict = true
+show_error_codes = true
+enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"]
+warn_unreachable = true
+
+[[tool.mypy.overrides]]
+module = ["ghapi.*"]
+ignore_missing_imports = true
+
+
+[tool.pytest.ini_options]
+minversion = "6.0"
+addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config"]
+xfail_strict = true
+filterwarnings = ["error"]
+log_cli_level = "info"
+testpaths = [
+    "tests",
+]
+timeout=300
+
+
+[tool.pylint]
+master.py-version = "3.6"
+reports.output-format = "colorized"
+messages_control.disable = [
+  "design",
+  "fixme",
+  "imports",
+  "line-too-long",
+  "imports",
+  "invalid-name",
+  "protected-access",
+  "missing-module-docstring",
+  "unused-argument",  # covered by Ruff ARG
+]
+
+
+[tool.ruff]
+select = [
+  "E", "F", "W", # flake8
+  "B",           # flake8-bugbear
+  "I",           # isort
+  "N",           # pep8-naming
+  "ARG",         # flake8-unused-arguments
+  "C4",          # flake8-comprehensions
+  "EM",          # flake8-errmsg
+  "ICN",         # flake8-import-conventions
+  "ISC",         # flake8-implicit-str-concat
+  "PGH",         # pygrep-hooks
+  "PIE",         # flake8-pie
+  "PL",          # pylint
+  "PT",          # flake8-pytest-style
+  "RET",         # flake8-return
+  "RUF100",      # Ruff-specific
+  "SIM",         # flake8-simplify
+  "UP",          # pyupgrade
+  "YTT",         # flake8-2020
+]
+ignore = [
+  "PLR",    # Design related pylint
+  "E501",   # Line too long (Black is enough)
+  "PT011",  # Too broad with raises in pytest
+  "PT004",  # Fixture that doesn't return needs underscore (no, it is fine)
+  "SIM118", # iter(x) is not always the same as iter(x.keys())
+]
+target-version = "py37"
+src = ["src"]
+unfixable = ["T20"]
+exclude = []
+line-length = 120
+isort.known-first-party = ["env", "pybind11_cross_module_tests", "pybind11_tests"]
+
+[tool.ruff.per-file-ignores]
+"tests/**" = ["EM", "N"]
+"tests/test_call_policies.py" = ["PLC1901"]
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/setup.cfg b/ethosu/regor/dependencies/thirdparty/pybind11/setup.cfg
new file mode 100644
index 00000000..92e6c953
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/setup.cfg
@@ -0,0 +1,43 @@
+[metadata]
+long_description = file: README.rst
+long_description_content_type = text/x-rst
+description = Seamless operability between C++11 and Python
+author = Wenzel Jakob
+author_email = wenzel.jakob@epfl.ch
+url = https://github.com/pybind/pybind11
+license = BSD
+
+classifiers =
+    Development Status :: 5 - Production/Stable
+    Intended Audience :: Developers
+    Topic :: Software Development :: Libraries :: Python Modules
+    Topic :: Utilities
+    Programming Language :: C++
+    Programming Language :: Python :: 3 :: Only
+    Programming Language :: Python :: 3.6
+    Programming Language :: Python :: 3.7
+    Programming Language :: Python :: 3.8
+    Programming Language :: Python :: 3.9
+    Programming Language :: Python :: 3.10
+    Programming Language :: Python :: 3.11
+    Programming Language :: Python :: 3.12
+    License :: OSI Approved :: BSD License
+    Programming Language :: Python :: Implementation :: PyPy
+    Programming Language :: Python :: Implementation :: CPython
+    Programming Language :: C++
+    Topic :: Software Development :: Libraries :: Python Modules
+
+keywords =
+    C++11
+    Python bindings
+
+project_urls =
+    Documentation = https://pybind11.readthedocs.io/
+    Bug Tracker = https://github.com/pybind/pybind11/issues
+    Discussions = https://github.com/pybind/pybind11/discussions
+    Changelog = https://pybind11.readthedocs.io/en/latest/changelog.html
+    Chat = https://gitter.im/pybind/Lobby
+
+[options]
+python_requires = >=3.6
+zip_safe = False
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/setup.py b/ethosu/regor/dependencies/thirdparty/pybind11/setup.py
new file mode 100644
index 00000000..9fea7d35
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/setup.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+
+# Setup script for PyPI; use CMakeFile.txt to build extension modules
+
+import contextlib
+import os
+import re
+import shutil
+import string
+import subprocess
+import sys
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import Dict, Iterator, List, Union
+
+import setuptools.command.sdist
+
+DIR = Path(__file__).parent.absolute()
+VERSION_REGEX = re.compile(
+    r"^\s*#\s*define\s+PYBIND11_VERSION_([A-Z]+)\s+(.*)$", re.MULTILINE
+)
+VERSION_FILE = Path("pybind11/_version.py")
+COMMON_FILE = Path("include/pybind11/detail/common.h")
+
+
+def build_expected_version_hex(matches: Dict[str, str]) -> str:
+    patch_level_serial = matches["PATCH"]
+    serial = None
+    major = int(matches["MAJOR"])
+    minor = int(matches["MINOR"])
+    flds = patch_level_serial.split(".")
+    if flds:
+        patch = int(flds[0])
+        if len(flds) == 1:
+            level = "0"
+            serial = 0
+        elif len(flds) == 2:
+            level_serial = flds[1]
+            for level in ("a", "b", "c", "dev"):
+                if level_serial.startswith(level):
+                    serial = int(level_serial[len(level) :])
+                    break
+    if serial is None:
+        msg = f'Invalid PYBIND11_VERSION_PATCH: "{patch_level_serial}"'
+        raise RuntimeError(msg)
+    version_hex_str = f"{major:02x}{minor:02x}{patch:02x}{level[:1]}{serial:x}"
+    return f"0x{version_hex_str.upper()}"
+
+
+# PYBIND11_GLOBAL_SDIST will build a different sdist, with the python-headers
+# files, and the sys.prefix files (CMake and headers).
+
+global_sdist = os.environ.get("PYBIND11_GLOBAL_SDIST", False)
+
+setup_py = Path(
+    "tools/setup_global.py.in" if global_sdist else "tools/setup_main.py.in"
+)
+extra_cmd = 'cmdclass["sdist"] = SDist\n'
+
+to_src = (
+    (Path("pyproject.toml"), Path("tools/pyproject.toml")),
+    (Path("setup.py"), setup_py),
+)
+
+
+# Read the listed version
+loc: Dict[str, str] = {}
+code = compile(VERSION_FILE.read_text(encoding="utf-8"), "pybind11/_version.py", "exec")
+exec(code, loc)
+version = loc["__version__"]
+
+# Verify that the version matches the one in C++
+matches = dict(VERSION_REGEX.findall(COMMON_FILE.read_text(encoding="utf8")))
+cpp_version = "{MAJOR}.{MINOR}.{PATCH}".format(**matches)
+if version != cpp_version:
+    msg = f"Python version {version} does not match C++ version {cpp_version}!"
+    raise RuntimeError(msg)
+
+version_hex = matches.get("HEX", "MISSING")
+exp_version_hex = build_expected_version_hex(matches)
+if version_hex != exp_version_hex:
+    msg = f"PYBIND11_VERSION_HEX {version_hex} does not match expected value {exp_version_hex}!"
+    raise RuntimeError(msg)
+
+
+# TODO: use literals & overload (typing extensions or Python 3.8)
+def get_and_replace(
+    filename: Path, binary: bool = False, **opts: str
+) -> Union[bytes, str]:
+    if binary:
+        contents = filename.read_bytes()
+        return string.Template(contents.decode()).substitute(opts).encode()
+
+    return string.Template(filename.read_text()).substitute(opts)
+
+
+# Use our input files instead when making the SDist (and anything that depends
+# on it, like a wheel)
+class SDist(setuptools.command.sdist.sdist):
+    def make_release_tree(self, base_dir: str, files: List[str]) -> None:
+        super().make_release_tree(base_dir, files)
+
+        for to, src in to_src:
+            txt = get_and_replace(src, binary=True, version=version, extra_cmd="")
+
+            dest = Path(base_dir) / to
+
+            # This is normally linked, so unlink before writing!
+            dest.unlink()
+            dest.write_bytes(txt)  # type: ignore[arg-type]
+
+
+# Remove the CMake install directory when done
+@contextlib.contextmanager
+def remove_output(*sources: str) -> Iterator[None]:
+    try:
+        yield
+    finally:
+        for src in sources:
+            shutil.rmtree(src)
+
+
+with remove_output("pybind11/include", "pybind11/share"):
+    # Generate the files if they are not present.
+    with TemporaryDirectory() as tmpdir:
+        cmd = ["cmake", "-S", ".", "-B", tmpdir] + [
+            "-DCMAKE_INSTALL_PREFIX=pybind11",
+            "-DBUILD_TESTING=OFF",
+            "-DPYBIND11_NOPYTHON=ON",
+            "-Dprefix_for_pc_file=${pcfiledir}/../../",
+        ]
+        if "CMAKE_ARGS" in os.environ:
+            fcommand = [
+                c
+                for c in os.environ["CMAKE_ARGS"].split()
+                if "DCMAKE_INSTALL_PREFIX" not in c
+            ]
+            cmd += fcommand
+        subprocess.run(cmd, check=True, cwd=DIR, stdout=sys.stdout, stderr=sys.stderr)
+        subprocess.run(
+            ["cmake", "--install", tmpdir],
+            check=True,
+            cwd=DIR,
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+        )
+
+    txt = get_and_replace(setup_py, version=version, extra_cmd=extra_cmd)
+    code = compile(txt, setup_py, "exec")
+    exec(code, {"SDist": SDist})
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/tools/FindCatch.cmake b/ethosu/regor/dependencies/thirdparty/pybind11/tools/FindCatch.cmake
new file mode 100644
index 00000000..5d3fcbfb
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/tools/FindCatch.cmake
@@ -0,0 +1,76 @@
+# - Find the Catch test framework or download it (single header)
+#
+# This is a quick module for internal use. It assumes that Catch is
+# REQUIRED and that a minimum version is provided (not EXACT). If
+# a suitable version isn't found locally, the single header file
+# will be downloaded and placed in the build dir: PROJECT_BINARY_DIR.
+#
+# This code sets the following variables:
+#  CATCH_INCLUDE_DIR      - path to catch.hpp
+#  CATCH_VERSION          - version number
+
+option(DOWNLOAD_CATCH "Download catch2 if not found")
+
+if(NOT Catch_FIND_VERSION)
+  message(FATAL_ERROR "A version number must be specified.")
+elseif(Catch_FIND_REQUIRED)
+  message(FATAL_ERROR "This module assumes Catch is not required.")
+elseif(Catch_FIND_VERSION_EXACT)
+  message(FATAL_ERROR "Exact version numbers are not supported, only minimum.")
+endif()
+
+# Extract the version number from catch.hpp
+function(_get_catch_version)
+  file(
+    STRINGS "${CATCH_INCLUDE_DIR}/catch.hpp" version_line
+    REGEX "Catch v.*"
+    LIMIT_COUNT 1)
+  if(version_line MATCHES "Catch v([0-9]+)\\.([0-9]+)\\.([0-9]+)")
+    set(CATCH_VERSION
+        "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}"
+        PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Download the single-header version of Catch
+function(_download_catch version destination_dir)
+  message(STATUS "Downloading catch v${version}...")
+  set(url https://github.com/philsquared/Catch/releases/download/v${version}/catch.hpp)
+  file(
+    DOWNLOAD ${url} "${destination_dir}/catch.hpp"
+    STATUS status
+    LOG log)
+  list(GET status 0 error)
+  if(error)
+    string(REPLACE "\n" "\n  " log "  ${log}")
+    message(FATAL_ERROR "Could not download URL:\n" "  ${url}\n" "Log:\n" "${log}")
+  endif()
+  set(CATCH_INCLUDE_DIR
+      "${destination_dir}"
+      CACHE INTERNAL "")
+endfunction()
+
+# Look for catch locally
+find_path(
+  CATCH_INCLUDE_DIR
+  NAMES catch.hpp
+  PATH_SUFFIXES catch2)
+if(CATCH_INCLUDE_DIR)
+  _get_catch_version()
+endif()
+
+# Download the header if it wasn't found or if it's outdated
+if(NOT CATCH_VERSION OR CATCH_VERSION VERSION_LESS ${Catch_FIND_VERSION})
+  if(DOWNLOAD_CATCH)
+    _download_catch(${Catch_FIND_VERSION} "${PROJECT_BINARY_DIR}/catch/")
+    _get_catch_version()
+  else()
+    set(CATCH_FOUND FALSE)
+    return()
+  endif()
+endif()
+
+add_library(Catch2::Catch2 IMPORTED INTERFACE)
+set_property(TARGET Catch2::Catch2 PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${CATCH_INCLUDE_DIR}")
+
+set(CATCH_FOUND TRUE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/tools/FindEigen3.cmake b/ethosu/regor/dependencies/thirdparty/pybind11/tools/FindEigen3.cmake
new file mode 100644
index 00000000..83625d92
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/tools/FindEigen3.cmake
@@ -0,0 +1,86 @@
+# - Try to find Eigen3 lib
+#
+# This module supports requiring a minimum version, e.g. you can do
+#   find_package(Eigen3 3.1.2)
+# to require version 3.1.2 or newer of Eigen3.
+#
+# Once done this will define
+#
+#  EIGEN3_FOUND - system has eigen lib with correct version
+#  EIGEN3_INCLUDE_DIR - the eigen include directory
+#  EIGEN3_VERSION - eigen version
+
+# Copyright (c) 2006, 2007 Montel Laurent, <montel@kde.org>
+# Copyright (c) 2008, 2009 Gael Guennebaud, <g.gael@free.fr>
+# Copyright (c) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+# Redistribution and use is allowed according to the terms of the 2-clause BSD license.
+
+if(NOT Eigen3_FIND_VERSION)
+  if(NOT Eigen3_FIND_VERSION_MAJOR)
+    set(Eigen3_FIND_VERSION_MAJOR 2)
+  endif(NOT Eigen3_FIND_VERSION_MAJOR)
+  if(NOT Eigen3_FIND_VERSION_MINOR)
+    set(Eigen3_FIND_VERSION_MINOR 91)
+  endif(NOT Eigen3_FIND_VERSION_MINOR)
+  if(NOT Eigen3_FIND_VERSION_PATCH)
+    set(Eigen3_FIND_VERSION_PATCH 0)
+  endif(NOT Eigen3_FIND_VERSION_PATCH)
+
+  set(Eigen3_FIND_VERSION
+      "${Eigen3_FIND_VERSION_MAJOR}.${Eigen3_FIND_VERSION_MINOR}.${Eigen3_FIND_VERSION_PATCH}")
+endif(NOT Eigen3_FIND_VERSION)
+
+macro(_eigen3_check_version)
+  file(READ "${EIGEN3_INCLUDE_DIR}/Eigen/src/Core/util/Macros.h" _eigen3_version_header)
+
+  string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen3_world_version_match
+               "${_eigen3_version_header}")
+  set(EIGEN3_WORLD_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen3_major_version_match
+               "${_eigen3_version_header}")
+  set(EIGEN3_MAJOR_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen3_minor_version_match
+               "${_eigen3_version_header}")
+  set(EIGEN3_MINOR_VERSION "${CMAKE_MATCH_1}")
+
+  set(EIGEN3_VERSION ${EIGEN3_WORLD_VERSION}.${EIGEN3_MAJOR_VERSION}.${EIGEN3_MINOR_VERSION})
+  if(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+    set(EIGEN3_VERSION_OK FALSE)
+  else(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+    set(EIGEN3_VERSION_OK TRUE)
+  endif(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+
+  if(NOT EIGEN3_VERSION_OK)
+
+    message(STATUS "Eigen3 version ${EIGEN3_VERSION} found in ${EIGEN3_INCLUDE_DIR}, "
+                   "but at least version ${Eigen3_FIND_VERSION} is required")
+  endif(NOT EIGEN3_VERSION_OK)
+endmacro(_eigen3_check_version)
+
+if(EIGEN3_INCLUDE_DIR)
+
+  # in cache already
+  _eigen3_check_version()
+  set(EIGEN3_FOUND ${EIGEN3_VERSION_OK})
+
+else(EIGEN3_INCLUDE_DIR)
+  if(NOT DEFINED KDE4_INCLUDE_DIR)
+    set(KDE4_INCLUDE_DIR "")
+  endif()
+
+  find_path(
+    EIGEN3_INCLUDE_DIR
+    NAMES signature_of_eigen3_matrix_library
+    PATHS ${CMAKE_INSTALL_PREFIX}/include ${KDE4_INCLUDE_DIR}
+    PATH_SUFFIXES eigen3 eigen)
+
+  if(EIGEN3_INCLUDE_DIR)
+    _eigen3_check_version()
+  endif(EIGEN3_INCLUDE_DIR)
+
+  include(FindPackageHandleStandardArgs)
+  find_package_handle_standard_args(Eigen3 DEFAULT_MSG EIGEN3_INCLUDE_DIR EIGEN3_VERSION_OK)
+
+  mark_as_advanced(EIGEN3_INCLUDE_DIR)
+
+endif(EIGEN3_INCLUDE_DIR)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/tools/FindPythonLibsNew.cmake b/ethosu/regor/dependencies/thirdparty/pybind11/tools/FindPythonLibsNew.cmake
new file mode 100644
index 00000000..ce558d4e
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/tools/FindPythonLibsNew.cmake
@@ -0,0 +1,287 @@
+# - Find python libraries
+# This module finds the libraries corresponding to the Python interpreter
+# FindPythonInterp provides.
+# This code sets the following variables:
+#
+#  PYTHONLIBS_FOUND           - have the Python libs been found
+#  PYTHON_PREFIX              - path to the Python installation
+#  PYTHON_LIBRARIES           - path to the python library
+#  PYTHON_INCLUDE_DIRS        - path to where Python.h is found
+#  PYTHON_MODULE_EXTENSION    - lib extension, e.g. '.so' or '.pyd'
+#  PYTHON_MODULE_PREFIX       - lib name prefix: usually an empty string
+#  PYTHON_SITE_PACKAGES       - path to installation site-packages
+#  PYTHON_IS_DEBUG            - whether the Python interpreter is a debug build
+#
+# Thanks to talljimbo for the patch adding the 'LDVERSION' config
+# variable usage.
+
+#=============================================================================
+# Copyright 2001-2009 Kitware, Inc.
+# Copyright 2012 Continuum Analytics, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# * Neither the names of Kitware, Inc., the Insight Software Consortium,
+# nor the names of their contributors may be used to endorse or promote
+# products derived from this software without specific prior written
+# permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#=============================================================================
+
+# Checking for the extension makes sure that `LibsNew` was found and not just `Libs`.
+if(PYTHONLIBS_FOUND AND PYTHON_MODULE_EXTENSION)
+  return()
+endif()
+
+if(PythonLibsNew_FIND_QUIETLY)
+  set(_pythonlibs_quiet QUIET)
+else()
+  set(_pythonlibs_quiet "")
+endif()
+
+if(PythonLibsNew_FIND_REQUIRED)
+  set(_pythonlibs_required REQUIRED)
+endif()
+
+# Check to see if the `python` command is present and from a virtual
+# environment, conda, or GHA activation - if it is, try to use that.
+
+if(NOT DEFINED PYTHON_EXECUTABLE)
+  if(DEFINED ENV{VIRTUAL_ENV})
+    find_program(
+      PYTHON_EXECUTABLE python
+      PATHS "$ENV{VIRTUAL_ENV}" "$ENV{VIRTUAL_ENV}/bin"
+      NO_DEFAULT_PATH)
+  elseif(DEFINED ENV{CONDA_PREFIX})
+    find_program(
+      PYTHON_EXECUTABLE python
+      PATHS "$ENV{CONDA_PREFIX}" "$ENV{CONDA_PREFIX}/bin"
+      NO_DEFAULT_PATH)
+  elseif(DEFINED ENV{pythonLocation})
+    find_program(
+      PYTHON_EXECUTABLE python
+      PATHS "$ENV{pythonLocation}" "$ENV{pythonLocation}/bin"
+      NO_DEFAULT_PATH)
+  endif()
+  if(NOT PYTHON_EXECUTABLE)
+    unset(PYTHON_EXECUTABLE)
+  endif()
+endif()
+
+# Use the Python interpreter to find the libs.
+if(NOT PythonLibsNew_FIND_VERSION)
+  set(PythonLibsNew_FIND_VERSION "3.6")
+endif()
+
+find_package(PythonInterp ${PythonLibsNew_FIND_VERSION} ${_pythonlibs_required}
+             ${_pythonlibs_quiet})
+
+if(NOT PYTHONINTERP_FOUND)
+  set(PYTHONLIBS_FOUND FALSE)
+  set(PythonLibsNew_FOUND FALSE)
+  return()
+endif()
+
+# According to https://stackoverflow.com/questions/646518/python-how-to-detect-debug-interpreter
+# testing whether sys has the gettotalrefcount function is a reliable, cross-platform
+# way to detect a CPython debug interpreter.
+#
+# The library suffix is from the config var LDVERSION sometimes, otherwise
+# VERSION. VERSION will typically be like "2.7" on unix, and "27" on windows.
+execute_process(
+  COMMAND
+    "${PYTHON_EXECUTABLE}" "-c" "
+import sys;import struct;
+import sysconfig as s
+USE_SYSCONFIG = sys.version_info >= (3, 10)
+if not USE_SYSCONFIG:
+    from distutils import sysconfig as ds
+print('.'.join(str(v) for v in sys.version_info));
+print(sys.prefix);
+if USE_SYSCONFIG:
+    scheme = s.get_default_scheme()
+    if scheme == 'posix_local':
+        # Debian's default scheme installs to /usr/local/ but we want to find headers in /usr/
+        scheme = 'posix_prefix'
+    print(s.get_path('platinclude', scheme))
+    print(s.get_path('platlib'))
+    print(s.get_config_var('EXT_SUFFIX') or s.get_config_var('SO'))
+else:
+    print(ds.get_python_inc(plat_specific=True));
+    print(ds.get_python_lib(plat_specific=True));
+    print(ds.get_config_var('EXT_SUFFIX') or ds.get_config_var('SO'));
+print(hasattr(sys, 'gettotalrefcount')+0);
+print(struct.calcsize('@P'));
+print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
+print(s.get_config_var('LIBDIR') or '');
+print(s.get_config_var('MULTIARCH') or '');
+"
+  RESULT_VARIABLE _PYTHON_SUCCESS
+  OUTPUT_VARIABLE _PYTHON_VALUES
+  ERROR_VARIABLE _PYTHON_ERROR_VALUE)
+
+if(NOT _PYTHON_SUCCESS MATCHES 0)
+  if(PythonLibsNew_FIND_REQUIRED)
+    message(FATAL_ERROR "Python config failure:\n${_PYTHON_ERROR_VALUE}")
+  endif()
+  set(PYTHONLIBS_FOUND FALSE)
+  set(PythonLibsNew_FOUND FALSE)
+  return()
+endif()
+
+option(
+  PYBIND11_PYTHONLIBS_OVERWRITE
+  "Overwrite cached values read from Python library (classic search). Turn off if cross-compiling and manually setting these values."
+  ON)
+# Can manually set values when cross-compiling
+macro(_PYBIND11_GET_IF_UNDEF lst index name)
+  if(PYBIND11_PYTHONLIBS_OVERWRITE OR NOT DEFINED "${name}")
+    list(GET "${lst}" "${index}" "${name}")
+  endif()
+endmacro()
+
+# Convert the process output into a list
+if(WIN32)
+  string(REGEX REPLACE "\\\\" "/" _PYTHON_VALUES ${_PYTHON_VALUES})
+endif()
+string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
+string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
+_pybind11_get_if_undef(_PYTHON_VALUES 0 _PYTHON_VERSION_LIST)
+_pybind11_get_if_undef(_PYTHON_VALUES 1 PYTHON_PREFIX)
+_pybind11_get_if_undef(_PYTHON_VALUES 2 PYTHON_INCLUDE_DIR)
+_pybind11_get_if_undef(_PYTHON_VALUES 3 PYTHON_SITE_PACKAGES)
+_pybind11_get_if_undef(_PYTHON_VALUES 4 PYTHON_MODULE_EXTENSION)
+_pybind11_get_if_undef(_PYTHON_VALUES 5 PYTHON_IS_DEBUG)
+_pybind11_get_if_undef(_PYTHON_VALUES 6 PYTHON_SIZEOF_VOID_P)
+_pybind11_get_if_undef(_PYTHON_VALUES 7 PYTHON_LIBRARY_SUFFIX)
+_pybind11_get_if_undef(_PYTHON_VALUES 8 PYTHON_LIBDIR)
+_pybind11_get_if_undef(_PYTHON_VALUES 9 PYTHON_MULTIARCH)
+
+# Make sure the Python has the same pointer-size as the chosen compiler
+# Skip if CMAKE_SIZEOF_VOID_P is not defined
+# This should be skipped for (non-Apple) cross-compiles (like EMSCRIPTEN)
+if(NOT CMAKE_CROSSCOMPILING
+   AND CMAKE_SIZEOF_VOID_P
+   AND (NOT "${PYTHON_SIZEOF_VOID_P}" STREQUAL "${CMAKE_SIZEOF_VOID_P}"))
+  if(PythonLibsNew_FIND_REQUIRED)
+    math(EXPR _PYTHON_BITS "${PYTHON_SIZEOF_VOID_P} * 8")
+    math(EXPR _CMAKE_BITS "${CMAKE_SIZEOF_VOID_P} * 8")
+    message(FATAL_ERROR "Python config failure: Python is ${_PYTHON_BITS}-bit, "
+                        "chosen compiler is  ${_CMAKE_BITS}-bit")
+  endif()
+  set(PYTHONLIBS_FOUND FALSE)
+  set(PythonLibsNew_FOUND FALSE)
+  return()
+endif()
+
+# The built-in FindPython didn't always give the version numbers
+string(REGEX REPLACE "\\." ";" _PYTHON_VERSION_LIST ${_PYTHON_VERSION_LIST})
+list(GET _PYTHON_VERSION_LIST 0 PYTHON_VERSION_MAJOR)
+list(GET _PYTHON_VERSION_LIST 1 PYTHON_VERSION_MINOR)
+list(GET _PYTHON_VERSION_LIST 2 PYTHON_VERSION_PATCH)
+set(PYTHON_VERSION "${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}.${PYTHON_VERSION_PATCH}")
+
+# Make sure all directory separators are '/'
+string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX "${PYTHON_PREFIX}")
+string(REGEX REPLACE "\\\\" "/" PYTHON_INCLUDE_DIR "${PYTHON_INCLUDE_DIR}")
+string(REGEX REPLACE "\\\\" "/" PYTHON_SITE_PACKAGES "${PYTHON_SITE_PACKAGES}")
+
+if(DEFINED PYTHON_LIBRARY)
+  # Don't write to PYTHON_LIBRARY if it's already set
+elseif(CMAKE_HOST_WIN32)
+  set(PYTHON_LIBRARY "${PYTHON_PREFIX}/libs/python${PYTHON_LIBRARY_SUFFIX}.lib")
+
+  # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the
+  # original python installation. They may be found relative to PYTHON_INCLUDE_DIR.
+  if(NOT EXISTS "${PYTHON_LIBRARY}")
+    get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY)
+    set(PYTHON_LIBRARY "${_PYTHON_ROOT}/libs/python${PYTHON_LIBRARY_SUFFIX}.lib")
+  endif()
+
+  # if we are in MSYS & MINGW, and we didn't find windows python lib, look for system python lib
+  if(DEFINED ENV{MSYSTEM}
+     AND MINGW
+     AND NOT EXISTS "${PYTHON_LIBRARY}")
+    if(PYTHON_MULTIARCH)
+      set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}/${PYTHON_MULTIARCH}" "${PYTHON_LIBDIR}")
+    else()
+      set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}")
+    endif()
+    unset(PYTHON_LIBRARY)
+    find_library(
+      PYTHON_LIBRARY
+      NAMES "python${PYTHON_LIBRARY_SUFFIX}"
+      PATHS ${_PYTHON_LIBS_SEARCH}
+      NO_DEFAULT_PATH)
+  endif()
+
+  # raise an error if the python libs are still not found.
+  if(NOT EXISTS "${PYTHON_LIBRARY}")
+    message(FATAL_ERROR "Python libraries not found")
+  endif()
+
+else()
+  if(PYTHON_MULTIARCH)
+    set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}/${PYTHON_MULTIARCH}" "${PYTHON_LIBDIR}")
+  else()
+    set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}")
+  endif()
+  #message(STATUS "Searching for Python libs in ${_PYTHON_LIBS_SEARCH}")
+  # Probably this needs to be more involved. It would be nice if the config
+  # information the python interpreter itself gave us were more complete.
+  find_library(
+    PYTHON_LIBRARY
+    NAMES "python${PYTHON_LIBRARY_SUFFIX}"
+    PATHS ${_PYTHON_LIBS_SEARCH}
+    NO_DEFAULT_PATH)
+
+  # If all else fails, just set the name/version and let the linker figure out the path.
+  if(NOT PYTHON_LIBRARY)
+    set(PYTHON_LIBRARY python${PYTHON_LIBRARY_SUFFIX})
+  endif()
+endif()
+
+mark_as_advanced(PYTHON_LIBRARY PYTHON_INCLUDE_DIR)
+
+# We use PYTHON_INCLUDE_DIR, PYTHON_LIBRARY and PYTHON_DEBUG_LIBRARY for the
+# cache entries because they are meant to specify the location of a single
+# library. We now set the variables listed by the documentation for this
+# module.
+set(PYTHON_INCLUDE_DIRS "${PYTHON_INCLUDE_DIR}")
+set(PYTHON_LIBRARIES "${PYTHON_LIBRARY}")
+if(NOT PYTHON_DEBUG_LIBRARY)
+  set(PYTHON_DEBUG_LIBRARY "")
+endif()
+set(PYTHON_DEBUG_LIBRARIES "${PYTHON_DEBUG_LIBRARY}")
+
+find_package_message(PYTHON "Found PythonLibs: ${PYTHON_LIBRARIES}"
+                     "${PYTHON_EXECUTABLE}${PYTHON_VERSION_STRING}")
+
+set(PYTHONLIBS_FOUND TRUE)
+set(PythonLibsNew_FOUND TRUE)
+
+if(NOT PYTHON_MODULE_PREFIX)
+  set(PYTHON_MODULE_PREFIX "")
+endif()
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/tools/JoinPaths.cmake b/ethosu/regor/dependencies/thirdparty/pybind11/tools/JoinPaths.cmake
new file mode 100644
index 00000000..c68d91b8
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/tools/JoinPaths.cmake
@@ -0,0 +1,23 @@
+# This module provides function for joining paths
+# known from most languages
+#
+# SPDX-License-Identifier: (MIT OR CC0-1.0)
+# Copyright 2020 Jan Tojnar
+# https://github.com/jtojnar/cmake-snips
+#
+# Modelled after Python’s os.path.join
+# https://docs.python.org/3.7/library/os.path.html#os.path.join
+# Windows not supported
+function(join_paths joined_path first_path_segment)
+    set(temp_path "${first_path_segment}")
+    foreach(current_segment IN LISTS ARGN)
+        if(NOT ("${current_segment}" STREQUAL ""))
+            if(IS_ABSOLUTE "${current_segment}")
+                set(temp_path "${current_segment}")
+            else()
+                set(temp_path "${temp_path}/${current_segment}")
+            endif()
+        endif()
+    endforeach()
+    set(${joined_path} "${temp_path}" PARENT_SCOPE)
+endfunction()
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/tools/check-style.sh b/ethosu/regor/dependencies/thirdparty/pybind11/tools/check-style.sh
new file mode 100755
index 00000000..6d832523
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/tools/check-style.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+#
+# Script to check include/test code for common pybind11 code style errors.
+#
+# This script currently checks for
+#
+# 1. missing space between keyword and parenthesis, e.g.: for(, if(, while(
+# 2. Missing space between right parenthesis and brace, e.g. 'for (...){'
+# 3. opening brace on its own line. It should always be on the same line as the
+#    if/while/for/do statement.
+#
+# Invoke as: tools/check-style.sh <filenames>
+#
+
+check_style_errors=0
+IFS=$'\n'
+
+
+found="$(grep '\<\(if\|for\|while\|catch\)(\|){' "$@" -rn --color=always)"
+if [ -n "$found" ]; then
+    echo -e '\033[31;01mError: found the following coding style problems:\033[0m'
+    check_style_errors=1
+    echo "${found//^/    /}"
+fi
+
+found="$(awk '
+function prefix(filename, lineno) {
+    return "    \033[35m" filename "\033[36m:\033[32m" lineno "\033[36m:\033[0m"
+}
+function mark(pattern, string) { sub(pattern, "\033[01;31m&\033[0m", string); return string }
+last && /^\s*{/ {
+    print prefix(FILENAME, FNR-1) mark("\\)\\s*$", last)
+    print prefix(FILENAME, FNR)   mark("^\\s*{", $0)
+    last=""
+}
+{ last = /(if|for|while|catch|switch)\s*\(.*\)\s*$/ ? $0 : "" }
+' "$(find include -type f)" "$@")"
+if [ -n "$found" ]; then
+    check_style_errors=1
+    echo -e '\033[31;01mError: braces should occur on the same line as the if/while/.. statement. Found issues in the following files:\033[0m'
+    echo "$found"
+fi
+
+exit $check_style_errors
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/tools/cmake_uninstall.cmake.in b/ethosu/regor/dependencies/thirdparty/pybind11/tools/cmake_uninstall.cmake.in
new file mode 100644
index 00000000..1e5d2bb8
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/tools/cmake_uninstall.cmake.in
@@ -0,0 +1,23 @@
+# Source: https://gitlab.kitware.com/cmake/community/-/wikis/FAQ#can-i-do-make-uninstall-with-cmake
+
+if(NOT EXISTS "@CMAKE_BINARY_DIR@/install_manifest.txt")
+  message(FATAL_ERROR "Cannot find install manifest: @CMAKE_BINARY_DIR@/install_manifest.txt")
+endif()
+
+file(READ "@CMAKE_BINARY_DIR@/install_manifest.txt" files)
+string(REGEX REPLACE "\n" ";" files "${files}")
+foreach(file ${files})
+  message(STATUS "Uninstalling $ENV{DESTDIR}${file}")
+  if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+    exec_program(
+      "@CMAKE_COMMAND@" ARGS
+      "-E remove \"$ENV{DESTDIR}${file}\""
+      OUTPUT_VARIABLE rm_out
+      RETURN_VALUE rm_retval)
+    if(NOT "${rm_retval}" STREQUAL 0)
+      message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}")
+    endif()
+  else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+    message(STATUS "File $ENV{DESTDIR}${file} does not exist.")
+  endif()
+endforeach()
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/tools/codespell_ignore_lines_from_errors.py b/ethosu/regor/dependencies/thirdparty/pybind11/tools/codespell_ignore_lines_from_errors.py
new file mode 100644
index 00000000..4ec9add1
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/tools/codespell_ignore_lines_from_errors.py
@@ -0,0 +1,39 @@
+"""Simple script for rebuilding .codespell-ignore-lines
+
+Usage:
+
+cat < /dev/null > .codespell-ignore-lines
+pre-commit run --all-files codespell >& /tmp/codespell_errors.txt
+python3 tools/codespell_ignore_lines_from_errors.py /tmp/codespell_errors.txt > .codespell-ignore-lines
+
+git diff to review changes, then commit, push.
+"""
+
+import sys
+from typing import List
+
+
+def run(args: List[str]) -> None:
+    assert len(args) == 1, "codespell_errors.txt"
+    cache = {}
+    done = set()
+    with open(args[0]) as f:
+        lines = f.read().splitlines()
+
+    for line in sorted(lines):
+        i = line.find(" ==> ")
+        if i > 0:
+            flds = line[:i].split(":")
+            if len(flds) >= 2:
+                filename, line_num = flds[:2]
+                if filename not in cache:
+                    with open(filename) as f:
+                        cache[filename] = f.read().splitlines()
+                supp = cache[filename][int(line_num) - 1]
+                if supp not in done:
+                    print(supp)
+                    done.add(supp)
+
+
+if __name__ == "__main__":
+    run(args=sys.argv[1:])
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/tools/libsize.py b/ethosu/regor/dependencies/thirdparty/pybind11/tools/libsize.py
new file mode 100644
index 00000000..1ac9afbe
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/tools/libsize.py
@@ -0,0 +1,36 @@
+import os
+import sys
+
+# Internal build script for generating debugging test .so size.
+# Usage:
+#     python libsize.py file.so save.txt -- displays the size of file.so and, if save.txt exists, compares it to the
+#                                           size in it, then overwrites save.txt with the new size for future runs.
+
+if len(sys.argv) != 3:
+    sys.exit("Invalid arguments: usage: python libsize.py file.so save.txt")
+
+lib = sys.argv[1]
+save = sys.argv[2]
+
+if not os.path.exists(lib):
+    sys.exit(f"Error: requested file ({lib}) does not exist")
+
+libsize = os.path.getsize(lib)
+
+print("------", os.path.basename(lib), "file size:", libsize, end="")
+
+if os.path.exists(save):
+    with open(save) as sf:
+        oldsize = int(sf.readline())
+
+    if oldsize > 0:
+        change = libsize - oldsize
+        if change == 0:
+            print(" (no change)")
+        else:
+            print(f" (change of {change:+} bytes = {change / oldsize:+.2%})")
+else:
+    print()
+
+with open(save, "w") as sf:
+    sf.write(str(libsize))
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/tools/make_changelog.py b/ethosu/regor/dependencies/thirdparty/pybind11/tools/make_changelog.py
new file mode 100755
index 00000000..b5bd8329
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/tools/make_changelog.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+
+import re
+
+import ghapi.all
+from rich import print
+from rich.syntax import Syntax
+
+ENTRY = re.compile(
+    r"""
+    Suggested \s changelog \s entry:
+    .*
+    ```rst
+    \s*
+    (.*?)
+    \s*
+    ```
+""",
+    re.DOTALL | re.VERBOSE,
+)
+
+print()
+
+
+api = ghapi.all.GhApi(owner="pybind", repo="pybind11")
+
+issues_pages = ghapi.page.paged(
+    api.issues.list_for_repo, labels="needs changelog", state="closed"
+)
+issues = (issue for page in issues_pages for issue in page)
+missing = []
+
+for issue in issues:
+    changelog = ENTRY.findall(issue.body or "")
+    if not changelog or not changelog[0]:
+        missing.append(issue)
+    else:
+        (msg,) = changelog
+        if not msg.startswith("* "):
+            msg = "* " + msg
+        if not msg.endswith("."):
+            msg += "."
+
+        msg += f"\n  `#{issue.number} <{issue.html_url}>`_"
+
+        print(Syntax(msg, "rst", theme="ansi_light", word_wrap=True))
+        print()
+
+if missing:
+    print()
+    print("[blue]" + "-" * 30)
+    print()
+
+    for issue in missing:
+        print(f"[red bold]Missing:[/red bold][red] {issue.title}")
+        print(f"[red]  {issue.html_url}\n")
+
+    print("[bold]Template:\n")
+    msg = "## Suggested changelog entry:\n\n```rst\n\n```"
+    print(Syntax(msg, "md", theme="ansi_light"))
+
+print()
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11.pc.in b/ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11.pc.in
new file mode 100644
index 00000000..402f0b35
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11.pc.in
@@ -0,0 +1,7 @@
+prefix=@prefix_for_pc_file@
+includedir=@includedir_for_pc_file@
+
+Name: @PROJECT_NAME@
+Description: Seamless operability between C++11 and Python
+Version: @PROJECT_VERSION@
+Cflags: -I${includedir}
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11Common.cmake b/ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11Common.cmake
new file mode 100644
index 00000000..308d1b70
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11Common.cmake
@@ -0,0 +1,405 @@
+#[======================================================[.rst
+
+Adds the following targets::
+
+    pybind11::pybind11 - link to headers and pybind11
+    pybind11::module - Adds module links
+    pybind11::embed - Adds embed links
+    pybind11::lto - Link time optimizations (only if CMAKE_INTERPROCEDURAL_OPTIMIZATION is not set)
+    pybind11::thin_lto - Link time optimizations (only if CMAKE_INTERPROCEDURAL_OPTIMIZATION is not set)
+    pybind11::python_link_helper - Adds link to Python libraries
+    pybind11::windows_extras - MSVC bigobj and mp for building multithreaded
+    pybind11::opt_size - avoid optimizations that increase code size
+
+Adds the following functions::
+
+    pybind11_strip(target) - strip target after building on linux/macOS
+    pybind11_find_import(module) - See if a module is installed.
+
+#]======================================================]
+
+# CMake 3.10 has an include_guard command, but we can't use that yet
+# include_guard(global) (pre-CMake 3.10)
+if(TARGET pybind11::pybind11)
+  return()
+endif()
+
+# If we are in subdirectory mode, all IMPORTED targets must be GLOBAL. If we
+# are in CONFIG mode, they should be "normal" targets instead.
+# In CMake 3.11+ you can promote a target to global after you create it,
+# which might be simpler than this check.
+get_property(
+  is_config
+  TARGET pybind11::headers
+  PROPERTY IMPORTED)
+if(NOT is_config)
+  set(optional_global GLOBAL)
+endif()
+
+# If not run in Python mode, we still would like this to at least
+# include pybind11's include directory:
+set(pybind11_INCLUDE_DIRS
+    "${pybind11_INCLUDE_DIR}"
+    CACHE INTERNAL "Include directory for pybind11 (Python not requested)")
+
+# --------------------- Shared targets ----------------------------
+
+# Build an interface library target:
+add_library(pybind11::pybind11 IMPORTED INTERFACE ${optional_global})
+set_property(
+  TARGET pybind11::pybind11
+  APPEND
+  PROPERTY INTERFACE_LINK_LIBRARIES pybind11::headers)
+
+# Build a module target:
+add_library(pybind11::module IMPORTED INTERFACE ${optional_global})
+set_property(
+  TARGET pybind11::module
+  APPEND
+  PROPERTY INTERFACE_LINK_LIBRARIES pybind11::pybind11)
+
+# Build an embed library target:
+add_library(pybind11::embed IMPORTED INTERFACE ${optional_global})
+set_property(
+  TARGET pybind11::embed
+  APPEND
+  PROPERTY INTERFACE_LINK_LIBRARIES pybind11::pybind11)
+
+# --------------------------- link helper ---------------------------
+
+add_library(pybind11::python_link_helper IMPORTED INTERFACE ${optional_global})
+
+if(CMAKE_VERSION VERSION_LESS 3.13)
+  # In CMake 3.11+, you can set INTERFACE properties via the normal methods, and
+  # this would be simpler.
+  set_property(
+    TARGET pybind11::python_link_helper
+    APPEND
+    PROPERTY INTERFACE_LINK_LIBRARIES "$<$<PLATFORM_ID:Darwin>:-undefined dynamic_lookup>")
+else()
+  # link_options was added in 3.13+
+  # This is safer, because you are ensured the deduplication pass in CMake will not consider
+  # these separate and remove one but not the other.
+  set_property(
+    TARGET pybind11::python_link_helper
+    APPEND
+    PROPERTY INTERFACE_LINK_OPTIONS "$<$<PLATFORM_ID:Darwin>:LINKER:-undefined,dynamic_lookup>")
+endif()
+
+# ------------------------ Windows extras -------------------------
+
+add_library(pybind11::windows_extras IMPORTED INTERFACE ${optional_global})
+
+if(MSVC) # That's also clang-cl
+  # /bigobj is needed for bigger binding projects due to the limit to 64k
+  # addressable sections
+  set_property(
+    TARGET pybind11::windows_extras
+    APPEND
+    PROPERTY INTERFACE_COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:CXX>:/bigobj>)
+
+  # /MP enables multithreaded builds (relevant when there are many files) for MSVC
+  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") # no Clang no Intel
+    if(CMAKE_VERSION VERSION_LESS 3.11)
+      set_property(
+        TARGET pybind11::windows_extras
+        APPEND
+        PROPERTY INTERFACE_COMPILE_OPTIONS $<$<NOT:$<CONFIG:Debug>>:/MP>)
+    else()
+      # Only set these options for C++ files.  This is important so that, for
+      # instance, projects that include other types of source files like CUDA
+      # .cu files don't get these options propagated to nvcc since that would
+      # cause the build to fail.
+      set_property(
+        TARGET pybind11::windows_extras
+        APPEND
+        PROPERTY INTERFACE_COMPILE_OPTIONS
+                 $<$<NOT:$<CONFIG:Debug>>:$<$<COMPILE_LANGUAGE:CXX>:/MP>>)
+    endif()
+  endif()
+endif()
+
+# ----------------------- Optimize binary size --------------------------
+
+add_library(pybind11::opt_size IMPORTED INTERFACE ${optional_global})
+
+if(MSVC)
+  set(PYBIND11_OPT_SIZE /Os)
+else()
+  set(PYBIND11_OPT_SIZE -Os)
+endif()
+
+set_property(
+  TARGET pybind11::opt_size
+  APPEND
+  PROPERTY INTERFACE_COMPILE_OPTIONS $<$<CONFIG:Release>:${PYBIND11_OPT_SIZE}>
+           $<$<CONFIG:MinSizeRel>:${PYBIND11_OPT_SIZE}>
+           $<$<CONFIG:RelWithDebInfo>:${PYBIND11_OPT_SIZE}>)
+
+# ----------------------- Legacy option --------------------------
+
+# Warn or error if old variable name used
+if(PYBIND11_CPP_STANDARD)
+  string(REGEX MATCH [[..$]] VAL "${PYBIND11_CPP_STANDARD}")
+  if(CMAKE_CXX_STANDARD)
+    if(NOT CMAKE_CXX_STANDARD STREQUAL VAL)
+      message(WARNING "CMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} does not match "
+                      "PYBIND11_CPP_STANDARD=${PYBIND11_CPP_STANDARD}, "
+                      "please remove PYBIND11_CPP_STANDARD from your cache")
+    endif()
+  else()
+    set(supported_standards 11 14 17 20)
+    if("${VAL}" IN_LIST supported_standards)
+      message(WARNING "USE -DCMAKE_CXX_STANDARD=${VAL} instead of PYBIND11_CPP_STANDARD")
+      set(CMAKE_CXX_STANDARD
+          ${VAL}
+          CACHE STRING "From PYBIND11_CPP_STANDARD")
+    else()
+      message(FATAL_ERROR "PYBIND11_CPP_STANDARD should be replaced with CMAKE_CXX_STANDARD "
+                          "(last two chars: ${VAL} not understood as a valid CXX std)")
+    endif()
+  endif()
+endif()
+
+# --------------------- Python specifics -------------------------
+
+# CMake 3.27 removes the classic FindPythonInterp if CMP0148 is NEW
+if(CMAKE_VERSION VERSION_LESS "3.27")
+  set(_pybind11_missing_old_python "OLD")
+else()
+  cmake_policy(GET CMP0148 _pybind11_missing_old_python)
+endif()
+
+# Check to see which Python mode we are in, new, old, or no python
+if(PYBIND11_NOPYTHON)
+  set(_pybind11_nopython ON)
+elseif(
+  _pybind11_missing_old_python STREQUAL "NEW"
+  OR PYBIND11_FINDPYTHON
+  OR Python_FOUND
+  OR Python2_FOUND
+  OR Python3_FOUND)
+  # New mode
+  include("${CMAKE_CURRENT_LIST_DIR}/pybind11NewTools.cmake")
+
+else()
+
+  # Classic mode
+  include("${CMAKE_CURRENT_LIST_DIR}/pybind11Tools.cmake")
+
+endif()
+
+# --------------------- pybind11_find_import -------------------------------
+
+if(NOT _pybind11_nopython)
+  # Check to see if modules are importable. Use REQUIRED to force an error if
+  # one of the modules is not found. <package_name>_FOUND will be set if the
+  # package was found (underscores replace dashes if present). QUIET will hide
+  # the found message, and VERSION will require a minimum version. A successful
+  # find will cache the result.
+  function(pybind11_find_import PYPI_NAME)
+    # CMake variables need underscores (PyPI doesn't care)
+    string(REPLACE "-" "_" NORM_PYPI_NAME "${PYPI_NAME}")
+
+    # Return if found previously
+    if(${NORM_PYPI_NAME}_FOUND)
+      return()
+    endif()
+
+    set(options "REQUIRED;QUIET")
+    set(oneValueArgs "VERSION")
+    cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "" ${ARGN})
+
+    if(ARG_REQUIRED)
+      set(status_level FATAL_ERROR)
+    else()
+      set(status_level WARNING)
+    endif()
+
+    execute_process(
+      COMMAND
+        ${${_Python}_EXECUTABLE} -c
+        "from pkg_resources import get_distribution; print(get_distribution('${PYPI_NAME}').version)"
+      RESULT_VARIABLE RESULT_PRESENT
+      OUTPUT_VARIABLE PKG_VERSION
+      ERROR_QUIET)
+
+    string(STRIP "${PKG_VERSION}" PKG_VERSION)
+
+    # If a result is present, this failed
+    if(RESULT_PRESENT)
+      set(${NORM_PYPI_NAME}_FOUND
+          ${NORM_PYPI_NAME}-NOTFOUND
+          CACHE INTERNAL "")
+      # Always warn or error
+      message(
+        ${status_level}
+        "Missing: ${PYPI_NAME} ${ARG_VERSION}\nTry: ${${_Python}_EXECUTABLE} -m pip install ${PYPI_NAME}"
+      )
+    else()
+      if(ARG_VERSION AND PKG_VERSION VERSION_LESS ARG_VERSION)
+        message(
+          ${status_level}
+          "Version incorrect: ${PYPI_NAME} ${PKG_VERSION} found, ${ARG_VERSION} required - try upgrading"
+        )
+      else()
+        set(${NORM_PYPI_NAME}_FOUND
+            YES
+            CACHE INTERNAL "")
+        set(${NORM_PYPI_NAME}_VERSION
+            ${PKG_VERSION}
+            CACHE INTERNAL "")
+      endif()
+      if(NOT ARG_QUIET)
+        message(STATUS "Found ${PYPI_NAME} ${PKG_VERSION}")
+      endif()
+    endif()
+    if(NOT ARG_VERSION OR (NOT PKG_VERSION VERSION_LESS ARG_VERSION))
+      # We have successfully found a good version, cache to avoid calling again.
+    endif()
+  endfunction()
+endif()
+
+# --------------------- LTO -------------------------------
+
+include(CheckCXXCompilerFlag)
+
+# Checks whether the given CXX/linker flags can compile and link a cxx file.
+# cxxflags and linkerflags are lists of flags to use.  The result variable is a
+# unique variable name for each set of flags: the compilation result will be
+# cached base on the result variable.  If the flags work, sets them in
+# cxxflags_out/linkerflags_out internal cache variables (in addition to
+# ${result}).
+function(_pybind11_return_if_cxx_and_linker_flags_work result cxxflags linkerflags cxxflags_out
+         linkerflags_out)
+  set(CMAKE_REQUIRED_LIBRARIES ${linkerflags})
+  check_cxx_compiler_flag("${cxxflags}" ${result})
+  if(${result})
+    set(${cxxflags_out}
+        "${cxxflags}"
+        PARENT_SCOPE)
+    set(${linkerflags_out}
+        "${linkerflags}"
+        PARENT_SCOPE)
+  endif()
+endfunction()
+
+function(_pybind11_generate_lto target prefer_thin_lto)
+  if(MINGW)
+    message(STATUS "${target} disabled (problems with undefined symbols for MinGW for now)")
+    return()
+  endif()
+
+  if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+    set(cxx_append "")
+    set(linker_append "")
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NOT APPLE)
+      # Clang Gold plugin does not support -Os; append -O3 to MinSizeRel builds to override it
+      set(linker_append ";$<$<CONFIG:MinSizeRel>:-O3>")
+    elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND NOT MINGW)
+      set(cxx_append ";-fno-fat-lto-objects")
+    endif()
+
+    if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le" OR CMAKE_SYSTEM_PROCESSOR MATCHES "mips64")
+      set(NO_FLTO_ARCH TRUE)
+    else()
+      set(NO_FLTO_ARCH FALSE)
+    endif()
+
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang"
+       AND prefer_thin_lto
+       AND NOT NO_FLTO_ARCH)
+      _pybind11_return_if_cxx_and_linker_flags_work(
+        HAS_FLTO_THIN "-flto=thin${cxx_append}" "-flto=thin${linker_append}"
+        PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+    endif()
+
+    if(NOT HAS_FLTO_THIN AND NOT NO_FLTO_ARCH)
+      _pybind11_return_if_cxx_and_linker_flags_work(
+        HAS_FLTO "-flto${cxx_append}" "-flto${linker_append}" PYBIND11_LTO_CXX_FLAGS
+        PYBIND11_LTO_LINKER_FLAGS)
+    endif()
+  elseif(CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM")
+    # IntelLLVM equivalent to LTO is called IPO; also IntelLLVM is WIN32/UNIX
+    # WARNING/HELP WANTED: This block of code is currently not covered by pybind11 GitHub Actions!
+    if(WIN32)
+      _pybind11_return_if_cxx_and_linker_flags_work(
+        HAS_INTEL_IPO "-Qipo" "-Qipo" PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+    else()
+      _pybind11_return_if_cxx_and_linker_flags_work(
+        HAS_INTEL_IPO "-ipo" "-ipo" PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+    endif()
+  elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
+    # Intel equivalent to LTO is called IPO
+    _pybind11_return_if_cxx_and_linker_flags_work(HAS_INTEL_IPO "-ipo" "-ipo"
+                                                  PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+  elseif(MSVC)
+    # cmake only interprets libraries as linker flags when they start with a - (otherwise it
+    # converts /LTCG to \LTCG as if it was a Windows path).  Luckily MSVC supports passing flags
+    # with - instead of /, even if it is a bit non-standard:
+    _pybind11_return_if_cxx_and_linker_flags_work(HAS_MSVC_GL_LTCG "/GL" "-LTCG"
+                                                  PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+  endif()
+
+  # Enable LTO flags if found, except for Debug builds
+  if(PYBIND11_LTO_CXX_FLAGS)
+    # CONFIG takes multiple values in CMake 3.19+, until then we have to use OR
+    set(is_debug "$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>")
+    set(not_debug "$<NOT:${is_debug}>")
+    set(cxx_lang "$<COMPILE_LANGUAGE:CXX>")
+    if(MSVC AND CMAKE_VERSION VERSION_LESS 3.11)
+      set(genex "${not_debug}")
+    else()
+      set(genex "$<AND:${not_debug},${cxx_lang}>")
+    endif()
+    set_property(
+      TARGET ${target}
+      APPEND
+      PROPERTY INTERFACE_COMPILE_OPTIONS "$<${genex}:${PYBIND11_LTO_CXX_FLAGS}>")
+    if(CMAKE_PROJECT_NAME STREQUAL "pybind11")
+      message(STATUS "${target} enabled")
+    endif()
+  else()
+    if(CMAKE_PROJECT_NAME STREQUAL "pybind11")
+      message(STATUS "${target} disabled (not supported by the compiler and/or linker)")
+    endif()
+  endif()
+
+  if(PYBIND11_LTO_LINKER_FLAGS)
+    if(CMAKE_VERSION VERSION_LESS 3.11)
+      set_property(
+        TARGET ${target}
+        APPEND
+        PROPERTY INTERFACE_LINK_LIBRARIES "$<${not_debug}:${PYBIND11_LTO_LINKER_FLAGS}>")
+    else()
+      set_property(
+        TARGET ${target}
+        APPEND
+        PROPERTY INTERFACE_LINK_OPTIONS "$<${not_debug}:${PYBIND11_LTO_LINKER_FLAGS}>")
+    endif()
+  endif()
+endfunction()
+
+if(NOT DEFINED CMAKE_INTERPROCEDURAL_OPTIMIZATION)
+  add_library(pybind11::lto IMPORTED INTERFACE ${optional_global})
+  _pybind11_generate_lto(pybind11::lto FALSE)
+
+  add_library(pybind11::thin_lto IMPORTED INTERFACE ${optional_global})
+  _pybind11_generate_lto(pybind11::thin_lto TRUE)
+endif()
+
+# ---------------------- pybind11_strip -----------------------------
+
+function(pybind11_strip target_name)
+  # Strip unnecessary sections of the binary on Linux/macOS
+  if(CMAKE_STRIP)
+    if(APPLE)
+      set(x_opt -x)
+    endif()
+
+    add_custom_command(
+      TARGET ${target_name}
+      POST_BUILD
+      COMMAND ${CMAKE_STRIP} ${x_opt} $<TARGET_FILE:${target_name}>)
+  endif()
+endfunction()
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11Config.cmake.in b/ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11Config.cmake.in
new file mode 100644
index 00000000..5734f437
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11Config.cmake.in
@@ -0,0 +1,233 @@
+#[=============================================================================[.rst:
+
+pybind11Config.cmake
+####################
+
+Exported variables
+==================
+
+This module sets the following variables in your project:
+
+``pybind11_FOUND``
+  true if pybind11 and all required components found on the system
+``pybind11_VERSION``
+  pybind11 version in format Major.Minor.Release
+``pybind11_VERSION_TYPE``
+  pybind11 version type (``dev*`` or empty for a release)
+``pybind11_INCLUDE_DIRS``
+  Directories where pybind11 and python headers are located.
+``pybind11_INCLUDE_DIR``
+  Directory where pybind11 headers are located.
+``pybind11_DEFINITIONS``
+  Definitions necessary to use pybind11, namely USING_pybind11.
+``pybind11_LIBRARIES``
+  Compile flags and python libraries (as needed) to link against.
+``pybind11_LIBRARY``
+  Empty.
+
+Available components: None
+
+
+Exported targets
+================
+
+If pybind11 is found, this module defines the following ``IMPORTED``
+interface library targets:
+
+``pybind11::module``
+  for extension modules.
+``pybind11::embed``
+  for embedding the Python interpreter.
+
+Python headers, libraries (as needed by platform), and the C++ standard
+are attached to the target.
+
+Advanced targets are also supplied - these are primary for users building
+complex applications, and they are available in all modes:
+
+``pybind11::headers``
+  Just the pybind11 headers and minimum compile requirements.
+``pybind11::pybind11``
+  Python headers too.
+``pybind11::python_link_helper``
+  Just the "linking" part of ``pybind11:module``, for CMake < 3.15.
+``pybind11::thin_lto``
+  An alternative to ``INTERPROCEDURAL_OPTIMIZATION``.
+``pybind11::lto``
+  An alternative to ``INTERPROCEDURAL_OPTIMIZATION`` (also avoids thin LTO on clang).
+``pybind11::windows_extras``
+  Adds bigobj and mp for MSVC.
+
+Modes
+=====
+
+There are two modes provided; classic, which is built on the old Python
+discovery packages in CMake, or the new FindPython mode, which uses FindPython
+from 3.12+ forward (3.15+ _highly_ recommended). If you set the minimum or
+maximum version of CMake to 3.27+, then FindPython is the default (since
+FindPythonInterp/FindPythonLibs has been removed via policy `CMP0148`).
+
+New FindPython mode
+^^^^^^^^^^^^^^^^^^^
+
+To activate this mode, either call ``find_package(Python COMPONENTS Interpreter Development)``
+before finding this package, or set the ``PYBIND11_FINDPYTHON`` variable to ON. In this mode,
+you can either use the basic targets, or use the FindPython tools:
+
+.. code-block:: cmake
+
+  find_package(Python COMPONENTS Interpreter Development)
+  find_package(pybind11 CONFIG)
+
+  # pybind11 method:
+  pybind11_add_module(MyModule1 src1.cpp)
+
+  # Python method:
+  Python_add_library(MyModule2 src2.cpp)
+  target_link_libraries(MyModule2 pybind11::headers)
+  set_target_properties(MyModule2 PROPERTIES
+                                  INTERPROCEDURAL_OPTIMIZATION ON
+                                  CXX_VISIBILITY_PRESET ON
+                                  VISIBILITY_INLINES_HIDDEN ON)
+
+If you build targets yourself, you may be interested in stripping the output
+for reduced size; this is the one other feature that the helper function gives you.
+
+Classic mode
+^^^^^^^^^^^^
+
+Set PythonLibsNew variables to influence python detection and
+CMAKE_CXX_STANDARD to influence standard setting.
+
+.. code-block:: cmake
+
+  find_package(pybind11 CONFIG REQUIRED)
+
+  # Create an extension module
+  add_library(mylib MODULE main.cpp)
+  target_link_libraries(mylib PUBLIC pybind11::module)
+
+  # Or embed the Python interpreter into an executable
+  add_executable(myexe main.cpp)
+  target_link_libraries(myexe PUBLIC pybind11::embed)
+
+
+Hints
+=====
+
+The following variables can be set to guide the search for this package:
+
+``pybind11_DIR``
+  CMake variable, set to directory containing this Config file.
+``CMAKE_PREFIX_PATH``
+  CMake variable, set to root directory of this package.
+``PATH``
+  Environment variable, set to bin directory of this package.
+``CMAKE_DISABLE_FIND_PACKAGE_pybind11``
+  CMake variable, disables ``find_package(pybind11)`` when not ``REQUIRED``,
+  perhaps to force internal build.
+
+Commands
+========
+
+pybind11_add_module
+^^^^^^^^^^^^^^^^^^^
+
+This module defines the following commands to assist with creating Python modules:
+
+.. code-block:: cmake
+
+  pybind11_add_module(<target>
+    [STATIC|SHARED|MODULE]
+    [THIN_LTO] [OPT_SIZE] [NO_EXTRAS] [WITHOUT_SOABI]
+    <files>...
+    )
+
+Add a module and setup all helpers. You can select the type of the library; the
+default is ``MODULE``. There are several options:
+
+``OPT_SIZE``
+  Optimize for size, even if the ``CMAKE_BUILD_TYPE`` is not ``MinSizeRel``.
+``THIN_LTO``
+  Use thin TLO instead of regular if there's a choice (pybind11's selection
+  is disabled if ``CMAKE_INTERPROCEDURAL_OPTIMIZATIONS`` is set).
+``WITHOUT_SOABI``
+  Disable the SOABI component (``PYBIND11_NEWPYTHON`` mode only).
+``NO_EXTRAS``
+  Disable all extras, exit immediately after making the module.
+
+pybind11_strip
+^^^^^^^^^^^^^^
+
+.. code-block:: cmake
+
+  pybind11_strip(<target>)
+
+Strip a target after building it (linux/macOS), called by ``pybind11_add_module``.
+
+pybind11_extension
+^^^^^^^^^^^^^^^^^^
+
+.. code-block:: cmake
+
+    pybind11_extension(<target>)
+
+Sets the Python extension name correctly for Python on your platform, called by
+``pybind11_add_module``.
+
+pybind11_find_import(module)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: cmake
+
+    pybind11_find_import(<module> [VERSION <number>] [REQUIRED] [QUIET])
+
+See if a module is installed. Use the registered name (the one on PyPI). You
+can specify a ``VERSION``, and you can specify ``REQUIRED`` or ``QUIET``. Only available if
+``NOPYTHON`` mode is not active.  Sets ``module_VERSION`` and ``module_FOUND``. Caches the
+result once a valid install is found.
+
+Suggested usage
+===============
+
+Using ``find_package`` with version info is not recommended except for release versions.
+
+.. code-block:: cmake
+
+  find_package(pybind11 CONFIG)
+  find_package(pybind11 2.9 EXACT CONFIG REQUIRED)
+
+#]=============================================================================]
+@PACKAGE_INIT@
+
+# Location of pybind11/pybind11.h
+# This will be relative unless explicitly set as absolute
+set(pybind11_INCLUDE_DIR "@pybind11_INCLUDEDIR@")
+
+set(pybind11_LIBRARY "")
+set(pybind11_DEFINITIONS USING_pybind11)
+set(pybind11_VERSION_TYPE "@pybind11_VERSION_TYPE@")
+
+check_required_components(pybind11)
+
+if(TARGET pybind11::python_link_helper)
+  # This has already been setup elsewhere, such as with a previous call or
+  # add_subdirectory
+  return()
+endif()
+
+include("${CMAKE_CURRENT_LIST_DIR}/pybind11Targets.cmake")
+
+# Easier to use / remember
+add_library(pybind11::headers IMPORTED INTERFACE)
+set_target_properties(pybind11::headers PROPERTIES INTERFACE_LINK_LIBRARIES
+                                                   pybind11::pybind11_headers)
+
+include("${CMAKE_CURRENT_LIST_DIR}/pybind11Common.cmake")
+
+if(NOT pybind11_FIND_QUIETLY)
+  message(
+    STATUS
+      "Found pybind11: ${pybind11_INCLUDE_DIR} (found version \"${pybind11_VERSION}${pybind11_VERSION_TYPE}\")"
+  )
+endif()
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11NewTools.cmake b/ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11NewTools.cmake
new file mode 100644
index 00000000..7d7424a7
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11NewTools.cmake
@@ -0,0 +1,256 @@
+# tools/pybind11NewTools.cmake -- Build system for the pybind11 modules
+#
+# Copyright (c) 2020 Wenzel Jakob <wenzel@inf.ethz.ch> and Henry Schreiner
+#
+# All rights reserved. Use of this source code is governed by a
+# BSD-style license that can be found in the LICENSE file.
+
+if(CMAKE_VERSION VERSION_LESS 3.12)
+  message(FATAL_ERROR "You cannot use the new FindPython module with CMake < 3.12")
+endif()
+
+include_guard(DIRECTORY)
+
+get_property(
+  is_config
+  TARGET pybind11::headers
+  PROPERTY IMPORTED)
+
+if(pybind11_FIND_QUIETLY)
+  set(_pybind11_quiet QUIET)
+else()
+  set(_pybind11_quiet "")
+endif()
+
+if(NOT Python_FOUND AND NOT Python3_FOUND)
+  if(NOT DEFINED Python_FIND_IMPLEMENTATIONS)
+    set(Python_FIND_IMPLEMENTATIONS CPython PyPy)
+  endif()
+
+  # GitHub Actions like activation
+  if(NOT DEFINED Python_ROOT_DIR AND DEFINED ENV{pythonLocation})
+    set(Python_ROOT_DIR "$ENV{pythonLocation}")
+  endif()
+
+  find_package(Python 3.6 REQUIRED COMPONENTS Interpreter Development ${_pybind11_quiet})
+
+  # If we are in submodule mode, export the Python targets to global targets.
+  # If this behavior is not desired, FindPython _before_ pybind11.
+  if(NOT is_config)
+    set_property(TARGET Python::Python PROPERTY IMPORTED_GLOBAL TRUE)
+    set_property(TARGET Python::Interpreter PROPERTY IMPORTED_GLOBAL TRUE)
+    if(TARGET Python::Module)
+      set_property(TARGET Python::Module PROPERTY IMPORTED_GLOBAL TRUE)
+    endif()
+  endif()
+endif()
+
+if(Python_FOUND)
+  set(_Python
+      Python
+      CACHE INTERNAL "" FORCE)
+elseif(Python3_FOUND)
+  set(_Python
+      Python3
+      CACHE INTERNAL "" FORCE)
+endif()
+
+if(PYBIND11_MASTER_PROJECT)
+  if(${_Python}_INTERPRETER_ID MATCHES "PyPy")
+    message(STATUS "PyPy ${${_Python}_PyPy_VERSION} (Py ${${_Python}_VERSION})")
+  else()
+    message(STATUS "${_Python} ${${_Python}_VERSION}")
+  endif()
+endif()
+
+# If a user finds Python, they may forget to include the Interpreter component
+# and the following two steps require it. It is highly recommended by CMake
+# when finding development libraries anyway, so we will require it.
+if(NOT DEFINED ${_Python}_EXECUTABLE)
+  message(
+    FATAL_ERROR
+      "${_Python} was found without the Interpreter component. Pybind11 requires this component.")
+
+endif()
+
+if(NOT ${_Python}_EXECUTABLE STREQUAL PYBIND11_PYTHON_EXECUTABLE_LAST)
+  # Detect changes to the Python version/binary in subsequent CMake runs, and refresh config if needed
+  unset(PYTHON_IS_DEBUG CACHE)
+  unset(PYTHON_MODULE_EXTENSION CACHE)
+  set(PYBIND11_PYTHON_EXECUTABLE_LAST
+      "${${_Python}_EXECUTABLE}"
+      CACHE INTERNAL "Python executable during the last CMake run")
+endif()
+
+if(NOT DEFINED PYTHON_IS_DEBUG)
+  # Debug check - see https://stackoverflow.com/questions/646518/python-how-to-detect-debug-Interpreter
+  execute_process(
+    COMMAND "${${_Python}_EXECUTABLE}" "-c"
+            "import sys; sys.exit(hasattr(sys, 'gettotalrefcount'))"
+    RESULT_VARIABLE _PYTHON_IS_DEBUG)
+  set(PYTHON_IS_DEBUG
+      "${_PYTHON_IS_DEBUG}"
+      CACHE INTERNAL "Python debug status")
+endif()
+
+# Get the suffix - SO is deprecated, should use EXT_SUFFIX, but this is
+# required for PyPy3 (as of 7.3.1)
+if(NOT DEFINED PYTHON_MODULE_EXTENSION)
+  execute_process(
+    COMMAND
+      "${${_Python}_EXECUTABLE}" "-c"
+      "import sys, importlib; s = importlib.import_module('distutils.sysconfig' if sys.version_info < (3, 10) else 'sysconfig'); print(s.get_config_var('EXT_SUFFIX') or s.get_config_var('SO'))"
+    OUTPUT_VARIABLE _PYTHON_MODULE_EXTENSION
+    ERROR_VARIABLE _PYTHON_MODULE_EXTENSION_ERR
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  if(_PYTHON_MODULE_EXTENSION STREQUAL "")
+    message(
+      FATAL_ERROR "pybind11 could not query the module file extension, likely the 'distutils'"
+                  "package is not installed. Full error message:\n${_PYTHON_MODULE_EXTENSION_ERR}")
+  endif()
+
+  # This needs to be available for the pybind11_extension function
+  set(PYTHON_MODULE_EXTENSION
+      "${_PYTHON_MODULE_EXTENSION}"
+      CACHE INTERNAL "")
+endif()
+
+# Python debug libraries expose slightly different objects before 3.8
+# https://docs.python.org/3.6/c-api/intro.html#debugging-builds
+# https://stackoverflow.com/questions/39161202/how-to-work-around-missing-pymodule-create2-in-amd64-win-python35-d-lib
+if(PYTHON_IS_DEBUG)
+  set_property(
+    TARGET pybind11::pybind11
+    APPEND
+    PROPERTY INTERFACE_COMPILE_DEFINITIONS Py_DEBUG)
+endif()
+
+# Check on every access - since Python can change - do nothing in that case.
+
+if(DEFINED ${_Python}_INCLUDE_DIRS)
+  # Only add Python for build - must be added during the import for config
+  # since it has to be re-discovered.
+  #
+  # This needs to be a target to be included after the local pybind11
+  # directory, just in case there there is an installed pybind11 sitting
+  # next to Python's includes. It also ensures Python is a SYSTEM library.
+  add_library(pybind11::python_headers INTERFACE IMPORTED)
+  set_property(
+    TARGET pybind11::python_headers PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+                                             "$<BUILD_INTERFACE:${${_Python}_INCLUDE_DIRS}>")
+  set_property(
+    TARGET pybind11::pybind11
+    APPEND
+    PROPERTY INTERFACE_LINK_LIBRARIES pybind11::python_headers)
+  set(pybind11_INCLUDE_DIRS
+      "${pybind11_INCLUDE_DIR}" "${${_Python}_INCLUDE_DIRS}"
+      CACHE INTERNAL "Directories where pybind11 and possibly Python headers are located")
+endif()
+
+# In CMake 3.18+, you can find these separately, so include an if
+if(TARGET ${_Python}::Python)
+  set_property(
+    TARGET pybind11::embed
+    APPEND
+    PROPERTY INTERFACE_LINK_LIBRARIES ${_Python}::Python)
+endif()
+
+# CMake 3.15+ has this
+if(TARGET ${_Python}::Module)
+  set_property(
+    TARGET pybind11::module
+    APPEND
+    PROPERTY INTERFACE_LINK_LIBRARIES ${_Python}::Module)
+else()
+  set_property(
+    TARGET pybind11::module
+    APPEND
+    PROPERTY INTERFACE_LINK_LIBRARIES pybind11::python_link_helper)
+endif()
+
+# WITHOUT_SOABI and WITH_SOABI will disable the custom extension handling used by pybind11.
+# WITH_SOABI is passed on to python_add_library.
+function(pybind11_add_module target_name)
+  cmake_parse_arguments(PARSE_ARGV 1 ARG
+                        "STATIC;SHARED;MODULE;THIN_LTO;OPT_SIZE;NO_EXTRAS;WITHOUT_SOABI" "" "")
+
+  if(ARG_STATIC)
+    set(lib_type STATIC)
+  elseif(ARG_SHARED)
+    set(lib_type SHARED)
+  else()
+    set(lib_type MODULE)
+  endif()
+
+  if("${_Python}" STREQUAL "Python")
+    python_add_library(${target_name} ${lib_type} ${ARG_UNPARSED_ARGUMENTS})
+  elseif("${_Python}" STREQUAL "Python3")
+    python3_add_library(${target_name} ${lib_type} ${ARG_UNPARSED_ARGUMENTS})
+  else()
+    message(FATAL_ERROR "Cannot detect FindPython version: ${_Python}")
+  endif()
+
+  target_link_libraries(${target_name} PRIVATE pybind11::headers)
+
+  if(lib_type STREQUAL "MODULE")
+    target_link_libraries(${target_name} PRIVATE pybind11::module)
+  else()
+    target_link_libraries(${target_name} PRIVATE pybind11::embed)
+  endif()
+
+  if(MSVC)
+    target_link_libraries(${target_name} PRIVATE pybind11::windows_extras)
+  endif()
+
+  # -fvisibility=hidden is required to allow multiple modules compiled against
+  # different pybind versions to work properly, and for some features (e.g.
+  # py::module_local).  We force it on everything inside the `pybind11`
+  # namespace; also turning it on for a pybind module compilation here avoids
+  # potential warnings or issues from having mixed hidden/non-hidden types.
+  if(NOT DEFINED CMAKE_CXX_VISIBILITY_PRESET)
+    set_target_properties(${target_name} PROPERTIES CXX_VISIBILITY_PRESET "hidden")
+  endif()
+
+  if(NOT DEFINED CMAKE_CUDA_VISIBILITY_PRESET)
+    set_target_properties(${target_name} PROPERTIES CUDA_VISIBILITY_PRESET "hidden")
+  endif()
+
+  # If we don't pass a WITH_SOABI or WITHOUT_SOABI, use our own default handling of extensions
+  if(NOT ARG_WITHOUT_SOABI AND NOT "WITH_SOABI" IN_LIST ARG_UNPARSED_ARGUMENTS)
+    pybind11_extension(${target_name})
+  endif()
+
+  if(ARG_NO_EXTRAS)
+    return()
+  endif()
+
+  if(NOT DEFINED CMAKE_INTERPROCEDURAL_OPTIMIZATION)
+    if(ARG_THIN_LTO)
+      target_link_libraries(${target_name} PRIVATE pybind11::thin_lto)
+    else()
+      target_link_libraries(${target_name} PRIVATE pybind11::lto)
+    endif()
+  endif()
+
+  # Use case-insensitive comparison to match the result of $<CONFIG:cfgs>
+  string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
+  if(NOT MSVC AND NOT "${uppercase_CMAKE_BUILD_TYPE}" MATCHES DEBUG|RELWITHDEBINFO)
+    # Strip unnecessary sections of the binary on Linux/macOS
+    pybind11_strip(${target_name})
+  endif()
+
+  if(MSVC)
+    target_link_libraries(${target_name} PRIVATE pybind11::windows_extras)
+  endif()
+
+  if(ARG_OPT_SIZE)
+    target_link_libraries(${target_name} PRIVATE pybind11::opt_size)
+  endif()
+endfunction()
+
+function(pybind11_extension name)
+  # The extension is precomputed
+  set_target_properties(${name} PROPERTIES PREFIX "" SUFFIX "${PYTHON_MODULE_EXTENSION}")
+
+endfunction()
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11Tools.cmake b/ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11Tools.cmake
new file mode 100644
index 00000000..66ad00a4
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11Tools.cmake
@@ -0,0 +1,233 @@
+# tools/pybind11Tools.cmake -- Build system for the pybind11 modules
+#
+# Copyright (c) 2020 Wenzel Jakob <wenzel.jakob@epfl.ch>
+#
+# All rights reserved. Use of this source code is governed by a
+# BSD-style license that can be found in the LICENSE file.
+
+# include_guard(global) (pre-CMake 3.10)
+if(TARGET pybind11::python_headers)
+  return()
+endif()
+
+# Built-in in CMake 3.5+
+include(CMakeParseArguments)
+
+if(pybind11_FIND_QUIETLY)
+  set(_pybind11_quiet QUIET)
+else()
+  set(_pybind11_quiet "")
+endif()
+
+# If this is the first run, PYTHON_VERSION can stand in for PYBIND11_PYTHON_VERSION
+if(NOT DEFINED PYBIND11_PYTHON_VERSION AND DEFINED PYTHON_VERSION)
+  message(WARNING "Set PYBIND11_PYTHON_VERSION to search for a specific version, not "
+                  "PYTHON_VERSION (which is an output). Assuming that is what you "
+                  "meant to do and continuing anyway.")
+  set(PYBIND11_PYTHON_VERSION
+      "${PYTHON_VERSION}"
+      CACHE STRING "Python version to use for compiling modules")
+  unset(PYTHON_VERSION)
+  unset(PYTHON_VERSION CACHE)
+elseif(DEFINED PYBIND11_PYTHON_VERSION)
+  # If this is set as a normal variable, promote it
+  set(PYBIND11_PYTHON_VERSION
+      "${PYBIND11_PYTHON_VERSION}"
+      CACHE STRING "Python version to use for compiling modules")
+else()
+  # Make an empty cache variable.
+  set(PYBIND11_PYTHON_VERSION
+      ""
+      CACHE STRING "Python version to use for compiling modules")
+endif()
+
+# A user can set versions manually too
+set(Python_ADDITIONAL_VERSIONS
+    "3.11;3.10;3.9;3.8;3.7;3.6"
+    CACHE INTERNAL "")
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
+find_package(PythonLibsNew ${PYBIND11_PYTHON_VERSION} MODULE REQUIRED ${_pybind11_quiet})
+list(REMOVE_AT CMAKE_MODULE_PATH -1)
+
+# Makes a normal variable a cached variable
+macro(_PYBIND11_PROMOTE_TO_CACHE NAME)
+  set(_tmp_ptc "${${NAME}}")
+  # CMake 3.21 complains if a cached variable is shadowed by a normal one
+  unset(${NAME})
+  set(${NAME}
+      "${_tmp_ptc}"
+      CACHE INTERNAL "")
+endmacro()
+
+# Cache variables so pybind11_add_module can be used in parent projects
+_pybind11_promote_to_cache(PYTHON_INCLUDE_DIRS)
+_pybind11_promote_to_cache(PYTHON_LIBRARIES)
+_pybind11_promote_to_cache(PYTHON_MODULE_PREFIX)
+_pybind11_promote_to_cache(PYTHON_MODULE_EXTENSION)
+_pybind11_promote_to_cache(PYTHON_VERSION_MAJOR)
+_pybind11_promote_to_cache(PYTHON_VERSION_MINOR)
+_pybind11_promote_to_cache(PYTHON_VERSION)
+_pybind11_promote_to_cache(PYTHON_IS_DEBUG)
+
+if(PYBIND11_MASTER_PROJECT)
+  if(PYTHON_MODULE_EXTENSION MATCHES "pypy")
+    if(NOT DEFINED PYPY_VERSION)
+      execute_process(
+        COMMAND ${PYTHON_EXECUTABLE} -c
+                [=[import sys; sys.stdout.write(".".join(map(str, sys.pypy_version_info[:3])))]=]
+        OUTPUT_VARIABLE pypy_version)
+      set(PYPY_VERSION
+          ${pypy_version}
+          CACHE INTERNAL "")
+    endif()
+    message(STATUS "PYPY ${PYPY_VERSION} (Py ${PYTHON_VERSION})")
+  else()
+    message(STATUS "PYTHON ${PYTHON_VERSION}")
+  endif()
+endif()
+
+# Only add Python for build - must be added during the import for config since
+# it has to be re-discovered.
+#
+# This needs to be an target to it is included after the local pybind11
+# directory, just in case there are multiple versions of pybind11, we want the
+# one we expect.
+add_library(pybind11::python_headers INTERFACE IMPORTED)
+set_property(TARGET pybind11::python_headers PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+                                                      "$<BUILD_INTERFACE:${PYTHON_INCLUDE_DIRS}>")
+set_property(
+  TARGET pybind11::pybind11
+  APPEND
+  PROPERTY INTERFACE_LINK_LIBRARIES pybind11::python_headers)
+
+set(pybind11_INCLUDE_DIRS
+    "${pybind11_INCLUDE_DIR}" "${PYTHON_INCLUDE_DIRS}"
+    CACHE INTERNAL "Directories where pybind11 and possibly Python headers are located")
+
+# Python debug libraries expose slightly different objects before 3.8
+# https://docs.python.org/3.6/c-api/intro.html#debugging-builds
+# https://stackoverflow.com/questions/39161202/how-to-work-around-missing-pymodule-create2-in-amd64-win-python35-d-lib
+if(PYTHON_IS_DEBUG)
+  set_property(
+    TARGET pybind11::pybind11
+    APPEND
+    PROPERTY INTERFACE_COMPILE_DEFINITIONS Py_DEBUG)
+endif()
+
+# The <3.11 code here does not support release/debug builds at the same time, like on vcpkg
+if(CMAKE_VERSION VERSION_LESS 3.11)
+  set_property(
+    TARGET pybind11::module
+    APPEND
+    PROPERTY
+      INTERFACE_LINK_LIBRARIES
+      pybind11::python_link_helper
+      "$<$<OR:$<PLATFORM_ID:Windows>,$<PLATFORM_ID:Cygwin>>:$<BUILD_INTERFACE:${PYTHON_LIBRARIES}>>"
+  )
+
+  set_property(
+    TARGET pybind11::embed
+    APPEND
+    PROPERTY INTERFACE_LINK_LIBRARIES pybind11::pybind11 $<BUILD_INTERFACE:${PYTHON_LIBRARIES}>)
+else()
+  # The IMPORTED INTERFACE library here is to ensure that "debug" and "release" get processed outside
+  # of a generator expression - https://gitlab.kitware.com/cmake/cmake/-/issues/18424, as they are
+  # target_link_library keywords rather than real libraries.
+  add_library(pybind11::_ClassicPythonLibraries IMPORTED INTERFACE)
+  target_link_libraries(pybind11::_ClassicPythonLibraries INTERFACE ${PYTHON_LIBRARIES})
+  target_link_libraries(
+    pybind11::module
+    INTERFACE
+      pybind11::python_link_helper
+      "$<$<OR:$<PLATFORM_ID:Windows>,$<PLATFORM_ID:Cygwin>>:pybind11::_ClassicPythonLibraries>")
+
+  target_link_libraries(pybind11::embed INTERFACE pybind11::pybind11
+                                                  pybind11::_ClassicPythonLibraries)
+endif()
+
+function(pybind11_extension name)
+  # The prefix and extension are provided by FindPythonLibsNew.cmake
+  set_target_properties(${name} PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}"
+                                           SUFFIX "${PYTHON_MODULE_EXTENSION}")
+endfunction()
+
+# Build a Python extension module:
+# pybind11_add_module(<name> [MODULE | SHARED] [EXCLUDE_FROM_ALL]
+#                     [NO_EXTRAS] [THIN_LTO] [OPT_SIZE] source1 [source2 ...])
+#
+function(pybind11_add_module target_name)
+  set(options "MODULE;SHARED;EXCLUDE_FROM_ALL;NO_EXTRAS;SYSTEM;THIN_LTO;OPT_SIZE")
+  cmake_parse_arguments(ARG "${options}" "" "" ${ARGN})
+
+  if(ARG_MODULE AND ARG_SHARED)
+    message(FATAL_ERROR "Can't be both MODULE and SHARED")
+  elseif(ARG_SHARED)
+    set(lib_type SHARED)
+  else()
+    set(lib_type MODULE)
+  endif()
+
+  if(ARG_EXCLUDE_FROM_ALL)
+    set(exclude_from_all EXCLUDE_FROM_ALL)
+  else()
+    set(exclude_from_all "")
+  endif()
+
+  add_library(${target_name} ${lib_type} ${exclude_from_all} ${ARG_UNPARSED_ARGUMENTS})
+
+  target_link_libraries(${target_name} PRIVATE pybind11::module)
+
+  if(ARG_SYSTEM)
+    message(
+      STATUS
+        "Warning: this does not have an effect - use NO_SYSTEM_FROM_IMPORTED if using imported targets"
+    )
+  endif()
+
+  pybind11_extension(${target_name})
+
+  # -fvisibility=hidden is required to allow multiple modules compiled against
+  # different pybind versions to work properly, and for some features (e.g.
+  # py::module_local).  We force it on everything inside the `pybind11`
+  # namespace; also turning it on for a pybind module compilation here avoids
+  # potential warnings or issues from having mixed hidden/non-hidden types.
+  if(NOT DEFINED CMAKE_CXX_VISIBILITY_PRESET)
+    set_target_properties(${target_name} PROPERTIES CXX_VISIBILITY_PRESET "hidden")
+  endif()
+
+  if(NOT DEFINED CMAKE_CUDA_VISIBILITY_PRESET)
+    set_target_properties(${target_name} PROPERTIES CUDA_VISIBILITY_PRESET "hidden")
+  endif()
+
+  if(ARG_NO_EXTRAS)
+    return()
+  endif()
+
+  if(NOT DEFINED CMAKE_INTERPROCEDURAL_OPTIMIZATION)
+    if(ARG_THIN_LTO)
+      target_link_libraries(${target_name} PRIVATE pybind11::thin_lto)
+    else()
+      target_link_libraries(${target_name} PRIVATE pybind11::lto)
+    endif()
+  endif()
+
+  # Use case-insensitive comparison to match the result of $<CONFIG:cfgs>
+  string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
+  if(NOT MSVC AND NOT "${uppercase_CMAKE_BUILD_TYPE}" MATCHES DEBUG|RELWITHDEBINFO)
+    pybind11_strip(${target_name})
+  endif()
+
+  if(MSVC)
+    target_link_libraries(${target_name} PRIVATE pybind11::windows_extras)
+  endif()
+
+  if(ARG_OPT_SIZE)
+    target_link_libraries(${target_name} PRIVATE pybind11::opt_size)
+  endif()
+endfunction()
+
+# Provide general way to call common Python commands in "common" file.
+set(_Python
+    PYTHON
+    CACHE INTERNAL "" FORCE)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/tools/pyproject.toml b/ethosu/regor/dependencies/thirdparty/pybind11/tools/pyproject.toml
new file mode 100644
index 00000000..8fe2f47a
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/tools/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools>=42", "wheel"]
+build-backend = "setuptools.build_meta"
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/tools/setup_global.py.in b/ethosu/regor/dependencies/thirdparty/pybind11/tools/setup_global.py.in
new file mode 100644
index 00000000..885ac5c7
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/tools/setup_global.py.in
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+
+# Setup script for pybind11-global (in the sdist or in tools/setup_global.py in the repository)
+# This package is targeted for easy use from CMake.
+
+import glob
+import os
+import re
+
+# Setuptools has to be before distutils
+from setuptools import setup
+
+from distutils.command.install_headers import install_headers
+
+class InstallHeadersNested(install_headers):
+    def run(self):
+        headers = self.distribution.headers or []
+        for header in headers:
+            # Remove pybind11/include/
+            short_header = header.split("/", 2)[-1]
+
+            dst = os.path.join(self.install_dir, os.path.dirname(short_header))
+            self.mkpath(dst)
+            (out, _) = self.copy_file(header, dst)
+            self.outfiles.append(out)
+
+
+main_headers = glob.glob("pybind11/include/pybind11/*.h")
+detail_headers = glob.glob("pybind11/include/pybind11/detail/*.h")
+eigen_headers = glob.glob("pybind11/include/pybind11/eigen/*.h")
+stl_headers = glob.glob("pybind11/include/pybind11/stl/*.h")
+cmake_files = glob.glob("pybind11/share/cmake/pybind11/*.cmake")
+pkgconfig_files = glob.glob("pybind11/share/pkgconfig/*.pc")
+headers = main_headers + detail_headers + stl_headers + eigen_headers
+
+cmdclass = {"install_headers": InstallHeadersNested}
+$extra_cmd
+
+# This will _not_ affect installing from wheels,
+# only building wheels or installing from SDist.
+# Primarily intended on Windows, where this is sometimes
+# customized (for example, conda-forge uses Library/)
+base = os.environ.get("PYBIND11_GLOBAL_PREFIX", "")
+
+# Must have a separator
+if base and not base.endswith("/"):
+    base += "/"
+
+setup(
+    name="pybind11_global",
+    version="$version",
+    packages=[],
+    headers=headers,
+    data_files=[
+        (base + "share/cmake/pybind11", cmake_files),
+        (base + "share/pkgconfig", pkgconfig_files),
+        (base + "include/pybind11", main_headers),
+        (base + "include/pybind11/detail", detail_headers),
+        (base + "include/pybind11/eigen", eigen_headers),
+        (base + "include/pybind11/stl", stl_headers),
+    ],
+    cmdclass=cmdclass,
+)
diff --git a/ethosu/regor/dependencies/thirdparty/pybind11/tools/setup_main.py.in b/ethosu/regor/dependencies/thirdparty/pybind11/tools/setup_main.py.in
new file mode 100644
index 00000000..6358cc7b
--- /dev/null
+++ b/ethosu/regor/dependencies/thirdparty/pybind11/tools/setup_main.py.in
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+
+# Setup script (in the sdist or in tools/setup_main.py in the repository)
+
+from setuptools import setup
+
+cmdclass = {}
+$extra_cmd
+
+setup(
+    name="pybind11",
+    version="$version",
+    download_url='https://github.com/pybind/pybind11/tarball/v$version',
+    packages=[
+        "pybind11",
+        "pybind11.include.pybind11",
+        "pybind11.include.pybind11.detail",
+        "pybind11.include.pybind11.eigen",
+        "pybind11.include.pybind11.stl",
+        "pybind11.share.cmake.pybind11",
+        "pybind11.share.pkgconfig",
+    ],
+    package_data={
+        "pybind11": ["py.typed"],
+        "pybind11.include.pybind11": ["*.h"],
+        "pybind11.include.pybind11.detail": ["*.h"],
+        "pybind11.include.pybind11.eigen": ["*.h"],
+        "pybind11.include.pybind11.stl": ["*.h"],
+        "pybind11.share.cmake.pybind11": ["*.cmake"],
+        "pybind11.share.pkgconfig": ["*.pc"],
+    },
+    extras_require={
+        "global": ["pybind11_global==$version"]
+        },
+    entry_points={
+        "console_scripts": [
+             "pybind11-config = pybind11.__main__:main",
+        ],
+        "pipx.run": [
+             "pybind11 = pybind11.__main__:main",
+        ]
+    },
+    cmdclass=cmdclass
+)
diff --git a/ethosu/regor/include/graphapi.hpp b/ethosu/regor/include/graphapi.hpp
new file mode 100644
index 00000000..1fc1eeb1
--- /dev/null
+++ b/ethosu/regor/include/graphapi.hpp
@@ -0,0 +1,295 @@
+//
+// SPDX-FileCopyrightText: Copyright 2022-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "graphapi_tosa_types.hpp"
+
+#include <stddef.h>
+#include <cstdint>
+
+namespace GraphApi
+{
+
+static constexpr int MAX_TENSOR_DIMS = 6;  // Covers 3D padding
+
+/// <summary>
+/// Classification for how a Tensor is consumed by an operator
+/// </summary>
+enum class GraphTensorUsage : int32_t
+{
+    None = 0,
+    IFM = 0x01,
+    OFM = 0x02,
+    Weights = 0x03,
+    Scales = 0x04,
+    Params = 0x05,
+    LUT = 0x06,
+    State = 0x07,
+    UserDefined = 0x1E,
+    TypeMask = 0x1F,
+    IndexShift = 8,
+    IndexMask = 0xFF00,
+    IFM0 = IFM,
+    IFM1 = 0x0100 | IFM,
+    IFM2 = 0x0200 | IFM,
+    Params1 = 0x100 | Params,
+};
+
+constexpr inline GraphTensorUsage MakeTensorUsage(GraphTensorUsage type, int index)
+{
+    return GraphTensorUsage(uint32_t(type) | uint32_t(index << int(GraphTensorUsage::IndexShift)));
+}
+
+/// <summary>
+/// Tensor axis ordering
+/// </summary>
+enum class AxisOrder : int16_t
+{
+    Unknown = 0,
+    OHWI = 1,
+    IHWO = 2,
+    OI = 3,
+};
+
+/// <summary>
+/// Tensor data types
+/// </summary>
+enum class GraphDataType : int16_t
+{
+    Bool8 = 1,
+    Int4Packed8,
+    Int8,
+    Int16,
+    Int32,
+    Int48,
+    Int64,
+    UInt8,
+    UInt16,
+    UInt32,
+    UInt48,
+    UInt64,
+    BFloat16,
+    Float16,
+    Float32,
+};
+
+enum class GraphTensorLayout : int8_t
+{
+    Linear = 0,
+    NHCWB16 = 1,
+};
+
+
+/// <summary>
+/// Graph base axis shape
+/// </summary>
+struct GraphShape
+{
+    int32_t count;
+    int32_t axisNHWC[MAX_TENSOR_DIMS];
+};
+
+/// <summary>
+/// Graph buffer Handle
+/// </summary>
+struct GraphBuffer
+{
+};
+
+/// <summary>
+/// Graph tensor handle
+/// </summary>
+struct GraphTensor
+{
+    virtual ~GraphTensor() = default;
+};
+
+/// <summary>
+/// Graph kernel definition
+/// </summary>
+struct GraphKernel
+{
+    int32_t sizeYXZ[3] = {0};
+    int32_t strideYXZ[3] = {0};
+    int32_t dilationYXZ[3] = {0};
+    int32_t paddingTBLRNF[6] = {0};  // Top,Bottom,Left,Right,Near,Far
+    double padValue = 0;
+};
+
+/// <summary>
+/// Fraction of numerator and denominator
+/// </summary>
+struct FractionND
+{
+    int16_t n;
+    int16_t d;
+};
+
+/// <summary>
+/// 2 Dimensional X/Y point
+/// </summary>
+struct Point2
+{
+    int x, y;
+};
+
+#include "graphapi_attr.hpp"
+
+/// <summary>
+/// Graph operator
+/// </summary>
+struct GraphOperation
+{
+    static constexpr int MAX_GRAPH_NAME_LEN = 64;
+    static constexpr int MAX_IDENT_NAME_LEN = 64;
+
+    union Attributes
+    {
+        struct PoolAttribute
+        {
+            GraphDataType accPrecision;
+        };
+        struct AxisAttributes
+        {
+            int32_t axis;
+        } axis;
+        struct ReshapeAttributes
+        {
+            GraphShape shape;
+        } reshape;
+        struct SliceAttributes
+        {
+            GraphShape begin;
+            GraphShape size;
+        } slice;
+        struct ResizeAttributes
+        {
+            FractionND scaleY;
+            FractionND scaleX;
+            int16_t offsetYX[2];
+            int16_t borderYX[2];
+            tosa::ResizeMode mode;
+        } resize;
+        struct ClampAttributes
+        {
+            double min;
+            double max;
+        } clamp;
+        struct RescaleAttributes
+        {
+            // The 'multiplier' and 'shift' attributes are specified as parameter tensors
+            bool scale32;
+            bool double_round;
+            bool per_channel;
+        } rescale;
+        struct MulAttributes
+        {
+            int32_t shift;
+        } mul;
+        struct AsrAttributes
+        {
+            bool round;
+        } asr;
+        struct CondIfAttributes
+        {
+            char then_branch[MAX_GRAPH_NAME_LEN];
+            char else_branch[MAX_GRAPH_NAME_LEN];
+        } condIf;
+        struct CondWhileAttributes
+        {
+            char cond_branch[MAX_GRAPH_NAME_LEN];
+            char body_branch[MAX_GRAPH_NAME_LEN];
+        } condWhile;
+        struct TransposeConvAttribute
+        {
+            GraphShape outShape;
+        } transposeConv;
+        struct TransposeAttributes
+        {
+            GraphShape perm;
+        } transpose;
+        struct TileAttributes
+        {
+            GraphShape multiples;
+        } tile;
+        struct FFTAttribute
+        {
+            bool inverse;
+        };
+        struct CustomAttribute
+        {
+            char name[MAX_IDENT_NAME_LEN];
+            char domain[MAX_IDENT_NAME_LEN];
+            // implementation_attr is an input tensor with usage 'Params'
+        } custom;
+        // TableAttribute is a tensor with usage 'LUT'
+        // Padding is set via kernel parameters
+    } attr;
+
+    virtual ~GraphOperation() = default;
+    virtual void SetZeroPoint(GraphTensorUsage tensor, double zeroPoint) = 0;
+};
+
+/// <summary>
+/// Buffer mapping strategy
+/// </summary>
+enum class BufferMapping
+{
+    Allocate = 0,
+    Alias = 1,
+};
+
+// Freeform syntax versioning
+static constexpr uint32_t VERSION_TOSA_0_60 = 0x00003C00;
+static constexpr uint32_t VERSION_TOSA_0_80 = 0x00005000;
+static constexpr uint32_t VERSION_TOSA_1_00 = 0x01000000;
+static constexpr int32_t PROFILE_BASELINE = 0;
+static constexpr int32_t PROFILE_MAIN = 1;
+
+/// <summary>
+/// Interface to a graph builder.
+/// </summary>
+struct IGraphBuilder
+{
+    virtual ~IGraphBuilder() = default;
+    virtual bool RequireSyntaxVersion(uint32_t version, int32_t level) = 0;
+    // Object factories
+    virtual GraphOperation *CreateOp(tosa::Op opType, const GraphKernel *kernel) = 0;
+    virtual GraphBuffer *CreateBuffer(size_t sizeBytes, BufferMapping mapping, const void *initialData) = 0;
+    virtual GraphTensor *CreateTensor(const char *name, const GraphShape &shape, GraphTensorLayout layout,
+        GraphDataType dataType, GraphBuffer *buffer = nullptr) = 0;
+    // Set graph inputs/outputs
+    virtual void AddInput(GraphTensor *graphTensor) = 0;
+    virtual void AddOutput(GraphTensor *graphTensor) = 0;
+    // Connect operator inputs/outputs
+    virtual void AddInput(GraphOperation *graphOp, GraphTensorUsage usage, GraphTensor *graphTensor) = 0;
+    virtual void AddOutput(GraphOperation *graphOp, GraphTensorUsage usage, GraphTensor *graphTensor) = 0;
+    // Object attribute and properties
+    virtual bool Set(GraphOperation *op, OpAttr attr, bool value) = 0;
+    virtual bool Set(GraphOperation *op, OpAttr attr, int32_t value) = 0;
+    virtual bool Set(GraphOperation *op, OpAttr attr, double value) = 0;
+    virtual bool Set(GraphOperation *op, OpAttr attr, const GraphShape &value) = 0;
+    virtual bool Set(GraphOperation *op, OpAttr attr, const Point2 &value) = 0;
+    virtual bool Set(GraphOperation *op, OpAttr attr, const char *value) = 0;
+    virtual void SetZeroPoint(GraphOperation *op, GraphTensorUsage tensor, double zeroPoint) = 0;
+    virtual void SetAxisOrder(GraphTensor *tensor, AxisOrder order) = 0;
+    virtual void SetAxisStrides(GraphTensor *tensor, const GraphShape *axisStrides) = 0;
+};
+
+}  // namespace GraphApi
diff --git a/ethosu/regor/include/graphapi_attr.hpp b/ethosu/regor/include/graphapi_attr.hpp
new file mode 100644
index 00000000..dbeadb56
--- /dev/null
+++ b/ethosu/regor/include/graphapi_attr.hpp
@@ -0,0 +1,110 @@
+//
+// SPDX-FileCopyrightText: Copyright 2022-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+// PART OF EXTERNAL INTERFACE
+// REQUIRED: NO NAMESPACE, NO HEADER GUARDS
+
+namespace detail
+{
+
+constexpr inline uint32_t FNVHash(const char *p)
+{
+    uint32_t hash = 0x811c9dc5;  // FNV-1a SEED
+    while ( *p )
+        hash = (hash ^ uint8_t(*p++)) * 0x01000193;  // FNV-1a PRIME
+    return hash;
+}
+
+constexpr inline uint32_t MakeAttributeId(uint32_t hash, unsigned type, unsigned index)
+{
+    return (hash << 12) | (index & 0xFF) << 4 | (type & 0xF);
+}
+
+}  // namespace detail
+
+#ifndef GRAPHAPI_FUNCTION_DETAIL_ONLY
+
+constexpr uint32_t GRAPHAPI_MAKE_ATTR_ID(const char *name, unsigned type, unsigned index)
+{
+    return detail::MakeAttributeId(detail::FNVHash(name), type, index);
+}
+
+// Validation type-codes (invalid mappings will be rejected)
+static constexpr unsigned GRAPHAPI_TYPECODE_bool = (1);
+static constexpr unsigned GRAPHAPI_TYPECODE_int32 = (2);
+static constexpr unsigned GRAPHAPI_TYPECODE_double = (3);
+static constexpr unsigned GRAPHAPI_TYPECODE_string = (4);
+static constexpr unsigned GRAPHAPI_TYPECODE_GraphShape = (5);
+static constexpr unsigned GRAPHAPI_TYPECODE_FractionND = (6);
+static constexpr unsigned GRAPHAPI_TYPECODE_Point2 = (7);
+
+// NAME: Name of the internal attribute
+// TYPE: Type that we want to you to pass in (not the internal type)
+// UNIQUE: Discriminator value for this named attribute
+#define GRAPHAPI_MAKE_ATTR(NAME_, TYPE_, UNIQUE_) \
+    GRAPHAPI_MAKE_ATTR_ID(#NAME_ "_attr_t", GRAPHAPI_TYPECODE_##TYPE_, UNIQUE_)
+
+
+enum class OpAttr : uint32_t
+{
+    // Pooling
+    POOL_PRECISION = GRAPHAPI_MAKE_ATTR(pooling, int32, 0),
+    // Axis
+    AXIS_SELECT = GRAPHAPI_MAKE_ATTR(axis, int32, 0),
+    // Reshape
+    RESHAPE_SHAPE = GRAPHAPI_MAKE_ATTR(reshape, GraphShape, 0),
+    // Slice
+    SLICE_BEGIN = GRAPHAPI_MAKE_ATTR(slice, GraphShape, 0),
+    SLICE_SIZE = GRAPHAPI_MAKE_ATTR(slice, GraphShape, 1),
+    // Resize
+    RESIZE_SCALEX = GRAPHAPI_MAKE_ATTR(resize, FractionND, 0),
+    RESIZE_SCALEY = GRAPHAPI_MAKE_ATTR(resize, FractionND, 1),
+    RESIZE_OFFSET = GRAPHAPI_MAKE_ATTR(resize, Point2, 2),
+    RESIZE_BORDER = GRAPHAPI_MAKE_ATTR(resize, Point2, 3),
+    RESIZE_MODE = GRAPHAPI_MAKE_ATTR(resize, int32, 4),
+    // Clamp
+    CLAMP_MIN = GRAPHAPI_MAKE_ATTR(clamp, double, 0),
+    CLAMP_MAX = GRAPHAPI_MAKE_ATTR(clamp, double, 1),
+    // Rescale
+    RESCALE_SCALE32 = GRAPHAPI_MAKE_ATTR(rescale, bool, 0),
+    RESCALE_ROUND = GRAPHAPI_MAKE_ATTR(double_round, bool, 1),
+    RESCALE_PER_CHANNEL = GRAPHAPI_MAKE_ATTR(per_channel, bool, 2),
+    // Mul
+    MUL_SHIFT = GRAPHAPI_MAKE_ATTR(mul, int32, 0),
+    // Asr
+    ASR_ROUND = GRAPHAPI_MAKE_ATTR(asr, bool, 0),
+    // Conditional branch
+    COND_IF = GRAPHAPI_MAKE_ATTR(cond, string, 0),
+    COND_ELSE = GRAPHAPI_MAKE_ATTR(cond, string, 1),
+    // While loop
+    WHILE_COND = GRAPHAPI_MAKE_ATTR(while, string, 0),
+    WHILE_BODY = GRAPHAPI_MAKE_ATTR(while, string, 1),
+    // Transpose Conv2D
+    TRANSPOSE_CONV2D_SHAPE = GRAPHAPI_MAKE_ATTR(transpose_conv2d, GraphShape, 0),
+    // Transpose
+    TRANSPOSE_PERM = GRAPHAPI_MAKE_ATTR(transpose, GraphShape, 0),
+    // Tiling
+    TILE_MULTIPLES = GRAPHAPI_MAKE_ATTR(tile, GraphShape, 0),
+    // FFT
+    FFT_INVERSE = GRAPHAPI_MAKE_ATTR(fft, bool, 0),
+    // CUSTOM
+    CUSTOM_NAME = GRAPHAPI_MAKE_ATTR(custom, string, 0),
+    CUSTOM_DOMAIN = GRAPHAPI_MAKE_ATTR(custom, string, 1),
+};
+
+#endif
diff --git a/ethosu/regor/include/graphapi_tosa_types.hpp b/ethosu/regor/include/graphapi_tosa_types.hpp
new file mode 100644
index 00000000..f9fad47f
--- /dev/null
+++ b/ethosu/regor/include/graphapi_tosa_types.hpp
@@ -0,0 +1,127 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include <cstdint>
+
+namespace tosa
+{
+
+
+enum class DType : uint32_t
+{
+    UNKNOWN = 0,
+    BOOL = 1,
+    UINT8 = 2,
+    INT4 = 3,
+    INT8 = 4,
+    INT16 = 5,
+    INT32 = 6,
+    INT48 = 7,
+    FP32 = 8,
+    UINT16 = 9,
+    FP16 = 10,
+    BF16 = 11,
+};
+
+enum class ResizeMode : uint32_t
+{
+    UNKNOWN = 0,
+    NEAREST = 1,
+    BILINEAR = 2,
+};
+
+enum class Op : uint32_t
+{
+    UNKNOWN = 0,
+    ARGMAX = 1,
+    AVG_POOL2D = 2,
+    CONV2D = 3,
+    CONV3D = 4,
+    DEPTHWISE_CONV2D = 5,
+    FULLY_CONNECTED = 6,
+    MATMUL = 7,
+    MAX_POOL2D = 8,
+    TRANSPOSE_CONV2D = 9,
+    CLAMP = 10,
+    RESERVED = 11,
+    SIGMOID = 12,
+    TANH = 13,
+    ADD = 14,
+    ARITHMETIC_RIGHT_SHIFT = 15,
+    BITWISE_AND = 16,
+    BITWISE_OR = 17,
+    BITWISE_XOR = 18,
+    INTDIV = 19,
+    LOGICAL_AND = 20,
+    LOGICAL_LEFT_SHIFT = 21,
+    LOGICAL_RIGHT_SHIFT = 22,
+    LOGICAL_OR = 23,
+    LOGICAL_XOR = 24,
+    MAXIMUM = 25,
+    MINIMUM = 26,
+    MUL = 27,
+    POW = 28,
+    SUB = 29,
+    TABLE = 30,
+    ABS = 31,
+    BITWISE_NOT = 32,
+    CEIL = 33,
+    CLZ = 34,
+    EXP = 35,
+    FLOOR = 36,
+    LOG = 37,
+    LOGICAL_NOT = 38,
+    NEGATE = 39,
+    RECIPROCAL = 40,
+    RSQRT = 41,
+    SELECT = 42,
+    EQUAL = 43,
+    GREATER = 44,
+    GREATER_EQUAL = 45,
+    REDUCE_ANY = 46,
+    REDUCE_ALL = 47,
+    REDUCE_MAX = 48,
+    REDUCE_MIN = 49,
+    REDUCE_PRODUCT = 50,
+    REDUCE_SUM = 51,
+    CONCAT = 52,
+    PAD = 53,
+    RESHAPE = 54,
+    REVERSE = 55,
+    SLICE = 56,
+    TILE = 57,
+    TRANSPOSE = 58,
+    GATHER = 59,
+    SCATTER = 60,
+    RESIZE = 61,
+    CAST = 62,
+    RESCALE = 63,
+    CONST = 64,
+    IDENTITY = 65,
+    CUSTOM = 66,
+    COND_IF = 67,
+    WHILE_LOOP = 68,
+    FFT2D = 69,
+    RFFT2D = 70,
+    ERF = 71,
+    DIM = 72,
+};
+
+}  // namespace tosa
diff --git a/ethosu/regor/include/regor.h b/ethosu/regor/include/regor.h
new file mode 100644
index 00000000..07ae5acc
--- /dev/null
+++ b/ethosu/regor/include/regor.h
@@ -0,0 +1,214 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#if !defined REGOR_API_H
+#define REGOR_API_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+// Regor input binary format
+typedef enum regor_format_t
+{
+    REGOR_INPUTFORMAT_GRAPHAPI = 0,
+    REGOR_INPUTFORMAT_TFLITE = 1,
+    REGOR_INPUTFORMAT_TOSA = 2,
+} regor_format_t;
+
+// Regor compiler context
+typedef int regor_context_t;
+
+// Regor interfaces
+#ifdef __cplusplus
+namespace regor
+{
+#define REGOR_NS regor::
+#else
+#define REGOR_NS
+#endif
+
+struct IRegorReporting;
+struct IRegorBlob;
+
+#ifdef __cplusplus
+}  // namespace regor
+#endif
+
+// GraphAPI interfaces
+#ifdef __cplusplus
+namespace GraphApi
+{
+#define REGOR_GRAPHAPI_NS GraphApi::
+#else
+#define REGOR_GRAPHAPI_NS
+#endif
+
+struct IGraphBuilder;
+
+#ifdef __cplusplus
+}  // namespace GraphAPI
+#endif
+
+
+typedef struct regor_memory_access_perf_t
+{
+    char memoryName[32];
+    char accessType[32];
+    int64_t bytesRead;
+    int64_t bytesWritten;
+    int64_t accessCycles;
+} regor_memory_perf_t;
+
+typedef struct regor_peak_memory_usage_t
+{
+    char memoryName[32];
+    int64_t peakUsage;
+} regor_peak_memory_usage_t;
+
+typedef struct regor_perf_report_t
+{
+    int64_t npuCycles;
+    int64_t cpuCycles;
+    int64_t totalCycles;
+    int64_t macCount;
+    int64_t cpuOps;
+    int64_t npuOps;
+    int64_t cascadedOps;
+    int64_t cascades;
+    int64_t originalWeights;
+    int64_t encodedWeights;
+    int accessCount;
+    int memory;
+    int numMemories;
+    int stagingMemory;
+    regor_peak_memory_usage_t peakUsages[4];
+    regor_memory_access_perf_t *access;
+} regor_perf_report_t;
+
+typedef struct regor_raw_tensor_header_t
+{
+    enum raw_tensor_type_t
+    {
+        RAW_TENSOR_TYPE_COMMAND_STREAM = 0,
+        RAW_TENSOR_TYPE_READ_ONLY = 1,
+        RAW_TENSOR_TYPE_SCRATCH = 2,
+        RAW_TENSOR_TYPE_SCRATCH_FAST = 3,
+        RAW_TENSOR_TYPE_INPUT = 4,
+        RAW_TENSOR_TYPE_OUTPUT = 5
+    };
+    enum raw_tensor_region_t
+    {
+        RAW_TENSOR_REGION_WEIGHTS = 0,
+        RAW_TENSOR_REGION_SCRATCH = 1,
+        RAW_TENSOR_REGION_SCRATCH_FAST = 2
+    };
+    uint8_t type;  // See enum raw_tensor_type
+    union
+    {
+        struct
+        {
+            uint32_t size;
+        } command_stream;
+        struct
+        {
+            uint8_t region;  // See raw_tensor_region
+            uint32_t size;
+        } read_only;
+        struct
+        {
+            uint8_t region;  // See raw_tensor_region
+            uint32_t size;
+            uint64_t address;
+        } scratch;
+        struct
+        {
+            uint8_t region;  // See raw_tensor_region
+            uint32_t size;
+            uint64_t address;
+        } scratch_fast;
+        struct
+        {
+            uint8_t region;  // See raw_tensor_region
+            uint32_t size;
+            uint64_t address;
+            uint8_t element_size;
+            uint32_t shape[4];
+        } input;
+        struct
+        {
+            uint8_t region;  // See raw_tensor_region
+            uint32_t size;
+            uint64_t address;
+            uint8_t element_size;
+            uint32_t shape[4];
+        } output;
+    } tensor;
+} regor_raw_tensor_header_t;
+
+#define REGOR_ARCH_ETHOSU55 "EthosU55"
+#define REGOR_ARCH_ETHOSU65 "EthosU65"
+#define REGOR_ARCH_ETHOSU85 "EthosU85"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+// Create a new instance of the regor compiler
+int regor_create(regor_context_t *ctx, const char *archName);
+
+// Destroy an instance of the regor compiler
+void regor_destroy(regor_context_t ctx);
+
+// Set system configuration
+int regor_set_system_config(regor_context_t ctx, const char *config_text, size_t length);
+
+// Set compiler/scheduler options
+int regor_set_compiler_options(regor_context_t ctx, const char *config_text, size_t length);
+
+// Writer callback function (might be better, actually)
+typedef size_t (*regor_writer_t)(void *userArg, const void *data, size_t size_to_write);
+
+int regor_set_callback_arg(regor_context_t ctx, void *userArg);
+
+int regor_compile(regor_context_t ctx, regor_format_t fmt, const void *input, size_t in_size, regor_writer_t write_func);
+
+// Retrieve regor data as a binary blob
+int regor_get_output(regor_context_t ctx, REGOR_NS IRegorBlob **blob);
+
+// Return last error text (if something returns an error code), call twice to size buffer
+int regor_get_error(regor_context_t ctx, char *text, size_t *length);
+
+// Free any data allocated by regor
+int regor_free_data(regor_context_t ctx, const void *data);
+
+// Set logging output
+typedef void (*regor_log_writer_t)(const void *data, size_t size_to_write);
+
+int regor_set_logging(regor_log_writer_t log_writer, unsigned filter_mask);
+
+int regor_get_perf_report(regor_context_t ctx, regor_perf_report_t *report);
+
+struct REGOR_NS IRegorReporting *regor_get_reporting_interface(regor_context_t ctx);
+
+struct REGOR_GRAPHAPI_NS IGraphBuilder *regor_get_graph_builder(regor_context_t ctx, const char *graph_name);
+
+#if defined __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // REGOR_API_H
diff --git a/ethosu/regor/include/regor_database.hpp b/ethosu/regor/include/regor_database.hpp
new file mode 100644
index 00000000..a2408f20
--- /dev/null
+++ b/ethosu/regor/include/regor_database.hpp
@@ -0,0 +1,57 @@
+//
+// SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+namespace regor
+{
+
+struct IDatabaseIterator
+{
+    virtual ~IDatabaseIterator() = default;
+    virtual bool Next() = 0;
+    virtual void Release() = 0;
+};
+
+template<typename TYPE>
+struct IRowIterator : public IDatabaseIterator
+{
+    virtual TYPE Value() = 0;
+    virtual int Id() = 0;
+    virtual int Column() = 0;
+};
+
+struct ITableIterator : public IDatabaseIterator
+{
+    virtual std::string Name() = 0;
+    virtual int Rows() = 0;
+    virtual int Columns() = 0;
+    virtual IRowIterator<std::string> *ColumnNames() = 0;
+    virtual IRowIterator<std::string> *Row(int row) = 0;
+};
+
+struct IDatabase
+{
+    virtual ~IDatabase() = default;
+    virtual ITableIterator *Tables() = 0;
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/include/regor_interface.hpp b/ethosu/regor/include/regor_interface.hpp
new file mode 100644
index 00000000..357fea52
--- /dev/null
+++ b/ethosu/regor/include/regor_interface.hpp
@@ -0,0 +1,45 @@
+//
+// SPDX-FileCopyrightText: Copyright 2022-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "regor_database.hpp"
+
+#include <cstdint>
+
+namespace regor
+{
+
+// Regor Binary Data interface
+struct IRegorBlob
+{
+    virtual ~IRegorBlob() = default;
+    virtual void AddRef() = 0;
+    virtual void Release() = 0;
+    virtual void *Map(int64_t &size) = 0;
+    virtual void Unmap(void *p) = 0;
+};
+
+// Regor Reporting interface
+struct IRegorReporting
+{
+    virtual ~IRegorReporting() = default;
+    virtual IDatabase *OptimiserDatabase() = 0;
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/regor.cpp b/ethosu/regor/regor.cpp
new file mode 100644
index 00000000..b70532dd
--- /dev/null
+++ b/ethosu/regor/regor.cpp
@@ -0,0 +1,591 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "include/regor.h"
+
+#include "common/common.hpp"
+#include "common/logging.hpp"
+
+#include "architecture/architecture.hpp"
+#include "architecture/ethosu55/ethos_u55.hpp"
+#include "architecture/ethosu65/ethos_u65.hpp"
+#include "architecture/ethosu85/ethos_u85.hpp"
+#include "common/numeric_util.hpp"
+#include "common/shape.hpp"
+#include "compiler/compiler.hpp"
+#include "compiler/network_performance.hpp"
+#include "include/regor_interface.hpp"
+
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+
+using namespace regor;
+
+namespace
+{
+
+struct Vela
+{
+    std::string system_config_name;
+    std::string memory_mode_name;
+};
+
+struct System
+{
+    std::string const_mem_area;
+    std::string arena_mem_area;
+    std::string cache_mem_area;
+};
+
+struct Memories
+{
+    struct Memory
+    {
+        float clock_scale;
+        int read_latency;
+        int write_latency;
+        int burst_length;
+    };
+
+    // Maps memory name ("Sram") to a memory struct instance
+    std::unordered_map<std::string, Memory> memories;
+
+    // Maps port ("Axi1") to memory name ("Sram")
+    std::unordered_map<std::string, std::string> mapping;
+};
+
+std::unordered_map<int, std::unique_ptr<Compiler>> s_contextMap;
+std::mutex s_contextMutex;
+
+Compiler *GetContext(regor_context_t ctx)
+{
+    std::lock_guard<std::mutex> lock(s_contextMutex);
+    auto pos = s_contextMap.find(ctx);
+    if ( pos != s_contextMap.end() )
+    {
+        return pos->second.get();
+    }
+    return nullptr;
+}
+
+Vela parse_vela_section(IniReader &reader)
+{
+    Vela vela;
+
+    std::string key;
+    while ( reader.Begin(key) )
+    {
+        if ( key == "system_config_name" )
+        {
+            vela.system_config_name = reader.Get<std::string>();
+        }
+        else if ( key == "memory_mode_name" )
+        {
+            vela.memory_mode_name = reader.Get<std::string>();
+        }
+
+        reader.End();
+    }
+
+    return vela;
+}
+
+Memories parse_vela_system_config_section(IniReader &reader, const std::unordered_map<std::string, Memories> &allMemories)
+{
+    Memories memories;
+
+    std::string key;
+    while ( reader.Begin(key) )
+    {
+        if ( key == "inherit" )
+        {
+            // If there's an inherit key set, lookup that section and use it as a base
+            std::string inherit = reader.Get<std::string>();
+            auto item = allMemories.find(inherit);
+            if ( item != allMemories.end() )
+            {
+                memories = item->second;
+            }
+            else
+            {
+                LOG_WARN("inherit= refers to a non-existent section\n");
+            }
+        }
+        else if ( key == "axi0_port" )
+        {
+            memories.mapping["Axi0"] = reader.Get<std::string>();
+        }
+        else if ( key == "axi1_port" )
+        {
+            memories.mapping["Axi1"] = reader.Get<std::string>();
+        }
+        else
+        {
+            std::string memoryName(key, 0, key.find('_'));
+            std::string memoryParameterName(key, key.find('_') + 1, std::string::npos);
+
+            if ( memoryParameterName == "clock_scale" )
+            {
+                memories.memories[memoryName].clock_scale = reader.Get<float>();
+            }
+            else if ( memoryParameterName == "burst_length" )
+            {
+                memories.memories[memoryName].burst_length = reader.Get<int>();
+            }
+            else if ( memoryParameterName == "read_latency" )
+            {
+                memories.memories[memoryName].read_latency = reader.Get<int>();
+            }
+            else if ( memoryParameterName == "write_latency" )
+            {
+                memories.memories[memoryName].write_latency = reader.Get<int>();
+            }
+        }
+
+        reader.End();
+    }
+
+    return memories;
+}
+
+System parse_vela_memory_mode_section(IniReader &reader, const std::unordered_map<std::string, System> &allSystems)
+{
+    System system;
+
+    std::string key;
+    while ( reader.Begin(key) )
+    {
+        if ( key == "inherit" )
+        {
+            // If there's an inherit key set, lookup that section and use it as a base
+            std::string inherit = reader.Get<std::string>();
+            auto item = allSystems.find(inherit);
+            if ( item != allSystems.end() )
+            {
+                system = item->second;
+            }
+            else
+            {
+                LOG_WARN("inherit= refers to a non-existent section\n");
+            }
+        }
+        else if ( key == "const_mem_area" )
+        {
+            system.const_mem_area = reader.Get<std::string>();
+        }
+        else if ( key == "arena_mem_area" )
+        {
+            system.arena_mem_area = reader.Get<std::string>();
+        }
+        else if ( key == "cache_mem_area" )
+        {
+            system.cache_mem_area = reader.Get<std::string>();
+        }
+
+        reader.End();
+    }
+
+    return system;
+}
+
+}  // namespace
+
+
+// Create a new instance of the regor compiler
+DLL_EXPORT int regor_create(regor_context_t *ctx, const char *archName)
+{
+    std::unique_ptr<Architecture> arch;
+    if ( strcmp(archName, REGOR_ARCH_ETHOSU55) == 0 )
+    {
+        arch = std::make_unique<ArchEthosU55>();
+    }
+    else if ( strcmp(archName, REGOR_ARCH_ETHOSU65) == 0 )
+    {
+        arch = std::make_unique<ArchEthosU65>();
+    }
+    else if ( strcmp(archName, REGOR_ARCH_ETHOSU85) == 0 )
+    {
+        arch = std::make_unique<ArchEthosU85>();
+    }
+
+    if ( arch )
+    {
+        std::lock_guard<std::mutex> lock(s_contextMutex);
+        *ctx = regor_context_t(s_contextMap.size() + 1);
+        s_contextMap[*ctx] = std::make_unique<Compiler>(arch);
+        return 1;
+    }
+
+    return 0;
+}
+
+
+// Destroy an instance of the regor compiler
+DLL_EXPORT void regor_destroy(regor_context_t ctx)
+{
+    std::lock_guard<std::mutex> lock(s_contextMutex);
+    s_contextMap.erase(ctx);
+}
+
+// Set Vela compatible system config
+DLL_EXPORT int regor_set_system_config(regor_context_t ctx, const char *config_text, size_t length)
+{
+    Compiler *compiler = GetContext(ctx);
+    if ( !compiler )
+    {
+        return 0;
+    }
+
+    std::string regorIni;
+    std::string regorIniUnknown;
+
+    bool foundSystemConfig = false;
+    bool foundMemoryMode = false;
+
+    Memories memories;
+    System system;
+    Vela vela;
+    std::unordered_map<std::string, Memories> allMemories;
+    std::unordered_map<std::string, System> allSystems;
+
+    IniReader reader(config_text, length);
+    ssize_t startOfSection = 0;
+    std::string section;
+    while ( reader.Begin(section) )
+    {
+        if ( section == "vela" )
+        {
+            vela = parse_vela_section(reader);
+
+            reader.End();
+        }
+        else if ( section.find("System_Config.") == 0 )
+        {
+            allMemories[section] = parse_vela_system_config_section(reader, allMemories);
+
+            std::string name(section, section.find('.') + 1);
+            if ( vela.system_config_name.empty() || name == vela.system_config_name )
+            {
+                memories = allMemories[section];
+                foundSystemConfig = true;
+            }
+
+            reader.End();
+        }
+        else if ( section.find("Memory_Mode.") == 0 )
+        {
+            allSystems[section] = parse_vela_memory_mode_section(reader, allSystems);
+
+            std::string name(section, section.find('.') + 1);
+            if ( vela.memory_mode_name.empty() || name == vela.memory_mode_name )
+            {
+                system = allSystems[section];
+                foundMemoryMode = true;
+            }
+
+            reader.End();
+        }
+        else
+        {
+            reader.End();
+
+            // If we encounter an unknown section, copy the entire section
+            auto sz = reader.Position() - startOfSection;
+            assert(sz >= 0);
+            regorIniUnknown.append(config_text + startOfSection, size_t(sz));
+        }
+
+        startOfSection = reader.Position();
+    }
+
+    if ( (foundSystemConfig && foundMemoryMode) || !regorIniUnknown.empty() )
+    {
+        for ( auto &it : memories.memories )
+        {
+            regorIni += "[memory." + it.first + "]\n";
+            regorIni += "name=" + it.first + "\n";
+            regorIni += "bandwidth=" + std::to_string(it.second.clock_scale * 16) + "\n";
+            regorIni += "read_latency=" + std::to_string(it.second.read_latency) + "\n";
+            regorIni += "write_latency=" + std::to_string(it.second.write_latency) + "\n";
+            regorIni += "burst_length=" + std::to_string(it.second.burst_length) + "\n";
+        }
+
+        if ( foundSystemConfig )
+        {
+            regorIni += "[system]\n";
+            regorIni += "const=" + memories.mapping[system.const_mem_area] + "\n";
+            regorIni += "feature_maps=" + memories.mapping[system.arena_mem_area] + "\n";
+            regorIni += "staging=" + memories.mapping[system.cache_mem_area] + "\n";
+        }
+
+        regorIni += regorIniUnknown;
+
+        if ( !compiler->ParseConfig(regorIni.c_str(), regorIni.length()) )
+        {
+            return 0;
+        }
+
+        return 1;
+    }
+
+    return 0;
+}
+
+
+// Set compiler/scheduler options
+DLL_EXPORT int regor_set_compiler_options(regor_context_t ctx, const char *config_text, size_t length)
+{
+    Compiler *compiler = GetContext(ctx);
+    if ( !compiler )
+    {
+        return 0;
+    }
+
+    if ( !compiler->ParseOptions(config_text, length) )
+    {
+        return 0;
+    }
+
+    return 1;
+}
+
+
+DLL_EXPORT int regor_set_callback_arg(regor_context_t ctx, void *userArg)
+{
+    Compiler *compiler = GetContext(ctx);
+    if ( !compiler )
+    {
+        return 0;
+    }
+    compiler->userApiArg = userArg;
+    return 1;
+}
+
+
+DLL_EXPORT int regor_compile(regor_context_t ctx, regor_format_t fmt, const void *input, size_t in_size, regor_writer_t write_func)
+{
+    Compiler *compiler = GetContext(ctx);
+    if ( !compiler )
+    {
+        return 0;
+    }
+
+    if ( fmt == REGOR_INPUTFORMAT_TFLITE )
+    {
+        try
+        {
+            if ( !compiler->LoadTflite(input, in_size) ) throw std::runtime_error("Unable to parse TFLite input data");
+        }
+        catch ( std::runtime_error &e )
+        {
+            compiler->SetLastError(e.what());
+            return 0;
+        }
+    }
+    else if ( fmt == REGOR_INPUTFORMAT_TOSA )
+    {
+        try
+        {
+            if ( !compiler->LoadTosa(input, in_size) ) throw std::runtime_error("Unable to parse tosa input data");
+        }
+        catch ( std::runtime_error &e )
+        {
+            compiler->SetLastError(e.what());
+            return 0;
+        }
+    }
+    else if ( fmt != REGOR_INPUTFORMAT_GRAPHAPI )
+    {
+        compiler->SetLastError("Unsupported input format");
+        return 0;
+    }
+
+    if ( !compiler->Compile() )
+    {
+        if ( compiler->LastError().empty() )
+        {
+            compiler->SetLastError("Compile() failed and no error set");
+        }
+        return 0;
+    }
+
+    if ( write_func )
+    {
+        int64_t size;
+        auto blob = compiler->Output();
+        if ( blob )
+        {
+            auto p = blob->Map(size);
+            write_func(compiler->userApiArg, p, size);
+            blob->Unmap(p);
+        }
+    }
+
+    return 1;
+}
+
+// Retrieve regor data as a binary blob
+DLL_EXPORT int regor_get_output(regor_context_t ctx, REGOR_NS IRegorBlob **blob)
+{
+    Compiler *compiler = GetContext(ctx);
+    if ( !compiler )
+    {
+        return 0;
+    }
+
+    assert(blob && "Can't request no result");
+    *blob = compiler->Output();
+    if ( !*blob ) return 0;
+    return 1;
+}
+
+
+// Return last error text (if something returns an error code)
+DLL_EXPORT int regor_get_error(regor_context_t ctx, char *text, size_t *length)
+{
+    Compiler *compiler = GetContext(ctx);
+    if ( !compiler || !length )
+    {
+        return 0;
+    }
+
+    size_t lenErrorText = compiler->LastError().size();
+    if ( text != nullptr )
+    {
+        // Return result
+        *length = std::min(lenErrorText, *length);
+        strncpy(text, compiler->LastError().c_str(), *length);
+    }
+    else
+    {
+        // Measure
+        *length = lenErrorText;
+    }
+
+    return 1;
+}
+
+
+// Free any data allocated by regor
+DLL_EXPORT int regor_free_data(regor_context_t ctx, const void *data)
+{
+    Compiler *compiler = GetContext(ctx);
+    if ( !compiler )
+    {
+        return 0;
+    }
+    assert(false && "unimplemented");
+    UNUSED(data);
+    return 0;
+}
+
+
+DLL_EXPORT int regor_set_logging(regor_log_writer_t log_writer, unsigned filter_mask)
+{
+    Logging::Out.SetWriter(log_writer);
+    Logging::Out.SetFilterMask(filter_mask);
+    return 1;
+}
+
+
+DLL_EXPORT int regor_get_perf_report(regor_context_t ctx, regor_perf_report_t *report)
+{
+    Compiler *compiler = GetContext(ctx);
+    if ( !compiler )
+    {
+        return 0;
+    }
+
+    const regor::PerformanceResult &result = compiler->LastPerfResult();
+
+    report->npuCycles = result.npuCycles;
+    report->cpuCycles = result.cpuCycles;
+    report->totalCycles = result.totalCycles;
+    report->macCount = result.macCount;
+    report->cpuOps = result.cpuOps;
+    report->npuOps = result.npuOps;
+    report->cascadedOps = result.cascadedOps;
+    report->cascades = result.cascades;
+    report->originalWeights = result.originalWeights;
+    report->encodedWeights = result.encodedWeights;
+    report->accessCount = result.Accesses();
+    report->numMemories = 0;
+    report->stagingMemory = -1;
+    int index = 0;
+
+    if ( report->access != nullptr )
+    {
+        for ( auto const &[archMem, memoryStat] : result.memory )
+        {
+            // TODO: Clean this up
+            if ( report->numMemories >= 0 && report->numMemories < 4 )
+            {
+                size_t len = archMem->Name().copy(report->peakUsages[report->numMemories].memoryName,
+                    std::size(report->peakUsages[report->numMemories].memoryName) - 1);
+                report->peakUsages[report->numMemories].memoryName[len] = 0;
+                report->peakUsages[report->numMemories].peakUsage = memoryStat.peakUsage;
+                if ( archMem == compiler->Arch()->StagingMemory().memory )
+                {
+                    report->stagingMemory = report->numMemories;
+                }
+                report->numMemories++;
+            }
+            for ( auto const &[accType, access] : memoryStat.access )
+            {
+                size_t len = archMem->Name().copy(report->access[index].memoryName, std::size(report->access[index].memoryName) - 1);
+                report->access[index].memoryName[len] = 0;
+
+                len = EnumToString<AccessType>(accType).copy(
+                    report->access[index].accessType, std::size(report->access[index].accessType) - 1);
+                report->access[index].accessType[len] = 0;
+
+                report->access[index].accessCycles = access.accessCycles;
+                report->access[index].bytesRead = access.bytesRead;
+                report->access[index].bytesWritten = access.bytesWritten;
+                index++;
+            }
+        }
+    }
+    return 1;
+}
+
+
+DLL_EXPORT struct REGOR_NS IRegorReporting *regor_get_reporting_interface(regor_context_t ctx)
+{
+    Compiler *compiler = GetContext(ctx);
+    if ( !compiler )
+    {
+        return nullptr;
+    }
+
+    return compiler;
+}
+
+
+DLL_EXPORT struct REGOR_GRAPHAPI_NS IGraphBuilder *regor_get_graph_builder(regor_context_t ctx, const char *graph_name)
+{
+    Compiler *compiler = GetContext(ctx);
+    if ( !compiler )
+    {
+        return nullptr;
+    }
+
+    return compiler->CreateGraph(graph_name);
+}
diff --git a/ethosu/regor/test/CMakeLists.txt b/ethosu/regor/test/CMakeLists.txt
new file mode 100644
index 00000000..7ba4e846
--- /dev/null
+++ b/ethosu/regor/test/CMakeLists.txt
@@ -0,0 +1,45 @@
+
+# SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Add unit test executables here. Having more than one helps multi-threading
+# them
+
+include(regor_test)
+include(regor_dependencies)
+
+add_catch_test(
+    NAME
+        unit_tests
+    SOURCES
+        test_main.cpp
+        test_mlw_encode.cpp
+        test_data_type.cpp
+        test_ini_reader.cpp
+        test_shape.cpp
+        test_tosa_validator.cpp
+        test_ethos_u85_weight_encoder.cpp
+        test_transpose_type.cpp
+        test_raw_writer.cpp
+        test_graph_packing.cpp
+        test_ordered_map.cpp
+        test_arch_ethos_u85.cpp
+    DEPS
+        regor::regor-objects
+        regor::fmt
+        mlw_codec_st
+)
diff --git a/ethosu/regor/test/randomize.hpp b/ethosu/regor/test/randomize.hpp
new file mode 100644
index 00000000..b284eca9
--- /dev/null
+++ b/ethosu/regor/test/randomize.hpp
@@ -0,0 +1,156 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/common.hpp"
+
+#include <iterator>
+#include <numeric>
+#include <random>
+
+// The random generator state is initialized from the Catch::rngSeed().
+// This happens for each "test" to ensure random number consistency,
+// even if only a single section was specified on the command line.
+extern std::mt19937 default_rnd_generator;
+
+namespace
+{
+
+// Randomizing a struct can easily be done using a friend function.
+//
+// struct MyStruct
+// {
+//     int a;
+//     bool b;
+//
+//     [...]
+//
+//     friend void randomize(MyStruct& value)
+//     {
+//         randomize(value.a);
+//         randomize(value.b);
+//     }
+// };
+//
+// MyStruct mine;
+//
+// randomize(mine);
+
+// Randomize an integer (with an optional min/max value)
+template<typename T, std::enable_if_t<std::numeric_limits<T>::is_integer, int> = 0>
+void randomize(T &value, T min_value = std::numeric_limits<T>::min(), T max_value = std::numeric_limits<T>::max())
+{
+    std::uniform_int_distribution<T> dist(min_value, max_value);
+    value = dist(default_rnd_generator);
+}
+
+// Usage:
+// std::vector<int> my_vec;
+// my_vec.resize(250);
+//
+// randomize(my_vec.begin(), my_vec.end());
+//
+// Note that this handles any type that has a randomize() function defined.
+template<typename InputIt>
+void randomize(InputIt first, InputIt last)
+{
+    for ( InputIt it = first; it != last; ++it )
+    {
+        randomize(*it);
+    }
+}
+
+// Usage:
+// std::vector<int> my_vec;
+// my_vec.resize(250);
+//
+// randomize(my_vec.begin(), my_vec.end(), 5, 10); // Constrain values to [5, 10]
+template<typename InputIt, typename T>
+void randomize(InputIt first, InputIt last, T min_value, T max_value)
+{
+    for ( InputIt it = first; it != last; ++it )
+    {
+        randomize(*it, min_value, max_value);
+    }
+}
+
+// Helper template to easily randomize a std::vector
+template<typename T>
+void randomize(std::vector<T> &values)
+{
+    randomize(values.begin(), values.end());
+}
+
+// Create entire randomised vector with bounds
+template<typename TYPE>
+std::vector<TYPE> random_vector(int length, TYPE min, TYPE max)
+{
+    std::uniform_int_distribution<int> distribution(min, max);
+    std::vector<TYPE> temp(length);
+    std::generate(temp.begin(), temp.end(), [&]() { return TYPE(distribution(default_rnd_generator)); });
+    return temp;
+}
+
+// Helper template to easily randomize a std::array
+template<typename T, size_t S>
+void randomize(std::array<T, S> &values)
+{
+    randomize(values.begin(), values.end());
+}
+
+// Helper template to easily randomize a fixed size array
+template<typename T, size_t N>
+void randomize(T (&values)[N])
+{
+    randomize(std::begin(values), std::end(values));
+}
+
+// Helper template to easily randomize a fixed size array
+template<typename T, size_t N, typename T2>
+void randomize(T (&values)[N], T2 min_value, T2 max_value)
+{
+    randomize(std::begin(values), std::end(values), min_value, max_value);
+}
+
+// Randomly pick one of the arguments. Useful for sparse enums
+template<typename T, typename... Ts>
+T random_of(T first, Ts... rest)
+{
+    T arr[] = {first, rest...};
+    unsigned index;
+    randomize(index, 0U, unsigned(sizeof...(rest)));
+    return arr[index];
+}
+
+// Helper
+unsigned urandom_range(unsigned min, unsigned max)
+{
+    unsigned ret;
+    randomize(ret, min, max);
+    return ret;
+}
+
+unsigned urandom()
+{
+    unsigned ret;
+    randomize(ret);
+    return ret;
+}
+
+}  // namespace
diff --git a/ethosu/regor/test/test_arch_ethos_u85.cpp b/ethosu/regor/test/test_arch_ethos_u85.cpp
new file mode 100644
index 00000000..2f841ce2
--- /dev/null
+++ b/ethosu/regor/test/test_arch_ethos_u85.cpp
@@ -0,0 +1,101 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common/common.hpp"
+
+#include "architecture/ethosu85/ethos_u85.hpp"
+
+#include <catch_all.hpp>
+
+#include "regor.h"
+
+using namespace regor;
+
+TEST_CASE("arch_ethos_u85 GetOpConfig")
+{
+    std::string config;
+    config += "[architecture]\r\n";
+    config += "macs=1024\r\n";
+    IniReader reader(config.c_str(), config.size());
+
+    ArchEthosU85 arch;
+    std::string section;
+    while ( reader.Begin(section) )
+    {
+        auto result = arch.ParseSection(section, &reader);
+        assert(result != IniParseResult::Error);
+        reader.End();
+    }
+
+    ArchitectureConfigQuery query{};
+    query.ifmBits = 8;
+    query.lutBytes = 0;
+    query.scaled = false;
+    query.ifmResampling = ArchResampling::None;
+    query.transpose = TransposeType::None;
+    query.ofmFormat = TensorFormat::NHCWB16;
+    query.accOutputEnabled = true;
+
+    SECTION("No waste")
+    {
+        OpType type = OpType::Add;
+        Kernel kernel({1, 1}, {1, 1}, {0, 0});
+        query.ofmShape = {1, 8, 8, 32};
+        query.ifmShape[0] = query.ofmShape;
+        query.kernel = &kernel;
+        auto archOpConfig = arch.GetOpConfig(type, query);
+        EthosU85OpConfig *ethosU85OpConfig = static_cast<EthosU85OpConfig *>(archOpConfig.get());
+        REQUIRE(ethosU85OpConfig->OfmUBlock() == Shape(2, 2, 32));
+    }
+
+    SECTION("Waste in H")
+    {
+        OpType type = OpType::Add;
+        Kernel kernel({1, 1}, {1, 1}, {0, 0});
+        query.ofmShape = {1, 1, 8, 32};
+        query.ifmShape[0] = query.ofmShape;
+        query.kernel = &kernel;
+        auto archOpConfig = arch.GetOpConfig(type, query);
+        EthosU85OpConfig *ethosU85OpConfig = static_cast<EthosU85OpConfig *>(archOpConfig.get());
+        REQUIRE(ethosU85OpConfig->OfmUBlock() == Shape(1, 4, 32));
+    }
+
+    SECTION("Waste in W")
+    {
+        OpType type = OpType::Add;
+        Kernel kernel({1, 1}, {1, 1}, {0, 0});
+        query.ofmShape = {1, 8, 1, 16};
+        query.ifmShape[0] = query.ofmShape;
+        query.kernel = &kernel;
+        auto archOpConfig = arch.GetOpConfig(type, query);
+        EthosU85OpConfig *ethosU85OpConfig = static_cast<EthosU85OpConfig *>(archOpConfig.get());
+        REQUIRE(ethosU85OpConfig->OfmUBlock() == Shape(2, 2, 32));
+    }
+
+    SECTION("Waste in C")
+    {
+        OpType type = OpType::Add;
+        Kernel kernel({1, 1}, {1, 1}, {0, 0});
+        query.ofmShape = {1, 8, 8, 1};
+        query.ifmShape[0] = query.ofmShape;
+        query.kernel = &kernel;
+        auto archOpConfig = arch.GetOpConfig(type, query);
+        EthosU85OpConfig *ethosU85OpConfig = static_cast<EthosU85OpConfig *>(archOpConfig.get());
+        REQUIRE(ethosU85OpConfig->OfmUBlock() == Shape(2, 4, 16));
+    }
+}
diff --git a/ethosu/regor/test/test_data_type.cpp b/ethosu/regor/test/test_data_type.cpp
new file mode 100644
index 00000000..577f3cba
--- /dev/null
+++ b/ethosu/regor/test/test_data_type.cpp
@@ -0,0 +1,145 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common/data_type.hpp"
+
+#include <catch_all.hpp>
+
+using namespace regor;
+
+TEST_CASE("DataType sizes")
+{
+    SECTION("Size bits")
+    {
+        REQUIRE(DataTypeSizeBits(DataType::Int4) == 4);
+        REQUIRE(DataTypeSizeBits(DataType::Int4Packed8) == 4);
+        REQUIRE(DataTypeSizeBits(DataType::Int8) == 8);
+        REQUIRE(DataTypeSizeBits(DataType::Int16) == 16);
+        REQUIRE(DataTypeSizeBits(DataType::Int32) == 32);
+        REQUIRE(DataTypeSizeBits(DataType::Int48) == 48);
+        REQUIRE(DataTypeSizeBits(DataType::Int64) == 64);
+        REQUIRE(DataTypeSizeBits(DataType::UInt8) == 8);
+        REQUIRE(DataTypeSizeBits(DataType::UInt16) == 16);
+        REQUIRE(DataTypeSizeBits(DataType::UInt32) == 32);
+        REQUIRE(DataTypeSizeBits(DataType::UInt48) == 48);
+        REQUIRE(DataTypeSizeBits(DataType::UInt64) == 64);
+    }
+
+    SECTION("Storage size bits")
+    {
+        REQUIRE(DataTypeStorageSizeBits(DataType::Int4Packed8) == 8);
+        REQUIRE(DataTypeStorageSizeBits(DataType::Int4) == 8);
+        REQUIRE(DataTypeStorageSizeBits(DataType::Int8) == 8);
+        REQUIRE(DataTypeStorageSizeBits(DataType::Int16) == 16);
+        REQUIRE(DataTypeStorageSizeBits(DataType::Int32) == 32);
+        REQUIRE(DataTypeStorageSizeBits(DataType::Int48) == 48);
+        REQUIRE(DataTypeStorageSizeBits(DataType::Int64) == 64);
+        REQUIRE(DataTypeStorageSizeBits(DataType::UInt8) == 8);
+        REQUIRE(DataTypeStorageSizeBits(DataType::UInt16) == 16);
+        REQUIRE(DataTypeStorageSizeBits(DataType::UInt32) == 32);
+        REQUIRE(DataTypeStorageSizeBits(DataType::UInt48) == 48);
+        REQUIRE(DataTypeStorageSizeBits(DataType::UInt64) == 64);
+    }
+
+    SECTION("Storage size bytes")
+    {
+        const int kMax = 16;
+        for ( int elements = 0; elements <= kMax; ++elements )
+        {
+            CAPTURE(elements);
+            REQUIRE(DataTypeStorageSizeBytes(DataType::Int4Packed8, elements) == 1 * (elements + 1) / 2);
+            REQUIRE(DataTypeStorageSizeBytes(DataType::Int4, elements) == 1 * elements);
+            REQUIRE(DataTypeStorageSizeBytes(DataType::Int8, elements) == 1 * elements);
+            REQUIRE(DataTypeStorageSizeBytes(DataType::Int16, elements) == 2 * elements);
+            REQUIRE(DataTypeStorageSizeBytes(DataType::Int32, elements) == 4 * elements);
+            REQUIRE(DataTypeStorageSizeBytes(DataType::Int48, elements) == 6 * elements);
+            REQUIRE(DataTypeStorageSizeBytes(DataType::Int64, elements) == 8 * elements);
+            REQUIRE(DataTypeStorageSizeBytes(DataType::UInt8, elements) == 1 * elements);
+            REQUIRE(DataTypeStorageSizeBytes(DataType::UInt16, elements) == 2 * elements);
+            REQUIRE(DataTypeStorageSizeBytes(DataType::UInt32, elements) == 4 * elements);
+            REQUIRE(DataTypeStorageSizeBytes(DataType::UInt48, elements) == 6 * elements);
+            REQUIRE(DataTypeStorageSizeBytes(DataType::UInt64, elements) == 8 * elements);
+        }
+    }
+
+    SECTION("Integer Min")
+    {
+        REQUIRE(IntegerMin(DataType::Int4) == -8);
+        REQUIRE(IntegerMin(DataType::Int8) == int64_t(std::numeric_limits<int8_t>::min()));
+        REQUIRE(IntegerMin(DataType::Int16) == int64_t(std::numeric_limits<int16_t>::min()));
+        REQUIRE(IntegerMin(DataType::Int32) == int64_t(std::numeric_limits<int32_t>::min()));
+        REQUIRE(IntegerMin(DataType::Int48) == -140737488355328LL);
+        REQUIRE(IntegerMin(DataType::Int64) == std::numeric_limits<int64_t>::min());
+        REQUIRE(IntegerMin(DataType::UInt8) == 0);
+        REQUIRE(IntegerMin(DataType::UInt16) == 0);
+        REQUIRE(IntegerMin(DataType::UInt32) == 0);
+        REQUIRE(IntegerMin(DataType::UInt48) == 0);
+        REQUIRE(IntegerMin(DataType::UInt64) == 0);
+    }
+
+    SECTION("Integer Max")
+    {
+        REQUIRE(IntegerMax(DataType::Int4) == 7);
+        REQUIRE(IntegerMax(DataType::Int8) == std::numeric_limits<int8_t>::max());
+        REQUIRE(IntegerMax(DataType::Int16) == std::numeric_limits<int16_t>::max());
+        REQUIRE(IntegerMax(DataType::Int32) == std::numeric_limits<int32_t>::max());
+        REQUIRE(IntegerMax(DataType::Int48) == 140737488355327ULL);
+        REQUIRE(IntegerMax(DataType::Int64) == std::numeric_limits<int64_t>::max());
+        REQUIRE(IntegerMax(DataType::UInt8) == std::numeric_limits<uint8_t>::max());
+        REQUIRE(IntegerMax(DataType::UInt16) == std::numeric_limits<uint16_t>::max());
+        REQUIRE(IntegerMax(DataType::UInt32) == std::numeric_limits<uint32_t>::max());
+        REQUIRE(IntegerMax(DataType::UInt48) == 281474976710655ULL);
+        REQUIRE(IntegerMax(DataType::UInt64) == std::numeric_limits<uint64_t>::max());
+    }
+}
+
+TEST_CASE("DataType elements")
+{
+    const int kMax = DataTypeStorageSizeBytes(DataType::Int64, 2);
+    for ( int bytes = 0; bytes <= kMax; ++bytes )
+    {
+        CAPTURE(bytes);
+        REQUIRE(DataTypeElements(DataType::Int4Packed8, bytes) == bytes * 2);
+        REQUIRE(DataTypeElements(DataType::Int4, bytes) == bytes);
+        REQUIRE(DataTypeElements(DataType::Int8, bytes) == bytes);
+        REQUIRE(DataTypeElements(DataType::Int16, bytes) == bytes / 2);
+        REQUIRE(DataTypeElements(DataType::Int32, bytes) == bytes / 4);
+        REQUIRE(DataTypeElements(DataType::Int48, bytes) == bytes / 6);
+        REQUIRE(DataTypeElements(DataType::Int64, bytes) == bytes / 8);
+        REQUIRE(DataTypeElements(DataType::UInt8, bytes) == bytes);
+        REQUIRE(DataTypeElements(DataType::UInt16, bytes) == bytes / 2);
+        REQUIRE(DataTypeElements(DataType::UInt32, bytes) == bytes / 4);
+        REQUIRE(DataTypeElements(DataType::UInt48, bytes) == bytes / 6);
+        REQUIRE(DataTypeElements(DataType::UInt64, bytes) == bytes / 8);
+    }
+}
+
+TEST_CASE("int48_t")
+{
+    const int64_t val = 48217395205765;
+    int48_t val48 = val;
+    int64_t val64 = val48;
+    REQUIRE(val64 == val);
+    int48_t val48neg = -val;
+    val64 = val48neg;
+    REQUIRE(val64 == -val);
+    int48_t array48[] = {val, -val};
+    int48_t *p48 = &array48[0];
+    REQUIRE(int64_t(p48[0]) == val);
+    REQUIRE(int64_t(p48[1]) == -val);
+}
diff --git a/ethosu/regor/test/test_ethos_u85_weight_encoder.cpp b/ethosu/regor/test/test_ethos_u85_weight_encoder.cpp
new file mode 100644
index 00000000..7e437967
--- /dev/null
+++ b/ethosu/regor/test/test_ethos_u85_weight_encoder.cpp
@@ -0,0 +1,123 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common/common.hpp"
+
+#include "architecture/ethosu85/ethos_u85.hpp"
+#include "architecture/ethosu85/ethos_u85_weight_encoder.hpp"
+#include "common/ini_reader.hpp"
+#include "randomize.hpp"
+
+#include <catch_all.hpp>
+
+namespace
+{
+
+using namespace regor;
+
+class TestArchEthosU85 : public ArchEthosU85
+{
+public:
+    std::unique_ptr<ArchitectureOpConfig> TestFindBlockConfig(OpType opType, const ArchitectureConfigQuery &query)
+    {
+        return FindBlockConfig(opType, query);
+    }
+};
+
+TEST_CASE("ethos_u85_weightsource")
+{
+    const char CONFIG[] = "[architecture]\nmacs=128\n";
+    constexpr int CHUNK_SIZE = 128 * 1024;
+
+    // Initialize Architecture
+    TestArchEthosU85 arch;
+    auto reader = IniReader(CONFIG, sizeof(CONFIG));
+    std::string section;
+    reader.Begin(section);
+    REQUIRE(section == "architecture");
+    arch.ParseConfig(&reader);
+    reader.End();
+    // Initialize shapes
+    Shape ifmShape{128, 128, 3};
+    Shape ofmShape{64, 64, 16};
+    Shape weightShape{16, 3, 3, 3};
+    Kernel kernel{{3, 3}, {2, 2}, {1, 1}};
+    std::vector<int> depthOffsets{0, ofmShape.Depth()};
+    // Generate weights
+    std::vector<uint8_t> weights;
+    weights.resize(weightShape.Elements());
+    randomize(weights);
+    auto weightBuffer = std::make_shared<Buffer>(std::move(weights));
+    Tensor weightTensor{"Weights", DataType::Int8, weightShape, weightBuffer};
+    // Setup query
+    ArchitectureConfigQuery query{};
+    query.ifmBits = 8;
+    query.ifmShape[0] = ifmShape;
+    query.ofmShape = ofmShape;
+    query.kernel = &kernel;
+    query.lutBytes = 0;
+    query.scaled = true;
+    query.ifmResampling = ArchResampling::None;
+    query.transpose = TransposeType::None;
+    query.ofmFormat = TensorFormat::Unknown;
+    // Setup weight source
+    WeightTransformParam param;
+    auto opCfg = arch.TestFindBlockConfig(OpType::Conv2D, query);
+    auto encoder = arch.WeightEncoder();
+    auto view = weightTensor.View();
+    WeightsRef weightsRef = {&view, AxisOrder::OHWI, weightTensor.Type()};
+    auto config = encoder->GetEncodingConfig(opCfg.get(), weightsRef, &kernel, DataType::Int8, depthOffsets, WeightFormat::Default);
+    auto transform = [](const WeightTransformParam *, int weight) { return weight; };
+    auto source = encoder->GetWeightSource(config.get(), DataType::Int8, transform, &param);
+    source->SetSource(&view.Values<uint8_t>()[0], 0, weightShape, view.StrideBytes(), 0);
+
+    // Initialize buffers
+    std::vector<int16_t> refBuffer, actBuffer;
+    actBuffer.resize(CHUNK_SIZE, 0);
+    refBuffer.resize(CHUNK_SIZE, 0);
+    // Generate reordered weights reference
+    int refSize = source->Get(refBuffer.data(), int(refBuffer.capacity()));
+    REQUIRE(refSize > 0);
+    REQUIRE(refSize < CHUNK_SIZE);
+    refBuffer.resize(refSize);
+    // Generate reordered weights and compare with reference for all possible chunk sizes
+    for ( int chunkSize = refSize; chunkSize > 0; chunkSize-- )
+    {
+        CAPTURE(chunkSize);
+        actBuffer.clear();
+        actBuffer.resize(CHUNK_SIZE, 0);
+        int actSize = source->Get(actBuffer.data(), chunkSize);
+        int totalSize = actSize;
+        while ( actSize == chunkSize )
+        {
+            actSize = source->Get(actBuffer.data() + totalSize, chunkSize);
+            totalSize += actSize;
+        }
+        actBuffer.resize(totalSize);
+        REQUIRE(refBuffer.size() == actBuffer.size());
+        auto cmp = std::mismatch(refBuffer.begin(), refBuffer.end(), actBuffer.begin());
+        if ( cmp.first != refBuffer.end() )
+        {
+            int index = std::distance(refBuffer.begin(), cmp.first);
+            CAPTURE(index);
+            REQUIRE(*cmp.first == *cmp.second);
+        }
+    }
+}
+
+}  // namespace
diff --git a/ethosu/regor/test/test_graph_packing.cpp b/ethosu/regor/test/test_graph_packing.cpp
new file mode 100644
index 00000000..779c64c0
--- /dev/null
+++ b/ethosu/regor/test/test_graph_packing.cpp
@@ -0,0 +1,292 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common/common.hpp"
+
+#include "compiler/graph_packing.hpp"
+
+#include <catch_all.hpp>
+#include <memory>
+
+#include "regor.h"
+
+using namespace regor;
+
+namespace
+{
+
+using AddressMap = std::unordered_map<const Tensor *, Address>;
+
+std::shared_ptr<SchedulerTensor> CreateTensor(std::string name, Address tensorAddress, AddressMap &tensorAddressMap)
+{
+    auto tensor = std::make_shared<Tensor>(name, DataType::Int8);
+    auto schedTensor = std::make_shared<SchedulerTensor>();
+    schedTensor->srcTensor = tensor;
+    schedTensor->allocatedAddress = tensorAddress;
+    tensorAddressMap[tensor.get()] = tensorAddress;
+
+    return schedTensor;
+}
+
+std::unique_ptr<SchedulerOperation> CreateOperation(bool npu, TensorUsage ifm0Usage, std::shared_ptr<SchedulerTensor> &ifm0,
+    TensorUsage ifm1Usage, std::shared_ptr<SchedulerTensor> &ifm1, TensorUsage ofmUsage, std::shared_ptr<SchedulerTensor> &ofm)
+{
+    static std::vector<std::shared_ptr<Operation>> ops;
+    ops.push_back(std::make_shared<Operation>(OpType::AvgPool));
+
+    auto schedOp = std::make_unique<SchedulerOperation>(OpType::AvgPool);
+    schedOp->SetNpuOp(npu);
+    schedOp->_srcKey = static_cast<void *>(ops.back().get());
+    auto *ifm0SchedConn = schedOp->AddInput(ifm0Usage);
+    ifm0SchedConn->tensor = ifm0;
+    ifm0->consumers.push_back(schedOp.get());
+    auto *ifm1SchedConn = schedOp->AddInput(ifm1Usage);
+    ifm1SchedConn->tensor = ifm1;
+    ifm1->consumers.push_back(schedOp.get());
+    auto *ofmSchedConn = schedOp->AddOutput(ofmUsage);
+    ofmSchedConn->tensor = ofm;
+    ofm->producers.push_back(schedOp.get());
+
+    return schedOp;
+}
+
+std::unique_ptr<SchedulerOperation> CreateOperation(bool npu, TensorUsage ifmUsage,
+    std::shared_ptr<SchedulerTensor> &ifm, TensorUsage ofmUsage, std::shared_ptr<SchedulerTensor> &ofm)
+{
+    static std::vector<std::shared_ptr<Operation>> ops;
+    ops.push_back(std::make_shared<Operation>(OpType::AvgPool));
+
+    auto schedOp = std::make_unique<SchedulerOperation>(OpType::AvgPool);
+    schedOp->SetNpuOp(npu);
+    schedOp->_srcKey = static_cast<void *>(ops.back().get());
+    auto *ifmSchedConn = schedOp->AddInput(ifmUsage);
+    ifmSchedConn->tensor = ifm;
+    ifm->consumers.push_back(schedOp.get());
+    auto *ofmSchedConn = schedOp->AddOutput(ofmUsage);
+    ofmSchedConn->tensor = ofm;
+    ofm->producers.push_back(schedOp.get());
+
+    return schedOp;
+}
+
+};  // namespace
+
+TEST_CASE("test_graph_packing")
+{
+    AddressMap tensorAddressMap;
+
+    // Create some tensors
+    auto tens1 = CreateTensor("t1", 0x04, tensorAddressMap);
+    auto tens2 = CreateTensor("t2", 0x08, tensorAddressMap);
+    auto tens3 = CreateTensor("t3", 0x12, tensorAddressMap);
+    auto tens4 = CreateTensor("t4", 0x16, tensorAddressMap);
+    auto tens5 = CreateTensor("t5", 0x1A, tensorAddressMap);
+
+    SECTION("All NPU")
+    {
+        std::vector<std::unique_ptr<SchedulerOperation>> ops;
+        ops.push_back(CreateOperation(true, TensorUsage::IFM, tens1, TensorUsage::OFM, tens2));
+        ops.push_back(CreateOperation(true, TensorUsage::IFM, tens2, TensorUsage::OFM, tens3));
+        ops.push_back(CreateOperation(true, TensorUsage::IFM, tens3, TensorUsage::OFM, tens4));
+
+        auto oldGraph = std::make_unique<Graph>(GraphNotation::TFLite);
+        oldGraph->AddInput(tens1->srcTensor);
+        tens1->isGraphInput = true;
+        oldGraph->AddOutput(tens4->srcTensor);
+        tens4->isGraphOutput = true;
+
+        std::vector<std::pair<Operation *, std::unique_ptr<NPUOperation>>> npuOps;
+        auto newGraph = PackScheduleToGraph(npuOps, ops, tensorAddressMap, oldGraph.get());
+
+        REQUIRE(npuOps.size() == 1);
+        REQUIRE(ops.size() == 0);
+        REQUIRE(newGraph->ScheduledOrder().size() == 1);
+        REQUIRE(newGraph->Inputs().size() == 1);
+        REQUIRE(newGraph->Outputs().size() == 1);
+
+        // Check packing
+        REQUIRE(npuOps[0].second->Operations().size() == 3);
+
+        // Check new graph operations
+        REQUIRE(newGraph->ScheduledOrder()[0]->Type() == OpType::CustomNpuOp);
+        REQUIRE(newGraph->ScheduledOrder()[0]->Inputs().size() == 1);
+        REQUIRE(newGraph->ScheduledOrder()[0]->Inputs().contains(TensorUsage::IFM));
+        REQUIRE(newGraph->ScheduledOrder()[0]->Outputs().size() == 1);
+        REQUIRE(newGraph->ScheduledOrder()[0]->Outputs().contains(TensorUsage::OFM));
+
+        // Check new graph connections
+        REQUIRE(newGraph->ScheduledOrder()[0]->IFM(0) == newGraph->Inputs()[0].get());
+        REQUIRE(newGraph->ScheduledOrder()[0]->OFM() == newGraph->Outputs()[0].get());
+
+        // Check new graph I/O
+        REQUIRE(newGraph->IsInput(newGraph->ScheduledOrder()[0]->IFM(0)));
+        REQUIRE(newGraph->IsOutput(newGraph->ScheduledOrder()[0]->OFM()));
+    }
+
+    SECTION("All CPU")
+    {
+        std::vector<std::unique_ptr<SchedulerOperation>> ops;
+        ops.push_back(CreateOperation(false, TensorUsage::IFM, tens1, TensorUsage::Params, tens2, TensorUsage::OFM, tens3));
+        ops.push_back(CreateOperation(false, TensorUsage::IFM, tens3, TensorUsage::OFM, tens4));
+        ops.push_back(CreateOperation(false, TensorUsage::IFM, tens4, TensorUsage::OFM, tens5));
+
+        auto oldGraph = std::make_unique<Graph>(GraphNotation::TFLite);
+        oldGraph->AddInput(tens1->srcTensor);
+        tens1->isGraphInput = true;
+        oldGraph->AddInput(tens2->srcTensor);
+        tens2->isGraphInput = true;
+        oldGraph->AddOutput(tens5->srcTensor);
+        tens5->isGraphOutput = true;
+
+        std::vector<std::pair<Operation *, std::unique_ptr<NPUOperation>>> npuOps;
+        auto newGraph = PackScheduleToGraph(npuOps, ops, tensorAddressMap, oldGraph.get());
+
+        REQUIRE(npuOps.size() == 0);
+        REQUIRE(ops.size() == 0);
+        REQUIRE(newGraph->ScheduledOrder().size() == 3);
+        REQUIRE(newGraph->Inputs().size() == 2);
+        REQUIRE(newGraph->Outputs().size() == 1);
+
+        // Check new graph operations
+        REQUIRE(newGraph->ScheduledOrder()[0]->Type() == OpType::AvgPool);
+        REQUIRE(newGraph->ScheduledOrder()[0]->Inputs().size() == 2);
+        REQUIRE(newGraph->ScheduledOrder()[0]->Inputs().contains(TensorUsage::IFM));
+        REQUIRE(newGraph->ScheduledOrder()[0]->Inputs().contains(TensorUsage::Params));
+        REQUIRE(newGraph->ScheduledOrder()[0]->Outputs().size() == 1);
+        REQUIRE(newGraph->ScheduledOrder()[0]->Outputs().contains(TensorUsage::OFM));
+        REQUIRE(newGraph->ScheduledOrder()[1]->Type() == OpType::AvgPool);
+        REQUIRE(newGraph->ScheduledOrder()[1]->Inputs().size() == 1);
+        REQUIRE(newGraph->ScheduledOrder()[1]->Inputs().contains(TensorUsage::IFM));
+        REQUIRE(newGraph->ScheduledOrder()[1]->Outputs().size() == 1);
+        REQUIRE(newGraph->ScheduledOrder()[1]->Outputs().contains(TensorUsage::OFM));
+        REQUIRE(newGraph->ScheduledOrder()[2]->Type() == OpType::AvgPool);
+        REQUIRE(newGraph->ScheduledOrder()[2]->Inputs().size() == 1);
+        REQUIRE(newGraph->ScheduledOrder()[2]->Inputs().contains(TensorUsage::IFM));
+        REQUIRE(newGraph->ScheduledOrder()[2]->Outputs().size() == 1);
+        REQUIRE(newGraph->ScheduledOrder()[2]->Outputs().contains(TensorUsage::OFM));
+
+        // Check new graph connections
+        REQUIRE(newGraph->ScheduledOrder()[0]->IFM(0) == newGraph->Inputs()[0].get());
+        REQUIRE(newGraph->ScheduledOrder()[0]->OFM() == newGraph->ScheduledOrder()[1]->IFM(0));
+        REQUIRE(newGraph->ScheduledOrder()[1]->OFM() == newGraph->ScheduledOrder()[2]->IFM(0));
+        REQUIRE(newGraph->ScheduledOrder()[2]->OFM() == newGraph->Outputs()[0].get());
+
+        // Check new graph I/O
+        REQUIRE(newGraph->IsInput(newGraph->ScheduledOrder()[0]->IFM(0)));
+        REQUIRE(newGraph->IsOutput(newGraph->ScheduledOrder()[2]->OFM()));
+    }
+
+    SECTION("Mixed NPU/CPU with subop")
+    {
+        std::vector<std::unique_ptr<SchedulerOperation>> ops;
+        ops.push_back(CreateOperation(true, TensorUsage::IFM, tens1, TensorUsage::OFM, tens3));
+        ops.back()->AddSubOp(CreateOperation(true, TensorUsage::IFM, tens2, TensorUsage::OFM, tens3));
+        ops.push_back(CreateOperation(false, TensorUsage::IFM, tens3, TensorUsage::OFM, tens4));
+
+        auto oldGraph = std::make_unique<Graph>(GraphNotation::TFLite);
+        oldGraph->AddInput(tens1->srcTensor);
+        tens1->isGraphInput = true;
+        oldGraph->AddOutput(tens4->srcTensor);
+        tens4->isGraphOutput = true;
+
+        std::vector<std::pair<Operation *, std::unique_ptr<NPUOperation>>> npuOps;
+        auto newGraph = PackScheduleToGraph(npuOps, ops, tensorAddressMap, oldGraph.get());
+
+        REQUIRE(npuOps.size() == 1);
+        REQUIRE(ops.size() == 0);
+        REQUIRE(newGraph->ScheduledOrder().size() == 2);
+        REQUIRE(newGraph->Inputs().size() == 1);
+        REQUIRE(newGraph->Outputs().size() == 1);
+
+        // Check new graph I/O
+        REQUIRE(newGraph->IsInput(newGraph->ScheduledOrder()[0]->IFM(0)));
+        REQUIRE(newGraph->IsOutput(newGraph->ScheduledOrder()[1]->OFM()));
+
+        // Check new graph operations
+        REQUIRE(newGraph->ScheduledOrder()[0]->Type() == OpType::CustomNpuOp);
+        REQUIRE(newGraph->ScheduledOrder()[0]->Inputs().size() == 1);
+        REQUIRE(newGraph->ScheduledOrder()[0]->Inputs().contains(TensorUsage::IFM));
+        REQUIRE(newGraph->ScheduledOrder()[0]->Outputs().size() == 1);
+        REQUIRE(newGraph->ScheduledOrder()[0]->Outputs().contains(TensorUsage::OFM));
+        REQUIRE(newGraph->ScheduledOrder()[1]->Type() == OpType::AvgPool);
+        REQUIRE(newGraph->ScheduledOrder()[1]->Inputs().size() == 1);
+        REQUIRE(newGraph->ScheduledOrder()[1]->Inputs().contains(TensorUsage::IFM));
+        REQUIRE(newGraph->ScheduledOrder()[1]->Outputs().size() == 1);
+        REQUIRE(newGraph->ScheduledOrder()[1]->Outputs().contains(TensorUsage::OFM));
+
+        // Check new graph connections
+        REQUIRE(newGraph->ScheduledOrder()[0]->IFM(0) == newGraph->Inputs()[0].get());
+        REQUIRE(newGraph->ScheduledOrder()[0]->OFM() == newGraph->ScheduledOrder()[1]->IFM(0));
+        REQUIRE(newGraph->ScheduledOrder()[1]->OFM() == newGraph->Outputs()[0].get());
+    }
+
+    SECTION("NPU to NPU connection")
+    {
+        std::vector<std::unique_ptr<SchedulerOperation>> ops;
+        ops.push_back(CreateOperation(true, TensorUsage::IFM, tens1, TensorUsage::OFM, tens2));
+        ops.push_back(CreateOperation(false, TensorUsage::IFM, tens3, TensorUsage::OFM, tens4));
+        ops.push_back(CreateOperation(true, TensorUsage::IFM0, tens2, TensorUsage::IFM1, tens4, TensorUsage::OFM, tens5));
+
+        auto oldGraph = std::make_unique<Graph>(GraphNotation::TFLite);
+        oldGraph->AddInput(tens1->srcTensor);
+        tens1->isGraphInput = true;
+        oldGraph->AddInput(tens3->srcTensor);
+        tens3->isGraphInput = true;
+        oldGraph->AddOutput(tens5->srcTensor);
+        tens5->isGraphOutput = true;
+
+        std::vector<std::pair<Operation *, std::unique_ptr<NPUOperation>>> npuOps;
+        auto newGraph = PackScheduleToGraph(npuOps, ops, tensorAddressMap, oldGraph.get());
+
+        REQUIRE(npuOps.size() == 2);
+        REQUIRE(ops.size() == 0);
+        REQUIRE(newGraph->ScheduledOrder().size() == 3);
+        REQUIRE(newGraph->Inputs().size() == 2);
+        REQUIRE(newGraph->Outputs().size() == 1);
+
+        // Check new graph I/O
+        REQUIRE(newGraph->IsInput(newGraph->ScheduledOrder()[0]->IFM(0)));
+        REQUIRE(newGraph->IsInput(newGraph->ScheduledOrder()[1]->IFM(0)));
+        REQUIRE(newGraph->IsOutput(newGraph->ScheduledOrder()[2]->OFM()));
+
+        // Check new graph operations
+        REQUIRE(newGraph->ScheduledOrder()[0]->Type() == OpType::CustomNpuOp);
+        REQUIRE(newGraph->ScheduledOrder()[0]->Inputs().size() == 1);
+        REQUIRE(newGraph->ScheduledOrder()[0]->Inputs().contains(TensorUsage::IFM));
+        REQUIRE(newGraph->ScheduledOrder()[0]->Outputs().size() == 1);
+        REQUIRE(newGraph->ScheduledOrder()[0]->Outputs().contains(TensorUsage::OFM));
+        REQUIRE(newGraph->ScheduledOrder()[1]->Type() == OpType::AvgPool);
+        REQUIRE(newGraph->ScheduledOrder()[1]->Inputs().size() == 1);
+        REQUIRE(newGraph->ScheduledOrder()[1]->Inputs().contains(TensorUsage::IFM));
+        REQUIRE(newGraph->ScheduledOrder()[1]->Outputs().size() == 1);
+        REQUIRE(newGraph->ScheduledOrder()[1]->Outputs().contains(TensorUsage::OFM));
+        REQUIRE(newGraph->ScheduledOrder()[2]->Type() == OpType::CustomNpuOp);
+        REQUIRE(newGraph->ScheduledOrder()[2]->Inputs().size() == 2);
+        REQUIRE(newGraph->ScheduledOrder()[2]->Inputs().contains(TensorUsage::IFM0));
+        REQUIRE(newGraph->ScheduledOrder()[2]->Inputs().contains(TensorUsage::IFM1));
+        REQUIRE(newGraph->ScheduledOrder()[2]->Outputs().size() == 1);
+        REQUIRE(newGraph->ScheduledOrder()[2]->Outputs().contains(TensorUsage::OFM));
+
+        // Check new graph connections
+        REQUIRE(newGraph->ScheduledOrder()[0]->IFM(0) == newGraph->Inputs()[0].get());
+        REQUIRE(newGraph->ScheduledOrder()[0]->OFM() == newGraph->ScheduledOrder()[2]->IFM(0));
+        REQUIRE(newGraph->ScheduledOrder()[1]->IFM(0) == newGraph->Inputs()[1].get());
+        REQUIRE(newGraph->ScheduledOrder()[1]->OFM() == newGraph->ScheduledOrder()[2]->IFM(1));
+        REQUIRE(newGraph->ScheduledOrder()[2]->OFM() == newGraph->Outputs()[0].get());
+    }
+}
diff --git a/ethosu/regor/test/test_ini_reader.cpp b/ethosu/regor/test/test_ini_reader.cpp
new file mode 100644
index 00000000..a4b6fc11
--- /dev/null
+++ b/ethosu/regor/test/test_ini_reader.cpp
@@ -0,0 +1,184 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common/ini_reader.hpp"
+
+#include <catch_all.hpp>
+
+static const std::string config = R"(
+[scheduler]
+optimize=Performance
+empty=
+size1=12
+size2 = 27kb
+size3=  437mb
+size4=1099511627776
+verbose=true
+silent=false
+; a comment
+int1=39
+int2=156
+int3=2199023255552
+values_may_contain=square[brackets]
+but_keys[must]=not
+quoted_string="value"
+quoted_empty_string=""
+quoted_escaped_string="\"escaped\" value"
+array=1 ,3, 4,5,6
+half=0.5
+
+unexpected_key=skipped_value
+expected_key=expected_key_value
+[unexpected_section]
+skipped_key=skipped_key_value
+square_brackets=not[a]new]section[
+[expected_section]
+key=value
+)";
+
+TEST_CASE("ini_reader")
+{
+    IniReader reader(config.data(), int(config.size()));
+    std::string section;
+    REQUIRE(reader.Begin(section));
+    REQUIRE(section == "scheduler");
+    std::string key;
+    REQUIRE(reader.Begin(key));
+    REQUIRE(key == "optimize");
+    std::string value;
+    REQUIRE(reader.Read(value));
+    REQUIRE(value == "Performance");
+    reader.End();
+    REQUIRE(reader.Begin(key));
+    REQUIRE(key == "empty");
+    REQUIRE(reader.Read(value));
+    REQUIRE(value == "");
+    reader.End();
+    int size;
+    REQUIRE(reader.Begin(key));
+    REQUIRE(key == "size1");
+    REQUIRE(reader.Read(size));
+    REQUIRE(size == 12);
+    std::string suffix;
+    REQUIRE(reader.Read(suffix));
+    REQUIRE(suffix == "");
+    reader.End();
+    REQUIRE(reader.Begin(key));
+    REQUIRE(key == "size2");
+    REQUIRE(reader.Read(size));
+    REQUIRE(size == 27);
+    REQUIRE(reader.Read(suffix));
+    REQUIRE(suffix == "kb");
+    reader.End();
+    REQUIRE(reader.Begin(key));
+    REQUIRE(key == "size3");
+    REQUIRE(reader.Read(size));
+    REQUIRE(size == 437);
+    REQUIRE(reader.Read(suffix));
+    REQUIRE(suffix == "mb");
+    reader.End();
+    int64_t size_64bit;
+    REQUIRE(reader.Begin(key));
+    REQUIRE(key == "size4");
+    REQUIRE(reader.Read(size_64bit));
+    REQUIRE(size_64bit == 1099511627776);
+    reader.End();
+    REQUIRE(reader.Begin(key));
+    REQUIRE(key == "verbose");
+    REQUIRE(reader.Get<bool>());
+    reader.End();
+    REQUIRE(reader.Begin(key));
+    REQUIRE(key == "silent");
+    REQUIRE(!reader.Get<bool>());
+    reader.End();
+    int intVal;
+    REQUIRE(reader.Begin(key));
+    REQUIRE(key == "int1");
+    REQUIRE(reader.Read(intVal));
+    REQUIRE(intVal == 39);
+    reader.End();
+    REQUIRE(reader.Begin(key));
+    REQUIRE(key == "int2");
+    REQUIRE(reader.Get<int>() == 156);
+    reader.End();
+    REQUIRE(reader.Begin(key));
+    REQUIRE(key == "int3");
+    REQUIRE(reader.Get<int64_t>() == 2199023255552);
+    reader.End();
+    REQUIRE(reader.Begin(key));
+    REQUIRE(key == "values_may_contain");
+    REQUIRE(reader.Read(value));
+    REQUIRE(value == "square[brackets]");
+    reader.End();
+    REQUIRE(!reader.Begin(key));  // but_keys[must]=not
+    REQUIRE(reader.Begin(key));
+    REQUIRE(key == "quoted_string");
+    REQUIRE(reader.Read(value));
+    REQUIRE(value == "value");
+    reader.End();
+    REQUIRE(reader.Begin(key));
+    REQUIRE(key == "quoted_empty_string");
+    REQUIRE(reader.Read(value));
+    REQUIRE(value.empty());
+    reader.End();
+    REQUIRE(reader.Begin(key));
+    REQUIRE(key == "quoted_escaped_string");
+    REQUIRE(reader.Read(value));
+    REQUIRE(value == "\"escaped\" value");
+    reader.End();
+    REQUIRE(reader.Begin(key));
+    auto arr = reader.Get<std::vector<int>>();
+    REQUIRE(arr.size() == 5);
+    REQUIRE(arr[2] == 4);
+    reader.End();
+    float floatVal;
+    REQUIRE(reader.Begin(key));
+    REQUIRE(key == "half");
+    REQUIRE(reader.Read(floatVal));
+    REQUIRE(floatVal == 0.5f);
+    reader.End();
+
+    // Test prematurely ending a key/value pair
+    REQUIRE(reader.Begin(key));
+    REQUIRE(key == "unexpected_key");
+    reader.End();  // End line without reading value. Should skip over 'skipped_value'.
+    REQUIRE(reader.Begin(key));
+    REQUIRE(key == "expected_key");
+    REQUIRE(reader.Read(value));
+    REQUIRE(value == "expected_key_value");
+    reader.End();  // end key
+    reader.End();  // end section
+
+    // Test prematurely ending a section
+    REQUIRE(reader.Begin(section));
+    REQUIRE(section == "unexpected_section");
+    reader.End();  // End section without beginning a key. Should skip over entire section.
+    REQUIRE(reader.Begin(section));
+    REQUIRE(section == "expected_section");
+    REQUIRE(reader.Begin(key));
+    REQUIRE(key == "key");
+    REQUIRE(reader.Read(value));
+    REQUIRE(value == "value");
+    reader.End();  // end key
+    reader.End();  // end section
+
+    // Test attempting to read beyond end of file
+    section = "should_remain_unchanged";
+    REQUIRE(!reader.Begin(section));
+    REQUIRE(section == "should_remain_unchanged");
+}
diff --git a/ethosu/regor/test/test_main.cpp b/ethosu/regor/test/test_main.cpp
new file mode 100644
index 00000000..468a5f76
--- /dev/null
+++ b/ethosu/regor/test/test_main.cpp
@@ -0,0 +1,77 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#define CATCH_CONFIG_RUNNER
+#include "randomize.hpp"
+
+#include <catch_all.hpp>
+#include <cstdint>
+#include <stack>
+#include <string>
+
+using namespace Catch::Clara;
+
+std::mt19937 default_rnd_generator(0);
+
+// To maximize random number stability we are (potentially) using multiple generators.
+// The idea is that each "section" will have its own fresh generator, initialized from Catch's seed.
+// This should ensure that the same values are used, even if we only re-run a single section of a test case.
+struct RandomGeneratorManager : public Catch::EventListenerBase
+{
+    using EventListenerBase::EventListenerBase;
+
+    virtual void sectionStarting(Catch::SectionInfo const &sectionInfo) override
+    {
+        EventListenerBase::sectionStarting(sectionInfo);
+
+        // Preserve the current generator state, and re-initialize a new generator
+        generators.push(default_rnd_generator);
+        default_rnd_generator = std::mt19937(Catch::rngSeed());
+    }
+
+    virtual void sectionEnded(Catch::SectionStats const &sectionStats) override
+    {
+        EventListenerBase::sectionEnded(sectionStats);
+
+        // Restore the previous generator state
+        default_rnd_generator = generators.top();
+        generators.pop();
+    }
+
+private:
+    std::stack<std::mt19937> generators;
+};
+CATCH_REGISTER_LISTENER(RandomGeneratorManager)
+
+bool OptNightly = false;
+
+int main(int argc, char **argv)
+{
+    Catch::Session session;
+
+    auto cli = session.cli() | Opt(OptNightly)["--nightly"]("This is a nightly run");
+    session.cli(cli);
+
+    auto ret = session.applyCommandLine(argc, argv);
+    if ( ret )
+    {
+        return ret;
+    }
+
+    return session.run();
+}
diff --git a/ethosu/regor/test/test_mlw_encode.cpp b/ethosu/regor/test/test_mlw_encode.cpp
new file mode 100644
index 00000000..27eba154
--- /dev/null
+++ b/ethosu/regor/test/test_mlw_encode.cpp
@@ -0,0 +1,260 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common/common.hpp"
+
+#include "architecture/mlw_encode.hpp"
+#include "randomize.hpp"
+
+#include <mlw_decode.h>
+#include <mlw_encode.h>
+#include <catch_all.hpp>
+
+namespace
+{
+
+enum class sc_compression_schemes_t : int
+{
+    DIRECT_MODE,
+    DIRECT_MODE_WITH_RLE,
+    DIRECT_TRUNCATED_MODE_WITH_RLE,
+    PALETTE_DIRECT_MODE,
+    PALETTE_MODE_WITH_RLE,
+    PALETTE_TRUNCATED_MODE_WITH_RLE,
+    PALETTE_DIRECT_MODE_WITH_RLE,
+    UNCOMPRESSED_MODE,
+    RANDOM_MODE,
+    PALETTE_MODE,
+    PALETTE_TRUNCATED_MODE,
+    DIRECT_TRUNCATED_MODE,
+
+    CNT
+};
+
+class MlwTb
+{
+public:
+    const std::vector<int16_t> &GenWeightSymbols(sc_compression_schemes_t comp_scheme, int total_num_weights)
+    {
+        int slice_div = urandom_range(1, 10);
+        int num_slices = (total_num_weights / slice_div) + (((total_num_weights % slice_div) != 0) ? 1 : 0);
+        unsigned weights = 0;
+        constexpr int signext_shift = 16 - 9;
+        // Make sure an all zeroes stream is tested with RLE
+        bool all_zeroes = comp_scheme == sc_compression_schemes_t::DIRECT_MODE_WITH_RLE && urandom_range(0, 1);
+        _unmapped_weights.clear();
+        _unmapped_weights.reserve(total_num_weights);
+
+        for ( int i = 0; i < num_slices; i++ )
+        {
+            int non_zeros = 0;
+            sc_compression_schemes_t sel_compression = GetMinMax(comp_scheme);
+            for ( int s = 0; s < slice_div; s++ )
+            {
+                if ( all_zeroes )
+                {
+                    weights = 0;
+                }
+                else if ( sel_compression == sc_compression_schemes_t::DIRECT_TRUNCATED_MODE_WITH_RLE )
+                {
+                    weights = (non_zeros < 3) ? urandom_range(0, 1) * urandom_range(_min, _max) : 0;
+                }
+                else
+                {
+                    weights = urandom_range(_min, _max);
+                }
+                if ( weights != 0 ) non_zeros++;
+                // Encoder range is -255 to 255. The value -1 in 2's complement is represented as -256
+                // Hence add +1 if the weight value is -1
+                if ( weights == 256 )
+                {
+                    weights++;
+                }
+                if ( int(_unmapped_weights.size()) < total_num_weights )
+                {
+                    auto w = int16_t(int16_t(weights << signext_shift) >> signext_shift);
+                    _unmapped_weights.push_back(w);
+                }
+            }
+        }
+        return _unmapped_weights;
+    }
+
+private:
+    sc_compression_schemes_t GetMinMax(sc_compression_schemes_t comp_scheme)
+    {
+        int zeros;
+        sc_compression_schemes_t sel_compression;
+
+        if ( comp_scheme == sc_compression_schemes_t::RANDOM_MODE )
+        {
+            sel_compression = sc_compression_schemes_t(urandom_range(0, unsigned(sc_compression_schemes_t::CNT) - 1));
+        }
+        else
+        {
+            sel_compression = comp_scheme;
+        }
+        switch ( sel_compression )
+        {
+            case sc_compression_schemes_t::UNCOMPRESSED_MODE:
+                _min = urandom_range(0, 127);
+                _max = urandom_range(_min, 510);
+                break;
+            case sc_compression_schemes_t::DIRECT_MODE:
+                _min = urandom_range(0, 200);
+                _max = urandom_range(_min, _min + 10);
+                break;
+            case sc_compression_schemes_t::DIRECT_MODE_WITH_RLE:
+                zeros = urandom_range(0, 1);
+                _min = urandom_range(0, zeros * 127);
+                _max = urandom_range(_min, zeros * 255);
+                break;
+            case sc_compression_schemes_t::DIRECT_TRUNCATED_MODE:
+                _min = urandom_range(0, 200);
+                _max = urandom_range(_min, _min + 3);
+                break;
+            case sc_compression_schemes_t::DIRECT_TRUNCATED_MODE_WITH_RLE:
+                zeros = urandom_range(0, 1);
+                _min = urandom_range(0, zeros * 127);
+                _max = urandom_range(_min, _min + 3);
+                break;
+            case sc_compression_schemes_t::PALETTE_MODE:
+                _min = urandom() % 3 == 0 ? urandom_range(0, 5) : urandom_range(0, 16);
+                _max = urandom_range(_min, _min + 3);
+                break;
+            case sc_compression_schemes_t::PALETTE_DIRECT_MODE:
+                // Fall through
+            case sc_compression_schemes_t::PALETTE_MODE_WITH_RLE:
+                _min = urandom_range(0, 220);
+                _max = urandom_range(_min, _min + 31);
+                break;
+            case sc_compression_schemes_t::PALETTE_DIRECT_MODE_WITH_RLE:
+                _min = urandom_range(0, 7);
+                _max = urandom_range(_min, _min + 2);
+                break;
+            case sc_compression_schemes_t::PALETTE_TRUNCATED_MODE:
+                _min = urandom() % 7 == 0 ? urandom_range(0, 16) : urandom_range(0, 200);
+                _max = urandom_range(_min, _min + 2);
+                break;
+            case sc_compression_schemes_t::PALETTE_TRUNCATED_MODE_WITH_RLE:
+                // Fall through
+            case sc_compression_schemes_t::RANDOM_MODE:
+                // Fall through
+            case sc_compression_schemes_t::CNT:
+                // Fall through
+            default:
+                _min = 0;
+                _max = 0;
+                break;
+        }
+        return sel_compression;
+    }
+
+    int _min = 0;
+    int _max = 0;
+    std::vector<int16_t> _unmapped_weights;
+};
+
+}  // namespace
+
+
+struct LinearWeightSource : public IWeightSource
+{
+    const int16_t *_buffer;
+    int _index = 0;
+    int _size = 0;
+
+    LinearWeightSource(const int16_t *inbuf, int size) : _buffer(inbuf), _size(size) {}
+
+    int Elements() { return _size - _index; }
+
+    int Get(int16_t *buffer, int count)
+    {
+        count = std::min(count, _size - _index);
+        std::copy(_buffer + _index, _buffer + _index + count, buffer);
+        _index += count;
+        return count;
+    }
+};
+
+TEST_CASE("mlw_encode")
+{
+    MlwTb tb;
+    extern bool OptNightly;
+    int iter_cnt = OptNightly ? 1024 : 2;
+    int long_stream_cnt = 0;
+    constexpr int min_long_stream = 2;
+    constexpr int max_long_stream = 4;
+
+    for ( int iter = 0; iter < iter_cnt; iter++ )
+    {
+        CAPTURE(iter);
+        // Make sure RLE is tested
+        sc_compression_schemes_t mode = iter < 16 ? sc_compression_schemes_t::DIRECT_MODE_WITH_RLE : sc_compression_schemes_t::RANDOM_MODE;
+        bool long_stream = (long_stream_cnt < max_long_stream) && (urandom_range(0, 4) == 0 || iter >= (iter_cnt - min_long_stream));
+        if ( long_stream )
+        {
+            long_stream_cnt++;
+        }
+        int len = long_stream ? urandom_range(1024 * 1024, 4096 * 1024) : urandom_range(16, 2048 * 10);
+        CAPTURE(mode);
+        CAPTURE(len);
+        const std::vector<int16_t> &inbuf_enc = tb.GenWeightSymbols(mode, len);
+        std::vector<uint8_t> outbuf_enc;
+
+        // Test direct API
+        ml_encode_result_t enc_res;
+        mle_context_t *ctx = mle_create_context(MLW_ENCODE_SYNTAX_ETHOSU);
+        int outbuf_size_enc = mle_encode(ctx, &enc_res, inbuf_enc.data(), int(inbuf_enc.size()), MLW_ENCODE_FLAG_NONE);
+        mle_destroy_context(ctx);
+        REQUIRE(enc_res.encoded_length > 0);
+
+        ml_decode_result_t dec_res;
+        ml_decode_ethosu_stream(&dec_res, enc_res.encoded_data, enc_res.encoded_length);
+        mle_free(&enc_res);
+        REQUIRE(size_t(dec_res.decoded_length) == inbuf_enc.size());
+        for ( unsigned i = 0; i < inbuf_enc.size(); i++ )
+        {
+            if ( dec_res.decoded_data[i] != inbuf_enc[i] )
+            {
+                CAPTURE(i);
+                REQUIRE(dec_res.decoded_data[i] == inbuf_enc[i]);
+            }
+        }
+        mld_free(&dec_res);
+
+        // Test interface API
+        LinearWeightSource source(inbuf_enc.data(), int(inbuf_enc.size()));
+        outbuf_enc.clear();
+        auto proxy_res = mle_encode_proxy(&source, 1024 * 1024, outbuf_enc, MLW_ENCODE_FLAG_NONE);
+        REQUIRE(proxy_res.bytes_written > 0);
+        REQUIRE(size_t(proxy_res.elements_read) == inbuf_enc.size());
+
+        ml_decode_ethosu_stream(&dec_res, outbuf_enc.data(), proxy_res.bytes_written);
+        REQUIRE(size_t(dec_res.decoded_length) == inbuf_enc.size());
+        for ( unsigned i = 0; i < inbuf_enc.size(); i++ )
+        {
+            if ( dec_res.decoded_data[i] != inbuf_enc[i] )
+            {
+                CAPTURE(i);
+                REQUIRE(dec_res.decoded_data[i] == inbuf_enc[i]);
+            }
+        }
+        mld_free(&dec_res);
+    }
+}
diff --git a/ethosu/regor/test/test_ordered_map.cpp b/ethosu/regor/test/test_ordered_map.cpp
new file mode 100644
index 00000000..43b8ec1e
--- /dev/null
+++ b/ethosu/regor/test/test_ordered_map.cpp
@@ -0,0 +1,1255 @@
+//
+// SPDX-FileCopyrightText: Copyright 2022-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common/ordered_map.hpp"
+#include "randomize.hpp"
+
+#include <array>
+#include <catch_all.hpp>
+#include <iterator>
+#include <unordered_map>
+#include <vector>
+
+using regor::ordered_map;
+
+namespace
+{
+bool operator==(const std::array<int, 1> &lhs, const std::array<int, 1> &rhs)
+{
+    return lhs[0] == rhs[0];
+}
+
+template<typename KEY, typename VAL>
+static void CheckInsertionOrder(const ordered_map<KEY, VAL> &uut, const std::unordered_map<KEY, VAL> &ref, const KEY expected_keys[])
+{
+    REQUIRE(uut.size() == int(ref.size()));
+
+    int i = 0;
+    for ( auto pair : uut.pairs() )
+    {
+        REQUIRE(i < uut.size());
+        CHECK(pair.first == expected_keys[i]);
+        CHECK(pair.second == ref.at(expected_keys[i]));
+        ++i;
+    }
+    REQUIRE(i == uut.size());
+
+    i = 0;
+    for ( auto value : uut )
+    {
+        REQUIRE(i < uut.size());
+        CHECK(value == ref.at(expected_keys[i]));
+        ++i;
+    }
+    REQUIRE(i == uut.size());
+
+    i = 0;
+    for ( auto &key : uut.keys() )
+    {
+        REQUIRE(i < uut.size());
+        CHECK(key == expected_keys[i]);
+        ++i;
+    }
+    REQUIRE(i == uut.size());
+}
+
+}  // namespace
+
+TEST_CASE("ordered_map: Retains insertion order")
+{
+    using Key = char16_t;
+    using Val = int;
+
+    // 0: Using only operator[] insertion
+    // 1: Using only insert() insertion
+    // 2: Using only emplace() insertion
+    // 3: Using a combination of all three
+    for ( int test = 0; test < 4; ++test )
+    {
+        for ( int capacity = 1; capacity <= 256; capacity *= 2 )
+        {
+            ordered_map<Key, Val> uut{capacity};
+            std::unordered_map<Key, Val> ref;
+            Val val{};
+
+            for ( auto c : u"An ordered_map will always retain insertion order" )
+            {
+                randomize(val);
+                int rand = urandom_range(0, 3);
+
+                if ( (test == 0) || ((test == 3) && (rand == 0)) )
+                {
+                    uut[c] = val;
+                    ref[c] = val;
+                }
+                else if ( (test == 1) || ((test == 3) && (rand == 1)) )
+                {
+                    if ( ref.count(c) == 0 )
+                    {
+                        uut.insert(c, val);
+                        ref[c] = val;
+                    }
+                }
+                else
+                {
+                    uut.emplace(c, val);
+                    ref.emplace(c, val);
+                }
+            }
+
+            REQUIRE(uut.size() == int(ref.size()));
+
+            CheckInsertionOrder<Key, Val>(uut, ref, u"An orde_mapwilyst");
+
+            uut.erase('r');
+            uut.erase('e');
+            uut.erase('m');
+            uut.erase('o');
+            uut.erase('v');
+            uut.erase('e');
+
+            ref.erase('r');
+            ref.erase('e');
+            ref.erase('m');
+            ref.erase('o');
+            ref.erase('v');
+            ref.erase('e');
+
+            CheckInsertionOrder<Key, Val>(uut, ref, u"An d_apwilyst");
+        }
+    }
+}
+
+TEST_CASE("ordered_map: Replaces")
+{
+    using Key = char16_t;
+    using Val = int;
+
+    ordered_map<Key, Val> uut;
+    std::unordered_map<Key, Val> ref;
+    Val val{};
+
+    for ( auto c : u"Using replace() does not send you to the back of the line" )
+    {
+        randomize(val);
+        ref[c] = val;
+        uut[c] = val;
+    }
+
+    CheckInsertionOrder<Key, Val>(uut, ref, u"Using replac()dotyuhbkf");
+
+    randomize(val);
+    uut.replace(' ', '_', val);
+    ref.erase(' ');
+    ref['_'] = val;
+
+    randomize(val);
+    uut.replace('e', 'E', val);
+    ref.erase('e');
+    ref['E'] = val;
+
+    randomize(val);
+    uut.replace('U', 'e', val);
+    ref.erase('U');
+    ref['e'] = val;
+
+    CheckInsertionOrder<Key, Val>(uut, ref, u"esing_rEplac()dotyuhbkf");
+}
+
+TEST_CASE("ordered_map: Reinserts")
+{
+    using Key = char16_t;
+    using Val = std::array<int, 1>;
+
+    for ( int capacity = 1; capacity <= 256; capacity *= 2 )
+    {
+        ordered_map<Key, Val> uut{capacity};
+        std::unordered_map<Key, Val> ref;
+        Val val{};
+
+        // Reinsert the only element
+        randomize(val);
+        uut['U'] = val;
+        ref['U'] = val;
+        CheckInsertionOrder<Key, Val>(uut, ref, u"U");
+        randomize(val);
+        uut.reinsert('U', val);
+        ref['U'] = val;
+        CheckInsertionOrder<Key, Val>(uut, ref, u"U");
+
+        for ( auto c : u"Using reinsert() sends you to the back of the line" )
+        {
+            randomize(val);
+            ref[c] = val;
+            uut[c] = val;
+        }
+
+        CheckInsertionOrder<Key, Val>(uut, ref, u"Using ret()dyouhbackfl");
+
+        // Reinsert elements from the middle
+        randomize(val);
+        uut.reinsert(' ', val);
+        ref[' '] = val;
+        randomize(val);
+        uut.reinsert('e', val);
+        ref['e'] = val;
+
+        CheckInsertionOrder<Key, Val>(uut, ref, u"Usingrt()dyouhbackfl\0 e");
+
+        // Reinsert the first element
+        randomize(val);
+        uut.reinsert('U', val);
+        ref['U'] = val;
+
+        CheckInsertionOrder<Key, Val>(uut, ref, u"singrt()dyouhbackfl\0 eU");
+
+        // Reinsert the last element
+        randomize(val);
+        uut.reinsert('U', val);
+        ref['U'] = val;
+
+        CheckInsertionOrder<Key, Val>(uut, ref, u"singrt()dyouhbackfl\0 eU");
+
+        // Reinsert using rvalue reference
+        randomize(val);
+        ref['r'] = val;
+        uut.reinsert('r', std::move(val));
+
+        CheckInsertionOrder<Key, Val>(uut, ref, u"singt()dyouhbackfl\0 eUr");
+    }
+}
+
+TEST_CASE("ordered_map: Reorders")
+{
+    using Key = char16_t;
+    using Val = int;
+
+    for ( int capacity = 1; capacity <= 256; capacity *= 2 )
+    {
+        ordered_map<Key, Val> uut{capacity};
+        std::unordered_map<Key, Val> ref;
+        Val val{};
+
+        // Reorder the only element in the map
+        randomize(val);
+        uut['R'] = val;
+        ref['R'] = val;
+        CheckInsertionOrder<Key, Val>(uut, ref, u"R");
+        uut.reorder_before('R', uut.find('R'));
+        CheckInsertionOrder<Key, Val>(uut, ref, u"R");
+        uut.reorder_after('R', uut.find('R'));
+        CheckInsertionOrder<Key, Val>(uut, ref, u"R");
+
+        for ( auto c : u"Reordering by key does not change value" )
+        {
+            randomize(val);
+            ref[c] = val;
+            uut[c] = val;
+        }
+
+        CheckInsertionOrder<Key, Val>(uut, ref, u"Reording bykstchavlu");
+
+        // Reorder in the middle
+        uut.reorder_after('r', uut.find('b'));   // after, move forwards
+        uut.reorder_after('c', uut.find('e'));   // after, move backwards
+        uut.reorder_before('l', uut.find('t'));  // before, move forwards
+        uut.reorder_before('a', uut.find('y'));  // before, move backwards
+
+        CheckInsertionOrder<Key, Val>(uut, ref, u"Recoding braykslthvu");
+
+        // Reorder to the front
+        uut.reorder_before(' ', uut.find('R'));
+
+        CheckInsertionOrder<Key, Val>(uut, ref, u" Recodingbraykslthvu");
+
+        // Reorder from the front
+        uut.reorder_before(' ', uut.find('\0'));
+        uut.reorder_after('R', uut.find('o'));
+
+        CheckInsertionOrder<Key, Val>(uut, ref, u"ecoRdingbraykslthvu ");
+
+        // Reorder from the back
+        uut.reorder_before('\0', uut.find('R'));
+        uut.reorder_after(' ', uut.find('R'));
+
+        CheckInsertionOrder<Key, Val>(uut, ref, u"eco\0R dingbraykslthvu");
+
+        // Reorder to the back
+        uut.reorder_after('y', uut.find('u'));
+
+        CheckInsertionOrder<Key, Val>(uut, ref, u"eco\0R dingbrakslthvuy");
+
+        // Reorder adjacent
+        uut.reorder_after('c', uut.find('o'));
+        uut.reorder_before('r', uut.find('b'));
+
+        CheckInsertionOrder<Key, Val>(uut, ref, u"eoc\0R dingrbakslthvuy");
+    }
+}
+
+TEST_CASE("ordered_map: Swaps")
+{
+    using Key = char16_t;
+    using Val = int;
+
+    for ( int capacity = 1; capacity <= 256; capacity *= 2 )
+    {
+        ordered_map<Key, Val> uut{capacity};
+        std::unordered_map<Key, Val> ref;
+        Val val{};
+
+        // Swap the only element with itself
+        randomize(val);
+        uut['S'] = val;
+        ref['S'] = val;
+        CheckInsertionOrder<Key, Val>(uut, ref, u"S");
+        uut.swap_order(uut.find('S'), uut.find('S'));
+        CheckInsertionOrder<Key, Val>(uut, ref, u"S");
+
+        for ( auto c : u"Swapping by key does not change value" )
+        {
+            randomize(val);
+            ref[c] = val;
+            uut[c] = val;
+        }
+
+        CheckInsertionOrder<Key, Val>(uut, ref, u"Swaping bykedostchvlu");
+
+        // Swap elements in the middle
+        uut.swap_order(uut.find('w'), uut.find('k'));  // earlier element first
+        uut.swap_order(uut.find('t'), uut.find('p'));  // earlier element second
+
+        CheckInsertionOrder<Key, Val>(uut, ref, u"Skating bywedospchvlu");
+
+        // Swap the first element
+        uut.swap_order(uut.begin(), uut.find('s'));
+
+        CheckInsertionOrder<Key, Val>(uut, ref, u"skating bywedoSpchvlu");
+
+        // Swap the last element
+        uut.swap_order(uut.find('w'), uut.find('\0'));
+
+        CheckInsertionOrder<Key, Val>(uut, ref, u"skating by\0edoSpchvluw");
+
+        // Swap the first and last elements
+        uut.swap_order(uut.find('w'), uut.begin());
+
+        CheckInsertionOrder<Key, Val>(uut, ref, u"wkating by\0edoSpchvlus");
+
+        // Swap adjacent elements
+        uut.swap_order(uut.find('c'), uut.find('h'));  // earlier element first
+
+        CheckInsertionOrder<Key, Val>(uut, ref, u"wkating by\0edoSphcvlus");
+
+        uut.swap_order(uut.find('b'), uut.find(' '));  // earlier element second
+
+        CheckInsertionOrder<Key, Val>(uut, ref, u"wkatingb y\0edoSphcvlus");
+    }
+}
+
+TEST_CASE("ordered_map: Clears")
+{
+    ordered_map<float, bool> uut;
+
+    CHECK(uut.empty());
+    CHECK(uut.size() == 0);
+    for ( auto pair : uut.pairs() )
+    {
+        REQUIRE(false);
+    }
+    for ( auto value : uut )
+    {
+        REQUIRE(false);
+    }
+
+    // Clear when empty
+    uut.clear();
+    CHECK(uut.empty());
+
+    // Clear when not empty
+    uut[3.14f] = false;
+    uut.insert(2.72f, true);
+    uut.clear();
+
+    bool val = false;
+    CHECK(uut.find(3.14f) == uut.end());
+    CHECK(uut.try_ref(3.14f) == nullptr);
+    CHECK(uut.try_get(2.72f, val) == false);
+    CHECK(val == false);
+
+    CHECK(uut.empty());
+    CHECK(uut.size() == 0);
+    for ( auto pair : uut.pairs() )
+    {
+        REQUIRE(false);
+    }
+    for ( auto value : uut )
+    {
+        REQUIRE(false);
+    }
+}
+
+TEST_CASE("ordered_map: Resizes (move assignable)")
+{
+    using Key = long long int;
+    using Val = std::unique_ptr<bool>;
+
+    static_assert(std::is_move_assignable_v<Val>);
+
+    for ( int capacity = 1; capacity <= 256; capacity *= 2 )
+    {
+        ordered_map<Key, Val> uut{capacity};
+
+        const Key lower = -10 * capacity;
+        const Key upper = 10 * capacity;
+
+        for ( Key i = lower; i < upper; ++i )
+        {
+            uut[i] = std::make_unique<bool>(i % 4);
+        }
+
+        for ( Key i = lower; i < upper; ++i )
+        {
+            CHECK(*uut[i] == bool(i % 4));
+        }
+    }
+}
+
+TEST_CASE("ordered_map: Resizes (trivially copyable)")
+{
+    using Key = long long int;
+
+    struct Val
+    {
+        Key key;
+        Val() = default;
+        Val(Key k) : key(k) {}
+        Val &operator=(const Val &val) = default;
+        Val &operator=(Val &&val) = delete;
+    };
+
+    static_assert(!std::is_move_assignable_v<Val>);
+    static_assert(std::is_trivially_copyable_v<Val>);
+
+    for ( int capacity = 1; capacity <= 256; capacity *= 2 )
+    {
+        ordered_map<Key, Val> uut{capacity};
+
+        const Key lower = -10 * capacity;
+        const Key upper = 10 * capacity;
+
+        for ( Key i = lower; i < upper; ++i )
+        {
+            uut.emplace(i, i);
+        }
+
+        for ( Key i = lower; i < upper; ++i )
+        {
+            CHECK(uut[i].key == i);
+        }
+    }
+}
+
+TEST_CASE("ordered_map: Resizes (not move assignable, not trivially copyable)")
+{
+    using Key = long long int;
+
+    struct Val
+    {
+        Key key;
+        Val() = default;
+        Val(Key k) : key(k) {}
+        Val(const Val &val) : key(val.key) {}
+        Val &operator=(const Val &val) = default;
+        Val &operator=(Val &&val) = delete;
+    };
+
+    static_assert(!std::is_move_assignable_v<Val>);
+    static_assert(!std::is_trivially_copyable_v<Val>);
+
+    for ( int capacity = 1; capacity <= 256; capacity *= 2 )
+    {
+        ordered_map<Key, Val> uut{capacity};
+
+        const Key lower = -10 * capacity;
+        const Key upper = 10 * capacity;
+
+        for ( Key i = lower; i < upper; ++i )
+        {
+            uut.emplace(i, i);
+        }
+
+        for ( Key i = lower; i < upper; ++i )
+        {
+            CHECK(uut[i].key == i);
+        }
+    }
+}
+
+namespace
+{
+constexpr int test_length = 10000;
+constexpr bool debug = false;
+
+enum class MapLikeOp
+{
+    Size,
+    Empty,
+    SquareBracketsAccess,
+    SquareBracketsInsert,
+    SquareBracketsReplace,
+    Insert,
+    ReinsertInsert,
+    ReinsertReplace,
+    EmplaceSucceed,
+    EmplaceFail,
+    EraseSucceed,
+    EraseFail,
+    AtAccess,
+    AtAccessConst,
+    AtReplace,
+    ContainsTrue,
+    ContainsFalse,
+    Clear,
+    TryGetSucceed,
+    TryGetFail,
+    TryRefSucceed,
+    TryRefFail,
+    TryRefReplace,
+    TryRefConstSucceed,
+    TryRefConstFail,
+    FindSucceed,
+    FindFail,
+    FindConstSucceed,
+    FindConstFail,
+    FindReplace,
+    SwapOrder,
+    Replace,
+    CheckAll,
+};
+
+bool RequiresKeyHit(MapLikeOp op)
+{
+    switch ( op )
+    {
+        case MapLikeOp::SquareBracketsAccess:
+        case MapLikeOp::SquareBracketsReplace:
+        case MapLikeOp::AtAccess:
+        case MapLikeOp::AtAccessConst:
+        case MapLikeOp::AtReplace:
+        case MapLikeOp::ContainsTrue:
+        case MapLikeOp::TryGetSucceed:
+        case MapLikeOp::TryRefSucceed:
+        case MapLikeOp::TryRefConstSucceed:
+        case MapLikeOp::TryRefReplace:
+        case MapLikeOp::EraseSucceed:
+        case MapLikeOp::EmplaceFail:
+        case MapLikeOp::ReinsertReplace:
+        case MapLikeOp::FindSucceed:
+        case MapLikeOp::FindConstSucceed:
+        case MapLikeOp::FindReplace:
+        case MapLikeOp::SwapOrder:
+        case MapLikeOp::Replace:
+            return true;
+        default:
+            return false;
+    }
+}
+
+bool RequiresKeyMiss(MapLikeOp op)
+{
+    switch ( op )
+    {
+        case MapLikeOp::SquareBracketsInsert:
+        case MapLikeOp::EmplaceSucceed:
+        case MapLikeOp::Insert:
+        case MapLikeOp::ReinsertInsert:
+        case MapLikeOp::ContainsFalse:
+        case MapLikeOp::TryGetFail:
+        case MapLikeOp::TryRefFail:
+        case MapLikeOp::TryRefConstFail:
+        case MapLikeOp::FindFail:
+        case MapLikeOp::FindConstFail:
+        case MapLikeOp::EraseFail:
+            return true;
+        default:
+            return false;
+    }
+}
+
+}  // namespace
+
+TEST_CASE("ordered_map: Behaves like a map")
+{
+    using Key = unsigned char;
+    using Val = int;
+
+    for ( int capacity = 1; capacity <= 256; capacity *= 2 )
+    {
+        ordered_map<Key, Val> uut{capacity};
+        std::unordered_map<Key, Val> ref;
+
+        for ( int i = 0; i < test_length; ++i )
+        {
+            auto op = MapLikeOp(urandom_range(unsigned(MapLikeOp::Size), unsigned(MapLikeOp::CheckAll)));
+            Key key;
+            Val val;
+            randomize(val);
+
+            if ( i + 1 == test_length )
+            {
+                op = MapLikeOp::CheckAll;  // Override to always finish on a check
+            }
+
+            if ( RequiresKeyHit(op) )
+            {
+                if ( ref.empty() )
+                {
+                    continue;
+                }
+                key = std::next(ref.begin(), urandom_range(0, unsigned(ref.size() - 1u)))->first;
+            }
+            else if ( RequiresKeyMiss(op) )
+            {
+                randomize(key);
+                if ( ref.count(key) )
+                {
+                    continue;
+                }
+            }
+            else
+            {
+                key = 0;
+            }
+
+            INFO("op = " << unsigned(op) << "\tkey = " << unsigned(key) << ",\tval = " << val);
+
+            switch ( op )
+            {
+                // Capacity
+                case MapLikeOp::Size:
+                    CHECK(uut.size() == int(ref.size()));
+                    break;
+                case MapLikeOp::Empty:
+                    CHECK(uut.empty() == ref.empty());
+                    break;
+
+                // Lookup existing key
+                case MapLikeOp::SquareBracketsAccess:
+                    CHECK(uut[key] == ref[key]);
+                    break;
+                case MapLikeOp::AtAccess:
+                    CHECK(uut.at(key) == ref.at(key));
+                    break;
+                case MapLikeOp::AtAccessConst:
+                {
+                    const ordered_map<Key, Val> &const_uut{uut};
+                    CHECK(const_uut.at(key) == ref.at(key));
+                }
+                break;
+                case MapLikeOp::ContainsTrue:
+                    CHECK(uut.contains(key));
+                    break;
+                case MapLikeOp::TryGetSucceed:
+                    CHECK(uut.try_get(key, val));
+                    CHECK(val == ref[key]);
+                    break;
+                case MapLikeOp::TryRefSucceed:
+                    CHECK(uut.try_ref(key));
+                    CHECK(*uut.try_ref(key) == ref[key]);
+                    break;
+                case MapLikeOp::TryRefConstSucceed:
+                {
+                    const ordered_map<Key, Val> &const_uut{uut};
+                    CHECK(const_uut.try_ref(key));
+                    CHECK(*const_uut.try_ref(key) == ref[key]);
+                }
+                break;
+                case MapLikeOp::FindSucceed:
+                    CHECK(*uut.find(key) == ref[key]);
+                    break;
+                case MapLikeOp::FindConstSucceed:
+                {
+                    const ordered_map<Key, Val> &const_uut{uut};
+                    CHECK(*const_uut.find(key) == ref[key]);
+                }
+                break;
+
+                // Lookup non-existing key
+                case MapLikeOp::ContainsFalse:
+                    CHECK(uut.contains(key) == false);
+                    break;
+                case MapLikeOp::TryGetFail:
+                    CHECK(uut.try_get(key, val) == false);
+                    break;
+                case MapLikeOp::TryRefFail:
+                    CHECK(uut.try_ref(key) == nullptr);
+                    break;
+                case MapLikeOp::TryRefConstFail:
+                {
+                    const ordered_map<Key, Val> &const_uut{uut};
+                    CHECK(const_uut.try_ref(key) == nullptr);
+                }
+                break;
+                case MapLikeOp::FindFail:
+                    CHECK(uut.find(key) == uut.end());
+                    break;
+                case MapLikeOp::FindConstFail:
+                {
+                    const ordered_map<Key, Val> &const_uut{uut};
+                    CHECK(const_uut.find(key) == uut.end());
+                }
+                break;
+
+                // Modify the map
+                case MapLikeOp::SquareBracketsInsert:   // key does not exist, create it
+                case MapLikeOp::SquareBracketsReplace:  // key exists, replace it
+                    ref[key] = val;
+                    uut[key] = val;
+                    break;
+                case MapLikeOp::AtReplace:  // key must exist, replace it
+                    ref[key] = val;
+                    uut.at(key) = val;
+                    break;
+                case MapLikeOp::EmplaceSucceed:  // key does not exist, create it
+                case MapLikeOp::EmplaceFail:     // key exists, no-op
+                    ref.emplace(key, val);
+                    uut.emplace(key, val);
+                    break;
+                case MapLikeOp::Insert:  // key does not exist, create it
+                    ref[key] = val;
+                    uut.insert(key, val);
+                    break;
+                case MapLikeOp::TryRefReplace:  // key exists, replace it
+                {
+                    ref[key] = val;
+                    auto *p = uut.try_ref(key);
+                    CHECK(p != nullptr);
+                    *p = val;
+                }
+                break;
+                case MapLikeOp::EraseSucceed:  // key exists, remove it
+                case MapLikeOp::EraseFail:     // key does not exist, no-op
+                    ref.erase(key);
+                    CHECK((uut.erase(key) != 0) == (op == MapLikeOp::EraseSucceed));
+                    break;
+                case MapLikeOp::ReinsertInsert:   // key does not exist, create it
+                case MapLikeOp::ReinsertReplace:  // key exists, replace it
+                    ref[key] = val;
+                    uut.reinsert(key, val);
+                    break;
+                case MapLikeOp::FindReplace:
+                    ref[key] = val;
+                    *uut.find(key) = val;
+                    break;
+                case MapLikeOp::SwapOrder:
+                    uut.swap_order(uut.find(key),
+                        uut.find(std::next(ref.begin(), urandom_range(0, unsigned(ref.size() - 1u)))->first));
+                    break;
+                case MapLikeOp::Replace:  // key must exist and new_key must not, remove key and create new_key
+                {
+                    Key new_key;
+                    randomize(new_key);
+                    if ( ref.count(new_key) == 0 )
+                    {
+                        ref.erase(key);
+                        ref[new_key] = val;
+                        uut.replace(key, new_key, val);
+                    }
+                }
+                break;
+
+                // Lookup every element
+                case MapLikeOp::CheckAll:
+                case MapLikeOp::Clear:  // Recently added elements may have never been checked, so check before clearing
+                default:
+                    CHECK(uut.size() == int(ref.size()));
+                    CHECK(uut.empty() == ref.empty());
+                    for ( auto &pair : ref )
+                    {
+                        key = pair.first;
+                        CHECK(uut.contains(key));
+                        CHECK(uut[key] == ref[key]);
+                    }
+                    if ( op == MapLikeOp::Clear )
+                    {
+                        ref.clear();
+                        uut.clear();
+                    }
+                    break;
+            }
+
+            if ( debug )
+            {
+                // Additional checks after every operation in debug mode to end the test closer to the point of failure.
+                // Disabled by default in case the act of reading the uut after every write masks a bug.
+                REQUIRE(uut.size() == int(ref.size()));
+                REQUIRE(uut.empty() == ref.empty());
+                for ( auto &pair : ref )
+                {
+                    key = pair.first;
+                    REQUIRE(uut.contains(key));
+                    REQUIRE(uut[key] == ref[key]);
+                }
+            }
+        }
+    }
+}
+
+namespace
+{
+enum class VectorLikeOp
+{
+    Size,
+    Empty,
+    Begin,
+    End,
+    Front,
+    Back,
+    Erase,
+    Clear,
+    IteratorRead,
+    IteratorReadConst,
+    IteratorWrite,
+    PairIteratorRead,
+    PairIteratorReadConst,
+    PairIteratorWrite,
+    EmplaceBack,
+    PushBackInsert,
+    PushBackSquareBrackets,
+    PushBackSeveral,
+    CheckAll,
+};
+
+bool RequiresKeyVal(VectorLikeOp op)
+{
+    switch ( op )
+    {
+        case VectorLikeOp::IteratorWrite:
+        case VectorLikeOp::PairIteratorWrite:
+        case VectorLikeOp::EmplaceBack:
+        case VectorLikeOp::PushBackInsert:
+        case VectorLikeOp::PushBackSquareBrackets:
+            return true;
+        default:
+            return false;
+    }
+}
+
+bool RequiresPos(VectorLikeOp op)
+{
+    switch ( op )
+    {
+        case VectorLikeOp::IteratorRead:
+        case VectorLikeOp::IteratorReadConst:
+        case VectorLikeOp::IteratorWrite:
+        case VectorLikeOp::PairIteratorRead:
+        case VectorLikeOp::PairIteratorReadConst:
+        case VectorLikeOp::PairIteratorWrite:
+        case VectorLikeOp::Erase:
+            return true;
+        default:
+            return false;
+    }
+}
+
+bool RequiresContents(VectorLikeOp op)
+{
+    if ( RequiresPos(op) )
+    {
+        return true;
+    }
+
+    switch ( op )
+    {
+        case VectorLikeOp::Front:
+        case VectorLikeOp::Back:
+            return true;
+        default:
+            return false;
+    }
+}
+
+}  // namespace
+
+TEST_CASE("ordered_map: Behaves like a vector")
+{
+    using Key = size_t;
+    using Val = Key;
+
+    for ( int capacity = 1; capacity <= 256; capacity *= 2 )
+    {
+        ordered_map<Key, Val> uut{capacity};
+        std::vector<Val> ref;
+        int elements_to_add = 0;
+
+        for ( int i = 0; i < test_length; ++i )
+        {
+            VectorLikeOp op{};
+            Key keyval{};
+            int pos = 0;
+            auto iterator = uut.begin();
+            auto pair_iterator = uut.pairs().begin();
+
+            if ( i + 1 == test_length )
+            {
+                op = VectorLikeOp::CheckAll;  // Override to always finish on a check
+            }
+            else if ( elements_to_add > 0 )
+            {
+                op = VectorLikeOp(urandom_range(unsigned(VectorLikeOp::EmplaceBack), unsigned(VectorLikeOp::PushBackSquareBrackets)));
+                --elements_to_add;
+            }
+            else
+            {
+                op = VectorLikeOp(urandom_range(unsigned(VectorLikeOp::Size), unsigned(VectorLikeOp::CheckAll)));
+            }
+
+            if ( ref.empty() && RequiresContents(op) )
+            {
+                continue;
+            }
+
+            if ( RequiresKeyVal(op) )
+            {
+                randomize(keyval);
+
+                if ( std::find(ref.begin(), ref.end(), keyval) != ref.end() )
+                {
+                    continue;  // Unique keys only - not exercising map-like overwriting in this test case
+                }
+            }
+
+            if ( RequiresPos(op) )
+            {
+                pos = int(urandom_range(0, unsigned(ref.size() - 1)));
+                for ( int n = 0; n < pos; ++n )
+                {
+                    if ( urandom_range(0, 1) )
+                    {
+                        ++iterator;
+                        ++pair_iterator;
+                    }
+                    else
+                    {
+                        iterator++;
+                        pair_iterator++;
+                    }
+                }
+            }
+
+            INFO("op = " << unsigned(op) << "\tkeyval = " << keyval << "\tpos = " << pos);
+
+            switch ( op )
+            {
+                // Capacity
+                case VectorLikeOp::Size:
+                    CHECK(uut.size() == int(ref.size()));
+                    break;
+                case VectorLikeOp::Empty:
+                    CHECK(uut.empty() == ref.empty());
+                    break;
+
+                // Begin/End/Front/Back
+                case VectorLikeOp::Begin:
+                {
+                    const auto cbegin = uut.begin();
+                    CHECK(uut.begin() == cbegin);
+                    if ( !ref.empty() )
+                    {
+                        CHECK(*uut.begin() == *cbegin);
+                        CHECK(*uut.begin() == *ref.begin());
+                    }
+                }
+                break;
+                case VectorLikeOp::End:
+                {
+                    const auto cend = uut.end();
+                    CHECK(uut.end() == cend);
+                }
+                break;
+                case VectorLikeOp::Front:
+                    CHECK(uut.front() == ref.front());
+                    break;
+                case VectorLikeOp::Back:
+                    CHECK(uut.back() == ref.back());
+                    break;
+
+                // Erase
+                case VectorLikeOp::Erase:
+                {
+                    auto ref_iterator = ref.erase(ref.begin() + pos);
+                    auto uut_iterator = uut.erase(iterator);
+                    if ( ref_iterator == ref.end() )
+                    {
+                        CHECK(uut_iterator == uut.end());
+                    }
+                    else
+                    {
+                        CHECK(*uut_iterator == *ref_iterator);
+                    }
+                }
+                break;
+
+                // Iterator access
+                case VectorLikeOp::IteratorRead:
+                    CHECK(*iterator == ref[pos]);
+                    break;
+                case VectorLikeOp::IteratorReadConst:
+                {
+                    const ordered_map<Key, Val> &const_uut{uut};
+                    auto const_iterator = const_uut.begin();
+                    for ( int n = 0; n < pos; ++n )
+                    {
+                        if ( urandom_range(0, 1) )
+                        {
+                            ++const_iterator;
+                        }
+                        else
+                        {
+                            const_iterator++;
+                        }
+                    }
+                    CHECK(*const_iterator == ref[pos]);
+                }
+                break;
+                case VectorLikeOp::IteratorWrite:
+                    ref[pos] = keyval;
+                    *iterator = keyval;
+                    break;
+                case VectorLikeOp::PairIteratorRead:
+                    CHECK((*pair_iterator).second == ref[pos]);
+                    CHECK(pair_iterator.value() == ref[pos]);
+                    break;
+                case VectorLikeOp::PairIteratorReadConst:
+                {
+                    const ordered_map<Key, Val> &const_uut{uut};
+                    auto const_pair_iterator = const_uut.pairs().begin();
+                    for ( int n = 0; n < pos; ++n )
+                    {
+                        if ( urandom_range(0, 1) )
+                        {
+                            ++const_pair_iterator;
+                        }
+                        else
+                        {
+                            const_pair_iterator++;
+                        }
+                    }
+                    CHECK((*const_pair_iterator).second == ref[pos]);
+                    CHECK(const_pair_iterator.value() == ref[pos]);
+                }
+                break;
+                case VectorLikeOp::PairIteratorWrite:
+                    ref[pos] = keyval;
+                    pair_iterator.value() = keyval;
+                    break;
+
+                // Add new elements
+                case VectorLikeOp::EmplaceBack:
+                    ref.emplace_back(keyval);
+                    uut.emplace(keyval, keyval);
+                    break;
+                case VectorLikeOp::PushBackInsert:
+                    ref.push_back(keyval);
+                    uut.insert(keyval, keyval);
+                    break;
+                case VectorLikeOp::PushBackSquareBrackets:
+                    ref.push_back(keyval);
+                    uut[keyval] = keyval;
+                    break;
+                case VectorLikeOp::PushBackSeveral:  // Add several elements in a row
+                    elements_to_add = int(urandom_range(1, capacity));
+                    break;
+
+                // Iterate through every element
+                case VectorLikeOp::CheckAll:
+                case VectorLikeOp::Clear:  // Recently added elements may have never been checked, so check before
+                                           // clearing
+                {
+                    CHECK(uut.size() == int(ref.size()));
+                    CHECK(uut.empty() == ref.empty());
+                    auto actual_it = uut.begin();
+                    for ( auto expect : ref )
+                    {
+                        REQUIRE(actual_it != uut.end());
+                        CHECK(*actual_it == expect);
+                        actual_it++;
+                    }
+                    CHECK(actual_it == uut.end());
+                    auto expect_it = ref.begin();
+                    for ( auto actual : uut )
+                    {
+                        REQUIRE(expect_it != ref.end());
+                        CHECK(*expect_it == actual);
+                        expect_it++;
+                    }
+                    CHECK(expect_it == ref.end());
+                    if ( op == VectorLikeOp::Clear )
+                    {
+                        ref.clear();
+                        uut.clear();
+                    }
+                }
+                break;
+                default:
+                    break;
+            }
+
+            if ( debug )
+            {
+                // Additional checks after every operation in debug mode to end the test closer to the point of failure.
+                // Disabled by default in case the act of reading the uut after every write masks a bug.
+                REQUIRE(uut.size() == int(ref.size()));
+                REQUIRE(uut.empty() == ref.empty());
+                auto actual_it = uut.begin();
+                for ( auto expect : ref )
+                {
+                    REQUIRE(actual_it != uut.end());
+                    REQUIRE(*actual_it == expect);
+                    actual_it++;
+                }
+                REQUIRE(actual_it == uut.end());
+                auto expect_it = ref.begin();
+                for ( auto actual : uut )
+                {
+                    REQUIRE(expect_it != ref.end());
+                    REQUIRE(*expect_it == actual);
+                    expect_it++;
+                }
+                REQUIRE(expect_it == ref.end());
+            }
+        }
+    }
+}
+
+TEST_CASE("ordered_map: shared_ptr")
+{
+    // Pathological case to reproduce an error observed in real use, where an ordered map of shared pointers was
+    // destroying a shared pointer twice upon its removal, resulting in a double decrement of the reference count.
+
+    // An object which complains when destroyed unexpectedly
+    struct Obj
+    {
+        bool &ok_to_destroy;
+        bool &return_fail;
+
+        Obj(bool &ok, bool &fail) : ok_to_destroy(ok), return_fail(fail) {}
+
+        ~Obj() { return_fail = return_fail || !ok_to_destroy; }
+    };
+
+    bool ok = false;
+    bool failed = false;
+
+    ordered_map<int, std::shared_ptr<Obj>> uut(3);
+
+    // Hold a shared_ptr to every object we create so that nothing should be destroyed until the end of the test
+    std::vector<std::shared_ptr<Obj>> refs;
+
+    // Overflow hash table so that k=0 is the start of a hash chain
+    for ( int k = 0; k < 10; ++k )
+    {
+        std::shared_ptr<Obj> ptr = std::make_shared<Obj>(ok, failed);
+        uut[k] = ptr;
+        refs.push_back(ptr);
+    }
+
+    // Remove an element from the overflow table
+    uut.erase(5);
+
+    // Remove an element from the front of a hash chain
+    uut.erase(0);
+
+    ok = true;
+    CHECK(!failed);
+}
+
+static const std::pair<int, std::string> defaultValues[] = {
+    {444, "text4"},
+    {111, "text1"},
+    {555, "text5"},
+    {222, "text2"},
+    {333, "text3"},
+};
+
+TEST_CASE("ordered_map: initialised construction")
+{
+    ordered_map<int, std::string> map(defaultValues, std::size(defaultValues));
+    REQUIRE(map.size() == int(std::size(defaultValues)));
+    auto pos = map.begin();
+    for ( size_t i = 0; i < std::size(defaultValues); i++ )
+    {
+        REQUIRE(defaultValues[i].second == *pos++);
+    }
+}
+
+
+TEST_CASE("ordered_map: .at behaviour")
+{
+    ordered_map<int, std::string> map(defaultValues, std::size(defaultValues));
+    try
+    {
+        auto &text1 = map.at(111);
+        map.erase(333);
+        auto &text2 = map.at(333);  // throws, jumping past the REQUIRE(false)
+        REQUIRE(false);
+    }
+    catch ( std::out_of_range &ex )
+    {
+    }
+}
+
+TEST_CASE("ordered_map: copy assignment")
+{
+    ordered_map<int, std::string> map(defaultValues, std::size(defaultValues));
+
+    // Construction copy
+    ordered_map<int, std::string> map2(map);
+    REQUIRE(map2.size() == map.size());
+    REQUIRE(map2.at(333) == "text3");
+
+    // Assignment copy
+    ordered_map<int, std::string> map3;
+    map3.emplace(1, "overwritten");
+    map3 = map2;
+    REQUIRE(map3.at(222) == "text2");
+    REQUIRE(!map3.contains(1));
+}
+
+TEST_CASE("ordered_map: reverse value iterator")
+{
+    ordered_map<int, std::string> map(defaultValues, std::size(defaultValues));
+    auto pos = map.rbegin();
+    for ( auto i = std::size(defaultValues) - 1; i < std::size(defaultValues); i-- )
+    {
+        REQUIRE(defaultValues[i].second == *pos++);
+    }
+}
+
+TEST_CASE("ordered_map: key_of inverse query")
+{
+    ordered_map<int, std::string> map(defaultValues, std::size(defaultValues));
+    int temp;
+    REQUIRE(map.key_of("text2", temp));
+    REQUIRE(temp == 222);
+    REQUIRE(map.key_of("text5", temp));
+    REQUIRE(temp == 555);
+    REQUIRE(map.key_of("text1", temp));
+    REQUIRE(temp == 111);
+}
diff --git a/ethosu/regor/test/test_raw_writer.cpp b/ethosu/regor/test/test_raw_writer.cpp
new file mode 100644
index 00000000..a5140156
--- /dev/null
+++ b/ethosu/regor/test/test_raw_writer.cpp
@@ -0,0 +1,206 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common/common.hpp"
+
+#include "compiler/raw_writer.hpp"
+
+#include <catch_all.hpp>
+
+#include "regor.h"
+
+using namespace regor;
+
+TEST_CASE("raw_writer")
+{
+    // Build command stream tensor
+    std::vector<uint8_t> commandStreamData = {'C', 'O', 'P', '1'};
+    const auto commandStreamBuffer = std::make_shared<Buffer>(std::move(commandStreamData));
+    const auto commandStreamTensor = std::make_shared<Tensor>("command_stream", DataType::Int8, Shape(4), commandStreamBuffer);
+    REQUIRE(commandStreamTensor->IsConstant());
+
+    // Build read only tensor
+    std::vector<uint8_t> readOnlyData = {21, 22, 23, 24, 25};
+    const auto readOnlyBuffer = std::make_shared<Buffer>(std::move(readOnlyData));
+    const auto readOnlyTensor = std::make_shared<Tensor>("read_only", DataType::Int8, Shape(5), readOnlyBuffer);
+    REQUIRE(readOnlyTensor->IsConstant());
+
+    // Build scratch tensor
+    const auto scratch = std::make_shared<Tensor>("scratch", DataType::Int8, Shape({1, 6}));
+    REQUIRE_FALSE(scratch->IsConstant());
+
+    // Build scratch fast tensor
+    const auto scratchFast = std::make_shared<Tensor>("scratch_fast", DataType::Int8, Shape({1, 7}));
+    REQUIRE_FALSE(scratchFast->IsConstant());
+
+    // Build input tensor
+    const auto input = std::make_shared<Tensor>("input_1", DataType::Int8, Shape({1, 1, 8}));
+    REQUIRE_FALSE(input->IsConstant());
+
+    // Build output tensor
+    const auto output = std::make_shared<Tensor>("output_1", DataType::Int8, Shape({1, 1, 1, 9}));
+    REQUIRE_FALSE(output->IsConstant());
+
+    // Create custom op
+    auto op = std::make_shared<Operation>(OpType::CustomNpuOp);
+    op->ConnectInput(MakeTensorUsage(TensorUsage::Params, 0), commandStreamTensor);
+    op->ConnectInput(MakeTensorUsage(TensorUsage::Params, 1), readOnlyTensor);
+    op->ConnectInput(MakeTensorUsage(TensorUsage::State, 0), scratch);
+    op->ConnectInput(MakeTensorUsage(TensorUsage::State, 1), scratchFast);
+    op->ConnectInput(TensorUsage::IFM0, input);
+    op->ConnectOutput(TensorUsage::OFM, output);
+
+    // Create graph
+    std::vector<std::unique_ptr<Graph>> graphs;
+    graphs.push_back(std::make_unique<Graph>(GraphNotation::TFLite));
+    graphs[0]->AddInput(input);
+    graphs[0]->AddOutput(output);
+
+    // Create tensor address map
+    std::vector<std::unordered_map<const Tensor *, Address>> addresses;
+    addresses.push_back({});
+    addresses[0][commandStreamTensor.get()] = 44;
+    addresses[0][readOnlyTensor.get()] = 55;
+    addresses[0][scratch.get()] = 66;
+    addresses[0][scratchFast.get()] = 77;
+    addresses[0][input.get()] = 88;
+    addresses[0][output.get()] = 99;
+
+    // Create the raw output blobs
+    RawWriter writer;
+    auto blobs = writer.Serialise(graphs, addresses);
+
+    // Check number of blobs
+    REQUIRE(blobs.size() == 6);
+
+    // Check command stream
+    {
+        // Check blob size
+        size_t dataSize = sizeof(regor_raw_tensor_header_t) + 4;
+        REQUIRE(blobs[0].second == dataSize);
+
+        // Check header
+        auto &data = blobs[0].first;
+        regor_raw_tensor_header_t header;
+        std::copy_n(data.get(), sizeof(header), reinterpret_cast<uint8_t *>(&header));
+        REQUIRE(header.type == regor_raw_tensor_header_t::RAW_TENSOR_TYPE_COMMAND_STREAM);
+        REQUIRE(header.tensor.command_stream.size == 4);
+
+        // Check tensor data
+        REQUIRE(data[sizeof(regor_raw_tensor_header_t) + 0] == 'C');
+        REQUIRE(data[sizeof(regor_raw_tensor_header_t) + 1] == 'O');
+        REQUIRE(data[sizeof(regor_raw_tensor_header_t) + 2] == 'P');
+        REQUIRE(data[sizeof(regor_raw_tensor_header_t) + 3] == '1');
+    }
+
+    // Check read only
+    {
+        // Check blob size
+        size_t dataSize = sizeof(regor_raw_tensor_header_t) + 5;
+        REQUIRE(blobs[1].second == dataSize);
+
+        // Check header
+        auto &data = blobs[1].first;
+        regor_raw_tensor_header_t header;
+        std::copy_n(data.get(), sizeof(header), reinterpret_cast<uint8_t *>(&header));
+        REQUIRE(header.type == regor_raw_tensor_header_t::RAW_TENSOR_TYPE_READ_ONLY);
+        REQUIRE(header.tensor.read_only.size == 5);
+        REQUIRE(header.tensor.read_only.region == 0);
+
+        // Check tensor data
+        REQUIRE(data[sizeof(regor_raw_tensor_header_t) + 0] == 21);
+        REQUIRE(data[sizeof(regor_raw_tensor_header_t) + 1] == 22);
+        REQUIRE(data[sizeof(regor_raw_tensor_header_t) + 2] == 23);
+        REQUIRE(data[sizeof(regor_raw_tensor_header_t) + 3] == 24);
+        REQUIRE(data[sizeof(regor_raw_tensor_header_t) + 4] == 25);
+    }
+
+    // Check scratch
+    {
+        // Check blob size
+        size_t dataSize = sizeof(regor_raw_tensor_header_t);
+        REQUIRE(blobs[2].second == dataSize);
+
+        // Check header
+        auto &data = blobs[2].first;
+        regor_raw_tensor_header_t header;
+        std::copy_n(data.get(), sizeof(header), reinterpret_cast<uint8_t *>(&header));
+        REQUIRE(header.type == regor_raw_tensor_header_t::RAW_TENSOR_TYPE_SCRATCH);
+        REQUIRE(header.tensor.scratch.size == 6);
+        REQUIRE(header.tensor.scratch.region == 1);
+        REQUIRE(header.tensor.scratch.address == 66);
+    }
+
+    // Check scratch fast
+    {
+        // Check blob size
+        size_t dataSize = sizeof(regor_raw_tensor_header_t);
+        REQUIRE(blobs[3].second == dataSize);
+
+        // Check header
+        auto &data = blobs[3].first;
+        regor_raw_tensor_header_t header;
+        std::copy_n(data.get(), sizeof(header), reinterpret_cast<uint8_t *>(&header));
+        REQUIRE(header.type == regor_raw_tensor_header_t::RAW_TENSOR_TYPE_SCRATCH_FAST);
+        REQUIRE(header.tensor.scratch_fast.size == 7);
+        REQUIRE(header.tensor.scratch_fast.region == 2);
+        REQUIRE(header.tensor.scratch_fast.address == 77);
+    }
+
+    // Check input
+    {
+        // Check blob size
+        size_t dataSize = sizeof(regor_raw_tensor_header_t);
+        REQUIRE(blobs[4].second == dataSize);
+
+        // Check header
+        auto &data = blobs[4].first;
+        regor_raw_tensor_header_t header;
+        std::copy_n(data.get(), sizeof(header), reinterpret_cast<uint8_t *>(&header));
+        REQUIRE(header.type == regor_raw_tensor_header_t::RAW_TENSOR_TYPE_INPUT);
+        REQUIRE(header.tensor.input.size == 8);
+        REQUIRE(header.tensor.input.region == 1);
+        REQUIRE(header.tensor.input.address == 88);
+        REQUIRE(header.tensor.input.element_size == 1);
+        REQUIRE(header.tensor.input.shape[0] == 1);
+        REQUIRE(header.tensor.input.shape[1] == 1);
+        REQUIRE(header.tensor.input.shape[2] == 1);
+        REQUIRE(header.tensor.input.shape[3] == 8);
+    }
+
+    // Check output
+    {
+        // Check blob size
+        size_t dataSize = sizeof(regor_raw_tensor_header_t);
+        REQUIRE(blobs[5].second == dataSize);
+
+        // Check header
+        auto &data = blobs[5].first;
+        regor_raw_tensor_header_t header;
+        std::copy_n(data.get(), sizeof(header), reinterpret_cast<uint8_t *>(&header));
+        REQUIRE(header.type == regor_raw_tensor_header_t::RAW_TENSOR_TYPE_OUTPUT);
+        REQUIRE(header.tensor.output.size == 9);
+        REQUIRE(header.tensor.output.region == 1);
+        REQUIRE(header.tensor.output.address == 99);
+        REQUIRE(header.tensor.output.element_size == 1);
+        REQUIRE(header.tensor.output.shape[0] == 1);
+        REQUIRE(header.tensor.output.shape[1] == 1);
+        REQUIRE(header.tensor.output.shape[2] == 1);
+        REQUIRE(header.tensor.output.shape[3] == 9);
+    }
+}
diff --git a/ethosu/regor/test/test_shape.cpp b/ethosu/regor/test/test_shape.cpp
new file mode 100644
index 00000000..2b61162a
--- /dev/null
+++ b/ethosu/regor/test/test_shape.cpp
@@ -0,0 +1,412 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common/shape.hpp"
+#include "randomize.hpp"
+
+#include <catch_all.hpp>
+#include <random>
+#include <vector>
+
+static const int MAX_TEST_DIMS = 8;
+static const int MAX_STATIC_DIMS = 4;
+
+TEST_CASE("Static shape allocation")
+{
+    Shape a;
+    Shape b(10);
+    Shape c(10, 20);
+    Shape d(10, 20, 30);
+    Shape e(10, 20, 30, 40);
+
+    SECTION("static lengths")
+    {
+        REQUIRE((a.Size() == 0 && !a.IsValid() && !a.IsDynamic()));  // Empty shape
+        REQUIRE((b.Size() == 1 && b.IsValid() && !b.IsDynamic()));
+        REQUIRE((c.Size() == 2 && c.IsValid() && !c.IsDynamic()));
+        REQUIRE((d.Size() == 3 && d.IsValid() && !d.IsDynamic()));
+        REQUIRE((e.Size() == 4 && d.IsValid() && !e.IsDynamic()));
+    }
+
+    SECTION("NHWC named indexing")
+    {
+        REQUIRE((b.Depth() == 10));
+        REQUIRE((c.Depth() == 20 && c.Width() == 10));
+        REQUIRE((d.Depth() == 30 && d.Width() == 20 && d.Height() == 10));
+        REQUIRE((e.Depth() == 40 && e.Width() == 30 && e.Height() == 20 && e.Batch() == 10));
+    }
+
+    SECTION("NHWC Forward indexing")
+    {
+        REQUIRE((b[0] == 10));
+        REQUIRE((c[1] == 20 && c[0] == 10));
+        REQUIRE((d[2] == 30 && d[1] == 20 && d[0] == 10));
+        REQUIRE((e[3] == 40 && e[2] == 30 && e[1] == 20 && e[0] == 10));
+    }
+
+    SECTION("NHWC Reverse indexing")
+    {
+        REQUIRE((b[-1] == 10));
+        REQUIRE((c[-1] == 20 && c[-2] == 10));
+        REQUIRE((d[-1] == 30 && d[-2] == 20 && d[-3] == 10));
+        REQUIRE((e[-1] == 40 && e[-2] == 30 && e[-3] == 20 && e[-4] == 10));
+    }
+
+    SECTION("Shape Volume")
+    {
+        REQUIRE(b.Elements() == 10);
+        REQUIRE(c.Elements() == 200);
+        REQUIRE(d.Elements() == 6000);
+        REQUIRE(e.Elements() == 240000);
+    }
+}
+
+TEST_CASE("Dynamic shape allocation")
+{
+    int axes = GENERATE(range(4, MAX_TEST_DIMS));
+
+    SECTION("Zero filled")
+    {
+        Shape a(nullptr, axes);
+        REQUIRE(a.Size() == axes);
+        REQUIRE(!a.IsEmpty());
+        REQUIRE((a.Size() > MAX_STATIC_DIMS) == a.IsDynamic());
+        REQUIRE((a.Size() <= MAX_STATIC_DIMS) == !a.IsDynamic());
+    }
+
+    SECTION("Initialised fill")
+    {
+        std::vector<int> temp = random_vector<int>(axes, 1, 100);
+
+        Shape b(temp.data(), int(temp.size()));
+        REQUIRE(b.Size() == axes);
+        int64_t volume = 1;
+        for ( int i = 0; i < axes; i++ )
+        {
+            REQUIRE(b[i] == temp[i]);
+            volume *= b[i];
+        }
+        REQUIRE(volume == b.Elements64());
+    }
+}
+
+TEST_CASE("Operators (equal length)")
+{
+    int axes = GENERATE(range(1, MAX_TEST_DIMS));
+    std::vector<int> atemp = random_vector<int>(axes, 1, 32767);
+    std::vector<int> btemp = random_vector<int>(axes, 1, 32767);
+    Shape a(atemp.data(), axes);
+    Shape b(btemp.data(), axes);
+
+    SECTION("Addition")
+    {
+        Shape c = a + b;
+        for ( int i = 0; i < axes; i++ )
+        {
+            REQUIRE(c[i] == a[i] + b[i]);
+        }
+        // Idempotent subtraction
+        c -= b;
+        REQUIRE(c == a);
+    }
+
+    SECTION("Subtraction")
+    {
+        Shape c = a - b;
+        for ( int i = 0; i < axes; i++ )
+        {
+            REQUIRE(c[i] == a[i] - b[i]);
+        }
+        // Idempotent addition
+        c += b;
+        REQUIRE(c == a);
+    }
+
+    SECTION("Modulo")
+    {
+        Shape c = a % b;
+        for ( int i = 0; i < axes; i++ )
+        {
+            REQUIRE(c[i] == (a[i] % b[i]));
+        }
+    }
+
+    SECTION("Divide")
+    {
+        Shape c = a / b;
+        for ( int i = 0; i < axes; i++ )
+        {
+            REQUIRE(c[i] == (a[i] / b[i]));
+        }
+    }
+
+    SECTION("Scale")
+    {
+        int s = GENERATE(take(2, random(1, 100)));
+        Shape c = a * s;
+        Shape d = a / s;
+        for ( int i = 0; i < axes; i++ )
+        {
+            REQUIRE(c[i] == (a[i] * s));
+            REQUIRE(d[i] == (a[i] / s));
+        }
+    }
+}
+
+TEST_CASE("Maximum (equal length)")
+{
+    int axes = GENERATE(range(1, MAX_TEST_DIMS, 2));
+    std::vector<int> atemp = random_vector<int>(axes, 1, 32767);
+    std::vector<int> btemp = random_vector<int>(axes, 1, 16384);
+    Shape a(atemp.data(), axes);
+    Shape b(btemp.data(), axes);
+
+    Shape c = Shape::Max(a, b);
+    for ( int i = 0; i < axes; i++ )
+    {
+        REQUIRE((c[i] >= a[i] && c[i] >= b[i]));
+    }
+}
+
+TEST_CASE("Minimum (equal length)")
+{
+    int axes = GENERATE(range(1, MAX_TEST_DIMS, 2));
+    std::vector<int> atemp = random_vector<int>(axes, 1, 32767);
+    std::vector<int> btemp = random_vector<int>(axes, 1, 16384);
+    Shape a(atemp.data(), axes);
+    Shape b(btemp.data(), axes);
+
+    Shape c = Shape::Min(a, b);
+    for ( int i = 0; i < axes; i++ )
+    {
+        REQUIRE((c[i] <= a[i] && c[i] <= b[i]));
+    }
+}
+
+TEST_CASE("RoundAway (equal length)")
+{
+    int axes = GENERATE(range(1, MAX_TEST_DIMS, 2));
+    std::vector<int> atemp = random_vector<int>(axes, 1, 32767);
+    std::vector<int> btemp = random_vector<int>(axes, 1, 256);
+    Shape a(atemp.data(), axes);
+    Shape b(btemp.data(), axes);
+
+    Shape c = Shape::RoundAway(a, b);
+    for ( int i = 0; i < axes; i++ )
+    {
+        REQUIRE(c[i] % b[i] == 0);
+        REQUIRE(c[i] >= a[i]);
+    }
+}
+
+TEST_CASE("RoundZero (equal length)")
+{
+    int axes = GENERATE(range(1, MAX_TEST_DIMS, 2));
+    std::vector<int> atemp = random_vector<int>(axes, 1, 32767);
+    std::vector<int> btemp = random_vector<int>(axes, 1, 256);
+    Shape a(atemp.data(), axes);
+    Shape b(btemp.data(), axes);
+
+    Shape c = Shape::RoundZero(a, b);
+    for ( int i = 0; i < axes; i++ )
+    {
+        REQUIRE(c[i] % b[i] == 0);
+        REQUIRE(c[i] <= a[i]);
+    }
+}
+
+TEST_CASE("DivRoundUp (equal length)")
+{
+    int axes = GENERATE(range(1, MAX_TEST_DIMS, 2));
+    std::vector<int> atemp = random_vector<int>(axes, 1, 32767);
+    std::vector<int> btemp = random_vector<int>(axes, 1, 256);
+    Shape a(atemp.data(), axes);
+    Shape b(btemp.data(), axes);
+
+    Shape c = Shape::DivRoundUp(a, b);
+    for ( int i = 0; i < axes; i++ )
+    {
+        int value = a[i] / b[i];
+        REQUIRE((c[i] == value || c[i] == value + 1));
+    }
+}
+
+TEST_CASE("Wrap (equal length)")
+{
+    int axes = GENERATE(range(1, MAX_TEST_DIMS, 2));
+    std::vector<int> atemp = random_vector<int>(axes, 1, 32767);
+    std::vector<int> btemp = random_vector<int>(axes, 1, 256);
+    Shape a(atemp.data(), axes);
+    Shape b(btemp.data(), axes);
+
+    Shape c = Shape::Wrap(a, b);
+    for ( int i = 0; i < axes; i++ )
+    {
+        int value = (a[i] > b[i]) ? (a[i] % b[i]) : a[i];
+        REQUIRE(c[i] >= 0);
+        REQUIRE(c[i] == value);
+    }
+}
+
+TEST_CASE("Pad axes (pads left)")
+{
+    int axes = GENERATE(range(1, MAX_TEST_DIMS, 1));
+    int padto = GENERATE(range(1, MAX_TEST_DIMS, 2));
+    std::vector<int> atemp = random_vector<int>(axes, 1, 100);
+    Shape a(atemp.data(), axes);
+    Shape b = Shape::PadAxes(a, padto, 0);
+    Shape c = Shape::PadAxes(a, padto, 12345);
+
+    padto = std::max(b.Size(), padto);
+    REQUIRE(b.Size() == padto);
+    REQUIRE(c.Size() == padto);
+    int diff = (padto - axes);
+    for ( int i = 0; i < padto; i++ )
+    {
+        if ( (padto > axes) && i < diff )
+        {
+            // Padded part
+            REQUIRE(b[i] == 0);
+            REQUIRE(c[i] == 12345);
+        }
+        else
+        {
+            // Original part
+            REQUIRE(b[i] == a[i - diff]);
+            REQUIRE(c[i] == a[i - diff]);
+        }
+    }
+}
+
+TEST_CASE("Copy, Move and Assignment")
+{
+    Shape invalid;
+    Shape a(1, 2);
+    Shape b(1, 2, 3);
+    Shape c(1, 2, 3, 4);
+    Shape d(b);
+    REQUIRE(d == b);
+    b = c;
+    REQUIRE(b.Size() == 4);
+    c = invalid;
+    REQUIRE(!c.IsValid());
+    b = std::move(a);
+    REQUIRE(b.Size() == 2);
+    REQUIRE(!a.IsValid());
+}
+
+TEST_CASE("Extract axes")
+{
+    Shape a(100, 200, 300, 400);
+
+    Shape aa = a.Extract(0, 1, 2, 3);
+    REQUIRE(a == aa);
+
+    Shape b = a.Extract({3});
+    REQUIRE((b[0] == 400 && b.Size() == 1));
+
+    Shape c = a.Extract({1, 2});
+    REQUIRE((c[0] == 200 && c[1] == 300 && c.Size() == 2));
+
+    Shape d = a.Extract({3, 2, 1, 0});
+    REQUIRE((d[0] == 400 && d[1] == 300 && d[2] == 200 && d[3] == 100));
+
+    Shape e = a.Extract({2, 2, 2});
+    REQUIRE((e[0] == 300 && e[1] == 300 && e[2] == 300));
+}
+
+TEST_CASE("Erase and Insert axes")
+{
+    Shape a(100, 200, 300, 400);
+
+    REQUIRE(a.Erase(0) == Shape(200, 300, 400));
+    REQUIRE(a.Erase(1) == Shape(100, 300, 400));
+    REQUIRE(a.Erase(2) == Shape(100, 200, 400));
+    REQUIRE(a.Erase(3) == Shape(100, 200, 300));
+
+    Shape b(100, 200);
+    REQUIRE(b.Insert(0, 50) == Shape(50, 100, 200));
+    REQUIRE(b.Insert(1, 50) == Shape(100, 50, 200));
+    REQUIRE(b.Insert(2, 50) == Shape(100, 200, 50));
+}
+
+TEST_CASE("Permute Mask")
+{
+    Shape a(100, 200, 300, 400);
+
+    SECTION("identity")
+    {
+        Shape b = a.Permute(0x000000123);
+        REQUIRE(a == b);
+    }
+
+    Shape c = a.Permute(0x000003210);
+    REQUIRE((c[0] == a[3] && c[1] == a[2] && c[2] == a[1] && c[3] == a[0]));
+
+    Shape d = a.Permute(0x000000312);
+    REQUIRE((d[0] == 100 && d[1] == 400 && d[2] == 200 && d[3] == 300));
+}
+
+TEST_CASE("Unpermute Mask")
+{
+    Shape a(100, 200, 300, 400);
+
+    SECTION("identity")
+    {
+        Shape b = a.Unpermute(0x000000123);
+        REQUIRE(a == b);
+    }
+
+    Shape c = a.Unpermute(0x000003012);
+    REQUIRE((c[0] == a[1] && c[1] == a[2] && c[2] == a[3] && c[3] == a[0]));
+
+    Shape d = a.Permute(0x000003012).Unpermute(0x000003012);
+    REQUIRE(a == d);
+
+    Shape e = a.Unpermute(0x000001230);
+    REQUIRE((e[0] == 400 && e[1] == 100 && e[2] == 200 && e[3] == 300));
+
+    Shape f = a.Permute(0x000001230).Unpermute(0x000001230);
+    REQUIRE(a == f);
+}
+
+TEST_CASE("Get Strides")
+{
+    Shape a(1, 2, 3, 4);
+
+    Shape b = Shape::GetStridesForShape(a, {1, 1, 1, 1});
+    REQUIRE((b[0] == 24 && b[1] == 12 && b[2] == 4 && b[3] == 1));
+
+    Shape c = Shape::GetStridesForShape(a, {1, 1, 1, 4});
+    REQUIRE((c[0] == 96 && c[1] == 48 && c[2] == 16 && c[3] == 4));
+
+    Shape d = Shape::GetStridesForShape(a, {128, 1, 1, 4});
+    REQUIRE((d[0] == 128 && d[1] == 48 && d[2] == 16 && d[3] == 4));
+}
+
+
+TEST_CASE("Vector Conversion")
+{
+    std::vector<int> intVec{5, 6, 2, 1};
+    Shape a = Shape::FromVector(intVec);
+    REQUIRE((a[0] == 5 && a[2] == 2));
+
+    std::vector<uint8_t> charVec{45, 32, 25, 12, 16, 19};
+    Shape b = Shape::FromVector(charVec);
+    REQUIRE((b[0] == 45 && b[4] == 16 && b[5] == 19));
+}
diff --git a/ethosu/regor/test/test_tosa_validator.cpp b/ethosu/regor/test/test_tosa_validator.cpp
new file mode 100644
index 00000000..f8096d84
--- /dev/null
+++ b/ethosu/regor/test/test_tosa_validator.cpp
@@ -0,0 +1,89 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+
+#include "include/graphapi.hpp"
+#include "randomize.hpp"
+#include "tosa/tosa_validator.hpp"
+
+#include <catch_all.hpp>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "include/regor.h"
+
+namespace
+{
+
+struct TestOperation
+{
+    tosa::Op tosaOp;
+    GraphApi::GraphOperation *op;
+    std::string description;
+};
+
+std::vector<TestOperation> expectedToPass{};
+std::vector<TestOperation> expectedToFail{};
+
+void CreateTestVectors(GraphApi::IGraphBuilder *builder);
+
+TEST_CASE("tosa_validator")
+{
+    regor_context_t context;
+    REQUIRE(regor_create(&context, REGOR_ARCH_ETHOSU85) == 1);
+
+    GraphApi::IGraphBuilder *builder = regor_get_graph_builder(context, "Tosa validator UnitTest");
+    REQUIRE(builder);
+
+    CreateTestVectors(builder);
+    for ( auto &test : expectedToPass )
+    {
+        CAPTURE(test.tosaOp);
+        auto description = test.description + " should pass.";
+        CAPTURE(description);
+        REQUIRE_NOTHROW(tosa::validator::ValidateOperator(test.op));
+    }
+    for ( auto &test : expectedToFail )
+    {
+        CAPTURE(test.tosaOp);
+        auto description = test.description + " should fail.";
+        CAPTURE(description);
+        REQUIRE_THROWS_AS(tosa::validator::ValidateOperator(test.op), std::logic_error);
+    }
+    regor_destroy(context);
+}
+
+void CreateTestVectors(GraphApi::IGraphBuilder *builder)
+{
+    expectedToFail.emplace_back(TestOperation{tosa::Op::ABS, nullptr, "Null GraphOperation"});
+    expectedToFail.emplace_back(TestOperation{tosa::Op::DIM, builder->CreateOp(tosa::Op::DIM, nullptr), "DIM not supported in 0.60"});
+    expectedToFail.emplace_back(TestOperation{tosa::Op::SIGMOID, builder->CreateOp(tosa::Op::SIGMOID, nullptr), "SIGMOID unsupported in BI"});
+    {
+        auto op{builder->CreateOp(tosa::Op::ABS, nullptr)};
+        auto inTensor{builder->CreateTensor("input-0", GraphApi::GraphShape{4, {55, 53, 18, 40}},
+            GraphApi::GraphTensorLayout::Linear, GraphApi::GraphDataType::Int32, nullptr)};
+        auto outTensor{builder->CreateTensor("result-0", GraphApi::GraphShape{4, {55, 53, 18, 40}},
+            GraphApi::GraphTensorLayout::Linear, GraphApi::GraphDataType::Int32, nullptr)};
+        builder->AddInput(op, GraphApi::GraphTensorUsage::IFM, inTensor);
+        builder->AddOutput(op, GraphApi::GraphTensorUsage::OFM, outTensor);
+        expectedToPass.emplace_back(TestOperation{tosa::Op::ABS, op, "ABS supported"});
+    }
+}
+
+}  // namespace
diff --git a/ethosu/regor/test/test_transpose_type.cpp b/ethosu/regor/test/test_transpose_type.cpp
new file mode 100644
index 00000000..93c749a3
--- /dev/null
+++ b/ethosu/regor/test/test_transpose_type.cpp
@@ -0,0 +1,56 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "common/transpose_type.hpp"
+#include "randomize.hpp"
+
+#include <catch_all.hpp>
+#include <random>
+#include <vector>
+
+TEST_CASE("TransposeType IsNone")
+{
+    REQUIRE(IsNone(TransposeType::None));
+    REQUIRE(IsNone(TransposeType::NHWC));
+
+    REQUIRE(IsNone(TransposeType(0x0)));
+    REQUIRE(IsNone(TransposeType(0x01)));
+    REQUIRE(IsNone(TransposeType(0x012)));
+    REQUIRE(IsNone(TransposeType(0x0123)));
+    REQUIRE(IsNone(TransposeType(0x01234)));
+    REQUIRE(IsNone(TransposeType(0x012345)));
+    REQUIRE(IsNone(TransposeType(0x0123456)));
+    REQUIRE(IsNone(TransposeType(0x01234567)));
+
+    REQUIRE_FALSE(IsNone(TransposeType::NWHC));
+    REQUIRE_FALSE(IsNone(TransposeType::NHCW));
+    REQUIRE_FALSE(IsNone(TransposeType::NWCH));
+    REQUIRE_FALSE(IsNone(TransposeType::NCHW));
+    REQUIRE_FALSE(IsNone(TransposeType::NCWH));
+}
+
+TEST_CASE("TransposeType Reduce4To3")
+{
+    REQUIRE(uint32_t(Reduce4To3(TransposeType::None)) == 0x012);
+    REQUIRE(uint32_t(Reduce4To3(TransposeType::NHWC)) == 0x012);
+    REQUIRE(uint32_t(Reduce4To3(TransposeType::NWHC)) == 0x102);
+    REQUIRE(uint32_t(Reduce4To3(TransposeType::NHCW)) == 0x021);
+    REQUIRE(uint32_t(Reduce4To3(TransposeType::NWCH)) == 0x120);
+    REQUIRE(uint32_t(Reduce4To3(TransposeType::NCHW)) == 0x201);
+    REQUIRE(uint32_t(Reduce4To3(TransposeType::NCWH)) == 0x210);
+}
diff --git a/ethosu/regor/tflite/custom_operator_ethosu.hpp b/ethosu/regor/tflite/custom_operator_ethosu.hpp
new file mode 100644
index 00000000..2012e558
--- /dev/null
+++ b/ethosu/regor/tflite/custom_operator_ethosu.hpp
@@ -0,0 +1,212 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/buffer_view.hpp"
+#include "compiler/scheduler.hpp"
+#include "compiler/scheduler_operation.hpp"
+#include "compiler/tensor.hpp"
+
+#include <memory>
+#include <vector>
+
+namespace regor
+{
+
+class DriverActions
+{
+public:
+    DriverActions(const DriverActions &) = delete;  // Never constructed. Static members only.
+
+    static std::unique_ptr<Buffer> CreateDriverPayload(
+        const std::vector<uint32_t> &registerCommandStream, uint32_t archConfigWord, uint32_t archVersion)
+    {
+        assert(registerCommandStream.size() < (1 << 24));  // 64MiB
+
+        std::vector<uint8_t> payload;
+        payload.reserve(4 * (registerCommandStream.size() + 8));
+
+        // FourCC
+        Emit(payload, 0x31, 0x50, 0x4F, 0x43);  // u8"COP1"
+
+        // Config
+        Emit(payload, OptimizerConfigWord(0, 1));  // Optimizer version 0.1
+        Emit(payload, archConfigWord);
+        Emit(payload, archVersion);  // Architecture version
+
+        // Header
+        while ( (payload.size() + 4) % 16 )
+        {
+            // Insert NOPs to align start of command stream to 16 bytes
+            Emit(payload, MakeTag(Command::NOP, 0, 0));
+        }
+        const auto length_high = registerCommandStream.size() >> 16;
+        const auto length_low = registerCommandStream.size() & 0xFFFF;
+        Emit(payload, MakeTag(Command::CMDSTRM, uint8_t(length_high), uint16_t(length_low)));
+
+        // Command stream
+        for ( const auto &registerCommand : registerCommandStream )
+        {
+            Emit(payload, registerCommand);
+        }
+
+        return std::make_unique<Buffer>(std::move(payload));
+    }
+
+private:
+    enum class Command : uint8_t
+    {
+        OPTCFG = 0x01,
+        CMDSTRM = 0x02,
+        NOP = 0x05,
+    };
+
+    static constexpr uint32_t MakeTag(Command id, uint8_t reserved, uint16_t param)
+    {
+        return unsigned(id) | (reserved << 8) | (param << 16);
+    }
+
+    static constexpr uint32_t OptimizerConfigWord(int release, int patch)
+    {
+        return MakeTag(Command::OPTCFG, 0, uint16_t(release | (patch << 4)));
+    }
+
+    static void Emit(std::vector<uint8_t> &data, uint8_t byte3, uint8_t byte2, uint8_t byte1, uint8_t byte0)
+    {
+        data.insert(data.end(), {byte0, byte1, byte2, byte3});
+    }
+
+    static void Emit(std::vector<uint8_t> &data, uint32_t word)
+    {
+        data.insert(data.end(), {uint8_t(word), uint8_t(word >> 8), uint8_t(word >> 16), uint8_t(word >> 24)});
+    }
+};
+
+class CustomOperatorBuilder
+{
+private:
+    const Schedule *_schedule;
+    uint32_t _archConfigWord;
+    uint32_t _archVersion;
+    std::shared_ptr<Tensor> _featureMapTensor;
+    std::shared_ptr<Tensor> _stagingTensor;
+    std::shared_ptr<Tensor> _readOnlyTensor;
+    std::shared_ptr<Buffer> _readOnlyBuffer;
+
+public:
+    CustomOperatorBuilder(Architecture *architecture, const Schedule *schedule)
+    {
+        _schedule = schedule;
+        _archConfigWord = architecture->ConfigRegisters().front();
+        _archVersion = architecture->Version();
+        _featureMapTensor = std::make_shared<Tensor>("scratch", DataType::UInt8);
+        _readOnlyTensor = std::make_shared<Tensor>("read_only", DataType::UInt8);
+        _featureMapTensor->SetStorageShape(Shape(schedule->memoryUsage.at(architecture->FeatureMapMemory())));
+
+        if ( architecture->StagingMemory() == architecture->FeatureMapMemory() )
+        {
+            _stagingTensor = _featureMapTensor;
+        }
+        else
+        {
+            _stagingTensor = std::make_shared<Tensor>("scratch_fast", DataType::UInt8);
+            _stagingTensor->SetStorageShape(Shape(schedule->memoryUsage.at(architecture->StagingMemory())));
+        }
+
+        const auto readOnlySize = schedule->memoryUsage.at(architecture->ReadonlyMemory());
+        std::vector<uint8_t> readOnlyBuffer(readOnlySize);
+        _readOnlyBuffer = std::make_shared<Buffer>(std::move(readOnlyBuffer));
+        _readOnlyTensor->SetStorageShape(Shape(readOnlySize));
+        _readOnlyTensor->SetBuffer(_readOnlyBuffer);
+    }
+
+    void AllocateScratchTensors(std::unordered_map<const Tensor *, Address> &tensorAddressMap)
+    {
+        tensorAddressMap[_featureMapTensor.get()] = 0;
+        if ( _featureMapTensor.get() != _stagingTensor.get() )
+        {
+            tensorAddressMap[_stagingTensor.get()] = 0;
+        }
+    }
+
+    void Serialise(Operation *operation, const NPUOperation *npuOp, const std::vector<uint32_t> &registerCommandStream)
+    {
+        operation->ConnectInput(TensorUsage::Params, CreateCommandStreamTensor(registerCommandStream));
+        operation->ConnectInput(MakeTensorUsage(TensorUsage::Params, 1), _readOnlyTensor);
+        operation->ConnectInput(TensorUsage::State, _featureMapTensor);
+        operation->ConnectInput(MakeTensorUsage(TensorUsage::State, 1), _stagingTensor);
+
+        for ( const auto &subOp : npuOp->Operations() )
+        {
+            const auto cost = _schedule->Cost(subOp.get());
+            for ( const auto &input : subOp->inputs.pairs() )
+            {
+                if ( input.second.tensor->srcTensor->IsConstant() )
+                {
+                    if ( input.first == TensorUsage::Weights )
+                    {
+                        AddToReadOnly(cost->npuWeightsTensor.get());
+                    }
+                    else if ( input.first == TensorUsage::Scales )
+                    {
+                        AddToReadOnly(cost->npuScalesTensor.get());
+                    }
+                    else
+                    {
+                        auto tensor = input.second.tensor.get();
+                        if ( tensor->allocatedAddress >= 0 )
+                        {
+                            AddToReadOnly(tensor);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+private:
+    std::unique_ptr<Tensor> CreateCommandStreamTensor(const std::vector<uint32_t> &commandStream)
+    {
+        auto tensor = std::make_unique<Tensor>("ethos_u_command_stream", DataType::UInt8);
+        const std::shared_ptr<Buffer> buffer = DriverActions::CreateDriverPayload(commandStream, _archConfigWord, _archVersion);
+        tensor->SetStorageShape(Shape(buffer->Size()));
+        tensor->SetBuffer(buffer);
+        return tensor;
+    }
+
+    // TODO: Move into TfLiteWriter and fill directly into the flatbuffer to reduce memory footprint
+    void AddToReadOnly(const SchedulerTensor *tensor)
+    {
+        if ( tensor )
+        {
+            const auto offset = tensor->allocatedAddress;
+            const auto allocation = tensor->AllocationSizeBytes();
+            const auto size = tensor->srcTensor->View().Buffer()->Size();
+
+            assert(tensor->memArea.usage & MemUsage::ReadOnly);
+            assert((offset >= 0) && (allocation >= 0));                         // Has been allocated
+            assert((offset + allocation) <= Address(_readOnlyBuffer->Size()));  // Allocation fits in buffer
+            assert(size <= allocation);                                         // Tensor fits in allocation
+
+            std::copy_n(tensor->srcTensor->View().Buffer()->Data<uint8_t>(), size, _readOnlyBuffer->Data<uint8_t>() + offset);
+        }
+    }
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/tflite/flatbuffer_utils.hpp b/ethosu/regor/tflite/flatbuffer_utils.hpp
new file mode 100644
index 00000000..5b29809c
--- /dev/null
+++ b/ethosu/regor/tflite/flatbuffer_utils.hpp
@@ -0,0 +1,49 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include <flatbuffers/flatbuffers.h>
+
+namespace FlatbufferUtils
+{
+// Load a vector (if present) from a flatbuffer into a local copy.
+// Intended for small vectors only - large vectors should be left in place and mapped using a Buffer class instead.
+template<typename T>
+static std::vector<T> LoadVector(const flatbuffers::Vector<T> *source)
+{
+    std::vector<T> destination;
+    if ( source )
+    {
+        destination.insert(destination.begin(), source->begin(), source->end());
+    }
+    return destination;
+}
+
+// Copy a vector (if present) from one flatbuffer to another, returning the offset into the destination buffer.
+template<typename T>
+static flatbuffers::Offset<flatbuffers::Vector<T>>
+CopyVector(flatbuffers::FlatBufferBuilder &destination, const flatbuffers::Vector<T> *source)
+{
+    if ( source )
+    {
+        return destination.CreateVector<T>(source->data(), source->size());
+    }
+    return 0;
+}
+}  // namespace FlatbufferUtils
diff --git a/ethosu/regor/tflite/tflite_mapping.cpp b/ethosu/regor/tflite/tflite_mapping.cpp
new file mode 100644
index 00000000..fef5b56b
--- /dev/null
+++ b/ethosu/regor/tflite/tflite_mapping.cpp
@@ -0,0 +1,717 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "tflite_mapping.hpp"
+
+#include "common/data_type.hpp"
+#include "compiler/op_type.hpp"
+#include "flatbuffer_utils.hpp"
+#include "tflite_schema_generated.hpp"
+
+#include <map>
+#include <set>
+
+namespace regor
+{
+
+const std::map<tflite::TensorType, DataType> TfLiteMapping::_tensorTypeToDataType = {
+    // clang-format off
+    {tflite::TensorType::FLOAT32,       DataType::Float32},
+    {tflite::TensorType::FLOAT16,       DataType::Float16},
+    {tflite::TensorType::INT32,         DataType::Int32},
+    {tflite::TensorType::UINT8,         DataType::UInt8},
+    {tflite::TensorType::INT64,         DataType::Int64},
+    {tflite::TensorType::STRING,        DataType::String},
+    {tflite::TensorType::BOOL,          DataType::Bool8},
+    {tflite::TensorType::INT16,         DataType::Int16},
+    {tflite::TensorType::COMPLEX64,     DataType::Complex64},
+    {tflite::TensorType::INT8,          DataType::Int8},
+    {tflite::TensorType::FLOAT64,       DataType::Float64},
+    {tflite::TensorType::COMPLEX128,    DataType::Complex128},
+    {tflite::TensorType::UINT64,        DataType::UInt64},
+    {tflite::TensorType::RESOURCE,      DataType::Resource},
+    {tflite::TensorType::VARIANT,       DataType::Variant},
+    {tflite::TensorType::UINT32,        DataType::UInt32}
+    // clang-format on
+};
+
+const std::map<DataType, tflite::TensorType> TfLiteMapping::_dataTypeToTensorType = InvertMap<tflite::TensorType, DataType>(_tensorTypeToDataType);
+
+const std::map<tflite::ActivationFunctionType, OpType> TfLiteMapping::_activationFunctionToOpType = {
+    // clang-format off
+    {tflite::ActivationFunctionType::NONE,          OpType::None},
+    {tflite::ActivationFunctionType::RELU,          OpType::Relu},
+    {tflite::ActivationFunctionType::RELU_N1_TO_1,  OpType::ReluN1To1},
+    {tflite::ActivationFunctionType::RELU6,         OpType::Relu6},
+    {tflite::ActivationFunctionType::TANH,          OpType::Tanh},
+    {tflite::ActivationFunctionType::SIGN_BIT,      OpType::SignBit}
+    // clang-format on
+};
+
+const std::map<OpType, tflite::ActivationFunctionType> TfLiteMapping::_opTypeToActivationFunction = InvertMap<
+    tflite::ActivationFunctionType, OpType>(_activationFunctionToOpType);
+
+const std::map<tflite::BuiltinOperator, OpType> TfLiteMapping::_builtinOperatorToOpType = {
+    // clang-format off
+    {tflite::BuiltinOperator::ADD,                              OpType::Add},
+    {tflite::BuiltinOperator::AVERAGE_POOL_2D,                  OpType::AvgPool},
+    {tflite::BuiltinOperator::CONCATENATION,                    OpType::ConcatTFLite},
+    {tflite::BuiltinOperator::CONV_2D,                          OpType::Conv2DBias},
+    {tflite::BuiltinOperator::DEPTHWISE_CONV_2D,                OpType::DepthwiseConv2DBias},
+    {tflite::BuiltinOperator::DEPTH_TO_SPACE,                   OpType::DepthToSpace},
+    {tflite::BuiltinOperator::DEQUANTIZE,                       OpType::Dequantize},
+    {tflite::BuiltinOperator::EMBEDDING_LOOKUP,                 OpType::EmbeddingLookup},
+    {tflite::BuiltinOperator::FLOOR,                            OpType::Floor},
+    {tflite::BuiltinOperator::FULLY_CONNECTED,                  OpType::FullyConnected},
+    {tflite::BuiltinOperator::HASHTABLE_LOOKUP,                 OpType::HashtableLookup},
+    {tflite::BuiltinOperator::L2_NORMALIZATION,                 OpType::L2Norm},
+    {tflite::BuiltinOperator::L2_POOL_2D,                       OpType::L2Pool2D},
+    {tflite::BuiltinOperator::LOCAL_RESPONSE_NORMALIZATION,     OpType::LRN},
+    {tflite::BuiltinOperator::LOGISTIC,                         OpType::Sigmoid},
+    {tflite::BuiltinOperator::LSH_PROJECTION,                   OpType::LSHProjection},
+    {tflite::BuiltinOperator::LSTM,                             OpType::Lstm},
+    {tflite::BuiltinOperator::MAX_POOL_2D,                      OpType::MaxPool},
+    {tflite::BuiltinOperator::MUL,                              OpType::Mul},
+    {tflite::BuiltinOperator::RELU,                             OpType::Relu},
+    {tflite::BuiltinOperator::RELU_N1_TO_1,                     OpType::ReluN1To1},
+    {tflite::BuiltinOperator::RELU6,                            OpType::Relu6},
+    {tflite::BuiltinOperator::RESHAPE,                          OpType::Reshape},
+    {tflite::BuiltinOperator::RESIZE_BILINEAR,                  OpType::ResizeBilinear},
+    {tflite::BuiltinOperator::RNN,                              OpType::Rnn},
+    {tflite::BuiltinOperator::SOFTMAX,                          OpType::Softmax},
+    {tflite::BuiltinOperator::SPACE_TO_DEPTH,                   OpType::SpaceToDepth},
+    {tflite::BuiltinOperator::SVDF,                             OpType::Svdf},
+    {tflite::BuiltinOperator::TANH,                             OpType::Tanh},
+    {tflite::BuiltinOperator::CONCAT_EMBEDDINGS,                OpType::ConcatEmbeddings},
+    {tflite::BuiltinOperator::SKIP_GRAM,                        OpType::SkipGram},
+    {tflite::BuiltinOperator::CALL,                             OpType::Call},
+    {tflite::BuiltinOperator::EMBEDDING_LOOKUP_SPARSE,          OpType::EmbeddingLookupSparse},
+    {tflite::BuiltinOperator::PAD,                              OpType::Pad},
+    {tflite::BuiltinOperator::UNIDIRECTIONAL_SEQUENCE_RNN,      OpType::UnidirectionalSequenceRnn},
+    {tflite::BuiltinOperator::GATHER,                           OpType::GatherV2},
+    {tflite::BuiltinOperator::BATCH_TO_SPACE_ND,                OpType::BatchToSpaceND},
+    {tflite::BuiltinOperator::SPACE_TO_BATCH_ND,                OpType::SpaceToBatchND},
+    {tflite::BuiltinOperator::TRANSPOSE,                        OpType::Transpose},
+    {tflite::BuiltinOperator::MEAN,                             OpType::Mean},
+    {tflite::BuiltinOperator::SUB,                              OpType::Sub},
+    {tflite::BuiltinOperator::DIV,                              OpType::Div},
+    {tflite::BuiltinOperator::SQUEEZE,                          OpType::Squeeze},
+    {tflite::BuiltinOperator::UNIDIRECTIONAL_SEQUENCE_LSTM,     OpType::UnidirectionalSequenceLstm},
+    {tflite::BuiltinOperator::STRIDED_SLICE,                    OpType::StridedSlice},
+    {tflite::BuiltinOperator::BIDIRECTIONAL_SEQUENCE_RNN,       OpType::BidirectionalSequenceRnn},
+    {tflite::BuiltinOperator::EXP,                              OpType::Exp},
+    {tflite::BuiltinOperator::TOPK_V2,                          OpType::TopKV2},
+    {tflite::BuiltinOperator::SPLIT,                            OpType::Split},
+    {tflite::BuiltinOperator::LOG_SOFTMAX,                      OpType::LogSoftmax},
+    {tflite::BuiltinOperator::DELEGATE,                         OpType::Delegate},
+    {tflite::BuiltinOperator::BIDIRECTIONAL_SEQUENCE_LSTM,      OpType::BidirectionalSequenceLstm},
+    {tflite::BuiltinOperator::CAST,                             OpType::Cast},
+    {tflite::BuiltinOperator::PRELU,                            OpType::Prelu},
+    {tflite::BuiltinOperator::MAXIMUM,                          OpType::Maximum},
+    {tflite::BuiltinOperator::ARGMAX,                           OpType::ArgMax},
+    {tflite::BuiltinOperator::MINIMUM,                          OpType::Minimum},
+    {tflite::BuiltinOperator::LESS,                             OpType::Less},
+    {tflite::BuiltinOperator::NEG,                              OpType::Neg},
+    {tflite::BuiltinOperator::PADV2,                            OpType::PadV2},
+    {tflite::BuiltinOperator::GREATER,                          OpType::Greater},
+    {tflite::BuiltinOperator::GREATER_EQUAL,                    OpType::GreaterEqual},
+    {tflite::BuiltinOperator::LESS_EQUAL,                       OpType::LessEqual},
+    {tflite::BuiltinOperator::SELECT,                           OpType::Select},
+    {tflite::BuiltinOperator::SLICE,                            OpType::Slice},
+    {tflite::BuiltinOperator::SIN,                              OpType::Sin},
+    {tflite::BuiltinOperator::TRANSPOSE_CONV,                   OpType::Conv2DBackpropInput},
+    {tflite::BuiltinOperator::SPARSE_TO_DENSE,                  OpType::SparseToDense},
+    {tflite::BuiltinOperator::TILE,                             OpType::Tile},
+    {tflite::BuiltinOperator::EXPAND_DIMS,                      OpType::ExpandDims},
+    {tflite::BuiltinOperator::EQUAL,                            OpType::Equal},
+    {tflite::BuiltinOperator::NOT_EQUAL,                        OpType::NotEqual},
+    {tflite::BuiltinOperator::LOG,                              OpType::Log},
+    {tflite::BuiltinOperator::SUM,                              OpType::Sum},
+    {tflite::BuiltinOperator::SQRT,                             OpType::Sqrt},
+    {tflite::BuiltinOperator::RSQRT,                            OpType::Rsqrt},
+    {tflite::BuiltinOperator::SHAPE,                            OpType::Shape},
+    {tflite::BuiltinOperator::POW,                              OpType::Pow},
+    {tflite::BuiltinOperator::ARG_MIN,                          OpType::ArgMin},
+    {tflite::BuiltinOperator::FAKE_QUANT,                       OpType::FakeQuantWithMinMaxArgs},
+    {tflite::BuiltinOperator::REDUCE_PROD,                      OpType::Prod},
+    {tflite::BuiltinOperator::REDUCE_MAX,                       OpType::Max},
+    {tflite::BuiltinOperator::PACK,                             OpType::Pack},
+    {tflite::BuiltinOperator::LOGICAL_OR,                       OpType::LogicalOr},
+    {tflite::BuiltinOperator::ONE_HOT,                          OpType::OneHot},
+    {tflite::BuiltinOperator::LOGICAL_AND,                      OpType::LogicalAnd},
+    {tflite::BuiltinOperator::LOGICAL_NOT,                      OpType::LogicalNot},
+    {tflite::BuiltinOperator::UNPACK,                           OpType::Unpack},
+    {tflite::BuiltinOperator::REDUCE_MIN,                       OpType::Min},
+    {tflite::BuiltinOperator::FLOOR_DIV,                        OpType::FloorDiv},
+    {tflite::BuiltinOperator::REDUCE_ANY,                       OpType::Any},
+    {tflite::BuiltinOperator::SQUARE,                           OpType::Square},
+    {tflite::BuiltinOperator::ZEROS_LIKE,                       OpType::ZerosLike},
+    {tflite::BuiltinOperator::FILL,                             OpType::Fill},
+    {tflite::BuiltinOperator::FLOOR_MOD,                        OpType::FloorMod},
+    {tflite::BuiltinOperator::RANGE,                            OpType::Range},
+    {tflite::BuiltinOperator::RESIZE_NEAREST_NEIGHBOR,          OpType::ResizeNearestNeighbor},
+    {tflite::BuiltinOperator::LEAKY_RELU,                       OpType::LeakyRelu},
+    {tflite::BuiltinOperator::SQUARED_DIFFERENCE,               OpType::SquaredDifference},
+    {tflite::BuiltinOperator::MIRROR_PAD,                       OpType::MirrorPad},
+    {tflite::BuiltinOperator::ABS,                              OpType::Abs},
+    {tflite::BuiltinOperator::SPLIT_V,                          OpType::SplitV},
+    {tflite::BuiltinOperator::UNIQUE,                           OpType::Unique},
+    {tflite::BuiltinOperator::CEIL,                             OpType::Ceil},
+    {tflite::BuiltinOperator::REVERSE_V2,                       OpType::ReverseV2},
+    {tflite::BuiltinOperator::ADD_N,                            OpType::AddN},
+    {tflite::BuiltinOperator::GATHER_ND,                        OpType::GatherNd},
+    {tflite::BuiltinOperator::COS,                              OpType::Cos},
+    {tflite::BuiltinOperator::WHERE,                            OpType::Where},
+    {tflite::BuiltinOperator::RANK,                             OpType::Rank},
+    {tflite::BuiltinOperator::ELU,                              OpType::Elu},
+    {tflite::BuiltinOperator::REVERSE_SEQUENCE,                 OpType::ReverseSequence},
+    {tflite::BuiltinOperator::MATRIX_DIAG,                      OpType::MatrixDiag},
+    {tflite::BuiltinOperator::QUANTIZE,                         OpType::Quantize},
+    {tflite::BuiltinOperator::MATRIX_SET_DIAG,                  OpType::MatrixSetDiag},
+    {tflite::BuiltinOperator::ROUND,                            OpType::Round},
+    {tflite::BuiltinOperator::HARD_SWISH,                       OpType::HardSwish},
+    {tflite::BuiltinOperator::IF,                               OpType::If},
+    {tflite::BuiltinOperator::WHILE,                            OpType::While},
+    {tflite::BuiltinOperator::NON_MAX_SUPPRESSION_V4,           OpType::NonMaxSuppressionV4},
+    {tflite::BuiltinOperator::NON_MAX_SUPPRESSION_V5,           OpType::NonMaxSuppressionV5},
+    {tflite::BuiltinOperator::SCATTER_ND,                       OpType::ScatterNd},
+    {tflite::BuiltinOperator::SELECT_V2,                        OpType::SelectV2},
+    {tflite::BuiltinOperator::DENSIFY,                          OpType::Densify},
+    {tflite::BuiltinOperator::SEGMENT_SUM,                      OpType::SegmentSum},
+    {tflite::BuiltinOperator::BATCH_MATMUL,                     OpType::BatchMatMul},
+    {tflite::BuiltinOperator::CUMSUM,                           OpType::Cumsum},
+    {tflite::BuiltinOperator::CUSTOM,                           OpType::Custom}
+    // clang-format on
+};
+
+const std::map<OpType, tflite::BuiltinOperator>
+    TfLiteMapping::_opTypeToBuiltinOperator = InvertMap<tflite::BuiltinOperator, OpType>(_builtinOperatorToOpType);
+
+const std::map<tflite::BuiltinOperator, tflite::BuiltinOptions> TfLiteMapping::_builtinOperatorToBuiltinOptions = {
+    // clang-format off
+    {tflite::BuiltinOperator::ADD,                              tflite::BuiltinOptions::AddOptions},
+    {tflite::BuiltinOperator::AVERAGE_POOL_2D,                  tflite::BuiltinOptions::Pool2DOptions},
+    {tflite::BuiltinOperator::CONCATENATION,                    tflite::BuiltinOptions::ConcatenationOptions},
+    {tflite::BuiltinOperator::CONV_2D,                          tflite::BuiltinOptions::Conv2DOptions},
+    {tflite::BuiltinOperator::DEPTHWISE_CONV_2D,                tflite::BuiltinOptions::DepthwiseConv2DOptions},
+    {tflite::BuiltinOperator::DEPTH_TO_SPACE,                   tflite::BuiltinOptions::DepthToSpaceOptions},
+    {tflite::BuiltinOperator::DEQUANTIZE,                       tflite::BuiltinOptions::DequantizeOptions},
+    {tflite::BuiltinOperator::EMBEDDING_LOOKUP,                 tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::FLOOR,                            tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::FULLY_CONNECTED,                  tflite::BuiltinOptions::FullyConnectedOptions},
+    {tflite::BuiltinOperator::HASHTABLE_LOOKUP,                 tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::L2_NORMALIZATION,                 tflite::BuiltinOptions::L2NormOptions},
+    {tflite::BuiltinOperator::L2_POOL_2D,                       tflite::BuiltinOptions::Pool2DOptions},
+    {tflite::BuiltinOperator::LOCAL_RESPONSE_NORMALIZATION,     tflite::BuiltinOptions::LocalResponseNormalizationOptions},
+    {tflite::BuiltinOperator::LOGISTIC,                         tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::LSH_PROJECTION,                   tflite::BuiltinOptions::LSHProjectionOptions},
+    {tflite::BuiltinOperator::LSTM,                             tflite::BuiltinOptions::LSTMOptions},
+    {tflite::BuiltinOperator::MAX_POOL_2D,                      tflite::BuiltinOptions::Pool2DOptions},
+    {tflite::BuiltinOperator::MUL,                              tflite::BuiltinOptions::MulOptions},
+    {tflite::BuiltinOperator::RELU,                             tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::RELU_N1_TO_1,                     tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::RELU6,                            tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::RESHAPE,                          tflite::BuiltinOptions::ReshapeOptions},
+    {tflite::BuiltinOperator::RESIZE_BILINEAR,                  tflite::BuiltinOptions::ResizeBilinearOptions},
+    {tflite::BuiltinOperator::RNN,                              tflite::BuiltinOptions::RNNOptions},
+    {tflite::BuiltinOperator::SOFTMAX,                          tflite::BuiltinOptions::SoftmaxOptions},
+    {tflite::BuiltinOperator::SPACE_TO_DEPTH,                   tflite::BuiltinOptions::SpaceToDepthOptions},
+    {tflite::BuiltinOperator::SVDF,                             tflite::BuiltinOptions::SVDFOptions},
+    {tflite::BuiltinOperator::TANH,                             tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::CONCAT_EMBEDDINGS,                tflite::BuiltinOptions::ConcatEmbeddingsOptions},
+    {tflite::BuiltinOperator::SKIP_GRAM,                        tflite::BuiltinOptions::SkipGramOptions},
+    {tflite::BuiltinOperator::CALL,                             tflite::BuiltinOptions::CallOptions},
+    {tflite::BuiltinOperator::CUSTOM,                           tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::EMBEDDING_LOOKUP_SPARSE,          tflite::BuiltinOptions::EmbeddingLookupSparseOptions},
+    {tflite::BuiltinOperator::PAD,                              tflite::BuiltinOptions::PadOptions},
+    {tflite::BuiltinOperator::UNIDIRECTIONAL_SEQUENCE_RNN,      tflite::BuiltinOptions::SequenceRNNOptions},
+    {tflite::BuiltinOperator::GATHER,                           tflite::BuiltinOptions::GatherOptions},
+    {tflite::BuiltinOperator::BATCH_TO_SPACE_ND,                tflite::BuiltinOptions::BatchToSpaceNDOptions},
+    {tflite::BuiltinOperator::SPACE_TO_BATCH_ND,                tflite::BuiltinOptions::SpaceToBatchNDOptions},
+    {tflite::BuiltinOperator::TRANSPOSE,                        tflite::BuiltinOptions::TransposeOptions},
+    {tflite::BuiltinOperator::MEAN,                             tflite::BuiltinOptions::ReducerOptions},
+    {tflite::BuiltinOperator::SUB,                              tflite::BuiltinOptions::SubOptions},
+    {tflite::BuiltinOperator::DIV,                              tflite::BuiltinOptions::DivOptions},
+    {tflite::BuiltinOperator::SQUEEZE,                          tflite::BuiltinOptions::SqueezeOptions},
+    {tflite::BuiltinOperator::UNIDIRECTIONAL_SEQUENCE_LSTM,     tflite::BuiltinOptions::UnidirectionalSequenceLSTMOptions},
+    {tflite::BuiltinOperator::STRIDED_SLICE,                    tflite::BuiltinOptions::StridedSliceOptions},
+    {tflite::BuiltinOperator::BIDIRECTIONAL_SEQUENCE_RNN,       tflite::BuiltinOptions::BidirectionalSequenceRNNOptions},
+    {tflite::BuiltinOperator::EXP,                              tflite::BuiltinOptions::ExpOptions},
+    {tflite::BuiltinOperator::TOPK_V2,                          tflite::BuiltinOptions::TopKV2Options},
+    {tflite::BuiltinOperator::SPLIT,                            tflite::BuiltinOptions::SplitOptions},
+    {tflite::BuiltinOperator::LOG_SOFTMAX,                      tflite::BuiltinOptions::LogSoftmaxOptions},
+    {tflite::BuiltinOperator::DELEGATE,                         tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::BIDIRECTIONAL_SEQUENCE_LSTM,      tflite::BuiltinOptions::BidirectionalSequenceLSTMOptions},
+    {tflite::BuiltinOperator::CAST,                             tflite::BuiltinOptions::CastOptions},
+    {tflite::BuiltinOperator::PRELU,                            tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::MAXIMUM,                          tflite::BuiltinOptions::MaximumMinimumOptions},
+    {tflite::BuiltinOperator::ARGMAX,                           tflite::BuiltinOptions::ArgMaxOptions},
+    {tflite::BuiltinOperator::MINIMUM,                          tflite::BuiltinOptions::MaximumMinimumOptions},
+    {tflite::BuiltinOperator::LESS,                             tflite::BuiltinOptions::LessOptions},
+    {tflite::BuiltinOperator::NEG,                              tflite::BuiltinOptions::NegOptions},
+    {tflite::BuiltinOperator::PADV2,                            tflite::BuiltinOptions::PadV2Options},
+    {tflite::BuiltinOperator::GREATER,                          tflite::BuiltinOptions::GreaterOptions},
+    {tflite::BuiltinOperator::GREATER_EQUAL,                    tflite::BuiltinOptions::GreaterEqualOptions},
+    {tflite::BuiltinOperator::LESS_EQUAL,                       tflite::BuiltinOptions::LessEqualOptions},
+    {tflite::BuiltinOperator::SELECT,                           tflite::BuiltinOptions::SelectOptions},
+    {tflite::BuiltinOperator::SLICE,                            tflite::BuiltinOptions::SliceOptions},
+    {tflite::BuiltinOperator::SIN,                              tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::TRANSPOSE_CONV,                   tflite::BuiltinOptions::TransposeConvOptions},
+    {tflite::BuiltinOperator::SPARSE_TO_DENSE,                  tflite::BuiltinOptions::SparseToDenseOptions},
+    {tflite::BuiltinOperator::TILE,                             tflite::BuiltinOptions::TileOptions},
+    {tflite::BuiltinOperator::EXPAND_DIMS,                      tflite::BuiltinOptions::ExpandDimsOptions},
+    {tflite::BuiltinOperator::EQUAL,                            tflite::BuiltinOptions::EqualOptions},
+    {tflite::BuiltinOperator::NOT_EQUAL,                        tflite::BuiltinOptions::NotEqualOptions},
+    {tflite::BuiltinOperator::LOG,                              tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::SUM,                              tflite::BuiltinOptions::ReducerOptions},
+    {tflite::BuiltinOperator::SQRT,                             tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::RSQRT,                            tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::SHAPE,                            tflite::BuiltinOptions::ShapeOptions},
+    {tflite::BuiltinOperator::POW,                              tflite::BuiltinOptions::PowOptions},
+    {tflite::BuiltinOperator::ARG_MIN,                          tflite::BuiltinOptions::ArgMinOptions},
+    {tflite::BuiltinOperator::FAKE_QUANT,                       tflite::BuiltinOptions::FakeQuantOptions},
+    {tflite::BuiltinOperator::REDUCE_PROD,                      tflite::BuiltinOptions::ReducerOptions},
+    {tflite::BuiltinOperator::REDUCE_MAX,                       tflite::BuiltinOptions::ReducerOptions},
+    {tflite::BuiltinOperator::PACK,                             tflite::BuiltinOptions::PackOptions},
+    {tflite::BuiltinOperator::LOGICAL_OR,                       tflite::BuiltinOptions::LogicalOrOptions},
+    {tflite::BuiltinOperator::ONE_HOT,                          tflite::BuiltinOptions::OneHotOptions},
+    {tflite::BuiltinOperator::LOGICAL_AND,                      tflite::BuiltinOptions::LogicalAndOptions},
+    {tflite::BuiltinOperator::LOGICAL_NOT,                      tflite::BuiltinOptions::LogicalNotOptions},
+    {tflite::BuiltinOperator::UNPACK,                           tflite::BuiltinOptions::UnpackOptions},
+    {tflite::BuiltinOperator::REDUCE_MIN,                       tflite::BuiltinOptions::ReducerOptions},
+    {tflite::BuiltinOperator::FLOOR_DIV,                        tflite::BuiltinOptions::FloorDivOptions},
+    {tflite::BuiltinOperator::REDUCE_ANY,                       tflite::BuiltinOptions::ReducerOptions},
+    {tflite::BuiltinOperator::SQUARE,                           tflite::BuiltinOptions::SquareOptions},
+    {tflite::BuiltinOperator::ZEROS_LIKE,                       tflite::BuiltinOptions::ZerosLikeOptions},
+    {tflite::BuiltinOperator::FILL,                             tflite::BuiltinOptions::FillOptions},
+    {tflite::BuiltinOperator::FLOOR_MOD,                        tflite::BuiltinOptions::FloorModOptions},
+    {tflite::BuiltinOperator::RANGE,                            tflite::BuiltinOptions::RangeOptions},
+    {tflite::BuiltinOperator::RESIZE_NEAREST_NEIGHBOR,          tflite::BuiltinOptions::ResizeNearestNeighborOptions},
+    {tflite::BuiltinOperator::LEAKY_RELU,                       tflite::BuiltinOptions::LeakyReluOptions},
+    {tflite::BuiltinOperator::SQUARED_DIFFERENCE,               tflite::BuiltinOptions::SquaredDifferenceOptions},
+    {tflite::BuiltinOperator::MIRROR_PAD,                       tflite::BuiltinOptions::MirrorPadOptions},
+    {tflite::BuiltinOperator::ABS,                              tflite::BuiltinOptions::AbsOptions},
+    {tflite::BuiltinOperator::SPLIT_V,                          tflite::BuiltinOptions::SplitVOptions},
+    {tflite::BuiltinOperator::UNIQUE,                           tflite::BuiltinOptions::UniqueOptions},
+    {tflite::BuiltinOperator::CEIL,                             tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::REVERSE_V2,                       tflite::BuiltinOptions::ReverseV2Options},
+    {tflite::BuiltinOperator::ADD_N,                            tflite::BuiltinOptions::AddNOptions},
+    {tflite::BuiltinOperator::GATHER_ND,                        tflite::BuiltinOptions::GatherNdOptions},
+    {tflite::BuiltinOperator::COS,                              tflite::BuiltinOptions::CosOptions},
+    {tflite::BuiltinOperator::WHERE,                            tflite::BuiltinOptions::WhereOptions},
+    {tflite::BuiltinOperator::RANK,                             tflite::BuiltinOptions::RankOptions},
+    {tflite::BuiltinOperator::ELU,                              tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::REVERSE_SEQUENCE,                 tflite::BuiltinOptions::ReverseSequenceOptions},
+    {tflite::BuiltinOperator::MATRIX_DIAG,                      tflite::BuiltinOptions::MatrixDiagOptions},
+    {tflite::BuiltinOperator::QUANTIZE,                         tflite::BuiltinOptions::QuantizeOptions},
+    {tflite::BuiltinOperator::MATRIX_SET_DIAG,                  tflite::BuiltinOptions::MatrixSetDiagOptions},
+    {tflite::BuiltinOperator::ROUND,                            tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::HARD_SWISH,                       tflite::BuiltinOptions::HardSwishOptions},
+    {tflite::BuiltinOperator::IF,                               tflite::BuiltinOptions::IfOptions},
+    {tflite::BuiltinOperator::WHILE,                            tflite::BuiltinOptions::WhileOptions},
+    {tflite::BuiltinOperator::NON_MAX_SUPPRESSION_V4,           tflite::BuiltinOptions::NonMaxSuppressionV4Options},
+    {tflite::BuiltinOperator::NON_MAX_SUPPRESSION_V5,           tflite::BuiltinOptions::NonMaxSuppressionV5Options},
+    {tflite::BuiltinOperator::SCATTER_ND,                       tflite::BuiltinOptions::ScatterNdOptions},
+    {tflite::BuiltinOperator::SELECT_V2,                        tflite::BuiltinOptions::SelectV2Options},
+    {tflite::BuiltinOperator::DENSIFY,                          tflite::BuiltinOptions::DensifyOptions},
+    {tflite::BuiltinOperator::SEGMENT_SUM,                      tflite::BuiltinOptions::SegmentSumOptions},
+    {tflite::BuiltinOperator::BATCH_MATMUL,                     tflite::BuiltinOptions::BatchMatMulOptions},
+    {tflite::BuiltinOperator::PLACEHOLDER_FOR_GREATER_OP_CODES, tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::CUMSUM,                           tflite::BuiltinOptions::CumsumOptions},
+    {tflite::BuiltinOperator::CALL_ONCE,                        tflite::BuiltinOptions::CallOnceOptions},
+    {tflite::BuiltinOperator::BROADCAST_TO,                     tflite::BuiltinOptions::BroadcastToOptions},
+    {tflite::BuiltinOperator::RFFT2D,                           tflite::BuiltinOptions::Rfft2dOptions},
+    {tflite::BuiltinOperator::CONV_3D,                          tflite::BuiltinOptions::Conv3DOptions},
+    {tflite::BuiltinOperator::IMAG,                             tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::REAL,                             tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::COMPLEX_ABS,                      tflite::BuiltinOptions::NONE},
+    {tflite::BuiltinOperator::HASHTABLE,                        tflite::BuiltinOptions::HashtableOptions},
+    {tflite::BuiltinOperator::HASHTABLE_FIND,                   tflite::BuiltinOptions::HashtableFindOptions},
+    {tflite::BuiltinOperator::HASHTABLE_IMPORT,                 tflite::BuiltinOptions::HashtableImportOptions},
+    {tflite::BuiltinOperator::HASHTABLE_SIZE,                   tflite::BuiltinOptions::HashtableSizeOptions},
+    {tflite::BuiltinOperator::REDUCE_ALL,                       tflite::BuiltinOptions::ReducerOptions},
+    // TODO: {tflite::BuiltinOperator::CONV_3D_TRANSPOSE,       tflite::BuiltinOptions::???},
+    {tflite::BuiltinOperator::VAR_HANDLE,                       tflite::BuiltinOptions::VarHandleOptions},
+    {tflite::BuiltinOperator::READ_VARIABLE,                    tflite::BuiltinOptions::ReadVariableOptions},
+    {tflite::BuiltinOperator::ASSIGN_VARIABLE,                  tflite::BuiltinOptions::AssignVariableOptions},
+    {tflite::BuiltinOperator::BROADCAST_ARGS,                   tflite::BuiltinOptions::NONE},
+    // clang-format on
+};
+
+const std::multimap<OpType, TensorUsage> TfLiteMapping::_inputTensorIndices = {
+    // clang-format off
+    {OpType::Abs,                               TensorUsage::IFM0},
+    {OpType::Add,                               TensorUsage::IFM0},
+    {OpType::Add,                               TensorUsage::IFM1},
+    // AddN is variadic                         None
+    {OpType::Any,                               TensorUsage::IFM0},
+    {OpType::ArgMax,                            TensorUsage::IFM0},
+    {OpType::ArgMax,                            TensorUsage::Params},
+    {OpType::ArgMin,                            TensorUsage::IFM0},
+    {OpType::ArgMin,                            TensorUsage::Params},
+    {OpType::AvgPool,                           TensorUsage::IFM0},
+    {OpType::BatchMatMul,                       TensorUsage::IFM0},
+    {OpType::BatchMatMul,                       TensorUsage::IFM1},
+    {OpType::BatchToSpaceND,                    TensorUsage::IFM0},
+    {OpType::BatchToSpaceND,                    TensorUsage::Params},
+    {OpType::BatchToSpaceND,                    MakeTensorUsage(TensorUsage::Params, 1)},
+    {OpType::BidirectionalSequenceLstm,         TensorUsage::IFM0},
+    {OpType::BidirectionalSequenceLstm,         TensorUsage::Weights},
+    {OpType::BidirectionalSequenceRnn,          TensorUsage::IFM0},
+    {OpType::BidirectionalSequenceRnn,          TensorUsage::Weights},
+    {OpType::CLZ,                               TensorUsage::IFM0},
+    // Call                                     None
+    {OpType::Cast,                              TensorUsage::IFM0},
+    {OpType::Ceil,                              TensorUsage::IFM0},
+    // Clip                                     None
+    {OpType::Clamp,                             TensorUsage::IFM0},
+    // Concat ops are variadic                  None
+    // Const                                    None
+    {OpType::Conv2D,                            TensorUsage::IFM0},
+    {OpType::Conv2D,                            TensorUsage::Weights},
+    {OpType::Conv2D,                            TensorUsage::Scales},
+    {OpType::Conv2DBackpropInput,               TensorUsage::Params},
+    {OpType::Conv2DBackpropInput,               TensorUsage::Weights},
+    {OpType::Conv2DBackpropInput,               TensorUsage::IFM},
+    {OpType::Conv2DBackpropInput,               TensorUsage::Scales},
+    {OpType::Conv2DBias,                        TensorUsage::IFM0},
+    {OpType::Conv2DBias,                        TensorUsage::Weights},
+    {OpType::Conv2DBias,                        TensorUsage::Scales},
+    {OpType::Cos,                               TensorUsage::IFM0},
+    {OpType::Cumsum,                            TensorUsage::IFM0},
+    // Custom ops are variadic
+    {OpType::CustomNpuOp,                       TensorUsage::Params},                     // Register command stream
+    {OpType::CustomNpuOp,                       MakeTensorUsage(TensorUsage::Params, 1)}, // Read only constants
+    {OpType::CustomNpuOp,                       TensorUsage::State},                      // Feature map area
+    {OpType::CustomNpuOp,                       MakeTensorUsage(TensorUsage::State, 1)},  // Staging area
+    // Delegate                                 None
+    {OpType::Densify,                           TensorUsage::IFM0},
+    {OpType::DepthToSpace,                      TensorUsage::IFM0},
+    {OpType::DepthwiseConv2DBias,               TensorUsage::IFM0},
+    {OpType::DepthwiseConv2DBias,               TensorUsage::Weights},
+    {OpType::DepthwiseConv2DBias,               TensorUsage::Scales},
+    {OpType::Dequantize,                        TensorUsage::IFM0},
+    {OpType::Div,                               TensorUsage::IFM0},
+    {OpType::Div,                               TensorUsage::IFM1},
+    {OpType::Elu,                               TensorUsage::IFM0},
+    // EmbeddingLookup, EmbeddingLookupSparse   None
+    {OpType::Equal,                             TensorUsage::IFM0},
+    {OpType::Equal,                             TensorUsage::IFM1},
+    {OpType::Exp,                               TensorUsage::IFM0},
+    {OpType::ExpandDims,                        TensorUsage::IFM0},
+    {OpType::FakeQuantWithMinMaxArgs,           TensorUsage::IFM0},
+    // Fill                                     None
+    {OpType::Floor,                             TensorUsage::IFM0},
+    {OpType::FloorDiv,                          TensorUsage::IFM0},
+    {OpType::FloorDiv,                          TensorUsage::IFM1},
+    {OpType::FloorMod,                          TensorUsage::IFM0},
+    {OpType::FloorMod,                          TensorUsage::IFM1},
+    {OpType::FullyConnected,                    TensorUsage::IFM0},
+    {OpType::FullyConnected,                    TensorUsage::Weights},
+    {OpType::FullyConnected,                    TensorUsage::Scales},
+    {OpType::GatherNd,                          TensorUsage::IFM0},
+    {OpType::GatherNd,                          TensorUsage::IFM1},
+    {OpType::GatherV2,                          TensorUsage::IFM0},
+    {OpType::GatherV2,                          TensorUsage::IFM1},
+    {OpType::Greater,                           TensorUsage::IFM0},
+    {OpType::Greater,                           TensorUsage::IFM1},
+    {OpType::GreaterEqual,                      TensorUsage::IFM0},
+    {OpType::GreaterEqual,                      TensorUsage::IFM1},
+    {OpType::HardSwish,                         TensorUsage::IFM0},
+    {OpType::HashtableLookup,                   TensorUsage::IFM0},
+    {OpType::Identity,                          TensorUsage::IFM0},
+    // If                                       None
+    {OpType::L2Norm,                            TensorUsage::IFM0},
+    {OpType::L2Pool2D,                          TensorUsage::IFM0},
+    {OpType::LRN,                               TensorUsage::IFM0},
+    {OpType::LSHProjection,                     TensorUsage::Params},
+    {OpType::LSHProjection,                     TensorUsage::IFM0},
+    {OpType::LSHProjection,                     MakeTensorUsage(TensorUsage::Params, 1)},
+    {OpType::LeakyRelu,                         TensorUsage::IFM0},
+    {OpType::Less,                              TensorUsage::IFM0},
+    {OpType::Less,                              TensorUsage::IFM1},
+    {OpType::LessEqual,                         TensorUsage::IFM0},
+    {OpType::LessEqual,                         TensorUsage::IFM1},
+    {OpType::Log,                               TensorUsage::IFM0},
+    {OpType::LogSoftmax,                        TensorUsage::IFM0},
+    {OpType::LogicalAnd,                        TensorUsage::IFM0},
+    {OpType::LogicalAnd,                        TensorUsage::IFM1},
+    {OpType::LogicalNot,                        TensorUsage::IFM0},
+    {OpType::LogicalNot,                        TensorUsage::IFM1},
+    {OpType::LogicalOr,                         TensorUsage::IFM0},
+    {OpType::LogicalOr,                         TensorUsage::IFM1},
+    // LUT                                      None
+    {OpType::Lstm,                              TensorUsage::IFM0},
+    {OpType::Lstm,                              TensorUsage::Weights},
+    {OpType::MatMul,                            TensorUsage::IFM0},
+    {OpType::MatMul,                            TensorUsage::Weights},
+    {OpType::MatrixDiag,                        TensorUsage::IFM0},
+    {OpType::MatrixSetDiag,                     TensorUsage::IFM0},
+    {OpType::MatrixSetDiag,                     TensorUsage::IFM1},
+    {OpType::Max,                               TensorUsage::IFM0},
+    {OpType::Max,                               TensorUsage::Params},
+    {OpType::MaxPool,                           TensorUsage::IFM0},
+    // Maximum and Minimum are variadic (not to be confused with Max and Min which are reduction operators)
+    {OpType::Mean,                              TensorUsage::IFM0},
+    {OpType::Mean,                              TensorUsage::Params},
+    {OpType::Min,                               TensorUsage::IFM0},
+    {OpType::Min,                               TensorUsage::Params},
+    {OpType::MirrorPad,                         TensorUsage::IFM0},
+    {OpType::MirrorPad,                         TensorUsage::Params},
+    {OpType::Mul,                               TensorUsage::IFM0},
+    {OpType::Mul,                               TensorUsage::IFM1},
+    {OpType::Neg,                               TensorUsage::IFM0},
+    {OpType::NonMaxSuppressionV4,               TensorUsage::IFM0},
+    {OpType::NonMaxSuppressionV4,               TensorUsage::IFM1},
+    {OpType::NonMaxSuppressionV4,               TensorUsage::Params},
+    {OpType::NonMaxSuppressionV4,               MakeTensorUsage(TensorUsage::Params, 1)},
+    {OpType::NonMaxSuppressionV4,               MakeTensorUsage(TensorUsage::Params, 2)},
+    {OpType::NonMaxSuppressionV5,               TensorUsage::IFM0},
+    {OpType::NonMaxSuppressionV5,               TensorUsage::IFM1},
+    {OpType::NonMaxSuppressionV5,               TensorUsage::Params},
+    {OpType::NonMaxSuppressionV5,               MakeTensorUsage(TensorUsage::Params, 1)},
+    {OpType::NonMaxSuppressionV5,               MakeTensorUsage(TensorUsage::Params, 2)},
+    {OpType::NonMaxSuppressionV5,               MakeTensorUsage(TensorUsage::Params, 3)},
+    {OpType::NotEqual,                          TensorUsage::IFM0},
+    {OpType::NotEqual,                          TensorUsage::IFM1},
+    {OpType::OneHot,                            TensorUsage::IFM0},
+    {OpType::OneHot,                            TensorUsage::Params},
+    {OpType::Pack,                              TensorUsage::IFM0},
+    {OpType::Pad,                               TensorUsage::IFM0},
+    {OpType::Pad,                               TensorUsage::Params},
+    {OpType::PadV2,                             TensorUsage::IFM0},
+    {OpType::PadV2,                             TensorUsage::Params},
+    {OpType::PadV2,                             MakeTensorUsage(TensorUsage::Params, 1)},
+    // Placeholder                              None
+    {OpType::Pow,                               TensorUsage::IFM0},
+    {OpType::Pow,                               TensorUsage::IFM1},
+    {OpType::Prelu,                             TensorUsage::IFM0},
+    {OpType::Prelu,                             TensorUsage::Params},
+    {OpType::Prod,                              TensorUsage::IFM0},
+    {OpType::Prod,                              TensorUsage::Params},
+    {OpType::Quantize,                          TensorUsage::IFM0},
+    // Range                                    None
+    {OpType::Rank,                              TensorUsage::IFM0},
+    {OpType::Relu,                              TensorUsage::IFM0},
+    {OpType::Relu6,                             TensorUsage::IFM0},
+    {OpType::ReluN1To1,                         TensorUsage::IFM0},
+    {OpType::ReluN,                             TensorUsage::IFM0},
+    {OpType::Rescale,                           TensorUsage::IFM0},
+    {OpType::Reshape,                           TensorUsage::IFM0},
+    {OpType::Reshape,                           TensorUsage::Params},
+    {OpType::ResizeBilinear,                    TensorUsage::IFM0},
+    {OpType::ResizeBilinear,                    TensorUsage::Params},
+    {OpType::ResizeNearestNeighbor,             TensorUsage::IFM0},
+    {OpType::ResizeNearestNeighbor,             TensorUsage::Params},
+    {OpType::Round,                             TensorUsage::IFM0},
+    {OpType::Rsqrt,                             TensorUsage::IFM0},
+    {OpType::ReverseSequence,                   TensorUsage::IFM0},
+    {OpType::ReverseSequence,                   TensorUsage::Params},
+    {OpType::ReverseV2,                         TensorUsage::IFM0},
+    {OpType::ReverseV2,                         TensorUsage::Params},
+    {OpType::Rnn,                               TensorUsage::IFM0},
+    {OpType::Rnn,                               TensorUsage::Weights},
+    {OpType::ScatterNd,                         TensorUsage::IFM0},
+    {OpType::ScatterNd,                         TensorUsage::IFM1},
+    {OpType::ScatterNd,                         TensorUsage::Params},
+    {OpType::SegmentSum,                        TensorUsage::IFM0},
+    {OpType::SegmentSum,                        TensorUsage::Params},
+    {OpType::Select,                            TensorUsage::Params},
+    {OpType::Select,                            TensorUsage::IFM0},
+    {OpType::Select,                            TensorUsage::IFM1},
+    {OpType::SelectV2,                          TensorUsage::Params},
+    {OpType::SelectV2,                          TensorUsage::IFM0},
+    {OpType::SelectV2,                          TensorUsage::IFM1},
+    {OpType::Shape,                             TensorUsage::IFM0},
+    {OpType::SignBit,                           TensorUsage::IFM0},
+    {OpType::Sin,                               TensorUsage::IFM0},
+    {OpType::SkipGram,                          TensorUsage::IFM0},
+    {OpType::Sigmoid,                           TensorUsage::IFM0},
+    {OpType::Slice,                             TensorUsage::IFM0},
+    {OpType::Slice,                             TensorUsage::Params},
+    {OpType::Slice,                             MakeTensorUsage(TensorUsage::Params, 1)},
+    {OpType::Softmax,                           TensorUsage::IFM0},
+    {OpType::SpaceToBatchND,                    TensorUsage::IFM0},
+    {OpType::SpaceToBatchND,                    TensorUsage::Params},
+    {OpType::SpaceToBatchND,                    MakeTensorUsage(TensorUsage::Params, 1)},
+    {OpType::SpaceToDepth,                      TensorUsage::IFM0},
+    {OpType::SparseToDense,                     TensorUsage::IFM0},
+    {OpType::Split,                             TensorUsage::Params},
+    {OpType::Split,                             TensorUsage::IFM0},
+    {OpType::SplitV,                            TensorUsage::IFM0},
+    {OpType::SplitV,                            TensorUsage::Params},
+    {OpType::SplitV,                            MakeTensorUsage(TensorUsage::Params, 1)},
+    {OpType::Sqrt,                              TensorUsage::IFM0},
+    {OpType::Square,                            TensorUsage::IFM0},
+    {OpType::SquaredDifference,                 TensorUsage::IFM0},
+    {OpType::SquaredDifference,                 TensorUsage::IFM1},
+    {OpType::Squeeze,                           TensorUsage::IFM0},
+    {OpType::StridedSlice,                      TensorUsage::IFM0},
+    {OpType::StridedSlice,                      TensorUsage::Params},
+    {OpType::StridedSlice,                      MakeTensorUsage(TensorUsage::Params, 1)},
+    {OpType::StridedSlice,                      MakeTensorUsage(TensorUsage::Params, 2)},
+    {OpType::Sub,                               TensorUsage::IFM0},
+    {OpType::Sub,                               TensorUsage::IFM1},
+    // SubgraphInput                            None
+    {OpType::Sum,                               TensorUsage::IFM0},
+    {OpType::Sum,                               TensorUsage::Params},
+    {OpType::Svdf,                              TensorUsage::IFM0},
+    {OpType::Svdf,                              TensorUsage::Weights},
+    {OpType::Svdf,                              MakeTensorUsage(TensorUsage::Weights, 1)},
+    {OpType::Svdf,                              TensorUsage::Scales},
+    {OpType::Svdf,                              TensorUsage::State},
+    {OpType::Tanh,                              TensorUsage::IFM0},
+    {OpType::Tile,                              TensorUsage::IFM0},
+    {OpType::Tile,                              TensorUsage::Params},
+    {OpType::TopKV2,                            TensorUsage::IFM0},
+    {OpType::TopKV2,                            TensorUsage::Params},
+    {OpType::Transpose,                         TensorUsage::IFM0},
+    {OpType::Transpose,                         TensorUsage::Params},
+    {OpType::Unique,                            TensorUsage::IFM0},
+    {OpType::UnidirectionalSequenceLstm,        TensorUsage::IFM0},
+    {OpType::UnidirectionalSequenceLstm,        TensorUsage::Weights},
+    {OpType::UnidirectionalSequenceRnn,         TensorUsage::IFM0},
+    {OpType::UnidirectionalSequenceRnn,         TensorUsage::Weights},
+    {OpType::Unpack,                            TensorUsage::IFM0},
+    {OpType::Where,                             TensorUsage::IFM0},
+    // While                                    None
+    {OpType::ZerosLike,                         TensorUsage::IFM0},
+    // clang-format on
+};
+
+template<typename T>
+static tflite::ActivationFunctionType GetBuiltinFaf(const tflite::Operator *tflite_operator)
+{
+    const auto options = tflite_operator->builtin_options_as<T>();
+    assert(options);
+    return options->fused_activation_function();
+}
+
+bool TfLiteMapping::CanFuseActivationFunction(const Operation *operation)
+{
+    if ( operation->Passthrough() == nullptr )
+    {
+        return false;  // Only fuse operators which came in fused
+    }
+
+    const auto type = TfLiteMapping::OpTypeToBuiltinOptions(operation->Type());
+    const tflite::Operator *const passthrough = static_cast<const tflite::Operator *>(operation->Passthrough());
+    tflite::ActivationFunctionType activation;
+
+    if ( type == tflite::BuiltinOptions::Conv2DOptions )
+    {
+        activation = GetBuiltinFaf<tflite::Conv2DOptions>(passthrough);
+    }
+    else if ( type == tflite::BuiltinOptions::Conv3DOptions )
+    {
+        activation = GetBuiltinFaf<tflite::Conv3DOptions>(passthrough);
+    }
+    else if ( type == tflite::BuiltinOptions::Pool2DOptions )
+    {
+        activation = GetBuiltinFaf<tflite::Pool2DOptions>(passthrough);
+    }
+    else if ( type == tflite::BuiltinOptions::DepthwiseConv2DOptions )
+    {
+        activation = GetBuiltinFaf<tflite::DepthwiseConv2DOptions>(passthrough);
+    }
+    else if ( type == tflite::BuiltinOptions::SVDFOptions )
+    {
+        activation = GetBuiltinFaf<tflite::SVDFOptions>(passthrough);
+    }
+    else if ( type == tflite::BuiltinOptions::RNNOptions )
+    {
+        activation = GetBuiltinFaf<tflite::RNNOptions>(passthrough);
+    }
+    else if ( type == tflite::BuiltinOptions::SequenceRNNOptions )
+    {
+        activation = GetBuiltinFaf<tflite::SequenceRNNOptions>(passthrough);
+    }
+    else if ( type == tflite::BuiltinOptions::BidirectionalSequenceRNNOptions )
+    {
+        activation = GetBuiltinFaf<tflite::BidirectionalSequenceRNNOptions>(passthrough);
+    }
+    else if ( type == tflite::BuiltinOptions::FullyConnectedOptions )
+    {
+        activation = GetBuiltinFaf<tflite::FullyConnectedOptions>(passthrough);
+    }
+    else if ( type == tflite::BuiltinOptions::ConcatenationOptions )
+    {
+        activation = GetBuiltinFaf<tflite::ConcatenationOptions>(passthrough);
+    }
+    else if ( type == tflite::BuiltinOptions::AddOptions )
+    {
+        activation = GetBuiltinFaf<tflite::AddOptions>(passthrough);
+    }
+    else if ( type == tflite::BuiltinOptions::MulOptions )
+    {
+        activation = GetBuiltinFaf<tflite::MulOptions>(passthrough);
+    }
+    else if ( type == tflite::BuiltinOptions::L2NormOptions )
+    {
+        activation = GetBuiltinFaf<tflite::L2NormOptions>(passthrough);
+    }
+    else if ( type == tflite::BuiltinOptions::LSTMOptions )
+    {
+        activation = GetBuiltinFaf<tflite::LSTMOptions>(passthrough);
+    }
+    else if ( type == tflite::BuiltinOptions::UnidirectionalSequenceLSTMOptions )
+    {
+        activation = GetBuiltinFaf<tflite::UnidirectionalSequenceLSTMOptions>(passthrough);
+    }
+    else if ( type == tflite::BuiltinOptions::BidirectionalSequenceLSTMOptions )
+    {
+        activation = GetBuiltinFaf<tflite::BidirectionalSequenceLSTMOptions>(passthrough);
+    }
+    else if ( type == tflite::BuiltinOptions::SubOptions )
+    {
+        activation = GetBuiltinFaf<tflite::SubOptions>(passthrough);
+    }
+    else if ( type == tflite::BuiltinOptions::DivOptions )
+    {
+        activation = GetBuiltinFaf<tflite::DivOptions>(passthrough);
+    }
+    else
+    {
+        return false;  // Operator type does not support fused activations
+    }
+
+    if ( activation == tflite::ActivationFunctionType::NONE )
+    {
+        return false;  // Only fuse operators which came in fused
+    }
+
+    if ( (operation->Outputs().size() != 1) || (operation->OFM()->Readers().size() != 1) )
+    {
+        return false;  // Only fuse operators with a single direct successor
+    }
+
+    const auto &successor = operation->OFM()->Readers().front();
+    if ( activation != TfLiteMapping::OpTypeToActivationFunction(successor->Type()) )
+    {
+        return false;  // Only fuse operators to the activation functions they arrived fused to
+    }
+
+    if ( operation->Output(TensorUsage::OFM)->quantization != successor->Output(TensorUsage::OFM)->quantization )
+    {
+        return false;  // Intermediate tensor contains quantization information that cannot be discarded
+    }
+
+    return true;
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/tflite/tflite_mapping.hpp b/ethosu/regor/tflite/tflite_mapping.hpp
new file mode 100644
index 00000000..f14d4cdf
--- /dev/null
+++ b/ethosu/regor/tflite/tflite_mapping.hpp
@@ -0,0 +1,135 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/data_type.hpp"
+#include "compiler/op_type.hpp"
+#include "compiler/operation.hpp"
+#include "tflite_schema_generated.hpp"
+
+#include <cassert>
+#include <map>
+
+namespace regor
+{
+
+class TfLiteMapping
+{
+public:
+    TfLiteMapping(const TfLiteMapping &) = delete;  // Never constructed. Static members only.
+
+    //
+    // Conversions from TensorFlow Lite types to Regor types
+    //
+    static DataType TensorTypeToDataType(tflite::TensorType type) { return _tensorTypeToDataType.at(type); }
+    static OpType ActivationFunctionToOpType(tflite::ActivationFunctionType type)
+    {
+        return _activationFunctionToOpType.at(type);
+    }
+    static OpType BuiltinOperatorToOpType(tflite::BuiltinOperator type) { return _builtinOperatorToOpType.at(type); }
+
+    static std::string BuiltinOperatorToString(tflite::BuiltinOperator &type) { return EnumNameBuiltinOperator(type); }
+
+    //
+    // Conversions from Regor types to TensorFlow Lite types
+    //
+    static tflite::TensorType DataTypeToTensorType(DataType type) { return _dataTypeToTensorType.at(type); }
+    static tflite::ActivationFunctionType OpTypeToActivationFunction(OpType type)
+    {
+        auto it = _opTypeToActivationFunction.find(type);
+        if ( it == _opTypeToActivationFunction.end() )
+        {
+            return tflite::ActivationFunctionType::NONE;
+        }
+        return it->second;
+    }
+    static tflite::BuiltinOperator OpTypeToBuiltinOperator(OpType type)
+    {
+        assert((type == OpType::CustomNpuOp || _opTypeToBuiltinOperator.count(type) == 1) && "Missing op type");
+
+        return type == OpType::CustomNpuOp ? tflite::BuiltinOperator::CUSTOM : _opTypeToBuiltinOperator.at(type);
+    }
+    static tflite::BuiltinOptions OpTypeToBuiltinOptions(OpType type)
+    {
+        return _builtinOperatorToBuiltinOptions.at(OpTypeToBuiltinOperator(type));
+    }
+
+    class InputTensorIndices;  // Usage: for (const auto& map_entry : InputTensorIndices(op_type)) {}
+
+    // The order in which input tensors are referenced by a TensorFlow Lite operator depends on the operator type
+
+
+    static bool CanFuseActivationFunction(const Operation *operation);
+
+private:
+    // Mappings from TensorFlow Lite types to Regor types are 1:1 (except for CUSTOM:[Custom|CustomNpuOp])
+    // Mappings from Regor types to TensorFlow Lite types are 1:[0|1]
+    // Regor types without a corresponding TensorFlow Lite type are not included in these maps
+    //  (the Regor to TensorFlow Lite maps are created by inverting the TensorFlow Lite to Regor maps)
+
+    static const std::map<tflite::TensorType, DataType> _tensorTypeToDataType;
+    static const std::map<DataType, tflite::TensorType> _dataTypeToTensorType;
+
+    static const std::map<tflite::ActivationFunctionType, OpType> _activationFunctionToOpType;
+    static const std::map<OpType, tflite::ActivationFunctionType> _opTypeToActivationFunction;
+
+    static const std::map<tflite::BuiltinOperator, OpType> _builtinOperatorToOpType;
+    static const std::map<OpType, tflite::BuiltinOperator> _opTypeToBuiltinOperator;
+
+    // Mapping from TensorFlow Lite operator type to TensorFlow Lite options type is N:1
+    static const std::map<tflite::BuiltinOperator, tflite::BuiltinOptions> _builtinOperatorToBuiltinOptions;
+
+    // The number of input tensors to a TensorFlow Lite operator depends on the operator type,
+    // as does the order in which the different kinds of input tensor are listed.
+    static const std::multimap<OpType, TensorUsage> _inputTensorIndices;
+
+    // Assumes 1:1 mapping (else some entries will be lost)
+    template<typename T1, typename T2>
+    static std::map<T2, T1> InvertMap(std::map<T1, T2> map)
+    {
+        std::map<T2, T1> inverse_map;
+        for ( const auto &pair : map )
+        {
+            inverse_map[pair.second] = pair.first;
+        }
+        return inverse_map;
+    }
+};
+
+
+// Provides lookup into `_inputTensorIndices` as a range expression
+class TfLiteMapping::InputTensorIndices
+{
+private:
+    typedef std::multimap<OpType, TensorUsage>::const_iterator const_iterator;
+    const_iterator _begin;
+    const_iterator _end;
+
+public:
+    InputTensorIndices(OpType op_type)
+    {
+        auto pair = _inputTensorIndices.equal_range(op_type);
+        _begin = pair.first;
+        _end = pair.second;
+    }
+    const_iterator begin() const { return _begin; }
+    const_iterator end() const { return _end; }
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/tflite/tflite_model_semantics.cpp b/ethosu/regor/tflite/tflite_model_semantics.cpp
new file mode 100644
index 00000000..d883500c
--- /dev/null
+++ b/ethosu/regor/tflite/tflite_model_semantics.cpp
@@ -0,0 +1,976 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+
+#include "tflite_model_semantics.hpp"
+
+#include "tflite_mapping.hpp"
+
+#include <fmt/format.h>
+#include <string>
+#include <type_traits>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+
+
+namespace tflite
+{
+using BufferOffsetRef = const flatbuffers::Vector<flatbuffers::Offset<Buffer>> &;
+
+using OperatorFunction = void (*)(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef buffers);
+using OpCheckVec = std::vector<OperatorFunction>;
+
+namespace
+{
+template<typename T>
+T *CheckedPtr(T *p)
+{
+    if ( p == nullptr )
+    {
+        throw std::runtime_error(
+            "Error: Null pointer exception encountered when reading TFLite file\n"
+            "Failed to Parse TFLite file\n");
+    }
+    else
+    {
+        return p;
+    }
+}
+
+template<typename T1, typename T2>
+T1 CheckedAdd(T1 a, T2 b)
+{
+    if ( a > 0 && b > std::numeric_limits<T1>::max() - a )
+    {
+        throw std::runtime_error(
+            "Integer overflow on addition\n"
+            "Failed to parse TFLite file\n");
+    }
+    else if constexpr ( std::is_signed<T1>::value )
+    {
+        if ( a < 0 && b < std::numeric_limits<T1>::min() - a )
+        {
+            throw std::runtime_error(
+                "Integer underflow on addition\n"
+                "Failed to parse TFLite file\n");
+        }
+    }
+    return a + b;
+}
+
+template<typename T, typename U>
+unsigned int BoundsCheckedIndex(T index, const U &vector)
+{
+    if ( (std::is_unsigned_v<T> || index >= 0) && static_cast<uint64_t>(index) < vector.size() ) return unsigned(index);
+    throw std::runtime_error("Error: Out of bounds\n");
+}
+
+const Tensor *TensorFromUsage(regor::TensorUsage usage, const Operator &op, const BuiltinOperator &builtinOperator,
+    const flatbuffers::Vector<flatbuffers::Offset<Tensor>> &tensors)
+{
+    auto opType = regor::TfLiteMapping::BuiltinOperatorToOpType(builtinOperator);
+    if ( !regor::IsOFM(usage) )
+    {
+        const auto *inputs = CheckedPtr(op.inputs());
+        if ( IsVariadic(opType) )
+        {
+            auto tensorIndex = BoundsCheckedIndex(regor::GetUsageIndex(usage), *inputs);
+            auto tensorsLookupIndex = BoundsCheckedIndex((*inputs)[tensorIndex], tensors);
+            return CheckedPtr(tensors[tensorsLookupIndex]);
+        }
+
+        auto mapping = regor::TfLiteMapping::InputTensorIndices(opType);
+        auto at = std::find_if(mapping.begin(), mapping.end(),
+            [usage](const std::pair<regor::OpType, regor::TensorUsage> &p) { return p.second == usage; });
+        if ( at == mapping.end() )
+        {
+            return nullptr;
+        }
+
+        auto i = std::distance(mapping.begin(), at);
+        auto tensorIndex = BoundsCheckedIndex(i, *inputs);
+        auto tensorsLookupIndex = BoundsCheckedIndex((*inputs)[tensorIndex], tensors);
+        return CheckedPtr(tensors[tensorsLookupIndex]);
+    }
+    else
+    {
+        const auto *outputs = CheckedPtr(op.outputs());
+        auto tensorIndex = BoundsCheckedIndex(regor::GetUsageIndex(usage), *outputs);
+        auto tensorsLookupIndex = BoundsCheckedIndex((*outputs)[tensorIndex], tensors);
+        return CheckedPtr(tensors[tensorsLookupIndex]);
+    }
+}
+}  // namespace
+
+class InvalidTfLiteException
+{
+private:
+    const std::string _constraint;
+    const std::string _extra;
+    const std::string _tensorName;
+    const std::string _builtinOperatorName;
+
+public:
+    InvalidTfLiteException(const std::string &constraint, const std::string &extra, const Tensor &tensor) :
+            _constraint(constraint), _extra(std::move(extra)), _tensorName(CheckedPtr(tensor.name())->str())
+    {
+    }
+    InvalidTfLiteException(const std::string &constraint, const std::string &extra, const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator) :
+            _constraint(constraint), _extra(std::move(extra)),
+            _tensorName(
+                CheckedPtr(TensorFromUsage(regor::TensorUsage::OFM, op, builtinOperator, *subgraph.tensors())->name())->str()),
+            _builtinOperatorName(EnumNameBuiltinOperator(builtinOperator))
+    {
+    }
+
+    const std::string &Constraint() const { return _constraint; }
+    const std::string &Extra() const { return _extra; }
+    const std::string &TensorName() const { return _tensorName; }
+    const std::string &BuiltinOperatorName() const { return _builtinOperatorName; }
+};
+
+namespace
+{
+
+const std::array<BuiltinOperator, 1> convolutionOps = {
+    BuiltinOperator::CONV_2D,
+};
+
+const std::array<BuiltinOperator, 2> poolingOps = {BuiltinOperator::AVERAGE_POOL_2D, BuiltinOperator::MAX_POOL_2D};
+
+const std::array<BuiltinOperator, 1> concatOps = {BuiltinOperator::CONCATENATION};
+
+const std::array<BuiltinOperator, 4> unaryElemWiseMainOps = {
+    BuiltinOperator::ABS, BuiltinOperator::EXP, BuiltinOperator::LEAKY_RELU, BuiltinOperator::RSQRT};
+
+const std::array<BuiltinOperator, 2> binaryElemWiseMinMaxOps = {BuiltinOperator::MINIMUM, BuiltinOperator::MAXIMUM};
+
+const std::array<BuiltinOperator, 3> binaryElemWiseAddMulSub = {BuiltinOperator::ADD, BuiltinOperator::MUL, BuiltinOperator::SUB};
+
+const std::array<BuiltinOperator, 10> elemWiseMainOps = {BuiltinOperator::MINIMUM, BuiltinOperator::MAXIMUM,
+    BuiltinOperator::ADD, BuiltinOperator::MUL, BuiltinOperator::SUB, BuiltinOperator::ABS, BuiltinOperator::EXP,
+    BuiltinOperator::LEAKY_RELU, BuiltinOperator::RSQRT, BuiltinOperator::SQUARED_DIFFERENCE};
+
+const std::array<BuiltinOperator, 3> reshapeOps = {BuiltinOperator::RESHAPE, BuiltinOperator::SQUEEZE, BuiltinOperator::EXPAND_DIMS};
+
+
+template<typename T>
+T DataFromBuffer(BufferOffsetRef buffers, uint32_t bufferIndex, size_t index)
+{
+    bufferIndex = BoundsCheckedIndex(bufferIndex, buffers);
+    auto buffer = CheckedPtr(buffers[bufferIndex]);
+    const void *p = CheckedPtr(buffer->data())->data();
+    if ( (uintptr_t(p) % alignof(T)) == 0 && index < CheckedPtr(buffer->data())->size() )
+        return static_cast<const T *>(p)[index];
+    throw std::runtime_error("Out of bounds\n");
+}
+
+
+
+bool IsInt(TensorType tensorType)
+{
+    return (tensorType == TensorType::INT4) || (tensorType == TensorType::INT8) || (tensorType == TensorType::INT16) ||
+           (tensorType == TensorType::INT32) || (tensorType == TensorType::INT64);
+}
+
+bool IsUint(TensorType tensorType)
+{
+    return (tensorType == TensorType::UINT8) || (tensorType == TensorType::UINT16) ||
+           (tensorType == TensorType::UINT32) || (tensorType == TensorType::UINT64);
+}
+
+Shape ShapeFromTens(const Tensor *tens)
+{
+    if ( tens->shape() )
+    {
+        size_t size = tens->shape()->size();
+        if ( size > 0 ) return Shape(tens->shape()->data(), size);
+    }
+    return Shape();
+}
+
+Shape _getSliceOffsets(Shape input_shape, const Tensor *offset_tens, int offset_mask, BufferOffsetRef buffers, bool is_begin)
+{
+    Shape offsets(nullptr, input_shape.Size());
+    for ( int i = 0; i < input_shape.Size(); i++ )
+    {
+        // For strided slice operator: get start or end offsets
+        if ( !is_begin )
+        {
+            offsets[i] = CheckedAdd(offsets[i], input_shape[i]);
+        }
+        // If the i:th bit in the mask is not set then the value in offset_tens[i] should be used, otherwise it
+        // should be ignored
+        if ( (offset_mask & (1 << i)) == 0 )
+        {
+            offsets[i] = DataFromBuffer<int>(buffers, offset_tens->buffer(), i);
+            if ( offsets[i] < 0 )
+            {
+                // Convert negative indexing to positive ones
+                offsets[i] = CheckedAdd(offsets[i], input_shape[i]);
+            }
+        }
+    }
+    return offsets;
+}
+
+void ConstraintEmptyConstTensors(const Model &m_model)
+{
+    std::unordered_set<int> writtenTensors;
+    auto &buffers = *CheckedPtr(m_model.buffers());
+    for ( const auto subgraph : *m_model.subgraphs() )
+    {
+        auto &tensors = *CheckedPtr(subgraph->tensors());
+        for ( auto input : *subgraph->inputs() )
+        {
+            writtenTensors.insert(input);
+        }
+        for ( auto op : *subgraph->operators() )
+        {
+            for ( auto output : *op->outputs() )
+            {
+                writtenTensors.insert(output);
+            }
+        }
+        for ( auto op : *subgraph->operators() )
+        {
+            for ( auto input : *op->inputs() )
+            {
+                if ( input != -1 && !writtenTensors.count(input) )
+                {
+                    auto tensor = tensors[BoundsCheckedIndex(input, tensors)];
+                    auto buffer = buffers[BoundsCheckedIndex(tensor->buffer(), buffers)];
+                    // Buffer 0 is a special buffer that is used for empty tensors
+                    if ( tensor->buffer() != 0 && (!buffer->data() || buffer->data()->size() == 0) )
+                    {
+                        std::string constraint = "Constant tensors must not have empty buffers";
+                        std::string extra = "Found Constant Tensor with empty buffer";
+                        throw InvalidTfLiteException(constraint, extra, *tensor);
+                    }
+                }
+            }
+        }
+    }
+}
+
+void ConstraintTensQuantScale(const Model &m_model)
+{
+    for ( const auto &subgraph : *m_model.subgraphs() )
+    {
+        for ( const auto &tensor : *subgraph->tensors() )
+        {
+            if ( tensor->quantization() && tensor->quantization()->scale() )
+            {
+                for ( float scale : *tensor->quantization()->scale() )
+                {
+                    if ( !std::isfinite(scale) )
+                    {
+                        std::string constraint = "Tensor Quantization scales must be finite";
+                        std::string extra = fmt::format("Quantization scale={}", scale);
+                        throw InvalidTfLiteException(constraint, extra, *tensor);
+                    }
+                }
+            }
+        }
+    }
+}
+
+void ConstraintQuantScaleInf(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto ifm = TensorFromUsage(regor::TensorUsage::IFM, op, builtinOperator, *subgraph.tensors());
+    auto ofm = TensorFromUsage(regor::TensorUsage::OFM, op, builtinOperator, *subgraph.tensors());
+    if ( ifm->quantization() && ifm->quantization()->scale() && ofm->quantization() && ofm->quantization()->scale() )
+    {
+        float ifmScale = (*ifm->quantization()->scale())[0];
+        float ofmScale = (*ofm->quantization()->scale())[0];
+        if ( !std::isfinite(ifmScale / ofmScale) )
+        {
+            std::string constraint = "Input and Output tensors must have quantization scales that fit within float32 precision";
+            std::string extra = fmt::format("(IFM Scale / OFM Scale)={}", ifmScale / ofmScale);
+            throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+        }
+    }
+}
+
+void ConstraintConvGroupsIfmDepth(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto ifm = TensorFromUsage(regor::TensorUsage::IFM, op, builtinOperator, *subgraph.tensors());
+    auto weights = TensorFromUsage(regor::TensorUsage::Weights, op, builtinOperator, *subgraph.tensors());
+    auto ifmDepth = ShapeFromTens(ifm)[-1];
+    auto kernelIc = ShapeFromTens(weights)[-1];
+    if ( kernelIc == 0 || kernelIc < 0 || ifmDepth < 0 )
+    {
+        throw std::runtime_error("Error: Out of bounds\n");
+    }
+    if ( ifmDepth % kernelIc != 0 )
+    {
+        std::string constraint = "IFM depth must be a whole multiple of the filter kernel depth";
+        std::string extra = fmt::format("IFM depth = {} and filter kernel depth = {}", ifmDepth, kernelIc);
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+void ConstraintConvGroupsNumFilters(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto ifm = TensorFromUsage(regor::TensorUsage::IFM, op, builtinOperator, *subgraph.tensors());
+    auto weights = TensorFromUsage(regor::TensorUsage::Weights, op, builtinOperator, *subgraph.tensors());
+    auto ifmDepth = ShapeFromTens(ifm)[-1];
+    auto kernelIc = ShapeFromTens(weights)[-1];
+    auto kernelOc = ShapeFromTens(weights)[0];
+
+    auto numConvGroups = ifmDepth / kernelIc;
+    if ( numConvGroups == 0 || kernelOc < 0 || numConvGroups < 0 )
+    {
+        throw std::runtime_error("Error: Out of bounds\n");
+    }
+    if ( kernelOc % numConvGroups != 0 )
+    {
+        std::string constraint = "Number of filter kernels must be equally divisible by the number of convolution groups";
+        std::string extra = fmt::format("Conv Groups = {}, filter kernels = {}", numConvGroups, kernelOc);
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+void ConstraintMatchingInOutTypes(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto ifm = TensorFromUsage(regor::TensorUsage::IFM, op, builtinOperator, *subgraph.tensors());
+    auto ofm = TensorFromUsage(regor::TensorUsage::OFM, op, builtinOperator, *subgraph.tensors());
+    if ( ifm->type() != ofm->type() )
+    {
+        std::string constraint = "IFM and OFM types must match";
+        std::string extra = fmt::format(
+            "IFM type={} and OFM type={}", EnumNameTensorType(ifm->type()), EnumNameTensorType(ofm->type()));
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+void ConstraintBetaValueRange(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto beta = CheckedPtr(op.builtin_options_as_SoftmaxOptions())->beta();
+    if ( beta < 0 )
+    {
+        std::string constraint = "Beta attr must to be positive";
+        std::string extra = fmt::format("Attribute beta={}", beta);
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+void ConstraintMatchingShapes(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto ifm = TensorFromUsage(regor::TensorUsage::IFM, op, builtinOperator, *subgraph.tensors());
+    auto ofm = TensorFromUsage(regor::TensorUsage::OFM, op, builtinOperator, *subgraph.tensors());
+    auto ifmShape = ShapeFromTens(ifm);
+    auto ofmShape = ShapeFromTens(ofm);
+
+    if ( ifmShape != ofmShape )
+    {
+        std::string constraint = "IFM and OFM shapes must match";
+        std::string extra = fmt::format("IFM shape={} and OFM type={}", ofmShape.ToString(), ifmShape.ToString());
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+void ConstraintSplitDim(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef buffers)
+{
+    auto ifm = TensorFromUsage(regor::TensorUsage::IFM, op, builtinOperator, *subgraph.tensors());
+    auto splitDimTensor = TensorFromUsage(regor::TensorUsage::Params, op, builtinOperator, *subgraph.tensors());
+
+    int64_t splitDim = DataFromBuffer<int>(buffers, splitDimTensor->buffer(), 0);
+    int64_t ifmRank = CheckedPtr(ifm->shape())->size();
+
+    if ( splitDim <= -ifmRank || splitDim > ifmRank || ifmRank < 0 )
+    {
+        std::string constraint = "Split dim size must be in the interval [-rank(IFM),rank(IFM))";
+        std::string extra = fmt::format("IFM rank={}, and Input split_dim={}", ifmRank, splitDim);
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+void ConstraintSplitNumSplits(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef buffers)
+{
+    auto ifm = TensorFromUsage(regor::TensorUsage::IFM, op, builtinOperator, *subgraph.tensors());
+    auto ifmShape = ShapeFromTens(ifm);
+
+    auto splitDimTensor = TensorFromUsage(regor::TensorUsage::Params, op, builtinOperator, *subgraph.tensors());
+    auto splitDim = DataFromBuffer<int>(buffers, splitDimTensor->buffer(), 0);
+    if ( splitDim < 0 )
+    {
+        splitDim = CheckedAdd(ifmShape.Size(), splitDim);
+    }
+    auto index = BoundsCheckedIndex(splitDim, *CheckedPtr(ifm->shape()));
+    int splitDimSize = (*CheckedPtr(ifm->shape()))[index];
+
+    int num_splits = CheckedPtr(op.builtin_options_as_SplitOptions())->num_splits();
+
+    if ( splitDimSize == 0 || num_splits < 0 || splitDimSize < 0 )
+    {
+        throw std::runtime_error("Error: Out of bounds\n");
+    }
+    if ( num_splits != 0 && splitDimSize % num_splits != 0 )
+    {
+        std::string constraint = "Size of dim to split must be divisible by the number of splits";
+        std::string extra = fmt::format("split_dim size={}, Attribute num_splits={}", splitDimSize, splitDim);
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+void ConstraintSplitvInferred(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef buffers)
+{
+    auto sizeSplits = TensorFromUsage(regor::TensorUsage::Params, op, builtinOperator, *subgraph.tensors());
+
+    int sizeSplitsElems = (*CheckedPtr(sizeSplits->shape()))[0];
+    int sizesToInfer = 0;
+    bool valid = true;
+
+    std::vector<int> vec;
+    if ( sizeSplitsElems <= 0 )
+    {
+        throw std::runtime_error("Error: Out of Bounds\n");
+    }
+    vec.reserve(sizeSplitsElems);
+
+    for ( unsigned int i = 0; static_cast<int>(i) < sizeSplitsElems; i++ )
+    {
+        if ( DataFromBuffer<int>(buffers, sizeSplits->buffer(), i) == -1 )
+        {
+            sizesToInfer++;
+            vec.emplace_back(DataFromBuffer<int>(buffers, sizeSplits->buffer(), i));
+            if ( sizesToInfer > 1 )
+            {
+                valid = false;
+            }
+        }
+    }
+    if ( !valid )
+    {
+        std::string constraint = "Only one size is allowed to be inferred";
+        std::string extra = fmt::format("Input size_splits={}", Shape(vec.data(), sizeSplitsElems).ToString());
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+void ConstraintAxisValid(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto ofm = TensorFromUsage(regor::TensorUsage::OFM, op, builtinOperator, *subgraph.tensors());
+    int64_t ofmRank = CheckedPtr(ofm->shape())->size();
+
+    int64_t axis = CheckedPtr(op.builtin_options_as_ConcatenationOptions())->axis();
+
+    if ( axis <= -ofmRank || axis > ofmRank || ofmRank < 0 )
+    {
+        std::string constraint = "Axis must be in the interval [-rank(OFM),rank(OFM))";
+        std::string extra = fmt::format("OFM rank={}, Attribute axis={}", ofmRank, axis);
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+void ConstraintMatchingDimensionality(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto ofm = TensorFromUsage(regor::TensorUsage::OFM, op, builtinOperator, *subgraph.tensors());
+    unsigned int ofmRank = CheckedPtr(ofm->shape())->size();
+
+    for ( flatbuffers::uoffset_t i = 0; i < CheckedPtr(op.inputs())->size(); i++ )
+    {
+        auto ifm = TensorFromUsage(
+            regor::MakeTensorUsage(regor::TensorUsage::IFM, int(i)), op, builtinOperator, *subgraph.tensors());
+        unsigned int ifmRank = CheckedPtr(ifm->shape())->size();
+        if ( ifmRank != ofmRank )
+        {
+            std::string constraint = "All Input ranks must match the OFM rank";
+            std::string extra = fmt::format("Found rank mismatch: OFM rank={}, IFM rank={}", ofmRank, ifmRank);
+            throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+        }
+    }
+}
+
+void ConstraintValidDimensions(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto ofm = TensorFromUsage(regor::TensorUsage::OFM, op, builtinOperator, *subgraph.tensors());
+    auto ofmShape = ShapeFromTens(ofm);
+
+    int ofmRank = ofmShape.Size();
+    int32_t axis = CheckedPtr(op.builtin_options_as_ConcatenationOptions())->axis();
+
+    axis = CheckedAdd(axis, axis < 0 ? ofmRank : 0);
+
+    int ifmAxisDimAcc = 0;
+    for ( const auto input_index : *CheckedPtr(op.inputs()) )
+    {
+        auto index = BoundsCheckedIndex(input_index, *subgraph.tensors());
+        auto input = (*subgraph.tensors())[index];
+        auto inputShape = ShapeFromTens(input);
+        for ( int dim = 0; dim < inputShape.Size(); dim++ )
+        {
+            if ( dim != axis )
+            {
+                if ( inputShape[dim] != ofmShape[dim] )
+                {
+                    std::string constraint = "All Input dimensions must match OFM dimension in all axes except the one defined by the axis attribute";
+                    std::string extra = fmt::format(
+                        "Found mismatch: IFM shape={}, OFM shape={}", inputShape.ToString(), ofmShape.ToString());
+                    throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+                }
+            }
+            else
+            {
+                ifmAxisDimAcc = CheckedAdd(ifmAxisDimAcc, inputShape[axis]);
+            }
+        }
+    }
+    if ( ifmAxisDimAcc != ofmShape[axis] )
+    {
+        std::string constraint = "All Input dimensions must match OFM dimension in all axes except the one defined by the axis attribute";
+        std::string extra = fmt::format("Found mismatch: sum of IFM dim size in concat axis={}, OFM concat axis dim size={}",
+            ifmAxisDimAcc, ofmShape[axis]);
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+
+void ConstraintStridedsliceInputCount(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto inputs = CheckedPtr(op.inputs());
+
+    if ( inputs->size() != 4 )
+    {
+        std::string constraint = "Must have 4 input tensors";
+        std::string extra = fmt::format("Number of inputs={}", inputs->size());
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+void ConstraintPadInputCount(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto inputs = CheckedPtr(op.inputs());
+
+    if ( inputs->size() != 2 )
+    {
+        std::string constraint = "Must have 2 input tensors";
+        std::string extra = fmt::format("Number of inputs={}", inputs->size());
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+void ConstraintPadOutputShape(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef buffers)
+{
+    auto ifm = TensorFromUsage(regor::TensorUsage::IFM, op, builtinOperator, *subgraph.tensors());
+    auto ofm = TensorFromUsage(regor::TensorUsage::OFM, op, builtinOperator, *subgraph.tensors());
+    auto ifmShape = ShapeFromTens(ifm);
+    auto ofmShape = ShapeFromTens(ofm);
+
+    auto padTensor = TensorFromUsage(regor::TensorUsage::Params, op, builtinOperator, *subgraph.tensors());
+
+    for ( unsigned int dim = 0; static_cast<int>(dim) < ifmShape.Size(); dim++ )
+    {
+        int paddedOutDim = CheckedAdd(CheckedAdd(ifmShape[dim], DataFromBuffer<int>(buffers, padTensor->buffer(), dim * 2)),
+            DataFromBuffer<int>(buffers, padTensor->buffer(), dim * 2 + 1));
+
+        if ( paddedOutDim != ofmShape[dim] )
+        {
+            std::string constraint = "Shape of OFM must equal the IFM shape plus padding";
+            std::string extra = fmt::format("Found mismatch for dim={}, padded output size in dim={}, OFM size in dim={}",
+                dim, paddedOutDim, ofmShape[dim]);
+            throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+        }
+    }
+}
+
+void ConstraintSliceRanges(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef buffers)
+{
+    auto ifm = TensorFromUsage(regor::TensorUsage::IFM, op, builtinOperator, *subgraph.tensors());
+    auto ifmShape = ShapeFromTens(ifm);
+
+    auto begin = TensorFromUsage(MakeTensorUsage(regor::TensorUsage::Params, 0), op, builtinOperator, *subgraph.tensors());
+    auto end = TensorFromUsage(MakeTensorUsage(regor::TensorUsage::Params, 1), op, builtinOperator, *subgraph.tensors());
+    auto StridedSliceOptions = CheckedPtr(op.builtin_options_as_StridedSliceOptions());
+
+    auto begin_mask = StridedSliceOptions->begin_mask();
+    auto end_mask = StridedSliceOptions->end_mask();
+    auto shrink_axis_mask = StridedSliceOptions->shrink_axis_mask();
+
+    auto offsetBegin = _getSliceOffsets(ifmShape, begin, begin_mask, buffers, true);
+    auto offsetEnd = _getSliceOffsets(ifmShape, end, end_mask, buffers, false);
+    assert(offsetBegin.Size() == ifmShape.Size() && offsetEnd.Size() == ifmShape.Size());
+
+    for ( int i = 0; i < ifmShape.Size(); i++ )
+    {
+        // Note: Vela does some options handling here, which I'm guessing should be done in the TFLite reader in Regor.
+        if ( offsetEnd[i] <= offsetBegin[i] && (shrink_axis_mask & (1 << i)) == 0 )
+        {
+            std::string constraint = "Slice end values must be greater than begin values";
+            std::string extra = fmt::format("Offset begin={}, Offset end={}", offsetBegin.ToString(), offsetEnd.ToString());
+            throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+        }
+    }
+}
+
+void ConstraintMatchingInputsTypes(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto ifm = TensorFromUsage(regor::TensorUsage::IFM, op, builtinOperator, *subgraph.tensors());
+    auto ifm2 = TensorFromUsage(MakeTensorUsage(regor::TensorUsage::IFM, 1), op, builtinOperator, *subgraph.tensors());
+    if ( ifm->type() != ifm2->type() )
+    {
+        std::string constraint = "Both Input data types must match";
+        std::string extra = fmt::format(
+            "Op has ifm_dtype={} and ifm2_dtype={}", EnumNameTensorType(ifm2->type()), EnumNameTensorType(ifm->type()));
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+void ConstraintMatchingSigned(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto ifm = TensorFromUsage(regor::TensorUsage::IFM, op, builtinOperator, *subgraph.tensors());
+    auto ofm = TensorFromUsage(regor::TensorUsage::OFM, op, builtinOperator, *subgraph.tensors());
+    if ( IsInt(ifm->type()) && !IsInt(ofm->type()) )
+    {
+        std::string constraint = "For IFM that are signed, OFM must also be signed";
+        std::string extra = fmt::format(
+            "IFM type={} and OFM type={}", EnumNameTensorType(ifm->type()), EnumNameTensorType(ofm->type()));
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+void ConstraintUnsignedValid(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto ifm = TensorFromUsage(regor::TensorUsage::IFM, op, builtinOperator, *subgraph.tensors());
+    auto ofm = TensorFromUsage(regor::TensorUsage::OFM, op, builtinOperator, *subgraph.tensors());
+
+    if ( IsUint(ifm->type()) && !(ifm->type() == ofm->type() || ofm->type() == TensorType::INT32) )
+    {
+        std::string constraint = "For IFM that are unsigned, OFM must either be the same type or int32";
+        std::string extra = fmt::format(
+            "IFM type={} and OFM type={}", EnumNameTensorType(ifm->type()), EnumNameTensorType(ofm->type()));
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+void ConstraintInputSigned(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto ifm = TensorFromUsage(regor::TensorUsage::IFM, op, builtinOperator, *subgraph.tensors());
+
+    if ( !(ifm->type() == TensorType::INT8 || ifm->type() == TensorType::INT16) )
+    {
+        std::string constraint = "IFM must be INT8 or INT16";
+        std::string extra = fmt::format("IFM type={}", EnumNameTensorType(ifm->type()));
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+void ConstraintInput8bit(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto ifm = TensorFromUsage(regor::TensorUsage::IFM, op, builtinOperator, *subgraph.tensors());
+
+    if ( !(ifm->type() == TensorType::INT8 || ifm->type() == TensorType::UINT8) )
+    {
+        std::string constraint = "IFM has to be 8bit";
+        std::string extra = fmt::format("IFM type={}, OFM type={}", EnumNameTensorType(ifm->type()));
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+void ConstraintArgmaxOutput(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto ofm = TensorFromUsage(regor::TensorUsage::OFM, op, builtinOperator, *subgraph.tensors());
+
+    if ( !(ofm->type() == TensorType::INT32 || ofm->type() == TensorType::INT64) )
+    {
+        std::string constraint = "For IFM that are signed, OFM must also be signed";
+        std::string extra = fmt::format("IFM type={}, OFM type={}", EnumNameTensorType(ofm->type()));
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+void ConstraintGatherIndicesInput(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto ifm2 = TensorFromUsage(regor::TensorUsage::IFM1, op, builtinOperator, *subgraph.tensors());
+
+    if ( !(ifm2->type() == TensorType::INT16 || ifm2->type() == TensorType::INT32 || ifm2->type() == TensorType::INT64) )
+    {
+        std::string constraint = "IFM2 must be INT16, INT32 or INT64";
+        std::string extra = fmt::format("IFM2 type={}", EnumNameTensorType(ifm2->type()));
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+void ConstraintMatchingEitherShapes(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto ifm = TensorFromUsage(regor::TensorUsage::IFM, op, builtinOperator, *subgraph.tensors());
+    auto ofm = TensorFromUsage(regor::TensorUsage::OFM, op, builtinOperator, *subgraph.tensors());
+    auto ifmShape = ShapeFromTens(ifm);
+    auto ofmShape = ShapeFromTens(ofm);
+    bool valid = ifmShape == ofmShape;
+    Shape ifm2Shape = Shape();
+    auto ifm2 = TensorFromUsage(MakeTensorUsage(regor::TensorUsage::IFM, 1), op, builtinOperator, *subgraph.tensors());
+
+    if ( ifm2 )
+    {
+        ifm2Shape = ShapeFromTens(ifm2);
+        valid = valid || ifm2Shape == ofmShape;
+    }
+    if ( !valid )
+    {
+        std::string constraint = "At least one Input's shape must match the OFM's shape";
+        std::string extra = fmt::format("IFM shape={}, IFM2 shape={} and OFM shape={}", ifmShape.ToString(),
+            ifm2Shape.ToString(), ofmShape.ToString());
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+void ConstraintMatchingInOutQuant(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto ifm = TensorFromUsage(regor::TensorUsage::IFM, op, builtinOperator, *subgraph.tensors());
+    auto ofm = TensorFromUsage(regor::TensorUsage::OFM, op, builtinOperator, *subgraph.tensors());
+    if ( ifm->quantization() && ifm->quantization()->scale() && ofm->quantization() && ofm->quantization()->scale() )
+    {
+        float ifmScale = (*ifm->quantization()->scale())[0];
+        float ofmScale = (*ofm->quantization()->scale())[0];
+        if ( ifmScale != ofmScale )
+        {
+            std::string constraint = "Input and output quantisation must match";
+            std::string extra = fmt::format("IFM scale={}, OFM shape={}", ifmScale, ofmScale);
+            throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+        }
+    }
+}
+
+void ConstraintKeepDimIfmOfm(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    if ( CheckedPtr(op.builtin_options_as_FullyConnectedOptions())->keep_num_dims() )
+    {
+        auto ifm = TensorFromUsage(regor::TensorUsage::IFM, op, builtinOperator, *subgraph.tensors());
+        auto ofm = TensorFromUsage(regor::TensorUsage::OFM, op, builtinOperator, *subgraph.tensors());
+        auto ifmShape = ShapeFromTens(ifm);
+        auto ofmShape = ShapeFromTens(ofm);
+        if ( ifmShape.Size() != ofmShape.Size() )
+        {
+            std::string constraint = "IFM and OFM ranks must match";
+            std::string extra = fmt::format("IFM rank={}, OFM rank={}", ifmShape.Size(), ofmShape.Size());
+            throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+        }
+    }
+}
+
+void ConstraintMatchingInOutElements(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto ifm = TensorFromUsage(regor::TensorUsage::IFM, op, builtinOperator, *subgraph.tensors());
+    auto ofm = TensorFromUsage(regor::TensorUsage::OFM, op, builtinOperator, *subgraph.tensors());
+    auto ifmShape = ShapeFromTens(ifm);
+    auto ofmShape = ShapeFromTens(ofm);
+
+    if ( ifmShape.Elements() != ofmShape.Elements() )
+    {
+        std::string constraint = "Input and output number of elements must match";
+        std::string extra = fmt::format("IFM shape={}, OFM shape={}", ifmShape.ToString(), ofmShape.ToString());
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+void ConstraintLstmInputRank(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto ifm = TensorFromUsage(regor::TensorUsage::IFM, op, builtinOperator, *subgraph.tensors());
+    auto ofm = TensorFromUsage(regor::TensorUsage::OFM, op, builtinOperator, *subgraph.tensors());
+    auto ifmShape = ShapeFromTens(ifm);
+    auto ofmShape = ShapeFromTens(ofm);
+    if ( !(ifmShape.Size() == 3 && ofmShape.Size() == 3) )
+    {
+        std::string constraint = "IFM and OFM must have 3D shape";
+        std::string extra = fmt::format("IFM rank={}, OFM rank={}", ifmShape.Size(), ofmShape.Size());
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+void ConstraintLstmInputs(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto input = CheckedPtr(op.inputs());
+    if ( input->size() != 24 )
+    {
+        std::string constraint = "Must have 24 input const tensors";
+        std::string extra = fmt::format("Number of inputs={}", input->size());
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+
+void ConstraintLstmIntermediates(const Operator &op, const SubGraph &subgraph, const BuiltinOperator &builtinOperator, BufferOffsetRef)
+{
+    auto intermediates = CheckedPtr(op.intermediates());
+    if ( intermediates->size() != 5 )
+    {
+        std::string constraint = "Must have 5 intermediate tensors";
+        std::string extra = fmt::format("Number of intermediates={}", intermediates->size());
+        throw InvalidTfLiteException(constraint, extra, op, subgraph, builtinOperator);
+    }
+}
+// TODO: Implement ConstraintLstmVariables when adding LSTM support
+void ConstraintLstmVariables(const Operator &, const SubGraph &, const BuiltinOperator &, BufferOffsetRef)
+{
+}
+
+regor::ordered_map<BuiltinOperator, OpCheckVec> GetSpecificConstraints()
+{
+
+    regor::ordered_map<BuiltinOperator, OpCheckVec> specificOpConstraints(50);
+    // Conv checks
+    for ( auto opType : convolutionOps )
+    {
+        specificOpConstraints[opType].emplace_back(&ConstraintConvGroupsIfmDepth);
+        specificOpConstraints[opType].emplace_back(&ConstraintConvGroupsNumFilters);
+    }
+
+    // Pooling checks
+    for ( auto opType : poolingOps )
+    {
+        specificOpConstraints[opType].emplace_back(&ConstraintMatchingInOutTypes);
+    }
+    // Concat specific checks
+    for ( auto opType : concatOps )
+    {
+        specificOpConstraints[opType].emplace_back(&ConstraintAxisValid);
+        specificOpConstraints[opType].emplace_back(&ConstraintMatchingDimensionality);
+        specificOpConstraints[opType].emplace_back(&ConstraintValidDimensions);
+    }
+
+    // ElementWiseMainOps checks
+    for ( auto opType : elemWiseMainOps )
+    {
+        specificOpConstraints[opType].emplace_back(&ConstraintMatchingEitherShapes);
+    }
+    // UnaryElemWiseMainOps specific checks
+    for ( auto opType : unaryElemWiseMainOps )
+    {
+        specificOpConstraints[opType].emplace_back(&ConstraintMatchingInOutTypes);
+    }
+    // BinaryElemWiseMinMaxOps specific checks
+    for ( auto opType : binaryElemWiseMinMaxOps )
+    {
+        specificOpConstraints[opType].emplace_back(&ConstraintMatchingInOutTypes);
+    }
+    // BinaryElemWiseAddMulSub specific checks
+    for ( auto opType : binaryElemWiseAddMulSub )
+    {
+        specificOpConstraints[opType].emplace_back(&ConstraintMatchingInputsTypes);
+        specificOpConstraints[opType].emplace_back(&ConstraintMatchingSigned);
+        specificOpConstraints[opType].emplace_back(&ConstraintUnsignedValid);
+    }
+
+    // OpsReshapingDimensions: Reshape, Squeeze, and ExpandDims
+    for ( auto opType : reshapeOps )
+    {
+        specificOpConstraints[opType].emplace_back(&ConstraintMatchingInOutQuant);
+        specificOpConstraints[opType].emplace_back(&ConstraintMatchingInOutElements);
+    }
+    // SoftmaxSpecificChecks
+    specificOpConstraints[BuiltinOperator::SOFTMAX].emplace_back(&ConstraintMatchingShapes);
+    specificOpConstraints[BuiltinOperator::SOFTMAX].emplace_back(&ConstraintMatchingInOutTypes);
+    specificOpConstraints[BuiltinOperator::SOFTMAX].emplace_back(&ConstraintBetaValueRange);
+
+    // SplitSpecificChecks
+    specificOpConstraints[BuiltinOperator::SPLIT].emplace_back(&ConstraintSplitDim);
+    specificOpConstraints[BuiltinOperator::SPLIT].emplace_back(&ConstraintSplitNumSplits);
+
+
+    // SplitVSpecificChecks
+    specificOpConstraints[BuiltinOperator::SPLIT_V].emplace_back(&ConstraintSplitvInferred);
+
+    // StridedSliceSpecificChecks
+    specificOpConstraints[BuiltinOperator::STRIDED_SLICE].emplace_back(&ConstraintStridedsliceInputCount);
+    specificOpConstraints[BuiltinOperator::STRIDED_SLICE].emplace_back(&ConstraintSliceRanges);
+
+    // FullyConnectedSpecificChecks
+    specificOpConstraints[BuiltinOperator::FULLY_CONNECTED].emplace_back(&ConstraintKeepDimIfmOfm);
+
+    // PadSpecificChecks
+    specificOpConstraints[BuiltinOperator::PAD].emplace_back(&ConstraintPadInputCount);
+    specificOpConstraints[BuiltinOperator::PAD].emplace_back(&ConstraintPadOutputShape);
+
+    // HardSwishSpecificChecks
+    specificOpConstraints[BuiltinOperator::HARD_SWISH].emplace_back(&ConstraintInput8bit);
+    specificOpConstraints[BuiltinOperator::HARD_SWISH].emplace_back(&ConstraintMatchingInOutTypes);
+
+    // ArgMaxSpecificChecks
+    specificOpConstraints[BuiltinOperator::ARGMAX].emplace_back(&ConstraintInput8bit);
+    specificOpConstraints[BuiltinOperator::ARGMAX].emplace_back(&ConstraintArgmaxOutput);
+
+    // UnidirectionalSequenceLstmSpecificChecks
+    specificOpConstraints[BuiltinOperator::UNIDIRECTIONAL_SEQUENCE_LSTM].emplace_back(&ConstraintInputSigned);
+    specificOpConstraints[BuiltinOperator::UNIDIRECTIONAL_SEQUENCE_LSTM].emplace_back(&ConstraintMatchingInOutTypes);
+    specificOpConstraints[BuiltinOperator::UNIDIRECTIONAL_SEQUENCE_LSTM].emplace_back(&ConstraintLstmInputRank);
+    specificOpConstraints[BuiltinOperator::UNIDIRECTIONAL_SEQUENCE_LSTM].emplace_back(&ConstraintLstmInputs);
+    specificOpConstraints[BuiltinOperator::UNIDIRECTIONAL_SEQUENCE_LSTM].emplace_back(&ConstraintLstmIntermediates);
+    specificOpConstraints[BuiltinOperator::UNIDIRECTIONAL_SEQUENCE_LSTM].emplace_back(&ConstraintLstmVariables);
+
+    // GatherSpecificChecks
+    specificOpConstraints[BuiltinOperator::GATHER].emplace_back(&ConstraintGatherIndicesInput);
+    specificOpConstraints[BuiltinOperator::GATHER].emplace_back(&ConstraintMatchingInOutTypes);
+
+    return specificOpConstraints;
+}
+
+OpCheckVec genericOpConstraints = {&ConstraintQuantScaleInf};
+
+std::vector<std::function<void(const Model &model)>> modelConstraints{&ConstraintEmptyConstTensors, &ConstraintTensQuantScale};
+}  // namespace
+
+void TFLiteModelSemantics::Check()
+{
+    auto buffers = CheckedPtr(m_model->buffers());
+    auto specificOpConstraints = GetSpecificConstraints();
+
+    try
+    {
+        for ( const auto &constraintFunc : modelConstraints )
+        {
+            constraintFunc(*m_model);
+        }
+        for ( const auto &subgraph : *m_model->subgraphs() )
+        {
+            for ( const auto &op : *subgraph->operators() )
+            {
+                auto index = BoundsCheckedIndex(op->opcode_index(), *CheckedPtr(m_model->operator_codes()));
+                auto opcode = (*CheckedPtr(m_model->operator_codes()))[index];
+                auto builtinOperator =
+                    opcode->builtin_code() == BuiltinOperator(0) ?
+                        BuiltinOperator(opcode->deprecated_builtin_code()) :
+                        opcode->builtin_code();
+
+                for ( const auto &constraintFunc : genericOpConstraints )
+                {
+                    constraintFunc(*op, *subgraph, builtinOperator, *buffers);
+                }
+
+                for ( const auto &constraintFunc : specificOpConstraints[builtinOperator] )
+                {
+                    constraintFunc(*op, *subgraph, builtinOperator, *buffers);
+                }
+            }
+        }
+    }
+    catch ( const InvalidTfLiteException &e )
+    {
+        std::string errorMessage = fmt::format(
+            "Error: {0} '{1}' Does not have valid TFLite Semantics.\n"
+            " - {2}\n"
+            "   {3}\n"
+            "Failed to parse TFLite file\n",
+            e.BuiltinOperatorName(), e.TensorName(), e.Constraint(), e.Extra());
+
+        throw std::runtime_error(errorMessage);
+    }
+}
+
+}  // namespace tflite
diff --git a/ethosu/regor/tflite/tflite_model_semantics.hpp b/ethosu/regor/tflite/tflite_model_semantics.hpp
new file mode 100644
index 00000000..a373564a
--- /dev/null
+++ b/ethosu/regor/tflite/tflite_model_semantics.hpp
@@ -0,0 +1,37 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "common/ordered_map.hpp"
+#include "tflite_schema_generated.hpp"
+
+namespace tflite
+{
+
+class TFLiteModelSemantics
+{
+public:
+    explicit TFLiteModelSemantics(const Model *model) : m_model(model) {}
+
+    void Check();
+
+private:
+    const Model *m_model;
+};
+}  // namespace tflite
diff --git a/ethosu/regor/tflite/tflite_reader.cpp b/ethosu/regor/tflite/tflite_reader.cpp
new file mode 100644
index 00000000..4cfc68d3
--- /dev/null
+++ b/ethosu/regor/tflite/tflite_reader.cpp
@@ -0,0 +1,777 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "tflite_reader.hpp"
+
+#include "common/logging.hpp"
+
+#include "common/buffer_view.hpp"
+#include "common/data_type.hpp"
+#include "common/numeric_util.hpp"
+#include "common/scaling.hpp"
+#include "common/shape.hpp"
+#include "compiler/graph.hpp"
+#include "compiler/op_type.hpp"
+#include "compiler/operation.hpp"
+#include "compiler/tensor.hpp"
+#include "flatbuffer_utils.hpp"
+#include "tflite_mapping.hpp"
+#include "tflite_model_semantics.hpp"
+#include "tflite_schema_generated.hpp"
+
+#include <cassert>
+#include <memory>
+#include <vector>
+
+namespace regor
+{
+
+
+static int64_t Quantize(float value, const Quantization &quant)
+{
+    float scale = quant.scales.empty() ? 1.0f : float(quant.scales[0].Dequantize());
+    int64_t zp = quant.zeroPoints.empty() ? 0 : quant.zeroPoints[0];
+    return zp + int64_t(std::round(double(value / scale)));
+}
+
+static void ClampActivation(const std::shared_ptr<Operation> &operation)
+{
+    OpType opType = operation->Type();
+    Quantization &quant = operation->Output(TensorUsage::OFM)->quantization;
+    if ( opType == OpType::Relu )
+    {
+        quant.quantMin = {Quantize(0, quant)};
+    }
+    else if ( opType == OpType::Relu6 )
+    {
+        quant.quantMin = {Quantize(0, quant)};
+        quant.quantMax = {Quantize(6, quant)};
+    }
+    else if ( opType == OpType::ReluN1To1 )
+    {
+        quant.quantMin = {Quantize(-1, quant)};
+        quant.quantMax = {Quantize(1, quant)};
+    }
+}
+
+static void SetKernel(const std::shared_ptr<Operation> &operation, const Point2i &size, const Point2i &stride,
+    const Point2i &dilation, tflite::Padding padding, int depthMultiplier = 1)
+{
+    Margin pad;
+    if ( padding == tflite::Padding::SAME )
+    {
+        auto &inputShape = operation->IFM(0)->StorageShape();
+        int dw = dilation.x * (size.x - 1) + 1;
+        int xpad = NeededTotalPadding(inputShape.Width(), stride.x, dw);
+        int dh = dilation.y * (size.y - 1) + 1;
+        int ypad = NeededTotalPadding(inputShape.Height(), stride.y, dh);
+        pad = Margin(ypad / 2, xpad / 2, (ypad + 1) / 2, (xpad + 1) / 2);
+    }
+    auto kernel = std::make_unique<Kernel>(size, stride, dilation, depthMultiplier, pad);
+    operation->SetKernel(std::move(kernel));
+}
+
+const tflite::Model *TfLiteReader::LoadModel(const void *input, size_t size)
+{
+    const uint8_t *buffer = static_cast<const uint8_t *>(input);
+    flatbuffers::Verifier verifier(buffer, size);
+
+    if ( !tflite::VerifyModelBuffer(verifier) )
+    {
+        LOG_ERROR("Failed to load TfLite model. Buffer contents inconsistent with generated schema.\n");
+        return nullptr;
+    }
+    return tflite::GetModel(buffer);
+}
+
+static Shape OperationOFMShape(OpType type, const Shape &input)
+{
+    assert(type == OpType::Quantize);
+    UNUSED(type);
+    return input;
+}
+
+void TfLiteReader::LoadGraphs(const tflite::Model *model, std::vector<std::unique_ptr<Graph>> &graphs, OptimiserDatabase *optDb)
+{
+    assert(model);
+
+    auto semanticsChecker = tflite::TFLiteModelSemantics(model);
+    semanticsChecker.Check();
+
+    std::unordered_map<UniqueId, Quantization> tensorQuantization{};
+    std::vector<tflite::BuiltinOperator> opcodes;
+    opcodes.reserve(model->operator_codes()->size());
+
+    for ( const auto &opcode : *model->operator_codes() )
+    {
+        if ( unsigned(opcode->builtin_code()) )
+        {
+            opcodes.push_back(opcode->builtin_code());
+        }
+        else  // See https://github.com/tensorflow/tensorflow/blob/bb13f5bb9c9c55/tensorflow/lite/schema/schema_utils.cc
+        {
+            opcodes.push_back(tflite::BuiltinOperator(opcode->deprecated_builtin_code()));
+        }
+    }
+
+    std::vector<std::shared_ptr<Buffer>> buffers;
+    assert(model->buffers());
+    buffers.reserve(model->buffers()->size());
+
+    for ( const auto &tflite_buffer : *model->buffers() )
+    {
+        if ( tflite_buffer->data() )
+        {
+            uint8_t *data = const_cast<uint8_t *>(tflite_buffer->data()->data());
+            buffers.push_back(std::make_shared<Buffer>(tflite_buffer->data()->size(), data, true));
+        }
+        else
+        {
+            buffers.push_back(nullptr);  // Preserves indexing
+        }
+    }
+
+    assert(model->subgraphs());
+    for ( const auto &tflite_subgraph : *model->subgraphs() )
+    {
+        std::vector<std::shared_ptr<Tensor>> tensors;
+        std::vector<std::shared_ptr<Operation>> operations;
+        assert(tflite_subgraph);
+        assert(tflite_subgraph->tensors());
+        assert(tflite_subgraph->operators());
+        tensors.reserve(tflite_subgraph->tensors()->size());
+        operations.reserve(tflite_subgraph->operators()->size());
+
+        // Operators refer to tensors, so create tensors before operations
+        for ( const auto &tflite_tensor : *tflite_subgraph->tensors() )
+        {
+            tensors.push_back(ParseTensor(tflite_tensor, buffers.at(tflite_tensor->buffer()), tensorQuantization));
+        }
+
+        // Create operations
+        int ext_key = 0;
+        for ( const auto &tflite_operator : *tflite_subgraph->operators() )
+        {
+            const OpType op_type = TfLiteMapping::BuiltinOperatorToOpType(opcodes.at(tflite_operator->opcode_index()));
+            auto operation = std::make_shared<Operation>(op_type);
+
+            // Connect operation to its input tensors
+            assert(tflite_operator);
+            assert(tflite_operator->inputs());
+            assert(tflite_operator->outputs());
+            const auto &input_tensors = *tflite_operator->inputs();  // A vector of indices into the `tensors` vector
+            int indirect_index = 0;                                  // An index into `input_tensors`
+            int ifm_count = 0;
+            Shape largestInput;
+            for ( const auto &map_entry : TfLiteMapping::InputTensorIndices(op_type) )
+            {
+                const TensorUsage usage = map_entry.second;
+                if ( indirect_index < int(input_tensors.size()) )  // Missing index means optional tensor not present
+                {
+                    const int direct_index = input_tensors[indirect_index++];
+                    if ( direct_index >= 0 )  // -1 indicates an optional tensor is not present
+                    {
+                        auto &tensor = tensors.at(direct_index);
+                        assert(tensorQuantization.count(tensor->Uid()) > 0);
+                        operation->ConnectInput(usage, tensor).Set(tensorQuantization[tensor->Uid()]);
+                        largestInput = Shape::Max(tensor->StorageShape(), largestInput);
+                    }
+                    if ( IsIFM(usage) )
+                    {
+                        ifm_count++;
+                    }
+                }
+            }
+
+            while ( indirect_index < int(input_tensors.size()) )
+            {
+                const int direct_index = input_tensors[indirect_index++];
+                if ( direct_index >= 0 )
+                {
+                    auto &tensor = tensors.at(direct_index);
+                    if ( IsVariadic(op_type) )
+                    {
+                        // Treat all input tensors beyond those specified in the indices map as IFMs.
+                        assert(tensorQuantization.count(tensor->Uid()) > 0);
+                        operation->ConnectInput(MakeTensorUsage(TensorUsage::IFM, ifm_count++), tensor)
+                            .Set(tensorQuantization[tensor->Uid()]);
+                    }
+                    else
+                    {
+                        LOG_WARN("TfLiteReader: Unexpected input tensor {} of operator {} will be ignored.\n",
+                            tensor->Name(), OpTypeToString(operation->Type()));
+                    }
+                }
+            }
+
+            // Connect operation to its output tensors
+            int ofm_count = 0;
+            for ( const int tensor_index : *tflite_operator->outputs() )
+            {
+                const auto &ofm = tensors.at(tensor_index);
+                // Correct OFMs with no dimension
+                if ( ofm->StorageShape().Size() == 0 )
+                {
+                    ofm->SetStorageShape(OperationOFMShape(operation->Type(), largestInput));
+                }
+                assert(tensorQuantization.count(ofm->Uid()) > 0);
+                operation->ConnectOutput(MakeTensorUsage(TensorUsage::OFM, ofm_count++), ofm).Set(tensorQuantization[ofm->Uid()]);
+            }
+            if ( optDb )
+            {
+                optDb->SourceOp(operation.get(), ext_key);
+            }
+            // Interpretation of operator options may depend on input/output tensor information,
+            // so the operation must be connected to its tensors before parsing operator options.
+            ParseOperatorOptions(operation, tflite_operator, optDb);
+
+            // Set rounding according to reference
+            SetOperatorRounding(operation);
+
+            operations.push_back(operation);
+            ext_key++;
+        }
+
+        // Create graph
+        auto graph = std::make_unique<Graph>(GraphNotation::TFLite);
+        for ( const auto &index : *tflite_subgraph->inputs() )
+        {
+            graph->AddInput(tensors.at(index));
+        }
+        for ( const auto &index : *tflite_subgraph->outputs() )
+        {
+            graph->AddOutput(tensors.at(index));
+        }
+
+        // Find and disconnect any operations which do not precede a graph output. Otherwise they might persist beyond
+        // the life of the Graph because the Graph destructor only disconnects operations which precede its outputs.
+        std::vector<Operation *> predecessors;
+        graph->GetAllOperations(predecessors);
+        for ( auto &operation : operations )
+        {
+            if ( std::find(predecessors.begin(), predecessors.end(), operation.get()) == predecessors.end() )
+            {
+                if ( TfLiteMapping::CanFuseActivationFunction(operation.get()) )
+                {
+                    operation->OFM()->Readers().front()->Disconnect();
+                }
+                operation->Disconnect();
+            }
+        }
+
+        // Save a pointer to the model table so we can look up operator_code later
+        graph->SetPassthrough(model);
+
+        // Give graph to caller
+        graphs.push_back(std::move(graph));
+
+        // Any operations which do not precede a graph output are destroyed here,
+        // Most tensors which do not precede a graph output are also destroyed here.
+        //  - Tensors which are themselves an input or output of a graph will persist.
+        //  - Tensors which do not precede a graph output but are written to by an operation which does will persist.
+    }
+}
+
+void TfLiteReader::LoadGraphs(const void *input, size_t size, std::vector<std::unique_ptr<Graph>> &graphs, OptimiserDatabase *optDb)
+{
+    LoadGraphs(LoadModel(input, size), graphs, optDb);
+}
+
+std::shared_ptr<Tensor> TfLiteReader::ParseTensor(const tflite::Tensor *tflite_tensor,
+    const std::shared_ptr<Buffer> &buffer, std::unordered_map<UniqueId, Quantization> &tensorQuantization)
+{
+    const std::string name = tflite_tensor->name() ? tflite_tensor->name()->str() : "<unnamed>";
+    const DataType type = TfLiteMapping::TensorTypeToDataType(tflite_tensor->type());
+
+    auto tensor = std::make_shared<Tensor>(name, type);
+
+    Shape shape;  // Defaults to shapeless
+    auto signature = tflite_tensor->shape_signature();
+    if ( tflite_tensor->shape() && tflite_tensor->shape()->size() )
+    {
+        shape = Shape(tflite_tensor->shape()->data(), tflite_tensor->shape()->size());
+    }
+    if ( signature && signature->size() )
+    {
+        // Signature trumps shape, but default to shape if signature is dynamic
+        if ( std::find(signature->begin(), signature->end(), -1) == signature->end() )
+        {
+            shape = Shape(signature->data(), signature->size());
+        }
+        else
+        {
+            LOG_WARN(
+                "Tensor '{}' has a dynamic shape signature, which is not supported. "
+                "Attempting to proceed with a fixed shape.\n",
+                name);
+        }
+    }
+
+    // Fix missing shapes on constant inputs
+    if ( shape.Size() == 0 && buffer )
+    {
+        shape = Shape(DataTypeElements(type, buffer->Size()));
+    }
+    tensor->SetStorageShape(shape);
+    tensor->SetBuffer(buffer);
+    tensorQuantization[tensor->Uid()] = {};
+
+    if ( tflite_tensor->quantization() )
+    {
+        if ( tflite_tensor->quantization()->details() )
+        {
+            LOG_WARN(
+                "Tensor '{}' specifies custom quantization, which is not supported. "
+                "Attempting to proceed with standard quantization only.\n",
+                name);
+        }
+        if ( tflite_tensor->quantization()->scale() && tflite_tensor->quantization()->zero_point() )
+        {
+            Quantization &quantization = tensorQuantization[tensor->Uid()];
+            quantization.type = QuantizationType::TFLITE;
+            std::vector<float> scale_f32 = FlatbufferUtils::LoadVector<float>(tflite_tensor->quantization()->scale());
+            for ( float scale : scale_f32 )
+            {
+                quantization.scales.push_back(QuantizedScale(scale));
+            }
+            quantization.zeroPoints = FlatbufferUtils::LoadVector<int64_t>(tflite_tensor->quantization()->zero_point());
+            quantization.dimension = tflite_tensor->quantization()->quantized_dimension();
+        }
+    }
+
+    if ( tflite_tensor->sparsity() )
+    {
+        LOG_WARN("Tensor '{}' contains sparsity information, which is not supported and will be ignored.\n", name);
+    }
+
+    tensor->SetPassthrough(tflite_tensor);
+
+    return tensor;
+}
+
+template<typename T>
+static const T *GetBuiltinOptions(const tflite::Operator *tflite_operator)
+{
+    const auto options = tflite_operator->builtin_options_as<T>();
+    assert(options);
+    return options;
+}
+
+void TfLiteReader::ParseOperatorOptions(
+    const std::shared_ptr<Operation> &operation, const tflite::Operator *tflite_operator, OptimiserDatabase *optDb)
+{
+    const auto type = tflite_operator->builtin_options_type();
+
+    assert((type == TfLiteMapping::OpTypeToBuiltinOptions(operation->Type())) || (type == tflite::BuiltinOptions::NONE));
+
+    switch ( type )
+    {
+        case tflite::BuiltinOptions::Conv2DOptions:
+        {
+            const auto options = GetBuiltinOptions<tflite::Conv2DOptions>(tflite_operator);
+            auto weight_tensor = operation->Input(TensorUsage::Weights)->tensor;
+            weight_tensor->SetAxisOrder(AxisOrder::OHWI);
+            SetKernel(operation, Point2i(weight_tensor->StorageShape().Width(), weight_tensor->StorageShape().Height()),
+                Point2i(options->stride_w(), options->stride_h()),
+                Point2i(options->dilation_w_factor(), options->dilation_h_factor()), options->padding());
+            UnFuseActivation(operation, options->fused_activation_function(), optDb);
+        }
+        break;
+
+        case tflite::BuiltinOptions::DepthwiseConv2DOptions:
+        {
+            const auto options = GetBuiltinOptions<tflite::DepthwiseConv2DOptions>(tflite_operator);
+            auto weight_tensor = operation->Input(TensorUsage::Weights)->tensor;
+            weight_tensor->SetAxisOrder(AxisOrder::IHWO);
+            Shape weightShape = weight_tensor->StorageShape();
+            int depth_multiplier = options->depth_multiplier();
+            if ( depth_multiplier == 0 )  // Depth multiplier is implicit. Derive it from tensor dimensions.
+            {
+                const int input_depth = operation->Input(TensorUsage::IFM)->tensor->StorageShape().Depth();
+                depth_multiplier = weightShape.Depth() / input_depth;
+            }
+            SetKernel(operation, weightShape.WH<int>(), Point2i(options->stride_w(), options->stride_h()),
+                Point2i(options->dilation_w_factor(), options->dilation_h_factor()), options->padding(), depth_multiplier);
+            UnFuseActivation(operation, options->fused_activation_function(), optDb);
+        }
+        break;
+
+        case tflite::BuiltinOptions::Pool2DOptions:
+        {
+            const auto options = GetBuiltinOptions<tflite::Pool2DOptions>(tflite_operator);
+            SetKernel(operation, Point2i(options->filter_width(), options->filter_height()),
+                Point2i(options->stride_w(), options->stride_h()), Point2i(1, 1),  // no dilation
+                options->padding());
+            UnFuseActivation(operation, options->fused_activation_function(), optDb);
+        }
+        break;
+
+        case tflite::BuiltinOptions::FullyConnectedOptions:
+        {
+            const auto options = GetBuiltinOptions<tflite::FullyConnectedOptions>(tflite_operator);
+            UnFuseActivation(operation, options->fused_activation_function(), optDb);
+            // TODO: Are `weights_format`, `keep_num_dims` or `asymmetric_quantize_inputs` used?
+
+            auto weight_tensor = operation->Input(TensorUsage::Weights)->tensor;
+            if ( weight_tensor->AxisOrder() == AxisOrder::Unknown )
+            {
+                // Reshape weight tensor from (num_outputs, ..., num_inputs) to (num_outputs, 1, 1, num_inputs)
+                weight_tensor->SetAxisOrder(AxisOrder::OHWI);
+                const auto &shape = weight_tensor->StorageShape();
+                for ( int i = 1; i < shape.Size() - 1; i++ )
+                {
+                    assert(shape[i] == 1);
+                }
+                weight_tensor->Reshape(Shape(shape[0], 1, 1, shape[-1]));
+            }
+            else
+            {
+                // Weight tensor has already been reshaped
+                assert(weight_tensor->AxisOrder() == AxisOrder::OHWI);
+            }
+            if ( operation->Input(TensorUsage::Scales) == nullptr )
+            {
+                // Op has no bias; add bias tensor filled with zeros
+                int elems = weight_tensor->StorageShape().Batch();
+                auto ifm = operation->Input(TensorUsage::IFM)->tensor;
+                DataType biasType;
+                std::shared_ptr<Buffer> buf;
+                if ( ifm->Type() == DataType::Int16 )
+                {
+                    biasType = DataType::Int64;
+                    std::vector<int64_t> data(ToUnsigned(elems));
+                    buf = std::make_shared<Buffer>(std::move(data));
+                }
+                else
+                {
+                    biasType = DataType::Int32;
+                    std::vector<int32_t> data(ToUnsigned(elems));
+                    buf = std::make_shared<Buffer>(std::move(data));
+                }
+                auto biasTens = std::make_shared<Tensor>(weight_tensor->Name() + "_bias", biasType, Shape(1, 1, 1, elems), buf);
+                operation->ConnectInput(TensorUsage::Scales, biasTens);
+            }
+        }
+        break;
+
+        case tflite::BuiltinOptions::SoftmaxOptions:
+        {
+            const auto options = GetBuiltinOptions<tflite::SoftmaxOptions>(tflite_operator);
+            operation->Parameters().softmax.beta = options->beta();
+        }
+        break;
+
+        case tflite::BuiltinOptions::ConcatenationOptions:
+        {
+            const auto options = GetBuiltinOptions<tflite::ConcatenationOptions>(tflite_operator);
+            operation->Parameters().concat.axis = options->axis();
+            UnFuseActivation(operation, options->fused_activation_function(), optDb);
+        }
+        break;
+
+        case tflite::BuiltinOptions::AddOptions:
+        {
+            const auto options = GetBuiltinOptions<tflite::AddOptions>(tflite_operator);
+            UnFuseActivation(operation, options->fused_activation_function(), optDb);
+        }
+        break;
+
+        case tflite::BuiltinOptions::SubOptions:
+        {
+            const auto options = GetBuiltinOptions<tflite::SubOptions>(tflite_operator);
+            UnFuseActivation(operation, options->fused_activation_function(), optDb);
+        }
+        break;
+
+        case tflite::BuiltinOptions::DivOptions:
+        {
+            const auto options = GetBuiltinOptions<tflite::DivOptions>(tflite_operator);
+            UnFuseActivation(operation, options->fused_activation_function(), optDb);
+        }
+        break;
+
+        case tflite::BuiltinOptions::MulOptions:
+        {
+            const auto options = GetBuiltinOptions<tflite::MulOptions>(tflite_operator);
+            UnFuseActivation(operation, options->fused_activation_function(), optDb);
+        }
+        break;
+
+        case tflite::BuiltinOptions::L2NormOptions:
+        {
+            const auto options = GetBuiltinOptions<tflite::L2NormOptions>(tflite_operator);
+            UnFuseActivation(operation, options->fused_activation_function(), optDb);
+        }
+        break;
+
+        case tflite::BuiltinOptions::ReshapeOptions:
+        {
+            const auto options = GetBuiltinOptions<tflite::ReshapeOptions>(tflite_operator);
+            const auto conn = operation->Input(TensorUsage::Params);
+
+            if ( conn == nullptr )
+            {
+                // New shape specified as option. Convert to input tensor.
+                auto new_shape = options->new_shape();
+                assert(new_shape);
+                auto tensor = std::make_shared<Tensor>("new_shape", DataType::Int32);
+                tensor->SetStorageShape(Shape(new_shape->size()));
+                auto buffer_base = new_shape->Data();
+                int buffer_size = int(new_shape->size() * (sizeof(int32_t) / sizeof(uint8_t)));
+                tensor->SetBuffer(std::make_shared<Buffer>(buffer_size, buffer_base, true));
+                operation->ConnectInput(TensorUsage::Params, tensor);
+            }
+        }
+        break;
+
+        case tflite::BuiltinOptions::PackOptions:
+        {
+            const auto options = GetBuiltinOptions<tflite::PackOptions>(tflite_operator);
+            operation->Parameters().pack_unpack.axis = options->axis();
+        }
+        break;
+
+        case tflite::BuiltinOptions::UnpackOptions:
+        {
+            const auto options = GetBuiltinOptions<tflite::UnpackOptions>(tflite_operator);
+            operation->Parameters().pack_unpack.axis = options->axis();
+        }
+        break;
+
+        case tflite::BuiltinOptions::LeakyReluOptions:
+        {
+            const auto options = GetBuiltinOptions<tflite::LeakyReluOptions>(tflite_operator);
+            operation->Parameters().leaky_relu.alpha = options->alpha();
+        }
+        break;
+
+        case tflite::BuiltinOptions::StridedSliceOptions:
+        {
+            const auto options = GetBuiltinOptions<tflite::StridedSliceOptions>(tflite_operator);
+            operation->Parameters().strided_slice.begin_mask = options->begin_mask();
+            operation->Parameters().strided_slice.end_mask = options->end_mask();
+            operation->Parameters().strided_slice.ellipsis_mask = options->ellipsis_mask();
+            operation->Parameters().strided_slice.new_axis_mask = options->new_axis_mask();
+            operation->Parameters().strided_slice.shrink_axis_mask = options->shrink_axis_mask();
+        }
+        break;
+
+        case tflite::BuiltinOptions::SplitOptions:
+        {
+            int num_splits = GetBuiltinOptions<tflite::SplitOptions>(tflite_operator)->num_splits();
+            assert(num_splits == operation->Outputs().size());
+        }
+        break;
+
+        case tflite::BuiltinOptions::SplitVOptions:
+        {
+            int num_splits = GetBuiltinOptions<tflite::SplitVOptions>(tflite_operator)->num_splits();
+            assert(num_splits == operation->Outputs().size());
+        }
+        break;
+
+        case tflite::BuiltinOptions::SVDFOptions:
+        {
+            const auto options = GetBuiltinOptions<tflite::SVDFOptions>(tflite_operator);
+            UnFuseActivation(operation, options->fused_activation_function(), optDb);
+        }
+        break;
+
+        case tflite::BuiltinOptions::ResizeBilinearOptions:
+        {
+            // Convert Parameters tensor to operation attributes
+            const auto options = GetBuiltinOptions<tflite::ResizeBilinearOptions>(tflite_operator);
+            operation->Parameters().resize.alignCorners = options->align_corners();
+            operation->Parameters().resize.halfPixelCenters = options->half_pixel_centers();
+        }
+        break;
+
+        case tflite::BuiltinOptions::ResizeNearestNeighborOptions:
+        {
+            // Convert Parameters tensor to operation attributes
+            const auto options = GetBuiltinOptions<tflite::ResizeNearestNeighborOptions>(tflite_operator);
+            operation->Parameters().resize.alignCorners = options->align_corners();
+            operation->Parameters().resize.halfPixelCenters = options->half_pixel_centers();
+            if ( !options->align_corners() )
+            {
+                // Use half-pixel-centers if align-corners is false.
+                // This aligns with reference kernels
+                operation->Parameters().resize.halfPixelCenters = true;
+            }
+        }
+        break;
+
+        // Options that are not used by the compiler are not loaded in, but can be written out again via passthrough
+        case tflite::BuiltinOptions::BatchMatMulOptions:
+        case tflite::BuiltinOptions::GatherOptions:
+        case tflite::BuiltinOptions::ShapeOptions:
+        case tflite::BuiltinOptions::SqueezeOptions:
+        case tflite::BuiltinOptions::ReducerOptions:
+            break;
+
+        // Empty option sets require no parsing
+        case tflite::BuiltinOptions::NONE:
+        case tflite::BuiltinOptions::HardSwishOptions:
+        case tflite::BuiltinOptions::MaximumMinimumOptions:
+        case tflite::BuiltinOptions::PadOptions:
+        case tflite::BuiltinOptions::DequantizeOptions:
+        case tflite::BuiltinOptions::QuantizeOptions:
+        case tflite::BuiltinOptions::TransposeOptions:
+        case tflite::BuiltinOptions::GatherNdOptions:
+        case tflite::BuiltinOptions::ScatterNdOptions:
+        case tflite::BuiltinOptions::ArgMaxOptions:
+            break;
+
+        case tflite::BuiltinOptions::ConcatEmbeddingsOptions:
+        case tflite::BuiltinOptions::LSHProjectionOptions:
+        case tflite::BuiltinOptions::RNNOptions:
+        case tflite::BuiltinOptions::LocalResponseNormalizationOptions:
+        case tflite::BuiltinOptions::LSTMOptions:
+        case tflite::BuiltinOptions::CallOptions:
+        case tflite::BuiltinOptions::SkipGramOptions:
+        case tflite::BuiltinOptions::SpaceToDepthOptions:
+        case tflite::BuiltinOptions::EmbeddingLookupSparseOptions:
+        case tflite::BuiltinOptions::BatchToSpaceNDOptions:
+        case tflite::BuiltinOptions::SpaceToBatchNDOptions:
+        case tflite::BuiltinOptions::SequenceRNNOptions:
+        case tflite::BuiltinOptions::ExpOptions:
+        case tflite::BuiltinOptions::TopKV2Options:
+        case tflite::BuiltinOptions::LogSoftmaxOptions:
+        case tflite::BuiltinOptions::CastOptions:
+        case tflite::BuiltinOptions::LessOptions:
+        case tflite::BuiltinOptions::NegOptions:
+        case tflite::BuiltinOptions::PadV2Options:
+        case tflite::BuiltinOptions::GreaterOptions:
+        case tflite::BuiltinOptions::GreaterEqualOptions:
+        case tflite::BuiltinOptions::LessEqualOptions:
+        case tflite::BuiltinOptions::SelectOptions:
+        case tflite::BuiltinOptions::SliceOptions:
+        case tflite::BuiltinOptions::TransposeConvOptions:
+        case tflite::BuiltinOptions::SparseToDenseOptions:
+        case tflite::BuiltinOptions::TileOptions:
+        case tflite::BuiltinOptions::ExpandDimsOptions:
+        case tflite::BuiltinOptions::EqualOptions:
+        case tflite::BuiltinOptions::NotEqualOptions:
+        case tflite::BuiltinOptions::PowOptions:
+        case tflite::BuiltinOptions::ArgMinOptions:
+        case tflite::BuiltinOptions::FakeQuantOptions:
+        case tflite::BuiltinOptions::LogicalOrOptions:
+        case tflite::BuiltinOptions::OneHotOptions:
+        case tflite::BuiltinOptions::LogicalAndOptions:
+        case tflite::BuiltinOptions::LogicalNotOptions:
+        case tflite::BuiltinOptions::FloorDivOptions:
+        case tflite::BuiltinOptions::SquareOptions:
+        case tflite::BuiltinOptions::ZerosLikeOptions:
+        case tflite::BuiltinOptions::FillOptions:
+        case tflite::BuiltinOptions::BidirectionalSequenceLSTMOptions:
+        case tflite::BuiltinOptions::BidirectionalSequenceRNNOptions:
+        case tflite::BuiltinOptions::UnidirectionalSequenceLSTMOptions:
+        case tflite::BuiltinOptions::FloorModOptions:
+        case tflite::BuiltinOptions::RangeOptions:
+        case tflite::BuiltinOptions::SquaredDifferenceOptions:
+        case tflite::BuiltinOptions::MirrorPadOptions:
+        case tflite::BuiltinOptions::AbsOptions:
+        case tflite::BuiltinOptions::UniqueOptions:
+        case tflite::BuiltinOptions::ReverseV2Options:
+        case tflite::BuiltinOptions::AddNOptions:
+        case tflite::BuiltinOptions::CosOptions:
+        case tflite::BuiltinOptions::WhereOptions:
+        case tflite::BuiltinOptions::RankOptions:
+        case tflite::BuiltinOptions::ReverseSequenceOptions:
+        case tflite::BuiltinOptions::MatrixDiagOptions:
+        case tflite::BuiltinOptions::MatrixSetDiagOptions:
+        case tflite::BuiltinOptions::IfOptions:
+        case tflite::BuiltinOptions::WhileOptions:
+        case tflite::BuiltinOptions::DepthToSpaceOptions:
+        case tflite::BuiltinOptions::NonMaxSuppressionV4Options:
+        case tflite::BuiltinOptions::NonMaxSuppressionV5Options:
+        case tflite::BuiltinOptions::SelectV2Options:
+        case tflite::BuiltinOptions::DensifyOptions:
+        case tflite::BuiltinOptions::SegmentSumOptions:
+        case tflite::BuiltinOptions::CumsumOptions:
+        case tflite::BuiltinOptions::CallOnceOptions:
+        case tflite::BuiltinOptions::BroadcastToOptions:
+        case tflite::BuiltinOptions::Rfft2dOptions:
+        case tflite::BuiltinOptions::Conv3DOptions:
+        case tflite::BuiltinOptions::HashtableOptions:
+        case tflite::BuiltinOptions::HashtableFindOptions:
+        case tflite::BuiltinOptions::HashtableImportOptions:
+        case tflite::BuiltinOptions::HashtableSizeOptions:
+        case tflite::BuiltinOptions::VarHandleOptions:
+        case tflite::BuiltinOptions::ReadVariableOptions:
+        case tflite::BuiltinOptions::AssignVariableOptions:
+            // TODO
+            LOG_WARN("TfLiteReader: Built-in options type '{}' is not yet implemented and will be ignored.\n",
+                tflite::EnumNameBuiltinOptions(type));
+            break;
+        default:
+            LOG_ERROR("TfLiteReader: Unrecognised built-in options type '{}'\n", int(type));
+            break;
+    }
+
+    operation->SetPassthrough(tflite_operator);
+}
+
+void TfLiteReader::SetOperatorRounding(const std::shared_ptr<Operation> &operation)
+{
+    auto ifm = operation->Input(TensorUsage::IFM)->tensor;
+    auto opType = operation->Type();
+
+    // Default rounding mode
+    RoundMode roundMode = RoundMode::DBL;
+
+    // Change according to reference
+    if ( ifm->Type() == DataType::Int16 && (IsConvolution(opType) || IsVectorProduct(opType)) )
+    {
+        roundMode = RoundMode::NATURAL;
+    }
+    else if ( IsPooling(opType) )
+    {
+        roundMode = RoundMode::NATURAL;
+    }
+    operation->SetRounding(roundMode);
+}
+
+void TfLiteReader::UnFuseActivation(const std::shared_ptr<Operation> &operation, tflite::ActivationFunctionType type, OptimiserDatabase *optDb)
+{
+    if ( type == tflite::ActivationFunctionType::NONE )
+    {
+        return;
+    }
+
+    assert(operation->Outputs().size() == 1);
+
+    // Before: upstream -> operation --------------------------------------> output_tensor -> downstream
+    // After:  upstream -> operation -> intermediate_tensor -> activation -> output_tensor -> downstream
+
+    auto activation = std::make_shared<Operation>(TfLiteMapping::ActivationFunctionToOpType(type));
+    auto &output_tensor = operation->Outputs().front().tensor;
+    Quantization quantization = operation->Outputs().front().quantization;
+    std::shared_ptr<Tensor> intermediate_tensor = output_tensor->Clone();
+    activation->ConnectOutput(TensorUsage::OFM, output_tensor).Set(quantization);
+    output_tensor->RemoveWriter(operation);
+    operation->ConnectOutput(TensorUsage::OFM, intermediate_tensor).Set(quantization);
+    activation->ConnectInput(TensorUsage::IFM, intermediate_tensor).Set(quantization);
+    ClampActivation(activation);
+    if ( optDb )
+    {
+        optDb->AddOptimised(operation.get(), activation.get());
+    }
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/tflite/tflite_reader.hpp b/ethosu/regor/tflite/tflite_reader.hpp
new file mode 100644
index 00000000..9292114f
--- /dev/null
+++ b/ethosu/regor/tflite/tflite_reader.hpp
@@ -0,0 +1,54 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+#include "common/scaling.hpp"
+#include "compiler/graph.hpp"
+#include "compiler/graph_optimiser.hpp"
+#include "compiler/operation.hpp"
+#include "compiler/tensor.hpp"
+#include "tflite_schema_generated.hpp"
+
+#include <memory>
+#include <unordered_map>
+
+namespace regor
+{
+
+// Parses a TensorFlow Lite flatbuffer to create a Graph.
+// The flatbuffer is expected to remain in place and unmodified for the entire lifetime of the created Graph.
+class TfLiteReader
+{
+public:
+    TfLiteReader() {}
+
+    static void LoadGraphs(const tflite::Model *model, std::vector<std::unique_ptr<Graph>> &graphs, OptimiserDatabase *optDb);  // From model
+    static void LoadGraphs(const void *input, size_t size, std::vector<std::unique_ptr<Graph>> &graphs, OptimiserDatabase *optDb);  // From buffer
+
+private:
+    static const tflite::Model *LoadModel(const void *input, size_t size);
+    static std::shared_ptr<Tensor> ParseTensor(const tflite::Tensor *tflite_tensor,
+        const std::shared_ptr<Buffer> &buffer, std::unordered_map<UniqueId, Quantization> &tensorQuantization);
+    static void ParseOperatorOptions(
+        const std::shared_ptr<Operation> &operation, const tflite::Operator *tflite_operator, OptimiserDatabase *optDb);
+    static void SetOperatorRounding(const std::shared_ptr<Operation> &operation);
+    static void UnFuseActivation(const std::shared_ptr<Operation> &operation, tflite::ActivationFunctionType type, OptimiserDatabase *optDb);
+    static void DefaultOperatorOptions(const std::shared_ptr<Operation> &operation);
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/tflite/tflite_schema_generated.hpp b/ethosu/regor/tflite/tflite_schema_generated.hpp
new file mode 100644
index 00000000..3f8e6407
--- /dev/null
+++ b/ethosu/regor/tflite/tflite_schema_generated.hpp
@@ -0,0 +1,17552 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+//
+// To reproduce:
+//   flatc version 23.5.26
+//   schema.fbs @v2.16.1
+//   flatc --cpp --scoped-enums --reflect-names schema.fbs
+//   sed -i '' -e 's/ARGMAX/ARGMAX/g' tflite_schema_generated.hpp
+
+
+#ifndef FLATBUFFERS_GENERATED_SCHEMA_TFLITE_H_
+#define FLATBUFFERS_GENERATED_SCHEMA_TFLITE_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
+              FLATBUFFERS_VERSION_MINOR == 5 &&
+              FLATBUFFERS_VERSION_REVISION == 26,
+             "Non-compatible flatbuffers version included");
+
+namespace tflite {
+
+struct CustomQuantization;
+struct CustomQuantizationBuilder;
+
+struct QuantizationParameters;
+struct QuantizationParametersBuilder;
+
+struct Int32Vector;
+struct Int32VectorBuilder;
+
+struct Uint16Vector;
+struct Uint16VectorBuilder;
+
+struct Uint8Vector;
+struct Uint8VectorBuilder;
+
+struct DimensionMetadata;
+struct DimensionMetadataBuilder;
+
+struct SparsityParameters;
+struct SparsityParametersBuilder;
+
+struct VariantSubType;
+struct VariantSubTypeBuilder;
+
+struct Tensor;
+struct TensorBuilder;
+
+struct StablehloGatherOptions;
+struct StablehloGatherOptionsBuilder;
+
+struct StablehloTransposeOptions;
+struct StablehloTransposeOptionsBuilder;
+
+struct StablehloDotGeneralOptions;
+struct StablehloDotGeneralOptionsBuilder;
+
+struct StablehloReduceWindowOptions;
+struct StablehloReduceWindowOptionsBuilder;
+
+struct StablehloWhileOptions;
+struct StablehloWhileOptionsBuilder;
+
+struct StablehloSortOptions;
+struct StablehloSortOptionsBuilder;
+
+struct StablehloConcatenateOptions;
+struct StablehloConcatenateOptionsBuilder;
+
+struct StablehloBroadcastInDimOptions;
+struct StablehloBroadcastInDimOptionsBuilder;
+
+struct StablehloCompareOptions;
+struct StablehloCompareOptionsBuilder;
+
+struct StablehloDynamicSliceOptions;
+struct StablehloDynamicSliceOptionsBuilder;
+
+struct StablehloPadOptions;
+struct StablehloPadOptionsBuilder;
+
+struct StablehloIotaOptions;
+struct StablehloIotaOptionsBuilder;
+
+struct StablehloCustomCallOptions;
+struct StablehloCustomCallOptionsBuilder;
+
+struct StablehloReduceOptions;
+struct StablehloReduceOptionsBuilder;
+
+struct StablehloSliceOptions;
+struct StablehloSliceOptionsBuilder;
+
+struct StablehloConvolutionOptions;
+struct StablehloConvolutionOptionsBuilder;
+
+struct StablehloScatterOptions;
+struct StablehloScatterOptionsBuilder;
+
+struct StablehloRngBitGeneratorOptions;
+struct StablehloRngBitGeneratorOptionsBuilder;
+
+struct Conv2DOptions;
+struct Conv2DOptionsBuilder;
+
+struct Conv3DOptions;
+struct Conv3DOptionsBuilder;
+
+struct Pool2DOptions;
+struct Pool2DOptionsBuilder;
+
+struct DepthwiseConv2DOptions;
+struct DepthwiseConv2DOptionsBuilder;
+
+struct ConcatEmbeddingsOptions;
+struct ConcatEmbeddingsOptionsBuilder;
+
+struct LSHProjectionOptions;
+struct LSHProjectionOptionsBuilder;
+
+struct SVDFOptions;
+struct SVDFOptionsBuilder;
+
+struct RNNOptions;
+struct RNNOptionsBuilder;
+
+struct SequenceRNNOptions;
+struct SequenceRNNOptionsBuilder;
+
+struct BidirectionalSequenceRNNOptions;
+struct BidirectionalSequenceRNNOptionsBuilder;
+
+struct FullyConnectedOptions;
+struct FullyConnectedOptionsBuilder;
+
+struct SoftmaxOptions;
+struct SoftmaxOptionsBuilder;
+
+struct ConcatenationOptions;
+struct ConcatenationOptionsBuilder;
+
+struct AddOptions;
+struct AddOptionsBuilder;
+
+struct MulOptions;
+struct MulOptionsBuilder;
+
+struct L2NormOptions;
+struct L2NormOptionsBuilder;
+
+struct LocalResponseNormalizationOptions;
+struct LocalResponseNormalizationOptionsBuilder;
+
+struct LSTMOptions;
+struct LSTMOptionsBuilder;
+
+struct UnidirectionalSequenceLSTMOptions;
+struct UnidirectionalSequenceLSTMOptionsBuilder;
+
+struct BidirectionalSequenceLSTMOptions;
+struct BidirectionalSequenceLSTMOptionsBuilder;
+
+struct ResizeBilinearOptions;
+struct ResizeBilinearOptionsBuilder;
+
+struct ResizeNearestNeighborOptions;
+struct ResizeNearestNeighborOptionsBuilder;
+
+struct CallOptions;
+struct CallOptionsBuilder;
+
+struct PadOptions;
+struct PadOptionsBuilder;
+
+struct PadV2Options;
+struct PadV2OptionsBuilder;
+
+struct ReshapeOptions;
+struct ReshapeOptionsBuilder;
+
+struct SpaceToBatchNDOptions;
+struct SpaceToBatchNDOptionsBuilder;
+
+struct BatchToSpaceNDOptions;
+struct BatchToSpaceNDOptionsBuilder;
+
+struct SkipGramOptions;
+struct SkipGramOptionsBuilder;
+
+struct SpaceToDepthOptions;
+struct SpaceToDepthOptionsBuilder;
+
+struct DepthToSpaceOptions;
+struct DepthToSpaceOptionsBuilder;
+
+struct SubOptions;
+struct SubOptionsBuilder;
+
+struct DivOptions;
+struct DivOptionsBuilder;
+
+struct TopKV2Options;
+struct TopKV2OptionsBuilder;
+
+struct EmbeddingLookupSparseOptions;
+struct EmbeddingLookupSparseOptionsBuilder;
+
+struct GatherOptions;
+struct GatherOptionsBuilder;
+
+struct TransposeOptions;
+struct TransposeOptionsBuilder;
+
+struct ExpOptions;
+struct ExpOptionsBuilder;
+
+struct CosOptions;
+struct CosOptionsBuilder;
+
+struct ReducerOptions;
+struct ReducerOptionsBuilder;
+
+struct SqueezeOptions;
+struct SqueezeOptionsBuilder;
+
+struct SplitOptions;
+struct SplitOptionsBuilder;
+
+struct SplitVOptions;
+struct SplitVOptionsBuilder;
+
+struct StridedSliceOptions;
+struct StridedSliceOptionsBuilder;
+
+struct LogSoftmaxOptions;
+struct LogSoftmaxOptionsBuilder;
+
+struct CastOptions;
+struct CastOptionsBuilder;
+
+struct DequantizeOptions;
+struct DequantizeOptionsBuilder;
+
+struct MaximumMinimumOptions;
+struct MaximumMinimumOptionsBuilder;
+
+struct TileOptions;
+struct TileOptionsBuilder;
+
+struct ArgMaxOptions;
+struct ArgMaxOptionsBuilder;
+
+struct ArgMinOptions;
+struct ArgMinOptionsBuilder;
+
+struct GreaterOptions;
+struct GreaterOptionsBuilder;
+
+struct GreaterEqualOptions;
+struct GreaterEqualOptionsBuilder;
+
+struct LessOptions;
+struct LessOptionsBuilder;
+
+struct LessEqualOptions;
+struct LessEqualOptionsBuilder;
+
+struct NegOptions;
+struct NegOptionsBuilder;
+
+struct SelectOptions;
+struct SelectOptionsBuilder;
+
+struct SliceOptions;
+struct SliceOptionsBuilder;
+
+struct TransposeConvOptions;
+struct TransposeConvOptionsBuilder;
+
+struct ExpandDimsOptions;
+struct ExpandDimsOptionsBuilder;
+
+struct SparseToDenseOptions;
+struct SparseToDenseOptionsBuilder;
+
+struct EqualOptions;
+struct EqualOptionsBuilder;
+
+struct NotEqualOptions;
+struct NotEqualOptionsBuilder;
+
+struct ShapeOptions;
+struct ShapeOptionsBuilder;
+
+struct RankOptions;
+struct RankOptionsBuilder;
+
+struct PowOptions;
+struct PowOptionsBuilder;
+
+struct FakeQuantOptions;
+struct FakeQuantOptionsBuilder;
+
+struct PackOptions;
+struct PackOptionsBuilder;
+
+struct LogicalOrOptions;
+struct LogicalOrOptionsBuilder;
+
+struct OneHotOptions;
+struct OneHotOptionsBuilder;
+
+struct AbsOptions;
+struct AbsOptionsBuilder;
+
+struct HardSwishOptions;
+struct HardSwishOptionsBuilder;
+
+struct LogicalAndOptions;
+struct LogicalAndOptionsBuilder;
+
+struct LogicalNotOptions;
+struct LogicalNotOptionsBuilder;
+
+struct UnpackOptions;
+struct UnpackOptionsBuilder;
+
+struct FloorDivOptions;
+struct FloorDivOptionsBuilder;
+
+struct SquareOptions;
+struct SquareOptionsBuilder;
+
+struct ZerosLikeOptions;
+struct ZerosLikeOptionsBuilder;
+
+struct FillOptions;
+struct FillOptionsBuilder;
+
+struct FloorModOptions;
+struct FloorModOptionsBuilder;
+
+struct RangeOptions;
+struct RangeOptionsBuilder;
+
+struct LeakyReluOptions;
+struct LeakyReluOptionsBuilder;
+
+struct SquaredDifferenceOptions;
+struct SquaredDifferenceOptionsBuilder;
+
+struct MirrorPadOptions;
+struct MirrorPadOptionsBuilder;
+
+struct UniqueOptions;
+struct UniqueOptionsBuilder;
+
+struct ReverseV2Options;
+struct ReverseV2OptionsBuilder;
+
+struct AddNOptions;
+struct AddNOptionsBuilder;
+
+struct GatherNdOptions;
+struct GatherNdOptionsBuilder;
+
+struct WhereOptions;
+struct WhereOptionsBuilder;
+
+struct ReverseSequenceOptions;
+struct ReverseSequenceOptionsBuilder;
+
+struct MatrixDiagOptions;
+struct MatrixDiagOptionsBuilder;
+
+struct QuantizeOptions;
+struct QuantizeOptionsBuilder;
+
+struct MatrixSetDiagOptions;
+struct MatrixSetDiagOptionsBuilder;
+
+struct IfOptions;
+struct IfOptionsBuilder;
+
+struct CallOnceOptions;
+struct CallOnceOptionsBuilder;
+
+struct WhileOptions;
+struct WhileOptionsBuilder;
+
+struct NonMaxSuppressionV4Options;
+struct NonMaxSuppressionV4OptionsBuilder;
+
+struct NonMaxSuppressionV5Options;
+struct NonMaxSuppressionV5OptionsBuilder;
+
+struct ScatterNdOptions;
+struct ScatterNdOptionsBuilder;
+
+struct SelectV2Options;
+struct SelectV2OptionsBuilder;
+
+struct DensifyOptions;
+struct DensifyOptionsBuilder;
+
+struct SegmentSumOptions;
+struct SegmentSumOptionsBuilder;
+
+struct BatchMatMulOptions;
+struct BatchMatMulOptionsBuilder;
+
+struct CumsumOptions;
+struct CumsumOptionsBuilder;
+
+struct BroadcastToOptions;
+struct BroadcastToOptionsBuilder;
+
+struct Rfft2dOptions;
+struct Rfft2dOptionsBuilder;
+
+struct HashtableOptions;
+struct HashtableOptionsBuilder;
+
+struct HashtableFindOptions;
+struct HashtableFindOptionsBuilder;
+
+struct HashtableImportOptions;
+struct HashtableImportOptionsBuilder;
+
+struct HashtableSizeOptions;
+struct HashtableSizeOptionsBuilder;
+
+struct VarHandleOptions;
+struct VarHandleOptionsBuilder;
+
+struct ReadVariableOptions;
+struct ReadVariableOptionsBuilder;
+
+struct AssignVariableOptions;
+struct AssignVariableOptionsBuilder;
+
+struct RandomOptions;
+struct RandomOptionsBuilder;
+
+struct BucketizeOptions;
+struct BucketizeOptionsBuilder;
+
+struct GeluOptions;
+struct GeluOptionsBuilder;
+
+struct DynamicUpdateSliceOptions;
+struct DynamicUpdateSliceOptionsBuilder;
+
+struct UnsortedSegmentProdOptions;
+struct UnsortedSegmentProdOptionsBuilder;
+
+struct UnsortedSegmentMaxOptions;
+struct UnsortedSegmentMaxOptionsBuilder;
+
+struct UnsortedSegmentSumOptions;
+struct UnsortedSegmentSumOptionsBuilder;
+
+struct ATan2Options;
+struct ATan2OptionsBuilder;
+
+struct UnsortedSegmentMinOptions;
+struct UnsortedSegmentMinOptionsBuilder;
+
+struct SignOptions;
+struct SignOptionsBuilder;
+
+struct BitcastOptions;
+struct BitcastOptionsBuilder;
+
+struct BitwiseXorOptions;
+struct BitwiseXorOptionsBuilder;
+
+struct RightShiftOptions;
+struct RightShiftOptionsBuilder;
+
+struct DilateOptions;
+struct DilateOptionsBuilder;
+
+struct ReduceWindowOptions;
+struct ReduceWindowOptionsBuilder;
+
+struct OperatorCode;
+struct OperatorCodeBuilder;
+
+struct Operator;
+struct OperatorBuilder;
+
+struct SubGraph;
+struct SubGraphBuilder;
+
+struct Buffer;
+struct BufferBuilder;
+
+struct Metadata;
+struct MetadataBuilder;
+
+struct TensorMap;
+struct TensorMapBuilder;
+
+struct SignatureDef;
+struct SignatureDefBuilder;
+
+struct Model;
+struct ModelBuilder;
+
+inline const ::flatbuffers::TypeTable *CustomQuantizationTypeTable();
+
+inline const ::flatbuffers::TypeTable *QuantizationParametersTypeTable();
+
+inline const ::flatbuffers::TypeTable *Int32VectorTypeTable();
+
+inline const ::flatbuffers::TypeTable *Uint16VectorTypeTable();
+
+inline const ::flatbuffers::TypeTable *Uint8VectorTypeTable();
+
+inline const ::flatbuffers::TypeTable *DimensionMetadataTypeTable();
+
+inline const ::flatbuffers::TypeTable *SparsityParametersTypeTable();
+
+inline const ::flatbuffers::TypeTable *VariantSubTypeTypeTable();
+
+inline const ::flatbuffers::TypeTable *TensorTypeTable();
+
+inline const ::flatbuffers::TypeTable *StablehloGatherOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *StablehloTransposeOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *StablehloDotGeneralOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *StablehloReduceWindowOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *StablehloWhileOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *StablehloSortOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *StablehloConcatenateOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *StablehloBroadcastInDimOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *StablehloCompareOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *StablehloDynamicSliceOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *StablehloPadOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *StablehloIotaOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *StablehloCustomCallOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *StablehloReduceOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *StablehloSliceOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *StablehloConvolutionOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *StablehloScatterOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *StablehloRngBitGeneratorOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *Conv2DOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *Conv3DOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *Pool2DOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *DepthwiseConv2DOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *ConcatEmbeddingsOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *LSHProjectionOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *SVDFOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *RNNOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *SequenceRNNOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *BidirectionalSequenceRNNOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *FullyConnectedOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *SoftmaxOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *ConcatenationOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *AddOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *MulOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *L2NormOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *LocalResponseNormalizationOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *LSTMOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *UnidirectionalSequenceLSTMOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *BidirectionalSequenceLSTMOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *ResizeBilinearOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *ResizeNearestNeighborOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *CallOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *PadOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *PadV2OptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *ReshapeOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *SpaceToBatchNDOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *BatchToSpaceNDOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *SkipGramOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *SpaceToDepthOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *DepthToSpaceOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *SubOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *DivOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *TopKV2OptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *EmbeddingLookupSparseOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *GatherOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *TransposeOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *ExpOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *CosOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *ReducerOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *SqueezeOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *SplitOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *SplitVOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *StridedSliceOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *LogSoftmaxOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *CastOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *DequantizeOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *MaximumMinimumOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *TileOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *ArgMaxOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *ArgMinOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *GreaterOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *GreaterEqualOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *LessOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *LessEqualOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *NegOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *SelectOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *SliceOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *TransposeConvOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *ExpandDimsOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *SparseToDenseOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *EqualOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *NotEqualOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *ShapeOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *RankOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *PowOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *FakeQuantOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *PackOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *LogicalOrOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *OneHotOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *AbsOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *HardSwishOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *LogicalAndOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *LogicalNotOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *UnpackOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *FloorDivOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *SquareOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *ZerosLikeOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *FillOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *FloorModOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *RangeOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *LeakyReluOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *SquaredDifferenceOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *MirrorPadOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *UniqueOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *ReverseV2OptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *AddNOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *GatherNdOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *WhereOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *ReverseSequenceOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *MatrixDiagOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *QuantizeOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *MatrixSetDiagOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *IfOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *CallOnceOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *WhileOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *NonMaxSuppressionV4OptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *NonMaxSuppressionV5OptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *ScatterNdOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *SelectV2OptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *DensifyOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *SegmentSumOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *BatchMatMulOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *CumsumOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *BroadcastToOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *Rfft2dOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *HashtableOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *HashtableFindOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *HashtableImportOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *HashtableSizeOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *VarHandleOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *ReadVariableOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *AssignVariableOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *RandomOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *BucketizeOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *GeluOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *DynamicUpdateSliceOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *UnsortedSegmentProdOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *UnsortedSegmentMaxOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *UnsortedSegmentSumOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *ATan2OptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *UnsortedSegmentMinOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *SignOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *BitcastOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *BitwiseXorOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *RightShiftOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *DilateOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *ReduceWindowOptionsTypeTable();
+
+inline const ::flatbuffers::TypeTable *OperatorCodeTypeTable();
+
+inline const ::flatbuffers::TypeTable *OperatorTypeTable();
+
+inline const ::flatbuffers::TypeTable *SubGraphTypeTable();
+
+inline const ::flatbuffers::TypeTable *BufferTypeTable();
+
+inline const ::flatbuffers::TypeTable *MetadataTypeTable();
+
+inline const ::flatbuffers::TypeTable *TensorMapTypeTable();
+
+inline const ::flatbuffers::TypeTable *SignatureDefTypeTable();
+
+inline const ::flatbuffers::TypeTable *ModelTypeTable();
+
+enum class TensorType : int8_t {
+  FLOAT32 = 0,
+  FLOAT16 = 1,
+  INT32 = 2,
+  UINT8 = 3,
+  INT64 = 4,
+  STRING = 5,
+  BOOL = 6,
+  INT16 = 7,
+  COMPLEX64 = 8,
+  INT8 = 9,
+  FLOAT64 = 10,
+  COMPLEX128 = 11,
+  UINT64 = 12,
+  RESOURCE = 13,
+  VARIANT = 14,
+  UINT32 = 15,
+  UINT16 = 16,
+  INT4 = 17,
+  MIN = FLOAT32,
+  MAX = INT4
+};
+
+inline const TensorType (&EnumValuesTensorType())[18] {
+  static const TensorType values[] = {
+    TensorType::FLOAT32,
+    TensorType::FLOAT16,
+    TensorType::INT32,
+    TensorType::UINT8,
+    TensorType::INT64,
+    TensorType::STRING,
+    TensorType::BOOL,
+    TensorType::INT16,
+    TensorType::COMPLEX64,
+    TensorType::INT8,
+    TensorType::FLOAT64,
+    TensorType::COMPLEX128,
+    TensorType::UINT64,
+    TensorType::RESOURCE,
+    TensorType::VARIANT,
+    TensorType::UINT32,
+    TensorType::UINT16,
+    TensorType::INT4
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesTensorType() {
+  static const char * const names[19] = {
+    "FLOAT32",
+    "FLOAT16",
+    "INT32",
+    "UINT8",
+    "INT64",
+    "STRING",
+    "BOOL",
+    "INT16",
+    "COMPLEX64",
+    "INT8",
+    "FLOAT64",
+    "COMPLEX128",
+    "UINT64",
+    "RESOURCE",
+    "VARIANT",
+    "UINT32",
+    "UINT16",
+    "INT4",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameTensorType(TensorType e) {
+  if (::flatbuffers::IsOutRange(e, TensorType::FLOAT32, TensorType::INT4)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesTensorType()[index];
+}
+
+enum class QuantizationDetails : uint8_t {
+  NONE = 0,
+  CustomQuantization = 1,
+  MIN = NONE,
+  MAX = CustomQuantization
+};
+
+inline const QuantizationDetails (&EnumValuesQuantizationDetails())[2] {
+  static const QuantizationDetails values[] = {
+    QuantizationDetails::NONE,
+    QuantizationDetails::CustomQuantization
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesQuantizationDetails() {
+  static const char * const names[3] = {
+    "NONE",
+    "CustomQuantization",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameQuantizationDetails(QuantizationDetails e) {
+  if (::flatbuffers::IsOutRange(e, QuantizationDetails::NONE, QuantizationDetails::CustomQuantization)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesQuantizationDetails()[index];
+}
+
+template<typename T> struct QuantizationDetailsTraits {
+  static const QuantizationDetails enum_value = QuantizationDetails::NONE;
+};
+
+template<> struct QuantizationDetailsTraits<tflite::CustomQuantization> {
+  static const QuantizationDetails enum_value = QuantizationDetails::CustomQuantization;
+};
+
+bool VerifyQuantizationDetails(::flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type);
+bool VerifyQuantizationDetailsVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<QuantizationDetails> *types);
+
+enum class DimensionType : int8_t {
+  DENSE = 0,
+  SPARSE_CSR = 1,
+  MIN = DENSE,
+  MAX = SPARSE_CSR
+};
+
+inline const DimensionType (&EnumValuesDimensionType())[2] {
+  static const DimensionType values[] = {
+    DimensionType::DENSE,
+    DimensionType::SPARSE_CSR
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesDimensionType() {
+  static const char * const names[3] = {
+    "DENSE",
+    "SPARSE_CSR",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameDimensionType(DimensionType e) {
+  if (::flatbuffers::IsOutRange(e, DimensionType::DENSE, DimensionType::SPARSE_CSR)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesDimensionType()[index];
+}
+
+enum class SparseIndexVector : uint8_t {
+  NONE = 0,
+  Int32Vector = 1,
+  Uint16Vector = 2,
+  Uint8Vector = 3,
+  MIN = NONE,
+  MAX = Uint8Vector
+};
+
+inline const SparseIndexVector (&EnumValuesSparseIndexVector())[4] {
+  static const SparseIndexVector values[] = {
+    SparseIndexVector::NONE,
+    SparseIndexVector::Int32Vector,
+    SparseIndexVector::Uint16Vector,
+    SparseIndexVector::Uint8Vector
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesSparseIndexVector() {
+  static const char * const names[5] = {
+    "NONE",
+    "Int32Vector",
+    "Uint16Vector",
+    "Uint8Vector",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameSparseIndexVector(SparseIndexVector e) {
+  if (::flatbuffers::IsOutRange(e, SparseIndexVector::NONE, SparseIndexVector::Uint8Vector)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesSparseIndexVector()[index];
+}
+
+template<typename T> struct SparseIndexVectorTraits {
+  static const SparseIndexVector enum_value = SparseIndexVector::NONE;
+};
+
+template<> struct SparseIndexVectorTraits<tflite::Int32Vector> {
+  static const SparseIndexVector enum_value = SparseIndexVector::Int32Vector;
+};
+
+template<> struct SparseIndexVectorTraits<tflite::Uint16Vector> {
+  static const SparseIndexVector enum_value = SparseIndexVector::Uint16Vector;
+};
+
+template<> struct SparseIndexVectorTraits<tflite::Uint8Vector> {
+  static const SparseIndexVector enum_value = SparseIndexVector::Uint8Vector;
+};
+
+bool VerifySparseIndexVector(::flatbuffers::Verifier &verifier, const void *obj, SparseIndexVector type);
+bool VerifySparseIndexVectorVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<SparseIndexVector> *types);
+
+enum class BuiltinOperator : int32_t {
+  ADD = 0,
+  AVERAGE_POOL_2D = 1,
+  CONCATENATION = 2,
+  CONV_2D = 3,
+  DEPTHWISE_CONV_2D = 4,
+  DEPTH_TO_SPACE = 5,
+  DEQUANTIZE = 6,
+  EMBEDDING_LOOKUP = 7,
+  FLOOR = 8,
+  FULLY_CONNECTED = 9,
+  HASHTABLE_LOOKUP = 10,
+  L2_NORMALIZATION = 11,
+  L2_POOL_2D = 12,
+  LOCAL_RESPONSE_NORMALIZATION = 13,
+  LOGISTIC = 14,
+  LSH_PROJECTION = 15,
+  LSTM = 16,
+  MAX_POOL_2D = 17,
+  MUL = 18,
+  RELU = 19,
+  RELU_N1_TO_1 = 20,
+  RELU6 = 21,
+  RESHAPE = 22,
+  RESIZE_BILINEAR = 23,
+  RNN = 24,
+  SOFTMAX = 25,
+  SPACE_TO_DEPTH = 26,
+  SVDF = 27,
+  TANH = 28,
+  CONCAT_EMBEDDINGS = 29,
+  SKIP_GRAM = 30,
+  CALL = 31,
+  CUSTOM = 32,
+  EMBEDDING_LOOKUP_SPARSE = 33,
+  PAD = 34,
+  UNIDIRECTIONAL_SEQUENCE_RNN = 35,
+  GATHER = 36,
+  BATCH_TO_SPACE_ND = 37,
+  SPACE_TO_BATCH_ND = 38,
+  TRANSPOSE = 39,
+  MEAN = 40,
+  SUB = 41,
+  DIV = 42,
+  SQUEEZE = 43,
+  UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
+  STRIDED_SLICE = 45,
+  BIDIRECTIONAL_SEQUENCE_RNN = 46,
+  EXP = 47,
+  TOPK_V2 = 48,
+  SPLIT = 49,
+  LOG_SOFTMAX = 50,
+  DELEGATE = 51,
+  BIDIRECTIONAL_SEQUENCE_LSTM = 52,
+  CAST = 53,
+  PRELU = 54,
+  MAXIMUM = 55,
+  ARGMAX = 56,
+  MINIMUM = 57,
+  LESS = 58,
+  NEG = 59,
+  PADV2 = 60,
+  GREATER = 61,
+  GREATER_EQUAL = 62,
+  LESS_EQUAL = 63,
+  SELECT = 64,
+  SLICE = 65,
+  SIN = 66,
+  TRANSPOSE_CONV = 67,
+  SPARSE_TO_DENSE = 68,
+  TILE = 69,
+  EXPAND_DIMS = 70,
+  EQUAL = 71,
+  NOT_EQUAL = 72,
+  LOG = 73,
+  SUM = 74,
+  SQRT = 75,
+  RSQRT = 76,
+  SHAPE = 77,
+  POW = 78,
+  ARG_MIN = 79,
+  FAKE_QUANT = 80,
+  REDUCE_PROD = 81,
+  REDUCE_MAX = 82,
+  PACK = 83,
+  LOGICAL_OR = 84,
+  ONE_HOT = 85,
+  LOGICAL_AND = 86,
+  LOGICAL_NOT = 87,
+  UNPACK = 88,
+  REDUCE_MIN = 89,
+  FLOOR_DIV = 90,
+  REDUCE_ANY = 91,
+  SQUARE = 92,
+  ZEROS_LIKE = 93,
+  FILL = 94,
+  FLOOR_MOD = 95,
+  RANGE = 96,
+  RESIZE_NEAREST_NEIGHBOR = 97,
+  LEAKY_RELU = 98,
+  SQUARED_DIFFERENCE = 99,
+  MIRROR_PAD = 100,
+  ABS = 101,
+  SPLIT_V = 102,
+  UNIQUE = 103,
+  CEIL = 104,
+  REVERSE_V2 = 105,
+  ADD_N = 106,
+  GATHER_ND = 107,
+  COS = 108,
+  WHERE = 109,
+  RANK = 110,
+  ELU = 111,
+  REVERSE_SEQUENCE = 112,
+  MATRIX_DIAG = 113,
+  QUANTIZE = 114,
+  MATRIX_SET_DIAG = 115,
+  ROUND = 116,
+  HARD_SWISH = 117,
+  IF = 118,
+  WHILE = 119,
+  NON_MAX_SUPPRESSION_V4 = 120,
+  NON_MAX_SUPPRESSION_V5 = 121,
+  SCATTER_ND = 122,
+  SELECT_V2 = 123,
+  DENSIFY = 124,
+  SEGMENT_SUM = 125,
+  BATCH_MATMUL = 126,
+  PLACEHOLDER_FOR_GREATER_OP_CODES = 127,
+  CUMSUM = 128,
+  CALL_ONCE = 129,
+  BROADCAST_TO = 130,
+  RFFT2D = 131,
+  CONV_3D = 132,
+  IMAG = 133,
+  REAL = 134,
+  COMPLEX_ABS = 135,
+  HASHTABLE = 136,
+  HASHTABLE_FIND = 137,
+  HASHTABLE_IMPORT = 138,
+  HASHTABLE_SIZE = 139,
+  REDUCE_ALL = 140,
+  CONV_3D_TRANSPOSE = 141,
+  VAR_HANDLE = 142,
+  READ_VARIABLE = 143,
+  ASSIGN_VARIABLE = 144,
+  BROADCAST_ARGS = 145,
+  RANDOM_STANDARD_NORMAL = 146,
+  BUCKETIZE = 147,
+  RANDOM_UNIFORM = 148,
+  MULTINOMIAL = 149,
+  GELU = 150,
+  DYNAMIC_UPDATE_SLICE = 151,
+  RELU_0_TO_1 = 152,
+  UNSORTED_SEGMENT_PROD = 153,
+  UNSORTED_SEGMENT_MAX = 154,
+  UNSORTED_SEGMENT_SUM = 155,
+  ATAN2 = 156,
+  UNSORTED_SEGMENT_MIN = 157,
+  SIGN = 158,
+  BITCAST = 159,
+  BITWISE_XOR = 160,
+  RIGHT_SHIFT = 161,
+  STABLEHLO_LOGISTIC = 162,
+  STABLEHLO_ADD = 163,
+  STABLEHLO_DIVIDE = 164,
+  STABLEHLO_MULTIPLY = 165,
+  STABLEHLO_MAXIMUM = 166,
+  STABLEHLO_RESHAPE = 167,
+  STABLEHLO_CLAMP = 168,
+  STABLEHLO_CONCATENATE = 169,
+  STABLEHLO_BROADCAST_IN_DIM = 170,
+  STABLEHLO_CONVOLUTION = 171,
+  STABLEHLO_SLICE = 172,
+  STABLEHLO_CUSTOM_CALL = 173,
+  STABLEHLO_REDUCE = 174,
+  STABLEHLO_ABS = 175,
+  STABLEHLO_AND = 176,
+  STABLEHLO_COSINE = 177,
+  STABLEHLO_EXPONENTIAL = 178,
+  STABLEHLO_FLOOR = 179,
+  STABLEHLO_LOG = 180,
+  STABLEHLO_MINIMUM = 181,
+  STABLEHLO_NEGATE = 182,
+  STABLEHLO_OR = 183,
+  STABLEHLO_POWER = 184,
+  STABLEHLO_REMAINDER = 185,
+  STABLEHLO_RSQRT = 186,
+  STABLEHLO_SELECT = 187,
+  STABLEHLO_SUBTRACT = 188,
+  STABLEHLO_TANH = 189,
+  STABLEHLO_SCATTER = 190,
+  STABLEHLO_COMPARE = 191,
+  STABLEHLO_CONVERT = 192,
+  STABLEHLO_DYNAMIC_SLICE = 193,
+  STABLEHLO_DYNAMIC_UPDATE_SLICE = 194,
+  STABLEHLO_PAD = 195,
+  STABLEHLO_IOTA = 196,
+  STABLEHLO_DOT_GENERAL = 197,
+  STABLEHLO_REDUCE_WINDOW = 198,
+  STABLEHLO_SORT = 199,
+  STABLEHLO_WHILE = 200,
+  STABLEHLO_GATHER = 201,
+  STABLEHLO_TRANSPOSE = 202,
+  DILATE = 203,
+  STABLEHLO_RNG_BIT_GENERATOR = 204,
+  REDUCE_WINDOW = 205,
+  MIN = ADD,
+  MAX = REDUCE_WINDOW
+};
+
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[206] {
+  static const BuiltinOperator values[] = {
+    BuiltinOperator::ADD,
+    BuiltinOperator::AVERAGE_POOL_2D,
+    BuiltinOperator::CONCATENATION,
+    BuiltinOperator::CONV_2D,
+    BuiltinOperator::DEPTHWISE_CONV_2D,
+    BuiltinOperator::DEPTH_TO_SPACE,
+    BuiltinOperator::DEQUANTIZE,
+    BuiltinOperator::EMBEDDING_LOOKUP,
+    BuiltinOperator::FLOOR,
+    BuiltinOperator::FULLY_CONNECTED,
+    BuiltinOperator::HASHTABLE_LOOKUP,
+    BuiltinOperator::L2_NORMALIZATION,
+    BuiltinOperator::L2_POOL_2D,
+    BuiltinOperator::LOCAL_RESPONSE_NORMALIZATION,
+    BuiltinOperator::LOGISTIC,
+    BuiltinOperator::LSH_PROJECTION,
+    BuiltinOperator::LSTM,
+    BuiltinOperator::MAX_POOL_2D,
+    BuiltinOperator::MUL,
+    BuiltinOperator::RELU,
+    BuiltinOperator::RELU_N1_TO_1,
+    BuiltinOperator::RELU6,
+    BuiltinOperator::RESHAPE,
+    BuiltinOperator::RESIZE_BILINEAR,
+    BuiltinOperator::RNN,
+    BuiltinOperator::SOFTMAX,
+    BuiltinOperator::SPACE_TO_DEPTH,
+    BuiltinOperator::SVDF,
+    BuiltinOperator::TANH,
+    BuiltinOperator::CONCAT_EMBEDDINGS,
+    BuiltinOperator::SKIP_GRAM,
+    BuiltinOperator::CALL,
+    BuiltinOperator::CUSTOM,
+    BuiltinOperator::EMBEDDING_LOOKUP_SPARSE,
+    BuiltinOperator::PAD,
+    BuiltinOperator::UNIDIRECTIONAL_SEQUENCE_RNN,
+    BuiltinOperator::GATHER,
+    BuiltinOperator::BATCH_TO_SPACE_ND,
+    BuiltinOperator::SPACE_TO_BATCH_ND,
+    BuiltinOperator::TRANSPOSE,
+    BuiltinOperator::MEAN,
+    BuiltinOperator::SUB,
+    BuiltinOperator::DIV,
+    BuiltinOperator::SQUEEZE,
+    BuiltinOperator::UNIDIRECTIONAL_SEQUENCE_LSTM,
+    BuiltinOperator::STRIDED_SLICE,
+    BuiltinOperator::BIDIRECTIONAL_SEQUENCE_RNN,
+    BuiltinOperator::EXP,
+    BuiltinOperator::TOPK_V2,
+    BuiltinOperator::SPLIT,
+    BuiltinOperator::LOG_SOFTMAX,
+    BuiltinOperator::DELEGATE,
+    BuiltinOperator::BIDIRECTIONAL_SEQUENCE_LSTM,
+    BuiltinOperator::CAST,
+    BuiltinOperator::PRELU,
+    BuiltinOperator::MAXIMUM,
+    BuiltinOperator::ARGMAX,
+    BuiltinOperator::MINIMUM,
+    BuiltinOperator::LESS,
+    BuiltinOperator::NEG,
+    BuiltinOperator::PADV2,
+    BuiltinOperator::GREATER,
+    BuiltinOperator::GREATER_EQUAL,
+    BuiltinOperator::LESS_EQUAL,
+    BuiltinOperator::SELECT,
+    BuiltinOperator::SLICE,
+    BuiltinOperator::SIN,
+    BuiltinOperator::TRANSPOSE_CONV,
+    BuiltinOperator::SPARSE_TO_DENSE,
+    BuiltinOperator::TILE,
+    BuiltinOperator::EXPAND_DIMS,
+    BuiltinOperator::EQUAL,
+    BuiltinOperator::NOT_EQUAL,
+    BuiltinOperator::LOG,
+    BuiltinOperator::SUM,
+    BuiltinOperator::SQRT,
+    BuiltinOperator::RSQRT,
+    BuiltinOperator::SHAPE,
+    BuiltinOperator::POW,
+    BuiltinOperator::ARG_MIN,
+    BuiltinOperator::FAKE_QUANT,
+    BuiltinOperator::REDUCE_PROD,
+    BuiltinOperator::REDUCE_MAX,
+    BuiltinOperator::PACK,
+    BuiltinOperator::LOGICAL_OR,
+    BuiltinOperator::ONE_HOT,
+    BuiltinOperator::LOGICAL_AND,
+    BuiltinOperator::LOGICAL_NOT,
+    BuiltinOperator::UNPACK,
+    BuiltinOperator::REDUCE_MIN,
+    BuiltinOperator::FLOOR_DIV,
+    BuiltinOperator::REDUCE_ANY,
+    BuiltinOperator::SQUARE,
+    BuiltinOperator::ZEROS_LIKE,
+    BuiltinOperator::FILL,
+    BuiltinOperator::FLOOR_MOD,
+    BuiltinOperator::RANGE,
+    BuiltinOperator::RESIZE_NEAREST_NEIGHBOR,
+    BuiltinOperator::LEAKY_RELU,
+    BuiltinOperator::SQUARED_DIFFERENCE,
+    BuiltinOperator::MIRROR_PAD,
+    BuiltinOperator::ABS,
+    BuiltinOperator::SPLIT_V,
+    BuiltinOperator::UNIQUE,
+    BuiltinOperator::CEIL,
+    BuiltinOperator::REVERSE_V2,
+    BuiltinOperator::ADD_N,
+    BuiltinOperator::GATHER_ND,
+    BuiltinOperator::COS,
+    BuiltinOperator::WHERE,
+    BuiltinOperator::RANK,
+    BuiltinOperator::ELU,
+    BuiltinOperator::REVERSE_SEQUENCE,
+    BuiltinOperator::MATRIX_DIAG,
+    BuiltinOperator::QUANTIZE,
+    BuiltinOperator::MATRIX_SET_DIAG,
+    BuiltinOperator::ROUND,
+    BuiltinOperator::HARD_SWISH,
+    BuiltinOperator::IF,
+    BuiltinOperator::WHILE,
+    BuiltinOperator::NON_MAX_SUPPRESSION_V4,
+    BuiltinOperator::NON_MAX_SUPPRESSION_V5,
+    BuiltinOperator::SCATTER_ND,
+    BuiltinOperator::SELECT_V2,
+    BuiltinOperator::DENSIFY,
+    BuiltinOperator::SEGMENT_SUM,
+    BuiltinOperator::BATCH_MATMUL,
+    BuiltinOperator::PLACEHOLDER_FOR_GREATER_OP_CODES,
+    BuiltinOperator::CUMSUM,
+    BuiltinOperator::CALL_ONCE,
+    BuiltinOperator::BROADCAST_TO,
+    BuiltinOperator::RFFT2D,
+    BuiltinOperator::CONV_3D,
+    BuiltinOperator::IMAG,
+    BuiltinOperator::REAL,
+    BuiltinOperator::COMPLEX_ABS,
+    BuiltinOperator::HASHTABLE,
+    BuiltinOperator::HASHTABLE_FIND,
+    BuiltinOperator::HASHTABLE_IMPORT,
+    BuiltinOperator::HASHTABLE_SIZE,
+    BuiltinOperator::REDUCE_ALL,
+    BuiltinOperator::CONV_3D_TRANSPOSE,
+    BuiltinOperator::VAR_HANDLE,
+    BuiltinOperator::READ_VARIABLE,
+    BuiltinOperator::ASSIGN_VARIABLE,
+    BuiltinOperator::BROADCAST_ARGS,
+    BuiltinOperator::RANDOM_STANDARD_NORMAL,
+    BuiltinOperator::BUCKETIZE,
+    BuiltinOperator::RANDOM_UNIFORM,
+    BuiltinOperator::MULTINOMIAL,
+    BuiltinOperator::GELU,
+    BuiltinOperator::DYNAMIC_UPDATE_SLICE,
+    BuiltinOperator::RELU_0_TO_1,
+    BuiltinOperator::UNSORTED_SEGMENT_PROD,
+    BuiltinOperator::UNSORTED_SEGMENT_MAX,
+    BuiltinOperator::UNSORTED_SEGMENT_SUM,
+    BuiltinOperator::ATAN2,
+    BuiltinOperator::UNSORTED_SEGMENT_MIN,
+    BuiltinOperator::SIGN,
+    BuiltinOperator::BITCAST,
+    BuiltinOperator::BITWISE_XOR,
+    BuiltinOperator::RIGHT_SHIFT,
+    BuiltinOperator::STABLEHLO_LOGISTIC,
+    BuiltinOperator::STABLEHLO_ADD,
+    BuiltinOperator::STABLEHLO_DIVIDE,
+    BuiltinOperator::STABLEHLO_MULTIPLY,
+    BuiltinOperator::STABLEHLO_MAXIMUM,
+    BuiltinOperator::STABLEHLO_RESHAPE,
+    BuiltinOperator::STABLEHLO_CLAMP,
+    BuiltinOperator::STABLEHLO_CONCATENATE,
+    BuiltinOperator::STABLEHLO_BROADCAST_IN_DIM,
+    BuiltinOperator::STABLEHLO_CONVOLUTION,
+    BuiltinOperator::STABLEHLO_SLICE,
+    BuiltinOperator::STABLEHLO_CUSTOM_CALL,
+    BuiltinOperator::STABLEHLO_REDUCE,
+    BuiltinOperator::STABLEHLO_ABS,
+    BuiltinOperator::STABLEHLO_AND,
+    BuiltinOperator::STABLEHLO_COSINE,
+    BuiltinOperator::STABLEHLO_EXPONENTIAL,
+    BuiltinOperator::STABLEHLO_FLOOR,
+    BuiltinOperator::STABLEHLO_LOG,
+    BuiltinOperator::STABLEHLO_MINIMUM,
+    BuiltinOperator::STABLEHLO_NEGATE,
+    BuiltinOperator::STABLEHLO_OR,
+    BuiltinOperator::STABLEHLO_POWER,
+    BuiltinOperator::STABLEHLO_REMAINDER,
+    BuiltinOperator::STABLEHLO_RSQRT,
+    BuiltinOperator::STABLEHLO_SELECT,
+    BuiltinOperator::STABLEHLO_SUBTRACT,
+    BuiltinOperator::STABLEHLO_TANH,
+    BuiltinOperator::STABLEHLO_SCATTER,
+    BuiltinOperator::STABLEHLO_COMPARE,
+    BuiltinOperator::STABLEHLO_CONVERT,
+    BuiltinOperator::STABLEHLO_DYNAMIC_SLICE,
+    BuiltinOperator::STABLEHLO_DYNAMIC_UPDATE_SLICE,
+    BuiltinOperator::STABLEHLO_PAD,
+    BuiltinOperator::STABLEHLO_IOTA,
+    BuiltinOperator::STABLEHLO_DOT_GENERAL,
+    BuiltinOperator::STABLEHLO_REDUCE_WINDOW,
+    BuiltinOperator::STABLEHLO_SORT,
+    BuiltinOperator::STABLEHLO_WHILE,
+    BuiltinOperator::STABLEHLO_GATHER,
+    BuiltinOperator::STABLEHLO_TRANSPOSE,
+    BuiltinOperator::DILATE,
+    BuiltinOperator::STABLEHLO_RNG_BIT_GENERATOR,
+    BuiltinOperator::REDUCE_WINDOW
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesBuiltinOperator() {
+  static const char * const names[207] = {
+    "ADD",
+    "AVERAGE_POOL_2D",
+    "CONCATENATION",
+    "CONV_2D",
+    "DEPTHWISE_CONV_2D",
+    "DEPTH_TO_SPACE",
+    "DEQUANTIZE",
+    "EMBEDDING_LOOKUP",
+    "FLOOR",
+    "FULLY_CONNECTED",
+    "HASHTABLE_LOOKUP",
+    "L2_NORMALIZATION",
+    "L2_POOL_2D",
+    "LOCAL_RESPONSE_NORMALIZATION",
+    "LOGISTIC",
+    "LSH_PROJECTION",
+    "LSTM",
+    "MAX_POOL_2D",
+    "MUL",
+    "RELU",
+    "RELU_N1_TO_1",
+    "RELU6",
+    "RESHAPE",
+    "RESIZE_BILINEAR",
+    "RNN",
+    "SOFTMAX",
+    "SPACE_TO_DEPTH",
+    "SVDF",
+    "TANH",
+    "CONCAT_EMBEDDINGS",
+    "SKIP_GRAM",
+    "CALL",
+    "CUSTOM",
+    "EMBEDDING_LOOKUP_SPARSE",
+    "PAD",
+    "UNIDIRECTIONAL_SEQUENCE_RNN",
+    "GATHER",
+    "BATCH_TO_SPACE_ND",
+    "SPACE_TO_BATCH_ND",
+    "TRANSPOSE",
+    "MEAN",
+    "SUB",
+    "DIV",
+    "SQUEEZE",
+    "UNIDIRECTIONAL_SEQUENCE_LSTM",
+    "STRIDED_SLICE",
+    "BIDIRECTIONAL_SEQUENCE_RNN",
+    "EXP",
+    "TOPK_V2",
+    "SPLIT",
+    "LOG_SOFTMAX",
+    "DELEGATE",
+    "BIDIRECTIONAL_SEQUENCE_LSTM",
+    "CAST",
+    "PRELU",
+    "MAXIMUM",
+    "ARGMAX",
+    "MINIMUM",
+    "LESS",
+    "NEG",
+    "PADV2",
+    "GREATER",
+    "GREATER_EQUAL",
+    "LESS_EQUAL",
+    "SELECT",
+    "SLICE",
+    "SIN",
+    "TRANSPOSE_CONV",
+    "SPARSE_TO_DENSE",
+    "TILE",
+    "EXPAND_DIMS",
+    "EQUAL",
+    "NOT_EQUAL",
+    "LOG",
+    "SUM",
+    "SQRT",
+    "RSQRT",
+    "SHAPE",
+    "POW",
+    "ARG_MIN",
+    "FAKE_QUANT",
+    "REDUCE_PROD",
+    "REDUCE_MAX",
+    "PACK",
+    "LOGICAL_OR",
+    "ONE_HOT",
+    "LOGICAL_AND",
+    "LOGICAL_NOT",
+    "UNPACK",
+    "REDUCE_MIN",
+    "FLOOR_DIV",
+    "REDUCE_ANY",
+    "SQUARE",
+    "ZEROS_LIKE",
+    "FILL",
+    "FLOOR_MOD",
+    "RANGE",
+    "RESIZE_NEAREST_NEIGHBOR",
+    "LEAKY_RELU",
+    "SQUARED_DIFFERENCE",
+    "MIRROR_PAD",
+    "ABS",
+    "SPLIT_V",
+    "UNIQUE",
+    "CEIL",
+    "REVERSE_V2",
+    "ADD_N",
+    "GATHER_ND",
+    "COS",
+    "WHERE",
+    "RANK",
+    "ELU",
+    "REVERSE_SEQUENCE",
+    "MATRIX_DIAG",
+    "QUANTIZE",
+    "MATRIX_SET_DIAG",
+    "ROUND",
+    "HARD_SWISH",
+    "IF",
+    "WHILE",
+    "NON_MAX_SUPPRESSION_V4",
+    "NON_MAX_SUPPRESSION_V5",
+    "SCATTER_ND",
+    "SELECT_V2",
+    "DENSIFY",
+    "SEGMENT_SUM",
+    "BATCH_MATMUL",
+    "PLACEHOLDER_FOR_GREATER_OP_CODES",
+    "CUMSUM",
+    "CALL_ONCE",
+    "BROADCAST_TO",
+    "RFFT2D",
+    "CONV_3D",
+    "IMAG",
+    "REAL",
+    "COMPLEX_ABS",
+    "HASHTABLE",
+    "HASHTABLE_FIND",
+    "HASHTABLE_IMPORT",
+    "HASHTABLE_SIZE",
+    "REDUCE_ALL",
+    "CONV_3D_TRANSPOSE",
+    "VAR_HANDLE",
+    "READ_VARIABLE",
+    "ASSIGN_VARIABLE",
+    "BROADCAST_ARGS",
+    "RANDOM_STANDARD_NORMAL",
+    "BUCKETIZE",
+    "RANDOM_UNIFORM",
+    "MULTINOMIAL",
+    "GELU",
+    "DYNAMIC_UPDATE_SLICE",
+    "RELU_0_TO_1",
+    "UNSORTED_SEGMENT_PROD",
+    "UNSORTED_SEGMENT_MAX",
+    "UNSORTED_SEGMENT_SUM",
+    "ATAN2",
+    "UNSORTED_SEGMENT_MIN",
+    "SIGN",
+    "BITCAST",
+    "BITWISE_XOR",
+    "RIGHT_SHIFT",
+    "STABLEHLO_LOGISTIC",
+    "STABLEHLO_ADD",
+    "STABLEHLO_DIVIDE",
+    "STABLEHLO_MULTIPLY",
+    "STABLEHLO_MAXIMUM",
+    "STABLEHLO_RESHAPE",
+    "STABLEHLO_CLAMP",
+    "STABLEHLO_CONCATENATE",
+    "STABLEHLO_BROADCAST_IN_DIM",
+    "STABLEHLO_CONVOLUTION",
+    "STABLEHLO_SLICE",
+    "STABLEHLO_CUSTOM_CALL",
+    "STABLEHLO_REDUCE",
+    "STABLEHLO_ABS",
+    "STABLEHLO_AND",
+    "STABLEHLO_COSINE",
+    "STABLEHLO_EXPONENTIAL",
+    "STABLEHLO_FLOOR",
+    "STABLEHLO_LOG",
+    "STABLEHLO_MINIMUM",
+    "STABLEHLO_NEGATE",
+    "STABLEHLO_OR",
+    "STABLEHLO_POWER",
+    "STABLEHLO_REMAINDER",
+    "STABLEHLO_RSQRT",
+    "STABLEHLO_SELECT",
+    "STABLEHLO_SUBTRACT",
+    "STABLEHLO_TANH",
+    "STABLEHLO_SCATTER",
+    "STABLEHLO_COMPARE",
+    "STABLEHLO_CONVERT",
+    "STABLEHLO_DYNAMIC_SLICE",
+    "STABLEHLO_DYNAMIC_UPDATE_SLICE",
+    "STABLEHLO_PAD",
+    "STABLEHLO_IOTA",
+    "STABLEHLO_DOT_GENERAL",
+    "STABLEHLO_REDUCE_WINDOW",
+    "STABLEHLO_SORT",
+    "STABLEHLO_WHILE",
+    "STABLEHLO_GATHER",
+    "STABLEHLO_TRANSPOSE",
+    "DILATE",
+    "STABLEHLO_RNG_BIT_GENERATOR",
+    "REDUCE_WINDOW",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameBuiltinOperator(BuiltinOperator e) {
+  if (::flatbuffers::IsOutRange(e, BuiltinOperator::ADD, BuiltinOperator::REDUCE_WINDOW)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesBuiltinOperator()[index];
+}
+
+enum class BuiltinOptions : uint8_t {
+  NONE = 0,
+  Conv2DOptions = 1,
+  DepthwiseConv2DOptions = 2,
+  ConcatEmbeddingsOptions = 3,
+  LSHProjectionOptions = 4,
+  Pool2DOptions = 5,
+  SVDFOptions = 6,
+  RNNOptions = 7,
+  FullyConnectedOptions = 8,
+  SoftmaxOptions = 9,
+  ConcatenationOptions = 10,
+  AddOptions = 11,
+  L2NormOptions = 12,
+  LocalResponseNormalizationOptions = 13,
+  LSTMOptions = 14,
+  ResizeBilinearOptions = 15,
+  CallOptions = 16,
+  ReshapeOptions = 17,
+  SkipGramOptions = 18,
+  SpaceToDepthOptions = 19,
+  EmbeddingLookupSparseOptions = 20,
+  MulOptions = 21,
+  PadOptions = 22,
+  GatherOptions = 23,
+  BatchToSpaceNDOptions = 24,
+  SpaceToBatchNDOptions = 25,
+  TransposeOptions = 26,
+  ReducerOptions = 27,
+  SubOptions = 28,
+  DivOptions = 29,
+  SqueezeOptions = 30,
+  SequenceRNNOptions = 31,
+  StridedSliceOptions = 32,
+  ExpOptions = 33,
+  TopKV2Options = 34,
+  SplitOptions = 35,
+  LogSoftmaxOptions = 36,
+  CastOptions = 37,
+  DequantizeOptions = 38,
+  MaximumMinimumOptions = 39,
+  ArgMaxOptions = 40,
+  LessOptions = 41,
+  NegOptions = 42,
+  PadV2Options = 43,
+  GreaterOptions = 44,
+  GreaterEqualOptions = 45,
+  LessEqualOptions = 46,
+  SelectOptions = 47,
+  SliceOptions = 48,
+  TransposeConvOptions = 49,
+  SparseToDenseOptions = 50,
+  TileOptions = 51,
+  ExpandDimsOptions = 52,
+  EqualOptions = 53,
+  NotEqualOptions = 54,
+  ShapeOptions = 55,
+  PowOptions = 56,
+  ArgMinOptions = 57,
+  FakeQuantOptions = 58,
+  PackOptions = 59,
+  LogicalOrOptions = 60,
+  OneHotOptions = 61,
+  LogicalAndOptions = 62,
+  LogicalNotOptions = 63,
+  UnpackOptions = 64,
+  FloorDivOptions = 65,
+  SquareOptions = 66,
+  ZerosLikeOptions = 67,
+  FillOptions = 68,
+  BidirectionalSequenceLSTMOptions = 69,
+  BidirectionalSequenceRNNOptions = 70,
+  UnidirectionalSequenceLSTMOptions = 71,
+  FloorModOptions = 72,
+  RangeOptions = 73,
+  ResizeNearestNeighborOptions = 74,
+  LeakyReluOptions = 75,
+  SquaredDifferenceOptions = 76,
+  MirrorPadOptions = 77,
+  AbsOptions = 78,
+  SplitVOptions = 79,
+  UniqueOptions = 80,
+  ReverseV2Options = 81,
+  AddNOptions = 82,
+  GatherNdOptions = 83,
+  CosOptions = 84,
+  WhereOptions = 85,
+  RankOptions = 86,
+  ReverseSequenceOptions = 87,
+  MatrixDiagOptions = 88,
+  QuantizeOptions = 89,
+  MatrixSetDiagOptions = 90,
+  HardSwishOptions = 91,
+  IfOptions = 92,
+  WhileOptions = 93,
+  DepthToSpaceOptions = 94,
+  NonMaxSuppressionV4Options = 95,
+  NonMaxSuppressionV5Options = 96,
+  ScatterNdOptions = 97,
+  SelectV2Options = 98,
+  DensifyOptions = 99,
+  SegmentSumOptions = 100,
+  BatchMatMulOptions = 101,
+  CumsumOptions = 102,
+  CallOnceOptions = 103,
+  BroadcastToOptions = 104,
+  Rfft2dOptions = 105,
+  Conv3DOptions = 106,
+  HashtableOptions = 107,
+  HashtableFindOptions = 108,
+  HashtableImportOptions = 109,
+  HashtableSizeOptions = 110,
+  VarHandleOptions = 111,
+  ReadVariableOptions = 112,
+  AssignVariableOptions = 113,
+  RandomOptions = 114,
+  BucketizeOptions = 115,
+  GeluOptions = 116,
+  DynamicUpdateSliceOptions = 117,
+  UnsortedSegmentProdOptions = 118,
+  UnsortedSegmentMaxOptions = 119,
+  UnsortedSegmentMinOptions = 120,
+  UnsortedSegmentSumOptions = 121,
+  ATan2Options = 122,
+  SignOptions = 123,
+  BitcastOptions = 124,
+  BitwiseXorOptions = 125,
+  RightShiftOptions = 126,
+  MIN = NONE,
+  MAX = RightShiftOptions
+};
+
+inline const BuiltinOptions (&EnumValuesBuiltinOptions())[127] {
+  static const BuiltinOptions values[] = {
+    BuiltinOptions::NONE,
+    BuiltinOptions::Conv2DOptions,
+    BuiltinOptions::DepthwiseConv2DOptions,
+    BuiltinOptions::ConcatEmbeddingsOptions,
+    BuiltinOptions::LSHProjectionOptions,
+    BuiltinOptions::Pool2DOptions,
+    BuiltinOptions::SVDFOptions,
+    BuiltinOptions::RNNOptions,
+    BuiltinOptions::FullyConnectedOptions,
+    BuiltinOptions::SoftmaxOptions,
+    BuiltinOptions::ConcatenationOptions,
+    BuiltinOptions::AddOptions,
+    BuiltinOptions::L2NormOptions,
+    BuiltinOptions::LocalResponseNormalizationOptions,
+    BuiltinOptions::LSTMOptions,
+    BuiltinOptions::ResizeBilinearOptions,
+    BuiltinOptions::CallOptions,
+    BuiltinOptions::ReshapeOptions,
+    BuiltinOptions::SkipGramOptions,
+    BuiltinOptions::SpaceToDepthOptions,
+    BuiltinOptions::EmbeddingLookupSparseOptions,
+    BuiltinOptions::MulOptions,
+    BuiltinOptions::PadOptions,
+    BuiltinOptions::GatherOptions,
+    BuiltinOptions::BatchToSpaceNDOptions,
+    BuiltinOptions::SpaceToBatchNDOptions,
+    BuiltinOptions::TransposeOptions,
+    BuiltinOptions::ReducerOptions,
+    BuiltinOptions::SubOptions,
+    BuiltinOptions::DivOptions,
+    BuiltinOptions::SqueezeOptions,
+    BuiltinOptions::SequenceRNNOptions,
+    BuiltinOptions::StridedSliceOptions,
+    BuiltinOptions::ExpOptions,
+    BuiltinOptions::TopKV2Options,
+    BuiltinOptions::SplitOptions,
+    BuiltinOptions::LogSoftmaxOptions,
+    BuiltinOptions::CastOptions,
+    BuiltinOptions::DequantizeOptions,
+    BuiltinOptions::MaximumMinimumOptions,
+    BuiltinOptions::ArgMaxOptions,
+    BuiltinOptions::LessOptions,
+    BuiltinOptions::NegOptions,
+    BuiltinOptions::PadV2Options,
+    BuiltinOptions::GreaterOptions,
+    BuiltinOptions::GreaterEqualOptions,
+    BuiltinOptions::LessEqualOptions,
+    BuiltinOptions::SelectOptions,
+    BuiltinOptions::SliceOptions,
+    BuiltinOptions::TransposeConvOptions,
+    BuiltinOptions::SparseToDenseOptions,
+    BuiltinOptions::TileOptions,
+    BuiltinOptions::ExpandDimsOptions,
+    BuiltinOptions::EqualOptions,
+    BuiltinOptions::NotEqualOptions,
+    BuiltinOptions::ShapeOptions,
+    BuiltinOptions::PowOptions,
+    BuiltinOptions::ArgMinOptions,
+    BuiltinOptions::FakeQuantOptions,
+    BuiltinOptions::PackOptions,
+    BuiltinOptions::LogicalOrOptions,
+    BuiltinOptions::OneHotOptions,
+    BuiltinOptions::LogicalAndOptions,
+    BuiltinOptions::LogicalNotOptions,
+    BuiltinOptions::UnpackOptions,
+    BuiltinOptions::FloorDivOptions,
+    BuiltinOptions::SquareOptions,
+    BuiltinOptions::ZerosLikeOptions,
+    BuiltinOptions::FillOptions,
+    BuiltinOptions::BidirectionalSequenceLSTMOptions,
+    BuiltinOptions::BidirectionalSequenceRNNOptions,
+    BuiltinOptions::UnidirectionalSequenceLSTMOptions,
+    BuiltinOptions::FloorModOptions,
+    BuiltinOptions::RangeOptions,
+    BuiltinOptions::ResizeNearestNeighborOptions,
+    BuiltinOptions::LeakyReluOptions,
+    BuiltinOptions::SquaredDifferenceOptions,
+    BuiltinOptions::MirrorPadOptions,
+    BuiltinOptions::AbsOptions,
+    BuiltinOptions::SplitVOptions,
+    BuiltinOptions::UniqueOptions,
+    BuiltinOptions::ReverseV2Options,
+    BuiltinOptions::AddNOptions,
+    BuiltinOptions::GatherNdOptions,
+    BuiltinOptions::CosOptions,
+    BuiltinOptions::WhereOptions,
+    BuiltinOptions::RankOptions,
+    BuiltinOptions::ReverseSequenceOptions,
+    BuiltinOptions::MatrixDiagOptions,
+    BuiltinOptions::QuantizeOptions,
+    BuiltinOptions::MatrixSetDiagOptions,
+    BuiltinOptions::HardSwishOptions,
+    BuiltinOptions::IfOptions,
+    BuiltinOptions::WhileOptions,
+    BuiltinOptions::DepthToSpaceOptions,
+    BuiltinOptions::NonMaxSuppressionV4Options,
+    BuiltinOptions::NonMaxSuppressionV5Options,
+    BuiltinOptions::ScatterNdOptions,
+    BuiltinOptions::SelectV2Options,
+    BuiltinOptions::DensifyOptions,
+    BuiltinOptions::SegmentSumOptions,
+    BuiltinOptions::BatchMatMulOptions,
+    BuiltinOptions::CumsumOptions,
+    BuiltinOptions::CallOnceOptions,
+    BuiltinOptions::BroadcastToOptions,
+    BuiltinOptions::Rfft2dOptions,
+    BuiltinOptions::Conv3DOptions,
+    BuiltinOptions::HashtableOptions,
+    BuiltinOptions::HashtableFindOptions,
+    BuiltinOptions::HashtableImportOptions,
+    BuiltinOptions::HashtableSizeOptions,
+    BuiltinOptions::VarHandleOptions,
+    BuiltinOptions::ReadVariableOptions,
+    BuiltinOptions::AssignVariableOptions,
+    BuiltinOptions::RandomOptions,
+    BuiltinOptions::BucketizeOptions,
+    BuiltinOptions::GeluOptions,
+    BuiltinOptions::DynamicUpdateSliceOptions,
+    BuiltinOptions::UnsortedSegmentProdOptions,
+    BuiltinOptions::UnsortedSegmentMaxOptions,
+    BuiltinOptions::UnsortedSegmentMinOptions,
+    BuiltinOptions::UnsortedSegmentSumOptions,
+    BuiltinOptions::ATan2Options,
+    BuiltinOptions::SignOptions,
+    BuiltinOptions::BitcastOptions,
+    BuiltinOptions::BitwiseXorOptions,
+    BuiltinOptions::RightShiftOptions
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesBuiltinOptions() {
+  static const char * const names[128] = {
+    "NONE",
+    "Conv2DOptions",
+    "DepthwiseConv2DOptions",
+    "ConcatEmbeddingsOptions",
+    "LSHProjectionOptions",
+    "Pool2DOptions",
+    "SVDFOptions",
+    "RNNOptions",
+    "FullyConnectedOptions",
+    "SoftmaxOptions",
+    "ConcatenationOptions",
+    "AddOptions",
+    "L2NormOptions",
+    "LocalResponseNormalizationOptions",
+    "LSTMOptions",
+    "ResizeBilinearOptions",
+    "CallOptions",
+    "ReshapeOptions",
+    "SkipGramOptions",
+    "SpaceToDepthOptions",
+    "EmbeddingLookupSparseOptions",
+    "MulOptions",
+    "PadOptions",
+    "GatherOptions",
+    "BatchToSpaceNDOptions",
+    "SpaceToBatchNDOptions",
+    "TransposeOptions",
+    "ReducerOptions",
+    "SubOptions",
+    "DivOptions",
+    "SqueezeOptions",
+    "SequenceRNNOptions",
+    "StridedSliceOptions",
+    "ExpOptions",
+    "TopKV2Options",
+    "SplitOptions",
+    "LogSoftmaxOptions",
+    "CastOptions",
+    "DequantizeOptions",
+    "MaximumMinimumOptions",
+    "ArgMaxOptions",
+    "LessOptions",
+    "NegOptions",
+    "PadV2Options",
+    "GreaterOptions",
+    "GreaterEqualOptions",
+    "LessEqualOptions",
+    "SelectOptions",
+    "SliceOptions",
+    "TransposeConvOptions",
+    "SparseToDenseOptions",
+    "TileOptions",
+    "ExpandDimsOptions",
+    "EqualOptions",
+    "NotEqualOptions",
+    "ShapeOptions",
+    "PowOptions",
+    "ArgMinOptions",
+    "FakeQuantOptions",
+    "PackOptions",
+    "LogicalOrOptions",
+    "OneHotOptions",
+    "LogicalAndOptions",
+    "LogicalNotOptions",
+    "UnpackOptions",
+    "FloorDivOptions",
+    "SquareOptions",
+    "ZerosLikeOptions",
+    "FillOptions",
+    "BidirectionalSequenceLSTMOptions",
+    "BidirectionalSequenceRNNOptions",
+    "UnidirectionalSequenceLSTMOptions",
+    "FloorModOptions",
+    "RangeOptions",
+    "ResizeNearestNeighborOptions",
+    "LeakyReluOptions",
+    "SquaredDifferenceOptions",
+    "MirrorPadOptions",
+    "AbsOptions",
+    "SplitVOptions",
+    "UniqueOptions",
+    "ReverseV2Options",
+    "AddNOptions",
+    "GatherNdOptions",
+    "CosOptions",
+    "WhereOptions",
+    "RankOptions",
+    "ReverseSequenceOptions",
+    "MatrixDiagOptions",
+    "QuantizeOptions",
+    "MatrixSetDiagOptions",
+    "HardSwishOptions",
+    "IfOptions",
+    "WhileOptions",
+    "DepthToSpaceOptions",
+    "NonMaxSuppressionV4Options",
+    "NonMaxSuppressionV5Options",
+    "ScatterNdOptions",
+    "SelectV2Options",
+    "DensifyOptions",
+    "SegmentSumOptions",
+    "BatchMatMulOptions",
+    "CumsumOptions",
+    "CallOnceOptions",
+    "BroadcastToOptions",
+    "Rfft2dOptions",
+    "Conv3DOptions",
+    "HashtableOptions",
+    "HashtableFindOptions",
+    "HashtableImportOptions",
+    "HashtableSizeOptions",
+    "VarHandleOptions",
+    "ReadVariableOptions",
+    "AssignVariableOptions",
+    "RandomOptions",
+    "BucketizeOptions",
+    "GeluOptions",
+    "DynamicUpdateSliceOptions",
+    "UnsortedSegmentProdOptions",
+    "UnsortedSegmentMaxOptions",
+    "UnsortedSegmentMinOptions",
+    "UnsortedSegmentSumOptions",
+    "ATan2Options",
+    "SignOptions",
+    "BitcastOptions",
+    "BitwiseXorOptions",
+    "RightShiftOptions",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameBuiltinOptions(BuiltinOptions e) {
+  if (::flatbuffers::IsOutRange(e, BuiltinOptions::NONE, BuiltinOptions::RightShiftOptions)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesBuiltinOptions()[index];
+}
+
+template<typename T> struct BuiltinOptionsTraits {
+  static const BuiltinOptions enum_value = BuiltinOptions::NONE;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::Conv2DOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::Conv2DOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::DepthwiseConv2DOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::DepthwiseConv2DOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ConcatEmbeddingsOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::ConcatEmbeddingsOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LSHProjectionOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::LSHProjectionOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::Pool2DOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::Pool2DOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SVDFOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::SVDFOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::RNNOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::RNNOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::FullyConnectedOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::FullyConnectedOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SoftmaxOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::SoftmaxOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ConcatenationOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::ConcatenationOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::AddOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::AddOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::L2NormOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::L2NormOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LocalResponseNormalizationOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::LocalResponseNormalizationOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LSTMOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::LSTMOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ResizeBilinearOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::ResizeBilinearOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::CallOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::CallOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ReshapeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::ReshapeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SkipGramOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::SkipGramOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SpaceToDepthOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::SpaceToDepthOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::EmbeddingLookupSparseOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::EmbeddingLookupSparseOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::MulOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::MulOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::PadOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::PadOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::GatherOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::GatherOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BatchToSpaceNDOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::BatchToSpaceNDOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SpaceToBatchNDOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::SpaceToBatchNDOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::TransposeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::TransposeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ReducerOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::ReducerOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SubOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::SubOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::DivOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::DivOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SqueezeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::SqueezeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SequenceRNNOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::SequenceRNNOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::StridedSliceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::StridedSliceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ExpOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::ExpOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::TopKV2Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions::TopKV2Options;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SplitOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::SplitOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LogSoftmaxOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::LogSoftmaxOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::CastOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::CastOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::DequantizeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::DequantizeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::MaximumMinimumOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::MaximumMinimumOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ArgMaxOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::ArgMaxOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LessOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::LessOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::NegOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::NegOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::PadV2Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions::PadV2Options;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::GreaterOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::GreaterOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::GreaterEqualOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::GreaterEqualOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LessEqualOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::LessEqualOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SelectOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::SelectOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SliceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::SliceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::TransposeConvOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::TransposeConvOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SparseToDenseOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::SparseToDenseOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::TileOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::TileOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ExpandDimsOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::ExpandDimsOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::EqualOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::EqualOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::NotEqualOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::NotEqualOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ShapeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::ShapeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::PowOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::PowOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ArgMinOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::ArgMinOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::FakeQuantOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::FakeQuantOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::PackOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::PackOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LogicalOrOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::LogicalOrOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::OneHotOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::OneHotOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LogicalAndOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::LogicalAndOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LogicalNotOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::LogicalNotOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::UnpackOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::UnpackOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::FloorDivOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::FloorDivOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SquareOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::SquareOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ZerosLikeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::ZerosLikeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::FillOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::FillOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BidirectionalSequenceLSTMOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::BidirectionalSequenceLSTMOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BidirectionalSequenceRNNOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::BidirectionalSequenceRNNOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::UnidirectionalSequenceLSTMOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::UnidirectionalSequenceLSTMOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::FloorModOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::FloorModOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::RangeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::RangeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ResizeNearestNeighborOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::ResizeNearestNeighborOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::LeakyReluOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::LeakyReluOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SquaredDifferenceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::SquaredDifferenceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::MirrorPadOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::MirrorPadOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::AbsOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::AbsOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SplitVOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::SplitVOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::UniqueOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::UniqueOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ReverseV2Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions::ReverseV2Options;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::AddNOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::AddNOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::GatherNdOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::GatherNdOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::CosOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::CosOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::WhereOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::WhereOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::RankOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::RankOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ReverseSequenceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::ReverseSequenceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::MatrixDiagOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::MatrixDiagOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::QuantizeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::QuantizeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::MatrixSetDiagOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::MatrixSetDiagOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::HardSwishOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::HardSwishOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::IfOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::IfOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::WhileOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::WhileOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::DepthToSpaceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::DepthToSpaceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::NonMaxSuppressionV4Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions::NonMaxSuppressionV4Options;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::NonMaxSuppressionV5Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions::NonMaxSuppressionV5Options;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ScatterNdOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::ScatterNdOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SelectV2Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions::SelectV2Options;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::DensifyOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::DensifyOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SegmentSumOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::SegmentSumOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BatchMatMulOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::BatchMatMulOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::CumsumOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::CumsumOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::CallOnceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::CallOnceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BroadcastToOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::BroadcastToOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::Rfft2dOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::Rfft2dOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::Conv3DOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::Conv3DOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::HashtableOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::HashtableOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::HashtableFindOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::HashtableFindOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::HashtableImportOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::HashtableImportOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::HashtableSizeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::HashtableSizeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::VarHandleOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::VarHandleOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ReadVariableOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::ReadVariableOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::AssignVariableOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::AssignVariableOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::RandomOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::RandomOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BucketizeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::BucketizeOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::GeluOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::GeluOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::DynamicUpdateSliceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::DynamicUpdateSliceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::UnsortedSegmentProdOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::UnsortedSegmentProdOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::UnsortedSegmentMaxOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::UnsortedSegmentMaxOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::UnsortedSegmentMinOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::UnsortedSegmentMinOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::UnsortedSegmentSumOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::UnsortedSegmentSumOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::ATan2Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions::ATan2Options;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::SignOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::SignOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BitcastOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::BitcastOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::BitwiseXorOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::BitwiseXorOptions;
+};
+
+template<> struct BuiltinOptionsTraits<tflite::RightShiftOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions::RightShiftOptions;
+};
+
+bool VerifyBuiltinOptions(::flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
+bool VerifyBuiltinOptionsVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<BuiltinOptions> *types);
+
+enum class BuiltinOptions2 : uint8_t {
+  NONE = 0,
+  StablehloConcatenateOptions = 1,
+  StablehloBroadcastInDimOptions = 2,
+  StablehloSliceOptions = 3,
+  StablehloConvolutionOptions = 4,
+  StablehloCustomCallOptions = 5,
+  StablehloReduceOptions = 6,
+  StablehloScatterOptions = 7,
+  StablehloCompareOptions = 8,
+  StablehloDynamicSliceOptions = 9,
+  StablehloPadOptions = 10,
+  StablehloIotaOptions = 11,
+  StablehloDotGeneralOptions = 12,
+  StablehloReduceWindowOptions = 13,
+  StablehloSortOptions = 14,
+  StablehloWhileOptions = 15,
+  StablehloGatherOptions = 16,
+  StablehloTransposeOptions = 17,
+  DilateOptions = 18,
+  StablehloRngBitGeneratorOptions = 19,
+  ReduceWindowOptions = 20,
+  MIN = NONE,
+  MAX = ReduceWindowOptions
+};
+
+inline const BuiltinOptions2 (&EnumValuesBuiltinOptions2())[21] {
+  static const BuiltinOptions2 values[] = {
+    BuiltinOptions2::NONE,
+    BuiltinOptions2::StablehloConcatenateOptions,
+    BuiltinOptions2::StablehloBroadcastInDimOptions,
+    BuiltinOptions2::StablehloSliceOptions,
+    BuiltinOptions2::StablehloConvolutionOptions,
+    BuiltinOptions2::StablehloCustomCallOptions,
+    BuiltinOptions2::StablehloReduceOptions,
+    BuiltinOptions2::StablehloScatterOptions,
+    BuiltinOptions2::StablehloCompareOptions,
+    BuiltinOptions2::StablehloDynamicSliceOptions,
+    BuiltinOptions2::StablehloPadOptions,
+    BuiltinOptions2::StablehloIotaOptions,
+    BuiltinOptions2::StablehloDotGeneralOptions,
+    BuiltinOptions2::StablehloReduceWindowOptions,
+    BuiltinOptions2::StablehloSortOptions,
+    BuiltinOptions2::StablehloWhileOptions,
+    BuiltinOptions2::StablehloGatherOptions,
+    BuiltinOptions2::StablehloTransposeOptions,
+    BuiltinOptions2::DilateOptions,
+    BuiltinOptions2::StablehloRngBitGeneratorOptions,
+    BuiltinOptions2::ReduceWindowOptions
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesBuiltinOptions2() {
+  static const char * const names[22] = {
+    "NONE",
+    "StablehloConcatenateOptions",
+    "StablehloBroadcastInDimOptions",
+    "StablehloSliceOptions",
+    "StablehloConvolutionOptions",
+    "StablehloCustomCallOptions",
+    "StablehloReduceOptions",
+    "StablehloScatterOptions",
+    "StablehloCompareOptions",
+    "StablehloDynamicSliceOptions",
+    "StablehloPadOptions",
+    "StablehloIotaOptions",
+    "StablehloDotGeneralOptions",
+    "StablehloReduceWindowOptions",
+    "StablehloSortOptions",
+    "StablehloWhileOptions",
+    "StablehloGatherOptions",
+    "StablehloTransposeOptions",
+    "DilateOptions",
+    "StablehloRngBitGeneratorOptions",
+    "ReduceWindowOptions",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameBuiltinOptions2(BuiltinOptions2 e) {
+  if (::flatbuffers::IsOutRange(e, BuiltinOptions2::NONE, BuiltinOptions2::ReduceWindowOptions)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesBuiltinOptions2()[index];
+}
+
+template<typename T> struct BuiltinOptions2Traits {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2::NONE;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloConcatenateOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2::StablehloConcatenateOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloBroadcastInDimOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2::StablehloBroadcastInDimOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloSliceOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2::StablehloSliceOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloConvolutionOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2::StablehloConvolutionOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloCustomCallOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2::StablehloCustomCallOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloReduceOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2::StablehloReduceOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloScatterOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2::StablehloScatterOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloCompareOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2::StablehloCompareOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloDynamicSliceOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2::StablehloDynamicSliceOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloPadOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2::StablehloPadOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloIotaOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2::StablehloIotaOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloDotGeneralOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2::StablehloDotGeneralOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloReduceWindowOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2::StablehloReduceWindowOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloSortOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2::StablehloSortOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloWhileOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2::StablehloWhileOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloGatherOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2::StablehloGatherOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloTransposeOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2::StablehloTransposeOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::DilateOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2::DilateOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::StablehloRngBitGeneratorOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2::StablehloRngBitGeneratorOptions;
+};
+
+template<> struct BuiltinOptions2Traits<tflite::ReduceWindowOptions> {
+  static const BuiltinOptions2 enum_value = BuiltinOptions2::ReduceWindowOptions;
+};
+
+bool VerifyBuiltinOptions2(::flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions2 type);
+bool VerifyBuiltinOptions2Vector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<BuiltinOptions2> *types);
+
+enum class StablehloPrecisionConfig : uint32_t {
+  DEFAULT = 0,
+  HIGH = 1,
+  HIGHEST = 2,
+  MIN = DEFAULT,
+  MAX = HIGHEST
+};
+
+inline const StablehloPrecisionConfig (&EnumValuesStablehloPrecisionConfig())[3] {
+  static const StablehloPrecisionConfig values[] = {
+    StablehloPrecisionConfig::DEFAULT,
+    StablehloPrecisionConfig::HIGH,
+    StablehloPrecisionConfig::HIGHEST
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesStablehloPrecisionConfig() {
+  static const char * const names[4] = {
+    "DEFAULT",
+    "HIGH",
+    "HIGHEST",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameStablehloPrecisionConfig(StablehloPrecisionConfig e) {
+  if (::flatbuffers::IsOutRange(e, StablehloPrecisionConfig::DEFAULT, StablehloPrecisionConfig::HIGHEST)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesStablehloPrecisionConfig()[index];
+}
+
+enum class StablehloComparisonDirection : uint32_t {
+  STABLEHLO_COMPARISON_DIRECTION_EQ = 0,
+  STABLEHLO_COMPARISON_DIRECTION_NE = 1,
+  STABLEHLO_COMPARISON_DIRECTION_GE = 2,
+  STABLEHLO_COMPARISON_DIRECTION_GT = 3,
+  STABLEHLO_COMPARISON_DIRECTION_LE = 4,
+  STABLEHLO_COMPARISON_DIRECTION_LT = 5,
+  MIN = STABLEHLO_COMPARISON_DIRECTION_EQ,
+  MAX = STABLEHLO_COMPARISON_DIRECTION_LT
+};
+
+inline const StablehloComparisonDirection (&EnumValuesStablehloComparisonDirection())[6] {
+  static const StablehloComparisonDirection values[] = {
+    StablehloComparisonDirection::STABLEHLO_COMPARISON_DIRECTION_EQ,
+    StablehloComparisonDirection::STABLEHLO_COMPARISON_DIRECTION_NE,
+    StablehloComparisonDirection::STABLEHLO_COMPARISON_DIRECTION_GE,
+    StablehloComparisonDirection::STABLEHLO_COMPARISON_DIRECTION_GT,
+    StablehloComparisonDirection::STABLEHLO_COMPARISON_DIRECTION_LE,
+    StablehloComparisonDirection::STABLEHLO_COMPARISON_DIRECTION_LT
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesStablehloComparisonDirection() {
+  static const char * const names[7] = {
+    "STABLEHLO_COMPARISON_DIRECTION_EQ",
+    "STABLEHLO_COMPARISON_DIRECTION_NE",
+    "STABLEHLO_COMPARISON_DIRECTION_GE",
+    "STABLEHLO_COMPARISON_DIRECTION_GT",
+    "STABLEHLO_COMPARISON_DIRECTION_LE",
+    "STABLEHLO_COMPARISON_DIRECTION_LT",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameStablehloComparisonDirection(StablehloComparisonDirection e) {
+  if (::flatbuffers::IsOutRange(e, StablehloComparisonDirection::STABLEHLO_COMPARISON_DIRECTION_EQ, StablehloComparisonDirection::STABLEHLO_COMPARISON_DIRECTION_LT)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesStablehloComparisonDirection()[index];
+}
+
+enum class StablehloComparisonType : uint32_t {
+  STABLEHLO_COMPARISON_TYPE_NOTYPE = 0,
+  STABLEHLO_COMPARISON_TYPE_FLOAT = 1,
+  STABLEHLO_COMPARISON_TYPE_FLOAT_TOTAL_ORDER = 2,
+  STABLEHLO_COMPARISON_TYPE_SIGNED = 3,
+  STABLEHLO_COMPARISON_TYPE_UNSIGNED = 4,
+  MIN = STABLEHLO_COMPARISON_TYPE_NOTYPE,
+  MAX = STABLEHLO_COMPARISON_TYPE_UNSIGNED
+};
+
+inline const StablehloComparisonType (&EnumValuesStablehloComparisonType())[5] {
+  static const StablehloComparisonType values[] = {
+    StablehloComparisonType::STABLEHLO_COMPARISON_TYPE_NOTYPE,
+    StablehloComparisonType::STABLEHLO_COMPARISON_TYPE_FLOAT,
+    StablehloComparisonType::STABLEHLO_COMPARISON_TYPE_FLOAT_TOTAL_ORDER,
+    StablehloComparisonType::STABLEHLO_COMPARISON_TYPE_SIGNED,
+    StablehloComparisonType::STABLEHLO_COMPARISON_TYPE_UNSIGNED
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesStablehloComparisonType() {
+  static const char * const names[6] = {
+    "STABLEHLO_COMPARISON_TYPE_NOTYPE",
+    "STABLEHLO_COMPARISON_TYPE_FLOAT",
+    "STABLEHLO_COMPARISON_TYPE_FLOAT_TOTAL_ORDER",
+    "STABLEHLO_COMPARISON_TYPE_SIGNED",
+    "STABLEHLO_COMPARISON_TYPE_UNSIGNED",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameStablehloComparisonType(StablehloComparisonType e) {
+  if (::flatbuffers::IsOutRange(e, StablehloComparisonType::STABLEHLO_COMPARISON_TYPE_NOTYPE, StablehloComparisonType::STABLEHLO_COMPARISON_TYPE_UNSIGNED)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesStablehloComparisonType()[index];
+}
+
+enum class RngAlgorithm : int8_t {
+  DEFAULT = 0,
+  PHILOX = 1,
+  THREEFRY = 2,
+  MIN = DEFAULT,
+  MAX = THREEFRY
+};
+
+inline const RngAlgorithm (&EnumValuesRngAlgorithm())[3] {
+  static const RngAlgorithm values[] = {
+    RngAlgorithm::DEFAULT,
+    RngAlgorithm::PHILOX,
+    RngAlgorithm::THREEFRY
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesRngAlgorithm() {
+  static const char * const names[4] = {
+    "DEFAULT",
+    "PHILOX",
+    "THREEFRY",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameRngAlgorithm(RngAlgorithm e) {
+  if (::flatbuffers::IsOutRange(e, RngAlgorithm::DEFAULT, RngAlgorithm::THREEFRY)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesRngAlgorithm()[index];
+}
+
+enum class Padding : int8_t {
+  SAME = 0,
+  VALID = 1,
+  MIN = SAME,
+  MAX = VALID
+};
+
+inline const Padding (&EnumValuesPadding())[2] {
+  static const Padding values[] = {
+    Padding::SAME,
+    Padding::VALID
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesPadding() {
+  static const char * const names[3] = {
+    "SAME",
+    "VALID",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNamePadding(Padding e) {
+  if (::flatbuffers::IsOutRange(e, Padding::SAME, Padding::VALID)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesPadding()[index];
+}
+
+enum class ActivationFunctionType : int8_t {
+  NONE = 0,
+  RELU = 1,
+  RELU_N1_TO_1 = 2,
+  RELU6 = 3,
+  TANH = 4,
+  SIGN_BIT = 5,
+  MIN = NONE,
+  MAX = SIGN_BIT
+};
+
+inline const ActivationFunctionType (&EnumValuesActivationFunctionType())[6] {
+  static const ActivationFunctionType values[] = {
+    ActivationFunctionType::NONE,
+    ActivationFunctionType::RELU,
+    ActivationFunctionType::RELU_N1_TO_1,
+    ActivationFunctionType::RELU6,
+    ActivationFunctionType::TANH,
+    ActivationFunctionType::SIGN_BIT
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesActivationFunctionType() {
+  static const char * const names[7] = {
+    "NONE",
+    "RELU",
+    "RELU_N1_TO_1",
+    "RELU6",
+    "TANH",
+    "SIGN_BIT",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameActivationFunctionType(ActivationFunctionType e) {
+  if (::flatbuffers::IsOutRange(e, ActivationFunctionType::NONE, ActivationFunctionType::SIGN_BIT)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesActivationFunctionType()[index];
+}
+
+enum class LSHProjectionType : int8_t {
+  UNKNOWN = 0,
+  SPARSE = 1,
+  DENSE = 2,
+  MIN = UNKNOWN,
+  MAX = DENSE
+};
+
+inline const LSHProjectionType (&EnumValuesLSHProjectionType())[3] {
+  static const LSHProjectionType values[] = {
+    LSHProjectionType::UNKNOWN,
+    LSHProjectionType::SPARSE,
+    LSHProjectionType::DENSE
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesLSHProjectionType() {
+  static const char * const names[4] = {
+    "UNKNOWN",
+    "SPARSE",
+    "DENSE",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameLSHProjectionType(LSHProjectionType e) {
+  if (::flatbuffers::IsOutRange(e, LSHProjectionType::UNKNOWN, LSHProjectionType::DENSE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesLSHProjectionType()[index];
+}
+
+enum class FullyConnectedOptionsWeightsFormat : int8_t {
+  DEFAULT = 0,
+  SHUFFLED4x16INT8 = 1,
+  MIN = DEFAULT,
+  MAX = SHUFFLED4x16INT8
+};
+
+inline const FullyConnectedOptionsWeightsFormat (&EnumValuesFullyConnectedOptionsWeightsFormat())[2] {
+  static const FullyConnectedOptionsWeightsFormat values[] = {
+    FullyConnectedOptionsWeightsFormat::DEFAULT,
+    FullyConnectedOptionsWeightsFormat::SHUFFLED4x16INT8
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesFullyConnectedOptionsWeightsFormat() {
+  static const char * const names[3] = {
+    "DEFAULT",
+    "SHUFFLED4x16INT8",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameFullyConnectedOptionsWeightsFormat(FullyConnectedOptionsWeightsFormat e) {
+  if (::flatbuffers::IsOutRange(e, FullyConnectedOptionsWeightsFormat::DEFAULT, FullyConnectedOptionsWeightsFormat::SHUFFLED4x16INT8)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesFullyConnectedOptionsWeightsFormat()[index];
+}
+
+enum class LSTMKernelType : int8_t {
+  FULL = 0,
+  BASIC = 1,
+  MIN = FULL,
+  MAX = BASIC
+};
+
+inline const LSTMKernelType (&EnumValuesLSTMKernelType())[2] {
+  static const LSTMKernelType values[] = {
+    LSTMKernelType::FULL,
+    LSTMKernelType::BASIC
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesLSTMKernelType() {
+  static const char * const names[3] = {
+    "FULL",
+    "BASIC",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameLSTMKernelType(LSTMKernelType e) {
+  if (::flatbuffers::IsOutRange(e, LSTMKernelType::FULL, LSTMKernelType::BASIC)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesLSTMKernelType()[index];
+}
+
+enum class CombinerType : int8_t {
+  SUM = 0,
+  MEAN = 1,
+  SQRTN = 2,
+  MIN = SUM,
+  MAX = SQRTN
+};
+
+inline const CombinerType (&EnumValuesCombinerType())[3] {
+  static const CombinerType values[] = {
+    CombinerType::SUM,
+    CombinerType::MEAN,
+    CombinerType::SQRTN
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesCombinerType() {
+  static const char * const names[4] = {
+    "SUM",
+    "MEAN",
+    "SQRTN",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameCombinerType(CombinerType e) {
+  if (::flatbuffers::IsOutRange(e, CombinerType::SUM, CombinerType::SQRTN)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesCombinerType()[index];
+}
+
+enum class MirrorPadMode : int8_t {
+  REFLECT = 0,
+  SYMMETRIC = 1,
+  MIN = REFLECT,
+  MAX = SYMMETRIC
+};
+
+inline const MirrorPadMode (&EnumValuesMirrorPadMode())[2] {
+  static const MirrorPadMode values[] = {
+    MirrorPadMode::REFLECT,
+    MirrorPadMode::SYMMETRIC
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesMirrorPadMode() {
+  static const char * const names[3] = {
+    "REFLECT",
+    "SYMMETRIC",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameMirrorPadMode(MirrorPadMode e) {
+  if (::flatbuffers::IsOutRange(e, MirrorPadMode::REFLECT, MirrorPadMode::SYMMETRIC)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesMirrorPadMode()[index];
+}
+
+enum class ReduceWindowFunction : int32_t {
+  UNSUPPORTED = 0,
+  ADD = 1,
+  MUL = 2,
+  MINIMUM = 3,
+  MAXIMUM = 4,
+  ALL = 5,
+  ANY = 6,
+  MIN = UNSUPPORTED,
+  MAX = ANY
+};
+
+inline const ReduceWindowFunction (&EnumValuesReduceWindowFunction())[7] {
+  static const ReduceWindowFunction values[] = {
+    ReduceWindowFunction::UNSUPPORTED,
+    ReduceWindowFunction::ADD,
+    ReduceWindowFunction::MUL,
+    ReduceWindowFunction::MINIMUM,
+    ReduceWindowFunction::MAXIMUM,
+    ReduceWindowFunction::ALL,
+    ReduceWindowFunction::ANY
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesReduceWindowFunction() {
+  static const char * const names[8] = {
+    "UNSUPPORTED",
+    "ADD",
+    "MUL",
+    "MINIMUM",
+    "MAXIMUM",
+    "ALL",
+    "ANY",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameReduceWindowFunction(ReduceWindowFunction e) {
+  if (::flatbuffers::IsOutRange(e, ReduceWindowFunction::UNSUPPORTED, ReduceWindowFunction::ANY)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesReduceWindowFunction()[index];
+}
+
+enum class CustomOptionsFormat : int8_t {
+  FLEXBUFFERS = 0,
+  MIN = FLEXBUFFERS,
+  MAX = FLEXBUFFERS
+};
+
+inline const CustomOptionsFormat (&EnumValuesCustomOptionsFormat())[1] {
+  static const CustomOptionsFormat values[] = {
+    CustomOptionsFormat::FLEXBUFFERS
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesCustomOptionsFormat() {
+  static const char * const names[2] = {
+    "FLEXBUFFERS",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameCustomOptionsFormat(CustomOptionsFormat e) {
+  if (::flatbuffers::IsOutRange(e, CustomOptionsFormat::FLEXBUFFERS, CustomOptionsFormat::FLEXBUFFERS)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesCustomOptionsFormat()[index];
+}
+
+struct CustomQuantization FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CustomQuantizationBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return CustomQuantizationTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_CUSTOM = 4
+  };
+  const ::flatbuffers::Vector<uint8_t> *custom() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_CUSTOM);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_CUSTOM) &&
+           verifier.VerifyVector(custom()) &&
+           verifier.EndTable();
+  }
+};
+
+struct CustomQuantizationBuilder {
+  typedef CustomQuantization Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_custom(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> custom) {
+    fbb_.AddOffset(CustomQuantization::VT_CUSTOM, custom);
+  }
+  explicit CustomQuantizationBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CustomQuantization> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CustomQuantization>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> custom = 0) {
+  CustomQuantizationBuilder builder_(_fbb);
+  builder_.add_custom(custom);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<CustomQuantization> CreateCustomQuantizationDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *custom = nullptr) {
+  if (custom) { _fbb.ForceVectorAlignment(custom->size(), sizeof(uint8_t), 16); }
+  auto custom__ = custom ? _fbb.CreateVector<uint8_t>(*custom) : 0;
+  return tflite::CreateCustomQuantization(
+      _fbb,
+      custom__);
+}
+
+struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef QuantizationParametersBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return QuantizationParametersTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_MIN = 4,
+    VT_MAX = 6,
+    VT_SCALE = 8,
+    VT_ZERO_POINT = 10,
+    VT_DETAILS_TYPE = 12,
+    VT_DETAILS = 14,
+    VT_QUANTIZED_DIMENSION = 16
+  };
+  const ::flatbuffers::Vector<float> *min() const {
+    return GetPointer<const ::flatbuffers::Vector<float> *>(VT_MIN);
+  }
+  const ::flatbuffers::Vector<float> *max() const {
+    return GetPointer<const ::flatbuffers::Vector<float> *>(VT_MAX);
+  }
+  const ::flatbuffers::Vector<float> *scale() const {
+    return GetPointer<const ::flatbuffers::Vector<float> *>(VT_SCALE);
+  }
+  const ::flatbuffers::Vector<int64_t> *zero_point() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_ZERO_POINT);
+  }
+  tflite::QuantizationDetails details_type() const {
+    return static_cast<tflite::QuantizationDetails>(GetField<uint8_t>(VT_DETAILS_TYPE, 0));
+  }
+  const void *details() const {
+    return GetPointer<const void *>(VT_DETAILS);
+  }
+  template<typename T> const T *details_as() const;
+  const tflite::CustomQuantization *details_as_CustomQuantization() const {
+    return details_type() == tflite::QuantizationDetails::CustomQuantization ? static_cast<const tflite::CustomQuantization *>(details()) : nullptr;
+  }
+  int32_t quantized_dimension() const {
+    return GetField<int32_t>(VT_QUANTIZED_DIMENSION, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_MIN) &&
+           verifier.VerifyVector(min()) &&
+           VerifyOffset(verifier, VT_MAX) &&
+           verifier.VerifyVector(max()) &&
+           VerifyOffset(verifier, VT_SCALE) &&
+           verifier.VerifyVector(scale()) &&
+           VerifyOffset(verifier, VT_ZERO_POINT) &&
+           verifier.VerifyVector(zero_point()) &&
+           VerifyField<uint8_t>(verifier, VT_DETAILS_TYPE, 1) &&
+           VerifyOffset(verifier, VT_DETAILS) &&
+           VerifyQuantizationDetails(verifier, details(), details_type()) &&
+           VerifyField<int32_t>(verifier, VT_QUANTIZED_DIMENSION, 4) &&
+           verifier.EndTable();
+  }
+};
+
+template<> inline const tflite::CustomQuantization *QuantizationParameters::details_as<tflite::CustomQuantization>() const {
+  return details_as_CustomQuantization();
+}
+
+struct QuantizationParametersBuilder {
+  typedef QuantizationParameters Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_min(::flatbuffers::Offset<::flatbuffers::Vector<float>> min) {
+    fbb_.AddOffset(QuantizationParameters::VT_MIN, min);
+  }
+  void add_max(::flatbuffers::Offset<::flatbuffers::Vector<float>> max) {
+    fbb_.AddOffset(QuantizationParameters::VT_MAX, max);
+  }
+  void add_scale(::flatbuffers::Offset<::flatbuffers::Vector<float>> scale) {
+    fbb_.AddOffset(QuantizationParameters::VT_SCALE, scale);
+  }
+  void add_zero_point(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> zero_point) {
+    fbb_.AddOffset(QuantizationParameters::VT_ZERO_POINT, zero_point);
+  }
+  void add_details_type(tflite::QuantizationDetails details_type) {
+    fbb_.AddElement<uint8_t>(QuantizationParameters::VT_DETAILS_TYPE, static_cast<uint8_t>(details_type), 0);
+  }
+  void add_details(::flatbuffers::Offset<void> details) {
+    fbb_.AddOffset(QuantizationParameters::VT_DETAILS, details);
+  }
+  void add_quantized_dimension(int32_t quantized_dimension) {
+    fbb_.AddElement<int32_t>(QuantizationParameters::VT_QUANTIZED_DIMENSION, quantized_dimension, 0);
+  }
+  explicit QuantizationParametersBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<QuantizationParameters> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<QuantizationParameters>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<float>> min = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<float>> max = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<float>> scale = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> zero_point = 0,
+    tflite::QuantizationDetails details_type = tflite::QuantizationDetails::NONE,
+    ::flatbuffers::Offset<void> details = 0,
+    int32_t quantized_dimension = 0) {
+  QuantizationParametersBuilder builder_(_fbb);
+  builder_.add_quantized_dimension(quantized_dimension);
+  builder_.add_details(details);
+  builder_.add_zero_point(zero_point);
+  builder_.add_scale(scale);
+  builder_.add_max(max);
+  builder_.add_min(min);
+  builder_.add_details_type(details_type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<QuantizationParameters> CreateQuantizationParametersDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<float> *min = nullptr,
+    const std::vector<float> *max = nullptr,
+    const std::vector<float> *scale = nullptr,
+    const std::vector<int64_t> *zero_point = nullptr,
+    tflite::QuantizationDetails details_type = tflite::QuantizationDetails::NONE,
+    ::flatbuffers::Offset<void> details = 0,
+    int32_t quantized_dimension = 0) {
+  auto min__ = min ? _fbb.CreateVector<float>(*min) : 0;
+  auto max__ = max ? _fbb.CreateVector<float>(*max) : 0;
+  auto scale__ = scale ? _fbb.CreateVector<float>(*scale) : 0;
+  auto zero_point__ = zero_point ? _fbb.CreateVector<int64_t>(*zero_point) : 0;
+  return tflite::CreateQuantizationParameters(
+      _fbb,
+      min__,
+      max__,
+      scale__,
+      zero_point__,
+      details_type,
+      details,
+      quantized_dimension);
+}
+
+struct Int32Vector FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Int32VectorBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return Int32VectorTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALUES = 4
+  };
+  const ::flatbuffers::Vector<int32_t> *values() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_VALUES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_VALUES) &&
+           verifier.VerifyVector(values()) &&
+           verifier.EndTable();
+  }
+};
+
+struct Int32VectorBuilder {
+  typedef Int32Vector Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_values(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> values) {
+    fbb_.AddOffset(Int32Vector::VT_VALUES, values);
+  }
+  explicit Int32VectorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Int32Vector> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Int32Vector>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Int32Vector> CreateInt32Vector(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> values = 0) {
+  Int32VectorBuilder builder_(_fbb);
+  builder_.add_values(values);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Int32Vector> CreateInt32VectorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *values = nullptr) {
+  auto values__ = values ? _fbb.CreateVector<int32_t>(*values) : 0;
+  return tflite::CreateInt32Vector(
+      _fbb,
+      values__);
+}
+
+struct Uint16Vector FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Uint16VectorBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return Uint16VectorTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALUES = 4
+  };
+  const ::flatbuffers::Vector<uint16_t> *values() const {
+    return GetPointer<const ::flatbuffers::Vector<uint16_t> *>(VT_VALUES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_VALUES) &&
+           verifier.VerifyVector(values()) &&
+           verifier.EndTable();
+  }
+};
+
+struct Uint16VectorBuilder {
+  typedef Uint16Vector Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_values(::flatbuffers::Offset<::flatbuffers::Vector<uint16_t>> values) {
+    fbb_.AddOffset(Uint16Vector::VT_VALUES, values);
+  }
+  explicit Uint16VectorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Uint16Vector> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Uint16Vector>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Uint16Vector> CreateUint16Vector(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint16_t>> values = 0) {
+  Uint16VectorBuilder builder_(_fbb);
+  builder_.add_values(values);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Uint16Vector> CreateUint16VectorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint16_t> *values = nullptr) {
+  if (values) { _fbb.ForceVectorAlignment(values->size(), sizeof(uint16_t), 4); }
+  auto values__ = values ? _fbb.CreateVector<uint16_t>(*values) : 0;
+  return tflite::CreateUint16Vector(
+      _fbb,
+      values__);
+}
+
+struct Uint8Vector FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Uint8VectorBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return Uint8VectorTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALUES = 4
+  };
+  const ::flatbuffers::Vector<uint8_t> *values() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_VALUES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_VALUES) &&
+           verifier.VerifyVector(values()) &&
+           verifier.EndTable();
+  }
+};
+
+struct Uint8VectorBuilder {
+  typedef Uint8Vector Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_values(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> values) {
+    fbb_.AddOffset(Uint8Vector::VT_VALUES, values);
+  }
+  explicit Uint8VectorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Uint8Vector> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Uint8Vector>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Uint8Vector> CreateUint8Vector(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> values = 0) {
+  Uint8VectorBuilder builder_(_fbb);
+  builder_.add_values(values);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Uint8Vector> CreateUint8VectorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *values = nullptr) {
+  if (values) { _fbb.ForceVectorAlignment(values->size(), sizeof(uint8_t), 4); }
+  auto values__ = values ? _fbb.CreateVector<uint8_t>(*values) : 0;
+  return tflite::CreateUint8Vector(
+      _fbb,
+      values__);
+}
+
+struct DimensionMetadata FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DimensionMetadataBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return DimensionMetadataTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FORMAT = 4,
+    VT_DENSE_SIZE = 6,
+    VT_ARRAY_SEGMENTS_TYPE = 8,
+    VT_ARRAY_SEGMENTS = 10,
+    VT_ARRAY_INDICES_TYPE = 12,
+    VT_ARRAY_INDICES = 14
+  };
+  tflite::DimensionType format() const {
+    return static_cast<tflite::DimensionType>(GetField<int8_t>(VT_FORMAT, 0));
+  }
+  int32_t dense_size() const {
+    return GetField<int32_t>(VT_DENSE_SIZE, 0);
+  }
+  tflite::SparseIndexVector array_segments_type() const {
+    return static_cast<tflite::SparseIndexVector>(GetField<uint8_t>(VT_ARRAY_SEGMENTS_TYPE, 0));
+  }
+  const void *array_segments() const {
+    return GetPointer<const void *>(VT_ARRAY_SEGMENTS);
+  }
+  template<typename T> const T *array_segments_as() const;
+  const tflite::Int32Vector *array_segments_as_Int32Vector() const {
+    return array_segments_type() == tflite::SparseIndexVector::Int32Vector ? static_cast<const tflite::Int32Vector *>(array_segments()) : nullptr;
+  }
+  const tflite::Uint16Vector *array_segments_as_Uint16Vector() const {
+    return array_segments_type() == tflite::SparseIndexVector::Uint16Vector ? static_cast<const tflite::Uint16Vector *>(array_segments()) : nullptr;
+  }
+  const tflite::Uint8Vector *array_segments_as_Uint8Vector() const {
+    return array_segments_type() == tflite::SparseIndexVector::Uint8Vector ? static_cast<const tflite::Uint8Vector *>(array_segments()) : nullptr;
+  }
+  tflite::SparseIndexVector array_indices_type() const {
+    return static_cast<tflite::SparseIndexVector>(GetField<uint8_t>(VT_ARRAY_INDICES_TYPE, 0));
+  }
+  const void *array_indices() const {
+    return GetPointer<const void *>(VT_ARRAY_INDICES);
+  }
+  template<typename T> const T *array_indices_as() const;
+  const tflite::Int32Vector *array_indices_as_Int32Vector() const {
+    return array_indices_type() == tflite::SparseIndexVector::Int32Vector ? static_cast<const tflite::Int32Vector *>(array_indices()) : nullptr;
+  }
+  const tflite::Uint16Vector *array_indices_as_Uint16Vector() const {
+    return array_indices_type() == tflite::SparseIndexVector::Uint16Vector ? static_cast<const tflite::Uint16Vector *>(array_indices()) : nullptr;
+  }
+  const tflite::Uint8Vector *array_indices_as_Uint8Vector() const {
+    return array_indices_type() == tflite::SparseIndexVector::Uint8Vector ? static_cast<const tflite::Uint8Vector *>(array_indices()) : nullptr;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FORMAT, 1) &&
+           VerifyField<int32_t>(verifier, VT_DENSE_SIZE, 4) &&
+           VerifyField<uint8_t>(verifier, VT_ARRAY_SEGMENTS_TYPE, 1) &&
+           VerifyOffset(verifier, VT_ARRAY_SEGMENTS) &&
+           VerifySparseIndexVector(verifier, array_segments(), array_segments_type()) &&
+           VerifyField<uint8_t>(verifier, VT_ARRAY_INDICES_TYPE, 1) &&
+           VerifyOffset(verifier, VT_ARRAY_INDICES) &&
+           VerifySparseIndexVector(verifier, array_indices(), array_indices_type()) &&
+           verifier.EndTable();
+  }
+};
+
+template<> inline const tflite::Int32Vector *DimensionMetadata::array_segments_as<tflite::Int32Vector>() const {
+  return array_segments_as_Int32Vector();
+}
+
+template<> inline const tflite::Uint16Vector *DimensionMetadata::array_segments_as<tflite::Uint16Vector>() const {
+  return array_segments_as_Uint16Vector();
+}
+
+template<> inline const tflite::Uint8Vector *DimensionMetadata::array_segments_as<tflite::Uint8Vector>() const {
+  return array_segments_as_Uint8Vector();
+}
+
+template<> inline const tflite::Int32Vector *DimensionMetadata::array_indices_as<tflite::Int32Vector>() const {
+  return array_indices_as_Int32Vector();
+}
+
+template<> inline const tflite::Uint16Vector *DimensionMetadata::array_indices_as<tflite::Uint16Vector>() const {
+  return array_indices_as_Uint16Vector();
+}
+
+template<> inline const tflite::Uint8Vector *DimensionMetadata::array_indices_as<tflite::Uint8Vector>() const {
+  return array_indices_as_Uint8Vector();
+}
+
+struct DimensionMetadataBuilder {
+  typedef DimensionMetadata Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_format(tflite::DimensionType format) {
+    fbb_.AddElement<int8_t>(DimensionMetadata::VT_FORMAT, static_cast<int8_t>(format), 0);
+  }
+  void add_dense_size(int32_t dense_size) {
+    fbb_.AddElement<int32_t>(DimensionMetadata::VT_DENSE_SIZE, dense_size, 0);
+  }
+  void add_array_segments_type(tflite::SparseIndexVector array_segments_type) {
+    fbb_.AddElement<uint8_t>(DimensionMetadata::VT_ARRAY_SEGMENTS_TYPE, static_cast<uint8_t>(array_segments_type), 0);
+  }
+  void add_array_segments(::flatbuffers::Offset<void> array_segments) {
+    fbb_.AddOffset(DimensionMetadata::VT_ARRAY_SEGMENTS, array_segments);
+  }
+  void add_array_indices_type(tflite::SparseIndexVector array_indices_type) {
+    fbb_.AddElement<uint8_t>(DimensionMetadata::VT_ARRAY_INDICES_TYPE, static_cast<uint8_t>(array_indices_type), 0);
+  }
+  void add_array_indices(::flatbuffers::Offset<void> array_indices) {
+    fbb_.AddOffset(DimensionMetadata::VT_ARRAY_INDICES, array_indices);
+  }
+  explicit DimensionMetadataBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DimensionMetadata> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DimensionMetadata>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DimensionMetadata> CreateDimensionMetadata(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::DimensionType format = tflite::DimensionType::DENSE,
+    int32_t dense_size = 0,
+    tflite::SparseIndexVector array_segments_type = tflite::SparseIndexVector::NONE,
+    ::flatbuffers::Offset<void> array_segments = 0,
+    tflite::SparseIndexVector array_indices_type = tflite::SparseIndexVector::NONE,
+    ::flatbuffers::Offset<void> array_indices = 0) {
+  DimensionMetadataBuilder builder_(_fbb);
+  builder_.add_array_indices(array_indices);
+  builder_.add_array_segments(array_segments);
+  builder_.add_dense_size(dense_size);
+  builder_.add_array_indices_type(array_indices_type);
+  builder_.add_array_segments_type(array_segments_type);
+  builder_.add_format(format);
+  return builder_.Finish();
+}
+
+struct SparsityParameters FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SparsityParametersBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SparsityParametersTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TRAVERSAL_ORDER = 4,
+    VT_BLOCK_MAP = 6,
+    VT_DIM_METADATA = 8
+  };
+  const ::flatbuffers::Vector<int32_t> *traversal_order() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_TRAVERSAL_ORDER);
+  }
+  const ::flatbuffers::Vector<int32_t> *block_map() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_BLOCK_MAP);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::DimensionMetadata>> *dim_metadata() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::DimensionMetadata>> *>(VT_DIM_METADATA);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_TRAVERSAL_ORDER) &&
+           verifier.VerifyVector(traversal_order()) &&
+           VerifyOffset(verifier, VT_BLOCK_MAP) &&
+           verifier.VerifyVector(block_map()) &&
+           VerifyOffset(verifier, VT_DIM_METADATA) &&
+           verifier.VerifyVector(dim_metadata()) &&
+           verifier.VerifyVectorOfTables(dim_metadata()) &&
+           verifier.EndTable();
+  }
+};
+
+struct SparsityParametersBuilder {
+  typedef SparsityParameters Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_traversal_order(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> traversal_order) {
+    fbb_.AddOffset(SparsityParameters::VT_TRAVERSAL_ORDER, traversal_order);
+  }
+  void add_block_map(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> block_map) {
+    fbb_.AddOffset(SparsityParameters::VT_BLOCK_MAP, block_map);
+  }
+  void add_dim_metadata(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::DimensionMetadata>>> dim_metadata) {
+    fbb_.AddOffset(SparsityParameters::VT_DIM_METADATA, dim_metadata);
+  }
+  explicit SparsityParametersBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SparsityParameters> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SparsityParameters>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SparsityParameters> CreateSparsityParameters(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> traversal_order = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> block_map = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::DimensionMetadata>>> dim_metadata = 0) {
+  SparsityParametersBuilder builder_(_fbb);
+  builder_.add_dim_metadata(dim_metadata);
+  builder_.add_block_map(block_map);
+  builder_.add_traversal_order(traversal_order);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<SparsityParameters> CreateSparsityParametersDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *traversal_order = nullptr,
+    const std::vector<int32_t> *block_map = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::DimensionMetadata>> *dim_metadata = nullptr) {
+  auto traversal_order__ = traversal_order ? _fbb.CreateVector<int32_t>(*traversal_order) : 0;
+  auto block_map__ = block_map ? _fbb.CreateVector<int32_t>(*block_map) : 0;
+  auto dim_metadata__ = dim_metadata ? _fbb.CreateVector<::flatbuffers::Offset<tflite::DimensionMetadata>>(*dim_metadata) : 0;
+  return tflite::CreateSparsityParameters(
+      _fbb,
+      traversal_order__,
+      block_map__,
+      dim_metadata__);
+}
+
+struct VariantSubType FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef VariantSubTypeBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return VariantSubTypeTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SHAPE = 4,
+    VT_TYPE = 6,
+    VT_HAS_RANK = 8
+  };
+  const ::flatbuffers::Vector<int32_t> *shape() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_SHAPE);
+  }
+  tflite::TensorType type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_TYPE, 0));
+  }
+  bool has_rank() const {
+    return GetField<uint8_t>(VT_HAS_RANK, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_SHAPE) &&
+           verifier.VerifyVector(shape()) &&
+           VerifyField<int8_t>(verifier, VT_TYPE, 1) &&
+           VerifyField<uint8_t>(verifier, VT_HAS_RANK, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct VariantSubTypeBuilder {
+  typedef VariantSubType Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_shape(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape) {
+    fbb_.AddOffset(VariantSubType::VT_SHAPE, shape);
+  }
+  void add_type(tflite::TensorType type) {
+    fbb_.AddElement<int8_t>(VariantSubType::VT_TYPE, static_cast<int8_t>(type), 0);
+  }
+  void add_has_rank(bool has_rank) {
+    fbb_.AddElement<uint8_t>(VariantSubType::VT_HAS_RANK, static_cast<uint8_t>(has_rank), 0);
+  }
+  explicit VariantSubTypeBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<VariantSubType> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<VariantSubType>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<VariantSubType> CreateVariantSubType(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape = 0,
+    tflite::TensorType type = tflite::TensorType::FLOAT32,
+    bool has_rank = false) {
+  VariantSubTypeBuilder builder_(_fbb);
+  builder_.add_shape(shape);
+  builder_.add_has_rank(has_rank);
+  builder_.add_type(type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<VariantSubType> CreateVariantSubTypeDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *shape = nullptr,
+    tflite::TensorType type = tflite::TensorType::FLOAT32,
+    bool has_rank = false) {
+  auto shape__ = shape ? _fbb.CreateVector<int32_t>(*shape) : 0;
+  return tflite::CreateVariantSubType(
+      _fbb,
+      shape__,
+      type,
+      has_rank);
+}
+
+struct Tensor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TensorBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TensorTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SHAPE = 4,
+    VT_TYPE = 6,
+    VT_BUFFER = 8,
+    VT_NAME = 10,
+    VT_QUANTIZATION = 12,
+    VT_IS_VARIABLE = 14,
+    VT_SPARSITY = 16,
+    VT_SHAPE_SIGNATURE = 18,
+    VT_HAS_RANK = 20,
+    VT_VARIANT_TENSORS = 22
+  };
+  const ::flatbuffers::Vector<int32_t> *shape() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_SHAPE);
+  }
+  tflite::TensorType type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_TYPE, 0));
+  }
+  uint32_t buffer() const {
+    return GetField<uint32_t>(VT_BUFFER, 0);
+  }
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  const tflite::QuantizationParameters *quantization() const {
+    return GetPointer<const tflite::QuantizationParameters *>(VT_QUANTIZATION);
+  }
+  bool is_variable() const {
+    return GetField<uint8_t>(VT_IS_VARIABLE, 0) != 0;
+  }
+  const tflite::SparsityParameters *sparsity() const {
+    return GetPointer<const tflite::SparsityParameters *>(VT_SPARSITY);
+  }
+  const ::flatbuffers::Vector<int32_t> *shape_signature() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_SHAPE_SIGNATURE);
+  }
+  bool has_rank() const {
+    return GetField<uint8_t>(VT_HAS_RANK, 0) != 0;
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>> *variant_tensors() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>> *>(VT_VARIANT_TENSORS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_SHAPE) &&
+           verifier.VerifyVector(shape()) &&
+           VerifyField<int8_t>(verifier, VT_TYPE, 1) &&
+           VerifyField<uint32_t>(verifier, VT_BUFFER, 4) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, VT_QUANTIZATION) &&
+           verifier.VerifyTable(quantization()) &&
+           VerifyField<uint8_t>(verifier, VT_IS_VARIABLE, 1) &&
+           VerifyOffset(verifier, VT_SPARSITY) &&
+           verifier.VerifyTable(sparsity()) &&
+           VerifyOffset(verifier, VT_SHAPE_SIGNATURE) &&
+           verifier.VerifyVector(shape_signature()) &&
+           VerifyField<uint8_t>(verifier, VT_HAS_RANK, 1) &&
+           VerifyOffset(verifier, VT_VARIANT_TENSORS) &&
+           verifier.VerifyVector(variant_tensors()) &&
+           verifier.VerifyVectorOfTables(variant_tensors()) &&
+           verifier.EndTable();
+  }
+};
+
+struct TensorBuilder {
+  typedef Tensor Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_shape(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape) {
+    fbb_.AddOffset(Tensor::VT_SHAPE, shape);
+  }
+  void add_type(tflite::TensorType type) {
+    fbb_.AddElement<int8_t>(Tensor::VT_TYPE, static_cast<int8_t>(type), 0);
+  }
+  void add_buffer(uint32_t buffer) {
+    fbb_.AddElement<uint32_t>(Tensor::VT_BUFFER, buffer, 0);
+  }
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(Tensor::VT_NAME, name);
+  }
+  void add_quantization(::flatbuffers::Offset<tflite::QuantizationParameters> quantization) {
+    fbb_.AddOffset(Tensor::VT_QUANTIZATION, quantization);
+  }
+  void add_is_variable(bool is_variable) {
+    fbb_.AddElement<uint8_t>(Tensor::VT_IS_VARIABLE, static_cast<uint8_t>(is_variable), 0);
+  }
+  void add_sparsity(::flatbuffers::Offset<tflite::SparsityParameters> sparsity) {
+    fbb_.AddOffset(Tensor::VT_SPARSITY, sparsity);
+  }
+  void add_shape_signature(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape_signature) {
+    fbb_.AddOffset(Tensor::VT_SHAPE_SIGNATURE, shape_signature);
+  }
+  void add_has_rank(bool has_rank) {
+    fbb_.AddElement<uint8_t>(Tensor::VT_HAS_RANK, static_cast<uint8_t>(has_rank), 0);
+  }
+  void add_variant_tensors(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>>> variant_tensors) {
+    fbb_.AddOffset(Tensor::VT_VARIANT_TENSORS, variant_tensors);
+  }
+  explicit TensorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Tensor> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Tensor>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Tensor> CreateTensor(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape = 0,
+    tflite::TensorType type = tflite::TensorType::FLOAT32,
+    uint32_t buffer = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<tflite::QuantizationParameters> quantization = 0,
+    bool is_variable = false,
+    ::flatbuffers::Offset<tflite::SparsityParameters> sparsity = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape_signature = 0,
+    bool has_rank = false,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::VariantSubType>>> variant_tensors = 0) {
+  TensorBuilder builder_(_fbb);
+  builder_.add_variant_tensors(variant_tensors);
+  builder_.add_shape_signature(shape_signature);
+  builder_.add_sparsity(sparsity);
+  builder_.add_quantization(quantization);
+  builder_.add_name(name);
+  builder_.add_buffer(buffer);
+  builder_.add_shape(shape);
+  builder_.add_has_rank(has_rank);
+  builder_.add_is_variable(is_variable);
+  builder_.add_type(type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Tensor> CreateTensorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *shape = nullptr,
+    tflite::TensorType type = tflite::TensorType::FLOAT32,
+    uint32_t buffer = 0,
+    const char *name = nullptr,
+    ::flatbuffers::Offset<tflite::QuantizationParameters> quantization = 0,
+    bool is_variable = false,
+    ::flatbuffers::Offset<tflite::SparsityParameters> sparsity = 0,
+    const std::vector<int32_t> *shape_signature = nullptr,
+    bool has_rank = false,
+    const std::vector<::flatbuffers::Offset<tflite::VariantSubType>> *variant_tensors = nullptr) {
+  auto shape__ = shape ? _fbb.CreateVector<int32_t>(*shape) : 0;
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto shape_signature__ = shape_signature ? _fbb.CreateVector<int32_t>(*shape_signature) : 0;
+  auto variant_tensors__ = variant_tensors ? _fbb.CreateVector<::flatbuffers::Offset<tflite::VariantSubType>>(*variant_tensors) : 0;
+  return tflite::CreateTensor(
+      _fbb,
+      shape__,
+      type,
+      buffer,
+      name__,
+      quantization,
+      is_variable,
+      sparsity,
+      shape_signature__,
+      has_rank,
+      variant_tensors__);
+}
+
+struct StablehloGatherOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloGatherOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StablehloGatherOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_OFFSET_DIMS = 4,
+    VT_COLLAPSED_SLICE_DIMS = 6,
+    VT_START_INDEX_MAP = 8,
+    VT_INDEX_VECTOR_DIM = 10,
+    VT_SLICE_SIZES = 12,
+    VT_INDICES_ARE_SORTED = 14
+  };
+  const ::flatbuffers::Vector<int64_t> *offset_dims() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_OFFSET_DIMS);
+  }
+  const ::flatbuffers::Vector<int64_t> *collapsed_slice_dims() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_COLLAPSED_SLICE_DIMS);
+  }
+  const ::flatbuffers::Vector<int64_t> *start_index_map() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_START_INDEX_MAP);
+  }
+  int64_t index_vector_dim() const {
+    return GetField<int64_t>(VT_INDEX_VECTOR_DIM, 0);
+  }
+  const ::flatbuffers::Vector<int64_t> *slice_sizes() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_SLICE_SIZES);
+  }
+  bool indices_are_sorted() const {
+    return GetField<uint8_t>(VT_INDICES_ARE_SORTED, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_OFFSET_DIMS) &&
+           verifier.VerifyVector(offset_dims()) &&
+           VerifyOffset(verifier, VT_COLLAPSED_SLICE_DIMS) &&
+           verifier.VerifyVector(collapsed_slice_dims()) &&
+           VerifyOffset(verifier, VT_START_INDEX_MAP) &&
+           verifier.VerifyVector(start_index_map()) &&
+           VerifyField<int64_t>(verifier, VT_INDEX_VECTOR_DIM, 8) &&
+           VerifyOffset(verifier, VT_SLICE_SIZES) &&
+           verifier.VerifyVector(slice_sizes()) &&
+           VerifyField<uint8_t>(verifier, VT_INDICES_ARE_SORTED, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct StablehloGatherOptionsBuilder {
+  typedef StablehloGatherOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_offset_dims(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> offset_dims) {
+    fbb_.AddOffset(StablehloGatherOptions::VT_OFFSET_DIMS, offset_dims);
+  }
+  void add_collapsed_slice_dims(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> collapsed_slice_dims) {
+    fbb_.AddOffset(StablehloGatherOptions::VT_COLLAPSED_SLICE_DIMS, collapsed_slice_dims);
+  }
+  void add_start_index_map(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> start_index_map) {
+    fbb_.AddOffset(StablehloGatherOptions::VT_START_INDEX_MAP, start_index_map);
+  }
+  void add_index_vector_dim(int64_t index_vector_dim) {
+    fbb_.AddElement<int64_t>(StablehloGatherOptions::VT_INDEX_VECTOR_DIM, index_vector_dim, 0);
+  }
+  void add_slice_sizes(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> slice_sizes) {
+    fbb_.AddOffset(StablehloGatherOptions::VT_SLICE_SIZES, slice_sizes);
+  }
+  void add_indices_are_sorted(bool indices_are_sorted) {
+    fbb_.AddElement<uint8_t>(StablehloGatherOptions::VT_INDICES_ARE_SORTED, static_cast<uint8_t>(indices_are_sorted), 0);
+  }
+  explicit StablehloGatherOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloGatherOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloGatherOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloGatherOptions> CreateStablehloGatherOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> offset_dims = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> collapsed_slice_dims = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> start_index_map = 0,
+    int64_t index_vector_dim = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> slice_sizes = 0,
+    bool indices_are_sorted = false) {
+  StablehloGatherOptionsBuilder builder_(_fbb);
+  builder_.add_index_vector_dim(index_vector_dim);
+  builder_.add_slice_sizes(slice_sizes);
+  builder_.add_start_index_map(start_index_map);
+  builder_.add_collapsed_slice_dims(collapsed_slice_dims);
+  builder_.add_offset_dims(offset_dims);
+  builder_.add_indices_are_sorted(indices_are_sorted);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloGatherOptions> CreateStablehloGatherOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *offset_dims = nullptr,
+    const std::vector<int64_t> *collapsed_slice_dims = nullptr,
+    const std::vector<int64_t> *start_index_map = nullptr,
+    int64_t index_vector_dim = 0,
+    const std::vector<int64_t> *slice_sizes = nullptr,
+    bool indices_are_sorted = false) {
+  auto offset_dims__ = offset_dims ? _fbb.CreateVector<int64_t>(*offset_dims) : 0;
+  auto collapsed_slice_dims__ = collapsed_slice_dims ? _fbb.CreateVector<int64_t>(*collapsed_slice_dims) : 0;
+  auto start_index_map__ = start_index_map ? _fbb.CreateVector<int64_t>(*start_index_map) : 0;
+  auto slice_sizes__ = slice_sizes ? _fbb.CreateVector<int64_t>(*slice_sizes) : 0;
+  return tflite::CreateStablehloGatherOptions(
+      _fbb,
+      offset_dims__,
+      collapsed_slice_dims__,
+      start_index_map__,
+      index_vector_dim,
+      slice_sizes__,
+      indices_are_sorted);
+}
+
+struct StablehloTransposeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloTransposeOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StablehloTransposeOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PERMUTATION = 4
+  };
+  const ::flatbuffers::Vector<int64_t> *permutation() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_PERMUTATION);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_PERMUTATION) &&
+           verifier.VerifyVector(permutation()) &&
+           verifier.EndTable();
+  }
+};
+
+struct StablehloTransposeOptionsBuilder {
+  typedef StablehloTransposeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_permutation(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> permutation) {
+    fbb_.AddOffset(StablehloTransposeOptions::VT_PERMUTATION, permutation);
+  }
+  explicit StablehloTransposeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloTransposeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloTransposeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloTransposeOptions> CreateStablehloTransposeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> permutation = 0) {
+  StablehloTransposeOptionsBuilder builder_(_fbb);
+  builder_.add_permutation(permutation);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloTransposeOptions> CreateStablehloTransposeOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *permutation = nullptr) {
+  auto permutation__ = permutation ? _fbb.CreateVector<int64_t>(*permutation) : 0;
+  return tflite::CreateStablehloTransposeOptions(
+      _fbb,
+      permutation__);
+}
+
+struct StablehloDotGeneralOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloDotGeneralOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StablehloDotGeneralOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_LHS_BATCHING_DIMENSIONS = 4,
+    VT_RHS_BATCHING_DIMENSIONS = 6,
+    VT_LHS_CONTRACTING_DIMENSIONS = 8,
+    VT_RHS_CONTRACTING_DIMENSIONS = 10,
+    VT_PRECISION_CONFIG = 12
+  };
+  const ::flatbuffers::Vector<int64_t> *lhs_batching_dimensions() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_LHS_BATCHING_DIMENSIONS);
+  }
+  const ::flatbuffers::Vector<int64_t> *rhs_batching_dimensions() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_RHS_BATCHING_DIMENSIONS);
+  }
+  const ::flatbuffers::Vector<int64_t> *lhs_contracting_dimensions() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_LHS_CONTRACTING_DIMENSIONS);
+  }
+  const ::flatbuffers::Vector<int64_t> *rhs_contracting_dimensions() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_RHS_CONTRACTING_DIMENSIONS);
+  }
+  const ::flatbuffers::Vector<tflite::StablehloPrecisionConfig> *precision_config() const {
+    return GetPointer<const ::flatbuffers::Vector<tflite::StablehloPrecisionConfig> *>(VT_PRECISION_CONFIG);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_LHS_BATCHING_DIMENSIONS) &&
+           verifier.VerifyVector(lhs_batching_dimensions()) &&
+           VerifyOffset(verifier, VT_RHS_BATCHING_DIMENSIONS) &&
+           verifier.VerifyVector(rhs_batching_dimensions()) &&
+           VerifyOffset(verifier, VT_LHS_CONTRACTING_DIMENSIONS) &&
+           verifier.VerifyVector(lhs_contracting_dimensions()) &&
+           VerifyOffset(verifier, VT_RHS_CONTRACTING_DIMENSIONS) &&
+           verifier.VerifyVector(rhs_contracting_dimensions()) &&
+           VerifyOffset(verifier, VT_PRECISION_CONFIG) &&
+           verifier.VerifyVector(precision_config()) &&
+           verifier.EndTable();
+  }
+};
+
+struct StablehloDotGeneralOptionsBuilder {
+  typedef StablehloDotGeneralOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_lhs_batching_dimensions(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> lhs_batching_dimensions) {
+    fbb_.AddOffset(StablehloDotGeneralOptions::VT_LHS_BATCHING_DIMENSIONS, lhs_batching_dimensions);
+  }
+  void add_rhs_batching_dimensions(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> rhs_batching_dimensions) {
+    fbb_.AddOffset(StablehloDotGeneralOptions::VT_RHS_BATCHING_DIMENSIONS, rhs_batching_dimensions);
+  }
+  void add_lhs_contracting_dimensions(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> lhs_contracting_dimensions) {
+    fbb_.AddOffset(StablehloDotGeneralOptions::VT_LHS_CONTRACTING_DIMENSIONS, lhs_contracting_dimensions);
+  }
+  void add_rhs_contracting_dimensions(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> rhs_contracting_dimensions) {
+    fbb_.AddOffset(StablehloDotGeneralOptions::VT_RHS_CONTRACTING_DIMENSIONS, rhs_contracting_dimensions);
+  }
+  void add_precision_config(::flatbuffers::Offset<::flatbuffers::Vector<tflite::StablehloPrecisionConfig>> precision_config) {
+    fbb_.AddOffset(StablehloDotGeneralOptions::VT_PRECISION_CONFIG, precision_config);
+  }
+  explicit StablehloDotGeneralOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloDotGeneralOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloDotGeneralOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloDotGeneralOptions> CreateStablehloDotGeneralOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> lhs_batching_dimensions = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> rhs_batching_dimensions = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> lhs_contracting_dimensions = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> rhs_contracting_dimensions = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<tflite::StablehloPrecisionConfig>> precision_config = 0) {
+  StablehloDotGeneralOptionsBuilder builder_(_fbb);
+  builder_.add_precision_config(precision_config);
+  builder_.add_rhs_contracting_dimensions(rhs_contracting_dimensions);
+  builder_.add_lhs_contracting_dimensions(lhs_contracting_dimensions);
+  builder_.add_rhs_batching_dimensions(rhs_batching_dimensions);
+  builder_.add_lhs_batching_dimensions(lhs_batching_dimensions);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloDotGeneralOptions> CreateStablehloDotGeneralOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *lhs_batching_dimensions = nullptr,
+    const std::vector<int64_t> *rhs_batching_dimensions = nullptr,
+    const std::vector<int64_t> *lhs_contracting_dimensions = nullptr,
+    const std::vector<int64_t> *rhs_contracting_dimensions = nullptr,
+    const std::vector<tflite::StablehloPrecisionConfig> *precision_config = nullptr) {
+  auto lhs_batching_dimensions__ = lhs_batching_dimensions ? _fbb.CreateVector<int64_t>(*lhs_batching_dimensions) : 0;
+  auto rhs_batching_dimensions__ = rhs_batching_dimensions ? _fbb.CreateVector<int64_t>(*rhs_batching_dimensions) : 0;
+  auto lhs_contracting_dimensions__ = lhs_contracting_dimensions ? _fbb.CreateVector<int64_t>(*lhs_contracting_dimensions) : 0;
+  auto rhs_contracting_dimensions__ = rhs_contracting_dimensions ? _fbb.CreateVector<int64_t>(*rhs_contracting_dimensions) : 0;
+  auto precision_config__ = precision_config ? _fbb.CreateVector<tflite::StablehloPrecisionConfig>(*precision_config) : 0;
+  return tflite::CreateStablehloDotGeneralOptions(
+      _fbb,
+      lhs_batching_dimensions__,
+      rhs_batching_dimensions__,
+      lhs_contracting_dimensions__,
+      rhs_contracting_dimensions__,
+      precision_config__);
+}
+
+struct StablehloReduceWindowOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloReduceWindowOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StablehloReduceWindowOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_WINDOW_DIMENSIONS = 4,
+    VT_WINDOW_STRIDES = 6,
+    VT_BASE_DILATIONS = 8,
+    VT_WINDOW_DILATIONS = 10,
+    VT_PADDING = 12,
+    VT_BODY_SUBGRAPH_INDEX = 14
+  };
+  const ::flatbuffers::Vector<int64_t> *window_dimensions() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_WINDOW_DIMENSIONS);
+  }
+  const ::flatbuffers::Vector<int64_t> *window_strides() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_WINDOW_STRIDES);
+  }
+  const ::flatbuffers::Vector<int64_t> *base_dilations() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_BASE_DILATIONS);
+  }
+  const ::flatbuffers::Vector<int64_t> *window_dilations() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_WINDOW_DILATIONS);
+  }
+  const ::flatbuffers::Vector<int64_t> *padding() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_PADDING);
+  }
+  int32_t body_subgraph_index() const {
+    return GetField<int32_t>(VT_BODY_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_WINDOW_DIMENSIONS) &&
+           verifier.VerifyVector(window_dimensions()) &&
+           VerifyOffset(verifier, VT_WINDOW_STRIDES) &&
+           verifier.VerifyVector(window_strides()) &&
+           VerifyOffset(verifier, VT_BASE_DILATIONS) &&
+           verifier.VerifyVector(base_dilations()) &&
+           VerifyOffset(verifier, VT_WINDOW_DILATIONS) &&
+           verifier.VerifyVector(window_dilations()) &&
+           VerifyOffset(verifier, VT_PADDING) &&
+           verifier.VerifyVector(padding()) &&
+           VerifyField<int32_t>(verifier, VT_BODY_SUBGRAPH_INDEX, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct StablehloReduceWindowOptionsBuilder {
+  typedef StablehloReduceWindowOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_window_dimensions(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> window_dimensions) {
+    fbb_.AddOffset(StablehloReduceWindowOptions::VT_WINDOW_DIMENSIONS, window_dimensions);
+  }
+  void add_window_strides(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> window_strides) {
+    fbb_.AddOffset(StablehloReduceWindowOptions::VT_WINDOW_STRIDES, window_strides);
+  }
+  void add_base_dilations(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> base_dilations) {
+    fbb_.AddOffset(StablehloReduceWindowOptions::VT_BASE_DILATIONS, base_dilations);
+  }
+  void add_window_dilations(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> window_dilations) {
+    fbb_.AddOffset(StablehloReduceWindowOptions::VT_WINDOW_DILATIONS, window_dilations);
+  }
+  void add_padding(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> padding) {
+    fbb_.AddOffset(StablehloReduceWindowOptions::VT_PADDING, padding);
+  }
+  void add_body_subgraph_index(int32_t body_subgraph_index) {
+    fbb_.AddElement<int32_t>(StablehloReduceWindowOptions::VT_BODY_SUBGRAPH_INDEX, body_subgraph_index, 0);
+  }
+  explicit StablehloReduceWindowOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloReduceWindowOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloReduceWindowOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloReduceWindowOptions> CreateStablehloReduceWindowOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> window_dimensions = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> window_strides = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> base_dilations = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> window_dilations = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> padding = 0,
+    int32_t body_subgraph_index = 0) {
+  StablehloReduceWindowOptionsBuilder builder_(_fbb);
+  builder_.add_body_subgraph_index(body_subgraph_index);
+  builder_.add_padding(padding);
+  builder_.add_window_dilations(window_dilations);
+  builder_.add_base_dilations(base_dilations);
+  builder_.add_window_strides(window_strides);
+  builder_.add_window_dimensions(window_dimensions);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloReduceWindowOptions> CreateStablehloReduceWindowOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *window_dimensions = nullptr,
+    const std::vector<int64_t> *window_strides = nullptr,
+    const std::vector<int64_t> *base_dilations = nullptr,
+    const std::vector<int64_t> *window_dilations = nullptr,
+    const std::vector<int64_t> *padding = nullptr,
+    int32_t body_subgraph_index = 0) {
+  auto window_dimensions__ = window_dimensions ? _fbb.CreateVector<int64_t>(*window_dimensions) : 0;
+  auto window_strides__ = window_strides ? _fbb.CreateVector<int64_t>(*window_strides) : 0;
+  auto base_dilations__ = base_dilations ? _fbb.CreateVector<int64_t>(*base_dilations) : 0;
+  auto window_dilations__ = window_dilations ? _fbb.CreateVector<int64_t>(*window_dilations) : 0;
+  auto padding__ = padding ? _fbb.CreateVector<int64_t>(*padding) : 0;
+  return tflite::CreateStablehloReduceWindowOptions(
+      _fbb,
+      window_dimensions__,
+      window_strides__,
+      base_dilations__,
+      window_dilations__,
+      padding__,
+      body_subgraph_index);
+}
+
+struct StablehloWhileOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloWhileOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StablehloWhileOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_COND_SUBGRAPH_INDEX = 4,
+    VT_BODY_SUBGRAPH_INDEX = 6
+  };
+  int32_t cond_subgraph_index() const {
+    return GetField<int32_t>(VT_COND_SUBGRAPH_INDEX, 0);
+  }
+  int32_t body_subgraph_index() const {
+    return GetField<int32_t>(VT_BODY_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_COND_SUBGRAPH_INDEX, 4) &&
+           VerifyField<int32_t>(verifier, VT_BODY_SUBGRAPH_INDEX, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct StablehloWhileOptionsBuilder {
+  typedef StablehloWhileOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_cond_subgraph_index(int32_t cond_subgraph_index) {
+    fbb_.AddElement<int32_t>(StablehloWhileOptions::VT_COND_SUBGRAPH_INDEX, cond_subgraph_index, 0);
+  }
+  void add_body_subgraph_index(int32_t body_subgraph_index) {
+    fbb_.AddElement<int32_t>(StablehloWhileOptions::VT_BODY_SUBGRAPH_INDEX, body_subgraph_index, 0);
+  }
+  explicit StablehloWhileOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloWhileOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloWhileOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloWhileOptions> CreateStablehloWhileOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t cond_subgraph_index = 0,
+    int32_t body_subgraph_index = 0) {
+  StablehloWhileOptionsBuilder builder_(_fbb);
+  builder_.add_body_subgraph_index(body_subgraph_index);
+  builder_.add_cond_subgraph_index(cond_subgraph_index);
+  return builder_.Finish();
+}
+
+struct StablehloSortOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloSortOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StablehloSortOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DIMENSION = 4,
+    VT_IS_STABLE = 6,
+    VT_COMPARATOR_SUBGRAPH_INDEX = 8
+  };
+  int64_t dimension() const {
+    return GetField<int64_t>(VT_DIMENSION, 0);
+  }
+  bool is_stable() const {
+    return GetField<uint8_t>(VT_IS_STABLE, 0) != 0;
+  }
+  int32_t comparator_subgraph_index() const {
+    return GetField<int32_t>(VT_COMPARATOR_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int64_t>(verifier, VT_DIMENSION, 8) &&
+           VerifyField<uint8_t>(verifier, VT_IS_STABLE, 1) &&
+           VerifyField<int32_t>(verifier, VT_COMPARATOR_SUBGRAPH_INDEX, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct StablehloSortOptionsBuilder {
+  typedef StablehloSortOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_dimension(int64_t dimension) {
+    fbb_.AddElement<int64_t>(StablehloSortOptions::VT_DIMENSION, dimension, 0);
+  }
+  void add_is_stable(bool is_stable) {
+    fbb_.AddElement<uint8_t>(StablehloSortOptions::VT_IS_STABLE, static_cast<uint8_t>(is_stable), 0);
+  }
+  void add_comparator_subgraph_index(int32_t comparator_subgraph_index) {
+    fbb_.AddElement<int32_t>(StablehloSortOptions::VT_COMPARATOR_SUBGRAPH_INDEX, comparator_subgraph_index, 0);
+  }
+  explicit StablehloSortOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloSortOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloSortOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloSortOptions> CreateStablehloSortOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int64_t dimension = 0,
+    bool is_stable = false,
+    int32_t comparator_subgraph_index = 0) {
+  StablehloSortOptionsBuilder builder_(_fbb);
+  builder_.add_dimension(dimension);
+  builder_.add_comparator_subgraph_index(comparator_subgraph_index);
+  builder_.add_is_stable(is_stable);
+  return builder_.Finish();
+}
+
+struct StablehloConcatenateOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloConcatenateOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StablehloConcatenateOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DIMENSION = 4
+  };
+  int64_t dimension() const {
+    return GetField<int64_t>(VT_DIMENSION, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int64_t>(verifier, VT_DIMENSION, 8) &&
+           verifier.EndTable();
+  }
+};
+
+struct StablehloConcatenateOptionsBuilder {
+  typedef StablehloConcatenateOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_dimension(int64_t dimension) {
+    fbb_.AddElement<int64_t>(StablehloConcatenateOptions::VT_DIMENSION, dimension, 0);
+  }
+  explicit StablehloConcatenateOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloConcatenateOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloConcatenateOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloConcatenateOptions> CreateStablehloConcatenateOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int64_t dimension = 0) {
+  StablehloConcatenateOptionsBuilder builder_(_fbb);
+  builder_.add_dimension(dimension);
+  return builder_.Finish();
+}
+
+struct StablehloBroadcastInDimOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloBroadcastInDimOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StablehloBroadcastInDimOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BROADCAST_DIMENSIONS = 4
+  };
+  const ::flatbuffers::Vector<int64_t> *broadcast_dimensions() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_BROADCAST_DIMENSIONS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_BROADCAST_DIMENSIONS) &&
+           verifier.VerifyVector(broadcast_dimensions()) &&
+           verifier.EndTable();
+  }
+};
+
+struct StablehloBroadcastInDimOptionsBuilder {
+  typedef StablehloBroadcastInDimOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_broadcast_dimensions(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> broadcast_dimensions) {
+    fbb_.AddOffset(StablehloBroadcastInDimOptions::VT_BROADCAST_DIMENSIONS, broadcast_dimensions);
+  }
+  explicit StablehloBroadcastInDimOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloBroadcastInDimOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloBroadcastInDimOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloBroadcastInDimOptions> CreateStablehloBroadcastInDimOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> broadcast_dimensions = 0) {
+  StablehloBroadcastInDimOptionsBuilder builder_(_fbb);
+  builder_.add_broadcast_dimensions(broadcast_dimensions);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloBroadcastInDimOptions> CreateStablehloBroadcastInDimOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *broadcast_dimensions = nullptr) {
+  auto broadcast_dimensions__ = broadcast_dimensions ? _fbb.CreateVector<int64_t>(*broadcast_dimensions) : 0;
+  return tflite::CreateStablehloBroadcastInDimOptions(
+      _fbb,
+      broadcast_dimensions__);
+}
+
+struct StablehloCompareOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloCompareOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StablehloCompareOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_COMPARISON_DIRECTION = 4,
+    VT_COMPARE_TYPE = 6
+  };
+  tflite::StablehloComparisonDirection comparison_direction() const {
+    return static_cast<tflite::StablehloComparisonDirection>(GetField<uint32_t>(VT_COMPARISON_DIRECTION, 0));
+  }
+  tflite::StablehloComparisonType compare_type() const {
+    return static_cast<tflite::StablehloComparisonType>(GetField<uint32_t>(VT_COMPARE_TYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_COMPARISON_DIRECTION, 4) &&
+           VerifyField<uint32_t>(verifier, VT_COMPARE_TYPE, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct StablehloCompareOptionsBuilder {
+  typedef StablehloCompareOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_comparison_direction(tflite::StablehloComparisonDirection comparison_direction) {
+    fbb_.AddElement<uint32_t>(StablehloCompareOptions::VT_COMPARISON_DIRECTION, static_cast<uint32_t>(comparison_direction), 0);
+  }
+  void add_compare_type(tflite::StablehloComparisonType compare_type) {
+    fbb_.AddElement<uint32_t>(StablehloCompareOptions::VT_COMPARE_TYPE, static_cast<uint32_t>(compare_type), 0);
+  }
+  explicit StablehloCompareOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloCompareOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloCompareOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloCompareOptions> CreateStablehloCompareOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::StablehloComparisonDirection comparison_direction = tflite::StablehloComparisonDirection::STABLEHLO_COMPARISON_DIRECTION_EQ,
+    tflite::StablehloComparisonType compare_type = tflite::StablehloComparisonType::STABLEHLO_COMPARISON_TYPE_NOTYPE) {
+  StablehloCompareOptionsBuilder builder_(_fbb);
+  builder_.add_compare_type(compare_type);
+  builder_.add_comparison_direction(comparison_direction);
+  return builder_.Finish();
+}
+
+struct StablehloDynamicSliceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloDynamicSliceOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StablehloDynamicSliceOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SLICE_SIZES = 4
+  };
+  const ::flatbuffers::Vector<int64_t> *slice_sizes() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_SLICE_SIZES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_SLICE_SIZES) &&
+           verifier.VerifyVector(slice_sizes()) &&
+           verifier.EndTable();
+  }
+};
+
+struct StablehloDynamicSliceOptionsBuilder {
+  typedef StablehloDynamicSliceOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_slice_sizes(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> slice_sizes) {
+    fbb_.AddOffset(StablehloDynamicSliceOptions::VT_SLICE_SIZES, slice_sizes);
+  }
+  explicit StablehloDynamicSliceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloDynamicSliceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloDynamicSliceOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloDynamicSliceOptions> CreateStablehloDynamicSliceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> slice_sizes = 0) {
+  StablehloDynamicSliceOptionsBuilder builder_(_fbb);
+  builder_.add_slice_sizes(slice_sizes);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloDynamicSliceOptions> CreateStablehloDynamicSliceOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *slice_sizes = nullptr) {
+  auto slice_sizes__ = slice_sizes ? _fbb.CreateVector<int64_t>(*slice_sizes) : 0;
+  return tflite::CreateStablehloDynamicSliceOptions(
+      _fbb,
+      slice_sizes__);
+}
+
+struct StablehloPadOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloPadOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StablehloPadOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_EDGE_PADDING_LOW = 4,
+    VT_EDGE_PADDING_HIGH = 6,
+    VT_INTERIOR_PADDING = 8
+  };
+  const ::flatbuffers::Vector<int64_t> *edge_padding_low() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_EDGE_PADDING_LOW);
+  }
+  const ::flatbuffers::Vector<int64_t> *edge_padding_high() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_EDGE_PADDING_HIGH);
+  }
+  const ::flatbuffers::Vector<int64_t> *interior_padding() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_INTERIOR_PADDING);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_EDGE_PADDING_LOW) &&
+           verifier.VerifyVector(edge_padding_low()) &&
+           VerifyOffset(verifier, VT_EDGE_PADDING_HIGH) &&
+           verifier.VerifyVector(edge_padding_high()) &&
+           VerifyOffset(verifier, VT_INTERIOR_PADDING) &&
+           verifier.VerifyVector(interior_padding()) &&
+           verifier.EndTable();
+  }
+};
+
+struct StablehloPadOptionsBuilder {
+  typedef StablehloPadOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_edge_padding_low(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> edge_padding_low) {
+    fbb_.AddOffset(StablehloPadOptions::VT_EDGE_PADDING_LOW, edge_padding_low);
+  }
+  void add_edge_padding_high(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> edge_padding_high) {
+    fbb_.AddOffset(StablehloPadOptions::VT_EDGE_PADDING_HIGH, edge_padding_high);
+  }
+  void add_interior_padding(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> interior_padding) {
+    fbb_.AddOffset(StablehloPadOptions::VT_INTERIOR_PADDING, interior_padding);
+  }
+  explicit StablehloPadOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloPadOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloPadOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloPadOptions> CreateStablehloPadOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> edge_padding_low = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> edge_padding_high = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> interior_padding = 0) {
+  StablehloPadOptionsBuilder builder_(_fbb);
+  builder_.add_interior_padding(interior_padding);
+  builder_.add_edge_padding_high(edge_padding_high);
+  builder_.add_edge_padding_low(edge_padding_low);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloPadOptions> CreateStablehloPadOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *edge_padding_low = nullptr,
+    const std::vector<int64_t> *edge_padding_high = nullptr,
+    const std::vector<int64_t> *interior_padding = nullptr) {
+  auto edge_padding_low__ = edge_padding_low ? _fbb.CreateVector<int64_t>(*edge_padding_low) : 0;
+  auto edge_padding_high__ = edge_padding_high ? _fbb.CreateVector<int64_t>(*edge_padding_high) : 0;
+  auto interior_padding__ = interior_padding ? _fbb.CreateVector<int64_t>(*interior_padding) : 0;
+  return tflite::CreateStablehloPadOptions(
+      _fbb,
+      edge_padding_low__,
+      edge_padding_high__,
+      interior_padding__);
+}
+
+struct StablehloIotaOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloIotaOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StablehloIotaOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_IOTA_DIMENSION = 4
+  };
+  int64_t iota_dimension() const {
+    return GetField<int64_t>(VT_IOTA_DIMENSION, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int64_t>(verifier, VT_IOTA_DIMENSION, 8) &&
+           verifier.EndTable();
+  }
+};
+
+struct StablehloIotaOptionsBuilder {
+  typedef StablehloIotaOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_iota_dimension(int64_t iota_dimension) {
+    fbb_.AddElement<int64_t>(StablehloIotaOptions::VT_IOTA_DIMENSION, iota_dimension, 0);
+  }
+  explicit StablehloIotaOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloIotaOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloIotaOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloIotaOptions> CreateStablehloIotaOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int64_t iota_dimension = 0) {
+  StablehloIotaOptionsBuilder builder_(_fbb);
+  builder_.add_iota_dimension(iota_dimension);
+  return builder_.Finish();
+}
+
+struct StablehloCustomCallOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloCustomCallOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StablehloCustomCallOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_CALL_TARGET_NAME = 4,
+    VT_HAS_SIDE_EFFECT = 6,
+    VT_BACKEND_CONFIG = 8,
+    VT_API_VERSION = 10,
+    VT_CALLED_COMPUTATIONS = 12,
+    VT_CUSTOM_ATTRIBUTES = 14
+  };
+  const ::flatbuffers::String *call_target_name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CALL_TARGET_NAME);
+  }
+  bool has_side_effect() const {
+    return GetField<uint8_t>(VT_HAS_SIDE_EFFECT, 0) != 0;
+  }
+  const ::flatbuffers::String *backend_config() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_BACKEND_CONFIG);
+  }
+  int32_t api_version() const {
+    return GetField<int32_t>(VT_API_VERSION, 0);
+  }
+  const ::flatbuffers::Vector<int32_t> *called_computations() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_CALLED_COMPUTATIONS);
+  }
+  const ::flatbuffers::Vector<uint8_t> *custom_attributes() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_ATTRIBUTES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_CALL_TARGET_NAME) &&
+           verifier.VerifyString(call_target_name()) &&
+           VerifyField<uint8_t>(verifier, VT_HAS_SIDE_EFFECT, 1) &&
+           VerifyOffset(verifier, VT_BACKEND_CONFIG) &&
+           verifier.VerifyString(backend_config()) &&
+           VerifyField<int32_t>(verifier, VT_API_VERSION, 4) &&
+           VerifyOffset(verifier, VT_CALLED_COMPUTATIONS) &&
+           verifier.VerifyVector(called_computations()) &&
+           VerifyOffset(verifier, VT_CUSTOM_ATTRIBUTES) &&
+           verifier.VerifyVector(custom_attributes()) &&
+           verifier.EndTable();
+  }
+};
+
+struct StablehloCustomCallOptionsBuilder {
+  typedef StablehloCustomCallOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_call_target_name(::flatbuffers::Offset<::flatbuffers::String> call_target_name) {
+    fbb_.AddOffset(StablehloCustomCallOptions::VT_CALL_TARGET_NAME, call_target_name);
+  }
+  void add_has_side_effect(bool has_side_effect) {
+    fbb_.AddElement<uint8_t>(StablehloCustomCallOptions::VT_HAS_SIDE_EFFECT, static_cast<uint8_t>(has_side_effect), 0);
+  }
+  void add_backend_config(::flatbuffers::Offset<::flatbuffers::String> backend_config) {
+    fbb_.AddOffset(StablehloCustomCallOptions::VT_BACKEND_CONFIG, backend_config);
+  }
+  void add_api_version(int32_t api_version) {
+    fbb_.AddElement<int32_t>(StablehloCustomCallOptions::VT_API_VERSION, api_version, 0);
+  }
+  void add_called_computations(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> called_computations) {
+    fbb_.AddOffset(StablehloCustomCallOptions::VT_CALLED_COMPUTATIONS, called_computations);
+  }
+  void add_custom_attributes(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> custom_attributes) {
+    fbb_.AddOffset(StablehloCustomCallOptions::VT_CUSTOM_ATTRIBUTES, custom_attributes);
+  }
+  explicit StablehloCustomCallOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloCustomCallOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloCustomCallOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloCustomCallOptions> CreateStablehloCustomCallOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> call_target_name = 0,
+    bool has_side_effect = false,
+    ::flatbuffers::Offset<::flatbuffers::String> backend_config = 0,
+    int32_t api_version = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> called_computations = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> custom_attributes = 0) {
+  StablehloCustomCallOptionsBuilder builder_(_fbb);
+  builder_.add_custom_attributes(custom_attributes);
+  builder_.add_called_computations(called_computations);
+  builder_.add_api_version(api_version);
+  builder_.add_backend_config(backend_config);
+  builder_.add_call_target_name(call_target_name);
+  builder_.add_has_side_effect(has_side_effect);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloCustomCallOptions> CreateStablehloCustomCallOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *call_target_name = nullptr,
+    bool has_side_effect = false,
+    const char *backend_config = nullptr,
+    int32_t api_version = 0,
+    const std::vector<int32_t> *called_computations = nullptr,
+    const std::vector<uint8_t> *custom_attributes = nullptr) {
+  auto call_target_name__ = call_target_name ? _fbb.CreateString(call_target_name) : 0;
+  auto backend_config__ = backend_config ? _fbb.CreateString(backend_config) : 0;
+  auto called_computations__ = called_computations ? _fbb.CreateVector<int32_t>(*called_computations) : 0;
+  auto custom_attributes__ = custom_attributes ? _fbb.CreateVector<uint8_t>(*custom_attributes) : 0;
+  return tflite::CreateStablehloCustomCallOptions(
+      _fbb,
+      call_target_name__,
+      has_side_effect,
+      backend_config__,
+      api_version,
+      called_computations__,
+      custom_attributes__);
+}
+
+struct StablehloReduceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloReduceOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StablehloReduceOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DIMENSIONS = 4,
+    VT_BODY_SUBGRAPH_INDEX = 6
+  };
+  const ::flatbuffers::Vector<int64_t> *dimensions() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_DIMENSIONS);
+  }
+  int32_t body_subgraph_index() const {
+    return GetField<int32_t>(VT_BODY_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DIMENSIONS) &&
+           verifier.VerifyVector(dimensions()) &&
+           VerifyField<int32_t>(verifier, VT_BODY_SUBGRAPH_INDEX, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct StablehloReduceOptionsBuilder {
+  typedef StablehloReduceOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_dimensions(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> dimensions) {
+    fbb_.AddOffset(StablehloReduceOptions::VT_DIMENSIONS, dimensions);
+  }
+  void add_body_subgraph_index(int32_t body_subgraph_index) {
+    fbb_.AddElement<int32_t>(StablehloReduceOptions::VT_BODY_SUBGRAPH_INDEX, body_subgraph_index, 0);
+  }
+  explicit StablehloReduceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloReduceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloReduceOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloReduceOptions> CreateStablehloReduceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> dimensions = 0,
+    int32_t body_subgraph_index = 0) {
+  StablehloReduceOptionsBuilder builder_(_fbb);
+  builder_.add_body_subgraph_index(body_subgraph_index);
+  builder_.add_dimensions(dimensions);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloReduceOptions> CreateStablehloReduceOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *dimensions = nullptr,
+    int32_t body_subgraph_index = 0) {
+  auto dimensions__ = dimensions ? _fbb.CreateVector<int64_t>(*dimensions) : 0;
+  return tflite::CreateStablehloReduceOptions(
+      _fbb,
+      dimensions__,
+      body_subgraph_index);
+}
+
+struct StablehloSliceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloSliceOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StablehloSliceOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_START_INDICES = 4,
+    VT_LIMIT_INDICES = 6,
+    VT_STRIDES = 8
+  };
+  const ::flatbuffers::Vector<int64_t> *start_indices() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_START_INDICES);
+  }
+  const ::flatbuffers::Vector<int64_t> *limit_indices() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_LIMIT_INDICES);
+  }
+  const ::flatbuffers::Vector<int64_t> *strides() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_STRIDES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_START_INDICES) &&
+           verifier.VerifyVector(start_indices()) &&
+           VerifyOffset(verifier, VT_LIMIT_INDICES) &&
+           verifier.VerifyVector(limit_indices()) &&
+           VerifyOffset(verifier, VT_STRIDES) &&
+           verifier.VerifyVector(strides()) &&
+           verifier.EndTable();
+  }
+};
+
+struct StablehloSliceOptionsBuilder {
+  typedef StablehloSliceOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_start_indices(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> start_indices) {
+    fbb_.AddOffset(StablehloSliceOptions::VT_START_INDICES, start_indices);
+  }
+  void add_limit_indices(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> limit_indices) {
+    fbb_.AddOffset(StablehloSliceOptions::VT_LIMIT_INDICES, limit_indices);
+  }
+  void add_strides(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> strides) {
+    fbb_.AddOffset(StablehloSliceOptions::VT_STRIDES, strides);
+  }
+  explicit StablehloSliceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloSliceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloSliceOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloSliceOptions> CreateStablehloSliceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> start_indices = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> limit_indices = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> strides = 0) {
+  StablehloSliceOptionsBuilder builder_(_fbb);
+  builder_.add_strides(strides);
+  builder_.add_limit_indices(limit_indices);
+  builder_.add_start_indices(start_indices);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloSliceOptions> CreateStablehloSliceOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *start_indices = nullptr,
+    const std::vector<int64_t> *limit_indices = nullptr,
+    const std::vector<int64_t> *strides = nullptr) {
+  auto start_indices__ = start_indices ? _fbb.CreateVector<int64_t>(*start_indices) : 0;
+  auto limit_indices__ = limit_indices ? _fbb.CreateVector<int64_t>(*limit_indices) : 0;
+  auto strides__ = strides ? _fbb.CreateVector<int64_t>(*strides) : 0;
+  return tflite::CreateStablehloSliceOptions(
+      _fbb,
+      start_indices__,
+      limit_indices__,
+      strides__);
+}
+
+struct StablehloConvolutionOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloConvolutionOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StablehloConvolutionOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_WINDOW_STRIDES = 4,
+    VT_PADDING = 6,
+    VT_LHS_DILATION = 8,
+    VT_RHS_DILATION = 10,
+    VT_WINDOW_REVERSAL = 12,
+    VT_INPUT_BATCH_DIMENSION = 14,
+    VT_INPUT_FEATURE_DIMENSION = 16,
+    VT_INPUT_SPATIAL_DIMENSIONS = 18,
+    VT_KERNEL_INPUT_FEATURE_DIMENSION = 20,
+    VT_KERNEL_OUTPUT_FEATURE_DIMENSION = 22,
+    VT_KERNEL_SPATIAL_DIMENSIONS = 24,
+    VT_OUTPUT_BATCH_DIMENSION = 26,
+    VT_OUTPUT_FEATURE_DIMENSION = 28,
+    VT_OUTPUT_SPATIAL_DIMENSIONS = 30,
+    VT_FEATURE_GROUP_COUNT = 32,
+    VT_BATCH_GROUP_COUNT = 34,
+    VT_PRECISION_CONFIG = 36
+  };
+  const ::flatbuffers::Vector<int64_t> *window_strides() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_WINDOW_STRIDES);
+  }
+  const ::flatbuffers::Vector<int64_t> *padding() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_PADDING);
+  }
+  const ::flatbuffers::Vector<int64_t> *lhs_dilation() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_LHS_DILATION);
+  }
+  const ::flatbuffers::Vector<int64_t> *rhs_dilation() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_RHS_DILATION);
+  }
+  const ::flatbuffers::Vector<uint8_t> *window_reversal() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_WINDOW_REVERSAL);
+  }
+  int64_t input_batch_dimension() const {
+    return GetField<int64_t>(VT_INPUT_BATCH_DIMENSION, 0);
+  }
+  int64_t input_feature_dimension() const {
+    return GetField<int64_t>(VT_INPUT_FEATURE_DIMENSION, 0);
+  }
+  const ::flatbuffers::Vector<int64_t> *input_spatial_dimensions() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_INPUT_SPATIAL_DIMENSIONS);
+  }
+  int64_t kernel_input_feature_dimension() const {
+    return GetField<int64_t>(VT_KERNEL_INPUT_FEATURE_DIMENSION, 0);
+  }
+  int64_t kernel_output_feature_dimension() const {
+    return GetField<int64_t>(VT_KERNEL_OUTPUT_FEATURE_DIMENSION, 0);
+  }
+  const ::flatbuffers::Vector<int64_t> *kernel_spatial_dimensions() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_KERNEL_SPATIAL_DIMENSIONS);
+  }
+  int64_t output_batch_dimension() const {
+    return GetField<int64_t>(VT_OUTPUT_BATCH_DIMENSION, 0);
+  }
+  int64_t output_feature_dimension() const {
+    return GetField<int64_t>(VT_OUTPUT_FEATURE_DIMENSION, 0);
+  }
+  const ::flatbuffers::Vector<int64_t> *output_spatial_dimensions() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_OUTPUT_SPATIAL_DIMENSIONS);
+  }
+  int64_t feature_group_count() const {
+    return GetField<int64_t>(VT_FEATURE_GROUP_COUNT, 0);
+  }
+  int64_t batch_group_count() const {
+    return GetField<int64_t>(VT_BATCH_GROUP_COUNT, 0);
+  }
+  const ::flatbuffers::Vector<tflite::StablehloPrecisionConfig> *precision_config() const {
+    return GetPointer<const ::flatbuffers::Vector<tflite::StablehloPrecisionConfig> *>(VT_PRECISION_CONFIG);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_WINDOW_STRIDES) &&
+           verifier.VerifyVector(window_strides()) &&
+           VerifyOffset(verifier, VT_PADDING) &&
+           verifier.VerifyVector(padding()) &&
+           VerifyOffset(verifier, VT_LHS_DILATION) &&
+           verifier.VerifyVector(lhs_dilation()) &&
+           VerifyOffset(verifier, VT_RHS_DILATION) &&
+           verifier.VerifyVector(rhs_dilation()) &&
+           VerifyOffset(verifier, VT_WINDOW_REVERSAL) &&
+           verifier.VerifyVector(window_reversal()) &&
+           VerifyField<int64_t>(verifier, VT_INPUT_BATCH_DIMENSION, 8) &&
+           VerifyField<int64_t>(verifier, VT_INPUT_FEATURE_DIMENSION, 8) &&
+           VerifyOffset(verifier, VT_INPUT_SPATIAL_DIMENSIONS) &&
+           verifier.VerifyVector(input_spatial_dimensions()) &&
+           VerifyField<int64_t>(verifier, VT_KERNEL_INPUT_FEATURE_DIMENSION, 8) &&
+           VerifyField<int64_t>(verifier, VT_KERNEL_OUTPUT_FEATURE_DIMENSION, 8) &&
+           VerifyOffset(verifier, VT_KERNEL_SPATIAL_DIMENSIONS) &&
+           verifier.VerifyVector(kernel_spatial_dimensions()) &&
+           VerifyField<int64_t>(verifier, VT_OUTPUT_BATCH_DIMENSION, 8) &&
+           VerifyField<int64_t>(verifier, VT_OUTPUT_FEATURE_DIMENSION, 8) &&
+           VerifyOffset(verifier, VT_OUTPUT_SPATIAL_DIMENSIONS) &&
+           verifier.VerifyVector(output_spatial_dimensions()) &&
+           VerifyField<int64_t>(verifier, VT_FEATURE_GROUP_COUNT, 8) &&
+           VerifyField<int64_t>(verifier, VT_BATCH_GROUP_COUNT, 8) &&
+           VerifyOffset(verifier, VT_PRECISION_CONFIG) &&
+           verifier.VerifyVector(precision_config()) &&
+           verifier.EndTable();
+  }
+};
+
+struct StablehloConvolutionOptionsBuilder {
+  typedef StablehloConvolutionOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_window_strides(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> window_strides) {
+    fbb_.AddOffset(StablehloConvolutionOptions::VT_WINDOW_STRIDES, window_strides);
+  }
+  void add_padding(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> padding) {
+    fbb_.AddOffset(StablehloConvolutionOptions::VT_PADDING, padding);
+  }
+  void add_lhs_dilation(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> lhs_dilation) {
+    fbb_.AddOffset(StablehloConvolutionOptions::VT_LHS_DILATION, lhs_dilation);
+  }
+  void add_rhs_dilation(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> rhs_dilation) {
+    fbb_.AddOffset(StablehloConvolutionOptions::VT_RHS_DILATION, rhs_dilation);
+  }
+  void add_window_reversal(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> window_reversal) {
+    fbb_.AddOffset(StablehloConvolutionOptions::VT_WINDOW_REVERSAL, window_reversal);
+  }
+  void add_input_batch_dimension(int64_t input_batch_dimension) {
+    fbb_.AddElement<int64_t>(StablehloConvolutionOptions::VT_INPUT_BATCH_DIMENSION, input_batch_dimension, 0);
+  }
+  void add_input_feature_dimension(int64_t input_feature_dimension) {
+    fbb_.AddElement<int64_t>(StablehloConvolutionOptions::VT_INPUT_FEATURE_DIMENSION, input_feature_dimension, 0);
+  }
+  void add_input_spatial_dimensions(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> input_spatial_dimensions) {
+    fbb_.AddOffset(StablehloConvolutionOptions::VT_INPUT_SPATIAL_DIMENSIONS, input_spatial_dimensions);
+  }
+  void add_kernel_input_feature_dimension(int64_t kernel_input_feature_dimension) {
+    fbb_.AddElement<int64_t>(StablehloConvolutionOptions::VT_KERNEL_INPUT_FEATURE_DIMENSION, kernel_input_feature_dimension, 0);
+  }
+  void add_kernel_output_feature_dimension(int64_t kernel_output_feature_dimension) {
+    fbb_.AddElement<int64_t>(StablehloConvolutionOptions::VT_KERNEL_OUTPUT_FEATURE_DIMENSION, kernel_output_feature_dimension, 0);
+  }
+  void add_kernel_spatial_dimensions(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> kernel_spatial_dimensions) {
+    fbb_.AddOffset(StablehloConvolutionOptions::VT_KERNEL_SPATIAL_DIMENSIONS, kernel_spatial_dimensions);
+  }
+  void add_output_batch_dimension(int64_t output_batch_dimension) {
+    fbb_.AddElement<int64_t>(StablehloConvolutionOptions::VT_OUTPUT_BATCH_DIMENSION, output_batch_dimension, 0);
+  }
+  void add_output_feature_dimension(int64_t output_feature_dimension) {
+    fbb_.AddElement<int64_t>(StablehloConvolutionOptions::VT_OUTPUT_FEATURE_DIMENSION, output_feature_dimension, 0);
+  }
+  void add_output_spatial_dimensions(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> output_spatial_dimensions) {
+    fbb_.AddOffset(StablehloConvolutionOptions::VT_OUTPUT_SPATIAL_DIMENSIONS, output_spatial_dimensions);
+  }
+  void add_feature_group_count(int64_t feature_group_count) {
+    fbb_.AddElement<int64_t>(StablehloConvolutionOptions::VT_FEATURE_GROUP_COUNT, feature_group_count, 0);
+  }
+  void add_batch_group_count(int64_t batch_group_count) {
+    fbb_.AddElement<int64_t>(StablehloConvolutionOptions::VT_BATCH_GROUP_COUNT, batch_group_count, 0);
+  }
+  void add_precision_config(::flatbuffers::Offset<::flatbuffers::Vector<tflite::StablehloPrecisionConfig>> precision_config) {
+    fbb_.AddOffset(StablehloConvolutionOptions::VT_PRECISION_CONFIG, precision_config);
+  }
+  explicit StablehloConvolutionOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloConvolutionOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloConvolutionOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloConvolutionOptions> CreateStablehloConvolutionOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> window_strides = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> padding = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> lhs_dilation = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> rhs_dilation = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> window_reversal = 0,
+    int64_t input_batch_dimension = 0,
+    int64_t input_feature_dimension = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> input_spatial_dimensions = 0,
+    int64_t kernel_input_feature_dimension = 0,
+    int64_t kernel_output_feature_dimension = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> kernel_spatial_dimensions = 0,
+    int64_t output_batch_dimension = 0,
+    int64_t output_feature_dimension = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> output_spatial_dimensions = 0,
+    int64_t feature_group_count = 0,
+    int64_t batch_group_count = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<tflite::StablehloPrecisionConfig>> precision_config = 0) {
+  StablehloConvolutionOptionsBuilder builder_(_fbb);
+  builder_.add_batch_group_count(batch_group_count);
+  builder_.add_feature_group_count(feature_group_count);
+  builder_.add_output_feature_dimension(output_feature_dimension);
+  builder_.add_output_batch_dimension(output_batch_dimension);
+  builder_.add_kernel_output_feature_dimension(kernel_output_feature_dimension);
+  builder_.add_kernel_input_feature_dimension(kernel_input_feature_dimension);
+  builder_.add_input_feature_dimension(input_feature_dimension);
+  builder_.add_input_batch_dimension(input_batch_dimension);
+  builder_.add_precision_config(precision_config);
+  builder_.add_output_spatial_dimensions(output_spatial_dimensions);
+  builder_.add_kernel_spatial_dimensions(kernel_spatial_dimensions);
+  builder_.add_input_spatial_dimensions(input_spatial_dimensions);
+  builder_.add_window_reversal(window_reversal);
+  builder_.add_rhs_dilation(rhs_dilation);
+  builder_.add_lhs_dilation(lhs_dilation);
+  builder_.add_padding(padding);
+  builder_.add_window_strides(window_strides);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloConvolutionOptions> CreateStablehloConvolutionOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *window_strides = nullptr,
+    const std::vector<int64_t> *padding = nullptr,
+    const std::vector<int64_t> *lhs_dilation = nullptr,
+    const std::vector<int64_t> *rhs_dilation = nullptr,
+    const std::vector<uint8_t> *window_reversal = nullptr,
+    int64_t input_batch_dimension = 0,
+    int64_t input_feature_dimension = 0,
+    const std::vector<int64_t> *input_spatial_dimensions = nullptr,
+    int64_t kernel_input_feature_dimension = 0,
+    int64_t kernel_output_feature_dimension = 0,
+    const std::vector<int64_t> *kernel_spatial_dimensions = nullptr,
+    int64_t output_batch_dimension = 0,
+    int64_t output_feature_dimension = 0,
+    const std::vector<int64_t> *output_spatial_dimensions = nullptr,
+    int64_t feature_group_count = 0,
+    int64_t batch_group_count = 0,
+    const std::vector<tflite::StablehloPrecisionConfig> *precision_config = nullptr) {
+  auto window_strides__ = window_strides ? _fbb.CreateVector<int64_t>(*window_strides) : 0;
+  auto padding__ = padding ? _fbb.CreateVector<int64_t>(*padding) : 0;
+  auto lhs_dilation__ = lhs_dilation ? _fbb.CreateVector<int64_t>(*lhs_dilation) : 0;
+  auto rhs_dilation__ = rhs_dilation ? _fbb.CreateVector<int64_t>(*rhs_dilation) : 0;
+  auto window_reversal__ = window_reversal ? _fbb.CreateVector<uint8_t>(*window_reversal) : 0;
+  auto input_spatial_dimensions__ = input_spatial_dimensions ? _fbb.CreateVector<int64_t>(*input_spatial_dimensions) : 0;
+  auto kernel_spatial_dimensions__ = kernel_spatial_dimensions ? _fbb.CreateVector<int64_t>(*kernel_spatial_dimensions) : 0;
+  auto output_spatial_dimensions__ = output_spatial_dimensions ? _fbb.CreateVector<int64_t>(*output_spatial_dimensions) : 0;
+  auto precision_config__ = precision_config ? _fbb.CreateVector<tflite::StablehloPrecisionConfig>(*precision_config) : 0;
+  return tflite::CreateStablehloConvolutionOptions(
+      _fbb,
+      window_strides__,
+      padding__,
+      lhs_dilation__,
+      rhs_dilation__,
+      window_reversal__,
+      input_batch_dimension,
+      input_feature_dimension,
+      input_spatial_dimensions__,
+      kernel_input_feature_dimension,
+      kernel_output_feature_dimension,
+      kernel_spatial_dimensions__,
+      output_batch_dimension,
+      output_feature_dimension,
+      output_spatial_dimensions__,
+      feature_group_count,
+      batch_group_count,
+      precision_config__);
+}
+
+struct StablehloScatterOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloScatterOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StablehloScatterOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INDICES_ARE_SORTED = 4,
+    VT_UPDATE_WINDOW_DIMS = 6,
+    VT_INSERTED_WINDOW_DIMS = 8,
+    VT_SCATTER_DIMS_TO_OPERAND_DIMS = 10,
+    VT_INDEX_VECTOR_DIM = 12,
+    VT_UNIQUE_INDICES = 14,
+    VT_UPDATE_COMPUTATION_SUBGRAPH_INDEX = 16
+  };
+  bool indices_are_sorted() const {
+    return GetField<uint8_t>(VT_INDICES_ARE_SORTED, 0) != 0;
+  }
+  const ::flatbuffers::Vector<int64_t> *update_window_dims() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_UPDATE_WINDOW_DIMS);
+  }
+  const ::flatbuffers::Vector<int64_t> *inserted_window_dims() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_INSERTED_WINDOW_DIMS);
+  }
+  const ::flatbuffers::Vector<int64_t> *scatter_dims_to_operand_dims() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_SCATTER_DIMS_TO_OPERAND_DIMS);
+  }
+  int64_t index_vector_dim() const {
+    return GetField<int64_t>(VT_INDEX_VECTOR_DIM, 0);
+  }
+  bool unique_indices() const {
+    return GetField<uint8_t>(VT_UNIQUE_INDICES, 0) != 0;
+  }
+  int32_t update_computation_subgraph_index() const {
+    return GetField<int32_t>(VT_UPDATE_COMPUTATION_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_INDICES_ARE_SORTED, 1) &&
+           VerifyOffset(verifier, VT_UPDATE_WINDOW_DIMS) &&
+           verifier.VerifyVector(update_window_dims()) &&
+           VerifyOffset(verifier, VT_INSERTED_WINDOW_DIMS) &&
+           verifier.VerifyVector(inserted_window_dims()) &&
+           VerifyOffset(verifier, VT_SCATTER_DIMS_TO_OPERAND_DIMS) &&
+           verifier.VerifyVector(scatter_dims_to_operand_dims()) &&
+           VerifyField<int64_t>(verifier, VT_INDEX_VECTOR_DIM, 8) &&
+           VerifyField<uint8_t>(verifier, VT_UNIQUE_INDICES, 1) &&
+           VerifyField<int32_t>(verifier, VT_UPDATE_COMPUTATION_SUBGRAPH_INDEX, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct StablehloScatterOptionsBuilder {
+  typedef StablehloScatterOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_indices_are_sorted(bool indices_are_sorted) {
+    fbb_.AddElement<uint8_t>(StablehloScatterOptions::VT_INDICES_ARE_SORTED, static_cast<uint8_t>(indices_are_sorted), 0);
+  }
+  void add_update_window_dims(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> update_window_dims) {
+    fbb_.AddOffset(StablehloScatterOptions::VT_UPDATE_WINDOW_DIMS, update_window_dims);
+  }
+  void add_inserted_window_dims(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> inserted_window_dims) {
+    fbb_.AddOffset(StablehloScatterOptions::VT_INSERTED_WINDOW_DIMS, inserted_window_dims);
+  }
+  void add_scatter_dims_to_operand_dims(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> scatter_dims_to_operand_dims) {
+    fbb_.AddOffset(StablehloScatterOptions::VT_SCATTER_DIMS_TO_OPERAND_DIMS, scatter_dims_to_operand_dims);
+  }
+  void add_index_vector_dim(int64_t index_vector_dim) {
+    fbb_.AddElement<int64_t>(StablehloScatterOptions::VT_INDEX_VECTOR_DIM, index_vector_dim, 0);
+  }
+  void add_unique_indices(bool unique_indices) {
+    fbb_.AddElement<uint8_t>(StablehloScatterOptions::VT_UNIQUE_INDICES, static_cast<uint8_t>(unique_indices), 0);
+  }
+  void add_update_computation_subgraph_index(int32_t update_computation_subgraph_index) {
+    fbb_.AddElement<int32_t>(StablehloScatterOptions::VT_UPDATE_COMPUTATION_SUBGRAPH_INDEX, update_computation_subgraph_index, 0);
+  }
+  explicit StablehloScatterOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloScatterOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloScatterOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloScatterOptions> CreateStablehloScatterOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool indices_are_sorted = false,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> update_window_dims = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> inserted_window_dims = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> scatter_dims_to_operand_dims = 0,
+    int64_t index_vector_dim = 0,
+    bool unique_indices = false,
+    int32_t update_computation_subgraph_index = 0) {
+  StablehloScatterOptionsBuilder builder_(_fbb);
+  builder_.add_index_vector_dim(index_vector_dim);
+  builder_.add_update_computation_subgraph_index(update_computation_subgraph_index);
+  builder_.add_scatter_dims_to_operand_dims(scatter_dims_to_operand_dims);
+  builder_.add_inserted_window_dims(inserted_window_dims);
+  builder_.add_update_window_dims(update_window_dims);
+  builder_.add_unique_indices(unique_indices);
+  builder_.add_indices_are_sorted(indices_are_sorted);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StablehloScatterOptions> CreateStablehloScatterOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool indices_are_sorted = false,
+    const std::vector<int64_t> *update_window_dims = nullptr,
+    const std::vector<int64_t> *inserted_window_dims = nullptr,
+    const std::vector<int64_t> *scatter_dims_to_operand_dims = nullptr,
+    int64_t index_vector_dim = 0,
+    bool unique_indices = false,
+    int32_t update_computation_subgraph_index = 0) {
+  auto update_window_dims__ = update_window_dims ? _fbb.CreateVector<int64_t>(*update_window_dims) : 0;
+  auto inserted_window_dims__ = inserted_window_dims ? _fbb.CreateVector<int64_t>(*inserted_window_dims) : 0;
+  auto scatter_dims_to_operand_dims__ = scatter_dims_to_operand_dims ? _fbb.CreateVector<int64_t>(*scatter_dims_to_operand_dims) : 0;
+  return tflite::CreateStablehloScatterOptions(
+      _fbb,
+      indices_are_sorted,
+      update_window_dims__,
+      inserted_window_dims__,
+      scatter_dims_to_operand_dims__,
+      index_vector_dim,
+      unique_indices,
+      update_computation_subgraph_index);
+}
+
+struct StablehloRngBitGeneratorOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StablehloRngBitGeneratorOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StablehloRngBitGeneratorOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ALGORITHM = 4
+  };
+  tflite::RngAlgorithm algorithm() const {
+    return static_cast<tflite::RngAlgorithm>(GetField<int8_t>(VT_ALGORITHM, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_ALGORITHM, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct StablehloRngBitGeneratorOptionsBuilder {
+  typedef StablehloRngBitGeneratorOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_algorithm(tflite::RngAlgorithm algorithm) {
+    fbb_.AddElement<int8_t>(StablehloRngBitGeneratorOptions::VT_ALGORITHM, static_cast<int8_t>(algorithm), 0);
+  }
+  explicit StablehloRngBitGeneratorOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StablehloRngBitGeneratorOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StablehloRngBitGeneratorOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StablehloRngBitGeneratorOptions> CreateStablehloRngBitGeneratorOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::RngAlgorithm algorithm = tflite::RngAlgorithm::DEFAULT) {
+  StablehloRngBitGeneratorOptionsBuilder builder_(_fbb);
+  builder_.add_algorithm(algorithm);
+  return builder_.Finish();
+}
+
+struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Conv2DOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return Conv2DOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PADDING = 4,
+    VT_STRIDE_W = 6,
+    VT_STRIDE_H = 8,
+    VT_FUSED_ACTIVATION_FUNCTION = 10,
+    VT_DILATION_W_FACTOR = 12,
+    VT_DILATION_H_FACTOR = 14,
+    VT_QUANTIZED_BIAS_TYPE = 16
+  };
+  tflite::Padding padding() const {
+    return static_cast<tflite::Padding>(GetField<int8_t>(VT_PADDING, 0));
+  }
+  int32_t stride_w() const {
+    return GetField<int32_t>(VT_STRIDE_W, 0);
+  }
+  int32_t stride_h() const {
+    return GetField<int32_t>(VT_STRIDE_H, 0);
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  int32_t dilation_w_factor() const {
+    return GetField<int32_t>(VT_DILATION_W_FACTOR, 1);
+  }
+  int32_t dilation_h_factor() const {
+    return GetField<int32_t>(VT_DILATION_H_FACTOR, 1);
+  }
+  tflite::TensorType quantized_bias_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_QUANTIZED_BIAS_TYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_PADDING, 1) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_W, 4) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_H, 4) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_W_FACTOR, 4) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_H_FACTOR, 4) &&
+           VerifyField<int8_t>(verifier, VT_QUANTIZED_BIAS_TYPE, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct Conv2DOptionsBuilder {
+  typedef Conv2DOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_padding(tflite::Padding padding) {
+    fbb_.AddElement<int8_t>(Conv2DOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+  }
+  void add_stride_w(int32_t stride_w) {
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_STRIDE_W, stride_w, 0);
+  }
+  void add_stride_h(int32_t stride_h) {
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_STRIDE_H, stride_h, 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(Conv2DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_dilation_w_factor(int32_t dilation_w_factor) {
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_DILATION_W_FACTOR, dilation_w_factor, 1);
+  }
+  void add_dilation_h_factor(int32_t dilation_h_factor) {
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_DILATION_H_FACTOR, dilation_h_factor, 1);
+  }
+  void add_quantized_bias_type(tflite::TensorType quantized_bias_type) {
+    fbb_.AddElement<int8_t>(Conv2DOptions::VT_QUANTIZED_BIAS_TYPE, static_cast<int8_t>(quantized_bias_type), 0);
+  }
+  explicit Conv2DOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Conv2DOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Conv2DOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::Padding padding = tflite::Padding::SAME,
+    int32_t stride_w = 0,
+    int32_t stride_h = 0,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType::NONE,
+    int32_t dilation_w_factor = 1,
+    int32_t dilation_h_factor = 1,
+    tflite::TensorType quantized_bias_type = tflite::TensorType::FLOAT32) {
+  Conv2DOptionsBuilder builder_(_fbb);
+  builder_.add_dilation_h_factor(dilation_h_factor);
+  builder_.add_dilation_w_factor(dilation_w_factor);
+  builder_.add_stride_h(stride_h);
+  builder_.add_stride_w(stride_w);
+  builder_.add_quantized_bias_type(quantized_bias_type);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_padding(padding);
+  return builder_.Finish();
+}
+
+struct Conv3DOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Conv3DOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return Conv3DOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PADDING = 4,
+    VT_STRIDE_D = 6,
+    VT_STRIDE_W = 8,
+    VT_STRIDE_H = 10,
+    VT_FUSED_ACTIVATION_FUNCTION = 12,
+    VT_DILATION_D_FACTOR = 14,
+    VT_DILATION_W_FACTOR = 16,
+    VT_DILATION_H_FACTOR = 18
+  };
+  tflite::Padding padding() const {
+    return static_cast<tflite::Padding>(GetField<int8_t>(VT_PADDING, 0));
+  }
+  int32_t stride_d() const {
+    return GetField<int32_t>(VT_STRIDE_D, 0);
+  }
+  int32_t stride_w() const {
+    return GetField<int32_t>(VT_STRIDE_W, 0);
+  }
+  int32_t stride_h() const {
+    return GetField<int32_t>(VT_STRIDE_H, 0);
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  int32_t dilation_d_factor() const {
+    return GetField<int32_t>(VT_DILATION_D_FACTOR, 1);
+  }
+  int32_t dilation_w_factor() const {
+    return GetField<int32_t>(VT_DILATION_W_FACTOR, 1);
+  }
+  int32_t dilation_h_factor() const {
+    return GetField<int32_t>(VT_DILATION_H_FACTOR, 1);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_PADDING, 1) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_D, 4) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_W, 4) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_H, 4) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_D_FACTOR, 4) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_W_FACTOR, 4) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_H_FACTOR, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct Conv3DOptionsBuilder {
+  typedef Conv3DOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_padding(tflite::Padding padding) {
+    fbb_.AddElement<int8_t>(Conv3DOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+  }
+  void add_stride_d(int32_t stride_d) {
+    fbb_.AddElement<int32_t>(Conv3DOptions::VT_STRIDE_D, stride_d, 0);
+  }
+  void add_stride_w(int32_t stride_w) {
+    fbb_.AddElement<int32_t>(Conv3DOptions::VT_STRIDE_W, stride_w, 0);
+  }
+  void add_stride_h(int32_t stride_h) {
+    fbb_.AddElement<int32_t>(Conv3DOptions::VT_STRIDE_H, stride_h, 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(Conv3DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_dilation_d_factor(int32_t dilation_d_factor) {
+    fbb_.AddElement<int32_t>(Conv3DOptions::VT_DILATION_D_FACTOR, dilation_d_factor, 1);
+  }
+  void add_dilation_w_factor(int32_t dilation_w_factor) {
+    fbb_.AddElement<int32_t>(Conv3DOptions::VT_DILATION_W_FACTOR, dilation_w_factor, 1);
+  }
+  void add_dilation_h_factor(int32_t dilation_h_factor) {
+    fbb_.AddElement<int32_t>(Conv3DOptions::VT_DILATION_H_FACTOR, dilation_h_factor, 1);
+  }
+  explicit Conv3DOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Conv3DOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Conv3DOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Conv3DOptions> CreateConv3DOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::Padding padding = tflite::Padding::SAME,
+    int32_t stride_d = 0,
+    int32_t stride_w = 0,
+    int32_t stride_h = 0,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType::NONE,
+    int32_t dilation_d_factor = 1,
+    int32_t dilation_w_factor = 1,
+    int32_t dilation_h_factor = 1) {
+  Conv3DOptionsBuilder builder_(_fbb);
+  builder_.add_dilation_h_factor(dilation_h_factor);
+  builder_.add_dilation_w_factor(dilation_w_factor);
+  builder_.add_dilation_d_factor(dilation_d_factor);
+  builder_.add_stride_h(stride_h);
+  builder_.add_stride_w(stride_w);
+  builder_.add_stride_d(stride_d);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_padding(padding);
+  return builder_.Finish();
+}
+
+struct Pool2DOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Pool2DOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return Pool2DOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PADDING = 4,
+    VT_STRIDE_W = 6,
+    VT_STRIDE_H = 8,
+    VT_FILTER_WIDTH = 10,
+    VT_FILTER_HEIGHT = 12,
+    VT_FUSED_ACTIVATION_FUNCTION = 14
+  };
+  tflite::Padding padding() const {
+    return static_cast<tflite::Padding>(GetField<int8_t>(VT_PADDING, 0));
+  }
+  int32_t stride_w() const {
+    return GetField<int32_t>(VT_STRIDE_W, 0);
+  }
+  int32_t stride_h() const {
+    return GetField<int32_t>(VT_STRIDE_H, 0);
+  }
+  int32_t filter_width() const {
+    return GetField<int32_t>(VT_FILTER_WIDTH, 0);
+  }
+  int32_t filter_height() const {
+    return GetField<int32_t>(VT_FILTER_HEIGHT, 0);
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_PADDING, 1) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_W, 4) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_H, 4) &&
+           VerifyField<int32_t>(verifier, VT_FILTER_WIDTH, 4) &&
+           VerifyField<int32_t>(verifier, VT_FILTER_HEIGHT, 4) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct Pool2DOptionsBuilder {
+  typedef Pool2DOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_padding(tflite::Padding padding) {
+    fbb_.AddElement<int8_t>(Pool2DOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+  }
+  void add_stride_w(int32_t stride_w) {
+    fbb_.AddElement<int32_t>(Pool2DOptions::VT_STRIDE_W, stride_w, 0);
+  }
+  void add_stride_h(int32_t stride_h) {
+    fbb_.AddElement<int32_t>(Pool2DOptions::VT_STRIDE_H, stride_h, 0);
+  }
+  void add_filter_width(int32_t filter_width) {
+    fbb_.AddElement<int32_t>(Pool2DOptions::VT_FILTER_WIDTH, filter_width, 0);
+  }
+  void add_filter_height(int32_t filter_height) {
+    fbb_.AddElement<int32_t>(Pool2DOptions::VT_FILTER_HEIGHT, filter_height, 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(Pool2DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit Pool2DOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Pool2DOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Pool2DOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::Padding padding = tflite::Padding::SAME,
+    int32_t stride_w = 0,
+    int32_t stride_h = 0,
+    int32_t filter_width = 0,
+    int32_t filter_height = 0,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType::NONE) {
+  Pool2DOptionsBuilder builder_(_fbb);
+  builder_.add_filter_height(filter_height);
+  builder_.add_filter_width(filter_width);
+  builder_.add_stride_h(stride_h);
+  builder_.add_stride_w(stride_w);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_padding(padding);
+  return builder_.Finish();
+}
+
+struct DepthwiseConv2DOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DepthwiseConv2DOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return DepthwiseConv2DOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PADDING = 4,
+    VT_STRIDE_W = 6,
+    VT_STRIDE_H = 8,
+    VT_DEPTH_MULTIPLIER = 10,
+    VT_FUSED_ACTIVATION_FUNCTION = 12,
+    VT_DILATION_W_FACTOR = 14,
+    VT_DILATION_H_FACTOR = 16
+  };
+  tflite::Padding padding() const {
+    return static_cast<tflite::Padding>(GetField<int8_t>(VT_PADDING, 0));
+  }
+  int32_t stride_w() const {
+    return GetField<int32_t>(VT_STRIDE_W, 0);
+  }
+  int32_t stride_h() const {
+    return GetField<int32_t>(VT_STRIDE_H, 0);
+  }
+  int32_t depth_multiplier() const {
+    return GetField<int32_t>(VT_DEPTH_MULTIPLIER, 0);
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  int32_t dilation_w_factor() const {
+    return GetField<int32_t>(VT_DILATION_W_FACTOR, 1);
+  }
+  int32_t dilation_h_factor() const {
+    return GetField<int32_t>(VT_DILATION_H_FACTOR, 1);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_PADDING, 1) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_W, 4) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_H, 4) &&
+           VerifyField<int32_t>(verifier, VT_DEPTH_MULTIPLIER, 4) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_W_FACTOR, 4) &&
+           VerifyField<int32_t>(verifier, VT_DILATION_H_FACTOR, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct DepthwiseConv2DOptionsBuilder {
+  typedef DepthwiseConv2DOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_padding(tflite::Padding padding) {
+    fbb_.AddElement<int8_t>(DepthwiseConv2DOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+  }
+  void add_stride_w(int32_t stride_w) {
+    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_STRIDE_W, stride_w, 0);
+  }
+  void add_stride_h(int32_t stride_h) {
+    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_STRIDE_H, stride_h, 0);
+  }
+  void add_depth_multiplier(int32_t depth_multiplier) {
+    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_DEPTH_MULTIPLIER, depth_multiplier, 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(DepthwiseConv2DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_dilation_w_factor(int32_t dilation_w_factor) {
+    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_DILATION_W_FACTOR, dilation_w_factor, 1);
+  }
+  void add_dilation_h_factor(int32_t dilation_h_factor) {
+    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_DILATION_H_FACTOR, dilation_h_factor, 1);
+  }
+  explicit DepthwiseConv2DOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DepthwiseConv2DOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DepthwiseConv2DOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::Padding padding = tflite::Padding::SAME,
+    int32_t stride_w = 0,
+    int32_t stride_h = 0,
+    int32_t depth_multiplier = 0,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType::NONE,
+    int32_t dilation_w_factor = 1,
+    int32_t dilation_h_factor = 1) {
+  DepthwiseConv2DOptionsBuilder builder_(_fbb);
+  builder_.add_dilation_h_factor(dilation_h_factor);
+  builder_.add_dilation_w_factor(dilation_w_factor);
+  builder_.add_depth_multiplier(depth_multiplier);
+  builder_.add_stride_h(stride_h);
+  builder_.add_stride_w(stride_w);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_padding(padding);
+  return builder_.Finish();
+}
+
+struct ConcatEmbeddingsOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ConcatEmbeddingsOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ConcatEmbeddingsOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUM_CHANNELS = 4,
+    VT_NUM_COLUMNS_PER_CHANNEL = 6,
+    VT_EMBEDDING_DIM_PER_CHANNEL = 8
+  };
+  int32_t num_channels() const {
+    return GetField<int32_t>(VT_NUM_CHANNELS, 0);
+  }
+  const ::flatbuffers::Vector<int32_t> *num_columns_per_channel() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_NUM_COLUMNS_PER_CHANNEL);
+  }
+  const ::flatbuffers::Vector<int32_t> *embedding_dim_per_channel() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_EMBEDDING_DIM_PER_CHANNEL);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM_CHANNELS, 4) &&
+           VerifyOffset(verifier, VT_NUM_COLUMNS_PER_CHANNEL) &&
+           verifier.VerifyVector(num_columns_per_channel()) &&
+           VerifyOffset(verifier, VT_EMBEDDING_DIM_PER_CHANNEL) &&
+           verifier.VerifyVector(embedding_dim_per_channel()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ConcatEmbeddingsOptionsBuilder {
+  typedef ConcatEmbeddingsOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_num_channels(int32_t num_channels) {
+    fbb_.AddElement<int32_t>(ConcatEmbeddingsOptions::VT_NUM_CHANNELS, num_channels, 0);
+  }
+  void add_num_columns_per_channel(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> num_columns_per_channel) {
+    fbb_.AddOffset(ConcatEmbeddingsOptions::VT_NUM_COLUMNS_PER_CHANNEL, num_columns_per_channel);
+  }
+  void add_embedding_dim_per_channel(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> embedding_dim_per_channel) {
+    fbb_.AddOffset(ConcatEmbeddingsOptions::VT_EMBEDDING_DIM_PER_CHANNEL, embedding_dim_per_channel);
+  }
+  explicit ConcatEmbeddingsOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ConcatEmbeddingsOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ConcatEmbeddingsOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_channels = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> num_columns_per_channel = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> embedding_dim_per_channel = 0) {
+  ConcatEmbeddingsOptionsBuilder builder_(_fbb);
+  builder_.add_embedding_dim_per_channel(embedding_dim_per_channel);
+  builder_.add_num_columns_per_channel(num_columns_per_channel);
+  builder_.add_num_channels(num_channels);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_channels = 0,
+    const std::vector<int32_t> *num_columns_per_channel = nullptr,
+    const std::vector<int32_t> *embedding_dim_per_channel = nullptr) {
+  auto num_columns_per_channel__ = num_columns_per_channel ? _fbb.CreateVector<int32_t>(*num_columns_per_channel) : 0;
+  auto embedding_dim_per_channel__ = embedding_dim_per_channel ? _fbb.CreateVector<int32_t>(*embedding_dim_per_channel) : 0;
+  return tflite::CreateConcatEmbeddingsOptions(
+      _fbb,
+      num_channels,
+      num_columns_per_channel__,
+      embedding_dim_per_channel__);
+}
+
+struct LSHProjectionOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LSHProjectionOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return LSHProjectionOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TYPE = 4
+  };
+  tflite::LSHProjectionType type() const {
+    return static_cast<tflite::LSHProjectionType>(GetField<int8_t>(VT_TYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_TYPE, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct LSHProjectionOptionsBuilder {
+  typedef LSHProjectionOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_type(tflite::LSHProjectionType type) {
+    fbb_.AddElement<int8_t>(LSHProjectionOptions::VT_TYPE, static_cast<int8_t>(type), 0);
+  }
+  explicit LSHProjectionOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LSHProjectionOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<LSHProjectionOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::LSHProjectionType type = tflite::LSHProjectionType::UNKNOWN) {
+  LSHProjectionOptionsBuilder builder_(_fbb);
+  builder_.add_type(type);
+  return builder_.Finish();
+}
+
+struct SVDFOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SVDFOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SVDFOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_RANK = 4,
+    VT_FUSED_ACTIVATION_FUNCTION = 6,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 8
+  };
+  int32_t rank() const {
+    return GetField<int32_t>(VT_RANK, 0);
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_RANK, 4) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct SVDFOptionsBuilder {
+  typedef SVDFOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_rank(int32_t rank) {
+    fbb_.AddElement<int32_t>(SVDFOptions::VT_RANK, rank, 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(SVDFOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(SVDFOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit SVDFOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SVDFOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SVDFOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t rank = 0,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType::NONE,
+    bool asymmetric_quantize_inputs = false) {
+  SVDFOptionsBuilder builder_(_fbb);
+  builder_.add_rank(rank);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+struct RNNOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef RNNOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return RNNOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 6
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct RNNOptionsBuilder {
+  typedef RNNOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(RNNOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(RNNOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit RNNOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<RNNOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<RNNOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<RNNOptions> CreateRNNOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType::NONE,
+    bool asymmetric_quantize_inputs = false) {
+  RNNOptionsBuilder builder_(_fbb);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+struct SequenceRNNOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SequenceRNNOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SequenceRNNOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TIME_MAJOR = 4,
+    VT_FUSED_ACTIVATION_FUNCTION = 6,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 8
+  };
+  bool time_major() const {
+    return GetField<uint8_t>(VT_TIME_MAJOR, 0) != 0;
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_TIME_MAJOR, 1) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct SequenceRNNOptionsBuilder {
+  typedef SequenceRNNOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_time_major(bool time_major) {
+    fbb_.AddElement<uint8_t>(SequenceRNNOptions::VT_TIME_MAJOR, static_cast<uint8_t>(time_major), 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(SequenceRNNOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(SequenceRNNOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit SequenceRNNOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SequenceRNNOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SequenceRNNOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SequenceRNNOptions> CreateSequenceRNNOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool time_major = false,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType::NONE,
+    bool asymmetric_quantize_inputs = false) {
+  SequenceRNNOptionsBuilder builder_(_fbb);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_time_major(time_major);
+  return builder_.Finish();
+}
+
+struct BidirectionalSequenceRNNOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BidirectionalSequenceRNNOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return BidirectionalSequenceRNNOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TIME_MAJOR = 4,
+    VT_FUSED_ACTIVATION_FUNCTION = 6,
+    VT_MERGE_OUTPUTS = 8,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 10
+  };
+  bool time_major() const {
+    return GetField<uint8_t>(VT_TIME_MAJOR, 0) != 0;
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool merge_outputs() const {
+    return GetField<uint8_t>(VT_MERGE_OUTPUTS, 0) != 0;
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_TIME_MAJOR, 1) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<uint8_t>(verifier, VT_MERGE_OUTPUTS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct BidirectionalSequenceRNNOptionsBuilder {
+  typedef BidirectionalSequenceRNNOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_time_major(bool time_major) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceRNNOptions::VT_TIME_MAJOR, static_cast<uint8_t>(time_major), 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(BidirectionalSequenceRNNOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_merge_outputs(bool merge_outputs) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceRNNOptions::VT_MERGE_OUTPUTS, static_cast<uint8_t>(merge_outputs), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceRNNOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit BidirectionalSequenceRNNOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BidirectionalSequenceRNNOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BidirectionalSequenceRNNOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BidirectionalSequenceRNNOptions> CreateBidirectionalSequenceRNNOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool time_major = false,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType::NONE,
+    bool merge_outputs = false,
+    bool asymmetric_quantize_inputs = false) {
+  BidirectionalSequenceRNNOptionsBuilder builder_(_fbb);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_merge_outputs(merge_outputs);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_time_major(time_major);
+  return builder_.Finish();
+}
+
+struct FullyConnectedOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FullyConnectedOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return FullyConnectedOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_WEIGHTS_FORMAT = 6,
+    VT_KEEP_NUM_DIMS = 8,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 10,
+    VT_QUANTIZED_BIAS_TYPE = 12
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  tflite::FullyConnectedOptionsWeightsFormat weights_format() const {
+    return static_cast<tflite::FullyConnectedOptionsWeightsFormat>(GetField<int8_t>(VT_WEIGHTS_FORMAT, 0));
+  }
+  bool keep_num_dims() const {
+    return GetField<uint8_t>(VT_KEEP_NUM_DIMS, 0) != 0;
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  tflite::TensorType quantized_bias_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_QUANTIZED_BIAS_TYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<int8_t>(verifier, VT_WEIGHTS_FORMAT, 1) &&
+           VerifyField<uint8_t>(verifier, VT_KEEP_NUM_DIMS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
+           VerifyField<int8_t>(verifier, VT_QUANTIZED_BIAS_TYPE, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct FullyConnectedOptionsBuilder {
+  typedef FullyConnectedOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(FullyConnectedOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_weights_format(tflite::FullyConnectedOptionsWeightsFormat weights_format) {
+    fbb_.AddElement<int8_t>(FullyConnectedOptions::VT_WEIGHTS_FORMAT, static_cast<int8_t>(weights_format), 0);
+  }
+  void add_keep_num_dims(bool keep_num_dims) {
+    fbb_.AddElement<uint8_t>(FullyConnectedOptions::VT_KEEP_NUM_DIMS, static_cast<uint8_t>(keep_num_dims), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(FullyConnectedOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  void add_quantized_bias_type(tflite::TensorType quantized_bias_type) {
+    fbb_.AddElement<int8_t>(FullyConnectedOptions::VT_QUANTIZED_BIAS_TYPE, static_cast<int8_t>(quantized_bias_type), 0);
+  }
+  explicit FullyConnectedOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FullyConnectedOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<FullyConnectedOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType::NONE,
+    tflite::FullyConnectedOptionsWeightsFormat weights_format = tflite::FullyConnectedOptionsWeightsFormat::DEFAULT,
+    bool keep_num_dims = false,
+    bool asymmetric_quantize_inputs = false,
+    tflite::TensorType quantized_bias_type = tflite::TensorType::FLOAT32) {
+  FullyConnectedOptionsBuilder builder_(_fbb);
+  builder_.add_quantized_bias_type(quantized_bias_type);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_keep_num_dims(keep_num_dims);
+  builder_.add_weights_format(weights_format);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+struct SoftmaxOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SoftmaxOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SoftmaxOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BETA = 4
+  };
+  float beta() const {
+    return GetField<float>(VT_BETA, 0.0f);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<float>(verifier, VT_BETA, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct SoftmaxOptionsBuilder {
+  typedef SoftmaxOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_beta(float beta) {
+    fbb_.AddElement<float>(SoftmaxOptions::VT_BETA, beta, 0.0f);
+  }
+  explicit SoftmaxOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SoftmaxOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SoftmaxOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    float beta = 0.0f) {
+  SoftmaxOptionsBuilder builder_(_fbb);
+  builder_.add_beta(beta);
+  return builder_.Finish();
+}
+
+struct ConcatenationOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ConcatenationOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ConcatenationOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_AXIS = 4,
+    VT_FUSED_ACTIVATION_FUNCTION = 6
+  };
+  int32_t axis() const {
+    return GetField<int32_t>(VT_AXIS, 0);
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_AXIS, 4) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct ConcatenationOptionsBuilder {
+  typedef ConcatenationOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(ConcatenationOptions::VT_AXIS, axis, 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(ConcatenationOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit ConcatenationOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ConcatenationOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ConcatenationOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t axis = 0,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType::NONE) {
+  ConcatenationOptionsBuilder builder_(_fbb);
+  builder_.add_axis(axis);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+struct AddOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef AddOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return AddOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_POT_SCALE_INT16 = 6
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool pot_scale_int16() const {
+    return GetField<uint8_t>(VT_POT_SCALE_INT16, 1) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<uint8_t>(verifier, VT_POT_SCALE_INT16, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct AddOptionsBuilder {
+  typedef AddOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(AddOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_pot_scale_int16(bool pot_scale_int16) {
+    fbb_.AddElement<uint8_t>(AddOptions::VT_POT_SCALE_INT16, static_cast<uint8_t>(pot_scale_int16), 1);
+  }
+  explicit AddOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<AddOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<AddOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<AddOptions> CreateAddOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType::NONE,
+    bool pot_scale_int16 = true) {
+  AddOptionsBuilder builder_(_fbb);
+  builder_.add_pot_scale_int16(pot_scale_int16);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+struct MulOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MulOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return MulOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct MulOptionsBuilder {
+  typedef MulOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(MulOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit MulOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<MulOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<MulOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<MulOptions> CreateMulOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType::NONE) {
+  MulOptionsBuilder builder_(_fbb);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+struct L2NormOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef L2NormOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return L2NormOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct L2NormOptionsBuilder {
+  typedef L2NormOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(L2NormOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit L2NormOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<L2NormOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<L2NormOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType::NONE) {
+  L2NormOptionsBuilder builder_(_fbb);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+struct LocalResponseNormalizationOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LocalResponseNormalizationOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return LocalResponseNormalizationOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_RADIUS = 4,
+    VT_BIAS = 6,
+    VT_ALPHA = 8,
+    VT_BETA = 10
+  };
+  int32_t radius() const {
+    return GetField<int32_t>(VT_RADIUS, 0);
+  }
+  float bias() const {
+    return GetField<float>(VT_BIAS, 0.0f);
+  }
+  float alpha() const {
+    return GetField<float>(VT_ALPHA, 0.0f);
+  }
+  float beta() const {
+    return GetField<float>(VT_BETA, 0.0f);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_RADIUS, 4) &&
+           VerifyField<float>(verifier, VT_BIAS, 4) &&
+           VerifyField<float>(verifier, VT_ALPHA, 4) &&
+           VerifyField<float>(verifier, VT_BETA, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct LocalResponseNormalizationOptionsBuilder {
+  typedef LocalResponseNormalizationOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_radius(int32_t radius) {
+    fbb_.AddElement<int32_t>(LocalResponseNormalizationOptions::VT_RADIUS, radius, 0);
+  }
+  void add_bias(float bias) {
+    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_BIAS, bias, 0.0f);
+  }
+  void add_alpha(float alpha) {
+    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_ALPHA, alpha, 0.0f);
+  }
+  void add_beta(float beta) {
+    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_BETA, beta, 0.0f);
+  }
+  explicit LocalResponseNormalizationOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LocalResponseNormalizationOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<LocalResponseNormalizationOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LocalResponseNormalizationOptions> CreateLocalResponseNormalizationOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t radius = 0,
+    float bias = 0.0f,
+    float alpha = 0.0f,
+    float beta = 0.0f) {
+  LocalResponseNormalizationOptionsBuilder builder_(_fbb);
+  builder_.add_beta(beta);
+  builder_.add_alpha(alpha);
+  builder_.add_bias(bias);
+  builder_.add_radius(radius);
+  return builder_.Finish();
+}
+
+struct LSTMOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LSTMOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return LSTMOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_CELL_CLIP = 6,
+    VT_PROJ_CLIP = 8,
+    VT_KERNEL_TYPE = 10,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 12
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  float cell_clip() const {
+    return GetField<float>(VT_CELL_CLIP, 0.0f);
+  }
+  float proj_clip() const {
+    return GetField<float>(VT_PROJ_CLIP, 0.0f);
+  }
+  tflite::LSTMKernelType kernel_type() const {
+    return static_cast<tflite::LSTMKernelType>(GetField<int8_t>(VT_KERNEL_TYPE, 0));
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<float>(verifier, VT_CELL_CLIP, 4) &&
+           VerifyField<float>(verifier, VT_PROJ_CLIP, 4) &&
+           VerifyField<int8_t>(verifier, VT_KERNEL_TYPE, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct LSTMOptionsBuilder {
+  typedef LSTMOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(LSTMOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_cell_clip(float cell_clip) {
+    fbb_.AddElement<float>(LSTMOptions::VT_CELL_CLIP, cell_clip, 0.0f);
+  }
+  void add_proj_clip(float proj_clip) {
+    fbb_.AddElement<float>(LSTMOptions::VT_PROJ_CLIP, proj_clip, 0.0f);
+  }
+  void add_kernel_type(tflite::LSTMKernelType kernel_type) {
+    fbb_.AddElement<int8_t>(LSTMOptions::VT_KERNEL_TYPE, static_cast<int8_t>(kernel_type), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(LSTMOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit LSTMOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LSTMOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<LSTMOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType::NONE,
+    float cell_clip = 0.0f,
+    float proj_clip = 0.0f,
+    tflite::LSTMKernelType kernel_type = tflite::LSTMKernelType::FULL,
+    bool asymmetric_quantize_inputs = false) {
+  LSTMOptionsBuilder builder_(_fbb);
+  builder_.add_proj_clip(proj_clip);
+  builder_.add_cell_clip(cell_clip);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_kernel_type(kernel_type);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+struct UnidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef UnidirectionalSequenceLSTMOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return UnidirectionalSequenceLSTMOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_CELL_CLIP = 6,
+    VT_PROJ_CLIP = 8,
+    VT_TIME_MAJOR = 10,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 12,
+    VT_DIAGONAL_RECURRENT_TENSORS = 14
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  float cell_clip() const {
+    return GetField<float>(VT_CELL_CLIP, 0.0f);
+  }
+  float proj_clip() const {
+    return GetField<float>(VT_PROJ_CLIP, 0.0f);
+  }
+  bool time_major() const {
+    return GetField<uint8_t>(VT_TIME_MAJOR, 0) != 0;
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool diagonal_recurrent_tensors() const {
+    return GetField<uint8_t>(VT_DIAGONAL_RECURRENT_TENSORS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<float>(verifier, VT_CELL_CLIP, 4) &&
+           VerifyField<float>(verifier, VT_PROJ_CLIP, 4) &&
+           VerifyField<uint8_t>(verifier, VT_TIME_MAJOR, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_DIAGONAL_RECURRENT_TENSORS, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct UnidirectionalSequenceLSTMOptionsBuilder {
+  typedef UnidirectionalSequenceLSTMOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(UnidirectionalSequenceLSTMOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_cell_clip(float cell_clip) {
+    fbb_.AddElement<float>(UnidirectionalSequenceLSTMOptions::VT_CELL_CLIP, cell_clip, 0.0f);
+  }
+  void add_proj_clip(float proj_clip) {
+    fbb_.AddElement<float>(UnidirectionalSequenceLSTMOptions::VT_PROJ_CLIP, proj_clip, 0.0f);
+  }
+  void add_time_major(bool time_major) {
+    fbb_.AddElement<uint8_t>(UnidirectionalSequenceLSTMOptions::VT_TIME_MAJOR, static_cast<uint8_t>(time_major), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(UnidirectionalSequenceLSTMOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  void add_diagonal_recurrent_tensors(bool diagonal_recurrent_tensors) {
+    fbb_.AddElement<uint8_t>(UnidirectionalSequenceLSTMOptions::VT_DIAGONAL_RECURRENT_TENSORS, static_cast<uint8_t>(diagonal_recurrent_tensors), 0);
+  }
+  explicit UnidirectionalSequenceLSTMOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<UnidirectionalSequenceLSTMOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<UnidirectionalSequenceLSTMOptions> CreateUnidirectionalSequenceLSTMOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType::NONE,
+    float cell_clip = 0.0f,
+    float proj_clip = 0.0f,
+    bool time_major = false,
+    bool asymmetric_quantize_inputs = false,
+    bool diagonal_recurrent_tensors = false) {
+  UnidirectionalSequenceLSTMOptionsBuilder builder_(_fbb);
+  builder_.add_proj_clip(proj_clip);
+  builder_.add_cell_clip(cell_clip);
+  builder_.add_diagonal_recurrent_tensors(diagonal_recurrent_tensors);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_time_major(time_major);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+struct BidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BidirectionalSequenceLSTMOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return BidirectionalSequenceLSTMOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_CELL_CLIP = 6,
+    VT_PROJ_CLIP = 8,
+    VT_MERGE_OUTPUTS = 10,
+    VT_TIME_MAJOR = 12,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 14
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  float cell_clip() const {
+    return GetField<float>(VT_CELL_CLIP, 0.0f);
+  }
+  float proj_clip() const {
+    return GetField<float>(VT_PROJ_CLIP, 0.0f);
+  }
+  bool merge_outputs() const {
+    return GetField<uint8_t>(VT_MERGE_OUTPUTS, 0) != 0;
+  }
+  bool time_major() const {
+    return GetField<uint8_t>(VT_TIME_MAJOR, 1) != 0;
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<float>(verifier, VT_CELL_CLIP, 4) &&
+           VerifyField<float>(verifier, VT_PROJ_CLIP, 4) &&
+           VerifyField<uint8_t>(verifier, VT_MERGE_OUTPUTS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_TIME_MAJOR, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct BidirectionalSequenceLSTMOptionsBuilder {
+  typedef BidirectionalSequenceLSTMOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(BidirectionalSequenceLSTMOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_cell_clip(float cell_clip) {
+    fbb_.AddElement<float>(BidirectionalSequenceLSTMOptions::VT_CELL_CLIP, cell_clip, 0.0f);
+  }
+  void add_proj_clip(float proj_clip) {
+    fbb_.AddElement<float>(BidirectionalSequenceLSTMOptions::VT_PROJ_CLIP, proj_clip, 0.0f);
+  }
+  void add_merge_outputs(bool merge_outputs) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceLSTMOptions::VT_MERGE_OUTPUTS, static_cast<uint8_t>(merge_outputs), 0);
+  }
+  void add_time_major(bool time_major) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceLSTMOptions::VT_TIME_MAJOR, static_cast<uint8_t>(time_major), 1);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceLSTMOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit BidirectionalSequenceLSTMOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BidirectionalSequenceLSTMOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BidirectionalSequenceLSTMOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectionalSequenceLSTMOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType::NONE,
+    float cell_clip = 0.0f,
+    float proj_clip = 0.0f,
+    bool merge_outputs = false,
+    bool time_major = true,
+    bool asymmetric_quantize_inputs = false) {
+  BidirectionalSequenceLSTMOptionsBuilder builder_(_fbb);
+  builder_.add_proj_clip(proj_clip);
+  builder_.add_cell_clip(cell_clip);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_time_major(time_major);
+  builder_.add_merge_outputs(merge_outputs);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ResizeBilinearOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ResizeBilinearOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ALIGN_CORNERS = 8,
+    VT_HALF_PIXEL_CENTERS = 10
+  };
+  bool align_corners() const {
+    return GetField<uint8_t>(VT_ALIGN_CORNERS, 0) != 0;
+  }
+  bool half_pixel_centers() const {
+    return GetField<uint8_t>(VT_HALF_PIXEL_CENTERS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_ALIGN_CORNERS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_HALF_PIXEL_CENTERS, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct ResizeBilinearOptionsBuilder {
+  typedef ResizeBilinearOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_align_corners(bool align_corners) {
+    fbb_.AddElement<uint8_t>(ResizeBilinearOptions::VT_ALIGN_CORNERS, static_cast<uint8_t>(align_corners), 0);
+  }
+  void add_half_pixel_centers(bool half_pixel_centers) {
+    fbb_.AddElement<uint8_t>(ResizeBilinearOptions::VT_HALF_PIXEL_CENTERS, static_cast<uint8_t>(half_pixel_centers), 0);
+  }
+  explicit ResizeBilinearOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ResizeBilinearOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ResizeBilinearOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool align_corners = false,
+    bool half_pixel_centers = false) {
+  ResizeBilinearOptionsBuilder builder_(_fbb);
+  builder_.add_half_pixel_centers(half_pixel_centers);
+  builder_.add_align_corners(align_corners);
+  return builder_.Finish();
+}
+
+struct ResizeNearestNeighborOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ResizeNearestNeighborOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ResizeNearestNeighborOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ALIGN_CORNERS = 4,
+    VT_HALF_PIXEL_CENTERS = 6
+  };
+  bool align_corners() const {
+    return GetField<uint8_t>(VT_ALIGN_CORNERS, 0) != 0;
+  }
+  bool half_pixel_centers() const {
+    return GetField<uint8_t>(VT_HALF_PIXEL_CENTERS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_ALIGN_CORNERS, 1) &&
+           VerifyField<uint8_t>(verifier, VT_HALF_PIXEL_CENTERS, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct ResizeNearestNeighborOptionsBuilder {
+  typedef ResizeNearestNeighborOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_align_corners(bool align_corners) {
+    fbb_.AddElement<uint8_t>(ResizeNearestNeighborOptions::VT_ALIGN_CORNERS, static_cast<uint8_t>(align_corners), 0);
+  }
+  void add_half_pixel_centers(bool half_pixel_centers) {
+    fbb_.AddElement<uint8_t>(ResizeNearestNeighborOptions::VT_HALF_PIXEL_CENTERS, static_cast<uint8_t>(half_pixel_centers), 0);
+  }
+  explicit ResizeNearestNeighborOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ResizeNearestNeighborOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ResizeNearestNeighborOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ResizeNearestNeighborOptions> CreateResizeNearestNeighborOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool align_corners = false,
+    bool half_pixel_centers = false) {
+  ResizeNearestNeighborOptionsBuilder builder_(_fbb);
+  builder_.add_half_pixel_centers(half_pixel_centers);
+  builder_.add_align_corners(align_corners);
+  return builder_.Finish();
+}
+
+struct CallOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CallOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return CallOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SUBGRAPH = 4
+  };
+  uint32_t subgraph() const {
+    return GetField<uint32_t>(VT_SUBGRAPH, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_SUBGRAPH, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct CallOptionsBuilder {
+  typedef CallOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_subgraph(uint32_t subgraph) {
+    fbb_.AddElement<uint32_t>(CallOptions::VT_SUBGRAPH, subgraph, 0);
+  }
+  explicit CallOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CallOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CallOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CallOptions> CreateCallOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t subgraph = 0) {
+  CallOptionsBuilder builder_(_fbb);
+  builder_.add_subgraph(subgraph);
+  return builder_.Finish();
+}
+
+struct PadOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef PadOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return PadOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct PadOptionsBuilder {
+  typedef PadOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit PadOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<PadOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<PadOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<PadOptions> CreatePadOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  PadOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct PadV2Options FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef PadV2OptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return PadV2OptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct PadV2OptionsBuilder {
+  typedef PadV2Options Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit PadV2OptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<PadV2Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<PadV2Options>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<PadV2Options> CreatePadV2Options(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  PadV2OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct ReshapeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ReshapeOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ReshapeOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NEW_SHAPE = 4
+  };
+  const ::flatbuffers::Vector<int32_t> *new_shape() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_NEW_SHAPE);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NEW_SHAPE) &&
+           verifier.VerifyVector(new_shape()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ReshapeOptionsBuilder {
+  typedef ReshapeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_new_shape(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> new_shape) {
+    fbb_.AddOffset(ReshapeOptions::VT_NEW_SHAPE, new_shape);
+  }
+  explicit ReshapeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ReshapeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ReshapeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> new_shape = 0) {
+  ReshapeOptionsBuilder builder_(_fbb);
+  builder_.add_new_shape(new_shape);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ReshapeOptions> CreateReshapeOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *new_shape = nullptr) {
+  auto new_shape__ = new_shape ? _fbb.CreateVector<int32_t>(*new_shape) : 0;
+  return tflite::CreateReshapeOptions(
+      _fbb,
+      new_shape__);
+}
+
+struct SpaceToBatchNDOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SpaceToBatchNDOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SpaceToBatchNDOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct SpaceToBatchNDOptionsBuilder {
+  typedef SpaceToBatchNDOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SpaceToBatchNDOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SpaceToBatchNDOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SpaceToBatchNDOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SpaceToBatchNDOptions> CreateSpaceToBatchNDOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  SpaceToBatchNDOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct BatchToSpaceNDOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BatchToSpaceNDOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return BatchToSpaceNDOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct BatchToSpaceNDOptionsBuilder {
+  typedef BatchToSpaceNDOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit BatchToSpaceNDOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BatchToSpaceNDOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BatchToSpaceNDOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BatchToSpaceNDOptions> CreateBatchToSpaceNDOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  BatchToSpaceNDOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct SkipGramOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SkipGramOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SkipGramOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NGRAM_SIZE = 4,
+    VT_MAX_SKIP_SIZE = 6,
+    VT_INCLUDE_ALL_NGRAMS = 8
+  };
+  int32_t ngram_size() const {
+    return GetField<int32_t>(VT_NGRAM_SIZE, 0);
+  }
+  int32_t max_skip_size() const {
+    return GetField<int32_t>(VT_MAX_SKIP_SIZE, 0);
+  }
+  bool include_all_ngrams() const {
+    return GetField<uint8_t>(VT_INCLUDE_ALL_NGRAMS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NGRAM_SIZE, 4) &&
+           VerifyField<int32_t>(verifier, VT_MAX_SKIP_SIZE, 4) &&
+           VerifyField<uint8_t>(verifier, VT_INCLUDE_ALL_NGRAMS, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct SkipGramOptionsBuilder {
+  typedef SkipGramOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_ngram_size(int32_t ngram_size) {
+    fbb_.AddElement<int32_t>(SkipGramOptions::VT_NGRAM_SIZE, ngram_size, 0);
+  }
+  void add_max_skip_size(int32_t max_skip_size) {
+    fbb_.AddElement<int32_t>(SkipGramOptions::VT_MAX_SKIP_SIZE, max_skip_size, 0);
+  }
+  void add_include_all_ngrams(bool include_all_ngrams) {
+    fbb_.AddElement<uint8_t>(SkipGramOptions::VT_INCLUDE_ALL_NGRAMS, static_cast<uint8_t>(include_all_ngrams), 0);
+  }
+  explicit SkipGramOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SkipGramOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SkipGramOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t ngram_size = 0,
+    int32_t max_skip_size = 0,
+    bool include_all_ngrams = false) {
+  SkipGramOptionsBuilder builder_(_fbb);
+  builder_.add_max_skip_size(max_skip_size);
+  builder_.add_ngram_size(ngram_size);
+  builder_.add_include_all_ngrams(include_all_ngrams);
+  return builder_.Finish();
+}
+
+struct SpaceToDepthOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SpaceToDepthOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SpaceToDepthOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BLOCK_SIZE = 4
+  };
+  int32_t block_size() const {
+    return GetField<int32_t>(VT_BLOCK_SIZE, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_BLOCK_SIZE, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct SpaceToDepthOptionsBuilder {
+  typedef SpaceToDepthOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_block_size(int32_t block_size) {
+    fbb_.AddElement<int32_t>(SpaceToDepthOptions::VT_BLOCK_SIZE, block_size, 0);
+  }
+  explicit SpaceToDepthOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SpaceToDepthOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SpaceToDepthOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t block_size = 0) {
+  SpaceToDepthOptionsBuilder builder_(_fbb);
+  builder_.add_block_size(block_size);
+  return builder_.Finish();
+}
+
+struct DepthToSpaceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DepthToSpaceOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return DepthToSpaceOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BLOCK_SIZE = 4
+  };
+  int32_t block_size() const {
+    return GetField<int32_t>(VT_BLOCK_SIZE, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_BLOCK_SIZE, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct DepthToSpaceOptionsBuilder {
+  typedef DepthToSpaceOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_block_size(int32_t block_size) {
+    fbb_.AddElement<int32_t>(DepthToSpaceOptions::VT_BLOCK_SIZE, block_size, 0);
+  }
+  explicit DepthToSpaceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DepthToSpaceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DepthToSpaceOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DepthToSpaceOptions> CreateDepthToSpaceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t block_size = 0) {
+  DepthToSpaceOptionsBuilder builder_(_fbb);
+  builder_.add_block_size(block_size);
+  return builder_.Finish();
+}
+
+struct SubOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SubOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SubOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4,
+    VT_POT_SCALE_INT16 = 6
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool pot_scale_int16() const {
+    return GetField<uint8_t>(VT_POT_SCALE_INT16, 1) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<uint8_t>(verifier, VT_POT_SCALE_INT16, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct SubOptionsBuilder {
+  typedef SubOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(SubOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_pot_scale_int16(bool pot_scale_int16) {
+    fbb_.AddElement<uint8_t>(SubOptions::VT_POT_SCALE_INT16, static_cast<uint8_t>(pot_scale_int16), 1);
+  }
+  explicit SubOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SubOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SubOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SubOptions> CreateSubOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType::NONE,
+    bool pot_scale_int16 = true) {
+  SubOptionsBuilder builder_(_fbb);
+  builder_.add_pot_scale_int16(pot_scale_int16);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+struct DivOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DivOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return DivOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FUSED_ACTIVATION_FUNCTION = 4
+  };
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct DivOptionsBuilder {
+  typedef DivOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(DivOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit DivOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DivOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DivOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DivOptions> CreateDivOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType::NONE) {
+  DivOptionsBuilder builder_(_fbb);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+struct TopKV2Options FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TopKV2OptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TopKV2OptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct TopKV2OptionsBuilder {
+  typedef TopKV2Options Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit TopKV2OptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<TopKV2Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<TopKV2Options>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<TopKV2Options> CreateTopKV2Options(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  TopKV2OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct EmbeddingLookupSparseOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef EmbeddingLookupSparseOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return EmbeddingLookupSparseOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_COMBINER = 4
+  };
+  tflite::CombinerType combiner() const {
+    return static_cast<tflite::CombinerType>(GetField<int8_t>(VT_COMBINER, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_COMBINER, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct EmbeddingLookupSparseOptionsBuilder {
+  typedef EmbeddingLookupSparseOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_combiner(tflite::CombinerType combiner) {
+    fbb_.AddElement<int8_t>(EmbeddingLookupSparseOptions::VT_COMBINER, static_cast<int8_t>(combiner), 0);
+  }
+  explicit EmbeddingLookupSparseOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<EmbeddingLookupSparseOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<EmbeddingLookupSparseOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<EmbeddingLookupSparseOptions> CreateEmbeddingLookupSparseOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::CombinerType combiner = tflite::CombinerType::SUM) {
+  EmbeddingLookupSparseOptionsBuilder builder_(_fbb);
+  builder_.add_combiner(combiner);
+  return builder_.Finish();
+}
+
+struct GatherOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef GatherOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return GatherOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_AXIS = 4,
+    VT_BATCH_DIMS = 6
+  };
+  int32_t axis() const {
+    return GetField<int32_t>(VT_AXIS, 0);
+  }
+  int32_t batch_dims() const {
+    return GetField<int32_t>(VT_BATCH_DIMS, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_AXIS, 4) &&
+           VerifyField<int32_t>(verifier, VT_BATCH_DIMS, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct GatherOptionsBuilder {
+  typedef GatherOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(GatherOptions::VT_AXIS, axis, 0);
+  }
+  void add_batch_dims(int32_t batch_dims) {
+    fbb_.AddElement<int32_t>(GatherOptions::VT_BATCH_DIMS, batch_dims, 0);
+  }
+  explicit GatherOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<GatherOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<GatherOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<GatherOptions> CreateGatherOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t axis = 0,
+    int32_t batch_dims = 0) {
+  GatherOptionsBuilder builder_(_fbb);
+  builder_.add_batch_dims(batch_dims);
+  builder_.add_axis(axis);
+  return builder_.Finish();
+}
+
+struct TransposeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TransposeOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TransposeOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct TransposeOptionsBuilder {
+  typedef TransposeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit TransposeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<TransposeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<TransposeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<TransposeOptions> CreateTransposeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  TransposeOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct ExpOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ExpOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ExpOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct ExpOptionsBuilder {
+  typedef ExpOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ExpOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ExpOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ExpOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ExpOptions> CreateExpOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  ExpOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct CosOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CosOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return CosOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct CosOptionsBuilder {
+  typedef CosOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit CosOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CosOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CosOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CosOptions> CreateCosOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  CosOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct ReducerOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ReducerOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ReducerOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_KEEP_DIMS = 4
+  };
+  bool keep_dims() const {
+    return GetField<uint8_t>(VT_KEEP_DIMS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_KEEP_DIMS, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct ReducerOptionsBuilder {
+  typedef ReducerOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_keep_dims(bool keep_dims) {
+    fbb_.AddElement<uint8_t>(ReducerOptions::VT_KEEP_DIMS, static_cast<uint8_t>(keep_dims), 0);
+  }
+  explicit ReducerOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ReducerOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ReducerOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ReducerOptions> CreateReducerOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool keep_dims = false) {
+  ReducerOptionsBuilder builder_(_fbb);
+  builder_.add_keep_dims(keep_dims);
+  return builder_.Finish();
+}
+
+struct SqueezeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SqueezeOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SqueezeOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SQUEEZE_DIMS = 4
+  };
+  const ::flatbuffers::Vector<int32_t> *squeeze_dims() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_SQUEEZE_DIMS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_SQUEEZE_DIMS) &&
+           verifier.VerifyVector(squeeze_dims()) &&
+           verifier.EndTable();
+  }
+};
+
+struct SqueezeOptionsBuilder {
+  typedef SqueezeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_squeeze_dims(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> squeeze_dims) {
+    fbb_.AddOffset(SqueezeOptions::VT_SQUEEZE_DIMS, squeeze_dims);
+  }
+  explicit SqueezeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SqueezeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SqueezeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> squeeze_dims = 0) {
+  SqueezeOptionsBuilder builder_(_fbb);
+  builder_.add_squeeze_dims(squeeze_dims);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *squeeze_dims = nullptr) {
+  auto squeeze_dims__ = squeeze_dims ? _fbb.CreateVector<int32_t>(*squeeze_dims) : 0;
+  return tflite::CreateSqueezeOptions(
+      _fbb,
+      squeeze_dims__);
+}
+
+struct SplitOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SplitOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SplitOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUM_SPLITS = 4
+  };
+  int32_t num_splits() const {
+    return GetField<int32_t>(VT_NUM_SPLITS, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM_SPLITS, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct SplitOptionsBuilder {
+  typedef SplitOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_num_splits(int32_t num_splits) {
+    fbb_.AddElement<int32_t>(SplitOptions::VT_NUM_SPLITS, num_splits, 0);
+  }
+  explicit SplitOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SplitOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SplitOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SplitOptions> CreateSplitOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_splits = 0) {
+  SplitOptionsBuilder builder_(_fbb);
+  builder_.add_num_splits(num_splits);
+  return builder_.Finish();
+}
+
+struct SplitVOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SplitVOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SplitVOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUM_SPLITS = 4
+  };
+  int32_t num_splits() const {
+    return GetField<int32_t>(VT_NUM_SPLITS, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM_SPLITS, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct SplitVOptionsBuilder {
+  typedef SplitVOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_num_splits(int32_t num_splits) {
+    fbb_.AddElement<int32_t>(SplitVOptions::VT_NUM_SPLITS, num_splits, 0);
+  }
+  explicit SplitVOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SplitVOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SplitVOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_splits = 0) {
+  SplitVOptionsBuilder builder_(_fbb);
+  builder_.add_num_splits(num_splits);
+  return builder_.Finish();
+}
+
+struct StridedSliceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StridedSliceOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return StridedSliceOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BEGIN_MASK = 4,
+    VT_END_MASK = 6,
+    VT_ELLIPSIS_MASK = 8,
+    VT_NEW_AXIS_MASK = 10,
+    VT_SHRINK_AXIS_MASK = 12,
+    VT_OFFSET = 14
+  };
+  int32_t begin_mask() const {
+    return GetField<int32_t>(VT_BEGIN_MASK, 0);
+  }
+  int32_t end_mask() const {
+    return GetField<int32_t>(VT_END_MASK, 0);
+  }
+  int32_t ellipsis_mask() const {
+    return GetField<int32_t>(VT_ELLIPSIS_MASK, 0);
+  }
+  int32_t new_axis_mask() const {
+    return GetField<int32_t>(VT_NEW_AXIS_MASK, 0);
+  }
+  int32_t shrink_axis_mask() const {
+    return GetField<int32_t>(VT_SHRINK_AXIS_MASK, 0);
+  }
+  bool offset() const {
+    return GetField<uint8_t>(VT_OFFSET, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_BEGIN_MASK, 4) &&
+           VerifyField<int32_t>(verifier, VT_END_MASK, 4) &&
+           VerifyField<int32_t>(verifier, VT_ELLIPSIS_MASK, 4) &&
+           VerifyField<int32_t>(verifier, VT_NEW_AXIS_MASK, 4) &&
+           VerifyField<int32_t>(verifier, VT_SHRINK_AXIS_MASK, 4) &&
+           VerifyField<uint8_t>(verifier, VT_OFFSET, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct StridedSliceOptionsBuilder {
+  typedef StridedSliceOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_begin_mask(int32_t begin_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_BEGIN_MASK, begin_mask, 0);
+  }
+  void add_end_mask(int32_t end_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_END_MASK, end_mask, 0);
+  }
+  void add_ellipsis_mask(int32_t ellipsis_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_ELLIPSIS_MASK, ellipsis_mask, 0);
+  }
+  void add_new_axis_mask(int32_t new_axis_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_NEW_AXIS_MASK, new_axis_mask, 0);
+  }
+  void add_shrink_axis_mask(int32_t shrink_axis_mask) {
+    fbb_.AddElement<int32_t>(StridedSliceOptions::VT_SHRINK_AXIS_MASK, shrink_axis_mask, 0);
+  }
+  void add_offset(bool offset) {
+    fbb_.AddElement<uint8_t>(StridedSliceOptions::VT_OFFSET, static_cast<uint8_t>(offset), 0);
+  }
+  explicit StridedSliceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StridedSliceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StridedSliceOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t begin_mask = 0,
+    int32_t end_mask = 0,
+    int32_t ellipsis_mask = 0,
+    int32_t new_axis_mask = 0,
+    int32_t shrink_axis_mask = 0,
+    bool offset = false) {
+  StridedSliceOptionsBuilder builder_(_fbb);
+  builder_.add_shrink_axis_mask(shrink_axis_mask);
+  builder_.add_new_axis_mask(new_axis_mask);
+  builder_.add_ellipsis_mask(ellipsis_mask);
+  builder_.add_end_mask(end_mask);
+  builder_.add_begin_mask(begin_mask);
+  builder_.add_offset(offset);
+  return builder_.Finish();
+}
+
+struct LogSoftmaxOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LogSoftmaxOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return LogSoftmaxOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct LogSoftmaxOptionsBuilder {
+  typedef LogSoftmaxOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LogSoftmaxOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LogSoftmaxOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<LogSoftmaxOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LogSoftmaxOptions> CreateLogSoftmaxOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  LogSoftmaxOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct CastOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CastOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return CastOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_IN_DATA_TYPE = 4,
+    VT_OUT_DATA_TYPE = 6
+  };
+  tflite::TensorType in_data_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_IN_DATA_TYPE, 0));
+  }
+  tflite::TensorType out_data_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_OUT_DATA_TYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_IN_DATA_TYPE, 1) &&
+           VerifyField<int8_t>(verifier, VT_OUT_DATA_TYPE, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct CastOptionsBuilder {
+  typedef CastOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_in_data_type(tflite::TensorType in_data_type) {
+    fbb_.AddElement<int8_t>(CastOptions::VT_IN_DATA_TYPE, static_cast<int8_t>(in_data_type), 0);
+  }
+  void add_out_data_type(tflite::TensorType out_data_type) {
+    fbb_.AddElement<int8_t>(CastOptions::VT_OUT_DATA_TYPE, static_cast<int8_t>(out_data_type), 0);
+  }
+  explicit CastOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CastOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CastOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CastOptions> CreateCastOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::TensorType in_data_type = tflite::TensorType::FLOAT32,
+    tflite::TensorType out_data_type = tflite::TensorType::FLOAT32) {
+  CastOptionsBuilder builder_(_fbb);
+  builder_.add_out_data_type(out_data_type);
+  builder_.add_in_data_type(in_data_type);
+  return builder_.Finish();
+}
+
+struct DequantizeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DequantizeOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return DequantizeOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct DequantizeOptionsBuilder {
+  typedef DequantizeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit DequantizeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DequantizeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DequantizeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  DequantizeOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct MaximumMinimumOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MaximumMinimumOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return MaximumMinimumOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct MaximumMinimumOptionsBuilder {
+  typedef MaximumMinimumOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit MaximumMinimumOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<MaximumMinimumOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<MaximumMinimumOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  MaximumMinimumOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct TileOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TileOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TileOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct TileOptionsBuilder {
+  typedef TileOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit TileOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<TileOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<TileOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<TileOptions> CreateTileOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  TileOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct ArgMaxOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ArgMaxOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ArgMaxOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_OUTPUT_TYPE = 4
+  };
+  tflite::TensorType output_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_OUTPUT_TYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_OUTPUT_TYPE, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct ArgMaxOptionsBuilder {
+  typedef ArgMaxOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_output_type(tflite::TensorType output_type) {
+    fbb_.AddElement<int8_t>(ArgMaxOptions::VT_OUTPUT_TYPE, static_cast<int8_t>(output_type), 0);
+  }
+  explicit ArgMaxOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ArgMaxOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ArgMaxOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::TensorType output_type = tflite::TensorType::FLOAT32) {
+  ArgMaxOptionsBuilder builder_(_fbb);
+  builder_.add_output_type(output_type);
+  return builder_.Finish();
+}
+
+struct ArgMinOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ArgMinOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ArgMinOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_OUTPUT_TYPE = 4
+  };
+  tflite::TensorType output_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_OUTPUT_TYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_OUTPUT_TYPE, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct ArgMinOptionsBuilder {
+  typedef ArgMinOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_output_type(tflite::TensorType output_type) {
+    fbb_.AddElement<int8_t>(ArgMinOptions::VT_OUTPUT_TYPE, static_cast<int8_t>(output_type), 0);
+  }
+  explicit ArgMinOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ArgMinOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ArgMinOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ArgMinOptions> CreateArgMinOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::TensorType output_type = tflite::TensorType::FLOAT32) {
+  ArgMinOptionsBuilder builder_(_fbb);
+  builder_.add_output_type(output_type);
+  return builder_.Finish();
+}
+
+struct GreaterOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef GreaterOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return GreaterOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct GreaterOptionsBuilder {
+  typedef GreaterOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit GreaterOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<GreaterOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<GreaterOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<GreaterOptions> CreateGreaterOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  GreaterOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct GreaterEqualOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef GreaterEqualOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return GreaterEqualOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct GreaterEqualOptionsBuilder {
+  typedef GreaterEqualOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit GreaterEqualOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<GreaterEqualOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<GreaterEqualOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<GreaterEqualOptions> CreateGreaterEqualOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  GreaterEqualOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct LessOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LessOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return LessOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct LessOptionsBuilder {
+  typedef LessOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LessOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LessOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<LessOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LessOptions> CreateLessOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  LessOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct LessEqualOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LessEqualOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return LessEqualOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct LessEqualOptionsBuilder {
+  typedef LessEqualOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LessEqualOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LessEqualOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<LessEqualOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LessEqualOptions> CreateLessEqualOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  LessEqualOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct NegOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef NegOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return NegOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct NegOptionsBuilder {
+  typedef NegOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit NegOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<NegOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<NegOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<NegOptions> CreateNegOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  NegOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct SelectOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SelectOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SelectOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct SelectOptionsBuilder {
+  typedef SelectOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SelectOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SelectOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SelectOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SelectOptions> CreateSelectOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  SelectOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct SliceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SliceOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SliceOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct SliceOptionsBuilder {
+  typedef SliceOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SliceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SliceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SliceOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SliceOptions> CreateSliceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  SliceOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct TransposeConvOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TransposeConvOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TransposeConvOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PADDING = 4,
+    VT_STRIDE_W = 6,
+    VT_STRIDE_H = 8,
+    VT_FUSED_ACTIVATION_FUNCTION = 10,
+    VT_QUANTIZED_BIAS_TYPE = 12
+  };
+  tflite::Padding padding() const {
+    return static_cast<tflite::Padding>(GetField<int8_t>(VT_PADDING, 0));
+  }
+  int32_t stride_w() const {
+    return GetField<int32_t>(VT_STRIDE_W, 0);
+  }
+  int32_t stride_h() const {
+    return GetField<int32_t>(VT_STRIDE_H, 0);
+  }
+  tflite::ActivationFunctionType fused_activation_function() const {
+    return static_cast<tflite::ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  tflite::TensorType quantized_bias_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_QUANTIZED_BIAS_TYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_PADDING, 1) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_W, 4) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_H, 4) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION, 1) &&
+           VerifyField<int8_t>(verifier, VT_QUANTIZED_BIAS_TYPE, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct TransposeConvOptionsBuilder {
+  typedef TransposeConvOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_padding(tflite::Padding padding) {
+    fbb_.AddElement<int8_t>(TransposeConvOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+  }
+  void add_stride_w(int32_t stride_w) {
+    fbb_.AddElement<int32_t>(TransposeConvOptions::VT_STRIDE_W, stride_w, 0);
+  }
+  void add_stride_h(int32_t stride_h) {
+    fbb_.AddElement<int32_t>(TransposeConvOptions::VT_STRIDE_H, stride_h, 0);
+  }
+  void add_fused_activation_function(tflite::ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(TransposeConvOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_quantized_bias_type(tflite::TensorType quantized_bias_type) {
+    fbb_.AddElement<int8_t>(TransposeConvOptions::VT_QUANTIZED_BIAS_TYPE, static_cast<int8_t>(quantized_bias_type), 0);
+  }
+  explicit TransposeConvOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<TransposeConvOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<TransposeConvOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::Padding padding = tflite::Padding::SAME,
+    int32_t stride_w = 0,
+    int32_t stride_h = 0,
+    tflite::ActivationFunctionType fused_activation_function = tflite::ActivationFunctionType::NONE,
+    tflite::TensorType quantized_bias_type = tflite::TensorType::FLOAT32) {
+  TransposeConvOptionsBuilder builder_(_fbb);
+  builder_.add_stride_h(stride_h);
+  builder_.add_stride_w(stride_w);
+  builder_.add_quantized_bias_type(quantized_bias_type);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_padding(padding);
+  return builder_.Finish();
+}
+
+struct ExpandDimsOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ExpandDimsOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ExpandDimsOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct ExpandDimsOptionsBuilder {
+  typedef ExpandDimsOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ExpandDimsOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ExpandDimsOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ExpandDimsOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ExpandDimsOptions> CreateExpandDimsOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  ExpandDimsOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct SparseToDenseOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SparseToDenseOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SparseToDenseOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALIDATE_INDICES = 4
+  };
+  bool validate_indices() const {
+    return GetField<uint8_t>(VT_VALIDATE_INDICES, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_VALIDATE_INDICES, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct SparseToDenseOptionsBuilder {
+  typedef SparseToDenseOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_validate_indices(bool validate_indices) {
+    fbb_.AddElement<uint8_t>(SparseToDenseOptions::VT_VALIDATE_INDICES, static_cast<uint8_t>(validate_indices), 0);
+  }
+  explicit SparseToDenseOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SparseToDenseOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SparseToDenseOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool validate_indices = false) {
+  SparseToDenseOptionsBuilder builder_(_fbb);
+  builder_.add_validate_indices(validate_indices);
+  return builder_.Finish();
+}
+
+struct EqualOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef EqualOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return EqualOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct EqualOptionsBuilder {
+  typedef EqualOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit EqualOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<EqualOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<EqualOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<EqualOptions> CreateEqualOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  EqualOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct NotEqualOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef NotEqualOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return NotEqualOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct NotEqualOptionsBuilder {
+  typedef NotEqualOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit NotEqualOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<NotEqualOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<NotEqualOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<NotEqualOptions> CreateNotEqualOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  NotEqualOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct ShapeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ShapeOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ShapeOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_OUT_TYPE = 4
+  };
+  tflite::TensorType out_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_OUT_TYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_OUT_TYPE, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct ShapeOptionsBuilder {
+  typedef ShapeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_out_type(tflite::TensorType out_type) {
+    fbb_.AddElement<int8_t>(ShapeOptions::VT_OUT_TYPE, static_cast<int8_t>(out_type), 0);
+  }
+  explicit ShapeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ShapeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ShapeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ShapeOptions> CreateShapeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::TensorType out_type = tflite::TensorType::FLOAT32) {
+  ShapeOptionsBuilder builder_(_fbb);
+  builder_.add_out_type(out_type);
+  return builder_.Finish();
+}
+
+struct RankOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef RankOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return RankOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct RankOptionsBuilder {
+  typedef RankOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit RankOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<RankOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<RankOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<RankOptions> CreateRankOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  RankOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct PowOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef PowOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return PowOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct PowOptionsBuilder {
+  typedef PowOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit PowOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<PowOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<PowOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<PowOptions> CreatePowOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  PowOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct FakeQuantOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FakeQuantOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return FakeQuantOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_MIN = 4,
+    VT_MAX = 6,
+    VT_NUM_BITS = 8,
+    VT_NARROW_RANGE = 10
+  };
+  float min() const {
+    return GetField<float>(VT_MIN, 0.0f);
+  }
+  float max() const {
+    return GetField<float>(VT_MAX, 0.0f);
+  }
+  int32_t num_bits() const {
+    return GetField<int32_t>(VT_NUM_BITS, 0);
+  }
+  bool narrow_range() const {
+    return GetField<uint8_t>(VT_NARROW_RANGE, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<float>(verifier, VT_MIN, 4) &&
+           VerifyField<float>(verifier, VT_MAX, 4) &&
+           VerifyField<int32_t>(verifier, VT_NUM_BITS, 4) &&
+           VerifyField<uint8_t>(verifier, VT_NARROW_RANGE, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct FakeQuantOptionsBuilder {
+  typedef FakeQuantOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_min(float min) {
+    fbb_.AddElement<float>(FakeQuantOptions::VT_MIN, min, 0.0f);
+  }
+  void add_max(float max) {
+    fbb_.AddElement<float>(FakeQuantOptions::VT_MAX, max, 0.0f);
+  }
+  void add_num_bits(int32_t num_bits) {
+    fbb_.AddElement<int32_t>(FakeQuantOptions::VT_NUM_BITS, num_bits, 0);
+  }
+  void add_narrow_range(bool narrow_range) {
+    fbb_.AddElement<uint8_t>(FakeQuantOptions::VT_NARROW_RANGE, static_cast<uint8_t>(narrow_range), 0);
+  }
+  explicit FakeQuantOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FakeQuantOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<FakeQuantOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FakeQuantOptions> CreateFakeQuantOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    float min = 0.0f,
+    float max = 0.0f,
+    int32_t num_bits = 0,
+    bool narrow_range = false) {
+  FakeQuantOptionsBuilder builder_(_fbb);
+  builder_.add_num_bits(num_bits);
+  builder_.add_max(max);
+  builder_.add_min(min);
+  builder_.add_narrow_range(narrow_range);
+  return builder_.Finish();
+}
+
+struct PackOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef PackOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return PackOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VALUES_COUNT = 4,
+    VT_AXIS = 6
+  };
+  int32_t values_count() const {
+    return GetField<int32_t>(VT_VALUES_COUNT, 0);
+  }
+  int32_t axis() const {
+    return GetField<int32_t>(VT_AXIS, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_VALUES_COUNT, 4) &&
+           VerifyField<int32_t>(verifier, VT_AXIS, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct PackOptionsBuilder {
+  typedef PackOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_values_count(int32_t values_count) {
+    fbb_.AddElement<int32_t>(PackOptions::VT_VALUES_COUNT, values_count, 0);
+  }
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(PackOptions::VT_AXIS, axis, 0);
+  }
+  explicit PackOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<PackOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<PackOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<PackOptions> CreatePackOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t values_count = 0,
+    int32_t axis = 0) {
+  PackOptionsBuilder builder_(_fbb);
+  builder_.add_axis(axis);
+  builder_.add_values_count(values_count);
+  return builder_.Finish();
+}
+
+struct LogicalOrOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LogicalOrOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return LogicalOrOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct LogicalOrOptionsBuilder {
+  typedef LogicalOrOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LogicalOrOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LogicalOrOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<LogicalOrOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LogicalOrOptions> CreateLogicalOrOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  LogicalOrOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct OneHotOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef OneHotOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return OneHotOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_AXIS = 4
+  };
+  int32_t axis() const {
+    return GetField<int32_t>(VT_AXIS, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_AXIS, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct OneHotOptionsBuilder {
+  typedef OneHotOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(OneHotOptions::VT_AXIS, axis, 0);
+  }
+  explicit OneHotOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<OneHotOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<OneHotOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t axis = 0) {
+  OneHotOptionsBuilder builder_(_fbb);
+  builder_.add_axis(axis);
+  return builder_.Finish();
+}
+
+struct AbsOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef AbsOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return AbsOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct AbsOptionsBuilder {
+  typedef AbsOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit AbsOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<AbsOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<AbsOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<AbsOptions> CreateAbsOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  AbsOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct HardSwishOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef HardSwishOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return HardSwishOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct HardSwishOptionsBuilder {
+  typedef HardSwishOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit HardSwishOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<HardSwishOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<HardSwishOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<HardSwishOptions> CreateHardSwishOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  HardSwishOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct LogicalAndOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LogicalAndOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return LogicalAndOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct LogicalAndOptionsBuilder {
+  typedef LogicalAndOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LogicalAndOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LogicalAndOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<LogicalAndOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LogicalAndOptions> CreateLogicalAndOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  LogicalAndOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct LogicalNotOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LogicalNotOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return LogicalNotOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct LogicalNotOptionsBuilder {
+  typedef LogicalNotOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LogicalNotOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LogicalNotOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<LogicalNotOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LogicalNotOptions> CreateLogicalNotOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  LogicalNotOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct UnpackOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef UnpackOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return UnpackOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NUM = 4,
+    VT_AXIS = 6
+  };
+  int32_t num() const {
+    return GetField<int32_t>(VT_NUM, 0);
+  }
+  int32_t axis() const {
+    return GetField<int32_t>(VT_AXIS, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM, 4) &&
+           VerifyField<int32_t>(verifier, VT_AXIS, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct UnpackOptionsBuilder {
+  typedef UnpackOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_num(int32_t num) {
+    fbb_.AddElement<int32_t>(UnpackOptions::VT_NUM, num, 0);
+  }
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(UnpackOptions::VT_AXIS, axis, 0);
+  }
+  explicit UnpackOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<UnpackOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<UnpackOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<UnpackOptions> CreateUnpackOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num = 0,
+    int32_t axis = 0) {
+  UnpackOptionsBuilder builder_(_fbb);
+  builder_.add_axis(axis);
+  builder_.add_num(num);
+  return builder_.Finish();
+}
+
+struct FloorDivOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FloorDivOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return FloorDivOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct FloorDivOptionsBuilder {
+  typedef FloorDivOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit FloorDivOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FloorDivOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<FloorDivOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FloorDivOptions> CreateFloorDivOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  FloorDivOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct SquareOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SquareOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SquareOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct SquareOptionsBuilder {
+  typedef SquareOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SquareOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SquareOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SquareOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SquareOptions> CreateSquareOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  SquareOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct ZerosLikeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ZerosLikeOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ZerosLikeOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct ZerosLikeOptionsBuilder {
+  typedef ZerosLikeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ZerosLikeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ZerosLikeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ZerosLikeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ZerosLikeOptions> CreateZerosLikeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  ZerosLikeOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct FillOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FillOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return FillOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct FillOptionsBuilder {
+  typedef FillOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit FillOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FillOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<FillOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FillOptions> CreateFillOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  FillOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct FloorModOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FloorModOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return FloorModOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct FloorModOptionsBuilder {
+  typedef FloorModOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit FloorModOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FloorModOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<FloorModOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FloorModOptions> CreateFloorModOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  FloorModOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct RangeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef RangeOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return RangeOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct RangeOptionsBuilder {
+  typedef RangeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit RangeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<RangeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<RangeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<RangeOptions> CreateRangeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  RangeOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct LeakyReluOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LeakyReluOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return LeakyReluOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ALPHA = 4
+  };
+  float alpha() const {
+    return GetField<float>(VT_ALPHA, 0.0f);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<float>(verifier, VT_ALPHA, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct LeakyReluOptionsBuilder {
+  typedef LeakyReluOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_alpha(float alpha) {
+    fbb_.AddElement<float>(LeakyReluOptions::VT_ALPHA, alpha, 0.0f);
+  }
+  explicit LeakyReluOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LeakyReluOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<LeakyReluOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    float alpha = 0.0f) {
+  LeakyReluOptionsBuilder builder_(_fbb);
+  builder_.add_alpha(alpha);
+  return builder_.Finish();
+}
+
+struct SquaredDifferenceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SquaredDifferenceOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SquaredDifferenceOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct SquaredDifferenceOptionsBuilder {
+  typedef SquaredDifferenceOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SquaredDifferenceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SquaredDifferenceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SquaredDifferenceOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  SquaredDifferenceOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct MirrorPadOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MirrorPadOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return MirrorPadOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_MODE = 4
+  };
+  tflite::MirrorPadMode mode() const {
+    return static_cast<tflite::MirrorPadMode>(GetField<int8_t>(VT_MODE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_MODE, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct MirrorPadOptionsBuilder {
+  typedef MirrorPadOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_mode(tflite::MirrorPadMode mode) {
+    fbb_.AddElement<int8_t>(MirrorPadOptions::VT_MODE, static_cast<int8_t>(mode), 0);
+  }
+  explicit MirrorPadOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<MirrorPadOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<MirrorPadOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::MirrorPadMode mode = tflite::MirrorPadMode::REFLECT) {
+  MirrorPadOptionsBuilder builder_(_fbb);
+  builder_.add_mode(mode);
+  return builder_.Finish();
+}
+
+struct UniqueOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef UniqueOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return UniqueOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_IDX_OUT_TYPE = 4
+  };
+  tflite::TensorType idx_out_type() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_IDX_OUT_TYPE, 2));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_IDX_OUT_TYPE, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct UniqueOptionsBuilder {
+  typedef UniqueOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_idx_out_type(tflite::TensorType idx_out_type) {
+    fbb_.AddElement<int8_t>(UniqueOptions::VT_IDX_OUT_TYPE, static_cast<int8_t>(idx_out_type), 2);
+  }
+  explicit UniqueOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<UniqueOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<UniqueOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::TensorType idx_out_type = tflite::TensorType::INT32) {
+  UniqueOptionsBuilder builder_(_fbb);
+  builder_.add_idx_out_type(idx_out_type);
+  return builder_.Finish();
+}
+
+struct ReverseV2Options FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ReverseV2OptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ReverseV2OptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct ReverseV2OptionsBuilder {
+  typedef ReverseV2Options Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ReverseV2OptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ReverseV2Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ReverseV2Options>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  ReverseV2OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct AddNOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef AddNOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return AddNOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct AddNOptionsBuilder {
+  typedef AddNOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit AddNOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<AddNOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<AddNOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<AddNOptions> CreateAddNOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  AddNOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct GatherNdOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef GatherNdOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return GatherNdOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct GatherNdOptionsBuilder {
+  typedef GatherNdOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit GatherNdOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<GatherNdOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<GatherNdOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  GatherNdOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct WhereOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef WhereOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return WhereOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct WhereOptionsBuilder {
+  typedef WhereOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit WhereOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<WhereOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<WhereOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<WhereOptions> CreateWhereOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  WhereOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct ReverseSequenceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ReverseSequenceOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ReverseSequenceOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SEQ_DIM = 4,
+    VT_BATCH_DIM = 6
+  };
+  int32_t seq_dim() const {
+    return GetField<int32_t>(VT_SEQ_DIM, 0);
+  }
+  int32_t batch_dim() const {
+    return GetField<int32_t>(VT_BATCH_DIM, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_SEQ_DIM, 4) &&
+           VerifyField<int32_t>(verifier, VT_BATCH_DIM, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct ReverseSequenceOptionsBuilder {
+  typedef ReverseSequenceOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_seq_dim(int32_t seq_dim) {
+    fbb_.AddElement<int32_t>(ReverseSequenceOptions::VT_SEQ_DIM, seq_dim, 0);
+  }
+  void add_batch_dim(int32_t batch_dim) {
+    fbb_.AddElement<int32_t>(ReverseSequenceOptions::VT_BATCH_DIM, batch_dim, 0);
+  }
+  explicit ReverseSequenceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ReverseSequenceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ReverseSequenceOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ReverseSequenceOptions> CreateReverseSequenceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t seq_dim = 0,
+    int32_t batch_dim = 0) {
+  ReverseSequenceOptionsBuilder builder_(_fbb);
+  builder_.add_batch_dim(batch_dim);
+  builder_.add_seq_dim(seq_dim);
+  return builder_.Finish();
+}
+
+struct MatrixDiagOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MatrixDiagOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return MatrixDiagOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct MatrixDiagOptionsBuilder {
+  typedef MatrixDiagOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit MatrixDiagOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<MatrixDiagOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<MatrixDiagOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<MatrixDiagOptions> CreateMatrixDiagOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  MatrixDiagOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct QuantizeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef QuantizeOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return QuantizeOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct QuantizeOptionsBuilder {
+  typedef QuantizeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit QuantizeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<QuantizeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<QuantizeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<QuantizeOptions> CreateQuantizeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  QuantizeOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct MatrixSetDiagOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MatrixSetDiagOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return MatrixSetDiagOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct MatrixSetDiagOptionsBuilder {
+  typedef MatrixSetDiagOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit MatrixSetDiagOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<MatrixSetDiagOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<MatrixSetDiagOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<MatrixSetDiagOptions> CreateMatrixSetDiagOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  MatrixSetDiagOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct IfOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef IfOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return IfOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_THEN_SUBGRAPH_INDEX = 4,
+    VT_ELSE_SUBGRAPH_INDEX = 6
+  };
+  int32_t then_subgraph_index() const {
+    return GetField<int32_t>(VT_THEN_SUBGRAPH_INDEX, 0);
+  }
+  int32_t else_subgraph_index() const {
+    return GetField<int32_t>(VT_ELSE_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_THEN_SUBGRAPH_INDEX, 4) &&
+           VerifyField<int32_t>(verifier, VT_ELSE_SUBGRAPH_INDEX, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct IfOptionsBuilder {
+  typedef IfOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_then_subgraph_index(int32_t then_subgraph_index) {
+    fbb_.AddElement<int32_t>(IfOptions::VT_THEN_SUBGRAPH_INDEX, then_subgraph_index, 0);
+  }
+  void add_else_subgraph_index(int32_t else_subgraph_index) {
+    fbb_.AddElement<int32_t>(IfOptions::VT_ELSE_SUBGRAPH_INDEX, else_subgraph_index, 0);
+  }
+  explicit IfOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<IfOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<IfOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<IfOptions> CreateIfOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t then_subgraph_index = 0,
+    int32_t else_subgraph_index = 0) {
+  IfOptionsBuilder builder_(_fbb);
+  builder_.add_else_subgraph_index(else_subgraph_index);
+  builder_.add_then_subgraph_index(then_subgraph_index);
+  return builder_.Finish();
+}
+
+struct CallOnceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CallOnceOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return CallOnceOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INIT_SUBGRAPH_INDEX = 4
+  };
+  int32_t init_subgraph_index() const {
+    return GetField<int32_t>(VT_INIT_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_INIT_SUBGRAPH_INDEX, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct CallOnceOptionsBuilder {
+  typedef CallOnceOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_init_subgraph_index(int32_t init_subgraph_index) {
+    fbb_.AddElement<int32_t>(CallOnceOptions::VT_INIT_SUBGRAPH_INDEX, init_subgraph_index, 0);
+  }
+  explicit CallOnceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CallOnceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CallOnceOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CallOnceOptions> CreateCallOnceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t init_subgraph_index = 0) {
+  CallOnceOptionsBuilder builder_(_fbb);
+  builder_.add_init_subgraph_index(init_subgraph_index);
+  return builder_.Finish();
+}
+
+struct WhileOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef WhileOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return WhileOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_COND_SUBGRAPH_INDEX = 4,
+    VT_BODY_SUBGRAPH_INDEX = 6
+  };
+  int32_t cond_subgraph_index() const {
+    return GetField<int32_t>(VT_COND_SUBGRAPH_INDEX, 0);
+  }
+  int32_t body_subgraph_index() const {
+    return GetField<int32_t>(VT_BODY_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_COND_SUBGRAPH_INDEX, 4) &&
+           VerifyField<int32_t>(verifier, VT_BODY_SUBGRAPH_INDEX, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct WhileOptionsBuilder {
+  typedef WhileOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_cond_subgraph_index(int32_t cond_subgraph_index) {
+    fbb_.AddElement<int32_t>(WhileOptions::VT_COND_SUBGRAPH_INDEX, cond_subgraph_index, 0);
+  }
+  void add_body_subgraph_index(int32_t body_subgraph_index) {
+    fbb_.AddElement<int32_t>(WhileOptions::VT_BODY_SUBGRAPH_INDEX, body_subgraph_index, 0);
+  }
+  explicit WhileOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<WhileOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<WhileOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<WhileOptions> CreateWhileOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t cond_subgraph_index = 0,
+    int32_t body_subgraph_index = 0) {
+  WhileOptionsBuilder builder_(_fbb);
+  builder_.add_body_subgraph_index(body_subgraph_index);
+  builder_.add_cond_subgraph_index(cond_subgraph_index);
+  return builder_.Finish();
+}
+
+struct NonMaxSuppressionV4Options FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef NonMaxSuppressionV4OptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return NonMaxSuppressionV4OptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct NonMaxSuppressionV4OptionsBuilder {
+  typedef NonMaxSuppressionV4Options Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit NonMaxSuppressionV4OptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<NonMaxSuppressionV4Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<NonMaxSuppressionV4Options>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<NonMaxSuppressionV4Options> CreateNonMaxSuppressionV4Options(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  NonMaxSuppressionV4OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct NonMaxSuppressionV5Options FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef NonMaxSuppressionV5OptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return NonMaxSuppressionV5OptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct NonMaxSuppressionV5OptionsBuilder {
+  typedef NonMaxSuppressionV5Options Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit NonMaxSuppressionV5OptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<NonMaxSuppressionV5Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<NonMaxSuppressionV5Options>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<NonMaxSuppressionV5Options> CreateNonMaxSuppressionV5Options(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  NonMaxSuppressionV5OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct ScatterNdOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ScatterNdOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ScatterNdOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct ScatterNdOptionsBuilder {
+  typedef ScatterNdOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ScatterNdOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ScatterNdOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ScatterNdOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ScatterNdOptions> CreateScatterNdOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  ScatterNdOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct SelectV2Options FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SelectV2OptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SelectV2OptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct SelectV2OptionsBuilder {
+  typedef SelectV2Options Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SelectV2OptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SelectV2Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SelectV2Options>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SelectV2Options> CreateSelectV2Options(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  SelectV2OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct DensifyOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DensifyOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return DensifyOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct DensifyOptionsBuilder {
+  typedef DensifyOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit DensifyOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DensifyOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DensifyOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DensifyOptions> CreateDensifyOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  DensifyOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct SegmentSumOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SegmentSumOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SegmentSumOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct SegmentSumOptionsBuilder {
+  typedef SegmentSumOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SegmentSumOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SegmentSumOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SegmentSumOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SegmentSumOptions> CreateSegmentSumOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  SegmentSumOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct BatchMatMulOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BatchMatMulOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return BatchMatMulOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ADJ_X = 4,
+    VT_ADJ_Y = 6,
+    VT_ASYMMETRIC_QUANTIZE_INPUTS = 8
+  };
+  bool adj_x() const {
+    return GetField<uint8_t>(VT_ADJ_X, 0) != 0;
+  }
+  bool adj_y() const {
+    return GetField<uint8_t>(VT_ADJ_Y, 0) != 0;
+  }
+  bool asymmetric_quantize_inputs() const {
+    return GetField<uint8_t>(VT_ASYMMETRIC_QUANTIZE_INPUTS, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_ADJ_X, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ADJ_Y, 1) &&
+           VerifyField<uint8_t>(verifier, VT_ASYMMETRIC_QUANTIZE_INPUTS, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct BatchMatMulOptionsBuilder {
+  typedef BatchMatMulOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_adj_x(bool adj_x) {
+    fbb_.AddElement<uint8_t>(BatchMatMulOptions::VT_ADJ_X, static_cast<uint8_t>(adj_x), 0);
+  }
+  void add_adj_y(bool adj_y) {
+    fbb_.AddElement<uint8_t>(BatchMatMulOptions::VT_ADJ_Y, static_cast<uint8_t>(adj_y), 0);
+  }
+  void add_asymmetric_quantize_inputs(bool asymmetric_quantize_inputs) {
+    fbb_.AddElement<uint8_t>(BatchMatMulOptions::VT_ASYMMETRIC_QUANTIZE_INPUTS, static_cast<uint8_t>(asymmetric_quantize_inputs), 0);
+  }
+  explicit BatchMatMulOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BatchMatMulOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BatchMatMulOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BatchMatMulOptions> CreateBatchMatMulOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool adj_x = false,
+    bool adj_y = false,
+    bool asymmetric_quantize_inputs = false) {
+  BatchMatMulOptionsBuilder builder_(_fbb);
+  builder_.add_asymmetric_quantize_inputs(asymmetric_quantize_inputs);
+  builder_.add_adj_y(adj_y);
+  builder_.add_adj_x(adj_x);
+  return builder_.Finish();
+}
+
+struct CumsumOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef CumsumOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return CumsumOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_EXCLUSIVE = 4,
+    VT_REVERSE = 6
+  };
+  bool exclusive() const {
+    return GetField<uint8_t>(VT_EXCLUSIVE, 0) != 0;
+  }
+  bool reverse() const {
+    return GetField<uint8_t>(VT_REVERSE, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_EXCLUSIVE, 1) &&
+           VerifyField<uint8_t>(verifier, VT_REVERSE, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct CumsumOptionsBuilder {
+  typedef CumsumOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_exclusive(bool exclusive) {
+    fbb_.AddElement<uint8_t>(CumsumOptions::VT_EXCLUSIVE, static_cast<uint8_t>(exclusive), 0);
+  }
+  void add_reverse(bool reverse) {
+    fbb_.AddElement<uint8_t>(CumsumOptions::VT_REVERSE, static_cast<uint8_t>(reverse), 0);
+  }
+  explicit CumsumOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<CumsumOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<CumsumOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<CumsumOptions> CreateCumsumOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool exclusive = false,
+    bool reverse = false) {
+  CumsumOptionsBuilder builder_(_fbb);
+  builder_.add_reverse(reverse);
+  builder_.add_exclusive(exclusive);
+  return builder_.Finish();
+}
+
+struct BroadcastToOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BroadcastToOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return BroadcastToOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct BroadcastToOptionsBuilder {
+  typedef BroadcastToOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit BroadcastToOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BroadcastToOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BroadcastToOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BroadcastToOptions> CreateBroadcastToOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  BroadcastToOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct Rfft2dOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Rfft2dOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return Rfft2dOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct Rfft2dOptionsBuilder {
+  typedef Rfft2dOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit Rfft2dOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Rfft2dOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Rfft2dOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Rfft2dOptions> CreateRfft2dOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  Rfft2dOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct HashtableOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef HashtableOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return HashtableOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TABLE_ID = 4,
+    VT_KEY_DTYPE = 6,
+    VT_VALUE_DTYPE = 8
+  };
+  int32_t table_id() const {
+    return GetField<int32_t>(VT_TABLE_ID, 0);
+  }
+  tflite::TensorType key_dtype() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_KEY_DTYPE, 0));
+  }
+  tflite::TensorType value_dtype() const {
+    return static_cast<tflite::TensorType>(GetField<int8_t>(VT_VALUE_DTYPE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_TABLE_ID, 4) &&
+           VerifyField<int8_t>(verifier, VT_KEY_DTYPE, 1) &&
+           VerifyField<int8_t>(verifier, VT_VALUE_DTYPE, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct HashtableOptionsBuilder {
+  typedef HashtableOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_table_id(int32_t table_id) {
+    fbb_.AddElement<int32_t>(HashtableOptions::VT_TABLE_ID, table_id, 0);
+  }
+  void add_key_dtype(tflite::TensorType key_dtype) {
+    fbb_.AddElement<int8_t>(HashtableOptions::VT_KEY_DTYPE, static_cast<int8_t>(key_dtype), 0);
+  }
+  void add_value_dtype(tflite::TensorType value_dtype) {
+    fbb_.AddElement<int8_t>(HashtableOptions::VT_VALUE_DTYPE, static_cast<int8_t>(value_dtype), 0);
+  }
+  explicit HashtableOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<HashtableOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<HashtableOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<HashtableOptions> CreateHashtableOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t table_id = 0,
+    tflite::TensorType key_dtype = tflite::TensorType::FLOAT32,
+    tflite::TensorType value_dtype = tflite::TensorType::FLOAT32) {
+  HashtableOptionsBuilder builder_(_fbb);
+  builder_.add_table_id(table_id);
+  builder_.add_value_dtype(value_dtype);
+  builder_.add_key_dtype(key_dtype);
+  return builder_.Finish();
+}
+
+struct HashtableFindOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef HashtableFindOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return HashtableFindOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct HashtableFindOptionsBuilder {
+  typedef HashtableFindOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit HashtableFindOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<HashtableFindOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<HashtableFindOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<HashtableFindOptions> CreateHashtableFindOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  HashtableFindOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct HashtableImportOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef HashtableImportOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return HashtableImportOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct HashtableImportOptionsBuilder {
+  typedef HashtableImportOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit HashtableImportOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<HashtableImportOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<HashtableImportOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<HashtableImportOptions> CreateHashtableImportOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  HashtableImportOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct HashtableSizeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef HashtableSizeOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return HashtableSizeOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct HashtableSizeOptionsBuilder {
+  typedef HashtableSizeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit HashtableSizeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<HashtableSizeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<HashtableSizeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<HashtableSizeOptions> CreateHashtableSizeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  HashtableSizeOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct VarHandleOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef VarHandleOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return VarHandleOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_CONTAINER = 4,
+    VT_SHARED_NAME = 6
+  };
+  const ::flatbuffers::String *container() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CONTAINER);
+  }
+  const ::flatbuffers::String *shared_name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_SHARED_NAME);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_CONTAINER) &&
+           verifier.VerifyString(container()) &&
+           VerifyOffset(verifier, VT_SHARED_NAME) &&
+           verifier.VerifyString(shared_name()) &&
+           verifier.EndTable();
+  }
+};
+
+struct VarHandleOptionsBuilder {
+  typedef VarHandleOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_container(::flatbuffers::Offset<::flatbuffers::String> container) {
+    fbb_.AddOffset(VarHandleOptions::VT_CONTAINER, container);
+  }
+  void add_shared_name(::flatbuffers::Offset<::flatbuffers::String> shared_name) {
+    fbb_.AddOffset(VarHandleOptions::VT_SHARED_NAME, shared_name);
+  }
+  explicit VarHandleOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<VarHandleOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<VarHandleOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<VarHandleOptions> CreateVarHandleOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> container = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> shared_name = 0) {
+  VarHandleOptionsBuilder builder_(_fbb);
+  builder_.add_shared_name(shared_name);
+  builder_.add_container(container);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<VarHandleOptions> CreateVarHandleOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *container = nullptr,
+    const char *shared_name = nullptr) {
+  auto container__ = container ? _fbb.CreateString(container) : 0;
+  auto shared_name__ = shared_name ? _fbb.CreateString(shared_name) : 0;
+  return tflite::CreateVarHandleOptions(
+      _fbb,
+      container__,
+      shared_name__);
+}
+
+struct ReadVariableOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ReadVariableOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ReadVariableOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct ReadVariableOptionsBuilder {
+  typedef ReadVariableOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ReadVariableOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ReadVariableOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ReadVariableOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ReadVariableOptions> CreateReadVariableOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  ReadVariableOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct AssignVariableOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef AssignVariableOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return AssignVariableOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct AssignVariableOptionsBuilder {
+  typedef AssignVariableOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit AssignVariableOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<AssignVariableOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<AssignVariableOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<AssignVariableOptions> CreateAssignVariableOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  AssignVariableOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct RandomOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef RandomOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return RandomOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_SEED = 4,
+    VT_SEED2 = 6
+  };
+  int64_t seed() const {
+    return GetField<int64_t>(VT_SEED, 0);
+  }
+  int64_t seed2() const {
+    return GetField<int64_t>(VT_SEED2, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int64_t>(verifier, VT_SEED, 8) &&
+           VerifyField<int64_t>(verifier, VT_SEED2, 8) &&
+           verifier.EndTable();
+  }
+};
+
+struct RandomOptionsBuilder {
+  typedef RandomOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_seed(int64_t seed) {
+    fbb_.AddElement<int64_t>(RandomOptions::VT_SEED, seed, 0);
+  }
+  void add_seed2(int64_t seed2) {
+    fbb_.AddElement<int64_t>(RandomOptions::VT_SEED2, seed2, 0);
+  }
+  explicit RandomOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<RandomOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<RandomOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<RandomOptions> CreateRandomOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int64_t seed = 0,
+    int64_t seed2 = 0) {
+  RandomOptionsBuilder builder_(_fbb);
+  builder_.add_seed2(seed2);
+  builder_.add_seed(seed);
+  return builder_.Finish();
+}
+
+struct BucketizeOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BucketizeOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return BucketizeOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BOUNDARIES = 4
+  };
+  const ::flatbuffers::Vector<float> *boundaries() const {
+    return GetPointer<const ::flatbuffers::Vector<float> *>(VT_BOUNDARIES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_BOUNDARIES) &&
+           verifier.VerifyVector(boundaries()) &&
+           verifier.EndTable();
+  }
+};
+
+struct BucketizeOptionsBuilder {
+  typedef BucketizeOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_boundaries(::flatbuffers::Offset<::flatbuffers::Vector<float>> boundaries) {
+    fbb_.AddOffset(BucketizeOptions::VT_BOUNDARIES, boundaries);
+  }
+  explicit BucketizeOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BucketizeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BucketizeOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BucketizeOptions> CreateBucketizeOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<float>> boundaries = 0) {
+  BucketizeOptionsBuilder builder_(_fbb);
+  builder_.add_boundaries(boundaries);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<BucketizeOptions> CreateBucketizeOptionsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<float> *boundaries = nullptr) {
+  auto boundaries__ = boundaries ? _fbb.CreateVector<float>(*boundaries) : 0;
+  return tflite::CreateBucketizeOptions(
+      _fbb,
+      boundaries__);
+}
+
+struct GeluOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef GeluOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return GeluOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_APPROXIMATE = 4
+  };
+  bool approximate() const {
+    return GetField<uint8_t>(VT_APPROXIMATE, 0) != 0;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_APPROXIMATE, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct GeluOptionsBuilder {
+  typedef GeluOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_approximate(bool approximate) {
+    fbb_.AddElement<uint8_t>(GeluOptions::VT_APPROXIMATE, static_cast<uint8_t>(approximate), 0);
+  }
+  explicit GeluOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<GeluOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<GeluOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<GeluOptions> CreateGeluOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    bool approximate = false) {
+  GeluOptionsBuilder builder_(_fbb);
+  builder_.add_approximate(approximate);
+  return builder_.Finish();
+}
+
+struct DynamicUpdateSliceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DynamicUpdateSliceOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return DynamicUpdateSliceOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct DynamicUpdateSliceOptionsBuilder {
+  typedef DynamicUpdateSliceOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit DynamicUpdateSliceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DynamicUpdateSliceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DynamicUpdateSliceOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DynamicUpdateSliceOptions> CreateDynamicUpdateSliceOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  DynamicUpdateSliceOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct UnsortedSegmentProdOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef UnsortedSegmentProdOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return UnsortedSegmentProdOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct UnsortedSegmentProdOptionsBuilder {
+  typedef UnsortedSegmentProdOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit UnsortedSegmentProdOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<UnsortedSegmentProdOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<UnsortedSegmentProdOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<UnsortedSegmentProdOptions> CreateUnsortedSegmentProdOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  UnsortedSegmentProdOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct UnsortedSegmentMaxOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef UnsortedSegmentMaxOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return UnsortedSegmentMaxOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct UnsortedSegmentMaxOptionsBuilder {
+  typedef UnsortedSegmentMaxOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit UnsortedSegmentMaxOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<UnsortedSegmentMaxOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<UnsortedSegmentMaxOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<UnsortedSegmentMaxOptions> CreateUnsortedSegmentMaxOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  UnsortedSegmentMaxOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct UnsortedSegmentSumOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef UnsortedSegmentSumOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return UnsortedSegmentSumOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct UnsortedSegmentSumOptionsBuilder {
+  typedef UnsortedSegmentSumOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit UnsortedSegmentSumOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<UnsortedSegmentSumOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<UnsortedSegmentSumOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<UnsortedSegmentSumOptions> CreateUnsortedSegmentSumOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  UnsortedSegmentSumOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct ATan2Options FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ATan2OptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ATan2OptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct ATan2OptionsBuilder {
+  typedef ATan2Options Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ATan2OptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ATan2Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ATan2Options>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ATan2Options> CreateATan2Options(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  ATan2OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct UnsortedSegmentMinOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef UnsortedSegmentMinOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return UnsortedSegmentMinOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct UnsortedSegmentMinOptionsBuilder {
+  typedef UnsortedSegmentMinOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit UnsortedSegmentMinOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<UnsortedSegmentMinOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<UnsortedSegmentMinOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<UnsortedSegmentMinOptions> CreateUnsortedSegmentMinOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  UnsortedSegmentMinOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct SignOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SignOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SignOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct SignOptionsBuilder {
+  typedef SignOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit SignOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SignOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SignOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SignOptions> CreateSignOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  SignOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct BitcastOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BitcastOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return BitcastOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct BitcastOptionsBuilder {
+  typedef BitcastOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit BitcastOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BitcastOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BitcastOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BitcastOptions> CreateBitcastOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  BitcastOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct BitwiseXorOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BitwiseXorOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return BitwiseXorOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct BitwiseXorOptionsBuilder {
+  typedef BitwiseXorOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit BitwiseXorOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BitwiseXorOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BitwiseXorOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BitwiseXorOptions> CreateBitwiseXorOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  BitwiseXorOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct RightShiftOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef RightShiftOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return RightShiftOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct RightShiftOptionsBuilder {
+  typedef RightShiftOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit RightShiftOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<RightShiftOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<RightShiftOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<RightShiftOptions> CreateRightShiftOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  RightShiftOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct DilateOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DilateOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return DilateOptionsTypeTable();
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+};
+
+struct DilateOptionsBuilder {
+  typedef DilateOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit DilateOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DilateOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DilateOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DilateOptions> CreateDilateOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb) {
+  DilateOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct ReduceWindowOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ReduceWindowOptionsBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ReduceWindowOptionsTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_REDUCE_FUNCTION = 4
+  };
+  tflite::ReduceWindowFunction reduce_function() const {
+    return static_cast<tflite::ReduceWindowFunction>(GetField<int32_t>(VT_REDUCE_FUNCTION, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_REDUCE_FUNCTION, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct ReduceWindowOptionsBuilder {
+  typedef ReduceWindowOptions Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_reduce_function(tflite::ReduceWindowFunction reduce_function) {
+    fbb_.AddElement<int32_t>(ReduceWindowOptions::VT_REDUCE_FUNCTION, static_cast<int32_t>(reduce_function), 0);
+  }
+  explicit ReduceWindowOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ReduceWindowOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ReduceWindowOptions>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ReduceWindowOptions> CreateReduceWindowOptions(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    tflite::ReduceWindowFunction reduce_function = tflite::ReduceWindowFunction::UNSUPPORTED) {
+  ReduceWindowOptionsBuilder builder_(_fbb);
+  builder_.add_reduce_function(reduce_function);
+  return builder_.Finish();
+}
+
+struct OperatorCode FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef OperatorCodeBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return OperatorCodeTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DEPRECATED_BUILTIN_CODE = 4,
+    VT_CUSTOM_CODE = 6,
+    VT_VERSION = 8,
+    VT_BUILTIN_CODE = 10
+  };
+  int8_t deprecated_builtin_code() const {
+    return GetField<int8_t>(VT_DEPRECATED_BUILTIN_CODE, 0);
+  }
+  const ::flatbuffers::String *custom_code() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CUSTOM_CODE);
+  }
+  int32_t version() const {
+    return GetField<int32_t>(VT_VERSION, 1);
+  }
+  tflite::BuiltinOperator builtin_code() const {
+    return static_cast<tflite::BuiltinOperator>(GetField<int32_t>(VT_BUILTIN_CODE, 0));
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_DEPRECATED_BUILTIN_CODE, 1) &&
+           VerifyOffset(verifier, VT_CUSTOM_CODE) &&
+           verifier.VerifyString(custom_code()) &&
+           VerifyField<int32_t>(verifier, VT_VERSION, 4) &&
+           VerifyField<int32_t>(verifier, VT_BUILTIN_CODE, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct OperatorCodeBuilder {
+  typedef OperatorCode Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_deprecated_builtin_code(int8_t deprecated_builtin_code) {
+    fbb_.AddElement<int8_t>(OperatorCode::VT_DEPRECATED_BUILTIN_CODE, deprecated_builtin_code, 0);
+  }
+  void add_custom_code(::flatbuffers::Offset<::flatbuffers::String> custom_code) {
+    fbb_.AddOffset(OperatorCode::VT_CUSTOM_CODE, custom_code);
+  }
+  void add_version(int32_t version) {
+    fbb_.AddElement<int32_t>(OperatorCode::VT_VERSION, version, 1);
+  }
+  void add_builtin_code(tflite::BuiltinOperator builtin_code) {
+    fbb_.AddElement<int32_t>(OperatorCode::VT_BUILTIN_CODE, static_cast<int32_t>(builtin_code), 0);
+  }
+  explicit OperatorCodeBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<OperatorCode> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<OperatorCode>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<OperatorCode> CreateOperatorCode(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int8_t deprecated_builtin_code = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> custom_code = 0,
+    int32_t version = 1,
+    tflite::BuiltinOperator builtin_code = tflite::BuiltinOperator::ADD) {
+  OperatorCodeBuilder builder_(_fbb);
+  builder_.add_builtin_code(builtin_code);
+  builder_.add_version(version);
+  builder_.add_custom_code(custom_code);
+  builder_.add_deprecated_builtin_code(deprecated_builtin_code);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<OperatorCode> CreateOperatorCodeDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int8_t deprecated_builtin_code = 0,
+    const char *custom_code = nullptr,
+    int32_t version = 1,
+    tflite::BuiltinOperator builtin_code = tflite::BuiltinOperator::ADD) {
+  auto custom_code__ = custom_code ? _fbb.CreateString(custom_code) : 0;
+  return tflite::CreateOperatorCode(
+      _fbb,
+      deprecated_builtin_code,
+      custom_code__,
+      version,
+      builtin_code);
+}
+
+struct Operator FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef OperatorBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return OperatorTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_OPCODE_INDEX = 4,
+    VT_INPUTS = 6,
+    VT_OUTPUTS = 8,
+    VT_BUILTIN_OPTIONS_TYPE = 10,
+    VT_BUILTIN_OPTIONS = 12,
+    VT_CUSTOM_OPTIONS = 14,
+    VT_CUSTOM_OPTIONS_FORMAT = 16,
+    VT_MUTATING_VARIABLE_INPUTS = 18,
+    VT_INTERMEDIATES = 20,
+    VT_LARGE_CUSTOM_OPTIONS_OFFSET = 22,
+    VT_LARGE_CUSTOM_OPTIONS_SIZE = 24,
+    VT_BUILTIN_OPTIONS_2_TYPE = 26,
+    VT_BUILTIN_OPTIONS_2 = 28
+  };
+  uint32_t opcode_index() const {
+    return GetField<uint32_t>(VT_OPCODE_INDEX, 0);
+  }
+  const ::flatbuffers::Vector<int32_t> *inputs() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_INPUTS);
+  }
+  const ::flatbuffers::Vector<int32_t> *outputs() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_OUTPUTS);
+  }
+  tflite::BuiltinOptions builtin_options_type() const {
+    return static_cast<tflite::BuiltinOptions>(GetField<uint8_t>(VT_BUILTIN_OPTIONS_TYPE, 0));
+  }
+  const void *builtin_options() const {
+    return GetPointer<const void *>(VT_BUILTIN_OPTIONS);
+  }
+  template<typename T> const T *builtin_options_as() const;
+  const tflite::Conv2DOptions *builtin_options_as_Conv2DOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::Conv2DOptions ? static_cast<const tflite::Conv2DOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::DepthwiseConv2DOptions *builtin_options_as_DepthwiseConv2DOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::DepthwiseConv2DOptions ? static_cast<const tflite::DepthwiseConv2DOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ConcatEmbeddingsOptions *builtin_options_as_ConcatEmbeddingsOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::ConcatEmbeddingsOptions ? static_cast<const tflite::ConcatEmbeddingsOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LSHProjectionOptions *builtin_options_as_LSHProjectionOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::LSHProjectionOptions ? static_cast<const tflite::LSHProjectionOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::Pool2DOptions *builtin_options_as_Pool2DOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::Pool2DOptions ? static_cast<const tflite::Pool2DOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SVDFOptions *builtin_options_as_SVDFOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::SVDFOptions ? static_cast<const tflite::SVDFOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::RNNOptions *builtin_options_as_RNNOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::RNNOptions ? static_cast<const tflite::RNNOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::FullyConnectedOptions *builtin_options_as_FullyConnectedOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::FullyConnectedOptions ? static_cast<const tflite::FullyConnectedOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SoftmaxOptions *builtin_options_as_SoftmaxOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::SoftmaxOptions ? static_cast<const tflite::SoftmaxOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ConcatenationOptions *builtin_options_as_ConcatenationOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::ConcatenationOptions ? static_cast<const tflite::ConcatenationOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::AddOptions *builtin_options_as_AddOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::AddOptions ? static_cast<const tflite::AddOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::L2NormOptions *builtin_options_as_L2NormOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::L2NormOptions ? static_cast<const tflite::L2NormOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LocalResponseNormalizationOptions *builtin_options_as_LocalResponseNormalizationOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::LocalResponseNormalizationOptions ? static_cast<const tflite::LocalResponseNormalizationOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LSTMOptions *builtin_options_as_LSTMOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::LSTMOptions ? static_cast<const tflite::LSTMOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ResizeBilinearOptions *builtin_options_as_ResizeBilinearOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::ResizeBilinearOptions ? static_cast<const tflite::ResizeBilinearOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::CallOptions *builtin_options_as_CallOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::CallOptions ? static_cast<const tflite::CallOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ReshapeOptions *builtin_options_as_ReshapeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::ReshapeOptions ? static_cast<const tflite::ReshapeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SkipGramOptions *builtin_options_as_SkipGramOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::SkipGramOptions ? static_cast<const tflite::SkipGramOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SpaceToDepthOptions *builtin_options_as_SpaceToDepthOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::SpaceToDepthOptions ? static_cast<const tflite::SpaceToDepthOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::EmbeddingLookupSparseOptions *builtin_options_as_EmbeddingLookupSparseOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::EmbeddingLookupSparseOptions ? static_cast<const tflite::EmbeddingLookupSparseOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::MulOptions *builtin_options_as_MulOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::MulOptions ? static_cast<const tflite::MulOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::PadOptions *builtin_options_as_PadOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::PadOptions ? static_cast<const tflite::PadOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::GatherOptions *builtin_options_as_GatherOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::GatherOptions ? static_cast<const tflite::GatherOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BatchToSpaceNDOptions *builtin_options_as_BatchToSpaceNDOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::BatchToSpaceNDOptions ? static_cast<const tflite::BatchToSpaceNDOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SpaceToBatchNDOptions *builtin_options_as_SpaceToBatchNDOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::SpaceToBatchNDOptions ? static_cast<const tflite::SpaceToBatchNDOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::TransposeOptions *builtin_options_as_TransposeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::TransposeOptions ? static_cast<const tflite::TransposeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ReducerOptions *builtin_options_as_ReducerOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::ReducerOptions ? static_cast<const tflite::ReducerOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SubOptions *builtin_options_as_SubOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::SubOptions ? static_cast<const tflite::SubOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::DivOptions *builtin_options_as_DivOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::DivOptions ? static_cast<const tflite::DivOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SqueezeOptions *builtin_options_as_SqueezeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::SqueezeOptions ? static_cast<const tflite::SqueezeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SequenceRNNOptions *builtin_options_as_SequenceRNNOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::SequenceRNNOptions ? static_cast<const tflite::SequenceRNNOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::StridedSliceOptions *builtin_options_as_StridedSliceOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::StridedSliceOptions ? static_cast<const tflite::StridedSliceOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ExpOptions *builtin_options_as_ExpOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::ExpOptions ? static_cast<const tflite::ExpOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::TopKV2Options *builtin_options_as_TopKV2Options() const {
+    return builtin_options_type() == tflite::BuiltinOptions::TopKV2Options ? static_cast<const tflite::TopKV2Options *>(builtin_options()) : nullptr;
+  }
+  const tflite::SplitOptions *builtin_options_as_SplitOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::SplitOptions ? static_cast<const tflite::SplitOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LogSoftmaxOptions *builtin_options_as_LogSoftmaxOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::LogSoftmaxOptions ? static_cast<const tflite::LogSoftmaxOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::CastOptions *builtin_options_as_CastOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::CastOptions ? static_cast<const tflite::CastOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::DequantizeOptions *builtin_options_as_DequantizeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::DequantizeOptions ? static_cast<const tflite::DequantizeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::MaximumMinimumOptions *builtin_options_as_MaximumMinimumOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::MaximumMinimumOptions ? static_cast<const tflite::MaximumMinimumOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ArgMaxOptions *builtin_options_as_ArgMaxOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::ArgMaxOptions ? static_cast<const tflite::ArgMaxOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LessOptions *builtin_options_as_LessOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::LessOptions ? static_cast<const tflite::LessOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::NegOptions *builtin_options_as_NegOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::NegOptions ? static_cast<const tflite::NegOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::PadV2Options *builtin_options_as_PadV2Options() const {
+    return builtin_options_type() == tflite::BuiltinOptions::PadV2Options ? static_cast<const tflite::PadV2Options *>(builtin_options()) : nullptr;
+  }
+  const tflite::GreaterOptions *builtin_options_as_GreaterOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::GreaterOptions ? static_cast<const tflite::GreaterOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::GreaterEqualOptions *builtin_options_as_GreaterEqualOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::GreaterEqualOptions ? static_cast<const tflite::GreaterEqualOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LessEqualOptions *builtin_options_as_LessEqualOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::LessEqualOptions ? static_cast<const tflite::LessEqualOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SelectOptions *builtin_options_as_SelectOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::SelectOptions ? static_cast<const tflite::SelectOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SliceOptions *builtin_options_as_SliceOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::SliceOptions ? static_cast<const tflite::SliceOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::TransposeConvOptions *builtin_options_as_TransposeConvOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::TransposeConvOptions ? static_cast<const tflite::TransposeConvOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SparseToDenseOptions *builtin_options_as_SparseToDenseOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::SparseToDenseOptions ? static_cast<const tflite::SparseToDenseOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::TileOptions *builtin_options_as_TileOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::TileOptions ? static_cast<const tflite::TileOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ExpandDimsOptions *builtin_options_as_ExpandDimsOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::ExpandDimsOptions ? static_cast<const tflite::ExpandDimsOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::EqualOptions *builtin_options_as_EqualOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::EqualOptions ? static_cast<const tflite::EqualOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::NotEqualOptions *builtin_options_as_NotEqualOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::NotEqualOptions ? static_cast<const tflite::NotEqualOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ShapeOptions *builtin_options_as_ShapeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::ShapeOptions ? static_cast<const tflite::ShapeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::PowOptions *builtin_options_as_PowOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::PowOptions ? static_cast<const tflite::PowOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ArgMinOptions *builtin_options_as_ArgMinOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::ArgMinOptions ? static_cast<const tflite::ArgMinOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::FakeQuantOptions *builtin_options_as_FakeQuantOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::FakeQuantOptions ? static_cast<const tflite::FakeQuantOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::PackOptions *builtin_options_as_PackOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::PackOptions ? static_cast<const tflite::PackOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LogicalOrOptions *builtin_options_as_LogicalOrOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::LogicalOrOptions ? static_cast<const tflite::LogicalOrOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::OneHotOptions *builtin_options_as_OneHotOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::OneHotOptions ? static_cast<const tflite::OneHotOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LogicalAndOptions *builtin_options_as_LogicalAndOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::LogicalAndOptions ? static_cast<const tflite::LogicalAndOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LogicalNotOptions *builtin_options_as_LogicalNotOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::LogicalNotOptions ? static_cast<const tflite::LogicalNotOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::UnpackOptions *builtin_options_as_UnpackOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::UnpackOptions ? static_cast<const tflite::UnpackOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::FloorDivOptions *builtin_options_as_FloorDivOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::FloorDivOptions ? static_cast<const tflite::FloorDivOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SquareOptions *builtin_options_as_SquareOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::SquareOptions ? static_cast<const tflite::SquareOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ZerosLikeOptions *builtin_options_as_ZerosLikeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::ZerosLikeOptions ? static_cast<const tflite::ZerosLikeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::FillOptions *builtin_options_as_FillOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::FillOptions ? static_cast<const tflite::FillOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BidirectionalSequenceLSTMOptions *builtin_options_as_BidirectionalSequenceLSTMOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::BidirectionalSequenceLSTMOptions ? static_cast<const tflite::BidirectionalSequenceLSTMOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BidirectionalSequenceRNNOptions *builtin_options_as_BidirectionalSequenceRNNOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::BidirectionalSequenceRNNOptions ? static_cast<const tflite::BidirectionalSequenceRNNOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::UnidirectionalSequenceLSTMOptions *builtin_options_as_UnidirectionalSequenceLSTMOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::UnidirectionalSequenceLSTMOptions ? static_cast<const tflite::UnidirectionalSequenceLSTMOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::FloorModOptions *builtin_options_as_FloorModOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::FloorModOptions ? static_cast<const tflite::FloorModOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::RangeOptions *builtin_options_as_RangeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::RangeOptions ? static_cast<const tflite::RangeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ResizeNearestNeighborOptions *builtin_options_as_ResizeNearestNeighborOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::ResizeNearestNeighborOptions ? static_cast<const tflite::ResizeNearestNeighborOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::LeakyReluOptions *builtin_options_as_LeakyReluOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::LeakyReluOptions ? static_cast<const tflite::LeakyReluOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SquaredDifferenceOptions *builtin_options_as_SquaredDifferenceOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::SquaredDifferenceOptions ? static_cast<const tflite::SquaredDifferenceOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::MirrorPadOptions *builtin_options_as_MirrorPadOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::MirrorPadOptions ? static_cast<const tflite::MirrorPadOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::AbsOptions *builtin_options_as_AbsOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::AbsOptions ? static_cast<const tflite::AbsOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SplitVOptions *builtin_options_as_SplitVOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::SplitVOptions ? static_cast<const tflite::SplitVOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::UniqueOptions *builtin_options_as_UniqueOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::UniqueOptions ? static_cast<const tflite::UniqueOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ReverseV2Options *builtin_options_as_ReverseV2Options() const {
+    return builtin_options_type() == tflite::BuiltinOptions::ReverseV2Options ? static_cast<const tflite::ReverseV2Options *>(builtin_options()) : nullptr;
+  }
+  const tflite::AddNOptions *builtin_options_as_AddNOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::AddNOptions ? static_cast<const tflite::AddNOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::GatherNdOptions *builtin_options_as_GatherNdOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::GatherNdOptions ? static_cast<const tflite::GatherNdOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::CosOptions *builtin_options_as_CosOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::CosOptions ? static_cast<const tflite::CosOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::WhereOptions *builtin_options_as_WhereOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::WhereOptions ? static_cast<const tflite::WhereOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::RankOptions *builtin_options_as_RankOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::RankOptions ? static_cast<const tflite::RankOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ReverseSequenceOptions *builtin_options_as_ReverseSequenceOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::ReverseSequenceOptions ? static_cast<const tflite::ReverseSequenceOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::MatrixDiagOptions *builtin_options_as_MatrixDiagOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::MatrixDiagOptions ? static_cast<const tflite::MatrixDiagOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::QuantizeOptions *builtin_options_as_QuantizeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::QuantizeOptions ? static_cast<const tflite::QuantizeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::MatrixSetDiagOptions *builtin_options_as_MatrixSetDiagOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::MatrixSetDiagOptions ? static_cast<const tflite::MatrixSetDiagOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::HardSwishOptions *builtin_options_as_HardSwishOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::HardSwishOptions ? static_cast<const tflite::HardSwishOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::IfOptions *builtin_options_as_IfOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::IfOptions ? static_cast<const tflite::IfOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::WhileOptions *builtin_options_as_WhileOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::WhileOptions ? static_cast<const tflite::WhileOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::DepthToSpaceOptions *builtin_options_as_DepthToSpaceOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::DepthToSpaceOptions ? static_cast<const tflite::DepthToSpaceOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::NonMaxSuppressionV4Options *builtin_options_as_NonMaxSuppressionV4Options() const {
+    return builtin_options_type() == tflite::BuiltinOptions::NonMaxSuppressionV4Options ? static_cast<const tflite::NonMaxSuppressionV4Options *>(builtin_options()) : nullptr;
+  }
+  const tflite::NonMaxSuppressionV5Options *builtin_options_as_NonMaxSuppressionV5Options() const {
+    return builtin_options_type() == tflite::BuiltinOptions::NonMaxSuppressionV5Options ? static_cast<const tflite::NonMaxSuppressionV5Options *>(builtin_options()) : nullptr;
+  }
+  const tflite::ScatterNdOptions *builtin_options_as_ScatterNdOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::ScatterNdOptions ? static_cast<const tflite::ScatterNdOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SelectV2Options *builtin_options_as_SelectV2Options() const {
+    return builtin_options_type() == tflite::BuiltinOptions::SelectV2Options ? static_cast<const tflite::SelectV2Options *>(builtin_options()) : nullptr;
+  }
+  const tflite::DensifyOptions *builtin_options_as_DensifyOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::DensifyOptions ? static_cast<const tflite::DensifyOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::SegmentSumOptions *builtin_options_as_SegmentSumOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::SegmentSumOptions ? static_cast<const tflite::SegmentSumOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BatchMatMulOptions *builtin_options_as_BatchMatMulOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::BatchMatMulOptions ? static_cast<const tflite::BatchMatMulOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::CumsumOptions *builtin_options_as_CumsumOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::CumsumOptions ? static_cast<const tflite::CumsumOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::CallOnceOptions *builtin_options_as_CallOnceOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::CallOnceOptions ? static_cast<const tflite::CallOnceOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BroadcastToOptions *builtin_options_as_BroadcastToOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::BroadcastToOptions ? static_cast<const tflite::BroadcastToOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::Rfft2dOptions *builtin_options_as_Rfft2dOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::Rfft2dOptions ? static_cast<const tflite::Rfft2dOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::Conv3DOptions *builtin_options_as_Conv3DOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::Conv3DOptions ? static_cast<const tflite::Conv3DOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::HashtableOptions *builtin_options_as_HashtableOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::HashtableOptions ? static_cast<const tflite::HashtableOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::HashtableFindOptions *builtin_options_as_HashtableFindOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::HashtableFindOptions ? static_cast<const tflite::HashtableFindOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::HashtableImportOptions *builtin_options_as_HashtableImportOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::HashtableImportOptions ? static_cast<const tflite::HashtableImportOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::HashtableSizeOptions *builtin_options_as_HashtableSizeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::HashtableSizeOptions ? static_cast<const tflite::HashtableSizeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::VarHandleOptions *builtin_options_as_VarHandleOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::VarHandleOptions ? static_cast<const tflite::VarHandleOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ReadVariableOptions *builtin_options_as_ReadVariableOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::ReadVariableOptions ? static_cast<const tflite::ReadVariableOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::AssignVariableOptions *builtin_options_as_AssignVariableOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::AssignVariableOptions ? static_cast<const tflite::AssignVariableOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::RandomOptions *builtin_options_as_RandomOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::RandomOptions ? static_cast<const tflite::RandomOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BucketizeOptions *builtin_options_as_BucketizeOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::BucketizeOptions ? static_cast<const tflite::BucketizeOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::GeluOptions *builtin_options_as_GeluOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::GeluOptions ? static_cast<const tflite::GeluOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::DynamicUpdateSliceOptions *builtin_options_as_DynamicUpdateSliceOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::DynamicUpdateSliceOptions ? static_cast<const tflite::DynamicUpdateSliceOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::UnsortedSegmentProdOptions *builtin_options_as_UnsortedSegmentProdOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::UnsortedSegmentProdOptions ? static_cast<const tflite::UnsortedSegmentProdOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::UnsortedSegmentMaxOptions *builtin_options_as_UnsortedSegmentMaxOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::UnsortedSegmentMaxOptions ? static_cast<const tflite::UnsortedSegmentMaxOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::UnsortedSegmentMinOptions *builtin_options_as_UnsortedSegmentMinOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::UnsortedSegmentMinOptions ? static_cast<const tflite::UnsortedSegmentMinOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::UnsortedSegmentSumOptions *builtin_options_as_UnsortedSegmentSumOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::UnsortedSegmentSumOptions ? static_cast<const tflite::UnsortedSegmentSumOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::ATan2Options *builtin_options_as_ATan2Options() const {
+    return builtin_options_type() == tflite::BuiltinOptions::ATan2Options ? static_cast<const tflite::ATan2Options *>(builtin_options()) : nullptr;
+  }
+  const tflite::SignOptions *builtin_options_as_SignOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::SignOptions ? static_cast<const tflite::SignOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BitcastOptions *builtin_options_as_BitcastOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::BitcastOptions ? static_cast<const tflite::BitcastOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::BitwiseXorOptions *builtin_options_as_BitwiseXorOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::BitwiseXorOptions ? static_cast<const tflite::BitwiseXorOptions *>(builtin_options()) : nullptr;
+  }
+  const tflite::RightShiftOptions *builtin_options_as_RightShiftOptions() const {
+    return builtin_options_type() == tflite::BuiltinOptions::RightShiftOptions ? static_cast<const tflite::RightShiftOptions *>(builtin_options()) : nullptr;
+  }
+  const ::flatbuffers::Vector<uint8_t> *custom_options() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
+  }
+  tflite::CustomOptionsFormat custom_options_format() const {
+    return static_cast<tflite::CustomOptionsFormat>(GetField<int8_t>(VT_CUSTOM_OPTIONS_FORMAT, 0));
+  }
+  const ::flatbuffers::Vector<uint8_t> *mutating_variable_inputs() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_MUTATING_VARIABLE_INPUTS);
+  }
+  const ::flatbuffers::Vector<int32_t> *intermediates() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_INTERMEDIATES);
+  }
+  uint64_t large_custom_options_offset() const {
+    return GetField<uint64_t>(VT_LARGE_CUSTOM_OPTIONS_OFFSET, 0);
+  }
+  uint64_t large_custom_options_size() const {
+    return GetField<uint64_t>(VT_LARGE_CUSTOM_OPTIONS_SIZE, 0);
+  }
+  tflite::BuiltinOptions2 builtin_options_2_type() const {
+    return static_cast<tflite::BuiltinOptions2>(GetField<uint8_t>(VT_BUILTIN_OPTIONS_2_TYPE, 0));
+  }
+  const void *builtin_options_2() const {
+    return GetPointer<const void *>(VT_BUILTIN_OPTIONS_2);
+  }
+  template<typename T> const T *builtin_options_2_as() const;
+  const tflite::StablehloConcatenateOptions *builtin_options_2_as_StablehloConcatenateOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2::StablehloConcatenateOptions ? static_cast<const tflite::StablehloConcatenateOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloBroadcastInDimOptions *builtin_options_2_as_StablehloBroadcastInDimOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2::StablehloBroadcastInDimOptions ? static_cast<const tflite::StablehloBroadcastInDimOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloSliceOptions *builtin_options_2_as_StablehloSliceOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2::StablehloSliceOptions ? static_cast<const tflite::StablehloSliceOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloConvolutionOptions *builtin_options_2_as_StablehloConvolutionOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2::StablehloConvolutionOptions ? static_cast<const tflite::StablehloConvolutionOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloCustomCallOptions *builtin_options_2_as_StablehloCustomCallOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2::StablehloCustomCallOptions ? static_cast<const tflite::StablehloCustomCallOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloReduceOptions *builtin_options_2_as_StablehloReduceOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2::StablehloReduceOptions ? static_cast<const tflite::StablehloReduceOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloScatterOptions *builtin_options_2_as_StablehloScatterOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2::StablehloScatterOptions ? static_cast<const tflite::StablehloScatterOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloCompareOptions *builtin_options_2_as_StablehloCompareOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2::StablehloCompareOptions ? static_cast<const tflite::StablehloCompareOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloDynamicSliceOptions *builtin_options_2_as_StablehloDynamicSliceOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2::StablehloDynamicSliceOptions ? static_cast<const tflite::StablehloDynamicSliceOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloPadOptions *builtin_options_2_as_StablehloPadOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2::StablehloPadOptions ? static_cast<const tflite::StablehloPadOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloIotaOptions *builtin_options_2_as_StablehloIotaOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2::StablehloIotaOptions ? static_cast<const tflite::StablehloIotaOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloDotGeneralOptions *builtin_options_2_as_StablehloDotGeneralOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2::StablehloDotGeneralOptions ? static_cast<const tflite::StablehloDotGeneralOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloReduceWindowOptions *builtin_options_2_as_StablehloReduceWindowOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2::StablehloReduceWindowOptions ? static_cast<const tflite::StablehloReduceWindowOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloSortOptions *builtin_options_2_as_StablehloSortOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2::StablehloSortOptions ? static_cast<const tflite::StablehloSortOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloWhileOptions *builtin_options_2_as_StablehloWhileOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2::StablehloWhileOptions ? static_cast<const tflite::StablehloWhileOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloGatherOptions *builtin_options_2_as_StablehloGatherOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2::StablehloGatherOptions ? static_cast<const tflite::StablehloGatherOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloTransposeOptions *builtin_options_2_as_StablehloTransposeOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2::StablehloTransposeOptions ? static_cast<const tflite::StablehloTransposeOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::DilateOptions *builtin_options_2_as_DilateOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2::DilateOptions ? static_cast<const tflite::DilateOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::StablehloRngBitGeneratorOptions *builtin_options_2_as_StablehloRngBitGeneratorOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2::StablehloRngBitGeneratorOptions ? static_cast<const tflite::StablehloRngBitGeneratorOptions *>(builtin_options_2()) : nullptr;
+  }
+  const tflite::ReduceWindowOptions *builtin_options_2_as_ReduceWindowOptions() const {
+    return builtin_options_2_type() == tflite::BuiltinOptions2::ReduceWindowOptions ? static_cast<const tflite::ReduceWindowOptions *>(builtin_options_2()) : nullptr;
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_OPCODE_INDEX, 4) &&
+           VerifyOffset(verifier, VT_INPUTS) &&
+           verifier.VerifyVector(inputs()) &&
+           VerifyOffset(verifier, VT_OUTPUTS) &&
+           verifier.VerifyVector(outputs()) &&
+           VerifyField<uint8_t>(verifier, VT_BUILTIN_OPTIONS_TYPE, 1) &&
+           VerifyOffset(verifier, VT_BUILTIN_OPTIONS) &&
+           VerifyBuiltinOptions(verifier, builtin_options(), builtin_options_type()) &&
+           VerifyOffset(verifier, VT_CUSTOM_OPTIONS) &&
+           verifier.VerifyVector(custom_options()) &&
+           VerifyField<int8_t>(verifier, VT_CUSTOM_OPTIONS_FORMAT, 1) &&
+           VerifyOffset(verifier, VT_MUTATING_VARIABLE_INPUTS) &&
+           verifier.VerifyVector(mutating_variable_inputs()) &&
+           VerifyOffset(verifier, VT_INTERMEDIATES) &&
+           verifier.VerifyVector(intermediates()) &&
+           VerifyField<uint64_t>(verifier, VT_LARGE_CUSTOM_OPTIONS_OFFSET, 8) &&
+           VerifyField<uint64_t>(verifier, VT_LARGE_CUSTOM_OPTIONS_SIZE, 8) &&
+           VerifyField<uint8_t>(verifier, VT_BUILTIN_OPTIONS_2_TYPE, 1) &&
+           VerifyOffset(verifier, VT_BUILTIN_OPTIONS_2) &&
+           VerifyBuiltinOptions2(verifier, builtin_options_2(), builtin_options_2_type()) &&
+           verifier.EndTable();
+  }
+};
+
+template<> inline const tflite::Conv2DOptions *Operator::builtin_options_as<tflite::Conv2DOptions>() const {
+  return builtin_options_as_Conv2DOptions();
+}
+
+template<> inline const tflite::DepthwiseConv2DOptions *Operator::builtin_options_as<tflite::DepthwiseConv2DOptions>() const {
+  return builtin_options_as_DepthwiseConv2DOptions();
+}
+
+template<> inline const tflite::ConcatEmbeddingsOptions *Operator::builtin_options_as<tflite::ConcatEmbeddingsOptions>() const {
+  return builtin_options_as_ConcatEmbeddingsOptions();
+}
+
+template<> inline const tflite::LSHProjectionOptions *Operator::builtin_options_as<tflite::LSHProjectionOptions>() const {
+  return builtin_options_as_LSHProjectionOptions();
+}
+
+template<> inline const tflite::Pool2DOptions *Operator::builtin_options_as<tflite::Pool2DOptions>() const {
+  return builtin_options_as_Pool2DOptions();
+}
+
+template<> inline const tflite::SVDFOptions *Operator::builtin_options_as<tflite::SVDFOptions>() const {
+  return builtin_options_as_SVDFOptions();
+}
+
+template<> inline const tflite::RNNOptions *Operator::builtin_options_as<tflite::RNNOptions>() const {
+  return builtin_options_as_RNNOptions();
+}
+
+template<> inline const tflite::FullyConnectedOptions *Operator::builtin_options_as<tflite::FullyConnectedOptions>() const {
+  return builtin_options_as_FullyConnectedOptions();
+}
+
+template<> inline const tflite::SoftmaxOptions *Operator::builtin_options_as<tflite::SoftmaxOptions>() const {
+  return builtin_options_as_SoftmaxOptions();
+}
+
+template<> inline const tflite::ConcatenationOptions *Operator::builtin_options_as<tflite::ConcatenationOptions>() const {
+  return builtin_options_as_ConcatenationOptions();
+}
+
+template<> inline const tflite::AddOptions *Operator::builtin_options_as<tflite::AddOptions>() const {
+  return builtin_options_as_AddOptions();
+}
+
+template<> inline const tflite::L2NormOptions *Operator::builtin_options_as<tflite::L2NormOptions>() const {
+  return builtin_options_as_L2NormOptions();
+}
+
+template<> inline const tflite::LocalResponseNormalizationOptions *Operator::builtin_options_as<tflite::LocalResponseNormalizationOptions>() const {
+  return builtin_options_as_LocalResponseNormalizationOptions();
+}
+
+template<> inline const tflite::LSTMOptions *Operator::builtin_options_as<tflite::LSTMOptions>() const {
+  return builtin_options_as_LSTMOptions();
+}
+
+template<> inline const tflite::ResizeBilinearOptions *Operator::builtin_options_as<tflite::ResizeBilinearOptions>() const {
+  return builtin_options_as_ResizeBilinearOptions();
+}
+
+template<> inline const tflite::CallOptions *Operator::builtin_options_as<tflite::CallOptions>() const {
+  return builtin_options_as_CallOptions();
+}
+
+template<> inline const tflite::ReshapeOptions *Operator::builtin_options_as<tflite::ReshapeOptions>() const {
+  return builtin_options_as_ReshapeOptions();
+}
+
+template<> inline const tflite::SkipGramOptions *Operator::builtin_options_as<tflite::SkipGramOptions>() const {
+  return builtin_options_as_SkipGramOptions();
+}
+
+template<> inline const tflite::SpaceToDepthOptions *Operator::builtin_options_as<tflite::SpaceToDepthOptions>() const {
+  return builtin_options_as_SpaceToDepthOptions();
+}
+
+template<> inline const tflite::EmbeddingLookupSparseOptions *Operator::builtin_options_as<tflite::EmbeddingLookupSparseOptions>() const {
+  return builtin_options_as_EmbeddingLookupSparseOptions();
+}
+
+template<> inline const tflite::MulOptions *Operator::builtin_options_as<tflite::MulOptions>() const {
+  return builtin_options_as_MulOptions();
+}
+
+template<> inline const tflite::PadOptions *Operator::builtin_options_as<tflite::PadOptions>() const {
+  return builtin_options_as_PadOptions();
+}
+
+template<> inline const tflite::GatherOptions *Operator::builtin_options_as<tflite::GatherOptions>() const {
+  return builtin_options_as_GatherOptions();
+}
+
+template<> inline const tflite::BatchToSpaceNDOptions *Operator::builtin_options_as<tflite::BatchToSpaceNDOptions>() const {
+  return builtin_options_as_BatchToSpaceNDOptions();
+}
+
+template<> inline const tflite::SpaceToBatchNDOptions *Operator::builtin_options_as<tflite::SpaceToBatchNDOptions>() const {
+  return builtin_options_as_SpaceToBatchNDOptions();
+}
+
+template<> inline const tflite::TransposeOptions *Operator::builtin_options_as<tflite::TransposeOptions>() const {
+  return builtin_options_as_TransposeOptions();
+}
+
+template<> inline const tflite::ReducerOptions *Operator::builtin_options_as<tflite::ReducerOptions>() const {
+  return builtin_options_as_ReducerOptions();
+}
+
+template<> inline const tflite::SubOptions *Operator::builtin_options_as<tflite::SubOptions>() const {
+  return builtin_options_as_SubOptions();
+}
+
+template<> inline const tflite::DivOptions *Operator::builtin_options_as<tflite::DivOptions>() const {
+  return builtin_options_as_DivOptions();
+}
+
+template<> inline const tflite::SqueezeOptions *Operator::builtin_options_as<tflite::SqueezeOptions>() const {
+  return builtin_options_as_SqueezeOptions();
+}
+
+template<> inline const tflite::SequenceRNNOptions *Operator::builtin_options_as<tflite::SequenceRNNOptions>() const {
+  return builtin_options_as_SequenceRNNOptions();
+}
+
+template<> inline const tflite::StridedSliceOptions *Operator::builtin_options_as<tflite::StridedSliceOptions>() const {
+  return builtin_options_as_StridedSliceOptions();
+}
+
+template<> inline const tflite::ExpOptions *Operator::builtin_options_as<tflite::ExpOptions>() const {
+  return builtin_options_as_ExpOptions();
+}
+
+template<> inline const tflite::TopKV2Options *Operator::builtin_options_as<tflite::TopKV2Options>() const {
+  return builtin_options_as_TopKV2Options();
+}
+
+template<> inline const tflite::SplitOptions *Operator::builtin_options_as<tflite::SplitOptions>() const {
+  return builtin_options_as_SplitOptions();
+}
+
+template<> inline const tflite::LogSoftmaxOptions *Operator::builtin_options_as<tflite::LogSoftmaxOptions>() const {
+  return builtin_options_as_LogSoftmaxOptions();
+}
+
+template<> inline const tflite::CastOptions *Operator::builtin_options_as<tflite::CastOptions>() const {
+  return builtin_options_as_CastOptions();
+}
+
+template<> inline const tflite::DequantizeOptions *Operator::builtin_options_as<tflite::DequantizeOptions>() const {
+  return builtin_options_as_DequantizeOptions();
+}
+
+template<> inline const tflite::MaximumMinimumOptions *Operator::builtin_options_as<tflite::MaximumMinimumOptions>() const {
+  return builtin_options_as_MaximumMinimumOptions();
+}
+
+template<> inline const tflite::ArgMaxOptions *Operator::builtin_options_as<tflite::ArgMaxOptions>() const {
+  return builtin_options_as_ArgMaxOptions();
+}
+
+template<> inline const tflite::LessOptions *Operator::builtin_options_as<tflite::LessOptions>() const {
+  return builtin_options_as_LessOptions();
+}
+
+template<> inline const tflite::NegOptions *Operator::builtin_options_as<tflite::NegOptions>() const {
+  return builtin_options_as_NegOptions();
+}
+
+template<> inline const tflite::PadV2Options *Operator::builtin_options_as<tflite::PadV2Options>() const {
+  return builtin_options_as_PadV2Options();
+}
+
+template<> inline const tflite::GreaterOptions *Operator::builtin_options_as<tflite::GreaterOptions>() const {
+  return builtin_options_as_GreaterOptions();
+}
+
+template<> inline const tflite::GreaterEqualOptions *Operator::builtin_options_as<tflite::GreaterEqualOptions>() const {
+  return builtin_options_as_GreaterEqualOptions();
+}
+
+template<> inline const tflite::LessEqualOptions *Operator::builtin_options_as<tflite::LessEqualOptions>() const {
+  return builtin_options_as_LessEqualOptions();
+}
+
+template<> inline const tflite::SelectOptions *Operator::builtin_options_as<tflite::SelectOptions>() const {
+  return builtin_options_as_SelectOptions();
+}
+
+template<> inline const tflite::SliceOptions *Operator::builtin_options_as<tflite::SliceOptions>() const {
+  return builtin_options_as_SliceOptions();
+}
+
+template<> inline const tflite::TransposeConvOptions *Operator::builtin_options_as<tflite::TransposeConvOptions>() const {
+  return builtin_options_as_TransposeConvOptions();
+}
+
+template<> inline const tflite::SparseToDenseOptions *Operator::builtin_options_as<tflite::SparseToDenseOptions>() const {
+  return builtin_options_as_SparseToDenseOptions();
+}
+
+template<> inline const tflite::TileOptions *Operator::builtin_options_as<tflite::TileOptions>() const {
+  return builtin_options_as_TileOptions();
+}
+
+template<> inline const tflite::ExpandDimsOptions *Operator::builtin_options_as<tflite::ExpandDimsOptions>() const {
+  return builtin_options_as_ExpandDimsOptions();
+}
+
+template<> inline const tflite::EqualOptions *Operator::builtin_options_as<tflite::EqualOptions>() const {
+  return builtin_options_as_EqualOptions();
+}
+
+template<> inline const tflite::NotEqualOptions *Operator::builtin_options_as<tflite::NotEqualOptions>() const {
+  return builtin_options_as_NotEqualOptions();
+}
+
+template<> inline const tflite::ShapeOptions *Operator::builtin_options_as<tflite::ShapeOptions>() const {
+  return builtin_options_as_ShapeOptions();
+}
+
+template<> inline const tflite::PowOptions *Operator::builtin_options_as<tflite::PowOptions>() const {
+  return builtin_options_as_PowOptions();
+}
+
+template<> inline const tflite::ArgMinOptions *Operator::builtin_options_as<tflite::ArgMinOptions>() const {
+  return builtin_options_as_ArgMinOptions();
+}
+
+template<> inline const tflite::FakeQuantOptions *Operator::builtin_options_as<tflite::FakeQuantOptions>() const {
+  return builtin_options_as_FakeQuantOptions();
+}
+
+template<> inline const tflite::PackOptions *Operator::builtin_options_as<tflite::PackOptions>() const {
+  return builtin_options_as_PackOptions();
+}
+
+template<> inline const tflite::LogicalOrOptions *Operator::builtin_options_as<tflite::LogicalOrOptions>() const {
+  return builtin_options_as_LogicalOrOptions();
+}
+
+template<> inline const tflite::OneHotOptions *Operator::builtin_options_as<tflite::OneHotOptions>() const {
+  return builtin_options_as_OneHotOptions();
+}
+
+template<> inline const tflite::LogicalAndOptions *Operator::builtin_options_as<tflite::LogicalAndOptions>() const {
+  return builtin_options_as_LogicalAndOptions();
+}
+
+template<> inline const tflite::LogicalNotOptions *Operator::builtin_options_as<tflite::LogicalNotOptions>() const {
+  return builtin_options_as_LogicalNotOptions();
+}
+
+template<> inline const tflite::UnpackOptions *Operator::builtin_options_as<tflite::UnpackOptions>() const {
+  return builtin_options_as_UnpackOptions();
+}
+
+template<> inline const tflite::FloorDivOptions *Operator::builtin_options_as<tflite::FloorDivOptions>() const {
+  return builtin_options_as_FloorDivOptions();
+}
+
+template<> inline const tflite::SquareOptions *Operator::builtin_options_as<tflite::SquareOptions>() const {
+  return builtin_options_as_SquareOptions();
+}
+
+template<> inline const tflite::ZerosLikeOptions *Operator::builtin_options_as<tflite::ZerosLikeOptions>() const {
+  return builtin_options_as_ZerosLikeOptions();
+}
+
+template<> inline const tflite::FillOptions *Operator::builtin_options_as<tflite::FillOptions>() const {
+  return builtin_options_as_FillOptions();
+}
+
+template<> inline const tflite::BidirectionalSequenceLSTMOptions *Operator::builtin_options_as<tflite::BidirectionalSequenceLSTMOptions>() const {
+  return builtin_options_as_BidirectionalSequenceLSTMOptions();
+}
+
+template<> inline const tflite::BidirectionalSequenceRNNOptions *Operator::builtin_options_as<tflite::BidirectionalSequenceRNNOptions>() const {
+  return builtin_options_as_BidirectionalSequenceRNNOptions();
+}
+
+template<> inline const tflite::UnidirectionalSequenceLSTMOptions *Operator::builtin_options_as<tflite::UnidirectionalSequenceLSTMOptions>() const {
+  return builtin_options_as_UnidirectionalSequenceLSTMOptions();
+}
+
+template<> inline const tflite::FloorModOptions *Operator::builtin_options_as<tflite::FloorModOptions>() const {
+  return builtin_options_as_FloorModOptions();
+}
+
+template<> inline const tflite::RangeOptions *Operator::builtin_options_as<tflite::RangeOptions>() const {
+  return builtin_options_as_RangeOptions();
+}
+
+template<> inline const tflite::ResizeNearestNeighborOptions *Operator::builtin_options_as<tflite::ResizeNearestNeighborOptions>() const {
+  return builtin_options_as_ResizeNearestNeighborOptions();
+}
+
+template<> inline const tflite::LeakyReluOptions *Operator::builtin_options_as<tflite::LeakyReluOptions>() const {
+  return builtin_options_as_LeakyReluOptions();
+}
+
+template<> inline const tflite::SquaredDifferenceOptions *Operator::builtin_options_as<tflite::SquaredDifferenceOptions>() const {
+  return builtin_options_as_SquaredDifferenceOptions();
+}
+
+template<> inline const tflite::MirrorPadOptions *Operator::builtin_options_as<tflite::MirrorPadOptions>() const {
+  return builtin_options_as_MirrorPadOptions();
+}
+
+template<> inline const tflite::AbsOptions *Operator::builtin_options_as<tflite::AbsOptions>() const {
+  return builtin_options_as_AbsOptions();
+}
+
+template<> inline const tflite::SplitVOptions *Operator::builtin_options_as<tflite::SplitVOptions>() const {
+  return builtin_options_as_SplitVOptions();
+}
+
+template<> inline const tflite::UniqueOptions *Operator::builtin_options_as<tflite::UniqueOptions>() const {
+  return builtin_options_as_UniqueOptions();
+}
+
+template<> inline const tflite::ReverseV2Options *Operator::builtin_options_as<tflite::ReverseV2Options>() const {
+  return builtin_options_as_ReverseV2Options();
+}
+
+template<> inline const tflite::AddNOptions *Operator::builtin_options_as<tflite::AddNOptions>() const {
+  return builtin_options_as_AddNOptions();
+}
+
+template<> inline const tflite::GatherNdOptions *Operator::builtin_options_as<tflite::GatherNdOptions>() const {
+  return builtin_options_as_GatherNdOptions();
+}
+
+template<> inline const tflite::CosOptions *Operator::builtin_options_as<tflite::CosOptions>() const {
+  return builtin_options_as_CosOptions();
+}
+
+template<> inline const tflite::WhereOptions *Operator::builtin_options_as<tflite::WhereOptions>() const {
+  return builtin_options_as_WhereOptions();
+}
+
+template<> inline const tflite::RankOptions *Operator::builtin_options_as<tflite::RankOptions>() const {
+  return builtin_options_as_RankOptions();
+}
+
+template<> inline const tflite::ReverseSequenceOptions *Operator::builtin_options_as<tflite::ReverseSequenceOptions>() const {
+  return builtin_options_as_ReverseSequenceOptions();
+}
+
+template<> inline const tflite::MatrixDiagOptions *Operator::builtin_options_as<tflite::MatrixDiagOptions>() const {
+  return builtin_options_as_MatrixDiagOptions();
+}
+
+template<> inline const tflite::QuantizeOptions *Operator::builtin_options_as<tflite::QuantizeOptions>() const {
+  return builtin_options_as_QuantizeOptions();
+}
+
+template<> inline const tflite::MatrixSetDiagOptions *Operator::builtin_options_as<tflite::MatrixSetDiagOptions>() const {
+  return builtin_options_as_MatrixSetDiagOptions();
+}
+
+template<> inline const tflite::HardSwishOptions *Operator::builtin_options_as<tflite::HardSwishOptions>() const {
+  return builtin_options_as_HardSwishOptions();
+}
+
+template<> inline const tflite::IfOptions *Operator::builtin_options_as<tflite::IfOptions>() const {
+  return builtin_options_as_IfOptions();
+}
+
+template<> inline const tflite::WhileOptions *Operator::builtin_options_as<tflite::WhileOptions>() const {
+  return builtin_options_as_WhileOptions();
+}
+
+template<> inline const tflite::DepthToSpaceOptions *Operator::builtin_options_as<tflite::DepthToSpaceOptions>() const {
+  return builtin_options_as_DepthToSpaceOptions();
+}
+
+template<> inline const tflite::NonMaxSuppressionV4Options *Operator::builtin_options_as<tflite::NonMaxSuppressionV4Options>() const {
+  return builtin_options_as_NonMaxSuppressionV4Options();
+}
+
+template<> inline const tflite::NonMaxSuppressionV5Options *Operator::builtin_options_as<tflite::NonMaxSuppressionV5Options>() const {
+  return builtin_options_as_NonMaxSuppressionV5Options();
+}
+
+template<> inline const tflite::ScatterNdOptions *Operator::builtin_options_as<tflite::ScatterNdOptions>() const {
+  return builtin_options_as_ScatterNdOptions();
+}
+
+template<> inline const tflite::SelectV2Options *Operator::builtin_options_as<tflite::SelectV2Options>() const {
+  return builtin_options_as_SelectV2Options();
+}
+
+template<> inline const tflite::DensifyOptions *Operator::builtin_options_as<tflite::DensifyOptions>() const {
+  return builtin_options_as_DensifyOptions();
+}
+
+template<> inline const tflite::SegmentSumOptions *Operator::builtin_options_as<tflite::SegmentSumOptions>() const {
+  return builtin_options_as_SegmentSumOptions();
+}
+
+template<> inline const tflite::BatchMatMulOptions *Operator::builtin_options_as<tflite::BatchMatMulOptions>() const {
+  return builtin_options_as_BatchMatMulOptions();
+}
+
+template<> inline const tflite::CumsumOptions *Operator::builtin_options_as<tflite::CumsumOptions>() const {
+  return builtin_options_as_CumsumOptions();
+}
+
+template<> inline const tflite::CallOnceOptions *Operator::builtin_options_as<tflite::CallOnceOptions>() const {
+  return builtin_options_as_CallOnceOptions();
+}
+
+template<> inline const tflite::BroadcastToOptions *Operator::builtin_options_as<tflite::BroadcastToOptions>() const {
+  return builtin_options_as_BroadcastToOptions();
+}
+
+template<> inline const tflite::Rfft2dOptions *Operator::builtin_options_as<tflite::Rfft2dOptions>() const {
+  return builtin_options_as_Rfft2dOptions();
+}
+
+template<> inline const tflite::Conv3DOptions *Operator::builtin_options_as<tflite::Conv3DOptions>() const {
+  return builtin_options_as_Conv3DOptions();
+}
+
+template<> inline const tflite::HashtableOptions *Operator::builtin_options_as<tflite::HashtableOptions>() const {
+  return builtin_options_as_HashtableOptions();
+}
+
+template<> inline const tflite::HashtableFindOptions *Operator::builtin_options_as<tflite::HashtableFindOptions>() const {
+  return builtin_options_as_HashtableFindOptions();
+}
+
+template<> inline const tflite::HashtableImportOptions *Operator::builtin_options_as<tflite::HashtableImportOptions>() const {
+  return builtin_options_as_HashtableImportOptions();
+}
+
+template<> inline const tflite::HashtableSizeOptions *Operator::builtin_options_as<tflite::HashtableSizeOptions>() const {
+  return builtin_options_as_HashtableSizeOptions();
+}
+
+template<> inline const tflite::VarHandleOptions *Operator::builtin_options_as<tflite::VarHandleOptions>() const {
+  return builtin_options_as_VarHandleOptions();
+}
+
+template<> inline const tflite::ReadVariableOptions *Operator::builtin_options_as<tflite::ReadVariableOptions>() const {
+  return builtin_options_as_ReadVariableOptions();
+}
+
+template<> inline const tflite::AssignVariableOptions *Operator::builtin_options_as<tflite::AssignVariableOptions>() const {
+  return builtin_options_as_AssignVariableOptions();
+}
+
+template<> inline const tflite::RandomOptions *Operator::builtin_options_as<tflite::RandomOptions>() const {
+  return builtin_options_as_RandomOptions();
+}
+
+template<> inline const tflite::BucketizeOptions *Operator::builtin_options_as<tflite::BucketizeOptions>() const {
+  return builtin_options_as_BucketizeOptions();
+}
+
+template<> inline const tflite::GeluOptions *Operator::builtin_options_as<tflite::GeluOptions>() const {
+  return builtin_options_as_GeluOptions();
+}
+
+template<> inline const tflite::DynamicUpdateSliceOptions *Operator::builtin_options_as<tflite::DynamicUpdateSliceOptions>() const {
+  return builtin_options_as_DynamicUpdateSliceOptions();
+}
+
+template<> inline const tflite::UnsortedSegmentProdOptions *Operator::builtin_options_as<tflite::UnsortedSegmentProdOptions>() const {
+  return builtin_options_as_UnsortedSegmentProdOptions();
+}
+
+template<> inline const tflite::UnsortedSegmentMaxOptions *Operator::builtin_options_as<tflite::UnsortedSegmentMaxOptions>() const {
+  return builtin_options_as_UnsortedSegmentMaxOptions();
+}
+
+template<> inline const tflite::UnsortedSegmentMinOptions *Operator::builtin_options_as<tflite::UnsortedSegmentMinOptions>() const {
+  return builtin_options_as_UnsortedSegmentMinOptions();
+}
+
+template<> inline const tflite::UnsortedSegmentSumOptions *Operator::builtin_options_as<tflite::UnsortedSegmentSumOptions>() const {
+  return builtin_options_as_UnsortedSegmentSumOptions();
+}
+
+template<> inline const tflite::ATan2Options *Operator::builtin_options_as<tflite::ATan2Options>() const {
+  return builtin_options_as_ATan2Options();
+}
+
+template<> inline const tflite::SignOptions *Operator::builtin_options_as<tflite::SignOptions>() const {
+  return builtin_options_as_SignOptions();
+}
+
+template<> inline const tflite::BitcastOptions *Operator::builtin_options_as<tflite::BitcastOptions>() const {
+  return builtin_options_as_BitcastOptions();
+}
+
+template<> inline const tflite::BitwiseXorOptions *Operator::builtin_options_as<tflite::BitwiseXorOptions>() const {
+  return builtin_options_as_BitwiseXorOptions();
+}
+
+template<> inline const tflite::RightShiftOptions *Operator::builtin_options_as<tflite::RightShiftOptions>() const {
+  return builtin_options_as_RightShiftOptions();
+}
+
+template<> inline const tflite::StablehloConcatenateOptions *Operator::builtin_options_2_as<tflite::StablehloConcatenateOptions>() const {
+  return builtin_options_2_as_StablehloConcatenateOptions();
+}
+
+template<> inline const tflite::StablehloBroadcastInDimOptions *Operator::builtin_options_2_as<tflite::StablehloBroadcastInDimOptions>() const {
+  return builtin_options_2_as_StablehloBroadcastInDimOptions();
+}
+
+template<> inline const tflite::StablehloSliceOptions *Operator::builtin_options_2_as<tflite::StablehloSliceOptions>() const {
+  return builtin_options_2_as_StablehloSliceOptions();
+}
+
+template<> inline const tflite::StablehloConvolutionOptions *Operator::builtin_options_2_as<tflite::StablehloConvolutionOptions>() const {
+  return builtin_options_2_as_StablehloConvolutionOptions();
+}
+
+template<> inline const tflite::StablehloCustomCallOptions *Operator::builtin_options_2_as<tflite::StablehloCustomCallOptions>() const {
+  return builtin_options_2_as_StablehloCustomCallOptions();
+}
+
+template<> inline const tflite::StablehloReduceOptions *Operator::builtin_options_2_as<tflite::StablehloReduceOptions>() const {
+  return builtin_options_2_as_StablehloReduceOptions();
+}
+
+template<> inline const tflite::StablehloScatterOptions *Operator::builtin_options_2_as<tflite::StablehloScatterOptions>() const {
+  return builtin_options_2_as_StablehloScatterOptions();
+}
+
+template<> inline const tflite::StablehloCompareOptions *Operator::builtin_options_2_as<tflite::StablehloCompareOptions>() const {
+  return builtin_options_2_as_StablehloCompareOptions();
+}
+
+template<> inline const tflite::StablehloDynamicSliceOptions *Operator::builtin_options_2_as<tflite::StablehloDynamicSliceOptions>() const {
+  return builtin_options_2_as_StablehloDynamicSliceOptions();
+}
+
+template<> inline const tflite::StablehloPadOptions *Operator::builtin_options_2_as<tflite::StablehloPadOptions>() const {
+  return builtin_options_2_as_StablehloPadOptions();
+}
+
+template<> inline const tflite::StablehloIotaOptions *Operator::builtin_options_2_as<tflite::StablehloIotaOptions>() const {
+  return builtin_options_2_as_StablehloIotaOptions();
+}
+
+template<> inline const tflite::StablehloDotGeneralOptions *Operator::builtin_options_2_as<tflite::StablehloDotGeneralOptions>() const {
+  return builtin_options_2_as_StablehloDotGeneralOptions();
+}
+
+template<> inline const tflite::StablehloReduceWindowOptions *Operator::builtin_options_2_as<tflite::StablehloReduceWindowOptions>() const {
+  return builtin_options_2_as_StablehloReduceWindowOptions();
+}
+
+template<> inline const tflite::StablehloSortOptions *Operator::builtin_options_2_as<tflite::StablehloSortOptions>() const {
+  return builtin_options_2_as_StablehloSortOptions();
+}
+
+template<> inline const tflite::StablehloWhileOptions *Operator::builtin_options_2_as<tflite::StablehloWhileOptions>() const {
+  return builtin_options_2_as_StablehloWhileOptions();
+}
+
+template<> inline const tflite::StablehloGatherOptions *Operator::builtin_options_2_as<tflite::StablehloGatherOptions>() const {
+  return builtin_options_2_as_StablehloGatherOptions();
+}
+
+template<> inline const tflite::StablehloTransposeOptions *Operator::builtin_options_2_as<tflite::StablehloTransposeOptions>() const {
+  return builtin_options_2_as_StablehloTransposeOptions();
+}
+
+template<> inline const tflite::DilateOptions *Operator::builtin_options_2_as<tflite::DilateOptions>() const {
+  return builtin_options_2_as_DilateOptions();
+}
+
+template<> inline const tflite::StablehloRngBitGeneratorOptions *Operator::builtin_options_2_as<tflite::StablehloRngBitGeneratorOptions>() const {
+  return builtin_options_2_as_StablehloRngBitGeneratorOptions();
+}
+
+template<> inline const tflite::ReduceWindowOptions *Operator::builtin_options_2_as<tflite::ReduceWindowOptions>() const {
+  return builtin_options_2_as_ReduceWindowOptions();
+}
+
+struct OperatorBuilder {
+  typedef Operator Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_opcode_index(uint32_t opcode_index) {
+    fbb_.AddElement<uint32_t>(Operator::VT_OPCODE_INDEX, opcode_index, 0);
+  }
+  void add_inputs(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> inputs) {
+    fbb_.AddOffset(Operator::VT_INPUTS, inputs);
+  }
+  void add_outputs(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> outputs) {
+    fbb_.AddOffset(Operator::VT_OUTPUTS, outputs);
+  }
+  void add_builtin_options_type(tflite::BuiltinOptions builtin_options_type) {
+    fbb_.AddElement<uint8_t>(Operator::VT_BUILTIN_OPTIONS_TYPE, static_cast<uint8_t>(builtin_options_type), 0);
+  }
+  void add_builtin_options(::flatbuffers::Offset<void> builtin_options) {
+    fbb_.AddOffset(Operator::VT_BUILTIN_OPTIONS, builtin_options);
+  }
+  void add_custom_options(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> custom_options) {
+    fbb_.AddOffset(Operator::VT_CUSTOM_OPTIONS, custom_options);
+  }
+  void add_custom_options_format(tflite::CustomOptionsFormat custom_options_format) {
+    fbb_.AddElement<int8_t>(Operator::VT_CUSTOM_OPTIONS_FORMAT, static_cast<int8_t>(custom_options_format), 0);
+  }
+  void add_mutating_variable_inputs(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> mutating_variable_inputs) {
+    fbb_.AddOffset(Operator::VT_MUTATING_VARIABLE_INPUTS, mutating_variable_inputs);
+  }
+  void add_intermediates(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> intermediates) {
+    fbb_.AddOffset(Operator::VT_INTERMEDIATES, intermediates);
+  }
+  void add_large_custom_options_offset(uint64_t large_custom_options_offset) {
+    fbb_.AddElement<uint64_t>(Operator::VT_LARGE_CUSTOM_OPTIONS_OFFSET, large_custom_options_offset, 0);
+  }
+  void add_large_custom_options_size(uint64_t large_custom_options_size) {
+    fbb_.AddElement<uint64_t>(Operator::VT_LARGE_CUSTOM_OPTIONS_SIZE, large_custom_options_size, 0);
+  }
+  void add_builtin_options_2_type(tflite::BuiltinOptions2 builtin_options_2_type) {
+    fbb_.AddElement<uint8_t>(Operator::VT_BUILTIN_OPTIONS_2_TYPE, static_cast<uint8_t>(builtin_options_2_type), 0);
+  }
+  void add_builtin_options_2(::flatbuffers::Offset<void> builtin_options_2) {
+    fbb_.AddOffset(Operator::VT_BUILTIN_OPTIONS_2, builtin_options_2);
+  }
+  explicit OperatorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Operator> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Operator>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Operator> CreateOperator(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t opcode_index = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> inputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> outputs = 0,
+    tflite::BuiltinOptions builtin_options_type = tflite::BuiltinOptions::NONE,
+    ::flatbuffers::Offset<void> builtin_options = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> custom_options = 0,
+    tflite::CustomOptionsFormat custom_options_format = tflite::CustomOptionsFormat::FLEXBUFFERS,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> mutating_variable_inputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> intermediates = 0,
+    uint64_t large_custom_options_offset = 0,
+    uint64_t large_custom_options_size = 0,
+    tflite::BuiltinOptions2 builtin_options_2_type = tflite::BuiltinOptions2::NONE,
+    ::flatbuffers::Offset<void> builtin_options_2 = 0) {
+  OperatorBuilder builder_(_fbb);
+  builder_.add_large_custom_options_size(large_custom_options_size);
+  builder_.add_large_custom_options_offset(large_custom_options_offset);
+  builder_.add_builtin_options_2(builtin_options_2);
+  builder_.add_intermediates(intermediates);
+  builder_.add_mutating_variable_inputs(mutating_variable_inputs);
+  builder_.add_custom_options(custom_options);
+  builder_.add_builtin_options(builtin_options);
+  builder_.add_outputs(outputs);
+  builder_.add_inputs(inputs);
+  builder_.add_opcode_index(opcode_index);
+  builder_.add_builtin_options_2_type(builtin_options_2_type);
+  builder_.add_custom_options_format(custom_options_format);
+  builder_.add_builtin_options_type(builtin_options_type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Operator> CreateOperatorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t opcode_index = 0,
+    const std::vector<int32_t> *inputs = nullptr,
+    const std::vector<int32_t> *outputs = nullptr,
+    tflite::BuiltinOptions builtin_options_type = tflite::BuiltinOptions::NONE,
+    ::flatbuffers::Offset<void> builtin_options = 0,
+    const std::vector<uint8_t> *custom_options = nullptr,
+    tflite::CustomOptionsFormat custom_options_format = tflite::CustomOptionsFormat::FLEXBUFFERS,
+    const std::vector<uint8_t> *mutating_variable_inputs = nullptr,
+    const std::vector<int32_t> *intermediates = nullptr,
+    uint64_t large_custom_options_offset = 0,
+    uint64_t large_custom_options_size = 0,
+    tflite::BuiltinOptions2 builtin_options_2_type = tflite::BuiltinOptions2::NONE,
+    ::flatbuffers::Offset<void> builtin_options_2 = 0) {
+  auto inputs__ = inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0;
+  auto outputs__ = outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0;
+  auto custom_options__ = custom_options ? _fbb.CreateVector<uint8_t>(*custom_options) : 0;
+  auto mutating_variable_inputs__ = mutating_variable_inputs ? _fbb.CreateVector<uint8_t>(*mutating_variable_inputs) : 0;
+  auto intermediates__ = intermediates ? _fbb.CreateVector<int32_t>(*intermediates) : 0;
+  return tflite::CreateOperator(
+      _fbb,
+      opcode_index,
+      inputs__,
+      outputs__,
+      builtin_options_type,
+      builtin_options,
+      custom_options__,
+      custom_options_format,
+      mutating_variable_inputs__,
+      intermediates__,
+      large_custom_options_offset,
+      large_custom_options_size,
+      builtin_options_2_type,
+      builtin_options_2);
+}
+
+struct SubGraph FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SubGraphBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SubGraphTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TENSORS = 4,
+    VT_INPUTS = 6,
+    VT_OUTPUTS = 8,
+    VT_OPERATORS = 10,
+    VT_NAME = 12
+  };
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Tensor>> *tensors() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Tensor>> *>(VT_TENSORS);
+  }
+  const ::flatbuffers::Vector<int32_t> *inputs() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_INPUTS);
+  }
+  const ::flatbuffers::Vector<int32_t> *outputs() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_OUTPUTS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Operator>> *operators() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Operator>> *>(VT_OPERATORS);
+  }
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_TENSORS) &&
+           verifier.VerifyVector(tensors()) &&
+           verifier.VerifyVectorOfTables(tensors()) &&
+           VerifyOffset(verifier, VT_INPUTS) &&
+           verifier.VerifyVector(inputs()) &&
+           VerifyOffset(verifier, VT_OUTPUTS) &&
+           verifier.VerifyVector(outputs()) &&
+           VerifyOffset(verifier, VT_OPERATORS) &&
+           verifier.VerifyVector(operators()) &&
+           verifier.VerifyVectorOfTables(operators()) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           verifier.EndTable();
+  }
+};
+
+struct SubGraphBuilder {
+  typedef SubGraph Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_tensors(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Tensor>>> tensors) {
+    fbb_.AddOffset(SubGraph::VT_TENSORS, tensors);
+  }
+  void add_inputs(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> inputs) {
+    fbb_.AddOffset(SubGraph::VT_INPUTS, inputs);
+  }
+  void add_outputs(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> outputs) {
+    fbb_.AddOffset(SubGraph::VT_OUTPUTS, outputs);
+  }
+  void add_operators(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Operator>>> operators) {
+    fbb_.AddOffset(SubGraph::VT_OPERATORS, operators);
+  }
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(SubGraph::VT_NAME, name);
+  }
+  explicit SubGraphBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SubGraph> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SubGraph>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SubGraph> CreateSubGraph(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Tensor>>> tensors = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> inputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> outputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Operator>>> operators = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0) {
+  SubGraphBuilder builder_(_fbb);
+  builder_.add_name(name);
+  builder_.add_operators(operators);
+  builder_.add_outputs(outputs);
+  builder_.add_inputs(inputs);
+  builder_.add_tensors(tensors);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<SubGraph> CreateSubGraphDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<tflite::Tensor>> *tensors = nullptr,
+    const std::vector<int32_t> *inputs = nullptr,
+    const std::vector<int32_t> *outputs = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::Operator>> *operators = nullptr,
+    const char *name = nullptr) {
+  auto tensors__ = tensors ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Tensor>>(*tensors) : 0;
+  auto inputs__ = inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0;
+  auto outputs__ = outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0;
+  auto operators__ = operators ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Operator>>(*operators) : 0;
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  return tflite::CreateSubGraph(
+      _fbb,
+      tensors__,
+      inputs__,
+      outputs__,
+      operators__,
+      name__);
+}
+
+struct Buffer FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BufferBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return BufferTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATA = 4,
+    VT_OFFSET = 6,
+    VT_SIZE = 8
+  };
+  const ::flatbuffers::Vector<uint8_t> *data() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_DATA);
+  }
+  uint64_t offset() const {
+    return GetField<uint64_t>(VT_OFFSET, 0);
+  }
+  uint64_t size() const {
+    return GetField<uint64_t>(VT_SIZE, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           verifier.VerifyVector(data()) &&
+           VerifyField<uint64_t>(verifier, VT_OFFSET, 8) &&
+           VerifyField<uint64_t>(verifier, VT_SIZE, 8) &&
+           verifier.EndTable();
+  }
+};
+
+struct BufferBuilder {
+  typedef Buffer Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_data(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data) {
+    fbb_.AddOffset(Buffer::VT_DATA, data);
+  }
+  void add_offset(uint64_t offset) {
+    fbb_.AddElement<uint64_t>(Buffer::VT_OFFSET, offset, 0);
+  }
+  void add_size(uint64_t size) {
+    fbb_.AddElement<uint64_t>(Buffer::VT_SIZE, size, 0);
+  }
+  explicit BufferBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Buffer> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Buffer>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Buffer> CreateBuffer(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data = 0,
+    uint64_t offset = 0,
+    uint64_t size = 0) {
+  BufferBuilder builder_(_fbb);
+  builder_.add_size(size);
+  builder_.add_offset(offset);
+  builder_.add_data(data);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Buffer> CreateBufferDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *data = nullptr,
+    uint64_t offset = 0,
+    uint64_t size = 0) {
+  if (data) { _fbb.ForceVectorAlignment(data->size(), sizeof(uint8_t), 16); }
+  auto data__ = data ? _fbb.CreateVector<uint8_t>(*data) : 0;
+  return tflite::CreateBuffer(
+      _fbb,
+      data__,
+      offset,
+      size);
+}
+
+struct Metadata FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MetadataBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return MetadataTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_BUFFER = 6
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  uint32_t buffer() const {
+    return GetField<uint32_t>(VT_BUFFER, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyField<uint32_t>(verifier, VT_BUFFER, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct MetadataBuilder {
+  typedef Metadata Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(Metadata::VT_NAME, name);
+  }
+  void add_buffer(uint32_t buffer) {
+    fbb_.AddElement<uint32_t>(Metadata::VT_BUFFER, buffer, 0);
+  }
+  explicit MetadataBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Metadata> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Metadata>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Metadata> CreateMetadata(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    uint32_t buffer = 0) {
+  MetadataBuilder builder_(_fbb);
+  builder_.add_buffer(buffer);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Metadata> CreateMetadataDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    uint32_t buffer = 0) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  return tflite::CreateMetadata(
+      _fbb,
+      name__,
+      buffer);
+}
+
+struct TensorMap FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TensorMapBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return TensorMapTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_TENSOR_INDEX = 6
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  uint32_t tensor_index() const {
+    return GetField<uint32_t>(VT_TENSOR_INDEX, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyField<uint32_t>(verifier, VT_TENSOR_INDEX, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct TensorMapBuilder {
+  typedef TensorMap Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(TensorMap::VT_NAME, name);
+  }
+  void add_tensor_index(uint32_t tensor_index) {
+    fbb_.AddElement<uint32_t>(TensorMap::VT_TENSOR_INDEX, tensor_index, 0);
+  }
+  explicit TensorMapBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<TensorMap> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<TensorMap>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<TensorMap> CreateTensorMap(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    uint32_t tensor_index = 0) {
+  TensorMapBuilder builder_(_fbb);
+  builder_.add_tensor_index(tensor_index);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<TensorMap> CreateTensorMapDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    uint32_t tensor_index = 0) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  return tflite::CreateTensorMap(
+      _fbb,
+      name__,
+      tensor_index);
+}
+
+struct SignatureDef FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SignatureDefBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return SignatureDefTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INPUTS = 4,
+    VT_OUTPUTS = 6,
+    VT_SIGNATURE_KEY = 8,
+    VT_SUBGRAPH_INDEX = 12
+  };
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>> *inputs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>> *>(VT_INPUTS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>> *outputs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>> *>(VT_OUTPUTS);
+  }
+  const ::flatbuffers::String *signature_key() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_SIGNATURE_KEY);
+  }
+  uint32_t subgraph_index() const {
+    return GetField<uint32_t>(VT_SUBGRAPH_INDEX, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_INPUTS) &&
+           verifier.VerifyVector(inputs()) &&
+           verifier.VerifyVectorOfTables(inputs()) &&
+           VerifyOffset(verifier, VT_OUTPUTS) &&
+           verifier.VerifyVector(outputs()) &&
+           verifier.VerifyVectorOfTables(outputs()) &&
+           VerifyOffset(verifier, VT_SIGNATURE_KEY) &&
+           verifier.VerifyString(signature_key()) &&
+           VerifyField<uint32_t>(verifier, VT_SUBGRAPH_INDEX, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct SignatureDefBuilder {
+  typedef SignatureDef Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_inputs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>>> inputs) {
+    fbb_.AddOffset(SignatureDef::VT_INPUTS, inputs);
+  }
+  void add_outputs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>>> outputs) {
+    fbb_.AddOffset(SignatureDef::VT_OUTPUTS, outputs);
+  }
+  void add_signature_key(::flatbuffers::Offset<::flatbuffers::String> signature_key) {
+    fbb_.AddOffset(SignatureDef::VT_SIGNATURE_KEY, signature_key);
+  }
+  void add_subgraph_index(uint32_t subgraph_index) {
+    fbb_.AddElement<uint32_t>(SignatureDef::VT_SUBGRAPH_INDEX, subgraph_index, 0);
+  }
+  explicit SignatureDefBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<SignatureDef> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<SignatureDef>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<SignatureDef> CreateSignatureDef(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>>> inputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::TensorMap>>> outputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> signature_key = 0,
+    uint32_t subgraph_index = 0) {
+  SignatureDefBuilder builder_(_fbb);
+  builder_.add_subgraph_index(subgraph_index);
+  builder_.add_signature_key(signature_key);
+  builder_.add_outputs(outputs);
+  builder_.add_inputs(inputs);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<SignatureDef> CreateSignatureDefDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<tflite::TensorMap>> *inputs = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::TensorMap>> *outputs = nullptr,
+    const char *signature_key = nullptr,
+    uint32_t subgraph_index = 0) {
+  auto inputs__ = inputs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::TensorMap>>(*inputs) : 0;
+  auto outputs__ = outputs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::TensorMap>>(*outputs) : 0;
+  auto signature_key__ = signature_key ? _fbb.CreateString(signature_key) : 0;
+  return tflite::CreateSignatureDef(
+      _fbb,
+      inputs__,
+      outputs__,
+      signature_key__,
+      subgraph_index);
+}
+
+struct Model FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ModelBuilder Builder;
+  static const ::flatbuffers::TypeTable *MiniReflectTypeTable() {
+    return ModelTypeTable();
+  }
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VERSION = 4,
+    VT_OPERATOR_CODES = 6,
+    VT_SUBGRAPHS = 8,
+    VT_DESCRIPTION = 10,
+    VT_BUFFERS = 12,
+    VT_METADATA_BUFFER = 14,
+    VT_METADATA = 16,
+    VT_SIGNATURE_DEFS = 18
+  };
+  uint32_t version() const {
+    return GetField<uint32_t>(VT_VERSION, 0);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::OperatorCode>> *operator_codes() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::OperatorCode>> *>(VT_OPERATOR_CODES);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::SubGraph>> *subgraphs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::SubGraph>> *>(VT_SUBGRAPHS);
+  }
+  const ::flatbuffers::String *description() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DESCRIPTION);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Buffer>> *buffers() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Buffer>> *>(VT_BUFFERS);
+  }
+  const ::flatbuffers::Vector<int32_t> *metadata_buffer() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_METADATA_BUFFER);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Metadata>> *metadata() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::Metadata>> *>(VT_METADATA);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>> *signature_defs() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>> *>(VT_SIGNATURE_DEFS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_VERSION, 4) &&
+           VerifyOffset(verifier, VT_OPERATOR_CODES) &&
+           verifier.VerifyVector(operator_codes()) &&
+           verifier.VerifyVectorOfTables(operator_codes()) &&
+           VerifyOffset(verifier, VT_SUBGRAPHS) &&
+           verifier.VerifyVector(subgraphs()) &&
+           verifier.VerifyVectorOfTables(subgraphs()) &&
+           VerifyOffset(verifier, VT_DESCRIPTION) &&
+           verifier.VerifyString(description()) &&
+           VerifyOffset(verifier, VT_BUFFERS) &&
+           verifier.VerifyVector(buffers()) &&
+           verifier.VerifyVectorOfTables(buffers()) &&
+           VerifyOffset(verifier, VT_METADATA_BUFFER) &&
+           verifier.VerifyVector(metadata_buffer()) &&
+           VerifyOffset(verifier, VT_METADATA) &&
+           verifier.VerifyVector(metadata()) &&
+           verifier.VerifyVectorOfTables(metadata()) &&
+           VerifyOffset(verifier, VT_SIGNATURE_DEFS) &&
+           verifier.VerifyVector(signature_defs()) &&
+           verifier.VerifyVectorOfTables(signature_defs()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ModelBuilder {
+  typedef Model Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_version(uint32_t version) {
+    fbb_.AddElement<uint32_t>(Model::VT_VERSION, version, 0);
+  }
+  void add_operator_codes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::OperatorCode>>> operator_codes) {
+    fbb_.AddOffset(Model::VT_OPERATOR_CODES, operator_codes);
+  }
+  void add_subgraphs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SubGraph>>> subgraphs) {
+    fbb_.AddOffset(Model::VT_SUBGRAPHS, subgraphs);
+  }
+  void add_description(::flatbuffers::Offset<::flatbuffers::String> description) {
+    fbb_.AddOffset(Model::VT_DESCRIPTION, description);
+  }
+  void add_buffers(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Buffer>>> buffers) {
+    fbb_.AddOffset(Model::VT_BUFFERS, buffers);
+  }
+  void add_metadata_buffer(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> metadata_buffer) {
+    fbb_.AddOffset(Model::VT_METADATA_BUFFER, metadata_buffer);
+  }
+  void add_metadata(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Metadata>>> metadata) {
+    fbb_.AddOffset(Model::VT_METADATA, metadata);
+  }
+  void add_signature_defs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>>> signature_defs) {
+    fbb_.AddOffset(Model::VT_SIGNATURE_DEFS, signature_defs);
+  }
+  explicit ModelBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Model> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Model>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Model> CreateModel(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t version = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::OperatorCode>>> operator_codes = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SubGraph>>> subgraphs = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> description = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Buffer>>> buffers = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> metadata_buffer = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::Metadata>>> metadata = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tflite::SignatureDef>>> signature_defs = 0) {
+  ModelBuilder builder_(_fbb);
+  builder_.add_signature_defs(signature_defs);
+  builder_.add_metadata(metadata);
+  builder_.add_metadata_buffer(metadata_buffer);
+  builder_.add_buffers(buffers);
+  builder_.add_description(description);
+  builder_.add_subgraphs(subgraphs);
+  builder_.add_operator_codes(operator_codes);
+  builder_.add_version(version);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Model> CreateModelDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t version = 0,
+    const std::vector<::flatbuffers::Offset<tflite::OperatorCode>> *operator_codes = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::SubGraph>> *subgraphs = nullptr,
+    const char *description = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::Buffer>> *buffers = nullptr,
+    const std::vector<int32_t> *metadata_buffer = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::Metadata>> *metadata = nullptr,
+    const std::vector<::flatbuffers::Offset<tflite::SignatureDef>> *signature_defs = nullptr) {
+  auto operator_codes__ = operator_codes ? _fbb.CreateVector<::flatbuffers::Offset<tflite::OperatorCode>>(*operator_codes) : 0;
+  auto subgraphs__ = subgraphs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::SubGraph>>(*subgraphs) : 0;
+  auto description__ = description ? _fbb.CreateString(description) : 0;
+  auto buffers__ = buffers ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Buffer>>(*buffers) : 0;
+  auto metadata_buffer__ = metadata_buffer ? _fbb.CreateVector<int32_t>(*metadata_buffer) : 0;
+  auto metadata__ = metadata ? _fbb.CreateVector<::flatbuffers::Offset<tflite::Metadata>>(*metadata) : 0;
+  auto signature_defs__ = signature_defs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::SignatureDef>>(*signature_defs) : 0;
+  return tflite::CreateModel(
+      _fbb,
+      version,
+      operator_codes__,
+      subgraphs__,
+      description__,
+      buffers__,
+      metadata_buffer__,
+      metadata__,
+      signature_defs__);
+}
+
+inline bool VerifyQuantizationDetails(::flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type) {
+  switch (type) {
+    case QuantizationDetails::NONE: {
+      return true;
+    }
+    case QuantizationDetails::CustomQuantization: {
+      auto ptr = reinterpret_cast<const tflite::CustomQuantization *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyQuantizationDetailsVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<QuantizationDetails> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyQuantizationDetails(
+        verifier,  values->Get(i), types->GetEnum<QuantizationDetails>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline bool VerifySparseIndexVector(::flatbuffers::Verifier &verifier, const void *obj, SparseIndexVector type) {
+  switch (type) {
+    case SparseIndexVector::NONE: {
+      return true;
+    }
+    case SparseIndexVector::Int32Vector: {
+      auto ptr = reinterpret_cast<const tflite::Int32Vector *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case SparseIndexVector::Uint16Vector: {
+      auto ptr = reinterpret_cast<const tflite::Uint16Vector *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case SparseIndexVector::Uint8Vector: {
+      auto ptr = reinterpret_cast<const tflite::Uint8Vector *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifySparseIndexVectorVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<SparseIndexVector> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifySparseIndexVector(
+        verifier,  values->Get(i), types->GetEnum<SparseIndexVector>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline bool VerifyBuiltinOptions(::flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type) {
+  switch (type) {
+    case BuiltinOptions::NONE: {
+      return true;
+    }
+    case BuiltinOptions::Conv2DOptions: {
+      auto ptr = reinterpret_cast<const tflite::Conv2DOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::DepthwiseConv2DOptions: {
+      auto ptr = reinterpret_cast<const tflite::DepthwiseConv2DOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::ConcatEmbeddingsOptions: {
+      auto ptr = reinterpret_cast<const tflite::ConcatEmbeddingsOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::LSHProjectionOptions: {
+      auto ptr = reinterpret_cast<const tflite::LSHProjectionOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::Pool2DOptions: {
+      auto ptr = reinterpret_cast<const tflite::Pool2DOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::SVDFOptions: {
+      auto ptr = reinterpret_cast<const tflite::SVDFOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::RNNOptions: {
+      auto ptr = reinterpret_cast<const tflite::RNNOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::FullyConnectedOptions: {
+      auto ptr = reinterpret_cast<const tflite::FullyConnectedOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::SoftmaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::SoftmaxOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::ConcatenationOptions: {
+      auto ptr = reinterpret_cast<const tflite::ConcatenationOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::AddOptions: {
+      auto ptr = reinterpret_cast<const tflite::AddOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::L2NormOptions: {
+      auto ptr = reinterpret_cast<const tflite::L2NormOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::LocalResponseNormalizationOptions: {
+      auto ptr = reinterpret_cast<const tflite::LocalResponseNormalizationOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::LSTMOptions: {
+      auto ptr = reinterpret_cast<const tflite::LSTMOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::ResizeBilinearOptions: {
+      auto ptr = reinterpret_cast<const tflite::ResizeBilinearOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::CallOptions: {
+      auto ptr = reinterpret_cast<const tflite::CallOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::ReshapeOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReshapeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::SkipGramOptions: {
+      auto ptr = reinterpret_cast<const tflite::SkipGramOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::SpaceToDepthOptions: {
+      auto ptr = reinterpret_cast<const tflite::SpaceToDepthOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::EmbeddingLookupSparseOptions: {
+      auto ptr = reinterpret_cast<const tflite::EmbeddingLookupSparseOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::MulOptions: {
+      auto ptr = reinterpret_cast<const tflite::MulOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::PadOptions: {
+      auto ptr = reinterpret_cast<const tflite::PadOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::GatherOptions: {
+      auto ptr = reinterpret_cast<const tflite::GatherOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::BatchToSpaceNDOptions: {
+      auto ptr = reinterpret_cast<const tflite::BatchToSpaceNDOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::SpaceToBatchNDOptions: {
+      auto ptr = reinterpret_cast<const tflite::SpaceToBatchNDOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::TransposeOptions: {
+      auto ptr = reinterpret_cast<const tflite::TransposeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::ReducerOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReducerOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::SubOptions: {
+      auto ptr = reinterpret_cast<const tflite::SubOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::DivOptions: {
+      auto ptr = reinterpret_cast<const tflite::DivOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::SqueezeOptions: {
+      auto ptr = reinterpret_cast<const tflite::SqueezeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::SequenceRNNOptions: {
+      auto ptr = reinterpret_cast<const tflite::SequenceRNNOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::StridedSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::StridedSliceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::ExpOptions: {
+      auto ptr = reinterpret_cast<const tflite::ExpOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::TopKV2Options: {
+      auto ptr = reinterpret_cast<const tflite::TopKV2Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::SplitOptions: {
+      auto ptr = reinterpret_cast<const tflite::SplitOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::LogSoftmaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogSoftmaxOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::CastOptions: {
+      auto ptr = reinterpret_cast<const tflite::CastOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::DequantizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::DequantizeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<const tflite::MaximumMinimumOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::ArgMaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::ArgMaxOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::LessOptions: {
+      auto ptr = reinterpret_cast<const tflite::LessOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::NegOptions: {
+      auto ptr = reinterpret_cast<const tflite::NegOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::PadV2Options: {
+      auto ptr = reinterpret_cast<const tflite::PadV2Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::GreaterOptions: {
+      auto ptr = reinterpret_cast<const tflite::GreaterOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::GreaterEqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::GreaterEqualOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::LessEqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::LessEqualOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::SelectOptions: {
+      auto ptr = reinterpret_cast<const tflite::SelectOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::SliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::SliceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::TransposeConvOptions: {
+      auto ptr = reinterpret_cast<const tflite::TransposeConvOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::SparseToDenseOptions: {
+      auto ptr = reinterpret_cast<const tflite::SparseToDenseOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::TileOptions: {
+      auto ptr = reinterpret_cast<const tflite::TileOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::ExpandDimsOptions: {
+      auto ptr = reinterpret_cast<const tflite::ExpandDimsOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::EqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::EqualOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::NotEqualOptions: {
+      auto ptr = reinterpret_cast<const tflite::NotEqualOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::ShapeOptions: {
+      auto ptr = reinterpret_cast<const tflite::ShapeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::PowOptions: {
+      auto ptr = reinterpret_cast<const tflite::PowOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::ArgMinOptions: {
+      auto ptr = reinterpret_cast<const tflite::ArgMinOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::FakeQuantOptions: {
+      auto ptr = reinterpret_cast<const tflite::FakeQuantOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::PackOptions: {
+      auto ptr = reinterpret_cast<const tflite::PackOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::LogicalOrOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogicalOrOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::OneHotOptions: {
+      auto ptr = reinterpret_cast<const tflite::OneHotOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::LogicalAndOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogicalAndOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::LogicalNotOptions: {
+      auto ptr = reinterpret_cast<const tflite::LogicalNotOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::UnpackOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnpackOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::FloorDivOptions: {
+      auto ptr = reinterpret_cast<const tflite::FloorDivOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::SquareOptions: {
+      auto ptr = reinterpret_cast<const tflite::SquareOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::ZerosLikeOptions: {
+      auto ptr = reinterpret_cast<const tflite::ZerosLikeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::FillOptions: {
+      auto ptr = reinterpret_cast<const tflite::FillOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::BidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const tflite::BidirectionalSequenceLSTMOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::BidirectionalSequenceRNNOptions: {
+      auto ptr = reinterpret_cast<const tflite::BidirectionalSequenceRNNOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::UnidirectionalSequenceLSTMOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnidirectionalSequenceLSTMOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::FloorModOptions: {
+      auto ptr = reinterpret_cast<const tflite::FloorModOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::RangeOptions: {
+      auto ptr = reinterpret_cast<const tflite::RangeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::ResizeNearestNeighborOptions: {
+      auto ptr = reinterpret_cast<const tflite::ResizeNearestNeighborOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::LeakyReluOptions: {
+      auto ptr = reinterpret_cast<const tflite::LeakyReluOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<const tflite::SquaredDifferenceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::MirrorPadOptions: {
+      auto ptr = reinterpret_cast<const tflite::MirrorPadOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::AbsOptions: {
+      auto ptr = reinterpret_cast<const tflite::AbsOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::SplitVOptions: {
+      auto ptr = reinterpret_cast<const tflite::SplitVOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::UniqueOptions: {
+      auto ptr = reinterpret_cast<const tflite::UniqueOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::ReverseV2Options: {
+      auto ptr = reinterpret_cast<const tflite::ReverseV2Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::AddNOptions: {
+      auto ptr = reinterpret_cast<const tflite::AddNOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::GatherNdOptions: {
+      auto ptr = reinterpret_cast<const tflite::GatherNdOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::CosOptions: {
+      auto ptr = reinterpret_cast<const tflite::CosOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::WhereOptions: {
+      auto ptr = reinterpret_cast<const tflite::WhereOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::RankOptions: {
+      auto ptr = reinterpret_cast<const tflite::RankOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::ReverseSequenceOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReverseSequenceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::MatrixDiagOptions: {
+      auto ptr = reinterpret_cast<const tflite::MatrixDiagOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::QuantizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::QuantizeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::MatrixSetDiagOptions: {
+      auto ptr = reinterpret_cast<const tflite::MatrixSetDiagOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::HardSwishOptions: {
+      auto ptr = reinterpret_cast<const tflite::HardSwishOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::IfOptions: {
+      auto ptr = reinterpret_cast<const tflite::IfOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::WhileOptions: {
+      auto ptr = reinterpret_cast<const tflite::WhileOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::DepthToSpaceOptions: {
+      auto ptr = reinterpret_cast<const tflite::DepthToSpaceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::NonMaxSuppressionV4Options: {
+      auto ptr = reinterpret_cast<const tflite::NonMaxSuppressionV4Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::NonMaxSuppressionV5Options: {
+      auto ptr = reinterpret_cast<const tflite::NonMaxSuppressionV5Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::ScatterNdOptions: {
+      auto ptr = reinterpret_cast<const tflite::ScatterNdOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::SelectV2Options: {
+      auto ptr = reinterpret_cast<const tflite::SelectV2Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::DensifyOptions: {
+      auto ptr = reinterpret_cast<const tflite::DensifyOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::SegmentSumOptions: {
+      auto ptr = reinterpret_cast<const tflite::SegmentSumOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::BatchMatMulOptions: {
+      auto ptr = reinterpret_cast<const tflite::BatchMatMulOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::CumsumOptions: {
+      auto ptr = reinterpret_cast<const tflite::CumsumOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::CallOnceOptions: {
+      auto ptr = reinterpret_cast<const tflite::CallOnceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::BroadcastToOptions: {
+      auto ptr = reinterpret_cast<const tflite::BroadcastToOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::Rfft2dOptions: {
+      auto ptr = reinterpret_cast<const tflite::Rfft2dOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::Conv3DOptions: {
+      auto ptr = reinterpret_cast<const tflite::Conv3DOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::HashtableOptions: {
+      auto ptr = reinterpret_cast<const tflite::HashtableOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::HashtableFindOptions: {
+      auto ptr = reinterpret_cast<const tflite::HashtableFindOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::HashtableImportOptions: {
+      auto ptr = reinterpret_cast<const tflite::HashtableImportOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::HashtableSizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::HashtableSizeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::VarHandleOptions: {
+      auto ptr = reinterpret_cast<const tflite::VarHandleOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::ReadVariableOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReadVariableOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::AssignVariableOptions: {
+      auto ptr = reinterpret_cast<const tflite::AssignVariableOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::RandomOptions: {
+      auto ptr = reinterpret_cast<const tflite::RandomOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::BucketizeOptions: {
+      auto ptr = reinterpret_cast<const tflite::BucketizeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::GeluOptions: {
+      auto ptr = reinterpret_cast<const tflite::GeluOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::DynamicUpdateSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::DynamicUpdateSliceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::UnsortedSegmentProdOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnsortedSegmentProdOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::UnsortedSegmentMaxOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnsortedSegmentMaxOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::UnsortedSegmentMinOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnsortedSegmentMinOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::UnsortedSegmentSumOptions: {
+      auto ptr = reinterpret_cast<const tflite::UnsortedSegmentSumOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::ATan2Options: {
+      auto ptr = reinterpret_cast<const tflite::ATan2Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::SignOptions: {
+      auto ptr = reinterpret_cast<const tflite::SignOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::BitcastOptions: {
+      auto ptr = reinterpret_cast<const tflite::BitcastOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::BitwiseXorOptions: {
+      auto ptr = reinterpret_cast<const tflite::BitwiseXorOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions::RightShiftOptions: {
+      auto ptr = reinterpret_cast<const tflite::RightShiftOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyBuiltinOptionsVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<BuiltinOptions> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyBuiltinOptions(
+        verifier,  values->Get(i), types->GetEnum<BuiltinOptions>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline bool VerifyBuiltinOptions2(::flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions2 type) {
+  switch (type) {
+    case BuiltinOptions2::NONE: {
+      return true;
+    }
+    case BuiltinOptions2::StablehloConcatenateOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloConcatenateOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2::StablehloBroadcastInDimOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloBroadcastInDimOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2::StablehloSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloSliceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2::StablehloConvolutionOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloConvolutionOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2::StablehloCustomCallOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloCustomCallOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2::StablehloReduceOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloReduceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2::StablehloScatterOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloScatterOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2::StablehloCompareOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloCompareOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2::StablehloDynamicSliceOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloDynamicSliceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2::StablehloPadOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloPadOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2::StablehloIotaOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloIotaOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2::StablehloDotGeneralOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloDotGeneralOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2::StablehloReduceWindowOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloReduceWindowOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2::StablehloSortOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloSortOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2::StablehloWhileOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloWhileOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2::StablehloGatherOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloGatherOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2::StablehloTransposeOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloTransposeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2::DilateOptions: {
+      auto ptr = reinterpret_cast<const tflite::DilateOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2::StablehloRngBitGeneratorOptions: {
+      auto ptr = reinterpret_cast<const tflite::StablehloRngBitGeneratorOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions2::ReduceWindowOptions: {
+      auto ptr = reinterpret_cast<const tflite::ReduceWindowOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyBuiltinOptions2Vector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<BuiltinOptions2> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyBuiltinOptions2(
+        verifier,  values->Get(i), types->GetEnum<BuiltinOptions2>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline const ::flatbuffers::TypeTable *TensorTypeTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::TensorTypeTypeTable
+  };
+  static const char * const names[] = {
+    "FLOAT32",
+    "FLOAT16",
+    "INT32",
+    "UINT8",
+    "INT64",
+    "STRING",
+    "BOOL",
+    "INT16",
+    "COMPLEX64",
+    "INT8",
+    "FLOAT64",
+    "COMPLEX128",
+    "UINT64",
+    "RESOURCE",
+    "VARIANT",
+    "UINT32",
+    "UINT16",
+    "INT4"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_ENUM, 18, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *QuantizationDetailsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_SEQUENCE, 0, -1 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::CustomQuantizationTypeTable
+  };
+  static const char * const names[] = {
+    "NONE",
+    "CustomQuantization"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_UNION, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *DimensionTypeTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::DimensionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "DENSE",
+    "SPARSE_CSR"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_ENUM, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SparseIndexVectorTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_SEQUENCE, 0, -1 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 0 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 1 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 2 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::Int32VectorTypeTable,
+    tflite::Uint16VectorTypeTable,
+    tflite::Uint8VectorTypeTable
+  };
+  static const char * const names[] = {
+    "NONE",
+    "Int32Vector",
+    "Uint16Vector",
+    "Uint8Vector"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_UNION, 4, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *BuiltinOperatorTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::BuiltinOperatorTypeTable
+  };
+  static const char * const names[] = {
+    "ADD",
+    "AVERAGE_POOL_2D",
+    "CONCATENATION",
+    "CONV_2D",
+    "DEPTHWISE_CONV_2D",
+    "DEPTH_TO_SPACE",
+    "DEQUANTIZE",
+    "EMBEDDING_LOOKUP",
+    "FLOOR",
+    "FULLY_CONNECTED",
+    "HASHTABLE_LOOKUP",
+    "L2_NORMALIZATION",
+    "L2_POOL_2D",
+    "LOCAL_RESPONSE_NORMALIZATION",
+    "LOGISTIC",
+    "LSH_PROJECTION",
+    "LSTM",
+    "MAX_POOL_2D",
+    "MUL",
+    "RELU",
+    "RELU_N1_TO_1",
+    "RELU6",
+    "RESHAPE",
+    "RESIZE_BILINEAR",
+    "RNN",
+    "SOFTMAX",
+    "SPACE_TO_DEPTH",
+    "SVDF",
+    "TANH",
+    "CONCAT_EMBEDDINGS",
+    "SKIP_GRAM",
+    "CALL",
+    "CUSTOM",
+    "EMBEDDING_LOOKUP_SPARSE",
+    "PAD",
+    "UNIDIRECTIONAL_SEQUENCE_RNN",
+    "GATHER",
+    "BATCH_TO_SPACE_ND",
+    "SPACE_TO_BATCH_ND",
+    "TRANSPOSE",
+    "MEAN",
+    "SUB",
+    "DIV",
+    "SQUEEZE",
+    "UNIDIRECTIONAL_SEQUENCE_LSTM",
+    "STRIDED_SLICE",
+    "BIDIRECTIONAL_SEQUENCE_RNN",
+    "EXP",
+    "TOPK_V2",
+    "SPLIT",
+    "LOG_SOFTMAX",
+    "DELEGATE",
+    "BIDIRECTIONAL_SEQUENCE_LSTM",
+    "CAST",
+    "PRELU",
+    "MAXIMUM",
+    "ARGMAX",
+    "MINIMUM",
+    "LESS",
+    "NEG",
+    "PADV2",
+    "GREATER",
+    "GREATER_EQUAL",
+    "LESS_EQUAL",
+    "SELECT",
+    "SLICE",
+    "SIN",
+    "TRANSPOSE_CONV",
+    "SPARSE_TO_DENSE",
+    "TILE",
+    "EXPAND_DIMS",
+    "EQUAL",
+    "NOT_EQUAL",
+    "LOG",
+    "SUM",
+    "SQRT",
+    "RSQRT",
+    "SHAPE",
+    "POW",
+    "ARG_MIN",
+    "FAKE_QUANT",
+    "REDUCE_PROD",
+    "REDUCE_MAX",
+    "PACK",
+    "LOGICAL_OR",
+    "ONE_HOT",
+    "LOGICAL_AND",
+    "LOGICAL_NOT",
+    "UNPACK",
+    "REDUCE_MIN",
+    "FLOOR_DIV",
+    "REDUCE_ANY",
+    "SQUARE",
+    "ZEROS_LIKE",
+    "FILL",
+    "FLOOR_MOD",
+    "RANGE",
+    "RESIZE_NEAREST_NEIGHBOR",
+    "LEAKY_RELU",
+    "SQUARED_DIFFERENCE",
+    "MIRROR_PAD",
+    "ABS",
+    "SPLIT_V",
+    "UNIQUE",
+    "CEIL",
+    "REVERSE_V2",
+    "ADD_N",
+    "GATHER_ND",
+    "COS",
+    "WHERE",
+    "RANK",
+    "ELU",
+    "REVERSE_SEQUENCE",
+    "MATRIX_DIAG",
+    "QUANTIZE",
+    "MATRIX_SET_DIAG",
+    "ROUND",
+    "HARD_SWISH",
+    "IF",
+    "WHILE",
+    "NON_MAX_SUPPRESSION_V4",
+    "NON_MAX_SUPPRESSION_V5",
+    "SCATTER_ND",
+    "SELECT_V2",
+    "DENSIFY",
+    "SEGMENT_SUM",
+    "BATCH_MATMUL",
+    "PLACEHOLDER_FOR_GREATER_OP_CODES",
+    "CUMSUM",
+    "CALL_ONCE",
+    "BROADCAST_TO",
+    "RFFT2D",
+    "CONV_3D",
+    "IMAG",
+    "REAL",
+    "COMPLEX_ABS",
+    "HASHTABLE",
+    "HASHTABLE_FIND",
+    "HASHTABLE_IMPORT",
+    "HASHTABLE_SIZE",
+    "REDUCE_ALL",
+    "CONV_3D_TRANSPOSE",
+    "VAR_HANDLE",
+    "READ_VARIABLE",
+    "ASSIGN_VARIABLE",
+    "BROADCAST_ARGS",
+    "RANDOM_STANDARD_NORMAL",
+    "BUCKETIZE",
+    "RANDOM_UNIFORM",
+    "MULTINOMIAL",
+    "GELU",
+    "DYNAMIC_UPDATE_SLICE",
+    "RELU_0_TO_1",
+    "UNSORTED_SEGMENT_PROD",
+    "UNSORTED_SEGMENT_MAX",
+    "UNSORTED_SEGMENT_SUM",
+    "ATAN2",
+    "UNSORTED_SEGMENT_MIN",
+    "SIGN",
+    "BITCAST",
+    "BITWISE_XOR",
+    "RIGHT_SHIFT",
+    "STABLEHLO_LOGISTIC",
+    "STABLEHLO_ADD",
+    "STABLEHLO_DIVIDE",
+    "STABLEHLO_MULTIPLY",
+    "STABLEHLO_MAXIMUM",
+    "STABLEHLO_RESHAPE",
+    "STABLEHLO_CLAMP",
+    "STABLEHLO_CONCATENATE",
+    "STABLEHLO_BROADCAST_IN_DIM",
+    "STABLEHLO_CONVOLUTION",
+    "STABLEHLO_SLICE",
+    "STABLEHLO_CUSTOM_CALL",
+    "STABLEHLO_REDUCE",
+    "STABLEHLO_ABS",
+    "STABLEHLO_AND",
+    "STABLEHLO_COSINE",
+    "STABLEHLO_EXPONENTIAL",
+    "STABLEHLO_FLOOR",
+    "STABLEHLO_LOG",
+    "STABLEHLO_MINIMUM",
+    "STABLEHLO_NEGATE",
+    "STABLEHLO_OR",
+    "STABLEHLO_POWER",
+    "STABLEHLO_REMAINDER",
+    "STABLEHLO_RSQRT",
+    "STABLEHLO_SELECT",
+    "STABLEHLO_SUBTRACT",
+    "STABLEHLO_TANH",
+    "STABLEHLO_SCATTER",
+    "STABLEHLO_COMPARE",
+    "STABLEHLO_CONVERT",
+    "STABLEHLO_DYNAMIC_SLICE",
+    "STABLEHLO_DYNAMIC_UPDATE_SLICE",
+    "STABLEHLO_PAD",
+    "STABLEHLO_IOTA",
+    "STABLEHLO_DOT_GENERAL",
+    "STABLEHLO_REDUCE_WINDOW",
+    "STABLEHLO_SORT",
+    "STABLEHLO_WHILE",
+    "STABLEHLO_GATHER",
+    "STABLEHLO_TRANSPOSE",
+    "DILATE",
+    "STABLEHLO_RNG_BIT_GENERATOR",
+    "REDUCE_WINDOW"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_ENUM, 206, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *BuiltinOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_SEQUENCE, 0, -1 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 0 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 1 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 2 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 3 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 4 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 5 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 6 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 7 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 8 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 9 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 10 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 11 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 12 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 13 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 14 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 15 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 16 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 17 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 18 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 19 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 20 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 21 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 22 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 23 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 24 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 25 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 26 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 27 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 28 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 29 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 30 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 31 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 32 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 33 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 34 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 35 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 36 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 37 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 38 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 39 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 40 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 41 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 42 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 43 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 44 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 45 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 46 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 47 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 48 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 49 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 50 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 51 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 52 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 53 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 54 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 55 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 56 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 57 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 58 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 59 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 60 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 61 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 62 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 63 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 64 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 65 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 66 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 67 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 68 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 69 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 70 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 71 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 72 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 73 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 74 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 75 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 76 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 77 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 78 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 79 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 80 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 81 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 82 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 83 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 84 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 85 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 86 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 87 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 88 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 89 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 90 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 91 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 92 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 93 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 94 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 95 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 96 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 97 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 98 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 99 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 100 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 101 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 102 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 103 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 104 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 105 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 106 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 107 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 108 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 109 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 110 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 111 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 112 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 113 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 114 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 115 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 116 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 117 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 118 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 119 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 120 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 121 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 122 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 123 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 124 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 125 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::Conv2DOptionsTypeTable,
+    tflite::DepthwiseConv2DOptionsTypeTable,
+    tflite::ConcatEmbeddingsOptionsTypeTable,
+    tflite::LSHProjectionOptionsTypeTable,
+    tflite::Pool2DOptionsTypeTable,
+    tflite::SVDFOptionsTypeTable,
+    tflite::RNNOptionsTypeTable,
+    tflite::FullyConnectedOptionsTypeTable,
+    tflite::SoftmaxOptionsTypeTable,
+    tflite::ConcatenationOptionsTypeTable,
+    tflite::AddOptionsTypeTable,
+    tflite::L2NormOptionsTypeTable,
+    tflite::LocalResponseNormalizationOptionsTypeTable,
+    tflite::LSTMOptionsTypeTable,
+    tflite::ResizeBilinearOptionsTypeTable,
+    tflite::CallOptionsTypeTable,
+    tflite::ReshapeOptionsTypeTable,
+    tflite::SkipGramOptionsTypeTable,
+    tflite::SpaceToDepthOptionsTypeTable,
+    tflite::EmbeddingLookupSparseOptionsTypeTable,
+    tflite::MulOptionsTypeTable,
+    tflite::PadOptionsTypeTable,
+    tflite::GatherOptionsTypeTable,
+    tflite::BatchToSpaceNDOptionsTypeTable,
+    tflite::SpaceToBatchNDOptionsTypeTable,
+    tflite::TransposeOptionsTypeTable,
+    tflite::ReducerOptionsTypeTable,
+    tflite::SubOptionsTypeTable,
+    tflite::DivOptionsTypeTable,
+    tflite::SqueezeOptionsTypeTable,
+    tflite::SequenceRNNOptionsTypeTable,
+    tflite::StridedSliceOptionsTypeTable,
+    tflite::ExpOptionsTypeTable,
+    tflite::TopKV2OptionsTypeTable,
+    tflite::SplitOptionsTypeTable,
+    tflite::LogSoftmaxOptionsTypeTable,
+    tflite::CastOptionsTypeTable,
+    tflite::DequantizeOptionsTypeTable,
+    tflite::MaximumMinimumOptionsTypeTable,
+    tflite::ArgMaxOptionsTypeTable,
+    tflite::LessOptionsTypeTable,
+    tflite::NegOptionsTypeTable,
+    tflite::PadV2OptionsTypeTable,
+    tflite::GreaterOptionsTypeTable,
+    tflite::GreaterEqualOptionsTypeTable,
+    tflite::LessEqualOptionsTypeTable,
+    tflite::SelectOptionsTypeTable,
+    tflite::SliceOptionsTypeTable,
+    tflite::TransposeConvOptionsTypeTable,
+    tflite::SparseToDenseOptionsTypeTable,
+    tflite::TileOptionsTypeTable,
+    tflite::ExpandDimsOptionsTypeTable,
+    tflite::EqualOptionsTypeTable,
+    tflite::NotEqualOptionsTypeTable,
+    tflite::ShapeOptionsTypeTable,
+    tflite::PowOptionsTypeTable,
+    tflite::ArgMinOptionsTypeTable,
+    tflite::FakeQuantOptionsTypeTable,
+    tflite::PackOptionsTypeTable,
+    tflite::LogicalOrOptionsTypeTable,
+    tflite::OneHotOptionsTypeTable,
+    tflite::LogicalAndOptionsTypeTable,
+    tflite::LogicalNotOptionsTypeTable,
+    tflite::UnpackOptionsTypeTable,
+    tflite::FloorDivOptionsTypeTable,
+    tflite::SquareOptionsTypeTable,
+    tflite::ZerosLikeOptionsTypeTable,
+    tflite::FillOptionsTypeTable,
+    tflite::BidirectionalSequenceLSTMOptionsTypeTable,
+    tflite::BidirectionalSequenceRNNOptionsTypeTable,
+    tflite::UnidirectionalSequenceLSTMOptionsTypeTable,
+    tflite::FloorModOptionsTypeTable,
+    tflite::RangeOptionsTypeTable,
+    tflite::ResizeNearestNeighborOptionsTypeTable,
+    tflite::LeakyReluOptionsTypeTable,
+    tflite::SquaredDifferenceOptionsTypeTable,
+    tflite::MirrorPadOptionsTypeTable,
+    tflite::AbsOptionsTypeTable,
+    tflite::SplitVOptionsTypeTable,
+    tflite::UniqueOptionsTypeTable,
+    tflite::ReverseV2OptionsTypeTable,
+    tflite::AddNOptionsTypeTable,
+    tflite::GatherNdOptionsTypeTable,
+    tflite::CosOptionsTypeTable,
+    tflite::WhereOptionsTypeTable,
+    tflite::RankOptionsTypeTable,
+    tflite::ReverseSequenceOptionsTypeTable,
+    tflite::MatrixDiagOptionsTypeTable,
+    tflite::QuantizeOptionsTypeTable,
+    tflite::MatrixSetDiagOptionsTypeTable,
+    tflite::HardSwishOptionsTypeTable,
+    tflite::IfOptionsTypeTable,
+    tflite::WhileOptionsTypeTable,
+    tflite::DepthToSpaceOptionsTypeTable,
+    tflite::NonMaxSuppressionV4OptionsTypeTable,
+    tflite::NonMaxSuppressionV5OptionsTypeTable,
+    tflite::ScatterNdOptionsTypeTable,
+    tflite::SelectV2OptionsTypeTable,
+    tflite::DensifyOptionsTypeTable,
+    tflite::SegmentSumOptionsTypeTable,
+    tflite::BatchMatMulOptionsTypeTable,
+    tflite::CumsumOptionsTypeTable,
+    tflite::CallOnceOptionsTypeTable,
+    tflite::BroadcastToOptionsTypeTable,
+    tflite::Rfft2dOptionsTypeTable,
+    tflite::Conv3DOptionsTypeTable,
+    tflite::HashtableOptionsTypeTable,
+    tflite::HashtableFindOptionsTypeTable,
+    tflite::HashtableImportOptionsTypeTable,
+    tflite::HashtableSizeOptionsTypeTable,
+    tflite::VarHandleOptionsTypeTable,
+    tflite::ReadVariableOptionsTypeTable,
+    tflite::AssignVariableOptionsTypeTable,
+    tflite::RandomOptionsTypeTable,
+    tflite::BucketizeOptionsTypeTable,
+    tflite::GeluOptionsTypeTable,
+    tflite::DynamicUpdateSliceOptionsTypeTable,
+    tflite::UnsortedSegmentProdOptionsTypeTable,
+    tflite::UnsortedSegmentMaxOptionsTypeTable,
+    tflite::UnsortedSegmentMinOptionsTypeTable,
+    tflite::UnsortedSegmentSumOptionsTypeTable,
+    tflite::ATan2OptionsTypeTable,
+    tflite::SignOptionsTypeTable,
+    tflite::BitcastOptionsTypeTable,
+    tflite::BitwiseXorOptionsTypeTable,
+    tflite::RightShiftOptionsTypeTable
+  };
+  static const char * const names[] = {
+    "NONE",
+    "Conv2DOptions",
+    "DepthwiseConv2DOptions",
+    "ConcatEmbeddingsOptions",
+    "LSHProjectionOptions",
+    "Pool2DOptions",
+    "SVDFOptions",
+    "RNNOptions",
+    "FullyConnectedOptions",
+    "SoftmaxOptions",
+    "ConcatenationOptions",
+    "AddOptions",
+    "L2NormOptions",
+    "LocalResponseNormalizationOptions",
+    "LSTMOptions",
+    "ResizeBilinearOptions",
+    "CallOptions",
+    "ReshapeOptions",
+    "SkipGramOptions",
+    "SpaceToDepthOptions",
+    "EmbeddingLookupSparseOptions",
+    "MulOptions",
+    "PadOptions",
+    "GatherOptions",
+    "BatchToSpaceNDOptions",
+    "SpaceToBatchNDOptions",
+    "TransposeOptions",
+    "ReducerOptions",
+    "SubOptions",
+    "DivOptions",
+    "SqueezeOptions",
+    "SequenceRNNOptions",
+    "StridedSliceOptions",
+    "ExpOptions",
+    "TopKV2Options",
+    "SplitOptions",
+    "LogSoftmaxOptions",
+    "CastOptions",
+    "DequantizeOptions",
+    "MaximumMinimumOptions",
+    "ArgMaxOptions",
+    "LessOptions",
+    "NegOptions",
+    "PadV2Options",
+    "GreaterOptions",
+    "GreaterEqualOptions",
+    "LessEqualOptions",
+    "SelectOptions",
+    "SliceOptions",
+    "TransposeConvOptions",
+    "SparseToDenseOptions",
+    "TileOptions",
+    "ExpandDimsOptions",
+    "EqualOptions",
+    "NotEqualOptions",
+    "ShapeOptions",
+    "PowOptions",
+    "ArgMinOptions",
+    "FakeQuantOptions",
+    "PackOptions",
+    "LogicalOrOptions",
+    "OneHotOptions",
+    "LogicalAndOptions",
+    "LogicalNotOptions",
+    "UnpackOptions",
+    "FloorDivOptions",
+    "SquareOptions",
+    "ZerosLikeOptions",
+    "FillOptions",
+    "BidirectionalSequenceLSTMOptions",
+    "BidirectionalSequenceRNNOptions",
+    "UnidirectionalSequenceLSTMOptions",
+    "FloorModOptions",
+    "RangeOptions",
+    "ResizeNearestNeighborOptions",
+    "LeakyReluOptions",
+    "SquaredDifferenceOptions",
+    "MirrorPadOptions",
+    "AbsOptions",
+    "SplitVOptions",
+    "UniqueOptions",
+    "ReverseV2Options",
+    "AddNOptions",
+    "GatherNdOptions",
+    "CosOptions",
+    "WhereOptions",
+    "RankOptions",
+    "ReverseSequenceOptions",
+    "MatrixDiagOptions",
+    "QuantizeOptions",
+    "MatrixSetDiagOptions",
+    "HardSwishOptions",
+    "IfOptions",
+    "WhileOptions",
+    "DepthToSpaceOptions",
+    "NonMaxSuppressionV4Options",
+    "NonMaxSuppressionV5Options",
+    "ScatterNdOptions",
+    "SelectV2Options",
+    "DensifyOptions",
+    "SegmentSumOptions",
+    "BatchMatMulOptions",
+    "CumsumOptions",
+    "CallOnceOptions",
+    "BroadcastToOptions",
+    "Rfft2dOptions",
+    "Conv3DOptions",
+    "HashtableOptions",
+    "HashtableFindOptions",
+    "HashtableImportOptions",
+    "HashtableSizeOptions",
+    "VarHandleOptions",
+    "ReadVariableOptions",
+    "AssignVariableOptions",
+    "RandomOptions",
+    "BucketizeOptions",
+    "GeluOptions",
+    "DynamicUpdateSliceOptions",
+    "UnsortedSegmentProdOptions",
+    "UnsortedSegmentMaxOptions",
+    "UnsortedSegmentMinOptions",
+    "UnsortedSegmentSumOptions",
+    "ATan2Options",
+    "SignOptions",
+    "BitcastOptions",
+    "BitwiseXorOptions",
+    "RightShiftOptions"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_UNION, 127, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *BuiltinOptions2TypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_SEQUENCE, 0, -1 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 0 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 1 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 2 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 3 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 4 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 5 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 6 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 7 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 8 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 9 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 10 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 11 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 12 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 13 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 14 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 15 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 16 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 17 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 18 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 19 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::StablehloConcatenateOptionsTypeTable,
+    tflite::StablehloBroadcastInDimOptionsTypeTable,
+    tflite::StablehloSliceOptionsTypeTable,
+    tflite::StablehloConvolutionOptionsTypeTable,
+    tflite::StablehloCustomCallOptionsTypeTable,
+    tflite::StablehloReduceOptionsTypeTable,
+    tflite::StablehloScatterOptionsTypeTable,
+    tflite::StablehloCompareOptionsTypeTable,
+    tflite::StablehloDynamicSliceOptionsTypeTable,
+    tflite::StablehloPadOptionsTypeTable,
+    tflite::StablehloIotaOptionsTypeTable,
+    tflite::StablehloDotGeneralOptionsTypeTable,
+    tflite::StablehloReduceWindowOptionsTypeTable,
+    tflite::StablehloSortOptionsTypeTable,
+    tflite::StablehloWhileOptionsTypeTable,
+    tflite::StablehloGatherOptionsTypeTable,
+    tflite::StablehloTransposeOptionsTypeTable,
+    tflite::DilateOptionsTypeTable,
+    tflite::StablehloRngBitGeneratorOptionsTypeTable,
+    tflite::ReduceWindowOptionsTypeTable
+  };
+  static const char * const names[] = {
+    "NONE",
+    "StablehloConcatenateOptions",
+    "StablehloBroadcastInDimOptions",
+    "StablehloSliceOptions",
+    "StablehloConvolutionOptions",
+    "StablehloCustomCallOptions",
+    "StablehloReduceOptions",
+    "StablehloScatterOptions",
+    "StablehloCompareOptions",
+    "StablehloDynamicSliceOptions",
+    "StablehloPadOptions",
+    "StablehloIotaOptions",
+    "StablehloDotGeneralOptions",
+    "StablehloReduceWindowOptions",
+    "StablehloSortOptions",
+    "StablehloWhileOptions",
+    "StablehloGatherOptions",
+    "StablehloTransposeOptions",
+    "DilateOptions",
+    "StablehloRngBitGeneratorOptions",
+    "ReduceWindowOptions"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_UNION, 21, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StablehloPrecisionConfigTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_UINT, 0, 0 },
+    { ::flatbuffers::ET_UINT, 0, 0 },
+    { ::flatbuffers::ET_UINT, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::StablehloPrecisionConfigTypeTable
+  };
+  static const char * const names[] = {
+    "DEFAULT",
+    "HIGH",
+    "HIGHEST"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_ENUM, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StablehloComparisonDirectionTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_UINT, 0, 0 },
+    { ::flatbuffers::ET_UINT, 0, 0 },
+    { ::flatbuffers::ET_UINT, 0, 0 },
+    { ::flatbuffers::ET_UINT, 0, 0 },
+    { ::flatbuffers::ET_UINT, 0, 0 },
+    { ::flatbuffers::ET_UINT, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::StablehloComparisonDirectionTypeTable
+  };
+  static const char * const names[] = {
+    "STABLEHLO_COMPARISON_DIRECTION_EQ",
+    "STABLEHLO_COMPARISON_DIRECTION_NE",
+    "STABLEHLO_COMPARISON_DIRECTION_GE",
+    "STABLEHLO_COMPARISON_DIRECTION_GT",
+    "STABLEHLO_COMPARISON_DIRECTION_LE",
+    "STABLEHLO_COMPARISON_DIRECTION_LT"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_ENUM, 6, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StablehloComparisonTypeTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_UINT, 0, 0 },
+    { ::flatbuffers::ET_UINT, 0, 0 },
+    { ::flatbuffers::ET_UINT, 0, 0 },
+    { ::flatbuffers::ET_UINT, 0, 0 },
+    { ::flatbuffers::ET_UINT, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::StablehloComparisonTypeTypeTable
+  };
+  static const char * const names[] = {
+    "STABLEHLO_COMPARISON_TYPE_NOTYPE",
+    "STABLEHLO_COMPARISON_TYPE_FLOAT",
+    "STABLEHLO_COMPARISON_TYPE_FLOAT_TOTAL_ORDER",
+    "STABLEHLO_COMPARISON_TYPE_SIGNED",
+    "STABLEHLO_COMPARISON_TYPE_UNSIGNED"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_ENUM, 5, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *RngAlgorithmTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::RngAlgorithmTypeTable
+  };
+  static const char * const names[] = {
+    "DEFAULT",
+    "PHILOX",
+    "THREEFRY"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_ENUM, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *PaddingTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::PaddingTypeTable
+  };
+  static const char * const names[] = {
+    "SAME",
+    "VALID"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_ENUM, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ActivationFunctionTypeTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "NONE",
+    "RELU",
+    "RELU_N1_TO_1",
+    "RELU6",
+    "TANH",
+    "SIGN_BIT"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_ENUM, 6, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *LSHProjectionTypeTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::LSHProjectionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "UNKNOWN",
+    "SPARSE",
+    "DENSE"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_ENUM, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *FullyConnectedOptionsWeightsFormatTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::FullyConnectedOptionsWeightsFormatTypeTable
+  };
+  static const char * const names[] = {
+    "DEFAULT",
+    "SHUFFLED4x16INT8"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_ENUM, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *LSTMKernelTypeTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::LSTMKernelTypeTypeTable
+  };
+  static const char * const names[] = {
+    "FULL",
+    "BASIC"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_ENUM, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *CombinerTypeTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::CombinerTypeTypeTable
+  };
+  static const char * const names[] = {
+    "SUM",
+    "MEAN",
+    "SQRTN"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_ENUM, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *MirrorPadModeTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::MirrorPadModeTypeTable
+  };
+  static const char * const names[] = {
+    "REFLECT",
+    "SYMMETRIC"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_ENUM, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ReduceWindowFunctionTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::ReduceWindowFunctionTypeTable
+  };
+  static const char * const names[] = {
+    "UNSUPPORTED",
+    "ADD",
+    "MUL",
+    "MINIMUM",
+    "MAXIMUM",
+    "ALL",
+    "ANY"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_ENUM, 7, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *CustomOptionsFormatTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::CustomOptionsFormatTypeTable
+  };
+  static const char * const names[] = {
+    "FLEXBUFFERS"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_ENUM, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *CustomQuantizationTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_UCHAR, 1, -1 }
+  };
+  static const char * const names[] = {
+    "custom"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *QuantizationParametersTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_FLOAT, 1, -1 },
+    { ::flatbuffers::ET_FLOAT, 1, -1 },
+    { ::flatbuffers::ET_FLOAT, 1, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_UTYPE, 0, 0 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, -1 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::QuantizationDetailsTypeTable
+  };
+  static const char * const names[] = {
+    "min",
+    "max",
+    "scale",
+    "zero_point",
+    "details_type",
+    "details",
+    "quantized_dimension"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 7, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *Int32VectorTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 1, -1 }
+  };
+  static const char * const names[] = {
+    "values"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *Uint16VectorTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_USHORT, 1, -1 }
+  };
+  static const char * const names[] = {
+    "values"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *Uint8VectorTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_UCHAR, 1, -1 }
+  };
+  static const char * const names[] = {
+    "values"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *DimensionMetadataTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_UTYPE, 0, 1 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 1 },
+    { ::flatbuffers::ET_UTYPE, 0, 1 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 1 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::DimensionTypeTypeTable,
+    tflite::SparseIndexVectorTypeTable
+  };
+  static const char * const names[] = {
+    "format",
+    "dense_size",
+    "array_segments_type",
+    "array_segments",
+    "array_indices_type",
+    "array_indices"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 6, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SparsityParametersTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 1, -1 },
+    { ::flatbuffers::ET_INT, 1, -1 },
+    { ::flatbuffers::ET_SEQUENCE, 1, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::DimensionMetadataTypeTable
+  };
+  static const char * const names[] = {
+    "traversal_order",
+    "block_map",
+    "dim_metadata"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *VariantSubTypeTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 1, -1 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::TensorTypeTypeTable
+  };
+  static const char * const names[] = {
+    "shape",
+    "type",
+    "has_rank"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *TensorTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 1, -1 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_UINT, 0, -1 },
+    { ::flatbuffers::ET_STRING, 0, -1 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 2 },
+    { ::flatbuffers::ET_INT, 1, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 },
+    { ::flatbuffers::ET_SEQUENCE, 1, 3 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::TensorTypeTypeTable,
+    tflite::QuantizationParametersTypeTable,
+    tflite::SparsityParametersTypeTable,
+    tflite::VariantSubTypeTypeTable
+  };
+  static const char * const names[] = {
+    "shape",
+    "type",
+    "buffer",
+    "name",
+    "quantization",
+    "is_variable",
+    "sparsity",
+    "shape_signature",
+    "has_rank",
+    "variant_tensors"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 10, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StablehloGatherOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 0, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const char * const names[] = {
+    "offset_dims",
+    "collapsed_slice_dims",
+    "start_index_map",
+    "index_vector_dim",
+    "slice_sizes",
+    "indices_are_sorted"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 6, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StablehloTransposeOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_LONG, 1, -1 }
+  };
+  static const char * const names[] = {
+    "permutation"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StablehloDotGeneralOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_UINT, 1, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::StablehloPrecisionConfigTypeTable
+  };
+  static const char * const names[] = {
+    "lhs_batching_dimensions",
+    "rhs_batching_dimensions",
+    "lhs_contracting_dimensions",
+    "rhs_contracting_dimensions",
+    "precision_config"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 5, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StablehloReduceWindowOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "window_dimensions",
+    "window_strides",
+    "base_dilations",
+    "window_dilations",
+    "padding",
+    "body_subgraph_index"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 6, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StablehloWhileOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "cond_subgraph_index",
+    "body_subgraph_index"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StablehloSortOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_LONG, 0, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "dimension",
+    "is_stable",
+    "comparator_subgraph_index"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 3, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StablehloConcatenateOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_LONG, 0, -1 }
+  };
+  static const char * const names[] = {
+    "dimension"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StablehloBroadcastInDimOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_LONG, 1, -1 }
+  };
+  static const char * const names[] = {
+    "broadcast_dimensions"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StablehloCompareOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_UINT, 0, 0 },
+    { ::flatbuffers::ET_UINT, 0, 1 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::StablehloComparisonDirectionTypeTable,
+    tflite::StablehloComparisonTypeTypeTable
+  };
+  static const char * const names[] = {
+    "comparison_direction",
+    "compare_type"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StablehloDynamicSliceOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_LONG, 1, -1 }
+  };
+  static const char * const names[] = {
+    "slice_sizes"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StablehloPadOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 }
+  };
+  static const char * const names[] = {
+    "edge_padding_low",
+    "edge_padding_high",
+    "interior_padding"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 3, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StablehloIotaOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_LONG, 0, -1 }
+  };
+  static const char * const names[] = {
+    "iota_dimension"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StablehloCustomCallOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_STRING, 0, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 },
+    { ::flatbuffers::ET_STRING, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 1, -1 },
+    { ::flatbuffers::ET_UCHAR, 1, -1 }
+  };
+  static const char * const names[] = {
+    "call_target_name",
+    "has_side_effect",
+    "backend_config",
+    "api_version",
+    "called_computations",
+    "custom_attributes"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 6, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StablehloReduceOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "dimensions",
+    "body_subgraph_index"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StablehloSliceOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 }
+  };
+  static const char * const names[] = {
+    "start_indices",
+    "limit_indices",
+    "strides"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 3, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StablehloConvolutionOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_BOOL, 1, -1 },
+    { ::flatbuffers::ET_LONG, 0, -1 },
+    { ::flatbuffers::ET_LONG, 0, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 0, -1 },
+    { ::flatbuffers::ET_LONG, 0, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 0, -1 },
+    { ::flatbuffers::ET_LONG, 0, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 0, -1 },
+    { ::flatbuffers::ET_LONG, 0, -1 },
+    { ::flatbuffers::ET_UINT, 1, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::StablehloPrecisionConfigTypeTable
+  };
+  static const char * const names[] = {
+    "window_strides",
+    "padding",
+    "lhs_dilation",
+    "rhs_dilation",
+    "window_reversal",
+    "input_batch_dimension",
+    "input_feature_dimension",
+    "input_spatial_dimensions",
+    "kernel_input_feature_dimension",
+    "kernel_output_feature_dimension",
+    "kernel_spatial_dimensions",
+    "output_batch_dimension",
+    "output_feature_dimension",
+    "output_spatial_dimensions",
+    "feature_group_count",
+    "batch_group_count",
+    "precision_config"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 17, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StablehloScatterOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_BOOL, 0, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 1, -1 },
+    { ::flatbuffers::ET_LONG, 0, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "indices_are_sorted",
+    "update_window_dims",
+    "inserted_window_dims",
+    "scatter_dims_to_operand_dims",
+    "index_vector_dim",
+    "unique_indices",
+    "update_computation_subgraph_index"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 7, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StablehloRngBitGeneratorOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::RngAlgorithmTypeTable
+  };
+  static const char * const names[] = {
+    "algorithm"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *Conv2DOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_CHAR, 0, 1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_CHAR, 0, 2 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::PaddingTypeTable,
+    tflite::ActivationFunctionTypeTypeTable,
+    tflite::TensorTypeTypeTable
+  };
+  static const char * const names[] = {
+    "padding",
+    "stride_w",
+    "stride_h",
+    "fused_activation_function",
+    "dilation_w_factor",
+    "dilation_h_factor",
+    "quantized_bias_type"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 7, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *Conv3DOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_CHAR, 0, 1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::PaddingTypeTable,
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "padding",
+    "stride_d",
+    "stride_w",
+    "stride_h",
+    "fused_activation_function",
+    "dilation_d_factor",
+    "dilation_w_factor",
+    "dilation_h_factor"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 8, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *Pool2DOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_CHAR, 0, 1 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::PaddingTypeTable,
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "padding",
+    "stride_w",
+    "stride_h",
+    "filter_width",
+    "filter_height",
+    "fused_activation_function"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 6, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *DepthwiseConv2DOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_CHAR, 0, 1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::PaddingTypeTable,
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "padding",
+    "stride_w",
+    "stride_h",
+    "depth_multiplier",
+    "fused_activation_function",
+    "dilation_w_factor",
+    "dilation_h_factor"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 7, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ConcatEmbeddingsOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 1, -1 },
+    { ::flatbuffers::ET_INT, 1, -1 }
+  };
+  static const char * const names[] = {
+    "num_channels",
+    "num_columns_per_channel",
+    "embedding_dim_per_channel"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 3, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *LSHProjectionOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::LSHProjectionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "type"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SVDFOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "rank",
+    "fused_activation_function",
+    "asymmetric_quantize_inputs"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *RNNOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "fused_activation_function",
+    "asymmetric_quantize_inputs"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SequenceRNNOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_BOOL, 0, -1 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "time_major",
+    "fused_activation_function",
+    "asymmetric_quantize_inputs"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *BidirectionalSequenceRNNOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_BOOL, 0, -1 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_BOOL, 0, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "time_major",
+    "fused_activation_function",
+    "merge_outputs",
+    "asymmetric_quantize_inputs"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 4, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *FullyConnectedOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 },
+    { ::flatbuffers::ET_CHAR, 0, 2 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable,
+    tflite::FullyConnectedOptionsWeightsFormatTypeTable,
+    tflite::TensorTypeTypeTable
+  };
+  static const char * const names[] = {
+    "fused_activation_function",
+    "weights_format",
+    "keep_num_dims",
+    "asymmetric_quantize_inputs",
+    "quantized_bias_type"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 5, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SoftmaxOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_FLOAT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "beta"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ConcatenationOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "axis",
+    "fused_activation_function"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *AddOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "fused_activation_function",
+    "pot_scale_int16"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *MulOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "fused_activation_function"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *L2NormOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "fused_activation_function"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *LocalResponseNormalizationOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_FLOAT, 0, -1 },
+    { ::flatbuffers::ET_FLOAT, 0, -1 },
+    { ::flatbuffers::ET_FLOAT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "radius",
+    "bias",
+    "alpha",
+    "beta"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 4, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *LSTMOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_FLOAT, 0, -1 },
+    { ::flatbuffers::ET_FLOAT, 0, -1 },
+    { ::flatbuffers::ET_CHAR, 0, 1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable,
+    tflite::LSTMKernelTypeTypeTable
+  };
+  static const char * const names[] = {
+    "fused_activation_function",
+    "cell_clip",
+    "proj_clip",
+    "kernel_type",
+    "asymmetric_quantize_inputs"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 5, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *UnidirectionalSequenceLSTMOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_FLOAT, 0, -1 },
+    { ::flatbuffers::ET_FLOAT, 0, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "fused_activation_function",
+    "cell_clip",
+    "proj_clip",
+    "time_major",
+    "asymmetric_quantize_inputs",
+    "diagonal_recurrent_tensors"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 6, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *BidirectionalSequenceLSTMOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_FLOAT, 0, -1 },
+    { ::flatbuffers::ET_FLOAT, 0, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "fused_activation_function",
+    "cell_clip",
+    "proj_clip",
+    "merge_outputs",
+    "time_major",
+    "asymmetric_quantize_inputs"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 6, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ResizeBilinearOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const char * const names[] = {
+    "new_height",
+    "new_width",
+    "align_corners",
+    "half_pixel_centers"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 4, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ResizeNearestNeighborOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_BOOL, 0, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const char * const names[] = {
+    "align_corners",
+    "half_pixel_centers"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *CallOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_UINT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "subgraph"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *PadOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *PadV2OptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ReshapeOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 1, -1 }
+  };
+  static const char * const names[] = {
+    "new_shape"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SpaceToBatchNDOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *BatchToSpaceNDOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SkipGramOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const char * const names[] = {
+    "ngram_size",
+    "max_skip_size",
+    "include_all_ngrams"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 3, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SpaceToDepthOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "block_size"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *DepthToSpaceOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "block_size"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SubOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "fused_activation_function",
+    "pot_scale_int16"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *DivOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::ActivationFunctionTypeTypeTable
+  };
+  static const char * const names[] = {
+    "fused_activation_function"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *TopKV2OptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *EmbeddingLookupSparseOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::CombinerTypeTypeTable
+  };
+  static const char * const names[] = {
+    "combiner"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *GatherOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "axis",
+    "batch_dims"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *TransposeOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ExpOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *CosOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ReducerOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const char * const names[] = {
+    "keep_dims"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SqueezeOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 1, -1 }
+  };
+  static const char * const names[] = {
+    "squeeze_dims"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SplitOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "num_splits"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SplitVOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "num_splits"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *StridedSliceOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const char * const names[] = {
+    "begin_mask",
+    "end_mask",
+    "ellipsis_mask",
+    "new_axis_mask",
+    "shrink_axis_mask",
+    "offset"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 6, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *LogSoftmaxOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *CastOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::TensorTypeTypeTable
+  };
+  static const char * const names[] = {
+    "in_data_type",
+    "out_data_type"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 2, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *DequantizeOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *MaximumMinimumOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *TileOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ArgMaxOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::TensorTypeTypeTable
+  };
+  static const char * const names[] = {
+    "output_type"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ArgMinOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::TensorTypeTypeTable
+  };
+  static const char * const names[] = {
+    "output_type"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *GreaterOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *GreaterEqualOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *LessOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *LessEqualOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *NegOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SelectOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SliceOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *TransposeConvOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_CHAR, 0, 1 },
+    { ::flatbuffers::ET_CHAR, 0, 2 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::PaddingTypeTable,
+    tflite::ActivationFunctionTypeTypeTable,
+    tflite::TensorTypeTypeTable
+  };
+  static const char * const names[] = {
+    "padding",
+    "stride_w",
+    "stride_h",
+    "fused_activation_function",
+    "quantized_bias_type"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 5, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ExpandDimsOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SparseToDenseOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const char * const names[] = {
+    "validate_indices"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *EqualOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *NotEqualOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ShapeOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::TensorTypeTypeTable
+  };
+  static const char * const names[] = {
+    "out_type"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *RankOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *PowOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *FakeQuantOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_FLOAT, 0, -1 },
+    { ::flatbuffers::ET_FLOAT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const char * const names[] = {
+    "min",
+    "max",
+    "num_bits",
+    "narrow_range"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 4, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *PackOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "values_count",
+    "axis"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *LogicalOrOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *OneHotOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "axis"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *AbsOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *HardSwishOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *LogicalAndOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *LogicalNotOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *UnpackOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "num",
+    "axis"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *FloorDivOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SquareOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ZerosLikeOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *FillOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *FloorModOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *RangeOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *LeakyReluOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_FLOAT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "alpha"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SquaredDifferenceOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *MirrorPadOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::MirrorPadModeTypeTable
+  };
+  static const char * const names[] = {
+    "mode"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *UniqueOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::TensorTypeTypeTable
+  };
+  static const char * const names[] = {
+    "idx_out_type"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ReverseV2OptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *AddNOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *GatherNdOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *WhereOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ReverseSequenceOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "seq_dim",
+    "batch_dim"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *MatrixDiagOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *QuantizeOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *MatrixSetDiagOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *IfOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "then_subgraph_index",
+    "else_subgraph_index"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *CallOnceOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "init_subgraph_index"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *WhileOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "cond_subgraph_index",
+    "body_subgraph_index"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *NonMaxSuppressionV4OptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *NonMaxSuppressionV5OptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ScatterNdOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SelectV2OptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *DensifyOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SegmentSumOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *BatchMatMulOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_BOOL, 0, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const char * const names[] = {
+    "adj_x",
+    "adj_y",
+    "asymmetric_quantize_inputs"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 3, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *CumsumOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_BOOL, 0, -1 },
+    { ::flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const char * const names[] = {
+    "exclusive",
+    "reverse"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *BroadcastToOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *Rfft2dOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *HashtableOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_CHAR, 0, 0 },
+    { ::flatbuffers::ET_CHAR, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::TensorTypeTypeTable
+  };
+  static const char * const names[] = {
+    "table_id",
+    "key_dtype",
+    "value_dtype"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 3, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *HashtableFindOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *HashtableImportOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *HashtableSizeOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *VarHandleOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_STRING, 0, -1 },
+    { ::flatbuffers::ET_STRING, 0, -1 }
+  };
+  static const char * const names[] = {
+    "container",
+    "shared_name"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ReadVariableOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *AssignVariableOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *RandomOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_LONG, 0, -1 },
+    { ::flatbuffers::ET_LONG, 0, -1 }
+  };
+  static const char * const names[] = {
+    "seed",
+    "seed2"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *BucketizeOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_FLOAT, 1, -1 }
+  };
+  static const char * const names[] = {
+    "boundaries"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *GeluOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_BOOL, 0, -1 }
+  };
+  static const char * const names[] = {
+    "approximate"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *DynamicUpdateSliceOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *UnsortedSegmentProdOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *UnsortedSegmentMaxOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *UnsortedSegmentSumOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ATan2OptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *UnsortedSegmentMinOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SignOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *BitcastOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *BitwiseXorOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *RightShiftOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *DilateOptionsTypeTable() {
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 0, nullptr, nullptr, nullptr, nullptr, nullptr
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ReduceWindowOptionsTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_INT, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::ReduceWindowFunctionTypeTable
+  };
+  static const char * const names[] = {
+    "reduce_function"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 1, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *OperatorCodeTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_CHAR, 0, -1 },
+    { ::flatbuffers::ET_STRING, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, -1 },
+    { ::flatbuffers::ET_INT, 0, 0 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::BuiltinOperatorTypeTable
+  };
+  static const char * const names[] = {
+    "deprecated_builtin_code",
+    "custom_code",
+    "version",
+    "builtin_code"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 4, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *OperatorTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_UINT, 0, -1 },
+    { ::flatbuffers::ET_INT, 1, -1 },
+    { ::flatbuffers::ET_INT, 1, -1 },
+    { ::flatbuffers::ET_UTYPE, 0, 0 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 0 },
+    { ::flatbuffers::ET_UCHAR, 1, -1 },
+    { ::flatbuffers::ET_CHAR, 0, 1 },
+    { ::flatbuffers::ET_BOOL, 1, -1 },
+    { ::flatbuffers::ET_INT, 1, -1 },
+    { ::flatbuffers::ET_ULONG, 0, -1 },
+    { ::flatbuffers::ET_ULONG, 0, -1 },
+    { ::flatbuffers::ET_UTYPE, 0, 2 },
+    { ::flatbuffers::ET_SEQUENCE, 0, 2 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::BuiltinOptionsTypeTable,
+    tflite::CustomOptionsFormatTypeTable,
+    tflite::BuiltinOptions2TypeTable
+  };
+  static const char * const names[] = {
+    "opcode_index",
+    "inputs",
+    "outputs",
+    "builtin_options_type",
+    "builtin_options",
+    "custom_options",
+    "custom_options_format",
+    "mutating_variable_inputs",
+    "intermediates",
+    "large_custom_options_offset",
+    "large_custom_options_size",
+    "builtin_options_2_type",
+    "builtin_options_2"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 13, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SubGraphTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_SEQUENCE, 1, 0 },
+    { ::flatbuffers::ET_INT, 1, -1 },
+    { ::flatbuffers::ET_INT, 1, -1 },
+    { ::flatbuffers::ET_SEQUENCE, 1, 1 },
+    { ::flatbuffers::ET_STRING, 0, -1 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::TensorTypeTable,
+    tflite::OperatorTypeTable
+  };
+  static const char * const names[] = {
+    "tensors",
+    "inputs",
+    "outputs",
+    "operators",
+    "name"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 5, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *BufferTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_UCHAR, 1, -1 },
+    { ::flatbuffers::ET_ULONG, 0, -1 },
+    { ::flatbuffers::ET_ULONG, 0, -1 }
+  };
+  static const char * const names[] = {
+    "data",
+    "offset",
+    "size"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 3, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *MetadataTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_STRING, 0, -1 },
+    { ::flatbuffers::ET_UINT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "name",
+    "buffer"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *TensorMapTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_STRING, 0, -1 },
+    { ::flatbuffers::ET_UINT, 0, -1 }
+  };
+  static const char * const names[] = {
+    "name",
+    "tensor_index"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SignatureDefTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_SEQUENCE, 1, 0 },
+    { ::flatbuffers::ET_SEQUENCE, 1, 0 },
+    { ::flatbuffers::ET_STRING, 0, -1 },
+    { ::flatbuffers::ET_STRING, 0, -1 },
+    { ::flatbuffers::ET_UINT, 0, -1 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::TensorMapTypeTable
+  };
+  static const char * const names[] = {
+    "inputs",
+    "outputs",
+    "signature_key",
+    "deprecated_tag",
+    "subgraph_index"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 5, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ModelTypeTable() {
+  static const ::flatbuffers::TypeCode type_codes[] = {
+    { ::flatbuffers::ET_UINT, 0, -1 },
+    { ::flatbuffers::ET_SEQUENCE, 1, 0 },
+    { ::flatbuffers::ET_SEQUENCE, 1, 1 },
+    { ::flatbuffers::ET_STRING, 0, -1 },
+    { ::flatbuffers::ET_SEQUENCE, 1, 2 },
+    { ::flatbuffers::ET_INT, 1, -1 },
+    { ::flatbuffers::ET_SEQUENCE, 1, 3 },
+    { ::flatbuffers::ET_SEQUENCE, 1, 4 }
+  };
+  static const ::flatbuffers::TypeFunction type_refs[] = {
+    tflite::OperatorCodeTypeTable,
+    tflite::SubGraphTypeTable,
+    tflite::BufferTypeTable,
+    tflite::MetadataTypeTable,
+    tflite::SignatureDefTypeTable
+  };
+  static const char * const names[] = {
+    "version",
+    "operator_codes",
+    "subgraphs",
+    "description",
+    "buffers",
+    "metadata_buffer",
+    "metadata",
+    "signature_defs"
+  };
+  static const ::flatbuffers::TypeTable tt = {
+    ::flatbuffers::ST_TABLE, 8, type_codes, type_refs, nullptr, nullptr, names
+  };
+  return &tt;
+}
+
+inline const tflite::Model *GetModel(const void *buf) {
+  return ::flatbuffers::GetRoot<tflite::Model>(buf);
+}
+
+inline const tflite::Model *GetSizePrefixedModel(const void *buf) {
+  return ::flatbuffers::GetSizePrefixedRoot<tflite::Model>(buf);
+}
+
+inline const char *ModelIdentifier() {
+  return "TFL3";
+}
+
+inline bool ModelBufferHasIdentifier(const void *buf) {
+  return ::flatbuffers::BufferHasIdentifier(
+      buf, ModelIdentifier());
+}
+
+inline bool SizePrefixedModelBufferHasIdentifier(const void *buf) {
+  return ::flatbuffers::BufferHasIdentifier(
+      buf, ModelIdentifier(), true);
+}
+
+inline bool VerifyModelBuffer(
+    ::flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<tflite::Model>(ModelIdentifier());
+}
+
+inline bool VerifySizePrefixedModelBuffer(
+    ::flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<tflite::Model>(ModelIdentifier());
+}
+
+inline const char *ModelExtension() {
+  return "tflite";
+}
+
+inline void FinishModelBuffer(
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<tflite::Model> root) {
+  fbb.Finish(root, ModelIdentifier());
+}
+
+inline void FinishSizePrefixedModelBuffer(
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<tflite::Model> root) {
+  fbb.FinishSizePrefixed(root, ModelIdentifier());
+}
+
+}  // namespace tflite
+
+#endif  // FLATBUFFERS_GENERATED_SCHEMA_TFLITE_H_
diff --git a/ethosu/regor/tflite/tflite_writer.cpp b/ethosu/regor/tflite/tflite_writer.cpp
new file mode 100644
index 00000000..76f16275
--- /dev/null
+++ b/ethosu/regor/tflite/tflite_writer.cpp
@@ -0,0 +1,778 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "tflite_writer.hpp"
+
+#include "common/logging.hpp"
+
+#include "flatbuffer_utils.hpp"
+#include "tflite_mapping.hpp"
+
+#include <unordered_map>
+#include <vector>
+
+// Specialization for sparsity dimension metadata
+template<>
+flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::DimensionMetadata>>> FlatbufferUtils::CopyVector(
+    flatbuffers::FlatBufferBuilder &dst, const flatbuffers::Vector<flatbuffers::Offset<tflite::DimensionMetadata>> *src)
+{
+    if ( src )
+    {
+        std::vector<flatbuffers::Offset<tflite::DimensionMetadata>> offsets;
+        for ( const auto &dimension_metadata : *src )
+        {
+            // TODO: offsets.push_back(tflite::CreateDimensionMetadata(...))
+        }
+        return dst.CreateVector<flatbuffers::Offset<tflite::DimensionMetadata>>(offsets);
+    }
+    return 0;
+}
+
+namespace regor
+{
+
+std::unique_ptr<const uint8_t[]> TfLiteWriter::Serialise(const std::vector<std::unique_ptr<Graph>> &graphs,
+    const std::vector<std::unordered_map<const Tensor *, Address>> &tensor_address_maps, int64_t &output_buffer_offset, size_t &output_buffer_size)
+{
+    // The zeroth buffer is always present and always empty
+    _buffers[BufferDesc()] = 0;
+    _serialised_buffers.push_back(tflite::CreateBufferDirect(_flatbuffer, nullptr));
+    std::vector<flatbuffers::Offset<tflite::Metadata>> serialised_metadata;  // TODO: passthrough metadata
+
+    for ( const auto &graph : graphs )
+    {
+        const auto &tensor_address_map = tensor_address_maps.at(_serialised_subgraphs.size());
+        std::vector<Operation *> operations;
+        std::set<const Operation *> skip;
+
+        for ( const auto &operation : graph->ScheduledOrder() )
+        {
+            if ( skip.count(operation) )
+            {
+                continue;
+            }
+
+            const OpType type = operation->Type();
+
+            // Set deprecated_builtin_code for backwards compatibility
+            tflite::BuiltinOperator builtin_code = TfLiteMapping::OpTypeToBuiltinOperator(type);
+            int8_t deprecated_builtin_code = int32_t(builtin_code) < 127 ? int8_t(builtin_code) : 127;
+            OperatorCodeDesc opcode_desc = {deprecated_builtin_code, nullptr, 1, builtin_code};
+            const auto tflite_model = static_cast<const tflite::Model *>(graph->Passthrough());
+            const auto tflite_operator = static_cast<const tflite::Operator *>(operation->Passthrough());
+            if ( tflite_model && tflite_operator )
+            {
+                assert(tflite_model->operator_codes());
+                assert(tflite_operator->opcode_index() < tflite_model->operator_codes()->size());
+                const auto opcode = tflite_model->operator_codes()->Get(tflite_operator->opcode_index());
+
+                assert(opcode);
+                opcode_desc = {opcode->deprecated_builtin_code(), opcode->custom_code() ? opcode->custom_code()->c_str() : nullptr,
+                    opcode->version(), opcode->builtin_code()};
+            }
+            else if ( type == OpType::CustomNpuOp )
+            {
+                opcode_desc = {deprecated_builtin_code, "ethos-u", 1, builtin_code};
+            }
+            else
+            {
+                assert(false && "Can't handle non-CustomNpuOp ops without passthrough data");
+            }
+
+            int opcode_index;
+            auto cached_opcode_desc = _opcodes.find(opcode_desc);
+            if ( cached_opcode_desc != _opcodes.end() )
+            {
+                // Used cached OperatorCode index
+                opcode_index = cached_opcode_desc->second;
+            }
+            else
+            {
+                opcode_index = _serialised_opcodes.size();
+                _serialised_opcodes.push_back(tflite::CreateOperatorCodeDirect(_flatbuffer,
+                    opcode_desc.deprecated_builtin_code, opcode_desc.custom_code, opcode_desc.version, opcode_desc.type));
+
+                // Cache the OperatorCode index
+                _opcodes[opcode_desc] = opcode_index;
+            }
+
+            const Operation *fused_activation = nullptr;
+            if ( TfLiteMapping::CanFuseActivationFunction(operation) )
+            {
+                fused_activation = operation->OFM()->Readers().front().get();
+                skip.insert(fused_activation);
+            }
+
+            std::vector<int> inputs, outputs;
+            for ( const auto &tensor : SortedInputTensors(operation) )
+            {
+                inputs.push_back(SerialisedTensorIndex(tensor, tensor_address_map));
+            }
+            for ( const auto &connection : (fused_activation ? fused_activation->Outputs() : operation->Outputs()) )
+            {
+                outputs.push_back(SerialisedTensorIndex(connection.tensor.get(), tensor_address_map));
+            }
+
+            // Unused parameters are set to default or, if present in the input model, passed through unmodified.
+            tflite::CustomOptionsFormat custom_options_format = tflite::CustomOptionsFormat::FLEXBUFFERS;
+            flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom_options = 0;
+            flatbuffers::Offset<flatbuffers::Vector<uint8_t>> mvi = 0;  // mutating_variable_inputs
+            flatbuffers::Offset<flatbuffers::Vector<int32_t>> intermediates = 0;
+
+            if ( type == OpType::CustomNpuOp )
+            {
+                // Could construct the flexbuffer using flexbuffers.h like this...
+                // {
+                //     flexbuffers::Builder builder;
+                //     builder.Int(1); // CO_TYPE = 1
+                //     builder.Finish();
+                //     flexbuffer = builder.GetBuffer()
+                // }
+
+                // But the result would always be the same, so just jump straight there.
+                std::vector<uint8_t> flexbuffer({1, 4, 1});
+
+                custom_options = _flatbuffer.CreateVector<uint8_t>(flexbuffer);
+            }
+            else if ( tflite_operator )
+            {
+                custom_options_format = tflite_operator->custom_options_format();
+                custom_options = FlatbufferUtils::CopyVector<uint8_t>(_flatbuffer, tflite_operator->custom_options());
+                mvi = FlatbufferUtils::CopyVector<uint8_t>(_flatbuffer, tflite_operator->mutating_variable_inputs());
+                intermediates = FlatbufferUtils::CopyVector<int32_t>(_flatbuffer, tflite_operator->intermediates());
+            }
+
+            auto serialised_inputs = _flatbuffer.CreateVector<int32_t>(inputs);
+            auto serialised_outputs = _flatbuffer.CreateVector<int32_t>(outputs);
+            auto serialised_options = SerialiseOptions(operation, fused_activation);
+
+            _serialised_operations.push_back(tflite::CreateOperator(_flatbuffer, opcode_index, serialised_inputs,
+                serialised_outputs, TfLiteMapping::OpTypeToBuiltinOptions(type), serialised_options, custom_options,
+                custom_options_format, mvi, intermediates));
+        }
+
+        std::vector<int> inputs, outputs;
+
+        for ( const auto &tensor : graph->Inputs() )
+        {
+            inputs.push_back(SerialisedTensorIndex(tensor.get(), tensor_address_map));
+        }
+        for ( const auto &tensor : graph->Outputs() )
+        {
+            outputs.push_back(SerialisedTensorIndex(tensor.get(), tensor_address_map));
+        }
+
+        _serialised_subgraphs.push_back(
+            tflite::CreateSubGraphDirect(_flatbuffer, &_serialised_tensors, &inputs, &outputs, &_serialised_operations));
+
+        serialised_metadata.push_back(SerialiseTensorAddresses(int(_serialised_subgraphs.size())));
+
+        _tensors.clear();
+        _serialised_operations.clear();
+        _serialised_tensors.clear();
+        _tensor_addresses.clear();
+    }
+
+    const char *_description = "Vela Optimised";
+
+    const auto model = tflite::CreateModelDirect(_flatbuffer,
+        3,  // version
+        &_serialised_opcodes, &_serialised_subgraphs, _description, &_serialised_buffers,
+        nullptr,  // deprecated metadata_buffer
+        &serialised_metadata
+        // TODO: signature_defs
+    );
+
+    tflite::FinishModelBuffer(_flatbuffer, model);
+
+    // Transfer ownership of the finished buffer from the flatbuffer builder to the caller
+    size_t size, offset;
+    const uint8_t *base = _flatbuffer.ReleaseRaw(size, offset);
+    output_buffer_offset = offset;
+    output_buffer_size = int(size - offset);
+
+    // Can convert to unique_ptr because std::default_delete() equivalent to flatbuffer::DefaultAllocator::deallocate()
+    //  - i.e. they both do `delete base`
+    return std::unique_ptr<const uint8_t[]>(base);
+}
+
+
+std::vector<const Tensor *> TfLiteWriter::SortedInputTensors(const Operation *operation)
+{
+    std::vector<const Tensor *> tensors;
+
+    int ifm = 0;
+    for ( const auto &pair : TfLiteMapping::InputTensorIndices(operation->Type()) )
+    {
+        const TensorUsage usage = pair.second;
+        const auto conn = operation->Input(usage);
+        tensors.push_back(conn ? conn->tensor.get() : nullptr);
+        ifm += IsIFM(usage);
+    }
+    while ( operation->Input(MakeTensorUsage(TensorUsage::IFM, ifm)) )
+    {
+        if ( IsVariadic(operation->Type()) )
+        {
+            tensors.push_back(operation->IFM(ifm));
+        }
+        else
+        {
+            LOG_WARN("TfLiteWriter: Unexpected input tensor {} of operator {} will be ignored.\n",
+                operation->IFM(ifm)->Name(), OpTypeToString(operation->Type()));
+        }
+        ifm++;
+    }
+    return tensors;
+}
+
+
+int TfLiteWriter::SerialisedTensorIndex(const Tensor *tensor, const std::unordered_map<const Tensor *, Address> &addresses)
+{
+    if ( !tensor )  // Optional tensor not present
+    {
+        return -1;
+    }
+    else if ( _tensors.count(tensor) )  // Already serialised
+    {
+        return _tensors.at(tensor);
+    }
+    else  // Needs serialising
+    {
+        const int index = int(_serialised_tensors.size());
+        _tensors[tensor] = index;
+        _serialised_tensors.push_back(SerialiseTensor(tensor));
+
+        auto address = addresses.find(tensor);
+        if ( address == addresses.end() )
+        {
+            _tensor_addresses.push_back(-1);
+        }
+        else
+        {
+            assert(std::abs(address->second) <= Address(std::numeric_limits<int32_t>::max()) && "Tensor address overflow");
+            _tensor_addresses.push_back(int32_t(address->second));
+        }
+        return index;
+    }
+}
+
+
+flatbuffers::Offset<tflite::Tensor> TfLiteWriter::SerialiseTensor(const Tensor *tensor)
+{
+    auto tflite_shape = tensor->StorageShape().ToList<int>();
+    std::vector<float> quant_min;
+    std::vector<float> quant_max;
+    std::vector<float> scale_f32;
+    std::vector<int64_t> zeroPoints;
+    int dimension = 0;
+
+    // Unused parameters are set to default or, if present in the input model, passed through unmodified
+    tflite::QuantizationDetails custom_quantization = tflite::QuantizationDetails::NONE;
+    flatbuffers::Offset<void> custom_quantization_details = 0;
+    bool is_variable = false;
+    flatbuffers::Offset<tflite::SparsityParameters> sparsity = 0;
+    std::vector<int> shape_signature;
+
+    if ( tensor->Passthrough() )
+    {
+        const auto tflite_tensor = static_cast<const tflite::Tensor *>(tensor->Passthrough());
+        const DataType type = TfLiteMapping::TensorTypeToDataType(tflite_tensor->type());
+
+        if ( tflite_tensor->quantization() )
+        {
+            if ( tflite_tensor->quantization()->scale() && tflite_tensor->quantization()->zero_point() )
+            {
+                quant_min = FlatbufferUtils::LoadVector<float>(tflite_tensor->quantization()->min());
+                quant_max = FlatbufferUtils::LoadVector<float>(tflite_tensor->quantization()->max());
+                scale_f32 = FlatbufferUtils::LoadVector<float>(tflite_tensor->quantization()->scale());
+                zeroPoints = FlatbufferUtils::LoadVector<int64_t>(tflite_tensor->quantization()->zero_point());
+                dimension = tflite_tensor->quantization()->quantized_dimension();
+            }
+
+            custom_quantization = tflite_tensor->quantization()->details_type();
+            if ( custom_quantization == tflite::QuantizationDetails::CustomQuantization )
+            {
+                if ( tflite_tensor->quantization()->details() )
+                {
+                    // TODO: custom_quantization_details
+                }
+            }
+        }
+
+        is_variable = tflite_tensor->is_variable();
+
+        if ( tflite_tensor->sparsity() )
+        {
+            auto traversal_order = FlatbufferUtils::CopyVector<int32_t>(_flatbuffer, tflite_tensor->sparsity()->traversal_order());
+            auto block_map = FlatbufferUtils::CopyVector<int32_t>(_flatbuffer, tflite_tensor->sparsity()->block_map());
+            auto dim_metadata = FlatbufferUtils::CopyVector<flatbuffers::Offset<tflite::DimensionMetadata>>(
+                _flatbuffer, tflite_tensor->sparsity()->dim_metadata());
+
+            sparsity = tflite::CreateSparsityParameters(_flatbuffer, traversal_order, block_map, dim_metadata);
+        }
+
+        shape_signature = FlatbufferUtils::LoadVector<int>(tflite_tensor->shape_signature());
+    }
+
+    auto writerType = tensor->Writers().empty() ? OpType::None : tensor->Writers().front()->Type();
+    for ( const auto &reader : tensor->Readers() )
+    {
+        auto tensorUsage = reader->UsageOfTensor(tensor);
+        if ( (reader->Type() == OpType::FullyConnected) && (tensorUsage == TensorUsage::Weights) )
+        {
+            // Reshape from (num_outputs, 1, 1, num_inputs) to (num_outputs, num_inputs)
+            tflite_shape = std::vector<int>({tflite_shape.front(), tflite_shape.back()});
+            break;
+        }
+    }
+
+    int buffer_index = 0;  // Default to the empty buffer at index 0
+    if ( tensor->IsConstant() )
+    {
+        const auto buffer = tensor->View().Buffer();
+        const auto descriptor = BufferDesc(buffer);
+        const auto it = _buffers.find(descriptor);
+        if ( it == _buffers.end() )
+        {
+            buffer_index = int(_serialised_buffers.size());
+            _buffers[descriptor] = buffer_index;
+            _serialised_buffers.push_back(SerialiseBuffer(buffer));
+        }
+        else  // Buffer has already been serialised - just reference it
+        {
+            buffer_index = it->second;
+        }
+    }
+
+    flatbuffers::Offset<tflite::QuantizationParameters> quantization = 0;
+    if ( !scale_f32.empty() && !zeroPoints.empty() )
+    {
+        quantization = tflite::CreateQuantizationParametersDirect(_flatbuffer, &quant_min, &quant_max, &scale_f32,
+            &zeroPoints, custom_quantization, custom_quantization_details, dimension);
+    }
+
+    return tflite::CreateTensorDirect(_flatbuffer, tflite_shape.size() ? &tflite_shape : nullptr,
+        TfLiteMapping::DataTypeToTensorType(tensor->Type()), buffer_index, tensor->Name().c_str(), quantization,
+        is_variable, sparsity, shape_signature.size() ? &shape_signature : nullptr);
+}
+
+template<typename T>
+static const T *GetBuiltinOptions(const tflite::Operator *tflite_operator)
+{
+    const auto options = tflite_operator->builtin_options_as<T>();
+    assert(options);
+    return options;
+}
+
+flatbuffers::Offset<void> TfLiteWriter::SerialiseOptions(const Operation *operation, const Operation *activation)
+{
+    const auto type = TfLiteMapping::OpTypeToBuiltinOptions(operation->Type());
+    flatbuffers::Offset<void> offset = 0;
+    const tflite::Operator *const passthrough = static_cast<const tflite::Operator *>(operation->Passthrough());
+    const auto fused_activation_function =
+        activation ? TfLiteMapping::OpTypeToActivationFunction(activation->Type()) : tflite::ActivationFunctionType::NONE;
+
+    switch ( type )
+    {
+        case tflite::BuiltinOptions::NONE:
+            break;
+
+        case tflite::BuiltinOptions::Conv2DOptions:
+        {
+            const auto kernel = TfLiteKernel(*operation->Kernel());
+            const auto typed_offset = tflite::CreateConv2DOptions(_flatbuffer, kernel.padding, kernel.stride_w,
+                kernel.stride_h, fused_activation_function, kernel.dilation_w_factor, kernel.dilation_h_factor);
+            offset = typed_offset.Union();
+        }
+        break;
+
+        case tflite::BuiltinOptions::DepthwiseConv2DOptions:
+        {
+            const auto kernel = TfLiteKernel(*operation->Kernel());
+            const auto typed_offset = tflite::CreateDepthwiseConv2DOptions(_flatbuffer, kernel.padding, kernel.stride_w,
+                kernel.stride_h, kernel.depth_multiplier, fused_activation_function, kernel.dilation_w_factor, kernel.dilation_h_factor);
+            offset = typed_offset.Union();
+        }
+        break;
+
+        case tflite::BuiltinOptions::Pool2DOptions:
+        {
+            const auto kernel = TfLiteKernel(*operation->Kernel());
+            const auto typed_offset = tflite::CreatePool2DOptions(_flatbuffer, kernel.padding, kernel.stride_w,
+                kernel.stride_h, kernel.filter_w, kernel.filter_h, fused_activation_function);
+            offset = typed_offset.Union();
+        }
+        break;
+
+        case tflite::BuiltinOptions::FullyConnectedOptions:
+        {
+            const auto typed_offset = tflite::CreateFullyConnectedOptions(_flatbuffer, fused_activation_function
+                // TODO: weights_format,
+                // TODO: keep_num_dims,
+                // TODO: asymmetric_quantize_inputs
+            );
+            offset = typed_offset.Union();
+        }
+        break;
+
+        case tflite::BuiltinOptions::SoftmaxOptions:
+        {
+            offset = tflite::CreateSoftmaxOptions(_flatbuffer, operation->Parameters().softmax.beta).Union();
+        }
+        break;
+
+        case tflite::BuiltinOptions::ConcatenationOptions:
+        {
+            const auto typed_offset = tflite::CreateConcatenationOptions(_flatbuffer, operation->Parameters().concat.axis, fused_activation_function);
+            offset = typed_offset.Union();
+        }
+        break;
+
+        case tflite::BuiltinOptions::AddOptions:
+        {
+            const auto typed_offset = tflite::CreateAddOptions(_flatbuffer, fused_activation_function,
+                passthrough ? GetBuiltinOptions<tflite::AddOptions>(passthrough)->pot_scale_int16() : true);
+            offset = typed_offset.Union();
+        }
+        break;
+
+        case tflite::BuiltinOptions::SubOptions:
+        {
+            const auto typed_offset = tflite::CreateSubOptions(_flatbuffer, fused_activation_function,
+                passthrough ? GetBuiltinOptions<tflite::SubOptions>(passthrough)->pot_scale_int16() : true);
+            offset = typed_offset.Union();
+        }
+        break;
+
+        case tflite::BuiltinOptions::MulOptions:
+        {
+            offset = tflite::CreateMulOptions(_flatbuffer, fused_activation_function).Union();
+        }
+        break;
+
+        case tflite::BuiltinOptions::DivOptions:
+        {
+            offset = tflite::CreateDivOptions(_flatbuffer, fused_activation_function).Union();
+        }
+        break;
+
+        case tflite::BuiltinOptions::L2NormOptions:
+        {
+            offset = tflite::CreateL2NormOptions(_flatbuffer, fused_activation_function).Union();
+        }
+        break;
+
+        case tflite::BuiltinOptions::ReshapeOptions:
+        {
+            // Replicate parameter tensor as ReshapeOptions
+            const auto tensor = operation->Input(TensorUsage::Params)->tensor;
+            const auto values = tensor->View().Values<int>();
+            const auto new_shape = _flatbuffer.CreateVector<int>(&values[0], tensor->StorageShape().Elements());
+            offset = tflite::CreateReshapeOptions(_flatbuffer, new_shape).Union();
+        }
+        break;
+
+        case tflite::BuiltinOptions::SqueezeOptions:
+        {
+            if ( passthrough )
+            {
+                const auto options = GetBuiltinOptions<tflite::SqueezeOptions>(passthrough);
+                const auto typed_offset = tflite::CreateSqueezeOptions(
+                    _flatbuffer, FlatbufferUtils::CopyVector<int32_t>(_flatbuffer, options->squeeze_dims()));
+                offset = typed_offset.Union();
+            }
+            else
+            {
+                offset = tflite::CreateSqueezeOptionsDirect(_flatbuffer).Union();
+            }
+        }
+        break;
+
+        case tflite::BuiltinOptions::PackOptions:
+        {
+            const auto typed_offset = tflite::CreatePackOptions(_flatbuffer,
+                passthrough ? GetBuiltinOptions<tflite::PackOptions>(passthrough)->values_count() : 0,
+                operation->Parameters().pack_unpack.axis);
+            offset = typed_offset.Union();
+        }
+        break;
+
+        case tflite::BuiltinOptions::UnpackOptions:
+        {
+            const auto typed_offset = tflite::CreateUnpackOptions(_flatbuffer,
+                passthrough ? GetBuiltinOptions<tflite::UnpackOptions>(passthrough)->num() : 0,
+                operation->Parameters().pack_unpack.axis);
+            offset = typed_offset.Union();
+        }
+        break;
+
+        case tflite::BuiltinOptions::LeakyReluOptions:
+        {
+            offset = tflite::CreateLeakyReluOptions(_flatbuffer, operation->Parameters().leaky_relu.alpha).Union();
+        }
+        break;
+
+        case tflite::BuiltinOptions::ShapeOptions:
+        {
+            const auto out_type = passthrough ? GetBuiltinOptions<tflite::ShapeOptions>(passthrough)->out_type() : tflite::TensorType::FLOAT32;
+            offset = tflite::CreateShapeOptions(_flatbuffer, out_type).Union();
+        }
+        break;
+
+        case tflite::BuiltinOptions::StridedSliceOptions:
+        {
+            const auto typed_offset = tflite::CreateStridedSliceOptions(_flatbuffer,
+                operation->Parameters().strided_slice.begin_mask, operation->Parameters().strided_slice.end_mask,
+                operation->Parameters().strided_slice.ellipsis_mask, operation->Parameters().strided_slice.new_axis_mask,
+                operation->Parameters().strided_slice.shrink_axis_mask);
+            offset = typed_offset.Union();
+        }
+        break;
+
+        case tflite::BuiltinOptions::SplitOptions:
+        {
+            offset = tflite::CreateSplitOptions(_flatbuffer, operation->Outputs().size()).Union();
+        }
+        break;
+
+        case tflite::BuiltinOptions::SplitVOptions:
+        {
+            offset = tflite::CreateSplitVOptions(_flatbuffer, operation->Outputs().size()).Union();
+        }
+        break;
+
+        case tflite::BuiltinOptions::ReducerOptions:
+        {
+            if ( passthrough )
+            {
+                const auto options = GetBuiltinOptions<tflite::ReducerOptions>(passthrough);
+                const auto typed_offset = tflite::CreateReducerOptions(_flatbuffer, options->keep_dims());
+                offset = typed_offset.Union();
+            }
+            else
+            {
+                offset = tflite::CreateReducerOptions(_flatbuffer).Union();
+            }
+        }
+        break;
+
+        case tflite::BuiltinOptions::SVDFOptions:
+        {
+            if ( passthrough )
+            {
+                const auto options = GetBuiltinOptions<tflite::SVDFOptions>(passthrough);
+                const auto typed_offset = tflite::CreateSVDFOptions(
+                    _flatbuffer, options->rank(), fused_activation_function, options->asymmetric_quantize_inputs());
+                offset = typed_offset.Union();
+            }
+            else
+            {
+                offset = tflite::CreateSVDFOptions(_flatbuffer).Union();
+            }
+        }
+        break;
+
+        case tflite::BuiltinOptions::BatchMatMulOptions:
+        {
+            if ( passthrough )
+            {
+                const auto options = GetBuiltinOptions<tflite::BatchMatMulOptions>(passthrough);
+                if ( options )
+                {
+                    const auto typed_offset = tflite::CreateBatchMatMulOptions(
+                        _flatbuffer, options->adj_x(), options->adj_y(), options->asymmetric_quantize_inputs());
+                    offset = typed_offset.Union();
+                }
+            }
+            else
+            {
+                offset = tflite::CreateBatchMatMulOptions(_flatbuffer).Union();
+            }
+        }
+        break;
+
+        case tflite::BuiltinOptions::GatherOptions:
+        {
+            if ( passthrough )
+            {
+                const auto options = GetBuiltinOptions<tflite::GatherOptions>(passthrough);
+                const auto typed_offset = tflite::CreateGatherOptions(_flatbuffer, options->axis(), options->batch_dims());
+                offset = typed_offset.Union();
+            }
+            else
+            {
+                offset = tflite::CreateGatherOptions(_flatbuffer).Union();
+            }
+        }
+        break;
+
+        case tflite::BuiltinOptions::ResizeBilinearOptions:
+        {
+            if ( passthrough )
+            {
+                const auto options = GetBuiltinOptions<tflite::ResizeBilinearOptions>(passthrough);
+                if ( options )
+                {
+                    const auto typed_offset = tflite::CreateResizeBilinearOptions(
+                        _flatbuffer, options->align_corners(), options->half_pixel_centers());
+                    offset = typed_offset.Union();
+                }
+                else
+                {
+                    offset = tflite::CreateResizeBilinearOptions(_flatbuffer).Union();
+                }
+            }
+            else
+            {
+                offset = tflite::CreateResizeBilinearOptions(_flatbuffer).Union();
+            }
+        }
+        break;
+
+        // Empty option sets can all be written as if they were QuantizeOptions
+        case tflite::BuiltinOptions::HardSwishOptions:
+        case tflite::BuiltinOptions::MaximumMinimumOptions:
+        case tflite::BuiltinOptions::PadOptions:
+        case tflite::BuiltinOptions::DequantizeOptions:
+        case tflite::BuiltinOptions::QuantizeOptions:
+        case tflite::BuiltinOptions::TransposeOptions:
+        case tflite::BuiltinOptions::GatherNdOptions:
+        case tflite::BuiltinOptions::ScatterNdOptions:
+        case tflite::BuiltinOptions::ArgMaxOptions:
+        {
+            offset = tflite::CreateQuantizeOptions(_flatbuffer).Union();
+        }
+        break;
+
+        case tflite::BuiltinOptions::ConcatEmbeddingsOptions:
+        case tflite::BuiltinOptions::LSHProjectionOptions:
+        case tflite::BuiltinOptions::RNNOptions:
+        case tflite::BuiltinOptions::LocalResponseNormalizationOptions:
+        case tflite::BuiltinOptions::LSTMOptions:
+        case tflite::BuiltinOptions::CallOptions:
+        case tflite::BuiltinOptions::SkipGramOptions:
+        case tflite::BuiltinOptions::SpaceToDepthOptions:
+        case tflite::BuiltinOptions::EmbeddingLookupSparseOptions:
+        case tflite::BuiltinOptions::BatchToSpaceNDOptions:
+        case tflite::BuiltinOptions::SpaceToBatchNDOptions:
+        case tflite::BuiltinOptions::SequenceRNNOptions:
+        case tflite::BuiltinOptions::ExpOptions:
+        case tflite::BuiltinOptions::TopKV2Options:
+        case tflite::BuiltinOptions::LogSoftmaxOptions:
+        case tflite::BuiltinOptions::CastOptions:
+        case tflite::BuiltinOptions::LessOptions:
+        case tflite::BuiltinOptions::NegOptions:
+        case tflite::BuiltinOptions::PadV2Options:
+        case tflite::BuiltinOptions::GreaterOptions:
+        case tflite::BuiltinOptions::GreaterEqualOptions:
+        case tflite::BuiltinOptions::LessEqualOptions:
+        case tflite::BuiltinOptions::SelectOptions:
+        case tflite::BuiltinOptions::SliceOptions:
+        case tflite::BuiltinOptions::TransposeConvOptions:
+        case tflite::BuiltinOptions::SparseToDenseOptions:
+        case tflite::BuiltinOptions::TileOptions:
+        case tflite::BuiltinOptions::ExpandDimsOptions:
+        case tflite::BuiltinOptions::EqualOptions:
+        case tflite::BuiltinOptions::NotEqualOptions:
+        case tflite::BuiltinOptions::PowOptions:
+        case tflite::BuiltinOptions::ArgMinOptions:
+        case tflite::BuiltinOptions::FakeQuantOptions:
+        case tflite::BuiltinOptions::LogicalOrOptions:
+        case tflite::BuiltinOptions::OneHotOptions:
+        case tflite::BuiltinOptions::LogicalAndOptions:
+        case tflite::BuiltinOptions::LogicalNotOptions:
+        case tflite::BuiltinOptions::FloorDivOptions:
+        case tflite::BuiltinOptions::SquareOptions:
+        case tflite::BuiltinOptions::ZerosLikeOptions:
+        case tflite::BuiltinOptions::FillOptions:
+        case tflite::BuiltinOptions::BidirectionalSequenceLSTMOptions:
+        case tflite::BuiltinOptions::BidirectionalSequenceRNNOptions:
+        case tflite::BuiltinOptions::UnidirectionalSequenceLSTMOptions:
+        case tflite::BuiltinOptions::FloorModOptions:
+        case tflite::BuiltinOptions::RangeOptions:
+        case tflite::BuiltinOptions::ResizeNearestNeighborOptions:
+        case tflite::BuiltinOptions::SquaredDifferenceOptions:
+        case tflite::BuiltinOptions::MirrorPadOptions:
+        case tflite::BuiltinOptions::AbsOptions:
+        case tflite::BuiltinOptions::UniqueOptions:
+        case tflite::BuiltinOptions::ReverseV2Options:
+        case tflite::BuiltinOptions::AddNOptions:
+        case tflite::BuiltinOptions::CosOptions:
+        case tflite::BuiltinOptions::WhereOptions:
+        case tflite::BuiltinOptions::RankOptions:
+        case tflite::BuiltinOptions::ReverseSequenceOptions:
+        case tflite::BuiltinOptions::MatrixDiagOptions:
+        case tflite::BuiltinOptions::MatrixSetDiagOptions:
+        case tflite::BuiltinOptions::IfOptions:
+        case tflite::BuiltinOptions::WhileOptions:
+        case tflite::BuiltinOptions::DepthToSpaceOptions:
+        case tflite::BuiltinOptions::NonMaxSuppressionV4Options:
+        case tflite::BuiltinOptions::NonMaxSuppressionV5Options:
+        case tflite::BuiltinOptions::SelectV2Options:
+        case tflite::BuiltinOptions::DensifyOptions:
+        case tflite::BuiltinOptions::SegmentSumOptions:
+        case tflite::BuiltinOptions::CumsumOptions:
+        case tflite::BuiltinOptions::CallOnceOptions:
+        case tflite::BuiltinOptions::BroadcastToOptions:
+        case tflite::BuiltinOptions::Rfft2dOptions:
+        case tflite::BuiltinOptions::Conv3DOptions:
+        case tflite::BuiltinOptions::HashtableOptions:
+        case tflite::BuiltinOptions::HashtableFindOptions:
+        case tflite::BuiltinOptions::HashtableImportOptions:
+        case tflite::BuiltinOptions::HashtableSizeOptions:
+        case tflite::BuiltinOptions::VarHandleOptions:
+        case tflite::BuiltinOptions::ReadVariableOptions:
+        case tflite::BuiltinOptions::AssignVariableOptions:
+            LOG_WARN("TfLiteWriter: Built-in options type '{}' is not yet implemented and will be set to default.\n",
+                tflite::EnumNameBuiltinOptions(type));
+            break;
+        default:
+            LOG_ERROR("TfLiteWriter: Unrecognised built-in options type '{}'\n", int(type));
+            break;
+    }
+    return offset;
+}
+
+flatbuffers::Offset<tflite::Metadata> TfLiteWriter::SerialiseTensorAddresses(int subgraph_index)
+{
+    assert(_tensor_addresses.size() == _serialised_tensors.size());
+    const int32_t version = 0;
+    const auto num_tensors = int32_t(_tensor_addresses.size());
+    const auto buffer_index = int32_t(_serialised_buffers.size());
+
+    _tensor_addresses.insert(_tensor_addresses.begin(), {version, subgraph_index, num_tensors});
+
+    const auto buffer_base = reinterpret_cast<uint8_t *>(_tensor_addresses.data());
+    const auto buffer_size = _tensor_addresses.size() * (sizeof(int32_t) / sizeof(uint8_t));
+    _serialised_buffers.push_back(SerialiseBuffer(buffer_base, int(buffer_size)));
+
+    return tflite::CreateMetadataDirect(_flatbuffer, "OfflineMemoryAllocation", uint32_t(buffer_index));
+}
+
+flatbuffers::Offset<tflite::Buffer> TfLiteWriter::SerialiseBuffer(const Buffer *buffer)
+{
+    return SerialiseBuffer(buffer->Data<uint8_t>(), buffer->Size());
+}
+
+flatbuffers::Offset<tflite::Buffer> TfLiteWriter::SerialiseBuffer(const uint8_t *data, int size)
+{
+    _flatbuffer.ForceVectorAlignment(size, 1, 16);  // 16-byte alignment
+    return tflite::CreateBuffer(_flatbuffer, _flatbuffer.CreateVector<uint8_t>(data, size));
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/tflite/tflite_writer.hpp b/ethosu/regor/tflite/tflite_writer.hpp
new file mode 100644
index 00000000..ac910828
--- /dev/null
+++ b/ethosu/regor/tflite/tflite_writer.hpp
@@ -0,0 +1,134 @@
+//
+// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "compiler/graph.hpp"
+#include "compiler/op_type.hpp"
+#include "compiler/operation.hpp"
+#include "compiler/tensor.hpp"
+#include "tflite_schema_generated.hpp"
+
+#include <flatbuffers/flatbuffers.h>
+#include <map>
+#include <memory>
+#include <string_view>
+#include <unordered_map>
+#include <vector>
+
+namespace regor
+{
+
+class TfLiteWriter
+{
+public:
+    TfLiteWriter()
+    {
+        _flatbuffer = flatbuffers::FlatBufferBuilder();  // TODO: Determine sensible starting size (default is 1KB)
+    }
+
+    std::unique_ptr<const uint8_t[]> Serialise(const std::vector<std::unique_ptr<Graph>> &graphs,
+        const std::vector<std::unordered_map<const Tensor *, Address>> &tensor_address_maps,
+        int64_t &output_buffer_offset, size_t &output_buffer_size);
+
+private:
+    struct BufferDesc
+    {
+        const uint8_t *data = nullptr;
+        size_t size = 0;
+        BufferDesc() = default;
+        BufferDesc(const Buffer *buffer) : data(buffer->Data<uint8_t>()), size(buffer->Size()) {}
+        bool operator==(const BufferDesc &other) const { return (other.data == data) && (other.size == size); }
+        struct hash
+        {
+            size_t operator()(const BufferDesc &desc) const { return std::hash<const uint8_t *>{}(desc.data); }
+        };
+    };
+
+    struct OperatorCodeDesc
+    {
+        int8_t deprecated_builtin_code;
+        const char *custom_code;
+        int32_t version;
+        tflite::BuiltinOperator type;
+        OperatorCodeDesc() = default;
+        bool operator==(const OperatorCodeDesc &other) const
+        {
+            const std::string_view custom_code_a(other.custom_code ? other.custom_code : "");
+            const std::string_view custom_code_b(custom_code ? custom_code : "");
+            return other.deprecated_builtin_code == deprecated_builtin_code && custom_code_a == custom_code_b &&
+                   other.version == version && other.type == type;
+        }
+        struct hash
+        {
+            size_t operator()(const OperatorCodeDesc &desc) const
+            {
+                const size_t a = std::hash<tflite::BuiltinOperator>{}(desc.type);
+                const size_t b = std::hash<std::string_view>{}(desc.custom_code ? desc.custom_code : "");
+                return a ^ (b << 1);
+            }
+        };
+    };
+
+    // per-model
+    flatbuffers::FlatBufferBuilder _flatbuffer;
+    std::unordered_map<OperatorCodeDesc, int, OperatorCodeDesc::hash> _opcodes;
+    std::unordered_map<BufferDesc, int, BufferDesc::hash> _buffers;
+    std::vector<flatbuffers::Offset<tflite::OperatorCode>> _serialised_opcodes;
+    std::vector<flatbuffers::Offset<tflite::SubGraph>> _serialised_subgraphs;
+    std::vector<flatbuffers::Offset<tflite::Buffer>> _serialised_buffers;
+
+    // per-subgraph
+    std::unordered_map<const Tensor *, int> _tensors;
+    std::vector<flatbuffers::Offset<tflite::Operator>> _serialised_operations;
+    std::vector<flatbuffers::Offset<tflite::Tensor>> _serialised_tensors;
+    std::vector<int32_t> _tensor_addresses;  // Keep as 32 bits - required by runtime
+
+    int SerialisedTensorIndex(const Tensor *tensor, const std::unordered_map<const Tensor *, Address> &addresses);
+
+    flatbuffers::Offset<tflite::Tensor> SerialiseTensor(const Tensor *tensor);
+    flatbuffers::Offset<void> SerialiseOptions(const Operation *operation, const Operation *activation);
+    flatbuffers::Offset<tflite::Metadata> SerialiseTensorAddresses(int subgraph_index);
+
+    flatbuffers::Offset<tflite::Buffer> SerialiseBuffer(const Buffer *buffer);
+    flatbuffers::Offset<tflite::Buffer> SerialiseBuffer(const uint8_t *data, int size);
+
+    struct TfLiteKernel
+    {
+        const tflite::Padding padding;
+        const int filter_w;
+        const int filter_h;
+        const int stride_w;
+        const int stride_h;
+        const int dilation_w_factor;
+        const int dilation_h_factor;
+        const int depth_multiplier;
+
+        TfLiteKernel(const Kernel &kernel) :
+                padding(kernel.Padding().IsZero() ? tflite::Padding::VALID : tflite::Padding::SAME),
+                filter_w(kernel.Size().x), filter_h(kernel.Size().y), stride_w(kernel.Stride().x),
+                stride_h(kernel.Stride().y), dilation_w_factor(kernel.Dilation().x),
+                dilation_h_factor(kernel.Dilation().y), depth_multiplier(kernel.DepthMultiplier())
+        {
+        }
+    };
+
+    static std::vector<const Tensor *> SortedInputTensors(const Operation *operation);
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/tools/cppcheck.py b/ethosu/regor/tools/cppcheck.py
new file mode 100755
index 00000000..637b9de7
--- /dev/null
+++ b/ethosu/regor/tools/cppcheck.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+#
+# SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Cppcheck wrapper."""
+import json
+import os
+import subprocess
+import sys
+
+# Load compile database
+def get_db_cmd(src):
+    """Load compile database, looks for src compile options and filters them."""
+    fcmd = []
+    compile_db = None
+    if "CMAKE_BINARY_DIR" in os.environ:
+        compile_db = os.path.join(os.environ["CMAKE_BINARY_DIR"], "compile_commands.json")
+        if not os.path.exists(compile_db):
+            compile_db = None
+    if compile_db:
+        with open(compile_db) as f:
+            cmds = json.load(f)
+        for cmd in cmds:
+            if cmd["file"] == src:
+                cmd = cmd["command"].split()
+                i = 0
+                while i < len(cmd):
+                    if cmd[i].startswith("-std"):
+                        fcmd.append("-" + cmd[i])
+                    if cmd[i].startswith("-D"):
+                        fcmd.append(cmd[i])
+                    if cmd[i].startswith("-isystem"):
+                        fcmd.append("-I")
+                        i += 1
+                        fcmd.append(cmd[i])
+                        fcmd.append(f"--suppress=*:{cmd[i]}/*")
+                        fcmd.append("--suppress=unmatchedSuppression:*")
+                    if cmd[i].startswith("-I"):
+                        fcmd.append(cmd[i])
+                        i += 1
+                        fcmd.append(cmd[i])
+                    i += 1
+                break
+    return fcmd
+
+
+# The source file to scan is the last argument
+src = sys.argv[-1]
+assert os.path.exists(src), f"File {src} not found"
+
+cmd = []
+# CMake options
+cmd += sys.argv[1:-1]
+# Options from compile database
+cmd += get_db_cmd(src)
+# The source file last
+cmd.append(src)
+
+subprocess.call(cmd)
diff --git a/ethosu/regor/tosa/tosaValidationGenerator.rb b/ethosu/regor/tosa/tosaValidationGenerator.rb
new file mode 100755
index 00000000..c2e507cd
--- /dev/null
+++ b/ethosu/regor/tosa/tosaValidationGenerator.rb
@@ -0,0 +1,830 @@
+#!ruby
+#
+# SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+require 'asciidoctor'
+require 'digest'
+require 'optparse'
+require 'rexml'
+REGOR_OP_NAMES = {
+  'ARGMAX':   'OpType::ArgMax',
+  'AVG_POOL2D': 'OpType::AvgPool',
+  'CONV2D':   'OpType::Conv2D',
+  'CONV3D':   'OpType::Conv3D',
+  'DEPTHWISE_CONV2D': 'OpType::DepthwiseConv2DBias',
+  'FULLY_CONNECTED': 'OpType::FullyConnected',
+  'MATMUL':   'OpType::MatMul',
+  'MAX_POOL2D': 'OpType::MaxPool',
+  'TRANSPOSE_CONV2D':  'OpType::TransposeConv2D',
+  'CLAMP':  'OpType::Clamp',
+  'SIGMOID': 'OpType::Sigmoid',
+  'TANH':   'OpType::Tanh',
+  'ADD':    'OpType::Add',
+  'ARITHMETIC_RIGHT_SHIFT': 'OpType::Asr',
+  'BITWISE_AND': 'OpType::And',
+  'BITWISE_OR': 'OpType::Or',
+  'BITWISE_XOR': 'OpType::Xor',
+  'INTDIV':    'OpType::Div',
+  'LOGICAL_AND': 'OpType::LogicalAnd',
+  'LOGICAL_LEFT_SHIFT': 'OpType::SHL',
+  'LOGICAL_RIGHT_SHIFT': 'OpType::SHR',
+  'LOGICAL_OR': 'OpType::LogicalOr',
+  'LOGICAL_XOR': 'OpType::LogicalXor',
+  'MAXIMUM': 'OpType::Maximum',
+  'MINIMUM': 'OpType::Minimum',
+  'MUL': 'OpType::Mul',
+  'POW': 'OpType::Pow',
+  'SUB': 'OpType::Sub',
+  'TABLE': 'OpType::LUT',
+  'ABS': 'OpType::Abs',
+  'BITWISE_NOT': 'OpType::Not',
+  'CEIL': 'OpType::Ceil',
+  'CLZ': 'OpType::CLZ',
+  'EXP': 'OpType::Exp',
+  'FLOOR': 'OpType::Floor',
+  'LOG': 'OpType::Log',
+  'LOGICAL_NOT': 'OpType::LogicalNot',
+  'NEGATE':    'OpType::Neg',
+  'RECIPROCAL': 'OpType::Reciprocal',
+  'RSQRT': 'OpType::Rsqrt',
+  'SELECT': 'OpType::Select',
+  'EQUAL': 'OpType::Equal',
+  'GREATER': 'OpType::Greater',
+  'GREATER_EQUAL': 'OpType::GreaterEqual',
+  'REDUCE_ANY':   'OpType::ReduceAny',
+  'REDUCE_ALL':   'OpType::ReduceAll',
+  'REDUCE_MAX':   'OpType::ReduceMax',
+  'REDUCE_MIN':   'OpType::ReduceMin',
+  'REDUCE_PRODUCT': 'OpType::ReduceProduct',
+  'REDUCE_SUM': 'OpType::ReduceSum',
+  'CONCAT':   'OpType::Concat',
+  'PAD':      'OpType::Pad',
+  'RESHAPE':  'OpType::Reshape',
+  'REVERSE':  'OpType::Reverse',
+  'SLICE':    'OpType::Slice',
+  'TILE':     'OpType::Tile',
+  'TRANSPOSE': 'OpType::Transpose',
+  'GATHER':   'OpType::Gather',
+  'SCATTER':  'OpType::Scatter',
+  'RESIZE':   'OpType::Resize',
+  'CAST':     'OpType::Cast',
+  'RESCALE':  'OpType::Rescale',
+  'IDENTITY': 'OpType::Identity',
+  'COND_IF':  'OpType::If',
+  'WHILE_LOOP': 'OpType::While',
+    #'FFT2D',      'OpType::CurrentlyUnsupported',
+    #'RFFT2D',     'OpType::CurrentlyUnsupported',
+    #'ERF',        'OpType::CurrentlyUnsupported',
+    #'DIM',        'OpType::CurrentlyUnsupported'},
+}
+def parse_options
+  options = {profile: 'BI'}
+  OptionParser.new do |opts|
+    opts.banner = "Usage: tosaValidationGenerator [options]"
+    opts.on('-s [ARG]', '--specification [ARG]', "Path to the TOSA Specification git.") do |v|
+      options[:spec] = v
+    end
+    opts.on('-p [ARG]', '--profile [ARG]', "TOSA profile (BI|MI)") do |v|
+      options[:profile] = v
+    end
+    opts.on('-h', '--help', 'Display this help') do
+      puts opts
+      exit
+    end
+  end.parse!
+  if (options[:spec].nil?)
+    abort("No specification path (-s/--specification option required) ")
+  end
+  if (!File.file?("%s/tosa.xml" % options[:spec]) || !File.file?("%s/tosa_spec.adoc" % options[:spec]))
+    abort("No TOSA Specification found at %s" % options[:spec])
+  end
+  if (options[:profile] != 'BI')
+    abort("Profile %s not supported." % options[:profile])
+  end
+  options
+end
+
+def indent(level)
+  " "*4*level
+end
+
+class TosaValidator
+  @operators
+  @error_set
+  @level_set
+  @require_set
+  @operator_checks
+  @level_limits
+  @levelName
+  @profile
+  def initialize(xml, adoc, profile, level_name)
+    xml_operators = xml.root.get_elements('//operator')
+    adoc_operators = (adoc.find_by id: '_operators').first
+    @operators = []
+    @error_set = Set[]
+    @level_set = Set[]
+    @require_set = Set[]
+    @operator_checks = {}
+    @operator_checks.default([])
+    @specVersion = {major: 0, minor: 0, patch: 0, draft: false}
+    @level_limits = {}
+    @profile = profile
+    @level_name = level_name
+    specVersionNode = xml.root.get_elements('//version')[0]
+    if (specVersionNode != nil)
+      @specVersion[:major] = specVersionNode['major']
+      @specVersion[:minor] = specVersionNode['minor']
+      @specVersion[:patch] = specVersionNode['patch']
+      @specVersion[:draft] = specVersionNode['draft'] != 'false'
+    end
+    levels_node = xml.root.get_elements('//levels')[0]
+    levels_node.elements.each do |level|
+      @level_limits[level['name']] = {max_rank: level['max_rank'], max_kernel: level['max_kernel'], max_stride: level['max_stride'], max_scale: level['max_scale'], max_log2_size: level['max_log2_size'], max_nesting: level['max_nesting']}
+    end
+    xml_operators.each do |xml_operator|
+      op = parse_operator(xml_operator, adoc_operators)
+      @operators.append(op) unless op.nil?
+    end
+  end
+
+  def versioned_nametag
+    specArgs = [@specVersion[:major], @specVersion[:minor], @specVersion[:patch], @specVersion[:draft] ? "_draft":"", @profile]
+    nametag = "Version_%s_%s_%s%s_Profile_%s" % specArgs
+  end
+
+  def specArgs
+    [@specVersion[:major], @specVersion[:minor], @specVersion[:patch], @specVersion[:draft] ? "draft":""]
+  end
+
+  def emit_file_header(should_modify: false, is_header: false)
+    text = "//\n"
+    text << "// SPDX-FileCopyrightText: Copyright 2023%s Arm Limited and/or its affiliates <open-source-office@arm.com>\n" % ((Time.now.year > 2023) ? "-" + Time.now.year.to_s : "")
+    text << "//\n"
+    text << "// SPDX-License-Identifier: Apache-2.0\n"
+    text << "//\n"
+    text << "// Licensed under the Apache License, Version 2.0 (the License); you may\n"
+    text << "// not use this file except in compliance with the License.\n"
+    text << "// You may obtain a copy of the License at\n"
+    text << "//\n"
+    text << "// www.apache.org/licenses/LICENSE-2.0\n"
+    text << "//\n"
+    text << "// Unless required by applicable law or agreed to in writing, software\n"
+    text << "// distributed under the License is distributed on an AS IS BASIS, WITHOUT\n"
+    text << "// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"
+    text << "// See the License for the specific language governing permissions and\n"
+    text << "// limitations under the License.\n"
+    text << "//\n"
+    if (should_modify)
+      text << "// Partially generated by tosaValidationGenerator for TOSA Specification %u.%u.%u%s\n" % specArgs
+      text << "// TODO: Implement the constraints.\n\n"
+    else
+      text << "// Automatically generated by tosaValidationGenerator for TOSA Specification %u.%u.%u%s\n" % specArgs
+      text << "// Do not edit.\n\n"
+    end
+    if (is_header)
+      text << "#pragma once\n\n"
+    end
+    text
+  end
+
+  def append_operator_check(check, operator)
+    tmp = @operator_checks[check]
+    if (tmp == nil)
+      tmp = []
+    end
+    tmp.append(operator)
+    @operator_checks[check] = tmp
+  end
+  def clean_constraint(constraint)
+    constraint.gsub!(/\s+/, ' ')
+    constraint.strip!
+    if (constraint.delete_prefix!('('))
+      constraint.delete_suffix!(')')
+    end
+    constraint
+  end
+  def parse_operator(xml_operator, adoc_operators)
+    operation = Operation.new(xml_operator)
+    adoc_operator = adoc_operators.find_by {|block| block.title == operation.name}.first
+    source_blocks = adoc_operator.find_by style: 'source'
+    error_checks = []
+    level_checks = []
+    require_checks = []
+    argument_checks = []
+    source_blocks.each {|block| error_checks += block.source.scan(/ERROR_IF(?<par>\((?:[^\(\)]|\g<par>)*\))/)}
+    source_blocks.each {|block| level_checks += block.source.scan(/LEVEL_CHECK(?<par>\((?:[^\(\)]|\g<par>)*\))/)}
+    source_blocks.each {|block| require_checks += block.source.scan(/REQUIRE(?<par>\((?:[^\(\)]|\g<par>)*\))/)}
+    source_blocks.each {|block| argument_checks += block.source.scan(/(?<all>apply_broadcast(?<par>\((?:[^\(\)]|\g<par>)*\)))/)}
+    error_checks.each do |a|
+      constraint = clean_constraint(a[0])
+      operation.error_checks.add(constraint)
+      @error_set.add(constraint)
+      append_operator_check(constraint, operation.name)
+    end
+    level_checks.each do |a|
+      constraint = clean_constraint(a[0])
+      operation.level_checks.add(constraint)
+      @level_set.add(constraint)
+      append_operator_check(constraint, operation.name)
+    end
+    require_checks.each do |a|
+      constraint = clean_constraint(a[0])
+      operation.require_checks.add(constraint)
+      @require_set.add(constraint)
+      append_operator_check(constraint, operation.name)
+    end
+    argument_checks.each do |a|
+      constraint = clean_constraint(a[0])
+      operation.error_checks.add(constraint)
+      @error_set.add(constraint)
+      append_operator_check(constraint, operation.name)
+    end
+    operation.shape_checks.each do |constraint|
+      operation.error_checks.add(constraint)
+      @error_set.add(constraint)
+      append_operator_check(constraint, operation.name)
+    end
+   operation
+  end
+
+  def emit_check_header(check)
+    header =  "\n{\n"
+    line = indent(1) + "// Operators:"
+    @operator_checks[check].each do |op|
+      op_string = " %s," % op
+      if ((line + op_string).length > 120)
+        header << line + "\n"
+        line = indent(1) + "// " + op_string
+      else
+        line << op_string
+      end
+    end
+    header << line + "\n"
+  end
+
+  def emit_check(check, function_prefix, macro_name, is_header)
+    function = "void %s_%s(const regor::Operation *op, %sconst Context &context)" % [function_prefix, name_tag(check), unused_tag(is_header)]
+    if (is_header)
+      function << ";\n"
+    else
+      function << emit_check_header(check)
+      function << indent(1) + "static constexpr char constraint[] = \"%s(%s)\";\n" % [macro_name, check]
+      function << indent(1) + "bool checkOk = true;\n"
+      function << indent(1) + "checkOk = (graphOp != nullptr);  // TODO: Implement check\n"
+      function << indent(1) + "if ( !checkOk ) throw std::invalid_argument(constraint);\n"
+      function << "}\n\n"
+    end
+  end
+
+  def unused_tag(is_header)
+    is_header ? "" : "[[maybe_unused]] "
+  end
+
+  def emit_argument_checks(is_header)
+    body = ""
+    if (is_header)
+      body << "#include \"tosa/tosa_validator.hpp\"\n\n"
+      body << "#include <map>\n"
+      body << "#include <string>\n"
+      body << "#include <string_view>\n"
+      body << "#include <utility>\n"
+      body << "#include <vector>\n"
+      body << "namespace tosa\n{\n"
+      body << "namespace validator\n{\n"
+      body << "namespace checks\n{\n\n"
+      body << "enum class Category\n{\n"
+      body << indent(1) + "Input,\n"
+      body << indent(1) + "Output,\n"
+      body << indent(1) + "Attribute,\n"
+      body << indent(1) + "ScalarAttribute\n"
+      body << "};\n\n"
+      body << "struct Argument\n{\n"
+      body << indent(1) + "Category category;\n"
+      body << indent(1) + "std::string name;\n"
+      body << indent(1) + "std::string element_type;\n"
+      body << indent(1) + "std::pair<int, int> rank_bounds = {-1, -1};\n"
+      body << "};\n\n"
+      body << "typedef std::map<std::string_view, std::string_view> Typesupport;\n\n"
+      body << "void ValidateArguments(const regor::Operation *op, const std::vector<const Argument *> &arguments,\n"
+      body << indent(1) + "const std::vector<Typesupport> &typesupports, const Context &context);\n\n"
+    else
+      body << "#include \"tosa/tosa_argument_checks.hpp\"\n\n"
+      body << "#include <unordered_set>\n\n"
+      body << "using namespace tosa::validator::checks;\n\n"
+      body << "namespace\n{\n\n"
+      body << "const std::unordered_set<std::string_view> tosaSupportedTypes = {\n"
+      ["bool_t", "i4_t", "int4_t", "uint6_t", "i8_t", "int8_t", "uint8_t", "i16_t", "int16_t", "uint16_t", "i32_t", "int32_t", "i48_t", "int48_t", "fp16_t", "bf16_t", "fp32_t", "fp64_t", "index_t", "mode_t", "mul_t", "tensor_list_t", "tosa_graph_t"].each do |t|
+        body << indent(1) + "\"%s\",\n" % t
+      end
+      body << "};\n\n"
+      body << "bool CanResolveArgument(const Argument *argument, const Typesupport *typesupport)\n"
+      body << "{\n"
+      body << indent(1) + "if ( argument == nullptr ) return false;\n"
+      body << indent(1) + "std::string_view typeName = argument->element_type;\n"
+      body << indent(1) + "if ( typesupport )\n"
+      body << indent(1) + "{\n"
+      body << indent(2) + "if ( auto p = typesupport->find(typeName); p != typesupport->end() )\n"
+      body << indent(2) + "{\n"
+      body << indent(3) + "typeName = p->second;\n"
+      body << indent(2) + "}\n"
+      body << indent(1) + "}\n"
+      body << indent(1) + "return tosaSupportedTypes.count(typeName) > 0;\n"
+      body << "}\n\n"
+      body << "bool CanResolveArguments(const std::vector<const Argument *> &arguments, const Typesupport *typesupport = nullptr)\n"
+      body << "{\n"
+      body << indent(1) + "for ( const auto argument : arguments )\n"
+      body << indent(1) + "{\n"
+      body << indent(2) + "if ( !CanResolveArgument(argument, typesupport) ) return false;\n"
+      body << indent(1) + "}\n"
+      body << indent(1) + "return true;\n"
+      body << "}\n"
+      body << "bool ArgumentsCanBeResolved(const std::vector<const Argument *> &arguments, const std::vector<Typesupport> &typesupports)\n"
+      body << "{\n"
+      body << indent(1) + "if ( CanResolveArguments(arguments) ) return true;\n"
+      body << indent(1) + "for ( const auto &typesupport : typesupports )\n"
+      body << indent(1) + "{\n"
+      body << indent(2) + "if ( CanResolveArguments(arguments, &typesupport) ) return true;\n"
+      body << indent(1) + "}\n"
+      body << indent(1) + "return false;\n"
+      body << "}\n"
+      body << "}  // namespace\n"
+
+      body << "namespace tosa\n{\n"
+      body << "namespace validator\n{\n"
+      body << "namespace checks\n{\n\n"
+      body << "void ValidateArguments(const regor::Operation *, const std::vector<const Argument *> &arguments,\n"
+      body << indent(1) + "const std::vector<Typesupport> &typesupports, const Context &)\n"
+      body << "{\n"
+      body << indent(1) + "if ( !ArgumentsCanBeResolved(arguments, typesupports) ) throw std::invalid_argument(\"Unsupported operation\");\n"
+      body << indent(1) + "// TODO: Implement argument validation\n"
+      body << "}\n\n"
+    end
+    body << "}  // namespace checks\n"
+    body << "}  // namespace validator\n"
+    body << "}  // namespace tosa\n"
+  end
+  def emit_version_validator_function
+    validator = ""
+    validator << "#include \"compiler/operation.hpp\"\n"
+    validator << "#include \"tosa/tosa_argument_checks.hpp\"\n"
+    validator << "#include \"tosa/tosa_error_checks.hpp\"\n"
+    validator << "#include \"tosa/tosa_level_checks.hpp\"\n"
+    validator << "#include \"tosa/tosa_require_checks.hpp\"\n"
+    validator << "#include \"tosa/tosa_validator.hpp\"\n"
+    validator << "using namespace tosa::validator;\n\n"
+    validator << "using namespace tosa::validator::checks;\n\n"
+    validator << "#define MAX_RANK (context.level == Level::Level8K ? 6 : (context.level == Level::Levelnone ? 32 : 0))\n\n"
+    validator << "#define MAX_KERNEL (context.level == Level::Level8K ? 8192 : (context.level == Level::Levelnone ? 2147483647 : 0))\n\n"
+    validator << "#define MAX_SCALE (context.level == Level::Level8K ? 256 : (context.level == Level::Levelnone ? 2048 : 0))\n\n"
+    validator << "#define MAX_STRIDE (context.level == Level::Level8K ? 8192 : (context.level == Level::Levelnone ? 2147483647 : 0))\n\n"
+    validator << "namespace\n{\n"
+    @operators.each do |op|
+      if (REGOR_OP_NAMES[op.name.to_sym]!=nil)
+        validator << op.emit_validation_check
+      end
+    end
+    validator << "}  // namespace\n"
+    validator << "namespace tosa\n{\n"
+    validator << "namespace validator\n{\n\n"
+    validator << "void ValidateOperator_%s(const GraphApi::GraphOperation *graphOp, const Context &context)" % versioned_nametag
+    validator << "\n{\n"
+    validator << indent(1) + "const auto *op = static_cast<const regor::Operation *>(graphOp);\n"
+    validator << indent(1) + "switch ( op->Type() )\n"
+    validator << indent(1) + "{\n"
+    @operators.each do |op|
+      if (REGOR_OP_NAMES[op.name.to_sym])
+        validator << indent(2) + "case regor::%s:\n" % REGOR_OP_NAMES[op.name.to_sym]
+        validator << indent(3) + "ValidateOperator_%s(op, context);\n" % op.name
+        validator << indent(3) + "break;\n" % op.name
+      end
+    end
+    validator << indent(2) + "default:\n"
+    validator << indent(3) + "throw std::invalid_argument(\"Unsupported operator\");\n"
+    validator << indent(1) + "}\n"
+    validator << "}\n\n"
+    validator << "}  // namespace validator\n"
+    validator << "}  // namespace tosa\n"
+  end
+
+  def update_checks(checks:, check_name:, function_prefix:, macro_name:)
+    filename = "tosa_%s_checks.hpp" % check_name
+    new_file = (!File.file?(filename))
+    existing_checks =  new_file ? [] : File.readlines(filename).grep(/void #{function_prefix}_.*\(/)
+    new_checks = []
+    checks.each do |check|
+      tag = name_tag(check)
+      if (existing_checks.grep(/void #{function_prefix}_#{tag}\(/).empty?)
+        new_checks.append(check)
+      end
+    end
+    write_updated_checks(checks: new_checks, check_name: check_name, function_prefix: function_prefix, macro_name: macro_name, new_file: new_file, is_header: true)
+    write_updated_checks(checks: new_checks, check_name: check_name, function_prefix: function_prefix, macro_name: macro_name, new_file: new_file, is_header: false)
+  end
+
+  def write_updated_checks(check_name:, function_prefix:, macro_name:, checks:, new_file:, is_header:)
+    mode = new_file ? 'w' : 'a'
+    file_name = "tosa_%s_checks.%s" % [check_name, (is_header ? 'hpp' : 'cpp')]
+    File.open(file_name, mode) do |f|
+      if (new_file)
+        f.write emit_file_header(is_header: is_header, should_modify: !is_header)
+        if (is_header)
+          f.write "#include \"tosa/tosa_validator.hpp\"\n\n"
+        else
+        f.write "#include \"tosa_%s_checks.hpp\"\n\n" % check_name
+        end
+      end
+      if (!checks.empty?)
+        f.write "namespace tosa\n{\n"
+        f.write "namespace validator\n{\n"
+        f.write "namespace checks\n{\n"
+        f.write "// Checks for TOSA Specification %u.%u.%u%s\n" % specArgs
+
+        checks.each {|check| f.write  emit_check(check, function_prefix, macro_name, is_header)}
+        f.write "}  // namespace checks\n"
+        f.write "}  // namespace validator\n"
+        f.write "}  // namespace tosa\n"
+      end
+    end
+  end
+
+  def update_argument_checks
+    if (!File.file?("tosa_argument_checks.hpp"))
+      File.open("tosa_argument_checks.hpp", "w"){|f| f.write emit_file_header(is_header: true) + emit_argument_checks(true)}
+    end
+    if (!File.file?("tosa_argument_checks.cpp"))  then
+        File.open("tosa_argument_checks.cpp", "w"){|f| f.write emit_file_header(should_modify: true) + emit_argument_checks(false)}
+    end
+  end
+
+  def update_error_checks
+    update_checks(checks:@error_set, check_name: "error", function_prefix: "ErrorIfCheck", macro_name:"ERROR_IF")
+  end
+
+  def update_level_checks
+    update_checks(checks:@level_set, check_name: "level", function_prefix: "LevelCheck", macro_name:"LEVEL_CHECK")
+  end
+
+  def update_require_checks
+    update_checks(checks:@require_set, check_name: "require", function_prefix: "RequireCheck", macro_name:"REQUIRE")
+  end
+
+  def validator_function_def
+    "void ValidateOperator_%s(const GraphApi::GraphOperation *graphOp, const Context &context)" % versioned_nametag
+  end
+
+  def supported_versions
+    filename = "tosa_validator.hpp"
+    new_file = (!File.file?(filename))
+    versions = Set[validator_function_def + ";"]
+    existing_versions = new_file ? [] : File.readlines(filename, chomp: true).grep(/void ValidateOperator_Version_.*_Profile/)
+    versions.merge(existing_versions)
+  end
+
+  def write_validator_header(versions)
+    File.open("tosa_validator.hpp", "w") do |f|
+      f.write emit_file_header(is_header: true)
+
+      f.write "#include \"include/graphapi.hpp\"\n\n"
+      f.write "#include <functional>\n\n"
+      f.write "#include <stdexcept>\n\n"
+      f.write "namespace GraphApi\n{\n"
+      f.write"struct GraphOperation;\n"
+      f.write "}  // namespace GraphApi\n\n"
+      f.write "namespace regor\n{\n"
+      f.write "class Operation;\n"
+      f.write "class Graph;\n"
+      f.write "}  // namespace regor\n\n"
+      f.write "namespace tosa\n{\n"
+      f.write "namespace validator\n{\n\n"
+      f.write "enum class Level\n{\n"
+      @level_limits.each { |name, level| f.write indent(1) + "Level%s,\n" % name}
+      f.write "};\n\n"
+      f.write "struct Context\n{\n"
+      f.write indent(1) + "uint32_t version = GraphApi::VERSION_TOSA_0_60;\n"
+      f.write indent(1) + "int32_t profile = GraphApi::PROFILE_BASELINE;\n"
+      f.write indent(1) + "Level level = Level::Level8K;\n"
+      f.write indent(1) + "std::function<const regor::Graph*(const char *)> GetGraph;\n"
+      f.write "};\n\n"
+      versions.each { |version| f.write "%s\n" % version }
+      f.write "void ValidateOperator(const GraphApi::GraphOperation *graphOp, const Context &context = Context{})"
+      f.write ";\n\n"
+      f.write "}  // namespace validator\n"
+      f.write "}  // namespace tosa\n"
+    end
+  end
+
+  def write_validator_implementation(versions)
+    File.open("tosa_validator.cpp", "w") do |f|
+      f.write emit_file_header(is_header: false)
+      f.write "#include \"tosa/tosa_validator.hpp\"\n\n"
+      f.write "#include \"compiler/operation.hpp\"\n\n"
+      f.write "namespace tosa\n{\n"
+      f.write "namespace validator\n{\n\n"
+      f.write "void ValidateOperator(const GraphApi::GraphOperation *graphOp, const Context &context)\n"
+      f.write "{\n"
+      f.write indent(1) + "if ( graphOp == nullptr ) throw std::invalid_argument(\"No operation\");\n"
+      versions.each { |version|  f.write version_validator_checked_call(version)}
+      f.write indent(1) + "throw std::invalid_argument(\"TOSA version or profile not supported\");\n"
+      f.write "}\n\n"
+      f.write "}  // namespace validator\n"
+      f.write "}  // namespace tosa\n"
+    end
+  end
+
+  def version_validator_checked_call(validator_name)
+    if (match = validator_name.match(/_Version_(\d+)_(\d+)_(\d+)(.*)Profile_(.*)\(/))
+      major, minor, patch, draft, profile = match.captures
+      patch_string = (patch.empty? || patch.to_i == 0 ? "" : "_" + patch)
+      draft_string = draft == "_" ? "" : "_DRAFT"
+      tosa_version_const = "GraphApi::VERSION_TOSA_%d_%02d%s%s" % [major.to_i, minor.to_i, patch_string, draft_string]
+      if (profile == "BI")
+        tosa_profile_const = "GraphApi::PROFILE_BASELINE"
+      elsif (profile == "MAIN")
+        tosa_profile_const = "GraphApi::PROFILE_MAIN"
+      else return
+      end
+      check = indent(1) + "if ( context.version == %s && context.profile == %s )\n" % [tosa_version_const, tosa_profile_const]
+      check << indent(1) + "{\n"
+      check << indent(2) + "ValidateOperator%s(graphOp, context);\n" % /(_Version_.*Profile_.*)\(/.match(validator_name)[1]
+      check << indent(2) + "return;\n"
+      check << indent(1) + "}\n"
+    end
+  end
+
+  def update_validator_function
+    File.open("tosa_validator_%s.cpp" % versioned_nametag.downcase, "w")  {|f| f.write emit_file_header + emit_version_validator_function}
+    write_validator_header(supported_versions)
+    write_validator_implementation(supported_versions)
+  end
+end
+
+class Argument
+  @name
+  @category
+  @description
+  @type
+  @shape
+  @element_type
+  @rank_bounds
+  @dimensions
+  attr_reader  :name, :dimensions, :shape
+
+  def initialize(xml_argument)
+    @name = xml_argument['name']
+    @category = xml_argument['category']
+    @description = xml_argument.text('./description')
+    @type = xml_argument['type']
+    @shape = xml_argument['shape']
+    parse_dimensions
+    @element_type = xml_argument['tensor-element-type']
+    if (@element_type == nil && @type != nil)
+      @element_type = @type.chomp('*')
+    end
+    rank_node = xml_argument.get_elements('rank').first
+    if(rank_node != nil)
+      @rank_bounds = {min: rank_node['min'], max:rank_node['max']}
+    else
+      if (@dimensions)
+        @rank_bounds = {min: @dimensions.size, max:@dimensions.size,}
+      end
+    end
+  end
+  def parse_dimensions
+    dim_match = /\[([^\]]*)\]/.match(@shape)
+    if (dim_match)
+      dim_string = dim_match[1]
+      @dimensions = dim_string.split(',') if (!dim_string.empty?)
+    end
+  end
+  def to_s
+    s = @name + ";" + @category + ";" + @type + ";" + @element_type
+    if (@rank_bounds != nil)
+      s <<  ";" + @rank_bounds[:min] + ";" + @rank_bounds[:max]
+    end
+    s
+  end
+  def scalar?
+    if (@rank_bounds == nil)
+      false
+    else
+      @rank_bounds[:min] == '0' && @rank_bounds[:max] == '0'
+    end
+  end
+  def category_string
+    s = "Category::"
+    if (@category == "input")
+      s << "Input"
+    elsif (@category == "output")
+      s << "Output"
+    elsif (scalar?)
+      s << "ScalarAttribute"
+    else
+      s << "Attribute"
+    end
+  end
+
+  def bounds_string
+    if (scalar? || @rank_bounds == nil)
+      ""
+    else
+      indent(2) + "{%s, %s}\n" % [@rank_bounds[:min], @rank_bounds[:max]]
+    end
+  end
+
+  def valid_name
+    if (@name == 'operator') #'operator' is reserved word in C++
+      'operatorName'
+    else
+      @name
+    end
+  end
+
+  def emit_definition
+    definition = indent(1) + "const Argument %s = {\n" % valid_name
+    definition << indent(2) + "%s,\n" % category_string
+    definition << indent(2) + "\"%s\",\n" % @name
+    definition << indent(2) + "\"%s\",\n" % @element_type
+    definition << "%s" % bounds_string
+    description = "/*%s %s*/\n" % [@description, scalar? ? "" : "shape="+@shape]
+    description_words = description.split(' ')
+    description_line = indent(1) + "};"
+    description_words.each do |word|
+      if (description_line.length() + 1 + word.length() > 120)
+        definition << description_line + "\n"
+        description_line = indent(2) + word
+      else
+        description_line << " " + word
+      end
+    end
+    definition << description_line + "\n"
+  end
+end
+
+class Operation
+  @name
+  @error_checks
+  @level_checks
+  @require_checks
+  @shape_ckecks
+  @arguments
+  @typesupport
+  attr_reader :name, :arguments, :shape_checks
+  attr_accessor :error_checks, :level_checks, :require_checks
+
+  def initialize(xml_operator)
+    @name = xml_operator.get_elements('name').first.text
+    @error_checks = Set[]
+    @level_checks = Set[]
+    @require_checks = Set[]
+    @shape_checks = Set[]
+    @typesupport = parse_types(xml_operator)
+    @arguments = parse_arguments(xml_operator)
+  end
+
+  def shape_match(shape, shape2)
+    return false if (!shape || !shape2)
+    shape1 = shape.clone
+    while (shape1.size > 0)
+      dim = shape1.pop
+      next if (Integer(dim, 10, exception: false))
+      shape2.each { |dim2| return true if (dim == dim2) }
+    end
+    false
+  end
+
+  def add_argument_shape_checks(arguments)
+    args = arguments.clone
+    while (args.size > 0)
+      arg = args.pop
+      args.each do |arg1|
+        if (shape_match(arg.dimensions, arg1.dimensions))
+          @shape_checks.add("shapeCheck(%s, %s, %s, %s)" % [arg.name, arg.shape, arg1.name, arg1.shape])
+        elsif (arg.shape == arg1.shape && arg.shape != "-" && !/\[([^\]]*)\]/.match(arg.shape))
+          @shape_checks.add("rankCheck(%s, %s)" % [arg.name, arg1.name])
+          x = 1
+        end
+      end
+    end
+  end
+
+  def parse_arguments(xml_operator)
+    arguments = []
+    xml_arguments = xml_operator.get_elements('arguments')
+    if (xml_arguments.size() > 0)
+      xml_arguments[0].get_elements('argument').each{|a|
+      arguments.append(Argument.new(a))}
+    end
+    add_argument_shape_checks(arguments)
+    arguments
+  end
+
+  def parse_types(xml_operator)
+    types = {}
+    typenames = []
+    xml_typesupports = xml_operator.get_elements('./typesupport')
+    if (xml_typesupports.size() > 0)
+      xml_types = xml_operator.get_elements('./types').first.get_elements('./type')
+      xml_types.each {|xml_type| typenames.append(xml_type['name'])}
+      xml_typesupports.each {|t| parse_typesupport(t, types, typenames)}
+    end
+    types
+  end
+
+  def parse_typesupport(xml_typesupport, types, typenames)
+    if (xml_typesupport.children.size == 0)
+      mode_types = {}
+      mode = xml_typesupport['mode']
+      typenames.each {|typename| mode_types[typename] = xml_typesupport[typename] }
+      types[mode] = mode_types
+    end
+    types
+  end
+
+  def emit_validation_check()
+    function = emit_validation_header()
+    function << emit_argument_checks()
+    function << emit_error_checks()
+    function << emit_level_checks()
+    function << emit_require_checks()
+    function << emit_validation_footer()
+  end
+
+  def emit_validation_header()
+    header = "void ValidateOperator_%s(const regor::Operation *op, const Context &context)\n" % @name
+    header << "{\n"
+  end
+
+  def emit_argument_checks()
+    checks = ""
+    @arguments.each { |a| checks << a.emit_definition }
+    checks << indent(1) + "const std::vector<const Argument *> arguments = {\n"
+    @arguments.each { |a| checks << indent(2) + "&%s,\n" % a.valid_name}
+    checks << indent(1) +"};\n"
+    checks << indent(1) + "const std::vector<Typesupport> typesupports = {%s" % (@typesupport.empty? ? "" : "\n")
+    @typesupport.each do |mode, types|
+      checks << indent(2) + "{\n"
+      types.each { |name, type| checks << indent(3) + "{\"%s\", \"%s\"},\n"% [name, type]}
+      checks << indent(2) +"},  // %s\n" % mode
+    end
+    checks << (@typesupport.empty? ? "" : indent(1)) + "};\n"
+    checks << indent(1) + "ValidateArguments(op, arguments, typesupports, context);\n"
+  end
+
+  def emit_error_checks()
+    checks = ""
+    @error_checks.each {|check| checks << indent(1) + "ErrorIfCheck_%s(op, context);\n" % name_tag(check)}
+    checks
+  end
+
+  def emit_level_checks()
+    checks = ""
+    @level_checks.each {|check| checks << indent(1) + "LevelCheck_%s(op, context);\n" % name_tag(check)}
+    checks
+  end
+
+  def emit_require_checks()
+    checks = ""
+    @require_checks.each {|check| checks << indent(1) + "RequireCheck_%s(op, context);\n" % name_tag(check)}
+    checks
+  end
+
+  def emit_validation_footer()
+    "}\n\n"
+  end
+
+end
+
+def name_tag(obj)
+  hashes = Digest::MD5.digest(obj.to_s).unpack('QQ')
+  hash = (hashes[0] ^ hashes[1])
+  tag = hash.to_s(36)
+  if (hash < 0)
+    tag[0] = 'N'
+  end
+  tag
+end
+
+options = parse_options
+xmlfile = File.new("%s/tosa.xml" % options[:spec])
+xml = REXML::Document.new(xmlfile)
+doc = Asciidoctor.load_file "%s/tosa_spec.adoc" % options[:spec], safe: :safe, attributes: "generated=%s/out/gen" % options[:spec]
+validator = TosaValidator.new(xml, doc, 'BI', '8K')
+validator.update_argument_checks
+validator.update_error_checks
+validator.update_level_checks
+validator.update_require_checks
+validator.update_validator_function
diff --git a/ethosu/regor/tosa/tosa_argument_checks.cpp b/ethosu/regor/tosa/tosa_argument_checks.cpp
new file mode 100644
index 00000000..3f36fe85
--- /dev/null
+++ b/ethosu/regor/tosa/tosa_argument_checks.cpp
@@ -0,0 +1,335 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Generated by tosaValidationGenerator for TOSA Specification 0.60.0
+// Modify by implementing the constraints.
+
+#include "tosa/tosa_argument_checks.hpp"
+
+#include "common/data_type.hpp"
+#include "compiler/operation.hpp"
+#include "compiler/tensor_properties.hpp"
+
+#include <optional>
+#include <unordered_set>
+
+using namespace tosa::validator::checks;
+using regor::DataType;
+using regor::Operation;
+using regor::OpType;
+using regor::TensorConnection;
+using regor::TensorUsage;
+namespace
+{
+
+const std::unordered_set<std::string_view> tosaSupportedTypes = {
+    "bool_t",
+    "i4_t",
+    "int4_t",
+    "uint6_t",
+    "i8_t",
+    "int8_t",
+    "uint8_t",
+    "i16_t",
+    "int16_t",
+    "uint16_t",
+    "i32_t",
+    "int32_t",
+    "i48_t",
+    "int48_t",
+    "fp16_t",
+    "bf16_t",
+    "fp32_t",
+    "fp64_t",
+    "index_t",
+    "mode_t",
+    "mul_t",
+    "tensor_list_t",
+    "tosa_graph_t",
+};
+
+std::optional<regor::DataType> MapType(const std::string_view &type)
+{
+
+    static const std::unordered_map<std::string_view, DataType> typeMap{
+        {"bool_t", DataType::Bool8},
+        {"i4_t", DataType::Int4Packed8},
+        {"int4_t", DataType::Int4Packed8},
+        {"i8_t", DataType::Int8},
+        {"int8_t", DataType::Int8},
+        {"uint8_t", DataType::UInt8},
+        {"i16_t", DataType::Int16},
+        {"int16_t", DataType::Int16},
+        {"uint16_t", DataType::UInt16},
+        {"i32_t", DataType::Int32},
+        {"int32_t", DataType::Int32},
+        {"i48_t", DataType::Int48},
+        {"int48_t", DataType::Int48},
+        {"fp16_t", DataType::Float16},
+        {"bf16_t", DataType::BFloat16},
+        {"fp32_t", DataType::Float32},
+        {"index_t", DataType::Int32},
+    };
+
+    if ( auto p = typeMap.find(type); p != typeMap.end() ) return p->second;
+    return std::nullopt;
+}
+
+const TensorConnection *GetTensorForArgument(const Operation *op, const Argument &argument)
+{
+
+    static const std::unordered_map<std::string_view, TensorUsage> inputUsageMap{
+        {"input", TensorUsage::IFM},
+        {"input_real", TensorUsage::IFM},
+        {"input_imag", TensorUsage::IFM1},
+        {"input1", TensorUsage::IFM},
+        {"input2", TensorUsage::IFM1},
+        {"input3", TensorUsage::IFM2},
+        {"A", TensorUsage::IFM},
+        {"B", TensorUsage::IFM1},
+        {"weight", TensorUsage::Weights},
+        {"bias", TensorUsage::Scales},
+        {"values", TensorUsage::IFM},
+        {"indices", TensorUsage::IFM1},
+        {"values_in", TensorUsage::IFM},
+        {"multiplier", TensorUsage::Params},
+        {"shift", TensorUsage::Params1},
+    };
+
+    switch ( argument.category )
+    {
+        case Category::Input:  // input, weight, bias, input_real, input_imag, A, B, input1, input2, input3
+            if ( auto p = inputUsageMap.find(argument.name); p != inputUsageMap.end() )
+            {
+                return op->Input(p->second);
+            }
+            break;
+        case Category::Output:
+        {
+            auto usage = regor::TensorUsage::OFM;
+            return op->Output(usage);
+        }
+        break;
+        case Category::Attribute:  // kernel stride pad dilation inverse weight bias out_pad out_shape padding new_shape
+                                   // start size multiples perms scale offset border multiplier shift values then_graph
+                                   // else_graph cond_graph body_graph
+            if ( auto p = inputUsageMap.find(argument.name); p != inputUsageMap.end() )
+            {
+                return op->Input(p->second);
+            }
+            break;
+        case Category::ScalarAttribute:  // axis acc_size input_zp output_zp weight_zp A_zp B_zp min_val max_val round
+                                         // pad_const mode scale32 double_round per_channel
+            break;
+        default:
+            break;
+    }
+    return nullptr;
+}
+
+std::optional<int> GetRankForArgument(const Operation *op, const Argument &argument)
+{
+    auto *tensorConnection = GetTensorForArgument(op, argument);
+    if ( tensorConnection ) return tensorConnection->shape.Size();
+    return std::nullopt;
+}
+
+std::optional<regor::DataType> GetTypeForArgument(const Operation *op, const Argument &argument)
+{
+
+    static const std::unordered_map<std::string_view, DataType> attributeTypeMap{
+        {"kernel", DataType::Int32},
+        {"stride", DataType::Int32},
+        {"pad", DataType::Int32},
+        {"dilation", DataType::Int32},
+    };
+
+    auto *tensorConnection = GetTensorForArgument(op, argument);
+    if ( tensorConnection ) return tensorConnection->tensor->Type();
+    switch ( argument.category )
+    {
+        case Category::Input:  // input, weight, bias, input_real, input_imag, A, B, input1, input2, input3
+            break;
+        case Category::Output:
+            break;
+        case Category::Attribute:  // kernel stride pad dilation inverse weight bias out_pad out_shape padding new_shape
+                                   // start size multiples perms scale offset border multiplier shift values then_graph
+                                   // else_graph cond_graph body_graph
+            if ( attributeTypeMap.count(argument.name) > 0 ) return attributeTypeMap.at(argument.name);
+            break;
+        case Category::ScalarAttribute:  // axis acc_size input_zp output_zp weight_zp A_zp B_zp min_val max_val round
+                                         // pad_const mode scale32 double_round per_channel
+            // TODO
+            break;
+        default:
+            break;
+    }
+    return std::nullopt;
+}
+
+bool IsReduceOp(OpType op)
+{
+    static const std::unordered_set<OpType> reduceOps{
+        OpType::ReduceAll,
+        OpType::ReduceAny,
+        OpType::ReduceMax,
+        OpType::ReduceMin,
+        OpType::ReduceProduct,
+        OpType::ReduceSum,
+    };
+    return reduceOps.count(op) > 0;
+}
+
+void ValidateArgumentShape(const regor::Operation *op, const Argument &argument, const tosa::validator::Context &)
+{
+    if ( argument.rank_bounds.first >= 0 || argument.rank_bounds.second >= 0 )
+    {
+        auto rank = GetRankForArgument(op, argument);
+        if ( rank && (rank < argument.rank_bounds.first || rank > argument.rank_bounds.second) )
+            throw std::invalid_argument("Tensor rank out of bounds");
+    }
+}
+
+void ValidateArgumentShapes(const regor::Operation *op, const std::vector<const Argument *> &arguments, const tosa::validator::Context &context)
+{
+    if ( IsReduceOp(op->Type()) )
+    {
+        auto inRank = op->Input(regor::TensorUsage::IFM)->shape.Size();
+        auto outRank = op->Output(regor::TensorUsage::OFM)->shape.Size();
+        if ( inRank != outRank ) throw std::invalid_argument("Tensor ranks different");
+        if ( inRank < 1 ) throw std::invalid_argument("Tensor rank < 1");
+        if ( context.version == GraphApi::VERSION_TOSA_0_60 && inRank > 4 )
+            throw std::invalid_argument("Tensor rank > 4");
+    }
+    for ( const auto &argument : arguments )
+        ValidateArgumentShape(op, *argument, context);
+}
+
+bool ValidateArgument(const regor::Operation *op, const Argument *argument, DataType expectedType)
+{
+    auto actualType = GetTypeForArgument(op, *argument);
+    if ( !actualType ) return false;
+    if ( *actualType == expectedType ) return true;
+    return false;
+}
+
+bool ResolveAndValidateArgument(const regor::Operation *op, const Argument *argument, const Typesupport *typesupport)
+{
+    if ( argument == nullptr ) return false;
+    if ( argument->category == Category::Attribute ) return true;        // For now
+    if ( argument->category == Category::ScalarAttribute ) return true;  // For now
+    std::string_view typeName = argument->element_type;
+    if ( typesupport )
+    {
+        if ( auto p = typesupport->find(typeName); p != typesupport->end() )
+        {
+            typeName = p->second;
+        }
+    }
+    auto expectedType = MapType(typeName);
+    if ( !expectedType ) return false;
+    return ValidateArgument(op, argument, *expectedType);
+}
+
+bool ResolveAndValidateArguments(
+    const regor::Operation *op, const std::vector<const Argument *> arguments, const Typesupport *typesupport = nullptr)
+{
+    for ( const auto argument : arguments )
+    {
+        if ( !ResolveAndValidateArgument(op, argument, typesupport) ) return false;
+    }
+    return true;
+}
+
+bool ArgumentsCanBeResolvedAndValidated(const regor::Operation *op, const std::vector<const Argument *> &arguments,
+    const std::vector<Typesupport> &typesupports)
+{
+    if ( op->Type() == regor::OpType::If )
+    {
+        auto cond = op->Input(TensorUsage::IFM);
+        if ( cond == nullptr ) return false;
+        auto type = cond->tensor->Type();
+        if ( type != DataType::Bool8 && type != DataType::Bool ) return false;
+        if ( strnlen(op->attr.condIf.else_branch, 1) == 0 ) return false;
+        if ( strnlen(op->attr.condIf.then_branch, 1) == 0 ) return false;
+        return true;
+    }
+    if ( op->Type() == regor::OpType::While )
+    {
+        if ( strnlen(op->attr.condWhile.body_branch, 1) == 0 ) return false;
+        if ( strnlen(op->attr.condWhile.cond_branch, 1) == 0 ) return false;
+        return true;
+    }
+    if ( ResolveAndValidateArguments(op, arguments) ) return true;
+    for ( const auto &typesupport : typesupports )
+    {
+        if ( ResolveAndValidateArguments(op, arguments, &typesupport) ) return true;
+    }
+    return false;
+}
+
+bool CanResolveArgument(const Argument *argument, const Typesupport *typesupport)
+{
+    if ( argument == nullptr ) return false;
+    std::string_view typeName = argument->element_type;
+    if ( typesupport )
+    {
+        if ( auto p = typesupport->find(typeName); p != typesupport->end() )
+        {
+            typeName = p->second;
+        }
+    }
+    return tosaSupportedTypes.count(typeName) > 0;
+}
+
+bool CanResolveArguments(const std::vector<const Argument *> &arguments, const Typesupport *typesupport = nullptr)
+{
+    for ( const auto argument : arguments )
+    {
+        if ( !CanResolveArgument(argument, typesupport) ) return false;
+    }
+    return true;
+}
+bool ArgumentsCanBeResolved(const std::vector<const Argument *> &arguments, const std::vector<Typesupport> &typesupports)
+{
+    if ( CanResolveArguments(arguments) ) return true;
+    for ( const auto &typesupport : typesupports )
+    {
+        if ( CanResolveArguments(arguments, &typesupport) ) return true;
+    }
+    return false;
+}
+}  // namespace
+namespace tosa
+{
+namespace validator
+{
+namespace checks
+{
+
+void ValidateArguments(const regor::Operation *op, const std::vector<const Argument *> &arguments,
+    const std::vector<Typesupport> &typesupports, const Context &context)
+{
+    if ( !ArgumentsCanBeResolved(arguments, typesupports) ) throw std::invalid_argument("Unsupported operation");
+    if ( !ArgumentsCanBeResolvedAndValidated(op, arguments, typesupports) )
+        throw std::invalid_argument("Invalid arguments");
+    ValidateArgumentShapes(op, arguments, context);
+}
+
+}  // namespace checks
+}  // namespace validator
+}  // namespace tosa
diff --git a/ethosu/regor/tosa/tosa_argument_checks.hpp b/ethosu/regor/tosa/tosa_argument_checks.hpp
new file mode 100644
index 00000000..7ccc8d30
--- /dev/null
+++ b/ethosu/regor/tosa/tosa_argument_checks.hpp
@@ -0,0 +1,60 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Generated by tosaValidationGenerator for TOSA Specification 0.60.0
+// Do not edit.
+
+#pragma once
+
+#include "tosa/tosa_validator.hpp"
+
+#include <map>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+namespace tosa
+{
+namespace validator
+{
+namespace checks
+{
+
+enum class Category
+{
+    Input,
+    Output,
+    Attribute,
+    ScalarAttribute
+};
+
+struct Argument
+{
+    Category category;
+    std::string name;
+    std::string element_type;
+    std::pair<int, int> rank_bounds = {-1, -1};
+};
+
+typedef std::map<std::string_view, std::string_view> Typesupport;
+
+void ValidateArguments(const regor::Operation *op, const std::vector<const Argument *> &arguments,
+    const std::vector<Typesupport> &typesupports, const Context &context);
+
+}  // namespace checks
+}  // namespace validator
+}  // namespace tosa
diff --git a/ethosu/regor/tosa/tosa_error_checks.cpp b/ethosu/regor/tosa/tosa_error_checks.cpp
new file mode 100644
index 00000000..2680f0c6
--- /dev/null
+++ b/ethosu/regor/tosa/tosa_error_checks.cpp
@@ -0,0 +1,1486 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Generated by tosaValidationGenerator for TOSA Specification 0.60.0
+// Modify by implementing the constraints.
+
+#include "tosa_error_checks.hpp"
+
+#include "common/ordered_map.hpp"
+#include "compiler/graph.hpp"
+#include "compiler/operation.hpp"
+
+using regor::DataType;
+using regor::Operation;
+using regor::ordered_map;
+using regor::Tensor;
+using regor::TensorConnection;
+using regor::TensorUsage;
+
+static bool shapeCheck(const TensorConnection *t1, int index1, const TensorConnection *t2, int index2)
+{
+    const auto &shape1 = t1->shape;
+    const auto &shape2 = t2->shape;
+    return (shape1.Size() > index1 && shape2.Size() > index2 && shape1[index1] == shape2[index2]);
+}
+namespace tosa
+{
+namespace validator
+{
+namespace checks
+{
+// Checks for TOSA Specification 0.60.0
+void ErrorIfCheck_ai0sdq9wgm72(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: ARGMAX,
+    static constexpr char constraint[] = "ERROR_IF(axis < 0 || axis >= rank(shape1) || rank(shape1) > 4)";
+    const auto rank = op->Input(TensorUsage::IFM)->shape.Size();
+    const auto axis = op->attr.axis.axis;
+    if ( axis < 0 || axis >= rank || rank > 4 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_gpp861oen43y(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: ARGMAX,
+    static constexpr char constraint[] = "ERROR_IF(flatten(left_shape, right_shape) != shape)";
+    const auto &inputShape = op->Input(TensorUsage::IFM)->shape;
+    const auto &expectedOutputShape = inputShape.Size() > 1 ? inputShape.Erase(op->attr.axis.axis) : Shape{1};
+    const auto &outputShape = op->Output(TensorUsage::OFM)->shape;
+    if ( outputShape != expectedOutputShape ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_1vu5c1tytwmhu(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: AVG_POOL2D,
+    static constexpr char constraint[] = "ERROR_IF(in_out_t != int8_t && input_zp != 0)";
+    const auto *input = op->Input(TensorUsage::IFM);
+    auto in_out_t = input->tensor->Type();
+    auto &zp = input->quantization.zeroPoints;
+    auto input_zp = zp.empty() ? 0 : zp[0];
+    if ( in_out_t != DataType::Int8 && input_zp != 0 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_1n0denkrrrlr1(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: AVG_POOL2D, NEGATE,
+    static constexpr char constraint[] = "ERROR_IF(in_out_t != int8_t && output_zp != 0)";
+    auto in_out_t = op->OFM()->Type();
+    auto &zp = op->Output(TensorUsage::OFM)->quantization.zeroPoints;
+    auto output_zp = zp.empty() ? 0 : zp[0];
+    if ( in_out_t != DataType::Int8 && output_zp != 0 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_36r4wpx3psd81(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: AVG_POOL2D, MAX_POOL2D,
+    static constexpr char constraint[] = "ERROR_IF(kernel_y < 1 || kernel_x < 1)";
+    const auto *kernel = op->Kernel();
+    if ( kernel->Size().y < 1 || kernel->Size().x < 1 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_1lrylbkd3w7ix(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: AVG_POOL2D, CONV2D, DEPTHWISE_CONV2D, MAX_POOL2D, TRANSPOSE_CONV2D,
+    static constexpr char constraint[] = "ERROR_IF(stride_y < 1 || stride_x < 1)";
+    const auto *kernel = op->Kernel();
+    if ( kernel->Stride().y < 1 || kernel->Stride().x < 1 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_ojmgqziimenu(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: AVG_POOL2D, CONV2D, DEPTHWISE_CONV2D, MAX_POOL2D,
+    static constexpr char constraint[] = "ERROR_IF(pad_top < 0 || pad_bottom < 0 || pad_left < 0 || pad_right < 0)";
+    const auto *kernel = op->Kernel();
+    if ( kernel->Padding().Top() < 0 || kernel->Padding().Bottom() < 0 || kernel->Padding().Left() < 0 ||
+         kernel->Padding().Right() < 0 )
+        throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_3vqy81ueu5wjk(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: AVG_POOL2D, MAX_POOL2D,
+    static constexpr char constraint[] = "ERROR_IF(pad_right >= kernel_x || pad_left >= kernel_x)";
+    const auto *kernel = op->Kernel();
+    if ( kernel->Padding().Right() >= kernel->Size().x || kernel->Padding().Left() >= kernel->Size().x )
+        throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_125xuezh1964i(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: AVG_POOL2D, MAX_POOL2D,
+    static constexpr char constraint[] = "ERROR_IF(pad_top >= kernel_y || pad_bottom >= kernel_y)";
+    const auto *kernel = op->Kernel();
+    if ( kernel->Padding().Top() >= kernel->Size().y || kernel->Padding().Bottom() >= kernel->Size().y )
+        throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_fqta626ku4qe(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: AVG_POOL2D, MAX_POOL2D,
+    static constexpr char constraint[] = "ERROR_IF(OH != idiv_check(IH + pad_top + pad_bottom - kernel_y, stride_y) + 1)";
+    auto IH = op->Input(TensorUsage::IFM)->shape.Height();
+    auto OH = op->Output(TensorUsage::OFM)->shape.Height();
+    int64_t tmp =
+        static_cast<int64_t>(IH) + op->Kernel()->Padding().Top() + op->Kernel()->Padding().Bottom() -
+        op->Kernel()->Size().y;
+    if ( tmp % op->Kernel()->Stride().y != 0 ) throw std::invalid_argument(constraint);
+    if ( OH != tmp / op->Kernel()->Stride().y + 1 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_ycjhrvf2yigr(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: AVG_POOL2D, MAX_POOL2D,
+    static constexpr char constraint[] = "ERROR_IF(OW != idiv_check(IW + pad_left + pad_right - kernel_x, stride_x) + 1)";
+    auto IW = op->Input(TensorUsage::IFM)->shape.Width();
+    auto OW = op->Output(TensorUsage::OFM)->shape.Width();
+    int64_t tmp =
+        static_cast<int64_t>(IW) + op->Kernel()->Padding().Left() + op->Kernel()->Padding().Right() -
+        op->Kernel()->Size().x;
+    if ( tmp % op->Kernel()->Stride().x != 0 ) throw std::invalid_argument(constraint);
+    if ( OW != tmp / op->Kernel()->Stride().x + 1 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_1c57olj698f3d(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: AVG_POOL2D, MAX_POOL2D, RESIZE,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output, [N,OH,OW,C], input, [N,IH,IW,C]))";
+    const auto *input = op->Input(TensorUsage::IFM);
+    const auto *output = op->Output(TensorUsage::OFM);
+    if ( !shapeCheck(input, 0, output, 0) ) throw std::invalid_argument(constraint);  // N
+    if ( !shapeCheck(input, 3, output, 3) ) throw std::invalid_argument(constraint);  // C
+}
+
+void ErrorIfCheck_1hby1qurzja4f(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV2D, CONV3D, DEPTHWISE_CONV2D, FULLY_CONNECTED, TRANSPOSE_CONV2D,
+    static constexpr char constraint[] = "ERROR_IF(in_t != int8_t && input_zp != 0)";
+    auto in_t = op->IFM(0)->Type();
+    auto &zp = op->Input(TensorUsage::IFM)->quantization.zeroPoints;
+    auto input_zp = zp.empty() ? 0 : zp[0];
+    if ( in_t != DataType::Int8 && input_zp != 0 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_1md8k265hfj92(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV2D, CONV3D, DEPTHWISE_CONV2D, FULLY_CONNECTED, TRANSPOSE_CONV2D,
+    static constexpr char constraint[] = "ERROR_IF(weight_t != int8_t && weight_zp != 0)";
+    auto weight_t = op->Input(TensorUsage::Weights)->tensor->Type();
+    auto &zp = op->Input(TensorUsage::Weights)->quantization.zeroPoints;
+    auto weight_zp = zp.empty() ? 0 : zp[0];
+    if ( weight_t != DataType::Int8 && weight_zp != 0 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_3fzsq78v5ypau(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV2D, DEPTHWISE_CONV2D,
+    static constexpr char constraint[] = "ERROR_IF(dilation_y < 1 || dilation_x < 1)";
+    const auto *kernel = op->Kernel();
+    if ( kernel->Dilation().y < 1 || kernel->Dilation().x < 1 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_2vhj6e48eyzlr(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV2D, CONV3D, DEPTHWISE_CONV2D,
+    static constexpr char constraint[] = "ERROR_IF(OH != idiv_check(IH - 1 + pad_top + pad_bottom - (KH - 1) * dilation_y, stride_y) + 1)";
+    auto IH = op->Input(TensorUsage::IFM)->shape.Height();
+    int WeightHeightIndex =
+        (op->Type() == regor::OpType::DepthwiseConv2DBias) ?
+            0 :
+        (op->Type() == regor::OpType::Conv3D) ?
+            2 :
+            1;
+    auto KH = op->Input(TensorUsage::Weights)->shape[WeightHeightIndex];
+    auto OH = op->Output(TensorUsage::OFM)->shape.Height();
+    const auto &padding = op->Kernel()->Padding();
+    const auto &stride = op->Kernel()->Stride();
+    const auto &dilation = op->Kernel()->Dilation();
+    if ( KH < 1 || dilation.y < 0 || padding.Top() < 0 || padding.Bottom() < 0 || stride.y < 1 )
+        throw std::invalid_argument(constraint);
+    int64_t term1 = IH - 1LL + padding.Top() + padding.Bottom();
+    int64_t term2 = (KH - 1LL) * dilation.y;
+    if ( term1 < 0 || term2 < 0 || term2 > term1 ) throw std::invalid_argument(constraint);
+    if ( term2 >= std::numeric_limits<int64_t>::max() - 3LL * std::numeric_limits<int>::max() - 1 )
+        throw std::invalid_argument(constraint);
+
+    int64_t numerator = term1 - term2;
+    if ( numerator % stride.y != 0 ) throw std::invalid_argument(constraint);
+    if ( OH != numerator / stride.y + 1 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_147wc580l2tik(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV2D, CONV3D, DEPTHWISE_CONV2D,
+    static constexpr char constraint[] = "ERROR_IF(OW != idiv_check(IW - 1 + pad_left + pad_right - (KW - 1) * dilation_x, stride_x) + 1)";
+    auto IW = op->Input(TensorUsage::IFM)->shape.Width();
+    int WeightWitdhIndex =
+        (op->Type() == regor::OpType::DepthwiseConv2DBias) ?
+            1 :
+        (op->Type() == regor::OpType::Conv3D) ?
+            3 :
+            2;
+    auto KW = op->Input(TensorUsage::Weights)->shape[WeightWitdhIndex];
+    auto OW = op->Output(TensorUsage::OFM)->shape.Width();
+    const auto &padding = op->Kernel()->Padding();
+    const auto &stride = op->Kernel()->Stride();
+    const auto &dilation = op->Kernel()->Dilation();
+    if ( KW < 1 || dilation.x < 0 || padding.Left() < 0 || padding.Right() < 0 || stride.x < 1 )
+        throw std::invalid_argument(constraint);
+    int64_t term1 = IW - 1LL + padding.Left() + padding.Right();
+    int64_t term2 = (KW - 1LL) * dilation.x;
+    if ( term1 < 0 || term2 < 0 || term2 > term1 ) throw std::invalid_argument(constraint);
+    if ( term2 >= std::numeric_limits<int64_t>::max() - 3LL * std::numeric_limits<int>::max() - 1 )
+        throw std::invalid_argument(constraint);
+    int64_t numerator = term1 - term2;
+    if ( numerator % stride.x != 0 ) throw std::invalid_argument(constraint);
+    if ( OW != numerator / stride.x + 1 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_2rm8rnsdfn14h(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV2D, TRANSPOSE_CONV2D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output, [N,OH,OW,OC], input, [N,IH,IW,IC]))";
+    const auto *input = op->Input(TensorUsage::IFM);
+    const auto *output = op->Output(TensorUsage::OFM);
+    if ( !shapeCheck(input, 0, output, 0) ) throw std::invalid_argument(constraint);  // N
+}
+
+void ErrorIfCheck_36emtx7zwkk96(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV2D, TRANSPOSE_CONV2D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output, [N,OH,OW,OC], weight, [OC,KH,KW,IC]))";
+    const auto *weights = op->Input(TensorUsage::Weights);
+    const auto *output = op->Output(TensorUsage::OFM);
+    if ( !shapeCheck(output, 3, weights, 0) ) throw std::invalid_argument(constraint);  // OC
+}
+
+void ErrorIfCheck_2r9jencgka20o(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV2D, TRANSPOSE_CONV2D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output, [N,OH,OW,OC], bias, [OC]))";
+    const auto *bias = op->Input(TensorUsage::Scales);
+    const auto *output = op->Output(TensorUsage::OFM);
+    if ( !shapeCheck(output, 3, bias, 0) ) throw std::invalid_argument(constraint);  // OC
+}
+
+void ErrorIfCheck_207p0r46d35m0(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV2D, TRANSPOSE_CONV2D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(bias, [OC], weight, [OC,KH,KW,IC]))";
+    const auto *weight = op->Input(TensorUsage::Weights);
+    const auto *bias = op->Input(TensorUsage::Scales);
+    if ( !shapeCheck(bias, 0, weight, 0) ) throw std::invalid_argument(constraint);  // OC
+}
+
+void ErrorIfCheck_cr43yjpqkcpd(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV2D, TRANSPOSE_CONV2D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(weight, [OC,KH,KW,IC], input, [N,IH,IW,IC]))";
+    const auto *weight = op->Input(TensorUsage::Weights);
+    const auto *input = op->Input(TensorUsage::IFM);
+    if ( !shapeCheck(weight, 3, input, 3) ) throw std::invalid_argument(constraint);  // IC
+}
+
+void ErrorIfCheck_341t6ysqc16b2(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV3D,
+    static constexpr char constraint[] = "ERROR_IF(pad_d0 < 0 || pad_d1 < 0 || pad_top < 0 || pad_bottom < 0 || pad_left < 0 || pad_right < 0)";
+    const auto &padding = op->Kernel()->Padding();
+    if ( padding.Top() < 0 || padding.Bottom() < 0 || padding.Left() < 0 || padding.Right() < 0 || padding.Near() < 0 || padding.Far() < 0 )
+        throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_uqm570jwaqb6(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV3D,
+    static constexpr char constraint[] = "ERROR_IF(stride_d < 1 || stride_y < 1 || stride_x < 1)";
+    const auto stride = op->Kernel()->Stride3D();
+    if ( stride.z < 1 || stride.y < 1 || stride.x < 1 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_34iiwt6o66qfa(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV3D,
+    static constexpr char constraint[] = "ERROR_IF(dilation_d < 1 || dilation_y < 1 || dilation_x < 1)";
+    const auto dilation = op->Kernel()->Dilation3D();
+    if ( dilation.z < 1 || dilation.y < 1 || dilation.x < 1 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_llbd3iugmek0(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV3D,
+    static constexpr char constraint[] = "ERROR_IF(OD != idiv_check(ID - 1 + pad_d0 + pad_d1 - (KD - 1) * dilation_d, stride_d) + 1)";
+    auto ID = op->Input(TensorUsage::IFM)->shape[1];
+    auto KD = op->Input(TensorUsage::Weights)->shape[1];
+    auto OD = op->Output(TensorUsage::OFM)->shape[1];
+    const auto *kernel = op->Kernel();
+    const auto &padding = kernel->Padding();
+    const auto &dilation = kernel->Dilation3D();
+    const auto &stride = kernel->Stride3D();
+    if ( KD < 1 || dilation.z < 0 || padding.Near() < 0 || padding.Far() < 0 || stride.z < 1 )
+        throw std::invalid_argument(constraint);
+    int64_t term1 = ID - 1LL + padding.Near() + padding.Far();
+    int64_t term2 = (KD - 1LL) * dilation.z;
+    if ( term1 < 0 || term2 < 0 || term2 > term1 ) throw std::invalid_argument(constraint);
+    if ( term2 >= std::numeric_limits<int64_t>::max() - 3LL * std::numeric_limits<int>::max() - 1 )
+        throw std::invalid_argument(constraint);
+    int64_t numerator = term1 - term2;
+    if ( numerator % stride.z != 0 ) throw std::invalid_argument(constraint);
+    if ( OD != numerator / stride.z + 1 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_1w510kxt5b2b2(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV3D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output, [N,OD,OH,OW,OC], input, [N,ID,IH,IW,IC]))";
+    const auto *input = op->Input(TensorUsage::IFM);
+    const auto *output = op->Output(TensorUsage::OFM);
+    if ( !shapeCheck(input, 0, output, 0) ) throw std::invalid_argument(constraint);  // N
+}
+
+void ErrorIfCheck_27g3t38z1of4h(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV3D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output, [N,OD,OH,OW,OC], weight, [OC,KD,KH,KW,IC]))";
+    const auto *weights = op->Input(TensorUsage::Weights);
+    const auto *output = op->Output(TensorUsage::OFM);
+    if ( !shapeCheck(output, 4, weights, 0) ) throw std::invalid_argument(constraint);  // OC
+}
+
+void ErrorIfCheck_95jvn4dzraol(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV3D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output, [N,OD,OH,OW,OC], bias, [OC]))";
+    const auto *bias = op->Input(TensorUsage::Scales);
+    const auto *output = op->Output(TensorUsage::OFM);
+    if ( !shapeCheck(output, 4, bias, 0) ) throw std::invalid_argument(constraint);  // OC
+}
+
+void ErrorIfCheck_21377cjnb1ox7(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV3D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(bias, [OC], weight, [OC,KD,KH,KW,IC]))";
+    const auto *weight = op->Input(TensorUsage::Weights);
+    const auto *bias = op->Input(TensorUsage::Scales);
+    if ( !shapeCheck(bias, 0, weight, 0) ) throw std::invalid_argument(constraint);  // OC
+}
+
+void ErrorIfCheck_2cpco8ykx99sa(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV3D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(weight, [OC,KD,KH,KW,IC], input, [N,ID,IH,IW,IC]))";
+    const auto *weight = op->Input(TensorUsage::Weights);
+    const auto *input = op->Input(TensorUsage::IFM);
+    if ( !shapeCheck(weight, 4, input, 4) ) throw std::invalid_argument(constraint);  // IC
+}
+
+void ErrorIfCheck_10sexbqileii7(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: DEPTHWISE_CONV2D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output, [N,OH,OW,C*M], input, [N,H,W,C]))";
+    const auto *input = op->Input(TensorUsage::IFM);
+    const auto *output = op->Output(TensorUsage::OFM);
+    if ( !shapeCheck(input, 0, output, 0) ) throw std::invalid_argument(constraint);  // N
+}
+
+void ErrorIfCheck_12rt0p658ac1(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: DEPTHWISE_CONV2D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output, [N,OH,OW,C*M], bias, [C*M]))";
+    const auto *output = op->Output(TensorUsage::OFM);
+    const auto *bias = op->Input(TensorUsage::Scales);
+    if ( !shapeCheck(output, 3, bias, 0) ) throw std::invalid_argument(constraint);  // C*M
+    // Verify M & C from Weights, with shape [KH,KW,C,M]
+    const auto &weightShape = op->Input(TensorUsage::Weights)->shape;
+    if ( weightShape.Size() != 4 ) throw std::invalid_argument(constraint);
+    auto CM = weightShape[2] * weightShape[3];
+    if ( output->shape[3] != CM ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_3cem64qtn6ajr(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: DEPTHWISE_CONV2D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(weight, [KH,KW,C,M], input, [N,H,W,C]))";
+    const auto *weight = op->Input(TensorUsage::Weights);
+    const auto *input = op->Input(TensorUsage::IFM);
+    if ( !shapeCheck(weight, 2, input, 3) ) throw std::invalid_argument(constraint);  // C
+}
+
+void ErrorIfCheck_1hp4djlq1mi8i(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: FFT2D, RFFT2D,
+    static constexpr char constraint[] = "ERROR_IF(!power_of_two(H))";
+    bool checkOk = (context.profile != GraphApi::PROFILE_BASELINE);
+    checkOk = (op != nullptr);  // TODO: Implement check when MainInference is supported
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_20r08ymi6c43u(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: FFT2D, RFFT2D,
+    static constexpr char constraint[] = "ERROR_IF(!power_of_two(W))";
+    bool checkOk = (context.profile != GraphApi::PROFILE_BASELINE);
+    checkOk = (op != nullptr);  // TODO: Implement check  when MainInference is supported
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_1xwwkxeypcw3j(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: FFT2D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output_imag, [N,H,W], input_real, [N,H,W]))";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_vi3hzxbetjyg(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: FFT2D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output_imag, [N,H,W], input_imag, [N,H,W]))";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_1m8qk2pbuovev(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: FFT2D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output_imag, [N,H,W], output_real, [N,H,W]))";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_1iv4j2x95j8dk(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: FFT2D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output_real, [N,H,W], input_real, [N,H,W]))";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_316kdwzc9jf5x(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: FFT2D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output_real, [N,H,W], input_imag, [N,H,W]))";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_tnr115b4spgw(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: FFT2D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(input_imag, [N,H,W], input_real, [N,H,W]))";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_3ufiqep5ipuco(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: FULLY_CONNECTED,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output, [N,OC], input, [N,IC]))";
+    const auto *input = op->Input(TensorUsage::IFM);
+    const auto *output = op->Output(TensorUsage::OFM);
+    if ( !shapeCheck(input, 0, output, 0) ) throw std::invalid_argument(constraint);  // N
+}
+
+void ErrorIfCheck_3kcipzq18dxv9(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: FULLY_CONNECTED,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output, [N,OC], weight, [OC,IC]))";
+    const auto *weights = op->Input(TensorUsage::Weights);
+    const auto *output = op->Output(TensorUsage::OFM);
+    if ( !shapeCheck(output, 1, weights, 0) ) throw std::invalid_argument(constraint);  // OC
+}
+
+void ErrorIfCheck_jcjmr2nnatvv(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: FULLY_CONNECTED,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output, [N,OC], bias, [OC]))";
+    const auto *bias = op->Input(TensorUsage::Scales);
+    const auto *output = op->Output(TensorUsage::OFM);
+    if ( !shapeCheck(output, 1, bias, 0) ) throw std::invalid_argument(constraint);  // OC
+}
+
+void ErrorIfCheck_qwmo2w7hxola(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: FULLY_CONNECTED,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(bias, [OC], weight, [OC,IC]))";
+    const auto *weight = op->Input(TensorUsage::Weights);
+    const auto *bias = op->Input(TensorUsage::Scales);
+    if ( !shapeCheck(bias, 0, weight, 0) ) throw std::invalid_argument(constraint);  // OC
+}
+
+void ErrorIfCheck_c9o11f07skde(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: FULLY_CONNECTED,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(weight, [OC,IC], input, [N,IC]))";
+    const auto *weight = op->Input(TensorUsage::Weights);
+    const auto *input = op->Input(TensorUsage::IFM);
+    if ( !shapeCheck(weight, 1, input, 1) ) throw std::invalid_argument(constraint);  // IC
+}
+
+void ErrorIfCheck_1ellfcuw76b13(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: MATMUL,
+    static constexpr char constraint[] = "ERROR_IF(in_t != int8_t && (A_zp != 0 || B_zp != 0))";
+    auto in_t = op->IFM(0)->Type();
+    auto &zpA = op->Input(TensorUsage::IFM)->quantization.zeroPoints;
+    auto A_zp = zpA.empty() ? 0 : zpA[0];
+    auto &zpB = op->Input(TensorUsage::IFM1)->quantization.zeroPoints;
+    auto B_zp = zpB.empty() ? 0 : zpB[0];
+    if ( in_t != DataType::Int8 && (A_zp != 0 || B_zp != 0) ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_h1uadv5irsu6(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: MATMUL,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output, [N,H,W], A, [N,H,C]))";
+    const auto *A = op->Input(TensorUsage::IFM);
+    const auto *output = op->Output(TensorUsage::OFM);
+    if ( !shapeCheck(A, 0, output, 0) ) throw std::invalid_argument(constraint);  // N
+    if ( !shapeCheck(A, 1, output, 1) ) throw std::invalid_argument(constraint);  // H
+}
+
+void ErrorIfCheck_1kfh97qingywb(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: MATMUL,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output, [N,H,W], B, [N,C,W]))";
+    const auto *B = op->Input(TensorUsage::IFM1);
+    const auto *output = op->Output(TensorUsage::OFM);
+    if ( !shapeCheck(B, 0, output, 0) ) throw std::invalid_argument(constraint);  // N
+    if ( !shapeCheck(B, 2, output, 2) ) throw std::invalid_argument(constraint);  // W
+}
+
+void ErrorIfCheck_1azcq4511qzyx(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: MATMUL,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(B, [N,C,W], A, [N,H,C]))";
+    const auto *A = op->Input(TensorUsage::IFM);
+    const auto *B = op->Input(TensorUsage::IFM1);
+    if ( !shapeCheck(B, 0, A, 0) ) throw std::invalid_argument(constraint);  // N
+    if ( !shapeCheck(B, 1, A, 2) ) throw std::invalid_argument(constraint);  // C
+}
+
+void ErrorIfCheck_15o9wo9pu7mrg(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RFFT2D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output_imag, [N,H,W/2 + 1], input, [N,H,W]))";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_13tqdu59nyxyh(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RFFT2D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output_imag, [N,H,W/2 + 1], output_real, [N,H,W/2 + 1]))";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_2kgf2jejxlrr6(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RFFT2D,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output_real, [N,H,W/2 + 1], input, [N,H,W]))";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_q9dl3x81rc4o(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: TRANSPOSE_CONV2D,
+    static constexpr char constraint[] = "ERROR_IF(out_pad_top <= -KH || out_pad_bottom <= -KH)";
+    int64_t KH = op->Input(TensorUsage::Weights)->shape.Height();
+    const auto *k = op->Kernel();
+    if ( k->Padding().Top() <= -KH || k->Padding().Bottom() <= -KH ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_2rfkujt9lg7eq(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: TRANSPOSE_CONV2D,
+    static constexpr char constraint[] = "ERROR_IF(out_pad_left <= -KW || out_pad_right <= -KW)";
+    int64_t KW = op->Input(TensorUsage::Weights)->shape.Width();
+    const auto *k = op->Kernel();
+    if ( k->Padding().Left() <= -KW || k->Padding().Right() <= -KW ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_3nelbnmxyemot(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: TRANSPOSE_CONV2D,
+    static constexpr char constraint[] = "ERROR_IF(OH != (IH - 1) * stride_y + out_pad_top + out_pad_bottom + KH)";
+    auto IH = op->Input(TensorUsage::IFM)->shape.Height();
+    auto KH = op->Input(TensorUsage::Weights)->shape.Height();
+    auto OH = op->Output(TensorUsage::OFM)->shape.Height();
+    auto *k = op->Kernel();
+    const auto &padding = k->Padding();
+    const auto &stride = k->Stride();
+    if ( IH < 1 || stride.y < 1 ) throw std::invalid_argument(constraint);
+    int64_t term1 = (IH - 1LL) * stride.y;
+    if ( term1 >= std::numeric_limits<int64_t>::max() - 3LL * std::numeric_limits<int>::max() )
+        throw std::invalid_argument(constraint);
+    int64_t term2 = static_cast<int64_t>(padding.Top()) + padding.Bottom() + KH;
+    if ( term1 < 0 || term2 < -term1 ) throw std::invalid_argument(constraint);
+    uint64_t resultH = static_cast<uint64_t>(term1) + term2;
+    if ( OH != static_cast<int64_t>(resultH) ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_24conlof4w8eh(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: TRANSPOSE_CONV2D,
+    static constexpr char constraint[] = "ERROR_IF(OW != (IW - 1) * stride_x + out_pad_left + out_pad_right + KW)";
+    auto IW = op->Input(TensorUsage::IFM)->shape.Width();
+    auto KW = op->Input(TensorUsage::Weights)->shape.Width();
+    auto OW = op->Output(TensorUsage::OFM)->shape.Width();
+    auto *k = op->Kernel();
+    const auto &padding = k->Padding();
+    const auto &stride = k->Stride();
+    if ( IW < 1 || stride.x < 1 ) throw std::invalid_argument(constraint);
+    int64_t term1 = (IW - 1LL) * stride.x;
+    if ( term1 >= std::numeric_limits<int64_t>::max() - 3LL * std::numeric_limits<int>::max() )
+        throw std::invalid_argument(constraint);
+    int64_t term2 = static_cast<int64_t>(padding.Left()) + padding.Right() + KW;
+    if ( term1 < 0 || term2 < -term1 ) throw std::invalid_argument(constraint);
+    uint64_t resultW = static_cast<uint64_t>(term1) + term2;
+    if ( OW != static_cast<int64_t>(resultW) ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_xod9coigx1x2(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CLAMP,
+    static constexpr char constraint[] = "ERROR_IF(max_val < min_val)";
+    if ( op->attr.clamp.max < op->attr.clamp.min ) throw std::invalid_argument(constraint);
+}
+
+static bool broadcastOk(const Shape &outShape, const Shape &inShape)
+{
+    auto inRank = inShape.Size();
+    auto outRank = outShape.Size();
+    if ( outRank != inRank ) return false;
+    for ( int i = 0; i < outRank; i++ )
+    {
+        if ( outShape[i] != inShape[i] && inShape[i] != 1 ) return false;
+    }
+    return true;
+}
+
+void ErrorIfCheck_1yism57if6v2z(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: ADD, ARITHMETIC_RIGHT_SHIFT, BITWISE_AND, BITWISE_OR, BITWISE_XOR, INTDIV, LOGICAL_AND,
+    //  LOGICAL_LEFT_SHIFT, LOGICAL_RIGHT_SHIFT, LOGICAL_OR, LOGICAL_XOR, MAXIMUM, MINIMUM, MUL, POW, SUB, SELECT,
+    //  EQUAL, GREATER, GREATER_EQUAL,
+    static constexpr char constraint[] = "ERROR_IF(apply_broadcast(shape, shape1, index))";
+    const auto &shape = op->Output(TensorUsage::OFM)->shape;
+    const auto &shape1 = op->Input(TensorUsage::IFM)->shape;
+
+    if ( !broadcastOk(shape, shape1) ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_3k5ug2w7gxc7r(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: ADD, ARITHMETIC_RIGHT_SHIFT, BITWISE_AND, BITWISE_OR, BITWISE_XOR, INTDIV, LOGICAL_AND,
+    //  LOGICAL_LEFT_SHIFT, LOGICAL_RIGHT_SHIFT, LOGICAL_OR, LOGICAL_XOR, MAXIMUM, MINIMUM, MUL, POW, SUB, SELECT,
+    //  EQUAL, GREATER, GREATER_EQUAL,
+    static constexpr char constraint[] = "ERROR_IF(apply_broadcast(shape, shape2, index))";
+    const auto &shape = op->Output(TensorUsage::OFM)->shape;
+    const auto &shape2 = op->Input(TensorUsage::IFM1)->shape;
+
+    if ( !broadcastOk(shape, shape2) ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_2gdayq6ofi7wx(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: MUL,
+    static constexpr char constraint[] = "ERROR_IF(in_t != int32_t && shift > 0)";
+    auto in_t = op->IFM(0)->Type();
+    if ( in_t != DataType::Int32 && op->attr.mul.shift > 0 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_38qvty7pudfz2(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: NEGATE,
+    static constexpr char constraint[] = "ERROR_IF(in_out_t != int8_t && input1_zp != 0)";
+    auto in_out_t = op->IFM(0)->Type();
+    auto &zp = op->Input(TensorUsage::IFM)->quantization.zeroPoints;
+    auto input1_zp = zp.empty() ? 0 : zp[0];
+    if ( in_out_t != DataType::Int8 && input1_zp != 0 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_3tccsjner0km9(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: SELECT,
+    static constexpr char constraint[] = "ERROR_IF(apply_broadcast(shape, shape3, index))";
+    const auto &shape = op->Output(TensorUsage::OFM)->shape;
+    const auto &shape3 = op->Input(TensorUsage::IFM2)->shape;
+
+    if ( !broadcastOk(shape, shape3) ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_3tg4p2a5te0jy(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: REDUCE_ALL, REDUCE_ANY, REDUCE_MAX, REDUCE_MIN, REDUCE_PRODUCT, REDUCE_SUM,
+    static constexpr char constraint[] = "ERROR_IF(axis < 0 || axis >= rank(shape1))";
+    const auto rank = op->Input(TensorUsage::IFM)->shape.Size();
+    if ( op->attr.axis.axis < 0 || op->attr.axis.axis >= rank ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_33exz9gn2i1wy(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: REDUCE_ALL, REDUCE_ANY, REDUCE_MAX, REDUCE_MIN, REDUCE_PRODUCT, REDUCE_SUM,
+    static constexpr char constraint[] = "ERROR_IF(shape[axis] != 1)";
+    const auto &shape = op->Output(TensorUsage::OFM)->shape;
+    if ( shape[op->attr.axis.axis] != 1 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_14slfd7r77hgh(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONCAT,
+    static constexpr char constraint[] = "ERROR_IF(axis < 0 || axis >= rank(shapes1[0]))";
+    const auto rank = op->Input(TensorUsage::IFM)->shape.Size();
+    if ( op->attr.axis.axis < 0 || op->attr.axis.axis >= rank ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_1fzhf02pkiw9z(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONCAT,
+    static constexpr char constraint[] = "ERROR_IF(shape[axis] != sum(shape1[k][axis] for all k))";
+    const auto &shape = op->Output(TensorUsage::OFM)->shape;
+    const auto &inputs = op->Inputs();
+    auto axis = op->attr.axis.axis;
+    int64_t sum = 0;
+    for ( const auto &input : inputs )
+    {
+        auto inputDim = input.shape[axis];
+        if ( inputDim < 0 || sum + inputDim > std::numeric_limits<int32_t>::max() )
+            throw std::invalid_argument(constraint);
+        sum += inputDim;
+    }
+    if ( shape[axis] != sum ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_16s99hvsej4fo(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONCAT,
+    static constexpr char constraint[] = "ERROR_IF(rank(input_shape) != rank(shapes1[0]))";
+    const auto &inputs = op->Inputs();
+    auto rank = inputs.front().shape.Size();
+    bool checkOk = true;
+    for ( const auto &input : inputs )
+    {
+        if ( input.shape.Size() != rank )
+        {
+            checkOk = false;
+            break;
+        }
+    }
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_dctmd6sgn5n0(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONCAT,
+    static constexpr char constraint[] = "ERROR_IF(input_shape[index] != shapes1[0][index] && index != axis)";
+    bool checkOk = true;
+    const auto &inputs = op->Inputs();
+    auto rank = inputs.front().shape.Size();
+    for ( const auto &input : inputs )
+    {
+        for ( int i = 0; i < rank; i++ )
+        {
+            if ( i != op->attr.axis.axis && input.shape[i] != inputs.front().shape[i] )
+            {
+                checkOk = false;
+                break;
+            }
+        }
+        if ( !checkOk ) break;
+    }
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_14z7y0qe9lwps(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: PAD,
+    static constexpr char constraint[] = "ERROR_IF(rank(shape) != rank(shape1))";
+    auto rank = op->Output(TensorUsage::OFM)->shape.Size();
+    auto rank1 = op->Input(TensorUsage::IFM)->shape.Size();
+    if ( rank != rank1 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_2rfef32dgp3be(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: PAD,
+    static constexpr char constraint[] = "ERROR_IF(padding[i,0] < 0 || padding[i,1] < 0)";
+    const auto *padding = op->Input(TensorUsage::Params);
+    const auto rank = op->Input(TensorUsage::IFM)->shape.Size();
+    if ( padding->shape.Elements() != 2LL * rank ) throw std::invalid_argument(constraint);
+    const auto &paddingView = padding->tensor->View();
+    if ( paddingView.ViewShape().Elements() != 2LL * rank ) throw std::invalid_argument(constraint);
+    for ( int i = 0; i < rank; i++ )
+    {
+        if ( paddingView.Values<int32_t>()[2 * i] < 0 || paddingView.Values<int32_t>()[2 * i + 1] < 0 )
+            throw std::invalid_argument(constraint);
+    }
+}
+
+void ErrorIfCheck_2sfcgak3rj1vs(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: PAD,
+    static constexpr char constraint[] = "ERROR_IF(shape[i] != padding[i, 0] + shape1[i] + padding[i, 1])";
+    const auto *padding = op->Input(TensorUsage::Params);
+    const auto &shape1 = op->Input(TensorUsage::IFM)->shape;
+    const auto rank = shape1.Size();
+    const auto &shape = op->Output(TensorUsage::OFM)->shape;
+    if ( padding->shape.Elements() != 2LL * rank ) throw std::invalid_argument(constraint);
+    const auto &paddingView = padding->tensor->View();
+    if ( paddingView.ViewShape().Elements() != 2LL * rank ) throw std::invalid_argument(constraint);
+    for ( int i = 0; i < rank; i++ )
+    {
+        if ( shape[i] != paddingView.Values<int32_t>()[2 * i] + shape1[i] + paddingView.Values<int32_t>()[2 * i + 1] )
+            throw std::invalid_argument(constraint);
+    }
+}
+
+void ErrorIfCheck_2a1jpygblc07i(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RESHAPE, TRANSPOSE,
+    static constexpr char constraint[] = "ERROR_IF(tensor_size(shape1) != tensor_size(shape))";
+    const auto &shape1 = op->Input(TensorUsage::IFM)->shape;
+    const auto &shape = op->Output(TensorUsage::OFM)->shape;
+    if ( shape1.Elements() != shape.Elements() ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_3hthyoock2ew5(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: REVERSE,
+    static constexpr char constraint[] = "ERROR_IF(axis < 0 || axis >= rank(shape))";
+    const auto rank = op->Input(TensorUsage::IFM)->shape.Size();
+    if ( op->attr.axis.axis < 0 || op->attr.axis.axis >= rank ) throw std::invalid_argument(constraint);
+    ;
+}
+
+void ErrorIfCheck_1nifeiq9rvmb8(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: SLICE,
+    static constexpr char constraint[] = "ERROR_IF(rank(shape1) != length(start) || rank(shape1) != length(size))";
+    const auto rank = op->Input(TensorUsage::IFM)->shape.Size();
+    if ( rank != op->attr.slice.begin.count || rank != op->attr.slice.size.count )
+        throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_21rq6kn6p1yle(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: SLICE, TILE, TRANSPOSE,
+    static constexpr char constraint[] = "ERROR_IF(rank(shape1) != rank(shape))";
+    const auto &shape1 = op->Input(TensorUsage::IFM)->shape;
+    const auto &shape = op->Output(TensorUsage::OFM)->shape;
+    if ( shape1.Size() != shape.Size() ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_3rghkieqip43o(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: SLICE,
+    static constexpr char constraint[] = "ERROR_IF(start[index] < 0)";
+    bool checkOk = true;
+    const auto rank = op->Input(TensorUsage::IFM)->shape.Size();
+    for ( int i = 0; i < rank; i++ )
+    {
+        if ( op->attr.slice.begin.axisNHWC[i] < 0 )
+        {
+            checkOk = false;
+            break;
+        }
+    }
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_1cyv9n59wyyyc(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: SLICE,
+    static constexpr char constraint[] = "ERROR_IF(size[index] <= 0)";
+    bool checkOk = true;
+    const auto rank = op->Input(TensorUsage::IFM)->shape.Size();
+    for ( int i = 0; i < rank; i++ )
+    {
+        if ( op->attr.slice.size.axisNHWC[i] < 0 )
+        {
+            checkOk = false;
+            break;
+        }
+    }
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_3oy2tclc6uhsu(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: SLICE,
+    static constexpr char constraint[] = "ERROR_IF(start[index] + size[index] > shape1[index])";
+    bool checkOk = true;
+    const auto &shape = op->Input(TensorUsage::IFM)->shape;
+    const auto rank = shape.Size();
+    for ( int i = 0; i < rank; i++ )
+    {
+        int64_t sliceSize = op->attr.slice.begin.axisNHWC[i] + op->attr.slice.size.axisNHWC[i];
+        if ( sliceSize > shape[i] )
+        {
+            checkOk = false;
+            break;
+        }
+    }
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_gpp3enlp1ddg(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: SLICE,
+    static constexpr char constraint[] = "ERROR_IF(shape[index] != size[index])";
+    const auto &shape = op->Output(TensorUsage::OFM)->shape;
+    const auto rank = shape.Size();
+    for ( int i = 0; i < rank; i++ )
+    {
+        if ( shape[i] != op->attr.slice.size.axisNHWC[i] ) throw std::invalid_argument(constraint);
+    }
+}
+
+void ErrorIfCheck_ix9div4ld46q(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: SLICE,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(size, [rank(shape1)], start, [rank(shape1)]))";
+    auto rank = op->Input(TensorUsage::IFM)->shape.Size();
+
+    if ( op->attr.slice.size.count != rank || op->attr.slice.begin.count != rank )
+        throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_3estuseky2gm2(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: TILE,
+    static constexpr char constraint[] = "ERROR_IF(shape1[i] * multiples[i] != shape[i])";
+    const auto &shape = op->Output(TensorUsage::OFM)->shape;
+    const auto &shape1 = op->Input(TensorUsage::IFM)->shape;
+    const auto &multiples = op->attr.tile.multiples;
+    if ( multiples.count != shape.Size() ) throw std::invalid_argument(constraint);
+    for ( int i = 0; i < shape.Size(); i++ )
+    {
+        int64_t shape1Dim = shape1[i];
+        if ( shape1Dim < 0 || multiples.axisNHWC[i] < 0 || shape[i] ) throw std::invalid_argument(constraint);
+        int64_t result = shape1Dim * multiples.axisNHWC[i];
+        if ( result > std::numeric_limits<int32_t>::max() ) throw std::invalid_argument(constraint);
+        if ( static_cast<int32_t>(result) != shape[i] ) throw std::invalid_argument(constraint);
+    }
+}
+
+void ErrorIfCheck_5bq1fx1llv8(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: TRANSPOSE,
+    static constexpr char constraint[] = "ERROR_IF(index >= rank(shape1))";
+    auto rank = op->Input(TensorUsage::IFM)->shape.Size();
+    const auto &perm = op->attr.transpose.perm;
+    for ( int i = 0; i < perm.count; i++ )
+    {
+        if ( perm.axisNHWC[i] >= rank ) throw std::invalid_argument(constraint);
+    }
+}
+
+void ErrorIfCheck_ckwpttzajw06(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: TRANSPOSE,
+    static constexpr char constraint[] = "ERROR_IF(index < 0)";
+    const auto &perm = op->attr.transpose.perm;
+    for ( int i = 0; i < perm.count; i++ )
+    {
+        if ( perm.axisNHWC[i] < 0 ) throw std::invalid_argument(constraint);
+    }
+}
+
+void ErrorIfCheck_2n1ratxgd89tx(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: TRANSPOSE,
+    static constexpr char constraint[] = "ERROR_IF(indexes_used[index] == true)";
+    const auto &perm = op->attr.transpose.perm;
+    bool indexes_used[GraphApi::MAX_TENSOR_DIMS]{};
+    for ( int i = 0; i < perm.count; i++ )
+    {
+        if ( perm.axisNHWC[i] >= GraphApi::MAX_TENSOR_DIMS || indexes_used[perm.axisNHWC[i]] )
+            throw std::invalid_argument(constraint);
+        indexes_used[perm.axisNHWC[i]] = true;
+    }
+}
+
+void ErrorIfCheck_aizwrn95lb0l(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: TRANSPOSE,
+    static constexpr char constraint[] = "ERROR_IF(shape1[perms[i]] != shape[i])";
+    const auto &shape = op->Output(TensorUsage::OFM)->shape;
+    const auto &shape1 = op->Input(TensorUsage::IFM)->shape;
+    const auto &perm = op->attr.transpose.perm;
+    for ( int i = 0; i < perm.count; i++ )
+    {
+        if ( perm.axisNHWC[i] >= shape.Size() ) throw std::invalid_argument(constraint);
+        if ( shape1[perm.axisNHWC[i]] != shape[i] ) throw std::invalid_argument(constraint);
+    }
+}
+
+void ErrorIfCheck_294afuxnedk9i(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: GATHER,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output, [N,W,C], values, [N,K,C]))";
+    const auto *values = op->Input(TensorUsage::IFM);
+    const auto *output = op->Output(TensorUsage::OFM);
+    if ( !shapeCheck(output, 0, values, 0) ) throw std::invalid_argument(constraint);  // N
+    if ( !shapeCheck(output, 2, values, 2) ) throw std::invalid_argument(constraint);  // C
+}
+
+void ErrorIfCheck_27p0n0pjt2bd6(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: GATHER,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(output, [N,W,C], indices, [N,W]))";
+    const auto *indices = op->Input(TensorUsage::IFM1);
+    const auto *output = op->Output(TensorUsage::OFM);
+    if ( !shapeCheck(output, 0, indices, 0) ) throw std::invalid_argument(constraint);  // N
+    if ( !shapeCheck(output, 1, indices, 1) ) throw std::invalid_argument(constraint);  // W
+}
+
+void ErrorIfCheck_1uwmsen32dse1(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: GATHER,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(indices, [N,W], values, [N,K,C]))";
+    const auto *values = op->Input(TensorUsage::IFM);
+    const auto *indices = op->Input(TensorUsage::IFM1);
+    if ( !shapeCheck(indices, 0, values, 0) ) throw std::invalid_argument(constraint);  // N
+}
+
+void ErrorIfCheck_3c5bq3iswjd1x(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: SCATTER,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(values_out, [N,K,C], values_in, [N,K,C]))";
+    const auto *values_in = op->Input(TensorUsage::IFM);
+    const auto *values_out = op->Output(TensorUsage::OFM);
+    if ( !shapeCheck(values_out, 0, values_in, 0) ) throw std::invalid_argument(constraint);  // N
+    if ( !shapeCheck(values_out, 1, values_in, 1) ) throw std::invalid_argument(constraint);  // K
+    if ( !shapeCheck(values_out, 2, values_in, 2) ) throw std::invalid_argument(constraint);  // C
+}
+
+void ErrorIfCheck_53yuoon46swi(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: SCATTER,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(values_out, [N,K,C], indices, [N,W]))";
+    const auto *indices = op->Input(TensorUsage::IFM1);
+    const auto *values_out = op->Output(TensorUsage::OFM);
+    if ( !shapeCheck(values_out, 0, indices, 0) ) throw std::invalid_argument(constraint);  // N
+}
+
+void ErrorIfCheck_q9pgbwuvutqu(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: SCATTER,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(values_out, [N,K,C], input, [N,W,C]))";
+    const auto *input = op->Input(TensorUsage::IFM2);
+    const auto *values_out = op->Output(TensorUsage::OFM);
+    if ( !shapeCheck(values_out, 0, input, 0) ) throw std::invalid_argument(constraint);  // N
+    if ( !shapeCheck(values_out, 2, input, 2) ) throw std::invalid_argument(constraint);  // C
+}
+
+void ErrorIfCheck_1qdcccs22lqtr(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: SCATTER,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(input, [N,W,C], values_in, [N,K,C]))";
+    const auto *input = op->Input(TensorUsage::IFM2);
+    const auto *values_in = op->Input(TensorUsage::IFM);
+    if ( !shapeCheck(input, 0, values_in, 0) ) throw std::invalid_argument(constraint);  // N
+    if ( !shapeCheck(input, 2, values_in, 2) ) throw std::invalid_argument(constraint);  // C
+}
+
+void ErrorIfCheck_2azl8wc8mbsrj(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: SCATTER,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(input, [N,W,C], indices, [N,W]))";
+    const auto *input = op->Input(TensorUsage::IFM2);
+    const auto *indices = op->Input(TensorUsage::IFM1);
+    if ( !shapeCheck(input, 0, indices, 0) ) throw std::invalid_argument(constraint);  // N
+    if ( !shapeCheck(input, 1, indices, 1) ) throw std::invalid_argument(constraint);  // W
+}
+
+void ErrorIfCheck_122a36k26p0au(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: SCATTER,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(indices, [N,W], values_in, [N,K,C]))";
+    const auto *values_in = op->Input(TensorUsage::IFM);
+    const auto *indices = op->Input(TensorUsage::IFM1);
+    if ( !shapeCheck(indices, 0, values_in, 0) ) throw std::invalid_argument(constraint);  // N
+}
+
+void ErrorIfCheck_3sfcy967j2w8w(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RESIZE,
+    static constexpr char constraint[] = "ERROR_IF(max(OH,OW,IH,IW) >= 16384)";
+    auto IW = op->Input(TensorUsage::IFM)->shape.Width();
+    auto IH = op->Input(TensorUsage::IFM)->shape.Height();
+    auto OW = op->Output(TensorUsage::OFM)->shape.Width();
+    auto OH = op->Output(TensorUsage::OFM)->shape.Height();
+    if ( std::max({OH, OW, IH, IW}) >= 16384 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_1obslcewwn583(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RESIZE,
+    static constexpr char constraint[] = "ERROR_IF(scale_y_n <= 0 || scale_y_d <= 0 || scale_x_n <= 0 || scale_x_d <= 0)";
+    const auto &attr = op->attr;
+    auto scale_y_d = attr.resize.scaleY.d;
+    auto scale_y_n = attr.resize.scaleY.n;
+    auto scale_x_d = attr.resize.scaleX.d;
+    auto scale_x_n = attr.resize.scaleX.n;
+    if ( scale_y_n <= 0 || scale_y_d <= 0 || scale_x_n <= 0 || scale_x_d <= 0 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_3oxfjen91qb6l(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RESIZE,
+    static constexpr char constraint[] = "ERROR_IF(scale_y_n > (1 << 11) || scale_x_n > (1 << 11))";
+    const auto &attr = op->attr;
+    auto scale_y_n = attr.resize.scaleY.n;
+    auto scale_x_n = attr.resize.scaleX.n;
+    if ( scale_y_n > (1 << 11) || scale_x_n > (1 << 11) ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_1uo0z247e42af(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RESIZE,
+    static constexpr char constraint[] = "ERROR_IF(scale_y_d >= 16 * scale_y_n || scale_x_d >= 16 * scale_x_n)";
+    const auto &attr = op->attr;
+    auto scale_y_d = attr.resize.scaleY.d;
+    auto scale_y_n = attr.resize.scaleY.n;
+    auto scale_x_d = attr.resize.scaleX.d;
+    auto scale_x_n = attr.resize.scaleX.n;
+    if ( scale_y_d >= 16 * scale_y_n || scale_x_d >= 16 * scale_x_n ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_1eovh9pyc6tyw(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RESIZE,
+    static constexpr char constraint[] = "ERROR_IF(offset_y < -scale_y_n || offset_y >= 16 * scale_y_n)";
+    const auto &attr = op->attr;
+    auto offset_y = attr.resize.offsetYX[0];
+    auto scale_y_n = attr.resize.scaleY.n;
+    if ( offset_y < -scale_y_n || offset_y >= 16 * scale_y_n ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_24jsin2zkf4ug(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RESIZE,
+    static constexpr char constraint[] = "ERROR_IF(offset_x < -scale_x_n || offset_x >= 16 * scale_x_n)";
+    const auto &attr = op->attr;
+    auto offset_x = attr.resize.offsetYX[1];
+    auto scale_x_n = attr.resize.scaleX.n;
+    if ( offset_x < -scale_x_n || offset_x >= 16 * scale_x_n ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_12uj5fltk5rbo(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RESIZE,
+    static constexpr char constraint[] = "ERROR_IF(border_y < -16 * scale_y_n || border_y >= scale_y_n)";
+    const auto &attr = op->attr;
+    auto border_y = attr.resize.borderYX[0];
+    auto scale_y_n = attr.resize.scaleY.n;
+    if ( border_y < -16 * scale_y_n || border_y >= scale_y_n ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_1py9f91imwjxe(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RESIZE,
+    static constexpr char constraint[] = "ERROR_IF(border_x < -16 * scale_x_n || border_x >= scale_x_n)";
+    const auto &attr = op->attr;
+    auto border_x = attr.resize.borderYX[1];
+    auto scale_x_n = attr.resize.scaleX.n;
+    if ( border_x < -16 * scale_x_n || border_x >= scale_x_n ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_fn614zzdrdfd(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RESIZE,
+    static constexpr char constraint[] = "ERROR_IF(OH != idiv_check((IH - 1) * scale_y_n - offset_y + border_y, scale_y_d) + 1)";
+    auto IH = op->Input(TensorUsage::IFM)->shape.Height();
+    auto OH = op->Output(TensorUsage::OFM)->shape.Height();
+    const auto &attr = op->attr;
+    auto scale_y_n = attr.resize.scaleY.n;
+    auto scale_y_d = attr.resize.scaleY.d;
+    if ( scale_y_n > (1 << 11) || scale_y_d >= 16 * scale_y_n ) throw std::invalid_argument(constraint);
+    auto offset_y = attr.resize.offsetYX[0];
+    if ( offset_y < -scale_y_n || offset_y >= 16 * scale_y_n ) throw std::invalid_argument(constraint);
+    if ( IH < 1 || IH >= std::numeric_limits<int16_t>::max() || scale_y_n <= 0 || scale_y_d <= 0 )
+        throw std::invalid_argument(constraint);
+    int64_t term1 = (IH - 1LL) * scale_y_n;
+    if ( term1 >= std::numeric_limits<int64_t>::max() - 2LL * std::numeric_limits<int>::max() - 1 )
+        throw std::invalid_argument(constraint);
+    int64_t numerator = term1 - offset_y + attr.resize.borderYX[0];
+    if ( numerator % scale_y_d != 0 ) throw std::invalid_argument(constraint);
+    if ( OH != numerator / scale_y_d + 1 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_338aejy0aeqeg(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RESIZE,
+    static constexpr char constraint[] = "ERROR_IF(OW != idiv_check((IW - 1) * scale_x_n - offset_x + border_x, scale_x_d) + 1)";
+    auto IW = op->Input(TensorUsage::IFM)->shape.Width();
+    auto OW = op->Output(TensorUsage::OFM)->shape.Width();
+    const auto &attr = op->attr;
+    auto scale_x_n = attr.resize.scaleX.n;
+    auto scale_x_d = attr.resize.scaleX.d;
+    if ( scale_x_n > (1 << 11) || scale_x_d >= 16 * scale_x_n ) throw std::invalid_argument(constraint);
+    auto offset_x = attr.resize.offsetYX[1];
+    if ( offset_x < -scale_x_n || offset_x >= 16 * scale_x_n ) throw std::invalid_argument(constraint);
+    if ( IW < 1 || IW >= std::numeric_limits<int16_t>::max() || scale_x_n <= 0 || scale_x_d <= 0 )
+        throw std::invalid_argument(constraint);
+    int64_t term1 = (IW - 1LL) * scale_x_n;
+    if ( term1 >= std::numeric_limits<int64_t>::max() - 2LL * std::numeric_limits<int>::max() )
+        throw std::invalid_argument(constraint);
+    int64_t numerator = term1 - offset_x + attr.resize.borderYX[1];
+    if ( numerator % scale_x_d != 0 ) throw std::invalid_argument(constraint);
+    if ( OW != numerator / scale_x_d + 1 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_7p5naeft5ga8(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RESCALE,
+    static constexpr char constraint[] = "ERROR_IF(in_t != int8_t && in_t != uint8_t && in_t != uint16_t && input_zp != 0)";
+    auto in_t = op->IFM(0)->Type();
+    auto &zp = op->Input(TensorUsage::IFM)->quantization.zeroPoints;
+    auto input_zp = zp.empty() ? 0 : zp[0];
+    if ( in_t != DataType::Int8 && in_t != DataType::UInt8 && in_t != DataType::UInt16 && input_zp != 0 )
+        throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_2hqaqrremyime(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RESCALE,
+    static constexpr char constraint[] = "ERROR_IF(out_t != int8_t && out_t != uint8_t && out_t != uint16_t && output_zp != 0)";
+    auto out_t = op->OFM()->Type();
+    auto &zp = op->Output(TensorUsage::OFM)->quantization.zeroPoints;
+    auto output_zp = zp.empty() ? 0 : zp[0];
+    if ( out_t != DataType::Int8 && out_t != DataType::UInt8 && out_t != DataType::UInt16 && output_zp != 0 )
+        throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_1wo90hck51cpk(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RESCALE,
+    static constexpr char constraint[] = "ERROR_IF(in_t == uint16_t && (input_zp != 0 || input_zp != 32768))";
+    auto in_t = op->IFM(0)->Type();
+    auto &zp = op->Input(TensorUsage::IFM)->quantization.zeroPoints;
+    auto input_zp = zp.empty() ? 0 : zp[0];
+    if ( in_t == DataType::UInt16 && (input_zp != 0 && input_zp != 32768) ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_v4b9g32rnf6p(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RESCALE,
+    static constexpr char constraint[] = "ERROR_IF(out_t == uint16_t && (output_zp != 0 || output_zp != 32768))";
+    auto out_t = op->OFM()->Type();
+    auto &zp = op->Output(TensorUsage::OFM)->quantization.zeroPoints;
+    auto output_zp = zp.empty() ? 0 : zp[0];
+    if ( out_t == DataType::UInt16 && (output_zp != 0 && output_zp != 32768) ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_22dev8it3bz2g(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RESCALE,
+    static constexpr char constraint[] = "ERROR_IF(scale32 && in_t == int48_t)";
+    auto in_t = op->IFM(0)->Type();
+    if ( op->attr.rescale.scale32 && in_t == DataType::Int48 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_3ms1pbkpa2td9(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RESCALE,
+    static constexpr char constraint[] = "ERROR_IF(!scale32 && double_round)";
+    if ( !op->attr.rescale.scale32 && op->attr.rescale.double_round ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_31ty7f0kcbfxg(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RESCALE,
+    static constexpr char constraint[] = "ERROR_IF(shapeCheck(shift, [NC], multiplier, [NC]))";
+    const auto &shape = op->Output(TensorUsage::OFM)->shape;
+    const auto &shiftShape = op->Input(TensorUsage::Params)->shape;
+    const auto &multiplierShape = op->Input(TensorUsage::Params1)->shape;
+    auto outRank = shape.Size();
+    auto NC = op->attr.rescale.per_channel ? (outRank > 0 ? shape[outRank - 1] : 1) : 1;
+    if ( shiftShape[0] != NC || multiplierShape[0] != NC ) throw std::invalid_argument(constraint);  // NC
+}
+
+static bool ShapeListsMatch(const ordered_map<TensorUsage, TensorConnection> &A,
+    const std::vector<std::shared_ptr<Tensor>> &B, bool skipFirst = false)
+{
+    auto bSize = B.size();
+    if ( bSize > size_t(std::numeric_limits<int32_t>::max()) ) return false;
+    int64_t sizeDiff = A.size() - static_cast<int32_t>(bSize);
+    if ( sizeDiff != (skipFirst ? 1 : 0) ) return false;
+    auto iterA = A.begin();
+    auto iterB = B.begin();
+    if ( skipFirst )
+    {
+        iterA++;
+    }
+    while ( iterA != A.end() )
+    {
+        if ( iterA->shape != (*iterB)->StorageShape() ) return false;
+        iterA++;
+        iterB++;
+    }
+    return true;
+}
+
+static bool ShapeListsMatch(const ordered_map<TensorUsage, TensorConnection> &A, const ordered_map<TensorUsage, TensorConnection> &B)
+{
+    if ( A.size() != B.size() ) return false;
+    auto iterA = A.begin();
+    auto iterB = B.begin();
+
+    while ( iterA != A.end() )
+    {
+        if ( iterA->shape != iterB->shape ) return false;
+        iterA++;
+        iterB++;
+    }
+    return true;
+}
+
+void ErrorIfCheck_1bm39avugkqqd(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: COND_IF,
+    static constexpr char constraint[] = "ERROR_IF(tensor_list_shape(input_list) != tosa_input_shape(then_graph))";
+    const auto *then_graph = context.GetGraph(op->attr.condIf.then_branch);
+    if ( !ShapeListsMatch(op->Inputs(), then_graph->Inputs(), true) ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_3tv3oatlz37e2(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: COND_IF,
+    static constexpr char constraint[] = "ERROR_IF(tensor_list_shape(input_list) != tosa_input_shape(else_graph))";
+    const auto *else_graph = context.GetGraph(op->attr.condIf.else_branch);
+    if ( !ShapeListsMatch(op->Inputs(), else_graph->Inputs(), true) ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_n7biu53x2n6k(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: COND_IF,
+    static constexpr char constraint[] = "ERROR_IF(tensor_list_shape(output_list) != tosa_output_shape(then_graph))";
+    const auto *then_graph = context.GetGraph(op->attr.condIf.then_branch);
+    if ( !ShapeListsMatch(op->Outputs(), then_graph->Outputs()) ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_2fd4dk1zw032u(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: COND_IF,
+    static constexpr char constraint[] = "ERROR_IF(tensor_list_shape(output_list) != tosa_output_shape(else_graph))";
+    const auto *else_graph = context.GetGraph(op->attr.condIf.else_branch);
+    if ( !ShapeListsMatch(op->Outputs(), else_graph->Outputs()) ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_omgw2xdm6irr(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: COND_IF,
+    static constexpr char constraint[] = "ERROR_IF(tensor_size(shape) != 1)";
+    auto condSize = op->Input(TensorUsage::IFM)->shape.Elements();
+    if ( condSize != 1 ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_18hgmc3pexnw4(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: WHILE_LOOP,
+    static constexpr char constraint[] = "ERROR_IF(tensor_list_shape(input_list) != tosa_list_shape(output_list))";
+    if ( !ShapeListsMatch(op->Inputs(), op->Outputs()) ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_12uu5ff3t3lv8(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: WHILE_LOOP,
+    static constexpr char constraint[] = "ERROR_IF(tensor_list_shape(input_list) != tosa_input_shape(cond_graph))";
+    const auto *cond_graph = context.GetGraph(op->attr.condWhile.cond_branch);
+    if ( !ShapeListsMatch(op->Inputs(), cond_graph->Inputs()) ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_3puzf7van5acf(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: WHILE_LOOP,
+    static constexpr char constraint[] = "ERROR_IF(tensor_list_shape(input_list) != tosa_input_shape(body_graph))";
+    const auto *body_graph = context.GetGraph(op->attr.condWhile.body_branch);
+    if ( !ShapeListsMatch(op->Inputs(), body_graph->Inputs()) ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_8tihij7a5ep0(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: WHILE_LOOP,
+    static constexpr char constraint[] = "ERROR_IF(tensor_list_shape(input_list) != tosa_output_shape(body_graph))";
+    const auto *body_graph = context.GetGraph(op->attr.condWhile.body_branch);
+    if ( !ShapeListsMatch(op->Inputs(), body_graph->Outputs()) ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_3lu68v2531bjz(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: WHILE_LOOP,
+    static constexpr char constraint[] = "ERROR_IF(tensor_size(tosa_output_shape(cond_graph)) != 1)";
+    const auto *cond_graph = context.GetGraph(op->attr.condWhile.cond_branch);
+    const auto &condOutputs = cond_graph->Outputs();
+    if ( condOutputs.size() != 1 || condOutputs.front()->StorageShape().Elements() != 1 )
+        throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_1fzl0zyxyd88z(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: WHILE_LOOP,
+    static constexpr char constraint[] = "ERROR_IF(tosa_output_type(cond_graph) != bool_t)";
+    const auto *cond_graph = context.GetGraph(op->attr.condWhile.cond_branch);
+    const auto &condOutputs = cond_graph->Outputs();
+    if ( condOutputs.size() != 1 ) throw std::invalid_argument(constraint);
+    auto type = condOutputs.front()->Type();
+    if ( type != DataType::Bool8 && type != DataType::Bool ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_10u6py7exa66n(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CLAMP, SIGMOID, TANH, TABLE, REVERSE, CAST, RESCALE,
+    static constexpr char constraint[] = "ERROR_IF(rankCheck(output, input))";
+    const auto &outputShape = op->Output(TensorUsage::OFM)->shape;
+    const auto &inputShape = op->Input(TensorUsage::IFM)->shape;
+    if ( outputShape != inputShape ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_396rg8p65j58r(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: ABS, BITWISE_NOT, CEIL, CLZ, EXP, FLOOR, LOG, LOGICAL_NOT, NEGATE, RECIPROCAL, RSQRT, IDENTITY,
+    static constexpr char constraint[] = "ERROR_IF(rankCheck(output, input1))";
+    const auto &outputShape = op->Output(TensorUsage::OFM)->shape;
+    const auto &inputShape = op->Input(TensorUsage::IFM)->shape;
+    if ( outputShape != inputShape ) throw std::invalid_argument(constraint);
+}
+
+void ErrorIfCheck_3oet4aggtv528(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONST,
+    static constexpr char constraint[] = "ERROR_IF(rankCheck(output, values))";
+    const auto &outputShape = op->Output(TensorUsage::OFM)->shape;
+    const auto &inputShape = op->Input(TensorUsage::IFM)->shape;
+    if ( outputShape != inputShape ) throw std::invalid_argument(constraint);
+}
+
+}  // namespace checks
+}  // namespace validator
+}  // namespace tosa
diff --git a/ethosu/regor/tosa/tosa_error_checks.hpp b/ethosu/regor/tosa/tosa_error_checks.hpp
new file mode 100644
index 00000000..fa1442e0
--- /dev/null
+++ b/ethosu/regor/tosa/tosa_error_checks.hpp
@@ -0,0 +1,161 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Generated by tosaValidationGenerator for TOSA Specification 0.60.0
+// Do not edit.
+
+#pragma once
+
+#include "tosa/tosa_validator.hpp"
+
+namespace tosa
+{
+namespace validator
+{
+namespace checks
+{
+// Checks for TOSA Specification 0.60.0
+void ErrorIfCheck_ai0sdq9wgm72(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_gpp861oen43y(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1vu5c1tytwmhu(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1n0denkrrrlr1(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_36r4wpx3psd81(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1lrylbkd3w7ix(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_ojmgqziimenu(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_3vqy81ueu5wjk(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_125xuezh1964i(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_fqta626ku4qe(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_ycjhrvf2yigr(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1c57olj698f3d(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1hby1qurzja4f(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1md8k265hfj92(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_3fzsq78v5ypau(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_2vhj6e48eyzlr(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_147wc580l2tik(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_2rm8rnsdfn14h(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_36emtx7zwkk96(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_2r9jencgka20o(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_207p0r46d35m0(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_cr43yjpqkcpd(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_341t6ysqc16b2(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_uqm570jwaqb6(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_34iiwt6o66qfa(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_llbd3iugmek0(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1w510kxt5b2b2(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_27g3t38z1of4h(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_95jvn4dzraol(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_21377cjnb1ox7(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_2cpco8ykx99sa(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_10sexbqileii7(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_12rt0p658ac1(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_3cem64qtn6ajr(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1hp4djlq1mi8i(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_20r08ymi6c43u(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1xwwkxeypcw3j(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_vi3hzxbetjyg(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1m8qk2pbuovev(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1iv4j2x95j8dk(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_316kdwzc9jf5x(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_tnr115b4spgw(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_3ufiqep5ipuco(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_3kcipzq18dxv9(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_jcjmr2nnatvv(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_qwmo2w7hxola(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_c9o11f07skde(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1ellfcuw76b13(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_h1uadv5irsu6(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1kfh97qingywb(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1azcq4511qzyx(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_15o9wo9pu7mrg(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_13tqdu59nyxyh(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_2kgf2jejxlrr6(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_q9dl3x81rc4o(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_2rfkujt9lg7eq(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_3nelbnmxyemot(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_24conlof4w8eh(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_xod9coigx1x2(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1yism57if6v2z(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_3k5ug2w7gxc7r(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_2gdayq6ofi7wx(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_38qvty7pudfz2(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_3tccsjner0km9(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_3tg4p2a5te0jy(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_33exz9gn2i1wy(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_14slfd7r77hgh(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1fzhf02pkiw9z(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_16s99hvsej4fo(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_dctmd6sgn5n0(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_14z7y0qe9lwps(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_2rfef32dgp3be(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_2sfcgak3rj1vs(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_2a1jpygblc07i(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_3hthyoock2ew5(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1nifeiq9rvmb8(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_21rq6kn6p1yle(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_3rghkieqip43o(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1cyv9n59wyyyc(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_3oy2tclc6uhsu(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_gpp3enlp1ddg(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_ix9div4ld46q(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_3estuseky2gm2(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_5bq1fx1llv8(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_ckwpttzajw06(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_2n1ratxgd89tx(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_aizwrn95lb0l(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_294afuxnedk9i(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_27p0n0pjt2bd6(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1uwmsen32dse1(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_3c5bq3iswjd1x(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_53yuoon46swi(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_q9pgbwuvutqu(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1qdcccs22lqtr(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_2azl8wc8mbsrj(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_122a36k26p0au(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_3sfcy967j2w8w(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1obslcewwn583(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_3oxfjen91qb6l(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1uo0z247e42af(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1eovh9pyc6tyw(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_24jsin2zkf4ug(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_12uj5fltk5rbo(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1py9f91imwjxe(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_fn614zzdrdfd(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_338aejy0aeqeg(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_7p5naeft5ga8(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_2hqaqrremyime(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1wo90hck51cpk(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_v4b9g32rnf6p(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_22dev8it3bz2g(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_3ms1pbkpa2td9(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_31ty7f0kcbfxg(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1bm39avugkqqd(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_3tv3oatlz37e2(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_n7biu53x2n6k(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_2fd4dk1zw032u(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_omgw2xdm6irr(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_18hgmc3pexnw4(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_12uu5ff3t3lv8(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_3puzf7van5acf(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_8tihij7a5ep0(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_3lu68v2531bjz(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_1fzl0zyxyd88z(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_10u6py7exa66n(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_396rg8p65j58r(const regor::Operation *op, const Context &context);
+void ErrorIfCheck_3oet4aggtv528(const regor::Operation *op, const Context &context);
+}  // namespace checks
+}  // namespace validator
+}  // namespace tosa
diff --git a/ethosu/regor/tosa/tosa_level_checks.cpp b/ethosu/regor/tosa/tosa_level_checks.cpp
new file mode 100644
index 00000000..9deeb585
--- /dev/null
+++ b/ethosu/regor/tosa/tosa_level_checks.cpp
@@ -0,0 +1,269 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Generated by tosaValidationGenerator for TOSA Specification 0.60.0
+// Modify by implementing the constraints.
+
+#include "tosa_level_checks.hpp"
+
+namespace tosa
+{
+namespace validator
+{
+namespace checks
+{
+// Checks for TOSA Specification 0.60.0
+void LevelCheck_1lz89reckvj8d(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: ARGMAX, RESHAPE,
+    static constexpr char constraint[] = "LEVEL_CHECK(rank(shape1) <= MAX_RANK)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_2i1ithnrq06wi(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: AVG_POOL2D, MAX_POOL2D,
+    static constexpr char constraint[] = "LEVEL_CHECK(kernel_y <= MAX_KERNEL)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_1wobi8axf7z2y(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: AVG_POOL2D, MAX_POOL2D,
+    static constexpr char constraint[] = "LEVEL_CHECK(kernel_x <= MAX_KERNEL)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_1xppm0ufw64nq(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: AVG_POOL2D, CONV2D, CONV3D, DEPTHWISE_CONV2D, MAX_POOL2D, TRANSPOSE_CONV2D,
+    static constexpr char constraint[] = "LEVEL_CHECK(stride_y <= MAX_STRIDE)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_as2lzdd5d28b(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: AVG_POOL2D, CONV2D, CONV3D, DEPTHWISE_CONV2D, MAX_POOL2D, TRANSPOSE_CONV2D,
+    static constexpr char constraint[] = "LEVEL_CHECK(stride_x <= MAX_STRIDE)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_2n3xkkz3ip4mz(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: AVG_POOL2D, CONV2D, CONV3D, DEPTHWISE_CONV2D, MAX_POOL2D,
+    static constexpr char constraint[] = "LEVEL_CHECK(pad_top <= MAX_KERNEL)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_3o7qpmmd9ga58(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: AVG_POOL2D, CONV2D, CONV3D, DEPTHWISE_CONV2D, MAX_POOL2D,
+    static constexpr char constraint[] = "LEVEL_CHECK(pad_bottom <= MAX_KERNEL)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_16lxbjk2bszcz(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: AVG_POOL2D, CONV2D, CONV3D, DEPTHWISE_CONV2D, MAX_POOL2D,
+    static constexpr char constraint[] = "LEVEL_CHECK(pad_left <= MAX_KERNEL)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_2kwfb08mbiwpg(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: AVG_POOL2D, CONV2D, CONV3D, DEPTHWISE_CONV2D, MAX_POOL2D,
+    static constexpr char constraint[] = "LEVEL_CHECK(pad_right <= MAX_KERNEL)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_1l00wczs5w70i(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV2D, CONV3D, DEPTHWISE_CONV2D,
+    static constexpr char constraint[] = "LEVEL_CHECK(dilation_y * KH <= MAX_KERNEL)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_1hle41fus7cpl(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV2D, CONV3D, DEPTHWISE_CONV2D,
+    static constexpr char constraint[] = "LEVEL_CHECK(dilation_x * KW <= MAX_KERNEL)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_1npkwxnbypn8z(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV3D,
+    static constexpr char constraint[] = "LEVEL_CHECK(dilation_d * KD <= MAX_KERNEL)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_olu6vs8y9f61(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV3D,
+    static constexpr char constraint[] = "LEVEL_CHECK(pad_d0 <= MAX_KERNEL)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_3l4no1w1u6sq4(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV3D,
+    static constexpr char constraint[] = "LEVEL_CHECK(pad_d1 <= MAX_KERNEL)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_1416gon2u3sue(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CONV3D,
+    static constexpr char constraint[] = "LEVEL_CHECK(stride_d <= MAX_STRIDE)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_1wd03gtw7132o(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: FFT2D, RFFT2D,
+    static constexpr char constraint[] = "LEVEL_CHECK(H <= MAX_KERNEL)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_i8zuc5feuxbr(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: FFT2D, RFFT2D,
+    static constexpr char constraint[] = "LEVEL_CHECK(W <= MAX_KERNEL)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_17eyg1nicy12g(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: TRANSPOSE_CONV2D,
+    static constexpr char constraint[] = "LEVEL_CHECK(KH <= MAX_KERNEL)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_6qao6e1mxke0(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: TRANSPOSE_CONV2D,
+    static constexpr char constraint[] = "LEVEL_CHECK(KW <= MAX_KERNEL)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_pnaf5n03f8jg(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: TRANSPOSE_CONV2D,
+    static constexpr char constraint[] = "LEVEL_CHECK(out_pad_top <= MAX_KERNEL)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_me421i5r5j13(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: TRANSPOSE_CONV2D,
+    static constexpr char constraint[] = "LEVEL_CHECK(out_pad_bottom <= MAX_KERNEL)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_2ffhdgbz1kvxc(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: TRANSPOSE_CONV2D,
+    static constexpr char constraint[] = "LEVEL_CHECK(out_pad_left <= MAX_KERNEL)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_a0x2apl3zoz(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: TRANSPOSE_CONV2D,
+    static constexpr char constraint[] = "LEVEL_CHECK(out_pad_right <= MAX_KERNEL)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_1flzmpv6hubzc(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: CLAMP, SIGMOID, TANH, ADD, ARITHMETIC_RIGHT_SHIFT, BITWISE_AND, BITWISE_OR, BITWISE_XOR, INTDIV,
+    //  LOGICAL_AND, LOGICAL_LEFT_SHIFT, LOGICAL_RIGHT_SHIFT, LOGICAL_OR, LOGICAL_XOR, MAXIMUM, MINIMUM, MUL, POW, SUB,
+    //  TABLE, ABS, BITWISE_NOT, CEIL, CLZ, EXP, FLOOR, LOG, LOGICAL_NOT, NEGATE, RECIPROCAL, RSQRT, SELECT, EQUAL,
+    //  GREATER, GREATER_EQUAL, CONCAT, PAD, RESHAPE, REVERSE, SLICE, TILE, TRANSPOSE, CAST, RESCALE,
+    static constexpr char constraint[] = "LEVEL_CHECK(rank(shape) <= MAX_RANK)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_1r40jc4ashh6o(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RESIZE,
+    static constexpr char constraint[] = "LEVEL_CHECK(scale_y_n/scale_y_d <= MAX_SCALE)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void LevelCheck_1u7rtl141felu(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: RESIZE,
+    static constexpr char constraint[] = "LEVEL_CHECK(scale_x_n/scale_x_d <= MAX_SCALE)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+}  // namespace checks
+}  // namespace validator
+}  // namespace tosa
diff --git a/ethosu/regor/tosa/tosa_level_checks.hpp b/ethosu/regor/tosa/tosa_level_checks.hpp
new file mode 100644
index 00000000..2e39dea7
--- /dev/null
+++ b/ethosu/regor/tosa/tosa_level_checks.hpp
@@ -0,0 +1,60 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Generated by tosaValidationGenerator for TOSA Specification 0.60.0
+// Do not edit.
+
+#pragma once
+
+#include "tosa/tosa_validator.hpp"
+
+namespace tosa
+{
+namespace validator
+{
+namespace checks
+{
+// Checks for TOSA Specification 0.60.0
+void LevelCheck_1lz89reckvj8d(const regor::Operation *op, const Context &context);
+void LevelCheck_2i1ithnrq06wi(const regor::Operation *op, const Context &context);
+void LevelCheck_1wobi8axf7z2y(const regor::Operation *op, const Context &context);
+void LevelCheck_1xppm0ufw64nq(const regor::Operation *op, const Context &context);
+void LevelCheck_as2lzdd5d28b(const regor::Operation *op, const Context &context);
+void LevelCheck_2n3xkkz3ip4mz(const regor::Operation *op, const Context &context);
+void LevelCheck_3o7qpmmd9ga58(const regor::Operation *op, const Context &context);
+void LevelCheck_16lxbjk2bszcz(const regor::Operation *op, const Context &context);
+void LevelCheck_2kwfb08mbiwpg(const regor::Operation *op, const Context &context);
+void LevelCheck_1l00wczs5w70i(const regor::Operation *op, const Context &context);
+void LevelCheck_1hle41fus7cpl(const regor::Operation *op, const Context &context);
+void LevelCheck_1npkwxnbypn8z(const regor::Operation *op, const Context &context);
+void LevelCheck_olu6vs8y9f61(const regor::Operation *op, const Context &context);
+void LevelCheck_3l4no1w1u6sq4(const regor::Operation *op, const Context &context);
+void LevelCheck_1416gon2u3sue(const regor::Operation *op, const Context &context);
+void LevelCheck_1wd03gtw7132o(const regor::Operation *op, const Context &context);
+void LevelCheck_i8zuc5feuxbr(const regor::Operation *op, const Context &context);
+void LevelCheck_17eyg1nicy12g(const regor::Operation *op, const Context &context);
+void LevelCheck_6qao6e1mxke0(const regor::Operation *op, const Context &context);
+void LevelCheck_pnaf5n03f8jg(const regor::Operation *op, const Context &context);
+void LevelCheck_me421i5r5j13(const regor::Operation *op, const Context &context);
+void LevelCheck_2ffhdgbz1kvxc(const regor::Operation *op, const Context &context);
+void LevelCheck_a0x2apl3zoz(const regor::Operation *op, const Context &context);
+void LevelCheck_1flzmpv6hubzc(const regor::Operation *op, const Context &context);
+void LevelCheck_1r40jc4ashh6o(const regor::Operation *op, const Context &context);
+void LevelCheck_1u7rtl141felu(const regor::Operation *op, const Context &context);
+}  // namespace checks
+}  // namespace validator
+}  // namespace tosa
diff --git a/ethosu/regor/tosa/tosa_mapping.cpp b/ethosu/regor/tosa/tosa_mapping.cpp
new file mode 100644
index 00000000..190e2fdf
--- /dev/null
+++ b/ethosu/regor/tosa/tosa_mapping.cpp
@@ -0,0 +1,171 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "tosa_mapping.hpp"
+
+#include "include/graphapi_tosa_types.hpp"
+#include "tosa/tosa_schema_generated.hpp"
+
+#include <stdexcept>
+#include <type_traits>
+
+namespace regor
+{
+
+
+static constexpr std::pair<tosaFb::DType, GraphApi::GraphDataType> s_tensorTypeToDataType[] = {
+    // clang-format off
+    {tosaFb::DType::BOOL,          GraphApi::GraphDataType::Bool8},
+    {tosaFb::DType::UINT8,         GraphApi::GraphDataType::UInt8},
+    {tosaFb::DType::INT4,          GraphApi::GraphDataType::Int4Packed8},
+    {tosaFb::DType::INT8,          GraphApi::GraphDataType::Int8},
+    {tosaFb::DType::INT16,         GraphApi::GraphDataType::Int16},
+    {tosaFb::DType::INT32,         GraphApi::GraphDataType::Int32},
+    {tosaFb::DType::INT48,         GraphApi::GraphDataType::Int48},
+    {tosaFb::DType::FP32,          GraphApi::GraphDataType::Float32},
+    {tosaFb::DType::UINT16,        GraphApi::GraphDataType::UInt16},
+    {tosaFb::DType::FP16,          GraphApi::GraphDataType::Float16},
+    {tosaFb::DType::BF16,          GraphApi::GraphDataType::BFloat16},
+    // clang-format on
+};
+
+static constexpr std::pair<tosaFb::Op, tosa::Op> s_FBOpToOp[] = {
+    // clang-format off
+    {tosaFb::Op::UNKNOWN, tosa::Op::UNKNOWN},
+    {tosaFb::Op::ARGMAX, tosa::Op::ARGMAX},
+    {tosaFb::Op::AVG_POOL2D, tosa::Op::AVG_POOL2D},
+    {tosaFb::Op::CONV2D, tosa::Op::CONV2D},
+    {tosaFb::Op::CONV3D, tosa::Op::CONV3D},
+    {tosaFb::Op::DEPTHWISE_CONV2D, tosa::Op::DEPTHWISE_CONV2D},
+    {tosaFb::Op::FULLY_CONNECTED, tosa::Op::FULLY_CONNECTED},
+    {tosaFb::Op::MATMUL, tosa::Op::MATMUL},
+    {tosaFb::Op::MAX_POOL2D, tosa::Op::MAX_POOL2D},
+    {tosaFb::Op::TRANSPOSE_CONV2D, tosa::Op::TRANSPOSE_CONV2D},
+    {tosaFb::Op::CLAMP, tosa::Op::CLAMP},
+    {tosaFb::Op::RESERVED, tosa::Op::RESERVED},
+    {tosaFb::Op::SIGMOID, tosa::Op::SIGMOID},
+    {tosaFb::Op::TANH, tosa::Op::TANH},
+    {tosaFb::Op::ADD, tosa::Op::ADD},
+    {tosaFb::Op::ARITHMETIC_RIGHT_SHIFT, tosa::Op::ARITHMETIC_RIGHT_SHIFT},
+    {tosaFb::Op::BITWISE_AND, tosa::Op::BITWISE_AND},
+    {tosaFb::Op::BITWISE_OR, tosa::Op::BITWISE_OR},
+    {tosaFb::Op::BITWISE_XOR, tosa::Op::BITWISE_XOR},
+    {tosaFb::Op::INTDIV, tosa::Op::INTDIV},
+    {tosaFb::Op::LOGICAL_AND, tosa::Op::LOGICAL_AND},
+    {tosaFb::Op::LOGICAL_LEFT_SHIFT, tosa::Op::LOGICAL_LEFT_SHIFT},
+    {tosaFb::Op::LOGICAL_RIGHT_SHIFT, tosa::Op::LOGICAL_RIGHT_SHIFT},
+    {tosaFb::Op::LOGICAL_OR, tosa::Op::LOGICAL_OR},
+    {tosaFb::Op::LOGICAL_XOR, tosa::Op::LOGICAL_XOR},
+    {tosaFb::Op::MAXIMUM, tosa::Op::MAXIMUM},
+    {tosaFb::Op::MINIMUM, tosa::Op::MINIMUM},
+    {tosaFb::Op::MUL, tosa::Op::MUL},
+    {tosaFb::Op::POW, tosa::Op::POW},
+    {tosaFb::Op::SUB, tosa::Op::SUB},
+    {tosaFb::Op::TABLE, tosa::Op::TABLE},
+    {tosaFb::Op::ABS, tosa::Op::ABS},
+    {tosaFb::Op::BITWISE_NOT, tosa::Op::BITWISE_NOT},
+    {tosaFb::Op::CEIL, tosa::Op::CEIL},
+    {tosaFb::Op::CLZ, tosa::Op::CLZ},
+    {tosaFb::Op::EXP, tosa::Op::EXP},
+    {tosaFb::Op::FLOOR, tosa::Op::FLOOR},
+    {tosaFb::Op::LOG, tosa::Op::LOG},
+    {tosaFb::Op::LOGICAL_NOT, tosa::Op::LOGICAL_NOT},
+    {tosaFb::Op::NEGATE, tosa::Op::NEGATE},
+    {tosaFb::Op::RECIPROCAL, tosa::Op::RECIPROCAL},
+    {tosaFb::Op::RSQRT, tosa::Op::RSQRT},
+    {tosaFb::Op::SELECT, tosa::Op::SELECT},
+    {tosaFb::Op::EQUAL, tosa::Op::EQUAL},
+    {tosaFb::Op::GREATER, tosa::Op::GREATER},
+    {tosaFb::Op::GREATER_EQUAL, tosa::Op::GREATER_EQUAL},
+    {tosaFb::Op::REDUCE_ANY, tosa::Op::REDUCE_ANY},
+    {tosaFb::Op::REDUCE_ALL, tosa::Op::REDUCE_ALL},
+    {tosaFb::Op::REDUCE_MAX, tosa::Op::REDUCE_MAX},
+    {tosaFb::Op::REDUCE_MIN, tosa::Op::REDUCE_MIN},
+    {tosaFb::Op::REDUCE_PRODUCT, tosa::Op::REDUCE_PRODUCT},
+    {tosaFb::Op::REDUCE_SUM, tosa::Op::REDUCE_SUM},
+    {tosaFb::Op::CONCAT, tosa::Op::CONCAT},
+    {tosaFb::Op::PAD, tosa::Op::PAD},
+    {tosaFb::Op::RESHAPE, tosa::Op::RESHAPE},
+    {tosaFb::Op::REVERSE, tosa::Op::REVERSE},
+    {tosaFb::Op::SLICE, tosa::Op::SLICE},
+    {tosaFb::Op::TILE, tosa::Op::TILE},
+    {tosaFb::Op::TRANSPOSE, tosa::Op::TRANSPOSE},
+    {tosaFb::Op::GATHER, tosa::Op::GATHER},
+    {tosaFb::Op::SCATTER, tosa::Op::SCATTER},
+    {tosaFb::Op::RESIZE, tosa::Op::RESIZE},
+    {tosaFb::Op::CAST, tosa::Op::CAST},
+    {tosaFb::Op::RESCALE, tosa::Op::RESCALE},
+    {tosaFb::Op::CONST, tosa::Op::CONST},
+    {tosaFb::Op::IDENTITY, tosa::Op::IDENTITY},
+    {tosaFb::Op::CUSTOM, tosa::Op::CUSTOM},
+    {tosaFb::Op::COND_IF, tosa::Op::COND_IF},
+    {tosaFb::Op::WHILE_LOOP, tosa::Op::WHILE_LOOP},
+    {tosaFb::Op::FFT2D, tosa::Op::FFT2D},
+    {tosaFb::Op::RFFT2D, tosa::Op::RFFT2D},
+    // clang-format on
+};
+
+static constexpr std::pair<tosaFb::ResizeMode, tosa::ResizeMode> s_FBResizeModeToResizeMode[] = {
+    // clang-format off
+    {tosaFb::ResizeMode::UNKNOWN, tosa::ResizeMode::UNKNOWN},
+    {tosaFb::ResizeMode::NEAREST, tosa::ResizeMode::NEAREST},
+    {tosaFb::ResizeMode::BILINEAR, tosa::ResizeMode::BILINEAR},
+    // clang-format on
+};
+
+template<typename A, typename B, size_t SIZE>
+constexpr bool is_sorted(const std::pair<A, B> (&list)[SIZE])
+{
+    A v = list[0].first;
+    for ( size_t i = 1; i < SIZE; i++ )
+    {
+        if ( list[i].first < v ) return false;
+        v = list[i].first;
+    }
+    return true;
+}
+
+static_assert(is_sorted(s_tensorTypeToDataType), "TOSA mapping must be sorted");
+static_assert(is_sorted(s_FBOpToOp), "TOSA mapping must be sorted");
+static_assert(is_sorted(s_FBResizeModeToResizeMode), "TOSA mapping must be sorted");
+
+template<typename KEY, typename VALUE, size_t SZ>
+constexpr VALUE Lookup(const std::pair<KEY, VALUE> (&arr)[SZ], KEY type)
+{
+    auto pos = std::equal_range(std::begin(arr), std::end(arr), std::pair<KEY, VALUE>(type, {}),
+        [](const auto &a, const auto &b) { return a.first < b.first; });
+    assert(pos.first != std::end(arr));
+    return pos.first->second;
+}
+
+GraphApi::GraphDataType TosaMapping::TensorTypeToDataType(tosaFb::DType type)
+{
+    return Lookup(s_tensorTypeToDataType, type);
+}
+
+tosa::Op TosaMapping::FBOpToOp(tosaFb::Op op)
+{
+    return Lookup(s_FBOpToOp, op);
+}
+
+tosa::ResizeMode TosaMapping::FBResizeModeToResizeMode(tosaFb::ResizeMode mode)
+{
+    return Lookup(s_FBResizeModeToResizeMode, mode);
+}
+
+}  // namespace regor
diff --git a/ethosu/regor/tosa/tosa_mapping.hpp b/ethosu/regor/tosa/tosa_mapping.hpp
new file mode 100644
index 00000000..dae0dc26
--- /dev/null
+++ b/ethosu/regor/tosa/tosa_mapping.hpp
@@ -0,0 +1,41 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "include/graphapi.hpp"
+#include "tosa/tosa_schema_generated.hpp"
+
+
+namespace regor
+{
+
+class TosaMapping
+{
+public:
+    TosaMapping(const TosaMapping &) = delete;  // Never constructed. Static members only.
+
+    //
+    // Conversions from TOSA types to GraphIR types
+    //
+    static GraphApi::GraphDataType TensorTypeToDataType(tosaFb::DType type);
+    static tosa::Op FBOpToOp(tosaFb::Op op);
+    static tosa::ResizeMode FBResizeModeToResizeMode(tosaFb::ResizeMode mode);
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/tosa/tosa_reader.cpp b/ethosu/regor/tosa/tosa_reader.cpp
new file mode 100644
index 00000000..69ab7717
--- /dev/null
+++ b/ethosu/regor/tosa/tosa_reader.cpp
@@ -0,0 +1,879 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "tosa_reader.hpp"
+
+#include "common/shape.hpp"
+#include "compiler/graph_builder.hpp"
+#include "include/graphapi.hpp"
+#include "tosa_mapping.hpp"
+#include "tosa_schema_generated.hpp"
+
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+namespace tosaFb
+{
+struct NONE
+{
+};
+}  // namespace tosaFb
+
+namespace regor
+{
+
+namespace
+{
+
+template<class TO, class FROM>
+std::enable_if_t<sizeof(TO) == sizeof(FROM) && std::is_trivially_copyable_v<FROM> && std::is_trivially_copyable_v<TO> && std::is_trivially_constructible_v<TO>, TO>
+BitCast(const FROM &src) noexcept
+{
+    TO dst;
+    std::memcpy(&dst, &src, sizeof(TO));
+    return dst;
+}
+
+inline void tosa_assert(bool cond, const char *msg = nullptr)
+{
+    if ( !cond )
+    {
+        throw std::runtime_error("TOSA FB Reader error : " + std::string(msg ? msg : "Failed to load TOSA model. Buffer contents inconsistent with generated schema"));
+    }
+}
+
+template<typename T>
+const T &SafeDeref(const T *ptr, const char *msg = nullptr)
+{
+    tosa_assert(ptr, msg);
+    const T &ret = *ptr;
+    if constexpr ( std::is_pointer_v<T> )
+    {
+        tosa_assert(ret, msg);
+    }
+    return ret;
+}
+
+template<GraphApi::GraphDataType = GraphApi::GraphDataType::Int32, typename ARG>
+double ToDouble(ARG v)
+{
+    return double(v);
+}
+
+template<>
+double ToDouble<GraphApi::GraphDataType::Int48, const ::flatbuffers::Vector<uint8_t> *>(const ::flatbuffers::Vector<uint8_t> *v)
+{
+    const auto &buf = SafeDeref(v);
+    tosa_assert(buf.size() == 6, "Malformed constant buffer");
+    int64_t r = 0;
+    for ( int i = 0; i < 6; i++ )
+    {
+        r |= uint64_t(buf[i]) << (16 + i * 8);
+    }
+    return double(r);
+}
+
+template<>
+double ToDouble<GraphApi::GraphDataType::Float32, const ::flatbuffers::Vector<uint8_t> *>(const ::flatbuffers::Vector<uint8_t> *v)
+{
+    const auto &buf = SafeDeref(v);
+    tosa_assert(buf.size() == 6, "Malformed constant buffer");
+    uint32_t u = 0;
+    for ( int i = 0; i < 4; i++ )
+    {
+        u |= uint32_t(buf[i]) << (i * 8);
+    }
+    return double(BitCast<float>(u));
+}
+
+template<>
+double ToDouble<GraphApi::GraphDataType::Float16, const ::flatbuffers::Vector<uint8_t> *>(const ::flatbuffers::Vector<uint8_t> *v)
+{
+    const auto &buf = SafeDeref(v);
+    tosa_assert(buf.size() == 6, "Malformed constant buffer");
+    uint32_t u = 0;
+    for ( int i = 0; i < 2; i++ )
+    {
+        u |= uint32_t(buf[i]) << (i * 8);
+    }
+    auto sign = u >> 15;
+    auto exp = (u >> 10) & 0x1F;
+    auto mant = u & 0x3FF;
+    if ( exp == 0x1F )
+    {
+        exp = 0xFF - 112;
+    }
+    u = (sign << 31) | ((exp + 112) << 23) | (mant << 13);
+    return double(BitCast<float>(u));
+}
+
+template<>
+double ToDouble<GraphApi::GraphDataType::BFloat16, const ::flatbuffers::Vector<uint8_t> *>(const ::flatbuffers::Vector<uint8_t> *v)
+{
+    const auto &buf = SafeDeref(v);
+    tosa_assert(buf.size() == 6, "Malformed constant buffer");
+    uint32_t u = 0;
+    for ( int i = 0; i < 2; i++ )
+    {
+        u |= uint32_t(buf[i]) << (i * 8);
+    }
+    u <<= 16;
+    return double(BitCast<float>(u));
+}
+
+template<typename ALLOC_TYPE = void, typename FB_TYPE>
+GraphApi::GraphTensor *CreateParamTensor(const ::flatbuffers::Vector<FB_TYPE> *attr, GraphApi::IGraphBuilder *builder, const std::string &name)
+{
+    using ACTUAL_ALLOC_TYPE = std::conditional_t<std::is_same_v<ALLOC_TYPE, void>, FB_TYPE, ALLOC_TYPE>;
+
+    const auto &buf = SafeDeref(attr);
+    GraphApi::GraphBuffer *buffer;
+
+    if constexpr ( std::is_same_v<FB_TYPE, ACTUAL_ALLOC_TYPE> )
+    {
+        buffer = builder->CreateBuffer(buf.size() * sizeof(buf[0]), GraphApi::BufferMapping::Alias, buf.Data());
+    }
+    else
+    {
+        std::vector<ACTUAL_ALLOC_TYPE> vbuf(buf.begin(), buf.end());
+        buffer = builder->CreateBuffer(vbuf.size() * sizeof(vbuf[0]), GraphApi::BufferMapping::Allocate, vbuf.data());
+    }
+    GraphApi::GraphShape tosaShape = {1, {int(buf.size())}};
+    GraphApi::GraphDataType type;
+    if constexpr ( std::is_same_v<ACTUAL_ALLOC_TYPE, bool> ) type = GraphApi::GraphDataType::Bool8;
+    else if constexpr ( std::is_same_v<ACTUAL_ALLOC_TYPE, int8_t> ) type = GraphApi::GraphDataType::Int8;
+    else if constexpr ( std::is_same_v<ACTUAL_ALLOC_TYPE, int16_t> ) type = GraphApi::GraphDataType::Int16;
+    else if constexpr ( std::is_same_v<ACTUAL_ALLOC_TYPE, int32_t> ) type = GraphApi::GraphDataType::Int32;
+    else if constexpr ( std::is_same_v<ACTUAL_ALLOC_TYPE, int64_t> ) type = GraphApi::GraphDataType::Int64;
+    else if constexpr ( std::is_same_v<ACTUAL_ALLOC_TYPE, uint8_t> ) type = GraphApi::GraphDataType::UInt8;
+    else if constexpr ( std::is_same_v<ACTUAL_ALLOC_TYPE, uint16_t> ) type = GraphApi::GraphDataType::UInt16;
+    else if constexpr ( std::is_same_v<ACTUAL_ALLOC_TYPE, uint32_t> ) type = GraphApi::GraphDataType::UInt32;
+    else if constexpr ( std::is_same_v<ACTUAL_ALLOC_TYPE, uint64_t> ) type = GraphApi::GraphDataType::UInt64;
+    else if constexpr ( std::is_same_v<ACTUAL_ALLOC_TYPE, float> ) type = GraphApi::GraphDataType::Float32;
+    else
+        static_assert(std::is_integral_v<ACTUAL_ALLOC_TYPE> || std::is_same_v<ACTUAL_ALLOC_TYPE, float>, "Make this more generic");
+    auto tensor = builder->CreateTensor(name.c_str(), tosaShape, GraphApi::GraphTensorLayout::Linear, type, buffer);
+    builder->SetAxisStrides(tensor, nullptr);  // Autocalculate
+    return tensor;
+}
+
+const tosaFb::TosaGraph *LoadModel(const void *input, size_t size)
+{
+    const uint8_t *buffer = static_cast<const uint8_t *>(input);
+    flatbuffers::Verifier verifier(buffer, size);
+
+    tosa_assert(tosaFb::VerifyTosaGraphBuffer(verifier));
+    return tosaFb::GetTosaGraph(buffer);
+}
+
+template<tosaFb::Op OP>
+struct TosaAttr
+{
+};
+
+std::unordered_map<tosaFb::Op, std::vector<GraphApi::GraphTensorUsage>> s_tosaTensorUsage;
+
+GraphApi::GraphTensorUsage GetTosaTensorUsage(const tosaFb::TosaOperator &op, int index)
+{
+    const auto &v = s_tosaTensorUsage.at(op.op());
+
+    return index >= int(v.size()) ? GraphApi::GraphTensorUsage::IFM : GraphApi::GraphTensorUsage(uint32_t(v[index]) & uint32_t(GraphApi::GraphTensorUsage::TypeMask));
+}
+
+#define TOSA_REGISTER_OP(OP_ENUM, ATTR_PREFIX, ...) \
+    template<> \
+    struct TosaAttr<tosaFb::Op::OP_ENUM> \
+    { \
+        static const tosaFb::ATTR_PREFIX &Get(const tosaFb::TosaOperator &op) \
+        { \
+            tosa_assert(op.attribute_type() == tosaFb::Attribute::ATTR_PREFIX, "Malformed TOSA Flatbuffer attribute"); \
+            auto attr = op.attribute_as<tosaFb::ATTR_PREFIX>(); \
+            return SafeDeref(attr, "Malformed TOSA Flatbuffer attribute"); \
+        } \
+\
+        TosaAttr<tosaFb::Op::OP_ENUM>() \
+        { \
+            s_tosaTensorUsage[tosaFb::Op::OP_ENUM] = {__VA_ARGS__}; \
+        } \
+    }; \
+    TosaAttr<tosaFb::Op::OP_ENUM> s_Init_##OP_ENUM
+
+// clang-format off
+TOSA_REGISTER_OP(ARGMAX,                  AxisAttribute,                 GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(AVG_POOL2D,              PoolAttribute,                 GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(CONV2D,                  ConvAttribute,                 GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::Weights, GraphApi::GraphTensorUsage::Scales);
+TOSA_REGISTER_OP(CONV3D,                  ConvAttribute,                 GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::Weights, GraphApi::GraphTensorUsage::Scales);
+TOSA_REGISTER_OP(DEPTHWISE_CONV2D,        ConvAttribute,                 GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::Weights, GraphApi::GraphTensorUsage::Scales);
+TOSA_REGISTER_OP(FULLY_CONNECTED,         FullyConnectedAttribute,       GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::Weights, GraphApi::GraphTensorUsage::Scales);
+TOSA_REGISTER_OP(MATMUL,                  MatMulAttribute,               GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(MAX_POOL2D,              PoolAttribute,                 GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(TRANSPOSE_CONV2D,        TransposeConvAttribute,        GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::Weights, GraphApi::GraphTensorUsage::Scales);
+TOSA_REGISTER_OP(CLAMP,                   ClampAttribute,                GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(SIGMOID,                 NONE,                          GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(TANH,                    NONE,                          GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(ADD,                     NONE,                          GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(ARITHMETIC_RIGHT_SHIFT,  ArithmeticRightShiftAttribute, GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(BITWISE_AND,             NONE,                          GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(BITWISE_OR,              NONE,                          GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(BITWISE_XOR,             NONE,                          GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(INTDIV,                  NONE,                          GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(LOGICAL_AND,             NONE,                          GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(LOGICAL_LEFT_SHIFT,      NONE,                          GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(LOGICAL_RIGHT_SHIFT,     NONE,                          GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(LOGICAL_OR,              NONE,                          GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(LOGICAL_XOR,             NONE,                          GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(MAXIMUM,                 NONE,                          GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(MINIMUM,                 NONE,                          GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(MUL,                     MulAttribute,                  GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(POW,                     NONE,                          GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(SUB,                     NONE,                          GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(TABLE,                   TableAttribute,                GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::Params);
+TOSA_REGISTER_OP(ABS,                     NONE,                          GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(BITWISE_NOT,             NONE,                          GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(CEIL,                    NONE,                          GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(CLZ,                     NONE,                          GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(EXP,                     NONE,                          GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(FLOOR,                   NONE,                          GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(LOG,                     NONE,                          GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(LOGICAL_NOT,             NONE,                          GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(NEGATE,                  NegateAttribute,               GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(RECIPROCAL,              NONE,                          GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(RSQRT,                   NONE,                          GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(SELECT,                  NONE,                          GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(EQUAL,                   NONE,                          GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(GREATER,                 NONE,                          GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(GREATER_EQUAL,           NONE,                          GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(REDUCE_ANY,              AxisAttribute,                 GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(REDUCE_ALL,              AxisAttribute,                 GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(REDUCE_MAX,              AxisAttribute,                 GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(REDUCE_MIN,              AxisAttribute,                 GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(REDUCE_PRODUCT,          AxisAttribute,                 GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(REDUCE_SUM,              AxisAttribute,                 GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(CONCAT,                  AxisAttribute,                 GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(PAD,                     PadAttribute,                  GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::Params);
+TOSA_REGISTER_OP(RESHAPE,                 ReshapeAttribute,              GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::Params);
+TOSA_REGISTER_OP(REVERSE,                 AxisAttribute,                 GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(SLICE,                   SliceAttribute,                GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(TILE,                    TileAttribute,                 GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(TRANSPOSE,               TransposeAttribute,            GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(GATHER,                  NONE,                          GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(SCATTER,                 NONE,                          GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(RESIZE,                  ResizeAttribute,               GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(CAST,                    NONE,                          GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(RESCALE,                 RescaleAttribute,              GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::Params, GraphApi::GraphTensorUsage::Params1);
+TOSA_REGISTER_OP(CONST,                   NONE,                          );
+TOSA_REGISTER_OP(IDENTITY,                NONE,                          GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(CUSTOM,                  NONE,                          GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(COND_IF,                 CondIfAttribute,               GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(WHILE_LOOP,              WhileLoopAttribute,            GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(FFT2D,                   FFTAttribute,                  GraphApi::GraphTensorUsage::IFM, GraphApi::GraphTensorUsage::IFM);
+TOSA_REGISTER_OP(RFFT2D,                  NONE,                          GraphApi::GraphTensorUsage::IFM);
+// clang-format on
+
+}  // namespace
+
+void TosaReader::LoadGraphs(const tosaFb::TosaGraph *model, std::list<GraphBuilder> &builders)
+{
+    tosa_assert(model);
+    const auto &version = SafeDeref(model->version());
+    const uint32_t ver_word = (uint32_t(version._major()) << 24) | (uint32_t(version._minor()) << 8) | uint32_t(version._patch());
+
+    for ( const auto &tosa_region : SafeDeref(model->regions()) )
+    {
+        for ( const auto &tosa_basicblock : SafeDeref(tosa_region->blocks()) )
+        {
+            const char *bbName = SafeDeref(tosa_basicblock->name()).c_str();
+            tosa_assert(bbName, "Basic block needs a valid name");
+            builders.emplace_back(bbName);
+            GraphApi::IGraphBuilder *builder = &builders.back();
+
+            tosa_assert(builder->RequireSyntaxVersion(ver_word, GraphApi::PROFILE_BASELINE), "Tosa version mismatch");
+
+            std::unordered_map<std::string, GraphApi::GraphTensor *> tensors;
+            std::unordered_map<std::string, GraphApi::GraphShape> shapes;
+            std::unordered_map<std::string, GraphApi::GraphDataType> types;
+            tensors.reserve(SafeDeref(tosa_basicblock->tensors()).size());
+
+            for ( const auto &tosa_tensor : SafeDeref(tosa_basicblock->tensors()) )
+            {
+                GraphApi::GraphBuffer *buffer = nullptr;
+
+                const char *name = SafeDeref(tosa_tensor->name()).c_str();
+                tosa_assert(name, "Tensor needs a valid name");
+                const auto type = TosaMapping::TensorTypeToDataType(tosa_tensor->type());
+
+                Shape shape;  // Defaults to shapeless
+                if ( tosa_tensor->shape() && tosa_tensor->shape()->size() )
+                {
+                    shape = Shape(tosa_tensor->shape()->data(), tosa_tensor->shape()->size());
+                }
+
+                if ( tosa_tensor->data() && tosa_tensor->data()->size() )
+                {
+                    buffer = builder->CreateBuffer(
+                        tosa_tensor->data()->size(), GraphApi::BufferMapping::Alias, tosa_tensor->data());
+                }
+
+                GraphApi::GraphShape tosaShape;
+                tosaShape.count = shape.ToNHWC(tosaShape.axisNHWC, GraphApi::MAX_TENSOR_DIMS);
+
+                auto tensor = builder->CreateTensor(name, tosaShape, GraphApi::GraphTensorLayout::Linear, type, buffer);
+
+                tensors[name] = tensor;
+                shapes[name] = std::move(tosaShape);
+                types[name] = type;
+
+                builder->SetAxisStrides(tensor, nullptr);  // Autocalculate
+            }
+
+            const auto &tosa_operators = SafeDeref(tosa_basicblock->operators());
+            for ( int tosa_op_index = 0; tosa_op_index < int(tosa_operators.size()); tosa_op_index++ )
+            {
+                const auto &tosa_operator = SafeDeref(tosa_operators[tosa_op_index]);
+                // Connect operation to its input tensors
+                const auto &input_tensors_fb = SafeDeref(tosa_operator.inputs());
+                std::vector<std::string> input_tensors;
+                input_tensors.reserve(input_tensors_fb.size());
+                for ( const auto &ten : input_tensors_fb )
+                    input_tensors.push_back(SafeDeref(ten).str());
+                const auto &output_tensors = SafeDeref(tosa_operator.outputs());
+
+                // Kernel
+                GraphApi::GraphKernel kernel = {};
+                GraphApi::GraphKernel *kernelPtr = nullptr;
+
+                switch ( tosa_operator.op() )
+                {
+                    case tosaFb::Op::DEPTHWISE_CONV2D:
+                    {
+                        kernelPtr = &kernel;
+                        tosa_assert(input_tensors.size() > 1);
+                        const auto &shape = shapes.at(input_tensors[1]);
+                        kernel.sizeYXZ[0] = shape.axisNHWC[0];
+                        kernel.sizeYXZ[1] = shape.axisNHWC[1];
+                        kernel.sizeYXZ[2] = 1;
+                        const auto &attr = TosaAttr<tosaFb::Op::DEPTHWISE_CONV2D>::Get(tosa_operator);
+                        tosa_assert(attr.pad());
+                        tosa_assert(attr.pad()->size() == 4);
+                        kernel.paddingTBLRNF[0] = (*attr.pad())[0];
+                        kernel.paddingTBLRNF[1] = (*attr.pad())[1];
+                        kernel.paddingTBLRNF[2] = (*attr.pad())[2];
+                        kernel.paddingTBLRNF[3] = (*attr.pad())[3];
+                        tosa_assert(attr.stride());
+                        tosa_assert(attr.stride()->size() == 2);
+                        kernel.strideYXZ[0] = (*attr.stride())[0];
+                        kernel.strideYXZ[1] = (*attr.stride())[1];
+                        kernel.strideYXZ[2] = 1;
+                        tosa_assert(attr.dilation());
+                        tosa_assert(attr.dilation()->size() == 2);
+                        kernel.dilationYXZ[0] = (*attr.dilation())[0];
+                        kernel.dilationYXZ[1] = (*attr.dilation())[1];
+                        kernel.dilationYXZ[2] = 1;
+                    }
+                    break;
+                    case tosaFb::Op::CONV2D:
+                    {
+                        kernelPtr = &kernel;
+                        tosa_assert(input_tensors.size() > 1);
+                        const auto &shape = shapes.at(input_tensors[1]);
+                        kernel.sizeYXZ[0] = shape.axisNHWC[1];
+                        kernel.sizeYXZ[1] = shape.axisNHWC[2];
+                        kernel.sizeYXZ[2] = 1;
+                        const auto &attr = TosaAttr<tosaFb::Op::CONV2D>::Get(tosa_operator);
+                        tosa_assert(attr.pad());
+                        tosa_assert(attr.pad()->size() == 4);
+                        kernel.paddingTBLRNF[0] = (*attr.pad())[0];
+                        kernel.paddingTBLRNF[1] = (*attr.pad())[1];
+                        kernel.paddingTBLRNF[2] = (*attr.pad())[2];
+                        kernel.paddingTBLRNF[3] = (*attr.pad())[3];
+                        tosa_assert(attr.stride());
+                        tosa_assert(attr.stride()->size() == 2);
+                        kernel.strideYXZ[0] = (*attr.stride())[0];
+                        kernel.strideYXZ[1] = (*attr.stride())[1];
+                        kernel.strideYXZ[2] = 1;
+                        tosa_assert(attr.dilation());
+                        tosa_assert(attr.dilation()->size() == 2);
+                        kernel.dilationYXZ[0] = (*attr.dilation())[0];
+                        kernel.dilationYXZ[1] = (*attr.dilation())[1];
+                        kernel.dilationYXZ[2] = 1;
+                    }
+                    break;
+                    case tosaFb::Op::CONV3D:
+                    {
+                        kernelPtr = &kernel;
+                        tosa_assert(input_tensors.size() > 1);
+                        const auto &shape = shapes.at(input_tensors[1]);
+                        kernel.sizeYXZ[0] = shape.axisNHWC[1];
+                        kernel.sizeYXZ[1] = shape.axisNHWC[2];
+                        kernel.sizeYXZ[2] = shape.axisNHWC[0];
+                        const auto &attr = TosaAttr<tosaFb::Op::CONV3D>::Get(tosa_operator);
+                        tosa_assert(attr.pad());
+                        tosa_assert(attr.pad()->size() == 6);
+                        kernel.paddingTBLRNF[0] = (*attr.pad())[2];
+                        kernel.paddingTBLRNF[1] = (*attr.pad())[3];
+                        kernel.paddingTBLRNF[2] = (*attr.pad())[4];
+                        kernel.paddingTBLRNF[3] = (*attr.pad())[5];
+                        kernel.paddingTBLRNF[4] = (*attr.pad())[0];
+                        kernel.paddingTBLRNF[5] = (*attr.pad())[1];
+                        tosa_assert(attr.stride());
+                        tosa_assert(attr.stride()->size() == 3);
+                        kernel.strideYXZ[0] = (*attr.stride())[1];
+                        kernel.strideYXZ[1] = (*attr.stride())[2];
+                        kernel.strideYXZ[2] = (*attr.stride())[0];
+                        tosa_assert(attr.dilation());
+                        tosa_assert(attr.dilation()->size() == 3);
+                        kernel.dilationYXZ[0] = (*attr.dilation())[1];
+                        kernel.dilationYXZ[1] = (*attr.dilation())[2];
+                        kernel.dilationYXZ[2] = (*attr.dilation())[0];
+                    }
+                    break;
+                    case tosaFb::Op::TRANSPOSE_CONV2D:
+                    {
+                        kernelPtr = &kernel;
+                        tosa_assert(input_tensors.size() > 1);
+                        const auto &shape = shapes.at(input_tensors[1]);
+                        kernel.sizeYXZ[0] = shape.axisNHWC[1];
+                        kernel.sizeYXZ[1] = shape.axisNHWC[2];
+                        kernel.sizeYXZ[2] = 1;
+                        const auto &attr = TosaAttr<tosaFb::Op::TRANSPOSE_CONV2D>::Get(tosa_operator);
+                        tosa_assert(attr.out_pad());
+                        tosa_assert(attr.out_pad()->size() == 4);
+                        kernel.paddingTBLRNF[0] = (*attr.out_pad())[0];
+                        kernel.paddingTBLRNF[1] = (*attr.out_pad())[1];
+                        kernel.paddingTBLRNF[2] = (*attr.out_pad())[2];
+                        kernel.paddingTBLRNF[3] = (*attr.out_pad())[3];
+                        tosa_assert(attr.stride());
+                        tosa_assert(attr.stride()->size() == 2);
+                        kernel.strideYXZ[0] = (*attr.stride())[0];
+                        kernel.strideYXZ[1] = (*attr.stride())[1];
+                        kernel.strideYXZ[2] = 1;
+                        kernel.dilationYXZ[0] = 1;
+                        kernel.dilationYXZ[1] = 1;
+                        kernel.dilationYXZ[2] = 1;
+                    }
+                    break;
+                    case tosaFb::Op::AVG_POOL2D:
+                        [[fallthrough]];
+                    case tosaFb::Op::MAX_POOL2D:
+                    {
+                        kernelPtr = &kernel;
+                        const auto &attr = TosaAttr<tosaFb::Op::AVG_POOL2D>::Get(tosa_operator);
+                        tosa_assert(attr.kernel());
+                        tosa_assert(attr.kernel()->size() == 2);
+                        kernel.sizeYXZ[0] = (*attr.kernel())[0];
+                        kernel.sizeYXZ[1] = (*attr.kernel())[1];
+                        kernel.sizeYXZ[2] = 1;
+                        tosa_assert(attr.pad());
+                        tosa_assert(attr.pad()->size() == 4);
+                        kernel.paddingTBLRNF[0] = (*attr.pad())[0];
+                        kernel.paddingTBLRNF[1] = (*attr.pad())[1];
+                        kernel.paddingTBLRNF[2] = (*attr.pad())[2];
+                        kernel.paddingTBLRNF[3] = (*attr.pad())[3];
+                        tosa_assert(attr.stride());
+                        tosa_assert(attr.stride()->size() == 2);
+                        kernel.strideYXZ[0] = (*attr.stride())[0];
+                        kernel.strideYXZ[1] = (*attr.stride())[1];
+                        kernel.strideYXZ[2] = 1;
+                    }
+                    break;
+                    case tosaFb::Op::PAD:
+                    {
+                        kernelPtr = &kernel;
+                        const auto &attr = TosaAttr<tosaFb::Op::PAD>::Get(tosa_operator);
+                        tosa_assert(input_tensors.size() > 0);
+                        auto type = types.at(input_tensors[0]);
+                        double padValue = 0;
+                        switch ( type )
+                        {
+                            case GraphApi::GraphDataType::UInt8:
+                                [[fallthrough]];
+                            case GraphApi::GraphDataType::Int8:
+                                [[fallthrough]];
+                            case GraphApi::GraphDataType::UInt16:
+                                [[fallthrough]];
+                            case GraphApi::GraphDataType::Int16:
+                                [[fallthrough]];
+                            case GraphApi::GraphDataType::Int32:
+                                [[fallthrough]];
+                            case GraphApi::GraphDataType::Bool8:
+                                [[fallthrough]];
+                            case GraphApi::GraphDataType::Int4Packed8:
+                                padValue = ToDouble(attr.pad_const_int());
+                                break;
+                            case GraphApi::GraphDataType::Int48:
+                            {
+                                padValue = ToDouble<GraphApi::GraphDataType::Int48>(attr.pad_const_fp());
+                            }
+                            break;
+                            case GraphApi::GraphDataType::Float32:
+                            {
+                                padValue = ToDouble<GraphApi::GraphDataType::Float32>(attr.pad_const_fp());
+                            }
+                            break;
+                            case GraphApi::GraphDataType::Float16:
+                            {
+                                padValue = ToDouble<GraphApi::GraphDataType::Float16>(attr.pad_const_fp());
+                            }
+                            break;
+                            case GraphApi::GraphDataType::BFloat16:
+                            {
+                                padValue = ToDouble<GraphApi::GraphDataType::BFloat16>(attr.pad_const_fp());
+                            }
+                            break;
+                            default:
+                                tosa_assert(false, "Unexpected data type for pad_value");
+                        }
+                        kernel.padValue = padValue;
+                    }
+                    break;
+                    default:
+                        break;
+                }
+
+                auto op = builder->CreateOp(TosaMapping::FBOpToOp(tosa_operator.op()), kernelPtr);
+
+                // Fix op Attributes
+                auto ToApiShape = [](const ::flatbuffers::Vector<int32_t> *in, GraphApi::GraphShape &out)
+                {
+                    const auto &buf = SafeDeref(in);
+                    tosa_assert(buf.size() <= std::size(out.axisNHWC), "Shape rank exceeds maximum allowed");
+                    for ( int i = 0; i < int(buf.size()); i++ )
+                    {
+                        out.axisNHWC[i] = buf[i];
+                    }
+                    out.count = buf.size();
+                };
+                auto SafeCopy = [](const ::flatbuffers::String *in, char(&out)[GraphApi::GraphOperation::MAX_GRAPH_NAME_LEN])
+                {
+                    auto s = SafeDeref(in).str();
+                    tosa_assert(s.copy(out, std::size(out)) == s.size());
+                };
+                auto &attr = op->attr;
+                switch ( tosa_operator.op() )
+                {
+                    case tosaFb::Op::ARITHMETIC_RIGHT_SHIFT:
+                    {
+                        const auto &tosa_attr = TosaAttr<tosaFb::Op::ARITHMETIC_RIGHT_SHIFT>::Get(tosa_operator);
+                        attr.asr.round = tosa_attr.round();
+                    }
+                    break;
+                    case tosaFb::Op::CLAMP:
+                    {
+                        const auto &tosa_attr = TosaAttr<tosaFb::Op::CLAMP>::Get(tosa_operator);
+                        attr.clamp.min = tosa_attr.min_int();
+                        attr.clamp.max = tosa_attr.max_int();
+                    }
+                    break;
+                    case tosaFb::Op::SLICE:
+                    {
+                        const auto &tosa_attr = TosaAttr<tosaFb::Op::SLICE>::Get(tosa_operator);
+                        ToApiShape(tosa_attr.start(), attr.slice.begin);
+                        ToApiShape(tosa_attr.size(), attr.slice.size);
+                    }
+                    break;
+                    case tosaFb::Op::MUL:
+                    {
+                        const auto &tosa_attr = TosaAttr<tosaFb::Op::MUL>::Get(tosa_operator);
+                        attr.mul.shift = tosa_attr.shift();
+                    }
+                    break;
+                    case tosaFb::Op::TRANSPOSE:
+                    {
+                        const auto &tosa_attr = TosaAttr<tosaFb::Op::TRANSPOSE>::Get(tosa_operator);
+                        ToApiShape(tosa_attr.perms(), attr.transpose.perm);
+                    }
+                    break;
+                    case tosaFb::Op::COND_IF:
+                    {
+                        const auto &tosa_attr = TosaAttr<tosaFb::Op::COND_IF>::Get(tosa_operator);
+                        SafeCopy(tosa_attr.then_branch(), attr.condIf.then_branch);
+                        SafeCopy(tosa_attr.else_branch(), attr.condIf.else_branch);
+                    }
+                    break;
+                    case tosaFb::Op::WHILE_LOOP:
+                    {
+                        const auto &tosa_attr = TosaAttr<tosaFb::Op::WHILE_LOOP>::Get(tosa_operator);
+                        SafeCopy(tosa_attr.body_branch(), attr.condWhile.body_branch);
+                        SafeCopy(tosa_attr.cond_branch(), attr.condWhile.cond_branch);
+                    }
+                    break;
+                    case tosaFb::Op::RESCALE:
+                    {
+                        const auto &tosa_attr = TosaAttr<tosaFb::Op::RESCALE>::Get(tosa_operator);
+                        attr.rescale.scale32 = tosa_attr.scale32();
+                        attr.rescale.double_round = tosa_attr.double_round();
+                        attr.rescale.per_channel = tosa_attr.per_channel();
+
+                        if ( input_tensors.size() == 1 )
+                        {
+                            std::string name = "multiplier_param" + std::to_string(tosa_op_index);
+                            tensors[name] = CreateParamTensor(tosa_attr.multiplier(), builder, name);
+                            input_tensors.push_back(name);
+                            name = "shift_param" + std::to_string(tosa_op_index);
+                            tensors[name] = CreateParamTensor<int8_t>(tosa_attr.shift(), builder, name);
+                            input_tensors.push_back(std::move(name));
+                        }
+                    }
+                    break;
+                    case tosaFb::Op::RESHAPE:
+                    {
+                        const auto &tosa_attr = TosaAttr<tosaFb::Op::RESHAPE>::Get(tosa_operator);
+                        ToApiShape(tosa_attr.new_shape(), attr.reshape.shape);
+                    }
+                    break;
+                    case tosaFb::Op::RESIZE:
+                    {
+                        const auto &tosa_attr = TosaAttr<tosaFb::Op::RESIZE>::Get(tosa_operator);
+                        tosa_assert(tosa_attr.scale());
+                        tosa_assert(tosa_attr.scale()->size() == 4);
+                        tosa_assert(tosa_attr.offset());
+                        tosa_assert(tosa_attr.offset()->size() == 2);
+                        tosa_assert(tosa_attr.border());
+                        tosa_assert(tosa_attr.border()->size() == 2);
+                        attr.resize.scaleY.n = (*tosa_attr.scale())[0];
+                        attr.resize.scaleY.d = (*tosa_attr.scale())[1];
+                        attr.resize.scaleX.n = (*tosa_attr.scale())[2];
+                        attr.resize.scaleX.d = (*tosa_attr.scale())[3];
+                        attr.resize.offsetYX[0] = (*tosa_attr.offset())[0];
+                        attr.resize.offsetYX[1] = (*tosa_attr.offset())[1];
+                        attr.resize.borderYX[0] = (*tosa_attr.border())[0];
+                        attr.resize.borderYX[1] = (*tosa_attr.border())[1];
+                        attr.resize.mode = TosaMapping::FBResizeModeToResizeMode(tosa_attr.mode());
+                    }
+                    break;
+                    case tosaFb::Op::ARGMAX:
+                        [[fallthrough]];
+                    case tosaFb::Op::REDUCE_ANY:
+                        [[fallthrough]];
+                    case tosaFb::Op::REDUCE_ALL:
+                        [[fallthrough]];
+                    case tosaFb::Op::REDUCE_MAX:
+                        [[fallthrough]];
+                    case tosaFb::Op::REDUCE_MIN:
+                        [[fallthrough]];
+                    case tosaFb::Op::REDUCE_PRODUCT:
+                        [[fallthrough]];
+                    case tosaFb::Op::REDUCE_SUM:
+                        [[fallthrough]];
+                    case tosaFb::Op::CONCAT:
+                        [[fallthrough]];
+                    case tosaFb::Op::REVERSE:
+                    {
+                        const auto &tosa_attr = TosaAttr<tosaFb::Op::ARGMAX>::Get(tosa_operator);
+                        attr.axis.axis = tosa_attr.axis();
+                        break;
+                    }
+                    case tosaFb::Op::TILE:
+                    {
+                        const auto &tosa_attr = TosaAttr<tosaFb::Op::TILE>::Get(tosa_operator);
+                        ToApiShape(tosa_attr.multiples(), attr.tile.multiples);
+                        break;
+                    }
+                    break;
+                    case tosaFb::Op::PAD:
+                    {
+                        const auto &tosa_attr = TosaAttr<tosaFb::Op::PAD>::Get(tosa_operator);
+                        if ( input_tensors.size() == 1 )
+                        {
+                            std::string name = "padding_param" + std::to_string(tosa_op_index);
+                            tensors[name] = CreateParamTensor(tosa_attr.padding(), builder, name);
+                            input_tensors.push_back(std::move(name));
+                        }
+                    }
+                    break;
+                    case tosaFb::Op::TABLE:
+                    {
+                        const auto &tosa_attr = TosaAttr<tosaFb::Op::TABLE>::Get(tosa_operator);
+                        if ( input_tensors.size() == 1 )
+                        {
+                            std::string name = "table_param" + std::to_string(tosa_op_index);
+                            tensors[name] = CreateParamTensor(tosa_attr.table(), builder, name);
+                            input_tensors.push_back(std::move(name));
+                        }
+                    }
+                    break;
+                    default:
+                        break;
+                }
+
+                // Collect input usage
+                std::vector<GraphApi::GraphTensorUsage> usages;
+                usages.reserve(input_tensors.size());
+                for ( int i = 0; i < int(input_tensors.size()); i++ )
+                {
+                    GraphApi::GraphTensorUsage usage = GetTosaTensorUsage(tosa_operator, i);
+                    int count = 0;
+                    for ( auto u : usages )
+                    {
+                        if ( GraphApi::GraphTensorUsage(uint32_t(u) & uint32_t(GraphApi::GraphTensorUsage::TypeMask)) == usage )
+                        {
+                            count++;
+                        }
+                    }
+                    usages.push_back(GraphApi::MakeTensorUsage(usage, count));
+                }
+                // Add inputs
+                for ( int i = 0; i < int(input_tensors.size()); i++ )
+                {
+                    auto usage = usages[i];
+                    builder->AddInput(op, usage, tensors.at(input_tensors[i]));
+
+                    // Zero point
+                    switch ( tosa_operator.op() )
+                    {
+                        case tosaFb::Op::AVG_POOL2D:
+                            if ( usage == GraphApi::GraphTensorUsage::IFM )
+                            {
+                                const auto &tosa_attr = TosaAttr<tosaFb::Op::AVG_POOL2D>::Get(tosa_operator);
+                                op->SetZeroPoint(usage, double(tosa_attr.input_zp()));
+                            }
+                            break;
+                        case tosaFb::Op::CONV2D:
+                            [[fallthrough]];
+                        case tosaFb::Op::CONV3D:
+                            [[fallthrough]];
+                        case tosaFb::Op::DEPTHWISE_CONV2D:
+                            if ( usage == GraphApi::GraphTensorUsage::IFM )
+                            {
+                                const auto &tosa_attr = TosaAttr<tosaFb::Op::CONV2D>::Get(tosa_operator);
+                                op->SetZeroPoint(usage, double(tosa_attr.input_zp()));
+                            }
+                            if ( usage == GraphApi::GraphTensorUsage::Weights )
+                            {
+                                const auto &tosa_attr = TosaAttr<tosaFb::Op::CONV2D>::Get(tosa_operator);
+                                op->SetZeroPoint(usage, double(tosa_attr.weight_zp()));
+                            }
+                            break;
+                        case tosaFb::Op::FULLY_CONNECTED:
+                            if ( usage == GraphApi::GraphTensorUsage::IFM )
+                            {
+                                const auto &tosa_attr = TosaAttr<tosaFb::Op::FULLY_CONNECTED>::Get(tosa_operator);
+                                op->SetZeroPoint(usage, double(tosa_attr.input_zp()));
+                            }
+                            if ( usage == GraphApi::GraphTensorUsage::Weights )
+                            {
+                                const auto &tosa_attr = TosaAttr<tosaFb::Op::FULLY_CONNECTED>::Get(tosa_operator);
+                                op->SetZeroPoint(usage, double(tosa_attr.weight_zp()));
+                                builder->SetAxisOrder(tensors.at(input_tensors[i]), GraphApi::AxisOrder::OI);
+                            }
+                            break;
+                        case tosaFb::Op::MATMUL:
+                            if ( usage == GraphApi::GraphTensorUsage::IFM0 )
+                            {
+                                const auto &tosa_attr = TosaAttr<tosaFb::Op::MATMUL>::Get(tosa_operator);
+                                op->SetZeroPoint(usage, double(tosa_attr.a_zp()));
+                            }
+                            if ( usage == GraphApi::GraphTensorUsage::IFM1 )
+                            {
+                                const auto &tosa_attr = TosaAttr<tosaFb::Op::MATMUL>::Get(tosa_operator);
+                                op->SetZeroPoint(usage, double(tosa_attr.b_zp()));
+                            }
+                            break;
+                        case tosaFb::Op::TRANSPOSE_CONV2D:
+                            if ( usage == GraphApi::GraphTensorUsage::IFM )
+                            {
+                                const auto &tosa_attr = TosaAttr<tosaFb::Op::TRANSPOSE_CONV2D>::Get(tosa_operator);
+                                op->SetZeroPoint(usage, double(tosa_attr.input_zp()));
+                            }
+                            if ( usage == GraphApi::GraphTensorUsage::Weights )
+                            {
+                                const auto &tosa_attr = TosaAttr<tosaFb::Op::TRANSPOSE_CONV2D>::Get(tosa_operator);
+                                op->SetZeroPoint(usage, double(tosa_attr.weight_zp()));
+                            }
+                            break;
+                        case tosaFb::Op::NEGATE:
+                            if ( usage == GraphApi::GraphTensorUsage::IFM )
+                            {
+                                const auto &tosa_attr = TosaAttr<tosaFb::Op::NEGATE>::Get(tosa_operator);
+                                op->SetZeroPoint(usage, double(tosa_attr.input1_zp()));
+                            }
+                            break;
+                        case tosaFb::Op::RESCALE:
+                            if ( usage == GraphApi::GraphTensorUsage::IFM )
+                            {
+                                const auto &tosa_attr = TosaAttr<tosaFb::Op::RESCALE>::Get(tosa_operator);
+                                op->SetZeroPoint(usage, double(tosa_attr.input_zp()));
+                            }
+                            break;
+                        default:
+                            break;
+                    }
+                }
+                // Add outputs
+                for ( int i = 0; i < int(output_tensors.size()); i++ )
+                {
+                    const auto &ten = SafeDeref(output_tensors[i]);
+                    GraphApi::GraphTensorUsage usage = GraphApi::MakeTensorUsage(GraphApi::GraphTensorUsage::OFM, i);
+                    builder->AddOutput(op, usage, tensors.at(ten.str()));
+
+                    // Zero point
+                    switch ( tosa_operator.op() )
+                    {
+                        case tosaFb::Op::AVG_POOL2D:
+                            if ( usage == GraphApi::GraphTensorUsage::OFM )
+                            {
+                                const auto &tosa_attr = TosaAttr<tosaFb::Op::AVG_POOL2D>::Get(tosa_operator);
+                                op->SetZeroPoint(usage, double(tosa_attr.output_zp()));
+                            }
+                            break;
+                        case tosaFb::Op::NEGATE:
+                            if ( usage == GraphApi::GraphTensorUsage::OFM )
+                            {
+                                const auto &tosa_attr = TosaAttr<tosaFb::Op::NEGATE>::Get(tosa_operator);
+                                op->SetZeroPoint(usage, double(tosa_attr.output_zp()));
+                            }
+                            break;
+                        case tosaFb::Op::RESCALE:
+                            if ( usage == GraphApi::GraphTensorUsage::OFM )
+                            {
+                                const auto &tosa_attr = TosaAttr<tosaFb::Op::RESCALE>::Get(tosa_operator);
+                                op->SetZeroPoint(usage, double(tosa_attr.output_zp()));
+                            }
+                            break;
+                        default:
+                            break;
+                    }
+                }
+            }
+
+            // Add graph inputs and outputs
+            for ( auto ten : SafeDeref(tosa_basicblock->inputs()) )
+            {
+                builder->AddInput(tensors.at(ten->str()));
+            }
+            for ( auto ten : SafeDeref(tosa_basicblock->outputs()) )
+            {
+                builder->AddOutput(tensors.at(ten->str()));
+            }
+        }
+    }
+}
+
+void TosaReader::LoadGraphs(const void *input, size_t size, std::list<GraphBuilder> &builders)
+{
+    LoadGraphs(LoadModel(input, size), builders);
+}
+
+
+}  // namespace regor
diff --git a/ethosu/regor/tosa/tosa_reader.hpp b/ethosu/regor/tosa/tosa_reader.hpp
new file mode 100644
index 00000000..ab8dced4
--- /dev/null
+++ b/ethosu/regor/tosa/tosa_reader.hpp
@@ -0,0 +1,40 @@
+//
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#pragma once
+
+#include "tosa_schema_generated.hpp"
+
+#include <list>
+
+namespace regor
+{
+
+class GraphBuilder;
+
+// Parses a Tosa flatbuffer to create a Builder.
+class TosaReader
+{
+public:
+    TosaReader() {}
+
+    static void LoadGraphs(const tosaFb::TosaGraph *model, std::list<GraphBuilder> &builders);
+    static void LoadGraphs(const void *input, size_t size, std::list<GraphBuilder> &builders);  // From buffer
+};
+
+}  // namespace regor
diff --git a/ethosu/regor/tosa/tosa_require_checks.cpp b/ethosu/regor/tosa/tosa_require_checks.cpp
new file mode 100644
index 00000000..d557e9bb
--- /dev/null
+++ b/ethosu/regor/tosa/tosa_require_checks.cpp
@@ -0,0 +1,104 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Generated by tosaValidationGenerator for TOSA Specification 0.60.0
+// Modify by implementing the constraints.
+
+#include "tosa_require_checks.hpp"
+
+namespace tosa
+{
+namespace validator
+{
+namespace checks
+{
+// Checks for TOSA Specification 0.60.0
+void RequireCheck_25jhgrylo2an5(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: ARITHMETIC_RIGHT_SHIFT,
+    static constexpr char constraint[] = "REQUIRE((in_out_t == int32_t && 0 <= value2 && value2 <= 31) || (in_out_t == int16_t && 0 <= value2 && value2 <= 15) || (in_out_t == int8_t && 0 <= value2 && value2 <= 7))";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void RequireCheck_35z4hcgn21c8p(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: INTDIV,
+    static constexpr char constraint[] = "REQUIRE(value2 != 0)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void RequireCheck_2v5c1x79g8j7o(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: INTDIV,
+    static constexpr char constraint[] = "REQUIRE((int64_t)value1 / value2 <= maximum<in_out_t>)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void RequireCheck_3k2pr9vozq62t(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: LOGICAL_LEFT_SHIFT, LOGICAL_RIGHT_SHIFT,
+    static constexpr char constraint[] = "REQUIRE(0 <= value2 && value2 <= 31)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void RequireCheck_27adsuj7sthvo(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: MUL,
+    static constexpr char constraint[] = "REQUIRE(product >= minimum<int32_t> && product <= maximum<int32_t>)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void RequireCheck_3o6eotvyt76cz(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: TABLE,
+    static constexpr char constraint[] = "REQUIRE(length(table) == TABLE_SIZE)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void RequireCheck_31n0oq4yculbk(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: GATHER, SCATTER,
+    static constexpr char constraint[] = "REQUIRE(0 <= k && k < K)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+void RequireCheck_2apk8ly9uthz6(const regor::Operation *op, [[maybe_unused]] const Context &context)
+{
+    // Operators: SCATTER,
+    static constexpr char constraint[] = "REQUIRE(output_modified[n,k,c] == false)";
+    bool checkOk = true;
+    checkOk = (op != nullptr);  // TODO: Implement check
+    if ( !checkOk ) throw std::invalid_argument(constraint);
+}
+
+}  // namespace checks
+}  // namespace validator
+}  // namespace tosa
diff --git a/ethosu/regor/tosa/tosa_require_checks.hpp b/ethosu/regor/tosa/tosa_require_checks.hpp
new file mode 100644
index 00000000..c0fe9b7b
--- /dev/null
+++ b/ethosu/regor/tosa/tosa_require_checks.hpp
@@ -0,0 +1,42 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Generated by tosaValidationGenerator for TOSA Specification 0.60.0
+// Do not edit.
+
+#pragma once
+
+#include "tosa/tosa_validator.hpp"
+
+namespace tosa
+{
+namespace validator
+{
+namespace checks
+{
+// Checks for TOSA Specification 0.60.0
+void RequireCheck_25jhgrylo2an5(const regor::Operation *op, const Context &context);
+void RequireCheck_35z4hcgn21c8p(const regor::Operation *op, const Context &context);
+void RequireCheck_2v5c1x79g8j7o(const regor::Operation *op, const Context &context);
+void RequireCheck_3k2pr9vozq62t(const regor::Operation *op, const Context &context);
+void RequireCheck_27adsuj7sthvo(const regor::Operation *op, const Context &context);
+void RequireCheck_3o6eotvyt76cz(const regor::Operation *op, const Context &context);
+void RequireCheck_31n0oq4yculbk(const regor::Operation *op, const Context &context);
+void RequireCheck_2apk8ly9uthz6(const regor::Operation *op, const Context &context);
+}  // namespace checks
+}  // namespace validator
+}  // namespace tosa
diff --git a/ethosu/regor/tosa/tosa_schema_generated.hpp b/ethosu/regor/tosa/tosa_schema_generated.hpp
new file mode 100644
index 00000000..bf701ce0
--- /dev/null
+++ b/ethosu/regor/tosa/tosa_schema_generated.hpp
@@ -0,0 +1,3099 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+//
+// To reproduce:
+//   flatc version 23.5.26
+//   schema.fbs v0.60.0
+//   sed -i 's/namespace tosa/namespace tosaFb/g' schema.fbs
+//   flatc --cpp --scoped-enums --reflect-names schema.fbs
+
+
+#ifndef FLATBUFFERS_GENERATED_TOSA_TOSAFB_H_
+#define FLATBUFFERS_GENERATED_TOSA_TOSAFB_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 23 && FLATBUFFERS_VERSION_MINOR == 5 && FLATBUFFERS_VERSION_REVISION == 26,
+    "Non-compatible flatbuffers version included");
+
+namespace tosaFb
+{
+
+struct PoolAttribute;
+struct PoolAttributeBuilder;
+
+struct ConvAttribute;
+struct ConvAttributeBuilder;
+
+struct TransposeConvAttribute;
+struct TransposeConvAttributeBuilder;
+
+struct PadAttribute;
+struct PadAttributeBuilder;
+
+struct AxisAttribute;
+struct AxisAttributeBuilder;
+
+struct ReshapeAttribute;
+struct ReshapeAttributeBuilder;
+
+struct SliceAttribute;
+struct SliceAttributeBuilder;
+
+struct TileAttribute;
+struct TileAttributeBuilder;
+
+struct ResizeAttribute;
+struct ResizeAttributeBuilder;
+
+struct ClampAttribute;
+struct ClampAttributeBuilder;
+
+struct RescaleAttribute;
+struct RescaleAttributeBuilder;
+
+struct MulAttribute;
+struct MulAttributeBuilder;
+
+struct ArithmeticRightShiftAttribute;
+struct ArithmeticRightShiftAttributeBuilder;
+
+struct CondIfAttribute;
+struct CondIfAttributeBuilder;
+
+struct WhileLoopAttribute;
+struct WhileLoopAttributeBuilder;
+
+struct TransposeAttribute;
+struct TransposeAttributeBuilder;
+
+struct TableAttribute;
+struct TableAttributeBuilder;
+
+struct MatMulAttribute;
+struct MatMulAttributeBuilder;
+
+struct FullyConnectedAttribute;
+struct FullyConnectedAttributeBuilder;
+
+struct NegateAttribute;
+struct NegateAttributeBuilder;
+
+struct CustomAttribute;
+struct CustomAttributeBuilder;
+
+struct FFTAttribute;
+struct FFTAttributeBuilder;
+
+struct Version;
+struct VersionBuilder;
+
+struct TosaTensor;
+struct TosaTensorBuilder;
+
+struct TosaOperator;
+struct TosaOperatorBuilder;
+
+struct TosaBasicBlock;
+struct TosaBasicBlockBuilder;
+
+struct TosaRegion;
+struct TosaRegionBuilder;
+
+struct TosaGraph;
+struct TosaGraphBuilder;
+
+inline const ::flatbuffers::TypeTable *PoolAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *ConvAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *TransposeConvAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *PadAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *AxisAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *ReshapeAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *SliceAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *TileAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *ResizeAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *ClampAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *RescaleAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *MulAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *ArithmeticRightShiftAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *CondIfAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *WhileLoopAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *TransposeAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *TableAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *MatMulAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *FullyConnectedAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *NegateAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *CustomAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *FFTAttributeTypeTable();
+
+inline const ::flatbuffers::TypeTable *VersionTypeTable();
+
+inline const ::flatbuffers::TypeTable *TosaTensorTypeTable();
+
+inline const ::flatbuffers::TypeTable *TosaOperatorTypeTable();
+
+inline const ::flatbuffers::TypeTable *TosaBasicBlockTypeTable();
+
+inline const ::flatbuffers::TypeTable *TosaRegionTypeTable();
+
+inline const ::flatbuffers::TypeTable *TosaGraphTypeTable();
+
+enum class DType : uint32_t
+{
+    UNKNOWN = 0,
+    BOOL = 1,
+    UINT8 = 2,
+    INT4 = 3,
+    INT8 = 4,
+    INT16 = 5,
+    INT32 = 6,
+    INT48 = 7,
+    FP32 = 8,
+    UINT16 = 9,
+    FP16 = 10,
+    BF16 = 11,
+    MIN = UNKNOWN,
+    MAX = BF16
+};
+
+inline const DType (&EnumValuesDType())[12]
+{
+    static const DType values[] = {DType::UNKNOWN, DType::BOOL, DType::UINT8, DType::INT4, DType::INT8, DType::INT16,
+        DType::INT32, DType::INT48, DType::FP32, DType::UINT16, DType::FP16, DType::BF16};
+    return values;
+}
+
+inline const char *const *EnumNamesDType()
+{
+    static const char *const names[13] = {"UNKNOWN", "BOOL", "UINT8", "INT4", "INT8", "INT16", "INT32", "INT48", "FP32",
+        "UINT16", "FP16", "BF16", nullptr};
+    return names;
+}
+
+inline const char *EnumNameDType(DType e)
+{
+    if ( ::flatbuffers::IsOutRange(e, DType::UNKNOWN, DType::BF16) ) return "";
+    const size_t index = static_cast<size_t>(e);
+    return EnumNamesDType()[index];
+}
+
+enum class ResizeMode : uint32_t
+{
+    UNKNOWN = 0,
+    NEAREST = 1,
+    BILINEAR = 2,
+    MIN = UNKNOWN,
+    MAX = BILINEAR
+};
+
+inline const ResizeMode (&EnumValuesResizeMode())[3]
+{
+    static const ResizeMode values[] = {ResizeMode::UNKNOWN, ResizeMode::NEAREST, ResizeMode::BILINEAR};
+    return values;
+}
+
+inline const char *const *EnumNamesResizeMode()
+{
+    static const char *const names[4] = {"UNKNOWN", "NEAREST", "BILINEAR", nullptr};
+    return names;
+}
+
+inline const char *EnumNameResizeMode(ResizeMode e)
+{
+    if ( ::flatbuffers::IsOutRange(e, ResizeMode::UNKNOWN, ResizeMode::BILINEAR) ) return "";
+    const size_t index = static_cast<size_t>(e);
+    return EnumNamesResizeMode()[index];
+}
+
+enum class Op : uint32_t
+{
+    UNKNOWN = 0,
+    ARGMAX = 1,
+    AVG_POOL2D = 2,
+    CONV2D = 3,
+    CONV3D = 4,
+    DEPTHWISE_CONV2D = 5,
+    FULLY_CONNECTED = 6,
+    MATMUL = 7,
+    MAX_POOL2D = 8,
+    TRANSPOSE_CONV2D = 9,
+    CLAMP = 10,
+    RESERVED = 11,
+    SIGMOID = 12,
+    TANH = 13,
+    ADD = 14,
+    ARITHMETIC_RIGHT_SHIFT = 15,
+    BITWISE_AND = 16,
+    BITWISE_OR = 17,
+    BITWISE_XOR = 18,
+    INTDIV = 19,
+    LOGICAL_AND = 20,
+    LOGICAL_LEFT_SHIFT = 21,
+    LOGICAL_RIGHT_SHIFT = 22,
+    LOGICAL_OR = 23,
+    LOGICAL_XOR = 24,
+    MAXIMUM = 25,
+    MINIMUM = 26,
+    MUL = 27,
+    POW = 28,
+    SUB = 29,
+    TABLE = 30,
+    ABS = 31,
+    BITWISE_NOT = 32,
+    CEIL = 33,
+    CLZ = 34,
+    EXP = 35,
+    FLOOR = 36,
+    LOG = 37,
+    LOGICAL_NOT = 38,
+    NEGATE = 39,
+    RECIPROCAL = 40,
+    RSQRT = 41,
+    SELECT = 42,
+    EQUAL = 43,
+    GREATER = 44,
+    GREATER_EQUAL = 45,
+    REDUCE_ANY = 46,
+    REDUCE_ALL = 47,
+    REDUCE_MAX = 48,
+    REDUCE_MIN = 49,
+    REDUCE_PRODUCT = 50,
+    REDUCE_SUM = 51,
+    CONCAT = 52,
+    PAD = 53,
+    RESHAPE = 54,
+    REVERSE = 55,
+    SLICE = 56,
+    TILE = 57,
+    TRANSPOSE = 58,
+    GATHER = 59,
+    SCATTER = 60,
+    RESIZE = 61,
+    CAST = 62,
+    RESCALE = 63,
+    CONST = 64,
+    IDENTITY = 65,
+    CUSTOM = 66,
+    COND_IF = 67,
+    WHILE_LOOP = 68,
+    FFT2D = 69,
+    RFFT2D = 70,
+    MIN = UNKNOWN,
+    MAX = RFFT2D
+};
+
+inline const Op (&EnumValuesOp())[71]
+{
+    static const Op values[] = {Op::UNKNOWN, Op::ARGMAX, Op::AVG_POOL2D, Op::CONV2D, Op::CONV3D, Op::DEPTHWISE_CONV2D,
+        Op::FULLY_CONNECTED, Op::MATMUL, Op::MAX_POOL2D, Op::TRANSPOSE_CONV2D, Op::CLAMP, Op::RESERVED, Op::SIGMOID, Op::TANH,
+        Op::ADD, Op::ARITHMETIC_RIGHT_SHIFT, Op::BITWISE_AND, Op::BITWISE_OR, Op::BITWISE_XOR, Op::INTDIV, Op::LOGICAL_AND,
+        Op::LOGICAL_LEFT_SHIFT, Op::LOGICAL_RIGHT_SHIFT, Op::LOGICAL_OR, Op::LOGICAL_XOR, Op::MAXIMUM, Op::MINIMUM,
+        Op::MUL, Op::POW, Op::SUB, Op::TABLE, Op::ABS, Op::BITWISE_NOT, Op::CEIL, Op::CLZ, Op::EXP, Op::FLOOR, Op::LOG,
+        Op::LOGICAL_NOT, Op::NEGATE, Op::RECIPROCAL, Op::RSQRT, Op::SELECT, Op::EQUAL, Op::GREATER, Op::GREATER_EQUAL,
+        Op::REDUCE_ANY, Op::REDUCE_ALL, Op::REDUCE_MAX, Op::REDUCE_MIN, Op::REDUCE_PRODUCT, Op::REDUCE_SUM, Op::CONCAT,
+        Op::PAD, Op::RESHAPE, Op::REVERSE, Op::SLICE, Op::TILE, Op::TRANSPOSE, Op::GATHER, Op::SCATTER, Op::RESIZE,
+        Op::CAST, Op::RESCALE, Op::CONST, Op::IDENTITY, Op::CUSTOM, Op::COND_IF, Op::WHILE_LOOP, Op::FFT2D, Op::RFFT2D};
+    return values;
+}
+
+inline const char *const *EnumNamesOp()
+{
+    static const char *const names[72] = {"UNKNOWN", "ARGMAX", "AVG_POOL2D", "CONV2D", "CONV3D", "DEPTHWISE_CONV2D",
+        "FULLY_CONNECTED", "MATMUL", "MAX_POOL2D", "TRANSPOSE_CONV2D", "CLAMP", "RESERVED", "SIGMOID", "TANH", "ADD",
+        "ARITHMETIC_RIGHT_SHIFT", "BITWISE_AND", "BITWISE_OR", "BITWISE_XOR", "INTDIV", "LOGICAL_AND", "LOGICAL_LEFT_SHIFT",
+        "LOGICAL_RIGHT_SHIFT", "LOGICAL_OR", "LOGICAL_XOR", "MAXIMUM", "MINIMUM", "MUL", "POW", "SUB", "TABLE", "ABS",
+        "BITWISE_NOT", "CEIL", "CLZ", "EXP", "FLOOR", "LOG", "LOGICAL_NOT", "NEGATE", "RECIPROCAL", "RSQRT", "SELECT",
+        "EQUAL", "GREATER", "GREATER_EQUAL", "REDUCE_ANY", "REDUCE_ALL", "REDUCE_MAX", "REDUCE_MIN", "REDUCE_PRODUCT",
+        "REDUCE_SUM", "CONCAT", "PAD", "RESHAPE", "REVERSE", "SLICE", "TILE", "TRANSPOSE", "GATHER", "SCATTER",
+        "RESIZE", "CAST", "RESCALE", "CONST", "IDENTITY", "CUSTOM", "COND_IF", "WHILE_LOOP", "FFT2D", "RFFT2D", nullptr};
+    return names;
+}
+
+inline const char *EnumNameOp(Op e)
+{
+    if ( ::flatbuffers::IsOutRange(e, Op::UNKNOWN, Op::RFFT2D) ) return "";
+    const size_t index = static_cast<size_t>(e);
+    return EnumNamesOp()[index];
+}
+
+enum class Attribute : uint8_t
+{
+    NONE = 0,
+    PoolAttribute = 1,
+    ConvAttribute = 2,
+    TransposeConvAttribute = 3,
+    PadAttribute = 4,
+    AxisAttribute = 5,
+    ReshapeAttribute = 6,
+    SliceAttribute = 7,
+    TileAttribute = 8,
+    ResizeAttribute = 9,
+    ClampAttribute = 10,
+    RescaleAttribute = 11,
+    MulAttribute = 12,
+    ArithmeticRightShiftAttribute = 13,
+    CondIfAttribute = 14,
+    WhileLoopAttribute = 15,
+    TransposeAttribute = 16,
+    TableAttribute = 17,
+    MatMulAttribute = 18,
+    FullyConnectedAttribute = 19,
+    NegateAttribute = 20,
+    CustomAttribute = 21,
+    FFTAttribute = 22,
+    MIN = NONE,
+    MAX = FFTAttribute
+};
+
+inline const Attribute (&EnumValuesAttribute())[23]
+{
+    static const Attribute values[] = {Attribute::NONE, Attribute::PoolAttribute, Attribute::ConvAttribute,
+        Attribute::TransposeConvAttribute, Attribute::PadAttribute, Attribute::AxisAttribute, Attribute::ReshapeAttribute,
+        Attribute::SliceAttribute, Attribute::TileAttribute, Attribute::ResizeAttribute, Attribute::ClampAttribute,
+        Attribute::RescaleAttribute, Attribute::MulAttribute, Attribute::ArithmeticRightShiftAttribute, Attribute::CondIfAttribute,
+        Attribute::WhileLoopAttribute, Attribute::TransposeAttribute, Attribute::TableAttribute, Attribute::MatMulAttribute,
+        Attribute::FullyConnectedAttribute, Attribute::NegateAttribute, Attribute::CustomAttribute, Attribute::FFTAttribute};
+    return values;
+}
+
+inline const char *const *EnumNamesAttribute()
+{
+    static const char *const names[24] = {"NONE", "PoolAttribute", "ConvAttribute", "TransposeConvAttribute", "PadAttribute",
+        "AxisAttribute", "ReshapeAttribute", "SliceAttribute", "TileAttribute", "ResizeAttribute", "ClampAttribute", "RescaleAttribute",
+        "MulAttribute", "ArithmeticRightShiftAttribute", "CondIfAttribute", "WhileLoopAttribute", "TransposeAttribute", "TableAttribute",
+        "MatMulAttribute", "FullyConnectedAttribute", "NegateAttribute", "CustomAttribute", "FFTAttribute", nullptr};
+    return names;
+}
+
+inline const char *EnumNameAttribute(Attribute e)
+{
+    if ( ::flatbuffers::IsOutRange(e, Attribute::NONE, Attribute::FFTAttribute) ) return "";
+    const size_t index = static_cast<size_t>(e);
+    return EnumNamesAttribute()[index];
+}
+
+template<typename T>
+struct AttributeTraits
+{
+    static const Attribute enum_value = Attribute::NONE;
+};
+
+template<>
+struct AttributeTraits<tosaFb::PoolAttribute>
+{
+    static const Attribute enum_value = Attribute::PoolAttribute;
+};
+
+template<>
+struct AttributeTraits<tosaFb::ConvAttribute>
+{
+    static const Attribute enum_value = Attribute::ConvAttribute;
+};
+
+template<>
+struct AttributeTraits<tosaFb::TransposeConvAttribute>
+{
+    static const Attribute enum_value = Attribute::TransposeConvAttribute;
+};
+
+template<>
+struct AttributeTraits<tosaFb::PadAttribute>
+{
+    static const Attribute enum_value = Attribute::PadAttribute;
+};
+
+template<>
+struct AttributeTraits<tosaFb::AxisAttribute>
+{
+    static const Attribute enum_value = Attribute::AxisAttribute;
+};
+
+template<>
+struct AttributeTraits<tosaFb::ReshapeAttribute>
+{
+    static const Attribute enum_value = Attribute::ReshapeAttribute;
+};
+
+template<>
+struct AttributeTraits<tosaFb::SliceAttribute>
+{
+    static const Attribute enum_value = Attribute::SliceAttribute;
+};
+
+template<>
+struct AttributeTraits<tosaFb::TileAttribute>
+{
+    static const Attribute enum_value = Attribute::TileAttribute;
+};
+
+template<>
+struct AttributeTraits<tosaFb::ResizeAttribute>
+{
+    static const Attribute enum_value = Attribute::ResizeAttribute;
+};
+
+template<>
+struct AttributeTraits<tosaFb::ClampAttribute>
+{
+    static const Attribute enum_value = Attribute::ClampAttribute;
+};
+
+template<>
+struct AttributeTraits<tosaFb::RescaleAttribute>
+{
+    static const Attribute enum_value = Attribute::RescaleAttribute;
+};
+
+template<>
+struct AttributeTraits<tosaFb::MulAttribute>
+{
+    static const Attribute enum_value = Attribute::MulAttribute;
+};
+
+template<>
+struct AttributeTraits<tosaFb::ArithmeticRightShiftAttribute>
+{
+    static const Attribute enum_value = Attribute::ArithmeticRightShiftAttribute;
+};
+
+template<>
+struct AttributeTraits<tosaFb::CondIfAttribute>
+{
+    static const Attribute enum_value = Attribute::CondIfAttribute;
+};
+
+template<>
+struct AttributeTraits<tosaFb::WhileLoopAttribute>
+{
+    static const Attribute enum_value = Attribute::WhileLoopAttribute;
+};
+
+template<>
+struct AttributeTraits<tosaFb::TransposeAttribute>
+{
+    static const Attribute enum_value = Attribute::TransposeAttribute;
+};
+
+template<>
+struct AttributeTraits<tosaFb::TableAttribute>
+{
+    static const Attribute enum_value = Attribute::TableAttribute;
+};
+
+template<>
+struct AttributeTraits<tosaFb::MatMulAttribute>
+{
+    static const Attribute enum_value = Attribute::MatMulAttribute;
+};
+
+template<>
+struct AttributeTraits<tosaFb::FullyConnectedAttribute>
+{
+    static const Attribute enum_value = Attribute::FullyConnectedAttribute;
+};
+
+template<>
+struct AttributeTraits<tosaFb::NegateAttribute>
+{
+    static const Attribute enum_value = Attribute::NegateAttribute;
+};
+
+template<>
+struct AttributeTraits<tosaFb::CustomAttribute>
+{
+    static const Attribute enum_value = Attribute::CustomAttribute;
+};
+
+template<>
+struct AttributeTraits<tosaFb::FFTAttribute>
+{
+    static const Attribute enum_value = Attribute::FFTAttribute;
+};
+
+bool VerifyAttribute(::flatbuffers::Verifier &verifier, const void *obj, Attribute type);
+bool VerifyAttributeVector(::flatbuffers::Verifier &verifier,
+    const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<Attribute> *types);
+
+struct PoolAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef PoolAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return PoolAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_PAD = 4,
+        VT_KERNEL = 6,
+        VT_STRIDE = 8,
+        VT_INPUT_ZP = 10,
+        VT_OUTPUT_ZP = 12,
+        VT_ACCUM_DTYPE = 14
+    };
+    const ::flatbuffers::Vector<int32_t> *pad() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_PAD);
+    }
+    const ::flatbuffers::Vector<int32_t> *kernel() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_KERNEL);
+    }
+    const ::flatbuffers::Vector<int32_t> *stride() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_STRIDE);
+    }
+    int32_t input_zp() const { return GetField<int32_t>(VT_INPUT_ZP, 0); }
+    int32_t output_zp() const { return GetField<int32_t>(VT_OUTPUT_ZP, 0); }
+    tosaFb::DType accum_dtype() const { return static_cast<tosaFb::DType>(GetField<uint32_t>(VT_ACCUM_DTYPE, 0)); }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_PAD) && verifier.VerifyVector(pad()) && VerifyOffset(verifier, VT_KERNEL) &&
+               verifier.VerifyVector(kernel()) && VerifyOffset(verifier, VT_STRIDE) && verifier.VerifyVector(stride()) &&
+               VerifyField<int32_t>(verifier, VT_INPUT_ZP, 4) && VerifyField<int32_t>(verifier, VT_OUTPUT_ZP, 4) &&
+               VerifyField<uint32_t>(verifier, VT_ACCUM_DTYPE, 4) && verifier.EndTable();
+    }
+};
+
+struct PoolAttributeBuilder
+{
+    typedef PoolAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_pad(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> pad)
+    {
+        fbb_.AddOffset(PoolAttribute::VT_PAD, pad);
+    }
+    void add_kernel(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> kernel)
+    {
+        fbb_.AddOffset(PoolAttribute::VT_KERNEL, kernel);
+    }
+    void add_stride(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> stride)
+    {
+        fbb_.AddOffset(PoolAttribute::VT_STRIDE, stride);
+    }
+    void add_input_zp(int32_t input_zp) { fbb_.AddElement<int32_t>(PoolAttribute::VT_INPUT_ZP, input_zp, 0); }
+    void add_output_zp(int32_t output_zp) { fbb_.AddElement<int32_t>(PoolAttribute::VT_OUTPUT_ZP, output_zp, 0); }
+    void add_accum_dtype(tosaFb::DType accum_dtype)
+    {
+        fbb_.AddElement<uint32_t>(PoolAttribute::VT_ACCUM_DTYPE, static_cast<uint32_t>(accum_dtype), 0);
+    }
+    explicit PoolAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+    ::flatbuffers::Offset<PoolAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<PoolAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<PoolAttribute> CreatePoolAttribute(::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> pad = 0, ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> kernel = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> stride = 0, int32_t input_zp = 0, int32_t output_zp = 0,
+    tosaFb::DType accum_dtype = tosaFb::DType::UNKNOWN)
+{
+    PoolAttributeBuilder builder_(_fbb);
+    builder_.add_accum_dtype(accum_dtype);
+    builder_.add_output_zp(output_zp);
+    builder_.add_input_zp(input_zp);
+    builder_.add_stride(stride);
+    builder_.add_kernel(kernel);
+    builder_.add_pad(pad);
+    return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<PoolAttribute> CreatePoolAttributeDirect(::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *pad = nullptr, const std::vector<int32_t> *kernel = nullptr, const std::vector<int32_t> *stride = nullptr,
+    int32_t input_zp = 0, int32_t output_zp = 0, tosaFb::DType accum_dtype = tosaFb::DType::UNKNOWN)
+{
+    auto pad__ = pad ? _fbb.CreateVector<int32_t>(*pad) : 0;
+    auto kernel__ = kernel ? _fbb.CreateVector<int32_t>(*kernel) : 0;
+    auto stride__ = stride ? _fbb.CreateVector<int32_t>(*stride) : 0;
+    return tosaFb::CreatePoolAttribute(_fbb, pad__, kernel__, stride__, input_zp, output_zp, accum_dtype);
+}
+
+struct ConvAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef ConvAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return ConvAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_PAD = 4,
+        VT_STRIDE = 6,
+        VT_DILATION = 8,
+        VT_INPUT_ZP = 10,
+        VT_WEIGHT_ZP = 12
+    };
+    const ::flatbuffers::Vector<int32_t> *pad() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_PAD);
+    }
+    const ::flatbuffers::Vector<int32_t> *stride() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_STRIDE);
+    }
+    const ::flatbuffers::Vector<int32_t> *dilation() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_DILATION);
+    }
+    int32_t input_zp() const { return GetField<int32_t>(VT_INPUT_ZP, 0); }
+    int32_t weight_zp() const { return GetField<int32_t>(VT_WEIGHT_ZP, 0); }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_PAD) && verifier.VerifyVector(pad()) &&
+               VerifyOffset(verifier, VT_STRIDE) && verifier.VerifyVector(stride()) && VerifyOffset(verifier, VT_DILATION) &&
+               verifier.VerifyVector(dilation()) && VerifyField<int32_t>(verifier, VT_INPUT_ZP, 4) &&
+               VerifyField<int32_t>(verifier, VT_WEIGHT_ZP, 4) && verifier.EndTable();
+    }
+};
+
+struct ConvAttributeBuilder
+{
+    typedef ConvAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_pad(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> pad)
+    {
+        fbb_.AddOffset(ConvAttribute::VT_PAD, pad);
+    }
+    void add_stride(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> stride)
+    {
+        fbb_.AddOffset(ConvAttribute::VT_STRIDE, stride);
+    }
+    void add_dilation(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> dilation)
+    {
+        fbb_.AddOffset(ConvAttribute::VT_DILATION, dilation);
+    }
+    void add_input_zp(int32_t input_zp) { fbb_.AddElement<int32_t>(ConvAttribute::VT_INPUT_ZP, input_zp, 0); }
+    void add_weight_zp(int32_t weight_zp) { fbb_.AddElement<int32_t>(ConvAttribute::VT_WEIGHT_ZP, weight_zp, 0); }
+    explicit ConvAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+    ::flatbuffers::Offset<ConvAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<ConvAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<ConvAttribute> CreateConvAttribute(::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> pad = 0, ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> stride = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> dilation = 0, int32_t input_zp = 0, int32_t weight_zp = 0)
+{
+    ConvAttributeBuilder builder_(_fbb);
+    builder_.add_weight_zp(weight_zp);
+    builder_.add_input_zp(input_zp);
+    builder_.add_dilation(dilation);
+    builder_.add_stride(stride);
+    builder_.add_pad(pad);
+    return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ConvAttribute> CreateConvAttributeDirect(::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *pad = nullptr, const std::vector<int32_t> *stride = nullptr,
+    const std::vector<int32_t> *dilation = nullptr, int32_t input_zp = 0, int32_t weight_zp = 0)
+{
+    auto pad__ = pad ? _fbb.CreateVector<int32_t>(*pad) : 0;
+    auto stride__ = stride ? _fbb.CreateVector<int32_t>(*stride) : 0;
+    auto dilation__ = dilation ? _fbb.CreateVector<int32_t>(*dilation) : 0;
+    return tosaFb::CreateConvAttribute(_fbb, pad__, stride__, dilation__, input_zp, weight_zp);
+}
+
+struct TransposeConvAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef TransposeConvAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return TransposeConvAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_OUT_PAD = 4,
+        VT_STRIDE = 6,
+        VT_OUTPUT_SHAPE = 8,
+        VT_INPUT_ZP = 10,
+        VT_WEIGHT_ZP = 12
+    };
+    const ::flatbuffers::Vector<int32_t> *out_pad() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_OUT_PAD);
+    }
+    const ::flatbuffers::Vector<int32_t> *stride() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_STRIDE);
+    }
+    const ::flatbuffers::Vector<int32_t> *output_shape() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_OUTPUT_SHAPE);
+    }
+    int32_t input_zp() const { return GetField<int32_t>(VT_INPUT_ZP, 0); }
+    int32_t weight_zp() const { return GetField<int32_t>(VT_WEIGHT_ZP, 0); }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_OUT_PAD) && verifier.VerifyVector(out_pad()) &&
+               VerifyOffset(verifier, VT_STRIDE) && verifier.VerifyVector(stride()) && VerifyOffset(verifier, VT_OUTPUT_SHAPE) &&
+               verifier.VerifyVector(output_shape()) && VerifyField<int32_t>(verifier, VT_INPUT_ZP, 4) &&
+               VerifyField<int32_t>(verifier, VT_WEIGHT_ZP, 4) && verifier.EndTable();
+    }
+};
+
+struct TransposeConvAttributeBuilder
+{
+    typedef TransposeConvAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_out_pad(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> out_pad)
+    {
+        fbb_.AddOffset(TransposeConvAttribute::VT_OUT_PAD, out_pad);
+    }
+    void add_stride(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> stride)
+    {
+        fbb_.AddOffset(TransposeConvAttribute::VT_STRIDE, stride);
+    }
+    void add_output_shape(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> output_shape)
+    {
+        fbb_.AddOffset(TransposeConvAttribute::VT_OUTPUT_SHAPE, output_shape);
+    }
+    void add_input_zp(int32_t input_zp) { fbb_.AddElement<int32_t>(TransposeConvAttribute::VT_INPUT_ZP, input_zp, 0); }
+    void add_weight_zp(int32_t weight_zp)
+    {
+        fbb_.AddElement<int32_t>(TransposeConvAttribute::VT_WEIGHT_ZP, weight_zp, 0);
+    }
+    explicit TransposeConvAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb)
+    {
+        start_ = fbb_.StartTable();
+    }
+    ::flatbuffers::Offset<TransposeConvAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<TransposeConvAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<TransposeConvAttribute> CreateTransposeConvAttribute(::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> out_pad = 0, ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> stride = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> output_shape = 0, int32_t input_zp = 0, int32_t weight_zp = 0)
+{
+    TransposeConvAttributeBuilder builder_(_fbb);
+    builder_.add_weight_zp(weight_zp);
+    builder_.add_input_zp(input_zp);
+    builder_.add_output_shape(output_shape);
+    builder_.add_stride(stride);
+    builder_.add_out_pad(out_pad);
+    return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<TransposeConvAttribute> CreateTransposeConvAttributeDirect(::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *out_pad = nullptr, const std::vector<int32_t> *stride = nullptr,
+    const std::vector<int32_t> *output_shape = nullptr, int32_t input_zp = 0, int32_t weight_zp = 0)
+{
+    auto out_pad__ = out_pad ? _fbb.CreateVector<int32_t>(*out_pad) : 0;
+    auto stride__ = stride ? _fbb.CreateVector<int32_t>(*stride) : 0;
+    auto output_shape__ = output_shape ? _fbb.CreateVector<int32_t>(*output_shape) : 0;
+    return tosaFb::CreateTransposeConvAttribute(_fbb, out_pad__, stride__, output_shape__, input_zp, weight_zp);
+}
+
+struct PadAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef PadAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return PadAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_PADDING = 4,
+        VT_PAD_CONST_INT = 6,
+        VT_PAD_CONST_FP = 8
+    };
+    const ::flatbuffers::Vector<int32_t> *padding() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_PADDING);
+    }
+    int32_t pad_const_int() const { return GetField<int32_t>(VT_PAD_CONST_INT, 0); }
+    const ::flatbuffers::Vector<uint8_t> *pad_const_fp() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_PAD_CONST_FP);
+    }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_PADDING) && verifier.VerifyVector(padding()) &&
+               VerifyField<int32_t>(verifier, VT_PAD_CONST_INT, 4) && VerifyOffset(verifier, VT_PAD_CONST_FP) &&
+               verifier.VerifyVector(pad_const_fp()) && verifier.EndTable();
+    }
+};
+
+struct PadAttributeBuilder
+{
+    typedef PadAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_padding(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> padding)
+    {
+        fbb_.AddOffset(PadAttribute::VT_PADDING, padding);
+    }
+    void add_pad_const_int(int32_t pad_const_int)
+    {
+        fbb_.AddElement<int32_t>(PadAttribute::VT_PAD_CONST_INT, pad_const_int, 0);
+    }
+    void add_pad_const_fp(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> pad_const_fp)
+    {
+        fbb_.AddOffset(PadAttribute::VT_PAD_CONST_FP, pad_const_fp);
+    }
+    explicit PadAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+    ::flatbuffers::Offset<PadAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<PadAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<PadAttribute> CreatePadAttribute(::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> padding = 0, int32_t pad_const_int = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> pad_const_fp = 0)
+{
+    PadAttributeBuilder builder_(_fbb);
+    builder_.add_pad_const_fp(pad_const_fp);
+    builder_.add_pad_const_int(pad_const_int);
+    builder_.add_padding(padding);
+    return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<PadAttribute> CreatePadAttributeDirect(::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *padding = nullptr, int32_t pad_const_int = 0, const std::vector<uint8_t> *pad_const_fp = nullptr)
+{
+    auto padding__ = padding ? _fbb.CreateVector<int32_t>(*padding) : 0;
+    if ( pad_const_fp )
+    {
+        _fbb.ForceVectorAlignment(pad_const_fp->size(), sizeof(uint8_t), 8);
+    }
+    auto pad_const_fp__ = pad_const_fp ? _fbb.CreateVector<uint8_t>(*pad_const_fp) : 0;
+    return tosaFb::CreatePadAttribute(_fbb, padding__, pad_const_int, pad_const_fp__);
+}
+
+struct AxisAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef AxisAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return AxisAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_AXIS = 4
+    };
+    int32_t axis() const { return GetField<int32_t>(VT_AXIS, 0); }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyField<int32_t>(verifier, VT_AXIS, 4) && verifier.EndTable();
+    }
+};
+
+struct AxisAttributeBuilder
+{
+    typedef AxisAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_axis(int32_t axis) { fbb_.AddElement<int32_t>(AxisAttribute::VT_AXIS, axis, 0); }
+    explicit AxisAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+    ::flatbuffers::Offset<AxisAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<AxisAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<AxisAttribute> CreateAxisAttribute(::flatbuffers::FlatBufferBuilder &_fbb, int32_t axis = 0)
+{
+    AxisAttributeBuilder builder_(_fbb);
+    builder_.add_axis(axis);
+    return builder_.Finish();
+}
+
+struct ReshapeAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef ReshapeAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return ReshapeAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_NEW_SHAPE = 4
+    };
+    const ::flatbuffers::Vector<int32_t> *new_shape() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_NEW_SHAPE);
+    }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_NEW_SHAPE) &&
+               verifier.VerifyVector(new_shape()) && verifier.EndTable();
+    }
+};
+
+struct ReshapeAttributeBuilder
+{
+    typedef ReshapeAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_new_shape(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> new_shape)
+    {
+        fbb_.AddOffset(ReshapeAttribute::VT_NEW_SHAPE, new_shape);
+    }
+    explicit ReshapeAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb)
+    {
+        start_ = fbb_.StartTable();
+    }
+    ::flatbuffers::Offset<ReshapeAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<ReshapeAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<ReshapeAttribute> CreateReshapeAttribute(
+    ::flatbuffers::FlatBufferBuilder &_fbb, ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> new_shape = 0)
+{
+    ReshapeAttributeBuilder builder_(_fbb);
+    builder_.add_new_shape(new_shape);
+    return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ReshapeAttribute>
+CreateReshapeAttributeDirect(::flatbuffers::FlatBufferBuilder &_fbb, const std::vector<int32_t> *new_shape = nullptr)
+{
+    auto new_shape__ = new_shape ? _fbb.CreateVector<int32_t>(*new_shape) : 0;
+    return tosaFb::CreateReshapeAttribute(_fbb, new_shape__);
+}
+
+struct SliceAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef SliceAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return SliceAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_START = 4,
+        VT_SIZE = 6
+    };
+    const ::flatbuffers::Vector<int32_t> *start() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_START);
+    }
+    const ::flatbuffers::Vector<int32_t> *size() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_SIZE);
+    }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_START) && verifier.VerifyVector(start()) &&
+               VerifyOffset(verifier, VT_SIZE) && verifier.VerifyVector(size()) && verifier.EndTable();
+    }
+};
+
+struct SliceAttributeBuilder
+{
+    typedef SliceAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_start(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> start)
+    {
+        fbb_.AddOffset(SliceAttribute::VT_START, start);
+    }
+    void add_size(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> size)
+    {
+        fbb_.AddOffset(SliceAttribute::VT_SIZE, size);
+    }
+    explicit SliceAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+    ::flatbuffers::Offset<SliceAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<SliceAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<SliceAttribute> CreateSliceAttribute(::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> start = 0, ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> size = 0)
+{
+    SliceAttributeBuilder builder_(_fbb);
+    builder_.add_size(size);
+    builder_.add_start(start);
+    return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<SliceAttribute> CreateSliceAttributeDirect(::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *start = nullptr, const std::vector<int32_t> *size = nullptr)
+{
+    auto start__ = start ? _fbb.CreateVector<int32_t>(*start) : 0;
+    auto size__ = size ? _fbb.CreateVector<int32_t>(*size) : 0;
+    return tosaFb::CreateSliceAttribute(_fbb, start__, size__);
+}
+
+struct TileAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef TileAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return TileAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_MULTIPLES = 4
+    };
+    const ::flatbuffers::Vector<int32_t> *multiples() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_MULTIPLES);
+    }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_MULTIPLES) &&
+               verifier.VerifyVector(multiples()) && verifier.EndTable();
+    }
+};
+
+struct TileAttributeBuilder
+{
+    typedef TileAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_multiples(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> multiples)
+    {
+        fbb_.AddOffset(TileAttribute::VT_MULTIPLES, multiples);
+    }
+    explicit TileAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+    ::flatbuffers::Offset<TileAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<TileAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<TileAttribute> CreateTileAttribute(
+    ::flatbuffers::FlatBufferBuilder &_fbb, ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> multiples = 0)
+{
+    TileAttributeBuilder builder_(_fbb);
+    builder_.add_multiples(multiples);
+    return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<TileAttribute>
+CreateTileAttributeDirect(::flatbuffers::FlatBufferBuilder &_fbb, const std::vector<int32_t> *multiples = nullptr)
+{
+    auto multiples__ = multiples ? _fbb.CreateVector<int32_t>(*multiples) : 0;
+    return tosaFb::CreateTileAttribute(_fbb, multiples__);
+}
+
+struct ResizeAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef ResizeAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return ResizeAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_SCALE = 4,
+        VT_OFFSET = 6,
+        VT_BORDER = 8,
+        VT_MODE = 10
+    };
+    const ::flatbuffers::Vector<int16_t> *scale() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int16_t> *>(VT_SCALE);
+    }
+    const ::flatbuffers::Vector<int16_t> *offset() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int16_t> *>(VT_OFFSET);
+    }
+    const ::flatbuffers::Vector<int16_t> *border() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int16_t> *>(VT_BORDER);
+    }
+    tosaFb::ResizeMode mode() const { return static_cast<tosaFb::ResizeMode>(GetField<uint32_t>(VT_MODE, 0)); }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_SCALE) && verifier.VerifyVector(scale()) &&
+               VerifyOffset(verifier, VT_OFFSET) && verifier.VerifyVector(offset()) && VerifyOffset(verifier, VT_BORDER) &&
+               verifier.VerifyVector(border()) && VerifyField<uint32_t>(verifier, VT_MODE, 4) && verifier.EndTable();
+    }
+};
+
+struct ResizeAttributeBuilder
+{
+    typedef ResizeAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_scale(::flatbuffers::Offset<::flatbuffers::Vector<int16_t>> scale)
+    {
+        fbb_.AddOffset(ResizeAttribute::VT_SCALE, scale);
+    }
+    void add_offset(::flatbuffers::Offset<::flatbuffers::Vector<int16_t>> offset)
+    {
+        fbb_.AddOffset(ResizeAttribute::VT_OFFSET, offset);
+    }
+    void add_border(::flatbuffers::Offset<::flatbuffers::Vector<int16_t>> border)
+    {
+        fbb_.AddOffset(ResizeAttribute::VT_BORDER, border);
+    }
+    void add_mode(tosaFb::ResizeMode mode)
+    {
+        fbb_.AddElement<uint32_t>(ResizeAttribute::VT_MODE, static_cast<uint32_t>(mode), 0);
+    }
+    explicit ResizeAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+    ::flatbuffers::Offset<ResizeAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<ResizeAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<ResizeAttribute> CreateResizeAttribute(::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int16_t>> scale = 0, ::flatbuffers::Offset<::flatbuffers::Vector<int16_t>> offset = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int16_t>> border = 0, tosaFb::ResizeMode mode = tosaFb::ResizeMode::UNKNOWN)
+{
+    ResizeAttributeBuilder builder_(_fbb);
+    builder_.add_mode(mode);
+    builder_.add_border(border);
+    builder_.add_offset(offset);
+    builder_.add_scale(scale);
+    return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ResizeAttribute> CreateResizeAttributeDirect(::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int16_t> *scale = nullptr, const std::vector<int16_t> *offset = nullptr,
+    const std::vector<int16_t> *border = nullptr, tosaFb::ResizeMode mode = tosaFb::ResizeMode::UNKNOWN)
+{
+    auto scale__ = scale ? _fbb.CreateVector<int16_t>(*scale) : 0;
+    auto offset__ = offset ? _fbb.CreateVector<int16_t>(*offset) : 0;
+    auto border__ = border ? _fbb.CreateVector<int16_t>(*border) : 0;
+    return tosaFb::CreateResizeAttribute(_fbb, scale__, offset__, border__, mode);
+}
+
+struct ClampAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef ClampAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return ClampAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_MIN_INT = 4,
+        VT_MAX_INT = 6,
+        VT_MIN_FP = 8,
+        VT_MAX_FP = 10
+    };
+    int32_t min_int() const { return GetField<int32_t>(VT_MIN_INT, 0); }
+    int32_t max_int() const { return GetField<int32_t>(VT_MAX_INT, 0); }
+    const ::flatbuffers::Vector<uint8_t> *min_fp() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_MIN_FP);
+    }
+    const ::flatbuffers::Vector<uint8_t> *max_fp() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_MAX_FP);
+    }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyField<int32_t>(verifier, VT_MIN_INT, 4) &&
+               VerifyField<int32_t>(verifier, VT_MAX_INT, 4) && VerifyOffset(verifier, VT_MIN_FP) && verifier.VerifyVector(min_fp()) &&
+               VerifyOffset(verifier, VT_MAX_FP) && verifier.VerifyVector(max_fp()) && verifier.EndTable();
+    }
+};
+
+struct ClampAttributeBuilder
+{
+    typedef ClampAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_min_int(int32_t min_int) { fbb_.AddElement<int32_t>(ClampAttribute::VT_MIN_INT, min_int, 0); }
+    void add_max_int(int32_t max_int) { fbb_.AddElement<int32_t>(ClampAttribute::VT_MAX_INT, max_int, 0); }
+    void add_min_fp(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> min_fp)
+    {
+        fbb_.AddOffset(ClampAttribute::VT_MIN_FP, min_fp);
+    }
+    void add_max_fp(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> max_fp)
+    {
+        fbb_.AddOffset(ClampAttribute::VT_MAX_FP, max_fp);
+    }
+    explicit ClampAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+    ::flatbuffers::Offset<ClampAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<ClampAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<ClampAttribute> CreateClampAttribute(::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t min_int = 0, int32_t max_int = 0, ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> min_fp = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> max_fp = 0)
+{
+    ClampAttributeBuilder builder_(_fbb);
+    builder_.add_max_fp(max_fp);
+    builder_.add_min_fp(min_fp);
+    builder_.add_max_int(max_int);
+    builder_.add_min_int(min_int);
+    return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ClampAttribute> CreateClampAttributeDirect(::flatbuffers::FlatBufferBuilder &_fbb, int32_t min_int = 0,
+    int32_t max_int = 0, const std::vector<uint8_t> *min_fp = nullptr, const std::vector<uint8_t> *max_fp = nullptr)
+{
+    if ( min_fp )
+    {
+        _fbb.ForceVectorAlignment(min_fp->size(), sizeof(uint8_t), 8);
+    }
+    auto min_fp__ = min_fp ? _fbb.CreateVector<uint8_t>(*min_fp) : 0;
+    if ( max_fp )
+    {
+        _fbb.ForceVectorAlignment(max_fp->size(), sizeof(uint8_t), 8);
+    }
+    auto max_fp__ = max_fp ? _fbb.CreateVector<uint8_t>(*max_fp) : 0;
+    return tosaFb::CreateClampAttribute(_fbb, min_int, max_int, min_fp__, max_fp__);
+}
+
+struct RescaleAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef RescaleAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return RescaleAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_INPUT_ZP = 4,
+        VT_OUTPUT_ZP = 6,
+        VT_MULTIPLIER = 8,
+        VT_SHIFT = 10,
+        VT_SCALE32 = 12,
+        VT_DOUBLE_ROUND = 14,
+        VT_PER_CHANNEL = 16
+    };
+    int32_t input_zp() const { return GetField<int32_t>(VT_INPUT_ZP, 0); }
+    int32_t output_zp() const { return GetField<int32_t>(VT_OUTPUT_ZP, 0); }
+    const ::flatbuffers::Vector<int32_t> *multiplier() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_MULTIPLIER);
+    }
+    const ::flatbuffers::Vector<int32_t> *shift() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_SHIFT);
+    }
+    bool scale32() const { return GetField<uint8_t>(VT_SCALE32, 0) != 0; }
+    bool double_round() const { return GetField<uint8_t>(VT_DOUBLE_ROUND, 0) != 0; }
+    bool per_channel() const { return GetField<uint8_t>(VT_PER_CHANNEL, 0) != 0; }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyField<int32_t>(verifier, VT_INPUT_ZP, 4) &&
+               VerifyField<int32_t>(verifier, VT_OUTPUT_ZP, 4) && VerifyOffset(verifier, VT_MULTIPLIER) &&
+               verifier.VerifyVector(multiplier()) && VerifyOffset(verifier, VT_SHIFT) && verifier.VerifyVector(shift()) &&
+               VerifyField<uint8_t>(verifier, VT_SCALE32, 1) && VerifyField<uint8_t>(verifier, VT_DOUBLE_ROUND, 1) &&
+               VerifyField<uint8_t>(verifier, VT_PER_CHANNEL, 1) && verifier.EndTable();
+    }
+};
+
+struct RescaleAttributeBuilder
+{
+    typedef RescaleAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_input_zp(int32_t input_zp) { fbb_.AddElement<int32_t>(RescaleAttribute::VT_INPUT_ZP, input_zp, 0); }
+    void add_output_zp(int32_t output_zp) { fbb_.AddElement<int32_t>(RescaleAttribute::VT_OUTPUT_ZP, output_zp, 0); }
+    void add_multiplier(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> multiplier)
+    {
+        fbb_.AddOffset(RescaleAttribute::VT_MULTIPLIER, multiplier);
+    }
+    void add_shift(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shift)
+    {
+        fbb_.AddOffset(RescaleAttribute::VT_SHIFT, shift);
+    }
+    void add_scale32(bool scale32)
+    {
+        fbb_.AddElement<uint8_t>(RescaleAttribute::VT_SCALE32, static_cast<uint8_t>(scale32), 0);
+    }
+    void add_double_round(bool double_round)
+    {
+        fbb_.AddElement<uint8_t>(RescaleAttribute::VT_DOUBLE_ROUND, static_cast<uint8_t>(double_round), 0);
+    }
+    void add_per_channel(bool per_channel)
+    {
+        fbb_.AddElement<uint8_t>(RescaleAttribute::VT_PER_CHANNEL, static_cast<uint8_t>(per_channel), 0);
+    }
+    explicit RescaleAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb)
+    {
+        start_ = fbb_.StartTable();
+    }
+    ::flatbuffers::Offset<RescaleAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<RescaleAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<RescaleAttribute> CreateRescaleAttribute(::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t input_zp = 0, int32_t output_zp = 0, ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> multiplier = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shift = 0, bool scale32 = false, bool double_round = false, bool per_channel = false)
+{
+    RescaleAttributeBuilder builder_(_fbb);
+    builder_.add_shift(shift);
+    builder_.add_multiplier(multiplier);
+    builder_.add_output_zp(output_zp);
+    builder_.add_input_zp(input_zp);
+    builder_.add_per_channel(per_channel);
+    builder_.add_double_round(double_round);
+    builder_.add_scale32(scale32);
+    return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<RescaleAttribute> CreateRescaleAttributeDirect(::flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t input_zp = 0, int32_t output_zp = 0, const std::vector<int32_t> *multiplier = nullptr,
+    const std::vector<int32_t> *shift = nullptr, bool scale32 = false, bool double_round = false, bool per_channel = false)
+{
+    auto multiplier__ = multiplier ? _fbb.CreateVector<int32_t>(*multiplier) : 0;
+    auto shift__ = shift ? _fbb.CreateVector<int32_t>(*shift) : 0;
+    return tosaFb::CreateRescaleAttribute(_fbb, input_zp, output_zp, multiplier__, shift__, scale32, double_round, per_channel);
+}
+
+struct MulAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef MulAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return MulAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_SHIFT = 4
+    };
+    int32_t shift() const { return GetField<int32_t>(VT_SHIFT, 0); }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyField<int32_t>(verifier, VT_SHIFT, 4) && verifier.EndTable();
+    }
+};
+
+struct MulAttributeBuilder
+{
+    typedef MulAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_shift(int32_t shift) { fbb_.AddElement<int32_t>(MulAttribute::VT_SHIFT, shift, 0); }
+    explicit MulAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+    ::flatbuffers::Offset<MulAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<MulAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<MulAttribute> CreateMulAttribute(::flatbuffers::FlatBufferBuilder &_fbb, int32_t shift = 0)
+{
+    MulAttributeBuilder builder_(_fbb);
+    builder_.add_shift(shift);
+    return builder_.Finish();
+}
+
+struct ArithmeticRightShiftAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef ArithmeticRightShiftAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return ArithmeticRightShiftAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_ROUND = 4
+    };
+    bool round() const { return GetField<uint8_t>(VT_ROUND, 0) != 0; }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyField<uint8_t>(verifier, VT_ROUND, 1) && verifier.EndTable();
+    }
+};
+
+struct ArithmeticRightShiftAttributeBuilder
+{
+    typedef ArithmeticRightShiftAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_round(bool round)
+    {
+        fbb_.AddElement<uint8_t>(ArithmeticRightShiftAttribute::VT_ROUND, static_cast<uint8_t>(round), 0);
+    }
+    explicit ArithmeticRightShiftAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb)
+    {
+        start_ = fbb_.StartTable();
+    }
+    ::flatbuffers::Offset<ArithmeticRightShiftAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<ArithmeticRightShiftAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<ArithmeticRightShiftAttribute>
+CreateArithmeticRightShiftAttribute(::flatbuffers::FlatBufferBuilder &_fbb, bool round = false)
+{
+    ArithmeticRightShiftAttributeBuilder builder_(_fbb);
+    builder_.add_round(round);
+    return builder_.Finish();
+}
+
+struct CondIfAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef CondIfAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return CondIfAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_THEN_BRANCH = 4,
+        VT_ELSE_BRANCH = 6
+    };
+    const ::flatbuffers::String *then_branch() const
+    {
+        return GetPointer<const ::flatbuffers::String *>(VT_THEN_BRANCH);
+    }
+    const ::flatbuffers::String *else_branch() const
+    {
+        return GetPointer<const ::flatbuffers::String *>(VT_ELSE_BRANCH);
+    }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_THEN_BRANCH) && verifier.VerifyString(then_branch()) &&
+               VerifyOffset(verifier, VT_ELSE_BRANCH) && verifier.VerifyString(else_branch()) && verifier.EndTable();
+    }
+};
+
+struct CondIfAttributeBuilder
+{
+    typedef CondIfAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_then_branch(::flatbuffers::Offset<::flatbuffers::String> then_branch)
+    {
+        fbb_.AddOffset(CondIfAttribute::VT_THEN_BRANCH, then_branch);
+    }
+    void add_else_branch(::flatbuffers::Offset<::flatbuffers::String> else_branch)
+    {
+        fbb_.AddOffset(CondIfAttribute::VT_ELSE_BRANCH, else_branch);
+    }
+    explicit CondIfAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+    ::flatbuffers::Offset<CondIfAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<CondIfAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<CondIfAttribute> CreateCondIfAttribute(::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> then_branch = 0, ::flatbuffers::Offset<::flatbuffers::String> else_branch = 0)
+{
+    CondIfAttributeBuilder builder_(_fbb);
+    builder_.add_else_branch(else_branch);
+    builder_.add_then_branch(then_branch);
+    return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<CondIfAttribute> CreateCondIfAttributeDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb, const char *then_branch = nullptr, const char *else_branch = nullptr)
+{
+    auto then_branch__ = then_branch ? _fbb.CreateString(then_branch) : 0;
+    auto else_branch__ = else_branch ? _fbb.CreateString(else_branch) : 0;
+    return tosaFb::CreateCondIfAttribute(_fbb, then_branch__, else_branch__);
+}
+
+struct WhileLoopAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef WhileLoopAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return WhileLoopAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_COND_BRANCH = 4,
+        VT_BODY_BRANCH = 6
+    };
+    const ::flatbuffers::String *cond_branch() const
+    {
+        return GetPointer<const ::flatbuffers::String *>(VT_COND_BRANCH);
+    }
+    const ::flatbuffers::String *body_branch() const
+    {
+        return GetPointer<const ::flatbuffers::String *>(VT_BODY_BRANCH);
+    }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_COND_BRANCH) && verifier.VerifyString(cond_branch()) &&
+               VerifyOffset(verifier, VT_BODY_BRANCH) && verifier.VerifyString(body_branch()) && verifier.EndTable();
+    }
+};
+
+struct WhileLoopAttributeBuilder
+{
+    typedef WhileLoopAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_cond_branch(::flatbuffers::Offset<::flatbuffers::String> cond_branch)
+    {
+        fbb_.AddOffset(WhileLoopAttribute::VT_COND_BRANCH, cond_branch);
+    }
+    void add_body_branch(::flatbuffers::Offset<::flatbuffers::String> body_branch)
+    {
+        fbb_.AddOffset(WhileLoopAttribute::VT_BODY_BRANCH, body_branch);
+    }
+    explicit WhileLoopAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb)
+    {
+        start_ = fbb_.StartTable();
+    }
+    ::flatbuffers::Offset<WhileLoopAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<WhileLoopAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<WhileLoopAttribute> CreateWhileLoopAttribute(::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> cond_branch = 0, ::flatbuffers::Offset<::flatbuffers::String> body_branch = 0)
+{
+    WhileLoopAttributeBuilder builder_(_fbb);
+    builder_.add_body_branch(body_branch);
+    builder_.add_cond_branch(cond_branch);
+    return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<WhileLoopAttribute> CreateWhileLoopAttributeDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb, const char *cond_branch = nullptr, const char *body_branch = nullptr)
+{
+    auto cond_branch__ = cond_branch ? _fbb.CreateString(cond_branch) : 0;
+    auto body_branch__ = body_branch ? _fbb.CreateString(body_branch) : 0;
+    return tosaFb::CreateWhileLoopAttribute(_fbb, cond_branch__, body_branch__);
+}
+
+struct TransposeAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef TransposeAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return TransposeAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_PERMS = 4
+    };
+    const ::flatbuffers::Vector<int32_t> *perms() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_PERMS);
+    }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_PERMS) && verifier.VerifyVector(perms()) &&
+               verifier.EndTable();
+    }
+};
+
+struct TransposeAttributeBuilder
+{
+    typedef TransposeAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_perms(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> perms)
+    {
+        fbb_.AddOffset(TransposeAttribute::VT_PERMS, perms);
+    }
+    explicit TransposeAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb)
+    {
+        start_ = fbb_.StartTable();
+    }
+    ::flatbuffers::Offset<TransposeAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<TransposeAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<TransposeAttribute> CreateTransposeAttribute(
+    ::flatbuffers::FlatBufferBuilder &_fbb, ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> perms = 0)
+{
+    TransposeAttributeBuilder builder_(_fbb);
+    builder_.add_perms(perms);
+    return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<TransposeAttribute>
+CreateTransposeAttributeDirect(::flatbuffers::FlatBufferBuilder &_fbb, const std::vector<int32_t> *perms = nullptr)
+{
+    auto perms__ = perms ? _fbb.CreateVector<int32_t>(*perms) : 0;
+    return tosaFb::CreateTransposeAttribute(_fbb, perms__);
+}
+
+struct TableAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef TableAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return TableAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_TABLE = 4
+    };
+    const ::flatbuffers::Vector<int16_t> *table() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int16_t> *>(VT_TABLE);
+    }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_TABLE) && verifier.VerifyVector(table()) &&
+               verifier.EndTable();
+    }
+};
+
+struct TableAttributeBuilder
+{
+    typedef TableAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_table(::flatbuffers::Offset<::flatbuffers::Vector<int16_t>> table)
+    {
+        fbb_.AddOffset(TableAttribute::VT_TABLE, table);
+    }
+    explicit TableAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+    ::flatbuffers::Offset<TableAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<TableAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<TableAttribute> CreateTableAttribute(
+    ::flatbuffers::FlatBufferBuilder &_fbb, ::flatbuffers::Offset<::flatbuffers::Vector<int16_t>> table = 0)
+{
+    TableAttributeBuilder builder_(_fbb);
+    builder_.add_table(table);
+    return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<TableAttribute>
+CreateTableAttributeDirect(::flatbuffers::FlatBufferBuilder &_fbb, const std::vector<int16_t> *table = nullptr)
+{
+    auto table__ = table ? _fbb.CreateVector<int16_t>(*table) : 0;
+    return tosaFb::CreateTableAttribute(_fbb, table__);
+}
+
+struct MatMulAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef MatMulAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return MatMulAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_A_ZP = 4,
+        VT_B_ZP = 6
+    };
+    int32_t a_zp() const { return GetField<int32_t>(VT_A_ZP, 0); }
+    int32_t b_zp() const { return GetField<int32_t>(VT_B_ZP, 0); }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyField<int32_t>(verifier, VT_A_ZP, 4) &&
+               VerifyField<int32_t>(verifier, VT_B_ZP, 4) && verifier.EndTable();
+    }
+};
+
+struct MatMulAttributeBuilder
+{
+    typedef MatMulAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_a_zp(int32_t a_zp) { fbb_.AddElement<int32_t>(MatMulAttribute::VT_A_ZP, a_zp, 0); }
+    void add_b_zp(int32_t b_zp) { fbb_.AddElement<int32_t>(MatMulAttribute::VT_B_ZP, b_zp, 0); }
+    explicit MatMulAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+    ::flatbuffers::Offset<MatMulAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<MatMulAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<MatMulAttribute>
+CreateMatMulAttribute(::flatbuffers::FlatBufferBuilder &_fbb, int32_t a_zp = 0, int32_t b_zp = 0)
+{
+    MatMulAttributeBuilder builder_(_fbb);
+    builder_.add_b_zp(b_zp);
+    builder_.add_a_zp(a_zp);
+    return builder_.Finish();
+}
+
+struct FullyConnectedAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef FullyConnectedAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return FullyConnectedAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_INPUT_ZP = 4,
+        VT_WEIGHT_ZP = 6
+    };
+    int32_t input_zp() const { return GetField<int32_t>(VT_INPUT_ZP, 0); }
+    int32_t weight_zp() const { return GetField<int32_t>(VT_WEIGHT_ZP, 0); }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyField<int32_t>(verifier, VT_INPUT_ZP, 4) &&
+               VerifyField<int32_t>(verifier, VT_WEIGHT_ZP, 4) && verifier.EndTable();
+    }
+};
+
+struct FullyConnectedAttributeBuilder
+{
+    typedef FullyConnectedAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_input_zp(int32_t input_zp) { fbb_.AddElement<int32_t>(FullyConnectedAttribute::VT_INPUT_ZP, input_zp, 0); }
+    void add_weight_zp(int32_t weight_zp)
+    {
+        fbb_.AddElement<int32_t>(FullyConnectedAttribute::VT_WEIGHT_ZP, weight_zp, 0);
+    }
+    explicit FullyConnectedAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb)
+    {
+        start_ = fbb_.StartTable();
+    }
+    ::flatbuffers::Offset<FullyConnectedAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<FullyConnectedAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<FullyConnectedAttribute>
+CreateFullyConnectedAttribute(::flatbuffers::FlatBufferBuilder &_fbb, int32_t input_zp = 0, int32_t weight_zp = 0)
+{
+    FullyConnectedAttributeBuilder builder_(_fbb);
+    builder_.add_weight_zp(weight_zp);
+    builder_.add_input_zp(input_zp);
+    return builder_.Finish();
+}
+
+struct NegateAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef NegateAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return NegateAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_INPUT1_ZP = 4,
+        VT_OUTPUT_ZP = 6
+    };
+    int32_t input1_zp() const { return GetField<int32_t>(VT_INPUT1_ZP, 0); }
+    int32_t output_zp() const { return GetField<int32_t>(VT_OUTPUT_ZP, 0); }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyField<int32_t>(verifier, VT_INPUT1_ZP, 4) &&
+               VerifyField<int32_t>(verifier, VT_OUTPUT_ZP, 4) && verifier.EndTable();
+    }
+};
+
+struct NegateAttributeBuilder
+{
+    typedef NegateAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_input1_zp(int32_t input1_zp) { fbb_.AddElement<int32_t>(NegateAttribute::VT_INPUT1_ZP, input1_zp, 0); }
+    void add_output_zp(int32_t output_zp) { fbb_.AddElement<int32_t>(NegateAttribute::VT_OUTPUT_ZP, output_zp, 0); }
+    explicit NegateAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+    ::flatbuffers::Offset<NegateAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<NegateAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<NegateAttribute>
+CreateNegateAttribute(::flatbuffers::FlatBufferBuilder &_fbb, int32_t input1_zp = 0, int32_t output_zp = 0)
+{
+    NegateAttributeBuilder builder_(_fbb);
+    builder_.add_output_zp(output_zp);
+    builder_.add_input1_zp(input1_zp);
+    return builder_.Finish();
+}
+
+struct CustomAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef CustomAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return CustomAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_IDENTIFIER = 4,
+        VT_CONFIG = 6,
+        VT_IMPLEMENTATION_ATTRS = 8
+    };
+    const ::flatbuffers::String *identifier() const { return GetPointer<const ::flatbuffers::String *>(VT_IDENTIFIER); }
+    const ::flatbuffers::String *config() const { return GetPointer<const ::flatbuffers::String *>(VT_CONFIG); }
+    const ::flatbuffers::Vector<uint8_t> *implementation_attrs() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_IMPLEMENTATION_ATTRS);
+    }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_IDENTIFIER) && verifier.VerifyString(identifier()) &&
+               VerifyOffset(verifier, VT_CONFIG) && verifier.VerifyString(config()) && VerifyOffset(verifier, VT_IMPLEMENTATION_ATTRS) &&
+               verifier.VerifyVector(implementation_attrs()) && verifier.EndTable();
+    }
+};
+
+struct CustomAttributeBuilder
+{
+    typedef CustomAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_identifier(::flatbuffers::Offset<::flatbuffers::String> identifier)
+    {
+        fbb_.AddOffset(CustomAttribute::VT_IDENTIFIER, identifier);
+    }
+    void add_config(::flatbuffers::Offset<::flatbuffers::String> config)
+    {
+        fbb_.AddOffset(CustomAttribute::VT_CONFIG, config);
+    }
+    void add_implementation_attrs(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> implementation_attrs)
+    {
+        fbb_.AddOffset(CustomAttribute::VT_IMPLEMENTATION_ATTRS, implementation_attrs);
+    }
+    explicit CustomAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+    ::flatbuffers::Offset<CustomAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<CustomAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<CustomAttribute> CreateCustomAttribute(::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> identifier = 0, ::flatbuffers::Offset<::flatbuffers::String> config = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> implementation_attrs = 0)
+{
+    CustomAttributeBuilder builder_(_fbb);
+    builder_.add_implementation_attrs(implementation_attrs);
+    builder_.add_config(config);
+    builder_.add_identifier(identifier);
+    return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<CustomAttribute> CreateCustomAttributeDirect(::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *identifier = nullptr, const char *config = nullptr, const std::vector<uint8_t> *implementation_attrs = nullptr)
+{
+    auto identifier__ = identifier ? _fbb.CreateString(identifier) : 0;
+    auto config__ = config ? _fbb.CreateString(config) : 0;
+    auto implementation_attrs__ = implementation_attrs ? _fbb.CreateVector<uint8_t>(*implementation_attrs) : 0;
+    return tosaFb::CreateCustomAttribute(_fbb, identifier__, config__, implementation_attrs__);
+}
+
+struct FFTAttribute FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef FFTAttributeBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return FFTAttributeTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_INVERSE = 4
+    };
+    bool inverse() const { return GetField<uint8_t>(VT_INVERSE, 0) != 0; }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyField<uint8_t>(verifier, VT_INVERSE, 1) && verifier.EndTable();
+    }
+};
+
+struct FFTAttributeBuilder
+{
+    typedef FFTAttribute Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_inverse(bool inverse)
+    {
+        fbb_.AddElement<uint8_t>(FFTAttribute::VT_INVERSE, static_cast<uint8_t>(inverse), 0);
+    }
+    explicit FFTAttributeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+    ::flatbuffers::Offset<FFTAttribute> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<FFTAttribute>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<FFTAttribute> CreateFFTAttribute(::flatbuffers::FlatBufferBuilder &_fbb, bool inverse = false)
+{
+    FFTAttributeBuilder builder_(_fbb);
+    builder_.add_inverse(inverse);
+    return builder_.Finish();
+}
+
+struct Version FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef VersionBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return VersionTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT__MAJOR = 4,
+        VT__MINOR = 6,
+        VT__PATCH = 8,
+        VT__DRAFT = 10
+    };
+    int32_t _major() const { return GetField<int32_t>(VT__MAJOR, 0); }
+    int32_t _minor() const { return GetField<int32_t>(VT__MINOR, 60); }
+    int32_t _patch() const { return GetField<int32_t>(VT__PATCH, 0); }
+    bool _draft() const { return GetField<uint8_t>(VT__DRAFT, 0) != 0; }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyField<int32_t>(verifier, VT__MAJOR, 4) && VerifyField<int32_t>(verifier, VT__MINOR, 4) &&
+               VerifyField<int32_t>(verifier, VT__PATCH, 4) && VerifyField<uint8_t>(verifier, VT__DRAFT, 1) && verifier.EndTable();
+    }
+};
+
+struct VersionBuilder
+{
+    typedef Version Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add__major(int32_t _major) { fbb_.AddElement<int32_t>(Version::VT__MAJOR, _major, 0); }
+    void add__minor(int32_t _minor) { fbb_.AddElement<int32_t>(Version::VT__MINOR, _minor, 60); }
+    void add__patch(int32_t _patch) { fbb_.AddElement<int32_t>(Version::VT__PATCH, _patch, 0); }
+    void add__draft(bool _draft) { fbb_.AddElement<uint8_t>(Version::VT__DRAFT, static_cast<uint8_t>(_draft), 0); }
+    explicit VersionBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+    ::flatbuffers::Offset<Version> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<Version>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<Version> CreateVersion(::flatbuffers::FlatBufferBuilder &_fbb, int32_t _major = 0,
+    int32_t _minor = 60, int32_t _patch = 0, bool _draft = false)
+{
+    VersionBuilder builder_(_fbb);
+    builder_.add__patch(_patch);
+    builder_.add__minor(_minor);
+    builder_.add__major(_major);
+    builder_.add__draft(_draft);
+    return builder_.Finish();
+}
+
+struct TosaTensor FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef TosaTensorBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return TosaTensorTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_NAME = 4,
+        VT_SHAPE = 6,
+        VT_TYPE = 8,
+        VT_DATA = 10
+    };
+    const ::flatbuffers::String *name() const { return GetPointer<const ::flatbuffers::String *>(VT_NAME); }
+    const ::flatbuffers::Vector<int32_t> *shape() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_SHAPE);
+    }
+    tosaFb::DType type() const { return static_cast<tosaFb::DType>(GetField<uint32_t>(VT_TYPE, 0)); }
+    const ::flatbuffers::Vector<uint8_t> *data() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_DATA);
+    }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_NAME) && verifier.VerifyString(name()) &&
+               VerifyOffset(verifier, VT_SHAPE) && verifier.VerifyVector(shape()) && VerifyField<uint32_t>(verifier, VT_TYPE, 4) &&
+               VerifyOffset(verifier, VT_DATA) && verifier.VerifyVector(data()) && verifier.EndTable();
+    }
+};
+
+struct TosaTensorBuilder
+{
+    typedef TosaTensor Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_name(::flatbuffers::Offset<::flatbuffers::String> name) { fbb_.AddOffset(TosaTensor::VT_NAME, name); }
+    void add_shape(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape)
+    {
+        fbb_.AddOffset(TosaTensor::VT_SHAPE, shape);
+    }
+    void add_type(tosaFb::DType type)
+    {
+        fbb_.AddElement<uint32_t>(TosaTensor::VT_TYPE, static_cast<uint32_t>(type), 0);
+    }
+    void add_data(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data)
+    {
+        fbb_.AddOffset(TosaTensor::VT_DATA, data);
+    }
+    explicit TosaTensorBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+    ::flatbuffers::Offset<TosaTensor> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<TosaTensor>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<TosaTensor> CreateTosaTensor(::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0, ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> shape = 0,
+    tosaFb::DType type = tosaFb::DType::UNKNOWN, ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data = 0)
+{
+    TosaTensorBuilder builder_(_fbb);
+    builder_.add_data(data);
+    builder_.add_type(type);
+    builder_.add_shape(shape);
+    builder_.add_name(name);
+    return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<TosaTensor> CreateTosaTensorDirect(::flatbuffers::FlatBufferBuilder &_fbb, const char *name = nullptr,
+    const std::vector<int32_t> *shape = nullptr, tosaFb::DType type = tosaFb::DType::UNKNOWN, const std::vector<uint8_t> *data = nullptr)
+{
+    auto name__ = name ? _fbb.CreateString(name) : 0;
+    auto shape__ = shape ? _fbb.CreateVector<int32_t>(*shape) : 0;
+    if ( data )
+    {
+        _fbb.ForceVectorAlignment(data->size(), sizeof(uint8_t), 8);
+    }
+    auto data__ = data ? _fbb.CreateVector<uint8_t>(*data) : 0;
+    return tosaFb::CreateTosaTensor(_fbb, name__, shape__, type, data__);
+}
+
+struct TosaOperator FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef TosaOperatorBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return TosaOperatorTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_OP = 4,
+        VT_ATTRIBUTE_TYPE = 6,
+        VT_ATTRIBUTE = 8,
+        VT_INPUTS = 10,
+        VT_OUTPUTS = 12
+    };
+    tosaFb::Op op() const { return static_cast<tosaFb::Op>(GetField<uint32_t>(VT_OP, 0)); }
+    tosaFb::Attribute attribute_type() const
+    {
+        return static_cast<tosaFb::Attribute>(GetField<uint8_t>(VT_ATTRIBUTE_TYPE, 0));
+    }
+    const void *attribute() const { return GetPointer<const void *>(VT_ATTRIBUTE); }
+    template<typename T>
+    const T *attribute_as() const;
+    const tosaFb::PoolAttribute *attribute_as_PoolAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::PoolAttribute ? static_cast<const tosaFb::PoolAttribute *>(attribute()) : nullptr;
+    }
+    const tosaFb::ConvAttribute *attribute_as_ConvAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::ConvAttribute ? static_cast<const tosaFb::ConvAttribute *>(attribute()) : nullptr;
+    }
+    const tosaFb::TransposeConvAttribute *attribute_as_TransposeConvAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::TransposeConvAttribute ? static_cast<const tosaFb::TransposeConvAttribute *>(attribute()) : nullptr;
+    }
+    const tosaFb::PadAttribute *attribute_as_PadAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::PadAttribute ? static_cast<const tosaFb::PadAttribute *>(attribute()) : nullptr;
+    }
+    const tosaFb::AxisAttribute *attribute_as_AxisAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::AxisAttribute ? static_cast<const tosaFb::AxisAttribute *>(attribute()) : nullptr;
+    }
+    const tosaFb::ReshapeAttribute *attribute_as_ReshapeAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::ReshapeAttribute ? static_cast<const tosaFb::ReshapeAttribute *>(attribute()) : nullptr;
+    }
+    const tosaFb::SliceAttribute *attribute_as_SliceAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::SliceAttribute ? static_cast<const tosaFb::SliceAttribute *>(attribute()) : nullptr;
+    }
+    const tosaFb::TileAttribute *attribute_as_TileAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::TileAttribute ? static_cast<const tosaFb::TileAttribute *>(attribute()) : nullptr;
+    }
+    const tosaFb::ResizeAttribute *attribute_as_ResizeAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::ResizeAttribute ? static_cast<const tosaFb::ResizeAttribute *>(attribute()) : nullptr;
+    }
+    const tosaFb::ClampAttribute *attribute_as_ClampAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::ClampAttribute ? static_cast<const tosaFb::ClampAttribute *>(attribute()) : nullptr;
+    }
+    const tosaFb::RescaleAttribute *attribute_as_RescaleAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::RescaleAttribute ? static_cast<const tosaFb::RescaleAttribute *>(attribute()) : nullptr;
+    }
+    const tosaFb::MulAttribute *attribute_as_MulAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::MulAttribute ? static_cast<const tosaFb::MulAttribute *>(attribute()) : nullptr;
+    }
+    const tosaFb::ArithmeticRightShiftAttribute *attribute_as_ArithmeticRightShiftAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::ArithmeticRightShiftAttribute ? static_cast<const tosaFb::ArithmeticRightShiftAttribute *>(attribute()) : nullptr;
+    }
+    const tosaFb::CondIfAttribute *attribute_as_CondIfAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::CondIfAttribute ? static_cast<const tosaFb::CondIfAttribute *>(attribute()) : nullptr;
+    }
+    const tosaFb::WhileLoopAttribute *attribute_as_WhileLoopAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::WhileLoopAttribute ? static_cast<const tosaFb::WhileLoopAttribute *>(attribute()) : nullptr;
+    }
+    const tosaFb::TransposeAttribute *attribute_as_TransposeAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::TransposeAttribute ? static_cast<const tosaFb::TransposeAttribute *>(attribute()) : nullptr;
+    }
+    const tosaFb::TableAttribute *attribute_as_TableAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::TableAttribute ? static_cast<const tosaFb::TableAttribute *>(attribute()) : nullptr;
+    }
+    const tosaFb::MatMulAttribute *attribute_as_MatMulAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::MatMulAttribute ? static_cast<const tosaFb::MatMulAttribute *>(attribute()) : nullptr;
+    }
+    const tosaFb::FullyConnectedAttribute *attribute_as_FullyConnectedAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::FullyConnectedAttribute ? static_cast<const tosaFb::FullyConnectedAttribute *>(attribute()) : nullptr;
+    }
+    const tosaFb::NegateAttribute *attribute_as_NegateAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::NegateAttribute ? static_cast<const tosaFb::NegateAttribute *>(attribute()) : nullptr;
+    }
+    const tosaFb::CustomAttribute *attribute_as_CustomAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::CustomAttribute ? static_cast<const tosaFb::CustomAttribute *>(attribute()) : nullptr;
+    }
+    const tosaFb::FFTAttribute *attribute_as_FFTAttribute() const
+    {
+        return attribute_type() == tosaFb::Attribute::FFTAttribute ? static_cast<const tosaFb::FFTAttribute *>(attribute()) : nullptr;
+    }
+    const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *inputs() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_INPUTS);
+    }
+    const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *outputs() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_OUTPUTS);
+    }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyField<uint32_t>(verifier, VT_OP, 4) &&
+               VerifyField<uint8_t>(verifier, VT_ATTRIBUTE_TYPE, 1) && VerifyOffset(verifier, VT_ATTRIBUTE) &&
+               VerifyAttribute(verifier, attribute(), attribute_type()) && VerifyOffset(verifier, VT_INPUTS) &&
+               verifier.VerifyVector(inputs()) && verifier.VerifyVectorOfStrings(inputs()) && VerifyOffset(verifier, VT_OUTPUTS) &&
+               verifier.VerifyVector(outputs()) && verifier.VerifyVectorOfStrings(outputs()) && verifier.EndTable();
+    }
+};
+
+template<>
+inline const tosaFb::PoolAttribute *TosaOperator::attribute_as<tosaFb::PoolAttribute>() const
+{
+    return attribute_as_PoolAttribute();
+}
+
+template<>
+inline const tosaFb::ConvAttribute *TosaOperator::attribute_as<tosaFb::ConvAttribute>() const
+{
+    return attribute_as_ConvAttribute();
+}
+
+template<>
+inline const tosaFb::TransposeConvAttribute *TosaOperator::attribute_as<tosaFb::TransposeConvAttribute>() const
+{
+    return attribute_as_TransposeConvAttribute();
+}
+
+template<>
+inline const tosaFb::PadAttribute *TosaOperator::attribute_as<tosaFb::PadAttribute>() const
+{
+    return attribute_as_PadAttribute();
+}
+
+template<>
+inline const tosaFb::AxisAttribute *TosaOperator::attribute_as<tosaFb::AxisAttribute>() const
+{
+    return attribute_as_AxisAttribute();
+}
+
+template<>
+inline const tosaFb::ReshapeAttribute *TosaOperator::attribute_as<tosaFb::ReshapeAttribute>() const
+{
+    return attribute_as_ReshapeAttribute();
+}
+
+template<>
+inline const tosaFb::SliceAttribute *TosaOperator::attribute_as<tosaFb::SliceAttribute>() const
+{
+    return attribute_as_SliceAttribute();
+}
+
+template<>
+inline const tosaFb::TileAttribute *TosaOperator::attribute_as<tosaFb::TileAttribute>() const
+{
+    return attribute_as_TileAttribute();
+}
+
+template<>
+inline const tosaFb::ResizeAttribute *TosaOperator::attribute_as<tosaFb::ResizeAttribute>() const
+{
+    return attribute_as_ResizeAttribute();
+}
+
+template<>
+inline const tosaFb::ClampAttribute *TosaOperator::attribute_as<tosaFb::ClampAttribute>() const
+{
+    return attribute_as_ClampAttribute();
+}
+
+template<>
+inline const tosaFb::RescaleAttribute *TosaOperator::attribute_as<tosaFb::RescaleAttribute>() const
+{
+    return attribute_as_RescaleAttribute();
+}
+
+template<>
+inline const tosaFb::MulAttribute *TosaOperator::attribute_as<tosaFb::MulAttribute>() const
+{
+    return attribute_as_MulAttribute();
+}
+
+template<>
+inline const tosaFb::ArithmeticRightShiftAttribute *TosaOperator::attribute_as<tosaFb::ArithmeticRightShiftAttribute>() const
+{
+    return attribute_as_ArithmeticRightShiftAttribute();
+}
+
+template<>
+inline const tosaFb::CondIfAttribute *TosaOperator::attribute_as<tosaFb::CondIfAttribute>() const
+{
+    return attribute_as_CondIfAttribute();
+}
+
+template<>
+inline const tosaFb::WhileLoopAttribute *TosaOperator::attribute_as<tosaFb::WhileLoopAttribute>() const
+{
+    return attribute_as_WhileLoopAttribute();
+}
+
+template<>
+inline const tosaFb::TransposeAttribute *TosaOperator::attribute_as<tosaFb::TransposeAttribute>() const
+{
+    return attribute_as_TransposeAttribute();
+}
+
+template<>
+inline const tosaFb::TableAttribute *TosaOperator::attribute_as<tosaFb::TableAttribute>() const
+{
+    return attribute_as_TableAttribute();
+}
+
+template<>
+inline const tosaFb::MatMulAttribute *TosaOperator::attribute_as<tosaFb::MatMulAttribute>() const
+{
+    return attribute_as_MatMulAttribute();
+}
+
+template<>
+inline const tosaFb::FullyConnectedAttribute *TosaOperator::attribute_as<tosaFb::FullyConnectedAttribute>() const
+{
+    return attribute_as_FullyConnectedAttribute();
+}
+
+template<>
+inline const tosaFb::NegateAttribute *TosaOperator::attribute_as<tosaFb::NegateAttribute>() const
+{
+    return attribute_as_NegateAttribute();
+}
+
+template<>
+inline const tosaFb::CustomAttribute *TosaOperator::attribute_as<tosaFb::CustomAttribute>() const
+{
+    return attribute_as_CustomAttribute();
+}
+
+template<>
+inline const tosaFb::FFTAttribute *TosaOperator::attribute_as<tosaFb::FFTAttribute>() const
+{
+    return attribute_as_FFTAttribute();
+}
+
+struct TosaOperatorBuilder
+{
+    typedef TosaOperator Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_op(tosaFb::Op op) { fbb_.AddElement<uint32_t>(TosaOperator::VT_OP, static_cast<uint32_t>(op), 0); }
+    void add_attribute_type(tosaFb::Attribute attribute_type)
+    {
+        fbb_.AddElement<uint8_t>(TosaOperator::VT_ATTRIBUTE_TYPE, static_cast<uint8_t>(attribute_type), 0);
+    }
+    void add_attribute(::flatbuffers::Offset<void> attribute) { fbb_.AddOffset(TosaOperator::VT_ATTRIBUTE, attribute); }
+    void add_inputs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> inputs)
+    {
+        fbb_.AddOffset(TosaOperator::VT_INPUTS, inputs);
+    }
+    void add_outputs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> outputs)
+    {
+        fbb_.AddOffset(TosaOperator::VT_OUTPUTS, outputs);
+    }
+    explicit TosaOperatorBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+    ::flatbuffers::Offset<TosaOperator> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<TosaOperator>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<TosaOperator> CreateTosaOperator(::flatbuffers::FlatBufferBuilder &_fbb, tosaFb::Op op = tosaFb::Op::UNKNOWN,
+    tosaFb::Attribute attribute_type = tosaFb::Attribute::NONE, ::flatbuffers::Offset<void> attribute = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> inputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> outputs = 0)
+{
+    TosaOperatorBuilder builder_(_fbb);
+    builder_.add_outputs(outputs);
+    builder_.add_inputs(inputs);
+    builder_.add_attribute(attribute);
+    builder_.add_op(op);
+    builder_.add_attribute_type(attribute_type);
+    return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<TosaOperator> CreateTosaOperatorDirect(::flatbuffers::FlatBufferBuilder &_fbb,
+    tosaFb::Op op = tosaFb::Op::UNKNOWN, tosaFb::Attribute attribute_type = tosaFb::Attribute::NONE,
+    ::flatbuffers::Offset<void> attribute = 0, const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *inputs = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *outputs = nullptr)
+{
+    auto inputs__ = inputs ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*inputs) : 0;
+    auto outputs__ = outputs ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*outputs) : 0;
+    return tosaFb::CreateTosaOperator(_fbb, op, attribute_type, attribute, inputs__, outputs__);
+}
+
+struct TosaBasicBlock FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef TosaBasicBlockBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return TosaBasicBlockTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_NAME = 4,
+        VT_OPERATORS = 6,
+        VT_TENSORS = 8,
+        VT_INPUTS = 10,
+        VT_OUTPUTS = 12
+    };
+    const ::flatbuffers::String *name() const { return GetPointer<const ::flatbuffers::String *>(VT_NAME); }
+    const ::flatbuffers::Vector<::flatbuffers::Offset<tosaFb::TosaOperator>> *operators() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tosaFb::TosaOperator>> *>(VT_OPERATORS);
+    }
+    const ::flatbuffers::Vector<::flatbuffers::Offset<tosaFb::TosaTensor>> *tensors() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tosaFb::TosaTensor>> *>(VT_TENSORS);
+    }
+    const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *inputs() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_INPUTS);
+    }
+    const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *outputs() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_OUTPUTS);
+    }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_NAME) && verifier.VerifyString(name()) && VerifyOffset(verifier, VT_OPERATORS) &&
+               verifier.VerifyVector(operators()) && verifier.VerifyVectorOfTables(operators()) && VerifyOffset(verifier, VT_TENSORS) &&
+               verifier.VerifyVector(tensors()) && verifier.VerifyVectorOfTables(tensors()) && VerifyOffset(verifier, VT_INPUTS) &&
+               verifier.VerifyVector(inputs()) && verifier.VerifyVectorOfStrings(inputs()) && VerifyOffset(verifier, VT_OUTPUTS) &&
+               verifier.VerifyVector(outputs()) && verifier.VerifyVectorOfStrings(outputs()) && verifier.EndTable();
+    }
+};
+
+struct TosaBasicBlockBuilder
+{
+    typedef TosaBasicBlock Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_name(::flatbuffers::Offset<::flatbuffers::String> name) { fbb_.AddOffset(TosaBasicBlock::VT_NAME, name); }
+    void add_operators(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tosaFb::TosaOperator>>> operators)
+    {
+        fbb_.AddOffset(TosaBasicBlock::VT_OPERATORS, operators);
+    }
+    void add_tensors(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tosaFb::TosaTensor>>> tensors)
+    {
+        fbb_.AddOffset(TosaBasicBlock::VT_TENSORS, tensors);
+    }
+    void add_inputs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> inputs)
+    {
+        fbb_.AddOffset(TosaBasicBlock::VT_INPUTS, inputs);
+    }
+    void add_outputs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> outputs)
+    {
+        fbb_.AddOffset(TosaBasicBlock::VT_OUTPUTS, outputs);
+    }
+    explicit TosaBasicBlockBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+    ::flatbuffers::Offset<TosaBasicBlock> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<TosaBasicBlock>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<TosaBasicBlock>
+CreateTosaBasicBlock(::flatbuffers::FlatBufferBuilder &_fbb, ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tosaFb::TosaOperator>>> operators = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tosaFb::TosaTensor>>> tensors = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> inputs = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> outputs = 0)
+{
+    TosaBasicBlockBuilder builder_(_fbb);
+    builder_.add_outputs(outputs);
+    builder_.add_inputs(inputs);
+    builder_.add_tensors(tensors);
+    builder_.add_operators(operators);
+    builder_.add_name(name);
+    return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<TosaBasicBlock> CreateTosaBasicBlockDirect(::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr, const std::vector<::flatbuffers::Offset<tosaFb::TosaOperator>> *operators = nullptr,
+    const std::vector<::flatbuffers::Offset<tosaFb::TosaTensor>> *tensors = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *inputs = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *outputs = nullptr)
+{
+    auto name__ = name ? _fbb.CreateString(name) : 0;
+    auto operators__ = operators ? _fbb.CreateVector<::flatbuffers::Offset<tosaFb::TosaOperator>>(*operators) : 0;
+    auto tensors__ = tensors ? _fbb.CreateVector<::flatbuffers::Offset<tosaFb::TosaTensor>>(*tensors) : 0;
+    auto inputs__ = inputs ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*inputs) : 0;
+    auto outputs__ = outputs ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*outputs) : 0;
+    return tosaFb::CreateTosaBasicBlock(_fbb, name__, operators__, tensors__, inputs__, outputs__);
+}
+
+struct TosaRegion FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef TosaRegionBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return TosaRegionTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_NAME = 4,
+        VT_BLOCKS = 6
+    };
+    const ::flatbuffers::String *name() const { return GetPointer<const ::flatbuffers::String *>(VT_NAME); }
+    const ::flatbuffers::Vector<::flatbuffers::Offset<tosaFb::TosaBasicBlock>> *blocks() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tosaFb::TosaBasicBlock>> *>(VT_BLOCKS);
+    }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_NAME) && verifier.VerifyString(name()) && VerifyOffset(verifier, VT_BLOCKS) &&
+               verifier.VerifyVector(blocks()) && verifier.VerifyVectorOfTables(blocks()) && verifier.EndTable();
+    }
+};
+
+struct TosaRegionBuilder
+{
+    typedef TosaRegion Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_name(::flatbuffers::Offset<::flatbuffers::String> name) { fbb_.AddOffset(TosaRegion::VT_NAME, name); }
+    void add_blocks(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tosaFb::TosaBasicBlock>>> blocks)
+    {
+        fbb_.AddOffset(TosaRegion::VT_BLOCKS, blocks);
+    }
+    explicit TosaRegionBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+    ::flatbuffers::Offset<TosaRegion> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<TosaRegion>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<TosaRegion>
+CreateTosaRegion(::flatbuffers::FlatBufferBuilder &_fbb, ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tosaFb::TosaBasicBlock>>> blocks = 0)
+{
+    TosaRegionBuilder builder_(_fbb);
+    builder_.add_blocks(blocks);
+    builder_.add_name(name);
+    return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<TosaRegion> CreateTosaRegionDirect(::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr, const std::vector<::flatbuffers::Offset<tosaFb::TosaBasicBlock>> *blocks = nullptr)
+{
+    auto name__ = name ? _fbb.CreateString(name) : 0;
+    auto blocks__ = blocks ? _fbb.CreateVector<::flatbuffers::Offset<tosaFb::TosaBasicBlock>>(*blocks) : 0;
+    return tosaFb::CreateTosaRegion(_fbb, name__, blocks__);
+}
+
+struct TosaGraph FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
+{
+    typedef TosaGraphBuilder Builder;
+    static const ::flatbuffers::TypeTable *MiniReflectTypeTable() { return TosaGraphTypeTable(); }
+    enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE
+    {
+        VT_VERSION = 4,
+        VT_REGIONS = 6
+    };
+    const tosaFb::Version *version() const { return GetPointer<const tosaFb::Version *>(VT_VERSION); }
+    const ::flatbuffers::Vector<::flatbuffers::Offset<tosaFb::TosaRegion>> *regions() const
+    {
+        return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<tosaFb::TosaRegion>> *>(VT_REGIONS);
+    }
+    bool Verify(::flatbuffers::Verifier &verifier) const
+    {
+        return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_VERSION) && verifier.VerifyTable(version()) && VerifyOffset(verifier, VT_REGIONS) &&
+               verifier.VerifyVector(regions()) && verifier.VerifyVectorOfTables(regions()) && verifier.EndTable();
+    }
+};
+
+struct TosaGraphBuilder
+{
+    typedef TosaGraph Table;
+    ::flatbuffers::FlatBufferBuilder &fbb_;
+    ::flatbuffers::uoffset_t start_;
+    void add_version(::flatbuffers::Offset<tosaFb::Version> version) { fbb_.AddOffset(TosaGraph::VT_VERSION, version); }
+    void add_regions(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tosaFb::TosaRegion>>> regions)
+    {
+        fbb_.AddOffset(TosaGraph::VT_REGIONS, regions);
+    }
+    explicit TosaGraphBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); }
+    ::flatbuffers::Offset<TosaGraph> Finish()
+    {
+        const auto end = fbb_.EndTable(start_);
+        auto o = ::flatbuffers::Offset<TosaGraph>(end);
+        return o;
+    }
+};
+
+inline ::flatbuffers::Offset<TosaGraph>
+CreateTosaGraph(::flatbuffers::FlatBufferBuilder &_fbb, ::flatbuffers::Offset<tosaFb::Version> version = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<tosaFb::TosaRegion>>> regions = 0)
+{
+    TosaGraphBuilder builder_(_fbb);
+    builder_.add_regions(regions);
+    builder_.add_version(version);
+    return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<TosaGraph> CreateTosaGraphDirect(::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<tosaFb::Version> version = 0, const std::vector<::flatbuffers::Offset<tosaFb::TosaRegion>> *regions = nullptr)
+{
+    auto regions__ = regions ? _fbb.CreateVector<::flatbuffers::Offset<tosaFb::TosaRegion>>(*regions) : 0;
+    return tosaFb::CreateTosaGraph(_fbb, version, regions__);
+}
+
+inline bool VerifyAttribute(::flatbuffers::Verifier &verifier, const void *obj, Attribute type)
+{
+    switch ( type )
+    {
+        case Attribute::NONE:
+        {
+            return true;
+        }
+        case Attribute::PoolAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::PoolAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        case Attribute::ConvAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::ConvAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        case Attribute::TransposeConvAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::TransposeConvAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        case Attribute::PadAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::PadAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        case Attribute::AxisAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::AxisAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        case Attribute::ReshapeAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::ReshapeAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        case Attribute::SliceAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::SliceAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        case Attribute::TileAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::TileAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        case Attribute::ResizeAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::ResizeAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        case Attribute::ClampAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::ClampAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        case Attribute::RescaleAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::RescaleAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        case Attribute::MulAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::MulAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        case Attribute::ArithmeticRightShiftAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::ArithmeticRightShiftAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        case Attribute::CondIfAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::CondIfAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        case Attribute::WhileLoopAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::WhileLoopAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        case Attribute::TransposeAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::TransposeAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        case Attribute::TableAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::TableAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        case Attribute::MatMulAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::MatMulAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        case Attribute::FullyConnectedAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::FullyConnectedAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        case Attribute::NegateAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::NegateAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        case Attribute::CustomAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::CustomAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        case Attribute::FFTAttribute:
+        {
+            auto ptr = reinterpret_cast<const tosaFb::FFTAttribute *>(obj);
+            return verifier.VerifyTable(ptr);
+        }
+        default:
+            return true;
+    }
+}
+
+inline bool VerifyAttributeVector(::flatbuffers::Verifier &verifier,
+    const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<Attribute> *types)
+{
+    if ( !values || !types ) return !values && !types;
+    if ( values->size() != types->size() ) return false;
+    for ( ::flatbuffers::uoffset_t i = 0; i < values->size(); ++i )
+    {
+        if ( !VerifyAttribute(verifier, values->Get(i), types->GetEnum<Attribute>(i)) )
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+inline const ::flatbuffers::TypeTable *DTypeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}};
+    static const ::flatbuffers::TypeFunction type_refs[] = {tosaFb::DTypeTypeTable};
+    static const char *const names[] = {
+        "UNKNOWN", "BOOL", "UINT8", "INT4", "INT8", "INT16", "INT32", "INT48", "FP32", "UINT16", "FP16", "BF16"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_ENUM, 12, type_codes, type_refs, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ResizeModeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}};
+    static const ::flatbuffers::TypeFunction type_refs[] = {tosaFb::ResizeModeTypeTable};
+    static const char *const names[] = {"UNKNOWN", "NEAREST", "BILINEAR"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_ENUM, 3, type_codes, type_refs, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *OpTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0},
+        {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UINT, 0, 0}};
+    static const ::flatbuffers::TypeFunction type_refs[] = {tosaFb::OpTypeTable};
+    static const char *const names[] = {"UNKNOWN", "ARGMAX", "AVG_POOL2D", "CONV2D", "CONV3D", "DEPTHWISE_CONV2D",
+        "FULLY_CONNECTED", "MATMUL", "MAX_POOL2D", "TRANSPOSE_CONV2D", "CLAMP", "RESERVED", "SIGMOID", "TANH", "ADD",
+        "ARITHMETIC_RIGHT_SHIFT", "BITWISE_AND", "BITWISE_OR", "BITWISE_XOR", "INTDIV", "LOGICAL_AND", "LOGICAL_LEFT_SHIFT",
+        "LOGICAL_RIGHT_SHIFT", "LOGICAL_OR", "LOGICAL_XOR", "MAXIMUM", "MINIMUM", "MUL", "POW", "SUB", "TABLE", "ABS",
+        "BITWISE_NOT", "CEIL", "CLZ", "EXP", "FLOOR", "LOG", "LOGICAL_NOT", "NEGATE", "RECIPROCAL", "RSQRT", "SELECT",
+        "EQUAL", "GREATER", "GREATER_EQUAL", "REDUCE_ANY", "REDUCE_ALL", "REDUCE_MAX", "REDUCE_MIN", "REDUCE_PRODUCT",
+        "REDUCE_SUM", "CONCAT", "PAD", "RESHAPE", "REVERSE", "SLICE", "TILE", "TRANSPOSE", "GATHER", "SCATTER",
+        "RESIZE", "CAST", "RESCALE", "CONST", "IDENTITY", "CUSTOM", "COND_IF", "WHILE_LOOP", "FFT2D", "RFFT2D"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_ENUM, 71, type_codes, type_refs, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *AttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_SEQUENCE, 0, -1}, {::flatbuffers::ET_SEQUENCE, 0, 0},
+        {::flatbuffers::ET_SEQUENCE, 0, 1}, {::flatbuffers::ET_SEQUENCE, 0, 2}, {::flatbuffers::ET_SEQUENCE, 0, 3},
+        {::flatbuffers::ET_SEQUENCE, 0, 4}, {::flatbuffers::ET_SEQUENCE, 0, 5}, {::flatbuffers::ET_SEQUENCE, 0, 6},
+        {::flatbuffers::ET_SEQUENCE, 0, 7}, {::flatbuffers::ET_SEQUENCE, 0, 8}, {::flatbuffers::ET_SEQUENCE, 0, 9},
+        {::flatbuffers::ET_SEQUENCE, 0, 10}, {::flatbuffers::ET_SEQUENCE, 0, 11}, {::flatbuffers::ET_SEQUENCE, 0, 12},
+        {::flatbuffers::ET_SEQUENCE, 0, 13}, {::flatbuffers::ET_SEQUENCE, 0, 14}, {::flatbuffers::ET_SEQUENCE, 0, 15},
+        {::flatbuffers::ET_SEQUENCE, 0, 16}, {::flatbuffers::ET_SEQUENCE, 0, 17}, {::flatbuffers::ET_SEQUENCE, 0, 18},
+        {::flatbuffers::ET_SEQUENCE, 0, 19}, {::flatbuffers::ET_SEQUENCE, 0, 20}, {::flatbuffers::ET_SEQUENCE, 0, 21}};
+    static const ::flatbuffers::TypeFunction type_refs[] = {tosaFb::PoolAttributeTypeTable, tosaFb::ConvAttributeTypeTable,
+        tosaFb::TransposeConvAttributeTypeTable, tosaFb::PadAttributeTypeTable, tosaFb::AxisAttributeTypeTable, tosaFb::ReshapeAttributeTypeTable,
+        tosaFb::SliceAttributeTypeTable, tosaFb::TileAttributeTypeTable, tosaFb::ResizeAttributeTypeTable, tosaFb::ClampAttributeTypeTable,
+        tosaFb::RescaleAttributeTypeTable, tosaFb::MulAttributeTypeTable, tosaFb::ArithmeticRightShiftAttributeTypeTable,
+        tosaFb::CondIfAttributeTypeTable, tosaFb::WhileLoopAttributeTypeTable, tosaFb::TransposeAttributeTypeTable,
+        tosaFb::TableAttributeTypeTable, tosaFb::MatMulAttributeTypeTable, tosaFb::FullyConnectedAttributeTypeTable,
+        tosaFb::NegateAttributeTypeTable, tosaFb::CustomAttributeTypeTable, tosaFb::FFTAttributeTypeTable};
+    static const char *const names[] = {"NONE", "PoolAttribute", "ConvAttribute", "TransposeConvAttribute", "PadAttribute",
+        "AxisAttribute", "ReshapeAttribute", "SliceAttribute", "TileAttribute", "ResizeAttribute", "ClampAttribute", "RescaleAttribute",
+        "MulAttribute", "ArithmeticRightShiftAttribute", "CondIfAttribute", "WhileLoopAttribute", "TransposeAttribute",
+        "TableAttribute", "MatMulAttribute", "FullyConnectedAttribute", "NegateAttribute", "CustomAttribute", "FFTAttribute"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_UNION, 23, type_codes, type_refs, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *PoolAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_INT, 1, -1}, {::flatbuffers::ET_INT, 1, -1},
+        {::flatbuffers::ET_INT, 1, -1}, {::flatbuffers::ET_INT, 0, -1}, {::flatbuffers::ET_INT, 0, -1}, {::flatbuffers::ET_UINT, 0, 0}};
+    static const ::flatbuffers::TypeFunction type_refs[] = {tosaFb::DTypeTypeTable};
+    static const char *const names[] = {"pad", "kernel", "stride", "input_zp", "output_zp", "accum_dtype"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 6, type_codes, type_refs, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ConvAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_INT, 1, -1}, {::flatbuffers::ET_INT, 1, -1},
+        {::flatbuffers::ET_INT, 1, -1}, {::flatbuffers::ET_INT, 0, -1}, {::flatbuffers::ET_INT, 0, -1}};
+    static const char *const names[] = {"pad", "stride", "dilation", "input_zp", "weight_zp"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 5, type_codes, nullptr, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *TransposeConvAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_INT, 1, -1}, {::flatbuffers::ET_INT, 1, -1},
+        {::flatbuffers::ET_INT, 1, -1}, {::flatbuffers::ET_INT, 0, -1}, {::flatbuffers::ET_INT, 0, -1}};
+    static const char *const names[] = {"out_pad", "stride", "output_shape", "input_zp", "weight_zp"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 5, type_codes, nullptr, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *PadAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {
+        {::flatbuffers::ET_INT, 1, -1}, {::flatbuffers::ET_INT, 0, -1}, {::flatbuffers::ET_UCHAR, 1, -1}};
+    static const char *const names[] = {"padding", "pad_const_int", "pad_const_fp"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 3, type_codes, nullptr, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *AxisAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_INT, 0, -1}};
+    static const char *const names[] = {"axis"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ReshapeAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_INT, 1, -1}};
+    static const char *const names[] = {"new_shape"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *SliceAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_INT, 1, -1}, {::flatbuffers::ET_INT, 1, -1}};
+    static const char *const names[] = {"start", "size"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *TileAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_INT, 1, -1}};
+    static const char *const names[] = {"multiples"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ResizeAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_SHORT, 1, -1},
+        {::flatbuffers::ET_SHORT, 1, -1}, {::flatbuffers::ET_SHORT, 1, -1}, {::flatbuffers::ET_UINT, 0, 0}};
+    static const ::flatbuffers::TypeFunction type_refs[] = {tosaFb::ResizeModeTypeTable};
+    static const char *const names[] = {"scale", "offset", "border", "mode"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 4, type_codes, type_refs, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ClampAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_INT, 0, -1}, {::flatbuffers::ET_INT, 0, -1},
+        {::flatbuffers::ET_UCHAR, 1, -1}, {::flatbuffers::ET_UCHAR, 1, -1}};
+    static const char *const names[] = {"min_int", "max_int", "min_fp", "max_fp"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 4, type_codes, nullptr, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *RescaleAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_INT, 0, -1}, {::flatbuffers::ET_INT, 0, -1},
+        {::flatbuffers::ET_INT, 1, -1}, {::flatbuffers::ET_INT, 1, -1}, {::flatbuffers::ET_BOOL, 0, -1},
+        {::flatbuffers::ET_BOOL, 0, -1}, {::flatbuffers::ET_BOOL, 0, -1}};
+    static const char *const names[] = {"input_zp", "output_zp", "multiplier", "shift", "scale32", "double_round", "per_channel"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 7, type_codes, nullptr, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *MulAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_INT, 0, -1}};
+    static const char *const names[] = {"shift"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *ArithmeticRightShiftAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_BOOL, 0, -1}};
+    static const char *const names[] = {"round"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *CondIfAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_STRING, 0, -1}, {::flatbuffers::ET_STRING, 0, -1}};
+    static const char *const names[] = {"then_branch", "else_branch"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *WhileLoopAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_STRING, 0, -1}, {::flatbuffers::ET_STRING, 0, -1}};
+    static const char *const names[] = {"cond_branch", "body_branch"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *TransposeAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_INT, 1, -1}};
+    static const char *const names[] = {"perms"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *TableAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_SHORT, 1, -1}};
+    static const char *const names[] = {"table"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *MatMulAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_INT, 0, -1}, {::flatbuffers::ET_INT, 0, -1}};
+    static const char *const names[] = {"a_zp", "b_zp"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *FullyConnectedAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_INT, 0, -1}, {::flatbuffers::ET_INT, 0, -1}};
+    static const char *const names[] = {"input_zp", "weight_zp"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *NegateAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_INT, 0, -1}, {::flatbuffers::ET_INT, 0, -1}};
+    static const char *const names[] = {"input1_zp", "output_zp"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 2, type_codes, nullptr, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *CustomAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {
+        {::flatbuffers::ET_STRING, 0, -1}, {::flatbuffers::ET_STRING, 0, -1}, {::flatbuffers::ET_UCHAR, 1, -1}};
+    static const char *const names[] = {"identifier", "config", "implementation_attrs"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 3, type_codes, nullptr, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *FFTAttributeTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_BOOL, 0, -1}};
+    static const char *const names[] = {"inverse"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 1, type_codes, nullptr, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *VersionTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_INT, 0, -1}, {::flatbuffers::ET_INT, 0, -1},
+        {::flatbuffers::ET_INT, 0, -1}, {::flatbuffers::ET_BOOL, 0, -1}};
+    static const char *const names[] = {"_major", "_minor", "_patch", "_draft"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 4, type_codes, nullptr, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *TosaTensorTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_STRING, 0, -1},
+        {::flatbuffers::ET_INT, 1, -1}, {::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UCHAR, 1, -1}};
+    static const ::flatbuffers::TypeFunction type_refs[] = {tosaFb::DTypeTypeTable};
+    static const char *const names[] = {"name", "shape", "type", "data"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 4, type_codes, type_refs, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *TosaOperatorTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_UINT, 0, 0}, {::flatbuffers::ET_UTYPE, 0, 1},
+        {::flatbuffers::ET_SEQUENCE, 0, 1}, {::flatbuffers::ET_STRING, 1, -1}, {::flatbuffers::ET_STRING, 1, -1}};
+    static const ::flatbuffers::TypeFunction type_refs[] = {tosaFb::OpTypeTable, tosaFb::AttributeTypeTable};
+    static const char *const names[] = {"op", "attribute_type", "attribute", "inputs", "outputs"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 5, type_codes, type_refs, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *TosaBasicBlockTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_STRING, 0, -1}, {::flatbuffers::ET_SEQUENCE, 1, 0},
+        {::flatbuffers::ET_SEQUENCE, 1, 1}, {::flatbuffers::ET_STRING, 1, -1}, {::flatbuffers::ET_STRING, 1, -1}};
+    static const ::flatbuffers::TypeFunction type_refs[] = {tosaFb::TosaOperatorTypeTable, tosaFb::TosaTensorTypeTable};
+    static const char *const names[] = {"name", "operators", "tensors", "inputs", "outputs"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 5, type_codes, type_refs, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *TosaRegionTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_STRING, 0, -1}, {::flatbuffers::ET_SEQUENCE, 1, 0}};
+    static const ::flatbuffers::TypeFunction type_refs[] = {tosaFb::TosaBasicBlockTypeTable};
+    static const char *const names[] = {"name", "blocks"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 2, type_codes, type_refs, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const ::flatbuffers::TypeTable *TosaGraphTypeTable()
+{
+    static const ::flatbuffers::TypeCode type_codes[] = {{::flatbuffers::ET_SEQUENCE, 0, 0}, {::flatbuffers::ET_SEQUENCE, 1, 1}};
+    static const ::flatbuffers::TypeFunction type_refs[] = {tosaFb::VersionTypeTable, tosaFb::TosaRegionTypeTable};
+    static const char *const names[] = {"version", "regions"};
+    static const ::flatbuffers::TypeTable tt = {::flatbuffers::ST_TABLE, 2, type_codes, type_refs, nullptr, nullptr, names};
+    return &tt;
+}
+
+inline const tosaFb::TosaGraph *GetTosaGraph(const void *buf)
+{
+    return ::flatbuffers::GetRoot<tosaFb::TosaGraph>(buf);
+}
+
+inline const tosaFb::TosaGraph *GetSizePrefixedTosaGraph(const void *buf)
+{
+    return ::flatbuffers::GetSizePrefixedRoot<tosaFb::TosaGraph>(buf);
+}
+
+inline const char *TosaGraphIdentifier()
+{
+    return "TOSA";
+}
+
+inline bool TosaGraphBufferHasIdentifier(const void *buf)
+{
+    return ::flatbuffers::BufferHasIdentifier(buf, TosaGraphIdentifier());
+}
+
+inline bool SizePrefixedTosaGraphBufferHasIdentifier(const void *buf)
+{
+    return ::flatbuffers::BufferHasIdentifier(buf, TosaGraphIdentifier(), true);
+}
+
+inline bool VerifyTosaGraphBuffer(::flatbuffers::Verifier &verifier)
+{
+    return verifier.VerifyBuffer<tosaFb::TosaGraph>(TosaGraphIdentifier());
+}
+
+inline bool VerifySizePrefixedTosaGraphBuffer(::flatbuffers::Verifier &verifier)
+{
+    return verifier.VerifySizePrefixedBuffer<tosaFb::TosaGraph>(TosaGraphIdentifier());
+}
+
+inline const char *TosaGraphExtension()
+{
+    return "tosa";
+}
+
+inline void FinishTosaGraphBuffer(::flatbuffers::FlatBufferBuilder &fbb, ::flatbuffers::Offset<tosaFb::TosaGraph> root)
+{
+    fbb.Finish(root, TosaGraphIdentifier());
+}
+
+inline void FinishSizePrefixedTosaGraphBuffer(::flatbuffers::FlatBufferBuilder &fbb, ::flatbuffers::Offset<tosaFb::TosaGraph> root)
+{
+    fbb.FinishSizePrefixed(root, TosaGraphIdentifier());
+}
+
+}  // namespace tosaFb
+
+#endif  // FLATBUFFERS_GENERATED_TOSA_TOSAFB_H_
diff --git a/ethosu/regor/tosa/tosa_validator.cpp b/ethosu/regor/tosa/tosa_validator.cpp
new file mode 100644
index 00000000..a7be3faf
--- /dev/null
+++ b/ethosu/regor/tosa/tosa_validator.cpp
@@ -0,0 +1,42 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Generated by tosaValidationGenerator for TOSA Specification 0.60.0
+// Do not edit.
+
+#include "tosa/tosa_validator.hpp"
+
+#include "compiler/operation.hpp"
+
+namespace tosa
+{
+namespace validator
+{
+
+void ValidateOperator(const GraphApi::GraphOperation *graphOp, const Context &context)
+{
+    if ( graphOp == nullptr ) throw std::invalid_argument("No operation");
+    if ( context.version == GraphApi::VERSION_TOSA_0_60 && context.profile == GraphApi::PROFILE_BASELINE )
+    {
+        ValidateOperator_Version_0_60_0_Profile_BI(graphOp, context);
+        return;
+    }
+    throw std::invalid_argument("TOSA version or profile not supported");
+}
+
+}  // namespace validator
+}  // namespace tosa
diff --git a/ethosu/regor/tosa/tosa_validator.hpp b/ethosu/regor/tosa/tosa_validator.hpp
new file mode 100644
index 00000000..0ec184e8
--- /dev/null
+++ b/ethosu/regor/tosa/tosa_validator.hpp
@@ -0,0 +1,62 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Generated by tosaValidationGenerator for TOSA Specification 0.60.0
+// Do not edit.
+
+#pragma once
+
+#include "include/graphapi.hpp"
+
+#include <functional>
+#include <stdexcept>
+
+namespace GraphApi
+{
+struct GraphOperation;
+}  // namespace GraphApi
+
+namespace regor
+{
+class Graph;
+class Operation;
+}  // namespace regor
+
+namespace tosa
+{
+namespace validator
+{
+
+enum class Level
+{
+    Levelnone,
+    Level8K,
+};
+
+struct Context
+{
+    uint32_t version = GraphApi::VERSION_TOSA_0_60;
+    int32_t profile = GraphApi::PROFILE_BASELINE;
+    Level level = Level::Level8K;
+    std::function<const regor::Graph *(const char *)> GetGraph;
+};
+
+void ValidateOperator_Version_0_60_0_Profile_BI(const GraphApi::GraphOperation *graphOp, const Context &context);
+void ValidateOperator(const GraphApi::GraphOperation *graphOp, const Context &context = Context{});
+
+}  // namespace validator
+}  // namespace tosa
diff --git a/ethosu/regor/tosa/tosa_validator_version_0_60_0_profile_bi.cpp b/ethosu/regor/tosa/tosa_validator_version_0_60_0_profile_bi.cpp
new file mode 100644
index 00000000..b7edee37
--- /dev/null
+++ b/ethosu/regor/tosa/tosa_validator_version_0_60_0_profile_bi.cpp
@@ -0,0 +1,2898 @@
+//
+// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the License); you may
+// not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Generated by tosaValidationGenerator for TOSA Specification 0.60.0
+// Do not edit.
+
+#include "compiler/operation.hpp"
+#include "tosa/tosa_argument_checks.hpp"
+#include "tosa/tosa_error_checks.hpp"
+#include "tosa/tosa_level_checks.hpp"
+#include "tosa/tosa_require_checks.hpp"
+#include "tosa/tosa_validator.hpp"
+using namespace tosa::validator;
+
+using namespace tosa::validator::checks;
+
+#define MAX_RANK (context.level == Level::Level8K ? 6 : (context.level == Level::Levelnone ? 32 : 0))
+
+#define MAX_KERNEL (context.level == Level::Level8K ? 8192 : (context.level == Level::Levelnone ? 2147483647 : 0))
+
+#define MAX_SCALE (context.level == Level::Level8K ? 256 : (context.level == Level::Levelnone ? 2048 : 0))
+
+#define MAX_STRIDE (context.level == Level::Level8K ? 8192 : (context.level == Level::Levelnone ? 2147483647 : 0))
+
+namespace
+{
+void ValidateOperator_ARGMAX(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {
+        Category::Input,
+        "input",
+        "in_t",
+    }; /*Input tensor with rank from 1 to 4 shape=shape1*/
+    const Argument axis = {
+        Category::Attribute,
+        "axis",
+        "int32_t",
+    }; /*Axis in range from 0 to rank(shape1)-1 shape=-*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "out_t",
+    }; /*Output tensor, with rank = rank(shape1)-1 shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &axis,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_t", "int8_t"},
+            {"out_t", "int32_t"},
+        },  // signed 8
+        {
+            {"in_t", "int16_t"},
+            {"out_t", "int32_t"},
+        },  // signed 16
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_ai0sdq9wgm72(op, context);
+    ErrorIfCheck_gpp861oen43y(op, context);
+    LevelCheck_1lz89reckvj8d(op, context);
+}
+
+void ValidateOperator_AVG_POOL2D(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {Category::Input, "input", "in_out_t", {4, 4}};      /*Input tensor 4D shape=[N,IH,IW,C]*/
+    const Argument kernel = {Category::Attribute, "kernel", "int32_t", {1, 1}}; /*[kernel_y, kernel_x] shape=[2]*/
+    const Argument stride = {Category::Attribute, "stride", "int32_t", {1, 1}}; /*[stride_y, stride_x] shape=[2]*/
+    const Argument pad = {Category::Attribute, "pad", "int32_t", {1, 1}}; /*[pad_top, pad_bottom, pad_left, pad_right]
+                                                                             shape=[4]*/
+    const Argument acc_size = {
+        Category::Attribute,
+        "acc_size",
+        "acc_t",
+    }; /*Enumerated type, must be one of INT32, FP16, FP32, as defined in the Supported Data Types table for this
+        operation shape=-*/
+    const Argument input_zp = {
+        Category::Attribute,
+        "input_zp",
+        "in_out_t",
+    }; /*Input tensor zero point. Must be zero for non-int8 types. shape=-*/
+    const Argument output_zp = {
+        Category::Attribute,
+        "output_zp",
+        "in_out_t",
+    }; /*Output tensor zero point. Must be zero for non-int8 types. shape=-*/
+    const Argument output = {Category::Output, "output", "in_out_t", {4, 4}}; /*Output tensor 4D shape=[N,OH,OW,C]*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &kernel,
+        &stride,
+        &pad,
+        &acc_size,
+        &input_zp,
+        &output_zp,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "int8_t"},
+            {"acc_t", "int32_t"},
+        },  // signed 8 with int32 accumulate
+        {
+            {"in_out_t", "int16_t"},
+            {"acc_t", "int32_t"},
+        },  // signed 16 with int32 accumulate
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1vu5c1tytwmhu(op, context);
+    ErrorIfCheck_1n0denkrrrlr1(op, context);
+    ErrorIfCheck_36r4wpx3psd81(op, context);
+    ErrorIfCheck_1lrylbkd3w7ix(op, context);
+    ErrorIfCheck_ojmgqziimenu(op, context);
+    ErrorIfCheck_3vqy81ueu5wjk(op, context);
+    ErrorIfCheck_125xuezh1964i(op, context);
+    ErrorIfCheck_fqta626ku4qe(op, context);
+    ErrorIfCheck_ycjhrvf2yigr(op, context);
+    ErrorIfCheck_1c57olj698f3d(op, context);
+    LevelCheck_2i1ithnrq06wi(op, context);
+    LevelCheck_1wobi8axf7z2y(op, context);
+    LevelCheck_1xppm0ufw64nq(op, context);
+    LevelCheck_as2lzdd5d28b(op, context);
+    LevelCheck_2n3xkkz3ip4mz(op, context);
+    LevelCheck_3o7qpmmd9ga58(op, context);
+    LevelCheck_16lxbjk2bszcz(op, context);
+    LevelCheck_2kwfb08mbiwpg(op, context);
+}
+
+void ValidateOperator_CONV2D(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {Category::Input, "input", "in_t", {4, 4}};       /*Input tensor shape=[N,IH,IW,IC]*/
+    const Argument weight = {Category::Input, "weight", "weight_t", {4, 4}}; /*Weight kernel size KH x KW
+                                                                                shape=[OC,KH,KW,IC]*/
+    const Argument bias = {Category::Input, "bias", "out_t", {1, 1}};     /*Per output channel bias data. shape=[OC]*/
+    const Argument pad = {Category::Attribute, "pad", "int32_t", {1, 1}}; /*[pad_top, pad_bottom, pad_left, pad_right]
+                                                                             shape=[4]*/
+    const Argument stride = {Category::Attribute, "stride", "int32_t", {1, 1}};     /*[stride_y, stride_x] shape=[2]*/
+    const Argument dilation = {Category::Attribute, "dilation", "int32_t", {1, 1}}; /*[dilation_y, dilation_x]
+                                                                                       shape=[2]*/
+    const Argument input_zp = {
+        Category::Attribute,
+        "input_zp",
+        "in_t",
+    }; /*Input tensor zero point. Must be zero for non-int8 types. shape=-*/
+    const Argument weight_zp = {
+        Category::Attribute,
+        "weight_zp",
+        "weight_t",
+    }; /*Weight zero point. Must be zero for non-int8 types. shape=-*/
+    const Argument output = {Category::Output, "output", "out_t", {4, 4}}; /*Output tensor shape=[N,OH,OW,OC]*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &weight,
+        &bias,
+        &pad,
+        &stride,
+        &dilation,
+        &input_zp,
+        &weight_zp,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_t", "int8_t"},
+            {"weight_t", "int8_t"},
+            {"out_t", "int32_t"},
+        },  // signed 8x8 with int32 accumulate
+        {
+            {"in_t", "int8_t"},
+            {"weight_t", "int4_t"},
+            {"out_t", "int32_t"},
+        },  // signed 8x4 with int32 accumulate
+        {
+            {"in_t", "int16_t"},
+            {"weight_t", "int8_t"},
+            {"out_t", "int48_t"},
+        },  // signed 16x8 with int48 accumulate
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1hby1qurzja4f(op, context);
+    ErrorIfCheck_1md8k265hfj92(op, context);
+    ErrorIfCheck_ojmgqziimenu(op, context);
+    ErrorIfCheck_1lrylbkd3w7ix(op, context);
+    ErrorIfCheck_3fzsq78v5ypau(op, context);
+    ErrorIfCheck_2vhj6e48eyzlr(op, context);
+    ErrorIfCheck_147wc580l2tik(op, context);
+    ErrorIfCheck_2rm8rnsdfn14h(op, context);
+    ErrorIfCheck_36emtx7zwkk96(op, context);
+    ErrorIfCheck_2r9jencgka20o(op, context);
+    ErrorIfCheck_207p0r46d35m0(op, context);
+    ErrorIfCheck_cr43yjpqkcpd(op, context);
+    LevelCheck_1l00wczs5w70i(op, context);
+    LevelCheck_1hle41fus7cpl(op, context);
+    LevelCheck_2n3xkkz3ip4mz(op, context);
+    LevelCheck_3o7qpmmd9ga58(op, context);
+    LevelCheck_16lxbjk2bszcz(op, context);
+    LevelCheck_2kwfb08mbiwpg(op, context);
+    LevelCheck_1xppm0ufw64nq(op, context);
+    LevelCheck_as2lzdd5d28b(op, context);
+}
+
+void ValidateOperator_CONV3D(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {Category::Input, "input", "in_t", {5, 5}};       /*Input tensor shape=[N,ID,IH,IW,IC]*/
+    const Argument weight = {Category::Input, "weight", "weight_t", {5, 5}}; /*Weight kernel size KDxKHxKW
+                                                                                shape=[OC,KD,KH,KW,IC]*/
+    const Argument bias = {Category::Input, "bias", "out_t", {1, 1}};     /*Per output channel bias data. shape=[OC]*/
+    const Argument pad = {Category::Attribute, "pad", "int32_t", {1, 1}}; /*[pad_d0, pad_d1, pad_top, pad_bottom,
+                                                                             pad_left, pad_right] shape=[6]*/
+    const Argument stride = {Category::Attribute, "stride", "int32_t", {1, 1}};     /*[stride_d, stride_y, stride_x]
+                                                                                       shape=[3]*/
+    const Argument dilation = {Category::Attribute, "dilation", "int32_t", {1, 1}}; /*[dilation_d, dilation_y,
+                                                                                       dilation_x] shape=[3]*/
+    const Argument input_zp = {
+        Category::Attribute,
+        "input_zp",
+        "in_t",
+    }; /*Input tensor zero point. Must be zero for non-int8 types. shape=-*/
+    const Argument weight_zp = {
+        Category::Attribute,
+        "weight_zp",
+        "weight_t",
+    }; /*Weight zero point. Must be zero for non-int8 types. shape=-*/
+    const Argument output = {Category::Output, "output", "out_t", {5, 5}}; /*Output tensor shape=[N,OD,OH,OW,OC]*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &weight,
+        &bias,
+        &pad,
+        &stride,
+        &dilation,
+        &input_zp,
+        &weight_zp,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_t", "int8_t"},
+            {"weight_t", "int8_t"},
+            {"out_t", "int32_t"},
+        },  // signed 8x8 with int32 accumulate
+        {
+            {"in_t", "int8_t"},
+            {"weight_t", "int4_t"},
+            {"out_t", "int32_t"},
+        },  // signed 8x4 with int32 accumulate
+        {
+            {"in_t", "int16_t"},
+            {"weight_t", "int8_t"},
+            {"out_t", "int48_t"},
+        },  // signed 16x8 with int48 accumulate
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1hby1qurzja4f(op, context);
+    ErrorIfCheck_1md8k265hfj92(op, context);
+    ErrorIfCheck_341t6ysqc16b2(op, context);
+    ErrorIfCheck_uqm570jwaqb6(op, context);
+    ErrorIfCheck_34iiwt6o66qfa(op, context);
+    ErrorIfCheck_llbd3iugmek0(op, context);
+    ErrorIfCheck_2vhj6e48eyzlr(op, context);
+    ErrorIfCheck_147wc580l2tik(op, context);
+    ErrorIfCheck_1w510kxt5b2b2(op, context);
+    ErrorIfCheck_27g3t38z1of4h(op, context);
+    ErrorIfCheck_95jvn4dzraol(op, context);
+    ErrorIfCheck_21377cjnb1ox7(op, context);
+    ErrorIfCheck_2cpco8ykx99sa(op, context);
+    LevelCheck_1npkwxnbypn8z(op, context);
+    LevelCheck_1l00wczs5w70i(op, context);
+    LevelCheck_1hle41fus7cpl(op, context);
+    LevelCheck_olu6vs8y9f61(op, context);
+    LevelCheck_3l4no1w1u6sq4(op, context);
+    LevelCheck_2n3xkkz3ip4mz(op, context);
+    LevelCheck_3o7qpmmd9ga58(op, context);
+    LevelCheck_16lxbjk2bszcz(op, context);
+    LevelCheck_2kwfb08mbiwpg(op, context);
+    LevelCheck_1xppm0ufw64nq(op, context);
+    LevelCheck_as2lzdd5d28b(op, context);
+    LevelCheck_1416gon2u3sue(op, context);
+}
+
+void ValidateOperator_DEPTHWISE_CONV2D(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {Category::Input, "input", "in_t", {4, 4}};       /*Input tensor shape=[N,H,W,C]*/
+    const Argument weight = {Category::Input, "weight", "weight_t", {4, 4}}; /*Weight kernel size KH x KW
+                                                                                shape=[KH,KW,C,M]*/
+    const Argument bias = {Category::Input, "bias", "out_t", {1, 1}};     /*Per output channel bias data. shape=[C*M]*/
+    const Argument pad = {Category::Attribute, "pad", "int32_t", {1, 1}}; /*[pad_top, pad_bottom, pad_left, pad_right]
+                                                                             shape=[4]*/
+    const Argument stride = {Category::Attribute, "stride", "int32_t", {1, 1}};     /*[stride_y, stride_x] shape=[2]*/
+    const Argument dilation = {Category::Attribute, "dilation", "int32_t", {1, 1}}; /*[dilation_y, dilation_x]
+                                                                                       shape=[2]*/
+    const Argument input_zp = {
+        Category::Attribute,
+        "input_zp",
+        "in_t",
+    }; /*Input tensor zero point. Must be zero for non-int8 types. shape=-*/
+    const Argument weight_zp = {
+        Category::Attribute,
+        "weight_zp",
+        "weight_t",
+    }; /*Weight zero point. Must be zero for non-int8 types. shape=-*/
+    const Argument output = {Category::Output, "output", "out_t", {4, 4}}; /*Output tensor shape=[N,OH,OW,C*M]*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &weight,
+        &bias,
+        &pad,
+        &stride,
+        &dilation,
+        &input_zp,
+        &weight_zp,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_t", "int8_t"},
+            {"weight_t", "int8_t"},
+            {"out_t", "int32_t"},
+        },  // signed 8x8 with int32 accumulate
+        {
+            {"in_t", "int8_t"},
+            {"weight_t", "int4_t"},
+            {"out_t", "int32_t"},
+        },  // signed 8x4 with int32 accumulate
+        {
+            {"in_t", "int16_t"},
+            {"weight_t", "int8_t"},
+            {"out_t", "int48_t"},
+        },  // signed 16x8 with int48 accumulate
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1hby1qurzja4f(op, context);
+    ErrorIfCheck_1md8k265hfj92(op, context);
+    ErrorIfCheck_ojmgqziimenu(op, context);
+    ErrorIfCheck_1lrylbkd3w7ix(op, context);
+    ErrorIfCheck_3fzsq78v5ypau(op, context);
+    ErrorIfCheck_2vhj6e48eyzlr(op, context);
+    ErrorIfCheck_147wc580l2tik(op, context);
+    ErrorIfCheck_10sexbqileii7(op, context);
+    ErrorIfCheck_12rt0p658ac1(op, context);
+    ErrorIfCheck_3cem64qtn6ajr(op, context);
+    LevelCheck_1l00wczs5w70i(op, context);
+    LevelCheck_1hle41fus7cpl(op, context);
+    LevelCheck_2n3xkkz3ip4mz(op, context);
+    LevelCheck_3o7qpmmd9ga58(op, context);
+    LevelCheck_16lxbjk2bszcz(op, context);
+    LevelCheck_2kwfb08mbiwpg(op, context);
+    LevelCheck_1xppm0ufw64nq(op, context);
+    LevelCheck_as2lzdd5d28b(op, context);
+}
+
+void ValidateOperator_FULLY_CONNECTED(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {Category::Input, "input", "in_t", {2, 2}};           /*Input tensor shape=[N,IC]*/
+    const Argument weight = {Category::Attribute, "weight", "weight_t", {2, 2}}; /*Weights shape=[OC,IC]*/
+    const Argument bias = {Category::Attribute, "bias", "out_t", {1, 1}}; /*Per output channel bias data. shape=[OC]*/
+    const Argument input_zp = {
+        Category::Attribute,
+        "input_zp",
+        "in_t",
+    }; /*Input tensor zero point. Must be zero for non-int8 types. shape=-*/
+    const Argument weight_zp = {
+        Category::Attribute,
+        "weight_zp",
+        "weight_t",
+    }; /*Weight zero point. Must be zero for non-int8 types. shape=-*/
+    const Argument output = {Category::Output, "output", "out_t", {2, 2}}; /*Output tensor shape=[N,OC]*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &weight,
+        &bias,
+        &input_zp,
+        &weight_zp,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_t", "int8_t"},
+            {"weight_t", "int8_t"},
+            {"out_t", "int32_t"},
+        },  // signed 8x8 with int32 accumulate
+        {
+            {"in_t", "int8_t"},
+            {"weight_t", "int4_t"},
+            {"out_t", "int32_t"},
+        },  // signed 8x4 with int32 accumulate
+        {
+            {"in_t", "int16_t"},
+            {"weight_t", "int8_t"},
+            {"out_t", "int48_t"},
+        },  // signed 16x8 with int48 accumulate
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1hby1qurzja4f(op, context);
+    ErrorIfCheck_1md8k265hfj92(op, context);
+    ErrorIfCheck_3ufiqep5ipuco(op, context);
+    ErrorIfCheck_3kcipzq18dxv9(op, context);
+    ErrorIfCheck_jcjmr2nnatvv(op, context);
+    ErrorIfCheck_qwmo2w7hxola(op, context);
+    ErrorIfCheck_c9o11f07skde(op, context);
+}
+
+void ValidateOperator_MATMUL(const regor::Operation *op, const Context &context)
+{
+    const Argument A = {Category::Input, "A", "in_t", {3, 3}}; /*Input tensor A, N matrices of size HxC shape=[N,H,C]*/
+    const Argument B = {Category::Input, "B", "in_t", {3, 3}}; /*Input tensor B, N matrices of size CxW shape=[N,C,W]*/
+    const Argument A_zp = {
+        Category::Attribute,
+        "A_zp",
+        "in_t",
+    }; /*Input tensor A zero point. Must be zero for non-int8 types. shape=-*/
+    const Argument B_zp = {
+        Category::Attribute,
+        "B_zp",
+        "in_t",
+    }; /*Input tensor B zero point. Must be zero for non-int8 types. shape=-*/
+    const Argument output = {Category::Output, "output", "out_t", {3, 3}}; /*Output tensor, N matrices of size HxW
+                                                                              shape=[N,H,W]*/
+    const std::vector<const Argument *> arguments = {
+        &A,
+        &B,
+        &A_zp,
+        &B_zp,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_t", "int8_t"},
+            {"out_t", "int32_t"},
+        },  // signed 8x8 with int32 accumulate
+        {
+            {"in_t", "int16_t"},
+            {"out_t", "int48_t"},
+        },  // signed 16x16 with int48 accumulate
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1ellfcuw76b13(op, context);
+    ErrorIfCheck_h1uadv5irsu6(op, context);
+    ErrorIfCheck_1kfh97qingywb(op, context);
+    ErrorIfCheck_1azcq4511qzyx(op, context);
+}
+
+void ValidateOperator_MAX_POOL2D(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {Category::Input, "input", "in_out_t", {4, 4}};      /*Input tensor 4D shape=[N,IH,IW,C]*/
+    const Argument kernel = {Category::Attribute, "kernel", "int32_t", {1, 1}}; /*[kernel_y, kernel_x] shape=[2]*/
+    const Argument stride = {Category::Attribute, "stride", "int32_t", {1, 1}}; /*[stride_y, stride_x] shape=[2]*/
+    const Argument pad = {Category::Attribute, "pad", "int32_t", {1, 1}}; /*[pad_top, pad_bottom, pad_left, pad_right]
+                                                                             shape=[4]*/
+    const Argument output = {Category::Output, "output", "in_out_t", {4, 4}}; /*Output tensor 4D shape=[N,OH,OW,C]*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &kernel,
+        &stride,
+        &pad,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "int8_t"},
+        },  // signed 8
+        {
+            {"in_out_t", "int16_t"},
+        },  // 16-bit
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_36r4wpx3psd81(op, context);
+    ErrorIfCheck_1lrylbkd3w7ix(op, context);
+    ErrorIfCheck_ojmgqziimenu(op, context);
+    ErrorIfCheck_3vqy81ueu5wjk(op, context);
+    ErrorIfCheck_125xuezh1964i(op, context);
+    ErrorIfCheck_fqta626ku4qe(op, context);
+    ErrorIfCheck_ycjhrvf2yigr(op, context);
+    ErrorIfCheck_1c57olj698f3d(op, context);
+    LevelCheck_2i1ithnrq06wi(op, context);
+    LevelCheck_1wobi8axf7z2y(op, context);
+    LevelCheck_1xppm0ufw64nq(op, context);
+    LevelCheck_as2lzdd5d28b(op, context);
+    LevelCheck_2n3xkkz3ip4mz(op, context);
+    LevelCheck_3o7qpmmd9ga58(op, context);
+    LevelCheck_16lxbjk2bszcz(op, context);
+    LevelCheck_2kwfb08mbiwpg(op, context);
+}
+
+void ValidateOperator_TRANSPOSE_CONV2D(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {Category::Input, "input", "in_t", {4, 4}};       /*Input tensor shape=[N,IH,IW,IC]*/
+    const Argument weight = {Category::Input, "weight", "weight_t", {4, 4}}; /*Weight kernel size KH x KW
+                                                                                shape=[OC,KH,KW,IC]*/
+    const Argument bias = {Category::Input, "bias", "out_t", {1, 1}}; /*Per output channel bias data. shape=[OC]*/
+    const Argument out_pad = {Category::Attribute, "out_pad", "int32_t", {1, 1}};     /*[out_pad_top, out_pad_bottom,
+                                                                                         out_pad_left, out_pad_right]
+                                                                                         shape=[4]*/
+    const Argument stride = {Category::Attribute, "stride", "int32_t", {1, 1}};       /*[stride_y, stride_x] shape=[2]*/
+    const Argument out_shape = {Category::Attribute, "out_shape", "int32_t", {1, 1}}; /*[N,OH,OW,OC] shape=[4]*/
+    const Argument input_zp = {
+        Category::Attribute,
+        "input_zp",
+        "in_t",
+    }; /*Input tensor zero point. Must be zero for non-int8 types. shape=-*/
+    const Argument weight_zp = {
+        Category::Attribute,
+        "weight_zp",
+        "weight_t",
+    }; /*Weight zero point. Must be zero for non-int8 types. shape=-*/
+    const Argument output = {Category::Output, "output", "out_t", {4, 4}}; /*Output tensor shape=[N,OH,OW,OC]*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &weight,
+        &bias,
+        &out_pad,
+        &stride,
+        &out_shape,
+        &input_zp,
+        &weight_zp,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_t", "int8_t"},
+            {"weight_t", "int8_t"},
+            {"out_t", "int32_t"},
+        },  // signed 8x8 with int32 accumulate
+        {
+            {"in_t", "int8_t"},
+            {"weight_t", "int4_t"},
+            {"out_t", "int32_t"},
+        },  // signed 8x4 with int32 accumulate
+        {
+            {"in_t", "int16_t"},
+            {"weight_t", "int8_t"},
+            {"out_t", "int48_t"},
+        },  // signed 16x8 with int48 accumulate
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1hby1qurzja4f(op, context);
+    ErrorIfCheck_1md8k265hfj92(op, context);
+    ErrorIfCheck_q9dl3x81rc4o(op, context);
+    ErrorIfCheck_2rfkujt9lg7eq(op, context);
+    ErrorIfCheck_1lrylbkd3w7ix(op, context);
+    ErrorIfCheck_3nelbnmxyemot(op, context);
+    ErrorIfCheck_24conlof4w8eh(op, context);
+    ErrorIfCheck_2rm8rnsdfn14h(op, context);
+    ErrorIfCheck_36emtx7zwkk96(op, context);
+    ErrorIfCheck_2r9jencgka20o(op, context);
+    ErrorIfCheck_207p0r46d35m0(op, context);
+    ErrorIfCheck_cr43yjpqkcpd(op, context);
+    LevelCheck_17eyg1nicy12g(op, context);
+    LevelCheck_6qao6e1mxke0(op, context);
+    LevelCheck_pnaf5n03f8jg(op, context);
+    LevelCheck_me421i5r5j13(op, context);
+    LevelCheck_2ffhdgbz1kvxc(op, context);
+    LevelCheck_a0x2apl3zoz(op, context);
+    LevelCheck_1xppm0ufw64nq(op, context);
+    LevelCheck_as2lzdd5d28b(op, context);
+}
+
+void ValidateOperator_CLAMP(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {
+        Category::Input,
+        "input",
+        "in_out_t",
+    }; /*Input tensor shape=shape*/
+    const Argument min_val = {
+        Category::Attribute,
+        "min_val",
+        "in_out_t",
+    }; /*Minimum clip value shape=-*/
+    const Argument max_val = {
+        Category::Attribute,
+        "max_val",
+        "in_out_t",
+    }; /*Maximum clip value shape=-*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor of same type and shape as input shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &min_val,
+        &max_val,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "int8_t"},
+        },  // signed 8
+        {
+            {"in_out_t", "int16_t"},
+        },  // signed 16
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_xod9coigx1x2(op, context);
+    ErrorIfCheck_10u6py7exa66n(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_SIGMOID(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {
+        Category::Input,
+        "input",
+        "in_out_t",
+    }; /*Input tensor shape=shape*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor of same type and shape as input shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {};
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_10u6py7exa66n(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_TANH(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {
+        Category::Input,
+        "input",
+        "in_out_t",
+    }; /*Input tensor shape=shape*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor of same type and shape as input shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {};
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_10u6py7exa66n(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_ADD(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape1*/
+    const Argument input2 = {
+        Category::Input,
+        "input2",
+        "in_out_t",
+    }; /*Input tensor with the same rank as input1 shape=shape2*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor with broadcast shape if necessary shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &input2,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1yism57if6v2z(op, context);
+    ErrorIfCheck_3k5ug2w7gxc7r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_ARITHMETIC_RIGHT_SHIFT(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape1*/
+    const Argument input2 = {
+        Category::Input,
+        "input2",
+        "in_out_t",
+    }; /*Input tensor with the same rank as input1 shape=shape2*/
+    const Argument round = {
+        Category::Attribute,
+        "round",
+        "bool_t",
+    }; /*If true then the shift is rounded shape=-*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor with broadcast shape if necessary shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &input2,
+        &round,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "int8_t"},
+        },  // signed 8
+        {
+            {"in_out_t", "int16_t"},
+        },  // signed 16
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1yism57if6v2z(op, context);
+    ErrorIfCheck_3k5ug2w7gxc7r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+    RequireCheck_25jhgrylo2an5(op, context);
+}
+
+void ValidateOperator_BITWISE_AND(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape1*/
+    const Argument input2 = {
+        Category::Input,
+        "input2",
+        "in_out_t",
+    }; /*Input tensor with the same rank as input1 shape=shape2*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor with broadcast shape if necessary shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &input2,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "int8_t"},
+        },  // signed 8
+        {
+            {"in_out_t", "int16_t"},
+        },  // signed 16
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1yism57if6v2z(op, context);
+    ErrorIfCheck_3k5ug2w7gxc7r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_BITWISE_OR(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape1*/
+    const Argument input2 = {
+        Category::Input,
+        "input2",
+        "in_out_t",
+    }; /*Input tensor with the same rank as input1 shape=shape2*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor with broadcast shape if necessary shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &input2,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "int8_t"},
+        },  // signed 8
+        {
+            {"in_out_t", "int16_t"},
+        },  // signed 16
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1yism57if6v2z(op, context);
+    ErrorIfCheck_3k5ug2w7gxc7r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_BITWISE_XOR(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape1*/
+    const Argument input2 = {
+        Category::Input,
+        "input2",
+        "in_out_t",
+    }; /*Input tensor with the same rank as input1 shape=shape2*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor with broadcast shape if necessary shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &input2,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "int8_t"},
+        },  // signed 8
+        {
+            {"in_out_t", "int16_t"},
+        },  // signed 16
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1yism57if6v2z(op, context);
+    ErrorIfCheck_3k5ug2w7gxc7r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_INTDIV(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape1*/
+    const Argument input2 = {
+        Category::Input,
+        "input2",
+        "in_out_t",
+    }; /*Input tensor with the same rank as input1 shape=shape2*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor with broadcast shape if necessary shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &input2,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1yism57if6v2z(op, context);
+    ErrorIfCheck_3k5ug2w7gxc7r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+    RequireCheck_35z4hcgn21c8p(op, context);
+    RequireCheck_2v5c1x79g8j7o(op, context);
+}
+
+void ValidateOperator_LOGICAL_AND(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape1*/
+    const Argument input2 = {
+        Category::Input,
+        "input2",
+        "in_out_t",
+    }; /*Input tensor with the same rank as input1 shape=shape2*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor with broadcast shape if necessary shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &input2,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "bool_t"},
+        },  // boolean
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1yism57if6v2z(op, context);
+    ErrorIfCheck_3k5ug2w7gxc7r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_LOGICAL_LEFT_SHIFT(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape1*/
+    const Argument input2 = {
+        Category::Input,
+        "input2",
+        "in_out_t",
+    }; /*Input tensor with the same rank as input1 shape=shape2*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor with broadcast shape if necessary shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &input2,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "int8_t"},
+        },  // signed 8
+        {
+            {"in_out_t", "int16_t"},
+        },  // signed 16
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1yism57if6v2z(op, context);
+    ErrorIfCheck_3k5ug2w7gxc7r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+    RequireCheck_3k2pr9vozq62t(op, context);
+}
+
+void ValidateOperator_LOGICAL_RIGHT_SHIFT(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape1*/
+    const Argument input2 = {
+        Category::Input,
+        "input2",
+        "in_out_t",
+    }; /*Input tensor with the same rank as input1 shape=shape2*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor with broadcast shape if necessary shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &input2,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "int8_t"},
+        },  // signed 8
+        {
+            {"in_out_t", "int16_t"},
+        },  // signed 16
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1yism57if6v2z(op, context);
+    ErrorIfCheck_3k5ug2w7gxc7r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+    RequireCheck_3k2pr9vozq62t(op, context);
+}
+
+void ValidateOperator_LOGICAL_OR(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape1*/
+    const Argument input2 = {
+        Category::Input,
+        "input2",
+        "in_out_t",
+    }; /*Input tensor with the same rank as input1 shape=shape2*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor with broadcast shape if necessary shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &input2,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "bool_t"},
+        },  // boolean
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1yism57if6v2z(op, context);
+    ErrorIfCheck_3k5ug2w7gxc7r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_LOGICAL_XOR(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape1*/
+    const Argument input2 = {
+        Category::Input,
+        "input2",
+        "in_out_t",
+    }; /*Input tensor with the same rank as input1 shape=shape2*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor with broadcast shape if necessary shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &input2,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "bool_t"},
+        },  // boolean
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1yism57if6v2z(op, context);
+    ErrorIfCheck_3k5ug2w7gxc7r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_MAXIMUM(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape1*/
+    const Argument input2 = {
+        Category::Input,
+        "input2",
+        "in_out_t",
+    }; /*Input tensor with the same rank as input1 shape=shape2*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor with broadcast shape if necessary shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &input2,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1yism57if6v2z(op, context);
+    ErrorIfCheck_3k5ug2w7gxc7r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_MINIMUM(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape1*/
+    const Argument input2 = {
+        Category::Input,
+        "input2",
+        "in_out_t",
+    }; /*Input tensor with the same rank as input1 shape=shape2*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor with broadcast shape if necessary shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &input2,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1yism57if6v2z(op, context);
+    ErrorIfCheck_3k5ug2w7gxc7r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_MUL(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_t",
+    }; /*Input tensor shape=shape1*/
+    const Argument input2 = {
+        Category::Input,
+        "input2",
+        "in_t",
+    }; /*Input tensor with the same rank as input1 shape=shape2*/
+    const Argument shift = {
+        Category::Attribute,
+        "shift",
+        "uint6_t",
+    }; /*Result right shift (int32_t data type only) shape=-*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "out_t",
+    }; /*Output tensor with broadcast shape if necessary shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &input2,
+        &shift,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_t", "int8_t"},
+            {"out_t", "int32_t"},
+        },  // signed 8
+        {
+            {"in_t", "int16_t"},
+            {"out_t", "int32_t"},
+        },  // signed 16
+        {
+            {"in_t", "int32_t"},
+            {"out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_2gdayq6ofi7wx(op, context);
+    ErrorIfCheck_1yism57if6v2z(op, context);
+    ErrorIfCheck_3k5ug2w7gxc7r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+    RequireCheck_27adsuj7sthvo(op, context);
+}
+
+void ValidateOperator_POW(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape1*/
+    const Argument input2 = {
+        Category::Input,
+        "input2",
+        "in_out_t",
+    }; /*Input tensor with the same rank as input1 shape=shape2*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor with broadcast shape if necessary shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &input2,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {};
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1yism57if6v2z(op, context);
+    ErrorIfCheck_3k5ug2w7gxc7r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_SUB(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape1*/
+    const Argument input2 = {
+        Category::Input,
+        "input2",
+        "in_out_t",
+    }; /*Input tensor with the same rank as input1 shape=shape2*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor with broadcast shape if necessary shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &input2,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1yism57if6v2z(op, context);
+    ErrorIfCheck_3k5ug2w7gxc7r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_TABLE(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {
+        Category::Input,
+        "input",
+        "in_t",
+    };                                                                        /*Input tensor shape=shape*/
+    const Argument table = {Category::Attribute, "table", "table_t", {1, 1}}; /*Lookup table tensor shape=[TABLE_SIZE]*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "out_t",
+    }; /*Output tensor shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &table,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_t", "int8_t"},
+            {"table_t", "int8_t"},
+            {"out_t", "int8_t"},
+            {"TABLE_SIZE", "256"},
+        },  // signed 8
+        {
+            {"in_t", "int16_t"},
+            {"table_t", "int16_t"},
+            {"out_t", "int32_t"},
+            {"TABLE_SIZE", "513"},
+        },  // signed 16
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_10u6py7exa66n(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+    RequireCheck_3o6eotvyt76cz(op, context);
+}
+
+void ValidateOperator_ABS(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor of same type, size as the input tensor shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_396rg8p65j58r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_BITWISE_NOT(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor of same type, size as the input tensor shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "int8_t"},
+        },  // signed 8
+        {
+            {"in_out_t", "int16_t"},
+        },  // signed 16
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_396rg8p65j58r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_CEIL(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor of same type, size as the input tensor shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {};
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_396rg8p65j58r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_CLZ(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor of same type, size as the input tensor shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_396rg8p65j58r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_EXP(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor of same type, size as the input tensor shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {};
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_396rg8p65j58r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_FLOOR(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor of same type, size as the input tensor shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {};
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_396rg8p65j58r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_LOG(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor of same type, size as the input tensor shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {};
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_396rg8p65j58r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_LOGICAL_NOT(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor of same type, size as the input tensor shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "bool_t"},
+        },  // Boolean
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_396rg8p65j58r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_NEGATE(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape*/
+    const Argument input1_zp = {
+        Category::Attribute,
+        "input1_zp",
+        "in_out_t",
+    }; /*Input 1 zero point. Must be zero for non-int8 types. shape=-*/
+    const Argument output_zp = {
+        Category::Attribute,
+        "output_zp",
+        "in_out_t",
+    }; /*Output zero point. Must be zero for non-int8 types. shape=-*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor of same type, size as the input tensor shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &input1_zp,
+        &output_zp,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "int8_t"},
+            {"acc_t", "int32_t"},
+        },  // signed 8
+        {
+            {"in_out_t", "int16_t"},
+            {"acc_t", "int32_t"},
+        },  // signed 16
+        {
+            {"in_out_t", "int32_t"},
+            {"acc_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_38qvty7pudfz2(op, context);
+    ErrorIfCheck_1n0denkrrrlr1(op, context);
+    ErrorIfCheck_396rg8p65j58r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_RECIPROCAL(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor of same type, size as the input tensor shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {};
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_396rg8p65j58r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_RSQRT(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor of same type, size as the input tensor shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {};
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_396rg8p65j58r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_SELECT(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "bool_t",
+    }; /*Input selector tensor shape=shape1*/
+    const Argument input2 = {
+        Category::Input,
+        "input2",
+        "in_out_t",
+    }; /*Input value tensor if input1 is True shape=shape2*/
+    const Argument input3 = {
+        Category::Input,
+        "input3",
+        "in_out_t",
+    }; /*Input value tensor if input1 is False shape=shape3*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor of same type as input2 and input3, with broadcast shape if necessary shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &input2,
+        &input3,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "bool_t"},
+        },  // Boolean
+        {
+            {"in_out_t", "int8_t"},
+        },  // signed 8
+        {
+            {"in_out_t", "int16_t"},
+        },  // signed 16
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1yism57if6v2z(op, context);
+    ErrorIfCheck_3k5ug2w7gxc7r(op, context);
+    ErrorIfCheck_3tccsjner0km9(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_EQUAL(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_t",
+    }; /*Input tensor shape=shape1*/
+    const Argument input2 = {
+        Category::Input,
+        "input2",
+        "in_t",
+    }; /*Input tensor with the same rank as input1 shape=shape2*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "out_t",
+    }; /*Output tensor with broadcast shape if necessary shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &input2,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_t", "int32_t"},
+            {"out_t", "bool_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1yism57if6v2z(op, context);
+    ErrorIfCheck_3k5ug2w7gxc7r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_GREATER(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_t",
+    }; /*Input tensor shape=shape1*/
+    const Argument input2 = {
+        Category::Input,
+        "input2",
+        "in_t",
+    }; /*Input tensor with the same rank as input1 shape=shape2*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "out_t",
+    }; /*Output tensor with broadcast shape if necessary shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &input2,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_t", "int32_t"},
+            {"out_t", "bool_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1yism57if6v2z(op, context);
+    ErrorIfCheck_3k5ug2w7gxc7r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_GREATER_EQUAL(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_t",
+    }; /*Input tensor shape=shape1*/
+    const Argument input2 = {
+        Category::Input,
+        "input2",
+        "in_t",
+    }; /*Input tensor with the same rank as input1 shape=shape2*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "out_t",
+    }; /*Output tensor with broadcast shape if necessary shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &input2,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_t", "int32_t"},
+            {"out_t", "bool_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1yism57if6v2z(op, context);
+    ErrorIfCheck_3k5ug2w7gxc7r(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_REDUCE_ALL(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {
+        Category::Input,
+        "input",
+        "in_out_t",
+    }; /*Input tensor with rank from 1 to 4 shape=shape1*/
+    const Argument axis = {
+        Category::Attribute,
+        "axis",
+        "int32_t",
+    }; /*Axis to reduce, in range from 0 to rank(shape1)-1 shape=-*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor. Same rank as the input tensor. shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &axis,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "bool_t"},
+        },  // boolean
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_3tg4p2a5te0jy(op, context);
+    ErrorIfCheck_33exz9gn2i1wy(op, context);
+}
+
+void ValidateOperator_REDUCE_ANY(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {
+        Category::Input,
+        "input",
+        "in_out_t",
+    }; /*Input tensor with rank from 1 to 4 shape=shape1*/
+    const Argument axis = {
+        Category::Attribute,
+        "axis",
+        "int32_t",
+    }; /*Axis to reduce, in range from 0 to rank(shape1)-1 shape=-*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor. Same rank as the input tensor. shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &axis,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "bool_t"},
+        },  // boolean
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_3tg4p2a5te0jy(op, context);
+    ErrorIfCheck_33exz9gn2i1wy(op, context);
+}
+
+void ValidateOperator_REDUCE_MAX(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {
+        Category::Input,
+        "input",
+        "in_out_t",
+    }; /*Input tensor with rank from 1 to 4 shape=shape1*/
+    const Argument axis = {
+        Category::Attribute,
+        "axis",
+        "int32_t",
+    }; /*Axis to reduce, in range from 0 to rank(shape1)-1 shape=-*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor. Same rank as the input tensor. shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &axis,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "int8_t"},
+        },  // signed 8
+        {
+            {"in_out_t", "int16_t"},
+        },  // signed 16
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_3tg4p2a5te0jy(op, context);
+    ErrorIfCheck_33exz9gn2i1wy(op, context);
+}
+
+void ValidateOperator_REDUCE_MIN(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {
+        Category::Input,
+        "input",
+        "in_out_t",
+    }; /*Input tensor with rank from 1 to 4 shape=shape1*/
+    const Argument axis = {
+        Category::Attribute,
+        "axis",
+        "int32_t",
+    }; /*Axis to reduce, in range from 0 to rank(shape1)-1 shape=-*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor. Same rank as the input tensor. shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &axis,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "int8_t"},
+        },  // signed 8
+        {
+            {"in_out_t", "int16_t"},
+        },  // signed 16
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_3tg4p2a5te0jy(op, context);
+    ErrorIfCheck_33exz9gn2i1wy(op, context);
+}
+
+void ValidateOperator_REDUCE_PRODUCT(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {
+        Category::Input,
+        "input",
+        "in_out_t",
+    }; /*Input tensor with rank from 1 to 4 shape=shape1*/
+    const Argument axis = {
+        Category::Attribute,
+        "axis",
+        "int32_t",
+    }; /*Axis to reduce, in range from 0 to rank(shape1)-1 shape=-*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor. Same rank as the input tensor. shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &axis,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {};
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_3tg4p2a5te0jy(op, context);
+    ErrorIfCheck_33exz9gn2i1wy(op, context);
+}
+
+void ValidateOperator_REDUCE_SUM(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {
+        Category::Input,
+        "input",
+        "in_out_t",
+    }; /*Input tensor with rank from 1 to 4 shape=shape1*/
+    const Argument axis = {
+        Category::Attribute,
+        "axis",
+        "int32_t",
+    }; /*Axis to reduce, in range from 0 to rank(shape1)-1 shape=-*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor. Same rank as the input tensor. shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &axis,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_3tg4p2a5te0jy(op, context);
+    ErrorIfCheck_33exz9gn2i1wy(op, context);
+}
+
+void ValidateOperator_CONCAT(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*List of input tensors. All inputs must have the same rank and data type shape=shapes1[]*/
+    const Argument axis = {
+        Category::Attribute,
+        "axis",
+        "int32_t",
+    }; /*Axis along which concatenation is to occur, in range from 0 to rank(shape)-1 shape=-*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &axis,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "bool_t"},
+        },  // boolean
+        {
+            {"in_out_t", "int8_t"},
+        },  // signed 8
+        {
+            {"in_out_t", "int16_t"},
+        },  // signed 16
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_14slfd7r77hgh(op, context);
+    ErrorIfCheck_1fzhf02pkiw9z(op, context);
+    ErrorIfCheck_16s99hvsej4fo(op, context);
+    ErrorIfCheck_dctmd6sgn5n0(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_PAD(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor with minimum rank of one. shape=shape1*/
+    const Argument padding = {Category::Attribute, "padding", "int32_t", {2, 2}}; /*Number of pad elements at the start
+                                                                                     and end of each dimension
+                                                                                     shape=[rank(shape1),2]*/
+    const Argument pad_const = {
+        Category::Attribute,
+        "pad_const",
+        "in_out_t",
+    }; /*Constant value to be used as padding shape=-*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor of same type as the input tensor shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &padding,
+        &pad_const,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "bool_t"},
+        },  // boolean
+        {
+            {"in_out_t", "int8_t"},
+        },  // signed 8
+        {
+            {"in_out_t", "int16_t"},
+        },  // signed 16
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_14z7y0qe9lwps(op, context);
+    ErrorIfCheck_2rfef32dgp3be(op, context);
+    ErrorIfCheck_2sfcgak3rj1vs(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_RESHAPE(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    };                                                                                /*Input tensor shape=shape1*/
+    const Argument new_shape = {Category::Attribute, "new_shape", "int32_t", {1, 1}}; /*List of values, with each
+                                                                                       element giving the size of the
+                                                                                       result tensor for the given
+                                                                                       dimension. At most one dimension
+                                                                                       may be given as -1 to
+                                                                                       automatically calculate the
+                                                                                       dimension size.
+                                                                                       shape=[rank(shape)]*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor of same type, size as the input tensor shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &new_shape,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "bool_t"},
+        },  // boolean
+        {
+            {"in_out_t", "int8_t"},
+        },  // signed 8
+        {
+            {"in_out_t", "int16_t"},
+        },  // signed 16
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_2a1jpygblc07i(op, context);
+    LevelCheck_1lz89reckvj8d(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_REVERSE(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {
+        Category::Input,
+        "input",
+        "in_out_t",
+    }; /*Input tensor with minimum rank of one. shape=shape*/
+    const Argument axis = {
+        Category::Attribute,
+        "axis",
+        "int32_t",
+    }; /*Axis to reverse, in range from 0 to rank(shape)-1 shape=-*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor. Same shape as input tensor shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &axis,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "bool_t"},
+        },  // boolean
+        {
+            {"in_out_t", "int8_t"},
+        },  // signed 8
+        {
+            {"in_out_t", "int16_t"},
+        },  // signed 16
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_3hthyoock2ew5(op, context);
+    ErrorIfCheck_10u6py7exa66n(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_SLICE(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor with minimum rank of one. shape=shape1*/
+    const Argument start = {Category::Attribute, "start", "int32_t", {1, 1}}; /*List of integer coordinates, of length
+                                                                               equal to the rank of input1. Start
+                                                                               coordinate for slicing.
+                                                                               shape=[rank(shape1)]*/
+    const Argument size = {Category::Attribute, "size", "int32_t", {1, 1}};   /*List of integer size values, of length
+                                                                               equal to the rank of input1. Size of the
+                                                                               input to be used.   shape=[rank(shape1)]*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor of same type as the input tensor shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &start,
+        &size,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "bool_t"},
+        },  // boolean
+        {
+            {"in_out_t", "int8_t"},
+        },  // signed 8
+        {
+            {"in_out_t", "int16_t"},
+        },  // signed 16
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1nifeiq9rvmb8(op, context);
+    ErrorIfCheck_21rq6kn6p1yle(op, context);
+    ErrorIfCheck_3rghkieqip43o(op, context);
+    ErrorIfCheck_1cyv9n59wyyyc(op, context);
+    ErrorIfCheck_3oy2tclc6uhsu(op, context);
+    ErrorIfCheck_gpp3enlp1ddg(op, context);
+    ErrorIfCheck_ix9div4ld46q(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_TILE(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor with minimum rank of one. shape=shape1*/
+    const Argument multiplies = {Category::Attribute, "multiplies", "int32_t", {1, 1}}; /*Number of times to replicate
+                                                                                           input1 in each dimension
+                                                                                           shape=[rank(shape1)]*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor of same type, rank as the input tensor shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &multiplies,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "bool_t"},
+        },  // boolean
+        {
+            {"in_out_t", "int8_t"},
+        },  // signed 8
+        {
+            {"in_out_t", "int16_t"},
+        },  // signed 16
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_21rq6kn6p1yle(op, context);
+    ErrorIfCheck_3estuseky2gm2(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_TRANSPOSE(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor with minimum rank of one. shape=shape1*/
+    const Argument perms = {Category::Attribute, "perms", "int32_t", {1, 1}}; /*List of integers of length equal to the
+                                                                               rank of input1. Values must be valid
+                                                                               dimensions within shape1, and may not be
+                                                                               repeated. shape=[rank(shape1)]*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor of same type, rank as the input tensor shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &perms,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "bool_t"},
+        },  // boolean
+        {
+            {"in_out_t", "int8_t"},
+        },  // signed 8
+        {
+            {"in_out_t", "int16_t"},
+        },  // signed 16
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_21rq6kn6p1yle(op, context);
+    ErrorIfCheck_2a1jpygblc07i(op, context);
+    ErrorIfCheck_5bq1fx1llv8(op, context);
+    ErrorIfCheck_ckwpttzajw06(op, context);
+    ErrorIfCheck_2n1ratxgd89tx(op, context);
+    ErrorIfCheck_aizwrn95lb0l(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_GATHER(const regor::Operation *op, const Context &context)
+{
+    const Argument values = {Category::Input, "values", "value_t", {3, 3}};   /*3D value tensor shape=[N,K,C]*/
+    const Argument indices = {Category::Input, "indices", "index_t", {2, 2}}; /*2D index tensor shape=[N,W]*/
+    const Argument output = {Category::Output, "output", "value_t", {3, 3}};  /*3D output tensor shape=[N,W,C]*/
+    const std::vector<const Argument *> arguments = {
+        &values,
+        &indices,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"value_t", "int8_t"},
+        },  // signed 8
+        {
+            {"value_t", "int16_t"},
+        },  // signed 16
+        {
+            {"value_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_294afuxnedk9i(op, context);
+    ErrorIfCheck_27p0n0pjt2bd6(op, context);
+    ErrorIfCheck_1uwmsen32dse1(op, context);
+    RequireCheck_31n0oq4yculbk(op, context);
+}
+
+void ValidateOperator_SCATTER(const regor::Operation *op, const Context &context)
+{
+    const Argument values_in = {Category::Input, "values_in", "value_t", {3, 3}}; /*3D values in tensor shape=[N,K,C]*/
+    const Argument indices = {Category::Input, "indices", "index_t", {2, 2}};     /*2D index tensor shape=[N,W]*/
+    const Argument input = {Category::Input, "input", "value_t", {3, 3}};         /*3D input tensor shape=[N,W,C]*/
+    const Argument values_out = {Category::Output, "values_out", "value_t", {3, 3}}; /*3D output tensor shape=[N,K,C]*/
+    const std::vector<const Argument *> arguments = {
+        &values_in,
+        &indices,
+        &input,
+        &values_out,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"value_t", "int8_t"},
+        },  // signed 8
+        {
+            {"value_t", "int16_t"},
+        },  // signed 16
+        {
+            {"value_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_3c5bq3iswjd1x(op, context);
+    ErrorIfCheck_53yuoon46swi(op, context);
+    ErrorIfCheck_q9pgbwuvutqu(op, context);
+    ErrorIfCheck_1qdcccs22lqtr(op, context);
+    ErrorIfCheck_2azl8wc8mbsrj(op, context);
+    ErrorIfCheck_122a36k26p0au(op, context);
+    RequireCheck_31n0oq4yculbk(op, context);
+    RequireCheck_2apk8ly9uthz6(op, context);
+}
+
+void ValidateOperator_RESIZE(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {Category::Input, "input", "in_t", {4, 4}};          /*Input tensor shape=[N,IH,IW,C]*/
+    const Argument scale = {Category::Attribute, "scale", "int16_t", {1, 1}};   /*[scale_y_n, scale_y_d, scale_x_n,
+                                                                                   scale_x_d] shape=[4]*/
+    const Argument offset = {Category::Attribute, "offset", "int16_t", {1, 1}}; /*[offset_y, offset_x] shape=[2]*/
+    const Argument border = {Category::Attribute, "border", "int16_t", {1, 1}}; /*[border_y, border_x] shape=[2]*/
+    const Argument mode = {
+        Category::Attribute,
+        "mode",
+        "mode_t",
+    };                                                                     /*BILINEAR or NEAREST shape=-*/
+    const Argument output = {Category::Output, "output", "out_t", {4, 4}}; /*Output tensor shape=[N,OH,OW,C]*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &scale,
+        &offset,
+        &border,
+        &mode,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"resize_t", "int16_t"},
+            {"in_t", "int8_t"},
+            {"out_t", "int32_t"},
+        },  // signed 8, bilinear
+        {
+            {"resize_t", "int16_t"},
+            {"in_t", "int8_t"},
+            {"out_t", "int8_t"},
+        },  // signed 8, nearest
+        {
+            {"resize_t", "int16_t"},
+            {"in_t", "int16_t"},
+            {"out_t", "int48_t"},
+        },  // signed 16, bilinear
+        {
+            {"resize_t", "int16_t"},
+            {"in_t", "int16_t"},
+            {"out_t", "int16_t"},
+        },  // signed 16, nearest
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_3sfcy967j2w8w(op, context);
+    ErrorIfCheck_1obslcewwn583(op, context);
+    ErrorIfCheck_3oxfjen91qb6l(op, context);
+    ErrorIfCheck_1uo0z247e42af(op, context);
+    ErrorIfCheck_1eovh9pyc6tyw(op, context);
+    ErrorIfCheck_24jsin2zkf4ug(op, context);
+    ErrorIfCheck_12uj5fltk5rbo(op, context);
+    ErrorIfCheck_1py9f91imwjxe(op, context);
+    ErrorIfCheck_fn614zzdrdfd(op, context);
+    ErrorIfCheck_338aejy0aeqeg(op, context);
+    ErrorIfCheck_1c57olj698f3d(op, context);
+    LevelCheck_1r40jc4ashh6o(op, context);
+    LevelCheck_1u7rtl141felu(op, context);
+}
+
+void ValidateOperator_CAST(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {
+        Category::Input,
+        "input",
+        "in_t",
+    }; /*Input tensor shape=shape*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "out_t",
+    }; /*Output tensor shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_t", "bool_t"},
+            {"out_t", "int8_t"},
+        },  // bool to signed 8
+        {
+            {"in_t", "bool_t"},
+            {"out_t", "int16_t"},
+        },  // bool to signed 16
+        {
+            {"in_t", "bool_t"},
+            {"out_t", "int32_t"},
+        },  // bool to signed 32
+        {
+            {"in_t", "int8_t"},
+            {"out_t", "bool_t"},
+        },  // signed 8 to bool
+        {
+            {"in_t", "int8_t"},
+            {"out_t", "int16_t"},
+        },  // signed 8 to signed 16
+        {
+            {"in_t", "int8_t"},
+            {"out_t", "int32_t"},
+        },  // signed 8 to signed 32
+        {
+            {"in_t", "int16_t"},
+            {"out_t", "bool_t"},
+        },  // signed 16 to bool
+        {
+            {"in_t", "int16_t"},
+            {"out_t", "int8_t"},
+        },  // signed 16 to signed 8
+        {
+            {"in_t", "int16_t"},
+            {"out_t", "int32_t"},
+        },  // signed 16 to signed 32
+        {
+            {"in_t", "int32_t"},
+            {"out_t", "bool_t"},
+        },  // signed 32 to bool
+        {
+            {"in_t", "int32_t"},
+            {"out_t", "int8_t"},
+        },  // signed 32 to signed 8
+        {
+            {"in_t", "int32_t"},
+            {"out_t", "int16_t"},
+        },  // signed 32 to signed 16
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_10u6py7exa66n(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_RESCALE(const regor::Operation *op, const Context &context)
+{
+    const Argument input = {
+        Category::Input,
+        "input",
+        "in_t",
+    }; /*Input tensor shape=shape*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "out_t",
+    }; /*Output tensor with the same shape as input shape=shape*/
+    const Argument input_zp = {
+        Category::Attribute,
+        "input_zp",
+        "in_t",
+    }; /*Input tensor zero point. Must be zero for non-int8 types. shape=-*/
+    const Argument output_zp = {
+        Category::Attribute,
+        "output_zp",
+        "out_t",
+    }; /*Output tensor zero point. Must be zero for non-int8 types. shape=-*/
+    const Argument multiplier = {Category::Attribute, "multiplier", "mul_t", {1, 1}}; /*Scaling multiplier array
+                                                                                         shape=[NC]*/
+    const Argument shift = {Category::Attribute, "shift", "uint6_t", {1, 1}};         /*Scaling shift array shape=[NC]*/
+    const Argument scale32 = {
+        Category::Attribute,
+        "scale32",
+        "bool_t",
+    }; /*if (scale32) mul_t=int32_t else mul_t=int16_t shape=-*/
+    const Argument double_round = {
+        Category::Attribute,
+        "double_round",
+        "bool_t",
+    }; /*Select double round mode shape=-*/
+    const Argument per_channel = {
+        Category::Attribute,
+        "per_channel",
+        "bool_t",
+    }; /*if (per_channel) NC=shape[rank(shape)-1] else NC=1 shape=-*/
+    const std::vector<const Argument *> arguments = {
+        &input,
+        &output,
+        &input_zp,
+        &output_zp,
+        &multiplier,
+        &shift,
+        &scale32,
+        &double_round,
+        &per_channel,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_t", "int8_t"},
+            {"out_t", "int8_t"},
+        },  // signed 8 to signed 8
+        {
+            {"in_t", "int8_t"},
+            {"out_t", "int16_t"},
+        },  // signed 8 to signed 16
+        {
+            {"in_t", "int8_t"},
+            {"out_t", "int32_t"},
+        },  // signed 8 to signed 32
+        {
+            {"in_t", "int8_t"},
+            {"out_t", "uint8_t"},
+        },  // signed 8 to unsigned 8
+        {
+            {"in_t", "int16_t"},
+            {"out_t", "int8_t"},
+        },  // signed 16 to signed 8
+        {
+            {"in_t", "int16_t"},
+            {"out_t", "int16_t"},
+        },  // signed 16 to signed 16
+        {
+            {"in_t", "int16_t"},
+            {"out_t", "int32_t"},
+        },  // signed 16 to signed 32
+        {
+            {"in_t", "int16_t"},
+            {"out_t", "uint8_t"},
+        },  // signed 16 to unsigned 8
+        {
+            {"in_t", "int16_t"},
+            {"out_t", "uint16_t"},
+        },  // signed 16 to unsigned 16
+        {
+            {"in_t", "int32_t"},
+            {"out_t", "int8_t"},
+        },  // signed 32 to signed 8
+        {
+            {"in_t", "int32_t"},
+            {"out_t", "int16_t"},
+        },  // signed 32 to signed 16
+        {
+            {"in_t", "int32_t"},
+            {"out_t", "int32_t"},
+        },  // signed 32 to signed 32
+        {
+            {"in_t", "int48_t"},
+            {"out_t", "int8_t"},
+        },  // signed 48 to signed 8
+        {
+            {"in_t", "int48_t"},
+            {"out_t", "int16_t"},
+        },  // signed 48 to signed 16
+        {
+            {"in_t", "int48_t"},
+            {"out_t", "int32_t"},
+        },  // signed 48 to signed 32
+        {
+            {"in_t", "uint8_t"},
+            {"out_t", "int8_t"},
+        },  // unsigned 8 to signed 8
+        {
+            {"in_t", "uint8_t"},
+            {"out_t", "int16_t"},
+        },  // unsigned 8 to signed 16
+        {
+            {"in_t", "uint16_t"},
+            {"out_t", "int16_t"},
+        },  // unsigned 16 to signed 16
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_7p5naeft5ga8(op, context);
+    ErrorIfCheck_2hqaqrremyime(op, context);
+    ErrorIfCheck_1wo90hck51cpk(op, context);
+    ErrorIfCheck_v4b9g32rnf6p(op, context);
+    ErrorIfCheck_22dev8it3bz2g(op, context);
+    ErrorIfCheck_3ms1pbkpa2td9(op, context);
+    ErrorIfCheck_31ty7f0kcbfxg(op, context);
+    ErrorIfCheck_10u6py7exa66n(op, context);
+    LevelCheck_1flzmpv6hubzc(op, context);
+}
+
+void ValidateOperator_IDENTITY(const regor::Operation *op, const Context &context)
+{
+    const Argument input1 = {
+        Category::Input,
+        "input1",
+        "in_out_t",
+    }; /*Input tensor shape=shape*/
+    const Argument output = {
+        Category::Output,
+        "output",
+        "in_out_t",
+    }; /*Output tensor of the same type, size as the input tensor shape=shape*/
+    const std::vector<const Argument *> arguments = {
+        &input1,
+        &output,
+    };
+    const std::vector<Typesupport> typesupports = {
+        {
+            {"in_out_t", "bool_t"},
+        },  // Boolean
+        {
+            {"in_out_t", "int8_t"},
+        },  // signed 8
+        {
+            {"in_out_t", "int16_t"},
+        },  // signed 16
+        {
+            {"in_out_t", "int32_t"},
+        },  // signed 32
+    };
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_396rg8p65j58r(op, context);
+}
+
+void ValidateOperator_COND_IF(const regor::Operation *op, const Context &context)
+{
+    const Argument input_list = {
+        Category::Input,
+        "input_list",
+        "tensor_list_t",
+    }; /*List of input tensors shape=-*/
+    const Argument condition = {
+        Category::Input,
+        "condition",
+        "bool_t",
+    }; /*Input condition as a size 1 tensor shape=shape*/
+    const Argument then_graph = {
+        Category::Attribute,
+        "then_graph",
+        "tosa_graph_t",
+    }; /*TOSA graph to execute if condition is true shape=-*/
+    const Argument else_graph = {
+        Category::Attribute,
+        "else_graph",
+        "tosa_graph_t",
+    }; /*TOSA graph to execute if condition is false shape=-*/
+    const Argument output_list = {
+        Category::Output,
+        "output_list",
+        "tensor_list_t",
+    }; /*List of output tensors shape=-*/
+    const std::vector<const Argument *> arguments = {
+        &input_list,
+        &condition,
+        &then_graph,
+        &else_graph,
+        &output_list,
+    };
+    const std::vector<Typesupport> typesupports = {};
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_1bm39avugkqqd(op, context);
+    ErrorIfCheck_3tv3oatlz37e2(op, context);
+    ErrorIfCheck_n7biu53x2n6k(op, context);
+    ErrorIfCheck_2fd4dk1zw032u(op, context);
+    ErrorIfCheck_omgw2xdm6irr(op, context);
+}
+
+void ValidateOperator_WHILE_LOOP(const regor::Operation *op, const Context &context)
+{
+    const Argument input_list = {
+        Category::Input,
+        "input_list",
+        "tensor_list_t",
+    }; /*List of input tensors shape=-*/
+    const Argument cond_graph = {
+        Category::Attribute,
+        "cond_graph",
+        "tosa_graph_t",
+    }; /*TOSA graph to evaluate the condition shape=-*/
+    const Argument body_graph = {
+        Category::Attribute,
+        "body_graph",
+        "tosa_graph_t",
+    }; /*TOSA graph to execute the loop body shape=-*/
+    const Argument output_list = {
+        Category::Output,
+        "output_list",
+        "tensor_list_t",
+    }; /*List of output tensors shape=-*/
+    const std::vector<const Argument *> arguments = {
+        &input_list,
+        &cond_graph,
+        &body_graph,
+        &output_list,
+    };
+    const std::vector<Typesupport> typesupports = {};
+    ValidateArguments(op, arguments, typesupports, context);
+    ErrorIfCheck_18hgmc3pexnw4(op, context);
+    ErrorIfCheck_12uu5ff3t3lv8(op, context);
+    ErrorIfCheck_3puzf7van5acf(op, context);
+    ErrorIfCheck_8tihij7a5ep0(op, context);
+    ErrorIfCheck_3lu68v2531bjz(op, context);
+    ErrorIfCheck_1fzl0zyxyd88z(op, context);
+}
+
+}  // namespace
+namespace tosa
+{
+namespace validator
+{
+
+void ValidateOperator_Version_0_60_0_Profile_BI(const GraphApi::GraphOperation *graphOp, const Context &context)
+{
+    const auto *op = static_cast<const regor::Operation *>(graphOp);
+    switch ( op->Type() )
+    {
+        case regor::OpType::ArgMax:
+            ValidateOperator_ARGMAX(op, context);
+            break;
+        case regor::OpType::AvgPool:
+            ValidateOperator_AVG_POOL2D(op, context);
+            break;
+        case regor::OpType::Conv2D:
+            ValidateOperator_CONV2D(op, context);
+            break;
+        case regor::OpType::Conv3D:
+            ValidateOperator_CONV3D(op, context);
+            break;
+        case regor::OpType::DepthwiseConv2DBias:
+            ValidateOperator_DEPTHWISE_CONV2D(op, context);
+            break;
+        case regor::OpType::FullyConnected:
+            ValidateOperator_FULLY_CONNECTED(op, context);
+            break;
+        case regor::OpType::MatMul:
+            ValidateOperator_MATMUL(op, context);
+            break;
+        case regor::OpType::MaxPool:
+            ValidateOperator_MAX_POOL2D(op, context);
+            break;
+        case regor::OpType::TransposeConv2D:
+            ValidateOperator_TRANSPOSE_CONV2D(op, context);
+            break;
+        case regor::OpType::Clamp:
+            ValidateOperator_CLAMP(op, context);
+            break;
+        case regor::OpType::Sigmoid:
+            ValidateOperator_SIGMOID(op, context);
+            break;
+        case regor::OpType::Tanh:
+            ValidateOperator_TANH(op, context);
+            break;
+        case regor::OpType::Add:
+            ValidateOperator_ADD(op, context);
+            break;
+        case regor::OpType::Asr:
+            ValidateOperator_ARITHMETIC_RIGHT_SHIFT(op, context);
+            break;
+        case regor::OpType::And:
+            ValidateOperator_BITWISE_AND(op, context);
+            break;
+        case regor::OpType::Or:
+            ValidateOperator_BITWISE_OR(op, context);
+            break;
+        case regor::OpType::Xor:
+            ValidateOperator_BITWISE_XOR(op, context);
+            break;
+        case regor::OpType::Div:
+            ValidateOperator_INTDIV(op, context);
+            break;
+        case regor::OpType::LogicalAnd:
+            ValidateOperator_LOGICAL_AND(op, context);
+            break;
+        case regor::OpType::SHL:
+            ValidateOperator_LOGICAL_LEFT_SHIFT(op, context);
+            break;
+        case regor::OpType::SHR:
+            ValidateOperator_LOGICAL_RIGHT_SHIFT(op, context);
+            break;
+        case regor::OpType::LogicalOr:
+            ValidateOperator_LOGICAL_OR(op, context);
+            break;
+        case regor::OpType::LogicalXor:
+            ValidateOperator_LOGICAL_XOR(op, context);
+            break;
+        case regor::OpType::Maximum:
+            ValidateOperator_MAXIMUM(op, context);
+            break;
+        case regor::OpType::Minimum:
+            ValidateOperator_MINIMUM(op, context);
+            break;
+        case regor::OpType::Mul:
+            ValidateOperator_MUL(op, context);
+            break;
+        case regor::OpType::Pow:
+            ValidateOperator_POW(op, context);
+            break;
+        case regor::OpType::Sub:
+            ValidateOperator_SUB(op, context);
+            break;
+        case regor::OpType::LUT:
+            ValidateOperator_TABLE(op, context);
+            break;
+        case regor::OpType::Abs:
+            ValidateOperator_ABS(op, context);
+            break;
+        case regor::OpType::Not:
+            ValidateOperator_BITWISE_NOT(op, context);
+            break;
+        case regor::OpType::Ceil:
+            ValidateOperator_CEIL(op, context);
+            break;
+        case regor::OpType::CLZ:
+            ValidateOperator_CLZ(op, context);
+            break;
+        case regor::OpType::Exp:
+            ValidateOperator_EXP(op, context);
+            break;
+        case regor::OpType::Floor:
+            ValidateOperator_FLOOR(op, context);
+            break;
+        case regor::OpType::Log:
+            ValidateOperator_LOG(op, context);
+            break;
+        case regor::OpType::LogicalNot:
+            ValidateOperator_LOGICAL_NOT(op, context);
+            break;
+        case regor::OpType::Neg:
+            ValidateOperator_NEGATE(op, context);
+            break;
+        case regor::OpType::Reciprocal:
+            ValidateOperator_RECIPROCAL(op, context);
+            break;
+        case regor::OpType::Rsqrt:
+            ValidateOperator_RSQRT(op, context);
+            break;
+        case regor::OpType::Select:
+            ValidateOperator_SELECT(op, context);
+            break;
+        case regor::OpType::Equal:
+            ValidateOperator_EQUAL(op, context);
+            break;
+        case regor::OpType::Greater:
+            ValidateOperator_GREATER(op, context);
+            break;
+        case regor::OpType::GreaterEqual:
+            ValidateOperator_GREATER_EQUAL(op, context);
+            break;
+        case regor::OpType::ReduceAll:
+            ValidateOperator_REDUCE_ALL(op, context);
+            break;
+        case regor::OpType::ReduceAny:
+            ValidateOperator_REDUCE_ANY(op, context);
+            break;
+        case regor::OpType::ReduceMax:
+            ValidateOperator_REDUCE_MAX(op, context);
+            break;
+        case regor::OpType::ReduceMin:
+            ValidateOperator_REDUCE_MIN(op, context);
+            break;
+        case regor::OpType::ReduceProduct:
+            ValidateOperator_REDUCE_PRODUCT(op, context);
+            break;
+        case regor::OpType::ReduceSum:
+            ValidateOperator_REDUCE_SUM(op, context);
+            break;
+        case regor::OpType::Concat:
+            ValidateOperator_CONCAT(op, context);
+            break;
+        case regor::OpType::Pad:
+            ValidateOperator_PAD(op, context);
+            break;
+        case regor::OpType::Reshape:
+            ValidateOperator_RESHAPE(op, context);
+            break;
+        case regor::OpType::Reverse:
+            ValidateOperator_REVERSE(op, context);
+            break;
+        case regor::OpType::Slice:
+            ValidateOperator_SLICE(op, context);
+            break;
+        case regor::OpType::Tile:
+            ValidateOperator_TILE(op, context);
+            break;
+        case regor::OpType::Transpose:
+            ValidateOperator_TRANSPOSE(op, context);
+            break;
+        case regor::OpType::Gather:
+            ValidateOperator_GATHER(op, context);
+            break;
+        case regor::OpType::Scatter:
+            ValidateOperator_SCATTER(op, context);
+            break;
+        case regor::OpType::Resize:
+            ValidateOperator_RESIZE(op, context);
+            break;
+        case regor::OpType::Cast:
+            ValidateOperator_CAST(op, context);
+            break;
+        case regor::OpType::Rescale:
+            ValidateOperator_RESCALE(op, context);
+            break;
+        case regor::OpType::Identity:
+            ValidateOperator_IDENTITY(op, context);
+            break;
+        case regor::OpType::If:
+            ValidateOperator_COND_IF(op, context);
+            break;
+        case regor::OpType::While:
+            ValidateOperator_WHILE_LOOP(op, context);
+            break;
+        case regor::OpType::Custom:  // passthrough or validate later
+            break;
+        default:
+            throw std::invalid_argument("Unsupported operator");
+    }
+}
+
+}  // namespace validator
+}  // namespace tosa
diff --git a/ethosu/vela/__init__.py b/ethosu/vela/__init__.py
index 8fc953ab..6c96ab3d 100644
--- a/ethosu/vela/__init__.py
+++ b/ethosu/vela/__init__.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2021, 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -13,6 +13,7 @@
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Init of vela."""
 from ._version import __version__
 
 __all__ = ["main", __version__]
diff --git a/ethosu/vela/__main__.py b/ethosu/vela/__main__.py
index 534a4a3c..f74df74c 100644
--- a/ethosu/vela/__main__.py
+++ b/ethosu/vela/__main__.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2021, 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -13,6 +13,7 @@
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Main of vela."""
 import sys
 
 from .vela import main
diff --git a/ethosu/vela/_version.py b/ethosu/vela/_version.py
index c78f8ddf..6b685fd7 100644
--- a/ethosu/vela/_version.py
+++ b/ethosu/vela/_version.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020, 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020, 2022-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
index 44b28c6d..3df2aeb8 100644
--- a/ethosu/vela/architecture_features.py
+++ b/ethosu/vela/architecture_features.py
@@ -164,6 +164,11 @@ class Accelerator(enum.Enum):
     Ethos_U55_256 = "ethos-u55-256"
     Ethos_U65_256 = "ethos-u65-256"
     Ethos_U65_512 = "ethos-u65-512"
+    Ethos_U85_128 = "ethos-u85-128"
+    Ethos_U85_256 = "ethos-u85-256"
+    Ethos_U85_512 = "ethos-u85-512"
+    Ethos_U85_1024 = "ethos-u85-1024"
+    Ethos_U85_2048 = "ethos-u85-2048"
 
     @classmethod
     def member_list(cls):
@@ -171,7 +176,8 @@ class Accelerator(enum.Enum):
 
     @classmethod
     def from_npu_accelerator(cls, npu_accelerator: NpuAccelerator) -> "Accelerator":
-        """Converts the given public API object to Accelerator (used internally)"""
+        """Converts the given public External API object to Accelerator (used
+        internally). This interface is not available for Ethos-U85"""
         accelerator_map = {
             NpuAccelerator.Ethos_U55_32: cls.Ethos_U55_32,
             NpuAccelerator.Ethos_U55_64: cls.Ethos_U55_64,
@@ -196,10 +202,10 @@ SHRAMConfig = namedtuple(
 
 
 class ArchitectureFeatures:
-    """This class is a container for various parameters of the Ethos-U core
-    and system configuration that can be tuned, either by command line
-    parameters or by the Ethos-U architects. The class is often passed
-    around to passes that need to do architecture-dependent actions.
+    """This class is a container for various parameters of the Ethos-U55 and
+    Ethos-U65 cores and system configurations that can be tuned, either by
+    command line parameters or by the Ethos-U architects. The class is often
+    passed around to passes that need to do architecture-dependent actions.
 
     Note the difference between ArchitectureFeatures and CompilerOptions
     - ArchitectureFeatures is for changing the Ethos-U and system architecture
@@ -249,7 +255,82 @@ class ArchitectureFeatures:
         accelerator_config = accelerator_config.lower()
         if accelerator_config not in Accelerator.member_list():
             raise CliOptionError("--accelerator-config", self.accelerator_config, "Unknown accelerator configuration")
+
         self.accelerator_config = Accelerator(accelerator_config)
+        self.is_ethos_u55_system = self.accelerator_config in (
+            Accelerator.Ethos_U55_32,
+            Accelerator.Ethos_U55_64,
+            Accelerator.Ethos_U55_128,
+            Accelerator.Ethos_U55_256,
+        )
+        self.is_ethos_u65_system = self.accelerator_config in (Accelerator.Ethos_U65_256, Accelerator.Ethos_U65_512)
+        self.is_ethos_u85_system = self.accelerator_config in (
+            Accelerator.Ethos_U85_128,
+            Accelerator.Ethos_U85_256,
+            Accelerator.Ethos_U85_512,
+            Accelerator.Ethos_U85_1024,
+            Accelerator.Ethos_U85_2048,
+        )
+
+        self.system_config = system_config
+        self.memory_mode = memory_mode
+        axi_port_data_width = np.ones(MemArea.Size)
+        if self.is_ethos_u85_system:
+            self.max_outstanding_dma = 4
+            axi_port_address_width = 40
+            # axi-port data-width depends on the number of interfaces in each config
+            if self.accelerator_config == Accelerator.Ethos_U85_2048:
+                axi_port_data_width = [128 * 2 for i in range(MemArea.Size)]
+                axi_port_data_width[MemArea.Sram] = 128 * 4
+            elif self.accelerator_config == Accelerator.Ethos_U85_1024:
+                axi_port_data_width = [128 * 2 for i in range(MemArea.Size)]
+            else:
+                axi_port_data_width = [128 for i in range(MemArea.Size)]
+                axi_port_data_width[MemArea.Sram] = 128 * 2
+        elif self.is_ethos_u65_system:
+            self.max_outstanding_dma = 2
+            axi_port_address_width = 40
+            axi_port_data_width = [128 for i in range(MemArea.Size)]
+        elif self.is_ethos_u55_system:
+            self.max_outstanding_dma = 1
+            axi_port_address_width = 32
+            axi_port_data_width = [64 for i in range(MemArea.Size)]
+        else:
+            assert False, f"Unsupported accelerator_config = {self.accelerator_config}"
+
+        self.max_address_offset = 1 << axi_port_address_width
+
+        self._get_vela_config(vela_config_files, verbose_config, arena_cache_size)
+
+        self.memory_bandwidths_per_cycle = (
+            np.array([a * b for a, b in zip(axi_port_data_width, self.memory_clock_scales)]) / 8
+        )
+        self.memory_bandwidths_per_second = self.memory_bandwidths_per_cycle * self.core_clock
+
+        self.tensor_storage_mem_area = {
+            # permanent mem_area
+            TensorPurpose.Unknown: MemArea.Unknown,
+            TensorPurpose.Weights: self.permanent_storage_mem_area,
+            TensorPurpose.FeatureMap: self.feature_map_storage_mem_area,
+            TensorPurpose.LUT: self.permanent_storage_mem_area,
+            TensorPurpose.Scratch: self.feature_map_storage_mem_area,
+            TensorPurpose.ScratchFast: self.fast_storage_mem_area,
+        }
+
+        self.tensor_storage_mem_type = {
+            TensorPurpose.Unknown: MemType.Unknown,
+            TensorPurpose.Weights: MemType.Permanent_NPU,
+            TensorPurpose.FeatureMap: MemType.Scratch,
+            TensorPurpose.LUT: MemType.Scratch,
+            TensorPurpose.Scratch: MemType.Scratch,
+            TensorPurpose.ScratchFast: MemType.Scratch_fast,
+        }
+
+        if self.is_ethos_u85_system:
+            self.num_macs_per_cycle = int(accelerator_config.split("-")[-1])
+            self.ncores = 1
+            return
+
         accel_config = ArchitectureFeatures.accelerator_configs[self.accelerator_config]
         self.config = accel_config
 
@@ -273,19 +354,6 @@ class ArchitectureFeatures:
 
         self.shram = SHRAMConfig(2, 1024, accel_config.shram_banks, 2 if accel_config.shram_banks > 16 else 0)
 
-        self.system_config = system_config
-        self.memory_mode = memory_mode
-        self.is_ethos_u65_system = self.accelerator_config in (Accelerator.Ethos_U65_256, Accelerator.Ethos_U65_512)
-
-        if self.is_ethos_u65_system:
-            self.max_outstanding_dma = 2
-            axi_port_address_width = 40
-            axi_port_data_width = 128
-        else:
-            self.max_outstanding_dma = 1
-            axi_port_address_width = 32
-            axi_port_data_width = 64
-
         self.max_outstanding_kernels = 2
 
         self.ncores = accel_config.cores
@@ -303,15 +371,6 @@ class ArchitectureFeatures:
         self.num_elem_wise_units = accel_config.elem_units
         self.num_macs_per_cycle = dpu_min_height * dpu_min_width * dpu_dot_product_width * dpu_min_ofm_channels
         assert self.num_macs_per_cycle == accel_config.macs, f"{self.num_macs_per_cycle} != {accel_config.macs}"
-        # Max value in address offsets
-        self.max_address_offset = 1 << axi_port_address_width
-
-        # Get system configuration and memory mode
-        self._get_vela_config(vela_config_files, verbose_config, arena_cache_size)
-
-        self.memory_bandwidths_per_cycle = axi_port_data_width * self.memory_clock_scales / 8
-
-        self.memory_bandwidths_per_second = self.memory_bandwidths_per_cycle * self.core_clock
 
         # Get output/activation performance numbers
         self._generate_output_perf_tables(self.accelerator_config)
@@ -335,25 +394,6 @@ class ArchitectureFeatures:
         self.default_weight_format = TensorFormat.WeightsCompressed
         self.default_feature_map_format = TensorFormat.NHWC
 
-        self.tensor_storage_mem_area = {
-            # permanent mem_area
-            TensorPurpose.Unknown: MemArea.Unknown,
-            TensorPurpose.Weights: self.permanent_storage_mem_area,
-            TensorPurpose.FeatureMap: self.feature_map_storage_mem_area,
-            TensorPurpose.LUT: self.permanent_storage_mem_area,
-            TensorPurpose.Scratch: self.feature_map_storage_mem_area,
-            TensorPurpose.ScratchFast: self.fast_storage_mem_area,
-        }
-
-        self.tensor_storage_mem_type = {
-            TensorPurpose.Unknown: MemType.Unknown,
-            TensorPurpose.Weights: MemType.Permanent_NPU,
-            TensorPurpose.FeatureMap: MemType.Scratch,
-            TensorPurpose.LUT: MemType.Scratch,
-            TensorPurpose.Scratch: MemType.Scratch,
-            TensorPurpose.ScratchFast: MemType.Scratch_fast,
-        }
-
         self.min_block_sizes = {
             NpuBlockType.Default: (dpu_min_height, dpu_min_width),
             NpuBlockType.VectorProduct: (1, 1),
@@ -444,10 +484,11 @@ class ArchitectureFeatures:
         elif accel_config in (Accelerator.Ethos_U55_256, Accelerator.Ethos_U65_256):
             self.output_cycles_per_elem = (0.625, 1.125, 0.5, 0.375, 0.5, 0.75, 0.125, 0.25)
             self.activation_cycles_per_elem = (1.0, 0.25, 0.0)
-        else:
-            assert accel_config == Accelerator.Ethos_U65_512
+        elif accel_config == Accelerator.Ethos_U65_512:
             self.output_cycles_per_elem = (0.3125, 0.5625, 0.25, 0.1875, 0.25, 0.375, 0.0625, 0.125)
             self.activation_cycles_per_elem = (0.5, 0.125, 0.0)
+        else:
+            assert False, f"Unsupported accel_config = {accel_config}"
 
     def calc_ifm_block_depth(self, ifm_depth, ifm_bits):
         assert ifm_bits in (8, 16, 32)
diff --git a/ethosu/vela/rawdata_writer.py b/ethosu/vela/rawdata_writer.py
index dd17b235..153def44 100644
--- a/ethosu/vela/rawdata_writer.py
+++ b/ethosu/vela/rawdata_writer.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2021 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2021, 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -79,3 +79,27 @@ def write_rawdata_output(nng, arch, filename):
                 output_region=ofm_regions,
                 output_offset=ofm_offsets,
             )
+
+
+# Write out a CompiledRawModel to a numpy raw file.
+def write_rawdata_output_from_model(filename, model):
+    np.savez(
+        filename,
+        cmd_data=model.command_stream,
+        weight_data=model.read_only.data,
+        weight_region=model.read_only.region,
+        scratch_shape=[model.scratch.size],
+        scratch_region=model.scratch.region,
+        scratch_size=model.scratch.size,
+        scratch_fast_shape=[model.scratch_fast.size],
+        scratch_fast_region=model.scratch_fast.region,
+        scratch_fast_size=model.scratch_fast.size,
+        input_shape=[t.shape for t in model.inputs],
+        input_elem_size=[t.element_size for t in model.inputs],
+        input_region=[t.region for t in model.inputs],
+        input_offset=[t.address for t in model.inputs],
+        output_shape=[t.shape for t in model.outputs],
+        output_elem_size=[t.element_size for t in model.outputs],
+        output_region=[t.region for t in model.outputs],
+        output_offset=[t.address for t in model.outputs],
+    )
diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py
index dce20b94..6c59d54d 100644
--- a/ethosu/vela/stats_writer.py
+++ b/ethosu/vela/stats_writer.py
@@ -36,9 +36,23 @@ def mem_areas_to_report():
     return [area for area in MemArea.all() if area != MemArea.Shram]
 
 
-def write_summary_metrics_csv(nng, summary_filename, arch):
+def write_summary_metrics_csv_common(
+    summary_filename,
+    arch,
+    name,
+    total_original_weights,
+    total_npu_encoded_weights,
+    n_passes,
+    n_cascaded_passes,
+    cycles,
+    bandwidths,
+    memory_used,
+    batch_size,
+    macs,
+):
     with open(summary_filename, "w") as f:
         writer = csv.writer(f)
+
         mem_areas = mem_areas_to_report()
 
         labels = [
@@ -80,7 +94,7 @@ def write_summary_metrics_csv(nng, summary_filename, arch):
 
         data_items = [
             "default",
-            nng.name,
+            name,
         ]
 
         if arch:
@@ -92,29 +106,26 @@ def write_summary_metrics_csv(nng, summary_filename, arch):
                     arch.core_clock,
                     arch.arena_cache_size / 1024,
                 ]
-                + [arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 for mem_area in mem_areas]
+                + [arch.memory_bandwidths_per_second[mem_area] / 1024.0 / 1024 / 1024 for mem_area in mem_areas]
                 + [
                     arch.tensor_storage_mem_area[TensorPurpose.Weights].display_name(),
                     arch.tensor_storage_mem_area[TensorPurpose.FeatureMap].display_name(),
                 ]
             )
 
-        midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.core_clock
+        midpoint_inference_time = cycles[PassCycles.Total] / arch.core_clock
         if midpoint_inference_time > 0:
             midpoint_fps = 1 / midpoint_inference_time
         else:
             midpoint_fps = np.nan
 
-        n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
-        n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
-
-        data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes]
-        data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in mem_areas]
-        data_items += [nng.total_original_weights]
-        data_items += [nng.total_npu_encoded_weights]
+        data_items += [midpoint_fps, batch_size, midpoint_inference_time, n_passes, n_cascaded_passes]
+        data_items += [memory_used.get(mem_area, 0) / 1024.0 for mem_area in mem_areas]
+        data_items += [total_original_weights]
+        data_items += [total_npu_encoded_weights]
 
         for mem_area in mem_areas:
-            bws = nng.bandwidths[mem_area]
+            bws = bandwidths[mem_area]
             total_bw = np.sum(bws)
             weight_bws = bws[TensorPurpose.Weights]
             fm_bws = bws[TensorPurpose.FeatureMap]
@@ -127,15 +138,34 @@ def write_summary_metrics_csv(nng, summary_filename, arch):
             ]
 
         data_items += [
-            nng.macs,
-            nng.macs * 2 * midpoint_fps / 1e12,
+            macs,
+            macs * 2 * midpoint_fps / 1e12,
         ]
 
-        data_items += [nng.cycles[kind] for kind in PassCycles.all()]
+        data_items += [cycles[kind] for kind in PassCycles.all()]
 
         writer.writerow(data_items)
 
 
+def write_summary_metrics_csv(nng, summary_filename, arch):
+    n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
+    n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
+    write_summary_metrics_csv_common(
+        summary_filename,
+        arch,
+        nng.name,
+        nng.total_original_weights,
+        nng.total_npu_encoded_weights,
+        n_passes,
+        n_cascaded_passes,
+        nng.cycles,
+        nng.bandwidths,
+        nng.memory_used,
+        nng.batch_size,
+        nng.macs,
+    )
+
+
 def write_pass_metrics_csv(nng, pass_filename):
 
     with open(pass_filename, "w") as f:
@@ -219,7 +249,7 @@ def write_pass_metrics_csv(nng, pass_filename):
         write_subgraph(nng.get_root_subgraph())
 
 
-def print_performance_metrics_for_strat(
+def print_performance_metrics_common(
     arch,
     name,
     cycles,
@@ -236,18 +266,18 @@ def print_performance_metrics_for_strat(
 
     orig_mem_areas_labels = [(v, v.display_name()) for v in mem_areas_to_report()]
 
-    midpoint_inference_time = cycles[PassCycles.Total] / arch.core_clock
-    if midpoint_inference_time > 0:
-        midpoint_fps = 1 / midpoint_inference_time
+    inference_time = cycles[-1] / arch.core_clock
+    if inference_time > 0:
+        inferences_per_second = 1 / inference_time
     else:
-        midpoint_fps = np.nan
+        inferences_per_second = np.nan
 
     mem_area_labels = [
         (mem_area, label) for mem_area, label in orig_mem_areas_labels if np.sum(bandwidths[mem_area]) > 0
     ]
 
+    print("", file=f)
     if name:
-        print("", file=f)
         print(f"Network summary for {name}", file=f)
     print(f"Accelerator configuration        {arch.accelerator_config.name:>20}", file=f)
     print(f"System configuration             {arch.system_config:>20}", file=f)
@@ -255,7 +285,7 @@ def print_performance_metrics_for_strat(
     print(f"Accelerator clock                        {int(arch.core_clock / 1e6):12d} MHz", file=f)
     for mem_area, label in mem_area_labels:
         label += " bandwidth"
-        bandwidth = arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000
+        bandwidth = arch.memory_bandwidths_per_second[mem_area] / 1024.0 / 1024 / 1024
         print(
             f"Design peak {label:25}    {bandwidth:12.2f} GB/s",
             file=f,
@@ -280,9 +310,6 @@ def print_performance_metrics_for_strat(
     n_npu_operations = len(npu_operations)
     n_total_operations = max(n_cpu_operations + n_npu_operations, 1)  # avoid potential divide by zero
 
-    def format_tens_list(lst):
-        return " ".join(str(list(tens.shape)) for tens in lst if tens is not None)
-
     for str_ops_type, n_ops, ops in (
         ("CPU", n_cpu_operations, cpu_operations),
         ("NPU", n_npu_operations, npu_operations),
@@ -290,10 +317,7 @@ def print_performance_metrics_for_strat(
         print(f"{str_ops_type} operators = {n_ops:d} ({n_ops / n_total_operations:4.1%})", file=f)
         if show_cpu_operations:
             for op in ops:
-                print(
-                    f"   {str_ops_type}: {op.type} = {op.name}"
-                    f" (inputs {format_tens_list(op.inputs)}, outputs {format_tens_list(op.outputs)})"
-                )
+                print(f"   {str_ops_type}: {op}")
 
     print("", file=f)
 
@@ -304,23 +328,24 @@ def print_performance_metrics_for_strat(
         fm_bws = bws[TensorPurpose.FeatureMap]
         aug_label = label + " bandwidth"
         print(
-            f"Average {aug_label:25}        {total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0:12.2f} GB/s",
+            f"Average {aug_label:25}        {total_bw * inferences_per_second / 1024.0 / 1024.0 / 1000.0:12.2f} GB/s",
             file=f,
         )
         print(
-            f"Input   {aug_label:25}        {np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0:12.2f} MB/batch",
+            f"Input   {aug_label:25}        "
+            f"{np.sum(fm_bws[BandwidthDirection.Read]) / 1024.0 / 1024.0:12.2f} MB/batch",
             file=f,
         )
-        print(f"Weight  {aug_label:25}        {np.sum(weight_bws) / 1000.0 / 1000.0:12.2f} MB/batch", file=f)
+        print(f"Weight  {aug_label:25}        {np.sum(weight_bws) / 1024.0 / 1024.0:12.2f} MB/batch", file=f)
         print(
             f"Output  {aug_label:25}        "
-            f"{np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0:12.2f} MB/batch",
+            f"{np.sum(fm_bws[BandwidthDirection.Write]) / 1024.0 / 1024.0:12.2f} MB/batch",
             file=f,
         )
-        print(f"Total   {aug_label:25}        {total_bw / 1000.0 / 1000.0:12.2f} MB/batch", file=f)
+        print(f"Total   {aug_label:25}        {total_bw / 1024.0 / 1024.0:12.2f} MB/batch", file=f)
         print(
             f"Total   {aug_label:25} per input "
-            f"{total_bw / 1000.0 / 1000.0 / batch_size:9.2f} MB/inference (batch size {batch_size:d})",
+            f"{total_bw / 1024.0 / 1024.0 / batch_size:9.2f} MB/inference (batch size {batch_size:d})",
             file=f,
         )
         print(file=f)
@@ -335,7 +360,7 @@ def print_performance_metrics_for_strat(
         file=f,
     )
     print(
-        f"Network Tops/s                           {macs * 2 * midpoint_fps / 1e12:12.2f} Tops/s",
+        f"Network Tops/s                           {macs * 2 * inferences_per_second / 1e12:12.2f} Tops/s",
         file=f,
     )
     print(file=f)
@@ -347,8 +372,8 @@ def print_performance_metrics_for_strat(
     print(file=f)
 
     print(
-        f"Batch Inference time              {midpoint_inference_time * 1000:7.2f} ms,"
-        f" {midpoint_fps:7.2f} inferences/s (batch size {batch_size:d})",
+        f"Batch Inference time              {inference_time * 1000:7.2f} ms,"
+        f" {inferences_per_second:7.2f} inferences/s (batch size {batch_size:d})",
         file=f,
     )
     print(file=f)
@@ -364,22 +389,32 @@ def print_performance_metrics(nng, arch, show_cpu_operations=False, verbose_weig
         Op.SubgraphInput,
     )
 
+    def format_tens_list(lst):
+        return " ".join(str(list(tens.shape)) for tens in lst)
+
     for sg in nng.subgraphs:
         if sg.placement == PassPlacement.Cpu:
             for op in sg.get_all_ops_from_passes():
                 if op.type not in ir_only_ops:
-                    cpu_operations.append(op)
+                    cpu_operations.append(
+                        f"{op.type} = {op.name} "
+                        f"(inputs {format_tens_list(op.inputs)}, outputs {format_tens_list(op.outputs)})"
+                    )
         elif sg.placement == PassPlacement.Npu:
             for op in sg.get_all_ops_from_passes():
                 if op.type not in ir_only_ops:
-                    npu_operations.append(op)
+                    npu_operations.append(
+                        f"{op.type} = {op.name} "
+                        f"(inputs {format_tens_list(op.inputs)}, outputs {format_tens_list(op.outputs)})"
+                    )
 
     weights_data = (
         {"original": nng.total_original_weights, "npu_encoded": nng.total_npu_encoded_weights}
         if verbose_weights
         else None
     )
-    return print_performance_metrics_for_strat(
+
+    return print_performance_metrics_common(
         arch,
         nng.name,
         nng.cycles,
@@ -393,3 +428,440 @@ def print_performance_metrics(nng, arch, show_cpu_operations=False, verbose_weig
         weights_data,
         f,
     )
+
+
+def regor_operations_from_database(opt_database):
+    def find_in_header(names, header):
+        idx = []
+        for name in names:
+            if name not in header:
+                idx.append(-1)
+            idx.append(header.index(name))
+        return idx
+
+    # set of optimised_ids that ended up on NPU
+    npu_optimised_ids = set()
+    # maps src-id to ofm-shape
+    ofm_shapes = dict()
+    cpu_operations = []
+    npu_operations = []
+
+    # prerequisite checks
+    required_tables = ["source", "optimised", "perf", "queue"]
+    for table in required_tables:
+        if table not in opt_database.tables:
+            print("Could not extract CPU operations:")
+            print("Table: {} was not in opt_database".format(table))
+            return cpu_operations, npu_operations
+
+    # build ofm_shapes from source table
+    src = opt_database.tables["source"]
+    fields = ["id", "ofm_w", "ofm_h", "ofm_d"]
+    ids = find_in_header(fields, src.header)
+    if any([x < 0 for x in ids]):
+        print("Could not extract CPU operations:")
+        print("Could not find all necessary fields in source database")
+        return cpu_operations, npu_operations
+
+    id_idx, w_idx, h_idx, d_idx = ids
+    for entry in src.data:
+        src_id = entry[id_idx]
+        w = int(entry[w_idx])
+        h = int(entry[h_idx])
+        d = int(entry[d_idx])
+        ofm_shapes[src_id] = [1, w, h, d]
+
+    # build npu_optimised_ids from queue table
+    qt = opt_database.tables["queue"]
+    id_idx = find_in_header(["optimised_id"], qt.header)[0]
+    if id_idx == -1:
+        print("Could not extract CPU operations:")
+        print("optimised_id was not found in queue table")
+        return cpu_operations, npu_operations
+    for entry in qt.data:
+        npu_optimised_ids.add(entry[id_idx])
+
+    # build cpu/npu operations from perf-table
+    perf = opt_database.tables["perf"]
+    fields = ["optimised_id", "source_id", "name", "operator"]
+    ids = find_in_header(fields, perf.header)
+    if any([x < 0 for x in ids]):
+        print("Could not extract CPU operations:")
+        print("Could not find all necessary fields in perf database")
+    opt_idx, src_idx, name_idx, operator_idx = ids
+    for entry in perf.data:
+        opt_id = entry[opt_idx]
+        src_id = entry[src_idx]
+        name = entry[name_idx]
+        operator = entry[operator_idx]
+        ofm_shape = ofm_shapes[src_id]
+        # TODO add ifm shapes
+        op_desc = f"{operator} = {name} (outputs {ofm_shape})"
+        if opt_id in npu_optimised_ids:
+            npu_operations.append(op_desc)
+        else:
+            cpu_operations.append(op_desc)
+    return cpu_operations, npu_operations
+
+
+def print_regor_performance_metrics(
+    arch,
+    report,
+    model_name,
+    csv_filename,
+    opt_database,
+    verbose_weights=False,
+    show_cpu_operations=False,
+):
+    # Map from Regor memory names to Vela MemArea
+    memory_mapping = {
+        "sram": MemArea.Sram,
+        "dram": MemArea.Dram,
+        "onchipflash": MemArea.OnChipFlash,
+        "offchipflash": MemArea.OffChipFlash,
+        "lutram": MemArea.Shram,
+        "shram": MemArea.Shram,
+    }
+
+    # Map from Regor memory names to Vela PassCycles
+    cycles_mapping = {
+        "sram": PassCycles.SramAccess,
+        "dram": PassCycles.DramAccess,
+        "onchipflash": PassCycles.OnChipFlashAccess,
+        "offchipflash": PassCycles.OffChipFlashAccess,
+    }
+
+    # Map from Regor access_type to Vela TensorPurpose
+    purpose_mapping = {
+        "lut": TensorPurpose.LUT,
+        "featuremap": TensorPurpose.FeatureMap,
+        "scale": TensorPurpose.FSBias,
+        "weights": TensorPurpose.Weights,
+    }
+
+    cycles = [0] * PassCycles.Size
+    bandwidths = [[[0] * BandwidthDirection.Size for i in range(TensorPurpose.Size)] for j in range(MemArea.Size)]
+    memory_used = {i: 0 for i in range(MemArea.Size)}
+    for mem_name, memory in report.memories.items():
+        for _, a in memory.accesses.items():
+            mem_name_lower = str(mem_name).lower()
+
+            # skip shram/lutram in performance report
+            if mem_name_lower in ["lutram", "shram"]:
+                continue
+
+            mem_area = memory_mapping.get(mem_name_lower, MemArea.Unknown)
+            purpose = purpose_mapping.get(str(a.accessType).lower(), TensorPurpose.Unknown)
+
+            if mem_name_lower in cycles_mapping:
+                cycles_idx = cycles_mapping[mem_name_lower]
+                cycles[cycles_idx] += a.accessCycles
+
+            bandwidths[mem_area][purpose][BandwidthDirection.Read] = a.bytesRead
+            bandwidths[mem_area][purpose][BandwidthDirection.Write] = a.bytesWritten
+
+            memory_used[mem_area] = memory.peakUsage
+
+    cycles[PassCycles.Npu] = report.npuCycles
+    cycles[PassCycles.Total] = report.totalCycles
+
+    batch_size = 1
+
+    npu_operations = list()
+    cpu_operations = list()
+
+    if opt_database:
+        if show_cpu_operations:
+            cpu_operations, npu_operations = regor_operations_from_database(opt_database)
+
+    if not len(cpu_operations) and not len(npu_operations):
+        cpu_operations = ["N/A"] * report.cpuOps
+        npu_operations = ["N/A"] * report.npuOps
+
+    total_original_weights = report.originalWeights
+    total_npu_encoded_weights = report.encodedWeights
+
+    weights_data = (
+        {"original": total_original_weights, "npu_encoded": total_npu_encoded_weights} if verbose_weights else None
+    )
+    f = sys.stdout
+
+    print_performance_metrics_common(
+        arch,
+        model_name,
+        cycles,
+        report.macCount,
+        bandwidths,
+        batch_size,
+        memory_used,
+        cpu_operations,
+        npu_operations,
+        show_cpu_operations,
+        weights_data,
+        f,
+    )
+
+    n_passes = report.cpuOps + report.npuOps
+    n_passes_after_cascading = n_passes - report.cascadedOps + report.cascades
+
+    write_summary_metrics_csv_common(
+        csv_filename,
+        arch,
+        model_name,
+        total_original_weights,
+        total_npu_encoded_weights,
+        n_passes,
+        n_passes_after_cascading,
+        cycles,
+        bandwidths,
+        memory_used,
+        batch_size,
+        report.macCount,
+    )
+
+
+def postprocess_regor_performance_database(arch, opt_database, performance_report):
+    if "perf" not in opt_database.tables:
+        return None, None
+
+    def find_in_header(name, header):
+        if name not in header:
+            return -1
+        return header.index(name)
+
+    def _percentage(x, y):
+        if y == 0:
+            return 100
+        return (x * 100) / y
+
+    # Set of operations in the optimised-table that ended up as commands
+    npu_operators = set()
+    if "queue" in opt_database.tables:
+        queue_table = opt_database.tables["queue"]
+        queue_header = queue_table.header
+        queue_rows = queue_table.data
+        opt_idx = find_in_header("optimised_id", queue_header)
+        if opt_idx != -1:
+            for row in queue_rows:
+                npu_operators.add(int(row[opt_idx]))
+
+    perf_table = opt_database.tables["perf"]
+    perf_header = perf_table.header
+    perf_rows = perf_table.data
+
+    # populate id -> name map for source operations
+    source_names = {}
+
+    if "source" in opt_database.tables:
+        source_table = opt_database.tables["source"]
+        source_header = source_table.header
+        source_rows = source_table.data
+        op_name_idx = find_in_header("name", source_header)
+        op_id_idx = find_in_header("id", source_header)
+        if op_name_idx != -1 and op_id_idx != -1:
+            for row in source_rows:
+                op_id = row[op_id_idx]
+                op_name = row[op_name_idx]
+                source_names[op_id] = op_name
+
+    # maps vela-columns to columns in the regor performance-database
+    col_map = {
+        "SourceId": find_in_header("source_id", perf_header),
+        "OptId": find_in_header("optimised_id", perf_header),
+        "NNG Operator": find_in_header("operator", perf_header),
+        "Staging Usage": find_in_header("staging_usage", perf_header),
+        "Op Cycles": find_in_header("op_cycles", perf_header),
+        "NPU": find_in_header("npu_cycles", perf_header),
+        "MAC Count": find_in_header("mac_count", perf_header),
+        "SRAM AC": find_in_header("Sram_ac", perf_header),
+        "DRAM AC": find_in_header("Dram_ac", perf_header),
+        "OnFlash AC": find_in_header("OnChipFlash_ac", perf_header),
+        "OffFlash AC": find_in_header("OffChipFlash_ac", perf_header),
+        "Name": find_in_header("name", perf_header),
+    }
+
+    # generate new header and data that aligns with vela format
+    new_header = [
+        "Original Operator",
+        "NNG Operator",
+        "Target",
+        "Staging Usage",
+        "Peak% (Staging)",
+        "Op Cycles",
+        "Network% (cycles)",
+        "NPU",
+        "SRAM AC",
+        "DRAM AC",
+        "OnFlash AC",
+        "OffFlash AC",
+        "MAC Count",
+        "Network% (MAC)",
+        "Util% (MAC)",
+        "Name",
+    ]
+    new_data = []
+    for row in perf_rows:
+        new_row = []
+        for col in new_header:
+            if col in col_map:
+                idx = col_map[col]
+                if idx == -1:
+                    new_row.append("0")
+                else:
+                    new_row.append(row[idx])
+            # post-processing for things not contained in regors performance-database
+            else:
+                if col == "Target":
+                    opt_id_idx = col_map["OptId"]
+                    if opt_id_idx != -1 and len(npu_operators):
+                        opt_id = row[opt_id_idx]
+                        if int(opt_id) in npu_operators:
+                            new_row.append("NPU")
+                        else:
+                            new_row.append("CPU")
+                    else:
+                        new_row.append("N/A")
+                elif col == "Original Operator":
+                    source_id_idx = col_map["SourceId"]
+                    if source_id_idx != -1:
+                        source_op_id = row[source_id_idx]
+                        source_name = source_names.get(source_op_id, "N/A")
+                        new_row.append(source_name)
+                    else:
+                        new_row.append("N/A")
+                elif col == "Peak% (Staging)":
+                    staging_idx = col_map["Staging Usage"]
+                    if staging_idx != -1:
+                        staging_usage = int(row[staging_idx])
+                        staging_network = int(
+                            performance_report.memories[performance_report.stagingMemoryArea].peakUsage
+                        )
+                        staging_percent = _percentage(staging_usage, staging_network)
+                        new_row.append(str(staging_percent))
+                    else:
+                        new_row.append("N/A")
+                elif col == "Network% (cycles)":
+                    cycles_idx = col_map["Op Cycles"]
+                    if cycles_idx != -1:
+                        cycles = int(row[cycles_idx])
+                        cycles_network = int(performance_report.totalCycles)
+                        cycles_percent = _percentage(cycles, cycles_network)
+                        new_row.append(str(cycles_percent))
+                    else:
+                        new_row.append("N/A")
+                elif col == "Network% (MAC)":
+                    mac_idx = col_map["MAC Count"]
+                    if mac_idx != -1:
+                        macs = int(row[mac_idx])
+                        macs_network = int(performance_report.macCount)
+                        macs_percent = _percentage(macs, macs_network)
+                        new_row.append(str(macs_percent))
+                    else:
+                        new_row.append("100")
+                elif col == "Util% (MAC)":
+                    cycles_idx = col_map["Op Cycles"]
+                    mac_idx = col_map["MAC Count"]
+                    if cycles_idx != -1 and mac_idx != -1:
+                        cycles = int(row[cycles_idx])
+                        macs = int(row[mac_idx])
+                        max_macs = cycles * arch.num_macs_per_cycle * arch.ncores
+                        util = _percentage(macs, max_macs)
+                        new_row.append(str(util))
+                    else:
+                        new_row.append("N/A")
+                else:
+                    new_row.append("N/A")
+
+        new_data.append(new_row)
+    return new_header, new_data
+
+
+def write_regor_db(opt_database, output_basename):
+    out = "<?xml version='1.0' encoding='UTF-8'?>\n"
+    out += f'<debug source="{output_basename}">\n'
+    for name, table in opt_database.tables.items():
+        out += f'<table name="{name}">\n'
+        out += "<![CDATA[\n"
+        out += ",".join(f'"{col}"' for col in table.header) + "\n"
+        for row in table.data:
+            for i in range(len(row)):
+                if not row[i].isnumeric():
+                    row[i] = f'"{row[i]}"'
+            out += ",".join(f"{val}" for val in row) + "\n"
+        out += "]]>\n"
+        out += "</table>\n"
+    out += "</debug>\n"
+
+    debug_filename = output_basename + "_debug.xml"
+
+    with open(debug_filename, "w", encoding="utf-8") as file:
+        file.write(out)
+
+
+def write_regor_perlayer_performance_csv(arch, opt_database, performance_report, output_basename):
+    if "perf" not in opt_database.tables:
+        return
+
+    perf_header, perf_rows = postprocess_regor_performance_database(arch, opt_database, performance_report)
+
+    csv_name = f"{output_basename}_per-layer.csv"
+    with open(csv_name, "w") as csv_file:
+        csv_file.write(",".join(perf_header))
+        csv_file.write("\n")
+        for row in perf_rows:
+            csv_file.write(",".join(row))
+            csv_file.write("\n")
+
+
+def print_regor_perlayer_performance(arch, opt_database, performance_report, output_basename):
+    if "perf" not in opt_database.tables:
+        return
+
+    perf_header, perf_rows = postprocess_regor_performance_database(arch, opt_database, performance_report)
+
+    # header -> (align, width, precision)
+    header = {
+        "Original Operator": ("<", 20, -1),
+        "NNG Operator": ("<", 20, -1),
+        "Target": ("<", 6, -1),
+        "Staging Usage": (">", 13, 0.0),
+        "Peak% (Staging)": (">", 16, 0.2),
+        "Op Cycles": (">", 10, 0.0),
+        "Network% (cycles)": (">", 17, 0.2),
+        "NPU": (">", 10, 0.0),
+        "SRAM AC": (">", 10, 0.0),
+        "DRAM AC": (">", 10, 0.0),
+        "OnFlash AC": (">", 10, 0.0),
+        "OffFlash AC": (">", 11, 0.0),
+        "MAC Count": (">", 10, 0.0),
+        "Network% (MAC)": (">", 14, 0.2),
+        "Util% (MAC)": (">", 12, 0.2),
+        "Name": ("<", 20, -1),
+    }
+
+    print(f"\n{str('#') * 80}\nPerformance for NPU Grap {output_basename}")
+
+    line = ""
+    line2 = ""
+    for col_name in header:
+        align, width, _ = header[col_name]
+        line_data = f"{col_name:{align}{width}}"
+        line += line_data + " "
+        line2 += "-" * len(line_data) + " "
+    print(line)
+    print(line2)
+
+    for op_data in perf_rows:
+        line = ""
+        for table_idx in range(len(perf_header)):
+            h = perf_header[table_idx]
+            align, width, precision = header[h]
+            if precision == -1:
+                w = str(width)
+            else:
+                w = str(width + precision) + "f"
+            item = op_data[table_idx]
+            if precision != -1:
+                item = float(item)
+            line += f"{item:{align}{w}}" + " "
+        print(line)
diff --git a/ethosu/vela/test/test_tflite_supported_operators.py b/ethosu/vela/test/test_tflite_supported_operators.py
index e65717a8..342fb3cf 100644
--- a/ethosu/vela/test/test_tflite_supported_operators.py
+++ b/ethosu/vela/test/test_tflite_supported_operators.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 # Description:
-# Unit tests for tflite support_operators
+# Unit tests for TFLite support_operators
 from typing import List
 
 import numpy as np
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index 3af8588c..66b42b3d 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -2508,7 +2508,7 @@ def convert_ops_to_lut(op: Operation, arch, nng) -> Operation:
         elif op.ifm.dtype == DataType.int16:
             return create_lut_int16_op(op, math.exp, "exp")
         else:
-            # Should already be catched in tflite supported ops
+            # Should already be catched in TFLite supported ops
             assert False, f"Unsupported data type {op.ifm.dtype} for {op.type}"
 
     if op.type == Op.Rsqrt:
diff --git a/ethosu/vela/tflite_reader.py b/ethosu/vela/tflite_reader.py
index e732f19d..7a7d38cd 100644
--- a/ethosu/vela/tflite_reader.py
+++ b/ethosu/vela/tflite_reader.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -321,7 +321,7 @@ class TFLiteGraph:
                     buf_data = self.buffers[meta.Buffer()]
                     self.nng.metadata.append((name, buf_data))
         except (struct.error, TypeError, RuntimeError) as e:
-            print(f'Error: Invalid tflite file. Got "{e}" while {parsing_step}.')
+            print(f'Error: Invalid TFLite file. Got "{e}" while {parsing_step}.')
             sys.exit(1)
 
     def parse_buffer(self, buf_data):
diff --git a/ethosu/vela/tflite_writer.py b/ethosu/vela/tflite_writer.py
index d4e24a20..9b875ae4 100644
--- a/ethosu/vela/tflite_writer.py
+++ b/ethosu/vela/tflite_writer.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -372,7 +372,7 @@ class TFLiteSerialiser:
 
         # Make sure all original tensors are written back, special case for Ops
         # with connected subgraphs. Even though not all inputs are used,
-        # the reference kernel expects all inputs to be in the tflite file.
+        # the reference kernel expects all inputs to be in the TFLite file.
         # Since we traverse the graph starting with all outputs they are
         # always added but if an input is not referenced it will not be added
         # to an op.
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
old mode 100644
new mode 100755
index a4b93f10..af00dcf6
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+#
 # SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
@@ -14,15 +16,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-# Description:
-# Main entry point for the Vela compiler.
-#
-# Provides command line interface, options parsing, and network loading. Before calling the compiler driver.
+"""Compile a neural network model for Arm Ethos-U NPUs."""
 import argparse
 import glob
 import os
 import sys
 import time
+from typing import List
+from typing import Optional
 
 import flatbuffers
 
@@ -51,8 +52,12 @@ from .tflite_model_semantic import TFLiteSemantic
 from .tflite_supported_operators import TFLiteSupportedOperators
 from .tosa_model_semantic import TosaSemantic
 from .tosa_supported_operators import TosaSupportedOperators
+from ethosu import regor
 from ethosu.vela.architecture_features import ArchitectureFeatures
 
+TFLITE_MAGIC = 0x334C4654
+TOSA_MAGIC = 0x41534F54
+
 CONFIG_FILES_PATH = os.path.normpath(os.path.join(__file__, "..", "..", "config_files"))
 
 
@@ -111,6 +116,60 @@ def process(input_name, enable_debug_db, arch, model_reader_options, compiler_op
     return nng
 
 
+def process_regor(
+    arch,
+    input_name,
+    accelerator,
+    system_config,
+    options,
+    enable_debug_db,
+    output_dir,
+    verbose_weights=False,
+    show_cpu_operations=False,
+    verbose_performance=False,
+):
+    os.makedirs(output_dir, exist_ok=True)
+
+    with open(input_name, "rb") as f:
+        network = f.read()
+    fmt = get_format(network)
+
+    compiled_model = regor.compile(accelerator, network, fmt, system_config, options=options, verbose=True)
+
+    model_name = os.path.splitext(os.path.basename(input_name))[0]
+
+    output_basename = os.path.join(output_dir, model_name)
+
+    if isinstance(compiled_model, regor.CompiledTFLiteModel):
+        output_name = output_basename + "_vela.tflite"
+        with open(output_name, "wb") as f:
+            f.write(compiled_model.model)
+    elif isinstance(compiled_model, regor.CompiledRawModel):
+        rawdata_writer.write_rawdata_output_from_model(output_basename + "_vela.npz", compiled_model)
+
+    summary_csv_file = "{0}_summary_{1}.csv".format(output_basename, arch.system_config)
+
+    if verbose_performance:
+        stats_writer.write_regor_perlayer_performance_csv(
+            arch, compiled_model.opt_database, compiled_model.perf_report, output_basename
+        )
+        stats_writer.print_regor_perlayer_performance(
+            arch, compiled_model.opt_database, compiled_model.perf_report, output_basename
+        )
+
+    stats_writer.print_regor_performance_metrics(
+        arch,
+        compiled_model.perf_report,
+        model_name,
+        summary_csv_file,
+        compiled_model.opt_database,
+        verbose_weights,
+        show_cpu_operations,
+    )
+    if enable_debug_db:
+        stats_writer.write_regor_db(compiled_model.opt_database, output_basename)
+
+
 def find_subgraph_with_command_stream_order(nng, idx):
     for sg in nng.subgraphs:
         if sg.generated_stream_id == idx:
@@ -119,7 +178,7 @@ def find_subgraph_with_command_stream_order(nng, idx):
 
 
 def calculate_operator_file_offsets(name: str):
-    # Read the vela optimized tflite file
+    # Read the vela optimized TFLite file
     with open(name, "rb") as f:
         buf = bytearray(f.read())
     # Calculate the file offsets for each custom operator
@@ -321,6 +380,65 @@ def generate_supported_ops():
         print(f"Report file: {filepath}")
 
 
+def get_compiler_config(
+    enable_debug_db: bool,
+    verbose_all: bool,
+    verbose_high_level_command_stream: bool,
+    verbose_register_command_stream: bool,
+    optimize: str,
+    arena_cache_size: int,
+    verbose_schedule: bool,
+    verbose_allocation: bool,
+    verbose_graph: bool,
+    verbose_quantization: bool,
+    verbose_packing: bool,
+    verbose_performance: bool,
+    show_cpu_operations: bool,
+    output_format: str,
+) -> str:
+    """Build compiler config file."""
+    config = "\n[compiler]\n"
+    if verbose_high_level_command_stream:
+        config += "verbose_high_level_command_stream=true\n"
+    if verbose_register_command_stream:
+        config += "verbose_register_command_stream=true\n"
+    if verbose_performance or show_cpu_operations or enable_debug_db:
+        config += "enable_db=true\n"
+    if output_format == "raw":
+        config += "output_format=Raw"
+    else:
+        config += "output_format=TFLite"
+
+    config += "\n[scheduler]\n"
+    config += f"optimize={optimize}\n"
+    config += f"arena_size_limit={arena_cache_size}\n"
+    if verbose_schedule:
+        config += "verbose=true\n"
+    if verbose_allocation:
+        config += "verbose_allocation=true\n"
+
+    config += "\n[graph]\n"
+    if verbose_graph:
+        config += "verbose=true\n"
+    if verbose_quantization:
+        config += "verbose_quantization=true\n"
+
+    return config
+
+
+def get_format(in_data: bytes) -> str:
+    """Infere format based on input file."""
+    ret = "UNDEFINED"
+    if len(in_data) < 8:
+        return ret
+    second_word = int.from_bytes(in_data[4:8], "little")
+    if second_word == TFLITE_MAGIC:
+        ret = "TFLITE"
+    if second_word == TOSA_MAGIC:
+        ret = "TOSA"
+    return ret
+
+
 def list_config_files():
     print("Available config files:")
     path_length = len(CONFIG_FILES_PATH + os.path.sep)
@@ -328,247 +446,313 @@ def list_config_files():
         print(config[path_length:])
 
 
-def main(args=None):
-    try:
-        if args is None:
-            args = sys.argv[1:]
+def main(argv: Optional[List[str]] = None) -> int:
+    """Run the main entry point."""
+    if argv is None:
+        argv = sys.argv[1:]
 
-        parser = argparse.ArgumentParser(prog="vela", description="Neural network model compiler for Arm Ethos-U NPUs")
-        parser.add_argument("--version", action="version", version=__version__)
-        parser.add_argument(
-            "--api-version", action="version", version=API_VERSION, help="Displays the version of the external API."
-        )
-        parser.add_argument(
-            "--supported-ops-report",
-            action="store_true",
-            help="Generate the SUPPORTED_OPS.md file in the current working directory and exit",
-        )
+    parser = argparse.ArgumentParser(prog="vela", description="Neural network model compiler for Arm Ethos-U NPUs")
+    parser.add_argument("--version", action="version", version=__version__)
+    parser.add_argument(
+        "--api-version", action="version", version=API_VERSION, help="Displays the version of the external API."
+    )
+    parser.add_argument(
+        "--supported-ops-report",
+        action="store_true",
+        help="Generate the SUPPORTED_OPS.md file in the current working directory and exit",
+    )
 
-        parser.add_argument(
-            "--list-config-files",
-            action="store_true",
-            help=(
-                "Display all available configurations in the `config_files` folder and exit. To select config file, "
-                "use the --config argument with one of the listed config files (For example: --config Arm/vela.ini )"
-            ),
-        )
+    parser.add_argument(
+        "--list-config-files",
+        action="store_true",
+        help=(
+            "Display all available configurations in the `config_files` folder and exit. To select config file, "
+            "use the --config argument with one of the listed config files (For example: --config Arm/vela.ini )"
+        ),
+    )
 
-        # set network nargs to be optional to allow the support-ops-report CLI option to be used standalone
-        parser.add_argument(
-            "network",
-            metavar="NETWORK",
-            type=str,
-            default=None,
-            nargs="?",
-            help="Filename of the input TensorFlow Lite for Microcontrollers network",
-        )
-        parser.add_argument(
-            "--output-dir", type=str, default="output", help="Output directory to write files to (default: %(default)s)"
-        )
-        parser.add_argument(
-            "--enable-debug-db",
-            action="store_true",
-            default=None,
-            help="Enables the calculation and writing of a network debug database to output directory",
-        )
-        parser.add_argument(
-            "--config",
-            type=str,
-            action="append",
-            help="Vela configuration file(s) in Python ConfigParser .ini file format",
-        )
-        parser.add_argument("--verbose-all", action="store_true", help="Enable all verbose options")
-        parser.add_argument(
-            "--verbose-config", action="store_true", help="Verbose system configuration and memory mode"
-        )
-        parser.add_argument("--verbose-graph", action="store_true", help="Verbose graph rewriter")
-        parser.add_argument("--verbose-quantization", action="store_true", help="Verbose quantization")
-        parser.add_argument("--verbose-packing", action="store_true", help="Verbose pass packing")
-        parser.add_argument("--verbose-tensor-purpose", action="store_true", help="Verbose tensor purpose")
-        parser.add_argument("--verbose-tensor-format", action="store_true", help="Verbose tensor format")
-        parser.add_argument("--verbose-schedule", action="store_true", help="Verbose schedule")
-        parser.add_argument("--verbose-allocation", action="store_true", help="Verbose tensor allocation")
-        parser.add_argument(
-            "--verbose-high-level-command-stream", action="store_true", help="Verbose high level command stream"
-        )
-        parser.add_argument(
-            "--verbose-register-command-stream", action="store_true", help="Verbose register command stream"
-        )
-        parser.add_argument("--verbose-operators", action="store_true", help="Verbose operator list")
-        parser.add_argument("--verbose-weights", action="store_true", help="Verbose weights information")
-        parser.add_argument("--verbose-performance", action="store_true", help="Verbose performance information")
-        parser.add_argument("--verbose-progress", action="store_true", help="Verbose progress information")
-        parser.add_argument(
-            "--show-cpu-operations", action="store_true", help="Show the operations that fall back to the CPU"
-        )
-        parser.add_argument("--timing", action="store_true", help="Time the compiler doing operations")
-        parser.add_argument(
-            "--force-symmetric-int-weights",
-            action="store_true",
-            help="Forces all zero points to 0 for signed integer weights",
-        )
-        parser.add_argument(
-            "--accelerator-config",
-            type=str,
-            default="ethos-u55-256",
-            choices=list(architecture_features.Accelerator.member_list()),
-            help="Accelerator configuration to use (default: %(default)s)",
-        )
-        parser.add_argument(
-            "--system-config",
-            type=str,
-            default=architecture_features.ArchitectureFeatures.DEFAULT_CONFIG,
-            help="System configuration to select from the Vela configuration file (default: %(default)s)",
-        )
-        parser.add_argument(
-            "--memory-mode",
-            type=str,
-            default=architecture_features.ArchitectureFeatures.DEFAULT_CONFIG,
-            help="Memory mode to select from the Vela configuration file (default: %(default)s)",
-        )
-        parser.add_argument(
-            "--tensor-allocator",
-            default=TensorAllocator.HillClimb,
-            type=lambda s: TensorAllocator[s],
-            choices=list(TensorAllocator),
-            help="Tensor Allocator algorithm (default: %(default)s)",
-        )
-        parser.add_argument(
-            "--show-subgraph-io-summary",
-            action="store_true",
-            help="Shows a summary of all the subgraphs and their inputs and outputs",
-        )
-        parser.add_argument(
-            "--max-block-dependency",
-            type=int,
-            default=architecture_features.ArchitectureFeatures.MAX_BLOCKDEP,
-            choices=range(0, architecture_features.ArchitectureFeatures.MAX_BLOCKDEP + 1),
-            help=(
-                "Set the maximum value that can be used for the block dependency between npu kernel operations"
-                " (default: %(default)s)"
-            ),
-        )
-        parser.add_argument(
-            "--optimise",
-            type=lambda s: scheduler.OptimizationStrategy[s],
-            default=scheduler.OptimizationStrategy.Performance,
-            choices=list(scheduler.OptimizationStrategy),
-            help=(
-                "Set the optimisation strategy. The Size strategy results in minimal SRAM usage (does not use"
-                " arena-cache-size). The Performance strategy results in maximal performance (uses the arena-cache-size"
-                " if specified) (default: %(default)s)"
-            ),
-        )
-        parser.add_argument(
-            "--arena-cache-size",
-            type=int,
-            help=(
-                "Set the size of the arena cache memory area, in bytes. If specified, this option overrides the memory"
-                " mode attribute with the same name in a Vela configuration file"
-            ),
-        )
-        parser.add_argument(
-            "--cpu-tensor-alignment",
-            type=int,
-            default=Tensor.AllocationQuantum,
-            help=(
-                "Controls the allocation byte alignment of cpu tensors including Ethos-U Custom"
-                " operator inputs and outputs (default: %(default)s Bytes)"
-            ),
-        )
-        parser.add_argument(
-            "--recursion-limit",
-            type=int,
-            default=1000,
-            help="Set the recursion depth limit, may result in RecursionError if too low (default: %(default)s)",
-        )
-        parser.add_argument(
-            "--hillclimb-max-iterations",
-            type=int,
-            default=HillClimbAllocator.MAX_ITERATIONS,
-            help=(
-                "Set the maximum number of iterations the Hill Climb tensor allocator will run (default: %(default)s)"
-            ),
-        )
-        args = parser.parse_args(args=args)
-
-        # Generate the supported ops report and exit
-        if args.supported_ops_report:
-            generate_supported_ops()
-            return 0
-
-        if args.list_config_files:
-            list_config_files()
-            return 0
-
-        if args.network is None:
-            parser.error("the following argument is required: NETWORK")
-
-        def _parse_config(config):
-            # Make sure the correct separator is used depending on OS
-            config = os.path.normpath(config)
-
-            if not config.endswith(".ini"):
-                raise InputFileError(config, "Configuration files must use the .ini extension")
-
-            if (
-                len(config.split(os.path.sep)) == 2
-                and not config.startswith(os.path.sep)
-                and not config.startswith(".")
-                and not config.startswith("~")
-            ):
-                config_path = os.path.join(CONFIG_FILES_PATH, config)
-            else:
-                # Check if the configuration file is correctly placed inside the config_files directory
-                if os.access(os.path.join(CONFIG_FILES_PATH, *config.split(os.path.sep)[-2:]), os.R_OK):
-                    rel_path = os.path.join(*config.split(os.path.sep)[-2:])
-                    print(
-                        f"Warning: Consider accessing the configuration by --config {rel_path} since it is located "
-                        "inside the config_files directory."
-                    )
-                config_path = config
-
-            if not os.access(config_path, os.R_OK):
-                raise InputFileError(
-                    config_path,
-                    "File not found or is not readable. The configuration file is either not located in a folder "
-                    "directly under the `config_files` directory or its path has not been provided correctly.",
-                )
+    # set network nargs to be optional to allow the support-ops-report CLI option to be used standalone
+    parser.add_argument(
+        "network",
+        metavar="NETWORK",
+        type=str,
+        nargs="?",
+        default=None,
+        help="Filename of the input network",
+    )
+    parser.add_argument(
+        "--output-dir", type=str, default="output", help="Output directory to write files to (default: %(default)s)"
+    )
+    parser.add_argument(
+        "--output-format",
+        type=str,
+        default="tflite",
+        choices=["tflite", "raw"],
+        help="Output format (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--enable-debug-db",
+        action="store_true",
+        default=None,
+        help="Enables the calculation and writing of a network debug database to output directory",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        action="append",
+        help="Vela configuration file(s) in Python ConfigParser .ini file format",
+    )
+    parser.add_argument("--verbose-all", action="store_true", help="Enable all verbose options")
+    parser.add_argument(
+        "--verbose-config", action="store_true", help="Enable system configuration and memory mode debug"
+    )
+    parser.add_argument("--verbose-graph", action="store_true", help="Enable graph optimizer debug")
+    parser.add_argument("--verbose-quantization", action="store_true", help="Enable quantization debug")
+    parser.add_argument("--verbose-packing", action="store_true", help="Enable pass packing debug")
+    parser.add_argument("--verbose-tensor-purpose", action="store_true", help="Enable tensor purpose debug")
+    parser.add_argument("--verbose-tensor-format", action="store_true", help="Enable tensor format debug")
+    parser.add_argument("--verbose-schedule", action="store_true", help="Enable schedule debug")
+    parser.add_argument("--verbose-allocation", action="store_true", help="Enable tensor allocation debug")
+    parser.add_argument(
+        "--verbose-high-level-command-stream", action="store_true", help="Enable high level command stream debug"
+    )
+    parser.add_argument(
+        "--verbose-register-command-stream", action="store_true", help="Enable register command stream debug"
+    )
+    parser.add_argument("--verbose-operators", action="store_true", help="Enable operator list debug")
+    parser.add_argument("--verbose-weights", action="store_true", help="Enable weights information debug")
+    parser.add_argument("--verbose-performance", action="store_true", help="Enable performance information debug")
+    parser.add_argument("--verbose-progress", action="store_true", help="Enable progress information debug")
+    parser.add_argument(
+        "--show-cpu-operations", action="store_true", help="Show the operations that fall back to the CPU"
+    )
+    parser.add_argument("--timing", action="store_true", help="Time the compiler doing operations")
+    parser.add_argument(
+        "--force-symmetric-int-weights",
+        action="store_true",
+        help="Forces all zero points to 0 for signed integer weights",
+    )
+    parser.add_argument(
+        "--accelerator-config",
+        type=str,
+        default="ethos-u55-256",
+        choices=list(architecture_features.Accelerator.member_list()),
+        help="Accelerator configuration to use (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--system-config",
+        type=str,
+        default=architecture_features.ArchitectureFeatures.DEFAULT_CONFIG,
+        help="System configuration to select from the Vela configuration file (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--memory-mode",
+        type=str,
+        default=architecture_features.ArchitectureFeatures.DEFAULT_CONFIG,
+        help="Memory mode to select from the Vela configuration file (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--tensor-allocator",
+        default=TensorAllocator.HillClimb,
+        type=lambda s: TensorAllocator[s],
+        choices=list(TensorAllocator),
+        help="Tensor Allocator algorithm (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--show-subgraph-io-summary",
+        action="store_true",
+        help="Shows a summary of all the subgraphs and their inputs and outputs",
+    )
+    parser.add_argument(
+        "--max-block-dependency",
+        type=int,
+        default=architecture_features.ArchitectureFeatures.MAX_BLOCKDEP,
+        choices=range(0, architecture_features.ArchitectureFeatures.MAX_BLOCKDEP + 1),
+        help=(
+            "Set the maximum value that can be used for the block dependency between npu kernel operations"
+            " (default: %(default)s)"
+        ),
+    )
+    parser.add_argument(
+        "--optimise",
+        type=lambda s: scheduler.OptimizationStrategy[s],
+        default=scheduler.OptimizationStrategy.Performance,
+        choices=list(scheduler.OptimizationStrategy),
+        help=(
+            "Set the optimisation strategy. The Size strategy results in minimal SRAM usage (does not use"
+            " arena-cache-size). The Performance strategy results in maximal performance (uses the arena-cache-size"
+            " if specified) (default: %(default)s)"
+        ),
+    )
+    parser.add_argument(
+        "--arena-cache-size",
+        type=int,
+        help=(
+            "Set the size of the arena cache memory area, in bytes. If specified, this option overrides the memory"
+            " mode attribute with the same name in a Vela configuration file"
+        ),
+    )
+    parser.add_argument(
+        "--cpu-tensor-alignment",
+        type=int,
+        default=Tensor.AllocationQuantum,
+        help=(
+            "Controls the allocation byte alignment of cpu tensors including Ethos-U Custom"
+            " operator inputs and outputs (default: %(default)s Bytes)"
+        ),
+    )
+    parser.add_argument(
+        "--recursion-limit",
+        type=int,
+        default=1000,
+        help="Set the recursion depth limit, may result in RecursionError if too low (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--hillclimb-max-iterations",
+        type=int,
+        default=HillClimbAllocator.MAX_ITERATIONS,
+        help="Set the maximum number of iterations the Hill Climb tensor allocator will run (default: %(default)s)",
+    )
+
+    # debug options
+    parser.add_argument("--debug-force-regor", action="store_true", help="Debug: Force the use of the regor")
+
+    args = parser.parse_args(argv)
 
-            return config_path
+    # Generate the supported ops report and exit
+    if args.supported_ops_report:
+        generate_supported_ops()
+        return 0
+
+    if args.list_config_files:
+        list_config_files()
+        return 0
 
-        # check all config files exist because they will be read as a group
-        config_files = [_parse_config(cfg) for cfg in args.config] if args.config else None
+    if args.network is None:
+        parser.error("the following argument is required: NETWORK")
+
+    def _parse_config(config):
+        # Make sure the correct separator is used depending on OS
+        config = os.path.normpath(config)
+
+        if not config.endswith(".ini"):
+            raise InputFileError(config, "Configuration files must use the .ini extension")
+
+        if (
+            len(config.split(os.path.sep)) == 2
+            and not config.startswith(os.path.sep)
+            and not config.startswith(".")
+            and not config.startswith("~")
+        ):
+            config_path = os.path.join(CONFIG_FILES_PATH, config)
+        else:
+            # Check if the configuration file is correctly placed inside the config_files directory
+            if os.access(os.path.join(CONFIG_FILES_PATH, *config.split(os.path.sep)[-2:]), os.R_OK):
+                rel_path = os.path.join(*config.split(os.path.sep)[-2:])
+                print(
+                    f"Warning: Consider accessing the configuration by --config {rel_path} since it is located "
+                    "inside the config_files directory."
+                )
+            config_path = config
 
-        if args.cpu_tensor_alignment < 16 or args.cpu_tensor_alignment & (args.cpu_tensor_alignment - 1) != 0:
-            parser.error(
-                "Invalid argument to --cpu-tensor-alignment = {} (must be greater than or equal to 16 and a power of 2)"
-                "".format(args.cpu_tensor_alignment)
+        if not os.access(config_path, os.R_OK):
+            raise InputFileError(
+                config_path,
+                "File not found or is not readable. The configuration file is either not located in a folder "
+                "directly under the `config_files` directory or its path has not been provided correctly.",
             )
 
-        if args.system_config == ArchitectureFeatures.DEFAULT_CONFIG:
-            print(f"Warning: Using {ArchitectureFeatures.DEFAULT_CONFIG} values for system configuration")
+        return config_path
 
-        if args.memory_mode == ArchitectureFeatures.DEFAULT_CONFIG:
-            print(f"Warning: Using {ArchitectureFeatures.DEFAULT_CONFIG} values for memory mode")
+    # check all config files exist because they will be read as a group
+    config_files = [_parse_config(cfg) for cfg in args.config] if args.config else None
 
-        if args.verbose_all:
-            for v in vars(args):
-                if v.startswith("verbose") and v != "verbose_all":
-                    setattr(args, v, True)
+    if args.cpu_tensor_alignment < 16 or args.cpu_tensor_alignment & (args.cpu_tensor_alignment - 1) != 0:
+        parser.error(
+            "Invalid argument to --cpu-tensor-alignment = {} (must be greater than or equal to 16 and a power of 2)"
+            "".format(args.cpu_tensor_alignment)
+        )
+
+    if args.system_config == ArchitectureFeatures.DEFAULT_CONFIG:
+        print(f"Warning: Using {ArchitectureFeatures.DEFAULT_CONFIG} values for system configuration")
+
+    if args.memory_mode == ArchitectureFeatures.DEFAULT_CONFIG:
+        print(f"Warning: Using {ArchitectureFeatures.DEFAULT_CONFIG} values for memory mode")
 
-        sys.setrecursionlimit(args.recursion_limit)
+    if args.verbose_all:
+        for v in vars(args):
+            if v.startswith("verbose") and v != "verbose_all":
+                setattr(args, v, True)
+
+    sys.setrecursionlimit(args.recursion_limit)
+
+    arch = architecture_features.ArchitectureFeatures(
+        vela_config_files=config_files,
+        system_config=args.system_config,
+        memory_mode=args.memory_mode,
+        accelerator_config=args.accelerator_config,
+        max_blockdep=args.max_block_dependency,
+        verbose_config=args.verbose_config,
+        arena_cache_size=args.arena_cache_size,
+    )
 
-        arch = architecture_features.ArchitectureFeatures(
-            vela_config_files=config_files,
-            system_config=args.system_config,
-            memory_mode=args.memory_mode,
-            accelerator_config=args.accelerator_config,
-            max_blockdep=args.max_block_dependency,
-            verbose_config=args.verbose_config,
-            arena_cache_size=args.arena_cache_size,
+    model_reader_options = model_reader.ModelReaderOptions()
+
+    # The default behaviour is to use Vela for Ethos-U55/U65 and Regor for Ethos-U85. However, developers can override
+    # this by using the --debug-force-regor option
+    if arch.is_ethos_u85_system or args.debug_force_regor:
+        system_config = "[architecture]\n"
+        system_config += f"macs={arch.num_macs_per_cycle}\n"
+        system_config += f"cores={arch.ncores}\n"
+
+        system_config += "[vela]\n"
+        system_config += f"system_config_name={args.system_config}\n"
+        system_config += f"memory_mode_name={args.memory_mode}\n"
+
+        # TODO MLBEDSW-8190: Improve support for multiple config files
+        for config_file in config_files:
+            with open(config_file, "rb") as f:
+                system_config += f.read().decode("utf-8")
+
+        # Transform Vela accelerator config string format to Regor format
+        accelerator_config = args.accelerator_config
+        substrings = accelerator_config.split("-")
+        substrings = [substring.capitalize() for substring in substrings[:-1]]
+        accelerator = "".join(substrings)
+
+        options = get_compiler_config(
+            args.enable_debug_db,
+            args.verbose_all,
+            args.verbose_high_level_command_stream,
+            args.verbose_register_command_stream,
+            args.optimise,
+            # TODO MLBEDSW-8191: Clean up arena_cache_size selection
+            # arch.arena_cache_size will use value from CLI option if available, otherwise from config file
+            arch.arena_cache_size,
+            args.verbose_schedule,
+            args.verbose_allocation,
+            args.verbose_graph,
+            args.verbose_quantization,
+            args.verbose_packing,
+            args.verbose_performance,
+            args.show_cpu_operations,
+            args.output_format,
         )
 
+        process_regor(
+            arch,
+            args.network,
+            accelerator,
+            system_config,
+            options,
+            args.enable_debug_db,
+            args.output_dir,
+            args.verbose_weights,
+            args.show_cpu_operations,
+            args.verbose_performance,
+        )
+
+    else:
         compiler_options = compiler_driver.CompilerOptions(
             verbose_graph=args.verbose_graph,
             verbose_quantization=args.verbose_quantization,
@@ -598,16 +782,16 @@ def main(args=None):
             verbose_progress=args.verbose_progress,
         )
 
-        model_reader_options = model_reader.ModelReaderOptions()
+        try:
+            nng = process(
+                args.network, args.enable_debug_db, arch, model_reader_options, compiler_options, scheduler_options
+            )
 
-        nng = process(
-            args.network, args.enable_debug_db, arch, model_reader_options, compiler_options, scheduler_options
-        )
+        except VelaError as e:
+            print(e.data)
+            return 1
 
         if args.show_subgraph_io_summary:
             print_subgraph_io_summary(nng)
 
-        return 0
-    except VelaError as e:
-        print(e.data)
-        return 1
+    return 0
diff --git a/pyproject.toml b/pyproject.toml
index 140e8092..78ab6ea0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -26,6 +26,7 @@ classifiers = [
     "License :: OSI Approved :: Apache Software License",
     "Operating System :: POSIX :: Linux",
     "Operating System :: Microsoft :: Windows :: Windows 10",
+    "Programming Language :: C++",
     "Programming Language :: C",
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3.10",
@@ -57,14 +58,35 @@ vela = "ethosu.vela.vela:main"
 
 [build-system]
 requires = [
+    "setuptools >= 42",
+    "wheel",
     "numpy",
-    "setuptools_scm[toml]~=7.1.0"
+    "setuptools_scm[toml]~=7.1.0",
+    "cmake >= 3.16.3"
 ]
 build-backend = "setuptools.build_meta"
 
+[tool.cibuildwheel]
+test-command = "pytest {project}/test"
+test-extras = ["dev"]
+
 [tool.pytest.ini_options]
 minversion = "6.0"
 addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config"]
 xfail_strict = true
 filterwarnings = ["error"]
-testpaths = ["ethosu/mlw_codec/test", "ethosu/vela/test"]
\ No newline at end of file
+testpaths = ["test", "ethosu/mlw_codec/test", "ethosu/vela/test"]
+
+[tool.black]
+line-length = 120
+
+[tool.pylint.MAIN]
+ignore = "ethosu/regor"
+disable = "all"
+enable = "W0102"
+
+[tool.pylint.FORMAT]
+max-line-length = 120
+
+[tool.pylint.REPORTS]
+score = "no"
diff --git a/setup.cfg b/setup.cfg
index 5c63e159..f24eace3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -22,6 +22,8 @@ packages =
     ethosu.vela.tflite
     ethosu.vela.tosa
     ethosu.mlw_codec
+    ethosu.regor
+include_package_data = False
 
 [options.package_data]
-ethosu = config_files/*/*.ini
\ No newline at end of file
+ethosu = config_files/*/*.ini
diff --git a/setup.py b/setup.py
index 6b7186bf..22321ce9 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,5 @@
-# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright (c) 2016 The Pybind Development Team, All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -14,10 +15,39 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-# Description:
-# Packaging for the Vela compiler
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+"""Packaging for the Vela compiler."""
 import os
+import platform
 import re
+import subprocess
+import sys
 
 from setuptools import Extension
 from setuptools import setup
@@ -25,7 +55,40 @@ from setuptools.command.build_ext import build_ext
 from setuptools_scm import get_version
 
 
-class BuildExtension(build_ext):
+# MakeExtension
+class MakeExtension(Extension):
+    def __init__(self, name, sources, define_macros=None):
+        super().__init__(name, sources, define_macros=define_macros)
+
+
+mlw_module = MakeExtension(
+    "ethosu.mlw_codec",
+    ["ethosu/mlw_codec/mlw_encode.c", "ethosu/mlw_codec/mlw_decode.c", "ethosu/mlw_codec/mlw_codecmodule.c"],
+    define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_9_API_VERSION")],
+)
+
+
+# CMakeExtension. The package_name needs to match the source tree location of the extension
+class CMakeExtension(Extension):
+    """CMake extension supporting the build process."""
+
+    def __init__(self, package_name):
+        """Cmake extension constructor."""
+        # name must be the single output extension from the CMake build.
+        Extension.__init__(self, package_name, sources=[])
+        # Cache the source dir
+        paths = package_name.split(".")
+        sourcedir = os.path.join(*paths)
+        self.sourcedir = os.path.abspath(sourcedir)
+
+
+ethosu_regor = CMakeExtension("ethosu.regor")
+
+
+# The actual build extension logic
+class MakeOrCMakeBuild(build_ext):
+    """CMake build extension main class extending setuptools build_ext."""
+
     def finalize_options(self):
         build_ext.finalize_options(self)
         import builtins
@@ -37,6 +100,123 @@ class BuildExtension(build_ext):
         # add the numpy headers to the mlw_codec extension
         self.include_dirs.append(np.get_include())
 
+    def build_extension(self, ext):
+        """Override build_extension. Implements the logic to build the module."""
+        if isinstance(ext, MakeExtension):
+            # make build
+            super().build_extension(ext)
+        elif isinstance(ext, CMakeExtension):
+            import numpy as np
+
+            self.include_dirs.remove(np.get_include())
+            self.build_extension_cmake(ext)
+        else:
+            assert False, f"Unsupported build_extension of type {type(ext)}"
+
+    def build_extension_cmake(self, ext):
+        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
+
+        # required for auto-detection & inclusion of auxiliary "native" libs
+        if not extdir.endswith(os.path.sep):
+            extdir += os.path.sep
+
+        # Build config
+        cfg = os.environ.get("CMAKE_BUILD_TYPE", "Release")
+
+        # CMake lets you override the generator - we need to check this.
+        # Can be set with Conda-Build, for example.
+        cmake_generator = os.environ.get("CMAKE_GENERATOR", "")
+
+        # Set Python3_ROOT_DIR pointing to a specific Python installation if
+        # required. Alternatively simply resort to a virtual env to get the
+        # Python version right. Constraining CMake's FindPython module to an
+        # EXACT Python version is also an option
+        cmake_args = [
+            f"-DCMAKE_BUILD_TYPE={cfg}",
+            f"-DREGOR_PYTHON_BINDINGS_DESTINATION={extdir}",
+        ]
+        build_args = ["-t", "install-python-bindings"]
+        # Adding CMake arguments set as environment variable
+        # (needed e.g. to build for ARM OSx on conda-forge)
+        if "CMAKE_ARGS" in os.environ:
+            cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
+
+        # Pass version
+        cmake_args += [f"-DREGOR_PYEXT_VERSION={get_version()}"]
+
+        if self.compiler.compiler_type != "msvc":
+            # Using Ninja-build since it a) is available as a wheel and b)
+            # multithreads automatically. MSVC would require all variables be
+            # exported for Ninja to pick it up, which is a little tricky to do.
+            # Users can override the generator with CMAKE_GENERATOR in CMake
+            # 3.15+.
+            if not cmake_generator:
+                try:
+                    import ninja  # noqa: F401
+
+                    ninja_executable_path = os.path.join(ninja.BIN_DIR, "ninja")
+                    cmake_args += [
+                        "-GNinja",
+                        f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}",
+                    ]
+                except ImportError:
+                    pass
+        else:
+
+            # Single config generators are handled "normally"
+            single_config = any(x in cmake_generator for x in {"NMake", "Ninja"})
+
+            # CMake allows an arch-in-generator style for backward compatibility
+            contains_arch = any(x in cmake_generator for x in {"ARM", "Win64"})
+
+            # Specify the arch if using MSVC generator, but only if it doesn't
+            # contain a backward-compatibility arch spec already in the
+            # generator name.
+            if not single_config and not contains_arch:
+                # Windows platform specifiers to CMake -A arguments
+                plat_to_cmake = {
+                    "win32": "Win32",
+                    "win-amd64": "x64",
+                    "win-arm32": "ARM",
+                    "win-arm64": "ARM64",
+                }
+                cmake_args += ["-A", plat_to_cmake[self.plat_name]]
+
+            # Multi-config generators have a different way to specify configs
+            if not single_config:
+                build_args += ["--config", cfg]
+
+        if sys.platform.startswith("darwin"):
+            # Cross-compile support for macOS - respect ARCHFLAGS if set
+            archs = re.findall(r"-arch (\S+)", os.environ.get("ARCHFLAGS", ""))
+            if archs:
+                cmake_args += ["-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]
+
+        # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
+        # across all generators.
+        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
+            # self.parallel is a Python 3 only way to set parallel jobs by hand
+            # using -j in the build_ext call, not supported by pip or PyPA-build.
+            if hasattr(self, "parallel") and self.parallel:
+                # CMake 3.12+ only.
+                build_args += [f"-j{self.parallel}"]
+
+        if not os.path.exists(self.build_temp):
+            os.makedirs(self.build_temp)
+        # CMake will use the latest python version installed on the system
+        # to avoid that, the intended version is specified as an env variable
+        # before building the extension
+        env = os.environ
+        env.update({"PYTHON_VERSION": platform.python_version()})
+        cmake_args += [
+            "-DPython3_FIND_STRATEGY=LOCATION",
+            "-DPython3_FIND_REGISTRY=NEVER",
+            "-DPython3_FIND_FRAMEWORK=NEVER",
+            f"-DPython3_ROOT_DIR={os.path.dirname(sys.executable)}",
+        ]
+        subprocess.check_call(["cmake", ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
+        subprocess.check_call(["cmake", "--build", "."] + build_args, cwd=self.build_temp, env=env)
+
 
 # Read the contents of README.md file
 this_directory = os.path.abspath(os.path.dirname(__file__))
@@ -55,16 +235,12 @@ with open(os.path.join(this_directory, "README.md"), encoding="utf-8") as f:
         with open(os.path.join(this_directory, "PYPI.md"), "wt", encoding="utf-8") as fout:
             fout.write(long_description)
 
-mlw_module = Extension(
-    "ethosu.mlw_codec",
-    ["ethosu/mlw_codec/mlw_encode.c", "ethosu/mlw_codec/mlw_decode.c", "ethosu/mlw_codec/mlw_codecmodule.c"],
-    define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_9_API_VERSION")],
-)
 
+# Can also go in setup.cfg
 setup(
     use_scm_version=True,
     long_description=long_description,
     long_description_content_type="text/markdown",
-    ext_modules=[mlw_module],
-    cmdclass={"build_ext": BuildExtension},  # type: ignore[dict-item]
+    ext_modules=[mlw_module, ethosu_regor],
+    cmdclass={"build_ext": MakeOrCMakeBuild},  # type: ignore[dict-item]
 )
diff --git a/test/network.py b/test/network.py
new file mode 100644
index 00000000..1558d62e
--- /dev/null
+++ b/test/network.py
@@ -0,0 +1,79 @@
+# SPDX-FileCopyrightText: Copyright 2021, 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Network test data of TFLite flatbuffer INT8(1x64x64x16) -> CONV2D(k1x1, s1x1, d1x1, bias[8]) -> INT8(1x64x64x8)"""
+# fmt: off
+TEST_NETWORK = bytes([
+  0x18, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x10, 0x00, 0x18, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00, 0x00, 0x00,
+  0x10, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x30, 0x04, 0x00, 0x00, 0x40, 0x04, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x0c, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00, 0x04, 0x00, 0x08, 0x00,
+  0x0c, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x5c, 0x03, 0x00, 0x00, 0x60, 0x03, 0x00, 0x00, 0x64, 0x03, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0xb4, 0x01, 0x00, 0x00, 0x80, 0x02, 0x00, 0x00, 0x10, 0x00, 0x19, 0x00,
+  0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00, 0x18, 0x00, 0x10, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x40, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x4d, 0x6f, 0x62, 0x69, 0x6c, 0x65, 0x6e, 0x65,
+  0x74, 0x56, 0x32, 0x2f, 0x65, 0x78, 0x70, 0x61, 0x6e, 0x64, 0x65, 0x64, 0x5f, 0x63, 0x6f, 0x6e, 0x76, 0x2f, 0x64, 0x65, 0x70, 0x74, 0x68, 0x77,
+  0x69, 0x73, 0x65, 0x2f, 0x52, 0x65, 0x6c, 0x75, 0x36, 0x00, 0x12, 0x00, 0x18, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x14, 0x00, 0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, 0x40, 0x01, 0x00, 0x00, 0x00,
+  0xc1, 0xc0, 0xc0, 0x3c, 0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xbc, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
+  0x09, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x4d, 0x6f, 0x62, 0x69,
+  0x6c, 0x65, 0x6e, 0x65, 0x74, 0x56, 0x32, 0x2f, 0x65, 0x78, 0x70, 0x61, 0x6e, 0x64, 0x65, 0x64, 0x5f, 0x63, 0x6f, 0x6e, 0x76, 0x2f, 0x70, 0x72,
+  0x6f, 0x6a, 0x65, 0x63, 0x74, 0x2f, 0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x73, 0x2f, 0x72, 0x65, 0x61, 0x64, 0x00, 0x00, 0x12, 0x00, 0x10, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xd8, 0x51, 0x81, 0x3c, 0x16, 0x66, 0x98, 0x3c, 0x56, 0xc3, 0xa6, 0x3c,
+  0xc7, 0x65, 0x1c, 0x3c, 0xba, 0xdc, 0xe3, 0x3c, 0xae, 0x19, 0xb9, 0x3c, 0xed, 0x47, 0xc6, 0x3c, 0x29, 0x29, 0x88, 0x3c, 0x08, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xac, 0x01, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x08, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0x4d, 0x6f, 0x62, 0x69, 0x6c, 0x65, 0x6e, 0x65, 0x74, 0x56, 0x32, 0x2f, 0x65, 0x78, 0x70, 0x61,
+  0x6e, 0x64, 0x65, 0x64, 0x5f, 0x63, 0x6f, 0x6e, 0x76, 0x2f, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x2f, 0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44,
+  0x5f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x08, 0x00, 0x00, 0x00, 0x82, 0xbd, 0xc2, 0x39, 0xa0, 0x7e, 0xe5, 0x39, 0x21, 0x20, 0xfb, 0x39, 0x2f, 0x84, 0x6b, 0x39, 0x1d, 0x91, 0x2b, 0x3a,
+  0xa1, 0x5e, 0x0b, 0x3a, 0x3d, 0x4b, 0x15, 0x3a, 0xc9, 0x0a, 0xcd, 0x39, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0x02, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x40, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x3a, 0x00, 0x00, 0x00, 0x4d, 0x6f, 0x62, 0x69, 0x6c, 0x65, 0x6e, 0x65, 0x74, 0x56, 0x32, 0x2f,
+  0x65, 0x78, 0x70, 0x61, 0x6e, 0x64, 0x65, 0x64, 0x5f, 0x63, 0x6f, 0x6e, 0x76, 0x2f, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x2f, 0x42, 0x61,
+  0x74, 0x63, 0x68, 0x4e, 0x6f, 0x72, 0x6d, 0x2f, 0x46, 0x75, 0x73, 0x65, 0x64, 0x42, 0x61, 0x74, 0x63, 0x68, 0x4e, 0x6f, 0x72, 0x6d, 0x00, 0x00,
+  0x12, 0x00, 0x18, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+  0x14, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x7e, 0xa5, 0xd0, 0xc1, 0x01, 0x00, 0x00, 0x00, 0x18, 0x1a, 0xe9, 0x41, 0x01, 0x00, 0x00, 0x00, 0x89, 0xbd, 0x5d, 0x3e, 0x01, 0x00, 0x00, 0x00,
+  0xf8, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x12, 0x00, 0x1d, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00, 0x00, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x34, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x00, 0x1c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00,
+  0x14, 0x00, 0x18, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x4f, 0x20, 0x4f, 0x70, 0x74, 0x69, 0x6d, 0x69, 0x73,
+  0x65, 0x64, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xa8, 0x00, 0x00, 0x00, 0xd4, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x06, 0xe1, 0xed, 0x2c, 0x03, 0x38, 0x2b, 0x7f, 0x66, 0xc0, 0xd3, 0x84, 0x11, 0x30, 0x05, 0xd8,
+  0x02, 0xf6, 0x40, 0x0b, 0xff, 0x03, 0x66, 0xdb, 0xf5, 0xba, 0xd6, 0xfe, 0x81, 0x12, 0xfc, 0x30, 0xfc, 0xfb, 0x79, 0xf1, 0x07, 0xdb, 0xdf, 0x81,
+  0x3e, 0xf8, 0x13, 0x0a, 0x2f, 0x07, 0x07, 0x01, 0xfd, 0x83, 0x04, 0x26, 0x14, 0x1e, 0x88, 0x56, 0xef, 0x81, 0xa2, 0x3d, 0xe2, 0x52, 0x02, 0x87,
+  0x02, 0x0e, 0x2c, 0x06, 0xfa, 0x0d, 0x19, 0x62, 0xf8, 0x81, 0xff, 0x06, 0x0c, 0xf9, 0x07, 0xd6, 0x01, 0xe6, 0x01, 0x00, 0x00, 0xe1, 0xea, 0x7f,
+  0xdc, 0xe4, 0x01, 0x00, 0x5b, 0x28, 0xf0, 0x58, 0x03, 0xfb, 0x0a, 0x01, 0x05, 0x00, 0xe1, 0x81, 0xc9, 0xa0, 0x0f, 0xd6, 0x0b, 0x15, 0x09, 0x08,
+  0xff, 0xdb, 0xfd, 0xd7, 0x10, 0xe5, 0x7c, 0xab, 0xfa, 0x02, 0x0b, 0x1a, 0x7f, 0x14, 0x00, 0xd0, 0x98, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xc3, 0xc4, 0xff, 0xff, 0x02, 0x08, 0x00, 0x00, 0x9b, 0xfd, 0xff, 0xff, 0xb2, 0x8b, 0x00, 0x00,
+  0xda, 0xfd, 0xff, 0xff, 0xe1, 0xa4, 0xff, 0xff, 0xc3, 0x94, 0x00, 0x00, 0xad, 0xcd, 0xff, 0xff, 0xc8, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+])
+# fmt: on
diff --git a/test/test_ethos_u_vela.py b/test/test_ethos_u_vela.py
new file mode 100644
index 00000000..58bf71c6
--- /dev/null
+++ b/test/test_ethos_u_vela.py
@@ -0,0 +1,132 @@
+# SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Ethos-U-Vela package test."""
+import os
+import subprocess
+
+import numpy as np
+
+
+def test_ethos_u_vela_with_vela(tmp_path):
+    """Testing ethos-u-vela with vela compiler core."""
+    from network import TEST_NETWORK
+
+    output_dir = tmp_path
+    output_file = tmp_path / "model_vela.tflite"
+    network_file = tmp_path / "model.tflite"
+    network_file.write_bytes(TEST_NETWORK)
+    cli = [
+        "vela",
+        "--accelerator-config",
+        "ethos-u55-256",
+        "--output-dir",
+        output_dir,
+        network_file,
+    ]
+    subprocess.check_call(cli)
+    output = output_file.read_bytes()
+    assert str.encode("COP1") in output
+    assert str.encode("ethos-u") in output
+    assert str.encode("_split_1_command_stream") in output
+
+
+def test_ethos_u_vela_with_regor(tmp_path):
+    """Testing ethos-u-vela with regor compiler core."""
+    from network import TEST_NETWORK
+
+    output_dir = tmp_path
+    output_file = tmp_path / "model_vela.tflite"
+    network_file = tmp_path / "model.tflite"
+    network_file.write_bytes(TEST_NETWORK)
+    cur_path = os.path.dirname(os.path.realpath(__file__))
+    sc_path = os.path.join(cur_path, "..", "ethosu", "config_files", "Arm", "vela.ini")
+    cli = [
+        "vela",
+        "--accelerator-config",
+        "ethos-u85-512",
+        "--config",
+        sc_path,
+        "--system-config",
+        "Ethos_U85_SYS_DRAM_Mid",
+        "--memory-mode",
+        "Dedicated_Sram_384KB",
+        "--output-dir",
+        output_dir,
+        network_file,
+    ]
+    subprocess.check_call(cli)
+    output = output_file.read_bytes()
+    assert output[4:8] == str.encode("TFL3")
+    assert str.encode("COP1") in output
+    assert str.encode("ethos-u") in output
+    assert str.encode("ethos_u_command_stream") in output
+
+
+def test_ethos_u_vela_with_regor_raw_output(tmp_path):
+    """Testing ethos-u-vela with regor compiler core and raw output"""
+    from network import TEST_NETWORK
+
+    output_dir = tmp_path
+    output_file = tmp_path / "model_vela.npz"
+    network_file = tmp_path / "model.tflite"
+    network_file.write_bytes(TEST_NETWORK)
+    cur_path = os.path.dirname(os.path.realpath(__file__))
+    sc_path = os.path.join(cur_path, "..", "ethosu", "config_files", "Arm", "vela.ini")
+    cli = [
+        "vela",
+        "--accelerator-config",
+        "ethos-u85-256",
+        "--config",
+        sc_path,
+        "--system-config",
+        "Ethos_U85_SYS_Flash_High",
+        "--memory-mode",
+        "Shared_Sram",
+        "--output-dir",
+        output_dir,
+        "--output-format",
+        "raw",
+        network_file,
+    ]
+    subprocess.check_call(cli)
+    raw = np.load(output_file)
+    assert len(raw["cmd_data"].data.tobytes()) > 0
+    assert str.encode("COP1") in raw["cmd_data"].data.tobytes()
+    assert len(raw["weight_data"].data.tobytes()) > 0
+    assert raw["weight_region"] == 0
+    assert all(v > 0 for v in raw["scratch_shape"].tolist())
+    assert raw["scratch_region"] == 1
+    assert raw["scratch_size"] > 0
+    assert raw["scratch_fast_shape"].tolist() == [0]
+    assert raw["scratch_fast_region"] == 0
+    assert raw["scratch_fast_size"] == 0
+    assert raw["input_shape"][0].tolist() == [1, 64, 64, 16]
+    assert raw["input_elem_size"][0] == 1
+    assert raw["input_region"][0] == 1
+    assert raw["input_offset"][0] >= 0
+    assert raw["output_shape"][0].tolist() == [1, 64, 64, 8]
+    assert raw["output_elem_size"][0] == 1
+    assert raw["output_region"][0] == 1
+    assert raw["output_offset"][0] >= 0
+
+
+def test_regor():
+    """Testing regor module."""
+    from ethosu import regor
+
+    assert "Regor" in dir(regor)
+    assert "compile" in dir(regor)
-- 
GitLab